temp fix ydb oss sync config to unlock sync on /vendor dependency

author: alexv-smirnov <alex@ydb.tech> 2022-08-18 16:52:30 +0300
committer: alexv-smirnov <alex@ydb.tech> 2022-08-18 16:52:30 +0300
commit: c140abc954b61ab7d86af80bdeced01482d9971a (patch)
tree: c47d70fa3213240d5e0eb59787a5325782a360de /contrib/go/_std_1.18/src/runtime
parent: 0ce07b9705ed20e3fce2759eae41496014ca4c33 (diff)
download: ydb-c140abc954b61ab7d86af80bdeced01482d9971a.tar.gz
194 files changed, 66316 insertions, 0 deletions
diff --git a/contrib/go/_std_1.18/src/runtime/alg.go b/contrib/go/_std_1.18/src/runtime/alg.go
new file mode 100644
index 0000000000..5d7d1c77f4
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/alg.go
@@ -0,0 +1,353 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"internal/goarch"
+	"unsafe"
+)
+
+const (
+	c0 = uintptr((8-goarch.PtrSize)/4*2860486313 + (goarch.PtrSize-4)/4*33054211828000289)
+	c1 = uintptr((8-goarch.PtrSize)/4*3267000013 + (goarch.PtrSize-4)/4*23344194077549503)
+)
+
+func memhash0(p unsafe.Pointer, h uintptr) uintptr {
+	return h
+}
+
+func memhash8(p unsafe.Pointer, h uintptr) uintptr {
+	return memhash(p, h, 1)
+}
+
+func memhash16(p unsafe.Pointer, h uintptr) uintptr {
+	return memhash(p, h, 2)
+}
+
+func memhash128(p unsafe.Pointer, h uintptr) uintptr {
+	return memhash(p, h, 16)
+}
+
+//go:nosplit
+func memhash_varlen(p unsafe.Pointer, h uintptr) uintptr {
+	ptr := getclosureptr()
+	size := *(*uintptr)(unsafe.Pointer(ptr + unsafe.Sizeof(h)))
+	return memhash(p, h, size)
+}
+
+// runtime variable to check if the processor we're running on
+// actually supports the instructions used by the AES-based
+// hash implementation.
+var useAeshash bool
+
+// in asm_*.s
+func memhash(p unsafe.Pointer, h, s uintptr) uintptr
+func memhash32(p unsafe.Pointer, h uintptr) uintptr
+func memhash64(p unsafe.Pointer, h uintptr) uintptr
+func strhash(p unsafe.Pointer, h uintptr) uintptr
+
+func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
+	x := (*stringStruct)(a)
+	return memhashFallback(x.str, h, uintptr(x.len))
+}
+
+// NOTE: Because NaN != NaN, a map can contain any
+// number of (mostly useless) entries keyed with NaNs.
+// To avoid long hash chains, we assign a random number
+// as the hash value for a NaN.
+
+func f32hash(p unsafe.Pointer, h uintptr) uintptr {
+	f := *(*float32)(p)
+	switch {
+	case f == 0:
+		return c1 * (c0 ^ h) // +0, -0
+	case f != f:
+		return c1 * (c0 ^ h ^ uintptr(fastrand())) // any kind of NaN
+	default:
+		return memhash(p, h, 4)
+	}
+}
+
+func f64hash(p unsafe.Pointer, h uintptr) uintptr {
+	f := *(*float64)(p)
+	switch {
+	case f == 0:
+		return c1 * (c0 ^ h) // +0, -0
+	case f != f:
+		return c1 * (c0 ^ h ^ uintptr(fastrand())) // any kind of NaN
+	default:
+		return memhash(p, h, 8)
+	}
+}
+
+func c64hash(p unsafe.Pointer, h uintptr) uintptr {
+	x := (*[2]float32)(p)
+	return f32hash(unsafe.Pointer(&x[1]), f32hash(unsafe.Pointer(&x[0]), h))
+}
+
+func c128hash(p unsafe.Pointer, h uintptr) uintptr {
+	x := (*[2]float64)(p)
+	return f64hash(unsafe.Pointer(&x[1]), f64hash(unsafe.Pointer(&x[0]), h))
+}
+
+func interhash(p unsafe.Pointer, h uintptr) uintptr {
+	a := (*iface)(p)
+	tab := a.tab
+	if tab == nil {
+		return h
+	}
+	t := tab._type
+	if t.equal == nil {
+		// Check hashability here. We could do this check inside
+		// typehash, but we want to report the topmost type in
+		// the error text (e.g. in a struct with a field of slice type
+		// we want to report the struct, not the slice).
+		panic(errorString("hash of unhashable type " + t.string()))
+	}
+	if isDirectIface(t) {
+		return c1 * typehash(t, unsafe.Pointer(&a.data), h^c0)
+	} else {
+		return c1 * typehash(t, a.data, h^c0)
+	}
+}
+
+func nilinterhash(p unsafe.Pointer, h uintptr) uintptr {
+	a := (*eface)(p)
+	t := a._type
+	if t == nil {
+		return h
+	}
+	if t.equal == nil {
+		// See comment in interhash above.
+		panic(errorString("hash of unhashable type " + t.string()))
+	}
+	if isDirectIface(t) {
+		return c1 * typehash(t, unsafe.Pointer(&a.data), h^c0)
+	} else {
+		return c1 * typehash(t, a.data, h^c0)
+	}
+}
+
+// typehash computes the hash of the object of type t at address p.
+// h is the seed.
+// This function is seldom used. Most maps use for hashing either
+// fixed functions (e.g. f32hash) or compiler-generated functions
+// (e.g. for a type like struct { x, y string }). This implementation
+// is slower but more general and is used for hashing interface types
+// (called from interhash or nilinterhash, above) or for hashing in
+// maps generated by reflect.MapOf (reflect_typehash, below).
+// Note: this function must match the compiler generated
+// functions exactly. See issue 37716.
+func typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
+	if t.tflag&tflagRegularMemory != 0 {
+		// Handle ptr sizes specially, see issue 37086.
+		switch t.size {
+		case 4:
+			return memhash32(p, h)
+		case 8:
+			return memhash64(p, h)
+		default:
+			return memhash(p, h, t.size)
+		}
+	}
+	switch t.kind & kindMask {
+	case kindFloat32:
+		return f32hash(p, h)
+	case kindFloat64:
+		return f64hash(p, h)
+	case kindComplex64:
+		return c64hash(p, h)
+	case kindComplex128:
+		return c128hash(p, h)
+	case kindString:
+		return strhash(p, h)
+	case kindInterface:
+		i := (*interfacetype)(unsafe.Pointer(t))
+		if len(i.mhdr) == 0 {
+			return nilinterhash(p, h)
+		}
+		return interhash(p, h)
+	case kindArray:
+		a := (*arraytype)(unsafe.Pointer(t))
+		for i := uintptr(0); i < a.len; i++ {
+			h = typehash(a.elem, add(p, i*a.elem.size), h)
+		}
+		return h
+	case kindStruct:
+		s := (*structtype)(unsafe.Pointer(t))
+		for _, f := range s.fields {
+			if f.name.isBlank() {
+				continue
+			}
+			h = typehash(f.typ, add(p, f.offset()), h)
+		}
+		return h
+	default:
+		// Should never happen, as typehash should only be called
+		// with comparable types.
+		panic(errorString("hash of unhashable type " + t.string()))
+	}
+}
+
+//go:linkname reflect_typehash reflect.typehash
+func reflect_typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
+	return typehash(t, p, h)
+}
+
+func memequal0(p, q unsafe.Pointer) bool {
+	return true
+}
+func memequal8(p, q unsafe.Pointer) bool {
+	return *(*int8)(p) == *(*int8)(q)
+}
+func memequal16(p, q unsafe.Pointer) bool {
+	return *(*int16)(p) == *(*int16)(q)
+}
+func memequal32(p, q unsafe.Pointer) bool {
+	return *(*int32)(p) == *(*int32)(q)
+}
+func memequal64(p, q unsafe.Pointer) bool {
+	return *(*int64)(p) == *(*int64)(q)
+}
+func memequal128(p, q unsafe.Pointer) bool {
+	return *(*[2]int64)(p) == *(*[2]int64)(q)
+}
+func f32equal(p, q unsafe.Pointer) bool {
+	return *(*float32)(p) == *(*float32)(q)
+}
+func f64equal(p, q unsafe.Pointer) bool {
+	return *(*float64)(p) == *(*float64)(q)
+}
+func c64equal(p, q unsafe.Pointer) bool {
+	return *(*complex64)(p) == *(*complex64)(q)
+}
+func c128equal(p, q unsafe.Pointer) bool {
+	return *(*complex128)(p) == *(*complex128)(q)
+}
+func strequal(p, q unsafe.Pointer) bool {
+	return *(*string)(p) == *(*string)(q)
+}
+func interequal(p, q unsafe.Pointer) bool {
+	x := *(*iface)(p)
+	y := *(*iface)(q)
+	return x.tab == y.tab && ifaceeq(x.tab, x.data, y.data)
+}
+func nilinterequal(p, q unsafe.Pointer) bool {
+	x := *(*eface)(p)
+	y := *(*eface)(q)
+	return x._type == y._type && efaceeq(x._type, x.data, y.data)
+}
+func efaceeq(t *_type, x, y unsafe.Pointer) bool {
+	if t == nil {
+		return true
+	}
+	eq := t.equal
+	if eq == nil {
+		panic(errorString("comparing uncomparable type " + t.string()))
+	}
+	if isDirectIface(t) {
+		// Direct interface types are ptr, chan, map, func, and single-element structs/arrays thereof.
+		// Maps and funcs are not comparable, so they can't reach here.
+		// Ptrs, chans, and single-element items can be compared directly using ==.
+		return x == y
+	}
+	return eq(x, y)
+}
+func ifaceeq(tab *itab, x, y unsafe.Pointer) bool {
+	if tab == nil {
+		return true
+	}
+	t := tab._type
+	eq := t.equal
+	if eq == nil {
+		panic(errorString("comparing uncomparable type " + t.string()))
+	}
+	if isDirectIface(t) {
+		// See comment in efaceeq.
+		return x == y
+	}
+	return eq(x, y)
+}
+
+// Testing adapters for hash quality tests (see hash_test.go)
+func stringHash(s string, seed uintptr) uintptr {
+	return strhash(noescape(unsafe.Pointer(&s)), seed)
+}
+
+func bytesHash(b []byte, seed uintptr) uintptr {
+	s := (*slice)(unsafe.Pointer(&b))
+	return memhash(s.array, seed, uintptr(s.len))
+}
+
+func int32Hash(i uint32, seed uintptr) uintptr {
+	return memhash32(noescape(unsafe.Pointer(&i)), seed)
+}
+
+func int64Hash(i uint64, seed uintptr) uintptr {
+	return memhash64(noescape(unsafe.Pointer(&i)), seed)
+}
+
+func efaceHash(i any, seed uintptr) uintptr {
+	return nilinterhash(noescape(unsafe.Pointer(&i)), seed)
+}
+
+func ifaceHash(i interface {
+	F()
+}, seed uintptr) uintptr {
+	return interhash(noescape(unsafe.Pointer(&i)), seed)
+}
+
+const hashRandomBytes = goarch.PtrSize / 4 * 64
+
+// used in asm_{386,amd64,arm64}.s to seed the hash function
+var aeskeysched [hashRandomBytes]byte
+
+// used in hash{32,64}.go to seed the hash function
+var hashkey [4]uintptr
+
+func alginit() {
+	// Install AES hash algorithms if the instructions needed are present.
+	if (GOARCH == "386" || GOARCH == "amd64") &&
+		cpu.X86.HasAES && // AESENC
+		cpu.X86.HasSSSE3 && // PSHUFB
+		cpu.X86.HasSSE41 { // PINSR{D,Q}
+		initAlgAES()
+		return
+	}
+	if GOARCH == "arm64" && cpu.ARM64.HasAES {
+		initAlgAES()
+		return
+	}
+	getRandomData((*[len(hashkey) * goarch.PtrSize]byte)(unsafe.Pointer(&hashkey))[:])
+	hashkey[0] |= 1 // make sure these numbers are odd
+	hashkey[1] |= 1
+	hashkey[2] |= 1
+	hashkey[3] |= 1
+}
+
+func initAlgAES() {
+	useAeshash = true
+	// Initialize with random data so hash collisions will be hard to engineer.
+	getRandomData(aeskeysched[:])
+}
+
+// Note: These routines perform the read with a native endianness.
+func readUnaligned32(p unsafe.Pointer) uint32 {
+	q := (*[4]byte)(p)
+	if goarch.BigEndian {
+		return uint32(q[3]) | uint32(q[2])<<8 | uint32(q[1])<<16 | uint32(q[0])<<24
+	}
+	return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+	q := (*[8]byte)(p)
+	if goarch.BigEndian {
+		return uint64(q[7]) | uint64(q[6])<<8 | uint64(q[5])<<16 | uint64(q[4])<<24 |
+			uint64(q[3])<<32 | uint64(q[2])<<40 | uint64(q[1])<<48 | uint64(q[0])<<56
+	}
+	return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
+}
diff --git a/contrib/go/_std_1.18/src/runtime/asan0.go b/contrib/go/_std_1.18/src/runtime/asan0.go
new file mode 100644
index 0000000000..d5478d6bee
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/asan0.go
@@ -0,0 +1,22 @@
+// Copyright 2021 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !asan
+
+// Dummy ASan support API, used when not built with -asan.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const asanenabled = false
+
+// Because asanenabled is false, none of these functions should be called.
+
+func asanread(addr unsafe.Pointer, sz uintptr)     { throw("asan") }
+func asanwrite(addr unsafe.Pointer, sz uintptr)    { throw("asan") }
+func asanunpoison(addr unsafe.Pointer, sz uintptr) { throw("asan") }
+func asanpoison(addr unsafe.Pointer, sz uintptr)   { throw("asan") }
diff --git a/contrib/go/_std_1.18/src/runtime/asm.s b/contrib/go/_std_1.18/src/runtime/asm.s
new file mode 100644
index 0000000000..84d56de7dd
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/asm.s
@@ -0,0 +1,10 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#ifndef GOARCH_amd64
+TEXT ·sigpanic0(SB),NOSPLIT,$0-0
+	JMP	·sigpanic<ABIInternal>(SB)
+#endif
diff --git a/contrib/go/_std_1.18/src/runtime/asm_amd64.s b/contrib/go/_std_1.18/src/runtime/asm_amd64.s
new file mode 100644
index 0000000000..c08ae610fb
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/asm_amd64.s
@@ -0,0 +1,2036 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+#include "cgo/abi_amd64.h"
+
+// _rt0_amd64 is common startup code for most amd64 systems when using
+// internal linking. This is the entry point for the program from the
+// kernel for an ordinary -buildmode=exe program. The stack holds the
+// number of arguments and the C-style argv.
+TEXT _rt0_amd64(SB),NOSPLIT,$-8
+	MOVQ	0(SP), DI	// argc
+	LEAQ	8(SP), SI	// argv
+	JMP	runtime·rt0_go(SB)
+
+// main is common startup code for most amd64 systems when using
+// external linking. The C startup code will call the symbol "main"
+// passing argc and argv in the usual C ABI registers DI and SI.
+TEXT main(SB),NOSPLIT,$-8
+	JMP	runtime·rt0_go(SB)
+
+// _rt0_amd64_lib is common startup code for most amd64 systems when
+// using -buildmode=c-archive or -buildmode=c-shared. The linker will
+// arrange to invoke this function as a global constructor (for
+// c-archive) or when the shared library is loaded (for c-shared).
+// We expect argc and argv to be passed in the usual C ABI registers
+// DI and SI.
+TEXT _rt0_amd64_lib(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
+	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
+	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	CALL	runtime·libpreinit(SB)
+
+	// Create a new thread to finish Go runtime initialization.
+	MOVQ	_cgo_sys_thread_create(SB), AX
+	TESTQ	AX, AX
+	JZ	nocgo
+
+	// We're calling back to C.
+	// Align stack per ELF ABI requirements.
+	MOVQ	SP, BX  // Callee-save in C ABI
+	ANDQ	$~15, SP
+	MOVQ	$_rt0_amd64_lib_go(SB), DI
+	MOVQ	$0, SI
+	CALL	AX
+	MOVQ	BX, SP
+	JMP	restore
+
+nocgo:
+	ADJSP	$16
+	MOVQ	$0x800000, 0(SP)		// stacksize
+	MOVQ	$_rt0_amd64_lib_go(SB), AX
+	MOVQ	AX, 8(SP)			// fn
+	CALL	runtime·newosproc0(SB)
+	ADJSP	$-16
+
+restore:
+	POP_REGS_HOST_TO_ABI0()
+	RET
+
+// _rt0_amd64_lib_go initializes the Go runtime.
+// This is started in a separate thread by _rt0_amd64_lib.
+TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
+	MOVQ	_rt0_amd64_lib_argc<>(SB), DI
+	MOVQ	_rt0_amd64_lib_argv<>(SB), SI
+	JMP	runtime·rt0_go(SB)
+
+DATA _rt0_amd64_lib_argc<>(SB)/8, $0
+GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
+DATA _rt0_amd64_lib_argv<>(SB)/8, $0
+GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
+
+#ifdef GOAMD64_v2
+DATA bad_cpu_msg<>+0x00(SB)/84, $"This program can only be run on AMD64 processors with v2 microarchitecture support.\n"
+#endif
+
+#ifdef GOAMD64_v3
+DATA bad_cpu_msg<>+0x00(SB)/84, $"This program can only be run on AMD64 processors with v3 microarchitecture support.\n"
+#endif
+
+#ifdef GOAMD64_v4
+DATA bad_cpu_msg<>+0x00(SB)/84, $"This program can only be run on AMD64 processors with v4 microarchitecture support.\n"
+#endif
+
+GLOBL bad_cpu_msg<>(SB), RODATA, $84
+
+// Define a list of AMD64 microarchitecture level features
+// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
+
+                     // SSE3     SSSE3    CMPXCHNG16 SSE4.1    SSE4.2    POPCNT
+#define V2_FEATURES_CX (1 << 0 | 1 << 9 | 1 << 13  | 1 << 19 | 1 << 20 | 1 << 23)
+                         // LAHF/SAHF
+#define V2_EXT_FEATURES_CX (1 << 0)
+                                      // FMA       MOVBE     OSXSAVE   AVX       F16C
+#define V3_FEATURES_CX (V2_FEATURES_CX | 1 << 12 | 1 << 22 | 1 << 27 | 1 << 28 | 1 << 29)
+                                              // ABM (FOR LZNCT)
+#define V3_EXT_FEATURES_CX (V2_EXT_FEATURES_CX | 1 << 5)
+                         // BMI1     AVX2     BMI2
+#define V3_EXT_FEATURES_BX (1 << 3 | 1 << 5 | 1 << 8)
+                       // XMM      YMM
+#define V3_OS_SUPPORT_AX (1 << 1 | 1 << 2)
+
+#define V4_FEATURES_CX V3_FEATURES_CX
+
+#define V4_EXT_FEATURES_CX V3_EXT_FEATURES_CX
+                                              // AVX512F   AVX512DQ  AVX512CD  AVX512BW  AVX512VL
+#define V4_EXT_FEATURES_BX (V3_EXT_FEATURES_BX | 1 << 16 | 1 << 17 | 1 << 28 | 1 << 30 | 1 << 31)
+                                          // OPMASK   ZMM
+#define V4_OS_SUPPORT_AX (V3_OS_SUPPORT_AX | 1 << 5 | (1 << 6 | 1 << 7))
+
+#ifdef GOAMD64_v2
+#define NEED_MAX_CPUID 0x80000001
+#define NEED_FEATURES_CX V2_FEATURES_CX
+#define NEED_EXT_FEATURES_CX V2_EXT_FEATURES_CX
+#endif
+
+#ifdef GOAMD64_v3
+#define NEED_MAX_CPUID 0x80000001
+#define NEED_FEATURES_CX V3_FEATURES_CX
+#define NEED_EXT_FEATURES_CX V3_EXT_FEATURES_CX
+#define NEED_EXT_FEATURES_BX V3_EXT_FEATURES_BX
+#define NEED_OS_SUPPORT_AX V3_OS_SUPPORT_AX
+#endif
+
+#ifdef GOAMD64_v4
+#define NEED_MAX_CPUID 0x80000001
+#define NEED_FEATURES_CX V4_FEATURES_CX
+#define NEED_EXT_FEATURES_CX V4_EXT_FEATURES_CX
+#define NEED_EXT_FEATURES_BX V4_EXT_FEATURES_BX
+
+// Downgrading v4 OS checks on Darwin for now, see CL 285572.
+#ifdef GOOS_darwin
+#define NEED_OS_SUPPORT_AX V3_OS_SUPPORT_AX
+#else
+#define NEED_OS_SUPPORT_AX V4_OS_SUPPORT_AX
+#endif
+
+#endif
+
+TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
+	// copy arguments forward on an even stack
+	MOVQ	DI, AX		// argc
+	MOVQ	SI, BX		// argv
+	SUBQ	$(5*8), SP		// 3args 2auto
+	ANDQ	$~15, SP
+	MOVQ	AX, 24(SP)
+	MOVQ	BX, 32(SP)
+
+	// create istack out of the given (operating system) stack.
+	// _cgo_init may update stackguard.
+	MOVQ	$runtime·g0(SB), DI
+	LEAQ	(-64*1024+104)(SP), BX
+	MOVQ	BX, g_stackguard0(DI)
+	MOVQ	BX, g_stackguard1(DI)
+	MOVQ	BX, (g_stack+stack_lo)(DI)
+	MOVQ	SP, (g_stack+stack_hi)(DI)
+
+	// find out information about the processor we're on
+	MOVL	$0, AX
+	CPUID
+	CMPL	AX, $0
+	JE	nocpuinfo
+
+	CMPL	BX, $0x756E6547  // "Genu"
+	JNE	notintel
+	CMPL	DX, $0x49656E69  // "ineI"
+	JNE	notintel
+	CMPL	CX, $0x6C65746E  // "ntel"
+	JNE	notintel
+	MOVB	$1, runtime·isIntel(SB)
+
+notintel:
+	// Load EAX=1 cpuid flags
+	MOVL	$1, AX
+	CPUID
+	MOVL	AX, runtime·processorVersionInfo(SB)
+
+nocpuinfo:
+	// if there is an _cgo_init, call it.
+	MOVQ	_cgo_init(SB), AX
+	TESTQ	AX, AX
+	JZ	needtls
+	// arg 1: g0, already in DI
+	MOVQ	$setg_gcc<>(SB), SI // arg 2: setg_gcc
+#ifdef GOOS_android
+	MOVQ	$runtime·tls_g(SB), DX 	// arg 3: &tls_g
+	// arg 4: TLS base, stored in slot 0 (Android's TLS_SLOT_SELF).
+	// Compensate for tls_g (+16).
+	MOVQ	-16(TLS), CX
+#else
+	MOVQ	$0, DX	// arg 3, 4: not used when using platform's TLS
+	MOVQ	$0, CX
+#endif
+#ifdef GOOS_windows
+	// Adjust for the Win64 calling convention.
+	MOVQ	CX, R9 // arg 4
+	MOVQ	DX, R8 // arg 3
+	MOVQ	SI, DX // arg 2
+	MOVQ	DI, CX // arg 1
+#endif
+	CALL	AX
+
+	// update stackguard after _cgo_init
+	MOVQ	$runtime·g0(SB), CX
+	MOVQ	(g_stack+stack_lo)(CX), AX
+	ADDQ	$const__StackGuard, AX
+	MOVQ	AX, g_stackguard0(CX)
+	MOVQ	AX, g_stackguard1(CX)
+
+#ifndef GOOS_windows
+	JMP ok
+#endif
+needtls:
+#ifdef GOOS_plan9
+	// skip TLS setup on Plan 9
+	JMP ok
+#endif
+#ifdef GOOS_solaris
+	// skip TLS setup on Solaris
+	JMP ok
+#endif
+#ifdef GOOS_illumos
+	// skip TLS setup on illumos
+	JMP ok
+#endif
+#ifdef GOOS_darwin
+	// skip TLS setup on Darwin
+	JMP ok
+#endif
+#ifdef GOOS_openbsd
+	// skip TLS setup on OpenBSD
+	JMP ok
+#endif
+
+	LEAQ	runtime·m0+m_tls(SB), DI
+	CALL	runtime·settls(SB)
+
+	// store through it, to make sure it works
+	get_tls(BX)
+	MOVQ	$0x123, g(BX)
+	MOVQ	runtime·m0+m_tls(SB), AX
+	CMPQ	AX, $0x123
+	JEQ 2(PC)
+	CALL	runtime·abort(SB)
+ok:
+	// set the per-goroutine and per-mach "registers"
+	get_tls(BX)
+	LEAQ	runtime·g0(SB), CX
+	MOVQ	CX, g(BX)
+	LEAQ	runtime·m0(SB), AX
+
+	// save m->g0 = g0
+	MOVQ	CX, m_g0(AX)
+	// save m0 to g0->m
+	MOVQ	AX, g_m(CX)
+
+	CLD				// convention is D is always left cleared
+
+	// Check GOAMD64 reqirements
+	// We need to do this after setting up TLS, so that
+	// we can report an error if there is a failure. See issue 49586.
+#ifdef NEED_FEATURES_CX
+	MOVL	$0, AX
+	CPUID
+	CMPL	AX, $0
+	JE	bad_cpu
+	MOVL	$1, AX
+	CPUID
+	ANDL	$NEED_FEATURES_CX, CX
+	CMPL	CX, $NEED_FEATURES_CX
+	JNE	bad_cpu
+#endif
+
+#ifdef NEED_MAX_CPUID
+	MOVL	$0x80000000, AX
+	CPUID
+	CMPL	AX, $NEED_MAX_CPUID
+	JL	bad_cpu
+#endif
+
+#ifdef NEED_EXT_FEATURES_BX
+	MOVL	$7, AX
+	MOVL	$0, CX
+	CPUID
+	ANDL	$NEED_EXT_FEATURES_BX, BX
+	CMPL	BX, $NEED_EXT_FEATURES_BX
+	JNE	bad_cpu
+#endif
+
+#ifdef NEED_EXT_FEATURES_CX
+	MOVL	$0x80000001, AX
+	CPUID
+	ANDL	$NEED_EXT_FEATURES_CX, CX
+	CMPL	CX, $NEED_EXT_FEATURES_CX
+	JNE	bad_cpu
+#endif
+
+#ifdef NEED_OS_SUPPORT_AX
+	XORL    CX, CX
+	XGETBV
+	ANDL	$NEED_OS_SUPPORT_AX, AX
+	CMPL	AX, $NEED_OS_SUPPORT_AX
+	JNE	bad_cpu
+#endif
+
+	CALL	runtime·check(SB)
+
+	MOVL	24(SP), AX		// copy argc
+	MOVL	AX, 0(SP)
+	MOVQ	32(SP), AX		// copy argv
+	MOVQ	AX, 8(SP)
+	CALL	runtime·args(SB)
+	CALL	runtime·osinit(SB)
+	CALL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVQ	$runtime·mainPC(SB), AX		// entry
+	PUSHQ	AX
+	CALL	runtime·newproc(SB)
+	POPQ	AX
+
+	// start this M
+	CALL	runtime·mstart(SB)
+
+	CALL	runtime·abort(SB)	// mstart should never return
+	RET
+
+bad_cpu: // show that the program requires a certain microarchitecture level.
+	MOVQ	$2, 0(SP)
+	MOVQ	$bad_cpu_msg<>(SB), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	$84, 16(SP)
+	CALL	runtime·write(SB)
+	MOVQ	$1, 0(SP)
+	CALL	runtime·exit(SB)
+	CALL	runtime·abort(SB)
+	RET
+
+	// Prevent dead-code elimination of debugCallV2, which is
+	// intended to be called by debuggers.
+	MOVQ	$runtime·debugCallV2<ABIInternal>(SB), AX
+	RET
+
+// mainPC is a function value for runtime.main, to be passed to newproc.
+// The reference to runtime.main is made via ABIInternal, since the
+// actual function (not the ABI0 wrapper) is needed by newproc.
+DATA	runtime·mainPC+0(SB)/8,$runtime·main<ABIInternal>(SB)
+GLOBL	runtime·mainPC(SB),RODATA,$8
+
+TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
+	BYTE	$0xcc
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT,$0-0
+	// No per-thread init.
+	RET
+
+TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME,$0
+	CALL	runtime·mstart0(SB)
+	RET // not reached
+
+/*
+ *  go-routine
+ */
+
+// func gogo(buf *gobuf)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $0-8
+	MOVQ	buf+0(FP), BX		// gobuf
+	MOVQ	gobuf_g(BX), DX
+	MOVQ	0(DX), CX		// make sure g != nil
+	JMP	gogo<>(SB)
+
+TEXT gogo<>(SB), NOSPLIT, $0
+	get_tls(CX)
+	MOVQ	DX, g(CX)
+	MOVQ	DX, R14		// set the g register
+	MOVQ	gobuf_sp(BX), SP	// restore SP
+	MOVQ	gobuf_ret(BX), AX
+	MOVQ	gobuf_ctxt(BX), DX
+	MOVQ	gobuf_bp(BX), BP
+	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
+	MOVQ	$0, gobuf_ret(BX)
+	MOVQ	$0, gobuf_ctxt(BX)
+	MOVQ	$0, gobuf_bp(BX)
+	MOVQ	gobuf_pc(BX), BX
+	JMP	BX
+
+// func mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return. It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall<ABIInternal>(SB), NOSPLIT, $0-8
+	MOVQ	AX, DX	// DX = fn
+
+	// save state in g->sched
+	MOVQ	0(SP), BX	// caller's PC
+	MOVQ	BX, (g_sched+gobuf_pc)(R14)
+	LEAQ	fn+0(FP), BX	// caller's SP
+	MOVQ	BX, (g_sched+gobuf_sp)(R14)
+	MOVQ	BP, (g_sched+gobuf_bp)(R14)
+
+	// switch to m->g0 & its stack, call fn
+	MOVQ	g_m(R14), BX
+	MOVQ	m_g0(BX), SI	// SI = g.m.g0
+	CMPQ	SI, R14	// if g == m->g0 call badmcall
+	JNE	goodm
+	JMP	runtime·badmcall(SB)
+goodm:
+	MOVQ	R14, AX		// AX (and arg 0) = g
+	MOVQ	SI, R14		// g = g.m.g0
+	get_tls(CX)		// Set G in TLS
+	MOVQ	R14, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(R14), SP	// sp = g0.sched.sp
+	PUSHQ	AX	// open up space for fn's arg spill slot
+	MOVQ	0(DX), R12
+	CALL	R12		// fn(g)
+	POPQ	AX
+	JMP	runtime·badmcall2(SB)
+	RET
+
+// systemstack_switch is a dummy routine that systemstack leaves at the bottom
+// of the G stack. We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the system stack because the one at the top of
+// the system stack terminates the stack walk (see topofstack()).
+TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
+	RET
+
+// func systemstack(fn func())
+TEXT runtime·systemstack(SB), NOSPLIT, $0-8
+	MOVQ	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVQ	g(CX), AX	// AX = g
+	MOVQ	g_m(AX), BX	// BX = m
+
+	CMPQ	AX, m_gsignal(BX)
+	JEQ	noswitch
+
+	MOVQ	m_g0(BX), DX	// DX = g0
+	CMPQ	AX, DX
+	JEQ	noswitch
+
+	CMPQ	AX, m_curg(BX)
+	JNE	bad
+
+	// switch stacks
+	// save our state in g->sched. Pretend to
+	// be systemstack_switch if the G stack is scanned.
+	CALL	gosave_systemstack_switch<>(SB)
+
+	// switch to g0
+	MOVQ	DX, g(CX)
+	MOVQ	DX, R14 // set the g register
+	MOVQ	(g_sched+gobuf_sp)(DX), BX
+	MOVQ	BX, SP
+
+	// call target function
+	MOVQ	DI, DX
+	MOVQ	0(DI), DI
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), BX
+	MOVQ	m_curg(BX), AX
+	MOVQ	AX, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(AX), SP
+	MOVQ	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+noswitch:
+	// already on m stack; tail call the function
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
+	MOVQ	DI, DX
+	MOVQ	0(DI), DI
+	JMP	DI
+
+bad:
+	// Bad: g is not gsignal, not g0, not curg. What is it?
+	MOVQ	$runtime·badsystemstack(SB), AX
+	CALL	AX
+	INT	$3
+
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$0-0
+	// Cannot grow scheduler stack (m->g0).
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_g0(BX), SI
+	CMPQ	g(CX), SI
+	JNE	3(PC)
+	CALL	runtime·badmorestackg0(SB)
+	CALL	runtime·abort(SB)
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVQ	m_gsignal(BX), SI
+	CMPQ	g(CX), SI
+	JNE	3(PC)
+	CALL	runtime·badmorestackgsignal(SB)
+	CALL	runtime·abort(SB)
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	NOP	SP	// tell vet SP changed - stop checking offsets
+	MOVQ	8(SP), AX	// f's caller's PC
+	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
+	LEAQ	16(SP), AX	// f's caller's SP
+	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
+	get_tls(CX)
+	MOVQ	g(CX), SI
+	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
+
+	// Set g->sched to context in f.
+	MOVQ	0(SP), AX // f's PC
+	MOVQ	AX, (g_sched+gobuf_pc)(SI)
+	LEAQ	8(SP), AX // f's SP
+	MOVQ	AX, (g_sched+gobuf_sp)(SI)
+	MOVQ	BP, (g_sched+gobuf_bp)(SI)
+	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
+
+	// Call newstack on m->g0's stack.
+	MOVQ	m_g0(BX), BX
+	MOVQ	BX, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(BX), SP
+	CALL	runtime·newstack(SB)
+	CALL	runtime·abort(SB)	// crash if newstack returns
+	RET
+
+// morestack but not preserving ctxt.
+TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
+	MOVL	$0, DX
+	JMP	runtime·morestack(SB)
+
+// spillArgs stores return values from registers to a *internal/abi.RegArgs in R12.
+TEXT ·spillArgs(SB),NOSPLIT,$0-0
+	MOVQ AX, 0(R12)
+	MOVQ BX, 8(R12)
+	MOVQ CX, 16(R12)
+	MOVQ DI, 24(R12)
+	MOVQ SI, 32(R12)
+	MOVQ R8, 40(R12)
+	MOVQ R9, 48(R12)
+	MOVQ R10, 56(R12)
+	MOVQ R11, 64(R12)
+	MOVQ X0, 72(R12)
+	MOVQ X1, 80(R12)
+	MOVQ X2, 88(R12)
+	MOVQ X3, 96(R12)
+	MOVQ X4, 104(R12)
+	MOVQ X5, 112(R12)
+	MOVQ X6, 120(R12)
+	MOVQ X7, 128(R12)
+	MOVQ X8, 136(R12)
+	MOVQ X9, 144(R12)
+	MOVQ X10, 152(R12)
+	MOVQ X11, 160(R12)
+	MOVQ X12, 168(R12)
+	MOVQ X13, 176(R12)
+	MOVQ X14, 184(R12)
+	RET
+
+// unspillArgs loads args into registers from a *internal/abi.RegArgs in R12.
+TEXT ·unspillArgs(SB),NOSPLIT,$0-0
+	MOVQ 0(R12), AX
+	MOVQ 8(R12), BX
+	MOVQ 16(R12), CX
+	MOVQ 24(R12), DI
+	MOVQ 32(R12), SI
+	MOVQ 40(R12), R8
+	MOVQ 48(R12), R9
+	MOVQ 56(R12), R10
+	MOVQ 64(R12), R11
+	MOVQ 72(R12), X0
+	MOVQ 80(R12), X1
+	MOVQ 88(R12), X2
+	MOVQ 96(R12), X3
+	MOVQ 104(R12), X4
+	MOVQ 112(R12), X5
+	MOVQ 120(R12), X6
+	MOVQ 128(R12), X7
+	MOVQ 136(R12), X8
+	MOVQ 144(R12), X9
+	MOVQ 152(R12), X10
+	MOVQ 160(R12), X11
+	MOVQ 168(R12), X12
+	MOVQ 176(R12), X13
+	MOVQ 184(R12), X14
+	RET
+
+// reflectcall: call a function with the given argument list
+// func call(stackArgsType *_type, f *FuncVal, stackArgs *byte, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	CMPQ	CX, $MAXSIZE;		\
+	JA	3(PC);			\
+	MOVQ	$NAME(SB), AX;		\
+	JMP	AX
+// Note: can't just "JMP NAME(SB)" - bad inlining results.
+
+TEXT ·reflectcall(SB), NOSPLIT, $0-48
+	MOVLQZX frameSize+32(FP), CX
+	DISPATCH(runtime·call16, 16)
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVQ	$runtime·badreflectcall(SB), AX
+	JMP	AX
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-48;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVQ	stackArgs+16(FP), SI;		\
+	MOVLQZX stackArgsSize+24(FP), CX;		\
+	MOVQ	SP, DI;				\
+	REP;MOVSB;				\
+	/* set up argument registers */		\
+	MOVQ    regArgs+40(FP), R12;		\
+	CALL    ·unspillArgs(SB);		\
+	/* call function */			\
+	MOVQ	f+8(FP), DX;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	MOVQ	(DX), R12;			\
+	CALL	R12;				\
+	/* copy register return values back */		\
+	MOVQ    regArgs+40(FP), R12;		\
+	CALL    ·spillArgs(SB);		\
+	MOVLQZX	stackArgsSize+24(FP), CX;		\
+	MOVLQZX	stackRetOffset+28(FP), BX;		\
+	MOVQ	stackArgs+16(FP), DI;		\
+	MOVQ	stackArgsType+0(FP), DX;		\
+	MOVQ	SP, SI;				\
+	ADDQ	BX, DI;				\
+	ADDQ	BX, SI;				\
+	SUBQ	BX, CX;				\
+	CALL	callRet<>(SB);			\
+	RET
+
+// callRet copies return values back at the end of call*. This is a
+// separate function so it can allocate stack space for the arguments
+// to reflectcallmove. It does not follow the Go ABI; it expects its
+// arguments in registers.
+TEXT callRet<>(SB), NOSPLIT, $40-0
+	NO_LOCAL_POINTERS
+	MOVQ	DX, 0(SP)
+	MOVQ	DI, 8(SP)
+	MOVQ	SI, 16(SP)
+	MOVQ	CX, 24(SP)
+	MOVQ	R12, 32(SP)
+	CALL	runtime·reflectcallmove(SB)
+	RET
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	MOVL	cycles+0(FP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
+
+TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
+	// Stores are already ordered on x86, so this is just a
+	// compile barrier.
+	RET
+
+// Save state of caller into g->sched,
+// but using fake PC from systemstack_switch.
+// Must only be called from functions with no locals ($0)
+// or else unwinding from systemstack_switch is incorrect.
+// Smashes R9.
+TEXT gosave_systemstack_switch<>(SB),NOSPLIT,$0
+	MOVQ	$runtime·systemstack_switch(SB), R9
+	MOVQ	R9, (g_sched+gobuf_pc)(R14)
+	LEAQ	8(SP), R9
+	MOVQ	R9, (g_sched+gobuf_sp)(R14)
+	MOVQ	$0, (g_sched+gobuf_ret)(R14)
+	MOVQ	BP, (g_sched+gobuf_bp)(R14)
+	// Assert ctxt is zero. See func save.
+	MOVQ	(g_sched+gobuf_ctxt)(R14), R9
+	TESTQ	R9, R9
+	JZ	2(PC)
+	CALL	runtime·abort(SB)
+	RET
+
+// func asmcgocall_no_g(fn, arg unsafe.Pointer)
+// Call fn(arg) aligned appropriately for the gcc ABI.
+// Called on a system stack, and there may be no g yet (during needm).
+TEXT ·asmcgocall_no_g(SB),NOSPLIT,$0-16
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg+8(FP), BX
+	MOVQ	SP, DX
+	SUBQ	$32, SP
+	ANDQ	$~15, SP	// alignment
+	MOVQ	DX, 8(SP)
+	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	MOVQ	BX, CX		// CX = first argument in Win64
+	CALL	AX
+	MOVQ	8(SP), DX
+	MOVQ	DX, SP
+	RET
+
+// func asmcgocall(fn, arg unsafe.Pointer) int32
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.go for more details.
+TEXT ·asmcgocall(SB),NOSPLIT,$0-20
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg+8(FP), BX
+
+	MOVQ	SP, DX
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already. Or we might already
+	// be on the m->gsignal stack.
+	get_tls(CX)
+	MOVQ	g(CX), DI
+	CMPQ	DI, $0
+	JEQ	nosave
+	MOVQ	g_m(DI), R8
+	MOVQ	m_gsignal(R8), SI
+	CMPQ	DI, SI
+	JEQ	nosave
+	MOVQ	m_g0(R8), SI
+	CMPQ	DI, SI
+	JEQ	nosave
+
+	// Switch to system stack.
+	CALL	gosave_systemstack_switch<>(SB)
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), SP
+
+	// Now on a scheduling stack (a pthread-created stack).
+	// Make sure we have enough room for 4 stack-backed fast-call
+	// registers as per windows amd64 calling convention.
+	SUBQ	$64, SP
+	ANDQ	$~15, SP	// alignment for gcc ABI
+	MOVQ	DI, 48(SP)	// save g
+	MOVQ	(g_stack+stack_hi)(DI), DI
+	SUBQ	DX, DI
+	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
+	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	MOVQ	BX, CX		// CX = first argument in Win64
+	CALL	AX
+
+	// Restore registers, g, stack pointer.
+	get_tls(CX)
+	MOVQ	48(SP), DI
+	MOVQ	(g_stack+stack_hi)(DI), SI
+	SUBQ	40(SP), SI
+	MOVQ	DI, g(CX)
+	MOVQ	SI, SP
+
+	MOVL	AX, ret+16(FP)
+	RET
+
+nosave:
+	// Running on a system stack, perhaps even without a g.
+	// Having no g can happen during thread creation or thread teardown
+	// (see needm/dropm on Solaris, for example).
+	// This code is like the above sequence but without saving/restoring g
+	// and without worrying about the stack moving out from under us
+	// (because we're on a system stack, not a goroutine stack).
+	// The above code could be used directly if already on a system stack,
+	// but then the only path through this code would be a rare case on Solaris.
+	// Using this code for all "already on system stack" calls exercises it more,
+	// which should help keep it correct.
+	SUBQ	$64, SP
+	ANDQ	$~15, SP
+	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
+	MOVQ	DX, 40(SP)	// save original stack pointer
+	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	MOVQ	BX, CX		// CX = first argument in Win64
+	CALL	AX
+	MOVQ	40(SP), SI	// restore original stack pointer
+	MOVQ	SI, SP
+	MOVL	AX, ret+16(FP)
+	RET
+
+#ifdef GOOS_windows
+// Dummy TLS that's used on Windows so that we don't crash trying
+// to restore the G register in needm. needm and its callees are
+// very careful never to actually use the G, the TLS just can't be
+// unset since we're in Go code.
+GLOBL zeroTLS<>(SB),RODATA,$const_tlsSize
+#endif
+
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
+// See cgocall.go for more details.
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
+	NO_LOCAL_POINTERS
+
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one m for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call through AX.
+	get_tls(CX)
+#ifdef GOOS_windows
+	MOVL	$0, BX
+	CMPQ	CX, $0
+	JEQ	2(PC)
+#endif
+	MOVQ	g(CX), BX
+	CMPQ	BX, $0
+	JEQ	needm
+	MOVQ	g_m(BX), BX
+	MOVQ	BX, savedm-8(SP)	// saved copy of oldm
+	JMP	havem
+needm:
+#ifdef GOOS_windows
+	// Set up a dummy TLS value. needm is careful not to use it,
+	// but it needs to be there to prevent autogenerated code from
+	// crashing when it loads from it.
+	// We don't need to clear it or anything later because needm
+	// will set up TLS properly.
+	MOVQ	$zeroTLS<>(SB), DI
+	CALL	runtime·settls(SB)
+#endif
+	// On some platforms (Windows) we cannot call needm through
+	// an ABI wrapper because there's no TLS set up, and the ABI
+	// wrapper will try to restore the G register (R14) from TLS.
+	// Clear X15 because Go expects it and we're not calling
+	// through a wrapper, but otherwise avoid setting the G
+	// register in the wrapper and call needm directly. It
+	// takes no arguments and doesn't return any values so
+	// there's no need to handle that. Clear R14 so that there's
+	// a bad value in there, in case needm tries to use it.
+	XORPS	X15, X15
+	XORQ    R14, R14
+	MOVQ	$runtime·needm<ABIInternal>(SB), AX
+	CALL	AX
+	MOVQ	$0, savedm-8(SP) // dropm on return
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then systemstack will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVQ	m_g0(BX), SI
+	MOVQ	SP, (g_sched+gobuf_sp)(SI)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
+	MOVQ	m_g0(BX), SI
+	MOVQ	(g_sched+gobuf_sp)(SI), AX
+	MOVQ	AX, 0(SP)
+	MOVQ	SP, (g_sched+gobuf_sp)(SI)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
+	MOVQ	m_curg(BX), SI
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
+	MOVQ	(g_sched+gobuf_pc)(SI), BX
+	MOVQ	BX, -8(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVQ	fn+0(FP), BX
+	MOVQ	frame+8(FP), CX
+	MOVQ	ctxt+16(FP), DX
+	// Compute the size of the frame, including return PC and, if
+	// GOEXPERIMENT=framepointer, the saved base pointer
+	LEAQ	fn+0(FP), AX
+	SUBQ	SP, AX   // AX is our actual frame size
+	SUBQ	AX, DI   // Allocate the same frame size on the g stack
+	MOVQ	DI, SP
+
+	MOVQ	BX, 0(SP)
+	MOVQ	CX, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	$runtime·cgocallbackg(SB), AX
+	CALL	AX	// indirect call to bypass nosplit check. We're on a different stack now.
+
+	// Compute the size of the frame again. FP and SP have
+	// completely different values here than they did above,
+	// but only their difference matters.
+	LEAQ	fn+0(FP), AX
+	SUBQ	SP, AX
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	get_tls(CX)
+	MOVQ	g(CX), SI
+	MOVQ	SP, DI
+	ADDQ	AX, DI
+	MOVQ	-8(DI), BX
+	MOVQ	BX, (g_sched+gobuf_pc)(SI)
+	MOVQ	DI, (g_sched+gobuf_sp)(SI)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_g0(BX), SI
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), SP
+	MOVQ	0(SP), AX
+	MOVQ	AX, (g_sched+gobuf_sp)(SI)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVQ	savedm-8(SP), BX
+	CMPQ	BX, $0
+	JNE	done
+	MOVQ	$runtime·dropm(SB), AX
+	CALL	AX
+#ifdef GOOS_windows
+	// We need to clear the TLS pointer in case the next
+	// thread that comes into Go tries to reuse that space
+	// but uses the same M.
+	XORQ	DI, DI
+	CALL	runtime·settls(SB)
+#endif
+done:
+
+	// Done!
+	RET
+
+// func setg(gg *g)
+// set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVQ	gg+0(FP), BX
+	get_tls(CX)
+	MOVQ	BX, g(CX)
+	RET
+
+// void setg_gcc(G*); set g called from gcc.
+TEXT setg_gcc<>(SB),NOSPLIT,$0
+	get_tls(AX)
+	MOVQ	DI, g(AX)
+	MOVQ	DI, R14 // set the g register
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT,$0-0
+	INT	$3
+loop:
+	JMP	loop
+
+// check that SP is in range [g->stack.lo, g->stack.hi)
+TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	CMPQ	(g_stack+stack_hi)(AX), SP
+	JHI	2(PC)
+	CALL	runtime·abort(SB)
+	CMPQ	SP, (g_stack+stack_lo)(AX)
+	JHI	2(PC)
+	CALL	runtime·abort(SB)
+	RET
+
+// func cputicks() int64
+TEXT runtime·cputicks(SB),NOSPLIT,$0-0
+	CMPB	internal∕cpu·X86+const_offsetX86HasRDTSCP(SB), $1
+	JNE	fences
+	// Instruction stream serializing RDTSCP is supported.
+	// RDTSCP is supported by Intel Nehalem (2008) and
+	// AMD K8 Rev. F (2006) and newer.
+	RDTSCP
+done:
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+fences:
+	// MFENCE is instruction stream serializing and flushes the
+	// store buffers on AMD. The serialization semantics of LFENCE on AMD
+	// are dependent on MSR C001_1029 and CPU generation.
+	// LFENCE on Intel does wait for all previous instructions to have executed.
+	// Intel recommends MFENCE;LFENCE in its manuals before RDTSC to have all
+	// previous instructions executed and all previous loads and stores to globally visible.
+	// Using MFENCE;LFENCE here aligns the serializing properties without
+	// runtime detection of CPU manufacturer.
+	MFENCE
+	LFENCE
+	RDTSC
+	JMP done
+
+// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
+// hash function using AES hardware instructions
+TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT,$0-32
+	// AX = ptr to data
+	// BX = seed
+	// CX = size
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
+	JMP	aeshashbody<>(SB)
+noaes:
+	JMP	runtime·memhashFallback<ABIInternal>(SB)
+
+// func strhash(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT,$0-24
+	// AX = ptr to string struct
+	// BX = seed
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
+	MOVQ	8(AX), CX	// length of string
+	MOVQ	(AX), AX	// string data
+	JMP	aeshashbody<>(SB)
+noaes:
+	JMP	runtime·strhashFallback<ABIInternal>(SB)
+
+// AX: data
+// BX: hash seed
+// CX: length
+// At return: AX = return value
+TEXT aeshashbody<>(SB),NOSPLIT,$0-0
+	// Fill an SSE register with our seeds.
+	MOVQ	BX, X0				// 64 bits of per-table hash seed
+	PINSRW	$4, CX, X0			// 16 bits of length
+	PSHUFHW $0, X0, X0			// repeat length 4 times total
+	MOVO	X0, X1				// save unscrambled seed
+	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
+	AESENC	X0, X0				// scramble seed
+
+	CMPQ	CX, $16
+	JB	aes0to15
+	JE	aes16
+	CMPQ	CX, $32
+	JBE	aes17to32
+	CMPQ	CX, $64
+	JBE	aes33to64
+	CMPQ	CX, $128
+	JBE	aes65to128
+	JMP	aes129plus
+
+aes0to15:
+	TESTQ	CX, CX
+	JE	aes0
+
+	ADDQ	$16, AX
+	TESTW	$0xff0, AX
+	JE	endofpage
+
+	// 16 bytes loaded at this address won't cross
+	// a page boundary, so we can load it directly.
+	MOVOU	-16(AX), X1
+	ADDQ	CX, CX
+	MOVQ	$masks<>(SB), AX
+	PAND	(AX)(CX*8), X1
+final1:
+	PXOR	X0, X1	// xor data with seed
+	AESENC	X1, X1	// scramble combo 3 times
+	AESENC	X1, X1
+	AESENC	X1, X1
+	MOVQ	X1, AX	// return X1
+	RET
+
+endofpage:
+	// address ends in 1111xxxx. Might be up against
+	// a page boundary, so load ending at last byte.
+	// Then shift bytes down using pshufb.
+	MOVOU	-32(AX)(CX*1), X1
+	ADDQ	CX, CX
+	MOVQ	$shifts<>(SB), AX
+	PSHUFB	(AX)(CX*8), X1
+	JMP	final1
+
+aes0:
+	// Return scrambled input seed
+	AESENC	X0, X0
+	MOVQ	X0, AX	// return X0
+	RET
+
+aes16:
+	MOVOU	(AX), X1
+	JMP	final1
+
+aes17to32:
+	// make second starting seed
+	PXOR	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X1
+
+	// load data to be hashed
+	MOVOU	(AX), X2
+	MOVOU	-16(AX)(CX*1), X3
+
+	// xor with seed
+	PXOR	X0, X2
+	PXOR	X1, X3
+
+	// scramble 3 times
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X2, X2
+	AESENC	X3, X3
+
+	// combine results
+	PXOR	X3, X2
+	MOVQ	X2, AX	// return X2
+	RET
+
+aes33to64:
+	// make 3 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+
+	MOVOU	(AX), X4
+	MOVOU	16(AX), X5
+	MOVOU	-32(AX)(CX*1), X6
+	MOVOU	-16(AX)(CX*1), X7
+
+	PXOR	X0, X4
+	PXOR	X1, X5
+	PXOR	X2, X6
+	PXOR	X3, X7
+
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+
+	PXOR	X6, X4
+	PXOR	X7, X5
+	PXOR	X5, X4
+	MOVQ	X4, AX	// return X4
+	RET
+
+aes65to128:
+	// make 7 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	MOVO	X1, X4
+	MOVO	X1, X5
+	MOVO	X1, X6
+	MOVO	X1, X7
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	PXOR	runtime·aeskeysched+64(SB), X4
+	PXOR	runtime·aeskeysched+80(SB), X5
+	PXOR	runtime·aeskeysched+96(SB), X6
+	PXOR	runtime·aeskeysched+112(SB), X7
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+
+	// load data
+	MOVOU	(AX), X8
+	MOVOU	16(AX), X9
+	MOVOU	32(AX), X10
+	MOVOU	48(AX), X11
+	MOVOU	-64(AX)(CX*1), X12
+	MOVOU	-48(AX)(CX*1), X13
+	MOVOU	-32(AX)(CX*1), X14
+	MOVOU	-16(AX)(CX*1), X15
+
+	// xor with seed
+	PXOR	X0, X8
+	PXOR	X1, X9
+	PXOR	X2, X10
+	PXOR	X3, X11
+	PXOR	X4, X12
+	PXOR	X5, X13
+	PXOR	X6, X14
+	PXOR	X7, X15
+
+	// scramble 3 times
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	// combine results
+	PXOR	X12, X8
+	PXOR	X13, X9
+	PXOR	X14, X10
+	PXOR	X15, X11
+	PXOR	X10, X8
+	PXOR	X11, X9
+	PXOR	X9, X8
+	// X15 must be zero on return
+	PXOR	X15, X15
+	MOVQ	X8, AX	// return X8
+	RET
+
+aes129plus:
+	// make 7 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	MOVO	X1, X4
+	MOVO	X1, X5
+	MOVO	X1, X6
+	MOVO	X1, X7
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	PXOR	runtime·aeskeysched+64(SB), X4
+	PXOR	runtime·aeskeysched+80(SB), X5
+	PXOR	runtime·aeskeysched+96(SB), X6
+	PXOR	runtime·aeskeysched+112(SB), X7
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+
+	// start with last (possibly overlapping) block
+	MOVOU	-128(AX)(CX*1), X8
+	MOVOU	-112(AX)(CX*1), X9
+	MOVOU	-96(AX)(CX*1), X10
+	MOVOU	-80(AX)(CX*1), X11
+	MOVOU	-64(AX)(CX*1), X12
+	MOVOU	-48(AX)(CX*1), X13
+	MOVOU	-32(AX)(CX*1), X14
+	MOVOU	-16(AX)(CX*1), X15
+
+	// xor in seed
+	PXOR	X0, X8
+	PXOR	X1, X9
+	PXOR	X2, X10
+	PXOR	X3, X11
+	PXOR	X4, X12
+	PXOR	X5, X13
+	PXOR	X6, X14
+	PXOR	X7, X15
+
+	// compute number of remaining 128-byte blocks
+	DECQ	CX
+	SHRQ	$7, CX
+
+aesloop:
+	// scramble state
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	// scramble state, xor in a block
+	MOVOU	(AX), X0
+	MOVOU	16(AX), X1
+	MOVOU	32(AX), X2
+	MOVOU	48(AX), X3
+	AESENC	X0, X8
+	AESENC	X1, X9
+	AESENC	X2, X10
+	AESENC	X3, X11
+	MOVOU	64(AX), X4
+	MOVOU	80(AX), X5
+	MOVOU	96(AX), X6
+	MOVOU	112(AX), X7
+	AESENC	X4, X12
+	AESENC	X5, X13
+	AESENC	X6, X14
+	AESENC	X7, X15
+
+	ADDQ	$128, AX
+	DECQ	CX
+	JNE	aesloop
+
+	// 3 more scrambles to finish
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	PXOR	X12, X8
+	PXOR	X13, X9
+	PXOR	X14, X10
+	PXOR	X15, X11
+	PXOR	X10, X8
+	PXOR	X11, X9
+	PXOR	X9, X8
+	// X15 must be zero on return
+	PXOR	X15, X15
+	MOVQ	X8, AX	// return X8
+	RET
+
+// func memhash32(p unsafe.Pointer, h uintptr) uintptr
+// ABIInternal for performance.
+TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT,$0-24
+	// AX = ptr to data
+	// BX = seed
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
+	MOVQ	BX, X0	// X0 = seed
+	PINSRD	$2, (AX), X0	// data
+	AESENC	runtime·aeskeysched+0(SB), X0
+	AESENC	runtime·aeskeysched+16(SB), X0
+	AESENC	runtime·aeskeysched+32(SB), X0
+	MOVQ	X0, AX	// return X0
+	RET
+noaes:
+	JMP	runtime·memhash32Fallback<ABIInternal>(SB)
+
+// func memhash64(p unsafe.Pointer, h uintptr) uintptr
+// ABIInternal for performance.
+TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT,$0-24
+	// AX = ptr to data
+	// BX = seed
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
+	MOVQ	BX, X0	// X0 = seed
+	PINSRQ	$1, (AX), X0	// data
+	AESENC	runtime·aeskeysched+0(SB), X0
+	AESENC	runtime·aeskeysched+16(SB), X0
+	AESENC	runtime·aeskeysched+32(SB), X0
+	MOVQ	X0, AX	// return X0
+	RET
+noaes:
+	JMP	runtime·memhash64Fallback<ABIInternal>(SB)
+
+// simple mask to get rid of data in the high part of the register.
+DATA masks<>+0x00(SB)/8, $0x0000000000000000
+DATA masks<>+0x08(SB)/8, $0x0000000000000000
+DATA masks<>+0x10(SB)/8, $0x00000000000000ff
+DATA masks<>+0x18(SB)/8, $0x0000000000000000
+DATA masks<>+0x20(SB)/8, $0x000000000000ffff
+DATA masks<>+0x28(SB)/8, $0x0000000000000000
+DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
+DATA masks<>+0x38(SB)/8, $0x0000000000000000
+DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
+DATA masks<>+0x48(SB)/8, $0x0000000000000000
+DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
+DATA masks<>+0x58(SB)/8, $0x0000000000000000
+DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
+DATA masks<>+0x68(SB)/8, $0x0000000000000000
+DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
+DATA masks<>+0x78(SB)/8, $0x0000000000000000
+DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
+DATA masks<>+0x88(SB)/8, $0x0000000000000000
+DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
+DATA masks<>+0x98(SB)/8, $0x00000000000000ff
+DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
+DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
+DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
+DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
+DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
+DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
+GLOBL masks<>(SB),RODATA,$256
+
+// func checkASM() bool
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
+	MOVQ	$masks<>(SB), AX
+	MOVQ	$shifts<>(SB), BX
+	ORQ	BX, AX
+	TESTQ	$15, AX
+	SETEQ	ret+0(FP)
+	RET
+
+// these are arguments to pshufb. They move data down from
+// the high bytes of the register to the low bytes of the register.
+// index is how many bytes to move.
+DATA shifts<>+0x00(SB)/8, $0x0000000000000000
+DATA shifts<>+0x08(SB)/8, $0x0000000000000000
+DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
+DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
+DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
+DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
+DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
+DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
+DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
+DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
+DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
+DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
+DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
+DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
+DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
+DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
+DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
+DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
+DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
+DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
+DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
+DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
+DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
+DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
+GLOBL shifts<>(SB),RODATA,$256
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVL	$0, AX
+	RET
+
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT,$0
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), AX
+	MOVQ	m_curg(AX), AX
+	MOVQ	(g_stack+stack_hi)(AX), AX
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT|TOPFRAME,$0-0
+	BYTE	$0x90	// NOP
+	CALL	runtime·goexit1(SB)	// does not return
+	// traceback from goexit1 must hit code range of goexit
+	BYTE	$0x90	// NOP
+
+// This is called from .init_array and follows the platform, not Go, ABI.
+TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
+	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
+	MOVQ	runtime·lastmoduledatap(SB), AX
+	MOVQ	DI, moduledata_next(AX)
+	MOVQ	DI, runtime·lastmoduledatap(SB)
+	POPQ	R15
+	RET
+
+// Initialize special registers then jump to sigpanic.
+// This function is injected from the signal handler for panicking
+// signals. It is quite painful to set X15 in the signal context,
+// so we do it here.
+TEXT ·sigpanic0(SB),NOSPLIT,$0-0
+	get_tls(R14)
+	MOVQ	g(R14), R14
+#ifndef GOOS_plan9
+	XORPS	X15, X15
+#endif
+	JMP	·sigpanic<ABIInternal>(SB)
+
+// gcWriteBarrier performs a heap pointer write and informs the GC.
+//
+// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
+// - DI is the destination of the write
+// - AX is the value being written at DI
+// It clobbers FLAGS. It does not clobber any general-purpose registers,
+// but may clobber others (e.g., SSE registers).
+// Defined as ABIInternal since it does not use the stack-based Go ABI.
+TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$112
+	// Save the registers clobbered by the fast path. This is slightly
+	// faster than having the caller spill these.
+	MOVQ	R12, 96(SP)
+	MOVQ	R13, 104(SP)
+	// TODO: Consider passing g.m.p in as an argument so they can be shared
+	// across a sequence of write barriers.
+	MOVQ	g_m(R14), R13
+	MOVQ	m_p(R13), R13
+	MOVQ	(p_wbBuf+wbBuf_next)(R13), R12
+	// Increment wbBuf.next position.
+	LEAQ	16(R12), R12
+	MOVQ	R12, (p_wbBuf+wbBuf_next)(R13)
+	CMPQ	R12, (p_wbBuf+wbBuf_end)(R13)
+	// Record the write.
+	MOVQ	AX, -16(R12)	// Record value
+	// Note: This turns bad pointer writes into bad
+	// pointer reads, which could be confusing. We could avoid
+	// reading from obviously bad pointers, which would
+	// take care of the vast majority of these. We could
+	// patch this up in the signal handler, or use XCHG to
+	// combine the read and the write.
+	MOVQ	(DI), R13
+	MOVQ	R13, -8(R12)	// Record *slot
+	// Is the buffer full? (flags set in CMPQ above)
+	JEQ	flush
+ret:
+	MOVQ	96(SP), R12
+	MOVQ	104(SP), R13
+	// Do the write.
+	MOVQ	AX, (DI)
+	RET
+
+flush:
+	// Save all general purpose registers since these could be
+	// clobbered by wbBufFlush and were not saved by the caller.
+	// It is possible for wbBufFlush to clobber other registers
+	// (e.g., SSE registers), but the compiler takes care of saving
+	// those in the caller if necessary. This strikes a balance
+	// with registers that are likely to be used.
+	//
+	// We don't have type information for these, but all code under
+	// here is NOSPLIT, so nothing will observe these.
+	//
+	// TODO: We could strike a different balance; e.g., saving X0
+	// and not saving GP registers that are less likely to be used.
+	MOVQ	DI, 0(SP)	// Also first argument to wbBufFlush
+	MOVQ	AX, 8(SP)	// Also second argument to wbBufFlush
+	MOVQ	BX, 16(SP)
+	MOVQ	CX, 24(SP)
+	MOVQ	DX, 32(SP)
+	// DI already saved
+	MOVQ	SI, 40(SP)
+	MOVQ	BP, 48(SP)
+	MOVQ	R8, 56(SP)
+	MOVQ	R9, 64(SP)
+	MOVQ	R10, 72(SP)
+	MOVQ	R11, 80(SP)
+	// R12 already saved
+	// R13 already saved
+	// R14 is g
+	MOVQ	R15, 88(SP)
+
+	// This takes arguments DI and AX
+	CALL	runtime·wbBufFlush(SB)
+
+	MOVQ	0(SP), DI
+	MOVQ	8(SP), AX
+	MOVQ	16(SP), BX
+	MOVQ	24(SP), CX
+	MOVQ	32(SP), DX
+	MOVQ	40(SP), SI
+	MOVQ	48(SP), BP
+	MOVQ	56(SP), R8
+	MOVQ	64(SP), R9
+	MOVQ	72(SP), R10
+	MOVQ	80(SP), R11
+	MOVQ	88(SP), R15
+	JMP	ret
+
+// gcWriteBarrierCX is gcWriteBarrier, but with args in DI and CX.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierCX<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ CX, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ CX, AX
+	RET
+
+// gcWriteBarrierDX is gcWriteBarrier, but with args in DI and DX.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierDX<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ DX, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ DX, AX
+	RET
+
+// gcWriteBarrierBX is gcWriteBarrier, but with args in DI and BX.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierBX<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ BX, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ BX, AX
+	RET
+
+// gcWriteBarrierBP is gcWriteBarrier, but with args in DI and BP.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierBP<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ BP, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ BP, AX
+	RET
+
+// gcWriteBarrierSI is gcWriteBarrier, but with args in DI and SI.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierSI<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ SI, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ SI, AX
+	RET
+
+// gcWriteBarrierR8 is gcWriteBarrier, but with args in DI and R8.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierR8<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ R8, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ R8, AX
+	RET
+
+// gcWriteBarrierR9 is gcWriteBarrier, but with args in DI and R9.
+// Defined as ABIInternal since it does not use the stable Go ABI.
+TEXT runtime·gcWriteBarrierR9<ABIInternal>(SB),NOSPLIT,$0
+	XCHGQ R9, AX
+	CALL runtime·gcWriteBarrier<ABIInternal>(SB)
+	XCHGQ R9, AX
+	RET
+
+DATA	debugCallFrameTooLarge<>+0x00(SB)/20, $"call frame too large"
+GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
+
+// debugCallV2 is the entry point for debugger-injected function
+// calls on running goroutines. It informs the runtime that a
+// debug call has been injected and creates a call frame for the
+// debugger to fill in.
+//
+// To inject a function call, a debugger should:
+// 1. Check that the goroutine is in state _Grunning and that
+//    there are at least 256 bytes free on the stack.
+// 2. Push the current PC on the stack (updating SP).
+// 3. Write the desired argument frame size at SP-16 (using the SP
+//    after step 2).
+// 4. Save all machine registers (including flags and XMM reigsters)
+//    so they can be restored later by the debugger.
+// 5. Set the PC to debugCallV2 and resume execution.
+//
+// If the goroutine is in state _Grunnable, then it's not generally
+// safe to inject a call because it may return out via other runtime
+// operations. Instead, the debugger should unwind the stack to find
+// the return to non-runtime code, add a temporary breakpoint there,
+// and inject the call once that breakpoint is hit.
+//
+// If the goroutine is in any other state, it's not safe to inject a call.
+//
+// This function communicates back to the debugger by setting R12 and
+// invoking INT3 to raise a breakpoint signal. See the comments in the
+// implementation for the protocol the debugger is expected to
+// follow. InjectDebugCall in the runtime tests demonstrates this protocol.
+//
+// The debugger must ensure that any pointers passed to the function
+// obey escape analysis requirements. Specifically, it must not pass
+// a stack pointer to an escaping argument. debugCallV2 cannot check
+// this invariant.
+//
+// This is ABIInternal because Go code injects its PC directly into new
+// goroutine stacks.
+TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT,$152-0
+	// Save all registers that may contain pointers so they can be
+	// conservatively scanned.
+	//
+	// We can't do anything that might clobber any of these
+	// registers before this.
+	MOVQ	R15, r15-(14*8+8)(SP)
+	MOVQ	R14, r14-(13*8+8)(SP)
+	MOVQ	R13, r13-(12*8+8)(SP)
+	MOVQ	R12, r12-(11*8+8)(SP)
+	MOVQ	R11, r11-(10*8+8)(SP)
+	MOVQ	R10, r10-(9*8+8)(SP)
+	MOVQ	R9, r9-(8*8+8)(SP)
+	MOVQ	R8, r8-(7*8+8)(SP)
+	MOVQ	DI, di-(6*8+8)(SP)
+	MOVQ	SI, si-(5*8+8)(SP)
+	MOVQ	BP, bp-(4*8+8)(SP)
+	MOVQ	BX, bx-(3*8+8)(SP)
+	MOVQ	DX, dx-(2*8+8)(SP)
+	// Save the frame size before we clobber it. Either of the last
+	// saves could clobber this depending on whether there's a saved BP.
+	MOVQ	frameSize-24(FP), DX	// aka -16(RSP) before prologue
+	MOVQ	CX, cx-(1*8+8)(SP)
+	MOVQ	AX, ax-(0*8+8)(SP)
+
+	// Save the argument frame size.
+	MOVQ	DX, frameSize-128(SP)
+
+	// Perform a safe-point check.
+	MOVQ	retpc-8(FP), AX	// Caller's PC
+	MOVQ	AX, 0(SP)
+	CALL	runtime·debugCallCheck(SB)
+	MOVQ	8(SP), AX
+	TESTQ	AX, AX
+	JZ	good
+	// The safety check failed. Put the reason string at the top
+	// of the stack.
+	MOVQ	AX, 0(SP)
+	MOVQ	16(SP), AX
+	MOVQ	AX, 8(SP)
+	// Set R12 to 8 and invoke INT3. The debugger should get the
+	// reason a call can't be injected from the top of the stack
+	// and resume execution.
+	MOVQ	$8, R12
+	BYTE	$0xcc
+	JMP	restore
+
+good:
+	// Registers are saved and it's safe to make a call.
+	// Open up a call frame, moving the stack if necessary.
+	//
+	// Once the frame is allocated, this will set R12 to 0 and
+	// invoke INT3. The debugger should write the argument
+	// frame for the call at SP, set up argument registers, push
+	// the trapping PC on the stack, set the PC to the function to
+	// call, set RDX to point to the closure (if a closure call),
+	// and resume execution.
+	//
+	// If the function returns, this will set R12 to 1 and invoke
+	// INT3. The debugger can then inspect any return value saved
+	// on the stack at SP and in registers and resume execution again.
+	//
+	// If the function panics, this will set R12 to 2 and invoke INT3.
+	// The interface{} value of the panic will be at SP. The debugger
+	// can inspect the panic value and resume execution again.
+#define DEBUG_CALL_DISPATCH(NAME,MAXSIZE)	\
+	CMPQ	AX, $MAXSIZE;			\
+	JA	5(PC);				\
+	MOVQ	$NAME(SB), AX;			\
+	MOVQ	AX, 0(SP);			\
+	CALL	runtime·debugCallWrap(SB);	\
+	JMP	restore
+
+	MOVQ	frameSize-128(SP), AX
+	DEBUG_CALL_DISPATCH(debugCall32<>, 32)
+	DEBUG_CALL_DISPATCH(debugCall64<>, 64)
+	DEBUG_CALL_DISPATCH(debugCall128<>, 128)
+	DEBUG_CALL_DISPATCH(debugCall256<>, 256)
+	DEBUG_CALL_DISPATCH(debugCall512<>, 512)
+	DEBUG_CALL_DISPATCH(debugCall1024<>, 1024)
+	DEBUG_CALL_DISPATCH(debugCall2048<>, 2048)
+	DEBUG_CALL_DISPATCH(debugCall4096<>, 4096)
+	DEBUG_CALL_DISPATCH(debugCall8192<>, 8192)
+	DEBUG_CALL_DISPATCH(debugCall16384<>, 16384)
+	DEBUG_CALL_DISPATCH(debugCall32768<>, 32768)
+	DEBUG_CALL_DISPATCH(debugCall65536<>, 65536)
+	// The frame size is too large. Report the error.
+	MOVQ	$debugCallFrameTooLarge<>(SB), AX
+	MOVQ	AX, 0(SP)
+	MOVQ	$20, 8(SP) // length of debugCallFrameTooLarge string
+	MOVQ	$8, R12
+	BYTE	$0xcc
+	JMP	restore
+
+restore:
+	// Calls and failures resume here.
+	//
+	// Set R12 to 16 and invoke INT3. The debugger should restore
+	// all registers except RIP and RSP and resume execution.
+	MOVQ	$16, R12
+	BYTE	$0xcc
+	// We must not modify flags after this point.
+
+	// Restore pointer-containing registers, which may have been
+	// modified from the debugger's copy by stack copying.
+	MOVQ	ax-(0*8+8)(SP), AX
+	MOVQ	cx-(1*8+8)(SP), CX
+	MOVQ	dx-(2*8+8)(SP), DX
+	MOVQ	bx-(3*8+8)(SP), BX
+	MOVQ	bp-(4*8+8)(SP), BP
+	MOVQ	si-(5*8+8)(SP), SI
+	MOVQ	di-(6*8+8)(SP), DI
+	MOVQ	r8-(7*8+8)(SP), R8
+	MOVQ	r9-(8*8+8)(SP), R9
+	MOVQ	r10-(9*8+8)(SP), R10
+	MOVQ	r11-(10*8+8)(SP), R11
+	MOVQ	r12-(11*8+8)(SP), R12
+	MOVQ	r13-(12*8+8)(SP), R13
+	MOVQ	r14-(13*8+8)(SP), R14
+	MOVQ	r15-(14*8+8)(SP), R15
+
+	RET
+
+// runtime.debugCallCheck assumes that functions defined with the
+// DEBUG_CALL_FN macro are safe points to inject calls.
+#define DEBUG_CALL_FN(NAME,MAXSIZE)		\
+TEXT NAME(SB),WRAPPER,$MAXSIZE-0;		\
+	NO_LOCAL_POINTERS;			\
+	MOVQ	$0, R12;				\
+	BYTE	$0xcc;				\
+	MOVQ	$1, R12;				\
+	BYTE	$0xcc;				\
+	RET
+DEBUG_CALL_FN(debugCall32<>, 32)
+DEBUG_CALL_FN(debugCall64<>, 64)
+DEBUG_CALL_FN(debugCall128<>, 128)
+DEBUG_CALL_FN(debugCall256<>, 256)
+DEBUG_CALL_FN(debugCall512<>, 512)
+DEBUG_CALL_FN(debugCall1024<>, 1024)
+DEBUG_CALL_FN(debugCall2048<>, 2048)
+DEBUG_CALL_FN(debugCall4096<>, 4096)
+DEBUG_CALL_FN(debugCall8192<>, 8192)
+DEBUG_CALL_FN(debugCall16384<>, 16384)
+DEBUG_CALL_FN(debugCall32768<>, 32768)
+DEBUG_CALL_FN(debugCall65536<>, 65536)
+
+// func debugCallPanicked(val interface{})
+TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
+	// Copy the panic value to the top of stack.
+	MOVQ	val_type+0(FP), AX
+	MOVQ	AX, 0(SP)
+	MOVQ	val_data+8(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	$2, R12
+	BYTE	$0xcc
+	RET
+
+// Note: these functions use a special calling convention to save generated code space.
+// Arguments are passed in registers, but the space for those arguments are allocated
+// in the caller's stack frame. These stubs write the args into that stack space and
+// then tail call to the corresponding runtime handler.
+// The tail call makes these stubs disappear in backtraces.
+// Defined as ABIInternal since they do not use the stack-based Go ABI.
+TEXT runtime·panicIndex<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, BX
+	JMP	runtime·goPanicIndex<ABIInternal>(SB)
+TEXT runtime·panicIndexU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, BX
+	JMP	runtime·goPanicIndexU<ABIInternal>(SB)
+TEXT runtime·panicSliceAlen<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+	JMP	runtime·goPanicSliceAlen<ABIInternal>(SB)
+TEXT runtime·panicSliceAlenU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+	JMP	runtime·goPanicSliceAlenU<ABIInternal>(SB)
+TEXT runtime·panicSliceAcap<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+	JMP	runtime·goPanicSliceAcap<ABIInternal>(SB)
+TEXT runtime·panicSliceAcapU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+	JMP	runtime·goPanicSliceAcapU<ABIInternal>(SB)
+TEXT runtime·panicSliceB<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, BX
+	JMP	runtime·goPanicSliceB<ABIInternal>(SB)
+TEXT runtime·panicSliceBU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, BX
+	JMP	runtime·goPanicSliceBU<ABIInternal>(SB)
+TEXT runtime·panicSlice3Alen<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	DX, AX
+	JMP	runtime·goPanicSlice3Alen<ABIInternal>(SB)
+TEXT runtime·panicSlice3AlenU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	DX, AX
+	JMP	runtime·goPanicSlice3AlenU<ABIInternal>(SB)
+TEXT runtime·panicSlice3Acap<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	DX, AX
+	JMP	runtime·goPanicSlice3Acap<ABIInternal>(SB)
+TEXT runtime·panicSlice3AcapU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	DX, AX
+	JMP	runtime·goPanicSlice3AcapU<ABIInternal>(SB)
+TEXT runtime·panicSlice3B<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+	JMP	runtime·goPanicSlice3B<ABIInternal>(SB)
+TEXT runtime·panicSlice3BU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, AX
+	MOVQ	DX, BX
+	JMP	runtime·goPanicSlice3BU<ABIInternal>(SB)
+TEXT runtime·panicSlice3C<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, BX
+	JMP	runtime·goPanicSlice3C<ABIInternal>(SB)
+TEXT runtime·panicSlice3CU<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	CX, BX
+	JMP	runtime·goPanicSlice3CU<ABIInternal>(SB)
+TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
+	MOVQ	DX, AX
+	JMP	runtime·goPanicSliceConvert<ABIInternal>(SB)
+
+#ifdef GOOS_android
+// Use the free TLS_SLOT_APP slot #2 on Android Q.
+// Earlier androids are set up in gcc_android.c.
+DATA runtime·tls_g+0(SB)/8, $16
+GLOBL runtime·tls_g+0(SB), NOPTR, $8
+#endif
+
+// The compiler and assembler's -spectre=ret mode rewrites
+// all indirect CALL AX / JMP AX instructions to be
+// CALL retpolineAX / JMP retpolineAX.
+// See https://support.google.com/faqs/answer/7625886.
+#define RETPOLINE(reg) \
+	/*   CALL setup */     BYTE $0xE8; BYTE $(2+2); BYTE $0; BYTE $0; BYTE $0;	\
+	/* nospec: */									\
+	/*   PAUSE */           BYTE $0xF3; BYTE $0x90;					\
+	/*   JMP nospec */      BYTE $0xEB; BYTE $-(2+2);				\
+	/* setup: */									\
+	/*   MOVQ AX, 0(SP) */  BYTE $0x48|((reg&8)>>1); BYTE $0x89;			\
+	                        BYTE $0x04|((reg&7)<<3); BYTE $0x24;			\
+	/*   RET */             BYTE $0xC3
+
+TEXT runtime·retpolineAX(SB),NOSPLIT,$0; RETPOLINE(0)
+TEXT runtime·retpolineCX(SB),NOSPLIT,$0; RETPOLINE(1)
+TEXT runtime·retpolineDX(SB),NOSPLIT,$0; RETPOLINE(2)
+TEXT runtime·retpolineBX(SB),NOSPLIT,$0; RETPOLINE(3)
+/* SP is 4, can't happen / magic encodings */
+TEXT runtime·retpolineBP(SB),NOSPLIT,$0; RETPOLINE(5)
+TEXT runtime·retpolineSI(SB),NOSPLIT,$0; RETPOLINE(6)
+TEXT runtime·retpolineDI(SB),NOSPLIT,$0; RETPOLINE(7)
+TEXT runtime·retpolineR8(SB),NOSPLIT,$0; RETPOLINE(8)
+TEXT runtime·retpolineR9(SB),NOSPLIT,$0; RETPOLINE(9)
+TEXT runtime·retpolineR10(SB),NOSPLIT,$0; RETPOLINE(10)
+TEXT runtime·retpolineR11(SB),NOSPLIT,$0; RETPOLINE(11)
+TEXT runtime·retpolineR12(SB),NOSPLIT,$0; RETPOLINE(12)
+TEXT runtime·retpolineR13(SB),NOSPLIT,$0; RETPOLINE(13)
+TEXT runtime·retpolineR14(SB),NOSPLIT,$0; RETPOLINE(14)
+TEXT runtime·retpolineR15(SB),NOSPLIT,$0; RETPOLINE(15)
diff --git a/contrib/go/_std_1.18/src/runtime/atomic_pointer.go b/contrib/go/_std_1.18/src/runtime/atomic_pointer.go
new file mode 100644
index 0000000000..b8f0c22c63
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/atomic_pointer.go
@@ -0,0 +1,77 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// These functions cannot have go:noescape annotations,
+// because while ptr does not escape, new does.
+// If new is marked as not escaping, the compiler will make incorrect
+// escape analysis decisions about the pointer value being stored.
+
+// atomicwb performs a write barrier before an atomic pointer write.
+// The caller should guard the call with "if writeBarrier.enabled".
+//
+//go:nosplit
+func atomicwb(ptr *unsafe.Pointer, new unsafe.Pointer) {
+	slot := (*uintptr)(unsafe.Pointer(ptr))
+	if !getg().m.p.ptr().wbBuf.putFast(*slot, uintptr(new)) {
+		wbBufFlush(slot, uintptr(new))
+	}
+}
+
+// atomicstorep performs *ptr = new atomically and invokes a write barrier.
+//
+//go:nosplit
+func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
+	if writeBarrier.enabled {
+		atomicwb((*unsafe.Pointer)(ptr), new)
+	}
+	atomic.StorepNoWB(noescape(ptr), new)
+}
+
+// Like above, but implement in terms of sync/atomic's uintptr operations.
+// We cannot just call the runtime routines, because the race detector expects
+// to be able to intercept the sync/atomic forms but not the runtime forms.
+
+//go:linkname sync_atomic_StoreUintptr sync/atomic.StoreUintptr
+func sync_atomic_StoreUintptr(ptr *uintptr, new uintptr)
+
+//go:linkname sync_atomic_StorePointer sync/atomic.StorePointer
+//go:nosplit
+func sync_atomic_StorePointer(ptr *unsafe.Pointer, new unsafe.Pointer) {
+	if writeBarrier.enabled {
+		atomicwb(ptr, new)
+	}
+	sync_atomic_StoreUintptr((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
+}
+
+//go:linkname sync_atomic_SwapUintptr sync/atomic.SwapUintptr
+func sync_atomic_SwapUintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:linkname sync_atomic_SwapPointer sync/atomic.SwapPointer
+//go:nosplit
+func sync_atomic_SwapPointer(ptr *unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
+	if writeBarrier.enabled {
+		atomicwb(ptr, new)
+	}
+	old := unsafe.Pointer(sync_atomic_SwapUintptr((*uintptr)(noescape(unsafe.Pointer(ptr))), uintptr(new)))
+	return old
+}
+
+//go:linkname sync_atomic_CompareAndSwapUintptr sync/atomic.CompareAndSwapUintptr
+func sync_atomic_CompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool
+
+//go:linkname sync_atomic_CompareAndSwapPointer sync/atomic.CompareAndSwapPointer
+//go:nosplit
+func sync_atomic_CompareAndSwapPointer(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
+	if writeBarrier.enabled {
+		atomicwb(ptr, new)
+	}
+	return sync_atomic_CompareAndSwapUintptr((*uintptr)(noescape(unsafe.Pointer(ptr))), uintptr(old), uintptr(new))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo.go b/contrib/go/_std_1.18/src/runtime/cgo.go
new file mode 100644
index 0000000000..d90468240d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo.go
@@ -0,0 +1,54 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+//go:cgo_export_static main
+
+// Filled in by runtime/cgo when linked into binary.
+
+//go:linkname _cgo_init _cgo_init
+//go:linkname _cgo_thread_start _cgo_thread_start
+//go:linkname _cgo_sys_thread_create _cgo_sys_thread_create
+//go:linkname _cgo_notify_runtime_init_done _cgo_notify_runtime_init_done
+//go:linkname _cgo_callers _cgo_callers
+//go:linkname _cgo_set_context_function _cgo_set_context_function
+//go:linkname _cgo_yield _cgo_yield
+
+var (
+	_cgo_init                     unsafe.Pointer
+	_cgo_thread_start             unsafe.Pointer
+	_cgo_sys_thread_create        unsafe.Pointer
+	_cgo_notify_runtime_init_done unsafe.Pointer
+	_cgo_callers                  unsafe.Pointer
+	_cgo_set_context_function     unsafe.Pointer
+	_cgo_yield                    unsafe.Pointer
+)
+
+// iscgo is set to true by the runtime/cgo package
+var iscgo bool
+
+// cgoHasExtraM is set on startup when an extra M is created for cgo.
+// The extra M must be created before any C/C++ code calls cgocallback.
+var cgoHasExtraM bool
+
+// cgoUse is called by cgo-generated code (using go:linkname to get at
+// an unexported name). The calls serve two purposes:
+// 1) they are opaque to escape analysis, so the argument is considered to
+// escape to the heap.
+// 2) they keep the argument alive until the call site; the call is emitted after
+// the end of the (presumed) use of the argument by C.
+// cgoUse should not actually be called (see cgoAlwaysFalse).
+func cgoUse(any) { throw("cgoUse should not be called") }
+
+// cgoAlwaysFalse is a boolean value that is always false.
+// The cgo-generated code says if cgoAlwaysFalse { cgoUse(p) }.
+// The compiler cannot see that cgoAlwaysFalse is always false,
+// so it emits the test and keeps the call, giving the desired
+// escape analysis result. The test is cheaper than the call.
+var cgoAlwaysFalse bool
+
+var cgo_yield = &_cgo_yield
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/abi_amd64.h b/contrib/go/_std_1.18/src/runtime/cgo/abi_amd64.h
new file mode 100644
index 0000000000..9949435fe9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/abi_amd64.h
@@ -0,0 +1,99 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Macros for transitioning from the host ABI to Go ABI0.
+//
+// These save the frame pointer, so in general, functions that use
+// these should have zero frame size to suppress the automatic frame
+// pointer, though it's harmless to not do this.
+
+#ifdef GOOS_windows
+
+// REGS_HOST_TO_ABI0_STACK is the stack bytes used by
+// PUSH_REGS_HOST_TO_ABI0.
+#define REGS_HOST_TO_ABI0_STACK (28*8 + 8)
+
+// PUSH_REGS_HOST_TO_ABI0 prepares for transitioning from
+// the host ABI to Go ABI0 code. It saves all registers that are
+// callee-save in the host ABI and caller-save in Go ABI0 and prepares
+// for entry to Go.
+//
+// Save DI SI BP BX R12 R13 R14 R15 X6-X15 registers and the DF flag.
+// Clear the DF flag for the Go ABI.
+// MXCSR matches the Go ABI, so we don't have to set that,
+// and Go doesn't modify it, so we don't have to save it.
+#define PUSH_REGS_HOST_TO_ABI0()	\
+	PUSHFQ			\
+	CLD			\
+	ADJSP	$(REGS_HOST_TO_ABI0_STACK - 8)	\
+	MOVQ	DI, (0*0)(SP)	\
+	MOVQ	SI, (1*8)(SP)	\
+	MOVQ	BP, (2*8)(SP)	\
+	MOVQ	BX, (3*8)(SP)	\
+	MOVQ	R12, (4*8)(SP)	\
+	MOVQ	R13, (5*8)(SP)	\
+	MOVQ	R14, (6*8)(SP)	\
+	MOVQ	R15, (7*8)(SP)	\
+	MOVUPS	X6, (8*8)(SP)	\
+	MOVUPS	X7, (10*8)(SP)	\
+	MOVUPS	X8, (12*8)(SP)	\
+	MOVUPS	X9, (14*8)(SP)	\
+	MOVUPS	X10, (16*8)(SP)	\
+	MOVUPS	X11, (18*8)(SP)	\
+	MOVUPS	X12, (20*8)(SP)	\
+	MOVUPS	X13, (22*8)(SP)	\
+	MOVUPS	X14, (24*8)(SP)	\
+	MOVUPS	X15, (26*8)(SP)
+
+#define POP_REGS_HOST_TO_ABI0()	\
+	MOVQ	(0*0)(SP), DI	\
+	MOVQ	(1*8)(SP), SI	\
+	MOVQ	(2*8)(SP), BP	\
+	MOVQ	(3*8)(SP), BX	\
+	MOVQ	(4*8)(SP), R12	\
+	MOVQ	(5*8)(SP), R13	\
+	MOVQ	(6*8)(SP), R14	\
+	MOVQ	(7*8)(SP), R15	\
+	MOVUPS	(8*8)(SP), X6	\
+	MOVUPS	(10*8)(SP), X7	\
+	MOVUPS	(12*8)(SP), X8	\
+	MOVUPS	(14*8)(SP), X9	\
+	MOVUPS	(16*8)(SP), X10	\
+	MOVUPS	(18*8)(SP), X11	\
+	MOVUPS	(20*8)(SP), X12	\
+	MOVUPS	(22*8)(SP), X13	\
+	MOVUPS	(24*8)(SP), X14	\
+	MOVUPS	(26*8)(SP), X15	\
+	ADJSP	$-(REGS_HOST_TO_ABI0_STACK - 8)	\
+	POPFQ
+
+#else
+// SysV ABI
+
+#define REGS_HOST_TO_ABI0_STACK (6*8)
+
+// SysV MXCSR matches the Go ABI, so we don't have to set that,
+// and Go doesn't modify it, so we don't have to save it.
+// Both SysV and Go require DF to be cleared, so that's already clear.
+// The SysV and Go frame pointer conventions are compatible.
+#define PUSH_REGS_HOST_TO_ABI0()	\
+	ADJSP	$(REGS_HOST_TO_ABI0_STACK)	\
+	MOVQ	BP, (5*8)(SP)	\
+	LEAQ	(5*8)(SP), BP	\
+	MOVQ	BX, (0*8)(SP)	\
+	MOVQ	R12, (1*8)(SP)	\
+	MOVQ	R13, (2*8)(SP)	\
+	MOVQ	R14, (3*8)(SP)	\
+	MOVQ	R15, (4*8)(SP)
+
+#define POP_REGS_HOST_TO_ABI0()	\
+	MOVQ	(0*8)(SP), BX	\
+	MOVQ	(1*8)(SP), R12	\
+	MOVQ	(2*8)(SP), R13	\
+	MOVQ	(3*8)(SP), R14	\
+	MOVQ	(4*8)(SP), R15	\
+	MOVQ	(5*8)(SP), BP	\
+	ADJSP	$-(REGS_HOST_TO_ABI0_STACK)
+
+#endif
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/asm_amd64.s b/contrib/go/_std_1.18/src/runtime/cgo/asm_amd64.s
new file mode 100644
index 0000000000..386299c548
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/asm_amd64.s
@@ -0,0 +1,34 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+#include "abi_amd64.h"
+
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
+// This signature is known to SWIG, so we can't change it.
+TEXT crosscall2(SB),NOSPLIT,$0-0
+	PUSH_REGS_HOST_TO_ABI0()
+
+	// Make room for arguments to cgocallback.
+	ADJSP	$0x18
+#ifndef GOOS_windows
+	MOVQ	DI, 0x0(SP)	/* fn */
+	MOVQ	SI, 0x8(SP)	/* arg */
+	// Skip n in DX.
+	MOVQ	CX, 0x10(SP)	/* ctxt */
+#else
+	MOVQ	CX, 0x0(SP)	/* fn */
+	MOVQ	DX, 0x8(SP)	/* arg */
+	// Skip n in R8.
+	MOVQ	R9, 0x10(SP)	/* ctxt */
+#endif
+
+	CALL	runtime·cgocallback(SB)
+
+	ADJSP	$-0x18
+	POP_REGS_HOST_TO_ABI0()
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/callbacks.go b/contrib/go/_std_1.18/src/runtime/cgo/callbacks.go
new file mode 100644
index 0000000000..cd8b795387
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/callbacks.go
@@ -0,0 +1,106 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import "unsafe"
+
+// These utility functions are available to be called from code
+// compiled with gcc via crosscall2.
+
+// The declaration of crosscall2 is:
+//   void crosscall2(void (*fn)(void *), void *, int);
+//
+// We need to export the symbol crosscall2 in order to support
+// callbacks from shared libraries. This applies regardless of
+// linking mode.
+//
+// Compatibility note: SWIG uses crosscall2 in exactly one situation:
+// to call _cgo_panic using the pattern shown below. We need to keep
+// that pattern working. In particular, crosscall2 actually takes four
+// arguments, but it works to call it with three arguments when
+// calling _cgo_panic.
+//go:cgo_export_static crosscall2
+//go:cgo_export_dynamic crosscall2
+
+// Panic. The argument is converted into a Go string.
+
+// Call like this in code compiled with gcc:
+//   struct { const char *p; } a;
+//   a.p = /* string to pass to panic */;
+//   crosscall2(_cgo_panic, &a, sizeof a);
+//   /* The function call will not return.  */
+
+// TODO: We should export a regular C function to panic, change SWIG
+// to use that instead of the above pattern, and then we can drop
+// backwards-compatibility from crosscall2 and stop exporting it.
+
+//go:linkname _runtime_cgo_panic_internal runtime._cgo_panic_internal
+func _runtime_cgo_panic_internal(p *byte)
+
+//go:linkname _cgo_panic _cgo_panic
+//go:cgo_export_static _cgo_panic
+//go:cgo_export_dynamic _cgo_panic
+func _cgo_panic(a *struct{ cstr *byte }) {
+	_runtime_cgo_panic_internal(a.cstr)
+}
+
+//go:cgo_import_static x_cgo_init
+//go:linkname x_cgo_init x_cgo_init
+//go:linkname _cgo_init _cgo_init
+var x_cgo_init byte
+var _cgo_init = &x_cgo_init
+
+//go:cgo_import_static x_cgo_thread_start
+//go:linkname x_cgo_thread_start x_cgo_thread_start
+//go:linkname _cgo_thread_start _cgo_thread_start
+var x_cgo_thread_start byte
+var _cgo_thread_start = &x_cgo_thread_start
+
+// Creates a new system thread without updating any Go state.
+//
+// This method is invoked during shared library loading to create a new OS
+// thread to perform the runtime initialization. This method is similar to
+// _cgo_sys_thread_start except that it doesn't update any Go state.
+
+//go:cgo_import_static x_cgo_sys_thread_create
+//go:linkname x_cgo_sys_thread_create x_cgo_sys_thread_create
+//go:linkname _cgo_sys_thread_create _cgo_sys_thread_create
+var x_cgo_sys_thread_create byte
+var _cgo_sys_thread_create = &x_cgo_sys_thread_create
+
+// Notifies that the runtime has been initialized.
+//
+// We currently block at every CGO entry point (via _cgo_wait_runtime_init_done)
+// to ensure that the runtime has been initialized before the CGO call is
+// executed. This is necessary for shared libraries where we kickoff runtime
+// initialization in a separate thread and return without waiting for this
+// thread to complete the init.
+
+//go:cgo_import_static x_cgo_notify_runtime_init_done
+//go:linkname x_cgo_notify_runtime_init_done x_cgo_notify_runtime_init_done
+//go:linkname _cgo_notify_runtime_init_done _cgo_notify_runtime_init_done
+var x_cgo_notify_runtime_init_done byte
+var _cgo_notify_runtime_init_done = &x_cgo_notify_runtime_init_done
+
+// Sets the traceback context function. See runtime.SetCgoTraceback.
+
+//go:cgo_import_static x_cgo_set_context_function
+//go:linkname x_cgo_set_context_function x_cgo_set_context_function
+//go:linkname _cgo_set_context_function _cgo_set_context_function
+var x_cgo_set_context_function byte
+var _cgo_set_context_function = &x_cgo_set_context_function
+
+// Calls a libc function to execute background work injected via libc
+// interceptors, such as processing pending signals under the thread
+// sanitizer.
+//
+// Left as a nil pointer if no libc interceptors are expected.
+
+//go:cgo_import_static _cgo_yield
+//go:linkname _cgo_yield _cgo_yield
+var _cgo_yield unsafe.Pointer
+
+//go:cgo_export_static _cgo_topofstack
+//go:cgo_export_dynamic _cgo_topofstack
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/callbacks_traceback.go b/contrib/go/_std_1.18/src/runtime/cgo/callbacks_traceback.go
new file mode 100644
index 0000000000..dae31a8fcd
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/callbacks_traceback.go
@@ -0,0 +1,17 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin || linux
+
+package cgo
+
+import _ "unsafe" // for go:linkname
+
+// Calls the traceback function passed to SetCgoTraceback.
+
+//go:cgo_import_static x_cgo_callers
+//go:linkname x_cgo_callers x_cgo_callers
+//go:linkname _cgo_callers _cgo_callers
+var x_cgo_callers byte
+var _cgo_callers = &x_cgo_callers
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/cgo.go b/contrib/go/_std_1.18/src/runtime/cgo/cgo.go
new file mode 100644
index 0000000000..4d2caf6c4f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/cgo.go
@@ -0,0 +1,34 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package cgo contains runtime support for code generated
+by the cgo tool.  See the documentation for the cgo command
+for details on using cgo.
+*/
+package cgo
+
+/*
+
+#cgo darwin,!arm64 LDFLAGS: -lpthread
+#cgo darwin,arm64 LDFLAGS: -framework CoreFoundation
+#cgo dragonfly LDFLAGS: -lpthread
+#cgo freebsd LDFLAGS: -lpthread
+#cgo android LDFLAGS: -llog
+#cgo !android,linux LDFLAGS: -lpthread
+#cgo netbsd LDFLAGS: -lpthread
+#cgo openbsd LDFLAGS: -lpthread
+#cgo aix LDFLAGS: -Wl,-berok
+#cgo solaris LDFLAGS: -lxnet
+#cgo illumos LDFLAGS: -lsocket
+
+// Issue 35247.
+#cgo darwin CFLAGS: -Wno-nullability-completeness
+
+#cgo CFLAGS: -Wall -Werror
+
+#cgo solaris CPPFLAGS: -D_POSIX_PTHREAD_SEMANTICS
+
+*/
+import "C"
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_amd64.S b/contrib/go/_std_1.18/src/runtime/cgo/gcc_amd64.S
new file mode 100644
index 0000000000..46699d1d9c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_amd64.S
@@ -0,0 +1,53 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Apple still insists on underscore prefixes for C function names.
+ */
+#if defined(__APPLE__)
+#define EXT(s) _##s
+#else
+#define EXT(s) s
+#endif
+
+/*
+ * void crosscall_amd64(void (*fn)(void), void (*setg_gcc)(void*), void *g)
+ *
+ * Calling into the 6c tool chain, where all registers are caller save.
+ * Called from standard x86-64 ABI, where %rbx, %rbp, %r12-%r15
+ * are callee-save so they must be saved explicitly.
+ * The standard x86-64 ABI passes the three arguments m, g, fn
+ * in %rdi, %rsi, %rdx.
+ */
+.globl EXT(crosscall_amd64)
+EXT(crosscall_amd64):
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+
+#if defined(_WIN64)
+	movq %r8, %rdi	/* arg of setg_gcc */
+	call *%rdx	/* setg_gcc */
+	call *%rcx	/* fn */
+#else
+	movq %rdi, %rbx
+	movq %rdx, %rdi	/* arg of setg_gcc */
+	call *%rsi	/* setg_gcc */
+	call *%rbx	/* fn */
+#endif
+
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	ret
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_context.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_context.c
new file mode 100644
index 0000000000..5fc0abb8bc
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_context.c
@@ -0,0 +1,21 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows
+
+#include "libcgo.h"
+
+// Releases the cgo traceback context.
+void _cgo_release_context(uintptr_t ctxt) {
+	void (*pfn)(struct context_arg*);
+
+	pfn = _cgo_get_context_function();
+	if (ctxt != 0 && pfn != nil) {
+		struct context_arg arg;
+
+		arg.Context = ctxt;
+		(*pfn)(&arg);
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_darwin_amd64.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_darwin_amd64.c
new file mode 100644
index 0000000000..d5b7fd8fd8
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_darwin_amd64.c
@@ -0,0 +1,65 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h> /* for strerror */
+#include <pthread.h>
+#include <signal.h>
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
+	ts->g->stackhi = size;
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall_amd64(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_fatalf.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_fatalf.c
new file mode 100644
index 0000000000..597e750f12
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_fatalf.c
@@ -0,0 +1,23 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix !android,linux freebsd
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "libcgo.h"
+
+void
+fatalf(const char* format, ...)
+{
+	va_list ap;
+
+	fprintf(stderr, "runtime/cgo: ");
+	va_start(ap, format);
+	vfprintf(stderr, format, ap);
+	va_end(ap);
+	fprintf(stderr, "\n");
+	abort();
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_libinit.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_libinit.c
new file mode 100644
index 0000000000..3304d95fdf
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_libinit.c
@@ -0,0 +1,113 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+#include <pthread.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> // strerror
+#include <time.h>
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+static pthread_cond_t runtime_init_cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t runtime_init_mu = PTHREAD_MUTEX_INITIALIZER;
+static int runtime_init_done;
+
+// The context function, used when tracing back C calls into Go.
+static void (*cgo_context_function)(struct context_arg*);
+
+void
+x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
+	pthread_t p;
+	int err = _cgo_try_pthread_create(&p, NULL, func, arg);
+	if (err != 0) {
+		fprintf(stderr, "pthread_create failed: %s", strerror(err));
+		abort();
+	}
+}
+
+uintptr_t
+_cgo_wait_runtime_init_done(void) {
+	void (*pfn)(struct context_arg*);
+
+	pthread_mutex_lock(&runtime_init_mu);
+	while (runtime_init_done == 0) {
+		pthread_cond_wait(&runtime_init_cond, &runtime_init_mu);
+	}
+
+	// TODO(iant): For the case of a new C thread calling into Go, such
+	// as when using -buildmode=c-archive, we know that Go runtime
+	// initialization is complete but we do not know that all Go init
+	// functions have been run. We should not fetch cgo_context_function
+	// until they have been, because that is where a call to
+	// SetCgoTraceback is likely to occur. We are going to wait for Go
+	// initialization to be complete anyhow, later, by waiting for
+	// main_init_done to be closed in cgocallbackg1. We should wait here
+	// instead. See also issue #15943.
+	pfn = cgo_context_function;
+
+	pthread_mutex_unlock(&runtime_init_mu);
+	if (pfn != nil) {
+		struct context_arg arg;
+
+		arg.Context = 0;
+		(*pfn)(&arg);
+		return arg.Context;
+	}
+	return 0;
+}
+
+void
+x_cgo_notify_runtime_init_done(void* dummy __attribute__ ((unused))) {
+	pthread_mutex_lock(&runtime_init_mu);
+	runtime_init_done = 1;
+	pthread_cond_broadcast(&runtime_init_cond);
+	pthread_mutex_unlock(&runtime_init_mu);
+}
+
+// Sets the context function to call to record the traceback context
+// when calling a Go function from C code. Called from runtime.SetCgoTraceback.
+void x_cgo_set_context_function(void (*context)(struct context_arg*)) {
+	pthread_mutex_lock(&runtime_init_mu);
+	cgo_context_function = context;
+	pthread_mutex_unlock(&runtime_init_mu);
+}
+
+// Gets the context function.
+void (*(_cgo_get_context_function(void)))(struct context_arg*) {
+	void (*ret)(struct context_arg*);
+
+	pthread_mutex_lock(&runtime_init_mu);
+	ret = cgo_context_function;
+	pthread_mutex_unlock(&runtime_init_mu);
+	return ret;
+}
+
+// _cgo_try_pthread_create retries pthread_create if it fails with
+// EAGAIN.
+int
+_cgo_try_pthread_create(pthread_t* thread, const pthread_attr_t* attr, void* (*pfn)(void*), void* arg) {
+	int tries;
+	int err;
+	struct timespec ts;
+
+	for (tries = 0; tries < 20; tries++) {
+		err = pthread_create(thread, attr, pfn, arg);
+		if (err == 0) {
+			pthread_detach(*thread);
+			return 0;
+		}
+		if (err != EAGAIN) {
+			return err;
+		}
+		ts.tv_sec = 0;
+		ts.tv_nsec = (tries + 1) * 1000 * 1000; // Milliseconds.
+		nanosleep(&ts, nil);
+	}
+	return EAGAIN;
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_linux_amd64.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_linux_amd64.c
new file mode 100644
index 0000000000..c25e7e769b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_linux_amd64.c
@@ -0,0 +1,94 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <errno.h>
+#include <string.h> // strerror
+#include <signal.h>
+#include <stdlib.h>
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+// This will be set in gcc_android.c for android-specific customization.
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase) __attribute__((common));
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
+{
+	pthread_attr_t *attr;
+	size_t size;
+
+	/* The memory sanitizer distributed with versions of clang
+	   before 3.8 has a bug: if you call mmap before malloc, mmap
+	   may return an address that is later overwritten by the msan
+	   library.  Avoid this problem by forcing a call to malloc
+	   here, before we ever call malloc.
+
+	   This is only required for the memory sanitizer, so it's
+	   unfortunate that we always run it.  It should be possible
+	   to remove this when we no longer care about versions of
+	   clang before 3.8.  The test for this is
+	   misc/cgo/testsanitizers.
+
+	   GCC works hard to eliminate a seemingly unnecessary call to
+	   malloc, so we actually use the memory we allocate.  */
+
+	setg_gcc = setg;
+	attr = (pthread_attr_t*)malloc(sizeof *attr);
+	if (attr == NULL) {
+		fatalf("malloc failed: %s", strerror(errno));
+	}
+	pthread_attr_init(attr);
+	pthread_attr_getstacksize(attr, &size);
+	g->stacklo = (uintptr)&size - size + 4096;
+	pthread_attr_destroy(attr);
+	free(attr);
+
+	if (x_cgo_inittls) {
+		x_cgo_inittls(tlsg, tlsbase);
+	}
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
+	ts->g->stackhi = size;
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	_cgo_tsan_acquire();
+	free(v);
+	_cgo_tsan_release();
+
+	crosscall_amd64(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_mmap.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_mmap.c
new file mode 100644
index 0000000000..698a7e3cd2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_mmap.c
@@ -0,0 +1,39 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux,amd64 linux,arm64 linux,ppc64le
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "libcgo.h"
+
+uintptr_t
+x_cgo_mmap(void *addr, uintptr_t length, int32_t prot, int32_t flags, int32_t fd, uint32_t offset) {
+	void *p;
+
+	_cgo_tsan_acquire();
+	p = mmap(addr, length, prot, flags, fd, offset);
+	_cgo_tsan_release();
+	if (p == MAP_FAILED) {
+		/* This is what the Go code expects on failure.  */
+		return (uintptr_t)errno;
+	}
+	return (uintptr_t)p;
+}
+
+void
+x_cgo_munmap(void *addr, uintptr_t length) {
+	int r;
+
+	_cgo_tsan_acquire();
+	r = munmap(addr, length);
+	_cgo_tsan_release();
+	if (r < 0) {
+		/* The Go runtime is not prepared for munmap to fail.  */
+		abort();
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_setenv.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_setenv.c
new file mode 100644
index 0000000000..d4f798357a
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_setenv.c
@@ -0,0 +1,28 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+#include "libcgo.h"
+
+#include <stdlib.h>
+
+/* Stub for calling setenv */
+void
+x_cgo_setenv(char **arg)
+{
+	_cgo_tsan_acquire();
+	setenv(arg[0], arg[1], 1);
+	_cgo_tsan_release();
+}
+
+/* Stub for calling unsetenv */
+void
+x_cgo_unsetenv(char **arg)
+{
+	_cgo_tsan_acquire();
+	unsetenv(arg[0]);
+	_cgo_tsan_release();
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_sigaction.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_sigaction.c
new file mode 100644
index 0000000000..fcf1e50740
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_sigaction.c
@@ -0,0 +1,82 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux,amd64 linux,arm64 linux,ppc64le
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <signal.h>
+
+#include "libcgo.h"
+
+// go_sigaction_t is a C version of the sigactiont struct from
+// defs_linux_amd64.go.  This definition — and its conversion to and from struct
+// sigaction — are specific to linux/amd64.
+typedef struct {
+	uintptr_t handler;
+	uint64_t flags;
+	uintptr_t restorer;
+	uint64_t mask;
+} go_sigaction_t;
+
+// SA_RESTORER is part of the kernel interface.
+// This is Linux i386/amd64 specific.
+#ifndef SA_RESTORER
+#define SA_RESTORER 0x4000000
+#endif
+
+int32_t
+x_cgo_sigaction(intptr_t signum, const go_sigaction_t *goact, go_sigaction_t *oldgoact) {
+	int32_t ret;
+	struct sigaction act;
+	struct sigaction oldact;
+	size_t i;
+
+	_cgo_tsan_acquire();
+
+	memset(&act, 0, sizeof act);
+	memset(&oldact, 0, sizeof oldact);
+
+	if (goact) {
+		if (goact->flags & SA_SIGINFO) {
+			act.sa_sigaction = (void(*)(int, siginfo_t*, void*))(goact->handler);
+		} else {
+			act.sa_handler = (void(*)(int))(goact->handler);
+		}
+		sigemptyset(&act.sa_mask);
+		for (i = 0; i < 8 * sizeof(goact->mask); i++) {
+			if (goact->mask & ((uint64_t)(1)<<i)) {
+				sigaddset(&act.sa_mask, (int)(i+1));
+			}
+		}
+		act.sa_flags = (int)(goact->flags & ~(uint64_t)SA_RESTORER);
+	}
+
+	ret = sigaction((int)signum, goact ? &act : NULL, oldgoact ? &oldact : NULL);
+	if (ret == -1) {
+		// runtime.rt_sigaction expects _cgo_sigaction to return errno on error.
+		_cgo_tsan_release();
+		return errno;
+	}
+
+	if (oldgoact) {
+		if (oldact.sa_flags & SA_SIGINFO) {
+			oldgoact->handler = (uintptr_t)(oldact.sa_sigaction);
+		} else {
+			oldgoact->handler = (uintptr_t)(oldact.sa_handler);
+		}
+		oldgoact->mask = 0;
+		for (i = 0; i < 8 * sizeof(oldgoact->mask); i++) {
+			if (sigismember(&oldact.sa_mask, (int)(i+1)) == 1) {
+				oldgoact->mask |= (uint64_t)(1)<<i;
+			}
+		}
+		oldgoact->flags = (uint64_t)oldact.sa_flags;
+	}
+
+	_cgo_tsan_release();
+	return ret;
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_traceback.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_traceback.c
new file mode 100644
index 0000000000..6e9470c43c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_traceback.c
@@ -0,0 +1,44 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo,darwin cgo,linux
+
+#include <stdint.h>
+#include "libcgo.h"
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#endif
+
+// Call the user's traceback function and then call sigtramp.
+// The runtime signal handler will jump to this code.
+// We do it this way so that the user's traceback function will be called
+// by a C function with proper unwind info.
+void
+x_cgo_callers(uintptr_t sig, void *info, void *context, void (*cgoTraceback)(struct cgoTracebackArg*), uintptr_t* cgoCallers, void (*sigtramp)(uintptr_t, void*, void*)) {
+	struct cgoTracebackArg arg;
+
+	arg.Context = 0;
+	arg.SigContext = (uintptr_t)(context);
+	arg.Buf = cgoCallers;
+	arg.Max = 32; // must match len(runtime.cgoCallers)
+
+#if __has_feature(memory_sanitizer)
+        // This function is called directly from the signal handler.
+        // The arguments are passed in registers, so whether msan
+        // considers cgoCallers to be initialized depends on whether
+        // it considers the appropriate register to be initialized.
+        // That can cause false reports in rare cases.
+        // Explicitly unpoison the memory to avoid that.
+        // See issue #47543 for more details.
+        __msan_unpoison(&arg, sizeof arg);
+#endif
+
+	(*cgoTraceback)(&arg);
+	sigtramp(sig, info, context);
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/gcc_util.c b/contrib/go/_std_1.18/src/runtime/cgo/gcc_util.c
new file mode 100644
index 0000000000..3fcb48cc8d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/gcc_util.c
@@ -0,0 +1,69 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "libcgo.h"
+
+/* Stub for creating a new thread */
+void
+x_cgo_thread_start(ThreadStart *arg)
+{
+	ThreadStart *ts;
+
+	/* Make our own copy that can persist after we return. */
+	_cgo_tsan_acquire();
+	ts = malloc(sizeof *ts);
+	_cgo_tsan_release();
+	if(ts == nil) {
+		fprintf(stderr, "runtime/cgo: out of memory in thread_start\n");
+		abort();
+	}
+	*ts = *arg;
+
+	_cgo_sys_thread_start(ts);	/* OS-dependent half */
+}
+
+#ifndef CGO_TSAN
+void(* const _cgo_yield)() = NULL;
+#else
+
+#include <string.h>
+
+char x_cgo_yield_strncpy_src = 0;
+char x_cgo_yield_strncpy_dst = 0;
+size_t x_cgo_yield_strncpy_n = 0;
+
+/*
+Stub for allowing libc interceptors to execute.
+
+_cgo_yield is set to NULL if we do not expect libc interceptors to exist.
+*/
+static void
+x_cgo_yield()
+{
+	/*
+	The libc function(s) we call here must form a no-op and include at least one
+	call that triggers TSAN to process pending asynchronous signals.
+
+	sleep(0) would be fine, but it's not portable C (so it would need more header
+	guards).
+	free(NULL) has a fast-path special case in TSAN, so it doesn't
+	trigger signal delivery.
+	free(malloc(0)) would work (triggering the interceptors in malloc), but
+	it also runs a bunch of user-supplied malloc hooks.
+
+	So we choose strncpy(_, _, 0): it requires an extra header,
+	but it's standard and should be very efficient.
+
+	GCC 7 has an unfortunate habit of optimizing out strncpy calls (see
+	https://golang.org/issue/21196), so the arguments here need to be global
+	variables with external linkage in order to ensure that the call traps all the
+	way down into libc.
+	*/
+	strncpy(&x_cgo_yield_strncpy_dst, &x_cgo_yield_strncpy_src,
+	        x_cgo_yield_strncpy_n);
+}
+
+void(* const _cgo_yield)() = &x_cgo_yield;
+
+#endif  /* GO_TSAN */
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/handle.go b/contrib/go/_std_1.18/src/runtime/cgo/handle.go
new file mode 100644
index 0000000000..d711900d79
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/handle.go
@@ -0,0 +1,144 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import (
+	"sync"
+	"sync/atomic"
+)
+
+// Handle provides a way to pass values that contain Go pointers
+// (pointers to memory allocated by Go) between Go and C without
+// breaking the cgo pointer passing rules. A Handle is an integer
+// value that can represent any Go value. A Handle can be passed
+// through C and back to Go, and Go code can use the Handle to
+// retrieve the original Go value.
+//
+// The underlying type of Handle is guaranteed to fit in an integer type
+// that is large enough to hold the bit pattern of any pointer. The zero
+// value of a Handle is not valid, and thus is safe to use as a sentinel
+// in C APIs.
+//
+// For instance, on the Go side:
+//
+//	package main
+//
+//	/*
+//	#include <stdint.h> // for uintptr_t
+//
+//	extern void MyGoPrint(uintptr_t handle);
+//	void myprint(uintptr_t handle);
+//	*/
+//	import "C"
+//	import "runtime/cgo"
+//
+//	//export MyGoPrint
+//	func MyGoPrint(handle C.uintptr_t) {
+//		h := cgo.Handle(handle)
+//		val := h.Value().(string)
+//		println(val)
+//		h.Delete()
+//	}
+//
+//	func main() {
+//		val := "hello Go"
+//		C.myprint(C.uintptr_t(cgo.NewHandle(val)))
+//		// Output: hello Go
+//	}
+//
+// and on the C side:
+//
+//	#include <stdint.h> // for uintptr_t
+//
+//	// A Go function
+//	extern void MyGoPrint(uintptr_t handle);
+//
+//	// A C function
+//	void myprint(uintptr_t handle) {
+//	    MyGoPrint(handle);
+//	}
+//
+// Some C functions accept a void* argument that points to an arbitrary
+// data value supplied by the caller. It is not safe to coerce a cgo.Handle
+// (an integer) to a Go unsafe.Pointer, but instead we can pass the address
+// of the cgo.Handle to the void* parameter, as in this variant of the
+// previous example:
+//
+//	package main
+//
+//	/*
+//	extern void MyGoPrint(void *context);
+//	static inline void myprint(void *context) {
+//	    MyGoPrint(context);
+//	}
+//	*/
+//	import "C"
+//	import (
+//		"runtime/cgo"
+//		"unsafe"
+//	)
+//
+//	//export MyGoPrint
+//	func MyGoPrint(context unsafe.Pointer) {
+//		h := *(*cgo.Handle)(context)
+//		val := h.Value().(string)
+//		println(val)
+//		h.Delete()
+//	}
+//
+//	func main() {
+//		val := "hello Go"
+//		h := cgo.NewHandle(val)
+//		C.myprint(unsafe.Pointer(&h))
+//		// Output: hello Go
+//	}
+type Handle uintptr
+
+// NewHandle returns a handle for a given value.
+//
+// The handle is valid until the program calls Delete on it. The handle
+// uses resources, and this package assumes that C code may hold on to
+// the handle, so a program must explicitly call Delete when the handle
+// is no longer needed.
+//
+// The intended use is to pass the returned handle to C code, which
+// passes it back to Go, which calls Value.
+func NewHandle(v any) Handle {
+	h := atomic.AddUintptr(&handleIdx, 1)
+	if h == 0 {
+		panic("runtime/cgo: ran out of handle space")
+	}
+
+	handles.Store(h, v)
+	return Handle(h)
+}
+
+// Value returns the associated Go value for a valid handle.
+//
+// The method panics if the handle is invalid.
+func (h Handle) Value() any {
+	v, ok := handles.Load(uintptr(h))
+	if !ok {
+		panic("runtime/cgo: misuse of an invalid Handle")
+	}
+	return v
+}
+
+// Delete invalidates a handle. This method should only be called once
+// the program no longer needs to pass the handle to C and the C code
+// no longer has a copy of the handle value.
+//
+// The method panics if the handle is invalid.
+func (h Handle) Delete() {
+	_, ok := handles.LoadAndDelete(uintptr(h))
+	if !ok {
+		panic("runtime/cgo: misuse of an invalid Handle")
+	}
+}
+
+var (
+	handles   = sync.Map{} // map[Handle]interface{}
+	handleIdx uintptr      // atomic
+)
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/iscgo.go b/contrib/go/_std_1.18/src/runtime/cgo/iscgo.go
new file mode 100644
index 0000000000..e12d0f4b95
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/iscgo.go
@@ -0,0 +1,17 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The runtime package contains an uninitialized definition
+// for runtime·iscgo. Override it to tell the runtime we're here.
+// There are various function pointers that should be set too,
+// but those depend on dynamic linker magic to get initialized
+// correctly, and sometimes they break. This variable is a
+// backup: it depends only on old C style static linking rules.
+
+package cgo
+
+import _ "unsafe" // for go:linkname
+
+//go:linkname _iscgo runtime.iscgo
+var _iscgo bool = true
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/libcgo.h b/contrib/go/_std_1.18/src/runtime/cgo/libcgo.h
new file mode 100644
index 0000000000..af4960e7e9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/libcgo.h
@@ -0,0 +1,151 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#undef nil
+#define nil ((void*)0)
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef uintptr_t uintptr;
+
+/*
+ * The beginning of the per-goroutine structure,
+ * as defined in ../pkg/runtime/runtime.h.
+ * Just enough to edit these two fields.
+ */
+typedef struct G G;
+struct G
+{
+	uintptr stacklo;
+	uintptr stackhi;
+};
+
+/*
+ * Arguments to the _cgo_thread_start call.
+ * Also known to ../pkg/runtime/runtime.h.
+ */
+typedef struct ThreadStart ThreadStart;
+struct ThreadStart
+{
+	G *g;
+	uintptr *tls;
+	void (*fn)(void);
+};
+
+/*
+ * Called by 5c/6c/8c world.
+ * Makes a local copy of the ThreadStart and
+ * calls _cgo_sys_thread_start(ts).
+ */
+extern void (*_cgo_thread_start)(ThreadStart *ts);
+
+/*
+ * Creates a new operating system thread without updating any Go state
+ * (OS dependent).
+ */
+extern void (*_cgo_sys_thread_create)(void* (*func)(void*), void* arg);
+
+/*
+ * Creates the new operating system thread (OS, arch dependent).
+ */
+void _cgo_sys_thread_start(ThreadStart *ts);
+
+/*
+ * Waits for the Go runtime to be initialized (OS dependent).
+ * If runtime.SetCgoTraceback is used to set a context function,
+ * calls the context function and returns the context value.
+ */
+uintptr_t _cgo_wait_runtime_init_done(void);
+
+/*
+ * Call fn in the 6c world.
+ */
+void crosscall_amd64(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+
+/*
+ * Call fn in the 8c world.
+ */
+void crosscall_386(void (*fn)(void));
+
+/*
+ * Prints error then calls abort. For linux and android.
+ */
+void fatalf(const char* format, ...);
+
+/*
+ * Registers the current mach thread port for EXC_BAD_ACCESS processing.
+ */
+void darwin_arm_init_thread_exception_port(void);
+
+/*
+ * Starts a mach message server processing EXC_BAD_ACCESS.
+ */
+void darwin_arm_init_mach_exception_handler(void);
+
+/*
+ * The cgo context function. See runtime.SetCgoTraceback.
+ */
+struct context_arg {
+	uintptr_t Context;
+};
+extern void (*(_cgo_get_context_function(void)))(struct context_arg*);
+
+/*
+ * The argument for the cgo traceback callback. See runtime.SetCgoTraceback.
+ */
+struct cgoTracebackArg {
+	uintptr_t  Context;
+	uintptr_t  SigContext;
+	uintptr_t* Buf;
+	uintptr_t  Max;
+};
+
+/*
+ * TSAN support.  This is only useful when building with
+ *   CGO_CFLAGS="-fsanitize=thread" CGO_LDFLAGS="-fsanitize=thread" go install
+ */
+#undef CGO_TSAN
+#if defined(__has_feature)
+# if __has_feature(thread_sanitizer)
+#  define CGO_TSAN
+# endif
+#elif defined(__SANITIZE_THREAD__)
+# define CGO_TSAN
+#endif
+
+#ifdef CGO_TSAN
+
+// These must match the definitions in yesTsanProlog in cmd/cgo/out.go.
+// In general we should call _cgo_tsan_acquire when we enter C code,
+// and call _cgo_tsan_release when we return to Go code.
+// This is only necessary when calling code that might be instrumented
+// by TSAN, which mostly means system library calls that TSAN intercepts.
+// See the comment in cmd/cgo/out.go for more details.
+
+long long _cgo_sync __attribute__ ((common));
+
+extern void __tsan_acquire(void*);
+extern void __tsan_release(void*);
+
+__attribute__ ((unused))
+static void _cgo_tsan_acquire() {
+	__tsan_acquire(&_cgo_sync);
+}
+
+__attribute__ ((unused))
+static void _cgo_tsan_release() {
+	__tsan_release(&_cgo_sync);
+}
+
+#else // !defined(CGO_TSAN)
+
+#define _cgo_tsan_acquire()
+#define _cgo_tsan_release()
+
+#endif // !defined(CGO_TSAN)
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/libcgo_unix.h b/contrib/go/_std_1.18/src/runtime/cgo/libcgo_unix.h
new file mode 100644
index 0000000000..a56a366f23
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/libcgo_unix.h
@@ -0,0 +1,15 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Call pthread_create, retrying on EAGAIN.
+ */
+extern int _cgo_try_pthread_create(pthread_t*, const pthread_attr_t*, void* (*)(void*), void*);
+
+/*
+ * Same as _cgo_try_pthread_create, but passing on the pthread_create function.
+ * Only defined on OpenBSD.
+ */
+extern int _cgo_openbsd_try_pthread_create(int (*)(pthread_t*, const pthread_attr_t*, void *(*pfn)(void*), void*),
+	pthread_t*, const pthread_attr_t*, void* (*)(void*), void* arg);
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/linux.go b/contrib/go/_std_1.18/src/runtime/cgo/linux.go
new file mode 100644
index 0000000000..1d6fe03917
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/linux.go
@@ -0,0 +1,74 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Linux system call wrappers that provide POSIX semantics through the
+// corresponding cgo->libc (nptl) wrappers for various system calls.
+
+//go:build linux
+
+package cgo
+
+import "unsafe"
+
+// Each of the following entries is needed to ensure that the
+// syscall.syscall_linux code can conditionally call these
+// function pointers:
+//
+//  1. find the C-defined function start
+//  2. force the local byte alias to be mapped to that location
+//  3. map the Go pointer to the function to the syscall package
+
+//go:cgo_import_static _cgo_libc_setegid
+//go:linkname _cgo_libc_setegid _cgo_libc_setegid
+//go:linkname cgo_libc_setegid syscall.cgo_libc_setegid
+var _cgo_libc_setegid byte
+var cgo_libc_setegid = unsafe.Pointer(&_cgo_libc_setegid)
+
+//go:cgo_import_static _cgo_libc_seteuid
+//go:linkname _cgo_libc_seteuid _cgo_libc_seteuid
+//go:linkname cgo_libc_seteuid syscall.cgo_libc_seteuid
+var _cgo_libc_seteuid byte
+var cgo_libc_seteuid = unsafe.Pointer(&_cgo_libc_seteuid)
+
+//go:cgo_import_static _cgo_libc_setregid
+//go:linkname _cgo_libc_setregid _cgo_libc_setregid
+//go:linkname cgo_libc_setregid syscall.cgo_libc_setregid
+var _cgo_libc_setregid byte
+var cgo_libc_setregid = unsafe.Pointer(&_cgo_libc_setregid)
+
+//go:cgo_import_static _cgo_libc_setresgid
+//go:linkname _cgo_libc_setresgid _cgo_libc_setresgid
+//go:linkname cgo_libc_setresgid syscall.cgo_libc_setresgid
+var _cgo_libc_setresgid byte
+var cgo_libc_setresgid = unsafe.Pointer(&_cgo_libc_setresgid)
+
+//go:cgo_import_static _cgo_libc_setresuid
+//go:linkname _cgo_libc_setresuid _cgo_libc_setresuid
+//go:linkname cgo_libc_setresuid syscall.cgo_libc_setresuid
+var _cgo_libc_setresuid byte
+var cgo_libc_setresuid = unsafe.Pointer(&_cgo_libc_setresuid)
+
+//go:cgo_import_static _cgo_libc_setreuid
+//go:linkname _cgo_libc_setreuid _cgo_libc_setreuid
+//go:linkname cgo_libc_setreuid syscall.cgo_libc_setreuid
+var _cgo_libc_setreuid byte
+var cgo_libc_setreuid = unsafe.Pointer(&_cgo_libc_setreuid)
+
+//go:cgo_import_static _cgo_libc_setgroups
+//go:linkname _cgo_libc_setgroups _cgo_libc_setgroups
+//go:linkname cgo_libc_setgroups syscall.cgo_libc_setgroups
+var _cgo_libc_setgroups byte
+var cgo_libc_setgroups = unsafe.Pointer(&_cgo_libc_setgroups)
+
+//go:cgo_import_static _cgo_libc_setgid
+//go:linkname _cgo_libc_setgid _cgo_libc_setgid
+//go:linkname cgo_libc_setgid syscall.cgo_libc_setgid
+var _cgo_libc_setgid byte
+var cgo_libc_setgid = unsafe.Pointer(&_cgo_libc_setgid)
+
+//go:cgo_import_static _cgo_libc_setuid
+//go:linkname _cgo_libc_setuid _cgo_libc_setuid
+//go:linkname cgo_libc_setuid syscall.cgo_libc_setuid
+var _cgo_libc_setuid byte
+var cgo_libc_setuid = unsafe.Pointer(&_cgo_libc_setuid)
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/linux_syscall.c b/contrib/go/_std_1.18/src/runtime/cgo/linux_syscall.c
new file mode 100644
index 0000000000..59761c8b40
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/linux_syscall.c
@@ -0,0 +1,85 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+#ifndef _GNU_SOURCE // setres[ug]id() API.
+#define _GNU_SOURCE
+#endif
+
+#include <grp.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+#include "libcgo.h"
+
+/*
+ * Assumed POSIX compliant libc system call wrappers. For linux, the
+ * glibc/nptl/setxid mechanism ensures that POSIX semantics are
+ * honored for all pthreads (by default), and this in turn with cgo
+ * ensures that all Go threads launched with cgo are kept in sync for
+ * these function calls.
+ */
+
+// argset_t matches runtime/cgocall.go:argset.
+typedef struct {
+	uintptr_t* args;
+	uintptr_t retval;
+} argset_t;
+
+// libc backed posix-compliant syscalls.
+
+#define SET_RETVAL(fn) \
+  uintptr_t ret = (uintptr_t) fn ; \
+  if (ret == (uintptr_t) -1) {	   \
+    x->retval = (uintptr_t) errno; \
+  } else                           \
+    x->retval = ret
+
+void
+_cgo_libc_setegid(argset_t* x) {
+	SET_RETVAL(setegid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_seteuid(argset_t* x) {
+	SET_RETVAL(seteuid((uid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgid(argset_t* x) {
+	SET_RETVAL(setgid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgroups(argset_t* x) {
+	SET_RETVAL(setgroups((size_t) x->args[0], (const gid_t *) x->args[1]));
+}
+
+void
+_cgo_libc_setregid(argset_t* x) {
+	SET_RETVAL(setregid((gid_t) x->args[0], (gid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setresgid(argset_t* x) {
+	SET_RETVAL(setresgid((gid_t) x->args[0], (gid_t) x->args[1],
+			     (gid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setresuid(argset_t* x) {
+	SET_RETVAL(setresuid((uid_t) x->args[0], (uid_t) x->args[1],
+			     (uid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setreuid(argset_t* x) {
+	SET_RETVAL(setreuid((uid_t) x->args[0], (uid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setuid(argset_t* x) {
+	SET_RETVAL(setuid((uid_t) x->args[0]));
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/mmap.go b/contrib/go/_std_1.18/src/runtime/cgo/mmap.go
new file mode 100644
index 0000000000..eae0a9e7cc
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/mmap.go
@@ -0,0 +1,31 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (linux && amd64) || (linux && arm64)
+
+package cgo
+
+// Import "unsafe" because we use go:linkname.
+import _ "unsafe"
+
+// When using cgo, call the C library for mmap, so that we call into
+// any sanitizer interceptors. This supports using the memory
+// sanitizer with Go programs. The memory sanitizer only applies to
+// C/C++ code; this permits that code to see the Go code as normal
+// program addresses that have been initialized.
+
+// To support interceptors that look for both mmap and munmap,
+// also call the C library for munmap.
+
+//go:cgo_import_static x_cgo_mmap
+//go:linkname x_cgo_mmap x_cgo_mmap
+//go:linkname _cgo_mmap _cgo_mmap
+var x_cgo_mmap byte
+var _cgo_mmap = &x_cgo_mmap
+
+//go:cgo_import_static x_cgo_munmap
+//go:linkname x_cgo_munmap x_cgo_munmap
+//go:linkname _cgo_munmap _cgo_munmap
+var x_cgo_munmap byte
+var _cgo_munmap = &x_cgo_munmap
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/setenv.go b/contrib/go/_std_1.18/src/runtime/cgo/setenv.go
new file mode 100644
index 0000000000..0f4780a945
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/setenv.go
@@ -0,0 +1,21 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
+
+package cgo
+
+import _ "unsafe" // for go:linkname
+
+//go:cgo_import_static x_cgo_setenv
+//go:linkname x_cgo_setenv x_cgo_setenv
+//go:linkname _cgo_setenv runtime._cgo_setenv
+var x_cgo_setenv byte
+var _cgo_setenv = &x_cgo_setenv
+
+//go:cgo_import_static x_cgo_unsetenv
+//go:linkname x_cgo_unsetenv x_cgo_unsetenv
+//go:linkname _cgo_unsetenv runtime._cgo_unsetenv
+var x_cgo_unsetenv byte
+var _cgo_unsetenv = &x_cgo_unsetenv
diff --git a/contrib/go/_std_1.18/src/runtime/cgo/sigaction.go b/contrib/go/_std_1.18/src/runtime/cgo/sigaction.go
new file mode 100644
index 0000000000..dc714f7ef4
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo/sigaction.go
@@ -0,0 +1,22 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (linux && amd64) || (freebsd && amd64) || (linux && arm64) || (linux && ppc64le)
+
+package cgo
+
+// Import "unsafe" because we use go:linkname.
+import _ "unsafe"
+
+// When using cgo, call the C library for sigaction, so that we call into
+// any sanitizer interceptors. This supports using the sanitizers
+// with Go programs. The thread and memory sanitizers only apply to
+// C/C++ code; this permits that code to see the Go runtime's existing signal
+// handlers when registering new signal handlers for the process.
+
+//go:cgo_import_static x_cgo_sigaction
+//go:linkname x_cgo_sigaction x_cgo_sigaction
+//go:linkname _cgo_sigaction _cgo_sigaction
+var x_cgo_sigaction byte
+var _cgo_sigaction = &x_cgo_sigaction
diff --git a/contrib/go/_std_1.18/src/runtime/cgo_mmap.go b/contrib/go/_std_1.18/src/runtime/cgo_mmap.go
new file mode 100644
index 0000000000..0cb25bdcda
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo_mmap.go
@@ -0,0 +1,67 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Support for memory sanitizer. See runtime/cgo/mmap.go.
+
+//go:build (linux && amd64) || (linux && arm64)
+
+package runtime
+
+import "unsafe"
+
+// _cgo_mmap is filled in by runtime/cgo when it is linked into the
+// program, so it is only non-nil when using cgo.
+//go:linkname _cgo_mmap _cgo_mmap
+var _cgo_mmap unsafe.Pointer
+
+// _cgo_munmap is filled in by runtime/cgo when it is linked into the
+// program, so it is only non-nil when using cgo.
+//go:linkname _cgo_munmap _cgo_munmap
+var _cgo_munmap unsafe.Pointer
+
+// mmap is used to route the mmap system call through C code when using cgo, to
+// support sanitizer interceptors. Don't allow stack splits, since this function
+// (used by sysAlloc) is called in a lot of low-level parts of the runtime and
+// callers often assume it won't acquire any locks.
+//go:nosplit
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (unsafe.Pointer, int) {
+	if _cgo_mmap != nil {
+		// Make ret a uintptr so that writing to it in the
+		// function literal does not trigger a write barrier.
+		// A write barrier here could break because of the way
+		// that mmap uses the same value both as a pointer and
+		// an errno value.
+		var ret uintptr
+		systemstack(func() {
+			ret = callCgoMmap(addr, n, prot, flags, fd, off)
+		})
+		if ret < 4096 {
+			return nil, int(ret)
+		}
+		return unsafe.Pointer(ret), 0
+	}
+	return sysMmap(addr, n, prot, flags, fd, off)
+}
+
+func munmap(addr unsafe.Pointer, n uintptr) {
+	if _cgo_munmap != nil {
+		systemstack(func() { callCgoMunmap(addr, n) })
+		return
+	}
+	sysMunmap(addr, n)
+}
+
+// sysMmap calls the mmap system call. It is implemented in assembly.
+func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
+
+// callCgoMmap calls the mmap function in the runtime/cgo package
+// using the GCC calling convention. It is implemented in assembly.
+func callCgoMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) uintptr
+
+// sysMunmap calls the munmap system call. It is implemented in assembly.
+func sysMunmap(addr unsafe.Pointer, n uintptr)
+
+// callCgoMunmap calls the munmap function in the runtime/cgo package
+// using the GCC calling convention. It is implemented in assembly.
+func callCgoMunmap(addr unsafe.Pointer, n uintptr)
diff --git a/contrib/go/_std_1.18/src/runtime/cgo_sigaction.go b/contrib/go/_std_1.18/src/runtime/cgo_sigaction.go
new file mode 100644
index 0000000000..a2e12f0f0e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgo_sigaction.go
@@ -0,0 +1,92 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Support for sanitizers. See runtime/cgo/sigaction.go.
+
+//go:build (linux && amd64) || (freebsd && amd64) || (linux && arm64) || (linux && ppc64le)
+
+package runtime
+
+import "unsafe"
+
+// _cgo_sigaction is filled in by runtime/cgo when it is linked into the
+// program, so it is only non-nil when using cgo.
+//go:linkname _cgo_sigaction _cgo_sigaction
+var _cgo_sigaction unsafe.Pointer
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigaction(sig uint32, new, old *sigactiont) {
+	// racewalk.go avoids adding sanitizing instrumentation to package runtime,
+	// but we might be calling into instrumented C functions here,
+	// so we need the pointer parameters to be properly marked.
+	//
+	// Mark the input as having been written before the call
+	// and the output as read after.
+	if msanenabled && new != nil {
+		msanwrite(unsafe.Pointer(new), unsafe.Sizeof(*new))
+	}
+	if asanenabled && new != nil {
+		asanwrite(unsafe.Pointer(new), unsafe.Sizeof(*new))
+	}
+	if _cgo_sigaction == nil || inForkedChild {
+		sysSigaction(sig, new, old)
+	} else {
+		// We need to call _cgo_sigaction, which means we need a big enough stack
+		// for C.  To complicate matters, we may be in libpreinit (before the
+		// runtime has been initialized) or in an asynchronous signal handler (with
+		// the current thread in transition between goroutines, or with the g0
+		// system stack already in use).
+
+		var ret int32
+
+		var g *g
+		if mainStarted {
+			g = getg()
+		}
+		sp := uintptr(unsafe.Pointer(&sig))
+		switch {
+		case g == nil:
+			// No g: we're on a C stack or a signal stack.
+			ret = callCgoSigaction(uintptr(sig), new, old)
+		case sp < g.stack.lo || sp >= g.stack.hi:
+			// We're no longer on g's stack, so we must be handling a signal.  It's
+			// possible that we interrupted the thread during a transition between g
+			// and g0, so we should stay on the current stack to avoid corrupting g0.
+			ret = callCgoSigaction(uintptr(sig), new, old)
+		default:
+			// We're running on g's stack, so either we're not in a signal handler or
+			// the signal handler has set the correct g.  If we're on gsignal or g0,
+			// systemstack will make the call directly; otherwise, it will switch to
+			// g0 to ensure we have enough room to call a libc function.
+			//
+			// The function literal that we pass to systemstack is not nosplit, but
+			// that's ok: we'll be running on a fresh, clean system stack so the stack
+			// check will always succeed anyway.
+			systemstack(func() {
+				ret = callCgoSigaction(uintptr(sig), new, old)
+			})
+		}
+
+		const EINVAL = 22
+		if ret == EINVAL {
+			// libc reserves certain signals — normally 32-33 — for pthreads, and
+			// returns EINVAL for sigaction calls on those signals.  If we get EINVAL,
+			// fall back to making the syscall directly.
+			sysSigaction(sig, new, old)
+		}
+	}
+
+	if msanenabled && old != nil {
+		msanread(unsafe.Pointer(old), unsafe.Sizeof(*old))
+	}
+	if asanenabled && old != nil {
+		asanread(unsafe.Pointer(old), unsafe.Sizeof(*old))
+	}
+}
+
+// callCgoSigaction calls the sigaction function in the runtime/cgo package
+// using the GCC calling convention. It is implemented in assembly.
+//go:noescape
+func callCgoSigaction(sig uintptr, new, old *sigactiont) int32
diff --git a/contrib/go/_std_1.18/src/runtime/cgocall.go b/contrib/go/_std_1.18/src/runtime/cgocall.go
new file mode 100644
index 0000000000..a0c9560fd0
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgocall.go
@@ -0,0 +1,639 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Cgo call and callback support.
+//
+// To call into the C function f from Go, the cgo-generated code calls
+// runtime.cgocall(_cgo_Cfunc_f, frame), where _cgo_Cfunc_f is a
+// gcc-compiled function written by cgo.
+//
+// runtime.cgocall (below) calls entersyscall so as not to block
+// other goroutines or the garbage collector, and then calls
+// runtime.asmcgocall(_cgo_Cfunc_f, frame).
+//
+// runtime.asmcgocall (in asm_$GOARCH.s) switches to the m->g0 stack
+// (assumed to be an operating system-allocated stack, so safe to run
+// gcc-compiled code on) and calls _cgo_Cfunc_f(frame).
+//
+// _cgo_Cfunc_f invokes the actual C function f with arguments
+// taken from the frame structure, records the results in the frame,
+// and returns to runtime.asmcgocall.
+//
+// After it regains control, runtime.asmcgocall switches back to the
+// original g (m->curg)'s stack and returns to runtime.cgocall.
+//
+// After it regains control, runtime.cgocall calls exitsyscall, which blocks
+// until this m can run Go code without violating the $GOMAXPROCS limit,
+// and then unlocks g from m.
+//
+// The above description skipped over the possibility of the gcc-compiled
+// function f calling back into Go. If that happens, we continue down
+// the rabbit hole during the execution of f.
+//
+// To make it possible for gcc-compiled C code to call a Go function p.GoF,
+// cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
+// know about packages).  The gcc-compiled C function f calls GoF.
+//
+// GoF initializes "frame", a structure containing all of its
+// arguments and slots for p.GoF's results. It calls
+// crosscall2(_cgoexp_GoF, frame, framesize, ctxt) using the gcc ABI.
+//
+// crosscall2 (in cgo/asm_$GOARCH.s) is a four-argument adapter from
+// the gcc function call ABI to the gc function call ABI. At this
+// point we're in the Go runtime, but we're still running on m.g0's
+// stack and outside the $GOMAXPROCS limit. crosscall2 calls
+// runtime.cgocallback(_cgoexp_GoF, frame, ctxt) using the gc ABI.
+// (crosscall2's framesize argument is no longer used, but there's one
+// case where SWIG calls crosscall2 directly and expects to pass this
+// argument. See _cgo_panic.)
+//
+// runtime.cgocallback (in asm_$GOARCH.s) switches from m.g0's stack
+// to the original g (m.curg)'s stack, on which it calls
+// runtime.cgocallbackg(_cgoexp_GoF, frame, ctxt). As part of the
+// stack switch, runtime.cgocallback saves the current SP as
+// m.g0.sched.sp, so that any use of m.g0's stack during the execution
+// of the callback will be done below the existing stack frames.
+// Before overwriting m.g0.sched.sp, it pushes the old value on the
+// m.g0 stack, so that it can be restored later.
+//
+// runtime.cgocallbackg (below) is now running on a real goroutine
+// stack (not an m.g0 stack).  First it calls runtime.exitsyscall, which will
+// block until the $GOMAXPROCS limit allows running this goroutine.
+// Once exitsyscall has returned, it is safe to do things like call the memory
+// allocator or invoke the Go callback function.  runtime.cgocallbackg
+// first defers a function to unwind m.g0.sched.sp, so that if p.GoF
+// panics, m.g0.sched.sp will be restored to its old value: the m.g0 stack
+// and the m.curg stack will be unwound in lock step.
+// Then it calls _cgoexp_GoF(frame).
+//
+// _cgoexp_GoF, which was generated by cmd/cgo, unpacks the arguments
+// from frame, calls p.GoF, writes the results back to frame, and
+// returns. Now we start unwinding this whole process.
+//
+// runtime.cgocallbackg pops but does not execute the deferred
+// function to unwind m.g0.sched.sp, calls runtime.entersyscall, and
+// returns to runtime.cgocallback.
+//
+// After it regains control, runtime.cgocallback switches back to
+// m.g0's stack (the pointer is still in m.g0.sched.sp), restores the old
+// m.g0.sched.sp value from the stack, and returns to crosscall2.
+//
+// crosscall2 restores the callee-save registers for gcc and returns
+// to GoF, which unpacks any result values and returns to f.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Addresses collected in a cgo backtrace when crashing.
+// Length must match arg.Max in x_cgo_callers in runtime/cgo/gcc_traceback.c.
+type cgoCallers [32]uintptr
+
+// argset matches runtime/cgo/linux_syscall.c:argset_t
+type argset struct {
+	args   unsafe.Pointer
+	retval uintptr
+}
+
+// wrapper for syscall package to call cgocall for libc (cgo) calls.
+//go:linkname syscall_cgocaller syscall.cgocaller
+//go:nosplit
+//go:uintptrescapes
+func syscall_cgocaller(fn unsafe.Pointer, args ...uintptr) uintptr {
+	as := argset{args: unsafe.Pointer(&args[0])}
+	cgocall(fn, unsafe.Pointer(&as))
+	return as.retval
+}
+
+var ncgocall uint64 // number of cgo calls in total for dead m
+
+// Call from Go to C.
+//
+// This must be nosplit because it's used for syscalls on some
+// platforms. Syscalls may have untyped arguments on the stack, so
+// it's not safe to grow or scan the stack.
+//
+//go:nosplit
+func cgocall(fn, arg unsafe.Pointer) int32 {
+	if !iscgo && GOOS != "solaris" && GOOS != "illumos" && GOOS != "windows" {
+		throw("cgocall unavailable")
+	}
+
+	if fn == nil {
+		throw("cgocall nil")
+	}
+
+	if raceenabled {
+		racereleasemerge(unsafe.Pointer(&racecgosync))
+	}
+
+	mp := getg().m
+	mp.ncgocall++
+	mp.ncgo++
+
+	// Reset traceback.
+	mp.cgoCallers[0] = 0
+
+	// Announce we are entering a system call
+	// so that the scheduler knows to create another
+	// M to run goroutines while we are in the
+	// foreign code.
+	//
+	// The call to asmcgocall is guaranteed not to
+	// grow the stack and does not allocate memory,
+	// so it is safe to call while "in a system call", outside
+	// the $GOMAXPROCS accounting.
+	//
+	// fn may call back into Go code, in which case we'll exit the
+	// "system call", run the Go code (which may grow the stack),
+	// and then re-enter the "system call" reusing the PC and SP
+	// saved by entersyscall here.
+	entersyscall()
+
+	// Tell asynchronous preemption that we're entering external
+	// code. We do this after entersyscall because this may block
+	// and cause an async preemption to fail, but at this point a
+	// sync preemption will succeed (though this is not a matter
+	// of correctness).
+	osPreemptExtEnter(mp)
+
+	mp.incgo = true
+	errno := asmcgocall(fn, arg)
+
+	// Update accounting before exitsyscall because exitsyscall may
+	// reschedule us on to a different M.
+	mp.incgo = false
+	mp.ncgo--
+
+	osPreemptExtExit(mp)
+
+	exitsyscall()
+
+	// Note that raceacquire must be called only after exitsyscall has
+	// wired this M to a P.
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&racecgosync))
+	}
+
+	// From the garbage collector's perspective, time can move
+	// backwards in the sequence above. If there's a callback into
+	// Go code, GC will see this function at the call to
+	// asmcgocall. When the Go call later returns to C, the
+	// syscall PC/SP is rolled back and the GC sees this function
+	// back at the call to entersyscall. Normally, fn and arg
+	// would be live at entersyscall and dead at asmcgocall, so if
+	// time moved backwards, GC would see these arguments as dead
+	// and then live. Prevent these undead arguments from crashing
+	// GC by forcing them to stay live across this time warp.
+	KeepAlive(fn)
+	KeepAlive(arg)
+	KeepAlive(mp)
+
+	return errno
+}
+
+// Call from C back to Go. fn must point to an ABIInternal Go entry-point.
+//go:nosplit
+func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
+	gp := getg()
+	if gp != gp.m.curg {
+		println("runtime: bad g in cgocallback")
+		exit(2)
+	}
+
+	// The call from C is on gp.m's g0 stack, so we must ensure
+	// that we stay on that M. We have to do this before calling
+	// exitsyscall, since it would otherwise be free to move us to
+	// a different M. The call to unlockOSThread is in unwindm.
+	lockOSThread()
+
+	checkm := gp.m
+
+	// Save current syscall parameters, so m.syscall can be
+	// used again if callback decide to make syscall.
+	syscall := gp.m.syscall
+
+	// entersyscall saves the caller's SP to allow the GC to trace the Go
+	// stack. However, since we're returning to an earlier stack frame and
+	// need to pair with the entersyscall() call made by cgocall, we must
+	// save syscall* and let reentersyscall restore them.
+	savedsp := unsafe.Pointer(gp.syscallsp)
+	savedpc := gp.syscallpc
+	exitsyscall() // coming out of cgo call
+	gp.m.incgo = false
+
+	osPreemptExtExit(gp.m)
+
+	cgocallbackg1(fn, frame, ctxt) // will call unlockOSThread
+
+	// At this point unlockOSThread has been called.
+	// The following code must not change to a different m.
+	// This is enforced by checking incgo in the schedule function.
+
+	gp.m.incgo = true
+
+	if gp.m != checkm {
+		throw("m changed unexpectedly in cgocallbackg")
+	}
+
+	osPreemptExtEnter(gp.m)
+
+	// going back to cgo call
+	reentersyscall(savedpc, uintptr(savedsp))
+
+	gp.m.syscall = syscall
+}
+
+func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) {
+	gp := getg()
+
+	// When we return, undo the call to lockOSThread in cgocallbackg.
+	// We must still stay on the same m.
+	defer unlockOSThread()
+
+	if gp.m.needextram || atomic.Load(&extraMWaiters) > 0 {
+		gp.m.needextram = false
+		systemstack(newextram)
+	}
+
+	if ctxt != 0 {
+		s := append(gp.cgoCtxt, ctxt)
+
+		// Now we need to set gp.cgoCtxt = s, but we could get
+		// a SIGPROF signal while manipulating the slice, and
+		// the SIGPROF handler could pick up gp.cgoCtxt while
+		// tracing up the stack.  We need to ensure that the
+		// handler always sees a valid slice, so set the
+		// values in an order such that it always does.
+		p := (*slice)(unsafe.Pointer(&gp.cgoCtxt))
+		atomicstorep(unsafe.Pointer(&p.array), unsafe.Pointer(&s[0]))
+		p.cap = cap(s)
+		p.len = len(s)
+
+		defer func(gp *g) {
+			// Decrease the length of the slice by one, safely.
+			p := (*slice)(unsafe.Pointer(&gp.cgoCtxt))
+			p.len--
+		}(gp)
+	}
+
+	if gp.m.ncgo == 0 {
+		// The C call to Go came from a thread not currently running
+		// any Go. In the case of -buildmode=c-archive or c-shared,
+		// this call may be coming in before package initialization
+		// is complete. Wait until it is.
+		<-main_init_done
+	}
+
+	// Check whether the profiler needs to be turned on or off; this route to
+	// run Go code does not use runtime.execute, so bypasses the check there.
+	hz := sched.profilehz
+	if gp.m.profilehz != hz {
+		setThreadCPUProfiler(hz)
+	}
+
+	// Add entry to defer stack in case of panic.
+	restore := true
+	defer unwindm(&restore)
+
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&racecgosync))
+	}
+
+	// Invoke callback. This function is generated by cmd/cgo and
+	// will unpack the argument frame and call the Go function.
+	var cb func(frame unsafe.Pointer)
+	cbFV := funcval{uintptr(fn)}
+	*(*unsafe.Pointer)(unsafe.Pointer(&cb)) = noescape(unsafe.Pointer(&cbFV))
+	cb(frame)
+
+	if raceenabled {
+		racereleasemerge(unsafe.Pointer(&racecgosync))
+	}
+
+	// Do not unwind m->g0->sched.sp.
+	// Our caller, cgocallback, will do that.
+	restore = false
+}
+
+func unwindm(restore *bool) {
+	if *restore {
+		// Restore sp saved by cgocallback during
+		// unwind of g's stack (see comment at top of file).
+		mp := acquirem()
+		sched := &mp.g0.sched
+		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + alignUp(sys.MinFrameSize, sys.StackAlign)))
+
+		// Do the accounting that cgocall will not have a chance to do
+		// during an unwind.
+		//
+		// In the case where a Go call originates from C, ncgo is 0
+		// and there is no matching cgocall to end.
+		if mp.ncgo > 0 {
+			mp.incgo = false
+			mp.ncgo--
+			osPreemptExtExit(mp)
+		}
+
+		releasem(mp)
+	}
+}
+
+// called from assembly
+func badcgocallback() {
+	throw("misaligned stack in cgocallback")
+}
+
+// called from (incomplete) assembly
+func cgounimpl() {
+	throw("cgo not implemented")
+}
+
+var racecgosync uint64 // represents possible synchronization in C code
+
+// Pointer checking for cgo code.
+
+// We want to detect all cases where a program that does not use
+// unsafe makes a cgo call passing a Go pointer to memory that
+// contains a Go pointer. Here a Go pointer is defined as a pointer
+// to memory allocated by the Go runtime. Programs that use unsafe
+// can evade this restriction easily, so we don't try to catch them.
+// The cgo program will rewrite all possibly bad pointer arguments to
+// call cgoCheckPointer, where we can catch cases of a Go pointer
+// pointing to a Go pointer.
+
+// Complicating matters, taking the address of a slice or array
+// element permits the C program to access all elements of the slice
+// or array. In that case we will see a pointer to a single element,
+// but we need to check the entire data structure.
+
+// The cgoCheckPointer call takes additional arguments indicating that
+// it was called on an address expression. An additional argument of
+// true means that it only needs to check a single element. An
+// additional argument of a slice or array means that it needs to
+// check the entire slice/array, but nothing else. Otherwise, the
+// pointer could be anything, and we check the entire heap object,
+// which is conservative but safe.
+
+// When and if we implement a moving garbage collector,
+// cgoCheckPointer will pin the pointer for the duration of the cgo
+// call.  (This is necessary but not sufficient; the cgo program will
+// also have to change to pin Go pointers that cannot point to Go
+// pointers.)
+
+// cgoCheckPointer checks if the argument contains a Go pointer that
+// points to a Go pointer, and panics if it does.
+func cgoCheckPointer(ptr any, arg any) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := efaceOf(&ptr)
+	t := ep._type
+
+	top := true
+	if arg != nil && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
+		p := ep.data
+		if t.kind&kindDirectIface == 0 {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if p == nil || !cgoIsGoPointer(p) {
+			return
+		}
+		aep := efaceOf(&arg)
+		switch aep._type.kind & kindMask {
+		case kindBool:
+			if t.kind&kindMask == kindUnsafePointer {
+				// We don't know the type of the element.
+				break
+			}
+			pt := (*ptrtype)(unsafe.Pointer(t))
+			cgoCheckArg(pt.elem, p, true, false, cgoCheckPointerFail)
+			return
+		case kindSlice:
+			// Check the slice rather than the pointer.
+			ep = aep
+			t = ep._type
+		case kindArray:
+			// Check the array rather than the pointer.
+			// Pass top as false since we have a pointer
+			// to the array.
+			ep = aep
+			t = ep._type
+			top = false
+		default:
+			throw("can't happen")
+		}
+	}
+
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, top, cgoCheckPointerFail)
+}
+
+const cgoCheckPointerFail = "cgo argument has Go pointer to Go pointer"
+const cgoResultFail = "cgo result has Go pointer"
+
+// cgoCheckArg is the real work of cgoCheckPointer. The argument p
+// is either a pointer to the value (of type t), or the value itself,
+// depending on indir. The top parameter is whether we are at the top
+// level, where Go pointers are allowed.
+func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
+	if t.ptrdata == 0 || p == nil {
+		// If the type has no pointers there is nothing to do.
+		return
+	}
+
+	switch t.kind & kindMask {
+	default:
+		throw("can't happen")
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		if !indir {
+			if at.len != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(at.elem, p, at.elem.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for i := uintptr(0); i < at.len; i++ {
+			cgoCheckArg(at.elem, p, true, top, msg)
+			p = add(p, at.elem.size)
+		}
+	case kindChan, kindMap:
+		// These types contain internal pointers that will
+		// always be allocated in the Go heap. It's never OK
+		// to pass them to C.
+		panic(errorString(msg))
+	case kindFunc:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		panic(errorString(msg))
+	case kindInterface:
+		it := *(**_type)(p)
+		if it == nil {
+			return
+		}
+		// A type known at compile time is OK since it's
+		// constant. A type not known at compile time will be
+		// in the heap and will not be OK.
+		if inheap(uintptr(unsafe.Pointer(it))) {
+			panic(errorString(msg))
+		}
+		p = *(*unsafe.Pointer)(add(p, goarch.PtrSize))
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		cgoCheckArg(it, p, it.kind&kindDirectIface == 0, false, msg)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		s := (*slice)(p)
+		p = s.array
+		if p == nil || !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		if st.elem.ptrdata == 0 {
+			return
+		}
+		for i := 0; i < s.cap; i++ {
+			cgoCheckArg(st.elem, p, true, false, msg)
+			p = add(p, st.elem.size)
+		}
+	case kindString:
+		ss := (*stringStruct)(p)
+		if !cgoIsGoPointer(ss.str) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		if !indir {
+			if len(st.fields) != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(st.fields[0].typ, p, st.fields[0].typ.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for _, f := range st.fields {
+			if f.typ.ptrdata == 0 {
+				continue
+			}
+			cgoCheckArg(f.typ, add(p, f.offset()), true, top, msg)
+		}
+	case kindPtr, kindUnsafePointer:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+			if p == nil {
+				return
+			}
+		}
+
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+
+		cgoCheckUnknownPointer(p, msg)
+	}
+}
+
+// cgoCheckUnknownPointer is called for an arbitrary pointer into Go
+// memory. It checks whether that Go memory contains any other
+// pointer into Go memory. If it does, we panic.
+// The return values are unused but useful to see in panic tracebacks.
+func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
+	if inheap(uintptr(p)) {
+		b, span, _ := findObject(uintptr(p), 0, 0)
+		base = b
+		if base == 0 {
+			return
+		}
+		hbits := heapBitsForAddr(base)
+		n := span.elemsize
+		for i = uintptr(0); i < n; i += goarch.PtrSize {
+			if !hbits.morePointers() {
+				// No more possible pointers.
+				break
+			}
+			if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+				panic(errorString(msg))
+			}
+			hbits = hbits.next()
+		}
+
+		return
+	}
+
+	for _, datap := range activeModules() {
+		if cgoInRange(p, datap.data, datap.edata) || cgoInRange(p, datap.bss, datap.ebss) {
+			// We have no way to know the size of the object.
+			// We have to assume that it might contain a pointer.
+			panic(errorString(msg))
+		}
+		// In the text or noptr sections, we know that the
+		// pointer does not point to a Go pointer.
+	}
+
+	return
+}
+
+// cgoIsGoPointer reports whether the pointer is a Go pointer--a
+// pointer to Go memory. We only care about Go memory that might
+// contain pointers.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoIsGoPointer(p unsafe.Pointer) bool {
+	if p == nil {
+		return false
+	}
+
+	if inHeapOrStack(uintptr(p)) {
+		return true
+	}
+
+	for _, datap := range activeModules() {
+		if cgoInRange(p, datap.data, datap.edata) || cgoInRange(p, datap.bss, datap.ebss) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// cgoInRange reports whether p is between start and end.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoInRange(p unsafe.Pointer, start, end uintptr) bool {
+	return start <= uintptr(p) && uintptr(p) < end
+}
+
+// cgoCheckResult is called to check the result parameter of an
+// exported Go function. It panics if the result is or contains a Go
+// pointer.
+func cgoCheckResult(val any) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := efaceOf(&val)
+	t := ep._type
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, false, cgoResultFail)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgocallback.go b/contrib/go/_std_1.18/src/runtime/cgocallback.go
new file mode 100644
index 0000000000..59953f1cee
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgocallback.go
@@ -0,0 +1,13 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// These functions are called from C code via cgo/callbacks.go.
+
+// Panic.
+
+func _cgo_panic_internal(p *byte) {
+	panic(gostringnocopy(p))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cgocheck.go b/contrib/go/_std_1.18/src/runtime/cgocheck.go
new file mode 100644
index 0000000000..3acbadf803
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cgocheck.go
@@ -0,0 +1,263 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code to check that pointer writes follow the cgo rules.
+// These functions are invoked via the write barrier when debug.cgocheck > 1.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+const cgoWriteBarrierFail = "Go pointer stored into non-Go memory"
+
+// cgoCheckWriteBarrier is called whenever a pointer is stored into memory.
+// It throws if the program is storing a Go pointer into non-Go memory.
+//
+// This is called from the write barrier, so its entire call tree must
+// be nosplit.
+//
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
+	if !cgoIsGoPointer(unsafe.Pointer(src)) {
+		return
+	}
+	if cgoIsGoPointer(unsafe.Pointer(dst)) {
+		return
+	}
+
+	// If we are running on the system stack then dst might be an
+	// address on the stack, which is OK.
+	g := getg()
+	if g == g.m.g0 || g == g.m.gsignal {
+		return
+	}
+
+	// Allocating memory can write to various mfixalloc structs
+	// that look like they are non-Go memory.
+	if g.m.mallocing != 0 {
+		return
+	}
+
+	// It's OK if writing to memory allocated by persistentalloc.
+	// Do this check last because it is more expensive and rarely true.
+	// If it is false the expense doesn't matter since we are crashing.
+	if inPersistentAlloc(uintptr(unsafe.Pointer(dst))) {
+		return
+	}
+
+	systemstack(func() {
+		println("write of Go pointer", hex(src), "to non-Go memory", hex(uintptr(unsafe.Pointer(dst))))
+		throw(cgoWriteBarrierFail)
+	})
+}
+
+// cgoCheckMemmove is called when moving a block of memory.
+// dst and src point off bytes into the value to copy.
+// size is the number of bytes to copy.
+// It throws if the program is copying a block that contains a Go pointer
+// into non-Go memory.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckMemmove(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
+	if typ.ptrdata == 0 {
+		return
+	}
+	if !cgoIsGoPointer(src) {
+		return
+	}
+	if cgoIsGoPointer(dst) {
+		return
+	}
+	cgoCheckTypedBlock(typ, src, off, size)
+}
+
+// cgoCheckSliceCopy is called when copying n elements of a slice.
+// src and dst are pointers to the first element of the slice.
+// typ is the element type of the slice.
+// It throws if the program is copying slice elements that contain Go pointers
+// into non-Go memory.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckSliceCopy(typ *_type, dst, src unsafe.Pointer, n int) {
+	if typ.ptrdata == 0 {
+		return
+	}
+	if !cgoIsGoPointer(src) {
+		return
+	}
+	if cgoIsGoPointer(dst) {
+		return
+	}
+	p := src
+	for i := 0; i < n; i++ {
+		cgoCheckTypedBlock(typ, p, 0, typ.size)
+		p = add(p, typ.size)
+	}
+}
+
+// cgoCheckTypedBlock checks the block of memory at src, for up to size bytes,
+// and throws if it finds a Go pointer. The type of the memory is typ,
+// and src is off bytes into that type.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
+	// Anything past typ.ptrdata is not a pointer.
+	if typ.ptrdata <= off {
+		return
+	}
+	if ptrdataSize := typ.ptrdata - off; size > ptrdataSize {
+		size = ptrdataSize
+	}
+
+	if typ.kind&kindGCProg == 0 {
+		cgoCheckBits(src, typ.gcdata, off, size)
+		return
+	}
+
+	// The type has a GC program. Try to find GC bits somewhere else.
+	for _, datap := range activeModules() {
+		if cgoInRange(src, datap.data, datap.edata) {
+			doff := uintptr(src) - datap.data
+			cgoCheckBits(add(src, -doff), datap.gcdatamask.bytedata, off+doff, size)
+			return
+		}
+		if cgoInRange(src, datap.bss, datap.ebss) {
+			boff := uintptr(src) - datap.bss
+			cgoCheckBits(add(src, -boff), datap.gcbssmask.bytedata, off+boff, size)
+			return
+		}
+	}
+
+	s := spanOfUnchecked(uintptr(src))
+	if s.state.get() == mSpanManual {
+		// There are no heap bits for value stored on the stack.
+		// For a channel receive src might be on the stack of some
+		// other goroutine, so we can't unwind the stack even if
+		// we wanted to.
+		// We can't expand the GC program without extra storage
+		// space we can't easily get.
+		// Fortunately we have the type information.
+		systemstack(func() {
+			cgoCheckUsingType(typ, src, off, size)
+		})
+		return
+	}
+
+	// src must be in the regular heap.
+
+	hbits := heapBitsForAddr(uintptr(src))
+	for i := uintptr(0); i < off+size; i += goarch.PtrSize {
+		bits := hbits.bits()
+		if i >= off && bits&bitPointer != 0 {
+			v := *(*unsafe.Pointer)(add(src, i))
+			if cgoIsGoPointer(v) {
+				throw(cgoWriteBarrierFail)
+			}
+		}
+		hbits = hbits.next()
+	}
+}
+
+// cgoCheckBits checks the block of memory at src, for up to size
+// bytes, and throws if it finds a Go pointer. The gcbits mark each
+// pointer value. The src pointer is off bytes into the gcbits.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckBits(src unsafe.Pointer, gcbits *byte, off, size uintptr) {
+	skipMask := off / goarch.PtrSize / 8
+	skipBytes := skipMask * goarch.PtrSize * 8
+	ptrmask := addb(gcbits, skipMask)
+	src = add(src, skipBytes)
+	off -= skipBytes
+	size += off
+	var bits uint32
+	for i := uintptr(0); i < size; i += goarch.PtrSize {
+		if i&(goarch.PtrSize*8-1) == 0 {
+			bits = uint32(*ptrmask)
+			ptrmask = addb(ptrmask, 1)
+		} else {
+			bits >>= 1
+		}
+		if off > 0 {
+			off -= goarch.PtrSize
+		} else {
+			if bits&1 != 0 {
+				v := *(*unsafe.Pointer)(add(src, i))
+				if cgoIsGoPointer(v) {
+					throw(cgoWriteBarrierFail)
+				}
+			}
+		}
+	}
+}
+
+// cgoCheckUsingType is like cgoCheckTypedBlock, but is a last ditch
+// fall back to look for pointers in src using the type information.
+// We only use this when looking at a value on the stack when the type
+// uses a GC program, because otherwise it's more efficient to use the
+// GC bits. This is called on the system stack.
+//go:nowritebarrier
+//go:systemstack
+func cgoCheckUsingType(typ *_type, src unsafe.Pointer, off, size uintptr) {
+	if typ.ptrdata == 0 {
+		return
+	}
+
+	// Anything past typ.ptrdata is not a pointer.
+	if typ.ptrdata <= off {
+		return
+	}
+	if ptrdataSize := typ.ptrdata - off; size > ptrdataSize {
+		size = ptrdataSize
+	}
+
+	if typ.kind&kindGCProg == 0 {
+		cgoCheckBits(src, typ.gcdata, off, size)
+		return
+	}
+	switch typ.kind & kindMask {
+	default:
+		throw("can't happen")
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(typ))
+		for i := uintptr(0); i < at.len; i++ {
+			if off < at.elem.size {
+				cgoCheckUsingType(at.elem, src, off, size)
+			}
+			src = add(src, at.elem.size)
+			skipped := off
+			if skipped > at.elem.size {
+				skipped = at.elem.size
+			}
+			checked := at.elem.size - skipped
+			off -= skipped
+			if size <= checked {
+				return
+			}
+			size -= checked
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(typ))
+		for _, f := range st.fields {
+			if off < f.typ.size {
+				cgoCheckUsingType(f.typ, src, off, size)
+			}
+			src = add(src, f.typ.size)
+			skipped := off
+			if skipped > f.typ.size {
+				skipped = f.typ.size
+			}
+			checked := f.typ.size - skipped
+			off -= skipped
+			if size <= checked {
+				return
+			}
+			size -= checked
+		}
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/chan.go b/contrib/go/_std_1.18/src/runtime/chan.go
new file mode 100644
index 0000000000..3cdb5dce11
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/chan.go
@@ -0,0 +1,846 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// This file contains the implementation of Go channels.
+
+// Invariants:
+//  At least one of c.sendq and c.recvq is empty,
+//  except for the case of an unbuffered channel with a single goroutine
+//  blocked on it for both sending and receiving using a select statement,
+//  in which case the length of c.sendq and c.recvq is limited only by the
+//  size of the select statement.
+//
+// For buffered channels, also:
+//  c.qcount > 0 implies that c.recvq is empty.
+//  c.qcount < c.dataqsiz implies that c.sendq is empty.
+
+import (
+	"internal/abi"
+	"runtime/internal/atomic"
+	"runtime/internal/math"
+	"unsafe"
+)
+
+const (
+	maxAlign  = 8
+	hchanSize = unsafe.Sizeof(hchan{}) + uintptr(-int(unsafe.Sizeof(hchan{}))&(maxAlign-1))
+	debugChan = false
+)
+
+type hchan struct {
+	qcount   uint           // total data in the queue
+	dataqsiz uint           // size of the circular queue
+	buf      unsafe.Pointer // points to an array of dataqsiz elements
+	elemsize uint16
+	closed   uint32
+	elemtype *_type // element type
+	sendx    uint   // send index
+	recvx    uint   // receive index
+	recvq    waitq  // list of recv waiters
+	sendq    waitq  // list of send waiters
+
+	// lock protects all fields in hchan, as well as several
+	// fields in sudogs blocked on this channel.
+	//
+	// Do not change another G's status while holding this lock
+	// (in particular, do not ready a G), as this can deadlock
+	// with stack shrinking.
+	lock mutex
+}
+
+type waitq struct {
+	first *sudog
+	last  *sudog
+}
+
+//go:linkname reflect_makechan reflect.makechan
+func reflect_makechan(t *chantype, size int) *hchan {
+	return makechan(t, size)
+}
+
+func makechan64(t *chantype, size int64) *hchan {
+	if int64(int(size)) != size {
+		panic(plainError("makechan: size out of range"))
+	}
+
+	return makechan(t, int(size))
+}
+
+func makechan(t *chantype, size int) *hchan {
+	elem := t.elem
+
+	// compiler checks this but be safe.
+	if elem.size >= 1<<16 {
+		throw("makechan: invalid channel element type")
+	}
+	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
+		throw("makechan: bad alignment")
+	}
+
+	mem, overflow := math.MulUintptr(elem.size, uintptr(size))
+	if overflow || mem > maxAlloc-hchanSize || size < 0 {
+		panic(plainError("makechan: size out of range"))
+	}
+
+	// Hchan does not contain pointers interesting for GC when elements stored in buf do not contain pointers.
+	// buf points into the same allocation, elemtype is persistent.
+	// SudoG's are referenced from their owning thread so they can't be collected.
+	// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
+	var c *hchan
+	switch {
+	case mem == 0:
+		// Queue or element size is zero.
+		c = (*hchan)(mallocgc(hchanSize, nil, true))
+		// Race detector uses this location for synchronization.
+		c.buf = c.raceaddr()
+	case elem.ptrdata == 0:
+		// Elements do not contain pointers.
+		// Allocate hchan and buf in one call.
+		c = (*hchan)(mallocgc(hchanSize+mem, nil, true))
+		c.buf = add(unsafe.Pointer(c), hchanSize)
+	default:
+		// Elements contain pointers.
+		c = new(hchan)
+		c.buf = mallocgc(mem, elem, true)
+	}
+
+	c.elemsize = uint16(elem.size)
+	c.elemtype = elem
+	c.dataqsiz = uint(size)
+	lockInit(&c.lock, lockRankHchan)
+
+	if debugChan {
+		print("makechan: chan=", c, "; elemsize=", elem.size, "; dataqsiz=", size, "\n")
+	}
+	return c
+}
+
+// chanbuf(c, i) is pointer to the i'th slot in the buffer.
+func chanbuf(c *hchan, i uint) unsafe.Pointer {
+	return add(c.buf, uintptr(i)*uintptr(c.elemsize))
+}
+
+// full reports whether a send on c would block (that is, the channel is full).
+// It uses a single word-sized read of mutable state, so although
+// the answer is instantaneously true, the correct answer may have changed
+// by the time the calling function receives the return value.
+func full(c *hchan) bool {
+	// c.dataqsiz is immutable (never written after the channel is created)
+	// so it is safe to read at any time during channel operation.
+	if c.dataqsiz == 0 {
+		// Assumes that a pointer read is relaxed-atomic.
+		return c.recvq.first == nil
+	}
+	// Assumes that a uint read is relaxed-atomic.
+	return c.qcount == c.dataqsiz
+}
+
+// entry point for c <- x from compiled code
+//go:nosplit
+func chansend1(c *hchan, elem unsafe.Pointer) {
+	chansend(c, elem, true, getcallerpc())
+}
+
+/*
+ * generic single channel send/recv
+ * If block is not nil,
+ * then the protocol will not
+ * sleep but return if it could
+ * not complete.
+ *
+ * sleep can wake up with g.param == nil
+ * when a channel involved in the sleep has
+ * been closed.  it is easiest to loop and re-run
+ * the operation; we'll see that it's now closed.
+ */
+func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
+	if c == nil {
+		if !block {
+			return false
+		}
+		gopark(nil, nil, waitReasonChanSendNilChan, traceEvGoStop, 2)
+		throw("unreachable")
+	}
+
+	if debugChan {
+		print("chansend: chan=", c, "\n")
+	}
+
+	if raceenabled {
+		racereadpc(c.raceaddr(), callerpc, abi.FuncPCABIInternal(chansend))
+	}
+
+	// Fast path: check for failed non-blocking operation without acquiring the lock.
+	//
+	// After observing that the channel is not closed, we observe that the channel is
+	// not ready for sending. Each of these observations is a single word-sized read
+	// (first c.closed and second full()).
+	// Because a closed channel cannot transition from 'ready for sending' to
+	// 'not ready for sending', even if the channel is closed between the two observations,
+	// they imply a moment between the two when the channel was both not yet closed
+	// and not ready for sending. We behave as if we observed the channel at that moment,
+	// and report that the send cannot proceed.
+	//
+	// It is okay if the reads are reordered here: if we observe that the channel is not
+	// ready for sending and then observe that it is not closed, that implies that the
+	// channel wasn't closed during the first observation. However, nothing here
+	// guarantees forward progress. We rely on the side effects of lock release in
+	// chanrecv() and closechan() to update this thread's view of c.closed and full().
+	if !block && c.closed == 0 && full(c) {
+		return false
+	}
+
+	var t0 int64
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+	}
+
+	lock(&c.lock)
+
+	if c.closed != 0 {
+		unlock(&c.lock)
+		panic(plainError("send on closed channel"))
+	}
+
+	if sg := c.recvq.dequeue(); sg != nil {
+		// Found a waiting receiver. We pass the value we want to send
+		// directly to the receiver, bypassing the channel buffer (if any).
+		send(c, sg, ep, func() { unlock(&c.lock) }, 3)
+		return true
+	}
+
+	if c.qcount < c.dataqsiz {
+		// Space is available in the channel buffer. Enqueue the element to send.
+		qp := chanbuf(c, c.sendx)
+		if raceenabled {
+			racenotify(c, c.sendx, nil)
+		}
+		typedmemmove(c.elemtype, qp, ep)
+		c.sendx++
+		if c.sendx == c.dataqsiz {
+			c.sendx = 0
+		}
+		c.qcount++
+		unlock(&c.lock)
+		return true
+	}
+
+	if !block {
+		unlock(&c.lock)
+		return false
+	}
+
+	// Block on the channel. Some receiver will complete our operation for us.
+	gp := getg()
+	mysg := acquireSudog()
+	mysg.releasetime = 0
+	if t0 != 0 {
+		mysg.releasetime = -1
+	}
+	// No stack splits between assigning elem and enqueuing mysg
+	// on gp.waiting where copystack can find it.
+	mysg.elem = ep
+	mysg.waitlink = nil
+	mysg.g = gp
+	mysg.isSelect = false
+	mysg.c = c
+	gp.waiting = mysg
+	gp.param = nil
+	c.sendq.enqueue(mysg)
+	// Signal to anyone trying to shrink our stack that we're about
+	// to park on a channel. The window between when this G's status
+	// changes and when we set gp.activeStackChans is not safe for
+	// stack shrinking.
+	atomic.Store8(&gp.parkingOnChan, 1)
+	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2)
+	// Ensure the value being sent is kept alive until the
+	// receiver copies it out. The sudog has a pointer to the
+	// stack object, but sudogs aren't considered as roots of the
+	// stack tracer.
+	KeepAlive(ep)
+
+	// someone woke us up.
+	if mysg != gp.waiting {
+		throw("G waiting list is corrupted")
+	}
+	gp.waiting = nil
+	gp.activeStackChans = false
+	closed := !mysg.success
+	gp.param = nil
+	if mysg.releasetime > 0 {
+		blockevent(mysg.releasetime-t0, 2)
+	}
+	mysg.c = nil
+	releaseSudog(mysg)
+	if closed {
+		if c.closed == 0 {
+			throw("chansend: spurious wakeup")
+		}
+		panic(plainError("send on closed channel"))
+	}
+	return true
+}
+
+// send processes a send operation on an empty channel c.
+// The value ep sent by the sender is copied to the receiver sg.
+// The receiver is then woken up to go on its merry way.
+// Channel c must be empty and locked.  send unlocks c with unlockf.
+// sg must already be dequeued from c.
+// ep must be non-nil and point to the heap or the caller's stack.
+func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
+	if raceenabled {
+		if c.dataqsiz == 0 {
+			racesync(c, sg)
+		} else {
+			// Pretend we go through the buffer, even though
+			// we copy directly. Note that we need to increment
+			// the head/tail locations only when raceenabled.
+			racenotify(c, c.recvx, nil)
+			racenotify(c, c.recvx, sg)
+			c.recvx++
+			if c.recvx == c.dataqsiz {
+				c.recvx = 0
+			}
+			c.sendx = c.recvx // c.sendx = (c.sendx+1) % c.dataqsiz
+		}
+	}
+	if sg.elem != nil {
+		sendDirect(c.elemtype, sg, ep)
+		sg.elem = nil
+	}
+	gp := sg.g
+	unlockf()
+	gp.param = unsafe.Pointer(sg)
+	sg.success = true
+	if sg.releasetime != 0 {
+		sg.releasetime = cputicks()
+	}
+	goready(gp, skip+1)
+}
+
+// Sends and receives on unbuffered or empty-buffered channels are the
+// only operations where one running goroutine writes to the stack of
+// another running goroutine. The GC assumes that stack writes only
+// happen when the goroutine is running and are only done by that
+// goroutine. Using a write barrier is sufficient to make up for
+// violating that assumption, but the write barrier has to work.
+// typedmemmove will call bulkBarrierPreWrite, but the target bytes
+// are not in the heap, so that will not help. We arrange to call
+// memmove and typeBitsBulkBarrier instead.
+
+func sendDirect(t *_type, sg *sudog, src unsafe.Pointer) {
+	// src is on our stack, dst is a slot on another stack.
+
+	// Once we read sg.elem out of sg, it will no longer
+	// be updated if the destination's stack gets copied (shrunk).
+	// So make sure that no preemption points can happen between read & use.
+	dst := sg.elem
+	typeBitsBulkBarrier(t, uintptr(dst), uintptr(src), t.size)
+	// No need for cgo write barrier checks because dst is always
+	// Go memory.
+	memmove(dst, src, t.size)
+}
+
+func recvDirect(t *_type, sg *sudog, dst unsafe.Pointer) {
+	// dst is on our stack or the heap, src is on another stack.
+	// The channel is locked, so src will not move during this
+	// operation.
+	src := sg.elem
+	typeBitsBulkBarrier(t, uintptr(dst), uintptr(src), t.size)
+	memmove(dst, src, t.size)
+}
+
+func closechan(c *hchan) {
+	if c == nil {
+		panic(plainError("close of nil channel"))
+	}
+
+	lock(&c.lock)
+	if c.closed != 0 {
+		unlock(&c.lock)
+		panic(plainError("close of closed channel"))
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(c.raceaddr(), callerpc, abi.FuncPCABIInternal(closechan))
+		racerelease(c.raceaddr())
+	}
+
+	c.closed = 1
+
+	var glist gList
+
+	// release all readers
+	for {
+		sg := c.recvq.dequeue()
+		if sg == nil {
+			break
+		}
+		if sg.elem != nil {
+			typedmemclr(c.elemtype, sg.elem)
+			sg.elem = nil
+		}
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		gp := sg.g
+		gp.param = unsafe.Pointer(sg)
+		sg.success = false
+		if raceenabled {
+			raceacquireg(gp, c.raceaddr())
+		}
+		glist.push(gp)
+	}
+
+	// release all writers (they will panic)
+	for {
+		sg := c.sendq.dequeue()
+		if sg == nil {
+			break
+		}
+		sg.elem = nil
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		gp := sg.g
+		gp.param = unsafe.Pointer(sg)
+		sg.success = false
+		if raceenabled {
+			raceacquireg(gp, c.raceaddr())
+		}
+		glist.push(gp)
+	}
+	unlock(&c.lock)
+
+	// Ready all Gs now that we've dropped the channel lock.
+	for !glist.empty() {
+		gp := glist.pop()
+		gp.schedlink = 0
+		goready(gp, 3)
+	}
+}
+
+// empty reports whether a read from c would block (that is, the channel is
+// empty).  It uses a single atomic read of mutable state.
+func empty(c *hchan) bool {
+	// c.dataqsiz is immutable.
+	if c.dataqsiz == 0 {
+		return atomic.Loadp(unsafe.Pointer(&c.sendq.first)) == nil
+	}
+	return atomic.Loaduint(&c.qcount) == 0
+}
+
+// entry points for <- c from compiled code
+//go:nosplit
+func chanrecv1(c *hchan, elem unsafe.Pointer) {
+	chanrecv(c, elem, true)
+}
+
+//go:nosplit
+func chanrecv2(c *hchan, elem unsafe.Pointer) (received bool) {
+	_, received = chanrecv(c, elem, true)
+	return
+}
+
+// chanrecv receives on channel c and writes the received data to ep.
+// ep may be nil, in which case received data is ignored.
+// If block == false and no elements are available, returns (false, false).
+// Otherwise, if c is closed, zeros *ep and returns (true, false).
+// Otherwise, fills in *ep with an element and returns (true, true).
+// A non-nil ep must point to the heap or the caller's stack.
+func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) {
+	// raceenabled: don't need to check ep, as it is always on the stack
+	// or is new memory allocated by reflect.
+
+	if debugChan {
+		print("chanrecv: chan=", c, "\n")
+	}
+
+	if c == nil {
+		if !block {
+			return
+		}
+		gopark(nil, nil, waitReasonChanReceiveNilChan, traceEvGoStop, 2)
+		throw("unreachable")
+	}
+
+	// Fast path: check for failed non-blocking operation without acquiring the lock.
+	if !block && empty(c) {
+		// After observing that the channel is not ready for receiving, we observe whether the
+		// channel is closed.
+		//
+		// Reordering of these checks could lead to incorrect behavior when racing with a close.
+		// For example, if the channel was open and not empty, was closed, and then drained,
+		// reordered reads could incorrectly indicate "open and empty". To prevent reordering,
+		// we use atomic loads for both checks, and rely on emptying and closing to happen in
+		// separate critical sections under the same lock.  This assumption fails when closing
+		// an unbuffered channel with a blocked send, but that is an error condition anyway.
+		if atomic.Load(&c.closed) == 0 {
+			// Because a channel cannot be reopened, the later observation of the channel
+			// being not closed implies that it was also not closed at the moment of the
+			// first observation. We behave as if we observed the channel at that moment
+			// and report that the receive cannot proceed.
+			return
+		}
+		// The channel is irreversibly closed. Re-check whether the channel has any pending data
+		// to receive, which could have arrived between the empty and closed checks above.
+		// Sequential consistency is also required here, when racing with such a send.
+		if empty(c) {
+			// The channel is irreversibly closed and empty.
+			if raceenabled {
+				raceacquire(c.raceaddr())
+			}
+			if ep != nil {
+				typedmemclr(c.elemtype, ep)
+			}
+			return true, false
+		}
+	}
+
+	var t0 int64
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+	}
+
+	lock(&c.lock)
+
+	if c.closed != 0 && c.qcount == 0 {
+		if raceenabled {
+			raceacquire(c.raceaddr())
+		}
+		unlock(&c.lock)
+		if ep != nil {
+			typedmemclr(c.elemtype, ep)
+		}
+		return true, false
+	}
+
+	if sg := c.sendq.dequeue(); sg != nil {
+		// Found a waiting sender. If buffer is size 0, receive value
+		// directly from sender. Otherwise, receive from head of queue
+		// and add sender's value to the tail of the queue (both map to
+		// the same buffer slot because the queue is full).
+		recv(c, sg, ep, func() { unlock(&c.lock) }, 3)
+		return true, true
+	}
+
+	if c.qcount > 0 {
+		// Receive directly from queue
+		qp := chanbuf(c, c.recvx)
+		if raceenabled {
+			racenotify(c, c.recvx, nil)
+		}
+		if ep != nil {
+			typedmemmove(c.elemtype, ep, qp)
+		}
+		typedmemclr(c.elemtype, qp)
+		c.recvx++
+		if c.recvx == c.dataqsiz {
+			c.recvx = 0
+		}
+		c.qcount--
+		unlock(&c.lock)
+		return true, true
+	}
+
+	if !block {
+		unlock(&c.lock)
+		return false, false
+	}
+
+	// no sender available: block on this channel.
+	gp := getg()
+	mysg := acquireSudog()
+	mysg.releasetime = 0
+	if t0 != 0 {
+		mysg.releasetime = -1
+	}
+	// No stack splits between assigning elem and enqueuing mysg
+	// on gp.waiting where copystack can find it.
+	mysg.elem = ep
+	mysg.waitlink = nil
+	gp.waiting = mysg
+	mysg.g = gp
+	mysg.isSelect = false
+	mysg.c = c
+	gp.param = nil
+	c.recvq.enqueue(mysg)
+	// Signal to anyone trying to shrink our stack that we're about
+	// to park on a channel. The window between when this G's status
+	// changes and when we set gp.activeStackChans is not safe for
+	// stack shrinking.
+	atomic.Store8(&gp.parkingOnChan, 1)
+	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2)
+
+	// someone woke us up
+	if mysg != gp.waiting {
+		throw("G waiting list is corrupted")
+	}
+	gp.waiting = nil
+	gp.activeStackChans = false
+	if mysg.releasetime > 0 {
+		blockevent(mysg.releasetime-t0, 2)
+	}
+	success := mysg.success
+	gp.param = nil
+	mysg.c = nil
+	releaseSudog(mysg)
+	return true, success
+}
+
+// recv processes a receive operation on a full channel c.
+// There are 2 parts:
+// 1) The value sent by the sender sg is put into the channel
+//    and the sender is woken up to go on its merry way.
+// 2) The value received by the receiver (the current G) is
+//    written to ep.
+// For synchronous channels, both values are the same.
+// For asynchronous channels, the receiver gets its data from
+// the channel buffer and the sender's data is put in the
+// channel buffer.
+// Channel c must be full and locked. recv unlocks c with unlockf.
+// sg must already be dequeued from c.
+// A non-nil ep must point to the heap or the caller's stack.
+func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
+	if c.dataqsiz == 0 {
+		if raceenabled {
+			racesync(c, sg)
+		}
+		if ep != nil {
+			// copy data from sender
+			recvDirect(c.elemtype, sg, ep)
+		}
+	} else {
+		// Queue is full. Take the item at the
+		// head of the queue. Make the sender enqueue
+		// its item at the tail of the queue. Since the
+		// queue is full, those are both the same slot.
+		qp := chanbuf(c, c.recvx)
+		if raceenabled {
+			racenotify(c, c.recvx, nil)
+			racenotify(c, c.recvx, sg)
+		}
+		// copy data from queue to receiver
+		if ep != nil {
+			typedmemmove(c.elemtype, ep, qp)
+		}
+		// copy data from sender to queue
+		typedmemmove(c.elemtype, qp, sg.elem)
+		c.recvx++
+		if c.recvx == c.dataqsiz {
+			c.recvx = 0
+		}
+		c.sendx = c.recvx // c.sendx = (c.sendx+1) % c.dataqsiz
+	}
+	sg.elem = nil
+	gp := sg.g
+	unlockf()
+	gp.param = unsafe.Pointer(sg)
+	sg.success = true
+	if sg.releasetime != 0 {
+		sg.releasetime = cputicks()
+	}
+	goready(gp, skip+1)
+}
+
+func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool {
+	// There are unlocked sudogs that point into gp's stack. Stack
+	// copying must lock the channels of those sudogs.
+	// Set activeStackChans here instead of before we try parking
+	// because we could self-deadlock in stack growth on the
+	// channel lock.
+	gp.activeStackChans = true
+	// Mark that it's safe for stack shrinking to occur now,
+	// because any thread acquiring this G's stack for shrinking
+	// is guaranteed to observe activeStackChans after this store.
+	atomic.Store8(&gp.parkingOnChan, 0)
+	// Make sure we unlock after setting activeStackChans and
+	// unsetting parkingOnChan. The moment we unlock chanLock
+	// we risk gp getting readied by a channel operation and
+	// so gp could continue running before everything before
+	// the unlock is visible (even to gp itself).
+	unlock((*mutex)(chanLock))
+	return true
+}
+
+// compiler implements
+//
+//	select {
+//	case c <- v:
+//		... foo
+//	default:
+//		... bar
+//	}
+//
+// as
+//
+//	if selectnbsend(c, v) {
+//		... foo
+//	} else {
+//		... bar
+//	}
+//
+func selectnbsend(c *hchan, elem unsafe.Pointer) (selected bool) {
+	return chansend(c, elem, false, getcallerpc())
+}
+
+// compiler implements
+//
+//	select {
+//	case v, ok = <-c:
+//		... foo
+//	default:
+//		... bar
+//	}
+//
+// as
+//
+//	if selected, ok = selectnbrecv(&v, c); selected {
+//		... foo
+//	} else {
+//		... bar
+//	}
+//
+func selectnbrecv(elem unsafe.Pointer, c *hchan) (selected, received bool) {
+	return chanrecv(c, elem, false)
+}
+
+//go:linkname reflect_chansend reflect.chansend
+func reflect_chansend(c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
+	return chansend(c, elem, !nb, getcallerpc())
+}
+
+//go:linkname reflect_chanrecv reflect.chanrecv
+func reflect_chanrecv(c *hchan, nb bool, elem unsafe.Pointer) (selected bool, received bool) {
+	return chanrecv(c, elem, !nb)
+}
+
+//go:linkname reflect_chanlen reflect.chanlen
+func reflect_chanlen(c *hchan) int {
+	if c == nil {
+		return 0
+	}
+	return int(c.qcount)
+}
+
+//go:linkname reflectlite_chanlen internal/reflectlite.chanlen
+func reflectlite_chanlen(c *hchan) int {
+	if c == nil {
+		return 0
+	}
+	return int(c.qcount)
+}
+
+//go:linkname reflect_chancap reflect.chancap
+func reflect_chancap(c *hchan) int {
+	if c == nil {
+		return 0
+	}
+	return int(c.dataqsiz)
+}
+
+//go:linkname reflect_chanclose reflect.chanclose
+func reflect_chanclose(c *hchan) {
+	closechan(c)
+}
+
+func (q *waitq) enqueue(sgp *sudog) {
+	sgp.next = nil
+	x := q.last
+	if x == nil {
+		sgp.prev = nil
+		q.first = sgp
+		q.last = sgp
+		return
+	}
+	sgp.prev = x
+	x.next = sgp
+	q.last = sgp
+}
+
+func (q *waitq) dequeue() *sudog {
+	for {
+		sgp := q.first
+		if sgp == nil {
+			return nil
+		}
+		y := sgp.next
+		if y == nil {
+			q.first = nil
+			q.last = nil
+		} else {
+			y.prev = nil
+			q.first = y
+			sgp.next = nil // mark as removed (see dequeueSudog)
+		}
+
+		// if a goroutine was put on this queue because of a
+		// select, there is a small window between the goroutine
+		// being woken up by a different case and it grabbing the
+		// channel locks. Once it has the lock
+		// it removes itself from the queue, so we won't see it after that.
+		// We use a flag in the G struct to tell us when someone
+		// else has won the race to signal this goroutine but the goroutine
+		// hasn't removed itself from the queue yet.
+		if sgp.isSelect && !atomic.Cas(&sgp.g.selectDone, 0, 1) {
+			continue
+		}
+
+		return sgp
+	}
+}
+
+func (c *hchan) raceaddr() unsafe.Pointer {
+	// Treat read-like and write-like operations on the channel to
+	// happen at this address. Avoid using the address of qcount
+	// or dataqsiz, because the len() and cap() builtins read
+	// those addresses, and we don't want them racing with
+	// operations like close().
+	return unsafe.Pointer(&c.buf)
+}
+
+func racesync(c *hchan, sg *sudog) {
+	racerelease(chanbuf(c, 0))
+	raceacquireg(sg.g, chanbuf(c, 0))
+	racereleaseg(sg.g, chanbuf(c, 0))
+	raceacquire(chanbuf(c, 0))
+}
+
+// Notify the race detector of a send or receive involving buffer entry idx
+// and a channel c or its communicating partner sg.
+// This function handles the special case of c.elemsize==0.
+func racenotify(c *hchan, idx uint, sg *sudog) {
+	// We could have passed the unsafe.Pointer corresponding to entry idx
+	// instead of idx itself.  However, in a future version of this function,
+	// we can use idx to better handle the case of elemsize==0.
+	// A future improvement to the detector is to call TSan with c and idx:
+	// this way, Go will continue to not allocating buffer entries for channels
+	// of elemsize==0, yet the race detector can be made to handle multiple
+	// sync objects underneath the hood (one sync object per idx)
+	qp := chanbuf(c, idx)
+	// When elemsize==0, we don't allocate a full buffer for the channel.
+	// Instead of individual buffer entries, the race detector uses the
+	// c.buf as the only buffer entry.  This simplification prevents us from
+	// following the memory model's happens-before rules (rules that are
+	// implemented in racereleaseacquire).  Instead, we accumulate happens-before
+	// information in the synchronization object associated with c.buf.
+	if c.elemsize == 0 {
+		if sg == nil {
+			raceacquire(qp)
+			racerelease(qp)
+		} else {
+			raceacquireg(sg.g, qp)
+			racereleaseg(sg.g, qp)
+		}
+	} else {
+		if sg == nil {
+			racereleaseacquire(qp)
+		} else {
+			racereleaseacquireg(sg.g, qp)
+		}
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/checkptr.go b/contrib/go/_std_1.18/src/runtime/checkptr.go
new file mode 100644
index 0000000000..2d4afd5cf6
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/checkptr.go
@@ -0,0 +1,109 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func checkptrAlignment(p unsafe.Pointer, elem *_type, n uintptr) {
+	// nil pointer is always suitably aligned (#47430).
+	if p == nil {
+		return
+	}
+
+	// Check that (*[n]elem)(p) is appropriately aligned.
+	// Note that we allow unaligned pointers if the types they point to contain
+	// no pointers themselves. See issue 37298.
+	// TODO(mdempsky): What about fieldAlign?
+	if elem.ptrdata != 0 && uintptr(p)&(uintptr(elem.align)-1) != 0 {
+		throw("checkptr: misaligned pointer conversion")
+	}
+
+	// Check that (*[n]elem)(p) doesn't straddle multiple heap objects.
+	// TODO(mdempsky): Fix #46938 so we don't need to worry about overflow here.
+	if checkptrStraddles(p, n*elem.size) {
+		throw("checkptr: converted pointer straddles multiple allocations")
+	}
+}
+
+// checkptrStraddles reports whether the first size-bytes of memory
+// addressed by ptr is known to straddle more than one Go allocation.
+func checkptrStraddles(ptr unsafe.Pointer, size uintptr) bool {
+	if size <= 1 {
+		return false
+	}
+
+	// Check that add(ptr, size-1) won't overflow. This avoids the risk
+	// of producing an illegal pointer value (assuming ptr is legal).
+	if uintptr(ptr) >= -(size - 1) {
+		return true
+	}
+	end := add(ptr, size-1)
+
+	// TODO(mdempsky): Detect when [ptr, end] contains Go allocations,
+	// but neither ptr nor end point into one themselves.
+
+	return checkptrBase(ptr) != checkptrBase(end)
+}
+
+func checkptrArithmetic(p unsafe.Pointer, originals []unsafe.Pointer) {
+	if 0 < uintptr(p) && uintptr(p) < minLegalPointer {
+		throw("checkptr: pointer arithmetic computed bad pointer value")
+	}
+
+	// Check that if the computed pointer p points into a heap
+	// object, then one of the original pointers must have pointed
+	// into the same object.
+	base := checkptrBase(p)
+	if base == 0 {
+		return
+	}
+
+	for _, original := range originals {
+		if base == checkptrBase(original) {
+			return
+		}
+	}
+
+	throw("checkptr: pointer arithmetic result points to invalid allocation")
+}
+
+// checkptrBase returns the base address for the allocation containing
+// the address p.
+//
+// Importantly, if p1 and p2 point into the same variable, then
+// checkptrBase(p1) == checkptrBase(p2). However, the converse/inverse
+// is not necessarily true as allocations can have trailing padding,
+// and multiple variables may be packed into a single allocation.
+func checkptrBase(p unsafe.Pointer) uintptr {
+	// stack
+	if gp := getg(); gp.stack.lo <= uintptr(p) && uintptr(p) < gp.stack.hi {
+		// TODO(mdempsky): Walk the stack to identify the
+		// specific stack frame or even stack object that p
+		// points into.
+		//
+		// In the mean time, use "1" as a pseudo-address to
+		// represent the stack. This is an invalid address on
+		// all platforms, so it's guaranteed to be distinct
+		// from any of the addresses we might return below.
+		return 1
+	}
+
+	// heap (must check after stack because of #35068)
+	if base, _, _ := findObject(uintptr(p), 0, 0); base != 0 {
+		return base
+	}
+
+	// data or bss
+	for _, datap := range activeModules() {
+		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			return datap.data
+		}
+		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			return datap.bss
+		}
+	}
+
+	return 0
+}
diff --git a/contrib/go/_std_1.18/src/runtime/compiler.go b/contrib/go/_std_1.18/src/runtime/compiler.go
new file mode 100644
index 0000000000..1ebc62dea1
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/compiler.go
@@ -0,0 +1,13 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Compiler is the name of the compiler toolchain that built the
+// running binary. Known toolchains are:
+//
+//	gc      Also known as cmd/compile.
+//	gccgo   The gccgo front end, part of the GCC compiler suite.
+//
+const Compiler = "gc"
diff --git a/contrib/go/_std_1.18/src/runtime/complex.go b/contrib/go/_std_1.18/src/runtime/complex.go
new file mode 100644
index 0000000000..07c596fc0b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/complex.go
@@ -0,0 +1,61 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// inf2one returns a signed 1 if f is an infinity and a signed 0 otherwise.
+// The sign of the result is the sign of f.
+func inf2one(f float64) float64 {
+	g := 0.0
+	if isInf(f) {
+		g = 1.0
+	}
+	return copysign(g, f)
+}
+
+func complex128div(n complex128, m complex128) complex128 {
+	var e, f float64 // complex(e, f) = n/m
+
+	// Algorithm for robust complex division as described in
+	// Robert L. Smith: Algorithm 116: Complex division. Commun. ACM 5(8): 435 (1962).
+	if abs(real(m)) >= abs(imag(m)) {
+		ratio := imag(m) / real(m)
+		denom := real(m) + ratio*imag(m)
+		e = (real(n) + imag(n)*ratio) / denom
+		f = (imag(n) - real(n)*ratio) / denom
+	} else {
+		ratio := real(m) / imag(m)
+		denom := imag(m) + ratio*real(m)
+		e = (real(n)*ratio + imag(n)) / denom
+		f = (imag(n)*ratio - real(n)) / denom
+	}
+
+	if isNaN(e) && isNaN(f) {
+		// Correct final result to infinities and zeros if applicable.
+		// Matches C99: ISO/IEC 9899:1999 - G.5.1  Multiplicative operators.
+
+		a, b := real(n), imag(n)
+		c, d := real(m), imag(m)
+
+		switch {
+		case m == 0 && (!isNaN(a) || !isNaN(b)):
+			e = copysign(inf, c) * a
+			f = copysign(inf, c) * b
+
+		case (isInf(a) || isInf(b)) && isFinite(c) && isFinite(d):
+			a = inf2one(a)
+			b = inf2one(b)
+			e = inf * (a*c + b*d)
+			f = inf * (b*c - a*d)
+
+		case (isInf(c) || isInf(d)) && isFinite(a) && isFinite(b):
+			c = inf2one(c)
+			d = inf2one(d)
+			e = 0 * (a*c + b*d)
+			f = 0 * (b*c - a*d)
+		}
+	}
+
+	return complex(e, f)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cpuflags.go b/contrib/go/_std_1.18/src/runtime/cpuflags.go
new file mode 100644
index 0000000000..bbe93c5bea
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cpuflags.go
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	offsetX86HasAVX    = unsafe.Offsetof(cpu.X86.HasAVX)
+	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasERMS   = unsafe.Offsetof(cpu.X86.HasERMS)
+	offsetX86HasRDTSCP = unsafe.Offsetof(cpu.X86.HasRDTSCP)
+
+	offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
+
+	offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
+)
+
+var (
+	// Set in runtime.cpuinit.
+	// TODO: deprecate these; use internal/cpu directly.
+	x86HasPOPCNT bool
+	x86HasSSE41  bool
+	x86HasFMA    bool
+
+	armHasVFPv4 bool
+
+	arm64HasATOMICS bool
+)
diff --git a/contrib/go/_std_1.18/src/runtime/cpuflags_amd64.go b/contrib/go/_std_1.18/src/runtime/cpuflags_amd64.go
new file mode 100644
index 0000000000..8cca4bca8f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cpuflags_amd64.go
@@ -0,0 +1,24 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+)
+
+var useAVXmemmove bool
+
+func init() {
+	// Let's remove stepping and reserved fields
+	processor := processorVersionInfo & 0x0FFF3FF0
+
+	isIntelBridgeFamily := isIntel &&
+		processor == 0x206A0 ||
+		processor == 0x206D0 ||
+		processor == 0x306A0 ||
+		processor == 0x306E0
+
+	useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cpuprof.go b/contrib/go/_std_1.18/src/runtime/cpuprof.go
new file mode 100644
index 0000000000..48cef46fe9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cpuprof.go
@@ -0,0 +1,218 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// CPU profiling.
+//
+// The signal handler for the profiling clock tick adds a new stack trace
+// to a log of recent traces. The log is read by a user goroutine that
+// turns it into formatted profile data. If the reader does not keep up
+// with the log, those writes will be recorded as a count of lost records.
+// The actual profile buffer is in profbuf.go.
+
+package runtime
+
+import (
+	"internal/abi"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const maxCPUProfStack = 64
+
+type cpuProfile struct {
+	lock mutex
+	on   bool     // profiling is on
+	log  *profBuf // profile events written here
+
+	// extra holds extra stacks accumulated in addNonGo
+	// corresponding to profiling signals arriving on
+	// non-Go-created threads. Those stacks are written
+	// to log the next time a normal Go thread gets the
+	// signal handler.
+	// Assuming the stacks are 2 words each (we don't get
+	// a full traceback from those threads), plus one word
+	// size for framing, 100 Hz profiling would generate
+	// 300 words per second.
+	// Hopefully a normal Go thread will get the profiling
+	// signal at least once every few seconds.
+	extra      [1000]uintptr
+	numExtra   int
+	lostExtra  uint64 // count of frames lost because extra is full
+	lostAtomic uint64 // count of frames lost because of being in atomic64 on mips/arm; updated racily
+}
+
+var cpuprof cpuProfile
+
+// SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
+// If hz <= 0, SetCPUProfileRate turns off profiling.
+// If the profiler is on, the rate cannot be changed without first turning it off.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.cpuprofile flag instead of calling
+// SetCPUProfileRate directly.
+func SetCPUProfileRate(hz int) {
+	// Clamp hz to something reasonable.
+	if hz < 0 {
+		hz = 0
+	}
+	if hz > 1000000 {
+		hz = 1000000
+	}
+
+	lock(&cpuprof.lock)
+	if hz > 0 {
+		if cpuprof.on || cpuprof.log != nil {
+			print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
+			unlock(&cpuprof.lock)
+			return
+		}
+
+		cpuprof.on = true
+		cpuprof.log = newProfBuf(1, 1<<17, 1<<14)
+		hdr := [1]uint64{uint64(hz)}
+		cpuprof.log.write(nil, nanotime(), hdr[:], nil)
+		setcpuprofilerate(int32(hz))
+	} else if cpuprof.on {
+		setcpuprofilerate(0)
+		cpuprof.on = false
+		cpuprof.addExtra()
+		cpuprof.log.close()
+	}
+	unlock(&cpuprof.lock)
+}
+
+// add adds the stack trace to the profile.
+// It is called from signal handlers and other limited environments
+// and cannot allocate memory or acquire locks that might be
+// held at the time of the signal, nor can it use substantial amounts
+// of stack.
+//go:nowritebarrierrec
+func (p *cpuProfile) add(tagPtr *unsafe.Pointer, stk []uintptr) {
+	// Simple cas-lock to coordinate with setcpuprofilerate.
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
+	}
+
+	if prof.hz != 0 { // implies cpuprof.log != nil
+		if p.numExtra > 0 || p.lostExtra > 0 || p.lostAtomic > 0 {
+			p.addExtra()
+		}
+		hdr := [1]uint64{1}
+		// Note: write "knows" that the argument is &gp.labels,
+		// because otherwise its write barrier behavior may not
+		// be correct. See the long comment there before
+		// changing the argument here.
+		cpuprof.log.write(tagPtr, nanotime(), hdr[:], stk)
+	}
+
+	atomic.Store(&prof.signalLock, 0)
+}
+
+// addNonGo adds the non-Go stack trace to the profile.
+// It is called from a non-Go thread, so we cannot use much stack at all,
+// nor do anything that needs a g or an m.
+// In particular, we can't call cpuprof.log.write.
+// Instead, we copy the stack into cpuprof.extra,
+// which will be drained the next time a Go thread
+// gets the signal handling event.
+//go:nosplit
+//go:nowritebarrierrec
+func (p *cpuProfile) addNonGo(stk []uintptr) {
+	// Simple cas-lock to coordinate with SetCPUProfileRate.
+	// (Other calls to add or addNonGo should be blocked out
+	// by the fact that only one SIGPROF can be handled by the
+	// process at a time. If not, this lock will serialize those too.)
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
+	}
+
+	if cpuprof.numExtra+1+len(stk) < len(cpuprof.extra) {
+		i := cpuprof.numExtra
+		cpuprof.extra[i] = uintptr(1 + len(stk))
+		copy(cpuprof.extra[i+1:], stk)
+		cpuprof.numExtra += 1 + len(stk)
+	} else {
+		cpuprof.lostExtra++
+	}
+
+	atomic.Store(&prof.signalLock, 0)
+}
+
+// addExtra adds the "extra" profiling events,
+// queued by addNonGo, to the profile log.
+// addExtra is called either from a signal handler on a Go thread
+// or from an ordinary goroutine; either way it can use stack
+// and has a g. The world may be stopped, though.
+func (p *cpuProfile) addExtra() {
+	// Copy accumulated non-Go profile events.
+	hdr := [1]uint64{1}
+	for i := 0; i < p.numExtra; {
+		p.log.write(nil, 0, hdr[:], p.extra[i+1:i+int(p.extra[i])])
+		i += int(p.extra[i])
+	}
+	p.numExtra = 0
+
+	// Report any lost events.
+	if p.lostExtra > 0 {
+		hdr := [1]uint64{p.lostExtra}
+		lostStk := [2]uintptr{
+			abi.FuncPCABIInternal(_LostExternalCode) + sys.PCQuantum,
+			abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum,
+		}
+		p.log.write(nil, 0, hdr[:], lostStk[:])
+		p.lostExtra = 0
+	}
+
+	if p.lostAtomic > 0 {
+		hdr := [1]uint64{p.lostAtomic}
+		lostStk := [2]uintptr{
+			abi.FuncPCABIInternal(_LostSIGPROFDuringAtomic64) + sys.PCQuantum,
+			abi.FuncPCABIInternal(_System) + sys.PCQuantum,
+		}
+		p.log.write(nil, 0, hdr[:], lostStk[:])
+		p.lostAtomic = 0
+	}
+
+}
+
+// CPUProfile panics.
+// It formerly provided raw access to chunks of
+// a pprof-format profile generated by the runtime.
+// The details of generating that format have changed,
+// so this functionality has been removed.
+//
+// Deprecated: Use the runtime/pprof package,
+// or the handlers in the net/http/pprof package,
+// or the testing package's -test.cpuprofile flag instead.
+func CPUProfile() []byte {
+	panic("CPUProfile no longer available")
+}
+
+//go:linkname runtime_pprof_runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
+func runtime_pprof_runtime_cyclesPerSecond() int64 {
+	return tickspersecond()
+}
+
+// readProfile, provided to runtime/pprof, returns the next chunk of
+// binary CPU profiling stack trace data, blocking until data is available.
+// If profiling is turned off and all the profile data accumulated while it was
+// on has been returned, readProfile returns eof=true.
+// The caller must save the returned data and tags before calling readProfile again.
+// The returned data contains a whole number of records, and tags contains
+// exactly one entry per record.
+//
+//go:linkname runtime_pprof_readProfile runtime/pprof.readProfile
+func runtime_pprof_readProfile() ([]uint64, []unsafe.Pointer, bool) {
+	lock(&cpuprof.lock)
+	log := cpuprof.log
+	unlock(&cpuprof.lock)
+	data, tags, eof := log.read(profBufBlocking)
+	if len(data) == 0 && eof {
+		lock(&cpuprof.lock)
+		cpuprof.log = nil
+		unlock(&cpuprof.lock)
+	}
+	return data, tags, eof
+}
diff --git a/contrib/go/_std_1.18/src/runtime/cputicks.go b/contrib/go/_std_1.18/src/runtime/cputicks.go
new file mode 100644
index 0000000000..2cf3240333
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/cputicks.go
@@ -0,0 +1,11 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !arm && !arm64 && !mips64 && !mips64le && !mips && !mipsle && !wasm
+
+package runtime
+
+// careful: cputicks is not guaranteed to be monotonic! In particular, we have
+// noticed drift between cpus on certain os/arch combinations. See issue 8976.
+func cputicks() int64
diff --git a/contrib/go/_std_1.18/src/runtime/debug.go b/contrib/go/_std_1.18/src/runtime/debug.go
new file mode 100644
index 0000000000..2703a0ce01
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/debug.go
@@ -0,0 +1,116 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// GOMAXPROCS sets the maximum number of CPUs that can be executing
+// simultaneously and returns the previous setting. It defaults to
+// the value of runtime.NumCPU. If n < 1, it does not change the current setting.
+// This call will go away when the scheduler improves.
+func GOMAXPROCS(n int) int {
+	if GOARCH == "wasm" && n > 1 {
+		n = 1 // WebAssembly has no threads yet, so only one CPU is possible.
+	}
+
+	lock(&sched.lock)
+	ret := int(gomaxprocs)
+	unlock(&sched.lock)
+	if n <= 0 || n == ret {
+		return ret
+	}
+
+	stopTheWorldGC("GOMAXPROCS")
+
+	// newprocs will be processed by startTheWorld
+	newprocs = int32(n)
+
+	startTheWorldGC()
+	return ret
+}
+
+// NumCPU returns the number of logical CPUs usable by the current process.
+//
+// The set of available CPUs is checked by querying the operating system
+// at process startup. Changes to operating system CPU allocation after
+// process startup are not reflected.
+func NumCPU() int {
+	return int(ncpu)
+}
+
+// NumCgoCall returns the number of cgo calls made by the current process.
+func NumCgoCall() int64 {
+	var n = int64(atomic.Load64(&ncgocall))
+	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+		n += int64(mp.ncgocall)
+	}
+	return n
+}
+
+// NumGoroutine returns the number of goroutines that currently exist.
+func NumGoroutine() int {
+	return int(gcount())
+}
+
+//go:linkname debug_modinfo runtime/debug.modinfo
+func debug_modinfo() string {
+	return modinfo
+}
+
+// mayMoreStackPreempt is a maymorestack hook that forces a preemption
+// at every possible cooperative preemption point.
+//
+// This is valuable to apply to the runtime, which can be sensitive to
+// preemption points. To apply this to all preemption points in the
+// runtime and runtime-like code, use the following in bash or zsh:
+//
+//   X=(-{gc,asm}flags={runtime/...,reflect,sync}=-d=maymorestack=runtime.mayMoreStackPreempt) GOFLAGS=${X[@]}
+//
+// This must be deeply nosplit because it is called from a function
+// prologue before the stack is set up and because the compiler will
+// call it from any splittable prologue (leading to infinite
+// recursion).
+//
+// Ideally it should also use very little stack because the linker
+// doesn't currently account for this in nosplit stack depth checking.
+//
+//go:nosplit
+//
+// Ensure mayMoreStackPreempt can be called for all ABIs.
+//
+//go:linkname mayMoreStackPreempt
+func mayMoreStackPreempt() {
+	// Don't do anything on the g0 or gsignal stack.
+	g := getg()
+	if g == g.m.g0 || g == g.m.gsignal {
+		return
+	}
+	// Force a preemption, unless the stack is already poisoned.
+	if g.stackguard0 < stackPoisonMin {
+		g.stackguard0 = stackPreempt
+	}
+}
+
+// mayMoreStackMove is a maymorestack hook that forces stack movement
+// at every possible point.
+//
+// See mayMoreStackPreempt.
+//
+//go:nosplit
+//go:linkname mayMoreStackMove
+func mayMoreStackMove() {
+	// Don't do anything on the g0 or gsignal stack.
+	g := getg()
+	if g == g.m.g0 || g == g.m.gsignal {
+		return
+	}
+	// Force stack movement, unless the stack is already poisoned.
+	if g.stackguard0 < stackPoisonMin {
+		g.stackguard0 = stackForceMove
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/debugcall.go b/contrib/go/_std_1.18/src/runtime/debugcall.go
new file mode 100644
index 0000000000..2f164e7fd7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/debugcall.go
@@ -0,0 +1,253 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64
+
+package runtime
+
+import "unsafe"
+
+const (
+	debugCallSystemStack = "executing on Go runtime stack"
+	debugCallUnknownFunc = "call from unknown function"
+	debugCallRuntime     = "call from within the Go runtime"
+	debugCallUnsafePoint = "call not at safe point"
+)
+
+func debugCallV2()
+func debugCallPanicked(val any)
+
+// debugCallCheck checks whether it is safe to inject a debugger
+// function call with return PC pc. If not, it returns a string
+// explaining why.
+//
+//go:nosplit
+func debugCallCheck(pc uintptr) string {
+	// No user calls from the system stack.
+	if getg() != getg().m.curg {
+		return debugCallSystemStack
+	}
+	if sp := getcallersp(); !(getg().stack.lo < sp && sp <= getg().stack.hi) {
+		// Fast syscalls (nanotime) and racecall switch to the
+		// g0 stack without switching g. We can't safely make
+		// a call in this state. (We can't even safely
+		// systemstack.)
+		return debugCallSystemStack
+	}
+
+	// Switch to the system stack to avoid overflowing the user
+	// stack.
+	var ret string
+	systemstack(func() {
+		f := findfunc(pc)
+		if !f.valid() {
+			ret = debugCallUnknownFunc
+			return
+		}
+
+		name := funcname(f)
+
+		switch name {
+		case "debugCall32",
+			"debugCall64",
+			"debugCall128",
+			"debugCall256",
+			"debugCall512",
+			"debugCall1024",
+			"debugCall2048",
+			"debugCall4096",
+			"debugCall8192",
+			"debugCall16384",
+			"debugCall32768",
+			"debugCall65536":
+			// These functions are allowed so that the debugger can initiate multiple function calls.
+			// See: https://golang.org/cl/161137/
+			return
+		}
+
+		// Disallow calls from the runtime. We could
+		// potentially make this condition tighter (e.g., not
+		// when locks are held), but there are enough tightly
+		// coded sequences (e.g., defer handling) that it's
+		// better to play it safe.
+		if pfx := "runtime."; len(name) > len(pfx) && name[:len(pfx)] == pfx {
+			ret = debugCallRuntime
+			return
+		}
+
+		// Check that this isn't an unsafe-point.
+		if pc != f.entry() {
+			pc--
+		}
+		up := pcdatavalue(f, _PCDATA_UnsafePoint, pc, nil)
+		if up != _PCDATA_UnsafePointSafe {
+			// Not at a safe point.
+			ret = debugCallUnsafePoint
+		}
+	})
+	return ret
+}
+
+// debugCallWrap starts a new goroutine to run a debug call and blocks
+// the calling goroutine. On the goroutine, it prepares to recover
+// panics from the debug call, and then calls the call dispatching
+// function at PC dispatch.
+//
+// This must be deeply nosplit because there are untyped values on the
+// stack from debugCallV2.
+//
+//go:nosplit
+func debugCallWrap(dispatch uintptr) {
+	var lockedm bool
+	var lockedExt uint32
+	callerpc := getcallerpc()
+	gp := getg()
+
+	// Create a new goroutine to execute the call on. Run this on
+	// the system stack to avoid growing our stack.
+	systemstack(func() {
+		// TODO(mknyszek): It would be nice to wrap these arguments in an allocated
+		// closure and start the goroutine with that closure, but the compiler disallows
+		// implicit closure allocation in the runtime.
+		fn := debugCallWrap1
+		newg := newproc1(*(**funcval)(unsafe.Pointer(&fn)), gp, callerpc)
+		args := &debugCallWrapArgs{
+			dispatch: dispatch,
+			callingG: gp,
+		}
+		newg.param = unsafe.Pointer(args)
+
+		// If the current G is locked, then transfer that
+		// locked-ness to the new goroutine.
+		if gp.lockedm != 0 {
+			// Save lock state to restore later.
+			mp := gp.m
+			if mp != gp.lockedm.ptr() {
+				throw("inconsistent lockedm")
+			}
+
+			lockedm = true
+			lockedExt = mp.lockedExt
+
+			// Transfer external lock count to internal so
+			// it can't be unlocked from the debug call.
+			mp.lockedInt++
+			mp.lockedExt = 0
+
+			mp.lockedg.set(newg)
+			newg.lockedm.set(mp)
+			gp.lockedm = 0
+		}
+
+		// Mark the calling goroutine as being at an async
+		// safe-point, since it has a few conservative frames
+		// at the bottom of the stack. This also prevents
+		// stack shrinks.
+		gp.asyncSafePoint = true
+
+		// Stash newg away so we can execute it below (mcall's
+		// closure can't capture anything).
+		gp.schedlink.set(newg)
+	})
+
+	// Switch to the new goroutine.
+	mcall(func(gp *g) {
+		// Get newg.
+		newg := gp.schedlink.ptr()
+		gp.schedlink = 0
+
+		// Park the calling goroutine.
+		gp.waitreason = waitReasonDebugCall
+		if trace.enabled {
+			traceGoPark(traceEvGoBlock, 1)
+		}
+		casgstatus(gp, _Grunning, _Gwaiting)
+		dropg()
+
+		// Directly execute the new goroutine. The debug
+		// protocol will continue on the new goroutine, so
+		// it's important we not just let the scheduler do
+		// this or it may resume a different goroutine.
+		execute(newg, true)
+	})
+
+	// We'll resume here when the call returns.
+
+	// Restore locked state.
+	if lockedm {
+		mp := gp.m
+		mp.lockedExt = lockedExt
+		mp.lockedInt--
+		mp.lockedg.set(gp)
+		gp.lockedm.set(mp)
+	}
+
+	gp.asyncSafePoint = false
+}
+
+type debugCallWrapArgs struct {
+	dispatch uintptr
+	callingG *g
+}
+
+// debugCallWrap1 is the continuation of debugCallWrap on the callee
+// goroutine.
+func debugCallWrap1() {
+	gp := getg()
+	args := (*debugCallWrapArgs)(gp.param)
+	dispatch, callingG := args.dispatch, args.callingG
+	gp.param = nil
+
+	// Dispatch call and trap panics.
+	debugCallWrap2(dispatch)
+
+	// Resume the caller goroutine.
+	getg().schedlink.set(callingG)
+	mcall(func(gp *g) {
+		callingG := gp.schedlink.ptr()
+		gp.schedlink = 0
+
+		// Unlock this goroutine from the M if necessary. The
+		// calling G will relock.
+		if gp.lockedm != 0 {
+			gp.lockedm = 0
+			gp.m.lockedg = 0
+		}
+
+		// Switch back to the calling goroutine. At some point
+		// the scheduler will schedule us again and we'll
+		// finish exiting.
+		if trace.enabled {
+			traceGoSched()
+		}
+		casgstatus(gp, _Grunning, _Grunnable)
+		dropg()
+		lock(&sched.lock)
+		globrunqput(gp)
+		unlock(&sched.lock)
+
+		if trace.enabled {
+			traceGoUnpark(callingG, 0)
+		}
+		casgstatus(callingG, _Gwaiting, _Grunnable)
+		execute(callingG, true)
+	})
+}
+
+func debugCallWrap2(dispatch uintptr) {
+	// Call the dispatch function and trap panics.
+	var dispatchF func()
+	dispatchFV := funcval{dispatch}
+	*(*unsafe.Pointer)(unsafe.Pointer(&dispatchF)) = noescape(unsafe.Pointer(&dispatchFV))
+
+	var ok bool
+	defer func() {
+		if !ok {
+			err := recover()
+			debugCallPanicked(err)
+		}
+	}()
+	dispatchF()
+	ok = true
+}
diff --git a/contrib/go/_std_1.18/src/runtime/debuglog.go b/contrib/go/_std_1.18/src/runtime/debuglog.go
new file mode 100644
index 0000000000..75b91c4216
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/debuglog.go
@@ -0,0 +1,820 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file provides an internal debug logging facility. The debug
+// log is a lightweight, in-memory, per-M ring buffer. By default, the
+// runtime prints the debug log on panic.
+//
+// To print something to the debug log, call dlog to obtain a dlogger
+// and use the methods on that to add values. The values will be
+// space-separated in the output (much like println).
+//
+// This facility can be enabled by passing -tags debuglog when
+// building. Without this tag, dlog calls compile to nothing.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// debugLogBytes is the size of each per-M ring buffer. This is
+// allocated off-heap to avoid blowing up the M and hence the GC'd
+// heap size.
+const debugLogBytes = 16 << 10
+
+// debugLogStringLimit is the maximum number of bytes in a string.
+// Above this, the string will be truncated with "..(n more bytes).."
+const debugLogStringLimit = debugLogBytes / 8
+
+// dlog returns a debug logger. The caller can use methods on the
+// returned logger to add values, which will be space-separated in the
+// final output, much like println. The caller must call end() to
+// finish the message.
+//
+// dlog can be used from highly-constrained corners of the runtime: it
+// is safe to use in the signal handler, from within the write
+// barrier, from within the stack implementation, and in places that
+// must be recursively nosplit.
+//
+// This will be compiled away if built without the debuglog build tag.
+// However, argument construction may not be. If any of the arguments
+// are not literals or trivial expressions, consider protecting the
+// call with "if dlogEnabled".
+//
+//go:nosplit
+//go:nowritebarrierrec
+func dlog() *dlogger {
+	if !dlogEnabled {
+		return nil
+	}
+
+	// Get the time.
+	tick, nano := uint64(cputicks()), uint64(nanotime())
+
+	// Try to get a cached logger.
+	l := getCachedDlogger()
+
+	// If we couldn't get a cached logger, try to get one from the
+	// global pool.
+	if l == nil {
+		allp := (*uintptr)(unsafe.Pointer(&allDloggers))
+		all := (*dlogger)(unsafe.Pointer(atomic.Loaduintptr(allp)))
+		for l1 := all; l1 != nil; l1 = l1.allLink {
+			if atomic.Load(&l1.owned) == 0 && atomic.Cas(&l1.owned, 0, 1) {
+				l = l1
+				break
+			}
+		}
+	}
+
+	// If that failed, allocate a new logger.
+	if l == nil {
+		l = (*dlogger)(sysAlloc(unsafe.Sizeof(dlogger{}), nil))
+		if l == nil {
+			throw("failed to allocate debug log")
+		}
+		l.w.r.data = &l.w.data
+		l.owned = 1
+
+		// Prepend to allDloggers list.
+		headp := (*uintptr)(unsafe.Pointer(&allDloggers))
+		for {
+			head := atomic.Loaduintptr(headp)
+			l.allLink = (*dlogger)(unsafe.Pointer(head))
+			if atomic.Casuintptr(headp, head, uintptr(unsafe.Pointer(l))) {
+				break
+			}
+		}
+	}
+
+	// If the time delta is getting too high, write a new sync
+	// packet. We set the limit so we don't write more than 6
+	// bytes of delta in the record header.
+	const deltaLimit = 1<<(3*7) - 1 // ~2ms between sync packets
+	if tick-l.w.tick > deltaLimit || nano-l.w.nano > deltaLimit {
+		l.w.writeSync(tick, nano)
+	}
+
+	// Reserve space for framing header.
+	l.w.ensure(debugLogHeaderSize)
+	l.w.write += debugLogHeaderSize
+
+	// Write record header.
+	l.w.uvarint(tick - l.w.tick)
+	l.w.uvarint(nano - l.w.nano)
+	gp := getg()
+	if gp != nil && gp.m != nil && gp.m.p != 0 {
+		l.w.varint(int64(gp.m.p.ptr().id))
+	} else {
+		l.w.varint(-1)
+	}
+
+	return l
+}
+
+// A dlogger writes to the debug log.
+//
+// To obtain a dlogger, call dlog(). When done with the dlogger, call
+// end().
+//
+//go:notinheap
+type dlogger struct {
+	w debugLogWriter
+
+	// allLink is the next dlogger in the allDloggers list.
+	allLink *dlogger
+
+	// owned indicates that this dlogger is owned by an M. This is
+	// accessed atomically.
+	owned uint32
+}
+
+// allDloggers is a list of all dloggers, linked through
+// dlogger.allLink. This is accessed atomically. This is prepend only,
+// so it doesn't need to protect against ABA races.
+var allDloggers *dlogger
+
+//go:nosplit
+func (l *dlogger) end() {
+	if !dlogEnabled {
+		return
+	}
+
+	// Fill in framing header.
+	size := l.w.write - l.w.r.end
+	if !l.w.writeFrameAt(l.w.r.end, size) {
+		throw("record too large")
+	}
+
+	// Commit the record.
+	l.w.r.end = l.w.write
+
+	// Attempt to return this logger to the cache.
+	if putCachedDlogger(l) {
+		return
+	}
+
+	// Return the logger to the global pool.
+	atomic.Store(&l.owned, 0)
+}
+
+const (
+	debugLogUnknown = 1 + iota
+	debugLogBoolTrue
+	debugLogBoolFalse
+	debugLogInt
+	debugLogUint
+	debugLogHex
+	debugLogPtr
+	debugLogString
+	debugLogConstString
+	debugLogStringOverflow
+
+	debugLogPC
+	debugLogTraceback
+)
+
+//go:nosplit
+func (l *dlogger) b(x bool) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	if x {
+		l.w.byte(debugLogBoolTrue)
+	} else {
+		l.w.byte(debugLogBoolFalse)
+	}
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) i(x int) *dlogger {
+	return l.i64(int64(x))
+}
+
+//go:nosplit
+func (l *dlogger) i8(x int8) *dlogger {
+	return l.i64(int64(x))
+}
+
+//go:nosplit
+func (l *dlogger) i16(x int16) *dlogger {
+	return l.i64(int64(x))
+}
+
+//go:nosplit
+func (l *dlogger) i32(x int32) *dlogger {
+	return l.i64(int64(x))
+}
+
+//go:nosplit
+func (l *dlogger) i64(x int64) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	l.w.byte(debugLogInt)
+	l.w.varint(x)
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) u(x uint) *dlogger {
+	return l.u64(uint64(x))
+}
+
+//go:nosplit
+func (l *dlogger) uptr(x uintptr) *dlogger {
+	return l.u64(uint64(x))
+}
+
+//go:nosplit
+func (l *dlogger) u8(x uint8) *dlogger {
+	return l.u64(uint64(x))
+}
+
+//go:nosplit
+func (l *dlogger) u16(x uint16) *dlogger {
+	return l.u64(uint64(x))
+}
+
+//go:nosplit
+func (l *dlogger) u32(x uint32) *dlogger {
+	return l.u64(uint64(x))
+}
+
+//go:nosplit
+func (l *dlogger) u64(x uint64) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	l.w.byte(debugLogUint)
+	l.w.uvarint(x)
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) hex(x uint64) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	l.w.byte(debugLogHex)
+	l.w.uvarint(x)
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) p(x any) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	l.w.byte(debugLogPtr)
+	if x == nil {
+		l.w.uvarint(0)
+	} else {
+		v := efaceOf(&x)
+		switch v._type.kind & kindMask {
+		case kindChan, kindFunc, kindMap, kindPtr, kindUnsafePointer:
+			l.w.uvarint(uint64(uintptr(v.data)))
+		default:
+			throw("not a pointer type")
+		}
+	}
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) s(x string) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	str := stringStructOf(&x)
+	datap := &firstmoduledata
+	if len(x) > 4 && datap.etext <= uintptr(str.str) && uintptr(str.str) < datap.end {
+		// String constants are in the rodata section, which
+		// isn't recorded in moduledata. But it has to be
+		// somewhere between etext and end.
+		l.w.byte(debugLogConstString)
+		l.w.uvarint(uint64(str.len))
+		l.w.uvarint(uint64(uintptr(str.str) - datap.etext))
+	} else {
+		l.w.byte(debugLogString)
+		var b []byte
+		bb := (*slice)(unsafe.Pointer(&b))
+		bb.array = str.str
+		bb.len, bb.cap = str.len, str.len
+		if len(b) > debugLogStringLimit {
+			b = b[:debugLogStringLimit]
+		}
+		l.w.uvarint(uint64(len(b)))
+		l.w.bytes(b)
+		if len(b) != len(x) {
+			l.w.byte(debugLogStringOverflow)
+			l.w.uvarint(uint64(len(x) - len(b)))
+		}
+	}
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) pc(x uintptr) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	l.w.byte(debugLogPC)
+	l.w.uvarint(uint64(x))
+	return l
+}
+
+//go:nosplit
+func (l *dlogger) traceback(x []uintptr) *dlogger {
+	if !dlogEnabled {
+		return l
+	}
+	l.w.byte(debugLogTraceback)
+	l.w.uvarint(uint64(len(x)))
+	for _, pc := range x {
+		l.w.uvarint(uint64(pc))
+	}
+	return l
+}
+
+// A debugLogWriter is a ring buffer of binary debug log records.
+//
+// A log record consists of a 2-byte framing header and a sequence of
+// fields. The framing header gives the size of the record as a little
+// endian 16-bit value. Each field starts with a byte indicating its
+// type, followed by type-specific data. If the size in the framing
+// header is 0, it's a sync record consisting of two little endian
+// 64-bit values giving a new time base.
+//
+// Because this is a ring buffer, new records will eventually
+// overwrite old records. Hence, it maintains a reader that consumes
+// the log as it gets overwritten. That reader state is where an
+// actual log reader would start.
+//
+//go:notinheap
+type debugLogWriter struct {
+	write uint64
+	data  debugLogBuf
+
+	// tick and nano are the time bases from the most recently
+	// written sync record.
+	tick, nano uint64
+
+	// r is a reader that consumes records as they get overwritten
+	// by the writer. It also acts as the initial reader state
+	// when printing the log.
+	r debugLogReader
+
+	// buf is a scratch buffer for encoding. This is here to
+	// reduce stack usage.
+	buf [10]byte
+}
+
+//go:notinheap
+type debugLogBuf [debugLogBytes]byte
+
+const (
+	// debugLogHeaderSize is the number of bytes in the framing
+	// header of every dlog record.
+	debugLogHeaderSize = 2
+
+	// debugLogSyncSize is the number of bytes in a sync record.
+	debugLogSyncSize = debugLogHeaderSize + 2*8
+)
+
+//go:nosplit
+func (l *debugLogWriter) ensure(n uint64) {
+	for l.write+n >= l.r.begin+uint64(len(l.data)) {
+		// Consume record at begin.
+		if l.r.skip() == ^uint64(0) {
+			// Wrapped around within a record.
+			//
+			// TODO(austin): It would be better to just
+			// eat the whole buffer at this point, but we
+			// have to communicate that to the reader
+			// somehow.
+			throw("record wrapped around")
+		}
+	}
+}
+
+//go:nosplit
+func (l *debugLogWriter) writeFrameAt(pos, size uint64) bool {
+	l.data[pos%uint64(len(l.data))] = uint8(size)
+	l.data[(pos+1)%uint64(len(l.data))] = uint8(size >> 8)
+	return size <= 0xFFFF
+}
+
+//go:nosplit
+func (l *debugLogWriter) writeSync(tick, nano uint64) {
+	l.tick, l.nano = tick, nano
+	l.ensure(debugLogHeaderSize)
+	l.writeFrameAt(l.write, 0)
+	l.write += debugLogHeaderSize
+	l.writeUint64LE(tick)
+	l.writeUint64LE(nano)
+	l.r.end = l.write
+}
+
+//go:nosplit
+func (l *debugLogWriter) writeUint64LE(x uint64) {
+	var b [8]byte
+	b[0] = byte(x)
+	b[1] = byte(x >> 8)
+	b[2] = byte(x >> 16)
+	b[3] = byte(x >> 24)
+	b[4] = byte(x >> 32)
+	b[5] = byte(x >> 40)
+	b[6] = byte(x >> 48)
+	b[7] = byte(x >> 56)
+	l.bytes(b[:])
+}
+
+//go:nosplit
+func (l *debugLogWriter) byte(x byte) {
+	l.ensure(1)
+	pos := l.write
+	l.write++
+	l.data[pos%uint64(len(l.data))] = x
+}
+
+//go:nosplit
+func (l *debugLogWriter) bytes(x []byte) {
+	l.ensure(uint64(len(x)))
+	pos := l.write
+	l.write += uint64(len(x))
+	for len(x) > 0 {
+		n := copy(l.data[pos%uint64(len(l.data)):], x)
+		pos += uint64(n)
+		x = x[n:]
+	}
+}
+
+//go:nosplit
+func (l *debugLogWriter) varint(x int64) {
+	var u uint64
+	if x < 0 {
+		u = (^uint64(x) << 1) | 1 // complement i, bit 0 is 1
+	} else {
+		u = (uint64(x) << 1) // do not complement i, bit 0 is 0
+	}
+	l.uvarint(u)
+}
+
+//go:nosplit
+func (l *debugLogWriter) uvarint(u uint64) {
+	i := 0
+	for u >= 0x80 {
+		l.buf[i] = byte(u) | 0x80
+		u >>= 7
+		i++
+	}
+	l.buf[i] = byte(u)
+	i++
+	l.bytes(l.buf[:i])
+}
+
+type debugLogReader struct {
+	data *debugLogBuf
+
+	// begin and end are the positions in the log of the beginning
+	// and end of the log data, modulo len(data).
+	begin, end uint64
+
+	// tick and nano are the current time base at begin.
+	tick, nano uint64
+}
+
+//go:nosplit
+func (r *debugLogReader) skip() uint64 {
+	// Read size at pos.
+	if r.begin+debugLogHeaderSize > r.end {
+		return ^uint64(0)
+	}
+	size := uint64(r.readUint16LEAt(r.begin))
+	if size == 0 {
+		// Sync packet.
+		r.tick = r.readUint64LEAt(r.begin + debugLogHeaderSize)
+		r.nano = r.readUint64LEAt(r.begin + debugLogHeaderSize + 8)
+		size = debugLogSyncSize
+	}
+	if r.begin+size > r.end {
+		return ^uint64(0)
+	}
+	r.begin += size
+	return size
+}
+
+//go:nosplit
+func (r *debugLogReader) readUint16LEAt(pos uint64) uint16 {
+	return uint16(r.data[pos%uint64(len(r.data))]) |
+		uint16(r.data[(pos+1)%uint64(len(r.data))])<<8
+}
+
+//go:nosplit
+func (r *debugLogReader) readUint64LEAt(pos uint64) uint64 {
+	var b [8]byte
+	for i := range b {
+		b[i] = r.data[pos%uint64(len(r.data))]
+		pos++
+	}
+	return uint64(b[0]) | uint64(b[1])<<8 |
+		uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 |
+		uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func (r *debugLogReader) peek() (tick uint64) {
+	// Consume any sync records.
+	size := uint64(0)
+	for size == 0 {
+		if r.begin+debugLogHeaderSize > r.end {
+			return ^uint64(0)
+		}
+		size = uint64(r.readUint16LEAt(r.begin))
+		if size != 0 {
+			break
+		}
+		if r.begin+debugLogSyncSize > r.end {
+			return ^uint64(0)
+		}
+		// Sync packet.
+		r.tick = r.readUint64LEAt(r.begin + debugLogHeaderSize)
+		r.nano = r.readUint64LEAt(r.begin + debugLogHeaderSize + 8)
+		r.begin += debugLogSyncSize
+	}
+
+	// Peek tick delta.
+	if r.begin+size > r.end {
+		return ^uint64(0)
+	}
+	pos := r.begin + debugLogHeaderSize
+	var u uint64
+	for i := uint(0); ; i += 7 {
+		b := r.data[pos%uint64(len(r.data))]
+		pos++
+		u |= uint64(b&^0x80) << i
+		if b&0x80 == 0 {
+			break
+		}
+	}
+	if pos > r.begin+size {
+		return ^uint64(0)
+	}
+	return r.tick + u
+}
+
+func (r *debugLogReader) header() (end, tick, nano uint64, p int) {
+	// Read size. We've already skipped sync packets and checked
+	// bounds in peek.
+	size := uint64(r.readUint16LEAt(r.begin))
+	end = r.begin + size
+	r.begin += debugLogHeaderSize
+
+	// Read tick, nano, and p.
+	tick = r.uvarint() + r.tick
+	nano = r.uvarint() + r.nano
+	p = int(r.varint())
+
+	return
+}
+
+func (r *debugLogReader) uvarint() uint64 {
+	var u uint64
+	for i := uint(0); ; i += 7 {
+		b := r.data[r.begin%uint64(len(r.data))]
+		r.begin++
+		u |= uint64(b&^0x80) << i
+		if b&0x80 == 0 {
+			break
+		}
+	}
+	return u
+}
+
+func (r *debugLogReader) varint() int64 {
+	u := r.uvarint()
+	var v int64
+	if u&1 == 0 {
+		v = int64(u >> 1)
+	} else {
+		v = ^int64(u >> 1)
+	}
+	return v
+}
+
+func (r *debugLogReader) printVal() bool {
+	typ := r.data[r.begin%uint64(len(r.data))]
+	r.begin++
+
+	switch typ {
+	default:
+		print("<unknown field type ", hex(typ), " pos ", r.begin-1, " end ", r.end, ">\n")
+		return false
+
+	case debugLogUnknown:
+		print("<unknown kind>")
+
+	case debugLogBoolTrue:
+		print(true)
+
+	case debugLogBoolFalse:
+		print(false)
+
+	case debugLogInt:
+		print(r.varint())
+
+	case debugLogUint:
+		print(r.uvarint())
+
+	case debugLogHex, debugLogPtr:
+		print(hex(r.uvarint()))
+
+	case debugLogString:
+		sl := r.uvarint()
+		if r.begin+sl > r.end {
+			r.begin = r.end
+			print("<string length corrupted>")
+			break
+		}
+		for sl > 0 {
+			b := r.data[r.begin%uint64(len(r.data)):]
+			if uint64(len(b)) > sl {
+				b = b[:sl]
+			}
+			r.begin += uint64(len(b))
+			sl -= uint64(len(b))
+			gwrite(b)
+		}
+
+	case debugLogConstString:
+		len, ptr := int(r.uvarint()), uintptr(r.uvarint())
+		ptr += firstmoduledata.etext
+		str := stringStruct{
+			str: unsafe.Pointer(ptr),
+			len: len,
+		}
+		s := *(*string)(unsafe.Pointer(&str))
+		print(s)
+
+	case debugLogStringOverflow:
+		print("..(", r.uvarint(), " more bytes)..")
+
+	case debugLogPC:
+		printDebugLogPC(uintptr(r.uvarint()), false)
+
+	case debugLogTraceback:
+		n := int(r.uvarint())
+		for i := 0; i < n; i++ {
+			print("\n\t")
+			// gentraceback PCs are always return PCs.
+			// Convert them to call PCs.
+			//
+			// TODO(austin): Expand inlined frames.
+			printDebugLogPC(uintptr(r.uvarint()), true)
+		}
+	}
+
+	return true
+}
+
+// printDebugLog prints the debug log.
+func printDebugLog() {
+	if !dlogEnabled {
+		return
+	}
+
+	// This function should not panic or throw since it is used in
+	// the fatal panic path and this may deadlock.
+
+	printlock()
+
+	// Get the list of all debug logs.
+	allp := (*uintptr)(unsafe.Pointer(&allDloggers))
+	all := (*dlogger)(unsafe.Pointer(atomic.Loaduintptr(allp)))
+
+	// Count the logs.
+	n := 0
+	for l := all; l != nil; l = l.allLink {
+		n++
+	}
+	if n == 0 {
+		printunlock()
+		return
+	}
+
+	// Prepare read state for all logs.
+	type readState struct {
+		debugLogReader
+		first    bool
+		lost     uint64
+		nextTick uint64
+	}
+	state1 := sysAlloc(unsafe.Sizeof(readState{})*uintptr(n), nil)
+	if state1 == nil {
+		println("failed to allocate read state for", n, "logs")
+		printunlock()
+		return
+	}
+	state := (*[1 << 20]readState)(state1)[:n]
+	{
+		l := all
+		for i := range state {
+			s := &state[i]
+			s.debugLogReader = l.w.r
+			s.first = true
+			s.lost = l.w.r.begin
+			s.nextTick = s.peek()
+			l = l.allLink
+		}
+	}
+
+	// Print records.
+	for {
+		// Find the next record.
+		var best struct {
+			tick uint64
+			i    int
+		}
+		best.tick = ^uint64(0)
+		for i := range state {
+			if state[i].nextTick < best.tick {
+				best.tick = state[i].nextTick
+				best.i = i
+			}
+		}
+		if best.tick == ^uint64(0) {
+			break
+		}
+
+		// Print record.
+		s := &state[best.i]
+		if s.first {
+			print(">> begin log ", best.i)
+			if s.lost != 0 {
+				print("; lost first ", s.lost>>10, "KB")
+			}
+			print(" <<\n")
+			s.first = false
+		}
+
+		end, _, nano, p := s.header()
+		oldEnd := s.end
+		s.end = end
+
+		print("[")
+		var tmpbuf [21]byte
+		pnano := int64(nano) - runtimeInitTime
+		if pnano < 0 {
+			// Logged before runtimeInitTime was set.
+			pnano = 0
+		}
+		print(string(itoaDiv(tmpbuf[:], uint64(pnano), 9)))
+		print(" P ", p, "] ")
+
+		for i := 0; s.begin < s.end; i++ {
+			if i > 0 {
+				print(" ")
+			}
+			if !s.printVal() {
+				// Abort this P log.
+				print("<aborting P log>")
+				end = oldEnd
+				break
+			}
+		}
+		println()
+
+		// Move on to the next record.
+		s.begin = end
+		s.end = oldEnd
+		s.nextTick = s.peek()
+	}
+
+	printunlock()
+}
+
+// printDebugLogPC prints a single symbolized PC. If returnPC is true,
+// pc is a return PC that must first be converted to a call PC.
+func printDebugLogPC(pc uintptr, returnPC bool) {
+	fn := findfunc(pc)
+	if returnPC && (!fn.valid() || pc > fn.entry()) {
+		// TODO(austin): Don't back up if the previous frame
+		// was a sigpanic.
+		pc--
+	}
+
+	print(hex(pc))
+	if !fn.valid() {
+		print(" [unknown PC]")
+	} else {
+		name := funcname(fn)
+		file, line := funcline(fn, pc)
+		print(" [", name, "+", hex(pc-fn.entry()),
+			" ", file, ":", line, "]")
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/debuglog_off.go b/contrib/go/_std_1.18/src/runtime/debuglog_off.go
new file mode 100644
index 0000000000..fa3be39c70
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/debuglog_off.go
@@ -0,0 +1,19 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !debuglog
+
+package runtime
+
+const dlogEnabled = false
+
+type dlogPerM struct{}
+
+func getCachedDlogger() *dlogger {
+	return nil
+}
+
+func putCachedDlogger(l *dlogger) bool {
+	return false
+}
diff --git a/contrib/go/_std_1.18/src/runtime/defs_darwin_amd64.go b/contrib/go/_std_1.18/src/runtime/defs_darwin_amd64.go
new file mode 100644
index 0000000000..cbc26bfcff
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/defs_darwin_amd64.go
@@ -0,0 +1,372 @@
+// created by cgo -cdefs and then converted to Go
+// cgo -cdefs defs_darwin.go
+
+package runtime
+
+import "unsafe"
+
+const (
+	_EINTR     = 0x4
+	_EFAULT    = 0xe
+	_EAGAIN    = 0x23
+	_ETIMEDOUT = 0x3c
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x1000
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED      = 0x4
+	_MADV_FREE          = 0x5
+	_MADV_FREE_REUSABLE = 0x7
+	_MADV_FREE_REUSE    = 0x8
+
+	_SA_SIGINFO   = 0x40
+	_SA_RESTART   = 0x2
+	_SA_ONSTACK   = 0x1
+	_SA_USERTRAMP = 0x100
+	_SA_64REGSET  = 0x200
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGEMT    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGBUS    = 0xa
+	_SIGSEGV   = 0xb
+	_SIGSYS    = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGTERM   = 0xf
+	_SIGURG    = 0x10
+	_SIGSTOP   = 0x11
+	_SIGTSTP   = 0x12
+	_SIGCONT   = 0x13
+	_SIGCHLD   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGIO     = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGINFO   = 0x1d
+	_SIGUSR1   = 0x1e
+	_SIGUSR2   = 0x1f
+
+	_FPE_INTDIV = 0x7
+	_FPE_INTOVF = 0x8
+	_FPE_FLTDIV = 0x1
+	_FPE_FLTOVF = 0x2
+	_FPE_FLTUND = 0x3
+	_FPE_FLTRES = 0x4
+	_FPE_FLTINV = 0x5
+	_FPE_FLTSUB = 0x6
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EV_ADD       = 0x1
+	_EV_DELETE    = 0x2
+	_EV_CLEAR     = 0x20
+	_EV_RECEIPT   = 0x40
+	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
+	_EVFILT_READ  = -0x1
+	_EVFILT_WRITE = -0x2
+
+	_PTHREAD_CREATE_DETACHED = 0x2
+
+	_F_SETFD    = 0x2
+	_F_GETFL    = 0x3
+	_F_SETFL    = 0x4
+	_FD_CLOEXEC = 0x1
+
+	_O_NONBLOCK = 4
+)
+
+type stackt struct {
+	ss_sp     *byte
+	ss_size   uintptr
+	ss_flags  int32
+	pad_cgo_0 [4]byte
+}
+
+type sigactiont struct {
+	__sigaction_u [8]byte
+	sa_tramp      unsafe.Pointer
+	sa_mask       uint32
+	sa_flags      int32
+}
+
+type usigactiont struct {
+	__sigaction_u [8]byte
+	sa_mask       uint32
+	sa_flags      int32
+}
+
+type siginfo struct {
+	si_signo  int32
+	si_errno  int32
+	si_code   int32
+	si_pid    int32
+	si_uid    uint32
+	si_status int32
+	si_addr   uint64
+	si_value  [8]byte
+	si_band   int64
+	__pad     [7]uint64
+}
+
+type timeval struct {
+	tv_sec    int64
+	tv_usec   int32
+	pad_cgo_0 [4]byte
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = x
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+//go:nosplit
+func (ts *timespec) setNsec(ns int64) {
+	ts.tv_sec = ns / 1e9
+	ts.tv_nsec = ns % 1e9
+}
+
+type fpcontrol struct {
+	pad_cgo_0 [2]byte
+}
+
+type fpstatus struct {
+	pad_cgo_0 [2]byte
+}
+
+type regmmst struct {
+	mmst_reg  [10]int8
+	mmst_rsrv [6]int8
+}
+
+type regxmm struct {
+	xmm_reg [16]int8
+}
+
+type regs64 struct {
+	rax    uint64
+	rbx    uint64
+	rcx    uint64
+	rdx    uint64
+	rdi    uint64
+	rsi    uint64
+	rbp    uint64
+	rsp    uint64
+	r8     uint64
+	r9     uint64
+	r10    uint64
+	r11    uint64
+	r12    uint64
+	r13    uint64
+	r14    uint64
+	r15    uint64
+	rip    uint64
+	rflags uint64
+	cs     uint64
+	fs     uint64
+	gs     uint64
+}
+
+type floatstate64 struct {
+	fpu_reserved  [2]int32
+	fpu_fcw       fpcontrol
+	fpu_fsw       fpstatus
+	fpu_ftw       uint8
+	fpu_rsrv1     uint8
+	fpu_fop       uint16
+	fpu_ip        uint32
+	fpu_cs        uint16
+	fpu_rsrv2     uint16
+	fpu_dp        uint32
+	fpu_ds        uint16
+	fpu_rsrv3     uint16
+	fpu_mxcsr     uint32
+	fpu_mxcsrmask uint32
+	fpu_stmm0     regmmst
+	fpu_stmm1     regmmst
+	fpu_stmm2     regmmst
+	fpu_stmm3     regmmst
+	fpu_stmm4     regmmst
+	fpu_stmm5     regmmst
+	fpu_stmm6     regmmst
+	fpu_stmm7     regmmst
+	fpu_xmm0      regxmm
+	fpu_xmm1      regxmm
+	fpu_xmm2      regxmm
+	fpu_xmm3      regxmm
+	fpu_xmm4      regxmm
+	fpu_xmm5      regxmm
+	fpu_xmm6      regxmm
+	fpu_xmm7      regxmm
+	fpu_xmm8      regxmm
+	fpu_xmm9      regxmm
+	fpu_xmm10     regxmm
+	fpu_xmm11     regxmm
+	fpu_xmm12     regxmm
+	fpu_xmm13     regxmm
+	fpu_xmm14     regxmm
+	fpu_xmm15     regxmm
+	fpu_rsrv4     [96]int8
+	fpu_reserved1 int32
+}
+
+type exceptionstate64 struct {
+	trapno     uint16
+	cpu        uint16
+	err        uint32
+	faultvaddr uint64
+}
+
+type mcontext64 struct {
+	es        exceptionstate64
+	ss        regs64
+	fs        floatstate64
+	pad_cgo_0 [4]byte
+}
+
+type regs32 struct {
+	eax    uint32
+	ebx    uint32
+	ecx    uint32
+	edx    uint32
+	edi    uint32
+	esi    uint32
+	ebp    uint32
+	esp    uint32
+	ss     uint32
+	eflags uint32
+	eip    uint32
+	cs     uint32
+	ds     uint32
+	es     uint32
+	fs     uint32
+	gs     uint32
+}
+
+type floatstate32 struct {
+	fpu_reserved  [2]int32
+	fpu_fcw       fpcontrol
+	fpu_fsw       fpstatus
+	fpu_ftw       uint8
+	fpu_rsrv1     uint8
+	fpu_fop       uint16
+	fpu_ip        uint32
+	fpu_cs        uint16
+	fpu_rsrv2     uint16
+	fpu_dp        uint32
+	fpu_ds        uint16
+	fpu_rsrv3     uint16
+	fpu_mxcsr     uint32
+	fpu_mxcsrmask uint32
+	fpu_stmm0     regmmst
+	fpu_stmm1     regmmst
+	fpu_stmm2     regmmst
+	fpu_stmm3     regmmst
+	fpu_stmm4     regmmst
+	fpu_stmm5     regmmst
+	fpu_stmm6     regmmst
+	fpu_stmm7     regmmst
+	fpu_xmm0      regxmm
+	fpu_xmm1      regxmm
+	fpu_xmm2      regxmm
+	fpu_xmm3      regxmm
+	fpu_xmm4      regxmm
+	fpu_xmm5      regxmm
+	fpu_xmm6      regxmm
+	fpu_xmm7      regxmm
+	fpu_rsrv4     [224]int8
+	fpu_reserved1 int32
+}
+
+type exceptionstate32 struct {
+	trapno     uint16
+	cpu        uint16
+	err        uint32
+	faultvaddr uint32
+}
+
+type mcontext32 struct {
+	es exceptionstate32
+	ss regs32
+	fs floatstate32
+}
+
+type ucontext struct {
+	uc_onstack  int32
+	uc_sigmask  uint32
+	uc_stack    stackt
+	uc_link     *ucontext
+	uc_mcsize   uint64
+	uc_mcontext *mcontext64
+}
+
+type keventt struct {
+	ident  uint64
+	filter int16
+	flags  uint16
+	fflags uint32
+	data   int64
+	udata  *byte
+}
+
+type pthread uintptr
+type pthreadattr struct {
+	X__sig    int64
+	X__opaque [56]int8
+}
+type pthreadmutex struct {
+	X__sig    int64
+	X__opaque [56]int8
+}
+type pthreadmutexattr struct {
+	X__sig    int64
+	X__opaque [8]int8
+}
+type pthreadcond struct {
+	X__sig    int64
+	X__opaque [40]int8
+}
+type pthreadcondattr struct {
+	X__sig    int64
+	X__opaque [8]int8
+}
+
+type machTimebaseInfo struct {
+	numer uint32
+	denom uint32
+}
diff --git a/contrib/go/_std_1.18/src/runtime/defs_linux_amd64.go b/contrib/go/_std_1.18/src/runtime/defs_linux_amd64.go
new file mode 100644
index 0000000000..47fb468621
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/defs_linux_amd64.go
@@ -0,0 +1,302 @@
+// created by cgo -cdefs and then converted to Go
+// cgo -cdefs defs_linux.go defs1_linux.go
+
+package runtime
+
+import "unsafe"
+
+const (
+	_EINTR  = 0x4
+	_EAGAIN = 0xb
+	_ENOMEM = 0xc
+	_ENOSYS = 0x26
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x20
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED   = 0x4
+	_MADV_FREE       = 0x8
+	_MADV_HUGEPAGE   = 0xe
+	_MADV_NOHUGEPAGE = 0xf
+
+	_SA_RESTART  = 0x10000000
+	_SA_ONSTACK  = 0x8000000
+	_SA_RESTORER = 0x4000000
+	_SA_SIGINFO  = 0x4
+
+	_SI_KERNEL = 0x80
+	_SI_TIMER  = -0x2
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGBUS    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGUSR1   = 0xa
+	_SIGSEGV   = 0xb
+	_SIGUSR2   = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGSTKFLT = 0x10
+	_SIGCHLD   = 0x11
+	_SIGCONT   = 0x12
+	_SIGSTOP   = 0x13
+	_SIGTSTP   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGURG    = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGIO     = 0x1d
+	_SIGPWR    = 0x1e
+	_SIGSYS    = 0x1f
+
+	_SIGRTMIN = 0x20
+
+	_FPE_INTDIV = 0x1
+	_FPE_INTOVF = 0x2
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_CLOCK_THREAD_CPUTIME_ID = 0x3
+
+	_SIGEV_THREAD_ID = 0x4
+
+	_EPOLLIN       = 0x1
+	_EPOLLOUT      = 0x4
+	_EPOLLERR      = 0x8
+	_EPOLLHUP      = 0x10
+	_EPOLLRDHUP    = 0x2000
+	_EPOLLET       = 0x80000000
+	_EPOLL_CLOEXEC = 0x80000
+	_EPOLL_CTL_ADD = 0x1
+	_EPOLL_CTL_DEL = 0x2
+	_EPOLL_CTL_MOD = 0x3
+
+	_AF_UNIX    = 0x1
+	_SOCK_DGRAM = 0x2
+)
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+//go:nosplit
+func (ts *timespec) setNsec(ns int64) {
+	ts.tv_sec = ns / 1e9
+	ts.tv_nsec = ns % 1e9
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type sigactiont struct {
+	sa_handler  uintptr
+	sa_flags    uint64
+	sa_restorer uintptr
+	sa_mask     uint64
+}
+
+type siginfoFields struct {
+	si_signo int32
+	si_errno int32
+	si_code  int32
+	// below here is a union; si_addr is the only field we use
+	si_addr uint64
+}
+
+type siginfo struct {
+	siginfoFields
+
+	// Pad struct to the max size in the kernel.
+	_ [_si_max_size - unsafe.Sizeof(siginfoFields{})]byte
+}
+
+type itimerspec struct {
+	it_interval timespec
+	it_value    timespec
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type sigeventFields struct {
+	value  uintptr
+	signo  int32
+	notify int32
+	// below here is a union; sigev_notify_thread_id is the only field we use
+	sigev_notify_thread_id int32
+}
+
+type sigevent struct {
+	sigeventFields
+
+	// Pad struct to the max size in the kernel.
+	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
+}
+
+type epollevent struct {
+	events uint32
+	data   [8]byte // unaligned uintptr
+}
+
+// created by cgo -cdefs and then converted to Go
+// cgo -cdefs defs_linux.go defs1_linux.go
+
+const (
+	_O_RDONLY   = 0x0
+	_O_NONBLOCK = 0x800
+	_O_CLOEXEC  = 0x80000
+)
+
+type usigset struct {
+	__val [16]uint64
+}
+
+type fpxreg struct {
+	significand [4]uint16
+	exponent    uint16
+	padding     [3]uint16
+}
+
+type xmmreg struct {
+	element [4]uint32
+}
+
+type fpstate struct {
+	cwd       uint16
+	swd       uint16
+	ftw       uint16
+	fop       uint16
+	rip       uint64
+	rdp       uint64
+	mxcsr     uint32
+	mxcr_mask uint32
+	_st       [8]fpxreg
+	_xmm      [16]xmmreg
+	padding   [24]uint32
+}
+
+type fpxreg1 struct {
+	significand [4]uint16
+	exponent    uint16
+	padding     [3]uint16
+}
+
+type xmmreg1 struct {
+	element [4]uint32
+}
+
+type fpstate1 struct {
+	cwd       uint16
+	swd       uint16
+	ftw       uint16
+	fop       uint16
+	rip       uint64
+	rdp       uint64
+	mxcsr     uint32
+	mxcr_mask uint32
+	_st       [8]fpxreg1
+	_xmm      [16]xmmreg1
+	padding   [24]uint32
+}
+
+type fpreg1 struct {
+	significand [4]uint16
+	exponent    uint16
+}
+
+type stackt struct {
+	ss_sp     *byte
+	ss_flags  int32
+	pad_cgo_0 [4]byte
+	ss_size   uintptr
+}
+
+type mcontext struct {
+	gregs       [23]uint64
+	fpregs      *fpstate
+	__reserved1 [8]uint64
+}
+
+type ucontext struct {
+	uc_flags     uint64
+	uc_link      *ucontext
+	uc_stack     stackt
+	uc_mcontext  mcontext
+	uc_sigmask   usigset
+	__fpregs_mem fpstate
+}
+
+type sigcontext struct {
+	r8          uint64
+	r9          uint64
+	r10         uint64
+	r11         uint64
+	r12         uint64
+	r13         uint64
+	r14         uint64
+	r15         uint64
+	rdi         uint64
+	rsi         uint64
+	rbp         uint64
+	rbx         uint64
+	rdx         uint64
+	rax         uint64
+	rcx         uint64
+	rsp         uint64
+	rip         uint64
+	eflags      uint64
+	cs          uint16
+	gs          uint16
+	fs          uint16
+	__pad0      uint16
+	err         uint64
+	trapno      uint64
+	oldmask     uint64
+	cr2         uint64
+	fpstate     *fpstate1
+	__reserved1 [8]uint64
+}
+
+type sockaddr_un struct {
+	family uint16
+	path   [108]byte
+}
diff --git a/contrib/go/_std_1.18/src/runtime/duff_amd64.s b/contrib/go/_std_1.18/src/runtime/duff_amd64.s
new file mode 100644
index 0000000000..df010f5853
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/duff_amd64.s
@@ -0,0 +1,427 @@
+// Code generated by mkduff.go; DO NOT EDIT.
+// Run go generate from src/runtime to update.
+// See mkduff.go for comments.
+
+#include "textflag.h"
+
+TEXT runtime·duffzero<ABIInternal>(SB), NOSPLIT, $0-0
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	MOVUPS	X15,(DI)
+	MOVUPS	X15,16(DI)
+	MOVUPS	X15,32(DI)
+	MOVUPS	X15,48(DI)
+	LEAQ	64(DI),DI
+
+	RET
+
+TEXT runtime·duffcopy<ABIInternal>(SB), NOSPLIT, $0-0
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
+
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/env_posix.go b/contrib/go/_std_1.18/src/runtime/env_posix.go
new file mode 100644
index 0000000000..44086c1d63
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/env_posix.go
@@ -0,0 +1,76 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build aix || darwin || dragonfly || freebsd || (js && wasm) || linux || netbsd || openbsd || solaris || windows || plan9
+
+package runtime
+
+import "unsafe"
+
+func gogetenv(key string) string {
+	env := environ()
+	if env == nil {
+		throw("getenv before env init")
+	}
+	for _, s := range env {
+		if len(s) > len(key) && s[len(key)] == '=' && envKeyEqual(s[:len(key)], key) {
+			return s[len(key)+1:]
+		}
+	}
+	return ""
+}
+
+// envKeyEqual reports whether a == b, with ASCII-only case insensitivity
+// on Windows. The two strings must have the same length.
+func envKeyEqual(a, b string) bool {
+	if GOOS == "windows" { // case insensitive
+		for i := 0; i < len(a); i++ {
+			ca, cb := a[i], b[i]
+			if ca == cb || lowerASCII(ca) == lowerASCII(cb) {
+				continue
+			}
+			return false
+		}
+		return true
+	}
+	return a == b
+}
+
+func lowerASCII(c byte) byte {
+	if 'A' <= c && c <= 'Z' {
+		return c + ('a' - 'A')
+	}
+	return c
+}
+
+var _cgo_setenv unsafe.Pointer   // pointer to C function
+var _cgo_unsetenv unsafe.Pointer // pointer to C function
+
+// Update the C environment if cgo is loaded.
+// Called from syscall.Setenv.
+//go:linkname syscall_setenv_c syscall.setenv_c
+func syscall_setenv_c(k string, v string) {
+	if _cgo_setenv == nil {
+		return
+	}
+	arg := [2]unsafe.Pointer{cstring(k), cstring(v)}
+	asmcgocall(_cgo_setenv, unsafe.Pointer(&arg))
+}
+
+// Update the C environment if cgo is loaded.
+// Called from syscall.unsetenv.
+//go:linkname syscall_unsetenv_c syscall.unsetenv_c
+func syscall_unsetenv_c(k string) {
+	if _cgo_unsetenv == nil {
+		return
+	}
+	arg := [1]unsafe.Pointer{cstring(k)}
+	asmcgocall(_cgo_unsetenv, unsafe.Pointer(&arg))
+}
+
+func cstring(s string) unsafe.Pointer {
+	p := make([]byte, len(s)+1)
+	copy(p, s)
+	return unsafe.Pointer(&p[0])
+}
diff --git a/contrib/go/_std_1.18/src/runtime/error.go b/contrib/go/_std_1.18/src/runtime/error.go
new file mode 100644
index 0000000000..43114f092e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/error.go
@@ -0,0 +1,329 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "internal/bytealg"
+
+// The Error interface identifies a run time error.
+type Error interface {
+	error
+
+	// RuntimeError is a no-op function but
+	// serves to distinguish types that are run time
+	// errors from ordinary errors: a type is a
+	// run time error if it has a RuntimeError method.
+	RuntimeError()
+}
+
+// A TypeAssertionError explains a failed type assertion.
+type TypeAssertionError struct {
+	_interface    *_type
+	concrete      *_type
+	asserted      *_type
+	missingMethod string // one method needed by Interface, missing from Concrete
+}
+
+func (*TypeAssertionError) RuntimeError() {}
+
+func (e *TypeAssertionError) Error() string {
+	inter := "interface"
+	if e._interface != nil {
+		inter = e._interface.string()
+	}
+	as := e.asserted.string()
+	if e.concrete == nil {
+		return "interface conversion: " + inter + " is nil, not " + as
+	}
+	cs := e.concrete.string()
+	if e.missingMethod == "" {
+		msg := "interface conversion: " + inter + " is " + cs + ", not " + as
+		if cs == as {
+			// provide slightly clearer error message
+			if e.concrete.pkgpath() != e.asserted.pkgpath() {
+				msg += " (types from different packages)"
+			} else {
+				msg += " (types from different scopes)"
+			}
+		}
+		return msg
+	}
+	return "interface conversion: " + cs + " is not " + as +
+		": missing method " + e.missingMethod
+}
+
+//go:nosplit
+// itoa converts val to a decimal representation. The result is
+// written somewhere within buf and the location of the result is returned.
+// buf must be at least 20 bytes.
+func itoa(buf []byte, val uint64) []byte {
+	i := len(buf) - 1
+	for val >= 10 {
+		buf[i] = byte(val%10 + '0')
+		i--
+		val /= 10
+	}
+	buf[i] = byte(val + '0')
+	return buf[i:]
+}
+
+// An errorString represents a runtime error described by a single string.
+type errorString string
+
+func (e errorString) RuntimeError() {}
+
+func (e errorString) Error() string {
+	return "runtime error: " + string(e)
+}
+
+type errorAddressString struct {
+	msg  string  // error message
+	addr uintptr // memory address where the error occurred
+}
+
+func (e errorAddressString) RuntimeError() {}
+
+func (e errorAddressString) Error() string {
+	return "runtime error: " + e.msg
+}
+
+// Addr returns the memory address where a fault occurred.
+// The address provided is best-effort.
+// The veracity of the result may depend on the platform.
+// Errors providing this method will only be returned as
+// a result of using runtime/debug.SetPanicOnFault.
+func (e errorAddressString) Addr() uintptr {
+	return e.addr
+}
+
+// plainError represents a runtime error described a string without
+// the prefix "runtime error: " after invoking errorString.Error().
+// See Issue #14965.
+type plainError string
+
+func (e plainError) RuntimeError() {}
+
+func (e plainError) Error() string {
+	return string(e)
+}
+
+// A boundsError represents an indexing or slicing operation gone wrong.
+type boundsError struct {
+	x int64
+	y int
+	// Values in an index or slice expression can be signed or unsigned.
+	// That means we'd need 65 bits to encode all possible indexes, from -2^63 to 2^64-1.
+	// Instead, we keep track of whether x should be interpreted as signed or unsigned.
+	// y is known to be nonnegative and to fit in an int.
+	signed bool
+	code   boundsErrorCode
+}
+
+type boundsErrorCode uint8
+
+const (
+	boundsIndex boundsErrorCode = iota // s[x], 0 <= x < len(s) failed
+
+	boundsSliceAlen // s[?:x], 0 <= x <= len(s) failed
+	boundsSliceAcap // s[?:x], 0 <= x <= cap(s) failed
+	boundsSliceB    // s[x:y], 0 <= x <= y failed (but boundsSliceA didn't happen)
+
+	boundsSlice3Alen // s[?:?:x], 0 <= x <= len(s) failed
+	boundsSlice3Acap // s[?:?:x], 0 <= x <= cap(s) failed
+	boundsSlice3B    // s[?:x:y], 0 <= x <= y failed (but boundsSlice3A didn't happen)
+	boundsSlice3C    // s[x:y:?], 0 <= x <= y failed (but boundsSlice3A/B didn't happen)
+
+	boundsConvert // (*[x]T)(s), 0 <= x <= len(s) failed
+	// Note: in the above, len(s) and cap(s) are stored in y
+)
+
+// boundsErrorFmts provide error text for various out-of-bounds panics.
+// Note: if you change these strings, you should adjust the size of the buffer
+// in boundsError.Error below as well.
+var boundsErrorFmts = [...]string{
+	boundsIndex:      "index out of range [%x] with length %y",
+	boundsSliceAlen:  "slice bounds out of range [:%x] with length %y",
+	boundsSliceAcap:  "slice bounds out of range [:%x] with capacity %y",
+	boundsSliceB:     "slice bounds out of range [%x:%y]",
+	boundsSlice3Alen: "slice bounds out of range [::%x] with length %y",
+	boundsSlice3Acap: "slice bounds out of range [::%x] with capacity %y",
+	boundsSlice3B:    "slice bounds out of range [:%x:%y]",
+	boundsSlice3C:    "slice bounds out of range [%x:%y:]",
+	boundsConvert:    "cannot convert slice with length %y to pointer to array with length %x",
+}
+
+// boundsNegErrorFmts are overriding formats if x is negative. In this case there's no need to report y.
+var boundsNegErrorFmts = [...]string{
+	boundsIndex:      "index out of range [%x]",
+	boundsSliceAlen:  "slice bounds out of range [:%x]",
+	boundsSliceAcap:  "slice bounds out of range [:%x]",
+	boundsSliceB:     "slice bounds out of range [%x:]",
+	boundsSlice3Alen: "slice bounds out of range [::%x]",
+	boundsSlice3Acap: "slice bounds out of range [::%x]",
+	boundsSlice3B:    "slice bounds out of range [:%x:]",
+	boundsSlice3C:    "slice bounds out of range [%x::]",
+}
+
+func (e boundsError) RuntimeError() {}
+
+func appendIntStr(b []byte, v int64, signed bool) []byte {
+	if signed && v < 0 {
+		b = append(b, '-')
+		v = -v
+	}
+	var buf [20]byte
+	b = append(b, itoa(buf[:], uint64(v))...)
+	return b
+}
+
+func (e boundsError) Error() string {
+	fmt := boundsErrorFmts[e.code]
+	if e.signed && e.x < 0 {
+		fmt = boundsNegErrorFmts[e.code]
+	}
+	// max message length is 99: "runtime error: slice bounds out of range [::%x] with capacity %y"
+	// x can be at most 20 characters. y can be at most 19.
+	b := make([]byte, 0, 100)
+	b = append(b, "runtime error: "...)
+	for i := 0; i < len(fmt); i++ {
+		c := fmt[i]
+		if c != '%' {
+			b = append(b, c)
+			continue
+		}
+		i++
+		switch fmt[i] {
+		case 'x':
+			b = appendIntStr(b, e.x, e.signed)
+		case 'y':
+			b = appendIntStr(b, int64(e.y), true)
+		}
+	}
+	return string(b)
+}
+
+type stringer interface {
+	String() string
+}
+
+// printany prints an argument passed to panic.
+// If panic is called with a value that has a String or Error method,
+// it has already been converted into a string by preprintpanics.
+func printany(i any) {
+	switch v := i.(type) {
+	case nil:
+		print("nil")
+	case bool:
+		print(v)
+	case int:
+		print(v)
+	case int8:
+		print(v)
+	case int16:
+		print(v)
+	case int32:
+		print(v)
+	case int64:
+		print(v)
+	case uint:
+		print(v)
+	case uint8:
+		print(v)
+	case uint16:
+		print(v)
+	case uint32:
+		print(v)
+	case uint64:
+		print(v)
+	case uintptr:
+		print(v)
+	case float32:
+		print(v)
+	case float64:
+		print(v)
+	case complex64:
+		print(v)
+	case complex128:
+		print(v)
+	case string:
+		print(v)
+	default:
+		printanycustomtype(i)
+	}
+}
+
+func printanycustomtype(i any) {
+	eface := efaceOf(&i)
+	typestring := eface._type.string()
+
+	switch eface._type.kind {
+	case kindString:
+		print(typestring, `("`, *(*string)(eface.data), `")`)
+	case kindBool:
+		print(typestring, "(", *(*bool)(eface.data), ")")
+	case kindInt:
+		print(typestring, "(", *(*int)(eface.data), ")")
+	case kindInt8:
+		print(typestring, "(", *(*int8)(eface.data), ")")
+	case kindInt16:
+		print(typestring, "(", *(*int16)(eface.data), ")")
+	case kindInt32:
+		print(typestring, "(", *(*int32)(eface.data), ")")
+	case kindInt64:
+		print(typestring, "(", *(*int64)(eface.data), ")")
+	case kindUint:
+		print(typestring, "(", *(*uint)(eface.data), ")")
+	case kindUint8:
+		print(typestring, "(", *(*uint8)(eface.data), ")")
+	case kindUint16:
+		print(typestring, "(", *(*uint16)(eface.data), ")")
+	case kindUint32:
+		print(typestring, "(", *(*uint32)(eface.data), ")")
+	case kindUint64:
+		print(typestring, "(", *(*uint64)(eface.data), ")")
+	case kindUintptr:
+		print(typestring, "(", *(*uintptr)(eface.data), ")")
+	case kindFloat32:
+		print(typestring, "(", *(*float32)(eface.data), ")")
+	case kindFloat64:
+		print(typestring, "(", *(*float64)(eface.data), ")")
+	case kindComplex64:
+		print(typestring, *(*complex64)(eface.data))
+	case kindComplex128:
+		print(typestring, *(*complex128)(eface.data))
+	default:
+		print("(", typestring, ") ", eface.data)
+	}
+}
+
+// panicwrap generates a panic for a call to a wrapped value method
+// with a nil pointer receiver.
+//
+// It is called from the generated wrapper code.
+func panicwrap() {
+	pc := getcallerpc()
+	name := funcname(findfunc(pc))
+	// name is something like "main.(*T).F".
+	// We want to extract pkg ("main"), typ ("T"), and meth ("F").
+	// Do it by finding the parens.
+	i := bytealg.IndexByteString(name, '(')
+	if i < 0 {
+		throw("panicwrap: no ( in " + name)
+	}
+	pkg := name[:i-1]
+	if i+2 >= len(name) || name[i-1:i+2] != ".(*" {
+		throw("panicwrap: unexpected string after package name: " + name)
+	}
+	name = name[i+2:]
+	i = bytealg.IndexByteString(name, ')')
+	if i < 0 {
+		throw("panicwrap: no ) in " + name)
+	}
+	if i+2 >= len(name) || name[i:i+2] != ")." {
+		throw("panicwrap: unexpected string after type name: " + name)
+	}
+	typ := name[:i]
+	meth := name[i+2:]
+	panic(plainError("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer"))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/extern.go b/contrib/go/_std_1.18/src/runtime/extern.go
new file mode 100644
index 0000000000..f1f6ea5123
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/extern.go
@@ -0,0 +1,275 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package runtime contains operations that interact with Go's runtime system,
+such as functions to control goroutines. It also includes the low-level type information
+used by the reflect package; see reflect's documentation for the programmable
+interface to the run-time type system.
+
+Environment Variables
+
+The following environment variables ($name or %name%, depending on the host
+operating system) control the run-time behavior of Go programs. The meanings
+and use may change from release to release.
+
+The GOGC variable sets the initial garbage collection target percentage.
+A collection is triggered when the ratio of freshly allocated data to live data
+remaining after the previous collection reaches this percentage. The default
+is GOGC=100. Setting GOGC=off disables the garbage collector entirely.
+The runtime/debug package's SetGCPercent function allows changing this
+percentage at run time. See https://golang.org/pkg/runtime/debug/#SetGCPercent.
+
+The GODEBUG variable controls debugging variables within the runtime.
+It is a comma-separated list of name=val pairs setting these named variables:
+
+	allocfreetrace: setting allocfreetrace=1 causes every allocation to be
+	profiled and a stack trace printed on each object's allocation and free.
+
+	clobberfree: setting clobberfree=1 causes the garbage collector to
+	clobber the memory content of an object with bad content when it frees
+	the object.
+
+	cgocheck: setting cgocheck=0 disables all checks for packages
+	using cgo to incorrectly pass Go pointers to non-Go code.
+	Setting cgocheck=1 (the default) enables relatively cheap
+	checks that may miss some errors.  Setting cgocheck=2 enables
+	expensive checks that should not miss any errors, but will
+	cause your program to run slower.
+
+	efence: setting efence=1 causes the allocator to run in a mode
+	where each object is allocated on a unique page and addresses are
+	never recycled.
+
+	gccheckmark: setting gccheckmark=1 enables verification of the
+	garbage collector's concurrent mark phase by performing a
+	second mark pass while the world is stopped.  If the second
+	pass finds a reachable object that was not found by concurrent
+	mark, the garbage collector will panic.
+
+	gcpacertrace: setting gcpacertrace=1 causes the garbage collector to
+	print information about the internal state of the concurrent pacer.
+
+	gcshrinkstackoff: setting gcshrinkstackoff=1 disables moving goroutines
+	onto smaller stacks. In this mode, a goroutine's stack can only grow.
+
+	gcstoptheworld: setting gcstoptheworld=1 disables concurrent garbage collection,
+	making every garbage collection a stop-the-world event. Setting gcstoptheworld=2
+	also disables concurrent sweeping after the garbage collection finishes.
+
+	gctrace: setting gctrace=1 causes the garbage collector to emit a single line to standard
+	error at each collection, summarizing the amount of memory collected and the
+	length of the pause. The format of this line is subject to change.
+	Currently, it is:
+		gc # @#s #%: #+#+# ms clock, #+#/#/#+# ms cpu, #->#-># MB, # MB goal, # P
+	where the fields are as follows:
+		gc #        the GC number, incremented at each GC
+		@#s         time in seconds since program start
+		#%          percentage of time spent in GC since program start
+		#+...+#     wall-clock/CPU times for the phases of the GC
+		#->#-># MB  heap size at GC start, at GC end, and live heap
+		# MB goal   goal heap size
+		# P         number of processors used
+	The phases are stop-the-world (STW) sweep termination, concurrent
+	mark and scan, and STW mark termination. The CPU times
+	for mark/scan are broken down in to assist time (GC performed in
+	line with allocation), background GC time, and idle GC time.
+	If the line ends with "(forced)", this GC was forced by a
+	runtime.GC() call.
+
+	harddecommit: setting harddecommit=1 causes memory that is returned to the OS to
+	also have protections removed on it. This is the only mode of operation on Windows,
+	but is helpful in debugging scavenger-related issues on other platforms. Currently,
+	only supported on Linux.
+
+	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
+	error for each package with init work, summarizing the execution time and memory
+	allocation. No information is printed for inits executed as part of plugin loading
+	and for packages without both user defined and compiler generated init work.
+	The format of this line is subject to change. Currently, it is:
+		init # @#ms, # ms clock, # bytes, # allocs
+	where the fields are as follows:
+		init #      the package name
+		@# ms       time in milliseconds when the init started since program start
+		# clock     wall-clock time for package initialization work
+		# bytes     memory allocated on the heap
+		# allocs    number of heap allocations
+
+	madvdontneed: setting madvdontneed=0 will use MADV_FREE
+	instead of MADV_DONTNEED on Linux when returning memory to the
+	kernel. This is more efficient, but means RSS numbers will
+	drop only when the OS is under memory pressure.
+
+	memprofilerate: setting memprofilerate=X will update the value of runtime.MemProfileRate.
+	When set to 0 memory profiling is disabled.  Refer to the description of
+	MemProfileRate for the default value.
+
+	invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
+	copier to crash the program if an invalid pointer value (for example, 1)
+	is found in a pointer-typed location. Setting invalidptr=0 disables this check.
+	This should only be used as a temporary workaround to diagnose buggy code.
+	The real fix is to not store integers in pointer-typed locations.
+
+	sbrk: setting sbrk=1 replaces the memory allocator and garbage collector
+	with a trivial allocator that obtains memory from the operating system and
+	never reclaims any memory.
+
+	scavtrace: setting scavtrace=1 causes the runtime to emit a single line to standard
+	error, roughly once per GC cycle, summarizing the amount of work done by the
+	scavenger as well as the total amount of memory returned to the operating system
+	and an estimate of physical memory utilization. The format of this line is subject
+	to change, but currently it is:
+		scav # # KiB work, # KiB total, #% util
+	where the fields are as follows:
+		scav #       the scavenge cycle number
+		# KiB work   the amount of memory returned to the OS since the last line
+		# KiB total  the total amount of memory returned to the OS
+		#% util      the fraction of all unscavenged memory which is in-use
+	If the line ends with "(forced)", then scavenging was forced by a
+	debug.FreeOSMemory() call.
+
+	scheddetail: setting schedtrace=X and scheddetail=1 causes the scheduler to emit
+	detailed multiline info every X milliseconds, describing state of the scheduler,
+	processors, threads and goroutines.
+
+	schedtrace: setting schedtrace=X causes the scheduler to emit a single line to standard
+	error every X milliseconds, summarizing the scheduler state.
+
+	tracebackancestors: setting tracebackancestors=N extends tracebacks with the stacks at
+	which goroutines were created, where N limits the number of ancestor goroutines to
+	report. This also extends the information returned by runtime.Stack. Ancestor's goroutine
+	IDs will refer to the ID of the goroutine at the time of creation; it's possible for this
+	ID to be reused for another goroutine. Setting N to 0 will report no ancestry information.
+
+	asyncpreemptoff: asyncpreemptoff=1 disables signal-based
+	asynchronous goroutine preemption. This makes some loops
+	non-preemptible for long periods, which may delay GC and
+	goroutine scheduling. This is useful for debugging GC issues
+	because it also disables the conservative stack scanning used
+	for asynchronously preempted goroutines.
+
+The net and net/http packages also refer to debugging variables in GODEBUG.
+See the documentation for those packages for details.
+
+The GOMAXPROCS variable limits the number of operating system threads that
+can execute user-level Go code simultaneously. There is no limit to the number of threads
+that can be blocked in system calls on behalf of Go code; those do not count against
+the GOMAXPROCS limit. This package's GOMAXPROCS function queries and changes
+the limit.
+
+The GORACE variable configures the race detector, for programs built using -race.
+See https://golang.org/doc/articles/race_detector.html for details.
+
+The GOTRACEBACK variable controls the amount of output generated when a Go
+program fails due to an unrecovered panic or an unexpected runtime condition.
+By default, a failure prints a stack trace for the current goroutine,
+eliding functions internal to the run-time system, and then exits with exit code 2.
+The failure prints stack traces for all goroutines if there is no current goroutine
+or the failure is internal to the run-time.
+GOTRACEBACK=none omits the goroutine stack traces entirely.
+GOTRACEBACK=single (the default) behaves as described above.
+GOTRACEBACK=all adds stack traces for all user-created goroutines.
+GOTRACEBACK=system is like ``all'' but adds stack frames for run-time functions
+and shows goroutines created internally by the run-time.
+GOTRACEBACK=crash is like ``system'' but crashes in an operating system-specific
+manner instead of exiting. For example, on Unix systems, the crash raises
+SIGABRT to trigger a core dump.
+For historical reasons, the GOTRACEBACK settings 0, 1, and 2 are synonyms for
+none, all, and system, respectively.
+The runtime/debug package's SetTraceback function allows increasing the
+amount of output at run time, but it cannot reduce the amount below that
+specified by the environment variable.
+See https://golang.org/pkg/runtime/debug/#SetTraceback.
+
+The GOARCH, GOOS, GOPATH, and GOROOT environment variables complete
+the set of Go environment variables. They influence the building of Go programs
+(see https://golang.org/cmd/go and https://golang.org/pkg/go/build).
+GOARCH, GOOS, and GOROOT are recorded at compile time and made available by
+constants or functions in this package, but they do not influence the execution
+of the run-time system.
+*/
+package runtime
+
+import (
+	"internal/goarch"
+	"internal/goos"
+)
+
+// Caller reports file and line number information about function invocations on
+// the calling goroutine's stack. The argument skip is the number of stack frames
+// to ascend, with 0 identifying the caller of Caller.  (For historical reasons the
+// meaning of skip differs between Caller and Callers.) The return values report the
+// program counter, file name, and line number within the file of the corresponding
+// call. The boolean ok is false if it was not possible to recover the information.
+func Caller(skip int) (pc uintptr, file string, line int, ok bool) {
+	rpc := make([]uintptr, 1)
+	n := callers(skip+1, rpc[:])
+	if n < 1 {
+		return
+	}
+	frame, _ := CallersFrames(rpc).Next()
+	return frame.PC, frame.File, frame.Line, frame.PC != 0
+}
+
+// Callers fills the slice pc with the return program counters of function invocations
+// on the calling goroutine's stack. The argument skip is the number of stack frames
+// to skip before recording in pc, with 0 identifying the frame for Callers itself and
+// 1 identifying the caller of Callers.
+// It returns the number of entries written to pc.
+//
+// To translate these PCs into symbolic information such as function
+// names and line numbers, use CallersFrames. CallersFrames accounts
+// for inlined functions and adjusts the return program counters into
+// call program counters. Iterating over the returned slice of PCs
+// directly is discouraged, as is using FuncForPC on any of the
+// returned PCs, since these cannot account for inlining or return
+// program counter adjustment.
+func Callers(skip int, pc []uintptr) int {
+	// runtime.callers uses pc.array==nil as a signal
+	// to print a stack trace. Pick off 0-length pc here
+	// so that we don't let a nil pc slice get to it.
+	if len(pc) == 0 {
+		return 0
+	}
+	return callers(skip, pc)
+}
+
+var defaultGOROOT string // set by cmd/link
+
+// GOROOT returns the root of the Go tree. It uses the
+// GOROOT environment variable, if set at process start,
+// or else the root used during the Go build.
+func GOROOT() string {
+	s := gogetenv("GOROOT")
+	if s != "" {
+		return s
+	}
+	return defaultGOROOT
+}
+
+// buildVersion is the Go tree's version string at build time.
+//
+// If any GOEXPERIMENTs are set to non-default values, it will include
+// "X:<GOEXPERIMENT>".
+//
+// This is set by the linker.
+//
+// This is accessed by "go version <binary>".
+var buildVersion string
+
+// Version returns the Go tree's version string.
+// It is either the commit hash and date at the time of the build or,
+// when possible, a release tag like "go1.3".
+func Version() string {
+	return buildVersion
+}
+
+// GOOS is the running program's operating system target:
+// one of darwin, freebsd, linux, and so on.
+// To view possible combinations of GOOS and GOARCH, run "go tool dist list".
+const GOOS string = goos.GOOS
+
+// GOARCH is the running program's architecture target:
+// one of 386, amd64, arm, s390x, and so on.
+const GOARCH string = goarch.GOARCH
diff --git a/contrib/go/_std_1.18/src/runtime/fastlog2.go b/contrib/go/_std_1.18/src/runtime/fastlog2.go
new file mode 100644
index 0000000000..1f251bfaab
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/fastlog2.go
@@ -0,0 +1,27 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// fastlog2 implements a fast approximation to the base 2 log of a
+// float64. This is used to compute a geometric distribution for heap
+// sampling, without introducing dependencies into package math. This
+// uses a very rough approximation using the float64 exponent and the
+// first 25 bits of the mantissa. The top 5 bits of the mantissa are
+// used to load limits from a table of constants and the rest are used
+// to scale linearly between them.
+func fastlog2(x float64) float64 {
+	const fastlogScaleBits = 20
+	const fastlogScaleRatio = 1.0 / (1 << fastlogScaleBits)
+
+	xBits := float64bits(x)
+	// Extract the exponent from the IEEE float64, and index a constant
+	// table with the first 10 bits from the mantissa.
+	xExp := int64((xBits>>52)&0x7FF) - 1023
+	xManIndex := (xBits >> (52 - fastlogNumBits)) % (1 << fastlogNumBits)
+	xManScale := (xBits >> (52 - fastlogNumBits - fastlogScaleBits)) % (1 << fastlogScaleBits)
+
+	low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
+	return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
+}
diff --git a/contrib/go/_std_1.18/src/runtime/fastlog2table.go b/contrib/go/_std_1.18/src/runtime/fastlog2table.go
new file mode 100644
index 0000000000..6ba4a7d3f2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/fastlog2table.go
@@ -0,0 +1,43 @@
+// Code generated by mkfastlog2table.go; DO NOT EDIT.
+// Run go generate from src/runtime to update.
+// See mkfastlog2table.go for comments.
+
+package runtime
+
+const fastlogNumBits = 5
+
+var fastlog2Table = [1<<fastlogNumBits + 1]float64{
+	0,
+	0.0443941193584535,
+	0.08746284125033943,
+	0.12928301694496647,
+	0.16992500144231248,
+	0.2094533656289499,
+	0.24792751344358555,
+	0.28540221886224837,
+	0.3219280948873623,
+	0.3575520046180837,
+	0.39231742277876036,
+	0.4262647547020979,
+	0.4594316186372973,
+	0.4918530963296748,
+	0.5235619560570128,
+	0.5545888516776374,
+	0.5849625007211563,
+	0.6147098441152082,
+	0.6438561897747247,
+	0.6724253419714956,
+	0.7004397181410922,
+	0.7279204545631992,
+	0.7548875021634686,
+	0.7813597135246596,
+	0.8073549220576042,
+	0.8328900141647417,
+	0.8579809951275721,
+	0.8826430493618412,
+	0.9068905956085185,
+	0.9307373375628862,
+	0.9541963103868752,
+	0.9772799234999164,
+	1,
+}
diff --git a/contrib/go/_std_1.18/src/runtime/float.go b/contrib/go/_std_1.18/src/runtime/float.go
new file mode 100644
index 0000000000..459e58dd7e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/float.go
@@ -0,0 +1,53 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var inf = float64frombits(0x7FF0000000000000)
+
+// isNaN reports whether f is an IEEE 754 ``not-a-number'' value.
+func isNaN(f float64) (is bool) {
+	// IEEE 754 says that only NaNs satisfy f != f.
+	return f != f
+}
+
+// isFinite reports whether f is neither NaN nor an infinity.
+func isFinite(f float64) bool {
+	return !isNaN(f - f)
+}
+
+// isInf reports whether f is an infinity.
+func isInf(f float64) bool {
+	return !isNaN(f) && !isFinite(f)
+}
+
+// Abs returns the absolute value of x.
+//
+// Special cases are:
+//	Abs(±Inf) = +Inf
+//	Abs(NaN) = NaN
+func abs(x float64) float64 {
+	const sign = 1 << 63
+	return float64frombits(float64bits(x) &^ sign)
+}
+
+// copysign returns a value with the magnitude
+// of x and the sign of y.
+func copysign(x, y float64) float64 {
+	const sign = 1 << 63
+	return float64frombits(float64bits(x)&^sign | float64bits(y)&sign)
+}
+
+// Float64bits returns the IEEE 754 binary representation of f.
+func float64bits(f float64) uint64 {
+	return *(*uint64)(unsafe.Pointer(&f))
+}
+
+// Float64frombits returns the floating point number corresponding
+// the IEEE 754 binary representation b.
+func float64frombits(b uint64) float64 {
+	return *(*float64)(unsafe.Pointer(&b))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/funcdata.h b/contrib/go/_std_1.18/src/runtime/funcdata.h
new file mode 100644
index 0000000000..2e2bb30446
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/funcdata.h
@@ -0,0 +1,56 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file defines the IDs for PCDATA and FUNCDATA instructions
+// in Go binaries. It is included by assembly sources, so it must
+// be written using #defines.
+//
+// These must agree with symtab.go and ../cmd/internal/objabi/funcdata.go.
+
+#define PCDATA_UnsafePoint 0
+#define PCDATA_StackMapIndex 1
+#define PCDATA_InlTreeIndex 2
+#define PCDATA_ArgLiveIndex 3
+
+#define FUNCDATA_ArgsPointerMaps 0 /* garbage collector blocks */
+#define FUNCDATA_LocalsPointerMaps 1
+#define FUNCDATA_StackObjects 2
+#define FUNCDATA_InlTree 3
+#define FUNCDATA_OpenCodedDeferInfo 4 /* info for func with open-coded defers */
+#define FUNCDATA_ArgInfo 5
+#define FUNCDATA_ArgLiveInfo 6
+#define FUNCDATA_WrapInfo 7
+
+// Pseudo-assembly statements.
+
+// GO_ARGS, GO_RESULTS_INITIALIZED, and NO_LOCAL_POINTERS are macros
+// that communicate to the runtime information about the location and liveness
+// of pointers in an assembly function's arguments, results, and stack frame.
+// This communication is only required in assembly functions that make calls
+// to other functions that might be preempted or grow the stack.
+// NOSPLIT functions that make no calls do not need to use these macros.
+
+// GO_ARGS indicates that the Go prototype for this assembly function
+// defines the pointer map for the function's arguments.
+// GO_ARGS should be the first instruction in a function that uses it.
+// It can be omitted if there are no arguments at all.
+// GO_ARGS is inserted implicitly by the linker for any function whose
+// name starts with a middle-dot and that also has a Go prototype; it
+// is therefore usually not necessary to write explicitly.
+#define GO_ARGS	FUNCDATA $FUNCDATA_ArgsPointerMaps, go_args_stackmap(SB)
+
+// GO_RESULTS_INITIALIZED indicates that the assembly function
+// has initialized the stack space for its results and that those results
+// should be considered live for the remainder of the function.
+#define GO_RESULTS_INITIALIZED	PCDATA $PCDATA_StackMapIndex, $1
+
+// NO_LOCAL_POINTERS indicates that the assembly function stores
+// no pointers to heap objects in its local stack variables.
+#define NO_LOCAL_POINTERS	FUNCDATA $FUNCDATA_LocalsPointerMaps, no_pointers_stackmap(SB)
+
+// ArgsSizeUnknown is set in Func.argsize to mark all functions
+// whose argument size is unknown (C vararg functions, and
+// assembly code without an explicit specification).
+// This value is generated by the compiler, assembler, or linker.
+#define ArgsSizeUnknown 0x80000000
diff --git a/contrib/go/_std_1.18/src/runtime/go_tls.h b/contrib/go/_std_1.18/src/runtime/go_tls.h
new file mode 100644
index 0000000000..a47e798d9d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/go_tls.h
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifdef GOARCH_arm
+#define LR R14
+#endif
+
+#ifdef GOARCH_amd64
+#define	get_tls(r)	MOVQ TLS, r
+#define	g(r)	0(r)(TLS*1)
+#endif
+
+#ifdef GOARCH_386
+#define	get_tls(r)	MOVL TLS, r
+#define	g(r)	0(r)(TLS*1)
+#endif
diff --git a/contrib/go/_std_1.18/src/runtime/hash64.go b/contrib/go/_std_1.18/src/runtime/hash64.go
new file mode 100644
index 0000000000..f773eb929c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/hash64.go
@@ -0,0 +1,92 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hashing algorithm inspired by
+// wyhash: https://github.com/wangyi-fudan/wyhash
+
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
+
+package runtime
+
+import (
+	"runtime/internal/math"
+	"unsafe"
+)
+
+const (
+	m1 = 0xa0761d6478bd642f
+	m2 = 0xe7037ed1a0b428db
+	m3 = 0x8ebc6af09c88c6e3
+	m4 = 0x589965cc75374cc3
+	m5 = 0x1d8e4e27c47d124f
+)
+
+func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
+	var a, b uintptr
+	seed ^= hashkey[0] ^ m1
+	switch {
+	case s == 0:
+		return seed
+	case s < 4:
+		a = uintptr(*(*byte)(p))
+		a |= uintptr(*(*byte)(add(p, s>>1))) << 8
+		a |= uintptr(*(*byte)(add(p, s-1))) << 16
+	case s == 4:
+		a = r4(p)
+		b = a
+	case s < 8:
+		a = r4(p)
+		b = r4(add(p, s-4))
+	case s == 8:
+		a = r8(p)
+		b = a
+	case s <= 16:
+		a = r8(p)
+		b = r8(add(p, s-8))
+	default:
+		l := s
+		if l > 48 {
+			seed1 := seed
+			seed2 := seed
+			for ; l > 48; l -= 48 {
+				seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+				seed1 = mix(r8(add(p, 16))^m3, r8(add(p, 24))^seed1)
+				seed2 = mix(r8(add(p, 32))^m4, r8(add(p, 40))^seed2)
+				p = add(p, 48)
+			}
+			seed ^= seed1 ^ seed2
+		}
+		for ; l > 16; l -= 16 {
+			seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+			p = add(p, 16)
+		}
+		a = r8(add(p, l-16))
+		b = r8(add(p, l-8))
+	}
+
+	return mix(m5^s, mix(a^m2, b^seed))
+}
+
+func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr {
+	a := r4(p)
+	return mix(m5^4, mix(a^m2, a^seed^hashkey[0]^m1))
+}
+
+func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr {
+	a := r8(p)
+	return mix(m5^8, mix(a^m2, a^seed^hashkey[0]^m1))
+}
+
+func mix(a, b uintptr) uintptr {
+	hi, lo := math.Mul64(uint64(a), uint64(b))
+	return uintptr(hi ^ lo)
+}
+
+func r4(p unsafe.Pointer) uintptr {
+	return uintptr(readUnaligned32(p))
+}
+
+func r8(p unsafe.Pointer) uintptr {
+	return uintptr(readUnaligned64(p))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/heapdump.go b/contrib/go/_std_1.18/src/runtime/heapdump.go
new file mode 100644
index 0000000000..871637a09e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/heapdump.go
@@ -0,0 +1,757 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of runtime/debug.WriteHeapDump. Writes all
+// objects in the heap plus additional info (roots, threads,
+// finalizers, etc.) to a file.
+
+// The format of the dumped file is described at
+// https://golang.org/s/go15heapdump.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+//go:linkname runtime_debug_WriteHeapDump runtime/debug.WriteHeapDump
+func runtime_debug_WriteHeapDump(fd uintptr) {
+	stopTheWorld("write heap dump")
+
+	// Keep m on this G's stack instead of the system stack.
+	// Both readmemstats_m and writeheapdump_m have pretty large
+	// peak stack depths and we risk blowing the system stack.
+	// This is safe because the world is stopped, so we don't
+	// need to worry about anyone shrinking and therefore moving
+	// our stack.
+	var m MemStats
+	systemstack(func() {
+		// Call readmemstats_m here instead of deeper in
+		// writeheapdump_m because we might blow the system stack
+		// otherwise.
+		readmemstats_m(&m)
+		writeheapdump_m(fd, &m)
+	})
+
+	startTheWorld()
+}
+
+const (
+	fieldKindEol       = 0
+	fieldKindPtr       = 1
+	fieldKindIface     = 2
+	fieldKindEface     = 3
+	tagEOF             = 0
+	tagObject          = 1
+	tagOtherRoot       = 2
+	tagType            = 3
+	tagGoroutine       = 4
+	tagStackFrame      = 5
+	tagParams          = 6
+	tagFinalizer       = 7
+	tagItab            = 8
+	tagOSThread        = 9
+	tagMemStats        = 10
+	tagQueuedFinalizer = 11
+	tagData            = 12
+	tagBSS             = 13
+	tagDefer           = 14
+	tagPanic           = 15
+	tagMemProf         = 16
+	tagAllocSample     = 17
+)
+
+var dumpfd uintptr // fd to write the dump to.
+var tmpbuf []byte
+
+// buffer of pending write data
+const (
+	bufSize = 4096
+)
+
+var buf [bufSize]byte
+var nbuf uintptr
+
+func dwrite(data unsafe.Pointer, len uintptr) {
+	if len == 0 {
+		return
+	}
+	if nbuf+len <= bufSize {
+		copy(buf[nbuf:], (*[bufSize]byte)(data)[:len])
+		nbuf += len
+		return
+	}
+
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
+	if len >= bufSize {
+		write(dumpfd, data, int32(len))
+		nbuf = 0
+	} else {
+		copy(buf[:], (*[bufSize]byte)(data)[:len])
+		nbuf = len
+	}
+}
+
+func dwritebyte(b byte) {
+	dwrite(unsafe.Pointer(&b), 1)
+}
+
+func flush() {
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
+	nbuf = 0
+}
+
+// Cache of types that have been serialized already.
+// We use a type's hash field to pick a bucket.
+// Inside a bucket, we keep a list of types that
+// have been serialized so far, most recently used first.
+// Note: when a bucket overflows we may end up
+// serializing a type more than once. That's ok.
+const (
+	typeCacheBuckets = 256
+	typeCacheAssoc   = 4
+)
+
+type typeCacheBucket struct {
+	t [typeCacheAssoc]*_type
+}
+
+var typecache [typeCacheBuckets]typeCacheBucket
+
+// dump a uint64 in a varint format parseable by encoding/binary
+func dumpint(v uint64) {
+	var buf [10]byte
+	var n int
+	for v >= 0x80 {
+		buf[n] = byte(v | 0x80)
+		n++
+		v >>= 7
+	}
+	buf[n] = byte(v)
+	n++
+	dwrite(unsafe.Pointer(&buf), uintptr(n))
+}
+
+func dumpbool(b bool) {
+	if b {
+		dumpint(1)
+	} else {
+		dumpint(0)
+	}
+}
+
+// dump varint uint64 length followed by memory contents
+func dumpmemrange(data unsafe.Pointer, len uintptr) {
+	dumpint(uint64(len))
+	dwrite(data, len)
+}
+
+func dumpslice(b []byte) {
+	dumpint(uint64(len(b)))
+	if len(b) > 0 {
+		dwrite(unsafe.Pointer(&b[0]), uintptr(len(b)))
+	}
+}
+
+func dumpstr(s string) {
+	sp := stringStructOf(&s)
+	dumpmemrange(sp.str, uintptr(sp.len))
+}
+
+// dump information for a type
+func dumptype(t *_type) {
+	if t == nil {
+		return
+	}
+
+	// If we've definitely serialized the type before,
+	// no need to do it again.
+	b := &typecache[t.hash&(typeCacheBuckets-1)]
+	if t == b.t[0] {
+		return
+	}
+	for i := 1; i < typeCacheAssoc; i++ {
+		if t == b.t[i] {
+			// Move-to-front
+			for j := i; j > 0; j-- {
+				b.t[j] = b.t[j-1]
+			}
+			b.t[0] = t
+			return
+		}
+	}
+
+	// Might not have been dumped yet. Dump it and
+	// remember we did so.
+	for j := typeCacheAssoc - 1; j > 0; j-- {
+		b.t[j] = b.t[j-1]
+	}
+	b.t[0] = t
+
+	// dump the type
+	dumpint(tagType)
+	dumpint(uint64(uintptr(unsafe.Pointer(t))))
+	dumpint(uint64(t.size))
+	if x := t.uncommon(); x == nil || t.nameOff(x.pkgpath).name() == "" {
+		dumpstr(t.string())
+	} else {
+		pkgpathstr := t.nameOff(x.pkgpath).name()
+		pkgpath := stringStructOf(&pkgpathstr)
+		namestr := t.name()
+		name := stringStructOf(&namestr)
+		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
+		dwrite(pkgpath.str, uintptr(pkgpath.len))
+		dwritebyte('.')
+		dwrite(name.str, uintptr(name.len))
+	}
+	dumpbool(t.kind&kindDirectIface == 0 || t.ptrdata != 0)
+}
+
+// dump an object
+func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
+	dumpint(tagObject)
+	dumpint(uint64(uintptr(obj)))
+	dumpmemrange(obj, size)
+	dumpfields(bv)
+}
+
+func dumpotherroot(description string, to unsafe.Pointer) {
+	dumpint(tagOtherRoot)
+	dumpstr(description)
+	dumpint(uint64(uintptr(to)))
+}
+
+func dumpfinalizer(obj unsafe.Pointer, fn *funcval, fint *_type, ot *ptrtype) {
+	dumpint(tagFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fint))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+type childInfo struct {
+	// Information passed up from the callee frame about
+	// the layout of the outargs region.
+	argoff uintptr   // where the arguments start in the frame
+	arglen uintptr   // size of args region
+	args   bitvector // if args.n >= 0, pointer map of args region
+	sp     *uint8    // callee sp
+	depth  uintptr   // depth in call stack (0 == most recent)
+}
+
+// dump kinds & offsets of interesting fields in bv
+func dumpbv(cbv *bitvector, offset uintptr) {
+	for i := uintptr(0); i < uintptr(cbv.n); i++ {
+		if cbv.ptrbit(i) == 1 {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(offset + i*goarch.PtrSize))
+		}
+	}
+}
+
+func dumpframe(s *stkframe, arg unsafe.Pointer) bool {
+	child := (*childInfo)(arg)
+	f := s.fn
+
+	// Figure out what we can about our stack map
+	pc := s.pc
+	pcdata := int32(-1) // Use the entry map at function entry
+	if pc != f.entry() {
+		pc--
+		pcdata = pcdatavalue(f, _PCDATA_StackMapIndex, pc, nil)
+	}
+	if pcdata == -1 {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function. It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0
+	}
+	stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+
+	var bv bitvector
+	if stkmap != nil && stkmap.n > 0 {
+		bv = stackmapdata(stkmap, pcdata)
+	} else {
+		bv.n = -1
+	}
+
+	// Dump main body of stack frame.
+	dumpint(tagStackFrame)
+	dumpint(uint64(s.sp))                              // lowest address in frame
+	dumpint(uint64(child.depth))                       // # of frames deep on the stack
+	dumpint(uint64(uintptr(unsafe.Pointer(child.sp)))) // sp of child, or 0 if bottom of stack
+	dumpmemrange(unsafe.Pointer(s.sp), s.fp-s.sp)      // frame contents
+	dumpint(uint64(f.entry()))
+	dumpint(uint64(s.pc))
+	dumpint(uint64(s.continpc))
+	name := funcname(f)
+	if name == "" {
+		name = "unknown function"
+	}
+	dumpstr(name)
+
+	// Dump fields in the outargs section
+	if child.args.n >= 0 {
+		dumpbv(&child.args, child.argoff)
+	} else {
+		// conservative - everything might be a pointer
+		for off := child.argoff; off < child.argoff+child.arglen; off += goarch.PtrSize {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(off))
+		}
+	}
+
+	// Dump fields in the local vars section
+	if stkmap == nil {
+		// No locals information, dump everything.
+		for off := child.arglen; off < s.varp-s.sp; off += goarch.PtrSize {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(off))
+		}
+	} else if stkmap.n < 0 {
+		// Locals size information, dump just the locals.
+		size := uintptr(-stkmap.n)
+		for off := s.varp - size - s.sp; off < s.varp-s.sp; off += goarch.PtrSize {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(off))
+		}
+	} else if stkmap.n > 0 {
+		// Locals bitmap information, scan just the pointers in
+		// locals.
+		dumpbv(&bv, s.varp-uintptr(bv.n)*goarch.PtrSize-s.sp)
+	}
+	dumpint(fieldKindEol)
+
+	// Record arg info for parent.
+	child.argoff = s.argp - s.fp
+	child.arglen = s.arglen
+	child.sp = (*uint8)(unsafe.Pointer(s.sp))
+	child.depth++
+	stkmap = (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+	if stkmap != nil {
+		child.args = stackmapdata(stkmap, pcdata)
+	} else {
+		child.args.n = -1
+	}
+	return true
+}
+
+func dumpgoroutine(gp *g) {
+	var sp, pc, lr uintptr
+	if gp.syscallsp != 0 {
+		sp = gp.syscallsp
+		pc = gp.syscallpc
+		lr = 0
+	} else {
+		sp = gp.sched.sp
+		pc = gp.sched.pc
+		lr = gp.sched.lr
+	}
+
+	dumpint(tagGoroutine)
+	dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+	dumpint(uint64(sp))
+	dumpint(uint64(gp.goid))
+	dumpint(uint64(gp.gopc))
+	dumpint(uint64(readgstatus(gp)))
+	dumpbool(isSystemGoroutine(gp, false))
+	dumpbool(false) // isbackground
+	dumpint(uint64(gp.waitsince))
+	dumpstr(gp.waitreason.String())
+	dumpint(uint64(uintptr(gp.sched.ctxt)))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp.m))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._defer))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._panic))))
+
+	// dump stack
+	var child childInfo
+	child.args.n = -1
+	child.arglen = 0
+	child.sp = nil
+	child.depth = 0
+	gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, dumpframe, noescape(unsafe.Pointer(&child)), 0)
+
+	// dump defer & panic records
+	for d := gp._defer; d != nil; d = d.link {
+		dumpint(tagDefer)
+		dumpint(uint64(uintptr(unsafe.Pointer(d))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		dumpint(uint64(d.sp))
+		dumpint(uint64(d.pc))
+		fn := *(**funcval)(unsafe.Pointer(&d.fn))
+		dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+		if d.fn == nil {
+			// d.fn can be nil for open-coded defers
+			dumpint(uint64(0))
+		} else {
+			dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+		}
+		dumpint(uint64(uintptr(unsafe.Pointer(d.link))))
+	}
+	for p := gp._panic; p != nil; p = p.link {
+		dumpint(tagPanic)
+		dumpint(uint64(uintptr(unsafe.Pointer(p))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		eface := efaceOf(&p.arg)
+		dumpint(uint64(uintptr(unsafe.Pointer(eface._type))))
+		dumpint(uint64(uintptr(unsafe.Pointer(eface.data))))
+		dumpint(0) // was p->defer, no longer recorded
+		dumpint(uint64(uintptr(unsafe.Pointer(p.link))))
+	}
+}
+
+func dumpgs() {
+	assertWorldStopped()
+
+	// goroutines & stacks
+	forEachG(func(gp *g) {
+		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
+		switch status {
+		default:
+			print("runtime: unexpected G.status ", hex(status), "\n")
+			throw("dumpgs in STW - bad status")
+		case _Gdead:
+			// ok
+		case _Grunnable,
+			_Gsyscall,
+			_Gwaiting:
+			dumpgoroutine(gp)
+		}
+	})
+}
+
+func finq_callback(fn *funcval, obj unsafe.Pointer, nret uintptr, fint *_type, ot *ptrtype) {
+	dumpint(tagQueuedFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fint))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+func dumproots() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
+	// TODO(mwhudson): dump datamask etc from all objects
+	// data segment
+	dumpint(tagData)
+	dumpint(uint64(firstmoduledata.data))
+	dumpmemrange(unsafe.Pointer(firstmoduledata.data), firstmoduledata.edata-firstmoduledata.data)
+	dumpfields(firstmoduledata.gcdatamask)
+
+	// bss segment
+	dumpint(tagBSS)
+	dumpint(uint64(firstmoduledata.bss))
+	dumpmemrange(unsafe.Pointer(firstmoduledata.bss), firstmoduledata.ebss-firstmoduledata.bss)
+	dumpfields(firstmoduledata.gcbssmask)
+
+	// mspan.types
+	for _, s := range mheap_.allspans {
+		if s.state.get() == mSpanInUse {
+			// Finalizers
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				p := unsafe.Pointer(s.base() + uintptr(spf.special.offset))
+				dumpfinalizer(p, spf.fn, spf.fint, spf.ot)
+			}
+		}
+	}
+
+	// Finalizer queue
+	iterate_finq(finq_callback)
+}
+
+// Bit vector of free marks.
+// Needs to be as big as the largest number of objects per span.
+var freemark [_PageSize / 8]bool
+
+func dumpobjs() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
+	for _, s := range mheap_.allspans {
+		if s.state.get() != mSpanInUse {
+			continue
+		}
+		p := s.base()
+		size := s.elemsize
+		n := (s.npages << _PageShift) / size
+		if n > uintptr(len(freemark)) {
+			throw("freemark array doesn't have enough entries")
+		}
+
+		for freeIndex := uintptr(0); freeIndex < s.nelems; freeIndex++ {
+			if s.isFree(freeIndex) {
+				freemark[freeIndex] = true
+			}
+		}
+
+		for j := uintptr(0); j < n; j, p = j+1, p+size {
+			if freemark[j] {
+				freemark[j] = false
+				continue
+			}
+			dumpobj(unsafe.Pointer(p), size, makeheapobjbv(p, size))
+		}
+	}
+}
+
+func dumpparams() {
+	dumpint(tagParams)
+	x := uintptr(1)
+	if *(*byte)(unsafe.Pointer(&x)) == 1 {
+		dumpbool(false) // little-endian ptrs
+	} else {
+		dumpbool(true) // big-endian ptrs
+	}
+	dumpint(goarch.PtrSize)
+	var arenaStart, arenaEnd uintptr
+	for i1 := range mheap_.arenas {
+		if mheap_.arenas[i1] == nil {
+			continue
+		}
+		for i, ha := range mheap_.arenas[i1] {
+			if ha == nil {
+				continue
+			}
+			base := arenaBase(arenaIdx(i1)<<arenaL1Shift | arenaIdx(i))
+			if arenaStart == 0 || base < arenaStart {
+				arenaStart = base
+			}
+			if base+heapArenaBytes > arenaEnd {
+				arenaEnd = base + heapArenaBytes
+			}
+		}
+	}
+	dumpint(uint64(arenaStart))
+	dumpint(uint64(arenaEnd))
+	dumpstr(goarch.GOARCH)
+	dumpstr(buildVersion)
+	dumpint(uint64(ncpu))
+}
+
+func itab_callback(tab *itab) {
+	t := tab._type
+	dumptype(t)
+	dumpint(tagItab)
+	dumpint(uint64(uintptr(unsafe.Pointer(tab))))
+	dumpint(uint64(uintptr(unsafe.Pointer(t))))
+}
+
+func dumpitabs() {
+	iterate_itabs(itab_callback)
+}
+
+func dumpms() {
+	for mp := allm; mp != nil; mp = mp.alllink {
+		dumpint(tagOSThread)
+		dumpint(uint64(uintptr(unsafe.Pointer(mp))))
+		dumpint(uint64(mp.id))
+		dumpint(mp.procid)
+	}
+}
+
+//go:systemstack
+func dumpmemstats(m *MemStats) {
+	assertWorldStopped()
+
+	// These ints should be identical to the exported
+	// MemStats structure and should be ordered the same
+	// way too.
+	dumpint(tagMemStats)
+	dumpint(m.Alloc)
+	dumpint(m.TotalAlloc)
+	dumpint(m.Sys)
+	dumpint(m.Lookups)
+	dumpint(m.Mallocs)
+	dumpint(m.Frees)
+	dumpint(m.HeapAlloc)
+	dumpint(m.HeapSys)
+	dumpint(m.HeapIdle)
+	dumpint(m.HeapInuse)
+	dumpint(m.HeapReleased)
+	dumpint(m.HeapObjects)
+	dumpint(m.StackInuse)
+	dumpint(m.StackSys)
+	dumpint(m.MSpanInuse)
+	dumpint(m.MSpanSys)
+	dumpint(m.MCacheInuse)
+	dumpint(m.MCacheSys)
+	dumpint(m.BuckHashSys)
+	dumpint(m.GCSys)
+	dumpint(m.OtherSys)
+	dumpint(m.NextGC)
+	dumpint(m.LastGC)
+	dumpint(m.PauseTotalNs)
+	for i := 0; i < 256; i++ {
+		dumpint(m.PauseNs[i])
+	}
+	dumpint(uint64(m.NumGC))
+}
+
+func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
+	stk := (*[100000]uintptr)(unsafe.Pointer(pstk))
+	dumpint(tagMemProf)
+	dumpint(uint64(uintptr(unsafe.Pointer(b))))
+	dumpint(uint64(size))
+	dumpint(uint64(nstk))
+	for i := uintptr(0); i < nstk; i++ {
+		pc := stk[i]
+		f := findfunc(pc)
+		if !f.valid() {
+			var buf [64]byte
+			n := len(buf)
+			n--
+			buf[n] = ')'
+			if pc == 0 {
+				n--
+				buf[n] = '0'
+			} else {
+				for pc > 0 {
+					n--
+					buf[n] = "0123456789abcdef"[pc&15]
+					pc >>= 4
+				}
+			}
+			n--
+			buf[n] = 'x'
+			n--
+			buf[n] = '0'
+			n--
+			buf[n] = '('
+			dumpslice(buf[n:])
+			dumpstr("?")
+			dumpint(0)
+		} else {
+			dumpstr(funcname(f))
+			if i > 0 && pc > f.entry() {
+				pc--
+			}
+			file, line := funcline(f, pc)
+			dumpstr(file)
+			dumpint(uint64(line))
+		}
+	}
+	dumpint(uint64(allocs))
+	dumpint(uint64(frees))
+}
+
+func dumpmemprof() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
+	iterate_memprof(dumpmemprof_callback)
+	for _, s := range mheap_.allspans {
+		if s.state.get() != mSpanInUse {
+			continue
+		}
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialProfile {
+				continue
+			}
+			spp := (*specialprofile)(unsafe.Pointer(sp))
+			p := s.base() + uintptr(spp.special.offset)
+			dumpint(tagAllocSample)
+			dumpint(uint64(p))
+			dumpint(uint64(uintptr(unsafe.Pointer(spp.b))))
+		}
+	}
+}
+
+var dumphdr = []byte("go1.7 heap dump\n")
+
+func mdump(m *MemStats) {
+	assertWorldStopped()
+
+	// make sure we're done sweeping
+	for _, s := range mheap_.allspans {
+		if s.state.get() == mSpanInUse {
+			s.ensureSwept()
+		}
+	}
+	memclrNoHeapPointers(unsafe.Pointer(&typecache), unsafe.Sizeof(typecache))
+	dwrite(unsafe.Pointer(&dumphdr[0]), uintptr(len(dumphdr)))
+	dumpparams()
+	dumpitabs()
+	dumpobjs()
+	dumpgs()
+	dumpms()
+	dumproots()
+	dumpmemstats(m)
+	dumpmemprof()
+	dumpint(tagEOF)
+	flush()
+}
+
+func writeheapdump_m(fd uintptr, m *MemStats) {
+	assertWorldStopped()
+
+	_g_ := getg()
+	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
+	_g_.waitreason = waitReasonDumpingHeap
+
+	// Update stats so we can dump them.
+	// As a side effect, flushes all the mcaches so the mspan.freelist
+	// lists contain all the free objects.
+	updatememstats()
+
+	// Set dump file.
+	dumpfd = fd
+
+	// Call dump routine.
+	mdump(m)
+
+	// Reset dump file.
+	dumpfd = 0
+	if tmpbuf != nil {
+		sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		tmpbuf = nil
+	}
+
+	casgstatus(_g_.m.curg, _Gwaiting, _Grunning)
+}
+
+// dumpint() the kind & offset of each field in an object.
+func dumpfields(bv bitvector) {
+	dumpbv(&bv, 0)
+	dumpint(fieldKindEol)
+}
+
+func makeheapobjbv(p uintptr, size uintptr) bitvector {
+	// Extend the temp buffer if necessary.
+	nptr := size / goarch.PtrSize
+	if uintptr(len(tmpbuf)) < nptr/8+1 {
+		if tmpbuf != nil {
+			sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		}
+		n := nptr/8 + 1
+		p := sysAlloc(n, &memstats.other_sys)
+		if p == nil {
+			throw("heapdump: out of memory")
+		}
+		tmpbuf = (*[1 << 30]byte)(p)[:n]
+	}
+	// Convert heap bitmap to pointer bitmap.
+	for i := uintptr(0); i < nptr/8+1; i++ {
+		tmpbuf[i] = 0
+	}
+	i := uintptr(0)
+	hbits := heapBitsForAddr(p)
+	for ; i < nptr; i++ {
+		if !hbits.morePointers() {
+			break // end of object
+		}
+		if hbits.isPointer() {
+			tmpbuf[i/8] |= 1 << (i % 8)
+		}
+		hbits = hbits.next()
+	}
+	return bitvector{int32(i), &tmpbuf[0]}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/histogram.go b/contrib/go/_std_1.18/src/runtime/histogram.go
new file mode 100644
index 0000000000..cd7e29a8c8
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/histogram.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	// For the time histogram type, we use an HDR histogram.
+	// Values are placed in super-buckets based solely on the most
+	// significant set bit. Thus, super-buckets are power-of-2 sized.
+	// Values are then placed into sub-buckets based on the value of
+	// the next timeHistSubBucketBits most significant bits. Thus,
+	// sub-buckets are linear within a super-bucket.
+	//
+	// Therefore, the number of sub-buckets (timeHistNumSubBuckets)
+	// defines the error. This error may be computed as
+	// 1/timeHistNumSubBuckets*100%. For example, for 16 sub-buckets
+	// per super-bucket the error is approximately 6%.
+	//
+	// The number of super-buckets (timeHistNumSuperBuckets), on the
+	// other hand, defines the range. To reserve room for sub-buckets,
+	// bit timeHistSubBucketBits is the first bit considered for
+	// super-buckets, so super-bucket indices are adjusted accordingly.
+	//
+	// As an example, consider 45 super-buckets with 16 sub-buckets.
+	//
+	//    00110
+	//    ^----
+	//    │  ^
+	//    │  └---- Lowest 4 bits -> sub-bucket 6
+	//    └------- Bit 4 unset -> super-bucket 0
+	//
+	//    10110
+	//    ^----
+	//    │  ^
+	//    │  └---- Next 4 bits -> sub-bucket 6
+	//    └------- Bit 4 set -> super-bucket 1
+	//    100010
+	//    ^----^
+	//    │  ^ └-- Lower bits ignored
+	//    │  └---- Next 4 bits -> sub-bucket 1
+	//    └------- Bit 5 set -> super-bucket 2
+	//
+	// Following this pattern, super-bucket 44 will have the bit 47 set. We don't
+	// have any buckets for higher values, so the highest sub-bucket will
+	// contain values of 2^48-1 nanoseconds or approx. 3 days. This range is
+	// more than enough to handle durations produced by the runtime.
+	timeHistSubBucketBits   = 4
+	timeHistNumSubBuckets   = 1 << timeHistSubBucketBits
+	timeHistNumSuperBuckets = 45
+	timeHistTotalBuckets    = timeHistNumSuperBuckets*timeHistNumSubBuckets + 1
+)
+
+// timeHistogram represents a distribution of durations in
+// nanoseconds.
+//
+// The accuracy and range of the histogram is defined by the
+// timeHistSubBucketBits and timeHistNumSuperBuckets constants.
+//
+// It is an HDR histogram with exponentially-distributed
+// buckets and linearly distributed sub-buckets.
+//
+// Counts in the histogram are updated atomically, so it is safe
+// for concurrent use. It is also safe to read all the values
+// atomically.
+type timeHistogram struct {
+	counts [timeHistNumSuperBuckets * timeHistNumSubBuckets]uint64
+
+	// underflow counts all the times we got a negative duration
+	// sample. Because of how time works on some platforms, it's
+	// possible to measure negative durations. We could ignore them,
+	// but we record them anyway because it's better to have some
+	// signal that it's happening than just missing samples.
+	underflow uint64
+}
+
+// record adds the given duration to the distribution.
+//
+// Disallow preemptions and stack growths because this function
+// may run in sensitive locations.
+//go:nosplit
+func (h *timeHistogram) record(duration int64) {
+	if duration < 0 {
+		atomic.Xadd64(&h.underflow, 1)
+		return
+	}
+	// The index of the exponential bucket is just the index
+	// of the highest set bit adjusted for how many bits we
+	// use for the subbucket. Note that it's timeHistSubBucketsBits-1
+	// because we use the 0th bucket to hold values < timeHistNumSubBuckets.
+	var superBucket, subBucket uint
+	if duration >= timeHistNumSubBuckets {
+		// At this point, we know the duration value will always be
+		// at least timeHistSubBucketsBits long.
+		superBucket = uint(sys.Len64(uint64(duration))) - timeHistSubBucketBits
+		if superBucket*timeHistNumSubBuckets >= uint(len(h.counts)) {
+			// The bucket index we got is larger than what we support, so
+			// include this count in the highest bucket, which extends to
+			// infinity.
+			superBucket = timeHistNumSuperBuckets - 1
+			subBucket = timeHistNumSubBuckets - 1
+		} else {
+			// The linear subbucket index is just the timeHistSubBucketsBits
+			// bits after the top bit. To extract that value, shift down
+			// the duration such that we leave the top bit and the next bits
+			// intact, then extract the index.
+			subBucket = uint((duration >> (superBucket - 1)) % timeHistNumSubBuckets)
+		}
+	} else {
+		subBucket = uint(duration)
+	}
+	atomic.Xadd64(&h.counts[superBucket*timeHistNumSubBuckets+subBucket], 1)
+}
+
+const (
+	fInf    = 0x7FF0000000000000
+	fNegInf = 0xFFF0000000000000
+)
+
+func float64Inf() float64 {
+	inf := uint64(fInf)
+	return *(*float64)(unsafe.Pointer(&inf))
+}
+
+func float64NegInf() float64 {
+	inf := uint64(fNegInf)
+	return *(*float64)(unsafe.Pointer(&inf))
+}
+
+// timeHistogramMetricsBuckets generates a slice of boundaries for
+// the timeHistogram. These boundaries are represented in seconds,
+// not nanoseconds like the timeHistogram represents durations.
+func timeHistogramMetricsBuckets() []float64 {
+	b := make([]float64, timeHistTotalBuckets+1)
+	b[0] = float64NegInf()
+	// Super-bucket 0 has no bits above timeHistSubBucketBits
+	// set, so just iterate over each bucket and assign the
+	// incrementing bucket.
+	for i := 0; i < timeHistNumSubBuckets; i++ {
+		bucketNanos := uint64(i)
+		b[i+1] = float64(bucketNanos) / 1e9
+	}
+	// Generate the rest of the super-buckets. It's easier to reason
+	// about if we cut out the 0'th bucket, so subtract one since
+	// we just handled that bucket.
+	for i := 0; i < timeHistNumSuperBuckets-1; i++ {
+		for j := 0; j < timeHistNumSubBuckets; j++ {
+			// Set the super-bucket bit.
+			bucketNanos := uint64(1) << (i + timeHistSubBucketBits)
+			// Set the sub-bucket bits.
+			bucketNanos |= uint64(j) << i
+			// The index for this bucket is going to be the (i+1)'th super bucket
+			// (note that we're starting from zero, but handled the first super-bucket
+			// earlier, so we need to compensate), and the j'th sub bucket.
+			// Add 1 because we left space for -Inf.
+			bucketIndex := (i+1)*timeHistNumSubBuckets + j + 1
+			// Convert nanoseconds to seconds via a division.
+			// These values will all be exactly representable by a float64.
+			b[bucketIndex] = float64(bucketNanos) / 1e9
+		}
+	}
+	b[len(b)-1] = float64Inf()
+	return b
+}
diff --git a/contrib/go/_std_1.18/src/runtime/iface.go b/contrib/go/_std_1.18/src/runtime/iface.go
new file mode 100644
index 0000000000..a4d56dd33b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/iface.go
@@ -0,0 +1,533 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const itabInitSize = 512
+
+var (
+	itabLock      mutex                               // lock for accessing itab table
+	itabTable     = &itabTableInit                    // pointer to current table
+	itabTableInit = itabTableType{size: itabInitSize} // starter table
+)
+
+// Note: change the formula in the mallocgc call in itabAdd if you change these fields.
+type itabTableType struct {
+	size    uintptr             // length of entries array. Always a power of 2.
+	count   uintptr             // current number of filled entries.
+	entries [itabInitSize]*itab // really [size] large
+}
+
+func itabHashFunc(inter *interfacetype, typ *_type) uintptr {
+	// compiler has provided some good hash codes for us.
+	return uintptr(inter.typ.hash ^ typ.hash)
+}
+
+func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
+	if len(inter.mhdr) == 0 {
+		throw("internal error - misuse of itab")
+	}
+
+	// easy case
+	if typ.tflag&tflagUncommon == 0 {
+		if canfail {
+			return nil
+		}
+		name := inter.typ.nameOff(inter.mhdr[0].name)
+		panic(&TypeAssertionError{nil, typ, &inter.typ, name.name()})
+	}
+
+	var m *itab
+
+	// First, look in the existing table to see if we can find the itab we need.
+	// This is by far the most common case, so do it without locks.
+	// Use atomic to ensure we see any previous writes done by the thread
+	// that updates the itabTable field (with atomic.Storep in itabAdd).
+	t := (*itabTableType)(atomic.Loadp(unsafe.Pointer(&itabTable)))
+	if m = t.find(inter, typ); m != nil {
+		goto finish
+	}
+
+	// Not found.  Grab the lock and try again.
+	lock(&itabLock)
+	if m = itabTable.find(inter, typ); m != nil {
+		unlock(&itabLock)
+		goto finish
+	}
+
+	// Entry doesn't exist yet. Make a new entry & add it.
+	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*goarch.PtrSize, 0, &memstats.other_sys))
+	m.inter = inter
+	m._type = typ
+	// The hash is used in type switches. However, compiler statically generates itab's
+	// for all interface/type pairs used in switches (which are added to itabTable
+	// in itabsinit). The dynamically-generated itab's never participate in type switches,
+	// and thus the hash is irrelevant.
+	// Note: m.hash is _not_ the hash used for the runtime itabTable hash table.
+	m.hash = 0
+	m.init()
+	itabAdd(m)
+	unlock(&itabLock)
+finish:
+	if m.fun[0] != 0 {
+		return m
+	}
+	if canfail {
+		return nil
+	}
+	// this can only happen if the conversion
+	// was already done once using the , ok form
+	// and we have a cached negative result.
+	// The cached result doesn't record which
+	// interface function was missing, so initialize
+	// the itab again to get the missing function name.
+	panic(&TypeAssertionError{concrete: typ, asserted: &inter.typ, missingMethod: m.init()})
+}
+
+// find finds the given interface/type pair in t.
+// Returns nil if the given interface/type pair isn't present.
+func (t *itabTableType) find(inter *interfacetype, typ *_type) *itab {
+	// Implemented using quadratic probing.
+	// Probe sequence is h(i) = h0 + i*(i+1)/2 mod 2^k.
+	// We're guaranteed to hit all table entries using this probe sequence.
+	mask := t.size - 1
+	h := itabHashFunc(inter, typ) & mask
+	for i := uintptr(1); ; i++ {
+		p := (**itab)(add(unsafe.Pointer(&t.entries), h*goarch.PtrSize))
+		// Use atomic read here so if we see m != nil, we also see
+		// the initializations of the fields of m.
+		// m := *p
+		m := (*itab)(atomic.Loadp(unsafe.Pointer(p)))
+		if m == nil {
+			return nil
+		}
+		if m.inter == inter && m._type == typ {
+			return m
+		}
+		h += i
+		h &= mask
+	}
+}
+
+// itabAdd adds the given itab to the itab hash table.
+// itabLock must be held.
+func itabAdd(m *itab) {
+	// Bugs can lead to calling this while mallocing is set,
+	// typically because this is called while panicing.
+	// Crash reliably, rather than only when we need to grow
+	// the hash table.
+	if getg().m.mallocing != 0 {
+		throw("malloc deadlock")
+	}
+
+	t := itabTable
+	if t.count >= 3*(t.size/4) { // 75% load factor
+		// Grow hash table.
+		// t2 = new(itabTableType) + some additional entries
+		// We lie and tell malloc we want pointer-free memory because
+		// all the pointed-to values are not in the heap.
+		t2 := (*itabTableType)(mallocgc((2+2*t.size)*goarch.PtrSize, nil, true))
+		t2.size = t.size * 2
+
+		// Copy over entries.
+		// Note: while copying, other threads may look for an itab and
+		// fail to find it. That's ok, they will then try to get the itab lock
+		// and as a consequence wait until this copying is complete.
+		iterate_itabs(t2.add)
+		if t2.count != t.count {
+			throw("mismatched count during itab table copy")
+		}
+		// Publish new hash table. Use an atomic write: see comment in getitab.
+		atomicstorep(unsafe.Pointer(&itabTable), unsafe.Pointer(t2))
+		// Adopt the new table as our own.
+		t = itabTable
+		// Note: the old table can be GC'ed here.
+	}
+	t.add(m)
+}
+
+// add adds the given itab to itab table t.
+// itabLock must be held.
+func (t *itabTableType) add(m *itab) {
+	// See comment in find about the probe sequence.
+	// Insert new itab in the first empty spot in the probe sequence.
+	mask := t.size - 1
+	h := itabHashFunc(m.inter, m._type) & mask
+	for i := uintptr(1); ; i++ {
+		p := (**itab)(add(unsafe.Pointer(&t.entries), h*goarch.PtrSize))
+		m2 := *p
+		if m2 == m {
+			// A given itab may be used in more than one module
+			// and thanks to the way global symbol resolution works, the
+			// pointed-to itab may already have been inserted into the
+			// global 'hash'.
+			return
+		}
+		if m2 == nil {
+			// Use atomic write here so if a reader sees m, it also
+			// sees the correctly initialized fields of m.
+			// NoWB is ok because m is not in heap memory.
+			// *p = m
+			atomic.StorepNoWB(unsafe.Pointer(p), unsafe.Pointer(m))
+			t.count++
+			return
+		}
+		h += i
+		h &= mask
+	}
+}
+
+// init fills in the m.fun array with all the code pointers for
+// the m.inter/m._type pair. If the type does not implement the interface,
+// it sets m.fun[0] to 0 and returns the name of an interface function that is missing.
+// It is ok to call this multiple times on the same m, even concurrently.
+func (m *itab) init() string {
+	inter := m.inter
+	typ := m._type
+	x := typ.uncommon()
+
+	// both inter and typ have method sorted by name,
+	// and interface names are unique,
+	// so can iterate over both in lock step;
+	// the loop is O(ni+nt) not O(ni*nt).
+	ni := len(inter.mhdr)
+	nt := int(x.mcount)
+	xmhdr := (*[1 << 16]method)(add(unsafe.Pointer(x), uintptr(x.moff)))[:nt:nt]
+	j := 0
+	methods := (*[1 << 16]unsafe.Pointer)(unsafe.Pointer(&m.fun[0]))[:ni:ni]
+	var fun0 unsafe.Pointer
+imethods:
+	for k := 0; k < ni; k++ {
+		i := &inter.mhdr[k]
+		itype := inter.typ.typeOff(i.ityp)
+		name := inter.typ.nameOff(i.name)
+		iname := name.name()
+		ipkg := name.pkgPath()
+		if ipkg == "" {
+			ipkg = inter.pkgpath.name()
+		}
+		for ; j < nt; j++ {
+			t := &xmhdr[j]
+			tname := typ.nameOff(t.name)
+			if typ.typeOff(t.mtyp) == itype && tname.name() == iname {
+				pkgPath := tname.pkgPath()
+				if pkgPath == "" {
+					pkgPath = typ.nameOff(x.pkgpath).name()
+				}
+				if tname.isExported() || pkgPath == ipkg {
+					if m != nil {
+						ifn := typ.textOff(t.ifn)
+						if k == 0 {
+							fun0 = ifn // we'll set m.fun[0] at the end
+						} else {
+							methods[k] = ifn
+						}
+					}
+					continue imethods
+				}
+			}
+		}
+		// didn't find method
+		m.fun[0] = 0
+		return iname
+	}
+	m.fun[0] = uintptr(fun0)
+	return ""
+}
+
+func itabsinit() {
+	lockInit(&itabLock, lockRankItab)
+	lock(&itabLock)
+	for _, md := range activeModules() {
+		for _, i := range md.itablinks {
+			itabAdd(i)
+		}
+	}
+	unlock(&itabLock)
+}
+
+// panicdottypeE is called when doing an e.(T) conversion and the conversion fails.
+// have = the dynamic type we have.
+// want = the static type we're trying to convert to.
+// iface = the static type we're converting from.
+func panicdottypeE(have, want, iface *_type) {
+	panic(&TypeAssertionError{iface, have, want, ""})
+}
+
+// panicdottypeI is called when doing an i.(T) conversion and the conversion fails.
+// Same args as panicdottypeE, but "have" is the dynamic itab we have.
+func panicdottypeI(have *itab, want, iface *_type) {
+	var t *_type
+	if have != nil {
+		t = have._type
+	}
+	panicdottypeE(t, want, iface)
+}
+
+// panicnildottype is called when doing a i.(T) conversion and the interface i is nil.
+// want = the static type we're trying to convert to.
+func panicnildottype(want *_type) {
+	panic(&TypeAssertionError{nil, nil, want, ""})
+	// TODO: Add the static type we're converting from as well.
+	// It might generate a better error message.
+	// Just to match other nil conversion errors, we don't for now.
+}
+
+// The specialized convTx routines need a type descriptor to use when calling mallocgc.
+// We don't need the type to be exact, just to have the correct size, alignment, and pointer-ness.
+// However, when debugging, it'd be nice to have some indication in mallocgc where the types came from,
+// so we use named types here.
+// We then construct interface values of these types,
+// and then extract the type word to use as needed.
+type (
+	uint16InterfacePtr uint16
+	uint32InterfacePtr uint32
+	uint64InterfacePtr uint64
+	stringInterfacePtr string
+	sliceInterfacePtr  []byte
+)
+
+var (
+	uint16Eface any = uint16InterfacePtr(0)
+	uint32Eface any = uint32InterfacePtr(0)
+	uint64Eface any = uint64InterfacePtr(0)
+	stringEface any = stringInterfacePtr("")
+	sliceEface  any = sliceInterfacePtr(nil)
+
+	uint16Type *_type = efaceOf(&uint16Eface)._type
+	uint32Type *_type = efaceOf(&uint32Eface)._type
+	uint64Type *_type = efaceOf(&uint64Eface)._type
+	stringType *_type = efaceOf(&stringEface)._type
+	sliceType  *_type = efaceOf(&sliceEface)._type
+)
+
+// The conv and assert functions below do very similar things.
+// The convXXX functions are guaranteed by the compiler to succeed.
+// The assertXXX functions may fail (either panicking or returning false,
+// depending on whether they are 1-result or 2-result).
+// The convXXX functions succeed on a nil input, whereas the assertXXX
+// functions fail on a nil input.
+
+// convT converts a value of type t, which is pointed to by v, to a pointer that can
+// be used as the second word of an interface value.
+func convT(t *_type, v unsafe.Pointer) unsafe.Pointer {
+	if raceenabled {
+		raceReadObjectPC(t, v, getcallerpc(), abi.FuncPCABIInternal(convT))
+	}
+	if msanenabled {
+		msanread(v, t.size)
+	}
+	if asanenabled {
+		asanread(v, t.size)
+	}
+	x := mallocgc(t.size, t, true)
+	typedmemmove(t, x, v)
+	return x
+}
+func convTnoptr(t *_type, v unsafe.Pointer) unsafe.Pointer {
+	// TODO: maybe take size instead of type?
+	if raceenabled {
+		raceReadObjectPC(t, v, getcallerpc(), abi.FuncPCABIInternal(convTnoptr))
+	}
+	if msanenabled {
+		msanread(v, t.size)
+	}
+	if asanenabled {
+		asanread(v, t.size)
+	}
+
+	x := mallocgc(t.size, t, false)
+	memmove(x, v, t.size)
+	return x
+}
+
+func convT16(val uint16) (x unsafe.Pointer) {
+	if val < uint16(len(staticuint64s)) {
+		x = unsafe.Pointer(&staticuint64s[val])
+		if goarch.BigEndian {
+			x = add(x, 6)
+		}
+	} else {
+		x = mallocgc(2, uint16Type, false)
+		*(*uint16)(x) = val
+	}
+	return
+}
+
+func convT32(val uint32) (x unsafe.Pointer) {
+	if val < uint32(len(staticuint64s)) {
+		x = unsafe.Pointer(&staticuint64s[val])
+		if goarch.BigEndian {
+			x = add(x, 4)
+		}
+	} else {
+		x = mallocgc(4, uint32Type, false)
+		*(*uint32)(x) = val
+	}
+	return
+}
+
+func convT64(val uint64) (x unsafe.Pointer) {
+	if val < uint64(len(staticuint64s)) {
+		x = unsafe.Pointer(&staticuint64s[val])
+	} else {
+		x = mallocgc(8, uint64Type, false)
+		*(*uint64)(x) = val
+	}
+	return
+}
+
+func convTstring(val string) (x unsafe.Pointer) {
+	if val == "" {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(unsafe.Sizeof(val), stringType, true)
+		*(*string)(x) = val
+	}
+	return
+}
+
+func convTslice(val []byte) (x unsafe.Pointer) {
+	// Note: this must work for any element type, not just byte.
+	if (*slice)(unsafe.Pointer(&val)).array == nil {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(unsafe.Sizeof(val), sliceType, true)
+		*(*[]byte)(x) = val
+	}
+	return
+}
+
+// convI2I returns the new itab to be used for the destination value
+// when converting a value with itab src to the dst interface.
+func convI2I(dst *interfacetype, src *itab) *itab {
+	if src == nil {
+		return nil
+	}
+	if src.inter == dst {
+		return src
+	}
+	return getitab(dst, src._type, false)
+}
+
+func assertI2I(inter *interfacetype, tab *itab) *itab {
+	if tab == nil {
+		// explicit conversions require non-nil interface value.
+		panic(&TypeAssertionError{nil, nil, &inter.typ, ""})
+	}
+	if tab.inter == inter {
+		return tab
+	}
+	return getitab(inter, tab._type, false)
+}
+
+func assertI2I2(inter *interfacetype, i iface) (r iface) {
+	tab := i.tab
+	if tab == nil {
+		return
+	}
+	if tab.inter != inter {
+		tab = getitab(inter, tab._type, true)
+		if tab == nil {
+			return
+		}
+	}
+	r.tab = tab
+	r.data = i.data
+	return
+}
+
+func assertE2I(inter *interfacetype, t *_type) *itab {
+	if t == nil {
+		// explicit conversions require non-nil interface value.
+		panic(&TypeAssertionError{nil, nil, &inter.typ, ""})
+	}
+	return getitab(inter, t, false)
+}
+
+func assertE2I2(inter *interfacetype, e eface) (r iface) {
+	t := e._type
+	if t == nil {
+		return
+	}
+	tab := getitab(inter, t, true)
+	if tab == nil {
+		return
+	}
+	r.tab = tab
+	r.data = e.data
+	return
+}
+
+//go:linkname reflect_ifaceE2I reflect.ifaceE2I
+func reflect_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
+	*dst = iface{assertE2I(inter, e._type), e.data}
+}
+
+//go:linkname reflectlite_ifaceE2I internal/reflectlite.ifaceE2I
+func reflectlite_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
+	*dst = iface{assertE2I(inter, e._type), e.data}
+}
+
+func iterate_itabs(fn func(*itab)) {
+	// Note: only runs during stop the world or with itabLock held,
+	// so no other locks/atomics needed.
+	t := itabTable
+	for i := uintptr(0); i < t.size; i++ {
+		m := *(**itab)(add(unsafe.Pointer(&t.entries), i*goarch.PtrSize))
+		if m != nil {
+			fn(m)
+		}
+	}
+}
+
+// staticuint64s is used to avoid allocating in convTx for small integer values.
+var staticuint64s = [...]uint64{
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+}
+
+// The linker redirects a reference of a method that it determined
+// unreachable to a reference to this function, so it will throw if
+// ever called.
+func unreachableMethod() {
+	throw("unreachable method called. linker bug?")
+}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/atomic_amd64.go b/contrib/go/_std_1.18/src/runtime/internal/atomic/atomic_amd64.go
new file mode 100644
index 0000000000..e36eb83a11
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/atomic_amd64.go
@@ -0,0 +1,116 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+// Export some functions via linkname to assembly in sync/atomic.
+//go:linkname Load
+//go:linkname Loadp
+//go:linkname Load64
+
+//go:nosplit
+//go:noinline
+func Load(ptr *uint32) uint32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+//go:noinline
+func Load64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcq(ptr *uint32) uint32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:nosplit
+//go:noinline
+func Load8(ptr *uint8) uint8 {
+	return *ptr
+}
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func CasRel(ptr *uint32, old, new uint32) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
+// StorepNoWB performs *ptr = val atomically and without a write
+// barrier.
+//
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/atomic_amd64.s b/contrib/go/_std_1.18/src/runtime/internal/atomic/atomic_amd64.s
new file mode 100644
index 0000000000..d21514b36b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/atomic_amd64.s
@@ -0,0 +1,225 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Note: some of these functions are semantically inlined
+// by the compiler (in src/cmd/compile/internal/gc/ssa.go).
+
+#include "textflag.h"
+
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Loaduint(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Loadint32(SB), NOSPLIT, $0-12
+	JMP	·Load(SB)
+
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB),NOSPLIT,$0-17
+	MOVQ	ptr+0(FP), BX
+	MOVL	old+8(FP), AX
+	MOVL	new+12(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+// bool	·Cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+// bool Casp1(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+TEXT ·Casint32(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+TEXT ·Casint64(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·CasRel(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	delta+8(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+// uint64 Xadd64(uint64 volatile *val, int64 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT ·Xaddint32(SB), NOSPLIT, $0-20
+	JMP	·Xadd(SB)
+
+TEXT ·Xaddint64(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+// uint32 Xchg(ptr *uint32, new uint32)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	new+8(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+16(FP)
+	RET
+
+// uint64 Xchg64(ptr *uint64, new uint64)
+// Atomically:
+//	old := *ptr;
+//	*ptr = new;
+//	return old;
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT ·Xchgint32(SB), NOSPLIT, $0-20
+	JMP	·Xchg(SB)
+
+TEXT ·Xchgint64(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT ·Store(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), BX
+	MOVL	val+8(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT ·Store8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), BX
+	MOVB	val+8(FP), AX
+	XCHGB	AX, 0(BX)
+	RET
+
+TEXT ·Store64(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT ·Storeint32(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+TEXT ·Storeint64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreRel(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+// void	·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void	·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ANDB	BX, (AX)
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/doc.go b/contrib/go/_std_1.18/src/runtime/internal/atomic/doc.go
new file mode 100644
index 0000000000..08e6b6ce0b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/doc.go
@@ -0,0 +1,18 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package atomic provides atomic operations, independent of sync/atomic,
+to the runtime.
+
+On most platforms, the compiler is aware of the functions defined
+in this package, and they're replaced with platform-specific intrinsics.
+On other platforms, generic implementations are made available.
+
+Unless otherwise noted, operations defined in this package are sequentially
+consistent across threads with respect to the values they manipulate. More
+specifically, operations that happen in a specific order on one thread,
+will always be observed to happen in exactly that order by another thread.
+*/
+package atomic
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/stubs.go b/contrib/go/_std_1.18/src/runtime/internal/atomic/stubs.go
new file mode 100644
index 0000000000..7df8d9c863
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/stubs.go
@@ -0,0 +1,59 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !wasm
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Cas(ptr *uint32, old, new uint32) bool
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+
+//go:noescape
+func Casint32(ptr *int32, old, new int32) bool
+
+//go:noescape
+func Casint64(ptr *int64, old, new int64) bool
+
+//go:noescape
+func Casuintptr(ptr *uintptr, old, new uintptr) bool
+
+//go:noescape
+func Storeint32(ptr *int32, new int32)
+
+//go:noescape
+func Storeint64(ptr *int64, new int64)
+
+//go:noescape
+func Storeuintptr(ptr *uintptr, new uintptr)
+
+//go:noescape
+func Loaduintptr(ptr *uintptr) uintptr
+
+//go:noescape
+func Loaduint(ptr *uint) uint
+
+// TODO(matloob): Should these functions have the go:noescape annotation?
+
+//go:noescape
+func Loadint32(ptr *int32) int32
+
+//go:noescape
+func Loadint64(ptr *int64) int64
+
+//go:noescape
+func Xaddint32(ptr *int32, delta int32) int32
+
+//go:noescape
+func Xaddint64(ptr *int64, delta int64) int64
+
+//go:noescape
+func Xchgint32(ptr *int32, new int32) int32
+
+//go:noescape
+func Xchgint64(ptr *int64, new int64) int64
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/types.go b/contrib/go/_std_1.18/src/runtime/internal/atomic/types.go
new file mode 100644
index 0000000000..1a240d7c91
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/types.go
@@ -0,0 +1,395 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+// Int32 is an atomically accessed int32 value.
+//
+// An Int32 must not be copied.
+type Int32 struct {
+	noCopy noCopy
+	value  int32
+}
+
+// Load accesses and returns the value atomically.
+func (i *Int32) Load() int32 {
+	return Loadint32(&i.value)
+}
+
+// Store updates the value atomically.
+func (i *Int32) Store(value int32) {
+	Storeint32(&i.value, value)
+}
+
+// CompareAndSwap atomically compares i's value with old,
+// and if they're equal, swaps i's value with new.
+//
+// Returns true if the operation succeeded.
+func (i *Int32) CompareAndSwap(old, new int32) bool {
+	return Casint32(&i.value, old, new)
+}
+
+// Swap replaces i's value with new, returning
+// i's value before the replacement.
+func (i *Int32) Swap(new int32) int32 {
+	return Xchgint32(&i.value, new)
+}
+
+// Add adds delta to i atomically, returning
+// the new updated value.
+//
+// This operation wraps around in the usual
+// two's-complement way.
+func (i *Int32) Add(delta int32) int32 {
+	return Xaddint32(&i.value, delta)
+}
+
+// Int64 is an atomically accessed int64 value.
+//
+// An Int64 must not be copied.
+type Int64 struct {
+	noCopy noCopy
+	value  int64
+}
+
+// Load accesses and returns the value atomically.
+func (i *Int64) Load() int64 {
+	return Loadint64(&i.value)
+}
+
+// Store updates the value atomically.
+func (i *Int64) Store(value int64) {
+	Storeint64(&i.value, value)
+}
+
+// CompareAndSwap atomically compares i's value with old,
+// and if they're equal, swaps i's value with new.
+//
+// Returns true if the operation succeeded.
+func (i *Int64) CompareAndSwap(old, new int64) bool {
+	return Casint64(&i.value, old, new)
+}
+
+// Swap replaces i's value with new, returning
+// i's value before the replacement.
+func (i *Int64) Swap(new int64) int64 {
+	return Xchgint64(&i.value, new)
+}
+
+// Add adds delta to i atomically, returning
+// the new updated value.
+//
+// This operation wraps around in the usual
+// two's-complement way.
+func (i *Int64) Add(delta int64) int64 {
+	return Xaddint64(&i.value, delta)
+}
+
+// Uint8 is an atomically accessed uint8 value.
+//
+// A Uint8 must not be copied.
+type Uint8 struct {
+	noCopy noCopy
+	value  uint8
+}
+
+// Load accesses and returns the value atomically.
+func (u *Uint8) Load() uint8 {
+	return Load8(&u.value)
+}
+
+// Store updates the value atomically.
+func (u *Uint8) Store(value uint8) {
+	Store8(&u.value, value)
+}
+
+// And takes value and performs a bit-wise
+// "and" operation with the value of u, storing
+// the result into u.
+//
+// The full process is performed atomically.
+func (u *Uint8) And(value uint8) {
+	And8(&u.value, value)
+}
+
+// Or takes value and performs a bit-wise
+// "or" operation with the value of u, storing
+// the result into u.
+//
+// The full process is performed atomically.
+func (u *Uint8) Or(value uint8) {
+	Or8(&u.value, value)
+}
+
+// Uint32 is an atomically accessed uint32 value.
+//
+// A Uint32 must not be copied.
+type Uint32 struct {
+	noCopy noCopy
+	value  uint32
+}
+
+// Load accesses and returns the value atomically.
+func (u *Uint32) Load() uint32 {
+	return Load(&u.value)
+}
+
+// LoadAcquire is a partially unsynchronized version
+// of Load that relaxes ordering constraints. Other threads
+// may observe operations that precede this operation to
+// occur after it, but no operation that occurs after it
+// on this thread can be observed to occur before it.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uint32) LoadAcquire() uint32 {
+	return LoadAcq(&u.value)
+}
+
+// Store updates the value atomically.
+func (u *Uint32) Store(value uint32) {
+	Store(&u.value, value)
+}
+
+// StoreRelease is a partially unsynchronized version
+// of Store that relaxes ordering constraints. Other threads
+// may observe operations that occur after this operation to
+// precede it, but no operation that precedes it
+// on this thread can be observed to occur after it.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uint32) StoreRelease(value uint32) {
+	StoreRel(&u.value, value)
+}
+
+// CompareAndSwap atomically compares u's value with old,
+// and if they're equal, swaps u's value with new.
+//
+// Returns true if the operation succeeded.
+func (u *Uint32) CompareAndSwap(old, new uint32) bool {
+	return Cas(&u.value, old, new)
+}
+
+// CompareAndSwapRelease is a partially unsynchronized version
+// of Cas that relaxes ordering constraints. Other threads
+// may observe operations that occur after this operation to
+// precede it, but no operation that precedes it
+// on this thread can be observed to occur after it.
+//
+// Returns true if the operation succeeded.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uint32) CompareAndSwapRelease(old, new uint32) bool {
+	return CasRel(&u.value, old, new)
+}
+
+// Swap replaces u's value with new, returning
+// u's value before the replacement.
+func (u *Uint32) Swap(value uint32) uint32 {
+	return Xchg(&u.value, value)
+}
+
+// And takes value and performs a bit-wise
+// "and" operation with the value of u, storing
+// the result into u.
+//
+// The full process is performed atomically.
+func (u *Uint32) And(value uint32) {
+	And(&u.value, value)
+}
+
+// Or takes value and performs a bit-wise
+// "or" operation with the value of u, storing
+// the result into u.
+//
+// The full process is performed atomically.
+func (u *Uint32) Or(value uint32) {
+	Or(&u.value, value)
+}
+
+// Add adds delta to u atomically, returning
+// the new updated value.
+//
+// This operation wraps around in the usual
+// two's-complement way.
+func (u *Uint32) Add(delta int32) uint32 {
+	return Xadd(&u.value, delta)
+}
+
+// Uint64 is an atomically accessed uint64 value.
+//
+// A Uint64 must not be copied.
+type Uint64 struct {
+	noCopy noCopy
+	value  uint64
+}
+
+// Load accesses and returns the value atomically.
+func (u *Uint64) Load() uint64 {
+	return Load64(&u.value)
+}
+
+// Store updates the value atomically.
+func (u *Uint64) Store(value uint64) {
+	Store64(&u.value, value)
+}
+
+// CompareAndSwap atomically compares u's value with old,
+// and if they're equal, swaps u's value with new.
+//
+// Returns true if the operation succeeded.
+func (u *Uint64) CompareAndSwap(old, new uint64) bool {
+	return Cas64(&u.value, old, new)
+}
+
+// Swap replaces u's value with new, returning
+// u's value before the replacement.
+func (u *Uint64) Swap(value uint64) uint64 {
+	return Xchg64(&u.value, value)
+}
+
+// Add adds delta to u atomically, returning
+// the new updated value.
+//
+// This operation wraps around in the usual
+// two's-complement way.
+func (u *Uint64) Add(delta int64) uint64 {
+	return Xadd64(&u.value, delta)
+}
+
+// Uintptr is an atomically accessed uintptr value.
+//
+// A Uintptr must not be copied.
+type Uintptr struct {
+	noCopy noCopy
+	value  uintptr
+}
+
+// Load accesses and returns the value atomically.
+func (u *Uintptr) Load() uintptr {
+	return Loaduintptr(&u.value)
+}
+
+// LoadAcquire is a partially unsynchronized version
+// of Load that relaxes ordering constraints. Other threads
+// may observe operations that precede this operation to
+// occur after it, but no operation that occurs after it
+// on this thread can be observed to occur before it.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uintptr) LoadAcquire() uintptr {
+	return LoadAcquintptr(&u.value)
+}
+
+// Store updates the value atomically.
+func (u *Uintptr) Store(value uintptr) {
+	Storeuintptr(&u.value, value)
+}
+
+// StoreRelease is a partially unsynchronized version
+// of Store that relaxes ordering constraints. Other threads
+// may observe operations that occur after this operation to
+// precede it, but no operation that precedes it
+// on this thread can be observed to occur after it.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uintptr) StoreRelease(value uintptr) {
+	StoreReluintptr(&u.value, value)
+}
+
+// CompareAndSwap atomically compares u's value with old,
+// and if they're equal, swaps u's value with new.
+//
+// Returns true if the operation succeeded.
+func (u *Uintptr) CompareAndSwap(old, new uintptr) bool {
+	return Casuintptr(&u.value, old, new)
+}
+
+// Swap replaces u's value with new, returning
+// u's value before the replacement.
+func (u *Uintptr) Swap(value uintptr) uintptr {
+	return Xchguintptr(&u.value, value)
+}
+
+// Add adds delta to u atomically, returning
+// the new updated value.
+//
+// This operation wraps around in the usual
+// two's-complement way.
+func (u *Uintptr) Add(delta uintptr) uintptr {
+	return Xadduintptr(&u.value, delta)
+}
+
+// Float64 is an atomically accessed float64 value.
+//
+// A Float64 must not be copied.
+type Float64 struct {
+	u Uint64
+}
+
+// Load accesses and returns the value atomically.
+func (f *Float64) Load() float64 {
+	r := f.u.Load()
+	return *(*float64)(unsafe.Pointer(&r))
+}
+
+// Store updates the value atomically.
+func (f *Float64) Store(value float64) {
+	f.u.Store(*(*uint64)(unsafe.Pointer(&value)))
+}
+
+// UnsafePointer is an atomically accessed unsafe.Pointer value.
+//
+// Note that because of the atomicity guarantees, stores to values
+// of this type never trigger a write barrier, and the relevant
+// methods are suffixed with "NoWB" to indicate that explicitly.
+// As a result, this type should be used carefully, and sparingly,
+// mostly with values that do not live in the Go heap anyway.
+//
+// An UnsafePointer must not be copied.
+type UnsafePointer struct {
+	noCopy noCopy
+	value  unsafe.Pointer
+}
+
+// Load accesses and returns the value atomically.
+func (u *UnsafePointer) Load() unsafe.Pointer {
+	return Loadp(unsafe.Pointer(&u.value))
+}
+
+// StoreNoWB updates the value atomically.
+//
+// WARNING: As the name implies this operation does *not*
+// perform a write barrier on value, and so this operation may
+// hide pointers from the GC. Use with care and sparingly.
+// It is safe to use with values not found in the Go heap.
+func (u *UnsafePointer) StoreNoWB(value unsafe.Pointer) {
+	StorepNoWB(unsafe.Pointer(&u.value), value)
+}
+
+// CompareAndSwapNoWB atomically (with respect to other methods)
+// compares u's value with old, and if they're equal,
+// swaps u's value with new.
+//
+// Returns true if the operation succeeded.
+//
+// WARNING: As the name implies this operation does *not*
+// perform a write barrier on value, and so this operation may
+// hide pointers from the GC. Use with care and sparingly.
+// It is safe to use with values not found in the Go heap.
+func (u *UnsafePointer) CompareAndSwapNoWB(old, new unsafe.Pointer) bool {
+	return Casp1(&u.value, old, new)
+}
+
+// noCopy may be embedded into structs which must not be copied
+// after the first use.
+//
+// See https://golang.org/issues/8005#issuecomment-190753527
+// for details.
+type noCopy struct{}
+
+// Lock is a no-op used by -copylocks checker from `go vet`.
+func (*noCopy) Lock()   {}
+func (*noCopy) Unlock() {}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/types_64bit.go b/contrib/go/_std_1.18/src/runtime/internal/atomic/types_64bit.go
new file mode 100644
index 0000000000..43c1ba2709
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/types_64bit.go
@@ -0,0 +1,29 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
+
+package atomic
+
+// LoadAcquire is a partially unsynchronized version
+// of Load that relaxes ordering constraints. Other threads
+// may observe operations that precede this operation to
+// occur after it, but no operation that occurs after it
+// on this thread can be observed to occur before it.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uint64) LoadAcquire() uint64 {
+	return LoadAcq64(&u.value)
+}
+
+// StoreRelease is a partially unsynchronized version
+// of Store that relaxes ordering constraints. Other threads
+// may observe operations that occur after this operation to
+// precede it, but no operation that precedes it
+// on this thread can be observed to occur after it.
+//
+// WARNING: Use sparingly and with great care.
+func (u *Uint64) StoreRelease(value uint64) {
+	StoreRel64(&u.value, value)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/atomic/unaligned.go b/contrib/go/_std_1.18/src/runtime/internal/atomic/unaligned.go
new file mode 100644
index 0000000000..a859de4144
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/atomic/unaligned.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func panicUnaligned() {
+	panic("unaligned 64-bit atomic operation")
+}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/math/math.go b/contrib/go/_std_1.18/src/runtime/internal/math/math.go
new file mode 100644
index 0000000000..c3fac366be
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/math/math.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+import "internal/goarch"
+
+const MaxUintptr = ^uintptr(0)
+
+// MulUintptr returns a * b and whether the multiplication overflowed.
+// On supported platforms this is an intrinsic lowered by the compiler.
+func MulUintptr(a, b uintptr) (uintptr, bool) {
+	if a|b < 1<<(4*goarch.PtrSize) || a == 0 {
+		return a * b, false
+	}
+	overflow := b > MaxUintptr/a
+	return a * b, overflow
+}
+
+// Mul64 returns the 128-bit product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+// This is a copy from math/bits.Mul64
+// On supported platforms this is an intrinsic lowered by the compiler.
+func Mul64(x, y uint64) (hi, lo uint64) {
+	const mask32 = 1<<32 - 1
+	x0 := x & mask32
+	x1 := x >> 32
+	y0 := y & mask32
+	y1 := y >> 32
+	w0 := x0 * y0
+	t := x1*y0 + w0>>32
+	w1 := t & mask32
+	w2 := t >> 32
+	w1 += x0 * y1
+	hi = x1*y1 + w2 + w1>>32
+	lo = x * y
+	return
+}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/sys/consts.go b/contrib/go/_std_1.18/src/runtime/internal/sys/consts.go
new file mode 100644
index 0000000000..fffcf81d1f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/sys/consts.go
@@ -0,0 +1,34 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+import (
+	"internal/goarch"
+	"internal/goos"
+)
+
+// AIX requires a larger stack for syscalls.
+const StackGuardMultiplier = StackGuardMultiplierDefault*(1-goos.IsAix) + 2*goos.IsAix
+
+// DefaultPhysPageSize is the default physical page size.
+const DefaultPhysPageSize = goarch.DefaultPhysPageSize
+
+// PCQuantum is the minimal unit for a program counter (1 on x86, 4 on most other systems).
+// The various PC tables record PC deltas pre-divided by PCQuantum.
+const PCQuantum = goarch.PCQuantum
+
+// Int64Align is the required alignment for a 64-bit integer (4 on 32-bit systems, 8 on 64-bit).
+const Int64Align = goarch.PtrSize
+
+// MinFrameSize is the size of the system-reserved words at the bottom
+// of a frame (just above the architectural stack pointer).
+// It is zero on x86 and PtrSize on most non-x86 (LR-based) systems.
+// On PowerPC it is larger, to cover three more reserved words:
+// the compiler word, the link editor word, and the TOC save word.
+const MinFrameSize = goarch.MinFrameSize
+
+// StackAlign is the required alignment of the SP register.
+// The stack must be at least word aligned, but some architectures require more.
+const StackAlign = goarch.StackAlign
diff --git a/contrib/go/_std_1.18/src/runtime/internal/sys/intrinsics.go b/contrib/go/_std_1.18/src/runtime/internal/sys/intrinsics.go
new file mode 100644
index 0000000000..5af49011e9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/sys/intrinsics.go
@@ -0,0 +1,91 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386
+
+// TODO finish intrinsifying 386, deadcode the assembly, remove build tags, merge w/ intrinsics_common
+// TODO replace all uses of CtzXX with TrailingZerosXX; they are the same.
+
+package sys
+
+// Using techniques from http://supertech.csail.mit.edu/papers/debruijn.pdf
+
+const deBruijn64ctz = 0x0218a392cd3d5dbf
+
+var deBruijnIdx64ctz = [64]byte{
+	0, 1, 2, 7, 3, 13, 8, 19,
+	4, 25, 14, 28, 9, 34, 20, 40,
+	5, 17, 26, 38, 15, 46, 29, 48,
+	10, 31, 35, 54, 21, 50, 41, 57,
+	63, 6, 12, 18, 24, 27, 33, 39,
+	16, 37, 45, 47, 30, 53, 49, 56,
+	62, 11, 23, 32, 36, 44, 52, 55,
+	61, 22, 43, 51, 60, 42, 59, 58,
+}
+
+const deBruijn32ctz = 0x04653adf
+
+var deBruijnIdx32ctz = [32]byte{
+	0, 1, 2, 6, 3, 11, 7, 16,
+	4, 14, 12, 21, 8, 23, 17, 26,
+	31, 5, 10, 15, 13, 20, 22, 25,
+	30, 9, 19, 24, 29, 18, 28, 27,
+}
+
+// Ctz64 counts trailing (low-order) zeroes,
+// and if all are zero, then 64.
+func Ctz64(x uint64) int {
+	x &= -x                       // isolate low-order bit
+	y := x * deBruijn64ctz >> 58  // extract part of deBruijn sequence
+	i := int(deBruijnIdx64ctz[y]) // convert to bit index
+	z := int((x - 1) >> 57 & 64)  // adjustment if zero
+	return i + z
+}
+
+// Ctz32 counts trailing (low-order) zeroes,
+// and if all are zero, then 32.
+func Ctz32(x uint32) int {
+	x &= -x                       // isolate low-order bit
+	y := x * deBruijn32ctz >> 27  // extract part of deBruijn sequence
+	i := int(deBruijnIdx32ctz[y]) // convert to bit index
+	z := int((x - 1) >> 26 & 32)  // adjustment if zero
+	return i + z
+}
+
+// Ctz8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
+func Ctz8(x uint8) int {
+	return int(ntz8tab[x])
+}
+
+// Bswap64 returns its input with byte order reversed
+// 0x0102030405060708 -> 0x0807060504030201
+func Bswap64(x uint64) uint64 {
+	c8 := uint64(0x00ff00ff00ff00ff)
+	a := x >> 8 & c8
+	b := (x & c8) << 8
+	x = a | b
+	c16 := uint64(0x0000ffff0000ffff)
+	a = x >> 16 & c16
+	b = (x & c16) << 16
+	x = a | b
+	c32 := uint64(0x00000000ffffffff)
+	a = x >> 32 & c32
+	b = (x & c32) << 32
+	x = a | b
+	return x
+}
+
+// Bswap32 returns its input with byte order reversed
+// 0x01020304 -> 0x04030201
+func Bswap32(x uint32) uint32 {
+	c8 := uint32(0x00ff00ff)
+	a := x >> 8 & c8
+	b := (x & c8) << 8
+	x = a | b
+	c16 := uint32(0x0000ffff)
+	a = x >> 16 & c16
+	b = (x & c16) << 16
+	x = a | b
+	return x
+}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/sys/intrinsics_common.go b/contrib/go/_std_1.18/src/runtime/internal/sys/intrinsics_common.go
new file mode 100644
index 0000000000..48d9759ca9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/sys/intrinsics_common.go
@@ -0,0 +1,158 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+// Copied from math/bits to avoid dependence.
+
+var len8tab = [256]uint8{
+	0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+}
+
+var ntz8tab = [256]uint8{
+	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+}
+
+// len64 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+func Len64(x uint64) (n int) {
+	if x >= 1<<32 {
+		x >>= 32
+		n = 32
+	}
+	if x >= 1<<16 {
+		x >>= 16
+		n += 16
+	}
+	if x >= 1<<8 {
+		x >>= 8
+		n += 8
+	}
+	return n + int(len8tab[x])
+}
+
+// --- OnesCount ---
+
+const m0 = 0x5555555555555555 // 01010101 ...
+const m1 = 0x3333333333333333 // 00110011 ...
+const m2 = 0x0f0f0f0f0f0f0f0f // 00001111 ...
+
+// OnesCount64 returns the number of one bits ("population count") in x.
+func OnesCount64(x uint64) int {
+	// Implementation: Parallel summing of adjacent bits.
+	// See "Hacker's Delight", Chap. 5: Counting Bits.
+	// The following pattern shows the general approach:
+	//
+	//   x = x>>1&(m0&m) + x&(m0&m)
+	//   x = x>>2&(m1&m) + x&(m1&m)
+	//   x = x>>4&(m2&m) + x&(m2&m)
+	//   x = x>>8&(m3&m) + x&(m3&m)
+	//   x = x>>16&(m4&m) + x&(m4&m)
+	//   x = x>>32&(m5&m) + x&(m5&m)
+	//   return int(x)
+	//
+	// Masking (& operations) can be left away when there's no
+	// danger that a field's sum will carry over into the next
+	// field: Since the result cannot be > 64, 8 bits is enough
+	// and we can ignore the masks for the shifts by 8 and up.
+	// Per "Hacker's Delight", the first line can be simplified
+	// more, but it saves at best one instruction, so we leave
+	// it alone for clarity.
+	const m = 1<<64 - 1
+	x = x>>1&(m0&m) + x&(m0&m)
+	x = x>>2&(m1&m) + x&(m1&m)
+	x = (x>>4 + x) & (m2 & m)
+	x += x >> 8
+	x += x >> 16
+	x += x >> 32
+	return int(x) & (1<<7 - 1)
+}
+
+var deBruijn64tab = [64]byte{
+	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
+	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
+	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
+	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
+}
+
+const deBruijn64 = 0x03f79d71b4ca8b09
+
+// TrailingZeros64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
+func TrailingZeros64(x uint64) int {
+	if x == 0 {
+		return 64
+	}
+	// If popcount is fast, replace code below with return popcount(^x & (x - 1)).
+	//
+	// x & -x leaves only the right-most bit set in the word. Let k be the
+	// index of that bit. Since only a single bit is set, the value is two
+	// to the power of k. Multiplying by a power of two is equivalent to
+	// left shifting, in this case by k bits. The de Bruijn (64 bit) constant
+	// is such that all six bit, consecutive substrings are distinct.
+	// Therefore, if we have a left shifted version of this constant we can
+	// find by how many bits it was shifted by looking at which six bit
+	// substring ended up at the top of the word.
+	// (Knuth, volume 4, section 7.3.1)
+	return int(deBruijn64tab[(x&-x)*deBruijn64>>(64-6)])
+}
+
+// LeadingZeros64 returns the number of leading zero bits in x; the result is 64 for x == 0.
+func LeadingZeros64(x uint64) int { return 64 - Len64(x) }
+
+// LeadingZeros8 returns the number of leading zero bits in x; the result is 8 for x == 0.
+func LeadingZeros8(x uint8) int { return 8 - Len8(x) }
+
+// TrailingZeros8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
+func TrailingZeros8(x uint8) int {
+	return int(ntz8tab[x])
+}
+
+// Len8 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+func Len8(x uint8) int {
+	return int(len8tab[x])
+}
+
+// Prefetch prefetches data from memory addr to cache
+//
+// AMD64: Produce PREFETCHT0 instruction
+//
+// ARM64: Produce PRFM instruction with PLDL1KEEP option
+func Prefetch(addr uintptr) {}
+
+// PrefetchStreamed prefetches data from memory addr, with a hint that this data is being streamed.
+// That is, it is likely to be accessed very soon, but only once. If possible, this will avoid polluting the cache.
+//
+// AMD64: Produce PREFETCHNTA instruction
+//
+// ARM64: Produce PRFM instruction with PLDL1STRM option
+func PrefetchStreamed(addr uintptr) {}
diff --git a/contrib/go/_std_1.18/src/runtime/internal/sys/sys.go b/contrib/go/_std_1.18/src/runtime/internal/sys/sys.go
new file mode 100644
index 0000000000..694101d36f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/sys/sys.go
@@ -0,0 +1,7 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// package sys contains system- and configuration- and architecture-specific
+// constants used by the runtime.
+package sys
diff --git a/contrib/go/_std_1.18/src/runtime/internal/sys/zversion.go b/contrib/go/_std_1.18/src/runtime/internal/sys/zversion.go
new file mode 100644
index 0000000000..b058a3db70
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/sys/zversion.go
@@ -0,0 +1,5 @@
+// Code generated by go tool dist; DO NOT EDIT.
+
+package sys
+
+const StackGuardMultiplierDefault = 1
diff --git a/contrib/go/_std_1.18/src/runtime/internal/syscall/asm_linux_amd64.s b/contrib/go/_std_1.18/src/runtime/internal/syscall/asm_linux_amd64.s
new file mode 100644
index 0000000000..961d9bd640
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/syscall/asm_linux_amd64.s
@@ -0,0 +1,33 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)
+//
+// Syscall # in AX, args in DI SI DX R10 R8 R9, return in AX DX.
+//
+// Note that this differs from "standard" ABI convention, which would pass 4th
+// arg in CX, not R10.
+TEXT ·Syscall6(SB),NOSPLIT,$0-80
+	MOVQ	num+0(FP), AX	// syscall entry
+	MOVQ	a1+8(FP), DI
+	MOVQ	a2+16(FP), SI
+	MOVQ	a3+24(FP), DX
+	MOVQ	a4+32(FP), R10
+	MOVQ	a5+40(FP), R8
+	MOVQ	a6+48(FP), R9
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	ok
+	MOVQ	$-1, r1+56(FP)
+	MOVQ	$0, r2+64(FP)
+	NEGQ	AX
+	MOVQ	AX, errno+72(FP)
+	RET
+ok:
+	MOVQ	AX, r1+56(FP)
+	MOVQ	DX, r2+64(FP)
+	MOVQ	$0, errno+72(FP)
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/internal/syscall/syscall_linux.go b/contrib/go/_std_1.18/src/runtime/internal/syscall/syscall_linux.go
new file mode 100644
index 0000000000..06d5f21e7c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/internal/syscall/syscall_linux.go
@@ -0,0 +1,12 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package syscall provides the syscall primitives required for the runtime.
+package syscall
+
+// TODO(https://go.dev/issue/51087): This package is incomplete and currently
+// only contains very minimal support for Linux.
+
+// Syscall6 calls system call number 'num' with arguments a1-6.
+func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)
diff --git a/contrib/go/_std_1.18/src/runtime/lfstack.go b/contrib/go/_std_1.18/src/runtime/lfstack.go
new file mode 100644
index 0000000000..406561a275
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/lfstack.go
@@ -0,0 +1,67 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lock-free stack.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// lfstack is the head of a lock-free stack.
+//
+// The zero value of lfstack is an empty list.
+//
+// This stack is intrusive. Nodes must embed lfnode as the first field.
+//
+// The stack does not keep GC-visible pointers to nodes, so the caller
+// is responsible for ensuring the nodes are not garbage collected
+// (typically by allocating them from manually-managed memory).
+type lfstack uint64
+
+func (head *lfstack) push(node *lfnode) {
+	node.pushcnt++
+	new := lfstackPack(node, node.pushcnt)
+	if node1 := lfstackUnpack(new); node1 != node {
+		print("runtime: lfstack.push invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
+		throw("lfstack.push")
+	}
+	for {
+		old := atomic.Load64((*uint64)(head))
+		node.next = old
+		if atomic.Cas64((*uint64)(head), old, new) {
+			break
+		}
+	}
+}
+
+func (head *lfstack) pop() unsafe.Pointer {
+	for {
+		old := atomic.Load64((*uint64)(head))
+		if old == 0 {
+			return nil
+		}
+		node := lfstackUnpack(old)
+		next := atomic.Load64(&node.next)
+		if atomic.Cas64((*uint64)(head), old, next) {
+			return unsafe.Pointer(node)
+		}
+	}
+}
+
+func (head *lfstack) empty() bool {
+	return atomic.Load64((*uint64)(head)) == 0
+}
+
+// lfnodeValidate panics if node is not a valid address for use with
+// lfstack.push. This only needs to be called when node is allocated.
+func lfnodeValidate(node *lfnode) {
+	if lfstackUnpack(lfstackPack(node, ^uintptr(0))) != node {
+		printlock()
+		println("runtime: bad lfnode address", hex(uintptr(unsafe.Pointer(node))))
+		throw("bad lfnode address")
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/lfstack_64bit.go b/contrib/go/_std_1.18/src/runtime/lfstack_64bit.go
new file mode 100644
index 0000000000..3f0e480897
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/lfstack_64bit.go
@@ -0,0 +1,58 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
+
+package runtime
+
+import "unsafe"
+
+const (
+	// addrBits is the number of bits needed to represent a virtual address.
+	//
+	// See heapAddrBits for a table of address space sizes on
+	// various architectures. 48 bits is enough for all
+	// architectures except s390x.
+	//
+	// On AMD64, virtual addresses are 48-bit (or 57-bit) numbers sign extended to 64.
+	// We shift the address left 16 to eliminate the sign extended part and make
+	// room in the bottom for the count.
+	//
+	// On s390x, virtual addresses are 64-bit. There's not much we
+	// can do about this, so we just hope that the kernel doesn't
+	// get to really high addresses and panic if it does.
+	addrBits = 48
+
+	// In addition to the 16 bits taken from the top, we can take 3 from the
+	// bottom, because node must be pointer-aligned, giving a total of 19 bits
+	// of count.
+	cntBits = 64 - addrBits + 3
+
+	// On AIX, 64-bit addresses are split into 36-bit segment number and 28-bit
+	// offset in segment.  Segment numbers in the range 0x0A0000000-0x0AFFFFFFF(LSA)
+	// are available for mmap.
+	// We assume all lfnode addresses are from memory allocated with mmap.
+	// We use one bit to distinguish between the two ranges.
+	aixAddrBits = 57
+	aixCntBits  = 64 - aixAddrBits + 3
+)
+
+func lfstackPack(node *lfnode, cnt uintptr) uint64 {
+	if GOARCH == "ppc64" && GOOS == "aix" {
+		return uint64(uintptr(unsafe.Pointer(node)))<<(64-aixAddrBits) | uint64(cnt&(1<<aixCntBits-1))
+	}
+	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
+}
+
+func lfstackUnpack(val uint64) *lfnode {
+	if GOARCH == "amd64" {
+		// amd64 systems can place the stack above the VA hole, so we need to sign extend
+		// val before unpacking.
+		return (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> cntBits << 3)))
+	}
+	if GOARCH == "ppc64" && GOOS == "aix" {
+		return (*lfnode)(unsafe.Pointer(uintptr((val >> aixCntBits << 3) | 0xa<<56)))
+	}
+	return (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/lock_futex.go b/contrib/go/_std_1.18/src/runtime/lock_futex.go
new file mode 100644
index 0000000000..575df7a1d5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/lock_futex.go
@@ -0,0 +1,245 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build dragonfly || freebsd || linux
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// This implementation depends on OS-specific implementations of
+//
+//	futexsleep(addr *uint32, val uint32, ns int64)
+//		Atomically,
+//			if *addr == val { sleep }
+//		Might be woken up spuriously; that's allowed.
+//		Don't sleep longer than ns; ns < 0 means forever.
+//
+//	futexwakeup(addr *uint32, cnt uint32)
+//		If any procs are sleeping on addr, wake up at most cnt.
+
+const (
+	mutex_unlocked = 0
+	mutex_locked   = 1
+	mutex_sleeping = 2
+
+	active_spin     = 4
+	active_spin_cnt = 30
+	passive_spin    = 1
+)
+
+// Possible lock states are mutex_unlocked, mutex_locked and mutex_sleeping.
+// mutex_sleeping means that there is presumably at least one sleeping thread.
+// Note that there can be spinning threads during all states - they do not
+// affect mutex's state.
+
+// We use the uintptr mutex.key and note.key as a uint32.
+//go:nosplit
+func key32(p *uintptr) *uint32 {
+	return (*uint32)(unsafe.Pointer(p))
+}
+
+func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
+	gp := getg()
+
+	if gp.m.locks < 0 {
+		throw("runtime·lock: lock count")
+	}
+	gp.m.locks++
+
+	// Speculative grab for lock.
+	v := atomic.Xchg(key32(&l.key), mutex_locked)
+	if v == mutex_unlocked {
+		return
+	}
+
+	// wait is either MUTEX_LOCKED or MUTEX_SLEEPING
+	// depending on whether there is a thread sleeping
+	// on this mutex. If we ever change l->key from
+	// MUTEX_SLEEPING to some other value, we must be
+	// careful to change it back to MUTEX_SLEEPING before
+	// returning, to ensure that the sleeping thread gets
+	// its wakeup call.
+	wait := v
+
+	// On uniprocessors, no point spinning.
+	// On multiprocessors, spin for ACTIVE_SPIN attempts.
+	spin := 0
+	if ncpu > 1 {
+		spin = active_spin
+	}
+	for {
+		// Try for lock, spinning.
+		for i := 0; i < spin; i++ {
+			for l.key == mutex_unlocked {
+				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+					return
+				}
+			}
+			procyield(active_spin_cnt)
+		}
+
+		// Try for lock, rescheduling.
+		for i := 0; i < passive_spin; i++ {
+			for l.key == mutex_unlocked {
+				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+					return
+				}
+			}
+			osyield()
+		}
+
+		// Sleep.
+		v = atomic.Xchg(key32(&l.key), mutex_sleeping)
+		if v == mutex_unlocked {
+			return
+		}
+		wait = mutex_sleeping
+		futexsleep(key32(&l.key), mutex_sleeping, -1)
+	}
+}
+
+func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+func unlock2(l *mutex) {
+	v := atomic.Xchg(key32(&l.key), mutex_unlocked)
+	if v == mutex_unlocked {
+		throw("unlock of unlocked lock")
+	}
+	if v == mutex_sleeping {
+		futexwakeup(key32(&l.key), 1)
+	}
+
+	gp := getg()
+	gp.m.locks--
+	if gp.m.locks < 0 {
+		throw("runtime·unlock: lock count")
+	}
+	if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
+		gp.stackguard0 = stackPreempt
+	}
+}
+
+// One-time notifications.
+func noteclear(n *note) {
+	n.key = 0
+}
+
+func notewakeup(n *note) {
+	old := atomic.Xchg(key32(&n.key), 1)
+	if old != 0 {
+		print("notewakeup - double wakeup (", old, ")\n")
+		throw("notewakeup - double wakeup")
+	}
+	futexwakeup(key32(&n.key), 1)
+}
+
+func notesleep(n *note) {
+	gp := getg()
+	if gp != gp.m.g0 {
+		throw("notesleep not on g0")
+	}
+	ns := int64(-1)
+	if *cgo_yield != nil {
+		// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+		ns = 10e6
+	}
+	for atomic.Load(key32(&n.key)) == 0 {
+		gp.m.blocked = true
+		futexsleep(key32(&n.key), 0, ns)
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
+		gp.m.blocked = false
+	}
+}
+
+// May run with m.p==nil if called from notetsleep, so write barriers
+// are not allowed.
+//
+//go:nosplit
+//go:nowritebarrier
+func notetsleep_internal(n *note, ns int64) bool {
+	gp := getg()
+
+	if ns < 0 {
+		if *cgo_yield != nil {
+			// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+			ns = 10e6
+		}
+		for atomic.Load(key32(&n.key)) == 0 {
+			gp.m.blocked = true
+			futexsleep(key32(&n.key), 0, ns)
+			if *cgo_yield != nil {
+				asmcgocall(*cgo_yield, nil)
+			}
+			gp.m.blocked = false
+		}
+		return true
+	}
+
+	if atomic.Load(key32(&n.key)) != 0 {
+		return true
+	}
+
+	deadline := nanotime() + ns
+	for {
+		if *cgo_yield != nil && ns > 10e6 {
+			ns = 10e6
+		}
+		gp.m.blocked = true
+		futexsleep(key32(&n.key), 0, ns)
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
+		gp.m.blocked = false
+		if atomic.Load(key32(&n.key)) != 0 {
+			break
+		}
+		now := nanotime()
+		if now >= deadline {
+			break
+		}
+		ns = deadline - now
+	}
+	return atomic.Load(key32(&n.key)) != 0
+}
+
+func notetsleep(n *note, ns int64) bool {
+	gp := getg()
+	if gp != gp.m.g0 && gp.m.preemptoff != "" {
+		throw("notetsleep not on g0")
+	}
+
+	return notetsleep_internal(n, ns)
+}
+
+// same as runtime·notetsleep, but called on user g (not g0)
+// calls only nosplit functions between entersyscallblock/exitsyscall
+func notetsleepg(n *note, ns int64) bool {
+	gp := getg()
+	if gp == gp.m.g0 {
+		throw("notetsleepg on g0")
+	}
+
+	entersyscallblock()
+	ok := notetsleep_internal(n, ns)
+	exitsyscall()
+	return ok
+}
+
+func beforeIdle(int64, int64) (*g, bool) {
+	return nil, false
+}
+
+func checkTimeouts() {}
diff --git a/contrib/go/_std_1.18/src/runtime/lock_sema.go b/contrib/go/_std_1.18/src/runtime/lock_sema.go
new file mode 100644
index 0000000000..db36df1f37
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/lock_sema.go
@@ -0,0 +1,304 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build aix || darwin || netbsd || openbsd || plan9 || solaris || windows
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// This implementation depends on OS-specific implementations of
+//
+//	func semacreate(mp *m)
+//		Create a semaphore for mp, if it does not already have one.
+//
+//	func semasleep(ns int64) int32
+//		If ns < 0, acquire m's semaphore and return 0.
+//		If ns >= 0, try to acquire m's semaphore for at most ns nanoseconds.
+//		Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
+//
+//	func semawakeup(mp *m)
+//		Wake up mp, which is or will soon be sleeping on its semaphore.
+//
+const (
+	locked uintptr = 1
+
+	active_spin     = 4
+	active_spin_cnt = 30
+	passive_spin    = 1
+)
+
+func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
+	gp := getg()
+	if gp.m.locks < 0 {
+		throw("runtime·lock: lock count")
+	}
+	gp.m.locks++
+
+	// Speculative grab for lock.
+	if atomic.Casuintptr(&l.key, 0, locked) {
+		return
+	}
+	semacreate(gp.m)
+
+	// On uniprocessor's, no point spinning.
+	// On multiprocessors, spin for ACTIVE_SPIN attempts.
+	spin := 0
+	if ncpu > 1 {
+		spin = active_spin
+	}
+Loop:
+	for i := 0; ; i++ {
+		v := atomic.Loaduintptr(&l.key)
+		if v&locked == 0 {
+			// Unlocked. Try to lock.
+			if atomic.Casuintptr(&l.key, v, v|locked) {
+				return
+			}
+			i = 0
+		}
+		if i < spin {
+			procyield(active_spin_cnt)
+		} else if i < spin+passive_spin {
+			osyield()
+		} else {
+			// Someone else has it.
+			// l->waitm points to a linked list of M's waiting
+			// for this lock, chained through m->nextwaitm.
+			// Queue this M.
+			for {
+				gp.m.nextwaitm = muintptr(v &^ locked)
+				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
+					break
+				}
+				v = atomic.Loaduintptr(&l.key)
+				if v&locked == 0 {
+					continue Loop
+				}
+			}
+			if v&locked != 0 {
+				// Queued. Wait.
+				semasleep(-1)
+				i = 0
+			}
+		}
+	}
+}
+
+func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+//go:nowritebarrier
+// We might not be holding a p in this code.
+func unlock2(l *mutex) {
+	gp := getg()
+	var mp *m
+	for {
+		v := atomic.Loaduintptr(&l.key)
+		if v == locked {
+			if atomic.Casuintptr(&l.key, locked, 0) {
+				break
+			}
+		} else {
+			// Other M's are waiting for the lock.
+			// Dequeue an M.
+			mp = muintptr(v &^ locked).ptr()
+			if atomic.Casuintptr(&l.key, v, uintptr(mp.nextwaitm)) {
+				// Dequeued an M.  Wake it.
+				semawakeup(mp)
+				break
+			}
+		}
+	}
+	gp.m.locks--
+	if gp.m.locks < 0 {
+		throw("runtime·unlock: lock count")
+	}
+	if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
+		gp.stackguard0 = stackPreempt
+	}
+}
+
+// One-time notifications.
+func noteclear(n *note) {
+	if GOOS == "aix" {
+		// On AIX, semaphores might not synchronize the memory in some
+		// rare cases. See issue #30189.
+		atomic.Storeuintptr(&n.key, 0)
+	} else {
+		n.key = 0
+	}
+}
+
+func notewakeup(n *note) {
+	var v uintptr
+	for {
+		v = atomic.Loaduintptr(&n.key)
+		if atomic.Casuintptr(&n.key, v, locked) {
+			break
+		}
+	}
+
+	// Successfully set waitm to locked.
+	// What was it before?
+	switch {
+	case v == 0:
+		// Nothing was waiting. Done.
+	case v == locked:
+		// Two notewakeups! Not allowed.
+		throw("notewakeup - double wakeup")
+	default:
+		// Must be the waiting m. Wake it up.
+		semawakeup((*m)(unsafe.Pointer(v)))
+	}
+}
+
+func notesleep(n *note) {
+	gp := getg()
+	if gp != gp.m.g0 {
+		throw("notesleep not on g0")
+	}
+	semacreate(gp.m)
+	if !atomic.Casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+		// Must be locked (got wakeup).
+		if n.key != locked {
+			throw("notesleep - waitm out of sync")
+		}
+		return
+	}
+	// Queued. Sleep.
+	gp.m.blocked = true
+	if *cgo_yield == nil {
+		semasleep(-1)
+	} else {
+		// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+		const ns = 10e6
+		for atomic.Loaduintptr(&n.key) == 0 {
+			semasleep(ns)
+			asmcgocall(*cgo_yield, nil)
+		}
+	}
+	gp.m.blocked = false
+}
+
+//go:nosplit
+func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
+	// gp and deadline are logically local variables, but they are written
+	// as parameters so that the stack space they require is charged
+	// to the caller.
+	// This reduces the nosplit footprint of notetsleep_internal.
+	gp = getg()
+
+	// Register for wakeup on n->waitm.
+	if !atomic.Casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+		// Must be locked (got wakeup).
+		if n.key != locked {
+			throw("notetsleep - waitm out of sync")
+		}
+		return true
+	}
+	if ns < 0 {
+		// Queued. Sleep.
+		gp.m.blocked = true
+		if *cgo_yield == nil {
+			semasleep(-1)
+		} else {
+			// Sleep in arbitrary-but-moderate intervals to poll libc interceptors.
+			const ns = 10e6
+			for semasleep(ns) < 0 {
+				asmcgocall(*cgo_yield, nil)
+			}
+		}
+		gp.m.blocked = false
+		return true
+	}
+
+	deadline = nanotime() + ns
+	for {
+		// Registered. Sleep.
+		gp.m.blocked = true
+		if *cgo_yield != nil && ns > 10e6 {
+			ns = 10e6
+		}
+		if semasleep(ns) >= 0 {
+			gp.m.blocked = false
+			// Acquired semaphore, semawakeup unregistered us.
+			// Done.
+			return true
+		}
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
+		gp.m.blocked = false
+		// Interrupted or timed out. Still registered. Semaphore not acquired.
+		ns = deadline - nanotime()
+		if ns <= 0 {
+			break
+		}
+		// Deadline hasn't arrived. Keep sleeping.
+	}
+
+	// Deadline arrived. Still registered. Semaphore not acquired.
+	// Want to give up and return, but have to unregister first,
+	// so that any notewakeup racing with the return does not
+	// try to grant us the semaphore when we don't expect it.
+	for {
+		v := atomic.Loaduintptr(&n.key)
+		switch v {
+		case uintptr(unsafe.Pointer(gp.m)):
+			// No wakeup yet; unregister if possible.
+			if atomic.Casuintptr(&n.key, v, 0) {
+				return false
+			}
+		case locked:
+			// Wakeup happened so semaphore is available.
+			// Grab it to avoid getting out of sync.
+			gp.m.blocked = true
+			if semasleep(-1) < 0 {
+				throw("runtime: unable to acquire - semaphore out of sync")
+			}
+			gp.m.blocked = false
+			return true
+		default:
+			throw("runtime: unexpected waitm - semaphore out of sync")
+		}
+	}
+}
+
+func notetsleep(n *note, ns int64) bool {
+	gp := getg()
+	if gp != gp.m.g0 {
+		throw("notetsleep not on g0")
+	}
+	semacreate(gp.m)
+	return notetsleep_internal(n, ns, nil, 0)
+}
+
+// same as runtime·notetsleep, but called on user g (not g0)
+// calls only nosplit functions between entersyscallblock/exitsyscall
+func notetsleepg(n *note, ns int64) bool {
+	gp := getg()
+	if gp == gp.m.g0 {
+		throw("notetsleepg on g0")
+	}
+	semacreate(gp.m)
+	entersyscallblock()
+	ok := notetsleep_internal(n, ns, nil, 0)
+	exitsyscall()
+	return ok
+}
+
+func beforeIdle(int64, int64) (*g, bool) {
+	return nil, false
+}
+
+func checkTimeouts() {}
diff --git a/contrib/go/_std_1.18/src/runtime/lockrank.go b/contrib/go/_std_1.18/src/runtime/lockrank.go
new file mode 100644
index 0000000000..4a16bc0ddb
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/lockrank.go
@@ -0,0 +1,251 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file records the static ranks of the locks in the runtime. If a lock
+// is not given a rank, then it is assumed to be a leaf lock, which means no other
+// lock can be acquired while it is held. Therefore, leaf locks do not need to be
+// given an explicit rank. We list all of the architecture-independent leaf locks
+// for documentation purposes, but don't list any of the architecture-dependent
+// locks (which are all leaf locks). debugLock is ignored for ranking, since it is used
+// when printing out lock ranking errors.
+//
+// lockInit(l *mutex, rank int) is used to set the rank of lock before it is used.
+// If there is no clear place to initialize a lock, then the rank of a lock can be
+// specified during the lock call itself via lockWithrank(l *mutex, rank int).
+//
+// Besides the static lock ranking (which is a total ordering of the locks), we
+// also represent and enforce the actual partial order among the locks in the
+// arcs[] array below. That is, if it is possible that lock B can be acquired when
+// lock A is the previous acquired lock that is still held, then there should be
+// an entry for A in arcs[B][]. We will currently fail not only if the total order
+// (the lock ranking) is violated, but also if there is a missing entry in the
+// partial order.
+
+package runtime
+
+type lockRank int
+
+// Constants representing the lock rank of the architecture-independent locks in
+// the runtime. Locks with lower rank must be taken before locks with higher
+// rank.
+const (
+	lockRankDummy lockRank = iota
+
+	// Locks held above sched
+	lockRankSysmon
+	lockRankScavenge
+	lockRankForcegc
+	lockRankSweepWaiters
+	lockRankAssistQueue
+	lockRankCpuprof
+	lockRankSweep
+
+	lockRankPollDesc
+	lockRankSched
+	lockRankDeadlock
+	lockRankAllg
+	lockRankAllp
+
+	lockRankTimers // Multiple timers locked simultaneously in destroy()
+	lockRankItab
+	lockRankReflectOffs
+	lockRankHchan // Multiple hchans acquired in lock order in syncadjustsudogs()
+	lockRankTraceBuf
+	lockRankFin
+	lockRankNotifyList
+	lockRankTraceStrings
+	lockRankMspanSpecial
+	lockRankProf
+	lockRankGcBitsArenas
+	lockRankRoot
+	lockRankTrace
+	lockRankTraceStackTab
+	lockRankNetpollInit
+
+	lockRankRwmutexW
+	lockRankRwmutexR
+
+	lockRankSpanSetSpine
+	lockRankGscan
+	lockRankStackpool
+	lockRankStackLarge
+	lockRankDefer
+	lockRankSudog
+
+	// Memory-related non-leaf locks
+	lockRankWbufSpans
+	lockRankMheap
+	lockRankMheapSpecial
+
+	// Memory-related leaf locks
+	lockRankGlobalAlloc
+	lockRankPageAllocScav
+
+	// Other leaf locks
+	lockRankGFree
+	// Generally, hchan must be acquired before gscan. But in one specific
+	// case (in syncadjustsudogs from markroot after the g has been suspended
+	// by suspendG), we allow gscan to be acquired, and then an hchan lock. To
+	// allow this case, we get this lockRankHchanLeaf rank in
+	// syncadjustsudogs(), rather than lockRankHchan. By using this special
+	// rank, we don't allow any further locks to be acquired other than more
+	// hchan locks.
+	lockRankHchanLeaf
+	lockRankPanic
+
+	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
+	// There are other architecture-dependent leaf locks as well.
+	lockRankNewmHandoff
+	lockRankDebugPtrmask
+	lockRankFaketimeState
+	lockRankTicks
+	lockRankRaceFini
+	lockRankPollCache
+	lockRankDebug
+)
+
+// lockRankLeafRank is the rank of lock that does not have a declared rank, and hence is
+// a leaf lock.
+const lockRankLeafRank lockRank = 1000
+
+// lockNames gives the names associated with each of the above ranks
+var lockNames = []string{
+	lockRankDummy: "",
+
+	lockRankSysmon:       "sysmon",
+	lockRankScavenge:     "scavenge",
+	lockRankForcegc:      "forcegc",
+	lockRankSweepWaiters: "sweepWaiters",
+	lockRankAssistQueue:  "assistQueue",
+	lockRankCpuprof:      "cpuprof",
+	lockRankSweep:        "sweep",
+
+	lockRankPollDesc: "pollDesc",
+	lockRankSched:    "sched",
+	lockRankDeadlock: "deadlock",
+	lockRankAllg:     "allg",
+	lockRankAllp:     "allp",
+
+	lockRankTimers:      "timers",
+	lockRankItab:        "itab",
+	lockRankReflectOffs: "reflectOffs",
+
+	lockRankHchan:         "hchan",
+	lockRankTraceBuf:      "traceBuf",
+	lockRankFin:           "fin",
+	lockRankNotifyList:    "notifyList",
+	lockRankTraceStrings:  "traceStrings",
+	lockRankMspanSpecial:  "mspanSpecial",
+	lockRankProf:          "prof",
+	lockRankGcBitsArenas:  "gcBitsArenas",
+	lockRankRoot:          "root",
+	lockRankTrace:         "trace",
+	lockRankTraceStackTab: "traceStackTab",
+	lockRankNetpollInit:   "netpollInit",
+
+	lockRankRwmutexW: "rwmutexW",
+	lockRankRwmutexR: "rwmutexR",
+
+	lockRankSpanSetSpine: "spanSetSpine",
+	lockRankGscan:        "gscan",
+	lockRankStackpool:    "stackpool",
+	lockRankStackLarge:   "stackLarge",
+	lockRankDefer:        "defer",
+	lockRankSudog:        "sudog",
+
+	lockRankWbufSpans:    "wbufSpans",
+	lockRankMheap:        "mheap",
+	lockRankMheapSpecial: "mheapSpecial",
+
+	lockRankGlobalAlloc:   "globalAlloc.mutex",
+	lockRankPageAllocScav: "pageAlloc.scav.lock",
+
+	lockRankGFree:     "gFree",
+	lockRankHchanLeaf: "hchanLeaf",
+	lockRankPanic:     "panic",
+
+	lockRankNewmHandoff:   "newmHandoff.lock",
+	lockRankDebugPtrmask:  "debugPtrmask.lock",
+	lockRankFaketimeState: "faketimeState.lock",
+	lockRankTicks:         "ticks.lock",
+	lockRankRaceFini:      "raceFiniLock",
+	lockRankPollCache:     "pollCache.lock",
+	lockRankDebug:         "debugLock",
+}
+
+func (rank lockRank) String() string {
+	if rank == 0 {
+		return "UNKNOWN"
+	}
+	if rank == lockRankLeafRank {
+		return "LEAF"
+	}
+	return lockNames[rank]
+}
+
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
+//
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
+var lockPartialOrder [][]lockRank = [][]lockRank{
+	lockRankDummy:         {},
+	lockRankSysmon:        {},
+	lockRankScavenge:      {lockRankSysmon},
+	lockRankForcegc:       {lockRankSysmon},
+	lockRankSweepWaiters:  {},
+	lockRankAssistQueue:   {},
+	lockRankCpuprof:       {},
+	lockRankSweep:         {},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
+	lockRankDeadlock:      {lockRankDeadlock},
+	lockRankAllg:          {lockRankSysmon, lockRankSched},
+	lockRankAllp:          {lockRankSysmon, lockRankSched},
+	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankPollDesc, lockRankSched, lockRankAllp, lockRankTimers},
+	lockRankItab:          {},
+	lockRankReflectOffs:   {lockRankItab},
+	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
+	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
+	lockRankFin:           {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllg, lockRankTimers, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf},
+	lockRankNotifyList:    {},
+	lockRankTraceStrings:  {lockRankTraceBuf},
+	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankRoot:          {},
+	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankNetpollInit:   {lockRankTimers},
+
+	lockRankRwmutexW: {},
+	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
+
+	lockRankSpanSetSpine:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGscan:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:    {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
+	lockRankDefer:         {},
+	lockRankSudog:         {lockRankHchan, lockRankNotifyList},
+	lockRankWbufSpans:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:         {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+	lockRankMheapSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGlobalAlloc:   {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankPageAllocScav: {lockRankMheap},
+
+	lockRankGFree:     {lockRankSched},
+	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
+	lockRankPanic:     {lockRankDeadlock}, // plus any other lock held on throw.
+
+	lockRankNewmHandoff:   {},
+	lockRankDebugPtrmask:  {},
+	lockRankFaketimeState: {},
+	lockRankTicks:         {},
+	lockRankRaceFini:      {},
+	lockRankPollCache:     {},
+	lockRankDebug:         {},
+}
diff --git a/contrib/go/_std_1.18/src/runtime/lockrank_off.go b/contrib/go/_std_1.18/src/runtime/lockrank_off.go
new file mode 100644
index 0000000000..daa45b542d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/lockrank_off.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.staticlockranking
+
+package runtime
+
+// // lockRankStruct is embedded in mutex, but is empty when staticklockranking is
+// disabled (the default)
+type lockRankStruct struct {
+}
+
+func lockInit(l *mutex, rank lockRank) {
+}
+
+func getLockRank(l *mutex) lockRank {
+	return 0
+}
+
+func lockWithRank(l *mutex, rank lockRank) {
+	lock2(l)
+}
+
+// This function may be called in nosplit context and thus must be nosplit.
+//go:nosplit
+func acquireLockRank(rank lockRank) {
+}
+
+func unlockWithRank(l *mutex) {
+	unlock2(l)
+}
+
+// This function may be called in nosplit context and thus must be nosplit.
+//go:nosplit
+func releaseLockRank(rank lockRank) {
+}
+
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+}
+
+//go:nosplit
+func assertLockHeld(l *mutex) {
+}
+
+//go:nosplit
+func assertRankHeld(r lockRank) {
+}
+
+//go:nosplit
+func worldStopped() {
+}
+
+//go:nosplit
+func worldStarted() {
+}
+
+//go:nosplit
+func assertWorldStopped() {
+}
+
+//go:nosplit
+func assertWorldStoppedOrLockHeld(l *mutex) {
+}
diff --git a/contrib/go/_std_1.18/src/runtime/malloc.go b/contrib/go/_std_1.18/src/runtime/malloc.go
new file mode 100644
index 0000000000..6ed6ceade2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/malloc.go
@@ -0,0 +1,1564 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Memory allocator.
+//
+// This was originally based on tcmalloc, but has diverged quite a bit.
+// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+
+// The main allocator works in runs of pages.
+// Small allocation sizes (up to and including 32 kB) are
+// rounded to one of about 70 size classes, each of which
+// has its own free set of objects of exactly that size.
+// Any free page of memory can be split into a set of objects
+// of one size class, which are then managed using a free bitmap.
+//
+// The allocator's data structures are:
+//
+//	fixalloc: a free-list allocator for fixed-size off-heap objects,
+//		used to manage storage used by the allocator.
+//	mheap: the malloc heap, managed at page (8192-byte) granularity.
+//	mspan: a run of in-use pages managed by the mheap.
+//	mcentral: collects all spans of a given size class.
+//	mcache: a per-P cache of mspans with free space.
+//	mstats: allocation statistics.
+//
+// Allocating a small object proceeds up a hierarchy of caches:
+//
+//	1. Round the size up to one of the small size classes
+//	   and look in the corresponding mspan in this P's mcache.
+//	   Scan the mspan's free bitmap to find a free slot.
+//	   If there is a free slot, allocate it.
+//	   This can all be done without acquiring a lock.
+//
+//	2. If the mspan has no free slots, obtain a new mspan
+//	   from the mcentral's list of mspans of the required size
+//	   class that have free space.
+//	   Obtaining a whole span amortizes the cost of locking
+//	   the mcentral.
+//
+//	3. If the mcentral's mspan list is empty, obtain a run
+//	   of pages from the mheap to use for the mspan.
+//
+//	4. If the mheap is empty or has no page runs large enough,
+//	   allocate a new group of pages (at least 1MB) from the
+//	   operating system. Allocating a large run of pages
+//	   amortizes the cost of talking to the operating system.
+//
+// Sweeping an mspan and freeing objects on it proceeds up a similar
+// hierarchy:
+//
+//	1. If the mspan is being swept in response to allocation, it
+//	   is returned to the mcache to satisfy the allocation.
+//
+//	2. Otherwise, if the mspan still has allocated objects in it,
+//	   it is placed on the mcentral free list for the mspan's size
+//	   class.
+//
+//	3. Otherwise, if all objects in the mspan are free, the mspan's
+//	   pages are returned to the mheap and the mspan is now dead.
+//
+// Allocating and freeing a large object uses the mheap
+// directly, bypassing the mcache and mcentral.
+//
+// If mspan.needzero is false, then free object slots in the mspan are
+// already zeroed. Otherwise if needzero is true, objects are zeroed as
+// they are allocated. There are various benefits to delaying zeroing
+// this way:
+//
+//	1. Stack frame allocation can avoid zeroing altogether.
+//
+//	2. It exhibits better temporal locality, since the program is
+//	   probably about to write to the memory.
+//
+//	3. We don't zero pages that never get reused.
+
+// Virtual memory layout
+//
+// The heap consists of a set of arenas, which are 64MB on 64-bit and
+// 4MB on 32-bit (heapArenaBytes). Each arena's start address is also
+// aligned to the arena size.
+//
+// Each arena has an associated heapArena object that stores the
+// metadata for that arena: the heap bitmap for all words in the arena
+// and the span map for all pages in the arena. heapArena objects are
+// themselves allocated off-heap.
+//
+// Since arenas are aligned, the address space can be viewed as a
+// series of arena frames. The arena map (mheap_.arenas) maps from
+// arena frame number to *heapArena, or nil for parts of the address
+// space not backed by the Go heap. The arena map is structured as a
+// two-level array consisting of a "L1" arena map and many "L2" arena
+// maps; however, since arenas are large, on many architectures, the
+// arena map consists of a single, large L2 map.
+//
+// The arena map covers the entire possible address space, allowing
+// the Go heap to use any part of the address space. The allocator
+// attempts to keep arenas contiguous so that large spans (and hence
+// large objects) can cross arenas.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"internal/goos"
+	"runtime/internal/atomic"
+	"runtime/internal/math"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	debugMalloc = false
+
+	maxTinySize   = _TinySize
+	tinySizeClass = _TinySizeClass
+	maxSmallSize  = _MaxSmallSize
+
+	pageShift = _PageShift
+	pageSize  = _PageSize
+	pageMask  = _PageMask
+	// By construction, single page spans of the smallest object class
+	// have the most objects per span.
+	maxObjsPerSpan = pageSize / 8
+
+	concurrentSweep = _ConcurrentSweep
+
+	_PageSize = 1 << _PageShift
+	_PageMask = _PageSize - 1
+
+	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+	_64bit = 1 << (^uintptr(0) >> 63) / 2
+
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
+	_TinySize      = 16
+	_TinySizeClass = int8(2)
+
+	_FixAllocChunk = 16 << 10 // Chunk size for FixAlloc
+
+	// Per-P, per order stack segment cache size.
+	_StackCacheSize = 32 * 1024
+
+	// Number of orders that get caching. Order 0 is FixedStack
+	// and each successive order is twice as large.
+	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
+	// will be allocated directly.
+	// Since FixedStack is different on different systems, we
+	// must vary NumStackOrders to keep the same maximum cached size.
+	//   OS               | FixedStack | NumStackOrders
+	//   -----------------+------------+---------------
+	//   linux/darwin/bsd | 2KB        | 4
+	//   windows/32       | 4KB        | 3
+	//   windows/64       | 8KB        | 2
+	//   plan9            | 4KB        | 3
+	_NumStackOrders = 4 - goarch.PtrSize/4*goos.IsWindows - 1*goos.IsPlan9
+
+	// heapAddrBits is the number of bits in a heap address. On
+	// amd64, addresses are sign-extended beyond heapAddrBits. On
+	// other arches, they are zero-extended.
+	//
+	// On most 64-bit platforms, we limit this to 48 bits based on a
+	// combination of hardware and OS limitations.
+	//
+	// amd64 hardware limits addresses to 48 bits, sign-extended
+	// to 64 bits. Addresses where the top 16 bits are not either
+	// all 0 or all 1 are "non-canonical" and invalid. Because of
+	// these "negative" addresses, we offset addresses by 1<<47
+	// (arenaBaseOffset) on amd64 before computing indexes into
+	// the heap arenas index. In 2017, amd64 hardware added
+	// support for 57 bit addresses; however, currently only Linux
+	// supports this extension and the kernel will never choose an
+	// address above 1<<47 unless mmap is called with a hint
+	// address above 1<<47 (which we never do).
+	//
+	// arm64 hardware (as of ARMv8) limits user addresses to 48
+	// bits, in the range [0, 1<<48).
+	//
+	// ppc64, mips64, and s390x support arbitrary 64 bit addresses
+	// in hardware. On Linux, Go leans on stricter OS limits. Based
+	// on Linux's processor.h, the user address space is limited as
+	// follows on 64-bit architectures:
+	//
+	// Architecture  Name              Maximum Value (exclusive)
+	// ---------------------------------------------------------------------
+	// amd64         TASK_SIZE_MAX     0x007ffffffff000 (47 bit addresses)
+	// arm64         TASK_SIZE_64      0x01000000000000 (48 bit addresses)
+	// ppc64{,le}    TASK_SIZE_USER64  0x00400000000000 (46 bit addresses)
+	// mips64{,le}   TASK_SIZE64       0x00010000000000 (40 bit addresses)
+	// s390x         TASK_SIZE         1<<64 (64 bit addresses)
+	//
+	// These limits may increase over time, but are currently at
+	// most 48 bits except on s390x. On all architectures, Linux
+	// starts placing mmap'd regions at addresses that are
+	// significantly below 48 bits, so even if it's possible to
+	// exceed Go's 48 bit limit, it's extremely unlikely in
+	// practice.
+	//
+	// On 32-bit platforms, we accept the full 32-bit address
+	// space because doing so is cheap.
+	// mips32 only has access to the low 2GB of virtual memory, so
+	// we further limit it to 31 bits.
+	//
+	// On ios/arm64, although 64-bit pointers are presumably
+	// available, pointers are truncated to 33 bits in iOS <14.
+	// Furthermore, only the top 4 GiB of the address space are
+	// actually available to the application. In iOS >=14, more
+	// of the address space is available, and the OS can now
+	// provide addresses outside of those 33 bits. Pick 40 bits
+	// as a reasonable balance between address space usage by the
+	// page allocator, and flexibility for what mmap'd regions
+	// we'll accept for the heap. We can't just move to the full
+	// 48 bits because this uses too much address space for older
+	// iOS versions.
+	// TODO(mknyszek): Once iOS <14 is deprecated, promote ios/arm64
+	// to a 48-bit address space like every other arm64 platform.
+	//
+	// WebAssembly currently has a limit of 4GB linear memory.
+	heapAddrBits = (_64bit*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64))*48 + (1-_64bit+goarch.IsWasm)*(32-(goarch.IsMips+goarch.IsMipsle)) + 40*goos.IsIos*goarch.IsArm64
+
+	// maxAlloc is the maximum size of an allocation. On 64-bit,
+	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
+	// 32-bit, however, this is one less than 1<<32 because the
+	// number of bytes in the address space doesn't actually fit
+	// in a uintptr.
+	maxAlloc = (1 << heapAddrBits) - (1-_64bit)*1
+
+	// The number of bits in a heap address, the size of heap
+	// arenas, and the L1 and L2 arena map sizes are related by
+	//
+	//   (1 << addr bits) = arena size * L1 entries * L2 entries
+	//
+	// Currently, we balance these as follows:
+	//
+	//       Platform  Addr bits  Arena size  L1 entries   L2 entries
+	// --------------  ---------  ----------  ----------  -----------
+	//       */64-bit         48        64MB           1    4M (32MB)
+	// windows/64-bit         48         4MB          64    1M  (8MB)
+	//      ios/arm64         33         4MB           1  2048  (8KB)
+	//       */32-bit         32         4MB           1  1024  (4KB)
+	//     */mips(le)         31         4MB           1   512  (2KB)
+
+	// heapArenaBytes is the size of a heap arena. The heap
+	// consists of mappings of size heapArenaBytes, aligned to
+	// heapArenaBytes. The initial heap mapping is one arena.
+	//
+	// This is currently 64MB on 64-bit non-Windows and 4MB on
+	// 32-bit and on Windows. We use smaller arenas on Windows
+	// because all committed memory is charged to the process,
+	// even if it's not touched. Hence, for processes with small
+	// heaps, the mapped arena space needs to be commensurate.
+	// This is particularly important with the race detector,
+	// since it significantly amplifies the cost of committed
+	// memory.
+	heapArenaBytes = 1 << logHeapArenaBytes
+
+	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
+	// prefer using heapArenaBytes where possible (we need the
+	// constant to compute some other constants).
+	logHeapArenaBytes = (6+20)*(_64bit*(1-goos.IsWindows)*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64)) + (2+20)*(_64bit*goos.IsWindows) + (2+20)*(1-_64bit) + (2+20)*goarch.IsWasm + (2+20)*goos.IsIos*goarch.IsArm64
+
+	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
+	heapArenaBitmapBytes = heapArenaBytes / (goarch.PtrSize * 8 / 2)
+
+	pagesPerArena = heapArenaBytes / pageSize
+
+	// arenaL1Bits is the number of bits of the arena number
+	// covered by the first level arena map.
+	//
+	// This number should be small, since the first level arena
+	// map requires PtrSize*(1<<arenaL1Bits) of space in the
+	// binary's BSS. It can be zero, in which case the first level
+	// index is effectively unused. There is a performance benefit
+	// to this, since the generated code can be more efficient,
+	// but comes at the cost of having a large L2 mapping.
+	//
+	// We use the L1 map on 64-bit Windows because the arena size
+	// is small, but the address space is still 48 bits, and
+	// there's a high cost to having a large L2.
+	arenaL1Bits = 6 * (_64bit * goos.IsWindows)
+
+	// arenaL2Bits is the number of bits of the arena number
+	// covered by the second level arena index.
+	//
+	// The size of each arena map allocation is proportional to
+	// 1<<arenaL2Bits, so it's important that this not be too
+	// large. 48 bits leads to 32MB arena index allocations, which
+	// is about the practical threshold.
+	arenaL2Bits = heapAddrBits - logHeapArenaBytes - arenaL1Bits
+
+	// arenaL1Shift is the number of bits to shift an arena frame
+	// number by to compute an index into the first level arena map.
+	arenaL1Shift = arenaL2Bits
+
+	// arenaBits is the total bits in a combined arena map index.
+	// This is split between the index into the L1 arena map and
+	// the L2 arena map.
+	arenaBits = arenaL1Bits + arenaL2Bits
+
+	// arenaBaseOffset is the pointer value that corresponds to
+	// index 0 in the heap arena map.
+	//
+	// On amd64, the address space is 48 bits, sign extended to 64
+	// bits. This offset lets us handle "negative" addresses (or
+	// high addresses if viewed as unsigned).
+	//
+	// On aix/ppc64, this offset allows to keep the heapAddrBits to
+	// 48. Otherwise, it would be 60 in order to handle mmap addresses
+	// (in range 0x0a00000000000000 - 0x0afffffffffffff). But in this
+	// case, the memory reserved in (s *pageAlloc).init for chunks
+	// is causing important slowdowns.
+	//
+	// On other platforms, the user address space is contiguous
+	// and starts at 0, so no offset is necessary.
+	arenaBaseOffset = 0xffff800000000000*goarch.IsAmd64 + 0x0a00000000000000*goos.IsAix
+	// A typed version of this constant that will make it into DWARF (for viewcore).
+	arenaBaseOffsetUintptr = uintptr(arenaBaseOffset)
+
+	// Max number of threads to run garbage collection.
+	// 2, 3, and 4 are all plausible maximums depending
+	// on the hardware details of the machine. The garbage
+	// collector scales well to 32 cpus.
+	_MaxGcproc = 32
+
+	// minLegalPointer is the smallest possible legal pointer.
+	// This is the smallest possible architectural page size,
+	// since we assume that the first page is never mapped.
+	//
+	// This should agree with minZeroPage in the compiler.
+	minLegalPointer uintptr = 4096
+)
+
+// physPageSize is the size in bytes of the OS's physical pages.
+// Mapping and unmapping operations must be done at multiples of
+// physPageSize.
+//
+// This must be set by the OS init code (typically in osinit) before
+// mallocinit.
+var physPageSize uintptr
+
+// physHugePageSize is the size in bytes of the OS's default physical huge
+// page size whose allocation is opaque to the application. It is assumed
+// and verified to be a power of two.
+//
+// If set, this must be set by the OS init code (typically in osinit) before
+// mallocinit. However, setting it at all is optional, and leaving the default
+// value is always safe (though potentially less efficient).
+//
+// Since physHugePageSize is always assumed to be a power of two,
+// physHugePageShift is defined as physHugePageSize == 1 << physHugePageShift.
+// The purpose of physHugePageShift is to avoid doing divisions in
+// performance critical functions.
+var (
+	physHugePageSize  uintptr
+	physHugePageShift uint
+)
+
+// OS memory management abstraction layer
+//
+// Regions of the address space managed by the runtime may be in one of four
+// states at any given time:
+// 1) None - Unreserved and unmapped, the default state of any region.
+// 2) Reserved - Owned by the runtime, but accessing it would cause a fault.
+//               Does not count against the process' memory footprint.
+// 3) Prepared - Reserved, intended not to be backed by physical memory (though
+//               an OS may implement this lazily). Can transition efficiently to
+//               Ready. Accessing memory in such a region is undefined (may
+//               fault, may give back unexpected zeroes, etc.).
+// 4) Ready - may be accessed safely.
+//
+// This set of states is more than is strictly necessary to support all the
+// currently supported platforms. One could get by with just None, Reserved, and
+// Ready. However, the Prepared state gives us flexibility for performance
+// purposes. For example, on POSIX-y operating systems, Reserved is usually a
+// private anonymous mmap'd region with PROT_NONE set, and to transition
+// to Ready would require setting PROT_READ|PROT_WRITE. However the
+// underspecification of Prepared lets us use just MADV_FREE to transition from
+// Ready to Prepared. Thus with the Prepared state we can set the permission
+// bits just once early on, we can efficiently tell the OS that it's free to
+// take pages away from us when we don't strictly need them.
+//
+// For each OS there is a common set of helpers defined that transition
+// memory regions between these states. The helpers are as follows:
+//
+// sysAlloc transitions an OS-chosen region of memory from None to Ready.
+// More specifically, it obtains a large chunk of zeroed memory from the
+// operating system, typically on the order of a hundred kilobytes
+// or a megabyte. This memory is always immediately available for use.
+//
+// sysFree transitions a memory region from any state to None. Therefore, it
+// returns memory unconditionally. It is used if an out-of-memory error has been
+// detected midway through an allocation or to carve out an aligned section of
+// the address space. It is okay if sysFree is a no-op only if sysReserve always
+// returns a memory region aligned to the heap allocator's alignment
+// restrictions.
+//
+// sysReserve transitions a memory region from None to Reserved. It reserves
+// address space in such a way that it would cause a fatal fault upon access
+// (either via permissions or not committing the memory). Such a reservation is
+// thus never backed by physical memory.
+// If the pointer passed to it is non-nil, the caller wants the
+// reservation there, but sysReserve can still choose another
+// location if that one is unavailable.
+// NOTE: sysReserve returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysReserve.
+//
+// sysMap transitions a memory region from Reserved to Prepared. It ensures the
+// memory region can be efficiently transitioned to Ready.
+//
+// sysUsed transitions a memory region from Prepared to Ready. It notifies the
+// operating system that the memory region is needed and ensures that the region
+// may be safely accessed. This is typically a no-op on systems that don't have
+// an explicit commit step and hard over-commit limits, but is critical on
+// Windows, for example.
+//
+// sysUnused transitions a memory region from Ready to Prepared. It notifies the
+// operating system that the physical pages backing this memory region are no
+// longer needed and can be reused for other purposes. The contents of a
+// sysUnused memory region are considered forfeit and the region must not be
+// accessed again until sysUsed is called.
+//
+// sysFault transitions a memory region from Ready or Prepared to Reserved. It
+// marks a region such that it will always fault if accessed. Used only for
+// debugging the runtime.
+
+func mallocinit() {
+	if class_to_size[_TinySizeClass] != _TinySize {
+		throw("bad TinySizeClass")
+	}
+
+	if heapArenaBitmapBytes&(heapArenaBitmapBytes-1) != 0 {
+		// heapBits expects modular arithmetic on bitmap
+		// addresses to work.
+		throw("heapArenaBitmapBytes not a power of 2")
+	}
+
+	// Copy class sizes out for statistics table.
+	for i := range class_to_size {
+		memstats.by_size[i].size = uint32(class_to_size[i])
+	}
+
+	// Check physPageSize.
+	if physPageSize == 0 {
+		// The OS init code failed to fetch the physical page size.
+		throw("failed to get system page size")
+	}
+	if physPageSize > maxPhysPageSize {
+		print("system page size (", physPageSize, ") is larger than maximum page size (", maxPhysPageSize, ")\n")
+		throw("bad system page size")
+	}
+	if physPageSize < minPhysPageSize {
+		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
+		throw("bad system page size")
+	}
+	if physPageSize&(physPageSize-1) != 0 {
+		print("system page size (", physPageSize, ") must be a power of 2\n")
+		throw("bad system page size")
+	}
+	if physHugePageSize&(physHugePageSize-1) != 0 {
+		print("system huge page size (", physHugePageSize, ") must be a power of 2\n")
+		throw("bad system huge page size")
+	}
+	if physHugePageSize > maxPhysHugePageSize {
+		// physHugePageSize is greater than the maximum supported huge page size.
+		// Don't throw here, like in the other cases, since a system configured
+		// in this way isn't wrong, we just don't have the code to support them.
+		// Instead, silently set the huge page size to zero.
+		physHugePageSize = 0
+	}
+	if physHugePageSize != 0 {
+		// Since physHugePageSize is a power of 2, it suffices to increase
+		// physHugePageShift until 1<<physHugePageShift == physHugePageSize.
+		for 1<<physHugePageShift != physHugePageSize {
+			physHugePageShift++
+		}
+	}
+	if pagesPerArena%pagesPerSpanRoot != 0 {
+		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerSpanRoot (", pagesPerSpanRoot, ")\n")
+		throw("bad pagesPerSpanRoot")
+	}
+	if pagesPerArena%pagesPerReclaimerChunk != 0 {
+		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerReclaimerChunk (", pagesPerReclaimerChunk, ")\n")
+		throw("bad pagesPerReclaimerChunk")
+	}
+
+	// Initialize the heap.
+	mheap_.init()
+	mcache0 = allocmcache()
+	lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas)
+	lockInit(&proflock, lockRankProf)
+	lockInit(&globalAlloc.mutex, lockRankGlobalAlloc)
+
+	// Create initial arena growth hints.
+	if goarch.PtrSize == 8 {
+		// On a 64-bit machine, we pick the following hints
+		// because:
+		//
+		// 1. Starting from the middle of the address space
+		// makes it easier to grow out a contiguous range
+		// without running in to some other mapping.
+		//
+		// 2. This makes Go heap addresses more easily
+		// recognizable when debugging.
+		//
+		// 3. Stack scanning in gccgo is still conservative,
+		// so it's important that addresses be distinguishable
+		// from other data.
+		//
+		// Starting at 0x00c0 means that the valid memory addresses
+		// will begin 0x00c0, 0x00c1, ...
+		// In little-endian, that's c0 00, c1 00, ... None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from
+		// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
+		// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
+		// These choices reduce the odds of a conservative garbage collector
+		// not collecting memory because some non-pointer block of memory
+		// had a bit pattern that matched a memory address.
+		//
+		// However, on arm64, we ignore all this advice above and slam the
+		// allocation at 0x40 << 32 because when using 4k pages with 3-level
+		// translation buffers, the user address space is limited to 39 bits
+		// On ios/arm64, the address space is even smaller.
+		//
+		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
+		// processes.
+		for i := 0x7f; i >= 0; i-- {
+			var p uintptr
+			switch {
+			case raceenabled:
+				// The TSAN runtime requires the heap
+				// to be in the range [0x00c000000000,
+				// 0x00e000000000).
+				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
+				if p >= uintptrMask&0x00e000000000 {
+					continue
+				}
+			case GOARCH == "arm64" && GOOS == "ios":
+				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
+			case GOARCH == "arm64":
+				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
+			case GOOS == "aix":
+				if i == 0 {
+					// We don't use addresses directly after 0x0A00000000000000
+					// to avoid collisions with others mmaps done by non-go programs.
+					continue
+				}
+				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52)
+			default:
+				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
+			}
+			hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
+			hint.addr = p
+			hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
+		}
+	} else {
+		// On a 32-bit machine, we're much more concerned
+		// about keeping the usable heap contiguous.
+		// Hence:
+		//
+		// 1. We reserve space for all heapArenas up front so
+		// they don't get interleaved with the heap. They're
+		// ~258MB, so this isn't too bad. (We could reserve a
+		// smaller amount of space up front if this is a
+		// problem.)
+		//
+		// 2. We hint the heap to start right above the end of
+		// the binary so we have the best chance of keeping it
+		// contiguous.
+		//
+		// 3. We try to stake out a reasonably large initial
+		// heap reservation.
+
+		const arenaMetaSize = (1 << arenaBits) * unsafe.Sizeof(heapArena{})
+		meta := uintptr(sysReserve(nil, arenaMetaSize))
+		if meta != 0 {
+			mheap_.heapArenaAlloc.init(meta, arenaMetaSize, true)
+		}
+
+		// We want to start the arena low, but if we're linked
+		// against C code, it's possible global constructors
+		// have called malloc and adjusted the process' brk.
+		// Query the brk so we can avoid trying to map the
+		// region over it (which will cause the kernel to put
+		// the region somewhere else, likely at a high
+		// address).
+		procBrk := sbrk0()
+
+		// If we ask for the end of the data segment but the
+		// operating system requires a little more space
+		// before we can start allocating, it will give out a
+		// slightly higher pointer. Except QEMU, which is
+		// buggy, as usual: it won't adjust the pointer
+		// upward. So adjust it upward a little bit ourselves:
+		// 1/4 MB to get away from the running binary image.
+		p := firstmoduledata.end
+		if p < procBrk {
+			p = procBrk
+		}
+		if mheap_.heapArenaAlloc.next <= p && p < mheap_.heapArenaAlloc.end {
+			p = mheap_.heapArenaAlloc.end
+		}
+		p = alignUp(p+(256<<10), heapArenaBytes)
+		// Because we're worried about fragmentation on
+		// 32-bit, we try to make a large initial reservation.
+		arenaSizes := []uintptr{
+			512 << 20,
+			256 << 20,
+			128 << 20,
+		}
+		for _, arenaSize := range arenaSizes {
+			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes)
+			if a != nil {
+				mheap_.arena.init(uintptr(a), size, false)
+				p = mheap_.arena.end // For hint below
+				break
+			}
+		}
+		hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
+		hint.addr = p
+		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
+	}
+}
+
+// sysAlloc allocates heap arena space for at least n bytes. The
+// returned pointer is always heapArenaBytes-aligned and backed by
+// h.arenas metadata. The returned size is always a multiple of
+// heapArenaBytes. sysAlloc returns nil on failure.
+// There is no corresponding free function.
+//
+// sysAlloc returns a memory region in the Reserved state. This region must
+// be transitioned to Prepared and then Ready before use.
+//
+// h must be locked.
+func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
+	assertLockHeld(&h.lock)
+
+	n = alignUp(n, heapArenaBytes)
+
+	// First, try the arena pre-reservation.
+	v = h.arena.alloc(n, heapArenaBytes, &memstats.heap_sys)
+	if v != nil {
+		size = n
+		goto mapped
+	}
+
+	// Try to grow the heap at a hint address.
+	for h.arenaHints != nil {
+		hint := h.arenaHints
+		p := hint.addr
+		if hint.down {
+			p -= n
+		}
+		if p+n < p {
+			// We can't use this, so don't ask.
+			v = nil
+		} else if arenaIndex(p+n-1) >= 1<<arenaBits {
+			// Outside addressable heap. Can't use.
+			v = nil
+		} else {
+			v = sysReserve(unsafe.Pointer(p), n)
+		}
+		if p == uintptr(v) {
+			// Success. Update the hint.
+			if !hint.down {
+				p += n
+			}
+			hint.addr = p
+			size = n
+			break
+		}
+		// Failed. Discard this hint and try the next.
+		//
+		// TODO: This would be cleaner if sysReserve could be
+		// told to only return the requested address. In
+		// particular, this is already how Windows behaves, so
+		// it would simplify things there.
+		if v != nil {
+			sysFree(v, n, nil)
+		}
+		h.arenaHints = hint.next
+		h.arenaHintAlloc.free(unsafe.Pointer(hint))
+	}
+
+	if size == 0 {
+		if raceenabled {
+			// The race detector assumes the heap lives in
+			// [0x00c000000000, 0x00e000000000), but we
+			// just ran out of hints in this region. Give
+			// a nice failure.
+			throw("too many address space collisions for -race mode")
+		}
+
+		// All of the hints failed, so we'll take any
+		// (sufficiently aligned) address the kernel will give
+		// us.
+		v, size = sysReserveAligned(nil, n, heapArenaBytes)
+		if v == nil {
+			return nil, 0
+		}
+
+		// Create new hints for extending this region.
+		hint := (*arenaHint)(h.arenaHintAlloc.alloc())
+		hint.addr, hint.down = uintptr(v), true
+		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
+		hint = (*arenaHint)(h.arenaHintAlloc.alloc())
+		hint.addr = uintptr(v) + size
+		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
+	}
+
+	// Check for bad pointers or pointers we can't use.
+	{
+		var bad string
+		p := uintptr(v)
+		if p+size < p {
+			bad = "region exceeds uintptr range"
+		} else if arenaIndex(p) >= 1<<arenaBits {
+			bad = "base outside usable address space"
+		} else if arenaIndex(p+size-1) >= 1<<arenaBits {
+			bad = "end outside usable address space"
+		}
+		if bad != "" {
+			// This should be impossible on most architectures,
+			// but it would be really confusing to debug.
+			print("runtime: memory allocated by OS [", hex(p), ", ", hex(p+size), ") not in usable address space: ", bad, "\n")
+			throw("memory reservation exceeds address space limit")
+		}
+	}
+
+	if uintptr(v)&(heapArenaBytes-1) != 0 {
+		throw("misrounded allocation in sysAlloc")
+	}
+
+mapped:
+	// Create arena metadata.
+	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
+		l2 := h.arenas[ri.l1()]
+		if l2 == nil {
+			// Allocate an L2 arena map.
+			l2 = (*[1 << arenaL2Bits]*heapArena)(persistentalloc(unsafe.Sizeof(*l2), goarch.PtrSize, nil))
+			if l2 == nil {
+				throw("out of memory allocating heap arena map")
+			}
+			atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri.l1()]), unsafe.Pointer(l2))
+		}
+
+		if l2[ri.l2()] != nil {
+			throw("arena already initialized")
+		}
+		var r *heapArena
+		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), goarch.PtrSize, &memstats.gcMiscSys))
+		if r == nil {
+			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), goarch.PtrSize, &memstats.gcMiscSys))
+			if r == nil {
+				throw("out of memory allocating heap arena metadata")
+			}
+		}
+
+		// Add the arena to the arenas list.
+		if len(h.allArenas) == cap(h.allArenas) {
+			size := 2 * uintptr(cap(h.allArenas)) * goarch.PtrSize
+			if size == 0 {
+				size = physPageSize
+			}
+			newArray := (*notInHeap)(persistentalloc(size, goarch.PtrSize, &memstats.gcMiscSys))
+			if newArray == nil {
+				throw("out of memory allocating allArenas")
+			}
+			oldSlice := h.allArenas
+			*(*notInHeapSlice)(unsafe.Pointer(&h.allArenas)) = notInHeapSlice{newArray, len(h.allArenas), int(size / goarch.PtrSize)}
+			copy(h.allArenas, oldSlice)
+			// Do not free the old backing array because
+			// there may be concurrent readers. Since we
+			// double the array each time, this can lead
+			// to at most 2x waste.
+		}
+		h.allArenas = h.allArenas[:len(h.allArenas)+1]
+		h.allArenas[len(h.allArenas)-1] = ri
+
+		// Store atomically just in case an object from the
+		// new heap arena becomes visible before the heap lock
+		// is released (which shouldn't happen, but there's
+		// little downside to this).
+		atomic.StorepNoWB(unsafe.Pointer(&l2[ri.l2()]), unsafe.Pointer(r))
+	}
+
+	// Tell the race detector about the new heap memory.
+	if raceenabled {
+		racemapshadow(v, size)
+	}
+
+	return
+}
+
+// sysReserveAligned is like sysReserve, but the returned pointer is
+// aligned to align bytes. It may reserve either n or n+align bytes,
+// so it returns the size that was reserved.
+func sysReserveAligned(v unsafe.Pointer, size, align uintptr) (unsafe.Pointer, uintptr) {
+	// Since the alignment is rather large in uses of this
+	// function, we're not likely to get it by chance, so we ask
+	// for a larger region and remove the parts we don't need.
+	retries := 0
+retry:
+	p := uintptr(sysReserve(v, size+align))
+	switch {
+	case p == 0:
+		return nil, 0
+	case p&(align-1) == 0:
+		// We got lucky and got an aligned region, so we can
+		// use the whole thing.
+		return unsafe.Pointer(p), size + align
+	case GOOS == "windows":
+		// On Windows we can't release pieces of a
+		// reservation, so we release the whole thing and
+		// re-reserve the aligned sub-region. This may race,
+		// so we may have to try again.
+		sysFree(unsafe.Pointer(p), size+align, nil)
+		p = alignUp(p, align)
+		p2 := sysReserve(unsafe.Pointer(p), size)
+		if p != uintptr(p2) {
+			// Must have raced. Try again.
+			sysFree(p2, size, nil)
+			if retries++; retries == 100 {
+				throw("failed to allocate aligned heap memory; too many retries")
+			}
+			goto retry
+		}
+		// Success.
+		return p2, size
+	default:
+		// Trim off the unaligned parts.
+		pAligned := alignUp(p, align)
+		sysFree(unsafe.Pointer(p), pAligned-p, nil)
+		end := pAligned + size
+		endLen := (p + size + align) - end
+		if endLen > 0 {
+			sysFree(unsafe.Pointer(end), endLen, nil)
+		}
+		return unsafe.Pointer(pAligned), size
+	}
+}
+
+// base address for all 0-byte allocations
+var zerobase uintptr
+
+// nextFreeFast returns the next free object if one is quickly available.
+// Otherwise it returns 0.
+func nextFreeFast(s *mspan) gclinkptr {
+	theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
+	if theBit < 64 {
+		result := s.freeindex + uintptr(theBit)
+		if result < s.nelems {
+			freeidx := result + 1
+			if freeidx%64 == 0 && freeidx != s.nelems {
+				return 0
+			}
+			s.allocCache >>= uint(theBit + 1)
+			s.freeindex = freeidx
+			s.allocCount++
+			return gclinkptr(result*s.elemsize + s.base())
+		}
+	}
+	return 0
+}
+
+// nextFree returns the next free object from the cached span if one is available.
+// Otherwise it refills the cache with a span with an available object and
+// returns that object along with a flag indicating that this was a heavy
+// weight allocation. If it is a heavy weight allocation the caller must
+// determine whether a new GC cycle needs to be started or if the GC is active
+// whether this goroutine needs to assist the GC.
+//
+// Must run in a non-preemptible context since otherwise the owner of
+// c could change.
+func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) {
+	s = c.alloc[spc]
+	shouldhelpgc = false
+	freeIndex := s.nextFreeIndex()
+	if freeIndex == s.nelems {
+		// The span is full.
+		if uintptr(s.allocCount) != s.nelems {
+			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
+		}
+		c.refill(spc)
+		shouldhelpgc = true
+		s = c.alloc[spc]
+
+		freeIndex = s.nextFreeIndex()
+	}
+
+	if freeIndex >= s.nelems {
+		throw("freeIndex is not valid")
+	}
+
+	v = gclinkptr(freeIndex*s.elemsize + s.base())
+	s.allocCount++
+	if uintptr(s.allocCount) > s.nelems {
+		println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+		throw("s.allocCount > s.nelems")
+	}
+	return
+}
+
+// Allocate an object of size bytes.
+// Small objects are allocated from the per-P cache's free lists.
+// Large objects (> 32 kB) are allocated straight from the heap.
+func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
+	if gcphase == _GCmarktermination {
+		throw("mallocgc called with gcphase == _GCmarktermination")
+	}
+
+	if size == 0 {
+		return unsafe.Pointer(&zerobase)
+	}
+	userSize := size
+	if asanenabled {
+		// Refer to ASAN runtime library, the malloc() function allocates extra memory,
+		// the redzone, around the user requested memory region. And the redzones are marked
+		// as unaddressable. We perform the same operations in Go to detect the overflows or
+		// underflows.
+		size += computeRZlog(size)
+	}
+
+	if debug.malloc {
+		if debug.sbrk != 0 {
+			align := uintptr(16)
+			if typ != nil {
+				// TODO(austin): This should be just
+				//   align = uintptr(typ.align)
+				// but that's only 4 on 32-bit platforms,
+				// even if there's a uint64 field in typ (see #599).
+				// This causes 64-bit atomic accesses to panic.
+				// Hence, we use stricter alignment that matches
+				// the normal allocator better.
+				if size&7 == 0 {
+					align = 8
+				} else if size&3 == 0 {
+					align = 4
+				} else if size&1 == 0 {
+					align = 2
+				} else {
+					align = 1
+				}
+			}
+			return persistentalloc(size, align, &memstats.other_sys)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single goroutine.
+			inittrace.allocs += 1
+		}
+	}
+
+	// assistG is the G to charge for this allocation, or nil if
+	// GC is not currently active.
+	var assistG *g
+	if gcBlackenEnabled != 0 {
+		// Charge the current user G for this allocation.
+		assistG = getg()
+		if assistG.m.curg != nil {
+			assistG = assistG.m.curg
+		}
+		// Charge the allocation against the G. We'll account
+		// for internal fragmentation at the end of mallocgc.
+		assistG.gcAssistBytes -= int64(size)
+
+		if assistG.gcAssistBytes < 0 {
+			// This G is in debt. Assist the GC to correct
+			// this before allocating. This must happen
+			// before disabling preemption.
+			gcAssistAlloc(assistG)
+		}
+	}
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("malloc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("malloc during signal")
+	}
+	mp.mallocing = 1
+
+	shouldhelpgc := false
+	dataSize := userSize
+	c := getMCache(mp)
+	if c == nil {
+		throw("mallocgc called without a P or outside bootstrapping")
+	}
+	var span *mspan
+	var x unsafe.Pointer
+	noscan := typ == nil || typ.ptrdata == 0
+	// In some cases block zeroing can profitably (for latency reduction purposes)
+	// be delayed till preemption is possible; delayedZeroing tracks that state.
+	delayedZeroing := false
+	if size <= maxSmallSize {
+		if noscan && size < maxTinySize {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be noscan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (maxTinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= maxTinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+			off := c.tinyoffset
+			// Align tiny pointer for required (conservative) alignment.
+			if size&7 == 0 {
+				off = alignUp(off, 8)
+			} else if goarch.PtrSize == 4 && size == 12 {
+				// Conservatively align 12-byte objects to 8 bytes on 32-bit
+				// systems so that objects whose first field is a 64-bit
+				// value is aligned to 8 bytes and does not cause a fault on
+				// atomic access. See issue 37262.
+				// TODO(mknyszek): Remove this workaround if/when issue 36606
+				// is resolved.
+				off = alignUp(off, 8)
+			} else if size&3 == 0 {
+				off = alignUp(off, 4)
+			} else if size&1 == 0 {
+				off = alignUp(off, 2)
+			}
+			if off+size <= maxTinySize && c.tiny != 0 {
+				// The object fits into existing tiny block.
+				x = unsafe.Pointer(c.tiny + off)
+				c.tinyoffset = off + size
+				c.tinyAllocs++
+				mp.mallocing = 0
+				releasem(mp)
+				return x
+			}
+			// Allocate a new maxTinySize block.
+			span = c.alloc[tinySpanClass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, span, shouldhelpgc = c.nextFree(tinySpanClass)
+			}
+			x = unsafe.Pointer(v)
+			(*[2]uint64)(x)[0] = 0
+			(*[2]uint64)(x)[1] = 0
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if !raceenabled && (size < c.tinyoffset || c.tiny == 0) {
+				// Note: disabled when race detector is on, see comment near end of this function.
+				c.tiny = uintptr(x)
+				c.tinyoffset = size
+			}
+			size = maxTinySize
+		} else {
+			var sizeclass uint8
+			if size <= smallSizeMax-8 {
+				sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
+			} else {
+				sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
+			}
+			size = uintptr(class_to_size[sizeclass])
+			spc := makeSpanClass(sizeclass, noscan)
+			span = c.alloc[spc]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, span, shouldhelpgc = c.nextFree(spc)
+			}
+			x = unsafe.Pointer(v)
+			if needzero && span.needzero != 0 {
+				memclrNoHeapPointers(unsafe.Pointer(v), size)
+			}
+		}
+	} else {
+		shouldhelpgc = true
+		// For large allocations, keep track of zeroed state so that
+		// bulk zeroing can be happen later in a preemptible context.
+		span = c.allocLarge(size, noscan)
+		span.freeindex = 1
+		span.allocCount = 1
+		size = span.elemsize
+		x = unsafe.Pointer(span.base())
+		if needzero && span.needzero != 0 {
+			if noscan {
+				delayedZeroing = true
+			} else {
+				memclrNoHeapPointers(x, size)
+				// We've in theory cleared almost the whole span here,
+				// and could take the extra step of actually clearing
+				// the whole thing. However, don't. Any GC bits for the
+				// uncleared parts will be zero, and it's just going to
+				// be needzero = 1 once freed anyway.
+			}
+		}
+	}
+
+	var scanSize uintptr
+	if !noscan {
+		heapBitsSetType(uintptr(x), size, dataSize, typ)
+		if dataSize > typ.size {
+			// Array allocation. If there are any
+			// pointers, GC has to scan to the last
+			// element.
+			if typ.ptrdata != 0 {
+				scanSize = dataSize - typ.size + typ.ptrdata
+			}
+		} else {
+			scanSize = typ.ptrdata
+		}
+		c.scanAlloc += scanSize
+	}
+
+	// Ensure that the stores above that initialize x to
+	// type-safe memory and set the heap bits occur before
+	// the caller can make x observable to the garbage
+	// collector. Otherwise, on weakly ordered machines,
+	// the garbage collector could follow a pointer to x,
+	// but see uninitialized memory or stale heap bits.
+	publicationBarrier()
+
+	// Allocate black during GC.
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase != _GCoff {
+		gcmarknewobject(span, uintptr(x), size, scanSize)
+	}
+
+	if raceenabled {
+		racemalloc(x, size)
+	}
+
+	if msanenabled {
+		msanmalloc(x, size)
+	}
+
+	if asanenabled {
+		// We should only read/write the memory with the size asked by the user.
+		// The rest of the allocated memory should be poisoned, so that we can report
+		// errors when accessing poisoned memory.
+		// The allocated memory is larger than required userSize, it will also include
+		// redzone and some other padding bytes.
+		rzBeg := unsafe.Add(x, userSize)
+		asanpoison(rzBeg, size-userSize)
+		asanunpoison(x, userSize)
+	}
+
+	if rate := MemProfileRate; rate > 0 {
+		// Note cache c only valid while m acquired; see #47302
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
+		} else {
+			profilealloc(mp, x, size)
+		}
+	}
+	mp.mallocing = 0
+	releasem(mp)
+
+	// Pointerfree data can be zeroed late in a context where preemption can occur.
+	// x will keep the memory alive.
+	if delayedZeroing {
+		if !noscan {
+			throw("delayed zeroing on data that may contain pointers")
+		}
+		memclrNoHeapPointersChunked(size, x) // This is a possible preemption point: see #47302
+	}
+
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(x, size, typ)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single goroutine.
+			inittrace.bytes += uint64(size)
+		}
+	}
+
+	if assistG != nil {
+		// Account for internal fragmentation in the assist
+		// debt now that we know it.
+		assistG.gcAssistBytes -= int64(size - dataSize)
+	}
+
+	if shouldhelpgc {
+		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+			gcStart(t)
+		}
+	}
+
+	if raceenabled && noscan && dataSize < maxTinySize {
+		// Pad tinysize allocations so they are aligned with the end
+		// of the tinyalloc region. This ensures that any arithmetic
+		// that goes off the top end of the object will be detectable
+		// by checkptr (issue 38872).
+		// Note that we disable tinyalloc when raceenabled for this to work.
+		// TODO: This padding is only performed when the race detector
+		// is enabled. It would be nice to enable it if any package
+		// was compiled with checkptr, but there's no easy way to
+		// detect that (especially at compile time).
+		// TODO: enable this padding for all allocations, not just
+		// tinyalloc ones. It's tricky because of pointer maps.
+		// Maybe just all noscan objects?
+		x = add(x, size-dataSize)
+	}
+
+	return x
+}
+
+// memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
+// on chunks of the buffer to be zeroed, with opportunities for preemption
+// along the way.  memclrNoHeapPointers contains no safepoints and also
+// cannot be preemptively scheduled, so this provides a still-efficient
+// block copy that can also be preempted on a reasonable granularity.
+//
+// Use this with care; if the data being cleared is tagged to contain
+// pointers, this allows the GC to run before it is all cleared.
+func memclrNoHeapPointersChunked(size uintptr, x unsafe.Pointer) {
+	v := uintptr(x)
+	// got this from benchmarking. 128k is too small, 512k is too large.
+	const chunkBytes = 256 * 1024
+	vsize := v + size
+	for voff := v; voff < vsize; voff = voff + chunkBytes {
+		if getg().preempt {
+			// may hold locks, e.g., profiling
+			goschedguarded()
+		}
+		// clear min(avail, lump) bytes
+		n := vsize - voff
+		if n > chunkBytes {
+			n = chunkBytes
+		}
+		memclrNoHeapPointers(unsafe.Pointer(voff), n)
+	}
+}
+
+// implementation of new builtin
+// compiler (both frontend and SSA backend) knows the signature
+// of this function
+func newobject(typ *_type) unsafe.Pointer {
+	return mallocgc(typ.size, typ, true)
+}
+
+//go:linkname reflect_unsafe_New reflect.unsafe_New
+func reflect_unsafe_New(typ *_type) unsafe.Pointer {
+	return mallocgc(typ.size, typ, true)
+}
+
+//go:linkname reflectlite_unsafe_New internal/reflectlite.unsafe_New
+func reflectlite_unsafe_New(typ *_type) unsafe.Pointer {
+	return mallocgc(typ.size, typ, true)
+}
+
+// newarray allocates an array of n elements of type typ.
+func newarray(typ *_type, n int) unsafe.Pointer {
+	if n == 1 {
+		return mallocgc(typ.size, typ, true)
+	}
+	mem, overflow := math.MulUintptr(typ.size, uintptr(n))
+	if overflow || mem > maxAlloc || n < 0 {
+		panic(plainError("runtime: allocation size out of range"))
+	}
+	return mallocgc(mem, typ, true)
+}
+
+//go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
+func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
+	return newarray(typ, n)
+}
+
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	c := getMCache(mp)
+	if c == nil {
+		throw("profilealloc called without a P or outside bootstrapping")
+	}
+	c.nextSample = nextSample()
+	mProf_Malloc(x, size)
+}
+
+// nextSample returns the next sampling point for heap profiling. The goal is
+// to sample allocations on average every MemProfileRate bytes, but with a
+// completely random distribution over the allocation timeline; this
+// corresponds to a Poisson process with parameter MemProfileRate. In Poisson
+// processes, the distance between two samples follows the exponential
+// distribution (exp(MemProfileRate)), so the best return value is a random
+// number taken from an exponential distribution whose mean is MemProfileRate.
+func nextSample() uintptr {
+	if MemProfileRate == 1 {
+		// Callers assign our return value to
+		// mcache.next_sample, but next_sample is not used
+		// when the rate is 1. So avoid the math below and
+		// just return something.
+		return 0
+	}
+	if GOOS == "plan9" {
+		// Plan 9 doesn't support floating point in note handler.
+		if g := getg(); g == g.m.gsignal {
+			return nextSampleNoFP()
+		}
+	}
+
+	return uintptr(fastexprand(MemProfileRate))
+}
+
+// fastexprand returns a random number from an exponential distribution with
+// the specified mean.
+func fastexprand(mean int) int32 {
+	// Avoid overflow. Maximum possible step is
+	// -ln(1/(1<<randomBitCount)) * mean, approximately 20 * mean.
+	switch {
+	case mean > 0x7000000:
+		mean = 0x7000000
+	case mean == 0:
+		return 0
+	}
+
+	// Take a random sample of the exponential distribution exp(-mean*x).
+	// The probability distribution function is mean*exp(-mean*x), so the CDF is
+	// p = 1 - exp(-mean*x), so
+	// q = 1 - p == exp(-mean*x)
+	// log_e(q) = -mean*x
+	// -log_e(q)/mean = x
+	// x = -log_e(q) * mean
+	// x = log_2(q) * (-log_e(2)) * mean    ; Using log_2 for efficiency
+	const randomBitCount = 26
+	q := fastrandn(1<<randomBitCount) + 1
+	qlog := fastlog2(float64(q)) - randomBitCount
+	if qlog > 0 {
+		qlog = 0
+	}
+	const minusLog2 = -0.6931471805599453 // -ln(2)
+	return int32(qlog*(minusLog2*float64(mean))) + 1
+}
+
+// nextSampleNoFP is similar to nextSample, but uses older,
+// simpler code to avoid floating point.
+func nextSampleNoFP() uintptr {
+	// Set first allocation sample size.
+	rate := MemProfileRate
+	if rate > 0x3fffffff { // make 2*rate not overflow
+		rate = 0x3fffffff
+	}
+	if rate != 0 {
+		return uintptr(fastrandn(uint32(2 * rate)))
+	}
+	return 0
+}
+
+type persistentAlloc struct {
+	base *notInHeap
+	off  uintptr
+}
+
+var globalAlloc struct {
+	mutex
+	persistentAlloc
+}
+
+// persistentChunkSize is the number of bytes we allocate when we grow
+// a persistentAlloc.
+const persistentChunkSize = 256 << 10
+
+// persistentChunks is a list of all the persistent chunks we have
+// allocated. The list is maintained through the first word in the
+// persistent chunk. This is updated atomically.
+var persistentChunks *notInHeap
+
+// Wrapper around sysAlloc that can allocate small chunks.
+// There is no associated free operation.
+// Intended for things like function/type/debug-related persistent data.
+// If align is 0, uses default align (currently 8).
+// The returned memory will be zeroed.
+//
+// Consider marking persistentalloc'd types go:notinheap.
+func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	var p *notInHeap
+	systemstack(func() {
+		p = persistentalloc1(size, align, sysStat)
+	})
+	return unsafe.Pointer(p)
+}
+
+// Must run on system stack because stack growth can (re)invoke it.
+// See issue 9174.
+//go:systemstack
+func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
+	const (
+		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
+	)
+
+	if size == 0 {
+		throw("persistentalloc: size == 0")
+	}
+	if align != 0 {
+		if align&(align-1) != 0 {
+			throw("persistentalloc: align is not a power of 2")
+		}
+		if align > _PageSize {
+			throw("persistentalloc: align is too large")
+		}
+	} else {
+		align = 8
+	}
+
+	if size >= maxBlock {
+		return (*notInHeap)(sysAlloc(size, sysStat))
+	}
+
+	mp := acquirem()
+	var persistent *persistentAlloc
+	if mp != nil && mp.p != 0 {
+		persistent = &mp.p.ptr().palloc
+	} else {
+		lock(&globalAlloc.mutex)
+		persistent = &globalAlloc.persistentAlloc
+	}
+	persistent.off = alignUp(persistent.off, align)
+	if persistent.off+size > persistentChunkSize || persistent.base == nil {
+		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys))
+		if persistent.base == nil {
+			if persistent == &globalAlloc.persistentAlloc {
+				unlock(&globalAlloc.mutex)
+			}
+			throw("runtime: cannot allocate memory")
+		}
+
+		// Add the new chunk to the persistentChunks list.
+		for {
+			chunks := uintptr(unsafe.Pointer(persistentChunks))
+			*(*uintptr)(unsafe.Pointer(persistent.base)) = chunks
+			if atomic.Casuintptr((*uintptr)(unsafe.Pointer(&persistentChunks)), chunks, uintptr(unsafe.Pointer(persistent.base))) {
+				break
+			}
+		}
+		persistent.off = alignUp(goarch.PtrSize, align)
+	}
+	p := persistent.base.add(persistent.off)
+	persistent.off += size
+	releasem(mp)
+	if persistent == &globalAlloc.persistentAlloc {
+		unlock(&globalAlloc.mutex)
+	}
+
+	if sysStat != &memstats.other_sys {
+		sysStat.add(int64(size))
+		memstats.other_sys.add(-int64(size))
+	}
+	return p
+}
+
+// inPersistentAlloc reports whether p points to memory allocated by
+// persistentalloc. This must be nosplit because it is called by the
+// cgo checker code, which is called by the write barrier code.
+//go:nosplit
+func inPersistentAlloc(p uintptr) bool {
+	chunk := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&persistentChunks)))
+	for chunk != 0 {
+		if p >= chunk && p < chunk+persistentChunkSize {
+			return true
+		}
+		chunk = *(*uintptr)(unsafe.Pointer(chunk))
+	}
+	return false
+}
+
+// linearAlloc is a simple linear allocator that pre-reserves a region
+// of memory and then optionally maps that region into the Ready state
+// as needed.
+//
+// The caller is responsible for locking.
+type linearAlloc struct {
+	next   uintptr // next free byte
+	mapped uintptr // one byte past end of mapped space
+	end    uintptr // end of reserved space
+
+	mapMemory bool // transition memory from Reserved to Ready if true
+}
+
+func (l *linearAlloc) init(base, size uintptr, mapMemory bool) {
+	if base+size < base {
+		// Chop off the last byte. The runtime isn't prepared
+		// to deal with situations where the bounds could overflow.
+		// Leave that memory reserved, though, so we don't map it
+		// later.
+		size -= 1
+	}
+	l.next, l.mapped = base, base
+	l.end = base + size
+	l.mapMemory = mapMemory
+}
+
+func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	p := alignUp(l.next, align)
+	if p+size > l.end {
+		return nil
+	}
+	l.next = p + size
+	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped {
+		if l.mapMemory {
+			// Transition from Reserved to Prepared to Ready.
+			sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
+			sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
+		}
+		l.mapped = pEnd
+	}
+	return unsafe.Pointer(p)
+}
+
+// notInHeap is off-heap memory allocated by a lower-level allocator
+// like sysAlloc or persistentAlloc.
+//
+// In general, it's better to use real types marked as go:notinheap,
+// but this serves as a generic type for situations where that isn't
+// possible (like in the allocators).
+//
+// TODO: Use this as the return type of sysAlloc, persistentAlloc, etc?
+//
+//go:notinheap
+type notInHeap struct{}
+
+func (p *notInHeap) add(bytes uintptr) *notInHeap {
+	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes))
+}
+
+// computeRZlog computes the size of the redzone.
+// Refer to the implementation of the compiler-rt.
+func computeRZlog(userSize uintptr) uintptr {
+	switch {
+	case userSize <= (64 - 16):
+		return 16 << 0
+	case userSize <= (128 - 32):
+		return 16 << 1
+	case userSize <= (512 - 64):
+		return 16 << 2
+	case userSize <= (4096 - 128):
+		return 16 << 3
+	case userSize <= (1<<14)-256:
+		return 16 << 4
+	case userSize <= (1<<15)-512:
+		return 16 << 5
+	case userSize <= (1<<16)-1024:
+		return 16 << 6
+	default:
+		return 16 << 7
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/map.go b/contrib/go/_std_1.18/src/runtime/map.go
new file mode 100644
index 0000000000..e91b25eaec
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/map.go
@@ -0,0 +1,1416 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// This file contains the implementation of Go's map type.
+//
+// A map is just a hash table. The data is arranged
+// into an array of buckets. Each bucket contains up to
+// 8 key/elem pairs. The low-order bits of the hash are
+// used to select a bucket. Each bucket contains a few
+// high-order bits of each hash to distinguish the entries
+// within a single bucket.
+//
+// If more than 8 keys hash to a bucket, we chain on
+// extra buckets.
+//
+// When the hashtable grows, we allocate a new array
+// of buckets twice as big. Buckets are incrementally
+// copied from the old bucket array to the new bucket array.
+//
+// Map iterators walk through the array of buckets and
+// return the keys in walk order (bucket #, then overflow
+// chain order, then bucket index).  To maintain iteration
+// semantics, we never move keys within their bucket (if
+// we did, keys might be returned 0 or 2 times).  When
+// growing the table, iterators remain iterating through the
+// old table and must check the new table if the bucket
+// they are iterating through has been moved ("evacuated")
+// to the new table.
+
+// Picking loadFactor: too large and we have lots of overflow
+// buckets, too small and we waste a lot of space. I wrote
+// a simple program to check some stats for different loads:
+// (64-bit, 8 byte keys and elems)
+//  loadFactor    %overflow  bytes/entry     hitprobe    missprobe
+//        4.00         2.13        20.77         3.00         4.00
+//        4.50         4.05        17.30         3.25         4.50
+//        5.00         6.85        14.77         3.50         5.00
+//        5.50        10.55        12.94         3.75         5.50
+//        6.00        15.27        11.67         4.00         6.00
+//        6.50        20.90        10.79         4.25         6.50
+//        7.00        27.14        10.15         4.50         7.00
+//        7.50        34.03         9.73         4.75         7.50
+//        8.00        41.10         9.40         5.00         8.00
+//
+// %overflow   = percentage of buckets which have an overflow bucket
+// bytes/entry = overhead bytes used per key/elem pair
+// hitprobe    = # of entries to check when looking up a present key
+// missprobe   = # of entries to check when looking up an absent key
+//
+// Keep in mind this data is for maximally loaded tables, i.e. just
+// before the table grows. Typical tables will be somewhat less loaded.
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/math"
+	"unsafe"
+)
+
+const (
+	// Maximum number of key/elem pairs a bucket can hold.
+	bucketCntBits = 3
+	bucketCnt     = 1 << bucketCntBits
+
+	// Maximum average load of a bucket that triggers growth is 6.5.
+	// Represent as loadFactorNum/loadFactorDen, to allow integer math.
+	loadFactorNum = 13
+	loadFactorDen = 2
+
+	// Maximum key or elem size to keep inline (instead of mallocing per element).
+	// Must fit in a uint8.
+	// Fast versions cannot handle big elems - the cutoff size for
+	// fast versions in cmd/compile/internal/gc/walk.go must be at most this elem.
+	maxKeySize  = 128
+	maxElemSize = 128
+
+	// data offset should be the size of the bmap struct, but needs to be
+	// aligned correctly. For amd64p32 this means 64-bit alignment
+	// even though pointers are 32 bit.
+	dataOffset = unsafe.Offsetof(struct {
+		b bmap
+		v int64
+	}{}.v)
+
+	// Possible tophash values. We reserve a few possibilities for special marks.
+	// Each bucket (including its overflow buckets, if any) will have either all or none of its
+	// entries in the evacuated* states (except during the evacuate() method, which only happens
+	// during map writes and thus no one else can observe the map during that time).
+	emptyRest      = 0 // this cell is empty, and there are no more non-empty cells at higher indexes or overflows.
+	emptyOne       = 1 // this cell is empty
+	evacuatedX     = 2 // key/elem is valid.  Entry has been evacuated to first half of larger table.
+	evacuatedY     = 3 // same as above, but evacuated to second half of larger table.
+	evacuatedEmpty = 4 // cell is empty, bucket is evacuated.
+	minTopHash     = 5 // minimum tophash for a normal filled cell.
+
+	// flags
+	iterator     = 1 // there may be an iterator using buckets
+	oldIterator  = 2 // there may be an iterator using oldbuckets
+	hashWriting  = 4 // a goroutine is writing to the map
+	sameSizeGrow = 8 // the current map growth is to a new map of the same size
+
+	// sentinel bucket ID for iterator checks
+	noCheck = 1<<(8*goarch.PtrSize) - 1
+)
+
+// isEmpty reports whether the given tophash array entry represents an empty bucket entry.
+func isEmpty(x uint8) bool {
+	return x <= emptyOne
+}
+
+// A header for a Go map.
+type hmap struct {
+	// Note: the format of the hmap is also encoded in cmd/compile/internal/reflectdata/reflect.go.
+	// Make sure this stays in sync with the compiler's definition.
+	count     int // # live cells == size of map.  Must be first (used by len() builtin)
+	flags     uint8
+	B         uint8  // log_2 of # of buckets (can hold up to loadFactor * 2^B items)
+	noverflow uint16 // approximate number of overflow buckets; see incrnoverflow for details
+	hash0     uint32 // hash seed
+
+	buckets    unsafe.Pointer // array of 2^B Buckets. may be nil if count==0.
+	oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
+	nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)
+
+	extra *mapextra // optional fields
+}
+
+// mapextra holds fields that are not present on all maps.
+type mapextra struct {
+	// If both key and elem do not contain pointers and are inline, then we mark bucket
+	// type as containing no pointers. This avoids scanning such maps.
+	// However, bmap.overflow is a pointer. In order to keep overflow buckets
+	// alive, we store pointers to all overflow buckets in hmap.extra.overflow and hmap.extra.oldoverflow.
+	// overflow and oldoverflow are only used if key and elem do not contain pointers.
+	// overflow contains overflow buckets for hmap.buckets.
+	// oldoverflow contains overflow buckets for hmap.oldbuckets.
+	// The indirection allows to store a pointer to the slice in hiter.
+	overflow    *[]*bmap
+	oldoverflow *[]*bmap
+
+	// nextOverflow holds a pointer to a free overflow bucket.
+	nextOverflow *bmap
+}
+
+// A bucket for a Go map.
+type bmap struct {
+	// tophash generally contains the top byte of the hash value
+	// for each key in this bucket. If tophash[0] < minTopHash,
+	// tophash[0] is a bucket evacuation state instead.
+	tophash [bucketCnt]uint8
+	// Followed by bucketCnt keys and then bucketCnt elems.
+	// NOTE: packing all the keys together and then all the elems together makes the
+	// code a bit more complicated than alternating key/elem/key/elem/... but it allows
+	// us to eliminate padding which would be needed for, e.g., map[int64]int8.
+	// Followed by an overflow pointer.
+}
+
+// A hash iteration structure.
+// If you modify hiter, also change cmd/compile/internal/reflectdata/reflect.go
+// and reflect/value.go to match the layout of this structure.
+type hiter struct {
+	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/walk/range.go).
+	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/walk/range.go).
+	t           *maptype
+	h           *hmap
+	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
+	bptr        *bmap          // current bucket
+	overflow    *[]*bmap       // keeps overflow buckets of hmap.buckets alive
+	oldoverflow *[]*bmap       // keeps overflow buckets of hmap.oldbuckets alive
+	startBucket uintptr        // bucket iteration started at
+	offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
+	wrapped     bool           // already wrapped around from end of bucket array to beginning
+	B           uint8
+	i           uint8
+	bucket      uintptr
+	checkBucket uintptr
+}
+
+// bucketShift returns 1<<b, optimized for code generation.
+func bucketShift(b uint8) uintptr {
+	// Masking the shift amount allows overflow checks to be elided.
+	return uintptr(1) << (b & (goarch.PtrSize*8 - 1))
+}
+
+// bucketMask returns 1<<b - 1, optimized for code generation.
+func bucketMask(b uint8) uintptr {
+	return bucketShift(b) - 1
+}
+
+// tophash calculates the tophash value for hash.
+func tophash(hash uintptr) uint8 {
+	top := uint8(hash >> (goarch.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	return top
+}
+
+func evacuated(b *bmap) bool {
+	h := b.tophash[0]
+	return h > emptyOne && h < minTopHash
+}
+
+func (b *bmap) overflow(t *maptype) *bmap {
+	return *(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-goarch.PtrSize))
+}
+
+func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
+	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-goarch.PtrSize)) = ovf
+}
+
+func (b *bmap) keys() unsafe.Pointer {
+	return add(unsafe.Pointer(b), dataOffset)
+}
+
+// incrnoverflow increments h.noverflow.
+// noverflow counts the number of overflow buckets.
+// This is used to trigger same-size map growth.
+// See also tooManyOverflowBuckets.
+// To keep hmap small, noverflow is a uint16.
+// When there are few buckets, noverflow is an exact count.
+// When there are many buckets, noverflow is an approximate count.
+func (h *hmap) incrnoverflow() {
+	// We trigger same-size map growth if there are
+	// as many overflow buckets as buckets.
+	// We need to be able to count to 1<<h.B.
+	if h.B < 16 {
+		h.noverflow++
+		return
+	}
+	// Increment with probability 1/(1<<(h.B-15)).
+	// When we reach 1<<15 - 1, we will have approximately
+	// as many overflow buckets as buckets.
+	mask := uint32(1)<<(h.B-15) - 1
+	// Example: if h.B == 18, then mask == 7,
+	// and fastrand & 7 == 0 with probability 1/8.
+	if fastrand()&mask == 0 {
+		h.noverflow++
+	}
+}
+
+func (h *hmap) newoverflow(t *maptype, b *bmap) *bmap {
+	var ovf *bmap
+	if h.extra != nil && h.extra.nextOverflow != nil {
+		// We have preallocated overflow buckets available.
+		// See makeBucketArray for more details.
+		ovf = h.extra.nextOverflow
+		if ovf.overflow(t) == nil {
+			// We're not at the end of the preallocated overflow buckets. Bump the pointer.
+			h.extra.nextOverflow = (*bmap)(add(unsafe.Pointer(ovf), uintptr(t.bucketsize)))
+		} else {
+			// This is the last preallocated overflow bucket.
+			// Reset the overflow pointer on this bucket,
+			// which was set to a non-nil sentinel value.
+			ovf.setoverflow(t, nil)
+			h.extra.nextOverflow = nil
+		}
+	} else {
+		ovf = (*bmap)(newobject(t.bucket))
+	}
+	h.incrnoverflow()
+	if t.bucket.ptrdata == 0 {
+		h.createOverflow()
+		*h.extra.overflow = append(*h.extra.overflow, ovf)
+	}
+	b.setoverflow(t, ovf)
+	return ovf
+}
+
+func (h *hmap) createOverflow() {
+	if h.extra == nil {
+		h.extra = new(mapextra)
+	}
+	if h.extra.overflow == nil {
+		h.extra.overflow = new([]*bmap)
+	}
+}
+
+func makemap64(t *maptype, hint int64, h *hmap) *hmap {
+	if int64(int(hint)) != hint {
+		hint = 0
+	}
+	return makemap(t, int(hint), h)
+}
+
+// makemap_small implements Go map creation for make(map[k]v) and
+// make(map[k]v, hint) when hint is known to be at most bucketCnt
+// at compile time and the map needs to be allocated on the heap.
+func makemap_small() *hmap {
+	h := new(hmap)
+	h.hash0 = fastrand()
+	return h
+}
+
+// makemap implements Go map creation for make(map[k]v, hint).
+// If the compiler has determined that the map or the first bucket
+// can be created on the stack, h and/or bucket may be non-nil.
+// If h != nil, the map can be created directly in h.
+// If h.buckets != nil, bucket pointed to can be used as the first bucket.
+func makemap(t *maptype, hint int, h *hmap) *hmap {
+	mem, overflow := math.MulUintptr(uintptr(hint), t.bucket.size)
+	if overflow || mem > maxAlloc {
+		hint = 0
+	}
+
+	// initialize Hmap
+	if h == nil {
+		h = new(hmap)
+	}
+	h.hash0 = fastrand()
+
+	// Find the size parameter B which will hold the requested # of elements.
+	// For hint < 0 overLoadFactor returns false since hint < bucketCnt.
+	B := uint8(0)
+	for overLoadFactor(hint, B) {
+		B++
+	}
+	h.B = B
+
+	// allocate initial hash table
+	// if B == 0, the buckets field is allocated lazily later (in mapassign)
+	// If hint is large zeroing this memory could take a while.
+	if h.B != 0 {
+		var nextOverflow *bmap
+		h.buckets, nextOverflow = makeBucketArray(t, h.B, nil)
+		if nextOverflow != nil {
+			h.extra = new(mapextra)
+			h.extra.nextOverflow = nextOverflow
+		}
+	}
+
+	return h
+}
+
+// makeBucketArray initializes a backing array for map buckets.
+// 1<<b is the minimum number of buckets to allocate.
+// dirtyalloc should either be nil or a bucket array previously
+// allocated by makeBucketArray with the same t and b parameters.
+// If dirtyalloc is nil a new backing array will be alloced and
+// otherwise dirtyalloc will be cleared and reused as backing array.
+func makeBucketArray(t *maptype, b uint8, dirtyalloc unsafe.Pointer) (buckets unsafe.Pointer, nextOverflow *bmap) {
+	base := bucketShift(b)
+	nbuckets := base
+	// For small b, overflow buckets are unlikely.
+	// Avoid the overhead of the calculation.
+	if b >= 4 {
+		// Add on the estimated number of overflow buckets
+		// required to insert the median number of elements
+		// used with this value of b.
+		nbuckets += bucketShift(b - 4)
+		sz := t.bucket.size * nbuckets
+		up := roundupsize(sz)
+		if up != sz {
+			nbuckets = up / t.bucket.size
+		}
+	}
+
+	if dirtyalloc == nil {
+		buckets = newarray(t.bucket, int(nbuckets))
+	} else {
+		// dirtyalloc was previously generated by
+		// the above newarray(t.bucket, int(nbuckets))
+		// but may not be empty.
+		buckets = dirtyalloc
+		size := t.bucket.size * nbuckets
+		if t.bucket.ptrdata != 0 {
+			memclrHasPointers(buckets, size)
+		} else {
+			memclrNoHeapPointers(buckets, size)
+		}
+	}
+
+	if base != nbuckets {
+		// We preallocated some overflow buckets.
+		// To keep the overhead of tracking these overflow buckets to a minimum,
+		// we use the convention that if a preallocated overflow bucket's overflow
+		// pointer is nil, then there are more available by bumping the pointer.
+		// We need a safe non-nil pointer for the last overflow bucket; just use buckets.
+		nextOverflow = (*bmap)(add(buckets, base*uintptr(t.bucketsize)))
+		last := (*bmap)(add(buckets, (nbuckets-1)*uintptr(t.bucketsize)))
+		last.setoverflow(t, (*bmap)(buckets))
+	}
+	return buckets, nextOverflow
+}
+
+// mapaccess1 returns a pointer to h[key].  Never returns nil, instead
+// it will return a reference to the zero object for the elem type if
+// the key is not in the map.
+// NOTE: The returned pointer may keep the whole map live, so don't
+// hold onto it for very long.
+func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(mapaccess1)
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if msanenabled && h != nil {
+		msanread(key, t.key.size)
+	}
+	if asanenabled && h != nil {
+		asanread(key, t.key.size)
+	}
+	if h == nil || h.count == 0 {
+		if t.hashMightPanic() {
+			t.hasher(key, 0) // see issue 23734
+		}
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	hash := t.hasher(key, uintptr(h.hash0))
+	m := bucketMask(h.B)
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		if !h.sameSizeGrow() {
+			// There used to be half as many buckets; mask down one more power of two.
+			m >>= 1
+		}
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := tophash(hash)
+bucketloop:
+	for ; b != nil; b = b.overflow(t) {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey() {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if t.key.equal(key, k) {
+				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
+				if t.indirectelem() {
+					e = *((*unsafe.Pointer)(e))
+				}
+				return e
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0])
+}
+
+func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(mapaccess2)
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if msanenabled && h != nil {
+		msanread(key, t.key.size)
+	}
+	if asanenabled && h != nil {
+		asanread(key, t.key.size)
+	}
+	if h == nil || h.count == 0 {
+		if t.hashMightPanic() {
+			t.hasher(key, 0) // see issue 23734
+		}
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	hash := t.hasher(key, uintptr(h.hash0))
+	m := bucketMask(h.B)
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		if !h.sameSizeGrow() {
+			// There used to be half as many buckets; mask down one more power of two.
+			m >>= 1
+		}
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := tophash(hash)
+bucketloop:
+	for ; b != nil; b = b.overflow(t) {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey() {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if t.key.equal(key, k) {
+				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
+				if t.indirectelem() {
+					e = *((*unsafe.Pointer)(e))
+				}
+				return e, true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+// returns both key and elem. Used by map iterator
+func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe.Pointer) {
+	if h == nil || h.count == 0 {
+		return nil, nil
+	}
+	hash := t.hasher(key, uintptr(h.hash0))
+	m := bucketMask(h.B)
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		if !h.sameSizeGrow() {
+			// There used to be half as many buckets; mask down one more power of two.
+			m >>= 1
+		}
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := tophash(hash)
+bucketloop:
+	for ; b != nil; b = b.overflow(t) {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey() {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if t.key.equal(key, k) {
+				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
+				if t.indirectelem() {
+					e = *((*unsafe.Pointer)(e))
+				}
+				return k, e
+			}
+		}
+	}
+	return nil, nil
+}
+
+func mapaccess1_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) unsafe.Pointer {
+	e := mapaccess1(t, h, key)
+	if e == unsafe.Pointer(&zeroVal[0]) {
+		return zero
+	}
+	return e
+}
+
+func mapaccess2_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) (unsafe.Pointer, bool) {
+	e := mapaccess1(t, h, key)
+	if e == unsafe.Pointer(&zeroVal[0]) {
+		return zero, false
+	}
+	return e, true
+}
+
+// Like mapaccess, but allocates a slot for the key if it is not present in the map.
+func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(mapassign)
+		racewritepc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if msanenabled {
+		msanread(key, t.key.size)
+	}
+	if asanenabled {
+		asanread(key, t.key.size)
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.hasher(key, uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher, since t.hasher may panic,
+	// in which case we have not actually done a write.
+	h.flags ^= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var elem unsafe.Pointer
+bucketloop:
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if isEmpty(b.tophash[i]) && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+					elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
+				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey() {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if !t.key.equal(key, k) {
+				continue
+			}
+			// already have a mapping for key. Update it.
+			if t.needkeyupdate() {
+				typedmemmove(t.key, k, key)
+			}
+			elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		elem = add(insertk, bucketCnt*uintptr(t.keysize))
+	}
+
+	// store new key/elem at insert position
+	if t.indirectkey() {
+		kmem := newobject(t.key)
+		*(*unsafe.Pointer)(insertk) = kmem
+		insertk = kmem
+	}
+	if t.indirectelem() {
+		vmem := newobject(t.elem)
+		*(*unsafe.Pointer)(elem) = vmem
+	}
+	typedmemmove(t.key, insertk, key)
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	if t.indirectelem() {
+		elem = *((*unsafe.Pointer)(elem))
+	}
+	return elem
+}
+
+func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(mapdelete)
+		racewritepc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if msanenabled && h != nil {
+		msanread(key, t.key.size)
+	}
+	if asanenabled && h != nil {
+		asanread(key, t.key.size)
+	}
+	if h == nil || h.count == 0 {
+		if t.hashMightPanic() {
+			t.hasher(key, 0) // see issue 23734
+		}
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.hasher(key, uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher, since t.hasher may panic,
+	// in which case we have not actually done a write (delete).
+	h.flags ^= hashWriting
+
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break search
+				}
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			k2 := k
+			if t.indirectkey() {
+				k2 = *((*unsafe.Pointer)(k2))
+			}
+			if !t.key.equal(key, k2) {
+				continue
+			}
+			// Only clear key if there are pointers in it.
+			if t.indirectkey() {
+				*(*unsafe.Pointer)(k) = nil
+			} else if t.key.ptrdata != 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
+			if t.indirectelem() {
+				*(*unsafe.Pointer)(e) = nil
+			} else if t.elem.ptrdata != 0 {
+				memclrHasPointers(e, t.elem.size)
+			} else {
+				memclrNoHeapPointers(e, t.elem.size)
+			}
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			// It would be nice to make this a separate function, but
+			// for loops are not currently inlineable.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
+			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
+			break search
+		}
+	}
+
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+// mapiterinit initializes the hiter struct used for ranging over maps.
+// The hiter struct pointed to by 'it' is allocated on the stack
+// by the compilers order pass or on the heap by reflect_mapiterinit.
+// Both need to have zeroed hiter since the struct contains pointers.
+func mapiterinit(t *maptype, h *hmap, it *hiter) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapiterinit))
+	}
+
+	it.t = t
+	if h == nil || h.count == 0 {
+		return
+	}
+
+	if unsafe.Sizeof(hiter{})/goarch.PtrSize != 12 {
+		throw("hash_iter size incorrect") // see cmd/compile/internal/reflectdata/reflect.go
+	}
+	it.h = h
+
+	// grab snapshot of bucket state
+	it.B = h.B
+	it.buckets = h.buckets
+	if t.bucket.ptrdata == 0 {
+		// Allocate the current slice and remember pointers to both current and old.
+		// This preserves all relevant overflow buckets alive even if
+		// the table grows and/or overflow buckets are added to the table
+		// while we are iterating.
+		h.createOverflow()
+		it.overflow = h.extra.overflow
+		it.oldoverflow = h.extra.oldoverflow
+	}
+
+	// decide where to start
+	r := uintptr(fastrand())
+	if h.B > 31-bucketCntBits {
+		r += uintptr(fastrand()) << 31
+	}
+	it.startBucket = r & bucketMask(h.B)
+	it.offset = uint8(r >> h.B & (bucketCnt - 1))
+
+	// iterator state
+	it.bucket = it.startBucket
+
+	// Remember we have an iterator.
+	// Can run concurrently with another mapiterinit().
+	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
+		atomic.Or8(&h.flags, iterator|oldIterator)
+	}
+
+	mapiternext(it)
+}
+
+func mapiternext(it *hiter) {
+	h := it.h
+	if raceenabled {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapiternext))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map iteration and map write")
+	}
+	t := it.t
+	bucket := it.bucket
+	b := it.bptr
+	i := it.i
+	checkBucket := it.checkBucket
+
+next:
+	if b == nil {
+		if bucket == it.startBucket && it.wrapped {
+			// end of iteration
+			it.key = nil
+			it.elem = nil
+			return
+		}
+		if h.growing() && it.B == h.B {
+			// Iterator was started in the middle of a grow, and the grow isn't done yet.
+			// If the bucket we're looking at hasn't been filled in yet (i.e. the old
+			// bucket hasn't been evacuated) then we need to iterate through the old
+			// bucket and only return the ones that will be migrated to this bucket.
+			oldbucket := bucket & it.h.oldbucketmask()
+			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+			if !evacuated(b) {
+				checkBucket = bucket
+			} else {
+				b = (*bmap)(add(it.buckets, bucket*uintptr(t.bucketsize)))
+				checkBucket = noCheck
+			}
+		} else {
+			b = (*bmap)(add(it.buckets, bucket*uintptr(t.bucketsize)))
+			checkBucket = noCheck
+		}
+		bucket++
+		if bucket == bucketShift(it.B) {
+			bucket = 0
+			it.wrapped = true
+		}
+		i = 0
+	}
+	for ; i < bucketCnt; i++ {
+		offi := (i + it.offset) & (bucketCnt - 1)
+		if isEmpty(b.tophash[offi]) || b.tophash[offi] == evacuatedEmpty {
+			// TODO: emptyRest is hard to use here, as we start iterating
+			// in the middle of a bucket. It's feasible, just tricky.
+			continue
+		}
+		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
+		if t.indirectkey() {
+			k = *((*unsafe.Pointer)(k))
+		}
+		e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.elemsize))
+		if checkBucket != noCheck && !h.sameSizeGrow() {
+			// Special case: iterator was started during a grow to a larger size
+			// and the grow is not done yet. We're working on a bucket whose
+			// oldbucket has not been evacuated yet. Or at least, it wasn't
+			// evacuated when we started the bucket. So we're iterating
+			// through the oldbucket, skipping any keys that will go
+			// to the other new bucket (each oldbucket expands to two
+			// buckets during a grow).
+			if t.reflexivekey() || t.key.equal(k, k) {
+				// If the item in the oldbucket is not destined for
+				// the current new bucket in the iteration, skip it.
+				hash := t.hasher(k, uintptr(h.hash0))
+				if hash&bucketMask(it.B) != checkBucket {
+					continue
+				}
+			} else {
+				// Hash isn't repeatable if k != k (NaNs).  We need a
+				// repeatable and randomish choice of which direction
+				// to send NaNs during evacuation. We'll use the low
+				// bit of tophash to decide which way NaNs go.
+				// NOTE: this case is why we need two evacuate tophash
+				// values, evacuatedX and evacuatedY, that differ in
+				// their low bit.
+				if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
+					continue
+				}
+			}
+		}
+		if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
+			!(t.reflexivekey() || t.key.equal(k, k)) {
+			// This is the golden data, we can return it.
+			// OR
+			// key!=key, so the entry can't be deleted or updated, so we can just return it.
+			// That's lucky for us because when key!=key we can't look it up successfully.
+			it.key = k
+			if t.indirectelem() {
+				e = *((*unsafe.Pointer)(e))
+			}
+			it.elem = e
+		} else {
+			// The hash table has grown since the iterator was started.
+			// The golden data for this key is now somewhere else.
+			// Check the current hash table for the data.
+			// This code handles the case where the key
+			// has been deleted, updated, or deleted and reinserted.
+			// NOTE: we need to regrab the key as it has potentially been
+			// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
+			rk, re := mapaccessK(t, h, k)
+			if rk == nil {
+				continue // key has been deleted
+			}
+			it.key = rk
+			it.elem = re
+		}
+		it.bucket = bucket
+		if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
+			it.bptr = b
+		}
+		it.i = i + 1
+		it.checkBucket = checkBucket
+		return
+	}
+	b = b.overflow(t)
+	i = 0
+	goto next
+}
+
+// mapclear deletes all keys from a map.
+func mapclear(t *maptype, h *hmap) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(mapclear)
+		racewritepc(unsafe.Pointer(h), callerpc, pc)
+	}
+
+	if h == nil || h.count == 0 {
+		return
+	}
+
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	h.flags ^= hashWriting
+
+	h.flags &^= sameSizeGrow
+	h.oldbuckets = nil
+	h.nevacuate = 0
+	h.noverflow = 0
+	h.count = 0
+
+	// Reset the hash seed to make it more difficult for attackers to
+	// repeatedly trigger hash collisions. See issue 25237.
+	h.hash0 = fastrand()
+
+	// Keep the mapextra allocation but clear any extra information.
+	if h.extra != nil {
+		*h.extra = mapextra{}
+	}
+
+	// makeBucketArray clears the memory pointed to by h.buckets
+	// and recovers any overflow buckets by generating them
+	// as if h.buckets was newly alloced.
+	_, nextOverflow := makeBucketArray(t, h.B, h.buckets)
+	if nextOverflow != nil {
+		// If overflow buckets are created then h.extra
+		// will have been allocated during initial bucket creation.
+		h.extra.nextOverflow = nextOverflow
+	}
+
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func hashGrow(t *maptype, h *hmap) {
+	// If we've hit the load factor, get bigger.
+	// Otherwise, there are too many overflow buckets,
+	// so keep the same number of buckets and "grow" laterally.
+	bigger := uint8(1)
+	if !overLoadFactor(h.count+1, h.B) {
+		bigger = 0
+		h.flags |= sameSizeGrow
+	}
+	oldbuckets := h.buckets
+	newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger, nil)
+
+	flags := h.flags &^ (iterator | oldIterator)
+	if h.flags&iterator != 0 {
+		flags |= oldIterator
+	}
+	// commit the grow (atomic wrt gc)
+	h.B += bigger
+	h.flags = flags
+	h.oldbuckets = oldbuckets
+	h.buckets = newbuckets
+	h.nevacuate = 0
+	h.noverflow = 0
+
+	if h.extra != nil && h.extra.overflow != nil {
+		// Promote current overflow buckets to the old generation.
+		if h.extra.oldoverflow != nil {
+			throw("oldoverflow is not nil")
+		}
+		h.extra.oldoverflow = h.extra.overflow
+		h.extra.overflow = nil
+	}
+	if nextOverflow != nil {
+		if h.extra == nil {
+			h.extra = new(mapextra)
+		}
+		h.extra.nextOverflow = nextOverflow
+	}
+
+	// the actual copying of the hash table data is done incrementally
+	// by growWork() and evacuate().
+}
+
+// overLoadFactor reports whether count items placed in 1<<B buckets is over loadFactor.
+func overLoadFactor(count int, B uint8) bool {
+	return count > bucketCnt && uintptr(count) > loadFactorNum*(bucketShift(B)/loadFactorDen)
+}
+
+// tooManyOverflowBuckets reports whether noverflow buckets is too many for a map with 1<<B buckets.
+// Note that most of these overflow buckets must be in sparse use;
+// if use was dense, then we'd have already triggered regular map growth.
+func tooManyOverflowBuckets(noverflow uint16, B uint8) bool {
+	// If the threshold is too low, we do extraneous work.
+	// If the threshold is too high, maps that grow and shrink can hold on to lots of unused memory.
+	// "too many" means (approximately) as many overflow buckets as regular buckets.
+	// See incrnoverflow for more details.
+	if B > 15 {
+		B = 15
+	}
+	// The compiler doesn't see here that B < 16; mask B to generate shorter shift code.
+	return noverflow >= uint16(1)<<(B&15)
+}
+
+// growing reports whether h is growing. The growth may be to the same size or bigger.
+func (h *hmap) growing() bool {
+	return h.oldbuckets != nil
+}
+
+// sameSizeGrow reports whether the current growth is to a map of the same size.
+func (h *hmap) sameSizeGrow() bool {
+	return h.flags&sameSizeGrow != 0
+}
+
+// noldbuckets calculates the number of buckets prior to the current map growth.
+func (h *hmap) noldbuckets() uintptr {
+	oldB := h.B
+	if !h.sameSizeGrow() {
+		oldB--
+	}
+	return bucketShift(oldB)
+}
+
+// oldbucketmask provides a mask that can be applied to calculate n % noldbuckets().
+func (h *hmap) oldbucketmask() uintptr {
+	return h.noldbuckets() - 1
+}
+
+func growWork(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate(t, h, h.nevacuate)
+	}
+}
+
+func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
+	b := (*bmap)(add(h.oldbuckets, bucket*uintptr(t.bucketsize)))
+	return evacuated(b)
+}
+
+// evacDst is an evacuation destination.
+type evacDst struct {
+	b *bmap          // current destination bucket
+	i int            // key/elem index into b
+	k unsafe.Pointer // pointer to current key storage
+	e unsafe.Pointer // pointer to current elem storage
+}
+
+func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.e = add(x.k, bucketCnt*uintptr(t.keysize))
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.e = add(y.k, bucketCnt*uintptr(t.keysize))
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			e := add(k, bucketCnt*uintptr(t.keysize))
+			for i := 0; i < bucketCnt; i, k, e = i+1, add(k, uintptr(t.keysize)), add(e, uintptr(t.elemsize)) {
+				top := b.tophash[i]
+				if isEmpty(top) {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				k2 := k
+				if t.indirectkey() {
+					k2 = *((*unsafe.Pointer)(k2))
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/elem to bucket x or bucket y).
+					hash := t.hasher(k2, uintptr(h.hash0))
+					if h.flags&iterator != 0 && !t.reflexivekey() && !t.key.equal(k2, k2) {
+						// If key != key (NaNs), then the hash could be (and probably
+						// will be) entirely different from the old hash. Moreover,
+						// it isn't reproducible. Reproducibility is required in the
+						// presence of iterators, as our evacuation decision must
+						// match whatever decision the iterator made.
+						// Fortunately, we have the freedom to send these keys either
+						// way. Also, tophash is meaningless for these kinds of keys.
+						// We let the low bit of tophash drive the evacuation decision.
+						// We recompute a new random tophash for the next level so
+						// these keys will get evenly distributed across all buckets
+						// after multiple grows.
+						useY = top & 1
+						top = tophash(hash)
+					} else {
+						if hash&newbit != 0 {
+							useY = 1
+						}
+					}
+				}
+
+				if evacuatedX+1 != evacuatedY || evacuatedX^1 != evacuatedY {
+					throw("bad evacuatedN")
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.e = add(dst.k, bucketCnt*uintptr(t.keysize))
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+				if t.indirectkey() {
+					*(*unsafe.Pointer)(dst.k) = k2 // copy pointer
+				} else {
+					typedmemmove(t.key, dst.k, k) // copy elem
+				}
+				if t.indirectelem() {
+					*(*unsafe.Pointer)(dst.e) = *(*unsafe.Pointer)(e)
+				} else {
+					typedmemmove(t.elem, dst.e, e)
+				}
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or elem arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, uintptr(t.keysize))
+				dst.e = add(dst.e, uintptr(t.elemsize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/elem to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.ptrdata != 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func advanceEvacuationMark(h *hmap, t *maptype, newbit uintptr) {
+	h.nevacuate++
+	// Experiments suggest that 1024 is overkill by at least an order of magnitude.
+	// Put it in there as a safeguard anyway, to ensure O(1) behavior.
+	stop := h.nevacuate + 1024
+	if stop > newbit {
+		stop = newbit
+	}
+	for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
+		h.nevacuate++
+	}
+	if h.nevacuate == newbit { // newbit == # of oldbuckets
+		// Growing is all done. Free old main bucket array.
+		h.oldbuckets = nil
+		// Can discard old overflow buckets as well.
+		// If they are still referenced by an iterator,
+		// then the iterator holds a pointers to the slice.
+		if h.extra != nil {
+			h.extra.oldoverflow = nil
+		}
+		h.flags &^= sameSizeGrow
+	}
+}
+
+// Reflect stubs. Called from ../reflect/asm_*.s
+
+//go:linkname reflect_makemap reflect.makemap
+func reflect_makemap(t *maptype, cap int) *hmap {
+	// Check invariants and reflects math.
+	if t.key.equal == nil {
+		throw("runtime.reflect_makemap: unsupported map key type")
+	}
+	if t.key.size > maxKeySize && (!t.indirectkey() || t.keysize != uint8(goarch.PtrSize)) ||
+		t.key.size <= maxKeySize && (t.indirectkey() || t.keysize != uint8(t.key.size)) {
+		throw("key size wrong")
+	}
+	if t.elem.size > maxElemSize && (!t.indirectelem() || t.elemsize != uint8(goarch.PtrSize)) ||
+		t.elem.size <= maxElemSize && (t.indirectelem() || t.elemsize != uint8(t.elem.size)) {
+		throw("elem size wrong")
+	}
+	if t.key.align > bucketCnt {
+		throw("key align too big")
+	}
+	if t.elem.align > bucketCnt {
+		throw("elem align too big")
+	}
+	if t.key.size%uintptr(t.key.align) != 0 {
+		throw("key size not a multiple of key align")
+	}
+	if t.elem.size%uintptr(t.elem.align) != 0 {
+		throw("elem size not a multiple of elem align")
+	}
+	if bucketCnt < 8 {
+		throw("bucketsize too small for proper alignment")
+	}
+	if dataOffset%uintptr(t.key.align) != 0 {
+		throw("need padding in bucket (key)")
+	}
+	if dataOffset%uintptr(t.elem.align) != 0 {
+		throw("need padding in bucket (elem)")
+	}
+
+	return makemap(t, cap, nil)
+}
+
+//go:linkname reflect_mapaccess reflect.mapaccess
+func reflect_mapaccess(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	elem, ok := mapaccess2(t, h, key)
+	if !ok {
+		// reflect wants nil for a missing element
+		elem = nil
+	}
+	return elem
+}
+
+//go:linkname reflect_mapaccess_faststr reflect.mapaccess_faststr
+func reflect_mapaccess_faststr(t *maptype, h *hmap, key string) unsafe.Pointer {
+	elem, ok := mapaccess2_faststr(t, h, key)
+	if !ok {
+		// reflect wants nil for a missing element
+		elem = nil
+	}
+	return elem
+}
+
+//go:linkname reflect_mapassign reflect.mapassign
+func reflect_mapassign(t *maptype, h *hmap, key unsafe.Pointer, elem unsafe.Pointer) {
+	p := mapassign(t, h, key)
+	typedmemmove(t.elem, p, elem)
+}
+
+//go:linkname reflect_mapassign_faststr reflect.mapassign_faststr
+func reflect_mapassign_faststr(t *maptype, h *hmap, key string, elem unsafe.Pointer) {
+	p := mapassign_faststr(t, h, key)
+	typedmemmove(t.elem, p, elem)
+}
+
+//go:linkname reflect_mapdelete reflect.mapdelete
+func reflect_mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
+	mapdelete(t, h, key)
+}
+
+//go:linkname reflect_mapdelete_faststr reflect.mapdelete_faststr
+func reflect_mapdelete_faststr(t *maptype, h *hmap, key string) {
+	mapdelete_faststr(t, h, key)
+}
+
+//go:linkname reflect_mapiterinit reflect.mapiterinit
+func reflect_mapiterinit(t *maptype, h *hmap, it *hiter) {
+	mapiterinit(t, h, it)
+}
+
+//go:linkname reflect_mapiternext reflect.mapiternext
+func reflect_mapiternext(it *hiter) {
+	mapiternext(it)
+}
+
+//go:linkname reflect_mapiterkey reflect.mapiterkey
+func reflect_mapiterkey(it *hiter) unsafe.Pointer {
+	return it.key
+}
+
+//go:linkname reflect_mapiterelem reflect.mapiterelem
+func reflect_mapiterelem(it *hiter) unsafe.Pointer {
+	return it.elem
+}
+
+//go:linkname reflect_maplen reflect.maplen
+func reflect_maplen(h *hmap) int {
+	if h == nil {
+		return 0
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(reflect_maplen))
+	}
+	return h.count
+}
+
+//go:linkname reflectlite_maplen internal/reflectlite.maplen
+func reflectlite_maplen(h *hmap) int {
+	if h == nil {
+		return 0
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(reflect_maplen))
+	}
+	return h.count
+}
+
+const maxZero = 1024 // must match value in reflect/value.go:maxZero cmd/compile/internal/gc/walk.go:zeroValSize
+var zeroVal [maxZero]byte
diff --git a/contrib/go/_std_1.18/src/runtime/map_fast32.go b/contrib/go/_std_1.18/src/runtime/map_fast32.go
new file mode 100644
index 0000000000..e80caeef55
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/map_fast32.go
@@ -0,0 +1,462 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapaccess1_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table. No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		m := bucketMask(h.B)
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			if !h.sameSizeGrow() {
+				// There used to be half as many buckets; mask down one more power of two.
+				m >>= 1
+			}
+			oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && !isEmpty(b.tophash[i]) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.elemsize))
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0])
+}
+
+func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapaccess2_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table. No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		m := bucketMask(h.B)
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			if !h.sameSizeGrow() {
+				// There used to be half as many buckets; mask down one more power of two.
+				m >>= 1
+			}
+			oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && !isEmpty(b.tophash[i]) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.elemsize)), true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
+	h.flags ^= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast32(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+bucketloop:
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if isEmpty(b.tophash[i]) {
+				if insertb == nil {
+					inserti = i
+					insertb = b
+				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			inserti = i
+			insertb = b
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*uint32)(insertk) = key
+
+	h.count++
+
+done:
+	elem := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.elemsize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return elem
+}
+
+func mapassign_fast32ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
+	h.flags ^= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast32(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+bucketloop:
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if isEmpty(b.tophash[i]) {
+				if insertb == nil {
+					inserti = i
+					insertb = b
+				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			inserti = i
+			insertb = b
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
+
+	h.count++
+
+done:
+	elem := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.elemsize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return elem
+}
+
+func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapdelete_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapdelete
+	h.flags ^= hashWriting
+
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast32(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if key != *(*uint32)(k) || isEmpty(b.tophash[i]) {
+				continue
+			}
+			// Only clear key if there are pointers in it.
+			// This can only happen if pointers are 32 bit
+			// wide as 64 bit pointers do not fit into a 32 bit key.
+			if goarch.PtrSize == 4 && t.key.ptrdata != 0 {
+				// The key must be a pointer as we checked pointers are
+				// 32 bits wide and the key is 32 bits wide also.
+				*(*unsafe.Pointer)(k) = nil
+			}
+			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.elemsize))
+			if t.elem.ptrdata != 0 {
+				memclrHasPointers(e, t.elem.size)
+			} else {
+				memclrNoHeapPointers(e, t.elem.size)
+			}
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
+			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
+			break search
+		}
+	}
+
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func growWork_fast32(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast32(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast32(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast32(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.e = add(x.k, bucketCnt*4)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.e = add(y.k, bucketCnt*4)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			e := add(k, bucketCnt*4)
+			for i := 0; i < bucketCnt; i, k, e = i+1, add(k, 4), add(e, uintptr(t.elemsize)) {
+				top := b.tophash[i]
+				if isEmpty(top) {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/elem to bucket x or bucket y).
+					hash := t.hasher(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.e = add(dst.k, bucketCnt*4)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if goarch.PtrSize == 4 && t.key.ptrdata != 0 && writeBarrier.enabled {
+					// Write with a write barrier.
+					*(*unsafe.Pointer)(dst.k) = *(*unsafe.Pointer)(k)
+				} else {
+					*(*uint32)(dst.k) = *(*uint32)(k)
+				}
+
+				typedmemmove(t.elem, dst.e, e)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or elem arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 4)
+				dst.e = add(dst.e, uintptr(t.elemsize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/elem to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.ptrdata != 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/map_fast64.go b/contrib/go/_std_1.18/src/runtime/map_fast64.go
new file mode 100644
index 0000000000..69d8872885
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/map_fast64.go
@@ -0,0 +1,470 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapaccess1_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table. No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		m := bucketMask(h.B)
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			if !h.sameSizeGrow() {
+				// There used to be half as many buckets; mask down one more power of two.
+				m >>= 1
+			}
+			oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && !isEmpty(b.tophash[i]) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.elemsize))
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0])
+}
+
+func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapaccess2_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table. No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		m := bucketMask(h.B)
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			if !h.sameSizeGrow() {
+				// There used to be half as many buckets; mask down one more power of two.
+				m >>= 1
+			}
+			oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && !isEmpty(b.tophash[i]) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.elemsize)), true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapassign_fast64))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
+	h.flags ^= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast64(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+bucketloop:
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if isEmpty(b.tophash[i]) {
+				if insertb == nil {
+					insertb = b
+					inserti = i
+				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			insertb = b
+			inserti = i
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*uint64)(insertk) = key
+
+	h.count++
+
+done:
+	elem := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.elemsize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return elem
+}
+
+func mapassign_fast64ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapassign_fast64))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
+	h.flags ^= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast64(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+bucketloop:
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if isEmpty(b.tophash[i]) {
+				if insertb == nil {
+					insertb = b
+					inserti = i
+				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			insertb = b
+			inserti = i
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
+
+	h.count++
+
+done:
+	elem := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.elemsize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return elem
+}
+
+func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapdelete_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapdelete
+	h.flags ^= hashWriting
+
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast64(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if key != *(*uint64)(k) || isEmpty(b.tophash[i]) {
+				continue
+			}
+			// Only clear key if there are pointers in it.
+			if t.key.ptrdata != 0 {
+				if goarch.PtrSize == 8 {
+					*(*unsafe.Pointer)(k) = nil
+				} else {
+					// There are three ways to squeeze at one ore more 32 bit pointers into 64 bits.
+					// Just call memclrHasPointers instead of trying to handle all cases here.
+					memclrHasPointers(k, 8)
+				}
+			}
+			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.elemsize))
+			if t.elem.ptrdata != 0 {
+				memclrHasPointers(e, t.elem.size)
+			} else {
+				memclrNoHeapPointers(e, t.elem.size)
+			}
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
+			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
+			break search
+		}
+	}
+
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func growWork_fast64(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast64(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast64(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast64(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.e = add(x.k, bucketCnt*8)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.e = add(y.k, bucketCnt*8)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			e := add(k, bucketCnt*8)
+			for i := 0; i < bucketCnt; i, k, e = i+1, add(k, 8), add(e, uintptr(t.elemsize)) {
+				top := b.tophash[i]
+				if isEmpty(top) {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/elem to bucket x or bucket y).
+					hash := t.hasher(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.e = add(dst.k, bucketCnt*8)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if t.key.ptrdata != 0 && writeBarrier.enabled {
+					if goarch.PtrSize == 8 {
+						// Write with a write barrier.
+						*(*unsafe.Pointer)(dst.k) = *(*unsafe.Pointer)(k)
+					} else {
+						// There are three ways to squeeze at least one 32 bit pointer into 64 bits.
+						// Give up and call typedmemmove.
+						typedmemmove(t.key, dst.k, k)
+					}
+				} else {
+					*(*uint64)(dst.k) = *(*uint64)(k)
+				}
+
+				typedmemmove(t.elem, dst.e, e)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or elem arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 8)
+				dst.e = add(dst.e, uintptr(t.elemsize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/elem to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.ptrdata != 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/map_faststr.go b/contrib/go/_std_1.18/src/runtime/map_faststr.go
new file mode 100644
index 0000000000..4dca882c63
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/map_faststr.go
@@ -0,0 +1,485 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapaccess1_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	key := stringStructOf(&ky)
+	if h.B == 0 {
+		// One-bucket table.
+		b := (*bmap)(h.buckets)
+		if key.len < 32 {
+			// short key, doing lots of comparisons is ok
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || isEmpty(b.tophash[i]) {
+					if b.tophash[i] == emptyRest {
+						break
+					}
+					continue
+				}
+				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize))
+				}
+			}
+			return unsafe.Pointer(&zeroVal[0])
+		}
+		// long key, try not to do more comparisons than necessary
+		keymaybe := uintptr(bucketCnt)
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || isEmpty(b.tophash[i]) {
+				if b.tophash[i] == emptyRest {
+					break
+				}
+				continue
+			}
+			if k.str == key.str {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize))
+			}
+			// check first 4 bytes
+			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
+				continue
+			}
+			// check last 4 bytes
+			if *((*[4]byte)(add(key.str, uintptr(key.len)-4))) != *((*[4]byte)(add(k.str, uintptr(key.len)-4))) {
+				continue
+			}
+			if keymaybe != bucketCnt {
+				// Two keys are potential matches. Use hash to distinguish them.
+				goto dohash
+			}
+			keymaybe = i
+		}
+		if keymaybe != bucketCnt {
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*goarch.PtrSize))
+			if memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+keymaybe*uintptr(t.elemsize))
+			}
+		}
+		return unsafe.Pointer(&zeroVal[0])
+	}
+dohash:
+	hash := t.hasher(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	m := bucketMask(h.B)
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		if !h.sameSizeGrow() {
+			// There used to be half as many buckets; mask down one more power of two.
+			m >>= 1
+		}
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
+				continue
+			}
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize))
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0])
+}
+
+func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racereadpc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapaccess2_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	key := stringStructOf(&ky)
+	if h.B == 0 {
+		// One-bucket table.
+		b := (*bmap)(h.buckets)
+		if key.len < 32 {
+			// short key, doing lots of comparisons is ok
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || isEmpty(b.tophash[i]) {
+					if b.tophash[i] == emptyRest {
+						break
+					}
+					continue
+				}
+				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize)), true
+				}
+			}
+			return unsafe.Pointer(&zeroVal[0]), false
+		}
+		// long key, try not to do more comparisons than necessary
+		keymaybe := uintptr(bucketCnt)
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || isEmpty(b.tophash[i]) {
+				if b.tophash[i] == emptyRest {
+					break
+				}
+				continue
+			}
+			if k.str == key.str {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize)), true
+			}
+			// check first 4 bytes
+			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
+				continue
+			}
+			// check last 4 bytes
+			if *((*[4]byte)(add(key.str, uintptr(key.len)-4))) != *((*[4]byte)(add(k.str, uintptr(key.len)-4))) {
+				continue
+			}
+			if keymaybe != bucketCnt {
+				// Two keys are potential matches. Use hash to distinguish them.
+				goto dohash
+			}
+			keymaybe = i
+		}
+		if keymaybe != bucketCnt {
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*goarch.PtrSize))
+			if memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+keymaybe*uintptr(t.elemsize)), true
+			}
+		}
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+dohash:
+	hash := t.hasher(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	m := bucketMask(h.B)
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		if !h.sameSizeGrow() {
+			// There used to be half as many buckets; mask down one more power of two.
+			m >>= 1
+		}
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
+				continue
+			}
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize)), true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapassign_faststr))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	key := stringStructOf(&s)
+	hash := t.hasher(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
+	h.flags ^= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+bucketloop:
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if isEmpty(b.tophash[i]) && insertb == nil {
+					insertb = b
+					inserti = i
+				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*goarch.PtrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
+				continue
+			}
+			// already have a mapping for key. Update it.
+			inserti = i
+			insertb = b
+			// Overwrite existing key, so it can be garbage collected.
+			// The size is already guaranteed to be set correctly.
+			k.str = key.str
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = top // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*2*goarch.PtrSize)
+	// store new key at insert position
+	*((*stringStruct)(insertk)) = *key
+	h.count++
+
+done:
+	elem := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*2*goarch.PtrSize+inserti*uintptr(t.elemsize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return elem
+}
+
+func mapdelete_faststr(t *maptype, h *hmap, ky string) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapdelete_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	key := stringStructOf(&ky)
+	hash := t.hasher(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+
+	// Set hashWriting after calling t.hasher for consistency with mapdelete
+	h.flags ^= hashWriting
+
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*goarch.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
+				continue
+			}
+			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
+				continue
+			}
+			// Clear key's pointer.
+			k.str = nil
+			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*2*goarch.PtrSize+i*uintptr(t.elemsize))
+			if t.elem.ptrdata != 0 {
+				memclrHasPointers(e, t.elem.size)
+			} else {
+				memclrNoHeapPointers(e, t.elem.size)
+			}
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
+			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
+			break search
+		}
+	}
+
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func growWork_faststr(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_faststr(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_faststr(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_faststr(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.e = add(x.k, bucketCnt*2*goarch.PtrSize)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.e = add(y.k, bucketCnt*2*goarch.PtrSize)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			e := add(k, bucketCnt*2*goarch.PtrSize)
+			for i := 0; i < bucketCnt; i, k, e = i+1, add(k, 2*goarch.PtrSize), add(e, uintptr(t.elemsize)) {
+				top := b.tophash[i]
+				if isEmpty(top) {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/elem to bucket x or bucket y).
+					hash := t.hasher(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.e = add(dst.k, bucketCnt*2*goarch.PtrSize)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				*(*string)(dst.k) = *(*string)(k)
+
+				typedmemmove(t.elem, dst.e, e)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or elem arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 2*goarch.PtrSize)
+				dst.e = add(dst.e, uintptr(t.elemsize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/elem to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.ptrdata != 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mbarrier.go b/contrib/go/_std_1.18/src/runtime/mbarrier.go
new file mode 100644
index 0000000000..465c21f83f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mbarrier.go
@@ -0,0 +1,343 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: write barriers.
+//
+// For the concurrent garbage collector, the Go compiler implements
+// updates to pointer-valued fields that may be in heap objects by
+// emitting calls to write barriers. The main write barrier for
+// individual pointer writes is gcWriteBarrier and is implemented in
+// assembly. This file contains write barrier entry points for bulk
+// operations. See also mwbbuf.go.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+// Go uses a hybrid barrier that combines a Yuasa-style deletion
+// barrier—which shades the object whose reference is being
+// overwritten—with Dijkstra insertion barrier—which shades the object
+// whose reference is being written. The insertion part of the barrier
+// is necessary while the calling goroutine's stack is grey. In
+// pseudocode, the barrier is:
+//
+//     writePointer(slot, ptr):
+//         shade(*slot)
+//         if current stack is grey:
+//             shade(ptr)
+//         *slot = ptr
+//
+// slot is the destination in Go code.
+// ptr is the value that goes into the slot in Go code.
+//
+// Shade indicates that it has seen a white pointer by adding the referent
+// to wbuf as well as marking it.
+//
+// The two shades and the condition work together to prevent a mutator
+// from hiding an object from the garbage collector:
+//
+// 1. shade(*slot) prevents a mutator from hiding an object by moving
+// the sole pointer to it from the heap to its stack. If it attempts
+// to unlink an object from the heap, this will shade it.
+//
+// 2. shade(ptr) prevents a mutator from hiding an object by moving
+// the sole pointer to it from its stack into a black object in the
+// heap. If it attempts to install the pointer into a black object,
+// this will shade it.
+//
+// 3. Once a goroutine's stack is black, the shade(ptr) becomes
+// unnecessary. shade(ptr) prevents hiding an object by moving it from
+// the stack to the heap, but this requires first having a pointer
+// hidden on the stack. Immediately after a stack is scanned, it only
+// points to shaded objects, so it's not hiding anything, and the
+// shade(*slot) prevents it from hiding any other pointers on its
+// stack.
+//
+// For a detailed description of this barrier and proof of
+// correctness, see https://github.com/golang/proposal/blob/master/design/17503-eliminate-rescan.md
+//
+//
+//
+// Dealing with memory ordering:
+//
+// Both the Yuasa and Dijkstra barriers can be made conditional on the
+// color of the object containing the slot. We chose not to make these
+// conditional because the cost of ensuring that the object holding
+// the slot doesn't concurrently change color without the mutator
+// noticing seems prohibitive.
+//
+// Consider the following example where the mutator writes into
+// a slot and then loads the slot's mark bit while the GC thread
+// writes to the slot's mark bit and then as part of scanning reads
+// the slot.
+//
+// Initially both [slot] and [slotmark] are 0 (nil)
+// Mutator thread          GC thread
+// st [slot], ptr          st [slotmark], 1
+//
+// ld r1, [slotmark]       ld r2, [slot]
+//
+// Without an expensive memory barrier between the st and the ld, the final
+// result on most HW (including 386/amd64) can be r1==r2==0. This is a classic
+// example of what can happen when loads are allowed to be reordered with older
+// stores (avoiding such reorderings lies at the heart of the classic
+// Peterson/Dekker algorithms for mutual exclusion). Rather than require memory
+// barriers, which will slow down both the mutator and the GC, we always grey
+// the ptr object regardless of the slot's color.
+//
+// Another place where we intentionally omit memory barriers is when
+// accessing mheap_.arena_used to check if a pointer points into the
+// heap. On relaxed memory machines, it's possible for a mutator to
+// extend the size of the heap by updating arena_used, allocate an
+// object from this new region, and publish a pointer to that object,
+// but for tracing running on another processor to observe the pointer
+// but use the old value of arena_used. In this case, tracing will not
+// mark the object, even though it's reachable. However, the mutator
+// is guaranteed to execute a write barrier when it publishes the
+// pointer, so it will take care of marking the object. A general
+// consequence of this is that the garbage collector may cache the
+// value of mheap_.arena_used. (See issue #9984.)
+//
+//
+// Stack writes:
+//
+// The compiler omits write barriers for writes to the current frame,
+// but if a stack pointer has been passed down the call stack, the
+// compiler will generate a write barrier for writes through that
+// pointer (because it doesn't know it's not a heap pointer).
+//
+// One might be tempted to ignore the write barrier if slot points
+// into to the stack. Don't do it! Mark termination only re-scans
+// frames that have potentially been active since the concurrent scan,
+// so it depends on write barriers to track changes to pointers in
+// stack frames that have not been active.
+//
+//
+// Global writes:
+//
+// The Go garbage collector requires write barriers when heap pointers
+// are stored in globals. Many garbage collectors ignore writes to
+// globals and instead pick up global -> heap pointers during
+// termination. This increases pause time, so we instead rely on write
+// barriers for writes to globals so that we don't have to rescan
+// global during mark termination.
+//
+//
+// Publication ordering:
+//
+// The write barrier is *pre-publication*, meaning that the write
+// barrier happens prior to the *slot = ptr write that may make ptr
+// reachable by some goroutine that currently cannot reach it.
+//
+//
+// Signal handler pointer writes:
+//
+// In general, the signal handler cannot safely invoke the write
+// barrier because it may run without a P or even during the write
+// barrier.
+//
+// There is exactly one exception: profbuf.go omits a barrier during
+// signal handler profile logging. That's safe only because of the
+// deletion barrier. See profbuf.go for a detailed argument. If we
+// remove the deletion barrier, we'll have to work out a new way to
+// handle the profile logging.
+
+// typedmemmove copies a value of type t to dst from src.
+// Must be nosplit, see #16026.
+//
+// TODO: Perfect for go:nosplitrec since we can't have a safe point
+// anywhere in the bulk barrier or memmove.
+//
+//go:nosplit
+func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	if dst == src {
+		return
+	}
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.ptrdata)
+	}
+	// There's a race here: if some other goroutine can write to
+	// src, it may change some pointer in src after we've
+	// performed the write barrier but before we perform the
+	// memory copy. This safe because the write performed by that
+	// other goroutine must also be accompanied by a write
+	// barrier, so at worst we've unnecessarily greyed the old
+	// pointer that was in src.
+	memmove(dst, src, typ.size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, 0, typ.size)
+	}
+}
+
+//go:linkname reflect_typedmemmove reflect.typedmemmove
+func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	if raceenabled {
+		raceWriteObjectPC(typ, dst, getcallerpc(), abi.FuncPCABIInternal(reflect_typedmemmove))
+		raceReadObjectPC(typ, src, getcallerpc(), abi.FuncPCABIInternal(reflect_typedmemmove))
+	}
+	if msanenabled {
+		msanwrite(dst, typ.size)
+		msanread(src, typ.size)
+	}
+	if asanenabled {
+		asanwrite(dst, typ.size)
+		asanread(src, typ.size)
+	}
+	typedmemmove(typ, dst, src)
+}
+
+//go:linkname reflectlite_typedmemmove internal/reflectlite.typedmemmove
+func reflectlite_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	reflect_typedmemmove(typ, dst, src)
+}
+
+// typedmemmovepartial is like typedmemmove but assumes that
+// dst and src point off bytes into the value and only copies size bytes.
+// off must be a multiple of goarch.PtrSize.
+//go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
+func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
+	if writeBarrier.needed && typ.ptrdata > off && size >= goarch.PtrSize {
+		if off&(goarch.PtrSize-1) != 0 {
+			panic("reflect: internal error: misaligned offset")
+		}
+		pwsize := alignDown(size, goarch.PtrSize)
+		if poff := typ.ptrdata - off; pwsize > poff {
+			pwsize = poff
+		}
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), pwsize)
+	}
+
+	memmove(dst, src, size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, off, size)
+	}
+}
+
+// reflectcallmove is invoked by reflectcall to copy the return values
+// out of the stack and into the heap, invoking the necessary write
+// barriers. dst, src, and size describe the return value area to
+// copy. typ describes the entire frame (not just the return values).
+// typ may be nil, which indicates write barriers are not needed.
+//
+// It must be nosplit and must only call nosplit functions because the
+// stack map of reflectcall is wrong.
+//
+//go:nosplit
+func reflectcallmove(typ *_type, dst, src unsafe.Pointer, size uintptr, regs *abi.RegArgs) {
+	if writeBarrier.needed && typ != nil && typ.ptrdata != 0 && size >= goarch.PtrSize {
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), size)
+	}
+	memmove(dst, src, size)
+
+	// Move pointers returned in registers to a place where the GC can see them.
+	for i := range regs.Ints {
+		if regs.ReturnIsPtr.Get(i) {
+			regs.Ptrs[i] = unsafe.Pointer(regs.Ints[i])
+		}
+	}
+}
+
+//go:nosplit
+func typedslicecopy(typ *_type, dstPtr unsafe.Pointer, dstLen int, srcPtr unsafe.Pointer, srcLen int) int {
+	n := dstLen
+	if n > srcLen {
+		n = srcLen
+	}
+	if n == 0 {
+		return 0
+	}
+
+	// The compiler emits calls to typedslicecopy before
+	// instrumentation runs, so unlike the other copying and
+	// assignment operations, it's not instrumented in the calling
+	// code and needs its own instrumentation.
+	if raceenabled {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(slicecopy)
+		racewriterangepc(dstPtr, uintptr(n)*typ.size, callerpc, pc)
+		racereadrangepc(srcPtr, uintptr(n)*typ.size, callerpc, pc)
+	}
+	if msanenabled {
+		msanwrite(dstPtr, uintptr(n)*typ.size)
+		msanread(srcPtr, uintptr(n)*typ.size)
+	}
+	if asanenabled {
+		asanwrite(dstPtr, uintptr(n)*typ.size)
+		asanread(srcPtr, uintptr(n)*typ.size)
+	}
+
+	if writeBarrier.cgo {
+		cgoCheckSliceCopy(typ, dstPtr, srcPtr, n)
+	}
+
+	if dstPtr == srcPtr {
+		return n
+	}
+
+	// Note: No point in checking typ.ptrdata here:
+	// compiler only emits calls to typedslicecopy for types with pointers,
+	// and growslice and reflect_typedslicecopy check for pointers
+	// before calling typedslicecopy.
+	size := uintptr(n) * typ.size
+	if writeBarrier.needed {
+		pwsize := size - typ.size + typ.ptrdata
+		bulkBarrierPreWrite(uintptr(dstPtr), uintptr(srcPtr), pwsize)
+	}
+	// See typedmemmove for a discussion of the race between the
+	// barrier and memmove.
+	memmove(dstPtr, srcPtr, size)
+	return n
+}
+
+//go:linkname reflect_typedslicecopy reflect.typedslicecopy
+func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
+	if elemType.ptrdata == 0 {
+		return slicecopy(dst.array, dst.len, src.array, src.len, elemType.size)
+	}
+	return typedslicecopy(elemType, dst.array, dst.len, src.array, src.len)
+}
+
+// typedmemclr clears the typed memory at ptr with type typ. The
+// memory at ptr must already be initialized (and hence in type-safe
+// state). If the memory is being initialized for the first time, see
+// memclrNoHeapPointers.
+//
+// If the caller knows that typ has pointers, it can alternatively
+// call memclrHasPointers.
+//
+//go:nosplit
+func typedmemclr(typ *_type, ptr unsafe.Pointer) {
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(ptr), 0, typ.ptrdata)
+	}
+	memclrNoHeapPointers(ptr, typ.size)
+}
+
+//go:linkname reflect_typedmemclr reflect.typedmemclr
+func reflect_typedmemclr(typ *_type, ptr unsafe.Pointer) {
+	typedmemclr(typ, ptr)
+}
+
+//go:linkname reflect_typedmemclrpartial reflect.typedmemclrpartial
+func reflect_typedmemclrpartial(typ *_type, ptr unsafe.Pointer, off, size uintptr) {
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(ptr), 0, size)
+	}
+	memclrNoHeapPointers(ptr, size)
+}
+
+// memclrHasPointers clears n bytes of typed memory starting at ptr.
+// The caller must ensure that the type of the object at ptr has
+// pointers, usually by checking typ.ptrdata. However, ptr
+// does not have to point to the start of the allocation.
+//
+//go:nosplit
+func memclrHasPointers(ptr unsafe.Pointer, n uintptr) {
+	bulkBarrierPreWrite(uintptr(ptr), 0, n)
+	memclrNoHeapPointers(ptr, n)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mbitmap.go b/contrib/go/_std_1.18/src/runtime/mbitmap.go
new file mode 100644
index 0000000000..937968807b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mbitmap.go
@@ -0,0 +1,2043 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: type and heap bitmaps.
+//
+// Stack, data, and bss bitmaps
+//
+// Stack frames and global variables in the data and bss sections are
+// described by bitmaps with 1 bit per pointer-sized word. A "1" bit
+// means the word is a live pointer to be visited by the GC (referred to
+// as "pointer"). A "0" bit means the word should be ignored by GC
+// (referred to as "scalar", though it could be a dead pointer value).
+//
+// Heap bitmap
+//
+// The heap bitmap comprises 2 bits for each pointer-sized word in the heap,
+// stored in the heapArena metadata backing each heap arena.
+// That is, if ha is the heapArena for the arena starting a start,
+// then ha.bitmap[0] holds the 2-bit entries for the four words start
+// through start+3*ptrSize, ha.bitmap[1] holds the entries for
+// start+4*ptrSize through start+7*ptrSize, and so on.
+//
+// In each 2-bit entry, the lower bit is a pointer/scalar bit, just
+// like in the stack/data bitmaps described above. The upper bit
+// indicates scan/dead: a "1" value ("scan") indicates that there may
+// be pointers in later words of the allocation, and a "0" value
+// ("dead") indicates there are no more pointers in the allocation. If
+// the upper bit is 0, the lower bit must also be 0, and this
+// indicates scanning can ignore the rest of the allocation.
+//
+// The 2-bit entries are split when written into the byte, so that the top half
+// of the byte contains 4 high (scan) bits and the bottom half contains 4 low
+// (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to
+// keep the pointer bits contiguous, instead of having to space them out.
+//
+// The code makes use of the fact that the zero value for a heap
+// bitmap means scalar/dead. This property must be preserved when
+// modifying the encoding.
+//
+// The bitmap for noscan spans is not maintained. Code must ensure
+// that an object is scannable before consulting its bitmap by
+// checking either the noscan bit in the span or by consulting its
+// type's information.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	bitPointer = 1 << 0
+	bitScan    = 1 << 4
+
+	heapBitsShift      = 1     // shift offset between successive bitPointer or bitScan entries
+	wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte
+
+	// all scan/pointer bits in a byte
+	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
+)
+
+// addb returns the byte pointer p+n.
+//go:nowritebarrier
+//go:nosplit
+func addb(p *byte, n uintptr) *byte {
+	// Note: wrote out full expression instead of calling add(p, n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
+}
+
+// subtractb returns the byte pointer p-n.
+//go:nowritebarrier
+//go:nosplit
+func subtractb(p *byte, n uintptr) *byte {
+	// Note: wrote out full expression instead of calling add(p, -n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
+}
+
+// add1 returns the byte pointer p+1.
+//go:nowritebarrier
+//go:nosplit
+func add1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling addb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
+}
+
+// subtract1 returns the byte pointer p-1.
+//go:nowritebarrier
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func subtract1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling subtractb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
+}
+
+// heapBits provides access to the bitmap bits for a single heap word.
+// The methods on heapBits take value receivers so that the compiler
+// can more easily inline calls to those methods and registerize the
+// struct fields independently.
+type heapBits struct {
+	bitp  *uint8
+	shift uint32
+	arena uint32 // Index of heap arena containing bitp
+	last  *uint8 // Last byte arena's bitmap
+}
+
+// Make the compiler check that heapBits.arena is large enough to hold
+// the maximum arena frame number.
+var _ = heapBits{arena: (1<<heapAddrBits)/heapArenaBytes - 1}
+
+// markBits provides access to the mark bit for an object in the heap.
+// bytep points to the byte holding the mark bit.
+// mask is a byte with a single bit set that can be &ed with *bytep
+// to see if the bit has been set.
+// *m.byte&m.mask != 0 indicates the mark bit is set.
+// index can be used along with span information to generate
+// the address of the object in the heap.
+// We maintain one set of mark bits for allocation and one for
+// marking purposes.
+type markBits struct {
+	bytep *uint8
+	mask  uint8
+	index uintptr
+}
+
+//go:nosplit
+func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
+	bytep, mask := s.allocBits.bitp(allocBitIndex)
+	return markBits{bytep, mask, allocBitIndex}
+}
+
+// refillAllocCache takes 8 bytes s.allocBits starting at whichByte
+// and negates them so that ctz (count trailing zeros) instructions
+// can be used. It then places these 8 bytes into the cached 64 bit
+// s.allocCache.
+func (s *mspan) refillAllocCache(whichByte uintptr) {
+	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
+	aCache := uint64(0)
+	aCache |= uint64(bytes[0])
+	aCache |= uint64(bytes[1]) << (1 * 8)
+	aCache |= uint64(bytes[2]) << (2 * 8)
+	aCache |= uint64(bytes[3]) << (3 * 8)
+	aCache |= uint64(bytes[4]) << (4 * 8)
+	aCache |= uint64(bytes[5]) << (5 * 8)
+	aCache |= uint64(bytes[6]) << (6 * 8)
+	aCache |= uint64(bytes[7]) << (7 * 8)
+	s.allocCache = ^aCache
+}
+
+// nextFreeIndex returns the index of the next free object in s at
+// or after s.freeindex.
+// There are hardware instructions that can be used to make this
+// faster if profiling warrants it.
+func (s *mspan) nextFreeIndex() uintptr {
+	sfreeindex := s.freeindex
+	snelems := s.nelems
+	if sfreeindex == snelems {
+		return sfreeindex
+	}
+	if sfreeindex > snelems {
+		throw("s.freeindex > s.nelems")
+	}
+
+	aCache := s.allocCache
+
+	bitIndex := sys.Ctz64(aCache)
+	for bitIndex == 64 {
+		// Move index to start of next cached bits.
+		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
+		if sfreeindex >= snelems {
+			s.freeindex = snelems
+			return snelems
+		}
+		whichByte := sfreeindex / 8
+		// Refill s.allocCache with the next 64 alloc bits.
+		s.refillAllocCache(whichByte)
+		aCache = s.allocCache
+		bitIndex = sys.Ctz64(aCache)
+		// nothing available in cached bits
+		// grab the next 8 bytes and try again.
+	}
+	result := sfreeindex + uintptr(bitIndex)
+	if result >= snelems {
+		s.freeindex = snelems
+		return snelems
+	}
+
+	s.allocCache >>= uint(bitIndex + 1)
+	sfreeindex = result + 1
+
+	if sfreeindex%64 == 0 && sfreeindex != snelems {
+		// We just incremented s.freeindex so it isn't 0.
+		// As each 1 in s.allocCache was encountered and used for allocation
+		// it was shifted away. At this point s.allocCache contains all 0s.
+		// Refill s.allocCache so that it corresponds
+		// to the bits at s.allocBits starting at s.freeindex.
+		whichByte := sfreeindex / 8
+		s.refillAllocCache(whichByte)
+	}
+	s.freeindex = sfreeindex
+	return result
+}
+
+// isFree reports whether the index'th object in s is unallocated.
+//
+// The caller must ensure s.state is mSpanInUse, and there must have
+// been no preemption points since ensuring this (which could allow a
+// GC transition, which would allow the state to change).
+func (s *mspan) isFree(index uintptr) bool {
+	if index < s.freeindex {
+		return false
+	}
+	bytep, mask := s.allocBits.bitp(index)
+	return *bytep&mask == 0
+}
+
+// divideByElemSize returns n/s.elemsize.
+// n must be within [0, s.npages*_PageSize),
+// or may be exactly s.npages*_PageSize
+// if s.elemsize is from sizeclasses.go.
+func (s *mspan) divideByElemSize(n uintptr) uintptr {
+	const doubleCheck = false
+
+	// See explanation in mksizeclasses.go's computeDivMagic.
+	q := uintptr((uint64(n) * uint64(s.divMul)) >> 32)
+
+	if doubleCheck && q != n/s.elemsize {
+		println(n, "/", s.elemsize, "should be", n/s.elemsize, "but got", q)
+		throw("bad magic division")
+	}
+	return q
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	return s.divideByElemSize(p - s.base())
+}
+
+func markBitsForAddr(p uintptr) markBits {
+	s := spanOf(p)
+	objIndex := s.objIndex(p)
+	return s.markBitsForIndex(objIndex)
+}
+
+func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
+	bytep, mask := s.gcmarkBits.bitp(objIndex)
+	return markBits{bytep, mask, objIndex}
+}
+
+func (s *mspan) markBitsForBase() markBits {
+	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
+}
+
+// isMarked reports whether mark bit m is set.
+func (m markBits) isMarked() bool {
+	return *m.bytep&m.mask != 0
+}
+
+// setMarked sets the marked bit in the markbits, atomically.
+func (m markBits) setMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.Or8(m.bytep, m.mask)
+}
+
+// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
+func (m markBits) setMarkedNonAtomic() {
+	*m.bytep |= m.mask
+}
+
+// clearMarked clears the marked bit in the markbits, atomically.
+func (m markBits) clearMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.And8(m.bytep, ^m.mask)
+}
+
+// markBitsForSpan returns the markBits for the span base address base.
+func markBitsForSpan(base uintptr) (mbits markBits) {
+	mbits = markBitsForAddr(base)
+	if mbits.mask != 1 {
+		throw("markBitsForSpan: unaligned start")
+	}
+	return mbits
+}
+
+// advance advances the markBits to the next object in the span.
+func (m *markBits) advance() {
+	if m.mask == 1<<7 {
+		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
+		m.mask = 1
+	} else {
+		m.mask = m.mask << 1
+	}
+	m.index++
+}
+
+// heapBitsForAddr returns the heapBits for the address addr.
+// The caller must ensure addr is in an allocated span.
+// In particular, be careful not to point past the end of an object.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func heapBitsForAddr(addr uintptr) (h heapBits) {
+	// 2 bits per word, 4 pairs per byte, and a mask is hard coded.
+	arena := arenaIndex(addr)
+	ha := mheap_.arenas[arena.l1()][arena.l2()]
+	// The compiler uses a load for nil checking ha, but in this
+	// case we'll almost never hit that cache line again, so it
+	// makes more sense to do a value check.
+	if ha == nil {
+		// addr is not in the heap. Return nil heapBits, which
+		// we expect to crash in the caller.
+		return
+	}
+	h.bitp = &ha.bitmap[(addr/(goarch.PtrSize*4))%heapArenaBitmapBytes]
+	h.shift = uint32((addr / goarch.PtrSize) & 3)
+	h.arena = uint32(arena)
+	h.last = &ha.bitmap[len(ha.bitmap)-1]
+	return
+}
+
+// clobberdeadPtr is a special value that is used by the compiler to
+// clobber dead stack slots, when -clobberdead flag is set.
+const clobberdeadPtr = uintptr(0xdeaddead | 0xdeaddead<<((^uintptr(0)>>63)*32))
+
+// badPointer throws bad pointer in heap panic.
+func badPointer(s *mspan, p, refBase, refOff uintptr) {
+	// Typically this indicates an incorrect use
+	// of unsafe or cgo to store a bad pointer in
+	// the Go heap. It may also indicate a runtime
+	// bug.
+	//
+	// TODO(austin): We could be more aggressive
+	// and detect pointers to unallocated objects
+	// in allocated spans.
+	printlock()
+	print("runtime: pointer ", hex(p))
+	if s != nil {
+		state := s.state.get()
+		if state != mSpanInUse {
+			print(" to unallocated span")
+		} else {
+			print(" to unused region of span")
+		}
+		print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state)
+	}
+	print("\n")
+	if refBase != 0 {
+		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
+		gcDumpObject("object", refBase, refOff)
+	}
+	getg().m.traceback = 2
+	throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+}
+
+// findObject returns the base address for the heap object containing
+// the address p, the object's span, and the index of the object in s.
+// If p does not point into a heap object, it returns base == 0.
+//
+// If p points is an invalid heap pointer and debug.invalidptr != 0,
+// findObject panics.
+//
+// refBase and refOff optionally give the base address of the object
+// in which the pointer p was found and the byte offset at which it
+// was found. These are used for error reporting.
+//
+// It is nosplit so it is safe for p to be a pointer to the current goroutine's stack.
+// Since p is a uintptr, it would not be adjusted if the stack were to move.
+//go:nosplit
+func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex uintptr) {
+	s = spanOf(p)
+	// If s is nil, the virtual address has never been part of the heap.
+	// This pointer may be to some mmap'd region, so we allow it.
+	if s == nil {
+		if (GOARCH == "amd64" || GOARCH == "arm64") && p == clobberdeadPtr && debug.invalidptr != 0 {
+			// Crash if clobberdeadPtr is seen. Only on AMD64 and ARM64 for now,
+			// as they are the only platform where compiler's clobberdead mode is
+			// implemented. On these platforms clobberdeadPtr cannot be a valid address.
+			badPointer(s, p, refBase, refOff)
+		}
+		return
+	}
+	// If p is a bad pointer, it may not be in s's bounds.
+	//
+	// Check s.state to synchronize with span initialization
+	// before checking other fields. See also spanOfHeap.
+	if state := s.state.get(); state != mSpanInUse || p < s.base() || p >= s.limit {
+		// Pointers into stacks are also ok, the runtime manages these explicitly.
+		if state == mSpanManual {
+			return
+		}
+		// The following ensures that we are rigorous about what data
+		// structures hold valid pointers.
+		if debug.invalidptr != 0 {
+			badPointer(s, p, refBase, refOff)
+		}
+		return
+	}
+
+	objIndex = s.objIndex(p)
+	base = s.base() + objIndex*s.elemsize
+	return
+}
+
+// verifyNotInHeapPtr reports whether converting the not-in-heap pointer into a unsafe.Pointer is ok.
+//go:linkname reflect_verifyNotInHeapPtr reflect.verifyNotInHeapPtr
+func reflect_verifyNotInHeapPtr(p uintptr) bool {
+	// Conversion to a pointer is ok as long as findObject above does not call badPointer.
+	// Since we're already promised that p doesn't point into the heap, just disallow heap
+	// pointers and the special clobbered pointer.
+	return spanOf(p) == nil && p != clobberdeadPtr
+}
+
+// next returns the heapBits describing the next pointer-sized word in memory.
+// That is, if h describes address p, h.next() describes p+ptrSize.
+// Note that next does not modify h. The caller must record the result.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) next() heapBits {
+	if h.shift < 3*heapBitsShift {
+		h.shift += heapBitsShift
+	} else if h.bitp != h.last {
+		h.bitp, h.shift = add1(h.bitp), 0
+	} else {
+		// Move to the next arena.
+		return h.nextArena()
+	}
+	return h
+}
+
+// nextArena advances h to the beginning of the next heap arena.
+//
+// This is a slow-path helper to next. gc's inliner knows that
+// heapBits.next can be inlined even though it calls this. This is
+// marked noinline so it doesn't get inlined into next and cause next
+// to be too big to inline.
+//
+//go:nosplit
+//go:noinline
+func (h heapBits) nextArena() heapBits {
+	h.arena++
+	ai := arenaIdx(h.arena)
+	l2 := mheap_.arenas[ai.l1()]
+	if l2 == nil {
+		// We just passed the end of the object, which
+		// was also the end of the heap. Poison h. It
+		// should never be dereferenced at this point.
+		return heapBits{}
+	}
+	ha := l2[ai.l2()]
+	if ha == nil {
+		return heapBits{}
+	}
+	h.bitp, h.shift = &ha.bitmap[0], 0
+	h.last = &ha.bitmap[len(ha.bitmap)-1]
+	return h
+}
+
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+//go:nosplit
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsShift
+	nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4
+	h.shift = uint32(n%4) * heapBitsShift
+	if nbitp <= uintptr(unsafe.Pointer(h.last)) {
+		h.bitp = (*uint8)(unsafe.Pointer(nbitp))
+		return h
+	}
+
+	// We're in a new heap arena.
+	past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1)
+	h.arena += 1 + uint32(past/heapArenaBitmapBytes)
+	ai := arenaIdx(h.arena)
+	if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil {
+		a := l2[ai.l2()]
+		h.bitp = &a.bitmap[past%heapArenaBitmapBytes]
+		h.last = &a.bitmap[len(a.bitmap)-1]
+	} else {
+		h.bitp, h.last = nil, nil
+	}
+	return h
+}
+
+// forwardOrBoundary is like forward, but stops at boundaries between
+// contiguous sections of the bitmap. It returns the number of words
+// advanced over, which will be <= n.
+func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) {
+	maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp)))
+	if n > maxn {
+		n = maxn
+	}
+	return h.forward(n), n
+}
+
+// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) bits() uint32 {
+	// The (shift & 31) eliminates a test and conditional branch
+	// from the generated code.
+	return uint32(*h.bitp) >> (h.shift & 31)
+}
+
+// morePointers reports whether this word and all remaining words in this object
+// are scalars.
+// h must not describe the second word of the object.
+func (h heapBits) morePointers() bool {
+	return h.bits()&bitScan != 0
+}
+
+// isPointer reports whether the heap bits describe a pointer word.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) isPointer() bool {
+	return h.bits()&bitPointer != 0
+}
+
+// bulkBarrierPreWrite executes a write barrier
+// for every pointer slot in the memory range [src, src+size),
+// using pointer/scalar information from [dst, dst+size).
+// This executes the write barriers necessary before a memmove.
+// src, dst, and size must be pointer-aligned.
+// The range [dst, dst+size) must lie within a single object.
+// It does not perform the actual writes.
+//
+// As a special case, src == 0 indicates that this is being used for a
+// memclr. bulkBarrierPreWrite will pass 0 for the src of each write
+// barrier.
+//
+// Callers should call bulkBarrierPreWrite immediately before
+// calling memmove(dst, src, size). This function is marked nosplit
+// to avoid being preempted; the GC must not stop the goroutine
+// between the memmove and the execution of the barriers.
+// The caller is also responsible for cgo pointer checks if this
+// may be writing Go pointers into non-Go memory.
+//
+// The pointer bitmap is not maintained for allocations containing
+// no pointers at all; any caller of bulkBarrierPreWrite must first
+// make sure the underlying allocation contains pointers, usually
+// by checking typ.ptrdata.
+//
+// Callers must perform cgo checks if writeBarrier.cgo.
+//
+//go:nosplit
+func bulkBarrierPreWrite(dst, src, size uintptr) {
+	if (dst|src|size)&(goarch.PtrSize-1) != 0 {
+		throw("bulkBarrierPreWrite: unaligned arguments")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	if s := spanOf(dst); s == nil {
+		// If dst is a global, use the data or BSS bitmaps to
+		// execute write barriers.
+		for _, datap := range activeModules() {
+			if datap.data <= dst && dst < datap.edata {
+				bulkBarrierBitmap(dst, src, size, dst-datap.data, datap.gcdatamask.bytedata)
+				return
+			}
+		}
+		for _, datap := range activeModules() {
+			if datap.bss <= dst && dst < datap.ebss {
+				bulkBarrierBitmap(dst, src, size, dst-datap.bss, datap.gcbssmask.bytedata)
+				return
+			}
+		}
+		return
+	} else if s.state.get() != mSpanInUse || dst < s.base() || s.limit <= dst {
+		// dst was heap memory at some point, but isn't now.
+		// It can't be a global. It must be either our stack,
+		// or in the case of direct channel sends, it could be
+		// another stack. Either way, no need for barriers.
+		// This will also catch if dst is in a freed span,
+		// though that should never have.
+		return
+	}
+
+	buf := &getg().m.p.ptr().wbBuf
+	h := heapBitsForAddr(dst)
+	if src == 0 {
+		for i := uintptr(0); i < size; i += goarch.PtrSize {
+			if h.isPointer() {
+				dstx := (*uintptr)(unsafe.Pointer(dst + i))
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
+			}
+			h = h.next()
+		}
+	} else {
+		for i := uintptr(0); i < size; i += goarch.PtrSize {
+			if h.isPointer() {
+				dstx := (*uintptr)(unsafe.Pointer(dst + i))
+				srcx := (*uintptr)(unsafe.Pointer(src + i))
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
+			}
+			h = h.next()
+		}
+	}
+}
+
+// bulkBarrierPreWriteSrcOnly is like bulkBarrierPreWrite but
+// does not execute write barriers for [dst, dst+size).
+//
+// In addition to the requirements of bulkBarrierPreWrite
+// callers need to ensure [dst, dst+size) is zeroed.
+//
+// This is used for special cases where e.g. dst was just
+// created and zeroed with malloc.
+//go:nosplit
+func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) {
+	if (dst|src|size)&(goarch.PtrSize-1) != 0 {
+		throw("bulkBarrierPreWrite: unaligned arguments")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	buf := &getg().m.p.ptr().wbBuf
+	h := heapBitsForAddr(dst)
+	for i := uintptr(0); i < size; i += goarch.PtrSize {
+		if h.isPointer() {
+			srcx := (*uintptr)(unsafe.Pointer(src + i))
+			if !buf.putFast(0, *srcx) {
+				wbBufFlush(nil, 0)
+			}
+		}
+		h = h.next()
+	}
+}
+
+// bulkBarrierBitmap executes write barriers for copying from [src,
+// src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
+// assumed to start maskOffset bytes into the data covered by the
+// bitmap in bits (which may not be a multiple of 8).
+//
+// This is used by bulkBarrierPreWrite for writes to data and BSS.
+//
+//go:nosplit
+func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
+	word := maskOffset / goarch.PtrSize
+	bits = addb(bits, word/8)
+	mask := uint8(1) << (word % 8)
+
+	buf := &getg().m.p.ptr().wbBuf
+	for i := uintptr(0); i < size; i += goarch.PtrSize {
+		if mask == 0 {
+			bits = addb(bits, 1)
+			if *bits == 0 {
+				// Skip 8 words.
+				i += 7 * goarch.PtrSize
+				continue
+			}
+			mask = 1
+		}
+		if *bits&mask != 0 {
+			dstx := (*uintptr)(unsafe.Pointer(dst + i))
+			if src == 0 {
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
+			} else {
+				srcx := (*uintptr)(unsafe.Pointer(src + i))
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
+			}
+		}
+		mask <<= 1
+	}
+}
+
+// typeBitsBulkBarrier executes a write barrier for every
+// pointer that would be copied from [src, src+size) to [dst,
+// dst+size) by a memmove using the type bitmap to locate those
+// pointer slots.
+//
+// The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
+// dst, src, and size must be pointer-aligned.
+// The type typ must have a plain bitmap, not a GC program.
+// The only use of this function is in channel sends, and the
+// 64 kB channel element limit takes care of this for us.
+//
+// Must not be preempted because it typically runs right before memmove,
+// and the GC must observe them as an atomic action.
+//
+// Callers must perform cgo checks if writeBarrier.cgo.
+//
+//go:nosplit
+func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
+	if typ == nil {
+		throw("runtime: typeBitsBulkBarrier without type")
+	}
+	if typ.size != size {
+		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
+		throw("runtime: invalid typeBitsBulkBarrier")
+	}
+	if typ.kind&kindGCProg != 0 {
+		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
+		throw("runtime: invalid typeBitsBulkBarrier")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	ptrmask := typ.gcdata
+	buf := &getg().m.p.ptr().wbBuf
+	var bits uint32
+	for i := uintptr(0); i < typ.ptrdata; i += goarch.PtrSize {
+		if i&(goarch.PtrSize*8-1) == 0 {
+			bits = uint32(*ptrmask)
+			ptrmask = addb(ptrmask, 1)
+		} else {
+			bits = bits >> 1
+		}
+		if bits&1 != 0 {
+			dstx := (*uintptr)(unsafe.Pointer(dst + i))
+			srcx := (*uintptr)(unsafe.Pointer(src + i))
+			if !buf.putFast(*dstx, *srcx) {
+				wbBufFlush(nil, 0)
+			}
+		}
+	}
+}
+
+// The methods operating on spans all require that h has been returned
+// by heapBitsForSpan and that size, n, total are the span layout description
+// returned by the mspan's layout method.
+// If total > size*n, it means that there is extra leftover memory in the span,
+// usually due to rounding.
+//
+// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
+
+// initSpan initializes the heap bitmap for a span.
+// If this is a span of pointer-sized objects, it initializes all
+// words to pointer/scan.
+// Otherwise, it initializes all words to scalar/dead.
+func (h heapBits) initSpan(s *mspan) {
+	// Clear bits corresponding to objects.
+	nw := (s.npages << _PageShift) / goarch.PtrSize
+	if nw%wordsPerBitmapByte != 0 {
+		throw("initSpan: unaligned length")
+	}
+	if h.shift != 0 {
+		throw("initSpan: unaligned base")
+	}
+	isPtrs := goarch.PtrSize == 8 && s.elemsize == goarch.PtrSize
+	for nw > 0 {
+		hNext, anw := h.forwardOrBoundary(nw)
+		nbyte := anw / wordsPerBitmapByte
+		if isPtrs {
+			bitp := h.bitp
+			for i := uintptr(0); i < nbyte; i++ {
+				*bitp = bitPointerAll | bitScanAll
+				bitp = add1(bitp)
+			}
+		} else {
+			memclrNoHeapPointers(unsafe.Pointer(h.bitp), nbyte)
+		}
+		h = hNext
+		nw -= anw
+	}
+}
+
+// countAlloc returns the number of objects allocated in span s by
+// scanning the allocation bitmap.
+func (s *mspan) countAlloc() int {
+	count := 0
+	bytes := divRoundUp(s.nelems, 8)
+	// Iterate over each 8-byte chunk and count allocations
+	// with an intrinsic. Note that newMarkBits guarantees that
+	// gcmarkBits will be 8-byte aligned, so we don't have to
+	// worry about edge cases, irrelevant bits will simply be zero.
+	for i := uintptr(0); i < bytes; i += 8 {
+		// Extract 64 bits from the byte pointer and get a OnesCount.
+		// Note that the unsafe cast here doesn't preserve endianness,
+		// but that's OK. We only care about how many bits are 1, not
+		// about the order we discover them in.
+		mrkBits := *(*uint64)(unsafe.Pointer(s.gcmarkBits.bytep(i)))
+		count += sys.OnesCount64(mrkBits)
+	}
+	return count
+}
+
+// heapBitsSetType records that the new allocation [x, x+size)
+// holds in [x, x+dataSize) one or more values of type typ.
+// (The number of values is given by dataSize / typ.size.)
+// If dataSize < size, the fragment [x+dataSize, x+size) is
+// recorded as non-pointer data.
+// It is known that the type has pointers somewhere;
+// malloc does not call heapBitsSetType when there are no pointers,
+// because all free objects are marked as noscan during
+// heapBitsSweepSpan.
+//
+// There can only be one allocation from a given span active at a time,
+// and the bitmap for a span always falls on byte boundaries,
+// so there are no write-write races for access to the heap bitmap.
+// Hence, heapBitsSetType can access the bitmap without atomics.
+//
+// There can be read-write races between heapBitsSetType and things
+// that read the heap bitmap like scanobject. However, since
+// heapBitsSetType is only used for objects that have not yet been
+// made reachable, readers will ignore bits being modified by this
+// function. This does mean this function cannot transiently modify
+// bits that belong to neighboring objects. Also, on weakly-ordered
+// machines, callers must execute a store/store (publication) barrier
+// between calling this function and making the object reachable.
+func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
+	const doubleCheck = false // slow but helpful; enable to test modifications to this code
+
+	const (
+		mask1 = bitPointer | bitScan                        // 00010001
+		mask2 = bitPointer | bitScan | mask1<<heapBitsShift // 00110011
+		mask3 = bitPointer | bitScan | mask2<<heapBitsShift // 01110111
+	)
+
+	// dataSize is always size rounded up to the next malloc size class,
+	// except in the case of allocating a defer block, in which case
+	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
+	// arbitrarily larger.
+	//
+	// The checks for size == goarch.PtrSize and size == 2*goarch.PtrSize can therefore
+	// assume that dataSize == size without checking it explicitly.
+
+	if goarch.PtrSize == 8 && size == goarch.PtrSize {
+		// It's one word and it has pointers, it must be a pointer.
+		// Since all allocated one-word objects are pointers
+		// (non-pointers are aggregated into tinySize allocations),
+		// initSpan sets the pointer bits for us. Nothing to do here.
+		if doubleCheck {
+			h := heapBitsForAddr(x)
+			if !h.isPointer() {
+				throw("heapBitsSetType: pointer bit missing")
+			}
+			if !h.morePointers() {
+				throw("heapBitsSetType: scan bit missing")
+			}
+		}
+		return
+	}
+
+	h := heapBitsForAddr(x)
+	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
+
+	// 2-word objects only have 4 bitmap bits and 3-word objects only have 6 bitmap bits.
+	// Therefore, these objects share a heap bitmap byte with the objects next to them.
+	// These are called out as a special case primarily so the code below can assume all
+	// objects are at least 4 words long and that their bitmaps start either at the beginning
+	// of a bitmap byte, or half-way in (h.shift of 0 and 2 respectively).
+
+	if size == 2*goarch.PtrSize {
+		if typ.size == goarch.PtrSize {
+			// We're allocating a block big enough to hold two pointers.
+			// On 64-bit, that means the actual object must be two pointers,
+			// or else we'd have used the one-pointer-sized block.
+			// On 32-bit, however, this is the 8-byte block, the smallest one.
+			// So it could be that we're allocating one pointer and this was
+			// just the smallest block available. Distinguish by checking dataSize.
+			// (In general the number of instances of typ being allocated is
+			// dataSize/typ.size.)
+			if goarch.PtrSize == 4 && dataSize == goarch.PtrSize {
+				// 1 pointer object. On 32-bit machines clear the bit for the
+				// unused second word.
+				*h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
+				*h.bitp |= (bitPointer | bitScan) << h.shift
+			} else {
+				// 2-element array of pointer.
+				*h.bitp |= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
+			}
+			return
+		}
+		// Otherwise typ.size must be 2*goarch.PtrSize,
+		// and typ.kind&kindGCProg == 0.
+		if doubleCheck {
+			if typ.size != 2*goarch.PtrSize || typ.kind&kindGCProg != 0 {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
+				throw("heapBitsSetType")
+			}
+		}
+		b := uint32(*ptrmask)
+		hb := b & 3
+		hb |= bitScanAll & ((bitScan << (typ.ptrdata / goarch.PtrSize)) - 1)
+		// Clear the bits for this object so we can set the
+		// appropriate ones.
+		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+		*h.bitp |= uint8(hb << h.shift)
+		return
+	} else if size == 3*goarch.PtrSize {
+		b := uint8(*ptrmask)
+		if doubleCheck {
+			if b == 0 {
+				println("runtime: invalid type ", typ.string())
+				throw("heapBitsSetType: called with non-pointer type")
+			}
+			if goarch.PtrSize != 8 {
+				throw("heapBitsSetType: unexpected 3 pointer wide size class on 32 bit")
+			}
+			if typ.kind&kindGCProg != 0 {
+				throw("heapBitsSetType: unexpected GC prog for 3 pointer wide size class")
+			}
+			if typ.size == 2*goarch.PtrSize {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, "\n")
+				throw("heapBitsSetType: inconsistent object sizes")
+			}
+		}
+		if typ.size == goarch.PtrSize {
+			// The type contains a pointer otherwise heapBitsSetType wouldn't have been called.
+			// Since the type is only 1 pointer wide and contains a pointer, its gcdata must be exactly 1.
+			if doubleCheck && *typ.gcdata != 1 {
+				print("runtime: heapBitsSetType size=", size, " typ.size=", typ.size, "but *typ.gcdata", *typ.gcdata, "\n")
+				throw("heapBitsSetType: unexpected gcdata for 1 pointer wide type size in 3 pointer wide size class")
+			}
+			// 3 element array of pointers. Unrolling ptrmask 3 times into p yields 00000111.
+			b = 7
+		}
+
+		hb := b & 7
+		// Set bitScan bits for all pointers.
+		hb |= hb << wordsPerBitmapByte
+		// First bitScan bit is always set since the type contains pointers.
+		hb |= bitScan
+		// Second bitScan bit needs to also be set if the third bitScan bit is set.
+		hb |= hb & (bitScan << (2 * heapBitsShift)) >> 1
+
+		// For h.shift > 1 heap bits cross a byte boundary and need to be written part
+		// to h.bitp and part to the next h.bitp.
+		switch h.shift {
+		case 0:
+			*h.bitp &^= mask3 << 0
+			*h.bitp |= hb << 0
+		case 1:
+			*h.bitp &^= mask3 << 1
+			*h.bitp |= hb << 1
+		case 2:
+			*h.bitp &^= mask2 << 2
+			*h.bitp |= (hb & mask2) << 2
+			// Two words written to the first byte.
+			// Advance two words to get to the next byte.
+			h = h.next().next()
+			*h.bitp &^= mask1
+			*h.bitp |= (hb >> 2) & mask1
+		case 3:
+			*h.bitp &^= mask1 << 3
+			*h.bitp |= (hb & mask1) << 3
+			// One word written to the first byte.
+			// Advance one word to get to the next byte.
+			h = h.next()
+			*h.bitp &^= mask2
+			*h.bitp |= (hb >> 1) & mask2
+		}
+		return
+	}
+
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
+	// The basic approach is to use a single uintptr as a bit buffer,
+	// alternating between reloading the buffer and writing bitmap bytes.
+	// In general, one load can supply two bitmap byte writes.
+	// This is a lot of lines of code, but it compiles into relatively few
+	// machine instructions.
+
+	outOfPlace := false
+	if arenaIndex(x+size-1) != arenaIdx(h.arena) || (doubleCheck && fastrandn(2) == 0) {
+		// This object spans heap arenas, so the bitmap may be
+		// discontiguous. Unroll it into the object instead
+		// and then copy it out.
+		//
+		// In doubleCheck mode, we randomly do this anyway to
+		// stress test the bitmap copying path.
+		outOfPlace = true
+		h.bitp = (*uint8)(unsafe.Pointer(x))
+		h.last = nil
+	}
+
+	var (
+		// Ptrmask input.
+		p     *byte   // last ptrmask byte read
+		b     uintptr // ptrmask bits already loaded
+		nb    uintptr // number of bits in b at next read
+		endp  *byte   // final ptrmask byte to read (then repeat)
+		endnb uintptr // number of valid bits in *endp
+		pbits uintptr // alternate source of bits
+
+		// Heap bitmap output.
+		w     uintptr // words processed
+		nw    uintptr // number of words to process
+		hbitp *byte   // next heap bitmap byte to write
+		hb    uintptr // bits being prepared for *hbitp
+	)
+
+	hbitp = h.bitp
+
+	// Handle GC program. Delayed until this part of the code
+	// so that we can use the same double-checking mechanism
+	// as the 1-bit case. Nothing above could have encountered
+	// GC programs: the cases were all too small.
+	if typ.kind&kindGCProg != 0 {
+		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
+		if doubleCheck {
+			// Double-check the heap bits written by GC program
+			// by running the GC program to create a 1-bit pointer mask
+			// and then jumping to the double-check code below.
+			// This doesn't catch bugs shared between the 1-bit and 4-bit
+			// GC program execution, but it does catch mistakes specific
+			// to just one of those and bugs in heapBitsSetTypeGCProg's
+			// implementation of arrays.
+			lock(&debugPtrmask.lock)
+			if debugPtrmask.data == nil {
+				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
+			}
+			ptrmask = debugPtrmask.data
+			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
+		}
+		goto Phase4
+	}
+
+	// Note about sizes:
+	//
+	// typ.size is the number of words in the object,
+	// and typ.ptrdata is the number of words in the prefix
+	// of the object that contains pointers. That is, the final
+	// typ.size - typ.ptrdata words contain no pointers.
+	// This allows optimization of a common pattern where
+	// an object has a small header followed by a large scalar
+	// buffer. If we know the pointers are over, we don't have
+	// to scan the buffer's heap bitmap at all.
+	// The 1-bit ptrmasks are sized to contain only bits for
+	// the typ.ptrdata prefix, zero padded out to a full byte
+	// of bitmap. This code sets nw (below) so that heap bitmap
+	// bits are only written for the typ.ptrdata prefix; if there is
+	// more room in the allocated object, the next heap bitmap
+	// entry is a 00, indicating that there are no more pointers
+	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	//
+	// Replicated copies are not as nice: if there is an array of
+	// objects with scalar tails, all but the last tail does have to
+	// be initialized, because there is no way to say "skip forward".
+	// However, because of the possibility of a repeated type with
+	// size not a multiple of 4 pointers (one heap bitmap byte),
+	// the code already must handle the last ptrmask byte specially
+	// by treating it as containing only the bits for endnb pointers,
+	// where endnb <= 4. We represent large scalar tails that must
+	// be expanded in the replication by setting endnb larger than 4.
+	// This will have the effect of reading many bits out of b,
+	// but once the real bits are shifted out, b will supply as many
+	// zero bits as we try to read, which is exactly what we need.
+
+	p = ptrmask
+	if typ.size < dataSize {
+		// Filling in bits for an array of typ.
+		// Set up for repetition of ptrmask during main loop.
+		// Note that ptrmask describes only a prefix of
+		const maxBits = goarch.PtrSize*8 - 7
+		if typ.ptrdata/goarch.PtrSize <= maxBits {
+			// Entire ptrmask fits in uintptr with room for a byte fragment.
+			// Load into pbits and never read from ptrmask again.
+			// This is especially important when the ptrmask has
+			// fewer than 8 bits in it; otherwise the reload in the middle
+			// of the Phase 2 loop would itself need to loop to gather
+			// at least 8 bits.
+
+			// Accumulate ptrmask into b.
+			// ptrmask is sized to describe only typ.ptrdata, but we record
+			// it as describing typ.size bytes, since all the high bits are zero.
+			nb = typ.ptrdata / goarch.PtrSize
+			for i := uintptr(0); i < nb; i += 8 {
+				b |= uintptr(*p) << i
+				p = add1(p)
+			}
+			nb = typ.size / goarch.PtrSize
+
+			// Replicate ptrmask to fill entire pbits uintptr.
+			// Doubling and truncating is fewer steps than
+			// iterating by nb each time. (nb could be 1.)
+			// Since we loaded typ.ptrdata/goarch.PtrSize bits
+			// but are pretending to have typ.size/goarch.PtrSize,
+			// there might be no replication necessary/possible.
+			pbits = b
+			endnb = nb
+			if nb+nb <= maxBits {
+				for endnb <= goarch.PtrSize*8 {
+					pbits |= pbits << endnb
+					endnb += endnb
+				}
+				// Truncate to a multiple of original ptrmask.
+				// Because nb+nb <= maxBits, nb fits in a byte.
+				// Byte division is cheaper than uintptr division.
+				endnb = uintptr(maxBits/byte(nb)) * nb
+				pbits &= 1<<endnb - 1
+				b = pbits
+				nb = endnb
+			}
+
+			// Clear p and endp as sentinel for using pbits.
+			// Checked during Phase 2 loop.
+			p = nil
+			endp = nil
+		} else {
+			// Ptrmask is larger. Read it multiple times.
+			n := (typ.ptrdata/goarch.PtrSize+7)/8 - 1
+			endp = addb(ptrmask, n)
+			endnb = typ.size/goarch.PtrSize - n*8
+		}
+	}
+	if p != nil {
+		b = uintptr(*p)
+		p = add1(p)
+		nb = 8
+	}
+
+	if typ.size == dataSize {
+		// Single entry: can stop once we reach the non-pointer data.
+		nw = typ.ptrdata / goarch.PtrSize
+	} else {
+		// Repeated instances of typ in an array.
+		// Have to process first N-1 entries in full, but can stop
+		// once we reach the non-pointer data in the final entry.
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / goarch.PtrSize
+	}
+	if nw == 0 {
+		// No pointers! Caller was supposed to check.
+		println("runtime: invalid type ", typ.string())
+		throw("heapBitsSetType: called with non-pointer type")
+		return
+	}
+
+	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2).
+	// The leading byte is special because it contains the bits for word 1,
+	// which does not have the scan bit set.
+	// The leading half-byte is special because it's a half a byte,
+	// so we have to be careful with the bits already there.
+	switch {
+	default:
+		throw("heapBitsSetType: unexpected shift")
+
+	case h.shift == 0:
+		// Ptrmask and heap bitmap are aligned.
+		//
+		// This is a fast path for small objects.
+		//
+		// The first byte we write out covers the first four
+		// words of the object. The scan/dead bit on the first
+		// word must be set to scan since there are pointers
+		// somewhere in the object.
+		// In all following words, we set the scan/dead
+		// appropriately to indicate that the object continues
+		// to the next 2-bit entry in the bitmap.
+		//
+		// We set four bits at a time here, but if the object
+		// is fewer than four words, phase 3 will clear
+		// unnecessary bits.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			goto Phase3
+		}
+		*hbitp = uint8(hb)
+		hbitp = add1(hbitp)
+		b >>= 4
+		nb -= 4
+
+	case h.shift == 2:
+		// Ptrmask and heap bitmap are misaligned.
+		//
+		// On 32 bit architectures only the 6-word object that corresponds
+		// to a 24 bytes size class can start with h.shift of 2 here since
+		// all other non 16 byte aligned size classes have been handled by
+		// special code paths at the beginning of heapBitsSetType on 32 bit.
+		//
+		// Many size classes are only 16 byte aligned. On 64 bit architectures
+		// this results in a heap bitmap position starting with a h.shift of 2.
+		//
+		// The bits for the first two words are in a byte shared
+		// with another object, so we must be careful with the bits
+		// already there.
+		//
+		// We took care of 1-word, 2-word, and 3-word objects above,
+		// so this is at least a 6-word object.
+		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
+		hb |= bitScan << (2 * heapBitsShift)
+		if nw > 1 {
+			hb |= bitScan << (3 * heapBitsShift)
+		}
+		b >>= 2
+		nb -= 2
+		*hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift))
+		*hbitp |= uint8(hb)
+		hbitp = add1(hbitp)
+		if w += 2; w >= nw {
+			// We know that there is more data, because we handled 2-word and 3-word objects above.
+			// This must be at least a 6-word object. If we're out of pointer words,
+			// mark no scan in next bitmap byte and finish.
+			hb = 0
+			w += 4
+			goto Phase3
+		}
+	}
+
+	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
+	// The loop computes the bits for that last write but does not execute the write;
+	// it leaves the bits in hb for processing by phase 3.
+	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
+	// use in the first half of the loop right now, and then we only adjust nb explicitly
+	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
+	nb -= 4
+	for {
+		// Emit bitmap byte.
+		// b has at least nb+4 bits, with one exception:
+		// if w+4 >= nw, then b has only nw-w bits,
+		// but we'll stop at the break and then truncate
+		// appropriately in Phase 3.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = add1(hbitp)
+		b >>= 4
+
+		// Load more bits. b has nb right now.
+		if p != endp {
+			// Fast path: keep reading from ptrmask.
+			// nb unmodified: we just loaded 8 bits,
+			// and the next iteration will consume 8 bits,
+			// leaving us with the same nb the next time we're here.
+			if nb < 8 {
+				b |= uintptr(*p) << nb
+				p = add1(p)
+			} else {
+				// Reduce the number of bits in b.
+				// This is important if we skipped
+				// over a scalar tail, since nb could
+				// be larger than the bit width of b.
+				nb -= 8
+			}
+		} else if p == nil {
+			// Almost as fast path: track bit count and refill from pbits.
+			// For short repetitions.
+			if nb < 8 {
+				b |= pbits << nb
+				nb += endnb
+			}
+			nb -= 8 // for next iteration
+		} else {
+			// Slow path: reached end of ptrmask.
+			// Process final partial byte and rewind to start.
+			b |= uintptr(*p) << nb
+			nb += endnb
+			if nb < 8 {
+				b |= uintptr(*ptrmask) << nb
+				p = add1(ptrmask)
+			} else {
+				nb -= 8
+				p = ptrmask
+			}
+		}
+
+		// Emit bitmap byte.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = add1(hbitp)
+		b >>= 4
+	}
+
+Phase3:
+	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
+	if w > nw {
+		// Counting the 4 entries in hb not yet written to memory,
+		// there are more entries than possible pointer slots.
+		// Discard the excess entries (can't be more than 3).
+		mask := uintptr(1)<<(4-(w-nw)) - 1
+		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
+	}
+
+	// Change nw from counting possibly-pointer words to total words in allocation.
+	nw = size / goarch.PtrSize
+
+	// Write whole bitmap bytes.
+	// The first is hb, the rest are zero.
+	if w <= nw {
+		*hbitp = uint8(hb)
+		hbitp = add1(hbitp)
+		hb = 0 // for possible final half-byte below
+		for w += 4; w <= nw; w += 4 {
+			*hbitp = 0
+			hbitp = add1(hbitp)
+		}
+	}
+
+	// Write final partial bitmap byte if any.
+	// We know w > nw, or else we'd still be in the loop above.
+	// It can be bigger only due to the 4 entries in hb that it counts.
+	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
+	// and can discard the 4 sitting in hb.
+	// But if w == nw+2, we need to write first two in hb.
+	// The byte is shared with the next object, so be careful with
+	// existing bits.
+	if w == nw+2 {
+		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
+	}
+
+Phase4:
+	// Phase 4: Copy unrolled bitmap to per-arena bitmaps, if necessary.
+	if outOfPlace {
+		// TODO: We could probably make this faster by
+		// handling [x+dataSize, x+size) specially.
+		h := heapBitsForAddr(x)
+		// cnw is the number of heap words, or bit pairs
+		// remaining (like nw above).
+		cnw := size / goarch.PtrSize
+		src := (*uint8)(unsafe.Pointer(x))
+		// We know the first and last byte of the bitmap are
+		// not the same, but it's still possible for small
+		// objects span arenas, so it may share bitmap bytes
+		// with neighboring objects.
+		//
+		// Handle the first byte specially if it's shared. See
+		// Phase 1 for why this is the only special case we need.
+		if doubleCheck {
+			if !(h.shift == 0 || h.shift == 2) {
+				print("x=", x, " size=", size, " cnw=", h.shift, "\n")
+				throw("bad start shift")
+			}
+		}
+		if h.shift == 2 {
+			*h.bitp = *h.bitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src
+			h = h.next().next()
+			cnw -= 2
+			src = addb(src, 1)
+		}
+		// We're now byte aligned. Copy out to per-arena
+		// bitmaps until the last byte (which may again be
+		// partial).
+		for cnw >= 4 {
+			// This loop processes four words at a time,
+			// so round cnw down accordingly.
+			hNext, words := h.forwardOrBoundary(cnw / 4 * 4)
+
+			// n is the number of bitmap bytes to copy.
+			n := words / 4
+			memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n)
+			cnw -= words
+			h = hNext
+			src = addb(src, n)
+		}
+		if doubleCheck && h.shift != 0 {
+			print("cnw=", cnw, " h.shift=", h.shift, "\n")
+			throw("bad shift after block copy")
+		}
+		// Handle the last byte if it's shared.
+		if cnw == 2 {
+			*h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | *src
+			src = addb(src, 1)
+			h = h.next().next()
+		}
+		if doubleCheck {
+			if uintptr(unsafe.Pointer(src)) > x+size {
+				throw("copy exceeded object size")
+			}
+			if !(cnw == 0 || cnw == 2) {
+				print("x=", x, " size=", size, " cnw=", cnw, "\n")
+				throw("bad number of remaining words")
+			}
+			// Set up hbitp so doubleCheck code below can check it.
+			hbitp = h.bitp
+		}
+		// Zero the object where we wrote the bitmap.
+		memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x)
+	}
+
+	// Double check the whole bitmap.
+	if doubleCheck {
+		// x+size may not point to the heap, so back up one
+		// word and then advance it the way we do above.
+		end := heapBitsForAddr(x + size - goarch.PtrSize)
+		if outOfPlace {
+			// In out-of-place copying, we just advance
+			// using next.
+			end = end.next()
+		} else {
+			// Don't use next because that may advance to
+			// the next arena and the in-place logic
+			// doesn't do that.
+			end.shift += heapBitsShift
+			if end.shift == 4*heapBitsShift {
+				end.bitp, end.shift = add1(end.bitp), 0
+			}
+		}
+		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
+			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
+			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+			h0 := heapBitsForAddr(x)
+			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
+			throw("bad heapBitsSetType")
+		}
+
+		// Double-check that bits to be written were written correctly.
+		// Does not check that other bits were not written, unfortunately.
+		h := heapBitsForAddr(x)
+		nptr := typ.ptrdata / goarch.PtrSize
+		ndata := typ.size / goarch.PtrSize
+		count := dataSize / typ.size
+		totalptr := ((count-1)*typ.size + typ.ptrdata) / goarch.PtrSize
+		for i := uintptr(0); i < size/goarch.PtrSize; i++ {
+			j := i % ndata
+			var have, want uint8
+			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
+			if i >= totalptr {
+				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
+					// heapBitsSetTypeGCProg always fills
+					// in full nibbles of bitScan.
+					want = bitScan
+				}
+			} else {
+				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+					want |= bitPointer
+				}
+				want |= bitScan
+			}
+			if have != want {
+				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
+				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+				print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n")
+				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+				h0 := heapBitsForAddr(x)
+				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
+				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				println("at word", i, "offset", i*goarch.PtrSize, "have", hex(have), "want", hex(want))
+				if typ.kind&kindGCProg != 0 {
+					println("GC program:")
+					dumpGCProg(addb(typ.gcdata, 4))
+				}
+				throw("bad heapBitsSetType")
+			}
+			h = h.next()
+		}
+		if ptrmask == debugPtrmask.data {
+			unlock(&debugPtrmask.lock)
+		}
+	}
+}
+
+var debugPtrmask struct {
+	lock mutex
+	data *byte
+}
+
+// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
+// progSize is the size of the memory described by the program.
+// elemSize is the size of the element that the GC program describes (a prefix of).
+// dataSize is the total size of the intended data, a multiple of elemSize.
+// allocSize is the total size of the allocated memory.
+//
+// GC programs are only used for large allocations.
+// heapBitsSetType requires that allocSize is a multiple of 4 words,
+// so that the relevant bitmap bytes are not shared with surrounding
+// objects.
+func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
+	if goarch.PtrSize == 8 && allocSize%(4*goarch.PtrSize) != 0 {
+		// Alignment will be wrong.
+		throw("heapBitsSetTypeGCProg: small allocation")
+	}
+	var totalBits uintptr
+	if elemSize == dataSize {
+		totalBits = runGCProg(prog, nil, h.bitp, 2)
+		if totalBits*goarch.PtrSize != progSize {
+			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
+			throw("heapBitsSetTypeGCProg: unexpected bit count")
+		}
+	} else {
+		count := dataSize / elemSize
+
+		// Piece together program trailer to run after prog that does:
+		//	literal(0)
+		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
+		//	repeat(elemSize, count-1) // repeat that element for count
+		// This zero-pads the data remaining in the first element and then
+		// repeats that first element to fill the array.
+		var trailer [40]byte // 3 varints (max 10 each) + some bytes
+		i := 0
+		if n := elemSize/goarch.PtrSize - progSize/goarch.PtrSize; n > 0 {
+			// literal(0)
+			trailer[i] = 0x01
+			i++
+			trailer[i] = 0
+			i++
+			if n > 1 {
+				// repeat(1, n-1)
+				trailer[i] = 0x81
+				i++
+				n--
+				for ; n >= 0x80; n >>= 7 {
+					trailer[i] = byte(n | 0x80)
+					i++
+				}
+				trailer[i] = byte(n)
+				i++
+			}
+		}
+		// repeat(elemSize/ptrSize, count-1)
+		trailer[i] = 0x80
+		i++
+		n := elemSize / goarch.PtrSize
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
+		}
+		trailer[i] = byte(n)
+		i++
+		n = count - 1
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
+		}
+		trailer[i] = byte(n)
+		i++
+		trailer[i] = 0
+		i++
+
+		runGCProg(prog, &trailer[0], h.bitp, 2)
+
+		// Even though we filled in the full array just now,
+		// record that we only filled in up to the ptrdata of the
+		// last element. This will cause the code below to
+		// memclr the dead section of the final array element,
+		// so that scanobject can stop early in the final element.
+		totalBits = (elemSize*(count-1) + progSize) / goarch.PtrSize
+	}
+	endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4))
+	endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/goarch.PtrSize/wordsPerBitmapByte))
+	memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg))
+}
+
+// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
+// size the size of the region described by prog, in bytes.
+// The resulting bitvector will have no more than size/goarch.PtrSize bits.
+func progToPointerMask(prog *byte, size uintptr) bitvector {
+	n := (size/goarch.PtrSize + 7) / 8
+	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
+	x[len(x)-1] = 0xa1 // overflow check sentinel
+	n = runGCProg(prog, nil, &x[0], 1)
+	if x[len(x)-1] != 0xa1 {
+		throw("progToPointerMask: overflow")
+	}
+	return bitvector{int32(n), &x[0]}
+}
+
+// Packed GC pointer bitmaps, aka GC programs.
+//
+// For large types containing arrays, the type information has a
+// natural repetition that can be encoded to save space in the
+// binary and in the memory representation of the type information.
+//
+// The encoding is a simple Lempel-Ziv style bytecode machine
+// with the following instructions:
+//
+//	00000000: stop
+//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
+//	10000000 n c: repeat the previous n bits c times; n, c are varints
+//	1nnnnnnn c: repeat the previous n bits c times; c is a varint
+
+// runGCProg executes the GC program prog, and then trailer if non-nil,
+// writing to dst with entries of the given size.
+// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
+// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
+// starting at dst (because the heap bitmap does). In this case, the caller guarantees
+// that only whole bytes in dst need to be written.
+//
+// runGCProg returns the number of 1- or 2-bit entries written to memory.
+func runGCProg(prog, trailer, dst *byte, size int) uintptr {
+	dstStart := dst
+
+	// Bits waiting to be written to memory.
+	var bits uintptr
+	var nbits uintptr
+
+	p := prog
+Run:
+	for {
+		// Flush accumulated full bytes.
+		// The rest of the loop assumes that nbits <= 7.
+		for ; nbits >= 8; nbits -= 8 {
+			if size == 1 {
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			} else {
+				v := bits&bitPointerAll | bitScanAll
+				*dst = uint8(v)
+				dst = add1(dst)
+				bits >>= 4
+				v = bits&bitPointerAll | bitScanAll
+				*dst = uint8(v)
+				dst = add1(dst)
+				bits >>= 4
+			}
+		}
+
+		// Process one instruction.
+		inst := uintptr(*p)
+		p = add1(p)
+		n := inst & 0x7F
+		if inst&0x80 == 0 {
+			// Literal bits; n == 0 means end of program.
+			if n == 0 {
+				// Program is over; continue in trailer if present.
+				if trailer != nil {
+					p = trailer
+					trailer = nil
+					continue
+				}
+				break Run
+			}
+			nbyte := n / 8
+			for i := uintptr(0); i < nbyte; i++ {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				if size == 1 {
+					*dst = uint8(bits)
+					dst = add1(dst)
+					bits >>= 8
+				} else {
+					v := bits&0xf | bitScanAll
+					*dst = uint8(v)
+					dst = add1(dst)
+					bits >>= 4
+					v = bits&0xf | bitScanAll
+					*dst = uint8(v)
+					dst = add1(dst)
+					bits >>= 4
+				}
+			}
+			if n %= 8; n > 0 {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				nbits += n
+			}
+			continue Run
+		}
+
+		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
+		if n == 0 {
+			for off := uint(0); ; off += 7 {
+				x := uintptr(*p)
+				p = add1(p)
+				n |= (x & 0x7F) << off
+				if x&0x80 == 0 {
+					break
+				}
+			}
+		}
+
+		// Count is encoded in a varint in the next bytes.
+		c := uintptr(0)
+		for off := uint(0); ; off += 7 {
+			x := uintptr(*p)
+			p = add1(p)
+			c |= (x & 0x7F) << off
+			if x&0x80 == 0 {
+				break
+			}
+		}
+		c *= n // now total number of bits to copy
+
+		// If the number of bits being repeated is small, load them
+		// into a register and use that register for the entire loop
+		// instead of repeatedly reading from memory.
+		// Handling fewer than 8 bits here makes the general loop simpler.
+		// The cutoff is goarch.PtrSize*8 - 7 to guarantee that when we add
+		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
+		// it will not overflow.
+		src := dst
+		const maxBits = goarch.PtrSize*8 - 7
+		if n <= maxBits {
+			// Start with bits in output buffer.
+			pattern := bits
+			npattern := nbits
+
+			// If we need more bits, fetch them from memory.
+			if size == 1 {
+				src = subtract1(src)
+				for npattern < n {
+					pattern <<= 8
+					pattern |= uintptr(*src)
+					src = subtract1(src)
+					npattern += 8
+				}
+			} else {
+				src = subtract1(src)
+				for npattern < n {
+					pattern <<= 4
+					pattern |= uintptr(*src) & 0xf
+					src = subtract1(src)
+					npattern += 4
+				}
+			}
+
+			// We started with the whole bit output buffer,
+			// and then we loaded bits from whole bytes.
+			// Either way, we might now have too many instead of too few.
+			// Discard the extra.
+			if npattern > n {
+				pattern >>= npattern - n
+				npattern = n
+			}
+
+			// Replicate pattern to at most maxBits.
+			if npattern == 1 {
+				// One bit being repeated.
+				// If the bit is 1, make the pattern all 1s.
+				// If the bit is 0, the pattern is already all 0s,
+				// but we can claim that the number of bits
+				// in the word is equal to the number we need (c),
+				// because right shift of bits will zero fill.
+				if pattern == 1 {
+					pattern = 1<<maxBits - 1
+					npattern = maxBits
+				} else {
+					npattern = c
+				}
+			} else {
+				b := pattern
+				nb := npattern
+				if nb+nb <= maxBits {
+					// Double pattern until the whole uintptr is filled.
+					for nb <= goarch.PtrSize*8 {
+						b |= b << nb
+						nb += nb
+					}
+					// Trim away incomplete copy of original pattern in high bits.
+					// TODO(rsc): Replace with table lookup or loop on systems without divide?
+					nb = maxBits / npattern * npattern
+					b &= 1<<nb - 1
+					pattern = b
+					npattern = nb
+				}
+			}
+
+			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
+			// Since pattern contains >8 bits, there will be full bytes to flush
+			// on each iteration.
+			for ; c >= npattern; c -= npattern {
+				bits |= pattern << nbits
+				nbits += npattern
+				if size == 1 {
+					for nbits >= 8 {
+						*dst = uint8(bits)
+						dst = add1(dst)
+						bits >>= 8
+						nbits -= 8
+					}
+				} else {
+					for nbits >= 4 {
+						*dst = uint8(bits&0xf | bitScanAll)
+						dst = add1(dst)
+						bits >>= 4
+						nbits -= 4
+					}
+				}
+			}
+
+			// Add final fragment to bit buffer.
+			if c > 0 {
+				pattern &= 1<<c - 1
+				bits |= pattern << nbits
+				nbits += c
+			}
+			continue Run
+		}
+
+		// Repeat; n too large to fit in a register.
+		// Since nbits <= 7, we know the first few bytes of repeated data
+		// are already written to memory.
+		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
+		if size == 1 {
+			// Leading src fragment.
+			src = subtractb(src, (off+7)/8)
+			if frag := off & 7; frag != 0 {
+				bits |= uintptr(*src) >> (8 - frag) << nbits
+				src = add1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 8; i > 0; i-- {
+				bits |= uintptr(*src) << nbits
+				src = add1(src)
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			}
+			// Final src fragment.
+			if c %= 8; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		} else {
+			// Leading src fragment.
+			src = subtractb(src, (off+3)/4)
+			if frag := off & 3; frag != 0 {
+				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
+				src = add1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 4; i > 0; i-- {
+				bits |= (uintptr(*src) & 0xf) << nbits
+				src = add1(src)
+				*dst = uint8(bits&0xf | bitScanAll)
+				dst = add1(dst)
+				bits >>= 4
+			}
+			// Final src fragment.
+			if c %= 4; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		}
+	}
+
+	// Write any final bits out, using full-byte writes, even for the final byte.
+	var totalBits uintptr
+	if size == 1 {
+		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
+		nbits += -nbits & 7
+		for ; nbits > 0; nbits -= 8 {
+			*dst = uint8(bits)
+			dst = add1(dst)
+			bits >>= 8
+		}
+	} else {
+		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*4 + nbits
+		nbits += -nbits & 3
+		for ; nbits > 0; nbits -= 4 {
+			v := bits&0xf | bitScanAll
+			*dst = uint8(v)
+			dst = add1(dst)
+			bits >>= 4
+		}
+	}
+	return totalBits
+}
+
+// materializeGCProg allocates space for the (1-bit) pointer bitmask
+// for an object of size ptrdata.  Then it fills that space with the
+// pointer bitmask specified by the program prog.
+// The bitmask starts at s.startAddr.
+// The result must be deallocated with dematerializeGCProg.
+func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
+	// Each word of ptrdata needs one bit in the bitmap.
+	bitmapBytes := divRoundUp(ptrdata, 8*goarch.PtrSize)
+	// Compute the number of pages needed for bitmapBytes.
+	pages := divRoundUp(bitmapBytes, pageSize)
+	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
+	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
+	return s
+}
+func dematerializeGCProg(s *mspan) {
+	mheap_.freeManual(s, spanAllocPtrScalarBits)
+}
+
+func dumpGCProg(p *byte) {
+	nptr := 0
+	for {
+		x := *p
+		p = add1(p)
+		if x == 0 {
+			print("\t", nptr, " end\n")
+			break
+		}
+		if x&0x80 == 0 {
+			print("\t", nptr, " lit ", x, ":")
+			n := int(x+7) / 8
+			for i := 0; i < n; i++ {
+				print(" ", hex(*p))
+				p = add1(p)
+			}
+			print("\n")
+			nptr += int(x)
+		} else {
+			nbit := int(x &^ 0x80)
+			if nbit == 0 {
+				for nb := uint(0); ; nb += 7 {
+					x := *p
+					p = add1(p)
+					nbit |= int(x&0x7f) << nb
+					if x&0x80 == 0 {
+						break
+					}
+				}
+			}
+			count := 0
+			for nb := uint(0); ; nb += 7 {
+				x := *p
+				p = add1(p)
+				count |= int(x&0x7f) << nb
+				if x&0x80 == 0 {
+					break
+				}
+			}
+			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
+			nptr += nbit * count
+		}
+	}
+}
+
+// Testing.
+
+func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
+	target := (*stkframe)(ctxt)
+	if frame.sp <= target.sp && target.sp < frame.varp {
+		*target = *frame
+		return false
+	}
+	return true
+}
+
+// gcbits returns the GC type info for x, for testing.
+// The result is the bitmap entries (0 or 1), one entry per byte.
+//go:linkname reflect_gcbits reflect.gcbits
+func reflect_gcbits(x any) []byte {
+	ret := getgcmask(x)
+	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
+	nptr := typ.ptrdata / goarch.PtrSize
+	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
+		ret = ret[:len(ret)-1]
+	}
+	return ret
+}
+
+// Returns GC type info for the pointer stored in ep for testing.
+// If ep points to the stack, only static live information will be returned
+// (i.e. not for objects which are only dynamically live stack objects).
+func getgcmask(ep any) (mask []byte) {
+	e := *efaceOf(&ep)
+	p := e.data
+	t := e._type
+	// data or bss
+	for _, datap := range activeModules() {
+		// data
+		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			bitmap := datap.gcdatamask.bytedata
+			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+			mask = make([]byte, n/goarch.PtrSize)
+			for i := uintptr(0); i < n; i += goarch.PtrSize {
+				off := (uintptr(p) + i - datap.data) / goarch.PtrSize
+				mask[i/goarch.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+			}
+			return
+		}
+
+		// bss
+		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			bitmap := datap.gcbssmask.bytedata
+			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+			mask = make([]byte, n/goarch.PtrSize)
+			for i := uintptr(0); i < n; i += goarch.PtrSize {
+				off := (uintptr(p) + i - datap.bss) / goarch.PtrSize
+				mask[i/goarch.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+			}
+			return
+		}
+	}
+
+	// heap
+	if base, s, _ := findObject(uintptr(p), 0, 0); base != 0 {
+		hbits := heapBitsForAddr(base)
+		n := s.elemsize
+		mask = make([]byte, n/goarch.PtrSize)
+		for i := uintptr(0); i < n; i += goarch.PtrSize {
+			if hbits.isPointer() {
+				mask[i/goarch.PtrSize] = 1
+			}
+			if !hbits.morePointers() {
+				mask = mask[:i/goarch.PtrSize]
+				break
+			}
+			hbits = hbits.next()
+		}
+		return
+	}
+
+	// stack
+	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
+		var frame stkframe
+		frame.sp = uintptr(p)
+		_g_ := getg()
+		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
+		if frame.fn.valid() {
+			locals, _, _ := getStackMap(&frame, nil, false)
+			if locals.n == 0 {
+				return
+			}
+			size := uintptr(locals.n) * goarch.PtrSize
+			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+			mask = make([]byte, n/goarch.PtrSize)
+			for i := uintptr(0); i < n; i += goarch.PtrSize {
+				off := (uintptr(p) + i - frame.varp + size) / goarch.PtrSize
+				mask[i/goarch.PtrSize] = locals.ptrbit(off)
+			}
+		}
+		return
+	}
+
+	// otherwise, not something the GC knows about.
+	// possibly read-only data, like malloc(0).
+	// must not have pointers
+	return
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mcache.go b/contrib/go/_std_1.18/src/runtime/mcache.go
new file mode 100644
index 0000000000..c58e05dd29
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mcache.go
@@ -0,0 +1,298 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
+// No locking needed because it is per-thread (per-P).
+//
+// mcaches are allocated from non-GC'd memory, so any heap pointers
+// must be specially handled.
+//
+//go:notinheap
+type mcache struct {
+	// The following members are accessed on every malloc,
+	// so they are grouped here for better caching.
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
+
+	// Allocator cache for tiny objects w/o pointers.
+	// See "Tiny allocator" comment in malloc.go.
+
+	// tiny points to the beginning of the current tiny block, or
+	// nil if there is no current tiny block.
+	//
+	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
+	// we handle it by clearing it in releaseAll during mark
+	// termination.
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
+	tiny       uintptr
+	tinyoffset uintptr
+	tinyAllocs uintptr
+
+	// The rest is not accessed on every malloc.
+
+	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
+
+	stackcache [_NumStackOrders]stackfreelist
+
+	// flushGen indicates the sweepgen during which this mcache
+	// was last flushed. If flushGen != mheap_.sweepgen, the spans
+	// in this mcache are stale and need to the flushed so they
+	// can be swept. This is done in acquirep.
+	flushGen uint32
+}
+
+// A gclink is a node in a linked list of blocks, like mlink,
+// but it is opaque to the garbage collector.
+// The GC does not trace the pointers during collection,
+// and the compiler does not emit write barriers for assignments
+// of gclinkptr values. Code should store references to gclinks
+// as gclinkptr, not as *gclink.
+type gclink struct {
+	next gclinkptr
+}
+
+// A gclinkptr is a pointer to a gclink, but it is opaque
+// to the garbage collector.
+type gclinkptr uintptr
+
+// ptr returns the *gclink form of p.
+// The result should be used for accessing fields, not stored
+// in other data structures.
+func (p gclinkptr) ptr() *gclink {
+	return (*gclink)(unsafe.Pointer(p))
+}
+
+type stackfreelist struct {
+	list gclinkptr // linked list of free stacks
+	size uintptr   // total size of stacks in list
+}
+
+// dummy mspan that contains no free objects.
+var emptymspan mspan
+
+func allocmcache() *mcache {
+	var c *mcache
+	systemstack(func() {
+		lock(&mheap_.lock)
+		c = (*mcache)(mheap_.cachealloc.alloc())
+		c.flushGen = mheap_.sweepgen
+		unlock(&mheap_.lock)
+	})
+	for i := range c.alloc {
+		c.alloc[i] = &emptymspan
+	}
+	c.nextSample = nextSample()
+	return c
+}
+
+// freemcache releases resources associated with this
+// mcache and puts the object onto a free list.
+//
+// In some cases there is no way to simply release
+// resources, such as statistics, so donate them to
+// a different mcache (the recipient).
+func freemcache(c *mcache) {
+	systemstack(func() {
+		c.releaseAll()
+		stackcache_clear(c)
+
+		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
+		// with the stealing of gcworkbufs during garbage collection to avoid
+		// a race where the workbuf is double-freed.
+		// gcworkbuffree(c.gcworkbuf)
+
+		lock(&mheap_.lock)
+		mheap_.cachealloc.free(unsafe.Pointer(c))
+		unlock(&mheap_.lock)
+	})
+}
+
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Returns nil if we're not bootstrapping or we don't have a P. The caller's
+// P must not change, so we must be in a non-preemptible state.
+func getMCache(mp *m) *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := mp.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
+// refill acquires a new span of span class spc for c. This span will
+// have at least one free object. The current span in c must be full.
+//
+// Must run in a non-preemptible context since otherwise the owner of
+// c could change.
+func (c *mcache) refill(spc spanClass) {
+	// Return the current cached span to the central lists.
+	s := c.alloc[spc]
+
+	if uintptr(s.allocCount) != s.nelems {
+		throw("refill of span with free space remaining")
+	}
+	if s != &emptymspan {
+		// Mark this span as no longer cached.
+		if s.sweepgen != mheap_.sweepgen+3 {
+			throw("bad sweepgen in refill")
+		}
+		mheap_.central[spc].mcentral.uncacheSpan(s)
+	}
+
+	// Get a new cached span from the central lists.
+	s = mheap_.central[spc].mcentral.cacheSpan()
+	if s == nil {
+		throw("out of memory")
+	}
+
+	if uintptr(s.allocCount) == s.nelems {
+		throw("span has no free space")
+	}
+
+	// Indicate that this span is cached and prevent asynchronous
+	// sweeping in the next sweep phase.
+	s.sweepgen = mheap_.sweepgen + 3
+
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	stats := memstats.heapStats.acquire()
+	atomic.Xadd64(&stats.smallAllocCount[spc.sizeclass()], int64(s.nelems)-int64(s.allocCount))
+
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&stats.tinyAllocCount, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+	memstats.heapStats.release()
+
+	// Update heapLive with the same assumption.
+	// While we're here, flush scanAlloc, since we have to call
+	// revise anyway.
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	gcController.update(int64(s.npages*pageSize)-int64(usedBytes), int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	c.alloc[spc] = s
+}
+
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc)
+	if s == nil {
+		throw("out of memory")
+	}
+	stats := memstats.heapStats.acquire()
+	atomic.Xadd64(&stats.largeAlloc, int64(npages*pageSize))
+	atomic.Xadd64(&stats.largeAllocCount, 1)
+	memstats.heapStats.release()
+
+	// Update heapLive.
+	gcController.update(int64(s.npages*pageSize), 0)
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
+func (c *mcache) releaseAll() {
+	// Take this opportunity to flush scanAlloc.
+	scanAlloc := int64(c.scanAlloc)
+	c.scanAlloc = 0
+
+	sg := mheap_.sweepgen
+	dHeapLive := int64(0)
+	for i := range c.alloc {
+		s := c.alloc[i]
+		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := int64(s.nelems) - int64(s.allocCount)
+			stats := memstats.heapStats.acquire()
+			atomic.Xadd64(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release()
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in gcController.heapLive.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// gcController.heapLive was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				dHeapLive -= n * int64(s.elemsize)
+			}
+			// Release the span to the mcentral.
+			mheap_.central[i].mcentral.uncacheSpan(s)
+			c.alloc[i] = &emptymspan
+		}
+	}
+	// Clear tinyalloc pool.
+	c.tiny = 0
+	c.tinyoffset = 0
+
+	// Flush tinyAllocs.
+	stats := memstats.heapStats.acquire()
+	atomic.Xadd64(&stats.tinyAllocCount, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
+	memstats.heapStats.release()
+
+	// Updated heapScan and heapLive.
+	gcController.update(dHeapLive, scanAlloc)
+}
+
+// prepareForSweep flushes c if the system has entered a new sweep phase
+// since c was populated. This must happen between the sweep phase
+// starting and the first allocation from c.
+func (c *mcache) prepareForSweep() {
+	// Alternatively, instead of making sure we do this on every P
+	// between starting the world and allocating on that P, we
+	// could leave allocate-black on, allow allocation to continue
+	// as usual, use a ragged barrier at the beginning of sweep to
+	// ensure all cached spans are swept, and then disable
+	// allocate-black. However, with this approach it's difficult
+	// to avoid spilling mark bits into the *next* GC cycle.
+	sg := mheap_.sweepgen
+	if c.flushGen == sg {
+		return
+	} else if c.flushGen != sg-2 {
+		println("bad flushGen", c.flushGen, "in prepareForSweep; sweepgen", sg)
+		throw("bad flushGen")
+	}
+	c.releaseAll()
+	stackcache_clear(c)
+	atomic.Store(&c.flushGen, mheap_.sweepgen) // Synchronizes with gcStart
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mcentral.go b/contrib/go/_std_1.18/src/runtime/mcentral.go
new file mode 100644
index 0000000000..e4bdf35071
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mcentral.go
@@ -0,0 +1,255 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Central free lists.
+//
+// See malloc.go for an overview.
+//
+// The mcentral doesn't actually contain the list of free objects; the mspan does.
+// Each mcentral is two lists of mspans: those with free objects (c->nonempty)
+// and those that are completely allocated (c->empty).
+
+package runtime
+
+import "runtime/internal/atomic"
+
+// Central list of free objects of a given size.
+//
+//go:notinheap
+type mcentral struct {
+	spanclass spanClass
+
+	// partial and full contain two mspan sets: one of swept in-use
+	// spans, and one of unswept in-use spans. These two trade
+	// roles on each GC cycle. The unswept set is drained either by
+	// allocation or by the background sweeper in every GC cycle,
+	// so only two roles are necessary.
+	//
+	// sweepgen is increased by 2 on each GC cycle, so the swept
+	// spans are in partial[sweepgen/2%2] and the unswept spans are in
+	// partial[1-sweepgen/2%2]. Sweeping pops spans from the
+	// unswept set and pushes spans that are still in-use on the
+	// swept set. Likewise, allocating an in-use span pushes it
+	// on the swept set.
+	//
+	// Some parts of the sweeper can sweep arbitrary spans, and hence
+	// can't remove them from the unswept set, but will add the span
+	// to the appropriate swept list. As a result, the parts of the
+	// sweeper and mcentral that do consume from the unswept list may
+	// encounter swept spans, and these should be ignored.
+	partial [2]spanSet // list of spans with a free object
+	full    [2]spanSet // list of spans with no free objects
+}
+
+// Initialize a single central free list.
+func (c *mcentral) init(spc spanClass) {
+	c.spanclass = spc
+	lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
+}
+
+// partialUnswept returns the spanSet which holds partially-filled
+// unswept spans for this sweepgen.
+func (c *mcentral) partialUnswept(sweepgen uint32) *spanSet {
+	return &c.partial[1-sweepgen/2%2]
+}
+
+// partialSwept returns the spanSet which holds partially-filled
+// swept spans for this sweepgen.
+func (c *mcentral) partialSwept(sweepgen uint32) *spanSet {
+	return &c.partial[sweepgen/2%2]
+}
+
+// fullUnswept returns the spanSet which holds unswept spans without any
+// free slots for this sweepgen.
+func (c *mcentral) fullUnswept(sweepgen uint32) *spanSet {
+	return &c.full[1-sweepgen/2%2]
+}
+
+// fullSwept returns the spanSet which holds swept spans without any
+// free slots for this sweepgen.
+func (c *mcentral) fullSwept(sweepgen uint32) *spanSet {
+	return &c.full[sweepgen/2%2]
+}
+
+// Allocate a span to use in an mcache.
+func (c *mcentral) cacheSpan() *mspan {
+	// Deduct credit for this span allocation and sweep if necessary.
+	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
+	deductSweepCredit(spanBytes, 0)
+
+	traceDone := false
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	// If we sweep spanBudget spans without finding any free
+	// space, just allocate a fresh span. This limits the amount
+	// of time we can spend trying to find free space and
+	// amortizes the cost of small object sweeping over the
+	// benefit of having a full free span to allocate from. By
+	// setting this to 100, we limit the space overhead to 1%.
+	//
+	// TODO(austin,mknyszek): This still has bad worst-case
+	// throughput. For example, this could find just one free slot
+	// on the 100th swept span. That limits allocation latency, but
+	// still has very poor throughput. We could instead keep a
+	// running free-to-used budget and switch to fresh span
+	// allocation if the budget runs low.
+	spanBudget := 100
+
+	var s *mspan
+	var sl sweepLocker
+
+	// Try partial swept spans first.
+	sg := mheap_.sweepgen
+	if s = c.partialSwept(sg).pop(); s != nil {
+		goto havespan
+	}
+
+	sl = sweep.active.begin()
+	if sl.valid {
+		// Now try partial unswept spans.
+		for ; spanBudget >= 0; spanBudget-- {
+			s = c.partialUnswept(sg).pop()
+			if s == nil {
+				break
+			}
+			if s, ok := sl.tryAcquire(s); ok {
+				// We got ownership of the span, so let's sweep it and use it.
+				s.sweep(true)
+				sweep.active.end(sl)
+				goto havespan
+			}
+			// We failed to get ownership of the span, which means it's being or
+			// has been swept by an asynchronous sweeper that just couldn't remove it
+			// from the unswept list. That sweeper took ownership of the span and
+			// responsibility for either freeing it to the heap or putting it on the
+			// right swept list. Either way, we should just ignore it (and it's unsafe
+			// for us to do anything else).
+		}
+		// Now try full unswept spans, sweeping them and putting them into the
+		// right list if we fail to get a span.
+		for ; spanBudget >= 0; spanBudget-- {
+			s = c.fullUnswept(sg).pop()
+			if s == nil {
+				break
+			}
+			if s, ok := sl.tryAcquire(s); ok {
+				// We got ownership of the span, so let's sweep it.
+				s.sweep(true)
+				// Check if there's any free space.
+				freeIndex := s.nextFreeIndex()
+				if freeIndex != s.nelems {
+					s.freeindex = freeIndex
+					sweep.active.end(sl)
+					goto havespan
+				}
+				// Add it to the swept list, because sweeping didn't give us any free space.
+				c.fullSwept(sg).push(s.mspan)
+			}
+			// See comment for partial unswept spans.
+		}
+		sweep.active.end(sl)
+	}
+	if trace.enabled {
+		traceGCSweepDone()
+		traceDone = true
+	}
+
+	// We failed to get a span from the mcentral so get one from mheap.
+	s = c.grow()
+	if s == nil {
+		return nil
+	}
+
+	// At this point s is a span that should have free slots.
+havespan:
+	if trace.enabled && !traceDone {
+		traceGCSweepDone()
+	}
+	n := int(s.nelems) - int(s.allocCount)
+	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
+		throw("span has no free objects")
+	}
+	freeByteBase := s.freeindex &^ (64 - 1)
+	whichByte := freeByteBase / 8
+	// Init alloc bits cache.
+	s.refillAllocCache(whichByte)
+
+	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
+	// s.allocCache.
+	s.allocCache >>= s.freeindex % 64
+
+	return s
+}
+
+// Return span from an mcache.
+//
+// s must have a span class corresponding to this
+// mcentral and it must not be empty.
+func (c *mcentral) uncacheSpan(s *mspan) {
+	if s.allocCount == 0 {
+		throw("uncaching span but s.allocCount == 0")
+	}
+
+	sg := mheap_.sweepgen
+	stale := s.sweepgen == sg+1
+
+	// Fix up sweepgen.
+	if stale {
+		// Span was cached before sweep began. It's our
+		// responsibility to sweep it.
+		//
+		// Set sweepgen to indicate it's not cached but needs
+		// sweeping and can't be allocated from. sweep will
+		// set s.sweepgen to indicate s is swept.
+		atomic.Store(&s.sweepgen, sg-1)
+	} else {
+		// Indicate that s is no longer cached.
+		atomic.Store(&s.sweepgen, sg)
+	}
+
+	// Put the span in the appropriate place.
+	if stale {
+		// It's stale, so just sweep it. Sweeping will put it on
+		// the right list.
+		//
+		// We don't use a sweepLocker here. Stale cached spans
+		// aren't in the global sweep lists, so mark termination
+		// itself holds up sweep completion until all mcaches
+		// have been swept.
+		ss := sweepLocked{s}
+		ss.sweep(false)
+	} else {
+		if int(s.nelems)-int(s.allocCount) > 0 {
+			// Put it back on the partial swept list.
+			c.partialSwept(sg).push(s)
+		} else {
+			// There's no free space and it's not stale, so put it on the
+			// full swept list.
+			c.fullSwept(sg).push(s)
+		}
+	}
+}
+
+// grow allocates a new empty span from the heap and initializes it for c's size class.
+func (c *mcentral) grow() *mspan {
+	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
+	size := uintptr(class_to_size[c.spanclass.sizeclass()])
+
+	s := mheap_.alloc(npages, c.spanclass)
+	if s == nil {
+		return nil
+	}
+
+	// Use division by multiplication and shifts to quickly compute:
+	// n := (npages << _PageShift) / size
+	n := s.divideByElemSize(npages << _PageShift)
+	s.limit = s.base() + size*n
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mcheckmark.go b/contrib/go/_std_1.18/src/runtime/mcheckmark.go
new file mode 100644
index 0000000000..1dd28585f1
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mcheckmark.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GC checkmarks
+//
+// In a concurrent garbage collector, one worries about failing to mark
+// a live object due to mutations without write barriers or bugs in the
+// collector implementation. As a sanity check, the GC has a 'checkmark'
+// mode that retraverses the object graph with the world stopped, to make
+// sure that everything that should be marked is marked.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// A checkmarksMap stores the GC marks in "checkmarks" mode. It is a
+// per-arena bitmap with a bit for every word in the arena. The mark
+// is stored on the bit corresponding to the first word of the marked
+// allocation.
+//
+//go:notinheap
+type checkmarksMap [heapArenaBytes / goarch.PtrSize / 8]uint8
+
+// If useCheckmark is true, marking of an object uses the checkmark
+// bits instead of the standard mark bits.
+var useCheckmark = false
+
+// startCheckmarks prepares for the checkmarks phase.
+//
+// The world must be stopped.
+func startCheckmarks() {
+	assertWorldStopped()
+
+	// Clear all checkmarks.
+	for _, ai := range mheap_.allArenas {
+		arena := mheap_.arenas[ai.l1()][ai.l2()]
+		bitmap := arena.checkmarks
+
+		if bitmap == nil {
+			// Allocate bitmap on first use.
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gcMiscSys))
+			if bitmap == nil {
+				throw("out of memory allocating checkmarks bitmap")
+			}
+			arena.checkmarks = bitmap
+		} else {
+			// Otherwise clear the existing bitmap.
+			for i := range bitmap {
+				bitmap[i] = 0
+			}
+		}
+	}
+	// Enable checkmarking.
+	useCheckmark = true
+}
+
+// endCheckmarks ends the checkmarks phase.
+func endCheckmarks() {
+	if gcMarkWorkAvailable(nil) {
+		throw("GC work not flushed")
+	}
+	useCheckmark = false
+}
+
+// setCheckmark throws if marking object is a checkmarks violation,
+// and otherwise sets obj's checkmark. It returns true if obj was
+// already checkmarked.
+func setCheckmark(obj, base, off uintptr, mbits markBits) bool {
+	if !mbits.isMarked() {
+		printlock()
+		print("runtime: checkmarks found unexpected unmarked object obj=", hex(obj), "\n")
+		print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
+
+		// Dump the source (base) object
+		gcDumpObject("base", base, off)
+
+		// Dump the object
+		gcDumpObject("obj", obj, ^uintptr(0))
+
+		getg().m.traceback = 2
+		throw("checkmark found unmarked object")
+	}
+
+	ai := arenaIndex(obj)
+	arena := mheap_.arenas[ai.l1()][ai.l2()]
+	arenaWord := (obj / heapArenaBytes / 8) % uintptr(len(arena.checkmarks))
+	mask := byte(1 << ((obj / heapArenaBytes) % 8))
+	bytep := &arena.checkmarks[arenaWord]
+
+	if atomic.Load8(bytep)&mask != 0 {
+		// Already checkmarked.
+		return true
+	}
+
+	atomic.Or8(bytep, mask)
+	return false
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mem_darwin.go b/contrib/go/_std_1.18/src/runtime/mem_darwin.go
new file mode 100644
index 0000000000..9f836c0818
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mem_darwin.go
@@ -0,0 +1,72 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// Don't split the stack as this function may be invoked without a valid G,
+// which prevents us from allocating more stack.
+//go:nosplit
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		return nil
+	}
+	sysStat.add(int64(n))
+	return v
+}
+
+func sysUnused(v unsafe.Pointer, n uintptr) {
+	// MADV_FREE_REUSABLE is like MADV_FREE except it also propagates
+	// accounting information about the process to task_info.
+	madvise(v, n, _MADV_FREE_REUSABLE)
+}
+
+func sysUsed(v unsafe.Pointer, n uintptr) {
+	// MADV_FREE_REUSE is necessary to keep the kernel's accounting
+	// accurate. If called on any memory region that hasn't been
+	// MADV_FREE_REUSABLE'd, it's a no-op.
+	madvise(v, n, _MADV_FREE_REUSE)
+}
+
+func sysHugePage(v unsafe.Pointer, n uintptr) {
+}
+
+// Don't split the stack as this function may be invoked without a valid G,
+// which prevents us from allocating more stack.
+//go:nosplit
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
+	munmap(v, n)
+}
+
+func sysFault(v unsafe.Pointer, n uintptr) {
+	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0)
+}
+
+func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		return nil
+	}
+	return p
+}
+
+const _ENOMEM = 12
+
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
+
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+	if err == _ENOMEM {
+		throw("runtime: out of memory")
+	}
+	if p != v || err != 0 {
+		print("runtime: mmap(", v, ", ", n, ") returned ", p, ", ", err, "\n")
+		throw("runtime: cannot map pages in arena address space")
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mem_linux.go b/contrib/go/_std_1.18/src/runtime/mem_linux.go
new file mode 100644
index 0000000000..f8333014c2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mem_linux.go
@@ -0,0 +1,195 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	_EACCES = 13
+	_EINVAL = 22
+)
+
+// Don't split the stack as this method may be invoked without a valid G, which
+// prevents us from allocating more stack.
+//go:nosplit
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		if err == _EACCES {
+			print("runtime: mmap: access denied\n")
+			exit(2)
+		}
+		if err == _EAGAIN {
+			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
+			exit(2)
+		}
+		return nil
+	}
+	sysStat.add(int64(n))
+	return p
+}
+
+var adviseUnused = uint32(_MADV_FREE)
+
+func sysUnused(v unsafe.Pointer, n uintptr) {
+	// By default, Linux's "transparent huge page" support will
+	// merge pages into a huge page if there's even a single
+	// present regular page, undoing the effects of madvise(adviseUnused)
+	// below. On amd64, that means khugepaged can turn a single
+	// 4KB page to 2MB, bloating the process's RSS by as much as
+	// 512X. (See issue #8832 and Linux kernel bug
+	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
+	//
+	// To work around this, we explicitly disable transparent huge
+	// pages when we release pages of the heap. However, we have
+	// to do this carefully because changing this flag tends to
+	// split the VMA (memory mapping) containing v in to three
+	// VMAs in order to track the different values of the
+	// MADV_NOHUGEPAGE flag in the different regions. There's a
+	// default limit of 65530 VMAs per address space (sysctl
+	// vm.max_map_count), so we must be careful not to create too
+	// many VMAs (see issue #12233).
+	//
+	// Since huge pages are huge, there's little use in adjusting
+	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
+	// exploding the number of VMAs by only adjusting the
+	// MADV_NOHUGEPAGE flag on a large granularity. This still
+	// gets most of the benefit of huge pages while keeping the
+	// number of VMAs under control. With hugePageSize = 2MB, even
+	// a pessimal heap can reach 128GB before running out of VMAs.
+	if physHugePageSize != 0 {
+		// If it's a large allocation, we want to leave huge
+		// pages enabled. Hence, we only adjust the huge page
+		// flag on the huge pages containing v and v+n-1, and
+		// only if those aren't aligned.
+		var head, tail uintptr
+		if uintptr(v)&(physHugePageSize-1) != 0 {
+			// Compute huge page containing v.
+			head = alignDown(uintptr(v), physHugePageSize)
+		}
+		if (uintptr(v)+n)&(physHugePageSize-1) != 0 {
+			// Compute huge page containing v+n-1.
+			tail = alignDown(uintptr(v)+n-1, physHugePageSize)
+		}
+
+		// Note that madvise will return EINVAL if the flag is
+		// already set, which is quite likely. We ignore
+		// errors.
+		if head != 0 && head+physHugePageSize == tail {
+			// head and tail are different but adjacent,
+			// so do this in one call.
+			madvise(unsafe.Pointer(head), 2*physHugePageSize, _MADV_NOHUGEPAGE)
+		} else {
+			// Advise the huge pages containing v and v+n-1.
+			if head != 0 {
+				madvise(unsafe.Pointer(head), physHugePageSize, _MADV_NOHUGEPAGE)
+			}
+			if tail != 0 && tail != head {
+				madvise(unsafe.Pointer(tail), physHugePageSize, _MADV_NOHUGEPAGE)
+			}
+		}
+	}
+
+	if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 {
+		// madvise will round this to any physical page
+		// *covered* by this range, so an unaligned madvise
+		// will release more memory than intended.
+		throw("unaligned sysUnused")
+	}
+
+	var advise uint32
+	if debug.madvdontneed != 0 {
+		advise = _MADV_DONTNEED
+	} else {
+		advise = atomic.Load(&adviseUnused)
+	}
+	if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 {
+		// MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is
+		// not supported.
+		atomic.Store(&adviseUnused, _MADV_DONTNEED)
+		madvise(v, n, _MADV_DONTNEED)
+	}
+
+	if debug.harddecommit > 0 {
+		p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+		if p != v || err != 0 {
+			throw("runtime: cannot disable permissions in address space")
+		}
+	}
+}
+
+func sysUsed(v unsafe.Pointer, n uintptr) {
+	if debug.harddecommit > 0 {
+		p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+		if err == _ENOMEM {
+			throw("runtime: out of memory")
+		}
+		if p != v || err != 0 {
+			throw("runtime: cannot remap pages in address space")
+		}
+		return
+
+		// Don't do the sysHugePage optimization in hard decommit mode.
+		// We're breaking up pages everywhere, there's no point.
+	}
+	// Partially undo the NOHUGEPAGE marks from sysUnused
+	// for whole huge pages between v and v+n. This may
+	// leave huge pages off at the end points v and v+n
+	// even though allocations may cover these entire huge
+	// pages. We could detect this and undo NOHUGEPAGE on
+	// the end points as well, but it's probably not worth
+	// the cost because when neighboring allocations are
+	// freed sysUnused will just set NOHUGEPAGE again.
+	sysHugePage(v, n)
+}
+
+func sysHugePage(v unsafe.Pointer, n uintptr) {
+	if physHugePageSize != 0 {
+		// Round v up to a huge page boundary.
+		beg := alignUp(uintptr(v), physHugePageSize)
+		// Round v+n down to a huge page boundary.
+		end := alignDown(uintptr(v)+n, physHugePageSize)
+
+		if beg < end {
+			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
+		}
+	}
+}
+
+// Don't split the stack as this function may be invoked without a valid G,
+// which prevents us from allocating more stack.
+//go:nosplit
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
+	munmap(v, n)
+}
+
+func sysFault(v unsafe.Pointer, n uintptr) {
+	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0)
+}
+
+func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		return nil
+	}
+	return p
+}
+
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
+
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
+	if err == _ENOMEM {
+		throw("runtime: out of memory")
+	}
+	if p != v || err != 0 {
+		print("runtime: mmap(", v, ", ", n, ") returned ", p, ", ", err, "\n")
+		throw("runtime: cannot map pages in arena address space")
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/memclr_amd64.s b/contrib/go/_std_1.18/src/runtime/memclr_amd64.s
new file mode 100644
index 0000000000..700bbd7b9b
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/memclr_amd64.s
@@ -0,0 +1,179 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !plan9
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// See memclrNoHeapPointers Go doc for important implementation constraints.
+
+// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
+// ABIInternal for performance.
+TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
+	// AX = ptr
+	// BX = n
+	MOVQ	AX, DI	// DI = ptr
+	XORQ	AX, AX
+
+	// MOVOU seems always faster than REP STOSQ.
+tail:
+	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
+	TESTQ	BX, BX
+	JEQ	_0
+	CMPQ	BX, $2
+	JBE	_1or2
+	CMPQ	BX, $4
+	JBE	_3or4
+	CMPQ	BX, $8
+	JB	_5through7
+	JE	_8
+	CMPQ	BX, $16
+	JBE	_9through16
+	CMPQ	BX, $32
+	JBE	_17through32
+	CMPQ	BX, $64
+	JBE	_33through64
+	CMPQ	BX, $128
+	JBE	_65through128
+	CMPQ	BX, $256
+	JBE	_129through256
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE loop_preheader_avx2
+	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
+
+loop:
+	MOVOU	X15, 0(DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, 128(DI)
+	MOVOU	X15, 144(DI)
+	MOVOU	X15, 160(DI)
+	MOVOU	X15, 176(DI)
+	MOVOU	X15, 192(DI)
+	MOVOU	X15, 208(DI)
+	MOVOU	X15, 224(DI)
+	MOVOU	X15, 240(DI)
+	SUBQ	$256, BX
+	ADDQ	$256, DI
+	CMPQ	BX, $256
+	JAE	loop
+	JMP	tail
+
+loop_preheader_avx2:
+	VPXOR Y0, Y0, Y0
+	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
+	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
+	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
+	CMPQ    BX, $0x2000000
+	JAE     loop_preheader_avx2_huge
+loop_avx2:
+	VMOVDQU	Y0, 0(DI)
+	VMOVDQU	Y0, 32(DI)
+	VMOVDQU	Y0, 64(DI)
+	VMOVDQU	Y0, 96(DI)
+	SUBQ	$128, BX
+	ADDQ	$128, DI
+	CMPQ	BX, $128
+	JAE	loop_avx2
+	VMOVDQU  Y0, -32(DI)(BX*1)
+	VMOVDQU  Y0, -64(DI)(BX*1)
+	VMOVDQU  Y0, -96(DI)(BX*1)
+	VMOVDQU  Y0, -128(DI)(BX*1)
+	VZEROUPPER
+	RET
+loop_preheader_avx2_huge:
+	// Align to 32 byte boundary
+	VMOVDQU  Y0, 0(DI)
+	MOVQ	DI, SI
+	ADDQ	$32, DI
+	ANDQ	$~31, DI
+	SUBQ	DI, SI
+	ADDQ	SI, BX
+loop_avx2_huge:
+	VMOVNTDQ	Y0, 0(DI)
+	VMOVNTDQ	Y0, 32(DI)
+	VMOVNTDQ	Y0, 64(DI)
+	VMOVNTDQ	Y0, 96(DI)
+	SUBQ	$128, BX
+	ADDQ	$128, DI
+	CMPQ	BX, $128
+	JAE	loop_avx2_huge
+	// In the description of MOVNTDQ in [1]
+	// "... fencing operation implemented with the SFENCE or MFENCE instruction
+	// should be used in conjunction with MOVNTDQ instructions..."
+	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+	SFENCE
+	VMOVDQU  Y0, -32(DI)(BX*1)
+	VMOVDQU  Y0, -64(DI)(BX*1)
+	VMOVDQU  Y0, -96(DI)(BX*1)
+	VMOVDQU  Y0, -128(DI)(BX*1)
+	VZEROUPPER
+	RET
+
+_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+_0:
+	RET
+_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+_5through7:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+_8:
+	// We need a separate case for 8 to make sure we clear pointers atomically.
+	MOVQ	AX, (DI)
+	RET
+_9through16:
+	MOVQ	AX, (DI)
+	MOVQ	AX, -8(DI)(BX*1)
+	RET
+_17through32:
+	MOVOU	X15, (DI)
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+_33through64:
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+_65through128:
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+_129through256:
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, -128(DI)(BX*1)
+	MOVOU	X15, -112(DI)(BX*1)
+	MOVOU	X15, -96(DI)(BX*1)
+	MOVOU	X15, -80(DI)(BX*1)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/memmove_amd64.s b/contrib/go/_std_1.18/src/runtime/memmove_amd64.s
new file mode 100644
index 0000000000..eeb5033fd9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/memmove_amd64.s
@@ -0,0 +1,532 @@
+// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//go:build !plan9
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// See memmove Go doc for important implementation constraints.
+
+// func memmove(to, from unsafe.Pointer, n uintptr)
+// ABIInternal for performance.
+TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
+	// AX = to
+	// BX = from
+	// CX = n
+	MOVQ	AX, DI
+	MOVQ	BX, SI
+	MOVQ	CX, BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code. The REP MOVSQ instruction is really fast
+	// for large sizes. The cutover is approximately 2K.
+tail:
+	// move_129through256 or smaller work whether or not the source and the
+	// destination memory regions overlap because they load all data into
+	// registers before writing it back.  move_256through2048 on the other
+	// hand can be used only when the memory regions don't overlap or the copy
+	// direction is forward.
+	//
+	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
+	TESTQ	BX, BX
+	JEQ	move_0
+	CMPQ	BX, $2
+	JBE	move_1or2
+	CMPQ	BX, $4
+	JB	move_3
+	JBE	move_4
+	CMPQ	BX, $8
+	JB	move_5through7
+	JE	move_8
+	CMPQ	BX, $16
+	JBE	move_9through16
+	CMPQ	BX, $32
+	JBE	move_17through32
+	CMPQ	BX, $64
+	JBE	move_33through64
+	CMPQ	BX, $128
+	JBE	move_65through128
+	CMPQ	BX, $256
+	JBE	move_129through256
+
+	TESTB	$1, runtime·useAVXmemmove(SB)
+	JNZ	avxUnaligned
+
+/*
+ * check and set for backwards
+ */
+	CMPQ	SI, DI
+	JLS	back
+
+/*
+ * forward copy loop
+ */
+forward:
+	CMPQ	BX, $2048
+	JLS	move_256through2048
+
+	// If REP MOVSB isn't fast, don't use it
+	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
+	JNE	fwdBy8
+
+	// Check alignment
+	MOVL	SI, AX
+	ORL	DI, AX
+	TESTL	$7, AX
+	JEQ	fwdBy8
+
+	// Do 1 byte at a time
+	MOVQ	BX, CX
+	REP;	MOVSB
+	RET
+
+fwdBy8:
+	// Do 8 bytes at a time
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+	REP;	MOVSQ
+	JMP	tail
+
+back:
+/*
+ * check overlap
+ */
+	MOVQ	SI, CX
+	ADDQ	BX, CX
+	CMPQ	CX, DI
+	JLS	forward
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+
+/*
+ * copy
+ */
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+
+	SUBQ	$8, DI
+	SUBQ	$8, SI
+	REP;	MOVSQ
+
+	CLD
+	ADDQ	$8, DI
+	ADDQ	$8, SI
+	SUBQ	BX, DI
+	SUBQ	BX, SI
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	AX, (DI)
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_4:
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
+	RET
+move_3:
+	MOVW	(SI), AX
+	MOVB	2(SI), CX
+	MOVW	AX, (DI)
+	MOVB	CX, 2(DI)
+	RET
+move_5through7:
+	MOVL	(SI), AX
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	AX, (DI)
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_8:
+	// We need a separate case for 8 to make sure we write pointers atomically.
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	RET
+move_9through16:
+	MOVQ	(SI), AX
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	AX, (DI)
+	MOVQ	CX, -8(DI)(BX*1)
+	RET
+move_17through32:
+	MOVOU	(SI), X0
+	MOVOU	-16(SI)(BX*1), X1
+	MOVOU	X0, (DI)
+	MOVOU	X1, -16(DI)(BX*1)
+	RET
+move_33through64:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	-32(SI)(BX*1), X2
+	MOVOU	-16(SI)(BX*1), X3
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, -32(DI)(BX*1)
+	MOVOU	X3, -16(DI)(BX*1)
+	RET
+move_65through128:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	-64(SI)(BX*1), X4
+	MOVOU	-48(SI)(BX*1), X5
+	MOVOU	-32(SI)(BX*1), X6
+	MOVOU	-16(SI)(BX*1), X7
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, -64(DI)(BX*1)
+	MOVOU	X5, -48(DI)(BX*1)
+	MOVOU	X6, -32(DI)(BX*1)
+	MOVOU	X7, -16(DI)(BX*1)
+	RET
+move_129through256:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	64(SI), X4
+	MOVOU	80(SI), X5
+	MOVOU	96(SI), X6
+	MOVOU	112(SI), X7
+	MOVOU	-128(SI)(BX*1), X8
+	MOVOU	-112(SI)(BX*1), X9
+	MOVOU	-96(SI)(BX*1), X10
+	MOVOU	-80(SI)(BX*1), X11
+	MOVOU	-64(SI)(BX*1), X12
+	MOVOU	-48(SI)(BX*1), X13
+	MOVOU	-32(SI)(BX*1), X14
+	MOVOU	-16(SI)(BX*1), X15
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, 64(DI)
+	MOVOU	X5, 80(DI)
+	MOVOU	X6, 96(DI)
+	MOVOU	X7, 112(DI)
+	MOVOU	X8, -128(DI)(BX*1)
+	MOVOU	X9, -112(DI)(BX*1)
+	MOVOU	X10, -96(DI)(BX*1)
+	MOVOU	X11, -80(DI)(BX*1)
+	MOVOU	X12, -64(DI)(BX*1)
+	MOVOU	X13, -48(DI)(BX*1)
+	MOVOU	X14, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
+	// X15 must be zero on return
+	PXOR	X15, X15
+	RET
+move_256through2048:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	64(SI), X4
+	MOVOU	80(SI), X5
+	MOVOU	96(SI), X6
+	MOVOU	112(SI), X7
+	MOVOU	128(SI), X8
+	MOVOU	144(SI), X9
+	MOVOU	160(SI), X10
+	MOVOU	176(SI), X11
+	MOVOU	192(SI), X12
+	MOVOU	208(SI), X13
+	MOVOU	224(SI), X14
+	MOVOU	240(SI), X15
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, 64(DI)
+	MOVOU	X5, 80(DI)
+	MOVOU	X6, 96(DI)
+	MOVOU	X7, 112(DI)
+	MOVOU	X8, 128(DI)
+	MOVOU	X9, 144(DI)
+	MOVOU	X10, 160(DI)
+	MOVOU	X11, 176(DI)
+	MOVOU	X12, 192(DI)
+	MOVOU	X13, 208(DI)
+	MOVOU	X14, 224(DI)
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_256through2048
+	// X15 must be zero on return
+	PXOR	X15, X15
+	JMP	tail
+
+avxUnaligned:
+	// There are two implementations of move algorithm.
+	// The first one for non-overlapped memory regions. It uses forward copying.
+	// The second one for overlapped regions. It uses backward copying
+	MOVQ	DI, CX
+	SUBQ	SI, CX
+	// Now CX contains distance between SRC and DEST
+	CMPQ	CX, BX
+	// If the distance lesser than region length it means that regions are overlapped
+	JC	copy_backward
+
+	// Non-temporal copy would be better for big sizes.
+	CMPQ	BX, $0x100000
+	JAE	gobble_big_data_fwd
+
+	// Memory layout on the source side
+	// SI                                       CX
+	// |<---------BX before correction--------->|
+	// |       |<--BX corrected-->|             |
+	// |       |                  |<--- AX  --->|
+	// |<-R11->|                  |<-128 bytes->|
+	// +----------------------------------------+
+	// | Head  | Body             | Tail        |
+	// +-------+------------------+-------------+
+	// ^       ^                  ^
+	// |       |                  |
+	// Save head into Y4          Save tail into X5..X12
+	//         |
+	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
+	// Algorithm:
+	// 1. Unaligned save of the tail's 128 bytes
+	// 2. Unaligned save of the head's 32  bytes
+	// 3. Destination-aligned copying of body (128 bytes per iteration)
+	// 4. Put head on the new place
+	// 5. Put the tail on the new place
+	// It can be important to satisfy processor's pipeline requirements for
+	// small sizes as the cost of unaligned memory region copying is
+	// comparable with the cost of main loop. So code is slightly messed there.
+	// There is more clean implementation of that algorithm for bigger sizes
+	// where the cost of unaligned part copying is negligible.
+	// You can see it after gobble_big_data_fwd label.
+	LEAQ	(SI)(BX*1), CX
+	MOVQ	DI, R10
+	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
+	MOVOU	-0x80(CX), X5
+	MOVOU	-0x70(CX), X6
+	MOVQ	$0x80, AX
+	// Align destination address
+	ANDQ	$-32, DI
+	ADDQ	$32, DI
+	// Continue tail saving.
+	MOVOU	-0x60(CX), X7
+	MOVOU	-0x50(CX), X8
+	// Make R11 delta between aligned and unaligned destination addresses.
+	MOVQ	DI, R11
+	SUBQ	R10, R11
+	// Continue tail saving.
+	MOVOU	-0x40(CX), X9
+	MOVOU	-0x30(CX), X10
+	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
+	SUBQ	R11, BX
+	// Continue tail saving.
+	MOVOU	-0x20(CX), X11
+	MOVOU	-0x10(CX), X12
+	// The tail will be put on its place after main body copying.
+	// It's time for the unaligned heading part.
+	VMOVDQU	(SI), Y4
+	// Adjust source address to point past head.
+	ADDQ	R11, SI
+	SUBQ	AX, BX
+	// Aligned memory copying there
+gobble_128_loop:
+	VMOVDQU	(SI), Y0
+	VMOVDQU	0x20(SI), Y1
+	VMOVDQU	0x40(SI), Y2
+	VMOVDQU	0x60(SI), Y3
+	ADDQ	AX, SI
+	VMOVDQA	Y0, (DI)
+	VMOVDQA	Y1, 0x20(DI)
+	VMOVDQA	Y2, 0x40(DI)
+	VMOVDQA	Y3, 0x60(DI)
+	ADDQ	AX, DI
+	SUBQ	AX, BX
+	JA	gobble_128_loop
+	// Now we can store unaligned parts.
+	ADDQ	AX, BX
+	ADDQ	DI, BX
+	VMOVDQU	Y4, (R10)
+	VZEROUPPER
+	MOVOU	X5, -0x80(BX)
+	MOVOU	X6, -0x70(BX)
+	MOVOU	X7, -0x60(BX)
+	MOVOU	X8, -0x50(BX)
+	MOVOU	X9, -0x40(BX)
+	MOVOU	X10, -0x30(BX)
+	MOVOU	X11, -0x20(BX)
+	MOVOU	X12, -0x10(BX)
+	RET
+
+gobble_big_data_fwd:
+	// There is forward copying for big regions.
+	// It uses non-temporal mov instructions.
+	// Details of this algorithm are commented previously for small sizes.
+	LEAQ	(SI)(BX*1), CX
+	MOVOU	-0x80(SI)(BX*1), X5
+	MOVOU	-0x70(CX), X6
+	MOVOU	-0x60(CX), X7
+	MOVOU	-0x50(CX), X8
+	MOVOU	-0x40(CX), X9
+	MOVOU	-0x30(CX), X10
+	MOVOU	-0x20(CX), X11
+	MOVOU	-0x10(CX), X12
+	VMOVDQU	(SI), Y4
+	MOVQ	DI, R8
+	ANDQ	$-32, DI
+	ADDQ	$32, DI
+	MOVQ	DI, R10
+	SUBQ	R8, R10
+	SUBQ	R10, BX
+	ADDQ	R10, SI
+	LEAQ	(DI)(BX*1), CX
+	SUBQ	$0x80, BX
+gobble_mem_fwd_loop:
+	PREFETCHNTA 0x1C0(SI)
+	PREFETCHNTA 0x280(SI)
+	// Prefetch values were chosen empirically.
+	// Approach for prefetch usage as in 7.6.6 of [1]
+	// [1] 64-ia-32-architectures-optimization-manual.pdf
+	// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+	VMOVDQU	(SI), Y0
+	VMOVDQU	0x20(SI), Y1
+	VMOVDQU	0x40(SI), Y2
+	VMOVDQU	0x60(SI), Y3
+	ADDQ	$0x80, SI
+	VMOVNTDQ Y0, (DI)
+	VMOVNTDQ Y1, 0x20(DI)
+	VMOVNTDQ Y2, 0x40(DI)
+	VMOVNTDQ Y3, 0x60(DI)
+	ADDQ	$0x80, DI
+	SUBQ	$0x80, BX
+	JA		gobble_mem_fwd_loop
+	// NT instructions don't follow the normal cache-coherency rules.
+	// We need SFENCE there to make copied data available timely.
+	SFENCE
+	VMOVDQU	Y4, (R8)
+	VZEROUPPER
+	MOVOU	X5, -0x80(CX)
+	MOVOU	X6, -0x70(CX)
+	MOVOU	X7, -0x60(CX)
+	MOVOU	X8, -0x50(CX)
+	MOVOU	X9, -0x40(CX)
+	MOVOU	X10, -0x30(CX)
+	MOVOU	X11, -0x20(CX)
+	MOVOU	X12, -0x10(CX)
+	RET
+
+copy_backward:
+	MOVQ	DI, AX
+	// Backward copying is about the same as the forward one.
+	// Firstly we load unaligned tail in the beginning of region.
+	MOVOU	(SI), X5
+	MOVOU	0x10(SI), X6
+	ADDQ	BX, DI
+	MOVOU	0x20(SI), X7
+	MOVOU	0x30(SI), X8
+	LEAQ	-0x20(DI), R10
+	MOVQ	DI, R11
+	MOVOU	0x40(SI), X9
+	MOVOU	0x50(SI), X10
+	ANDQ	$0x1F, R11
+	MOVOU	0x60(SI), X11
+	MOVOU	0x70(SI), X12
+	XORQ	R11, DI
+	// Let's point SI to the end of region
+	ADDQ	BX, SI
+	// and load unaligned head into X4.
+	VMOVDQU	-0x20(SI), Y4
+	SUBQ	R11, SI
+	SUBQ	R11, BX
+	// If there is enough data for non-temporal moves go to special loop
+	CMPQ	BX, $0x100000
+	JA		gobble_big_data_bwd
+	SUBQ	$0x80, BX
+gobble_mem_bwd_loop:
+	VMOVDQU	-0x20(SI), Y0
+	VMOVDQU	-0x40(SI), Y1
+	VMOVDQU	-0x60(SI), Y2
+	VMOVDQU	-0x80(SI), Y3
+	SUBQ	$0x80, SI
+	VMOVDQA	Y0, -0x20(DI)
+	VMOVDQA	Y1, -0x40(DI)
+	VMOVDQA	Y2, -0x60(DI)
+	VMOVDQA	Y3, -0x80(DI)
+	SUBQ	$0x80, DI
+	SUBQ	$0x80, BX
+	JA		gobble_mem_bwd_loop
+	// Let's store unaligned data
+	VMOVDQU	Y4, (R10)
+	VZEROUPPER
+	MOVOU	X5, (AX)
+	MOVOU	X6, 0x10(AX)
+	MOVOU	X7, 0x20(AX)
+	MOVOU	X8, 0x30(AX)
+	MOVOU	X9, 0x40(AX)
+	MOVOU	X10, 0x50(AX)
+	MOVOU	X11, 0x60(AX)
+	MOVOU	X12, 0x70(AX)
+	RET
+
+gobble_big_data_bwd:
+	SUBQ	$0x80, BX
+gobble_big_mem_bwd_loop:
+	PREFETCHNTA -0x1C0(SI)
+	PREFETCHNTA -0x280(SI)
+	VMOVDQU	-0x20(SI), Y0
+	VMOVDQU	-0x40(SI), Y1
+	VMOVDQU	-0x60(SI), Y2
+	VMOVDQU	-0x80(SI), Y3
+	SUBQ	$0x80, SI
+	VMOVNTDQ	Y0, -0x20(DI)
+	VMOVNTDQ	Y1, -0x40(DI)
+	VMOVNTDQ	Y2, -0x60(DI)
+	VMOVNTDQ	Y3, -0x80(DI)
+	SUBQ	$0x80, DI
+	SUBQ	$0x80, BX
+	JA	gobble_big_mem_bwd_loop
+	SFENCE
+	VMOVDQU	Y4, (R10)
+	VZEROUPPER
+	MOVOU	X5, (AX)
+	MOVOU	X6, 0x10(AX)
+	MOVOU	X7, 0x20(AX)
+	MOVOU	X8, 0x30(AX)
+	MOVOU	X9, 0x40(AX)
+	MOVOU	X10, 0x50(AX)
+	MOVOU	X11, 0x60(AX)
+	MOVOU	X12, 0x70(AX)
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/metrics.go b/contrib/go/_std_1.18/src/runtime/metrics.go
new file mode 100644
index 0000000000..922dd2f814
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/metrics.go
@@ -0,0 +1,594 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Metrics implementation exported to runtime/metrics.
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var (
+	// metrics is a map of runtime/metrics keys to data used by the runtime
+	// to sample each metric's value. metricsInit indicates it has been
+	// initialized.
+	//
+	// These fields are protected by metricsSema which should be
+	// locked/unlocked with metricsLock() / metricsUnlock().
+	metricsSema uint32 = 1
+	metricsInit bool
+	metrics     map[string]metricData
+
+	sizeClassBuckets []float64
+	timeHistBuckets  []float64
+)
+
+type metricData struct {
+	// deps is the set of runtime statistics that this metric
+	// depends on. Before compute is called, the statAggregate
+	// which will be passed must ensure() these dependencies.
+	deps statDepSet
+
+	// compute is a function that populates a metricValue
+	// given a populated statAggregate structure.
+	compute func(in *statAggregate, out *metricValue)
+}
+
+func metricsLock() {
+	// Acquire the metricsSema but with handoff. Operations are typically
+	// expensive enough that queueing up goroutines and handing off between
+	// them will be noticeably better-behaved.
+	semacquire1(&metricsSema, true, 0, 0)
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&metricsSema))
+	}
+}
+
+func metricsUnlock() {
+	if raceenabled {
+		racerelease(unsafe.Pointer(&metricsSema))
+	}
+	semrelease(&metricsSema)
+}
+
+// initMetrics initializes the metrics map if it hasn't been yet.
+//
+// metricsSema must be held.
+func initMetrics() {
+	if metricsInit {
+		return
+	}
+
+	sizeClassBuckets = make([]float64, _NumSizeClasses, _NumSizeClasses+1)
+	// Skip size class 0 which is a stand-in for large objects, but large
+	// objects are tracked separately (and they actually get placed in
+	// the last bucket, not the first).
+	sizeClassBuckets[0] = 1 // The smallest allocation is 1 byte in size.
+	for i := 1; i < _NumSizeClasses; i++ {
+		// Size classes have an inclusive upper-bound
+		// and exclusive lower bound (e.g. 48-byte size class is
+		// (32, 48]) whereas we want and inclusive lower-bound
+		// and exclusive upper-bound (e.g. 48-byte size class is
+		// [33, 49). We can achieve this by shifting all bucket
+		// boundaries up by 1.
+		//
+		// Also, a float64 can precisely represent integers with
+		// value up to 2^53 and size classes are relatively small
+		// (nowhere near 2^48 even) so this will give us exact
+		// boundaries.
+		sizeClassBuckets[i] = float64(class_to_size[i] + 1)
+	}
+	sizeClassBuckets = append(sizeClassBuckets, float64Inf())
+
+	timeHistBuckets = timeHistogramMetricsBuckets()
+	metrics = map[string]metricData{
+		"/gc/cycles/automatic:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone - in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/forced:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/total:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone
+			},
+		},
+		"/gc/heap/allocs-by-size:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeAllocCount)
+				// Cut off the first index which is ostensibly for size class 0,
+				// but large objects are tracked separately so it's actually unused.
+				for i, count := range in.heapStats.smallAllocCount[1:] {
+					hist.counts[i] = uint64(count)
+				}
+			},
+		},
+		"/gc/heap/allocs:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalAllocated
+			},
+		},
+		"/gc/heap/allocs:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalAllocs
+			},
+		},
+		"/gc/heap/frees-by-size:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeFreeCount)
+				// Cut off the first index which is ostensibly for size class 0,
+				// but large objects are tracked separately so it's actually unused.
+				for i, count := range in.heapStats.smallFreeCount[1:] {
+					hist.counts[i] = uint64(count)
+				}
+			},
+		},
+		"/gc/heap/frees:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalFreed
+			},
+		},
+		"/gc/heap/frees:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalFrees
+			},
+		},
+		"/gc/heap/goal:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.heapGoal
+			},
+		},
+		"/gc/heap/objects:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.numObjects
+			},
+		},
+		"/gc/heap/tiny/allocs:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.tinyAllocCount)
+			},
+		},
+		"/gc/pauses:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				// The bottom-most bucket, containing negative values, is tracked
+				// as a separately as underflow, so fill that in manually and then
+				// iterate over the rest.
+				hist.counts[0] = atomic.Load64(&memstats.gcPauseDist.underflow)
+				for i := range memstats.gcPauseDist.counts {
+					hist.counts[i+1] = atomic.Load64(&memstats.gcPauseDist.counts[i])
+				}
+			},
+		},
+		"/memory/classes/heap/free:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed - in.heapStats.inHeap -
+					in.heapStats.inStacks - in.heapStats.inWorkBufs -
+					in.heapStats.inPtrScalarBits)
+			},
+		},
+		"/memory/classes/heap/objects:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/heap/released:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.released)
+			},
+		},
+		"/memory/classes/heap/stacks:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inStacks)
+			},
+		},
+		"/memory/classes/heap/unused:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inHeap) - in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/metadata/mcache/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheSys - in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mcache/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanSys - in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/other:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inWorkBufs+in.heapStats.inPtrScalarBits) + in.sysStats.gcMiscSys
+			},
+		},
+		"/memory/classes/os-stacks:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.stacksSys
+			},
+		},
+		"/memory/classes/other:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.otherSys
+			},
+		},
+		"/memory/classes/profiling/buckets:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.buckHashSys
+			},
+		},
+		"/memory/classes/total:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed+in.heapStats.released) +
+					in.sysStats.stacksSys + in.sysStats.mSpanSys +
+					in.sysStats.mCacheSys + in.sysStats.buckHashSys +
+					in.sysStats.gcMiscSys + in.sysStats.otherSys
+			},
+		},
+		"/sched/goroutines:goroutines": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(gcount())
+			},
+		},
+		"/sched/latencies:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[0] = atomic.Load64(&sched.timeToRun.underflow)
+				for i := range sched.timeToRun.counts {
+					hist.counts[i+1] = atomic.Load64(&sched.timeToRun.counts[i])
+				}
+			},
+		},
+	}
+	metricsInit = true
+}
+
+// statDep is a dependency on a group of statistics
+// that a metric might have.
+type statDep uint
+
+const (
+	heapStatsDep statDep = iota // corresponds to heapStatsAggregate
+	sysStatsDep                 // corresponds to sysStatsAggregate
+	numStatsDeps
+)
+
+// statDepSet represents a set of statDeps.
+//
+// Under the hood, it's a bitmap.
+type statDepSet [1]uint64
+
+// makeStatDepSet creates a new statDepSet from a list of statDeps.
+func makeStatDepSet(deps ...statDep) statDepSet {
+	var s statDepSet
+	for _, d := range deps {
+		s[d/64] |= 1 << (d % 64)
+	}
+	return s
+}
+
+// differennce returns set difference of s from b as a new set.
+func (s statDepSet) difference(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] &^ b[i]
+	}
+	return c
+}
+
+// union returns the union of the two sets as a new set.
+func (s statDepSet) union(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] | b[i]
+	}
+	return c
+}
+
+// empty returns true if there are no dependencies in the set.
+func (s *statDepSet) empty() bool {
+	for _, c := range s {
+		if c != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// has returns true if the set contains a given statDep.
+func (s *statDepSet) has(d statDep) bool {
+	return s[d/64]&(1<<(d%64)) != 0
+}
+
+// heapStatsAggregate represents memory stats obtained from the
+// runtime. This set of stats is grouped together because they
+// depend on each other in some way to make sense of the runtime's
+// current heap memory use. They're also sharded across Ps, so it
+// makes sense to grab them all at once.
+type heapStatsAggregate struct {
+	heapStatsDelta
+
+	// Derived from values in heapStatsDelta.
+
+	// inObjects is the bytes of memory occupied by objects,
+	inObjects uint64
+
+	// numObjects is the number of live objects in the heap.
+	numObjects uint64
+
+	// totalAllocated is the total bytes of heap objects allocated
+	// over the lifetime of the program.
+	totalAllocated uint64
+
+	// totalFreed is the total bytes of heap objects freed
+	// over the lifetime of the program.
+	totalFreed uint64
+
+	// totalAllocs is the number of heap objects allocated over
+	// the lifetime of the program.
+	totalAllocs uint64
+
+	// totalFrees is the number of heap objects freed over
+	// the lifetime of the program.
+	totalFrees uint64
+}
+
+// compute populates the heapStatsAggregate with values from the runtime.
+func (a *heapStatsAggregate) compute() {
+	memstats.heapStats.read(&a.heapStatsDelta)
+
+	// Calculate derived stats.
+	a.totalAllocs = a.largeAllocCount
+	a.totalFrees = a.largeFreeCount
+	a.totalAllocated = a.largeAlloc
+	a.totalFreed = a.largeFree
+	for i := range a.smallAllocCount {
+		na := a.smallAllocCount[i]
+		nf := a.smallFreeCount[i]
+		a.totalAllocs += na
+		a.totalFrees += nf
+		a.totalAllocated += na * uint64(class_to_size[i])
+		a.totalFreed += nf * uint64(class_to_size[i])
+	}
+	a.inObjects = a.totalAllocated - a.totalFreed
+	a.numObjects = a.totalAllocs - a.totalFrees
+}
+
+// sysStatsAggregate represents system memory stats obtained
+// from the runtime. This set of stats is grouped together because
+// they're all relatively cheap to acquire and generally independent
+// of one another and other runtime memory stats. The fact that they
+// may be acquired at different times, especially with respect to
+// heapStatsAggregate, means there could be some skew, but because of
+// these stats are independent, there's no real consistency issue here.
+type sysStatsAggregate struct {
+	stacksSys      uint64
+	mSpanSys       uint64
+	mSpanInUse     uint64
+	mCacheSys      uint64
+	mCacheInUse    uint64
+	buckHashSys    uint64
+	gcMiscSys      uint64
+	otherSys       uint64
+	heapGoal       uint64
+	gcCyclesDone   uint64
+	gcCyclesForced uint64
+}
+
+// compute populates the sysStatsAggregate with values from the runtime.
+func (a *sysStatsAggregate) compute() {
+	a.stacksSys = memstats.stacks_sys.load()
+	a.buckHashSys = memstats.buckhash_sys.load()
+	a.gcMiscSys = memstats.gcMiscSys.load()
+	a.otherSys = memstats.other_sys.load()
+	a.heapGoal = atomic.Load64(&gcController.heapGoal)
+	a.gcCyclesDone = uint64(memstats.numgc)
+	a.gcCyclesForced = uint64(memstats.numforcedgc)
+
+	systemstack(func() {
+		lock(&mheap_.lock)
+		a.mSpanSys = memstats.mspan_sys.load()
+		a.mSpanInUse = uint64(mheap_.spanalloc.inuse)
+		a.mCacheSys = memstats.mcache_sys.load()
+		a.mCacheInUse = uint64(mheap_.cachealloc.inuse)
+		unlock(&mheap_.lock)
+	})
+}
+
+// statAggregate is the main driver of the metrics implementation.
+//
+// It contains multiple aggregates of runtime statistics, as well
+// as a set of these aggregates that it has populated. The aggergates
+// are populated lazily by its ensure method.
+type statAggregate struct {
+	ensured   statDepSet
+	heapStats heapStatsAggregate
+	sysStats  sysStatsAggregate
+}
+
+// ensure populates statistics aggregates determined by deps if they
+// haven't yet been populated.
+func (a *statAggregate) ensure(deps *statDepSet) {
+	missing := deps.difference(a.ensured)
+	if missing.empty() {
+		return
+	}
+	for i := statDep(0); i < numStatsDeps; i++ {
+		if !missing.has(i) {
+			continue
+		}
+		switch i {
+		case heapStatsDep:
+			a.heapStats.compute()
+		case sysStatsDep:
+			a.sysStats.compute()
+		}
+	}
+	a.ensured = a.ensured.union(missing)
+}
+
+// metricValidKind is a runtime copy of runtime/metrics.ValueKind and
+// must be kept structurally identical to that type.
+type metricKind int
+
+const (
+	// These values must be kept identical to their corresponding Kind* values
+	// in the runtime/metrics package.
+	metricKindBad metricKind = iota
+	metricKindUint64
+	metricKindFloat64
+	metricKindFloat64Histogram
+)
+
+// metricSample is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricSample struct {
+	name  string
+	value metricValue
+}
+
+// metricValue is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricValue struct {
+	kind    metricKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// float64HistOrInit tries to pull out an existing float64Histogram
+// from the value, but if none exists, then it allocates one with
+// the given buckets.
+func (v *metricValue) float64HistOrInit(buckets []float64) *metricFloat64Histogram {
+	var hist *metricFloat64Histogram
+	if v.kind == metricKindFloat64Histogram && v.pointer != nil {
+		hist = (*metricFloat64Histogram)(v.pointer)
+	} else {
+		v.kind = metricKindFloat64Histogram
+		hist = new(metricFloat64Histogram)
+		v.pointer = unsafe.Pointer(hist)
+	}
+	hist.buckets = buckets
+	if len(hist.counts) != len(hist.buckets)-1 {
+		hist.counts = make([]uint64, len(buckets)-1)
+	}
+	return hist
+}
+
+// metricFloat64Histogram is a runtime copy of runtime/metrics.Float64Histogram
+// and must be kept structurally identical to that type.
+type metricFloat64Histogram struct {
+	counts  []uint64
+	buckets []float64
+}
+
+// agg is used by readMetrics, and is protected by metricsSema.
+//
+// Managed as a global variable because its pointer will be
+// an argument to a dynamically-defined function, and we'd
+// like to avoid it escaping to the heap.
+var agg statAggregate
+
+// readMetrics is the implementation of runtime/metrics.Read.
+//
+//go:linkname readMetrics runtime/metrics.runtime_readMetrics
+func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
+	// Construct a slice from the args.
+	sl := slice{samplesp, len, cap}
+	samples := *(*[]metricSample)(unsafe.Pointer(&sl))
+
+	metricsLock()
+
+	// Ensure the map is initialized.
+	initMetrics()
+
+	// Clear agg defensively.
+	agg = statAggregate{}
+
+	// Sample.
+	for i := range samples {
+		sample := &samples[i]
+		data, ok := metrics[sample.name]
+		if !ok {
+			sample.value.kind = metricKindBad
+			continue
+		}
+		// Ensure we have all the stats we need.
+		// agg is populated lazily.
+		agg.ensure(&data.deps)
+
+		// Compute the value based on the stats we have.
+		data.compute(&agg, &sample.value)
+	}
+
+	metricsUnlock()
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mfinal.go b/contrib/go/_std_1.18/src/runtime/mfinal.go
new file mode 100644
index 0000000000..10623e4d67
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mfinal.go
@@ -0,0 +1,474 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: finalizers and block profiling.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// finblock is an array of finalizers to be executed. finblocks are
+// arranged in a linked list for the finalizer queue.
+//
+// finblock is allocated from non-GC'd memory, so any heap pointers
+// must be specially handled. GC currently assumes that the finalizer
+// queue does not grow during marking (but it can shrink).
+//
+//go:notinheap
+type finblock struct {
+	alllink *finblock
+	next    *finblock
+	cnt     uint32
+	_       int32
+	fin     [(_FinBlockSize - 2*goarch.PtrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
+}
+
+var finlock mutex  // protects the following variables
+var fing *g        // goroutine that runs finalizers
+var finq *finblock // list of finalizers that are to be executed
+var finc *finblock // cache of free blocks
+var finptrmask [_FinBlockSize / goarch.PtrSize / 8]byte
+var fingwait bool
+var fingwake bool
+var allfin *finblock // list of all blocks
+
+// NOTE: Layout known to queuefinalizer.
+type finalizer struct {
+	fn   *funcval       // function to call (may be a heap pointer)
+	arg  unsafe.Pointer // ptr to object (may be a heap pointer)
+	nret uintptr        // bytes of return values from fn
+	fint *_type         // type of first argument of fn
+	ot   *ptrtype       // type of ptr to object (may be a heap pointer)
+}
+
+var finalizer1 = [...]byte{
+	// Each Finalizer is 5 words, ptr ptr INT ptr ptr (INT = uintptr here)
+	// Each byte describes 8 words.
+	// Need 8 Finalizers described by 5 bytes before pattern repeats:
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	//	ptr ptr INT ptr ptr
+	// aka
+	//
+	//	ptr ptr INT ptr ptr ptr ptr INT
+	//	ptr ptr ptr ptr INT ptr ptr ptr
+	//	ptr INT ptr ptr ptr ptr INT ptr
+	//	ptr ptr ptr INT ptr ptr ptr ptr
+	//	INT ptr ptr ptr ptr INT ptr ptr
+	//
+	// Assumptions about Finalizer layout checked below.
+	1<<0 | 1<<1 | 0<<2 | 1<<3 | 1<<4 | 1<<5 | 1<<6 | 0<<7,
+	1<<0 | 1<<1 | 1<<2 | 1<<3 | 0<<4 | 1<<5 | 1<<6 | 1<<7,
+	1<<0 | 0<<1 | 1<<2 | 1<<3 | 1<<4 | 1<<5 | 0<<6 | 1<<7,
+	1<<0 | 1<<1 | 1<<2 | 0<<3 | 1<<4 | 1<<5 | 1<<6 | 1<<7,
+	0<<0 | 1<<1 | 1<<2 | 1<<3 | 1<<4 | 0<<5 | 1<<6 | 1<<7,
+}
+
+func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) {
+	if gcphase != _GCoff {
+		// Currently we assume that the finalizer queue won't
+		// grow during marking so we don't have to rescan it
+		// during mark termination. If we ever need to lift
+		// this assumption, we can do it by adding the
+		// necessary barriers to queuefinalizer (which it may
+		// have automatically).
+		throw("queuefinalizer during GC")
+	}
+
+	lock(&finlock)
+	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
+		if finc == nil {
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gcMiscSys))
+			finc.alllink = allfin
+			allfin = finc
+			if finptrmask[0] == 0 {
+				// Build pointer mask for Finalizer array in block.
+				// Check assumptions made in finalizer1 array above.
+				if (unsafe.Sizeof(finalizer{}) != 5*goarch.PtrSize ||
+					unsafe.Offsetof(finalizer{}.fn) != 0 ||
+					unsafe.Offsetof(finalizer{}.arg) != goarch.PtrSize ||
+					unsafe.Offsetof(finalizer{}.nret) != 2*goarch.PtrSize ||
+					unsafe.Offsetof(finalizer{}.fint) != 3*goarch.PtrSize ||
+					unsafe.Offsetof(finalizer{}.ot) != 4*goarch.PtrSize) {
+					throw("finalizer out of sync")
+				}
+				for i := range finptrmask {
+					finptrmask[i] = finalizer1[i%len(finalizer1)]
+				}
+			}
+		}
+		block := finc
+		finc = block.next
+		block.next = finq
+		finq = block
+	}
+	f := &finq.fin[finq.cnt]
+	atomic.Xadd(&finq.cnt, +1) // Sync with markroots
+	f.fn = fn
+	f.nret = nret
+	f.fint = fint
+	f.ot = ot
+	f.arg = p
+	fingwake = true
+	unlock(&finlock)
+}
+
+//go:nowritebarrier
+func iterate_finq(callback func(*funcval, unsafe.Pointer, uintptr, *_type, *ptrtype)) {
+	for fb := allfin; fb != nil; fb = fb.alllink {
+		for i := uint32(0); i < fb.cnt; i++ {
+			f := &fb.fin[i]
+			callback(f.fn, f.arg, f.nret, f.fint, f.ot)
+		}
+	}
+}
+
+func wakefing() *g {
+	var res *g
+	lock(&finlock)
+	if fingwait && fingwake {
+		fingwait = false
+		fingwake = false
+		res = fing
+	}
+	unlock(&finlock)
+	return res
+}
+
+var (
+	fingCreate  uint32
+	fingRunning bool
+)
+
+func createfing() {
+	// start the finalizer goroutine exactly once
+	if fingCreate == 0 && atomic.Cas(&fingCreate, 0, 1) {
+		go runfinq()
+	}
+}
+
+// This is the goroutine that runs all of the finalizers
+func runfinq() {
+	var (
+		frame    unsafe.Pointer
+		framecap uintptr
+		argRegs  int
+	)
+
+	for {
+		lock(&finlock)
+		fb := finq
+		finq = nil
+		if fb == nil {
+			gp := getg()
+			fing = gp
+			fingwait = true
+			goparkunlock(&finlock, waitReasonFinalizerWait, traceEvGoBlock, 1)
+			continue
+		}
+		argRegs = intArgRegs
+		unlock(&finlock)
+		if raceenabled {
+			racefingo()
+		}
+		for fb != nil {
+			for i := fb.cnt; i > 0; i-- {
+				f := &fb.fin[i-1]
+
+				var regs abi.RegArgs
+				// The args may be passed in registers or on stack. Even for
+				// the register case, we still need the spill slots.
+				// TODO: revisit if we remove spill slots.
+				//
+				// Unfortunately because we can have an arbitrary
+				// amount of returns and it would be complex to try and
+				// figure out how many of those can get passed in registers,
+				// just conservatively assume none of them do.
+				framesz := unsafe.Sizeof((any)(nil)) + f.nret
+				if framecap < framesz {
+					// The frame does not contain pointers interesting for GC,
+					// all not yet finalized objects are stored in finq.
+					// If we do not mark it as FlagNoScan,
+					// the last finalized object is not collected.
+					frame = mallocgc(framesz, nil, true)
+					framecap = framesz
+				}
+
+				if f.fint == nil {
+					throw("missing type in runfinq")
+				}
+				r := frame
+				if argRegs > 0 {
+					r = unsafe.Pointer(&regs.Ints)
+				} else {
+					// frame is effectively uninitialized
+					// memory. That means we have to clear
+					// it before writing to it to avoid
+					// confusing the write barrier.
+					*(*[2]uintptr)(frame) = [2]uintptr{}
+				}
+				switch f.fint.kind & kindMask {
+				case kindPtr:
+					// direct use of pointer
+					*(*unsafe.Pointer)(r) = f.arg
+				case kindInterface:
+					ityp := (*interfacetype)(unsafe.Pointer(f.fint))
+					// set up with empty interface
+					(*eface)(r)._type = &f.ot.typ
+					(*eface)(r).data = f.arg
+					if len(ityp.mhdr) != 0 {
+						// convert to interface with methods
+						// this conversion is guaranteed to succeed - we checked in SetFinalizer
+						(*iface)(r).tab = assertE2I(ityp, (*eface)(r)._type)
+					}
+				default:
+					throw("bad kind in runfinq")
+				}
+				fingRunning = true
+				reflectcall(nil, unsafe.Pointer(f.fn), frame, uint32(framesz), uint32(framesz), uint32(framesz), &regs)
+				fingRunning = false
+
+				// Drop finalizer queue heap references
+				// before hiding them from markroot.
+				// This also ensures these will be
+				// clear if we reuse the finalizer.
+				f.fn = nil
+				f.arg = nil
+				f.ot = nil
+				atomic.Store(&fb.cnt, i-1)
+			}
+			next := fb.next
+			lock(&finlock)
+			fb.next = finc
+			finc = fb
+			unlock(&finlock)
+			fb = next
+		}
+	}
+}
+
+// SetFinalizer sets the finalizer associated with obj to the provided
+// finalizer function. When the garbage collector finds an unreachable block
+// with an associated finalizer, it clears the association and runs
+// finalizer(obj) in a separate goroutine. This makes obj reachable again,
+// but now without an associated finalizer. Assuming that SetFinalizer
+// is not called again, the next time the garbage collector sees
+// that obj is unreachable, it will free obj.
+//
+// SetFinalizer(obj, nil) clears any finalizer associated with obj.
+//
+// The argument obj must be a pointer to an object allocated by calling
+// new, by taking the address of a composite literal, or by taking the
+// address of a local variable.
+// The argument finalizer must be a function that takes a single argument
+// to which obj's type can be assigned, and can have arbitrary ignored return
+// values. If either of these is not true, SetFinalizer may abort the
+// program.
+//
+// Finalizers are run in dependency order: if A points at B, both have
+// finalizers, and they are otherwise unreachable, only the finalizer
+// for A runs; once A is freed, the finalizer for B can run.
+// If a cyclic structure includes a block with a finalizer, that
+// cycle is not guaranteed to be garbage collected and the finalizer
+// is not guaranteed to run, because there is no ordering that
+// respects the dependencies.
+//
+// The finalizer is scheduled to run at some arbitrary time after the
+// program can no longer reach the object to which obj points.
+// There is no guarantee that finalizers will run before a program exits,
+// so typically they are useful only for releasing non-memory resources
+// associated with an object during a long-running program.
+// For example, an os.File object could use a finalizer to close the
+// associated operating system file descriptor when a program discards
+// an os.File without calling Close, but it would be a mistake
+// to depend on a finalizer to flush an in-memory I/O buffer such as a
+// bufio.Writer, because the buffer would not be flushed at program exit.
+//
+// It is not guaranteed that a finalizer will run if the size of *obj is
+// zero bytes.
+//
+// It is not guaranteed that a finalizer will run for objects allocated
+// in initializers for package-level variables. Such objects may be
+// linker-allocated, not heap-allocated.
+//
+// A finalizer may run as soon as an object becomes unreachable.
+// In order to use finalizers correctly, the program must ensure that
+// the object is reachable until it is no longer required.
+// Objects stored in global variables, or that can be found by tracing
+// pointers from a global variable, are reachable. For other objects,
+// pass the object to a call of the KeepAlive function to mark the
+// last point in the function where the object must be reachable.
+//
+// For example, if p points to a struct, such as os.File, that contains
+// a file descriptor d, and p has a finalizer that closes that file
+// descriptor, and if the last use of p in a function is a call to
+// syscall.Write(p.d, buf, size), then p may be unreachable as soon as
+// the program enters syscall.Write. The finalizer may run at that moment,
+// closing p.d, causing syscall.Write to fail because it is writing to
+// a closed file descriptor (or, worse, to an entirely different
+// file descriptor opened by a different goroutine). To avoid this problem,
+// call runtime.KeepAlive(p) after the call to syscall.Write.
+//
+// A single goroutine runs all finalizers for a program, sequentially.
+// If a finalizer must run for a long time, it should do so by starting
+// a new goroutine.
+func SetFinalizer(obj any, finalizer any) {
+	if debug.sbrk != 0 {
+		// debug.sbrk never frees memory, so no finalizers run
+		// (and we don't have the data structures to record them).
+		return
+	}
+	e := efaceOf(&obj)
+	etyp := e._type
+	if etyp == nil {
+		throw("runtime.SetFinalizer: first argument is nil")
+	}
+	if etyp.kind&kindMask != kindPtr {
+		throw("runtime.SetFinalizer: first argument is " + etyp.string() + ", not pointer")
+	}
+	ot := (*ptrtype)(unsafe.Pointer(etyp))
+	if ot.elem == nil {
+		throw("nil elem type!")
+	}
+
+	// find the containing object
+	base, _, _ := findObject(uintptr(e.data), 0, 0)
+
+	if base == 0 {
+		// 0-length objects are okay.
+		if e.data == unsafe.Pointer(&zerobase) {
+			return
+		}
+
+		// Global initializers might be linker-allocated.
+		//	var Foo = &Object{}
+		//	func main() {
+		//		runtime.SetFinalizer(Foo, nil)
+		//	}
+		// The relevant segments are: noptrdata, data, bss, noptrbss.
+		// We cannot assume they are in any order or even contiguous,
+		// due to external linking.
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			if datap.noptrdata <= uintptr(e.data) && uintptr(e.data) < datap.enoptrdata ||
+				datap.data <= uintptr(e.data) && uintptr(e.data) < datap.edata ||
+				datap.bss <= uintptr(e.data) && uintptr(e.data) < datap.ebss ||
+				datap.noptrbss <= uintptr(e.data) && uintptr(e.data) < datap.enoptrbss {
+				return
+			}
+		}
+		throw("runtime.SetFinalizer: pointer not in allocated block")
+	}
+
+	if uintptr(e.data) != base {
+		// As an implementation detail we allow to set finalizers for an inner byte
+		// of an object if it could come from tiny alloc (see mallocgc for details).
+		if ot.elem == nil || ot.elem.ptrdata != 0 || ot.elem.size >= maxTinySize {
+			throw("runtime.SetFinalizer: pointer not at beginning of allocated block")
+		}
+	}
+
+	f := efaceOf(&finalizer)
+	ftyp := f._type
+	if ftyp == nil {
+		// switch to system stack and remove finalizer
+		systemstack(func() {
+			removefinalizer(e.data)
+		})
+		return
+	}
+
+	if ftyp.kind&kindMask != kindFunc {
+		throw("runtime.SetFinalizer: second argument is " + ftyp.string() + ", not a function")
+	}
+	ft := (*functype)(unsafe.Pointer(ftyp))
+	if ft.dotdotdot() {
+		throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string() + " because dotdotdot")
+	}
+	if ft.inCount != 1 {
+		throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string())
+	}
+	fint := ft.in()[0]
+	switch {
+	case fint == etyp:
+		// ok - same type
+		goto okarg
+	case fint.kind&kindMask == kindPtr:
+		if (fint.uncommon() == nil || etyp.uncommon() == nil) && (*ptrtype)(unsafe.Pointer(fint)).elem == ot.elem {
+			// ok - not same type, but both pointers,
+			// one or the other is unnamed, and same element type, so assignable.
+			goto okarg
+		}
+	case fint.kind&kindMask == kindInterface:
+		ityp := (*interfacetype)(unsafe.Pointer(fint))
+		if len(ityp.mhdr) == 0 {
+			// ok - satisfies empty interface
+			goto okarg
+		}
+		if iface := assertE2I2(ityp, *efaceOf(&obj)); iface.tab != nil {
+			goto okarg
+		}
+	}
+	throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string())
+okarg:
+	// compute size needed for return parameters
+	nret := uintptr(0)
+	for _, t := range ft.out() {
+		nret = alignUp(nret, uintptr(t.align)) + uintptr(t.size)
+	}
+	nret = alignUp(nret, goarch.PtrSize)
+
+	// make sure we have a finalizer goroutine
+	createfing()
+
+	systemstack(func() {
+		if !addfinalizer(e.data, (*funcval)(f.data), nret, fint, ot) {
+			throw("runtime.SetFinalizer: finalizer already set")
+		}
+	})
+}
+
+// Mark KeepAlive as noinline so that it is easily detectable as an intrinsic.
+//go:noinline
+
+// KeepAlive marks its argument as currently reachable.
+// This ensures that the object is not freed, and its finalizer is not run,
+// before the point in the program where KeepAlive is called.
+//
+// A very simplified example showing where KeepAlive is required:
+// 	type File struct { d int }
+// 	d, err := syscall.Open("/file/path", syscall.O_RDONLY, 0)
+// 	// ... do something if err != nil ...
+// 	p := &File{d}
+// 	runtime.SetFinalizer(p, func(p *File) { syscall.Close(p.d) })
+// 	var buf [10]byte
+// 	n, err := syscall.Read(p.d, buf[:])
+// 	// Ensure p is not finalized until Read returns.
+// 	runtime.KeepAlive(p)
+// 	// No more uses of p after this point.
+//
+// Without the KeepAlive call, the finalizer could run at the start of
+// syscall.Read, closing the file descriptor before syscall.Read makes
+// the actual system call.
+//
+// Note: KeepAlive should only be used to prevent finalizers from
+// running prematurely. In particular, when used with unsafe.Pointer,
+// the rules for valid uses of unsafe.Pointer still apply.
+func KeepAlive(x any) {
+	// Introduce a use of x that the compiler can't eliminate.
+	// This makes sure x is alive on entry. We need x to be alive
+	// on entry for "defer runtime.KeepAlive(x)"; see issue 21402.
+	if cgoAlwaysFalse {
+		println(x)
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mfixalloc.go b/contrib/go/_std_1.18/src/runtime/mfixalloc.go
new file mode 100644
index 0000000000..b701a09b40
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mfixalloc.go
@@ -0,0 +1,108 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fixed-size object allocator. Returned memory is not zeroed.
+//
+// See malloc.go for overview.
+
+package runtime
+
+import "unsafe"
+
+// FixAlloc is a simple free-list allocator for fixed size objects.
+// Malloc uses a FixAlloc wrapped around sysAlloc to manage its
+// mcache and mspan objects.
+//
+// Memory returned by fixalloc.alloc is zeroed by default, but the
+// caller may take responsibility for zeroing allocations by setting
+// the zero flag to false. This is only safe if the memory never
+// contains heap pointers.
+//
+// The caller is responsible for locking around FixAlloc calls.
+// Callers can keep state in the object but the first word is
+// smashed by freeing and reallocating.
+//
+// Consider marking fixalloc'd types go:notinheap.
+type fixalloc struct {
+	size   uintptr
+	first  func(arg, p unsafe.Pointer) // called first time p is returned
+	arg    unsafe.Pointer
+	list   *mlink
+	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
+	nchunk uint32  // bytes remaining in current chunk
+	nalloc uint32  // size of new chunks in bytes
+	inuse  uintptr // in-use bytes now
+	stat   *sysMemStat
+	zero   bool // zero allocations
+}
+
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+// Since assignments to mlink.next will result in a write barrier being performed
+// this cannot be used by some of the internal GC structures. For example when
+// the sweeper is placing an unmarked object on the free list it does not want the
+// write barrier to be called since that could result in the object being reachable.
+//
+//go:notinheap
+type mlink struct {
+	next *mlink
+}
+
+// Initialize f to allocate objects of the given size,
+// using the allocator to obtain chunks of memory.
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
+	if size > _FixAllocChunk {
+		throw("runtime: fixalloc size too large")
+	}
+	if min := unsafe.Sizeof(mlink{}); size < min {
+		size = min
+	}
+
+	f.size = size
+	f.first = first
+	f.arg = arg
+	f.list = nil
+	f.chunk = 0
+	f.nchunk = 0
+	f.nalloc = uint32(_FixAllocChunk / size * size) // Round _FixAllocChunk down to an exact multiple of size to eliminate tail waste
+	f.inuse = 0
+	f.stat = stat
+	f.zero = true
+}
+
+func (f *fixalloc) alloc() unsafe.Pointer {
+	if f.size == 0 {
+		print("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n")
+		throw("runtime: internal error")
+	}
+
+	if f.list != nil {
+		v := unsafe.Pointer(f.list)
+		f.list = f.list.next
+		f.inuse += f.size
+		if f.zero {
+			memclrNoHeapPointers(v, f.size)
+		}
+		return v
+	}
+	if uintptr(f.nchunk) < f.size {
+		f.chunk = uintptr(persistentalloc(uintptr(f.nalloc), 0, f.stat))
+		f.nchunk = f.nalloc
+	}
+
+	v := unsafe.Pointer(f.chunk)
+	if f.first != nil {
+		f.first(f.arg, v)
+	}
+	f.chunk = f.chunk + f.size
+	f.nchunk -= uint32(f.size)
+	f.inuse += f.size
+	return v
+}
+
+func (f *fixalloc) free(p unsafe.Pointer) {
+	f.inuse -= f.size
+	v := (*mlink)(p)
+	v.next = f.list
+	f.list = v
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgc.go b/contrib/go/_std_1.18/src/runtime/mgc.go
new file mode 100644
index 0000000000..44b96154e7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgc.go
@@ -0,0 +1,1714 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector (GC).
+//
+// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
+// GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
+// non-generational and non-compacting. Allocation is done using size segregated per P allocation
+// areas to minimize fragmentation while eliminating locks in the common case.
+//
+// The algorithm decomposes into several steps.
+// This is a high level description of the algorithm being used. For an overview of GC a good
+// place to start is Richard Jones' gchandbook.org.
+//
+// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
+// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
+// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
+// 966-975.
+// For journal quality proofs that these steps are complete, correct, and terminate see
+// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
+// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
+//
+// 1. GC performs sweep termination.
+//
+//    a. Stop the world. This causes all Ps to reach a GC safe-point.
+//
+//    b. Sweep any unswept spans. There will only be unswept spans if
+//    this GC cycle was forced before the expected time.
+//
+// 2. GC performs the mark phase.
+//
+//    a. Prepare for the mark phase by setting gcphase to _GCmark
+//    (from _GCoff), enabling the write barrier, enabling mutator
+//    assists, and enqueueing root mark jobs. No objects may be
+//    scanned until all Ps have enabled the write barrier, which is
+//    accomplished using STW.
+//
+//    b. Start the world. From this point, GC work is done by mark
+//    workers started by the scheduler and by assists performed as
+//    part of allocation. The write barrier shades both the
+//    overwritten pointer and the new pointer value for any pointer
+//    writes (see mbarrier.go for details). Newly allocated objects
+//    are immediately marked black.
+//
+//    c. GC performs root marking jobs. This includes scanning all
+//    stacks, shading all globals, and shading any heap pointers in
+//    off-heap runtime data structures. Scanning a stack stops a
+//    goroutine, shades any pointers found on its stack, and then
+//    resumes the goroutine.
+//
+//    d. GC drains the work queue of grey objects, scanning each grey
+//    object to black and shading all pointers found in the object
+//    (which in turn may add those pointers to the work queue).
+//
+//    e. Because GC work is spread across local caches, GC uses a
+//    distributed termination algorithm to detect when there are no
+//    more root marking jobs or grey objects (see gcMarkDone). At this
+//    point, GC transitions to mark termination.
+//
+// 3. GC performs mark termination.
+//
+//    a. Stop the world.
+//
+//    b. Set gcphase to _GCmarktermination, and disable workers and
+//    assists.
+//
+//    c. Perform housekeeping like flushing mcaches.
+//
+// 4. GC performs the sweep phase.
+//
+//    a. Prepare for the sweep phase by setting gcphase to _GCoff,
+//    setting up sweep state and disabling the write barrier.
+//
+//    b. Start the world. From this point on, newly allocated objects
+//    are white, and allocating sweeps spans before use if necessary.
+//
+//    c. GC does concurrent sweeping in the background and in response
+//    to allocation. See description below.
+//
+// 5. When sufficient allocation has taken place, replay the sequence
+// starting with 1 above. See discussion of GC rate below.
+
+// Concurrent sweep.
+//
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// At the end of STW mark termination all spans are marked as "needs sweeping".
+//
+// The background sweeper goroutine simply sweeps spans one-by-one.
+//
+// To avoid requesting more OS memory while there are unswept spans, when a
+// goroutine needs another span, it first attempts to reclaim that much memory
+// by sweeping. When a goroutine needs to allocate a new small-object span, it
+// sweeps small-object spans for the same object size until it frees at least
+// one object. When a goroutine needs to allocate large-object span from heap,
+// it sweeps spans until it frees at least that many pages into heap. There is
+// one case where this may not suffice: if a goroutine sweeps and frees two
+// nonadjacent one-page spans to the heap, it will allocate a new two-page
+// span, but there can still be other one-page unswept spans which could be
+// combined into a two-page span.
+//
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
+
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in gcController.heapGoal variable). This keeps the GC cost in
+// linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+
+// Oblets
+//
+// In order to prevent long pauses while scanning large objects and to
+// improve parallelism, the garbage collector breaks up scan jobs for
+// objects larger than maxObletBytes into "oblets" of at most
+// maxObletBytes. When scanning encounters the beginning of a large
+// object, it scans only the first oblet and enqueues the remaining
+// oblets as new scan jobs.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	_DebugGC         = 0
+	_ConcurrentSweep = true
+	_FinBlockSize    = 4 * 1024
+
+	// debugScanConservative enables debug logging for stack
+	// frames that are scanned conservatively.
+	debugScanConservative = false
+
+	// sweepMinHeapDistance is a lower bound on the heap distance
+	// (in bytes) reserved for concurrent sweeping between GC
+	// cycles.
+	sweepMinHeapDistance = 1024 * 1024
+)
+
+func gcinit() {
+	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+		throw("size of Workbuf is suboptimal")
+	}
+	// No sweep on the first cycle.
+	sweep.active.state.Store(sweepDrainedMask)
+
+	// Initialize GC pacer state.
+	// Use the environment variable GOGC for the initial gcPercent value.
+	gcController.init(readGOGC())
+
+	work.startSema = 1
+	work.markDoneSema = 1
+	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
+	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
+	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
+}
+
+// gcenable is called after the bulk of the runtime initialization,
+// just before we're about to start letting user code run.
+// It kicks off the background sweeper goroutine, the background
+// scavenger goroutine, and enables GC.
+func gcenable() {
+	// Kick off sweeping and scavenging.
+	c := make(chan int, 2)
+	go bgsweep(c)
+	go bgscavenge(c)
+	<-c
+	<-c
+	memstats.enablegc = true // now that runtime is initialized, GC is okay
+}
+
+// Garbage collector phase.
+// Indicates to write barrier and synchronization task to perform.
+var gcphase uint32
+
+// The compiler knows about this variable.
+// If you change it, you must change builtin/runtime.go, too.
+// If you change the first four bytes, you must also change the write
+// barrier insertion code.
+var writeBarrier struct {
+	enabled bool    // compiler emits a check of this before calling write barrier
+	pad     [3]byte // compiler uses 32-bit load for "enabled" field
+	needed  bool    // whether we need a write barrier for current GC phase
+	cgo     bool    // whether we need a write barrier for a cgo check
+	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
+}
+
+// gcBlackenEnabled is 1 if mutator assists and background mark
+// workers are allowed to blacken objects. This must only be set when
+// gcphase == _GCmark.
+var gcBlackenEnabled uint32
+
+const (
+	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
+	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
+	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
+)
+
+//go:nosplit
+func setGCPhase(x uint32) {
+	atomic.Store(&gcphase, x)
+	writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
+	writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
+}
+
+// gcMarkWorkerMode represents the mode that a concurrent mark worker
+// should operate in.
+//
+// Concurrent marking happens through four different mechanisms. One
+// is mutator assists, which happen in response to allocations and are
+// not scheduled. The other three are variations in the per-P mark
+// workers and are distinguished by gcMarkWorkerMode.
+type gcMarkWorkerMode int
+
+const (
+	// gcMarkWorkerNotWorker indicates that the next scheduled G is not
+	// starting work and the mode should be ignored.
+	gcMarkWorkerNotWorker gcMarkWorkerMode = iota
+
+	// gcMarkWorkerDedicatedMode indicates that the P of a mark
+	// worker is dedicated to running that mark worker. The mark
+	// worker should run without preemption.
+	gcMarkWorkerDedicatedMode
+
+	// gcMarkWorkerFractionalMode indicates that a P is currently
+	// running the "fractional" mark worker. The fractional worker
+	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
+	// an integer and using only dedicated workers would result in
+	// utilization too far from the target of gcBackgroundUtilization.
+	// The fractional worker should run until it is preempted and
+	// will be scheduled to pick up the fractional part of
+	// GOMAXPROCS*gcBackgroundUtilization.
+	gcMarkWorkerFractionalMode
+
+	// gcMarkWorkerIdleMode indicates that a P is running the mark
+	// worker because it has nothing else to do. The idle worker
+	// should run until it is preempted and account its time
+	// against gcController.idleMarkTime.
+	gcMarkWorkerIdleMode
+)
+
+// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
+// to use in execution traces.
+var gcMarkWorkerModeStrings = [...]string{
+	"Not worker",
+	"GC (dedicated)",
+	"GC (fractional)",
+	"GC (idle)",
+}
+
+// pollFractionalWorkerExit reports whether a fractional mark worker
+// should self-preempt. It assumes it is called from the fractional
+// worker.
+func pollFractionalWorkerExit() bool {
+	// This should be kept in sync with the fractional worker
+	// scheduler logic in findRunnableGCWorker.
+	now := nanotime()
+	delta := now - gcController.markStartTime
+	if delta <= 0 {
+		return true
+	}
+	p := getg().m.p.ptr()
+	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
+	// Add some slack to the utilization goal so that the
+	// fractional worker isn't behind again the instant it exits.
+	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
+}
+
+var work struct {
+	full  lfstack          // lock-free list of full blocks workbuf
+	empty lfstack          // lock-free list of empty blocks workbuf
+	pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
+
+	wbufSpans struct {
+		lock mutex
+		// free is a list of spans dedicated to workbufs, but
+		// that don't currently contain any workbufs.
+		free mSpanList
+		// busy is a list of all spans containing workbufs on
+		// one of the workbuf lists.
+		busy mSpanList
+	}
+
+	// Restore 64-bit alignment on 32-bit.
+	_ uint32
+
+	// bytesMarked is the number of bytes marked this cycle. This
+	// includes bytes blackened in scanned objects, noscan objects
+	// that go straight to black, and permagrey objects scanned by
+	// markroot during the concurrent scan phase. This is updated
+	// atomically during the cycle. Updates may be batched
+	// arbitrarily, since the value is only read at the end of the
+	// cycle.
+	//
+	// Because of benign races during marking, this number may not
+	// be the exact number of marked bytes, but it should be very
+	// close.
+	//
+	// Put this field here because it needs 64-bit atomic access
+	// (and thus 8-byte alignment even on 32-bit architectures).
+	bytesMarked uint64
+
+	markrootNext uint32 // next markroot job
+	markrootJobs uint32 // number of markroot jobs
+
+	nproc  uint32
+	tstart int64
+	nwait  uint32
+
+	// Number of roots of various root types. Set by gcMarkRootPrepare.
+	//
+	// nStackRoots == len(stackRoots), but we have nStackRoots for
+	// consistency.
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
+
+	// Base indexes of each root type. Set by gcMarkRootPrepare.
+	baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
+
+	// stackRoots is a snapshot of all of the Gs that existed
+	// before the beginning of concurrent marking. The backing
+	// store of this must not be modified because it might be
+	// shared with allgs.
+	stackRoots []*g
+
+	// Each type of GC state transition is protected by a lock.
+	// Since multiple threads can simultaneously detect the state
+	// transition condition, any thread that detects a transition
+	// condition must acquire the appropriate transition lock,
+	// re-check the transition condition and return if it no
+	// longer holds or perform the transition if it does.
+	// Likewise, any transition must invalidate the transition
+	// condition before releasing the lock. This ensures that each
+	// transition is performed by exactly one thread and threads
+	// that need the transition to happen block until it has
+	// happened.
+	//
+	// startSema protects the transition from "off" to mark or
+	// mark termination.
+	startSema uint32
+	// markDoneSema protects transitions from mark to mark termination.
+	markDoneSema uint32
+
+	bgMarkReady note   // signal background mark worker has started
+	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
+	// Background mark completion signaling
+
+	// mode is the concurrency mode of the current GC cycle.
+	mode gcMode
+
+	// userForced indicates the current GC cycle was forced by an
+	// explicit user call.
+	userForced bool
+
+	// totaltime is the CPU nanoseconds spent in GC since the
+	// program started if debug.gctrace > 0.
+	totaltime int64
+
+	// initialHeapLive is the value of gcController.heapLive at the
+	// beginning of this GC cycle.
+	initialHeapLive uint64
+
+	// assistQueue is a queue of assists that are blocked because
+	// there was neither enough credit to steal or enough work to
+	// do.
+	assistQueue struct {
+		lock mutex
+		q    gQueue
+	}
+
+	// sweepWaiters is a list of blocked goroutines to wake when
+	// we transition from mark termination to sweep.
+	sweepWaiters struct {
+		lock mutex
+		list gList
+	}
+
+	// cycles is the number of completed GC cycles, where a GC
+	// cycle is sweep termination, mark, mark termination, and
+	// sweep. This differs from memstats.numgc, which is
+	// incremented at mark termination.
+	cycles uint32
+
+	// Timing/utilization stats for this cycle.
+	stwprocs, maxprocs                 int32
+	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
+
+	pauseNS    int64 // total STW time this cycle
+	pauseStart int64 // nanotime() of last STW
+
+	// debug.gctrace heap sizes for this cycle.
+	heap0, heap1, heap2, heapGoal uint64
+}
+
+// GC runs a garbage collection and blocks the caller until the
+// garbage collection is complete. It may also block the entire
+// program.
+func GC() {
+	// We consider a cycle to be: sweep termination, mark, mark
+	// termination, and sweep. This function shouldn't return
+	// until a full cycle has been completed, from beginning to
+	// end. Hence, we always want to finish up the current cycle
+	// and start a new one. That means:
+	//
+	// 1. In sweep termination, mark, or mark termination of cycle
+	// N, wait until mark termination N completes and transitions
+	// to sweep N.
+	//
+	// 2. In sweep N, help with sweep N.
+	//
+	// At this point we can begin a full cycle N+1.
+	//
+	// 3. Trigger cycle N+1 by starting sweep termination N+1.
+	//
+	// 4. Wait for mark termination N+1 to complete.
+	//
+	// 5. Help with sweep N+1 until it's done.
+	//
+	// This all has to be written to deal with the fact that the
+	// GC may move ahead on its own. For example, when we block
+	// until mark termination N, we may wake up in cycle N+2.
+
+	// Wait until the current sweep termination, mark, and mark
+	// termination complete.
+	n := atomic.Load(&work.cycles)
+	gcWaitOnMark(n)
+
+	// We're now in sweep N or later. Trigger GC cycle N+1, which
+	// will first finish sweep N if necessary and then enter sweep
+	// termination N+1.
+	gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
+
+	// Wait for mark termination N+1 to complete.
+	gcWaitOnMark(n + 1)
+
+	// Finish sweep N+1 before returning. We do this both to
+	// complete the cycle and because runtime.GC() is often used
+	// as part of tests and benchmarks to get the system into a
+	// relatively stable and isolated state.
+	for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+		Gosched()
+	}
+
+	// Callers may assume that the heap profile reflects the
+	// just-completed cycle when this returns (historically this
+	// happened because this was a STW GC), but right now the
+	// profile still reflects mark termination N, not N+1.
+	//
+	// As soon as all of the sweep frees from cycle N+1 are done,
+	// we can go ahead and publish the heap profile.
+	//
+	// First, wait for sweeping to finish. (We know there are no
+	// more spans on the sweep queue, but we may be concurrently
+	// sweeping spans, so we have to wait.)
+	for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
+		Gosched()
+	}
+
+	// Now we're really done with sweeping, so we can publish the
+	// stable heap profile. Only do this if we haven't already hit
+	// another mark termination.
+	mp := acquirem()
+	cycle := atomic.Load(&work.cycles)
+	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
+		mProf_PostSweep()
+	}
+	releasem(mp)
+}
+
+// gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
+// already completed this mark phase, it returns immediately.
+func gcWaitOnMark(n uint32) {
+	for {
+		// Disable phase transitions.
+		lock(&work.sweepWaiters.lock)
+		nMarks := atomic.Load(&work.cycles)
+		if gcphase != _GCmark {
+			// We've already completed this cycle's mark.
+			nMarks++
+		}
+		if nMarks > n {
+			// We're done.
+			unlock(&work.sweepWaiters.lock)
+			return
+		}
+
+		// Wait until sweep termination, mark, and mark
+		// termination of cycle N complete.
+		work.sweepWaiters.list.push(getg())
+		goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
+	}
+}
+
+// gcMode indicates how concurrent a GC cycle should be.
+type gcMode int
+
+const (
+	gcBackgroundMode gcMode = iota // concurrent GC and sweep
+	gcForceMode                    // stop-the-world GC now, concurrent sweep
+	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
+)
+
+// A gcTrigger is a predicate for starting a GC cycle. Specifically,
+// it is an exit condition for the _GCoff phase.
+type gcTrigger struct {
+	kind gcTriggerKind
+	now  int64  // gcTriggerTime: current time
+	n    uint32 // gcTriggerCycle: cycle number to start
+}
+
+type gcTriggerKind int
+
+const (
+	// gcTriggerHeap indicates that a cycle should be started when
+	// the heap size reaches the trigger heap size computed by the
+	// controller.
+	gcTriggerHeap gcTriggerKind = iota
+
+	// gcTriggerTime indicates that a cycle should be started when
+	// it's been more than forcegcperiod nanoseconds since the
+	// previous GC cycle.
+	gcTriggerTime
+
+	// gcTriggerCycle indicates that a cycle should be started if
+	// we have not yet started cycle number gcTrigger.n (relative
+	// to work.cycles).
+	gcTriggerCycle
+)
+
+// test reports whether the trigger condition is satisfied, meaning
+// that the exit condition for the _GCoff phase has been met. The exit
+// condition should be tested when allocating.
+func (t gcTrigger) test() bool {
+	if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
+		return false
+	}
+	switch t.kind {
+	case gcTriggerHeap:
+		// Non-atomic access to gcController.heapLive for performance. If
+		// we are going to trigger on this, this thread just
+		// atomically wrote gcController.heapLive anyway and we'll see our
+		// own write.
+		return gcController.heapLive >= gcController.trigger
+	case gcTriggerTime:
+		if gcController.gcPercent.Load() < 0 {
+			return false
+		}
+		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
+		return lastgc != 0 && t.now-lastgc > forcegcperiod
+	case gcTriggerCycle:
+		// t.n > work.cycles, but accounting for wraparound.
+		return int32(t.n-work.cycles) > 0
+	}
+	return true
+}
+
+// gcStart starts the GC. It transitions from _GCoff to _GCmark (if
+// debug.gcstoptheworld == 0) or performs all of GC (if
+// debug.gcstoptheworld != 0).
+//
+// This may return without performing this transition in some cases,
+// such as when called on a system stack or with locks held.
+func gcStart(trigger gcTrigger) {
+	// Since this is called from malloc and malloc is called in
+	// the guts of a number of libraries that might be holding
+	// locks, don't attempt to start GC in non-preemptible or
+	// potentially unstable situations.
+	mp := acquirem()
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
+		releasem(mp)
+		return
+	}
+	releasem(mp)
+	mp = nil
+
+	// Pick up the remaining unswept/not being swept spans concurrently
+	//
+	// This shouldn't happen if we're being invoked in background
+	// mode since proportional sweep should have just finished
+	// sweeping everything, but rounding errors, etc, may leave a
+	// few spans unswept. In forced mode, this is necessary since
+	// GC can be forced at any point in the sweeping cycle.
+	//
+	// We check the transition condition continuously here in case
+	// this G gets delayed in to the next GC cycle.
+	for trigger.test() && sweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+	}
+
+	// Perform GC initialization and the sweep termination
+	// transition.
+	semacquire(&work.startSema)
+	// Re-check transition condition under transition lock.
+	if !trigger.test() {
+		semrelease(&work.startSema)
+		return
+	}
+
+	// For stats, check if this GC was forced by the user.
+	work.userForced = trigger.kind == gcTriggerCycle
+
+	// In gcstoptheworld debug mode, upgrade the mode accordingly.
+	// We do this after re-checking the transition condition so
+	// that multiple goroutines that detect the heap trigger don't
+	// start multiple STW GCs.
+	mode := gcBackgroundMode
+	if debug.gcstoptheworld == 1 {
+		mode = gcForceMode
+	} else if debug.gcstoptheworld == 2 {
+		mode = gcForceBlockMode
+	}
+
+	// Ok, we're doing it! Stop everybody else
+	semacquire(&gcsema)
+	semacquire(&worldsema)
+
+	if trace.enabled {
+		traceGCStart()
+	}
+
+	// Check that all Ps have finished deferred mcache flushes.
+	for _, p := range allp {
+		if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
+			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
+			throw("p mcache not flushed")
+		}
+	}
+
+	gcBgMarkStartWorkers()
+
+	systemstack(gcResetMarkState)
+
+	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
+	if work.stwprocs > ncpu {
+		// This is used to compute CPU time of the STW phases,
+		// so it can't be more than ncpu, even if GOMAXPROCS is.
+		work.stwprocs = ncpu
+	}
+	work.heap0 = atomic.Load64(&gcController.heapLive)
+	work.pauseNS = 0
+	work.mode = mode
+
+	now := nanotime()
+	work.tSweepTerm = now
+	work.pauseStart = now
+	if trace.enabled {
+		traceGCSTWStart(1)
+	}
+	systemstack(stopTheWorldWithSema)
+	// Finish sweep before we start concurrent scan.
+	systemstack(func() {
+		finishsweep_m()
+	})
+
+	// clearpools before we start the GC. If we wait they memory will not be
+	// reclaimed until the next GC cycle.
+	clearpools()
+
+	work.cycles++
+
+	// Assists and workers can start the moment we start
+	// the world.
+	gcController.startCycle(now, int(gomaxprocs))
+	work.heapGoal = gcController.heapGoal
+
+	// In STW mode, disable scheduling of user Gs. This may also
+	// disable scheduling of this goroutine, so it may block as
+	// soon as we start the world again.
+	if mode != gcBackgroundMode {
+		schedEnableUser(false)
+	}
+
+	// Enter concurrent mark phase and enable
+	// write barriers.
+	//
+	// Because the world is stopped, all Ps will
+	// observe that write barriers are enabled by
+	// the time we start the world and begin
+	// scanning.
+	//
+	// Write barriers must be enabled before assists are
+	// enabled because they must be enabled before
+	// any non-leaf heap objects are marked. Since
+	// allocations are blocked until assists can
+	// happen, we want enable assists as early as
+	// possible.
+	setGCPhase(_GCmark)
+
+	gcBgMarkPrepare() // Must happen before assist enable.
+	gcMarkRootPrepare()
+
+	// Mark all active tinyalloc blocks. Since we're
+	// allocating from these, they need to be black like
+	// other allocations. The alternative is to blacken
+	// the tiny block on every allocation from it, which
+	// would slow down the tiny allocator.
+	gcMarkTinyAllocs()
+
+	// At this point all Ps have enabled the write
+	// barrier, thus maintaining the no white to
+	// black invariant. Enable mutator assists to
+	// put back-pressure on fast allocating
+	// mutators.
+	atomic.Store(&gcBlackenEnabled, 1)
+
+	// In STW mode, we could block the instant systemstack
+	// returns, so make sure we're not preemptible.
+	mp = acquirem()
+
+	// Concurrent mark.
+	systemstack(func() {
+		now = startTheWorldWithSema(trace.enabled)
+		work.pauseNS += now - work.pauseStart
+		work.tMark = now
+		memstats.gcPauseDist.record(now - work.pauseStart)
+	})
+
+	// Release the world sema before Gosched() in STW mode
+	// because we will need to reacquire it later but before
+	// this goroutine becomes runnable again, and we could
+	// self-deadlock otherwise.
+	semrelease(&worldsema)
+	releasem(mp)
+
+	// Make sure we block instead of returning to user code
+	// in STW mode.
+	if mode != gcBackgroundMode {
+		Gosched()
+	}
+
+	semrelease(&work.startSema)
+}
+
+// gcMarkDoneFlushed counts the number of P's with flushed work.
+//
+// Ideally this would be a captured local in gcMarkDone, but forEachP
+// escapes its callback closure, so it can't capture anything.
+//
+// This is protected by markDoneSema.
+var gcMarkDoneFlushed uint32
+
+// gcMarkDone transitions the GC from mark to mark termination if all
+// reachable objects have been marked (that is, there are no grey
+// objects and can be no more in the future). Otherwise, it flushes
+// all local work to the global queues where it can be discovered by
+// other workers.
+//
+// This should be called when all local mark work has been drained and
+// there are no remaining workers. Specifically, when
+//
+//   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
+//
+// The calling context must be preemptible.
+//
+// Flushing local work is important because idle Ps may have local
+// work queued. This is the only way to make that work visible and
+// drive GC to completion.
+//
+// It is explicitly okay to have write barriers in this function. If
+// it does transition to mark termination, then all reachable objects
+// have been marked, so the write barrier cannot shade any more
+// objects.
+func gcMarkDone() {
+	// Ensure only one thread is running the ragged barrier at a
+	// time.
+	semacquire(&work.markDoneSema)
+
+top:
+	// Re-check transition condition under transition lock.
+	//
+	// It's critical that this checks the global work queues are
+	// empty before performing the ragged barrier. Otherwise,
+	// there could be global work that a P could take after the P
+	// has passed the ragged barrier.
+	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
+		semrelease(&work.markDoneSema)
+		return
+	}
+
+	// forEachP needs worldsema to execute, and we'll need it to
+	// stop the world later, so acquire worldsema now.
+	semacquire(&worldsema)
+
+	// Flush all local buffers and collect flushedWork flags.
+	gcMarkDoneFlushed = 0
+	systemstack(func() {
+		gp := getg().m.curg
+		// Mark the user stack as preemptible so that it may be scanned.
+		// Otherwise, our attempt to force all P's to a safepoint could
+		// result in a deadlock as we attempt to preempt a worker that's
+		// trying to preempt us (e.g. for a stack scan).
+		casgstatus(gp, _Grunning, _Gwaiting)
+		forEachP(func(_p_ *p) {
+			// Flush the write barrier buffer, since this may add
+			// work to the gcWork.
+			wbBufFlush1(_p_)
+
+			// Flush the gcWork, since this may create global work
+			// and set the flushedWork flag.
+			//
+			// TODO(austin): Break up these workbufs to
+			// better distribute work.
+			_p_.gcw.dispose()
+			// Collect the flushedWork flag.
+			if _p_.gcw.flushedWork {
+				atomic.Xadd(&gcMarkDoneFlushed, 1)
+				_p_.gcw.flushedWork = false
+			}
+		})
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
+
+	if gcMarkDoneFlushed != 0 {
+		// More grey objects were discovered since the
+		// previous termination check, so there may be more
+		// work to do. Keep going. It's possible the
+		// transition condition became true again during the
+		// ragged barrier, so re-check it.
+		semrelease(&worldsema)
+		goto top
+	}
+
+	// There was no global work, no local work, and no Ps
+	// communicated work since we took markDoneSema. Therefore
+	// there are no grey objects and no more objects can be
+	// shaded. Transition to mark termination.
+	now := nanotime()
+	work.tMarkTerm = now
+	work.pauseStart = now
+	getg().m.preemptoff = "gcing"
+	if trace.enabled {
+		traceGCSTWStart(0)
+	}
+	systemstack(stopTheWorldWithSema)
+	// The gcphase is _GCmark, it will transition to _GCmarktermination
+	// below. The important thing is that the wb remains active until
+	// all marking is complete. This includes writes made by the GC.
+
+	// There is sometimes work left over when we enter mark termination due
+	// to write barriers performed after the completion barrier above.
+	// Detect this and resume concurrent mark. This is obviously
+	// unfortunate.
+	//
+	// See issue #27993 for details.
+	//
+	// Switch to the system stack to call wbBufFlush1, though in this case
+	// it doesn't matter because we're non-preemptible anyway.
+	restart := false
+	systemstack(func() {
+		for _, p := range allp {
+			wbBufFlush1(p)
+			if !p.gcw.empty() {
+				restart = true
+				break
+			}
+		}
+	})
+	if restart {
+		getg().m.preemptoff = ""
+		systemstack(func() {
+			now := startTheWorldWithSema(true)
+			work.pauseNS += now - work.pauseStart
+			memstats.gcPauseDist.record(now - work.pauseStart)
+		})
+		semrelease(&worldsema)
+		goto top
+	}
+
+	// Disable assists and background workers. We must do
+	// this before waking blocked assists.
+	atomic.Store(&gcBlackenEnabled, 0)
+
+	// Wake all blocked assists. These will run when we
+	// start the world again.
+	gcWakeAllAssists()
+
+	// Likewise, release the transition lock. Blocked
+	// workers and assists will run when we start the
+	// world again.
+	semrelease(&work.markDoneSema)
+
+	// In STW mode, re-enable user goroutines. These will be
+	// queued to run after we start the world.
+	schedEnableUser(true)
+
+	// endCycle depends on all gcWork cache stats being flushed.
+	// The termination algorithm above ensured that up to
+	// allocations since the ragged barrier.
+	nextTriggerRatio := gcController.endCycle(now, int(gomaxprocs), work.userForced)
+
+	// Perform mark termination. This will restart the world.
+	gcMarkTermination(nextTriggerRatio)
+}
+
+// World must be stopped and mark assists and background workers must be
+// disabled.
+func gcMarkTermination(nextTriggerRatio float64) {
+	// Start marktermination (write barrier remains enabled for now).
+	setGCPhase(_GCmarktermination)
+
+	work.heap1 = gcController.heapLive
+	startTime := nanotime()
+
+	mp := acquirem()
+	mp.preemptoff = "gcing"
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gp := _g_.m.curg
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = waitReasonGarbageCollection
+
+	// Run gc on the g0 stack. We do this so that the g stack
+	// we're currently running on will no longer change. Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	systemstack(func() {
+		gcMark(startTime)
+		// Must return immediately.
+		// The outer function's stack may have moved
+		// during gcMark (it shrinks stacks, including the
+		// outer function's stack), so we must not refer
+		// to any of its variables. Return back to the
+		// non-system stack to pick up the new addresses
+		// before continuing.
+	})
+
+	systemstack(func() {
+		work.heap2 = work.bytesMarked
+		if debug.gccheckmark > 0 {
+			// Run a full non-parallel, stop-the-world
+			// mark using checkmark bits, to check that we
+			// didn't forget to mark anything during the
+			// concurrent mark process.
+			startCheckmarks()
+			gcResetMarkState()
+			gcw := &getg().m.p.ptr().gcw
+			gcDrain(gcw, 0)
+			wbBufFlush1(getg().m.p.ptr())
+			gcw.dispose()
+			endCheckmarks()
+		}
+
+		// marking is complete so we can turn the write barrier off
+		setGCPhase(_GCoff)
+		gcSweep(work.mode)
+	})
+
+	_g_.m.traceback = 0
+	casgstatus(gp, _Gwaiting, _Grunning)
+
+	if trace.enabled {
+		traceGCDone()
+	}
+
+	// all done
+	mp.preemptoff = ""
+
+	if gcphase != _GCoff {
+		throw("gc done but gcphase != _GCoff")
+	}
+
+	// Record heap_inuse for scavenger.
+	memstats.last_heap_inuse = memstats.heap_inuse
+
+	// Update GC trigger and pacing for the next cycle.
+	gcController.commit(nextTriggerRatio)
+	gcPaceSweeper(gcController.trigger)
+	gcPaceScavenger(gcController.heapGoal, gcController.lastHeapGoal)
+
+	// Update timing memstats
+	now := nanotime()
+	sec, nsec, _ := time_now()
+	unixNow := sec*1e9 + int64(nsec)
+	work.pauseNS += now - work.pauseStart
+	work.tEnd = now
+	memstats.gcPauseDist.record(now - work.pauseStart)
+	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
+	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
+	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
+	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
+	memstats.pause_total_ns += uint64(work.pauseNS)
+
+	// Update work.totaltime.
+	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
+	// We report idle marking time below, but omit it from the
+	// overall utilization here since it's "free".
+	markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
+	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
+	cycleCpu := sweepTermCpu + markCpu + markTermCpu
+	work.totaltime += cycleCpu
+
+	// Compute overall GC CPU utilization.
+	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
+	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
+
+	// Reset sweep state.
+	sweep.nbgsweep = 0
+	sweep.npausesweep = 0
+
+	if work.userForced {
+		memstats.numforcedgc++
+	}
+
+	// Bump GC cycle count and wake goroutines waiting on sweep.
+	lock(&work.sweepWaiters.lock)
+	memstats.numgc++
+	injectglist(&work.sweepWaiters.list)
+	unlock(&work.sweepWaiters.lock)
+
+	// Finish the current heap profiling cycle and start a new
+	// heap profiling cycle. We do this before starting the world
+	// so events don't leak into the wrong cycle.
+	mProf_NextCycle()
+
+	// There may be stale spans in mcaches that need to be swept.
+	// Those aren't tracked in any sweep lists, so we need to
+	// count them against sweep completion until we ensure all
+	// those spans have been forced out.
+	sl := sweep.active.begin()
+	if !sl.valid {
+		throw("failed to set sweep barrier")
+	}
+
+	systemstack(func() { startTheWorldWithSema(true) })
+
+	// Flush the heap profile so we can start a new cycle next GC.
+	// This is relatively expensive, so we don't do it with the
+	// world stopped.
+	mProf_Flush()
+
+	// Prepare workbufs for freeing by the sweeper. We do this
+	// asynchronously because it can take non-trivial time.
+	prepareFreeWorkbufs()
+
+	// Free stack spans. This must be done between GC cycles.
+	systemstack(freeStackSpans)
+
+	// Ensure all mcaches are flushed. Each P will flush its own
+	// mcache before allocating, but idle Ps may not. Since this
+	// is necessary to sweep all spans, we need to ensure all
+	// mcaches are flushed before we start the next GC cycle.
+	systemstack(func() {
+		forEachP(func(_p_ *p) {
+			_p_.mcache.prepareForSweep()
+		})
+	})
+	// Now that we've swept stale spans in mcaches, they don't
+	// count against unswept spans.
+	sweep.active.end(sl)
+
+	// Print gctrace before dropping worldsema. As soon as we drop
+	// worldsema another cycle could start and smash the stats
+	// we're trying to print.
+	if debug.gctrace > 0 {
+		util := int(memstats.gc_cpu_fraction * 100)
+
+		var sbuf [24]byte
+		printlock()
+		print("gc ", memstats.numgc,
+			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
+			util, "%: ")
+		prev := work.tSweepTerm
+		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
+			if i != 0 {
+				print("+")
+			}
+			print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
+			prev = ns
+		}
+		print(" ms clock, ")
+		for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
+			if i == 2 || i == 3 {
+				// Separate mark time components with /.
+				print("/")
+			} else if i != 0 {
+				print("+")
+			}
+			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
+		}
+		print(" ms cpu, ",
+			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
+			work.heapGoal>>20, " MB goal, ",
+			gcController.stackScan>>20, " MB stacks, ",
+			gcController.globalsScan>>20, " MB globals, ",
+			work.maxprocs, " P")
+		if work.userForced {
+			print(" (forced)")
+		}
+		print("\n")
+		printunlock()
+	}
+
+	semrelease(&worldsema)
+	semrelease(&gcsema)
+	// Careful: another GC cycle may start now.
+
+	releasem(mp)
+	mp = nil
+
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		Gosched()
+	}
+}
+
+// gcBgMarkStartWorkers prepares background mark worker goroutines. These
+// goroutines will not run until the mark phase, but they must be started while
+// the work is not stopped and from a regular G stack. The caller must hold
+// worldsema.
+func gcBgMarkStartWorkers() {
+	// Background marking is performed by per-P G's. Ensure that each P has
+	// a background GC G.
+	//
+	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
+	// again, we can reuse the old workers; no need to create new workers.
+	for gcBgMarkWorkerCount < gomaxprocs {
+		go gcBgMarkWorker()
+
+		notetsleepg(&work.bgMarkReady, -1)
+		noteclear(&work.bgMarkReady)
+		// The worker is now guaranteed to be added to the pool before
+		// its P's next findRunnableGCWorker.
+
+		gcBgMarkWorkerCount++
+	}
+}
+
+// gcBgMarkPrepare sets up state for background marking.
+// Mutator assists must not yet be enabled.
+func gcBgMarkPrepare() {
+	// Background marking will stop when the work queues are empty
+	// and there are no more workers (note that, since this is
+	// concurrent, this may be a transient state, but mark
+	// termination will clean it up). Between background workers
+	// and assists, we don't really know how many workers there
+	// will be, so we pretend to have an arbitrarily large number
+	// of workers, almost all of which are "waiting". While a
+	// worker is working it decrements nwait. If nproc == nwait,
+	// there are no workers.
+	work.nproc = ^uint32(0)
+	work.nwait = ^uint32(0)
+}
+
+// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
+// gcBgMarkWorker goroutine.
+type gcBgMarkWorkerNode struct {
+	// Unused workers are managed in a lock-free stack. This field must be first.
+	node lfnode
+
+	// The g of this worker.
+	gp guintptr
+
+	// Release this m on park. This is used to communicate with the unlock
+	// function, which cannot access the G's stack. It is unused outside of
+	// gcBgMarkWorker().
+	m muintptr
+}
+
+func gcBgMarkWorker() {
+	gp := getg()
+
+	// We pass node to a gopark unlock function, so it can't be on
+	// the stack (see gopark). Prevent deadlock from recursively
+	// starting GC by disabling preemption.
+	gp.m.preemptoff = "GC worker init"
+	node := new(gcBgMarkWorkerNode)
+	gp.m.preemptoff = ""
+
+	node.gp.set(gp)
+
+	node.m.set(acquirem())
+	notewakeup(&work.bgMarkReady)
+	// After this point, the background mark worker is generally scheduled
+	// cooperatively by gcController.findRunnableGCWorker. While performing
+	// work on the P, preemption is disabled because we are working on
+	// P-local work buffers. When the preempt flag is set, this puts itself
+	// into _Gwaiting to be woken up by gcController.findRunnableGCWorker
+	// at the appropriate time.
+	//
+	// When preemption is enabled (e.g., while in gcMarkDone), this worker
+	// may be preempted and schedule as a _Grunnable G from a runq. That is
+	// fine; it will eventually gopark again for further scheduling via
+	// findRunnableGCWorker.
+	//
+	// Since we disable preemption before notifying bgMarkReady, we
+	// guarantee that this G will be in the worker pool for the next
+	// findRunnableGCWorker. This isn't strictly necessary, but it reduces
+	// latency between _GCmark starting and the workers starting.
+
+	for {
+		// Go to sleep until woken by
+		// gcController.findRunnableGCWorker.
+		gopark(func(g *g, nodep unsafe.Pointer) bool {
+			node := (*gcBgMarkWorkerNode)(nodep)
+
+			if mp := node.m.ptr(); mp != nil {
+				// The worker G is no longer running; release
+				// the M.
+				//
+				// N.B. it is _safe_ to release the M as soon
+				// as we are no longer performing P-local mark
+				// work.
+				//
+				// However, since we cooperatively stop work
+				// when gp.preempt is set, if we releasem in
+				// the loop then the following call to gopark
+				// would immediately preempt the G. This is
+				// also safe, but inefficient: the G must
+				// schedule again only to enter gopark and park
+				// again. Thus, we defer the release until
+				// after parking the G.
+				releasem(mp)
+			}
+
+			// Release this G to the pool.
+			gcBgMarkWorkerPool.push(&node.node)
+			// Note that at this point, the G may immediately be
+			// rescheduled and may be running.
+			return true
+		}, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
+
+		// Preemption must not occur here, or another G might see
+		// p.gcMarkWorkerMode.
+
+		// Disable preemption so we can use the gcw. If the
+		// scheduler wants to preempt us, we'll stop draining,
+		// dispose the gcw, and then preempt.
+		node.m.set(acquirem())
+		pp := gp.m.p.ptr() // P can't change with preemption disabled.
+
+		if gcBlackenEnabled == 0 {
+			println("worker mode", pp.gcMarkWorkerMode)
+			throw("gcBgMarkWorker: blackening not enabled")
+		}
+
+		if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
+			throw("gcBgMarkWorker: mode not set")
+		}
+
+		startTime := nanotime()
+		pp.gcMarkWorkerStartTime = startTime
+
+		decnwait := atomic.Xadd(&work.nwait, -1)
+		if decnwait == work.nproc {
+			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+			throw("work.nwait was > work.nproc")
+		}
+
+		systemstack(func() {
+			// Mark our goroutine preemptible so its stack
+			// can be scanned. This lets two mark workers
+			// scan each other (otherwise, they would
+			// deadlock). We must not modify anything on
+			// the G stack. However, stack shrinking is
+			// disabled for mark workers, so it is safe to
+			// read from the G stack.
+			casgstatus(gp, _Grunning, _Gwaiting)
+			switch pp.gcMarkWorkerMode {
+			default:
+				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
+			case gcMarkWorkerDedicatedMode:
+				gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				if gp.preempt {
+					// We were preempted. This is
+					// a useful signal to kick
+					// everything out of the run
+					// queue so it can run
+					// somewhere else.
+					if drainQ, n := runqdrain(pp); n > 0 {
+						lock(&sched.lock)
+						globrunqputbatch(&drainQ, int32(n))
+						unlock(&sched.lock)
+					}
+				}
+				// Go back to draining, this time
+				// without preemption.
+				gcDrain(&pp.gcw, gcDrainFlushBgCredit)
+			case gcMarkWorkerFractionalMode:
+				gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+			case gcMarkWorkerIdleMode:
+				gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+			}
+			casgstatus(gp, _Gwaiting, _Grunning)
+		})
+
+		// Account for time.
+		duration := nanotime() - startTime
+		gcController.logWorkTime(pp.gcMarkWorkerMode, duration)
+		if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
+			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
+		}
+
+		// Was this the last worker and did we run out
+		// of work?
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait > work.nproc {
+			println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
+				"work.nwait=", incnwait, "work.nproc=", work.nproc)
+			throw("work.nwait > work.nproc")
+		}
+
+		// We'll releasem after this point and thus this P may run
+		// something else. We must clear the worker mode to avoid
+		// attributing the mode to a different (non-worker) G in
+		// traceGoStart.
+		pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
+
+		// If this worker reached a background mark completion
+		// point, signal the main GC goroutine.
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// We don't need the P-local buffers here, allow
+			// preemption because we may schedule like a regular
+			// goroutine in gcMarkDone (block on locks, etc).
+			releasem(node.m.ptr())
+			node.m.set(nil)
+
+			gcMarkDone()
+		}
+	}
+}
+
+// gcMarkWorkAvailable reports whether executing a mark worker
+// on p is potentially useful. p may be nil, in which case it only
+// checks the global sources of work.
+func gcMarkWorkAvailable(p *p) bool {
+	if p != nil && !p.gcw.empty() {
+		return true
+	}
+	if !work.full.empty() {
+		return true // global work available
+	}
+	if work.markrootNext < work.markrootJobs {
+		return true // root scan work available
+	}
+	return false
+}
+
+// gcMark runs the mark (or, for concurrent GC, mark termination)
+// All gcWork caches must be empty.
+// STW is in effect at this point.
+func gcMark(startTime int64) {
+	if debug.allocfreetrace > 0 {
+		tracegc()
+	}
+
+	if gcphase != _GCmarktermination {
+		throw("in gcMark expecting to see gcphase as _GCmarktermination")
+	}
+	work.tstart = startTime
+
+	// Check that there's no marking work remaining.
+	if work.full != 0 || work.markrootNext < work.markrootJobs {
+		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
+		panic("non-empty mark queue after concurrent mark")
+	}
+
+	if debug.gccheckmark > 0 {
+		// This is expensive when there's a large number of
+		// Gs, so only do it if checkmark is also enabled.
+		gcMarkRootCheck()
+	}
+	if work.full != 0 {
+		throw("work.full != 0")
+	}
+
+	// Drop allg snapshot. allgs may have grown, in which case
+	// this is the only reference to the old backing store and
+	// there's no need to keep it around.
+	work.stackRoots = nil
+
+	// Clear out buffers and double-check that all gcWork caches
+	// are empty. This should be ensured by gcMarkDone before we
+	// enter mark termination.
+	//
+	// TODO: We could clear out buffers just before mark if this
+	// has a non-negligible impact on STW time.
+	for _, p := range allp {
+		// The write barrier may have buffered pointers since
+		// the gcMarkDone barrier. However, since the barrier
+		// ensured all reachable objects were marked, all of
+		// these must be pointers to black objects. Hence we
+		// can just discard the write barrier buffer.
+		if debug.gccheckmark > 0 {
+			// For debugging, flush the buffer and make
+			// sure it really was all marked.
+			wbBufFlush1(p)
+		} else {
+			p.wbBuf.reset()
+		}
+
+		gcw := &p.gcw
+		if !gcw.empty() {
+			printlock()
+			print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
+			if gcw.wbuf1 == nil {
+				print(" wbuf1=<nil>")
+			} else {
+				print(" wbuf1.n=", gcw.wbuf1.nobj)
+			}
+			if gcw.wbuf2 == nil {
+				print(" wbuf2=<nil>")
+			} else {
+				print(" wbuf2.n=", gcw.wbuf2.nobj)
+			}
+			print("\n")
+			throw("P has cached GC work at end of mark termination")
+		}
+		// There may still be cached empty buffers, which we
+		// need to flush since we're going to free them. Also,
+		// there may be non-zero stats because we allocated
+		// black after the gcMarkDone barrier.
+		gcw.dispose()
+	}
+
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heapScan directly. If we were to flush this later, then scanAlloc
+	// might have incorrect information.
+	//
+	// Note that it's not important to retain this information; we know
+	// exactly what heapScan is at this point via scanWork.
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		c.scanAlloc = 0
+	}
+
+	// Reset controller state.
+	gcController.resetLive(work.bytesMarked)
+}
+
+// gcSweep must be called on the system stack because it acquires the heap
+// lock. See mheap for details.
+//
+// The world must be stopped.
+//
+//go:systemstack
+func gcSweep(mode gcMode) {
+	assertWorldStopped()
+
+	if gcphase != _GCoff {
+		throw("gcSweep being done but phase is not GCoff")
+	}
+
+	lock(&mheap_.lock)
+	mheap_.sweepgen += 2
+	sweep.active.reset()
+	mheap_.pagesSwept.Store(0)
+	mheap_.sweepArenas = mheap_.allArenas
+	mheap_.reclaimIndex.Store(0)
+	mheap_.reclaimCredit.Store(0)
+	unlock(&mheap_.lock)
+
+	sweep.centralIndex.clear()
+
+	if !_ConcurrentSweep || mode == gcForceBlockMode {
+		// Special case synchronous sweep.
+		// Record that no proportional sweeping has to happen.
+		lock(&mheap_.lock)
+		mheap_.sweepPagesPerByte = 0
+		unlock(&mheap_.lock)
+		// Sweep all spans eagerly.
+		for sweepone() != ^uintptr(0) {
+			sweep.npausesweep++
+		}
+		// Free workbufs eagerly.
+		prepareFreeWorkbufs()
+		for freeSomeWbufs(false) {
+		}
+		// All "free" events for this mark/sweep cycle have
+		// now happened, so we can make this profile cycle
+		// available immediately.
+		mProf_NextCycle()
+		mProf_Flush()
+		return
+	}
+
+	// Background sweep.
+	lock(&sweep.lock)
+	if sweep.parked {
+		sweep.parked = false
+		ready(sweep.g, 0, true)
+	}
+	unlock(&sweep.lock)
+}
+
+// gcResetMarkState resets global state prior to marking (concurrent
+// or STW) and resets the stack scan state of all Gs.
+//
+// This is safe to do without the world stopped because any Gs created
+// during or after this will start out in the reset state.
+//
+// gcResetMarkState must be called on the system stack because it acquires
+// the heap lock. See mheap for details.
+//
+//go:systemstack
+func gcResetMarkState() {
+	// This may be called during a concurrent phase, so lock to make sure
+	// allgs doesn't change.
+	forEachG(func(gp *g) {
+		gp.gcscandone = false // set to true in gcphasework
+		gp.gcAssistBytes = 0
+	})
+
+	// Clear page marks. This is just 1MB per 64GB of heap, so the
+	// time here is pretty trivial.
+	lock(&mheap_.lock)
+	arenas := mheap_.allArenas
+	unlock(&mheap_.lock)
+	for _, ai := range arenas {
+		ha := mheap_.arenas[ai.l1()][ai.l2()]
+		for i := range ha.pageMarks {
+			ha.pageMarks[i] = 0
+		}
+	}
+
+	work.bytesMarked = 0
+	work.initialHeapLive = atomic.Load64(&gcController.heapLive)
+}
+
+// Hooks for other packages
+
+var poolcleanup func()
+
+//go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
+func sync_runtime_registerPoolCleanup(f func()) {
+	poolcleanup = f
+}
+
+func clearpools() {
+	// clear sync.Pools
+	if poolcleanup != nil {
+		poolcleanup()
+	}
+
+	// Clear central sudog cache.
+	// Leave per-P caches alone, they have strictly bounded size.
+	// Disconnect cached list before dropping it on the floor,
+	// so that a dangling ref to one entry does not pin all of them.
+	lock(&sched.sudoglock)
+	var sg, sgnext *sudog
+	for sg = sched.sudogcache; sg != nil; sg = sgnext {
+		sgnext = sg.next
+		sg.next = nil
+	}
+	sched.sudogcache = nil
+	unlock(&sched.sudoglock)
+
+	// Clear central defer pool.
+	// Leave per-P pools alone, they have strictly bounded size.
+	lock(&sched.deferlock)
+	// disconnect cached list before dropping it on the floor,
+	// so that a dangling ref to one entry does not pin all of them.
+	var d, dlink *_defer
+	for d = sched.deferpool; d != nil; d = dlink {
+		dlink = d.link
+		d.link = nil
+	}
+	sched.deferpool = nil
+	unlock(&sched.deferlock)
+}
+
+// Timing
+
+// itoaDiv formats val/(10**dec) into buf.
+func itoaDiv(buf []byte, val uint64, dec int) []byte {
+	i := len(buf) - 1
+	idec := i - dec
+	for val >= 10 || i >= idec {
+		buf[i] = byte(val%10 + '0')
+		i--
+		if i == idec {
+			buf[i] = '.'
+			i--
+		}
+		val /= 10
+	}
+	buf[i] = byte(val + '0')
+	return buf[i:]
+}
+
+// fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
+func fmtNSAsMS(buf []byte, ns uint64) []byte {
+	if ns >= 10e6 {
+		// Format as whole milliseconds.
+		return itoaDiv(buf, ns/1e6, 0)
+	}
+	// Format two digits of precision, with at most three decimal places.
+	x := ns / 1e3
+	if x == 0 {
+		buf[0] = '0'
+		return buf[:1]
+	}
+	dec := 3
+	for x >= 100 {
+		x /= 10
+		dec--
+	}
+	return itoaDiv(buf, x, dec)
+}
+
+// Helpers for testing GC.
+
+// gcTestMoveStackOnNextCall causes the stack to be moved on a call
+// immediately following the call to this. It may not work correctly
+// if any other work appears after this call (such as returning).
+// Typically the following call should be marked go:noinline so it
+// performs a stack check.
+//
+// In rare cases this may not cause the stack to move, specifically if
+// there's a preemption between this call and the next.
+func gcTestMoveStackOnNextCall() {
+	gp := getg()
+	gp.stackguard0 = stackForceMove
+}
+
+// gcTestIsReachable performs a GC and returns a bit set where bit i
+// is set if ptrs[i] is reachable.
+func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
+	// This takes the pointers as unsafe.Pointers in order to keep
+	// them live long enough for us to attach specials. After
+	// that, we drop our references to them.
+
+	if len(ptrs) > 64 {
+		panic("too many pointers for uint64 mask")
+	}
+
+	// Block GC while we attach specials and drop our references
+	// to ptrs. Otherwise, if a GC is in progress, it could mark
+	// them reachable via this function before we have a chance to
+	// drop them.
+	semacquire(&gcsema)
+
+	// Create reachability specials for ptrs.
+	specials := make([]*specialReachable, len(ptrs))
+	for i, p := range ptrs {
+		lock(&mheap_.speciallock)
+		s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
+		unlock(&mheap_.speciallock)
+		s.special.kind = _KindSpecialReachable
+		if !addspecial(p, &s.special) {
+			throw("already have a reachable special (duplicate pointer?)")
+		}
+		specials[i] = s
+		// Make sure we don't retain ptrs.
+		ptrs[i] = nil
+	}
+
+	semrelease(&gcsema)
+
+	// Force a full GC and sweep.
+	GC()
+
+	// Process specials.
+	for i, s := range specials {
+		if !s.done {
+			printlock()
+			println("runtime: object", i, "was not swept")
+			throw("IsReachable failed")
+		}
+		if s.reachable {
+			mask |= 1 << i
+		}
+		lock(&mheap_.speciallock)
+		mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
+		unlock(&mheap_.speciallock)
+	}
+
+	return mask
+}
+
+// gcTestPointerClass returns the category of what p points to, one of:
+// "heap", "stack", "data", "bss", "other". This is useful for checking
+// that a test is doing what it's intended to do.
+//
+// This is nosplit simply to avoid extra pointer shuffling that may
+// complicate a test.
+//
+//go:nosplit
+func gcTestPointerClass(p unsafe.Pointer) string {
+	p2 := uintptr(noescape(p))
+	gp := getg()
+	if gp.stack.lo <= p2 && p2 < gp.stack.hi {
+		return "stack"
+	}
+	if base, _, _ := findObject(p2, 0, 0); base != 0 {
+		return "heap"
+	}
+	for _, datap := range activeModules() {
+		if datap.data <= p2 && p2 < datap.edata || datap.noptrdata <= p2 && p2 < datap.enoptrdata {
+			return "data"
+		}
+		if datap.bss <= p2 && p2 < datap.ebss || datap.noptrbss <= p2 && p2 <= datap.enoptrbss {
+			return "bss"
+		}
+	}
+	KeepAlive(p)
+	return "other"
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgcmark.go b/contrib/go/_std_1.18/src/runtime/mgcmark.go
new file mode 100644
index 0000000000..0bf044e314
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgcmark.go
@@ -0,0 +1,1591 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: marking and scanning
+
+package runtime
+
+import (
+	"internal/goarch"
+	"internal/goexperiment"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	fixedRootFinalizers = iota
+	fixedRootFreeGStacks
+	fixedRootCount
+
+	// rootBlockBytes is the number of bytes to scan per data or
+	// BSS root.
+	rootBlockBytes = 256 << 10
+
+	// maxObletBytes is the maximum bytes of an object to scan at
+	// once. Larger objects will be split up into "oblets" of at
+	// most this size. Since we can scan 1–2 MB/ms, 128 KB bounds
+	// scan preemption at ~100 µs.
+	//
+	// This must be > _MaxSmallSize so that the object base is the
+	// span base.
+	maxObletBytes = 128 << 10
+
+	// drainCheckThreshold specifies how many units of work to do
+	// between self-preemption checks in gcDrain. Assuming a scan
+	// rate of 1 MB/ms, this is ~100 µs. Lower values have higher
+	// overhead in the scan loop (the scheduler check may perform
+	// a syscall, so its overhead is nontrivial). Higher values
+	// make the system less responsive to incoming work.
+	drainCheckThreshold = 100000
+
+	// pagesPerSpanRoot indicates how many pages to scan from a span root
+	// at a time. Used by special root marking.
+	//
+	// Higher values improve throughput by increasing locality, but
+	// increase the minimum latency of a marking operation.
+	//
+	// Must be a multiple of the pageInUse bitmap element size and
+	// must also evenly divide pagesPerArena.
+	pagesPerSpanRoot = 512
+)
+
+// gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
+// some miscellany) and initializes scanning-related state.
+//
+// The world must be stopped.
+func gcMarkRootPrepare() {
+	assertWorldStopped()
+
+	// Compute how many data and BSS root blocks there are.
+	nBlocks := func(bytes uintptr) int {
+		return int(divRoundUp(bytes, rootBlockBytes))
+	}
+
+	work.nDataRoots = 0
+	work.nBSSRoots = 0
+
+	// Scan globals.
+	for _, datap := range activeModules() {
+		nDataRoots := nBlocks(datap.edata - datap.data)
+		if nDataRoots > work.nDataRoots {
+			work.nDataRoots = nDataRoots
+		}
+	}
+
+	for _, datap := range activeModules() {
+		nBSSRoots := nBlocks(datap.ebss - datap.bss)
+		if nBSSRoots > work.nBSSRoots {
+			work.nBSSRoots = nBSSRoots
+		}
+	}
+
+	// Scan span roots for finalizer specials.
+	//
+	// We depend on addfinalizer to mark objects that get
+	// finalizers after root marking.
+	//
+	// We're going to scan the whole heap (that was available at the time the
+	// mark phase started, i.e. markArenas) for in-use spans which have specials.
+	//
+	// Break up the work into arenas, and further into chunks.
+	//
+	// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
+	// is append-only.
+	mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
+	work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
+
+	// Scan stacks.
+	//
+	// Gs may be created after this point, but it's okay that we
+	// ignore them because they begin life without any roots, so
+	// there's nothing to scan, and any roots they create during
+	// the concurrent phase will be caught by the write barrier.
+	work.stackRoots = allGsSnapshot()
+	work.nStackRoots = len(work.stackRoots)
+
+	work.markrootNext = 0
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+
+	// Calculate base indexes of each root type
+	work.baseData = uint32(fixedRootCount)
+	work.baseBSS = work.baseData + uint32(work.nDataRoots)
+	work.baseSpans = work.baseBSS + uint32(work.nBSSRoots)
+	work.baseStacks = work.baseSpans + uint32(work.nSpanRoots)
+	work.baseEnd = work.baseStacks + uint32(work.nStackRoots)
+}
+
+// gcMarkRootCheck checks that all roots have been scanned. It is
+// purely for debugging.
+func gcMarkRootCheck() {
+	if work.markrootNext < work.markrootJobs {
+		print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
+		throw("left over markroot jobs")
+	}
+
+	// Check that stacks have been scanned.
+	//
+	// We only check the first nStackRoots Gs that we should have scanned.
+	// Since we don't care about newer Gs (see comment in
+	// gcMarkRootPrepare), no locking is required.
+	i := 0
+	forEachGRace(func(gp *g) {
+		if i >= work.nStackRoots {
+			return
+		}
+
+		if !gp.gcscandone {
+			println("gp", gp, "goid", gp.goid,
+				"status", readgstatus(gp),
+				"gcscandone", gp.gcscandone)
+			throw("scan missed a g")
+		}
+
+		i++
+	})
+}
+
+// ptrmask for an allocation containing a single pointer.
+var oneptrmask = [...]uint8{1}
+
+// markroot scans the i'th root.
+//
+// Preemption must be disabled (because this uses a gcWork).
+//
+// Returns the amount of GC work credit produced by the operation.
+// If flushBgCredit is true, then that credit is also flushed
+// to the background credit pool.
+//
+// nowritebarrier is only advisory here.
+//
+//go:nowritebarrier
+func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
+	// Note: if you add a case here, please also update heapdump.go:dumproots.
+	var workDone int64
+	var workCounter *atomic.Int64
+	switch {
+	case work.baseData <= i && i < work.baseBSS:
+		workCounter = &gcController.globalsScanWork
+		for _, datap := range activeModules() {
+			workDone += markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
+		}
+
+	case work.baseBSS <= i && i < work.baseSpans:
+		workCounter = &gcController.globalsScanWork
+		for _, datap := range activeModules() {
+			workDone += markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
+		}
+
+	case i == fixedRootFinalizers:
+		for fb := allfin; fb != nil; fb = fb.alllink {
+			cnt := uintptr(atomic.Load(&fb.cnt))
+			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw, nil)
+		}
+
+	case i == fixedRootFreeGStacks:
+		// Switch to the system stack so we can call
+		// stackfree.
+		systemstack(markrootFreeGStacks)
+
+	case work.baseSpans <= i && i < work.baseStacks:
+		// mark mspan.specials
+		markrootSpans(gcw, int(i-work.baseSpans))
+
+	default:
+		// the rest is scanning goroutine stacks
+		workCounter = &gcController.stackScanWork
+		if i < work.baseStacks || work.baseEnd <= i {
+			printlock()
+			print("runtime: markroot index ", i, " not in stack roots range [", work.baseStacks, ", ", work.baseEnd, ")\n")
+			throw("markroot: bad index")
+		}
+		gp := work.stackRoots[i-work.baseStacks]
+
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		status := readgstatus(gp) // We are not in a scan state
+		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+			gp.waitsince = work.tstart
+		}
+
+		// scanstack must be done on the system stack in case
+		// we're trying to scan our own stack.
+		systemstack(func() {
+			// If this is a self-scan, put the user G in
+			// _Gwaiting to prevent self-deadlock. It may
+			// already be in _Gwaiting if this is a mark
+			// worker or we're in mark termination.
+			userG := getg().m.curg
+			selfScan := gp == userG && readgstatus(userG) == _Grunning
+			if selfScan {
+				casgstatus(userG, _Grunning, _Gwaiting)
+				userG.waitreason = waitReasonGarbageCollectionScan
+			}
+
+			// TODO: suspendG blocks (and spins) until gp
+			// stops, which may take a while for
+			// running goroutines. Consider doing this in
+			// two phases where the first is non-blocking:
+			// we scan the stacks we can and ask running
+			// goroutines to scan themselves; and the
+			// second blocks.
+			stopped := suspendG(gp)
+			if stopped.dead {
+				gp.gcscandone = true
+				return
+			}
+			if gp.gcscandone {
+				throw("g already scanned")
+			}
+			workDone += scanstack(gp, gcw)
+			gp.gcscandone = true
+			resumeG(stopped)
+
+			if selfScan {
+				casgstatus(userG, _Gwaiting, _Grunning)
+			}
+		})
+	}
+	if goexperiment.PacerRedesign {
+		if workCounter != nil && workDone != 0 {
+			workCounter.Add(workDone)
+			if flushBgCredit {
+				gcFlushBgCredit(workDone)
+			}
+		}
+	}
+	return workDone
+}
+
+// markrootBlock scans the shard'th shard of the block of memory [b0,
+// b0+n0), with the given pointer mask.
+//
+// Returns the amount of work done.
+//
+//go:nowritebarrier
+func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) int64 {
+	if rootBlockBytes%(8*goarch.PtrSize) != 0 {
+		// This is necessary to pick byte offsets in ptrmask0.
+		throw("rootBlockBytes must be a multiple of 8*ptrSize")
+	}
+
+	// Note that if b0 is toward the end of the address space,
+	// then b0 + rootBlockBytes might wrap around.
+	// These tests are written to avoid any possible overflow.
+	off := uintptr(shard) * rootBlockBytes
+	if off >= n0 {
+		return 0
+	}
+	b := b0 + off
+	ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*goarch.PtrSize))))
+	n := uintptr(rootBlockBytes)
+	if off+n > n0 {
+		n = n0 - off
+	}
+
+	// Scan this shard.
+	scanblock(b, n, ptrmask, gcw, nil)
+	return int64(n)
+}
+
+// markrootFreeGStacks frees stacks of dead Gs.
+//
+// This does not free stacks of dead Gs cached on Ps, but having a few
+// cached stacks around isn't a problem.
+func markrootFreeGStacks() {
+	// Take list of dead Gs with stacks.
+	lock(&sched.gFree.lock)
+	list := sched.gFree.stack
+	sched.gFree.stack = gList{}
+	unlock(&sched.gFree.lock)
+	if list.empty() {
+		return
+	}
+
+	// Free stacks.
+	q := gQueue{list.head, list.head}
+	for gp := list.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
+		stackfree(gp.stack)
+		gp.stack.lo = 0
+		gp.stack.hi = 0
+		// Manipulate the queue directly since the Gs are
+		// already all linked the right way.
+		q.tail.set(gp)
+	}
+
+	// Put Gs back on the free list.
+	lock(&sched.gFree.lock)
+	sched.gFree.noStack.pushAll(q)
+	unlock(&sched.gFree.lock)
+}
+
+// markrootSpans marks roots for one shard of markArenas.
+//
+//go:nowritebarrier
+func markrootSpans(gcw *gcWork, shard int) {
+	// Objects with finalizers have two GC-related invariants:
+	//
+	// 1) Everything reachable from the object must be marked.
+	// This ensures that when we pass the object to its finalizer,
+	// everything the finalizer can reach will be retained.
+	//
+	// 2) Finalizer specials (which are not in the garbage
+	// collected heap) are roots. In practice, this means the fn
+	// field must be scanned.
+	sg := mheap_.sweepgen
+
+	// Find the arena and page index into that arena for this shard.
+	ai := mheap_.markArenas[shard/(pagesPerArena/pagesPerSpanRoot)]
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	arenaPage := uint(uintptr(shard) * pagesPerSpanRoot % pagesPerArena)
+
+	// Construct slice of bitmap which we'll iterate over.
+	specialsbits := ha.pageSpecials[arenaPage/8:]
+	specialsbits = specialsbits[:pagesPerSpanRoot/8]
+	for i := range specialsbits {
+		// Find set bits, which correspond to spans with specials.
+		specials := atomic.Load8(&specialsbits[i])
+		if specials == 0 {
+			continue
+		}
+		for j := uint(0); j < 8; j++ {
+			if specials&(1<<j) == 0 {
+				continue
+			}
+			// Find the span for this bit.
+			//
+			// This value is guaranteed to be non-nil because having
+			// specials implies that the span is in-use, and since we're
+			// currently marking we can be sure that we don't have to worry
+			// about the span being freed and re-used.
+			s := ha.spans[arenaPage+uint(i)*8+j]
+
+			// The state must be mSpanInUse if the specials bit is set, so
+			// sanity check that.
+			if state := s.state.get(); state != mSpanInUse {
+				print("s.state = ", state, "\n")
+				throw("non in-use span found with specials bit set")
+			}
+			// Check that this span was swept (it may be cached or uncached).
+			if !useCheckmark && !(s.sweepgen == sg || s.sweepgen == sg+3) {
+				// sweepgen was updated (+2) during non-checkmark GC pass
+				print("sweep ", s.sweepgen, " ", sg, "\n")
+				throw("gc: unswept span")
+			}
+
+			// Lock the specials to prevent a special from being
+			// removed from the list while we're traversing it.
+			lock(&s.speciallock)
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+
+				// Mark everything that can be reached from
+				// the object (but *not* the object itself or
+				// we'll never collect it).
+				scanobject(p, gcw)
+
+				// The special itself is a root.
+				scanblock(uintptr(unsafe.Pointer(&spf.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
+			}
+			unlock(&s.speciallock)
+		}
+	}
+}
+
+// gcAssistAlloc performs GC work to make gp's assist debt positive.
+// gp must be the calling user goroutine.
+//
+// This must be called with preemption enabled.
+func gcAssistAlloc(gp *g) {
+	// Don't assist in non-preemptible contexts. These are
+	// generally fragile and won't allow the assist to block.
+	if getg() == gp.m.g0 {
+		return
+	}
+	if mp := getg().m; mp.locks > 0 || mp.preemptoff != "" {
+		return
+	}
+
+	traced := false
+retry:
+	// Compute the amount of scan work we need to do to make the
+	// balance positive. When the required amount of work is low,
+	// we over-assist to build up credit for future allocations
+	// and amortize the cost of assisting.
+	assistWorkPerByte := gcController.assistWorkPerByte.Load()
+	assistBytesPerWork := gcController.assistBytesPerWork.Load()
+	debtBytes := -gp.gcAssistBytes
+	scanWork := int64(assistWorkPerByte * float64(debtBytes))
+	if scanWork < gcOverAssistWork {
+		scanWork = gcOverAssistWork
+		debtBytes = int64(assistBytesPerWork * float64(scanWork))
+	}
+
+	// Steal as much credit as we can from the background GC's
+	// scan credit. This is racy and may drop the background
+	// credit below 0 if two mutators steal at the same time. This
+	// will just cause steals to fail until credit is accumulated
+	// again, so in the long run it doesn't really matter, but we
+	// do have to handle the negative credit case.
+	bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
+	stolen := int64(0)
+	if bgScanCredit > 0 {
+		if bgScanCredit < scanWork {
+			stolen = bgScanCredit
+			gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(stolen))
+		} else {
+			stolen = scanWork
+			gp.gcAssistBytes += debtBytes
+		}
+		atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
+
+		scanWork -= stolen
+
+		if scanWork == 0 {
+			// We were able to steal all of the credit we
+			// needed.
+			if traced {
+				traceGCMarkAssistDone()
+			}
+			return
+		}
+	}
+
+	if trace.enabled && !traced {
+		traced = true
+		traceGCMarkAssistStart()
+	}
+
+	// Perform assist work
+	systemstack(func() {
+		gcAssistAlloc1(gp, scanWork)
+		// The user stack may have moved, so this can't touch
+		// anything on it until it returns from systemstack.
+	})
+
+	completed := gp.param != nil
+	gp.param = nil
+	if completed {
+		gcMarkDone()
+	}
+
+	if gp.gcAssistBytes < 0 {
+		// We were unable steal enough credit or perform
+		// enough work to pay off the assist debt. We need to
+		// do one of these before letting the mutator allocate
+		// more to prevent over-allocation.
+		//
+		// If this is because we were preempted, reschedule
+		// and try some more.
+		if gp.preempt {
+			Gosched()
+			goto retry
+		}
+
+		// Add this G to an assist queue and park. When the GC
+		// has more background credit, it will satisfy queued
+		// assists before flushing to the global credit pool.
+		//
+		// Note that this does *not* get woken up when more
+		// work is added to the work list. The theory is that
+		// there wasn't enough work to do anyway, so we might
+		// as well let background marking take care of the
+		// work that is available.
+		if !gcParkAssist() {
+			goto retry
+		}
+
+		// At this point either background GC has satisfied
+		// this G's assist debt, or the GC cycle is over.
+	}
+	if traced {
+		traceGCMarkAssistDone()
+	}
+}
+
+// gcAssistAlloc1 is the part of gcAssistAlloc that runs on the system
+// stack. This is a separate function to make it easier to see that
+// we're not capturing anything from the user stack, since the user
+// stack may move while we're in this function.
+//
+// gcAssistAlloc1 indicates whether this assist completed the mark
+// phase by setting gp.param to non-nil. This can't be communicated on
+// the stack since it may move.
+//
+//go:systemstack
+func gcAssistAlloc1(gp *g, scanWork int64) {
+	// Clear the flag indicating that this assist completed the
+	// mark phase.
+	gp.param = nil
+
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		// The gcBlackenEnabled check in malloc races with the
+		// store that clears it but an atomic check in every malloc
+		// would be a performance hit.
+		// Instead we recheck it here on the non-preemptable system
+		// stack to determine if we should perform an assist.
+
+		// GC is done, so ignore any remaining debt.
+		gp.gcAssistBytes = 0
+		return
+	}
+	// Track time spent in this assist. Since we're on the
+	// system stack, this is non-preemptible, so we can
+	// just measure start and end time.
+	startTime := nanotime()
+
+	decnwait := atomic.Xadd(&work.nwait, -1)
+	if decnwait == work.nproc {
+		println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
+		throw("nwait > work.nprocs")
+	}
+
+	// gcDrainN requires the caller to be preemptible.
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = waitReasonGCAssistMarking
+
+	// drain own cached work first in the hopes that it
+	// will be more cache friendly.
+	gcw := &getg().m.p.ptr().gcw
+	workDone := gcDrainN(gcw, scanWork)
+
+	casgstatus(gp, _Gwaiting, _Grunning)
+
+	// Record that we did this much scan work.
+	//
+	// Back out the number of bytes of assist credit that
+	// this scan work counts for. The "1+" is a poor man's
+	// round-up, to ensure this adds credit even if
+	// assistBytesPerWork is very low.
+	assistBytesPerWork := gcController.assistBytesPerWork.Load()
+	gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(workDone))
+
+	// If this is the last worker and we ran out of work,
+	// signal a completion point.
+	incnwait := atomic.Xadd(&work.nwait, +1)
+	if incnwait > work.nproc {
+		println("runtime: work.nwait=", incnwait,
+			"work.nproc=", work.nproc)
+		throw("work.nwait > work.nproc")
+	}
+
+	if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+		// This has reached a background completion point. Set
+		// gp.param to a non-nil value to indicate this. It
+		// doesn't matter what we set it to (it just has to be
+		// a valid pointer).
+		gp.param = unsafe.Pointer(gp)
+	}
+	duration := nanotime() - startTime
+	_p_ := gp.m.p.ptr()
+	_p_.gcAssistTime += duration
+	if _p_.gcAssistTime > gcAssistTimeSlack {
+		atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
+		_p_.gcAssistTime = 0
+	}
+}
+
+// gcWakeAllAssists wakes all currently blocked assists. This is used
+// at the end of a GC cycle. gcBlackenEnabled must be false to prevent
+// new assists from going to sleep after this point.
+func gcWakeAllAssists() {
+	lock(&work.assistQueue.lock)
+	list := work.assistQueue.q.popList()
+	injectglist(&list)
+	unlock(&work.assistQueue.lock)
+}
+
+// gcParkAssist puts the current goroutine on the assist queue and parks.
+//
+// gcParkAssist reports whether the assist is now satisfied. If it
+// returns false, the caller must retry the assist.
+func gcParkAssist() bool {
+	lock(&work.assistQueue.lock)
+	// If the GC cycle finished while we were getting the lock,
+	// exit the assist. The cycle can't finish while we hold the
+	// lock.
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		unlock(&work.assistQueue.lock)
+		return true
+	}
+
+	gp := getg()
+	oldList := work.assistQueue.q
+	work.assistQueue.q.pushBack(gp)
+
+	// Recheck for background credit now that this G is in
+	// the queue, but can still back out. This avoids a
+	// race in case background marking has flushed more
+	// credit since we checked above.
+	if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
+		work.assistQueue.q = oldList
+		if oldList.tail != 0 {
+			oldList.tail.ptr().schedlink.set(nil)
+		}
+		unlock(&work.assistQueue.lock)
+		return false
+	}
+	// Park.
+	goparkunlock(&work.assistQueue.lock, waitReasonGCAssistWait, traceEvGoBlockGC, 2)
+	return true
+}
+
+// gcFlushBgCredit flushes scanWork units of background scan work
+// credit. This first satisfies blocked assists on the
+// work.assistQueue and then flushes any remaining credit to
+// gcController.bgScanCredit.
+//
+// Write barriers are disallowed because this is used by gcDrain after
+// it has ensured that all work is drained and this must preserve that
+// condition.
+//
+//go:nowritebarrierrec
+func gcFlushBgCredit(scanWork int64) {
+	if work.assistQueue.q.empty() {
+		// Fast path; there are no blocked assists. There's a
+		// small window here where an assist may add itself to
+		// the blocked queue and park. If that happens, we'll
+		// just get it on the next flush.
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+		return
+	}
+
+	assistBytesPerWork := gcController.assistBytesPerWork.Load()
+	scanBytes := int64(float64(scanWork) * assistBytesPerWork)
+
+	lock(&work.assistQueue.lock)
+	for !work.assistQueue.q.empty() && scanBytes > 0 {
+		gp := work.assistQueue.q.pop()
+		// Note that gp.gcAssistBytes is negative because gp
+		// is in debt. Think carefully about the signs below.
+		if scanBytes+gp.gcAssistBytes >= 0 {
+			// Satisfy this entire assist debt.
+			scanBytes += gp.gcAssistBytes
+			gp.gcAssistBytes = 0
+			// It's important that we *not* put gp in
+			// runnext. Otherwise, it's possible for user
+			// code to exploit the GC worker's high
+			// scheduler priority to get itself always run
+			// before other goroutines and always in the
+			// fresh quantum started by GC.
+			ready(gp, 0, false)
+		} else {
+			// Partially satisfy this assist.
+			gp.gcAssistBytes += scanBytes
+			scanBytes = 0
+			// As a heuristic, we move this assist to the
+			// back of the queue so that large assists
+			// can't clog up the assist queue and
+			// substantially delay small assists.
+			work.assistQueue.q.pushBack(gp)
+			break
+		}
+	}
+
+	if scanBytes > 0 {
+		// Convert from scan bytes back to work.
+		assistWorkPerByte := gcController.assistWorkPerByte.Load()
+		scanWork = int64(float64(scanBytes) * assistWorkPerByte)
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+	}
+	unlock(&work.assistQueue.lock)
+}
+
+// scanstack scans gp's stack, greying all pointers found on the stack.
+//
+// For goexperiment.PacerRedesign:
+// Returns the amount of scan work performed, but doesn't update
+// gcController.stackScanWork or flush any credit. Any background credit produced
+// by this function should be flushed by its caller. scanstack itself can't
+// safely flush because it may result in trying to wake up a goroutine that
+// was just scanned, resulting in a self-deadlock.
+//
+// scanstack will also shrink the stack if it is safe to do so. If it
+// is not, it schedules a stack shrink for the next synchronous safe
+// point.
+//
+// scanstack is marked go:systemstack because it must not be preempted
+// while using a workbuf.
+//
+//go:nowritebarrier
+//go:systemstack
+func scanstack(gp *g, gcw *gcWork) int64 {
+	if readgstatus(gp)&_Gscan == 0 {
+		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
+		throw("scanstack - bad status")
+	}
+
+	switch readgstatus(gp) &^ _Gscan {
+	default:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("mark - bad status")
+	case _Gdead:
+		return 0
+	case _Grunning:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("scanstack: goroutine not stopped")
+	case _Grunnable, _Gsyscall, _Gwaiting:
+		// ok
+	}
+
+	if gp == getg() {
+		throw("can't scan our own stack")
+	}
+
+	// stackSize is the amount of work we'll be reporting.
+	//
+	// We report the total stack size, more than we scan,
+	// because this number needs to line up with gcControllerState's
+	// stackScan and scannableStackSize fields.
+	//
+	// See the documentation on those fields for more information.
+	stackSize := gp.stack.hi - gp.stack.lo
+
+	if isShrinkStackSafe(gp) {
+		// Shrink the stack if not much of it is being used.
+		shrinkstack(gp)
+	} else {
+		// Otherwise, shrink the stack at the next sync safe point.
+		gp.preemptShrink = true
+	}
+
+	var state stackScanState
+	state.stack = gp.stack
+
+	if stackTraceDebug {
+		println("stack trace goroutine", gp.goid)
+	}
+
+	if debugScanConservative && gp.asyncSafePoint {
+		print("scanning async preempted goroutine ", gp.goid, " stack [", hex(gp.stack.lo), ",", hex(gp.stack.hi), ")\n")
+	}
+
+	// Scan the saved context register. This is effectively a live
+	// register that gets moved back and forth between the
+	// register and sched.ctxt without a write barrier.
+	if gp.sched.ctxt != nil {
+		scanblock(uintptr(unsafe.Pointer(&gp.sched.ctxt)), goarch.PtrSize, &oneptrmask[0], gcw, &state)
+	}
+
+	// Scan the stack. Accumulate a list of stack objects.
+	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
+		scanframeworker(frame, &state, gcw)
+		return true
+	}
+	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
+
+	// Find additional pointers that point into the stack from the heap.
+	// Currently this includes defers and panics. See also function copystack.
+
+	// Find and trace other pointers in defer records.
+	for d := gp._defer; d != nil; d = d.link {
+		if d.fn != nil {
+			// Scan the func value, which could be a stack allocated closure.
+			// See issue 30453.
+			scanblock(uintptr(unsafe.Pointer(&d.fn)), goarch.PtrSize, &oneptrmask[0], gcw, &state)
+		}
+		if d.link != nil {
+			// The link field of a stack-allocated defer record might point
+			// to a heap-allocated defer record. Keep that heap record live.
+			scanblock(uintptr(unsafe.Pointer(&d.link)), goarch.PtrSize, &oneptrmask[0], gcw, &state)
+		}
+		// Retain defers records themselves.
+		// Defer records might not be reachable from the G through regular heap
+		// tracing because the defer linked list might weave between the stack and the heap.
+		if d.heap {
+			scanblock(uintptr(unsafe.Pointer(&d)), goarch.PtrSize, &oneptrmask[0], gcw, &state)
+		}
+	}
+	if gp._panic != nil {
+		// Panics are always stack allocated.
+		state.putPtr(uintptr(unsafe.Pointer(gp._panic)), false)
+	}
+
+	// Find and scan all reachable stack objects.
+	//
+	// The state's pointer queue prioritizes precise pointers over
+	// conservative pointers so that we'll prefer scanning stack
+	// objects precisely.
+	state.buildIndex()
+	for {
+		p, conservative := state.getPtr()
+		if p == 0 {
+			break
+		}
+		obj := state.findObject(p)
+		if obj == nil {
+			continue
+		}
+		r := obj.r
+		if r == nil {
+			// We've already scanned this object.
+			continue
+		}
+		obj.setRecord(nil) // Don't scan it again.
+		if stackTraceDebug {
+			printlock()
+			print("  live stkobj at", hex(state.stack.lo+uintptr(obj.off)), "of size", obj.size)
+			if conservative {
+				print(" (conservative)")
+			}
+			println()
+			printunlock()
+		}
+		gcdata := r.gcdata()
+		var s *mspan
+		if r.useGCProg() {
+			// This path is pretty unlikely, an object large enough
+			// to have a GC program allocated on the stack.
+			// We need some space to unpack the program into a straight
+			// bitmask, which we allocate/free here.
+			// TODO: it would be nice if there were a way to run a GC
+			// program without having to store all its bits. We'd have
+			// to change from a Lempel-Ziv style program to something else.
+			// Or we can forbid putting objects on stacks if they require
+			// a gc program (see issue 27447).
+			s = materializeGCProg(r.ptrdata(), gcdata)
+			gcdata = (*byte)(unsafe.Pointer(s.startAddr))
+		}
+
+		b := state.stack.lo + uintptr(obj.off)
+		if conservative {
+			scanConservative(b, r.ptrdata(), gcdata, gcw, &state)
+		} else {
+			scanblock(b, r.ptrdata(), gcdata, gcw, &state)
+		}
+
+		if s != nil {
+			dematerializeGCProg(s)
+		}
+	}
+
+	// Deallocate object buffers.
+	// (Pointer buffers were all deallocated in the loop above.)
+	for state.head != nil {
+		x := state.head
+		state.head = x.next
+		if stackTraceDebug {
+			for i := 0; i < x.nobj; i++ {
+				obj := &x.obj[i]
+				if obj.r == nil { // reachable
+					continue
+				}
+				println("  dead stkobj at", hex(gp.stack.lo+uintptr(obj.off)), "of size", obj.r.size)
+				// Note: not necessarily really dead - only reachable-from-ptr dead.
+			}
+		}
+		x.nobj = 0
+		putempty((*workbuf)(unsafe.Pointer(x)))
+	}
+	if state.buf != nil || state.cbuf != nil || state.freeBuf != nil {
+		throw("remaining pointer buffers")
+	}
+	return int64(stackSize)
+}
+
+// Scan a stack frame: local variables and function arguments/results.
+//go:nowritebarrier
+func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
+	if _DebugGC > 1 && frame.continpc != 0 {
+		print("scanframe ", funcname(frame.fn), "\n")
+	}
+
+	isAsyncPreempt := frame.fn.valid() && frame.fn.funcID == funcID_asyncPreempt
+	isDebugCall := frame.fn.valid() && frame.fn.funcID == funcID_debugCallV2
+	if state.conservative || isAsyncPreempt || isDebugCall {
+		if debugScanConservative {
+			println("conservatively scanning function", funcname(frame.fn), "at PC", hex(frame.continpc))
+		}
+
+		// Conservatively scan the frame. Unlike the precise
+		// case, this includes the outgoing argument space
+		// since we may have stopped while this function was
+		// setting up a call.
+		//
+		// TODO: We could narrow this down if the compiler
+		// produced a single map per function of stack slots
+		// and registers that ever contain a pointer.
+		if frame.varp != 0 {
+			size := frame.varp - frame.sp
+			if size > 0 {
+				scanConservative(frame.sp, size, nil, gcw, state)
+			}
+		}
+
+		// Scan arguments to this frame.
+		if frame.arglen != 0 {
+			// TODO: We could pass the entry argument map
+			// to narrow this down further.
+			scanConservative(frame.argp, frame.arglen, nil, gcw, state)
+		}
+
+		if isAsyncPreempt || isDebugCall {
+			// This function's frame contained the
+			// registers for the asynchronously stopped
+			// parent frame. Scan the parent
+			// conservatively.
+			state.conservative = true
+		} else {
+			// We only wanted to scan those two frames
+			// conservatively. Clear the flag for future
+			// frames.
+			state.conservative = false
+		}
+		return
+	}
+
+	locals, args, objs := getStackMap(frame, &state.cache, false)
+
+	// Scan local variables if stack frame has been allocated.
+	if locals.n > 0 {
+		size := uintptr(locals.n) * goarch.PtrSize
+		scanblock(frame.varp-size, size, locals.bytedata, gcw, state)
+	}
+
+	// Scan arguments.
+	if args.n > 0 {
+		scanblock(frame.argp, uintptr(args.n)*goarch.PtrSize, args.bytedata, gcw, state)
+	}
+
+	// Add all stack objects to the stack object list.
+	if frame.varp != 0 {
+		// varp is 0 for defers, where there are no locals.
+		// In that case, there can't be a pointer to its args, either.
+		// (And all args would be scanned above anyway.)
+		for i := range objs {
+			obj := &objs[i]
+			off := obj.off
+			base := frame.varp // locals base pointer
+			if off >= 0 {
+				base = frame.argp // arguments and return values base pointer
+			}
+			ptr := base + uintptr(off)
+			if ptr < frame.sp {
+				// object hasn't been allocated in the frame yet.
+				continue
+			}
+			if stackTraceDebug {
+				println("stkobj at", hex(ptr), "of size", obj.size)
+			}
+			state.addObject(ptr, obj)
+		}
+	}
+}
+
+type gcDrainFlags int
+
+const (
+	gcDrainUntilPreempt gcDrainFlags = 1 << iota
+	gcDrainFlushBgCredit
+	gcDrainIdle
+	gcDrainFractional
+)
+
+// gcDrain scans roots and objects in work buffers, blackening grey
+// objects until it is unable to get more work. It may return before
+// GC is done; it's the caller's responsibility to balance work from
+// other Ps.
+//
+// If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
+// is set.
+//
+// If flags&gcDrainIdle != 0, gcDrain returns when there is other work
+// to do.
+//
+// If flags&gcDrainFractional != 0, gcDrain self-preempts when
+// pollFractionalWorkerExit() returns true. This implies
+// gcDrainNoBlock.
+//
+// If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
+// credit to gcController.bgScanCredit every gcCreditSlack units of
+// scan work.
+//
+// gcDrain will always return if there is a pending STW.
+//
+//go:nowritebarrier
+func gcDrain(gcw *gcWork, flags gcDrainFlags) {
+	if !writeBarrier.needed {
+		throw("gcDrain phase incorrect")
+	}
+
+	gp := getg().m.curg
+	preemptible := flags&gcDrainUntilPreempt != 0
+	flushBgCredit := flags&gcDrainFlushBgCredit != 0
+	idle := flags&gcDrainIdle != 0
+
+	initScanWork := gcw.heapScanWork
+
+	// checkWork is the scan work before performing the next
+	// self-preempt check.
+	checkWork := int64(1<<63 - 1)
+	var check func() bool
+	if flags&(gcDrainIdle|gcDrainFractional) != 0 {
+		checkWork = initScanWork + drainCheckThreshold
+		if idle {
+			check = pollWork
+		} else if flags&gcDrainFractional != 0 {
+			check = pollFractionalWorkerExit
+		}
+	}
+
+	// Drain root marking jobs.
+	if work.markrootNext < work.markrootJobs {
+		// Stop if we're preemptible or if someone wants to STW.
+		for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
+			job := atomic.Xadd(&work.markrootNext, +1) - 1
+			if job >= work.markrootJobs {
+				break
+			}
+			markroot(gcw, job, flushBgCredit)
+			if check != nil && check() {
+				goto done
+			}
+		}
+	}
+
+	// Drain heap marking jobs.
+	// Stop if we're preemptible or if someone wants to STW.
+	for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
+		// Try to keep work available on the global queue. We used to
+		// check if there were waiting workers, but it's better to
+		// just keep work available than to make workers wait. In the
+		// worst case, we'll do O(log(_WorkbufSize)) unnecessary
+		// balances.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
+		b := gcw.tryGetFast()
+		if b == 0 {
+			b = gcw.tryGet()
+			if b == 0 {
+				// Flush the write barrier
+				// buffer; this may create
+				// more work.
+				wbBufFlush(nil, 0)
+				b = gcw.tryGet()
+			}
+		}
+		if b == 0 {
+			// Unable to get work.
+			break
+		}
+		scanobject(b, gcw)
+
+		// Flush background scan work credit to the global
+		// account if we've accumulated enough locally so
+		// mutator assists can draw on it.
+		if gcw.heapScanWork >= gcCreditSlack {
+			gcController.heapScanWork.Add(gcw.heapScanWork)
+			if flushBgCredit {
+				gcFlushBgCredit(gcw.heapScanWork - initScanWork)
+				initScanWork = 0
+			}
+			checkWork -= gcw.heapScanWork
+			gcw.heapScanWork = 0
+
+			if checkWork <= 0 {
+				checkWork += drainCheckThreshold
+				if check != nil && check() {
+					break
+				}
+			}
+		}
+	}
+
+done:
+	// Flush remaining scan work credit.
+	if gcw.heapScanWork > 0 {
+		gcController.heapScanWork.Add(gcw.heapScanWork)
+		if flushBgCredit {
+			gcFlushBgCredit(gcw.heapScanWork - initScanWork)
+		}
+		gcw.heapScanWork = 0
+	}
+}
+
+// gcDrainN blackens grey objects until it has performed roughly
+// scanWork units of scan work or the G is preempted. This is
+// best-effort, so it may perform less work if it fails to get a work
+// buffer. Otherwise, it will perform at least n units of work, but
+// may perform more because scanning is always done in whole object
+// increments. It returns the amount of scan work performed.
+//
+// The caller goroutine must be in a preemptible state (e.g.,
+// _Gwaiting) to prevent deadlocks during stack scanning. As a
+// consequence, this must be called on the system stack.
+//
+//go:nowritebarrier
+//go:systemstack
+func gcDrainN(gcw *gcWork, scanWork int64) int64 {
+	if !writeBarrier.needed {
+		throw("gcDrainN phase incorrect")
+	}
+
+	// There may already be scan work on the gcw, which we don't
+	// want to claim was done by this call.
+	workFlushed := -gcw.heapScanWork
+
+	gp := getg().m.curg
+	for !gp.preempt && workFlushed+gcw.heapScanWork < scanWork {
+		// See gcDrain comment.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
+		b := gcw.tryGetFast()
+		if b == 0 {
+			b = gcw.tryGet()
+			if b == 0 {
+				// Flush the write barrier buffer;
+				// this may create more work.
+				wbBufFlush(nil, 0)
+				b = gcw.tryGet()
+			}
+		}
+
+		if b == 0 {
+			// Try to do a root job.
+			if work.markrootNext < work.markrootJobs {
+				job := atomic.Xadd(&work.markrootNext, +1) - 1
+				if job < work.markrootJobs {
+					work := markroot(gcw, job, false)
+					if goexperiment.PacerRedesign {
+						workFlushed += work
+					}
+					continue
+				}
+			}
+			// No heap or root jobs.
+			break
+		}
+
+		scanobject(b, gcw)
+
+		// Flush background scan work credit.
+		if gcw.heapScanWork >= gcCreditSlack {
+			gcController.heapScanWork.Add(gcw.heapScanWork)
+			workFlushed += gcw.heapScanWork
+			gcw.heapScanWork = 0
+		}
+	}
+
+	// Unlike gcDrain, there's no need to flush remaining work
+	// here because this never flushes to bgScanCredit and
+	// gcw.dispose will flush any remaining work to scanWork.
+
+	return workFlushed + gcw.heapScanWork
+}
+
+// scanblock scans b as scanobject would, but using an explicit
+// pointer bitmap instead of the heap bitmap.
+//
+// This is used to scan non-heap roots, so it does not update
+// gcw.bytesMarked or gcw.heapScanWork.
+//
+// If stk != nil, possible stack pointers are also reported to stk.putPtr.
+//go:nowritebarrier
+func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork, stk *stackScanState) {
+	// Use local copies of original parameters, so that a stack trace
+	// due to one of the throws below shows the original block
+	// base and extent.
+	b := b0
+	n := n0
+
+	for i := uintptr(0); i < n; {
+		// Find bits for the next word.
+		bits := uint32(*addb(ptrmask, i/(goarch.PtrSize*8)))
+		if bits == 0 {
+			i += goarch.PtrSize * 8
+			continue
+		}
+		for j := 0; j < 8 && i < n; j++ {
+			if bits&1 != 0 {
+				// Same work as in scanobject; see comments there.
+				p := *(*uintptr)(unsafe.Pointer(b + i))
+				if p != 0 {
+					if obj, span, objIndex := findObject(p, b, i); obj != 0 {
+						greyobject(obj, b, i, span, gcw, objIndex)
+					} else if stk != nil && p >= stk.stack.lo && p < stk.stack.hi {
+						stk.putPtr(p, false)
+					}
+				}
+			}
+			bits >>= 1
+			i += goarch.PtrSize
+		}
+	}
+}
+
+// scanobject scans the object starting at b, adding pointers to gcw.
+// b must point to the beginning of a heap object or an oblet.
+// scanobject consults the GC bitmap for the pointer mask and the
+// spans for the size of the object.
+//
+//go:nowritebarrier
+func scanobject(b uintptr, gcw *gcWork) {
+	// Prefetch object before we scan it.
+	//
+	// This will overlap fetching the beginning of the object with initial
+	// setup before we start scanning the object.
+	sys.Prefetch(b)
+
+	// Find the bits for b and the size of the object at b.
+	//
+	// b is either the beginning of an object, in which case this
+	// is the size of the object to scan, or it points to an
+	// oblet, in which case we compute the size to scan below.
+	hbits := heapBitsForAddr(b)
+	s := spanOfUnchecked(b)
+	n := s.elemsize
+	if n == 0 {
+		throw("scanobject n == 0")
+	}
+
+	if n > maxObletBytes {
+		// Large object. Break into oblets for better
+		// parallelism and lower latency.
+		if b == s.base() {
+			// It's possible this is a noscan object (not
+			// from greyobject, but from other code
+			// paths), in which case we must *not* enqueue
+			// oblets since their bitmaps will be
+			// uninitialized.
+			if s.spanclass.noscan() {
+				// Bypass the whole scan.
+				gcw.bytesMarked += uint64(n)
+				return
+			}
+
+			// Enqueue the other oblets to scan later.
+			// Some oblets may be in b's scalar tail, but
+			// these will be marked as "no more pointers",
+			// so we'll drop out immediately when we go to
+			// scan those.
+			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
+				if !gcw.putFast(oblet) {
+					gcw.put(oblet)
+				}
+			}
+		}
+
+		// Compute the size of the oblet. Since this object
+		// must be a large object, s.base() is the beginning
+		// of the object.
+		n = s.base() + s.elemsize - b
+		if n > maxObletBytes {
+			n = maxObletBytes
+		}
+	}
+
+	var i uintptr
+	for i = 0; i < n; i, hbits = i+goarch.PtrSize, hbits.next() {
+		// Load bits once. See CL 22712 and issue 16973 for discussion.
+		bits := hbits.bits()
+		if bits&bitScan == 0 {
+			break // no more pointers in this object
+		}
+		if bits&bitPointer == 0 {
+			continue // not a pointer
+		}
+
+		// Work here is duplicated in scanblock and above.
+		// If you make changes here, make changes there too.
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// At this point we have extracted the next potential pointer.
+		// Quickly filter out nil and pointers back to the current object.
+		if obj != 0 && obj-b >= n {
+			// Test if obj points into the Go heap and, if so,
+			// mark the object.
+			//
+			// Note that it's possible for findObject to
+			// fail if obj points to a just-allocated heap
+			// object because of a race with growing the
+			// heap. In this case, we know the object was
+			// just allocated and hence will be marked by
+			// allocation itself.
+			if obj, span, objIndex := findObject(obj, b, i); obj != 0 {
+				greyobject(obj, b, i, span, gcw, objIndex)
+			}
+		}
+	}
+	gcw.bytesMarked += uint64(n)
+	gcw.heapScanWork += int64(i)
+}
+
+// scanConservative scans block [b, b+n) conservatively, treating any
+// pointer-like value in the block as a pointer.
+//
+// If ptrmask != nil, only words that are marked in ptrmask are
+// considered as potential pointers.
+//
+// If state != nil, it's assumed that [b, b+n) is a block in the stack
+// and may contain pointers to stack objects.
+func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackScanState) {
+	if debugScanConservative {
+		printlock()
+		print("conservatively scanning [", hex(b), ",", hex(b+n), ")\n")
+		hexdumpWords(b, b+n, func(p uintptr) byte {
+			if ptrmask != nil {
+				word := (p - b) / goarch.PtrSize
+				bits := *addb(ptrmask, word/8)
+				if (bits>>(word%8))&1 == 0 {
+					return '$'
+				}
+			}
+
+			val := *(*uintptr)(unsafe.Pointer(p))
+			if state != nil && state.stack.lo <= val && val < state.stack.hi {
+				return '@'
+			}
+
+			span := spanOfHeap(val)
+			if span == nil {
+				return ' '
+			}
+			idx := span.objIndex(val)
+			if span.isFree(idx) {
+				return ' '
+			}
+			return '*'
+		})
+		printunlock()
+	}
+
+	for i := uintptr(0); i < n; i += goarch.PtrSize {
+		if ptrmask != nil {
+			word := i / goarch.PtrSize
+			bits := *addb(ptrmask, word/8)
+			if bits == 0 {
+				// Skip 8 words (the loop increment will do the 8th)
+				//
+				// This must be the first time we've
+				// seen this word of ptrmask, so i
+				// must be 8-word-aligned, but check
+				// our reasoning just in case.
+				if i%(goarch.PtrSize*8) != 0 {
+					throw("misaligned mask")
+				}
+				i += goarch.PtrSize*8 - goarch.PtrSize
+				continue
+			}
+			if (bits>>(word%8))&1 == 0 {
+				continue
+			}
+		}
+
+		val := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// Check if val points into the stack.
+		if state != nil && state.stack.lo <= val && val < state.stack.hi {
+			// val may point to a stack object. This
+			// object may be dead from last cycle and
+			// hence may contain pointers to unallocated
+			// objects, but unlike heap objects we can't
+			// tell if it's already dead. Hence, if all
+			// pointers to this object are from
+			// conservative scanning, we have to scan it
+			// defensively, too.
+			state.putPtr(val, true)
+			continue
+		}
+
+		// Check if val points to a heap span.
+		span := spanOfHeap(val)
+		if span == nil {
+			continue
+		}
+
+		// Check if val points to an allocated object.
+		idx := span.objIndex(val)
+		if span.isFree(idx) {
+			continue
+		}
+
+		// val points to an allocated object. Mark it.
+		obj := span.base() + idx*span.elemsize
+		greyobject(obj, b, i, span, gcw, idx)
+	}
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+// Preemption must be disabled.
+//go:nowritebarrier
+func shade(b uintptr) {
+	if obj, span, objIndex := findObject(b, 0, 0); obj != 0 {
+		gcw := &getg().m.p.ptr().gcw
+		greyobject(obj, 0, 0, span, gcw, objIndex)
+	}
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into gcw.
+// base and off are for debugging only and could be removed.
+//
+// See also wbBufFlush1, which partially duplicates this logic.
+//
+//go:nowritebarrierrec
+func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintptr) {
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if obj&(goarch.PtrSize-1) != 0 {
+		throw("greyobject: obj not pointer-aligned")
+	}
+	mbits := span.markBitsForIndex(objIndex)
+
+	if useCheckmark {
+		if setCheckmark(obj, base, off, mbits) {
+			// Already marked.
+			return
+		}
+	} else {
+		if debug.gccheckmark > 0 && span.isFree(objIndex) {
+			print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
+			gcDumpObject("base", base, off)
+			gcDumpObject("obj", obj, ^uintptr(0))
+			getg().m.traceback = 2
+			throw("marking free object")
+		}
+
+		// If marked we have nothing to do.
+		if mbits.isMarked() {
+			return
+		}
+		mbits.setMarked()
+
+		// Mark span.
+		arena, pageIdx, pageMask := pageIndexOf(span.base())
+		if arena.pageMarks[pageIdx]&pageMask == 0 {
+			atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+		}
+
+		// If this is a noscan object, fast-track it to black
+		// instead of greying it.
+		if span.spanclass.noscan() {
+			gcw.bytesMarked += uint64(span.elemsize)
+			return
+		}
+	}
+
+	// We're adding obj to P's local workbuf, so it's likely
+	// this object will be processed soon by the same P.
+	// Even if the workbuf gets flushed, there will likely still be
+	// some benefit on platforms with inclusive shared caches.
+	sys.Prefetch(obj)
+	// Queue the obj for scanning.
+	if !gcw.putFast(obj) {
+		gcw.put(obj)
+	}
+}
+
+// gcDumpObject dumps the contents of obj for debugging and marks the
+// field at byte offset off in obj.
+func gcDumpObject(label string, obj, off uintptr) {
+	s := spanOf(obj)
+	print(label, "=", hex(obj))
+	if s == nil {
+		print(" s=nil\n")
+		return
+	}
+	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.spanclass=", s.spanclass, " s.elemsize=", s.elemsize, " s.state=")
+	if state := s.state.get(); 0 <= state && int(state) < len(mSpanStateNames) {
+		print(mSpanStateNames[state], "\n")
+	} else {
+		print("unknown(", state, ")\n")
+	}
+
+	skipped := false
+	size := s.elemsize
+	if s.state.get() == mSpanManual && size == 0 {
+		// We're printing something from a stack frame. We
+		// don't know how big it is, so just show up to an
+		// including off.
+		size = off + goarch.PtrSize
+	}
+	for i := uintptr(0); i < size; i += goarch.PtrSize {
+		// For big objects, just print the beginning (because
+		// that usually hints at the object's type) and the
+		// fields around off.
+		if !(i < 128*goarch.PtrSize || off-16*goarch.PtrSize < i && i < off+16*goarch.PtrSize) {
+			skipped = true
+			continue
+		}
+		if skipped {
+			print(" ...\n")
+			skipped = false
+		}
+		print(" *(", label, "+", i, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + i))))
+		if i == off {
+			print(" <==")
+		}
+		print("\n")
+	}
+	if skipped {
+		print(" ...\n")
+	}
+}
+
+// gcmarknewobject marks a newly allocated object black. obj must
+// not contain any non-nil pointers.
+//
+// This is nosplit so it can manipulate a gcWork without preemption.
+//
+//go:nowritebarrier
+//go:nosplit
+func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
+	if useCheckmark { // The world should be stopped so this should not happen.
+		throw("gcmarknewobject called while doing checkmark")
+	}
+
+	// Mark object.
+	objIndex := span.objIndex(obj)
+	span.markBitsForIndex(objIndex).setMarked()
+
+	// Mark span.
+	arena, pageIdx, pageMask := pageIndexOf(span.base())
+	if arena.pageMarks[pageIdx]&pageMask == 0 {
+		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+	}
+
+	gcw := &getg().m.p.ptr().gcw
+	gcw.bytesMarked += uint64(size)
+	if !goexperiment.PacerRedesign {
+		// The old pacer counts newly allocated memory toward
+		// heapScanWork because heapScan is continuously updated
+		// throughout the GC cycle with newly allocated memory. However,
+		// these objects are never actually scanned, so we need
+		// to account for them in heapScanWork here, "faking" their work.
+		// Otherwise the pacer will think it's always behind, potentially
+		// by a large margin.
+		//
+		// The new pacer doesn't care about this because it ceases to updated
+		// heapScan once a GC cycle starts, effectively snapshotting it.
+		gcw.heapScanWork += int64(scanSize)
+	}
+}
+
+// gcMarkTinyAllocs greys all active tiny alloc blocks.
+//
+// The world must be stopped.
+func gcMarkTinyAllocs() {
+	assertWorldStopped()
+
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil || c.tiny == 0 {
+			continue
+		}
+		_, span, objIndex := findObject(c.tiny, 0, 0)
+		gcw := &p.gcw
+		greyobject(c.tiny, 0, 0, span, gcw, objIndex)
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgcpacer.go b/contrib/go/_std_1.18/src/runtime/mgcpacer.go
new file mode 100644
index 0000000000..d54dbc26c2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgcpacer.go
@@ -0,0 +1,1348 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"internal/goexperiment"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// gcGoalUtilization is the goal CPU utilization for
+	// marking as a fraction of GOMAXPROCS.
+	gcGoalUtilization = goexperiment.PacerRedesignInt*gcBackgroundUtilization +
+		(1-goexperiment.PacerRedesignInt)*(gcBackgroundUtilization+0.05)
+
+	// gcBackgroundUtilization is the fixed CPU utilization for background
+	// marking. It must be <= gcGoalUtilization. The difference between
+	// gcGoalUtilization and gcBackgroundUtilization will be made up by
+	// mark assists. The scheduler will aim to use within 50% of this
+	// goal.
+	//
+	// Setting this to < gcGoalUtilization avoids saturating the trigger
+	// feedback controller when there are no assists, which allows it to
+	// better control CPU and heap growth. However, the larger the gap,
+	// the more mutator assists are expected to happen, which impact
+	// mutator latency.
+	//
+	// If goexperiment.PacerRedesign, the trigger feedback controller
+	// is replaced with an estimate of the mark/cons ratio that doesn't
+	// have the same saturation issues, so this is set equal to
+	// gcGoalUtilization.
+	gcBackgroundUtilization = 0.25
+
+	// gcCreditSlack is the amount of scan work credit that can
+	// accumulate locally before updating gcController.heapScanWork and,
+	// optionally, gcController.bgScanCredit. Lower values give a more
+	// accurate assist ratio and make it more likely that assists will
+	// successfully steal background credit. Higher values reduce memory
+	// contention.
+	gcCreditSlack = 2000
+
+	// gcAssistTimeSlack is the nanoseconds of mutator assist time that
+	// can accumulate on a P before updating gcController.assistTime.
+	gcAssistTimeSlack = 5000
+
+	// gcOverAssistWork determines how many extra units of scan work a GC
+	// assist does when an assist happens. This amortizes the cost of an
+	// assist by pre-paying for this many bytes of future allocations.
+	gcOverAssistWork = 64 << 10
+
+	// defaultHeapMinimum is the value of heapMinimum for GOGC==100.
+	defaultHeapMinimum = (goexperiment.HeapMinimum512KiBInt)*(512<<10) +
+		(1-goexperiment.HeapMinimum512KiBInt)*(4<<20)
+
+	// scannableStackSizeSlack is the bytes of stack space allocated or freed
+	// that can accumulate on a P before updating gcController.stackSize.
+	scannableStackSizeSlack = 8 << 10
+)
+
+func init() {
+	if offset := unsafe.Offsetof(gcController.heapLive); offset%8 != 0 {
+		println(offset)
+		throw("gcController.heapLive not aligned to 8 bytes")
+	}
+}
+
+// gcController implements the GC pacing controller that determines
+// when to trigger concurrent garbage collection and how much marking
+// work to do in mutator assists and background marking.
+//
+// It uses a feedback control algorithm to adjust the gcController.trigger
+// trigger based on the heap growth and GC CPU utilization each cycle.
+// This algorithm optimizes for heap growth to match GOGC and for CPU
+// utilization between assist and background marking to be 25% of
+// GOMAXPROCS. The high-level design of this algorithm is documented
+// at https://golang.org/s/go15gcpacing.
+//
+// All fields of gcController are used only during a single mark
+// cycle.
+var gcController gcControllerState
+
+type gcControllerState struct {
+
+	// Initialized from GOGC. GOGC=off means no GC.
+	gcPercent atomic.Int32
+
+	_ uint32 // padding so following 64-bit values are 8-byte aligned
+
+	// heapMinimum is the minimum heap size at which to trigger GC.
+	// For small heaps, this overrides the usual GOGC*live set rule.
+	//
+	// When there is a very small live set but a lot of allocation, simply
+	// collecting when the heap reaches GOGC*live results in many GC
+	// cycles and high total per-GC overhead. This minimum amortizes this
+	// per-GC overhead while keeping the heap reasonably small.
+	//
+	// During initialization this is set to 4MB*GOGC/100. In the case of
+	// GOGC==0, this will set heapMinimum to 0, resulting in constant
+	// collection even when the heap size is small, which is useful for
+	// debugging.
+	heapMinimum uint64
+
+	// triggerRatio is the heap growth ratio that triggers marking.
+	//
+	// E.g., if this is 0.6, then GC should start when the live
+	// heap has reached 1.6 times the heap size marked by the
+	// previous cycle. This should be ≤ GOGC/100 so the trigger
+	// heap size is less than the goal heap size. This is set
+	// during mark termination for the next cycle's trigger.
+	//
+	// Protected by mheap_.lock or a STW.
+	//
+	// Used if !goexperiment.PacerRedesign.
+	triggerRatio float64
+
+	// trigger is the heap size that triggers marking.
+	//
+	// When heapLive ≥ trigger, the mark phase will start.
+	// This is also the heap size by which proportional sweeping
+	// must be complete.
+	//
+	// This is computed from triggerRatio during mark termination
+	// for the next cycle's trigger.
+	//
+	// Protected by mheap_.lock or a STW.
+	trigger uint64
+
+	// consMark is the estimated per-CPU consMark ratio for the application.
+	//
+	// It represents the ratio between the application's allocation
+	// rate, as bytes allocated per CPU-time, and the GC's scan rate,
+	// as bytes scanned per CPU-time.
+	// The units of this ratio are (B / cpu-ns) / (B / cpu-ns).
+	//
+	// At a high level, this value is computed as the bytes of memory
+	// allocated (cons) per unit of scan work completed (mark) in a GC
+	// cycle, divided by the CPU time spent on each activity.
+	//
+	// Updated at the end of each GC cycle, in endCycle.
+	//
+	// For goexperiment.PacerRedesign.
+	consMark float64
+
+	// consMarkController holds the state for the mark-cons ratio
+	// estimation over time.
+	//
+	// Its purpose is to smooth out noisiness in the computation of
+	// consMark; see consMark for details.
+	//
+	// For goexperiment.PacerRedesign.
+	consMarkController piController
+
+	_ uint32 // Padding for atomics on 32-bit platforms.
+
+	// heapGoal is the goal heapLive for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	heapGoal uint64
+
+	// lastHeapGoal is the value of heapGoal for the previous GC.
+	// Note that this is distinct from the last value heapGoal had,
+	// because it could change if e.g. gcPercent changes.
+	//
+	// Read and written with the world stopped or with mheap_.lock held.
+	lastHeapGoal uint64
+
+	// heapLive is the number of bytes considered live by the GC.
+	// That is: retained by the most recent GC plus allocated
+	// since then. heapLive ≤ memstats.heapAlloc, since heapAlloc includes
+	// unmarked objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heapLive excludes these
+	// objects (and hence only goes up between GCs).
+	//
+	// This is updated atomically without locking. To reduce
+	// contention, this is updated only when obtaining a span from
+	// an mcentral and at this point it counts all of the
+	// unallocated slots in that span (which will be allocated
+	// before that mcache obtains another span from that
+	// mcentral). Hence, it slightly overestimates the "true" live
+	// heap size. It's better to overestimate than to
+	// underestimate because 1) this triggers the GC earlier than
+	// necessary rather than potentially too late and 2) this
+	// leads to a conservative GC rate rather than a GC rate that
+	// is potentially too low.
+	//
+	// Reads should likewise be atomic (or during STW).
+	//
+	// Whenever this is updated, call traceHeapAlloc() and
+	// this gcControllerState's revise() method.
+	heapLive uint64
+
+	// heapScan is the number of bytes of "scannable" heap. This
+	// is the live heap (as counted by heapLive), but omitting
+	// no-scan objects and no-scan tails of objects.
+	//
+	// For !goexperiment.PacerRedesign: Whenever this is updated,
+	// call this gcControllerState's revise() method. It is read
+	// and written atomically or with the world stopped.
+	//
+	// For goexperiment.PacerRedesign: This value is fixed at the
+	// start of a GC cycle, so during a GC cycle it is safe to
+	// read without atomics, and it represents the maximum scannable
+	// heap.
+	heapScan uint64
+
+	// lastHeapScan is the number of bytes of heap that were scanned
+	// last GC cycle. It is the same as heapMarked, but only
+	// includes the "scannable" parts of objects.
+	//
+	// Updated when the world is stopped.
+	lastHeapScan uint64
+
+	// stackScan is a snapshot of scannableStackSize taken at each GC
+	// STW pause and is used in pacing decisions.
+	//
+	// Updated only while the world is stopped.
+	stackScan uint64
+
+	// scannableStackSize is the amount of allocated goroutine stack space in
+	// use by goroutines.
+	//
+	// This number tracks allocated goroutine stack space rather than used
+	// goroutine stack space (i.e. what is actually scanned) because used
+	// goroutine stack space is much harder to measure cheaply. By using
+	// allocated space, we make an overestimate; this is OK, it's better
+	// to conservatively overcount than undercount.
+	//
+	// Read and updated atomically.
+	scannableStackSize uint64
+
+	// globalsScan is the total amount of global variable space
+	// that is scannable.
+	//
+	// Read and updated atomically.
+	globalsScan uint64
+
+	// heapMarked is the number of bytes marked by the previous
+	// GC. After mark termination, heapLive == heapMarked, but
+	// unlike heapLive, heapMarked does not change until the
+	// next mark termination.
+	heapMarked uint64
+
+	// heapScanWork is the total heap scan work performed this cycle.
+	// stackScanWork is the total stack scan work performed this cycle.
+	// globalsScanWork is the total globals scan work performed this cycle.
+	//
+	// These are updated atomically during the cycle. Updates occur in
+	// bounded batches, since they are both written and read
+	// throughout the cycle. At the end of the cycle, heapScanWork is how
+	// much of the retained heap is scannable.
+	//
+	// Currently these are measured in bytes. For most uses, this is an
+	// opaque unit of work, but for estimation the definition is important.
+	//
+	// Note that stackScanWork includes all allocated space, not just the
+	// size of the stack itself, mirroring stackSize.
+	//
+	// For !goexperiment.PacerRedesign, stackScanWork and globalsScanWork
+	// are always zero.
+	heapScanWork    atomic.Int64
+	stackScanWork   atomic.Int64
+	globalsScanWork atomic.Int64
+
+	// bgScanCredit is the scan work credit accumulated by the
+	// concurrent background scan. This credit is accumulated by
+	// the background scan and stolen by mutator assists. This is
+	// updated atomically. Updates occur in bounded batches, since
+	// it is both written and read throughout the cycle.
+	bgScanCredit int64
+
+	// assistTime is the nanoseconds spent in mutator assists
+	// during this cycle. This is updated atomically. Updates
+	// occur in bounded batches, since it is both written and read
+	// throughout the cycle.
+	assistTime int64
+
+	// dedicatedMarkTime is the nanoseconds spent in dedicated
+	// mark workers during this cycle. This is updated atomically
+	// at the end of the concurrent mark phase.
+	dedicatedMarkTime int64
+
+	// fractionalMarkTime is the nanoseconds spent in the
+	// fractional mark worker during this cycle. This is updated
+	// atomically throughout the cycle and will be up-to-date if
+	// the fractional mark worker is not currently running.
+	fractionalMarkTime int64
+
+	// idleMarkTime is the nanoseconds spent in idle marking
+	// during this cycle. This is updated atomically throughout
+	// the cycle.
+	idleMarkTime int64
+
+	// markStartTime is the absolute start time in nanoseconds
+	// that assists and background mark workers started.
+	markStartTime int64
+
+	// dedicatedMarkWorkersNeeded is the number of dedicated mark
+	// workers that need to be started. This is computed at the
+	// beginning of each cycle and decremented atomically as
+	// dedicated mark workers get started.
+	dedicatedMarkWorkersNeeded int64
+
+	// assistWorkPerByte is the ratio of scan work to allocated
+	// bytes that should be performed by mutator assists. This is
+	// computed at the beginning of each cycle and updated every
+	// time heapScan is updated.
+	assistWorkPerByte atomic.Float64
+
+	// assistBytesPerWork is 1/assistWorkPerByte.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork atomic.Float64
+
+	// fractionalUtilizationGoal is the fraction of wall clock
+	// time that should be spent in the fractional mark worker on
+	// each P that isn't running a dedicated worker.
+	//
+	// For example, if the utilization goal is 25% and there are
+	// no dedicated workers, this will be 0.25. If the goal is
+	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
+	// this will be 0.05 to make up the missing 5%.
+	//
+	// If this is zero, no fractional workers are needed.
+	fractionalUtilizationGoal float64
+
+	// test indicates that this is a test-only copy of gcControllerState.
+	test bool
+
+	_ cpu.CacheLinePad
+}
+
+func (c *gcControllerState) init(gcPercent int32) {
+	c.heapMinimum = defaultHeapMinimum
+
+	if goexperiment.PacerRedesign {
+		c.consMarkController = piController{
+			// Tuned first via the Ziegler-Nichols process in simulation,
+			// then the integral time was manually tuned against real-world
+			// applications to deal with noisiness in the measured cons/mark
+			// ratio.
+			kp: 0.9,
+			ti: 4.0,
+
+			// Set a high reset time in GC cycles.
+			// This is inversely proportional to the rate at which we
+			// accumulate error from clipping. By making this very high
+			// we make the accumulation slow. In general, clipping is
+			// OK in our situation, hence the choice.
+			//
+			// Tune this if we get unintended effects from clipping for
+			// a long time.
+			tt:  1000,
+			min: -1000,
+			max: 1000,
+		}
+	} else {
+		// Set a reasonable initial GC trigger.
+		c.triggerRatio = 7 / 8.0
+
+		// Fake a heapMarked value so it looks like a trigger at
+		// heapMinimum is the appropriate growth from heapMarked.
+		// This will go into computing the initial GC goal.
+		c.heapMarked = uint64(float64(c.heapMinimum) / (1 + c.triggerRatio))
+	}
+
+	// This will also compute and set the GC trigger and goal.
+	c.setGCPercent(gcPercent)
+}
+
+// startCycle resets the GC controller's state and computes estimates
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
+func (c *gcControllerState) startCycle(markStartTime int64, procs int) {
+	c.heapScanWork.Store(0)
+	c.stackScanWork.Store(0)
+	c.globalsScanWork.Store(0)
+	c.bgScanCredit = 0
+	c.assistTime = 0
+	c.dedicatedMarkTime = 0
+	c.fractionalMarkTime = 0
+	c.idleMarkTime = 0
+	c.markStartTime = markStartTime
+	c.stackScan = atomic.Load64(&c.scannableStackSize)
+
+	// Ensure that the heap goal is at least a little larger than
+	// the current live heap size. This may not be the case if GC
+	// start is delayed or if the allocation that pushed gcController.heapLive
+	// over trigger is large or if the trigger is really close to
+	// GOGC. Assist is proportional to this distance, so enforce a
+	// minimum distance, even if it means going over the GOGC goal
+	// by a tiny bit.
+	if goexperiment.PacerRedesign {
+		if c.heapGoal < c.heapLive+64<<10 {
+			c.heapGoal = c.heapLive + 64<<10
+		}
+	} else {
+		if c.heapGoal < c.heapLive+1<<20 {
+			c.heapGoal = c.heapLive + 1<<20
+		}
+	}
+
+	// Compute the background mark utilization goal. In general,
+	// this may not come out exactly. We round the number of
+	// dedicated workers so that the utilization is closest to
+	// 25%. For small GOMAXPROCS, this would introduce too much
+	// error, so we add fractional workers in that case.
+	totalUtilizationGoal := float64(procs) * gcBackgroundUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
+	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	const maxUtilError = 0.3
+	if utilError < -maxUtilError || utilError > maxUtilError {
+		// Rounding put us more than 30% off our goal. With
+		// gcBackgroundUtilization of 25%, this happens for
+		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
+		// workers to compensate.
+		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+			// Too many dedicated workers.
+			c.dedicatedMarkWorkersNeeded--
+		}
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(procs)
+	} else {
+		c.fractionalUtilizationGoal = 0
+	}
+
+	// In STW mode, we just want dedicated workers.
+	if debug.gcstoptheworld > 0 {
+		c.dedicatedMarkWorkersNeeded = int64(procs)
+		c.fractionalUtilizationGoal = 0
+	}
+
+	// Clear per-P state
+	for _, p := range allp {
+		p.gcAssistTime = 0
+		p.gcFractionalMarkTime = 0
+	}
+
+	// Compute initial values for controls that are updated
+	// throughout the cycle.
+	c.revise()
+
+	if debug.gcpacertrace > 0 {
+		assistRatio := c.assistWorkPerByte.Load()
+		print("pacer: assist ratio=", assistRatio,
+			" (scan ", gcController.heapScan>>20, " MB in ",
+			work.initialHeapLive>>20, "->",
+			c.heapGoal>>20, " MB)",
+			" workers=", c.dedicatedMarkWorkersNeeded,
+			"+", c.fractionalUtilizationGoal, "\n")
+	}
+}
+
+// revise updates the assist ratio during the GC cycle to account for
+// improved estimates. This should be called whenever gcController.heapScan,
+// gcController.heapLive, or gcController.heapGoal is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
+//
+// It should only be called when gcBlackenEnabled != 0 (because this
+// is when assists are enabled and the necessary statistics are
+// available).
+func (c *gcControllerState) revise() {
+	gcPercent := c.gcPercent.Load()
+	if gcPercent < 0 {
+		// If GC is disabled but we're running a forced GC,
+		// act like GOGC is huge for the below calculations.
+		gcPercent = 100000
+	}
+	live := atomic.Load64(&c.heapLive)
+	scan := atomic.Load64(&c.heapScan)
+	work := c.heapScanWork.Load() + c.stackScanWork.Load() + c.globalsScanWork.Load()
+
+	// Assume we're under the soft goal. Pace GC to complete at
+	// heapGoal assuming the heap is in steady-state.
+	heapGoal := int64(atomic.Load64(&c.heapGoal))
+
+	var scanWorkExpected int64
+	if goexperiment.PacerRedesign {
+		// The expected scan work is computed as the amount of bytes scanned last
+		// GC cycle, plus our estimate of stacks and globals work for this cycle.
+		scanWorkExpected = int64(c.lastHeapScan + c.stackScan + c.globalsScan)
+
+		// maxScanWork is a worst-case estimate of the amount of scan work that
+		// needs to be performed in this GC cycle. Specifically, it represents
+		// the case where *all* scannable memory turns out to be live.
+		maxScanWork := int64(scan + c.stackScan + c.globalsScan)
+		if work > scanWorkExpected {
+			// We've already done more scan work than expected. Because our expectation
+			// is based on a steady-state scannable heap size, we assume this means our
+			// heap is growing. Compute a new heap goal that takes our existing runway
+			// computed for scanWorkExpected and extrapolates it to maxScanWork, the worst-case
+			// scan work. This keeps our assist ratio stable if the heap continues to grow.
+			//
+			// The effect of this mechanism is that assists stay flat in the face of heap
+			// growths. It's OK to use more memory this cycle to scan all the live heap,
+			// because the next GC cycle is inevitably going to use *at least* that much
+			// memory anyway.
+			extHeapGoal := int64(float64(heapGoal-int64(c.trigger))/float64(scanWorkExpected)*float64(maxScanWork)) + int64(c.trigger)
+			scanWorkExpected = maxScanWork
+
+			// hardGoal is a hard limit on the amount that we're willing to push back the
+			// heap goal, and that's twice the heap goal (i.e. if GOGC=100 and the heap and/or
+			// stacks and/or globals grow to twice their size, this limits the current GC cycle's
+			// growth to 4x the original live heap's size).
+			//
+			// This maintains the invariant that we use no more memory than the next GC cycle
+			// will anyway.
+			hardGoal := int64((1.0 + float64(gcPercent)/100.0) * float64(heapGoal))
+			if extHeapGoal > hardGoal {
+				extHeapGoal = hardGoal
+			}
+			heapGoal = extHeapGoal
+		}
+		if int64(live) > heapGoal {
+			// We're already past our heap goal, even the extrapolated one.
+			// Leave ourselves some extra runway, so in the worst case we
+			// finish by that point.
+			const maxOvershoot = 1.1
+			heapGoal = int64(float64(heapGoal) * maxOvershoot)
+
+			// Compute the upper bound on the scan work remaining.
+			scanWorkExpected = maxScanWork
+		}
+	} else {
+		// Compute the expected scan work remaining.
+		//
+		// This is estimated based on the expected
+		// steady-state scannable heap. For example, with
+		// GOGC=100, only half of the scannable heap is
+		// expected to be live, so that's what we target.
+		//
+		// (This is a float calculation to avoid overflowing on
+		// 100*heapScan.)
+		scanWorkExpected = int64(float64(scan) * 100 / float64(100+gcPercent))
+		if int64(live) > heapGoal || work > scanWorkExpected {
+			// We're past the soft goal, or we've already done more scan
+			// work than we expected. Pace GC so that in the worst case it
+			// will complete by the hard goal.
+			const maxOvershoot = 1.1
+			heapGoal = int64(float64(heapGoal) * maxOvershoot)
+
+			// Compute the upper bound on the scan work remaining.
+			scanWorkExpected = int64(scan)
+		}
+	}
+
+	// Compute the remaining scan work estimate.
+	//
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heapScan) and scan work completed
+	// (scanWork), so allocation will change this difference
+	// slowly in the soft regime and not at all in the hard
+	// regime.
+	scanWorkRemaining := scanWorkExpected - work
+	if scanWorkRemaining < 1000 {
+		// We set a somewhat arbitrary lower bound on
+		// remaining scan work since if we aim a little high,
+		// we can miss by a little.
+		//
+		// We *do* need to enforce that this is at least 1,
+		// since marking is racy and double-scanning objects
+		// may legitimately make the remaining scan work
+		// negative, even in the hard goal regime.
+		scanWorkRemaining = 1000
+	}
+
+	// Compute the heap distance remaining.
+	heapRemaining := heapGoal - int64(live)
+	if heapRemaining <= 0 {
+		// This shouldn't happen, but if it does, avoid
+		// dividing by zero or setting the assist negative.
+		heapRemaining = 1
+	}
+
+	// Compute the mutator assist ratio so by the time the mutator
+	// allocates the remaining heap bytes up to heapGoal, it will
+	// have done (or stolen) the remaining amount of scan work.
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	c.assistWorkPerByte.Store(assistWorkPerByte)
+	c.assistBytesPerWork.Store(assistBytesPerWork)
+}
+
+// endCycle computes the trigger ratio (!goexperiment.PacerRedesign)
+// or the consMark estimate (goexperiment.PacerRedesign) for the next cycle.
+// Returns the trigger ratio if application, or 0 (goexperiment.PacerRedesign).
+// userForced indicates whether the current GC cycle was forced
+// by the application.
+func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) float64 {
+	// Record last heap goal for the scavenger.
+	// We'll be updating the heap goal soon.
+	gcController.lastHeapGoal = gcController.heapGoal
+
+	// Compute the duration of time for which assists were turned on.
+	assistDuration := now - c.markStartTime
+
+	// Assume background mark hit its utilization goal.
+	utilization := gcBackgroundUtilization
+	// Add assist utilization; avoid divide by zero.
+	if assistDuration > 0 {
+		utilization += float64(c.assistTime) / float64(assistDuration*int64(procs))
+	}
+
+	if goexperiment.PacerRedesign {
+		if c.heapLive <= c.trigger {
+			// Shouldn't happen, but let's be very safe about this in case the
+			// GC is somehow extremely short.
+			//
+			// In this case though, the only reasonable value for c.heapLive-c.trigger
+			// would be 0, which isn't really all that useful, i.e. the GC was so short
+			// that it didn't matter.
+			//
+			// Ignore this case and don't update anything.
+			return 0
+		}
+		idleUtilization := 0.0
+		if assistDuration > 0 {
+			idleUtilization = float64(c.idleMarkTime) / float64(assistDuration*int64(procs))
+		}
+		// Determine the cons/mark ratio.
+		//
+		// The units we want for the numerator and denominator are both B / cpu-ns.
+		// We get this by taking the bytes allocated or scanned, and divide by the amount of
+		// CPU time it took for those operations. For allocations, that CPU time is
+		//
+		//    assistDuration * procs * (1 - utilization)
+		//
+		// Where utilization includes just background GC workers and assists. It does *not*
+		// include idle GC work time, because in theory the mutator is free to take that at
+		// any point.
+		//
+		// For scanning, that CPU time is
+		//
+		//    assistDuration * procs * (utilization + idleUtilization)
+		//
+		// In this case, we *include* idle utilization, because that is additional CPU time that the
+		// the GC had available to it.
+		//
+		// In effect, idle GC time is sort of double-counted here, but it's very weird compared
+		// to other kinds of GC work, because of how fluid it is. Namely, because the mutator is
+		// *always* free to take it.
+		//
+		// So this calculation is really:
+		//     (heapLive-trigger) / (assistDuration * procs * (1-utilization)) /
+		//         (scanWork) / (assistDuration * procs * (utilization+idleUtilization)
+		//
+		// Note that because we only care about the ratio, assistDuration and procs cancel out.
+		scanWork := c.heapScanWork.Load() + c.stackScanWork.Load() + c.globalsScanWork.Load()
+		currentConsMark := (float64(c.heapLive-c.trigger) * (utilization + idleUtilization)) /
+			(float64(scanWork) * (1 - utilization))
+
+		// Update cons/mark controller. The time period for this is 1 GC cycle.
+		//
+		// This use of a PI controller might seem strange. So, here's an explanation:
+		//
+		// currentConsMark represents the consMark we *should've* had to be perfectly
+		// on-target for this cycle. Given that we assume the next GC will be like this
+		// one in the steady-state, it stands to reason that we should just pick that
+		// as our next consMark. In practice, however, currentConsMark is too noisy:
+		// we're going to be wildly off-target in each GC cycle if we do that.
+		//
+		// What we do instead is make a long-term assumption: there is some steady-state
+		// consMark value, but it's obscured by noise. By constantly shooting for this
+		// noisy-but-perfect consMark value, the controller will bounce around a bit,
+		// but its average behavior, in aggregate, should be less noisy and closer to
+		// the true long-term consMark value, provided its tuned to be slightly overdamped.
+		var ok bool
+		oldConsMark := c.consMark
+		c.consMark, ok = c.consMarkController.next(c.consMark, currentConsMark, 1.0)
+		if !ok {
+			// The error spiraled out of control. This is incredibly unlikely seeing
+			// as this controller is essentially just a smoothing function, but it might
+			// mean that something went very wrong with how currentConsMark was calculated.
+			// Just reset consMark and keep going.
+			c.consMark = 0
+		}
+
+		if debug.gcpacertrace > 0 {
+			printlock()
+			goal := gcGoalUtilization * 100
+			print("pacer: ", int(utilization*100), "% CPU (", int(goal), " exp.) for ")
+			print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.stackScan+c.globalsScan, " B exp.) ")
+			print("in ", c.trigger, " B -> ", c.heapLive, " B (∆goal ", int64(c.heapLive)-int64(c.heapGoal), ", cons/mark ", oldConsMark, ")")
+			if !ok {
+				print("[controller reset]")
+			}
+			println()
+			printunlock()
+		}
+		return 0
+	}
+
+	// !goexperiment.PacerRedesign below.
+
+	if userForced {
+		// Forced GC means this cycle didn't start at the
+		// trigger, so where it finished isn't good
+		// information about how to adjust the trigger.
+		// Just leave it where it is.
+		return c.triggerRatio
+	}
+
+	// Proportional response gain for the trigger controller. Must
+	// be in [0, 1]. Lower values smooth out transient effects but
+	// take longer to respond to phase changes. Higher values
+	// react to phase changes quickly, but are more affected by
+	// transient changes. Values near 1 may be unstable.
+	const triggerGain = 0.5
+
+	// Compute next cycle trigger ratio. First, this computes the
+	// "error" for this cycle; that is, how far off the trigger
+	// was from what it should have been, accounting for both heap
+	// growth and GC CPU utilization. We compute the actual heap
+	// growth during this cycle and scale that by how far off from
+	// the goal CPU utilization we were (to estimate the heap
+	// growth if we had the desired CPU utilization). The
+	// difference between this estimate and the GOGC-based goal
+	// heap growth is the error.
+	goalGrowthRatio := c.effectiveGrowthRatio()
+	actualGrowthRatio := float64(c.heapLive)/float64(c.heapMarked) - 1
+	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+
+	// Finally, we adjust the trigger for next time by this error,
+	// damped by the proportional gain.
+	triggerRatio := c.triggerRatio + triggerGain*triggerError
+
+	if debug.gcpacertrace > 0 {
+		// Print controller state in terms of the design
+		// document.
+		H_m_prev := c.heapMarked
+		h_t := c.triggerRatio
+		H_T := c.trigger
+		h_a := actualGrowthRatio
+		H_a := c.heapLive
+		h_g := goalGrowthRatio
+		H_g := int64(float64(H_m_prev) * (1 + h_g))
+		u_a := utilization
+		u_g := gcGoalUtilization
+		W_a := c.heapScanWork.Load()
+		print("pacer: H_m_prev=", H_m_prev,
+			" h_t=", h_t, " H_T=", H_T,
+			" h_a=", h_a, " H_a=", H_a,
+			" h_g=", h_g, " H_g=", H_g,
+			" u_a=", u_a, " u_g=", u_g,
+			" W_a=", W_a,
+			" goalΔ=", goalGrowthRatio-h_t,
+			" actualΔ=", h_a-h_t,
+			" u_a/u_g=", u_a/u_g,
+			"\n")
+	}
+
+	return triggerRatio
+}
+
+// enlistWorker encourages another dedicated mark worker to start on
+// another P if there are spare worker slots. It is used by putfull
+// when more work is made available.
+//
+//go:nowritebarrier
+func (c *gcControllerState) enlistWorker() {
+	// If there are idle Ps, wake one so it will run an idle worker.
+	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
+	//
+	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+	//		wakep()
+	//		return
+	//	}
+
+	// There are no idle Ps. If we need more dedicated workers,
+	// try to preempt a running P so it will switch to a worker.
+	if c.dedicatedMarkWorkersNeeded <= 0 {
+		return
+	}
+	// Pick a random other P to preempt.
+	if gomaxprocs <= 1 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.m == nil || gp.m.p == 0 {
+		return
+	}
+	myID := gp.m.p.ptr().id
+	for tries := 0; tries < 5; tries++ {
+		id := int32(fastrandn(uint32(gomaxprocs - 1)))
+		if id >= myID {
+			id++
+		}
+		p := allp[id]
+		if p.status != _Prunning {
+			continue
+		}
+		if preemptone(p) {
+			return
+		}
+	}
+}
+
+// findRunnableGCWorker returns a background mark worker for _p_ if it
+// should be run. This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
+	if gcBlackenEnabled == 0 {
+		throw("gcControllerState.findRunnable: blackening not enabled")
+	}
+
+	if !gcMarkWorkAvailable(_p_) {
+		// No work to be done right now. This can happen at
+		// the end of the mark phase when there are still
+		// assists tapering off. Don't bother running a worker
+		// now because it'll just return immediately.
+		return nil
+	}
+
+	// Grab a worker before we commit to running below.
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		// There is at least one worker per P, so normally there are
+		// enough workers to run on all Ps, if necessary. However, once
+		// a worker enters gcMarkDone it may park without rejoining the
+		// pool, thus freeing a P with no corresponding worker.
+		// gcMarkDone never depends on another worker doing work, so it
+		// is safe to simply do nothing here.
+		//
+		// If gcMarkDone bails out without completing the mark phase,
+		// it will always do so with queued global work. Thus, that P
+		// will be immediately eligible to re-run the worker G it was
+		// just using, ensuring work can complete.
+		return nil
+	}
+
+	decIfPositive := func(ptr *int64) bool {
+		for {
+			v := atomic.Loadint64(ptr)
+			if v <= 0 {
+				return false
+			}
+
+			if atomic.Casint64(ptr, v, v-1) {
+				return true
+			}
+		}
+	}
+
+	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
+		// This P is now dedicated to marking until the end of
+		// the concurrent mark phase.
+		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+	} else if c.fractionalUtilizationGoal == 0 {
+		// No need for fractional workers.
+		gcBgMarkWorkerPool.push(&node.node)
+		return nil
+	} else {
+		// Is this P behind on the fractional utilization
+		// goal?
+		//
+		// This should be kept in sync with pollFractionalWorkerExit.
+		delta := nanotime() - c.markStartTime
+		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+			// Nope. No need to run a fractional worker.
+			gcBgMarkWorkerPool.push(&node.node)
+			return nil
+		}
+		// Run a fractional worker.
+		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
+	}
+
+	// Run the background mark worker.
+	gp := node.gp.ptr()
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	if trace.enabled {
+		traceGoUnpark(gp, 0)
+	}
+	return gp
+}
+
+// resetLive sets up the controller state for the next mark phase after the end
+// of the previous one. Must be called after endCycle and before commit, before
+// the world is started.
+//
+// The world must be stopped.
+func (c *gcControllerState) resetLive(bytesMarked uint64) {
+	c.heapMarked = bytesMarked
+	c.heapLive = bytesMarked
+	c.heapScan = uint64(c.heapScanWork.Load())
+	c.lastHeapScan = uint64(c.heapScanWork.Load())
+
+	// heapLive was updated, so emit a trace event.
+	if trace.enabled {
+		traceHeapAlloc()
+	}
+}
+
+// logWorkTime updates mark work accounting in the controller by a duration of
+// work in nanoseconds.
+//
+// Safe to execute at any time.
+func (c *gcControllerState) logWorkTime(mode gcMarkWorkerMode, duration int64) {
+	switch mode {
+	case gcMarkWorkerDedicatedMode:
+		atomic.Xaddint64(&c.dedicatedMarkTime, duration)
+		atomic.Xaddint64(&c.dedicatedMarkWorkersNeeded, 1)
+	case gcMarkWorkerFractionalMode:
+		atomic.Xaddint64(&c.fractionalMarkTime, duration)
+	case gcMarkWorkerIdleMode:
+		atomic.Xaddint64(&c.idleMarkTime, duration)
+	default:
+		throw("logWorkTime: unknown mark worker mode")
+	}
+}
+
+func (c *gcControllerState) update(dHeapLive, dHeapScan int64) {
+	if dHeapLive != 0 {
+		atomic.Xadd64(&gcController.heapLive, dHeapLive)
+		if trace.enabled {
+			// gcController.heapLive changed.
+			traceHeapAlloc()
+		}
+	}
+	// Only update heapScan in the new pacer redesign if we're not
+	// currently in a GC.
+	if !goexperiment.PacerRedesign || gcBlackenEnabled == 0 {
+		if dHeapScan != 0 {
+			atomic.Xadd64(&gcController.heapScan, dHeapScan)
+		}
+	}
+	if gcBlackenEnabled != 0 {
+		// gcController.heapLive and heapScan changed.
+		c.revise()
+	}
+}
+
+func (c *gcControllerState) addScannableStack(pp *p, amount int64) {
+	if pp == nil {
+		atomic.Xadd64(&c.scannableStackSize, amount)
+		return
+	}
+	pp.scannableStackSizeDelta += amount
+	if pp.scannableStackSizeDelta >= scannableStackSizeSlack || pp.scannableStackSizeDelta <= -scannableStackSizeSlack {
+		atomic.Xadd64(&c.scannableStackSize, pp.scannableStackSizeDelta)
+		pp.scannableStackSizeDelta = 0
+	}
+}
+
+func (c *gcControllerState) addGlobals(amount int64) {
+	atomic.Xadd64(&c.globalsScan, amount)
+}
+
+// commit recomputes all pacing parameters from scratch, namely
+// absolute trigger, the heap goal, mark pacing, and sweep pacing.
+//
+// If goexperiment.PacerRedesign is true, triggerRatio is ignored.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcPercent, gcController.heapMarked, and
+// gcController.heapLive. These must be up to date.
+//
+// mheap_.lock must be held or the world must be stopped.
+func (c *gcControllerState) commit(triggerRatio float64) {
+	if !c.test {
+		assertWorldStoppedOrLockHeld(&mheap_.lock)
+	}
+
+	if !goexperiment.PacerRedesign {
+		c.oldCommit(triggerRatio)
+		return
+	}
+
+	// Compute the next GC goal, which is when the allocated heap
+	// has grown by GOGC/100 over where it started the last cycle,
+	// plus additional runway for non-heap sources of GC work.
+	goal := ^uint64(0)
+	if gcPercent := c.gcPercent.Load(); gcPercent >= 0 {
+		goal = c.heapMarked + (c.heapMarked+atomic.Load64(&c.stackScan)+atomic.Load64(&c.globalsScan))*uint64(gcPercent)/100
+	}
+
+	// Don't trigger below the minimum heap size.
+	minTrigger := c.heapMinimum
+	if !isSweepDone() {
+		// Concurrent sweep happens in the heap growth
+		// from gcController.heapLive to trigger, so ensure
+		// that concurrent sweep has some heap growth
+		// in which to perform sweeping before we
+		// start the next GC cycle.
+		sweepMin := atomic.Load64(&c.heapLive) + sweepMinHeapDistance
+		if sweepMin > minTrigger {
+			minTrigger = sweepMin
+		}
+	}
+
+	// If we let the trigger go too low, then if the application
+	// is allocating very rapidly we might end up in a situation
+	// where we're allocating black during a nearly always-on GC.
+	// The result of this is a growing heap and ultimately an
+	// increase in RSS. By capping us at a point >0, we're essentially
+	// saying that we're OK using more CPU during the GC to prevent
+	// this growth in RSS.
+	//
+	// The current constant was chosen empirically: given a sufficiently
+	// fast/scalable allocator with 48 Ps that could drive the trigger ratio
+	// to <0.05, this constant causes applications to retain the same peak
+	// RSS compared to not having this allocator.
+	if triggerBound := uint64(0.7*float64(goal-c.heapMarked)) + c.heapMarked; minTrigger < triggerBound {
+		minTrigger = triggerBound
+	}
+
+	// For small heaps, set the max trigger point at 95% of the heap goal.
+	// This ensures we always have *some* headroom when the GC actually starts.
+	// For larger heaps, set the max trigger point at the goal, minus the
+	// minimum heap size.
+	// This choice follows from the fact that the minimum heap size is chosen
+	// to reflect the costs of a GC with no work to do. With a large heap but
+	// very little scan work to perform, this gives us exactly as much runway
+	// as we would need, in the worst case.
+	maxRunway := uint64(0.95 * float64(goal-c.heapMarked))
+	if largeHeapMaxRunway := goal - c.heapMinimum; goal > c.heapMinimum && maxRunway < largeHeapMaxRunway {
+		maxRunway = largeHeapMaxRunway
+	}
+	maxTrigger := maxRunway + c.heapMarked
+	if maxTrigger < minTrigger {
+		maxTrigger = minTrigger
+	}
+
+	// Compute the trigger by using our estimate of the cons/mark ratio.
+	//
+	// The idea is to take our expected scan work, and multiply it by
+	// the cons/mark ratio to determine how long it'll take to complete
+	// that scan work in terms of bytes allocated. This gives us our GC's
+	// runway.
+	//
+	// However, the cons/mark ratio is a ratio of rates per CPU-second, but
+	// here we care about the relative rates for some division of CPU
+	// resources among the mutator and the GC.
+	//
+	// To summarize, we have B / cpu-ns, and we want B / ns. We get that
+	// by multiplying by our desired division of CPU resources. We choose
+	// to express CPU resources as GOMAPROCS*fraction. Note that because
+	// we're working with a ratio here, we can omit the number of CPU cores,
+	// because they'll appear in the numerator and denominator and cancel out.
+	// As a result, this is basically just "weighing" the cons/mark ratio by
+	// our desired division of resources.
+	//
+	// Furthermore, by setting the trigger so that CPU resources are divided
+	// this way, assuming that the cons/mark ratio is correct, we make that
+	// division a reality.
+	var trigger uint64
+	runway := uint64((c.consMark * (1 - gcGoalUtilization) / (gcGoalUtilization)) * float64(c.lastHeapScan+c.stackScan+c.globalsScan))
+	if runway > goal {
+		trigger = minTrigger
+	} else {
+		trigger = goal - runway
+	}
+	if trigger < minTrigger {
+		trigger = minTrigger
+	}
+	if trigger > maxTrigger {
+		trigger = maxTrigger
+	}
+	if trigger > goal {
+		goal = trigger
+	}
+
+	// Commit to the trigger and goal.
+	c.trigger = trigger
+	atomic.Store64(&c.heapGoal, goal)
+	if trace.enabled {
+		traceHeapGoal()
+	}
+
+	// Update mark pacing.
+	if gcphase != _GCoff {
+		c.revise()
+	}
+}
+
+// oldCommit sets the trigger ratio and updates everything
+// derived from it: the absolute trigger, the heap goal, mark pacing,
+// and sweep pacing.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcPercent, gcController.heapMarked, and
+// gcController.heapLive. These must be up to date.
+//
+// For !goexperiment.PacerRedesign.
+func (c *gcControllerState) oldCommit(triggerRatio float64) {
+	gcPercent := c.gcPercent.Load()
+
+	// Compute the next GC goal, which is when the allocated heap
+	// has grown by GOGC/100 over the heap marked by the last
+	// cycle.
+	goal := ^uint64(0)
+	if gcPercent >= 0 {
+		goal = c.heapMarked + c.heapMarked*uint64(gcPercent)/100
+	}
+
+	// Set the trigger ratio, capped to reasonable bounds.
+	if gcPercent >= 0 {
+		scalingFactor := float64(gcPercent) / 100
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		maxTriggerRatio := 0.95 * scalingFactor
+		if triggerRatio > maxTriggerRatio {
+			triggerRatio = maxTriggerRatio
+		}
+
+		// If we let triggerRatio go too low, then if the application
+		// is allocating very rapidly we might end up in a situation
+		// where we're allocating black during a nearly always-on GC.
+		// The result of this is a growing heap and ultimately an
+		// increase in RSS. By capping us at a point >0, we're essentially
+		// saying that we're OK using more CPU during the GC to prevent
+		// this growth in RSS.
+		//
+		// The current constant was chosen empirically: given a sufficiently
+		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
+		// to <0.05, this constant causes applications to retain the same peak
+		// RSS compared to not having this allocator.
+		minTriggerRatio := 0.6 * scalingFactor
+		if triggerRatio < minTriggerRatio {
+			triggerRatio = minTriggerRatio
+		}
+	} else if triggerRatio < 0 {
+		// gcPercent < 0, so just make sure we're not getting a negative
+		// triggerRatio. This case isn't expected to happen in practice,
+		// and doesn't really matter because if gcPercent < 0 then we won't
+		// ever consume triggerRatio further on in this function, but let's
+		// just be defensive here; the triggerRatio being negative is almost
+		// certainly undesirable.
+		triggerRatio = 0
+	}
+	c.triggerRatio = triggerRatio
+
+	// Compute the absolute GC trigger from the trigger ratio.
+	//
+	// We trigger the next GC cycle when the allocated heap has
+	// grown by the trigger ratio over the marked heap size.
+	trigger := ^uint64(0)
+	if gcPercent >= 0 {
+		trigger = uint64(float64(c.heapMarked) * (1 + triggerRatio))
+		// Don't trigger below the minimum heap size.
+		minTrigger := c.heapMinimum
+		if !isSweepDone() {
+			// Concurrent sweep happens in the heap growth
+			// from gcController.heapLive to trigger, so ensure
+			// that concurrent sweep has some heap growth
+			// in which to perform sweeping before we
+			// start the next GC cycle.
+			sweepMin := atomic.Load64(&c.heapLive) + sweepMinHeapDistance
+			if sweepMin > minTrigger {
+				minTrigger = sweepMin
+			}
+		}
+		if trigger < minTrigger {
+			trigger = minTrigger
+		}
+		if int64(trigger) < 0 {
+			print("runtime: heapGoal=", c.heapGoal, " heapMarked=", c.heapMarked, " gcController.heapLive=", c.heapLive, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
+			throw("trigger underflow")
+		}
+		if trigger > goal {
+			// The trigger ratio is always less than GOGC/100, but
+			// other bounds on the trigger may have raised it.
+			// Push up the goal, too.
+			goal = trigger
+		}
+	}
+
+	// Commit to the trigger and goal.
+	c.trigger = trigger
+	atomic.Store64(&c.heapGoal, goal)
+	if trace.enabled {
+		traceHeapGoal()
+	}
+
+	// Update mark pacing.
+	if gcphase != _GCoff {
+		c.revise()
+	}
+}
+
+// effectiveGrowthRatio returns the current effective heap growth
+// ratio (GOGC/100) based on heapMarked from the previous GC and
+// heapGoal for the current GC.
+//
+// This may differ from gcPercent/100 because of various upper and
+// lower bounds on gcPercent. For example, if the heap is smaller than
+// heapMinimum, this can be higher than gcPercent/100.
+//
+// mheap_.lock must be held or the world must be stopped.
+func (c *gcControllerState) effectiveGrowthRatio() float64 {
+	if !c.test {
+		assertWorldStoppedOrLockHeld(&mheap_.lock)
+	}
+
+	egogc := float64(atomic.Load64(&c.heapGoal)-c.heapMarked) / float64(c.heapMarked)
+	if egogc < 0 {
+		// Shouldn't happen, but just in case.
+		egogc = 0
+	}
+	return egogc
+}
+
+// setGCPercent updates gcPercent and all related pacer state.
+// Returns the old value of gcPercent.
+//
+// Calls gcControllerState.commit.
+//
+// The world must be stopped, or mheap_.lock must be held.
+func (c *gcControllerState) setGCPercent(in int32) int32 {
+	if !c.test {
+		assertWorldStoppedOrLockHeld(&mheap_.lock)
+	}
+
+	out := c.gcPercent.Load()
+	if in < 0 {
+		in = -1
+	}
+	c.heapMinimum = defaultHeapMinimum * uint64(in) / 100
+	c.gcPercent.Store(in)
+	// Update pacing in response to gcPercent change.
+	c.commit(c.triggerRatio)
+
+	return out
+}
+
+//go:linkname setGCPercent runtime/debug.setGCPercent
+func setGCPercent(in int32) (out int32) {
+	// Run on the system stack since we grab the heap lock.
+	systemstack(func() {
+		lock(&mheap_.lock)
+		out = gcController.setGCPercent(in)
+		gcPaceSweeper(gcController.trigger)
+		gcPaceScavenger(gcController.heapGoal, gcController.lastHeapGoal)
+		unlock(&mheap_.lock)
+	})
+
+	// If we just disabled GC, wait for any concurrent GC mark to
+	// finish so we always return with no GC running.
+	if in < 0 {
+		gcWaitOnMark(atomic.Load(&work.cycles))
+	}
+
+	return out
+}
+
+func readGOGC() int32 {
+	p := gogetenv("GOGC")
+	if p == "off" {
+		return -1
+	}
+	if n, ok := atoi32(p); ok {
+		return n
+	}
+	return 100
+}
+
+type piController struct {
+	kp float64 // Proportional constant.
+	ti float64 // Integral time constant.
+	tt float64 // Reset time.
+
+	min, max float64 // Output boundaries.
+
+	// PI controller state.
+
+	errIntegral float64 // Integral of the error from t=0 to now.
+
+	// Error flags.
+	errOverflow   bool // Set if errIntegral ever overflowed.
+	inputOverflow bool // Set if an operation with the input overflowed.
+}
+
+// next provides a new sample to the controller.
+//
+// input is the sample, setpoint is the desired point, and period is how much
+// time (in whatever unit makes the most sense) has passed since the last sample.
+//
+// Returns a new value for the variable it's controlling, and whether the operation
+// completed successfully. One reason this might fail is if error has been growing
+// in an unbounded manner, to the point of overflow.
+//
+// In the specific case of an error overflow occurs, the errOverflow field will be
+// set and the rest of the controller's internal state will be fully reset.
+func (c *piController) next(input, setpoint, period float64) (float64, bool) {
+	// Compute the raw output value.
+	prop := c.kp * (setpoint - input)
+	rawOutput := prop + c.errIntegral
+
+	// Clamp rawOutput into output.
+	output := rawOutput
+	if isInf(output) || isNaN(output) {
+		// The input had a large enough magnitude that either it was already
+		// overflowed, or some operation with it overflowed.
+		// Set a flag and reset. That's the safest thing to do.
+		c.reset()
+		c.inputOverflow = true
+		return c.min, false
+	}
+	if output < c.min {
+		output = c.min
+	} else if output > c.max {
+		output = c.max
+	}
+
+	// Update the controller's state.
+	if c.ti != 0 && c.tt != 0 {
+		c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput)
+		if isInf(c.errIntegral) || isNaN(c.errIntegral) {
+			// So much error has accumulated that we managed to overflow.
+			// The assumptions around the controller have likely broken down.
+			// Set a flag and reset. That's the safest thing to do.
+			c.reset()
+			c.errOverflow = true
+			return c.min, false
+		}
+	}
+	return output, true
+}
+
+// reset resets the controller state, except for controller error flags.
+func (c *piController) reset() {
+	c.errIntegral = 0
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgcscavenge.go b/contrib/go/_std_1.18/src/runtime/mgcscavenge.go
new file mode 100644
index 0000000000..5f50378adf
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgcscavenge.go
@@ -0,0 +1,1008 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Scavenging free pages.
+//
+// This file implements scavenging (the release of physical pages backing mapped
+// memory) of free and unused pages in the heap as a way to deal with page-level
+// fragmentation and reduce the RSS of Go applications.
+//
+// Scavenging in Go happens on two fronts: there's the background
+// (asynchronous) scavenger and the heap-growth (synchronous) scavenger.
+//
+// The former happens on a goroutine much like the background sweeper which is
+// soft-capped at using scavengePercent of the mutator's time, based on
+// order-of-magnitude estimates of the costs of scavenging. The background
+// scavenger's primary goal is to bring the estimated heap RSS of the
+// application down to a goal.
+//
+// That goal is defined as:
+//   (retainExtraPercent+100) / 100 * (heapGoal / lastHeapGoal) * last_heap_inuse
+//
+// Essentially, we wish to have the application's RSS track the heap goal, but
+// the heap goal is defined in terms of bytes of objects, rather than pages like
+// RSS. As a result, we need to take into account for fragmentation internal to
+// spans. heapGoal / lastHeapGoal defines the ratio between the current heap goal
+// and the last heap goal, which tells us by how much the heap is growing and
+// shrinking. We estimate what the heap will grow to in terms of pages by taking
+// this ratio and multiplying it by heap_inuse at the end of the last GC, which
+// allows us to account for this additional fragmentation. Note that this
+// procedure makes the assumption that the degree of fragmentation won't change
+// dramatically over the next GC cycle. Overestimating the amount of
+// fragmentation simply results in higher memory use, which will be accounted
+// for by the next pacing up date. Underestimating the fragmentation however
+// could lead to performance degradation. Handling this case is not within the
+// scope of the scavenger. Situations where the amount of fragmentation balloons
+// over the course of a single GC cycle should be considered pathologies,
+// flagged as bugs, and fixed appropriately.
+//
+// An additional factor of retainExtraPercent is added as a buffer to help ensure
+// that there's more unscavenged memory to allocate out of, since each allocation
+// out of scavenged memory incurs a potentially expensive page fault.
+//
+// The goal is updated after each GC and the scavenger's pacing parameters
+// (which live in mheap_) are updated to match. The pacing parameters work much
+// like the background sweeping parameters. The parameters define a line whose
+// horizontal axis is time and vertical axis is estimated heap RSS, and the
+// scavenger attempts to stay below that line at all times.
+//
+// The synchronous heap-growth scavenging happens whenever the heap grows in
+// size, for some definition of heap-growth. The intuition behind this is that
+// the application had to grow the heap because existing fragments were
+// not sufficiently large to satisfy a page-level memory allocation, so we
+// scavenge those fragments eagerly to offset the growth in RSS that results.
+
+package runtime
+
+import (
+	"internal/goos"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	// The background scavenger is paced according to these parameters.
+	//
+	// scavengePercent represents the portion of mutator time we're willing
+	// to spend on scavenging in percent.
+	scavengePercent = 1 // 1%
+
+	// retainExtraPercent represents the amount of memory over the heap goal
+	// that the scavenger should keep as a buffer space for the allocator.
+	//
+	// The purpose of maintaining this overhead is to have a greater pool of
+	// unscavenged memory available for allocation (since using scavenged memory
+	// incurs an additional cost), to account for heap fragmentation and
+	// the ever-changing layout of the heap.
+	retainExtraPercent = 10
+
+	// maxPagesPerPhysPage is the maximum number of supported runtime pages per
+	// physical page, based on maxPhysPageSize.
+	maxPagesPerPhysPage = maxPhysPageSize / pageSize
+
+	// scavengeCostRatio is the approximate ratio between the costs of using previously
+	// scavenged memory and scavenging memory.
+	//
+	// For most systems the cost of scavenging greatly outweighs the costs
+	// associated with using scavenged memory, making this constant 0. On other systems
+	// (especially ones where "sysUsed" is not just a no-op) this cost is non-trivial.
+	//
+	// This ratio is used as part of multiplicative factor to help the scavenger account
+	// for the additional costs of using scavenged memory in its pacing.
+	scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos)
+
+	// scavengeReservationShards determines the amount of memory the scavenger
+	// should reserve for scavenging at a time. Specifically, the amount of
+	// memory reserved is (heap size in bytes) / scavengeReservationShards.
+	scavengeReservationShards = 64
+)
+
+// heapRetained returns an estimate of the current heap RSS.
+func heapRetained() uint64 {
+	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
+}
+
+// gcPaceScavenger updates the scavenger's pacing, particularly
+// its rate and RSS goal. For this, it requires the current heapGoal,
+// and the heapGoal for the previous GC cycle.
+//
+// The RSS goal is based on the current heap goal with a small overhead
+// to accommodate non-determinism in the allocator.
+//
+// The pacing is based on scavengePageRate, which applies to both regular and
+// huge pages. See that constant for more information.
+//
+// Must be called whenever GC pacing is updated.
+//
+// mheap_.lock must be held or the world must be stopped.
+func gcPaceScavenger(heapGoal, lastHeapGoal uint64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	// If we're called before the first GC completed, disable scavenging.
+	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
+	// information about the heap yet) so this is fine, and avoids a fault
+	// or garbage data later.
+	if lastHeapGoal == 0 {
+		atomic.Store64(&mheap_.scavengeGoal, ^uint64(0))
+		return
+	}
+	// Compute our scavenging goal.
+	goalRatio := float64(heapGoal) / float64(lastHeapGoal)
+	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
+	// Add retainExtraPercent overhead to retainedGoal. This calculation
+	// looks strange but the purpose is to arrive at an integer division
+	// (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8)
+	// that also avoids the overflow from a multiplication.
+	retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0))
+	// Align it to a physical page boundary to make the following calculations
+	// a bit more exact.
+	retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1)
+
+	// Represents where we are now in the heap's contribution to RSS in bytes.
+	//
+	// Guaranteed to always be a multiple of physPageSize on systems where
+	// physPageSize <= pageSize since we map heap_sys at a rate larger than
+	// any physPageSize and released memory in multiples of the physPageSize.
+	//
+	// However, certain functions recategorize heap_sys as other stats (e.g.
+	// stack_sys) and this happens in multiples of pageSize, so on systems
+	// where physPageSize > pageSize the calculations below will not be exact.
+	// Generally this is OK since we'll be off by at most one regular
+	// physical page.
+	retainedNow := heapRetained()
+
+	// If we're already below our goal, or within one page of our goal, then disable
+	// the background scavenger. We disable the background scavenger if there's
+	// less than one physical page of work to do because it's not worth it.
+	if retainedNow <= retainedGoal || retainedNow-retainedGoal < uint64(physPageSize) {
+		atomic.Store64(&mheap_.scavengeGoal, ^uint64(0))
+		return
+	}
+	atomic.Store64(&mheap_.scavengeGoal, retainedGoal)
+}
+
+// Sleep/wait state of the background scavenger.
+var scavenge struct {
+	lock                 mutex
+	g                    *g
+	parked               bool
+	timer                *timer
+	sysmonWake           uint32 // Set atomically.
+	printControllerReset bool   // Whether the scavenger is in cooldown.
+}
+
+// readyForScavenger signals sysmon to wake the scavenger because
+// there may be new work to do.
+//
+// There may be a significant delay between when this function runs
+// and when the scavenger is kicked awake, but it may be safely invoked
+// in contexts where wakeScavenger is unsafe to call directly.
+func readyForScavenger() {
+	atomic.Store(&scavenge.sysmonWake, 1)
+}
+
+// wakeScavenger immediately unparks the scavenger if necessary.
+//
+// May run without a P, but it may allocate, so it must not be called
+// on any allocation path.
+//
+// mheap_.lock, scavenge.lock, and sched.lock must not be held.
+func wakeScavenger() {
+	lock(&scavenge.lock)
+	if scavenge.parked {
+		// Notify sysmon that it shouldn't bother waking up the scavenger.
+		atomic.Store(&scavenge.sysmonWake, 0)
+
+		// Try to stop the timer but we don't really care if we succeed.
+		// It's possible that either a timer was never started, or that
+		// we're racing with it.
+		// In the case that we're racing with there's the low chance that
+		// we experience a spurious wake-up of the scavenger, but that's
+		// totally safe.
+		stopTimer(scavenge.timer)
+
+		// Unpark the goroutine and tell it that there may have been a pacing
+		// change. Note that we skip the scheduler's runnext slot because we
+		// want to avoid having the scavenger interfere with the fair
+		// scheduling of user goroutines. In effect, this schedules the
+		// scavenger at a "lower priority" but that's OK because it'll
+		// catch up on the work it missed when it does get scheduled.
+		scavenge.parked = false
+
+		// Ready the goroutine by injecting it. We use injectglist instead
+		// of ready or goready in order to allow us to run this function
+		// without a P. injectglist also avoids placing the goroutine in
+		// the current P's runnext slot, which is desirable to prevent
+		// the scavenger from interfering with user goroutine scheduling
+		// too much.
+		var list gList
+		list.push(scavenge.g)
+		injectglist(&list)
+	}
+	unlock(&scavenge.lock)
+}
+
+// scavengeSleep attempts to put the scavenger to sleep for ns.
+//
+// Note that this function should only be called by the scavenger.
+//
+// The scavenger may be woken up earlier by a pacing change, and it may not go
+// to sleep at all if there's a pending pacing change.
+//
+// Returns the amount of time actually slept.
+func scavengeSleep(ns int64) int64 {
+	lock(&scavenge.lock)
+
+	// Set the timer.
+	//
+	// This must happen here instead of inside gopark
+	// because we can't close over any variables without
+	// failing escape analysis.
+	start := nanotime()
+	resetTimer(scavenge.timer, start+ns)
+
+	// Mark ourself as asleep and go to sleep.
+	scavenge.parked = true
+	goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
+
+	// Return how long we actually slept for.
+	return nanotime() - start
+}
+
+// Background scavenger.
+//
+// The background scavenger maintains the RSS of the application below
+// the line described by the proportional scavenging statistics in
+// the mheap struct.
+func bgscavenge(c chan int) {
+	scavenge.g = getg()
+
+	lockInit(&scavenge.lock, lockRankScavenge)
+	lock(&scavenge.lock)
+	scavenge.parked = true
+
+	scavenge.timer = new(timer)
+	scavenge.timer.f = func(_ any, _ uintptr) {
+		wakeScavenger()
+	}
+
+	c <- 1
+	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
+
+	// idealFraction is the ideal % of overall application CPU time that we
+	// spend scavenging.
+	idealFraction := float64(scavengePercent) / 100.0
+
+	// Input: fraction of CPU time used.
+	// Setpoint: idealFraction.
+	// Output: ratio of critical time to sleep time (determines sleep time).
+	//
+	// The output of this controller is somewhat indirect to what we actually
+	// want to achieve: how much time to sleep for. The reason for this definition
+	// is to ensure that the controller's outputs have a direct relationship with
+	// its inputs (as opposed to an inverse relationship), making it somewhat
+	// easier to reason about for tuning purposes.
+	critSleepController := piController{
+		// Tuned loosely via Ziegler-Nichols process.
+		kp: 0.3375,
+		ti: 3.2e6,
+		tt: 1e9, // 1 second reset time.
+
+		// These ranges seem wide, but we want to give the controller plenty of
+		// room to hunt for the optimal value.
+		min: 0.001,  // 1:1000
+		max: 1000.0, // 1000:1
+	}
+	// It doesn't really matter what value we start at, but we can't be zero, because
+	// that'll cause divide-by-zero issues. Pick something conservative which we'll
+	// also use as a fallback.
+	const startingCritSleepRatio = 0.001
+	critSleepRatio := startingCritSleepRatio
+	// Duration left in nanoseconds during which we avoid using the controller and
+	// we hold critSleepRatio at a conservative value. Used if the controller's
+	// assumptions fail to hold.
+	controllerCooldown := int64(0)
+	for {
+		released := uintptr(0)
+		crit := float64(0)
+
+		// Spend at least 1 ms scavenging, otherwise the corresponding
+		// sleep time to maintain our desired utilization is too low to
+		// be reliable.
+		const minCritTime = 1e6
+		for crit < minCritTime {
+			// If background scavenging is disabled or if there's no work to do just park.
+			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
+			if retained <= goal {
+				break
+			}
+
+			// scavengeQuantum is the amount of memory we try to scavenge
+			// in one go. A smaller value means the scavenger is more responsive
+			// to the scheduler in case of e.g. preemption. A larger value means
+			// that the overheads of scavenging are better amortized, so better
+			// scavenging throughput.
+			//
+			// The current value is chosen assuming a cost of ~10µs/physical page
+			// (this is somewhat pessimistic), which implies a worst-case latency of
+			// about 160µs for 4 KiB physical pages. The current value is biased
+			// toward latency over throughput.
+			const scavengeQuantum = 64 << 10
+
+			// Accumulate the amount of time spent scavenging.
+			start := nanotime()
+			r := mheap_.pages.scavenge(scavengeQuantum)
+			atomic.Xadduintptr(&mheap_.pages.scav.released, r)
+			end := nanotime()
+
+			// On some platforms we may see end >= start if the time it takes to scavenge
+			// memory is less than the minimum granularity of its clock (e.g. Windows) or
+			// due to clock bugs.
+			//
+			// In this case, just assume scavenging takes 10 µs per regular physical page
+			// (determined empirically), and conservatively ignore the impact of huge pages
+			// on timing.
+			const approxCritNSPerPhysicalPage = 10e3
+			if end <= start {
+				crit += approxCritNSPerPhysicalPage * float64(r/physPageSize)
+			} else {
+				crit += float64(end - start)
+			}
+			released += r
+
+			// When using fake time just do one loop.
+			if faketime != 0 {
+				break
+			}
+		}
+
+		if released == 0 {
+			lock(&scavenge.lock)
+			scavenge.parked = true
+			goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
+			continue
+		}
+
+		if released < physPageSize {
+			// If this happens, it means that we may have attempted to release part
+			// of a physical page, but the likely effect of that is that it released
+			// the whole physical page, some of which may have still been in-use.
+			// This could lead to memory corruption. Throw.
+			throw("released less than one physical page of memory")
+		}
+
+		if crit < minCritTime {
+			// This means there wasn't enough work to actually fill up minCritTime.
+			// That's fine; we shouldn't try to do anything with this information
+			// because it's going result in a short enough sleep request that things
+			// will get messy. Just assume we did at least this much work.
+			// All this means is that we'll sleep longer than we otherwise would have.
+			crit = minCritTime
+		}
+
+		// Multiply the critical time by 1 + the ratio of the costs of using
+		// scavenged memory vs. scavenging memory. This forces us to pay down
+		// the cost of reusing this memory eagerly by sleeping for a longer period
+		// of time and scavenging less frequently. More concretely, we avoid situations
+		// where we end up scavenging so often that we hurt allocation performance
+		// because of the additional overheads of using scavenged memory.
+		crit *= 1 + scavengeCostRatio
+
+		// Go to sleep based on how much time we spent doing work.
+		slept := scavengeSleep(int64(crit / critSleepRatio))
+
+		// Stop here if we're cooling down from the controller.
+		if controllerCooldown > 0 {
+			// crit and slept aren't exact measures of time, but it's OK to be a bit
+			// sloppy here. We're just hoping we're avoiding some transient bad behavior.
+			t := slept + int64(crit)
+			if t > controllerCooldown {
+				controllerCooldown = 0
+			} else {
+				controllerCooldown -= t
+			}
+			continue
+		}
+
+		// Calculate the CPU time spent.
+		//
+		// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
+		// recomputing this often enough relative to GOMAXPROCS changes in general
+		// (it only changes when the world is stopped, and not during a GC) that
+		// that small inaccuracy is in the noise.
+		cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))
+
+		// Update the critSleepRatio, adjusting until we reach our ideal fraction.
+		var ok bool
+		critSleepRatio, ok = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
+		if !ok {
+			// The core assumption of the controller, that we can get a proportional
+			// response, broke down. This may be transient, so temporarily switch to
+			// sleeping a fixed, conservative amount.
+			critSleepRatio = startingCritSleepRatio
+			controllerCooldown = 5e9 // 5 seconds.
+
+			// Signal the scav trace printer to output this.
+			lock(&scavenge.lock)
+			scavenge.printControllerReset = true
+			unlock(&scavenge.lock)
+		}
+	}
+}
+
+// scavenge scavenges nbytes worth of free pages, starting with the
+// highest address first. Successive calls continue from where it left
+// off until the heap is exhausted. Call scavengeStartGen to bring it
+// back to the top of the heap.
+//
+// Returns the amount of memory scavenged in bytes.
+func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
+	var (
+		addrs addrRange
+		gen   uint32
+	)
+	released := uintptr(0)
+	for released < nbytes {
+		if addrs.size() == 0 {
+			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
+				break
+			}
+		}
+		systemstack(func() {
+			r, a := p.scavengeOne(addrs, nbytes-released)
+			released += r
+			addrs = a
+		})
+	}
+	// Only unreserve the space which hasn't been scavenged or searched
+	// to ensure we always make progress.
+	p.scavengeUnreserve(addrs, gen)
+	return released
+}
+
+// printScavTrace prints a scavenge trace line to standard error.
+//
+// released should be the amount of memory released since the last time this
+// was called, and forced indicates whether the scavenge was forced by the
+// application.
+//
+// scavenge.lock must be held.
+func printScavTrace(gen uint32, released uintptr, forced bool) {
+	assertLockHeld(&scavenge.lock)
+
+	printlock()
+	print("scav ", gen, " ",
+		released>>10, " KiB work, ",
+		atomic.Load64(&memstats.heap_released)>>10, " KiB total, ",
+		(atomic.Load64(&memstats.heap_inuse)*100)/heapRetained(), "% util",
+	)
+	if forced {
+		print(" (forced)")
+	} else if scavenge.printControllerReset {
+		print(" [controller reset]")
+		scavenge.printControllerReset = false
+	}
+	println()
+	printunlock()
+}
+
+// scavengeStartGen starts a new scavenge generation, resetting
+// the scavenger's search space to the full in-use address space.
+//
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeStartGen() {
+	assertLockHeld(p.mheapLock)
+
+	lock(&p.scav.lock)
+	if debug.scavtrace > 0 {
+		printScavTrace(p.scav.gen, atomic.Loaduintptr(&p.scav.released), false)
+	}
+	p.inUse.cloneInto(&p.scav.inUse)
+
+	// Pick the new starting address for the scavenger cycle.
+	var startAddr offAddr
+	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
+		// The "free" high watermark exceeds the "scavenged" low watermark,
+		// so there are free scavengable pages in parts of the address space
+		// that the scavenger already searched, the high watermark being the
+		// highest one. Pick that as our new starting point to ensure we
+		// see those pages.
+		startAddr = p.scav.freeHWM
+	} else {
+		// The "free" high watermark does not exceed the "scavenged" low
+		// watermark. This means the allocator didn't free any memory in
+		// the range we scavenged last cycle, so we might as well continue
+		// scavenging from where we were.
+		startAddr = p.scav.scavLWM
+	}
+	p.scav.inUse.removeGreaterEqual(startAddr.addr())
+
+	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
+	// scavengeReservationShards is large. This case is fine as the scavenger
+	// will simply be turned off, but it does mean that scavengeReservationShards,
+	// in concert with pallocChunkBytes, dictates the minimum heap size at which
+	// the scavenger triggers. In practice this minimum is generally less than an
+	// arena in size, so virtually every heap has the scavenger on.
+	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	p.scav.gen++
+	atomic.Storeuintptr(&p.scav.released, 0)
+	p.scav.freeHWM = minOffAddr
+	p.scav.scavLWM = maxOffAddr
+	unlock(&p.scav.lock)
+}
+
+// scavengeReserve reserves a contiguous range of the address space
+// for scavenging. The maximum amount of space it reserves is proportional
+// to the size of the heap. The ranges are reserved from the high addresses
+// first.
+//
+// Returns the reserved range and the scavenge generation number for it.
+func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
+	lock(&p.scav.lock)
+	gen := p.scav.gen
+
+	// Start by reserving the minimum.
+	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
+
+	// Return early if the size is zero; we don't want to use
+	// the bogus address below.
+	if r.size() == 0 {
+		unlock(&p.scav.lock)
+		return r, gen
+	}
+
+	// The scavenger requires that base be aligned to a
+	// palloc chunk because that's the unit of operation for
+	// the scavenger, so align down, potentially extending
+	// the range.
+	newBase := alignDown(r.base.addr(), pallocChunkBytes)
+
+	// Remove from inUse however much extra we just pulled out.
+	p.scav.inUse.removeGreaterEqual(newBase)
+	unlock(&p.scav.lock)
+
+	r.base = offAddr{newBase}
+	return r, gen
+}
+
+// scavengeUnreserve returns an unscavenged portion of a range that was
+// previously reserved with scavengeReserve.
+func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	if r.size() == 0 {
+		return
+	}
+	if r.base.addr()%pallocChunkBytes != 0 {
+		throw("unreserving unaligned region")
+	}
+	lock(&p.scav.lock)
+	if gen == p.scav.gen {
+		p.scav.inUse.add(r)
+	}
+	unlock(&p.scav.lock)
+}
+
+// scavengeOne walks over address range work until it finds
+// a contiguous run of pages to scavenge. It will try to scavenge
+// at most max bytes at once, but may scavenge more to avoid
+// breaking huge pages. Once it scavenges some memory it returns
+// how much it scavenged in bytes.
+//
+// Returns the number of bytes scavenged and the part of work
+// which was not yet searched.
+//
+// work's base address must be aligned to pallocChunkBytes.
+//
+// Must run on the systemstack because it acquires p.mheapLock.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange) {
+	// Defensively check if we've received an empty address range.
+	// If so, just return.
+	if work.size() == 0 {
+		// Nothing to do.
+		return 0, work
+	}
+	// Check the prerequisites of work.
+	if work.base.addr()%pallocChunkBytes != 0 {
+		throw("scavengeOne called with unaligned work region")
+	}
+	// Calculate the maximum number of pages to scavenge.
+	//
+	// This should be alignUp(max, pageSize) / pageSize but max can and will
+	// be ^uintptr(0), so we need to be very careful not to overflow here.
+	// Rather than use alignUp, calculate the number of pages rounded down
+	// first, then add back one if necessary.
+	maxPages := max / pageSize
+	if max%pageSize != 0 {
+		maxPages++
+	}
+
+	// Calculate the minimum number of pages we can scavenge.
+	//
+	// Because we can only scavenge whole physical pages, we must
+	// ensure that we scavenge at least minPages each time, aligned
+	// to minPages*pageSize.
+	minPages := physPageSize / pageSize
+	if minPages < 1 {
+		minPages = 1
+	}
+
+	// Fast path: check the chunk containing the top-most address in work.
+	if r, w := p.scavengeOneFast(work, minPages, maxPages); r != 0 {
+		return r, w
+	} else {
+		work = w
+	}
+
+	// findCandidate finds the next scavenge candidate in work optimistically.
+	//
+	// Returns the candidate chunk index and true on success, and false on failure.
+	//
+	// The heap need not be locked.
+	findCandidate := func(work addrRange) (chunkIdx, bool) {
+		// Iterate over this work's chunks.
+		for i := chunkIndex(work.limit.addr() - 1); i >= chunkIndex(work.base.addr()); i-- {
+			// If this chunk is totally in-use or has no unscavenged pages, don't bother
+			// doing a more sophisticated check.
+			//
+			// Note we're accessing the summary and the chunks without a lock, but
+			// that's fine. We're being optimistic anyway.
+
+			// Check quickly if there are enough free pages at all.
+			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
+				continue
+			}
+
+			// Run over the chunk looking harder for a candidate. Again, we could
+			// race with a lot of different pieces of code, but we're just being
+			// optimistic. Make sure we load the l2 pointer atomically though, to
+			// avoid races with heap growth. It may or may not be possible to also
+			// see a nil pointer in this case if we do race with heap growth, but
+			// just defensively ignore the nils. This operation is optimistic anyway.
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
+			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
+				return i, true
+			}
+		}
+		return 0, false
+	}
+
+	// Slow path: iterate optimistically over the in-use address space
+	// looking for any free and unscavenged page. If we think we see something,
+	// lock and verify it!
+	for work.size() != 0 {
+
+		// Search for the candidate.
+		candidateChunkIdx, ok := findCandidate(work)
+		if !ok {
+			// We didn't find a candidate, so we're done.
+			work.limit = work.base
+			break
+		}
+
+		// Lock, so we can verify what we found.
+		lock(p.mheapLock)
+
+		// Find, verify, and scavenge if we can.
+		chunk := p.chunkOf(candidateChunkIdx)
+		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
+		if npages > 0 {
+			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			unlock(p.mheapLock)
+			return uintptr(npages) * pageSize, work
+		}
+		unlock(p.mheapLock)
+
+		// We were fooled, so let's continue from where we left off.
+		work.limit = offAddr{chunkBase(candidateChunkIdx)}
+	}
+	return 0, work
+}
+
+// scavengeOneFast is the fast path for scavengeOne, which just checks the top
+// chunk of work for some pages to scavenge.
+//
+// Must run on the system stack because it acquires the heap lock.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeOneFast(work addrRange, minPages, maxPages uintptr) (uintptr, addrRange) {
+	maxAddr := work.limit.addr() - 1
+	maxChunk := chunkIndex(maxAddr)
+
+	lock(p.mheapLock)
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
+		// We only bother looking for a candidate if there at least
+		// minPages free pages at all.
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+
+		// If we found something, scavenge it and return!
+		if npages != 0 {
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+			unlock(p.mheapLock)
+			return uintptr(npages) * pageSize, work
+		}
+	}
+	unlock(p.mheapLock)
+
+	// Update the limit to reflect the fact that we checked maxChunk already.
+	work.limit = offAddr{chunkBase(maxChunk)}
+	return 0, work
+}
+
+// scavengeRangeLocked scavenges the given region of memory.
+// The region of memory is described by its chunk index (ci),
+// the starting page index of the region relative to that
+// chunk (base), and the length of the region in pages (npages).
+//
+// Returns the base address of the scavenged region.
+//
+// p.mheapLock must be held. Unlocks p.mheapLock but reacquires
+// it before returning. Must be run on the systemstack as a result.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	assertLockHeld(p.mheapLock)
+
+	// Compute the full address for the start of the range.
+	addr := chunkBase(ci) + uintptr(base)*pageSize
+
+	// Mark the range we're about to scavenge as allocated, because
+	// we don't want any allocating goroutines to grab it while
+	// the scavenging is in progress.
+	if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
+		throw("double scavenge")
+	}
+
+	// With that done, it's safe to unlock.
+	unlock(p.mheapLock)
+
+	// Update the scavenge low watermark.
+	lock(&p.scav.lock)
+	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
+		p.scav.scavLWM = oAddr
+	}
+	unlock(&p.scav.lock)
+
+	if !p.test {
+		// Only perform the actual scavenging if we're not in a test.
+		// It's dangerous to do so otherwise.
+		sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+
+		// Update global accounting only when not in test, otherwise
+		// the runtime's accounting will be wrong.
+		nbytes := int64(npages) * pageSize
+		atomic.Xadd64(&memstats.heap_released, nbytes)
+
+		// Update consistent accounting too.
+		stats := memstats.heapStats.acquire()
+		atomic.Xaddint64(&stats.committed, -nbytes)
+		atomic.Xaddint64(&stats.released, nbytes)
+		memstats.heapStats.release()
+	}
+
+	// Relock the heap, because now we need to make these pages
+	// available allocation. Free them back to the page allocator.
+	lock(p.mheapLock)
+	p.free(addr, uintptr(npages), true)
+
+	// Mark the range as scavenged.
+	p.chunkOf(ci).scavenged.setRange(base, npages)
+	return addr
+}
+
+// fillAligned returns x but with all zeroes in m-aligned
+// groups of m bits set to 1 if any bit in the group is non-zero.
+//
+// For example, fillAligned(0x0100a3, 8) == 0xff00ff.
+//
+// Note that if m == 1, this is a no-op.
+//
+// m must be a power of 2 <= maxPagesPerPhysPage.
+func fillAligned(x uint64, m uint) uint64 {
+	apply := func(x uint64, c uint64) uint64 {
+		// The technique used it here is derived from
+		// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+		// and extended for more than just bytes (like nibbles
+		// and uint16s) by using an appropriate constant.
+		//
+		// To summarize the technique, quoting from that page:
+		// "[It] works by first zeroing the high bits of the [8]
+		// bytes in the word. Subsequently, it adds a number that
+		// will result in an overflow to the high bit of a byte if
+		// any of the low bits were initially set. Next the high
+		// bits of the original word are ORed with these values;
+		// thus, the high bit of a byte is set iff any bit in the
+		// byte was set. Finally, we determine if any of these high
+		// bits are zero by ORing with ones everywhere except the
+		// high bits and inverting the result."
+		return ^((((x & c) + c) | x) | c)
+	}
+	// Transform x to contain a 1 bit at the top of each m-aligned
+	// group of m zero bits.
+	switch m {
+	case 1:
+		return x
+	case 2:
+		x = apply(x, 0x5555555555555555)
+	case 4:
+		x = apply(x, 0x7777777777777777)
+	case 8:
+		x = apply(x, 0x7f7f7f7f7f7f7f7f)
+	case 16:
+		x = apply(x, 0x7fff7fff7fff7fff)
+	case 32:
+		x = apply(x, 0x7fffffff7fffffff)
+	case 64: // == maxPagesPerPhysPage
+		x = apply(x, 0x7fffffffffffffff)
+	default:
+		throw("bad m value")
+	}
+	// Now, the top bit of each m-aligned group in x is set
+	// that group was all zero in the original x.
+
+	// From each group of m bits subtract 1.
+	// Because we know only the top bits of each
+	// m-aligned group are set, we know this will
+	// set each group to have all the bits set except
+	// the top bit, so just OR with the original
+	// result to set all the bits.
+	return ^((x - (x >> (m - 1))) | x)
+}
+
+// hasScavengeCandidate returns true if there's any min-page-aligned groups of
+// min pages of free-and-unscavenged memory in the region represented by this
+// pallocData.
+//
+// min must be a non-zero power of 2 <= maxPagesPerPhysPage.
+func (m *pallocData) hasScavengeCandidate(min uintptr) bool {
+	if min&(min-1) != 0 || min == 0 {
+		print("runtime: min = ", min, "\n")
+		throw("min must be a non-zero power of 2")
+	} else if min > maxPagesPerPhysPage {
+		print("runtime: min = ", min, "\n")
+		throw("min too large")
+	}
+
+	// The goal of this search is to see if the chunk contains any free and unscavenged memory.
+	for i := len(m.scavenged) - 1; i >= 0; i-- {
+		// 1s are scavenged OR non-free => 0s are unscavenged AND free
+		//
+		// TODO(mknyszek): Consider splitting up fillAligned into two
+		// functions, since here we technically could get by with just
+		// the first half of its computation. It'll save a few instructions
+		// but adds some additional code complexity.
+		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+
+		// Quickly skip over chunks of non-free or scavenged pages.
+		if x != ^uint64(0) {
+			return true
+		}
+	}
+	return false
+}
+
+// findScavengeCandidate returns a start index and a size for this pallocData
+// segment which represents a contiguous region of free and unscavenged memory.
+//
+// searchIdx indicates the page index within this chunk to start the search, but
+// note that findScavengeCandidate searches backwards through the pallocData. As a
+// a result, it will return the highest scavenge candidate in address order.
+//
+// min indicates a hard minimum size and alignment for runs of pages. That is,
+// findScavengeCandidate will not return a region smaller than min pages in size,
+// or that is min pages or greater in size but not aligned to min. min must be
+// a non-zero power of 2 <= maxPagesPerPhysPage.
+//
+// max is a hint for how big of a region is desired. If max >= pallocChunkPages, then
+// findScavengeCandidate effectively returns entire free and unscavenged regions.
+// If max < pallocChunkPages, it may truncate the returned region such that size is
+// max. However, findScavengeCandidate may still return a larger region if, for
+// example, it chooses to preserve huge pages, or if max is not aligned to min (it
+// will round up). That is, even if max is small, the returned size is not guaranteed
+// to be equal to max. max is allowed to be less than min, in which case it is as if
+// max == min.
+func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
+	if min&(min-1) != 0 || min == 0 {
+		print("runtime: min = ", min, "\n")
+		throw("min must be a non-zero power of 2")
+	} else if min > maxPagesPerPhysPage {
+		print("runtime: min = ", min, "\n")
+		throw("min too large")
+	}
+	// max may not be min-aligned, so we might accidentally truncate to
+	// a max value which causes us to return a non-min-aligned value.
+	// To prevent this, align max up to a multiple of min (which is always
+	// a power of 2). This also prevents max from ever being less than
+	// min, unless it's zero, so handle that explicitly.
+	if max == 0 {
+		max = min
+	} else {
+		max = alignUp(max, min)
+	}
+
+	i := int(searchIdx / 64)
+	// Start by quickly skipping over blocks of non-free or scavenged pages.
+	for ; i >= 0; i-- {
+		// 1s are scavenged OR non-free => 0s are unscavenged AND free
+		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+		if x != ^uint64(0) {
+			break
+		}
+	}
+	if i < 0 {
+		// Failed to find any free/unscavenged pages.
+		return 0, 0
+	}
+	// We have something in the 64-bit chunk at i, but it could
+	// extend further. Loop until we find the extent of it.
+
+	// 1s are scavenged OR non-free => 0s are unscavenged AND free
+	x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+	z1 := uint(sys.LeadingZeros64(^x))
+	run, end := uint(0), uint(i)*64+(64-z1)
+	if x<<z1 != 0 {
+		// After shifting out z1 bits, we still have 1s,
+		// so the run ends inside this word.
+		run = uint(sys.LeadingZeros64(x << z1))
+	} else {
+		// After shifting out z1 bits, we have no more 1s.
+		// This means the run extends to the bottom of the
+		// word so it may extend into further words.
+		run = 64 - z1
+		for j := i - 1; j >= 0; j-- {
+			x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min))
+			run += uint(sys.LeadingZeros64(x))
+			if x != 0 {
+				// The run stopped in this word.
+				break
+			}
+		}
+	}
+
+	// Split the run we found if it's larger than max but hold on to
+	// our original length, since we may need it later.
+	size := run
+	if size > uint(max) {
+		size = uint(max)
+	}
+	start := end - size
+
+	// Each huge page is guaranteed to fit in a single palloc chunk.
+	//
+	// TODO(mknyszek): Support larger huge page sizes.
+	// TODO(mknyszek): Consider taking pages-per-huge-page as a parameter
+	// so we can write tests for this.
+	if physHugePageSize > pageSize && physHugePageSize > physPageSize {
+		// We have huge pages, so let's ensure we don't break one by scavenging
+		// over a huge page boundary. If the range [start, start+size) overlaps with
+		// a free-and-unscavenged huge page, we want to grow the region we scavenge
+		// to include that huge page.
+
+		// Compute the huge page boundary above our candidate.
+		pagesPerHugePage := uintptr(physHugePageSize / pageSize)
+		hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage))
+
+		// If that boundary is within our current candidate, then we may be breaking
+		// a huge page.
+		if hugePageAbove <= end {
+			// Compute the huge page boundary below our candidate.
+			hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage))
+
+			if hugePageBelow >= end-run {
+				// We're in danger of breaking apart a huge page since start+size crosses
+				// a huge page boundary and rounding down start to the nearest huge
+				// page boundary is included in the full run we found. Include the entire
+				// huge page in the bound by rounding down to the huge page size.
+				size = size + (start - hugePageBelow)
+				start = hugePageBelow
+			}
+		}
+	}
+	return start, size
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgcstack.go b/contrib/go/_std_1.18/src/runtime/mgcstack.go
new file mode 100644
index 0000000000..49dc54e165
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgcstack.go
@@ -0,0 +1,352 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: stack objects and stack tracing
+// See the design doc at https://docs.google.com/document/d/1un-Jn47yByHL7I0aVIP_uVCMxjdM5mpelJhiKlIqxkE/edit?usp=sharing
+// Also see issue 22350.
+
+// Stack tracing solves the problem of determining which parts of the
+// stack are live and should be scanned. It runs as part of scanning
+// a single goroutine stack.
+//
+// Normally determining which parts of the stack are live is easy to
+// do statically, as user code has explicit references (reads and
+// writes) to stack variables. The compiler can do a simple dataflow
+// analysis to determine liveness of stack variables at every point in
+// the code. See cmd/compile/internal/gc/plive.go for that analysis.
+//
+// However, when we take the address of a stack variable, determining
+// whether that variable is still live is less clear. We can still
+// look for static accesses, but accesses through a pointer to the
+// variable are difficult in general to track statically. That pointer
+// can be passed among functions on the stack, conditionally retained,
+// etc.
+//
+// Instead, we will track pointers to stack variables dynamically.
+// All pointers to stack-allocated variables will themselves be on the
+// stack somewhere (or in associated locations, like defer records), so
+// we can find them all efficiently.
+//
+// Stack tracing is organized as a mini garbage collection tracing
+// pass. The objects in this garbage collection are all the variables
+// on the stack whose address is taken, and which themselves contain a
+// pointer. We call these variables "stack objects".
+//
+// We begin by determining all the stack objects on the stack and all
+// the statically live pointers that may point into the stack. We then
+// process each pointer to see if it points to a stack object. If it
+// does, we scan that stack object. It may contain pointers into the
+// heap, in which case those pointers are passed to the main garbage
+// collection. It may also contain pointers into the stack, in which
+// case we add them to our set of stack pointers.
+//
+// Once we're done processing all the pointers (including the ones we
+// added during processing), we've found all the stack objects that
+// are live. Any dead stack objects are not scanned and their contents
+// will not keep heap objects live. Unlike the main garbage
+// collection, we can't sweep the dead stack objects; they live on in
+// a moribund state until the stack frame that contains them is
+// popped.
+//
+// A stack can look like this:
+//
+// +----------+
+// | foo()    |
+// | +------+ |
+// | |  A   | | <---\
+// | +------+ |     |
+// |          |     |
+// | +------+ |     |
+// | |  B   | |     |
+// | +------+ |     |
+// |          |     |
+// +----------+     |
+// | bar()    |     |
+// | +------+ |     |
+// | |  C   | | <-\ |
+// | +----|-+ |   | |
+// |      |   |   | |
+// | +----v-+ |   | |
+// | |  D  ---------/
+// | +------+ |   |
+// |          |   |
+// +----------+   |
+// | baz()    |   |
+// | +------+ |   |
+// | |  E  -------/
+// | +------+ |
+// |      ^   |
+// | F: --/   |
+// |          |
+// +----------+
+//
+// foo() calls bar() calls baz(). Each has a frame on the stack.
+// foo() has stack objects A and B.
+// bar() has stack objects C and D, with C pointing to D and D pointing to A.
+// baz() has a stack object E pointing to C, and a local variable F pointing to E.
+//
+// Starting from the pointer in local variable F, we will eventually
+// scan all of E, C, D, and A (in that order). B is never scanned
+// because there is no live pointer to it. If B is also statically
+// dead (meaning that foo() never accesses B again after it calls
+// bar()), then B's pointers into the heap are not considered live.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+const stackTraceDebug = false
+
+// Buffer for pointers found during stack tracing.
+// Must be smaller than or equal to workbuf.
+//
+//go:notinheap
+type stackWorkBuf struct {
+	stackWorkBufHdr
+	obj [(_WorkbufSize - unsafe.Sizeof(stackWorkBufHdr{})) / goarch.PtrSize]uintptr
+}
+
+// Header declaration must come after the buf declaration above, because of issue #14620.
+//
+//go:notinheap
+type stackWorkBufHdr struct {
+	workbufhdr
+	next *stackWorkBuf // linked list of workbufs
+	// Note: we could theoretically repurpose lfnode.next as this next pointer.
+	// It would save 1 word, but that probably isn't worth busting open
+	// the lfnode API.
+}
+
+// Buffer for stack objects found on a goroutine stack.
+// Must be smaller than or equal to workbuf.
+//
+//go:notinheap
+type stackObjectBuf struct {
+	stackObjectBufHdr
+	obj [(_WorkbufSize - unsafe.Sizeof(stackObjectBufHdr{})) / unsafe.Sizeof(stackObject{})]stackObject
+}
+
+//go:notinheap
+type stackObjectBufHdr struct {
+	workbufhdr
+	next *stackObjectBuf
+}
+
+func init() {
+	if unsafe.Sizeof(stackWorkBuf{}) > unsafe.Sizeof(workbuf{}) {
+		panic("stackWorkBuf too big")
+	}
+	if unsafe.Sizeof(stackObjectBuf{}) > unsafe.Sizeof(workbuf{}) {
+		panic("stackObjectBuf too big")
+	}
+}
+
+// A stackObject represents a variable on the stack that has had
+// its address taken.
+//
+//go:notinheap
+type stackObject struct {
+	off   uint32             // offset above stack.lo
+	size  uint32             // size of object
+	r     *stackObjectRecord // info of the object (for ptr/nonptr bits). nil if object has been scanned.
+	left  *stackObject       // objects with lower addresses
+	right *stackObject       // objects with higher addresses
+}
+
+// obj.r = r, but with no write barrier.
+//go:nowritebarrier
+func (obj *stackObject) setRecord(r *stackObjectRecord) {
+	// Types of stack objects are always in read-only memory, not the heap.
+	// So not using a write barrier is ok.
+	*(*uintptr)(unsafe.Pointer(&obj.r)) = uintptr(unsafe.Pointer(r))
+}
+
+// A stackScanState keeps track of the state used during the GC walk
+// of a goroutine.
+type stackScanState struct {
+	cache pcvalueCache
+
+	// stack limits
+	stack stack
+
+	// conservative indicates that the next frame must be scanned conservatively.
+	// This applies only to the innermost frame at an async safe-point.
+	conservative bool
+
+	// buf contains the set of possible pointers to stack objects.
+	// Organized as a LIFO linked list of buffers.
+	// All buffers except possibly the head buffer are full.
+	buf     *stackWorkBuf
+	freeBuf *stackWorkBuf // keep around one free buffer for allocation hysteresis
+
+	// cbuf contains conservative pointers to stack objects. If
+	// all pointers to a stack object are obtained via
+	// conservative scanning, then the stack object may be dead
+	// and may contain dead pointers, so it must be scanned
+	// defensively.
+	cbuf *stackWorkBuf
+
+	// list of stack objects
+	// Objects are in increasing address order.
+	head  *stackObjectBuf
+	tail  *stackObjectBuf
+	nobjs int
+
+	// root of binary tree for fast object lookup by address
+	// Initialized by buildIndex.
+	root *stackObject
+}
+
+// Add p as a potential pointer to a stack object.
+// p must be a stack address.
+func (s *stackScanState) putPtr(p uintptr, conservative bool) {
+	if p < s.stack.lo || p >= s.stack.hi {
+		throw("address not a stack address")
+	}
+	head := &s.buf
+	if conservative {
+		head = &s.cbuf
+	}
+	buf := *head
+	if buf == nil {
+		// Initial setup.
+		buf = (*stackWorkBuf)(unsafe.Pointer(getempty()))
+		buf.nobj = 0
+		buf.next = nil
+		*head = buf
+	} else if buf.nobj == len(buf.obj) {
+		if s.freeBuf != nil {
+			buf = s.freeBuf
+			s.freeBuf = nil
+		} else {
+			buf = (*stackWorkBuf)(unsafe.Pointer(getempty()))
+		}
+		buf.nobj = 0
+		buf.next = *head
+		*head = buf
+	}
+	buf.obj[buf.nobj] = p
+	buf.nobj++
+}
+
+// Remove and return a potential pointer to a stack object.
+// Returns 0 if there are no more pointers available.
+//
+// This prefers non-conservative pointers so we scan stack objects
+// precisely if there are any non-conservative pointers to them.
+func (s *stackScanState) getPtr() (p uintptr, conservative bool) {
+	for _, head := range []**stackWorkBuf{&s.buf, &s.cbuf} {
+		buf := *head
+		if buf == nil {
+			// Never had any data.
+			continue
+		}
+		if buf.nobj == 0 {
+			if s.freeBuf != nil {
+				// Free old freeBuf.
+				putempty((*workbuf)(unsafe.Pointer(s.freeBuf)))
+			}
+			// Move buf to the freeBuf.
+			s.freeBuf = buf
+			buf = buf.next
+			*head = buf
+			if buf == nil {
+				// No more data in this list.
+				continue
+			}
+		}
+		buf.nobj--
+		return buf.obj[buf.nobj], head == &s.cbuf
+	}
+	// No more data in either list.
+	if s.freeBuf != nil {
+		putempty((*workbuf)(unsafe.Pointer(s.freeBuf)))
+		s.freeBuf = nil
+	}
+	return 0, false
+}
+
+// addObject adds a stack object at addr of type typ to the set of stack objects.
+func (s *stackScanState) addObject(addr uintptr, r *stackObjectRecord) {
+	x := s.tail
+	if x == nil {
+		// initial setup
+		x = (*stackObjectBuf)(unsafe.Pointer(getempty()))
+		x.next = nil
+		s.head = x
+		s.tail = x
+	}
+	if x.nobj > 0 && uint32(addr-s.stack.lo) < x.obj[x.nobj-1].off+x.obj[x.nobj-1].size {
+		throw("objects added out of order or overlapping")
+	}
+	if x.nobj == len(x.obj) {
+		// full buffer - allocate a new buffer, add to end of linked list
+		y := (*stackObjectBuf)(unsafe.Pointer(getempty()))
+		y.next = nil
+		x.next = y
+		s.tail = y
+		x = y
+	}
+	obj := &x.obj[x.nobj]
+	x.nobj++
+	obj.off = uint32(addr - s.stack.lo)
+	obj.size = uint32(r.size)
+	obj.setRecord(r)
+	// obj.left and obj.right will be initialized by buildIndex before use.
+	s.nobjs++
+}
+
+// buildIndex initializes s.root to a binary search tree.
+// It should be called after all addObject calls but before
+// any call of findObject.
+func (s *stackScanState) buildIndex() {
+	s.root, _, _ = binarySearchTree(s.head, 0, s.nobjs)
+}
+
+// Build a binary search tree with the n objects in the list
+// x.obj[idx], x.obj[idx+1], ..., x.next.obj[0], ...
+// Returns the root of that tree, and the buf+idx of the nth object after x.obj[idx].
+// (The first object that was not included in the binary search tree.)
+// If n == 0, returns nil, x.
+func binarySearchTree(x *stackObjectBuf, idx int, n int) (root *stackObject, restBuf *stackObjectBuf, restIdx int) {
+	if n == 0 {
+		return nil, x, idx
+	}
+	var left, right *stackObject
+	left, x, idx = binarySearchTree(x, idx, n/2)
+	root = &x.obj[idx]
+	idx++
+	if idx == len(x.obj) {
+		x = x.next
+		idx = 0
+	}
+	right, x, idx = binarySearchTree(x, idx, n-n/2-1)
+	root.left = left
+	root.right = right
+	return root, x, idx
+}
+
+// findObject returns the stack object containing address a, if any.
+// Must have called buildIndex previously.
+func (s *stackScanState) findObject(a uintptr) *stackObject {
+	off := uint32(a - s.stack.lo)
+	obj := s.root
+	for {
+		if obj == nil {
+			return nil
+		}
+		if off < obj.off {
+			obj = obj.left
+			continue
+		}
+		if off >= obj.off+obj.size {
+			obj = obj.right
+			continue
+		}
+		return obj
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgcsweep.go b/contrib/go/_std_1.18/src/runtime/mgcsweep.go
new file mode 100644
index 0000000000..0d58f8e0b5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgcsweep.go
@@ -0,0 +1,878 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: sweeping
+
+// The sweeper consists of two different algorithms:
+//
+// * The object reclaimer finds and frees unmarked slots in spans. It
+//   can free a whole span if none of the objects are marked, but that
+//   isn't its goal. This can be driven either synchronously by
+//   mcentral.cacheSpan for mcentral spans, or asynchronously by
+//   sweepone, which looks at all the mcentral lists.
+//
+// * The span reclaimer looks for spans that contain no marked objects
+//   and frees whole spans. This is a separate algorithm because
+//   freeing whole spans is the hardest task for the object reclaimer,
+//   but is critical when allocating new spans. The entry point for
+//   this is mheap_.reclaim and it's driven by a sequential scan of
+//   the page marks bitmap in the heap arenas.
+//
+// Both algorithms ultimately call mspan.sweep, which sweeps a single
+// heap span.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var sweep sweepdata
+
+// State of background sweep.
+type sweepdata struct {
+	lock    mutex
+	g       *g
+	parked  bool
+	started bool
+
+	nbgsweep    uint32
+	npausesweep uint32
+
+	// active tracks outstanding sweepers and the sweep
+	// termination condition.
+	active activeSweep
+
+	// centralIndex is the current unswept span class.
+	// It represents an index into the mcentral span
+	// sets. Accessed and updated via its load and
+	// update methods. Not protected by a lock.
+	//
+	// Reset at mark termination.
+	// Used by mheap.nextSpanForSweep.
+	centralIndex sweepClass
+}
+
+// sweepClass is a spanClass and one bit to represent whether we're currently
+// sweeping partial or full spans.
+type sweepClass uint32
+
+const (
+	numSweepClasses            = numSpanClasses * 2
+	sweepClassDone  sweepClass = sweepClass(^uint32(0))
+)
+
+func (s *sweepClass) load() sweepClass {
+	return sweepClass(atomic.Load((*uint32)(s)))
+}
+
+func (s *sweepClass) update(sNew sweepClass) {
+	// Only update *s if its current value is less than sNew,
+	// since *s increases monotonically.
+	sOld := s.load()
+	for sOld < sNew && !atomic.Cas((*uint32)(s), uint32(sOld), uint32(sNew)) {
+		sOld = s.load()
+	}
+	// TODO(mknyszek): This isn't the only place we have
+	// an atomic monotonically increasing counter. It would
+	// be nice to have an "atomic max" which is just implemented
+	// as the above on most architectures. Some architectures
+	// like RISC-V however have native support for an atomic max.
+}
+
+func (s *sweepClass) clear() {
+	atomic.Store((*uint32)(s), 0)
+}
+
+// split returns the underlying span class as well as
+// whether we're interested in the full or partial
+// unswept lists for that class, indicated as a boolean
+// (true means "full").
+func (s sweepClass) split() (spc spanClass, full bool) {
+	return spanClass(s >> 1), s&1 == 0
+}
+
+// nextSpanForSweep finds and pops the next span for sweeping from the
+// central sweep buffers. It returns ownership of the span to the caller.
+// Returns nil if no such span exists.
+func (h *mheap) nextSpanForSweep() *mspan {
+	sg := h.sweepgen
+	for sc := sweep.centralIndex.load(); sc < numSweepClasses; sc++ {
+		spc, full := sc.split()
+		c := &h.central[spc].mcentral
+		var s *mspan
+		if full {
+			s = c.fullUnswept(sg).pop()
+		} else {
+			s = c.partialUnswept(sg).pop()
+		}
+		if s != nil {
+			// Write down that we found something so future sweepers
+			// can start from here.
+			sweep.centralIndex.update(sc)
+			return s
+		}
+	}
+	// Write down that we found nothing.
+	sweep.centralIndex.update(sweepClassDone)
+	return nil
+}
+
+const sweepDrainedMask = 1 << 31
+
+// activeSweep is a type that captures whether sweeping
+// is done, and whether there are any outstanding sweepers.
+//
+// Every potential sweeper must call begin() before they look
+// for work, and end() after they've finished sweeping.
+type activeSweep struct {
+	// state is divided into two parts.
+	//
+	// The top bit (masked by sweepDrainedMask) is a boolean
+	// value indicating whether all the sweep work has been
+	// drained from the queue.
+	//
+	// The rest of the bits are a counter, indicating the
+	// number of outstanding concurrent sweepers.
+	state atomic.Uint32
+}
+
+// begin registers a new sweeper. Returns a sweepLocker
+// for acquiring spans for sweeping. Any outstanding sweeper blocks
+// sweep termination.
+//
+// If the sweepLocker is invalid, the caller can be sure that all
+// outstanding sweep work has been drained, so there is nothing left
+// to sweep. Note that there may be sweepers currently running, so
+// this does not indicate that all sweeping has completed.
+//
+// Even if the sweepLocker is invalid, its sweepGen is always valid.
+func (a *activeSweep) begin() sweepLocker {
+	for {
+		state := a.state.Load()
+		if state&sweepDrainedMask != 0 {
+			return sweepLocker{mheap_.sweepgen, false}
+		}
+		if a.state.CompareAndSwap(state, state+1) {
+			return sweepLocker{mheap_.sweepgen, true}
+		}
+	}
+}
+
+// end deregisters a sweeper. Must be called once for each time
+// begin is called if the sweepLocker is valid.
+func (a *activeSweep) end(sl sweepLocker) {
+	if sl.sweepGen != mheap_.sweepgen {
+		throw("sweeper left outstanding across sweep generations")
+	}
+	for {
+		state := a.state.Load()
+		if (state&^sweepDrainedMask)-1 >= sweepDrainedMask {
+			throw("mismatched begin/end of activeSweep")
+		}
+		if a.state.CompareAndSwap(state, state-1) {
+			if state != sweepDrainedMask {
+				return
+			}
+			if debug.gcpacertrace > 0 {
+				print("pacer: sweep done at heap size ", gcController.heapLive>>20, "MB; allocated ", (gcController.heapLive-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept.Load(), " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+			}
+			return
+		}
+	}
+}
+
+// markDrained marks the active sweep cycle as having drained
+// all remaining work. This is safe to be called concurrently
+// with all other methods of activeSweep, though may race.
+//
+// Returns true if this call was the one that actually performed
+// the mark.
+func (a *activeSweep) markDrained() bool {
+	for {
+		state := a.state.Load()
+		if state&sweepDrainedMask != 0 {
+			return false
+		}
+		if a.state.CompareAndSwap(state, state|sweepDrainedMask) {
+			return true
+		}
+	}
+}
+
+// sweepers returns the current number of active sweepers.
+func (a *activeSweep) sweepers() uint32 {
+	return a.state.Load() &^ sweepDrainedMask
+}
+
+// isDone returns true if all sweep work has been drained and no more
+// outstanding sweepers exist. That is, when the sweep phase is
+// completely done.
+func (a *activeSweep) isDone() bool {
+	return a.state.Load() == sweepDrainedMask
+}
+
+// reset sets up the activeSweep for the next sweep cycle.
+//
+// The world must be stopped.
+func (a *activeSweep) reset() {
+	assertWorldStopped()
+	a.state.Store(0)
+}
+
+// finishsweep_m ensures that all spans are swept.
+//
+// The world must be stopped. This ensures there are no sweeps in
+// progress.
+//
+//go:nowritebarrier
+func finishsweep_m() {
+	assertWorldStopped()
+
+	// Sweeping must be complete before marking commences, so
+	// sweep any unswept spans. If this is a concurrent GC, there
+	// shouldn't be any spans left to sweep, so this should finish
+	// instantly. If GC was forced before the concurrent sweep
+	// finished, there may be spans to sweep.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	// Make sure there aren't any outstanding sweepers left.
+	// At this point, with the world stopped, it means one of two
+	// things. Either we were able to preempt a sweeper, or that
+	// a sweeper didn't call sweep.active.end when it should have.
+	// Both cases indicate a bug, so throw.
+	if sweep.active.sweepers() != 0 {
+		throw("active sweepers found at start of mark phase")
+	}
+
+	// Reset all the unswept buffers, which should be empty.
+	// Do this in sweep termination as opposed to mark termination
+	// so that we can catch unswept spans and reclaim blocks as
+	// soon as possible.
+	sg := mheap_.sweepgen
+	for i := range mheap_.central {
+		c := &mheap_.central[i].mcentral
+		c.partialUnswept(sg).reset()
+		c.fullUnswept(sg).reset()
+	}
+
+	// Sweeping is done, so if the scavenger isn't already awake,
+	// wake it up. There's definitely work for it to do at this
+	// point.
+	wakeScavenger()
+
+	nextMarkBitArenaEpoch()
+}
+
+func bgsweep(c chan int) {
+	sweep.g = getg()
+
+	lockInit(&sweep.lock, lockRankSweep)
+	lock(&sweep.lock)
+	sweep.parked = true
+	c <- 1
+	goparkunlock(&sweep.lock, waitReasonGCSweepWait, traceEvGoBlock, 1)
+
+	for {
+		for sweepone() != ^uintptr(0) {
+			sweep.nbgsweep++
+			Gosched()
+		}
+		for freeSomeWbufs(true) {
+			Gosched()
+		}
+		lock(&sweep.lock)
+		if !isSweepDone() {
+			// This can happen if a GC runs between
+			// gosweepone returning ^0 above
+			// and the lock being acquired.
+			unlock(&sweep.lock)
+			continue
+		}
+		sweep.parked = true
+		goparkunlock(&sweep.lock, waitReasonGCSweepWait, traceEvGoBlock, 1)
+	}
+}
+
+// sweepLocker acquires sweep ownership of spans.
+type sweepLocker struct {
+	// sweepGen is the sweep generation of the heap.
+	sweepGen uint32
+	valid    bool
+}
+
+// sweepLocked represents sweep ownership of a span.
+type sweepLocked struct {
+	*mspan
+}
+
+// tryAcquire attempts to acquire sweep ownership of span s. If it
+// successfully acquires ownership, it blocks sweep completion.
+func (l *sweepLocker) tryAcquire(s *mspan) (sweepLocked, bool) {
+	if !l.valid {
+		throw("use of invalid sweepLocker")
+	}
+	// Check before attempting to CAS.
+	if atomic.Load(&s.sweepgen) != l.sweepGen-2 {
+		return sweepLocked{}, false
+	}
+	// Attempt to acquire sweep ownership of s.
+	if !atomic.Cas(&s.sweepgen, l.sweepGen-2, l.sweepGen-1) {
+		return sweepLocked{}, false
+	}
+	return sweepLocked{s}, true
+}
+
+// sweepone sweeps some unswept heap span and returns the number of pages returned
+// to the heap, or ^uintptr(0) if there was nothing to sweep.
+func sweepone() uintptr {
+	gp := getg()
+
+	// Increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	gp.m.locks++
+
+	// TODO(austin): sweepone is almost always called in a loop;
+	// lift the sweepLocker into its callers.
+	sl := sweep.active.begin()
+	if !sl.valid {
+		gp.m.locks--
+		return ^uintptr(0)
+	}
+
+	// Find a span to sweep.
+	npages := ^uintptr(0)
+	var noMoreWork bool
+	for {
+		s := mheap_.nextSpanForSweep()
+		if s == nil {
+			noMoreWork = sweep.active.markDrained()
+			break
+		}
+		if state := s.state.get(); state != mSpanInUse {
+			// This can happen if direct sweeping already
+			// swept this span, but in that case the sweep
+			// generation should always be up-to-date.
+			if !(s.sweepgen == sl.sweepGen || s.sweepgen == sl.sweepGen+3) {
+				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sl.sweepGen, "\n")
+				throw("non in-use span in unswept list")
+			}
+			continue
+		}
+		if s, ok := sl.tryAcquire(s); ok {
+			// Sweep the span we found.
+			npages = s.npages
+			if s.sweep(false) {
+				// Whole span was freed. Count it toward the
+				// page reclaimer credit since these pages can
+				// now be used for span allocation.
+				mheap_.reclaimCredit.Add(npages)
+			} else {
+				// Span is still in-use, so this returned no
+				// pages to the heap and the span needs to
+				// move to the swept in-use list.
+				npages = 0
+			}
+			break
+		}
+	}
+	sweep.active.end(sl)
+
+	if noMoreWork {
+		// The sweep list is empty. There may still be
+		// concurrent sweeps running, but we're at least very
+		// close to done sweeping.
+
+		// Move the scavenge gen forward (signalling
+		// that there's new work to do) and wake the scavenger.
+		//
+		// The scavenger is signaled by the last sweeper because once
+		// sweeping is done, we will definitely have useful work for
+		// the scavenger to do, since the scavenger only runs over the
+		// heap once per GC cycle. This update is not done during sweep
+		// termination because in some cases there may be a long delay
+		// between sweep done and sweep termination (e.g. not enough
+		// allocations to trigger a GC) which would be nice to fill in
+		// with scavenging work.
+		systemstack(func() {
+			lock(&mheap_.lock)
+			mheap_.pages.scavengeStartGen()
+			unlock(&mheap_.lock)
+		})
+		// Since we might sweep in an allocation path, it's not possible
+		// for us to wake the scavenger directly via wakeScavenger, since
+		// it could allocate. Ask sysmon to do it for us instead.
+		readyForScavenger()
+	}
+
+	gp.m.locks--
+	return npages
+}
+
+// isSweepDone reports whether all spans are swept.
+//
+// Note that this condition may transition from false to true at any
+// time as the sweeper runs. It may transition from true to false if a
+// GC runs; to prevent that the caller must be non-preemptible or must
+// somehow block GC progress.
+func isSweepDone() bool {
+	return sweep.active.isDone()
+}
+
+// Returns only when span s has been swept.
+//go:nowritebarrier
+func (s *mspan) ensureSwept() {
+	// Caller must disable preemption.
+	// Otherwise when this function returns the span can become unswept again
+	// (if GC is triggered on another goroutine).
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("mspan.ensureSwept: m is not locked")
+	}
+
+	// If this operation fails, then that means that there are
+	// no more spans to be swept. In this case, either s has already
+	// been swept, or is about to be acquired for sweeping and swept.
+	sl := sweep.active.begin()
+	if sl.valid {
+		// The caller must be sure that the span is a mSpanInUse span.
+		if s, ok := sl.tryAcquire(s); ok {
+			s.sweep(false)
+			sweep.active.end(sl)
+			return
+		}
+		sweep.active.end(sl)
+	}
+
+	// Unfortunately we can't sweep the span ourselves. Somebody else
+	// got to it first. We don't have efficient means to wait, but that's
+	// OK, it will be swept fairly soon.
+	for {
+		spangen := atomic.Load(&s.sweepgen)
+		if spangen == sl.sweepGen || spangen == sl.sweepGen+3 {
+			break
+		}
+		osyield()
+	}
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in mcentral lists;
+// caller takes care of it.
+func (sl *sweepLocked) sweep(preserve bool) bool {
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("mspan.sweep: m is not locked")
+	}
+
+	s := sl.mspan
+	if !preserve {
+		// We'll release ownership of this span. Nil it out to
+		// prevent the caller from accidentally using it.
+		sl.mspan = nil
+	}
+
+	sweepgen := mheap_.sweepgen
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state")
+	}
+
+	if trace.enabled {
+		traceGCSweepSpan(s.npages * _PageSize)
+	}
+
+	mheap_.pagesSwept.Add(int64(s.npages))
+
+	spc := s.spanclass
+	size := s.elemsize
+
+	// The allocBits indicate which unmarked objects don't need to be
+	// processed since they were free at the end of the last GC cycle
+	// and were not allocated since then.
+	// If the allocBits index is >= s.freeindex and the bit
+	// is not marked then the object remains unallocated
+	// since the last GC.
+	// This situation is analogous to being on a freelist.
+
+	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
+	hadSpecials := s.specials != nil
+	siter := newSpecialsIter(s)
+	for siter.valid() {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		objIndex := uintptr(siter.s.offset) / size
+		p := s.base() + objIndex*size
+		mbits := s.markBitsForIndex(objIndex)
+		if !mbits.isMarked() {
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - s.base() + size
+			for tmp := siter.s; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					mbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for siter.valid() && uintptr(siter.s.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				special := siter.s
+				p := s.base() + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					siter.unlinkAndNext()
+					freeSpecial(special, unsafe.Pointer(p), size)
+				} else {
+					// The object has finalizers, so we're keeping it alive.
+					// All other specials only apply when an object is freed,
+					// so just keep the special record.
+					siter.next()
+				}
+			}
+		} else {
+			// object is still live
+			if siter.s.kind == _KindSpecialReachable {
+				special := siter.unlinkAndNext()
+				(*specialReachable)(unsafe.Pointer(special)).reachable = true
+				freeSpecial(special, unsafe.Pointer(p), size)
+			} else {
+				// keep special record
+				siter.next()
+			}
+		}
+	}
+	if hadSpecials && s.specials == nil {
+		spanHasNoSpecials(s)
+	}
+
+	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled || asanenabled {
+		// Find all newly freed objects. This doesn't have to
+		// efficient; allocfreetrace has massive overhead.
+		mbits := s.markBitsForBase()
+		abits := s.allocBitsForIndex(0)
+		for i := uintptr(0); i < s.nelems; i++ {
+			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
+				x := s.base() + i*s.elemsize
+				if debug.allocfreetrace != 0 {
+					tracefree(unsafe.Pointer(x), size)
+				}
+				if debug.clobberfree != 0 {
+					clobberfree(unsafe.Pointer(x), size)
+				}
+				if raceenabled {
+					racefree(unsafe.Pointer(x), size)
+				}
+				if msanenabled {
+					msanfree(unsafe.Pointer(x), size)
+				}
+				if asanenabled {
+					asanpoison(unsafe.Pointer(x), size)
+				}
+			}
+			mbits.advance()
+			abits.advance()
+		}
+	}
+
+	// Check for zombie objects.
+	if s.freeindex < s.nelems {
+		// Everything < freeindex is allocated and hence
+		// cannot be zombies.
+		//
+		// Check the first bitmap byte, where we have to be
+		// careful with freeindex.
+		obj := s.freeindex
+		if (*s.gcmarkBits.bytep(obj / 8)&^*s.allocBits.bytep(obj / 8))>>(obj%8) != 0 {
+			s.reportZombies()
+		}
+		// Check remaining bytes.
+		for i := obj/8 + 1; i < divRoundUp(s.nelems, 8); i++ {
+			if *s.gcmarkBits.bytep(i)&^*s.allocBits.bytep(i) != 0 {
+				s.reportZombies()
+			}
+		}
+	}
+
+	// Count the number of free objects in this span.
+	nalloc := uint16(s.countAlloc())
+	nfreed := s.allocCount - nalloc
+	if nalloc > s.allocCount {
+		// The zombie check above should have caught this in
+		// more detail.
+		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		throw("sweep increased allocation count")
+	}
+
+	s.allocCount = nalloc
+	s.freeindex = 0 // reset allocation index to start of span.
+	if trace.enabled {
+		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
+	}
+
+	// gcmarkBits becomes the allocBits.
+	// get a fresh cleared gcmarkBits in preparation for next GC
+	s.allocBits = s.gcmarkBits
+	s.gcmarkBits = newMarkBits(s.nelems)
+
+	// Initialize alloc bits cache.
+	s.refillAllocCache(0)
+
+	// The span must be in our exclusive ownership until we update sweepgen,
+	// check for potential races.
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state after sweep")
+	}
+	if s.sweepgen == sweepgen+1 || s.sweepgen == sweepgen+3 {
+		throw("swept cached span")
+	}
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	//
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	//
+	// Serialization point.
+	// At this point the mark bits are cleared and allocation ready
+	// to go so release the span.
+	atomic.Store(&s.sweepgen, sweepgen)
+
+	if spc.sizeclass() != 0 {
+		// Handle spans for small objects.
+		if nfreed > 0 {
+			// Only mark the span as needing zeroing if we've freed any
+			// objects, because a fresh span that had been allocated into,
+			// wasn't totally filled, but then swept, still has all of its
+			// free slots zeroed.
+			s.needzero = 1
+			stats := memstats.heapStats.acquire()
+			atomic.Xadd64(&stats.smallFreeCount[spc.sizeclass()], int64(nfreed))
+			memstats.heapStats.release()
+		}
+		if !preserve {
+			// The caller may not have removed this span from whatever
+			// unswept set its on but taken ownership of the span for
+			// sweeping by updating sweepgen. If this span still is in
+			// an unswept set, then the mcentral will pop it off the
+			// set, check its sweepgen, and ignore it.
+			if nalloc == 0 {
+				// Free totally free span directly back to the heap.
+				mheap_.freeSpan(s)
+				return true
+			}
+			// Return span back to the right mcentral list.
+			if uintptr(nalloc) == s.nelems {
+				mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+			} else {
+				mheap_.central[spc].mcentral.partialSwept(sweepgen).push(s)
+			}
+		}
+	} else if !preserve {
+		// Handle spans for large objects.
+		if nfreed != 0 {
+			// Free large object span to heap.
+
+			// NOTE(rsc,dvyukov): The original implementation of efence
+			// in CL 22060046 used sysFree instead of sysFault, so that
+			// the operating system would eventually give the memory
+			// back to us again, so that an efence program could run
+			// longer without running out of memory. Unfortunately,
+			// calling sysFree here without any kind of adjustment of the
+			// heap data structures means that when the memory does
+			// come back to us, we have the wrong metadata for it, either in
+			// the mspan structures or in the garbage collection bitmap.
+			// Using sysFault here means that the program will run out of
+			// memory fairly quickly in efence mode, but at least it won't
+			// have mysterious crashes due to confused memory reuse.
+			// It should be possible to switch back to sysFree if we also
+			// implement and then call some kind of mheap.deleteSpan.
+			if debug.efence > 0 {
+				s.limit = 0 // prevent mlookup from finding this span
+				sysFault(unsafe.Pointer(s.base()), size)
+			} else {
+				mheap_.freeSpan(s)
+			}
+			stats := memstats.heapStats.acquire()
+			atomic.Xadd64(&stats.largeFreeCount, 1)
+			atomic.Xadd64(&stats.largeFree, int64(size))
+			memstats.heapStats.release()
+			return true
+		}
+
+		// Add a large span directly onto the full+swept list.
+		mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+	}
+	return false
+}
+
+// reportZombies reports any marked but free objects in s and throws.
+//
+// This generally means one of the following:
+//
+// 1. User code converted a pointer to a uintptr and then back
+// unsafely, and a GC ran while the uintptr was the only reference to
+// an object.
+//
+// 2. User code (or a compiler bug) constructed a bad pointer that
+// points to a free slot, often a past-the-end pointer.
+//
+// 3. The GC two cycles ago missed a pointer and freed a live object,
+// but it was still live in the last cycle, so this GC cycle found a
+// pointer to that object and marked it.
+func (s *mspan) reportZombies() {
+	printlock()
+	print("runtime: marked free object in span ", s, ", elemsize=", s.elemsize, " freeindex=", s.freeindex, " (bad use of unsafe.Pointer? try -d=checkptr)\n")
+	mbits := s.markBitsForBase()
+	abits := s.allocBitsForIndex(0)
+	for i := uintptr(0); i < s.nelems; i++ {
+		addr := s.base() + i*s.elemsize
+		print(hex(addr))
+		alloc := i < s.freeindex || abits.isMarked()
+		if alloc {
+			print(" alloc")
+		} else {
+			print(" free ")
+		}
+		if mbits.isMarked() {
+			print(" marked  ")
+		} else {
+			print(" unmarked")
+		}
+		zombie := mbits.isMarked() && !alloc
+		if zombie {
+			print(" zombie")
+		}
+		print("\n")
+		if zombie {
+			length := s.elemsize
+			if length > 1024 {
+				length = 1024
+			}
+			hexdumpWords(addr, addr+length, nil)
+		}
+		mbits.advance()
+		abits.advance()
+	}
+	throw("found pointer to free object")
+}
+
+// deductSweepCredit deducts sweep credit for allocating a span of
+// size spanBytes. This must be performed *before* the span is
+// allocated to ensure the system has enough credit. If necessary, it
+// performs sweeping to prevent going in to debt. If the caller will
+// also sweep pages (e.g., for a large allocation), it can pass a
+// non-zero callerSweepPages to leave that many pages unswept.
+//
+// deductSweepCredit makes a worst-case assumption that all spanBytes
+// bytes of the ultimately allocated span will be available for object
+// allocation.
+//
+// deductSweepCredit is the core of the "proportional sweep" system.
+// It uses statistics gathered by the garbage collector to perform
+// enough sweeping so that all pages are swept during the concurrent
+// sweep phase between GC cycles.
+//
+// mheap_ must NOT be locked.
+func deductSweepCredit(spanBytes uintptr, callerSweepPages uintptr) {
+	if mheap_.sweepPagesPerByte == 0 {
+		// Proportional sweep is done or disabled.
+		return
+	}
+
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+retry:
+	sweptBasis := mheap_.pagesSweptBasis.Load()
+
+	// Fix debt if necessary.
+	newHeapLive := uintptr(atomic.Load64(&gcController.heapLive)-mheap_.sweepHeapLiveBasis) + spanBytes
+	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
+	for pagesTarget > int64(mheap_.pagesSwept.Load()-sweptBasis) {
+		if sweepone() == ^uintptr(0) {
+			mheap_.sweepPagesPerByte = 0
+			break
+		}
+		if mheap_.pagesSweptBasis.Load() != sweptBasis {
+			// Sweep pacing changed. Recompute debt.
+			goto retry
+		}
+	}
+
+	if trace.enabled {
+		traceGCSweepDone()
+	}
+}
+
+// clobberfree sets the memory content at x to bad content, for debugging
+// purposes.
+func clobberfree(x unsafe.Pointer, size uintptr) {
+	// size (span.elemsize) is always a multiple of 4.
+	for i := uintptr(0); i < size; i += 4 {
+		*(*uint32)(add(x, i)) = 0xdeadbeef
+	}
+}
+
+// gcPaceSweeper updates the sweeper's pacing parameters.
+//
+// Must be called whenever the GC's pacing is updated.
+//
+// The world must be stopped, or mheap_.lock must be held.
+func gcPaceSweeper(trigger uint64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	// Update sweep pacing.
+	if isSweepDone() {
+		mheap_.sweepPagesPerByte = 0
+	} else {
+		// Concurrent sweep needs to sweep all of the in-use
+		// pages by the time the allocated heap reaches the GC
+		// trigger. Compute the ratio of in-use pages to sweep
+		// per byte allocated, accounting for the fact that
+		// some might already be swept.
+		heapLiveBasis := atomic.Load64(&gcController.heapLive)
+		heapDistance := int64(trigger) - int64(heapLiveBasis)
+		// Add a little margin so rounding errors and
+		// concurrent sweep are less likely to leave pages
+		// unswept when GC starts.
+		heapDistance -= 1024 * 1024
+		if heapDistance < _PageSize {
+			// Avoid setting the sweep ratio extremely high
+			heapDistance = _PageSize
+		}
+		pagesSwept := mheap_.pagesSwept.Load()
+		pagesInUse := mheap_.pagesInUse.Load()
+		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
+		if sweepDistancePages <= 0 {
+			mheap_.sweepPagesPerByte = 0
+		} else {
+			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
+			mheap_.sweepHeapLiveBasis = heapLiveBasis
+			// Write pagesSweptBasis last, since this
+			// signals concurrent sweeps to recompute
+			// their debt.
+			mheap_.pagesSweptBasis.Store(pagesSwept)
+		}
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mgcwork.go b/contrib/go/_std_1.18/src/runtime/mgcwork.go
new file mode 100644
index 0000000000..9c3f7fd223
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mgcwork.go
@@ -0,0 +1,483 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	_WorkbufSize = 2048 // in bytes; larger values result in less contention
+
+	// workbufAlloc is the number of bytes to allocate at a time
+	// for new workbufs. This must be a multiple of pageSize and
+	// should be a multiple of _WorkbufSize.
+	//
+	// Larger values reduce workbuf allocation overhead. Smaller
+	// values reduce heap fragmentation.
+	workbufAlloc = 32 << 10
+)
+
+func init() {
+	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
+		throw("bad workbufAlloc")
+	}
+}
+
+// Garbage collector work pool abstraction.
+//
+// This implements a producer/consumer model for pointers to grey
+// objects. A grey object is one that is marked and on a work
+// queue. A black object is marked and not on a work queue.
+//
+// Write barriers, root discovery, stack scanning, and object scanning
+// produce pointers to grey objects. Scanning consumes pointers to
+// grey objects, thus blackening them, and then scans them,
+// potentially producing new pointers to grey objects.
+
+// A gcWork provides the interface to produce and consume work for the
+// garbage collector.
+//
+// A gcWork can be used on the stack as follows:
+//
+//     (preemption must be disabled)
+//     gcw := &getg().m.p.ptr().gcw
+//     .. call gcw.put() to produce and gcw.tryGet() to consume ..
+//
+// It's important that any use of gcWork during the mark phase prevent
+// the garbage collector from transitioning to mark termination since
+// gcWork may locally hold GC work buffers. This can be done by
+// disabling preemption (systemstack or acquirem).
+type gcWork struct {
+	// wbuf1 and wbuf2 are the primary and secondary work buffers.
+	//
+	// This can be thought of as a stack of both work buffers'
+	// pointers concatenated. When we pop the last pointer, we
+	// shift the stack up by one work buffer by bringing in a new
+	// full buffer and discarding an empty one. When we fill both
+	// buffers, we shift the stack down by one work buffer by
+	// bringing in a new empty buffer and discarding a full one.
+	// This way we have one buffer's worth of hysteresis, which
+	// amortizes the cost of getting or putting a work buffer over
+	// at least one buffer of work and reduces contention on the
+	// global work lists.
+	//
+	// wbuf1 is always the buffer we're currently pushing to and
+	// popping from and wbuf2 is the buffer that will be discarded
+	// next.
+	//
+	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
+	wbuf1, wbuf2 *workbuf
+
+	// Bytes marked (blackened) on this gcWork. This is aggregated
+	// into work.bytesMarked by dispose.
+	bytesMarked uint64
+
+	// Heap scan work performed on this gcWork. This is aggregated into
+	// gcController by dispose and may also be flushed by callers.
+	// Other types of scan work are flushed immediately.
+	heapScanWork int64
+
+	// flushedWork indicates that a non-empty work buffer was
+	// flushed to the global work list since the last gcMarkDone
+	// termination check. Specifically, this indicates that this
+	// gcWork may have communicated work to another gcWork.
+	flushedWork bool
+}
+
+// Most of the methods of gcWork are go:nowritebarrierrec because the
+// write barrier itself can invoke gcWork methods but the methods are
+// not generally re-entrant. Hence, if a gcWork method invoked the
+// write barrier while the gcWork was in an inconsistent state, and
+// the write barrier in turn invoked a gcWork method, it could
+// permanently corrupt the gcWork.
+
+func (w *gcWork) init() {
+	w.wbuf1 = getempty()
+	wbuf2 := trygetfull()
+	if wbuf2 == nil {
+		wbuf2 = getempty()
+	}
+	w.wbuf2 = wbuf2
+}
+
+// put enqueues a pointer for the garbage collector to trace.
+// obj must point to the beginning of a heap object or an oblet.
+//go:nowritebarrierrec
+func (w *gcWork) put(obj uintptr) {
+	flushed := false
+	wbuf := w.wbuf1
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1
+		// wbuf is empty at this point.
+	} else if wbuf.nobj == len(wbuf.obj) {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1
+		if wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			w.flushedWork = true
+			wbuf = getempty()
+			w.wbuf1 = wbuf
+			flushed = true
+		}
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+
+	// If we put a buffer on full, let the GC controller know so
+	// it can encourage more workers to run. We delay this until
+	// the end of put so that w is in a consistent state, since
+	// enlistWorker may itself manipulate w.
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// putFast does a put and reports whether it can be done quickly
+// otherwise it returns false and the caller needs to call put.
+//go:nowritebarrierrec
+func (w *gcWork) putFast(obj uintptr) bool {
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		return false
+	} else if wbuf.nobj == len(wbuf.obj) {
+		return false
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+	return true
+}
+
+// putBatch performs a put on every pointer in obj. See put for
+// constraints on these pointers.
+//
+//go:nowritebarrierrec
+func (w *gcWork) putBatch(obj []uintptr) {
+	if len(obj) == 0 {
+		return
+	}
+
+	flushed := false
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1
+	}
+
+	for len(obj) > 0 {
+		for wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			w.flushedWork = true
+			w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
+			wbuf = w.wbuf1
+			flushed = true
+		}
+		n := copy(wbuf.obj[wbuf.nobj:], obj)
+		wbuf.nobj += n
+		obj = obj[n:]
+	}
+
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// tryGet dequeues a pointer for the garbage collector to trace.
+//
+// If there are no pointers remaining in this gcWork or in the global
+// queue, tryGet returns 0.  Note that there may still be pointers in
+// other gcWork instances or other caches.
+//go:nowritebarrierrec
+func (w *gcWork) tryGet() uintptr {
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = trygetfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbuf
+		}
+	}
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// tryGetFast dequeues a pointer for the garbage collector to trace
+// if one is readily available. Otherwise it returns 0 and
+// the caller is expected to call tryGet().
+//go:nowritebarrierrec
+func (w *gcWork) tryGetFast() uintptr {
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		return 0
+	}
+	if wbuf.nobj == 0 {
+		return 0
+	}
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// dispose returns any cached pointers to the global queue.
+// The buffers are being put on the full queue so that the
+// write barriers will not simply reacquire them before the
+// GC can inspect them. This helps reduce the mutator's
+// ability to hide pointers during the concurrent mark phase.
+//
+//go:nowritebarrierrec
+func (w *gcWork) dispose() {
+	if wbuf := w.wbuf1; wbuf != nil {
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+			w.flushedWork = true
+		}
+		w.wbuf1 = nil
+
+		wbuf = w.wbuf2
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+			w.flushedWork = true
+		}
+		w.wbuf2 = nil
+	}
+	if w.bytesMarked != 0 {
+		// dispose happens relatively infrequently. If this
+		// atomic becomes a problem, we should first try to
+		// dispose less and if necessary aggregate in a per-P
+		// counter.
+		atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
+		w.bytesMarked = 0
+	}
+	if w.heapScanWork != 0 {
+		gcController.heapScanWork.Add(w.heapScanWork)
+		w.heapScanWork = 0
+	}
+}
+
+// balance moves some work that's cached in this gcWork back on the
+// global queue.
+//go:nowritebarrierrec
+func (w *gcWork) balance() {
+	if w.wbuf1 == nil {
+		return
+	}
+	if wbuf := w.wbuf2; wbuf.nobj != 0 {
+		putfull(wbuf)
+		w.flushedWork = true
+		w.wbuf2 = getempty()
+	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
+		w.wbuf1 = handoff(wbuf)
+		w.flushedWork = true // handoff did putfull
+	} else {
+		return
+	}
+	// We flushed a buffer to the full list, so wake a worker.
+	if gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// empty reports whether w has no mark work available.
+//go:nowritebarrierrec
+func (w *gcWork) empty() bool {
+	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
+}
+
+// Internally, the GC work pool is kept in arrays in work buffers.
+// The gcWork interface caches a work buffer until full (or empty) to
+// avoid contending on the global work buffer lists.
+
+type workbufhdr struct {
+	node lfnode // must be first
+	nobj int
+}
+
+//go:notinheap
+type workbuf struct {
+	workbufhdr
+	// account for the above fields
+	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / goarch.PtrSize]uintptr
+}
+
+// workbuf factory routines. These funcs are used to manage the
+// workbufs.
+// If the GC asks for some work these are the only routines that
+// make wbufs available to the GC.
+
+func (b *workbuf) checknonempty() {
+	if b.nobj == 0 {
+		throw("workbuf is empty")
+	}
+}
+
+func (b *workbuf) checkempty() {
+	if b.nobj != 0 {
+		throw("workbuf is not empty")
+	}
+}
+
+// getempty pops an empty work buffer off the work.empty list,
+// allocating new buffers if none are available.
+//go:nowritebarrier
+func getempty() *workbuf {
+	var b *workbuf
+	if work.empty != 0 {
+		b = (*workbuf)(work.empty.pop())
+		if b != nil {
+			b.checkempty()
+		}
+	}
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
+	if b == nil {
+		// Allocate more workbufs.
+		var s *mspan
+		if work.wbufSpans.free.first != nil {
+			lock(&work.wbufSpans.lock)
+			s = work.wbufSpans.free.first
+			if s != nil {
+				work.wbufSpans.free.remove(s)
+				work.wbufSpans.busy.insert(s)
+			}
+			unlock(&work.wbufSpans.lock)
+		}
+		if s == nil {
+			systemstack(func() {
+				s = mheap_.allocManual(workbufAlloc/pageSize, spanAllocWorkBuf)
+			})
+			if s == nil {
+				throw("out of memory")
+			}
+			// Record the new span in the busy list.
+			lock(&work.wbufSpans.lock)
+			work.wbufSpans.busy.insert(s)
+			unlock(&work.wbufSpans.lock)
+		}
+		// Slice up the span into new workbufs. Return one and
+		// put the rest on the empty list.
+		for i := uintptr(0); i+_WorkbufSize <= workbufAlloc; i += _WorkbufSize {
+			newb := (*workbuf)(unsafe.Pointer(s.base() + i))
+			newb.nobj = 0
+			lfnodeValidate(&newb.node)
+			if i == 0 {
+				b = newb
+			} else {
+				putempty(newb)
+			}
+		}
+	}
+	return b
+}
+
+// putempty puts a workbuf onto the work.empty list.
+// Upon entry this goroutine owns b. The lfstack.push relinquishes ownership.
+//go:nowritebarrier
+func putempty(b *workbuf) {
+	b.checkempty()
+	work.empty.push(&b.node)
+}
+
+// putfull puts the workbuf on the work.full list for the GC.
+// putfull accepts partially full buffers so the GC can avoid competing
+// with the mutators for ownership of partially full buffers.
+//go:nowritebarrier
+func putfull(b *workbuf) {
+	b.checknonempty()
+	work.full.push(&b.node)
+}
+
+// trygetfull tries to get a full or partially empty workbuffer.
+// If one is not immediately available return nil
+//go:nowritebarrier
+func trygetfull() *workbuf {
+	b := (*workbuf)(work.full.pop())
+	if b != nil {
+		b.checknonempty()
+		return b
+	}
+	return b
+}
+
+//go:nowritebarrier
+func handoff(b *workbuf) *workbuf {
+	// Make new buffer with half of b's pointers.
+	b1 := getempty()
+	n := b.nobj / 2
+	b.nobj -= n
+	b1.nobj = n
+	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
+
+	// Put b on full list - let first half of b get stolen.
+	putfull(b)
+	return b1
+}
+
+// prepareFreeWorkbufs moves busy workbuf spans to free list so they
+// can be freed to the heap. This must only be called when all
+// workbufs are on the empty list.
+func prepareFreeWorkbufs() {
+	lock(&work.wbufSpans.lock)
+	if work.full != 0 {
+		throw("cannot free workbufs when work.full != 0")
+	}
+	// Since all workbufs are on the empty list, we don't care
+	// which ones are in which spans. We can wipe the entire empty
+	// list and move all workbuf spans to the free list.
+	work.empty = 0
+	work.wbufSpans.free.takeAll(&work.wbufSpans.busy)
+	unlock(&work.wbufSpans.lock)
+}
+
+// freeSomeWbufs frees some workbufs back to the heap and returns
+// true if it should be called again to free more.
+func freeSomeWbufs(preemptible bool) bool {
+	const batchSize = 64 // ~1–2 µs per span.
+	lock(&work.wbufSpans.lock)
+	if gcphase != _GCoff || work.wbufSpans.free.isEmpty() {
+		unlock(&work.wbufSpans.lock)
+		return false
+	}
+	systemstack(func() {
+		gp := getg().m.curg
+		for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
+			span := work.wbufSpans.free.first
+			if span == nil {
+				break
+			}
+			work.wbufSpans.free.remove(span)
+			mheap_.freeManual(span, spanAllocWorkBuf)
+		}
+	})
+	more := !work.wbufSpans.free.isEmpty()
+	unlock(&work.wbufSpans.lock)
+	return more
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mheap.go b/contrib/go/_std_1.18/src/runtime/mheap.go
new file mode 100644
index 0000000000..ecbd0a3a49
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mheap.go
@@ -0,0 +1,2119 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.go for overview.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// minPhysPageSize is a lower-bound on the physical page size. The
+	// true physical page size may be larger than this. In contrast,
+	// sys.PhysPageSize is an upper-bound on the physical page size.
+	minPhysPageSize = 4096
+
+	// maxPhysPageSize is the maximum page size the runtime supports.
+	maxPhysPageSize = 512 << 10
+
+	// maxPhysHugePageSize sets an upper-bound on the maximum huge page size
+	// that the runtime supports.
+	maxPhysHugePageSize = pallocChunkBytes
+
+	// pagesPerReclaimerChunk indicates how many pages to scan from the
+	// pageInUse bitmap at a time. Used by the page reclaimer.
+	//
+	// Higher values reduce contention on scanning indexes (such as
+	// h.reclaimIndex), but increase the minimum latency of the
+	// operation.
+	//
+	// The time required to scan this many pages can vary a lot depending
+	// on how many spans are actually freed. Experimentally, it can
+	// scan for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
+	// free spans at ~32 MB/ms. Using 512 pages bounds this at
+	// roughly 100µs.
+	//
+	// Must be a multiple of the pageInUse bitmap element size and
+	// must also evenly divide pagesPerArena.
+	pagesPerReclaimerChunk = 512
+
+	// physPageAlignedStacks indicates whether stack allocations must be
+	// physical page aligned. This is a requirement for MAP_STACK on
+	// OpenBSD.
+	physPageAlignedStacks = GOOS == "openbsd"
+)
+
+// Main malloc heap.
+// The heap itself is the "free" and "scav" treaps,
+// but all the other global data is here too.
+//
+// mheap must not be heap-allocated because it contains mSpanLists,
+// which must not be heap-allocated.
+//
+//go:notinheap
+type mheap struct {
+	// lock must only be acquired on the system stack, otherwise a g
+	// could self-deadlock if its stack grows with the lock held.
+	lock  mutex
+	pages pageAlloc // page allocation data structure
+
+	sweepgen uint32 // sweep generation, see comment in mspan; written during STW
+
+	// allspans is a slice of all mspans ever created. Each mspan
+	// appears exactly once.
+	//
+	// The memory for allspans is manually managed and can be
+	// reallocated and move as the heap grows.
+	//
+	// In general, allspans is protected by mheap_.lock, which
+	// prevents concurrent access as well as freeing the backing
+	// store. Accesses during STW might not hold the lock, but
+	// must ensure that allocation cannot happen around the
+	// access (since that may free the backing store).
+	allspans []*mspan // all spans out there
+
+	// _ uint32 // align uint64 fields on 32-bit for atomics
+
+	// Proportional sweep
+	//
+	// These parameters represent a linear function from gcController.heapLive
+	// to page sweep count. The proportional sweep system works to
+	// stay in the black by keeping the current page sweep count
+	// above this line at the current gcController.heapLive.
+	//
+	// The line has slope sweepPagesPerByte and passes through a
+	// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At
+	// any given time, the system is at (gcController.heapLive,
+	// pagesSwept) in this space.
+	//
+	// It is important that the line pass through a point we
+	// control rather than simply starting at a 0,0 origin
+	// because that lets us adjust sweep pacing at any time while
+	// accounting for current progress. If we could only adjust
+	// the slope, it would create a discontinuity in debt if any
+	// progress has already been made.
+	pagesInUse         atomic.Uint64 // pages of spans in stats mSpanInUse
+	pagesSwept         atomic.Uint64 // pages swept this cycle
+	pagesSweptBasis    atomic.Uint64 // pagesSwept to use as the origin of the sweep ratio
+	sweepHeapLiveBasis uint64        // value of gcController.heapLive to use as the origin of sweep ratio; written with lock, read without
+	sweepPagesPerByte  float64       // proportional sweep ratio; written with lock, read without
+	// TODO(austin): pagesInUse should be a uintptr, but the 386
+	// compiler can't 8-byte align fields.
+
+	// scavengeGoal is the amount of total retained heap memory (measured by
+	// heapRetained) that the runtime will try to maintain by returning memory
+	// to the OS.
+	//
+	// Accessed atomically.
+	scavengeGoal uint64
+
+	// Page reclaimer state
+
+	// reclaimIndex is the page index in allArenas of next page to
+	// reclaim. Specifically, it refers to page (i %
+	// pagesPerArena) of arena allArenas[i / pagesPerArena].
+	//
+	// If this is >= 1<<63, the page reclaimer is done scanning
+	// the page marks.
+	reclaimIndex atomic.Uint64
+
+	// reclaimCredit is spare credit for extra pages swept. Since
+	// the page reclaimer works in large chunks, it may reclaim
+	// more than requested. Any spare pages released go to this
+	// credit pool.
+	reclaimCredit atomic.Uintptr
+
+	// arenas is the heap arena map. It points to the metadata for
+	// the heap for every arena frame of the entire usable virtual
+	// address space.
+	//
+	// Use arenaIndex to compute indexes into this array.
+	//
+	// For regions of the address space that are not backed by the
+	// Go heap, the arena map contains nil.
+	//
+	// Modifications are protected by mheap_.lock. Reads can be
+	// performed without locking; however, a given entry can
+	// transition from nil to non-nil at any time when the lock
+	// isn't held. (Entries never transitions back to nil.)
+	//
+	// In general, this is a two-level mapping consisting of an L1
+	// map and possibly many L2 maps. This saves space when there
+	// are a huge number of arena frames. However, on many
+	// platforms (even 64-bit), arenaL1Bits is 0, making this
+	// effectively a single-level map. In this case, arenas[0]
+	// will never be nil.
+	arenas [1 << arenaL1Bits]*[1 << arenaL2Bits]*heapArena
+
+	// heapArenaAlloc is pre-reserved space for allocating heapArena
+	// objects. This is only used on 32-bit, where we pre-reserve
+	// this space to avoid interleaving it with the heap itself.
+	heapArenaAlloc linearAlloc
+
+	// arenaHints is a list of addresses at which to attempt to
+	// add more heap arenas. This is initially populated with a
+	// set of general hint addresses, and grown with the bounds of
+	// actual heap arena ranges.
+	arenaHints *arenaHint
+
+	// arena is a pre-reserved space for allocating heap arenas
+	// (the actual arenas). This is only used on 32-bit.
+	arena linearAlloc
+
+	// allArenas is the arenaIndex of every mapped arena. This can
+	// be used to iterate through the address space.
+	//
+	// Access is protected by mheap_.lock. However, since this is
+	// append-only and old backing arrays are never freed, it is
+	// safe to acquire mheap_.lock, copy the slice header, and
+	// then release mheap_.lock.
+	allArenas []arenaIdx
+
+	// sweepArenas is a snapshot of allArenas taken at the
+	// beginning of the sweep cycle. This can be read safely by
+	// simply blocking GC (by disabling preemption).
+	sweepArenas []arenaIdx
+
+	// markArenas is a snapshot of allArenas taken at the beginning
+	// of the mark cycle. Because allArenas is append-only, neither
+	// this slice nor its contents will change during the mark, so
+	// it can be read safely.
+	markArenas []arenaIdx
+
+	// curArena is the arena that the heap is currently growing
+	// into. This should always be physPageSize-aligned.
+	curArena struct {
+		base, end uintptr
+	}
+
+	_ uint32 // ensure 64-bit alignment of central
+
+	// central free lists for small size classes.
+	// the padding makes sure that the mcentrals are
+	// spaced CacheLinePadSize bytes apart, so that each mcentral.lock
+	// gets its own cache line.
+	// central is indexed by spanClass.
+	central [numSpanClasses]struct {
+		mcentral mcentral
+		pad      [cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize]byte
+	}
+
+	spanalloc             fixalloc // allocator for span*
+	cachealloc            fixalloc // allocator for mcache*
+	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+	specialprofilealloc   fixalloc // allocator for specialprofile*
+	specialReachableAlloc fixalloc // allocator for specialReachable
+	speciallock           mutex    // lock for special record allocators.
+	arenaHintAlloc        fixalloc // allocator for arenaHints
+
+	unused *specialfinalizer // never set, just here to force the specialfinalizer type into DWARF
+}
+
+var mheap_ mheap
+
+// A heapArena stores metadata for a heap arena. heapArenas are stored
+// outside of the Go heap and accessed via the mheap_.arenas index.
+//
+//go:notinheap
+type heapArena struct {
+	// bitmap stores the pointer/scalar bitmap for the words in
+	// this arena. See mbitmap.go for a description. Use the
+	// heapBits type to access this.
+	bitmap [heapArenaBitmapBytes]byte
+
+	// spans maps from virtual address page ID within this arena to *mspan.
+	// For allocated spans, their pages map to the span itself.
+	// For free spans, only the lowest and highest pages map to the span itself.
+	// Internal pages map to an arbitrary span.
+	// For pages that have never been allocated, spans entries are nil.
+	//
+	// Modifications are protected by mheap.lock. Reads can be
+	// performed without locking, but ONLY from indexes that are
+	// known to contain in-use or stack spans. This means there
+	// must not be a safe-point between establishing that an
+	// address is live and looking it up in the spans array.
+	spans [pagesPerArena]*mspan
+
+	// pageInUse is a bitmap that indicates which spans are in
+	// state mSpanInUse. This bitmap is indexed by page number,
+	// but only the bit corresponding to the first page in each
+	// span is used.
+	//
+	// Reads and writes are atomic.
+	pageInUse [pagesPerArena / 8]uint8
+
+	// pageMarks is a bitmap that indicates which spans have any
+	// marked objects on them. Like pageInUse, only the bit
+	// corresponding to the first page in each span is used.
+	//
+	// Writes are done atomically during marking. Reads are
+	// non-atomic and lock-free since they only occur during
+	// sweeping (and hence never race with writes).
+	//
+	// This is used to quickly find whole spans that can be freed.
+	//
+	// TODO(austin): It would be nice if this was uint64 for
+	// faster scanning, but we don't have 64-bit atomic bit
+	// operations.
+	pageMarks [pagesPerArena / 8]uint8
+
+	// pageSpecials is a bitmap that indicates which spans have
+	// specials (finalizers or other). Like pageInUse, only the bit
+	// corresponding to the first page in each span is used.
+	//
+	// Writes are done atomically whenever a special is added to
+	// a span and whenever the last special is removed from a span.
+	// Reads are done atomically to find spans containing specials
+	// during marking.
+	pageSpecials [pagesPerArena / 8]uint8
+
+	// checkmarks stores the debug.gccheckmark state. It is only
+	// used if debug.gccheckmark > 0.
+	checkmarks *checkmarksMap
+
+	// zeroedBase marks the first byte of the first page in this
+	// arena which hasn't been used yet and is therefore already
+	// zero. zeroedBase is relative to the arena base.
+	// Increases monotonically until it hits heapArenaBytes.
+	//
+	// This field is sufficient to determine if an allocation
+	// needs to be zeroed because the page allocator follows an
+	// address-ordered first-fit policy.
+	//
+	// Read atomically and written with an atomic CAS.
+	zeroedBase uintptr
+}
+
+// arenaHint is a hint for where to grow the heap arenas. See
+// mheap_.arenaHints.
+//
+//go:notinheap
+type arenaHint struct {
+	addr uintptr
+	down bool
+	next *arenaHint
+}
+
+// An mspan is a run of pages.
+//
+// When a mspan is in the heap free treap, state == mSpanFree
+// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
+// If the mspan is in the heap scav treap, then in addition to the
+// above scavenged == true. scavenged == false in all other cases.
+//
+// When a mspan is allocated, state == mSpanInUse or mSpanManual
+// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
+
+// Every mspan is in one doubly-linked list, either in the mheap's
+// busy list or one of the mcentral's span lists.
+
+// An mspan representing actual memory has state mSpanInUse,
+// mSpanManual, or mSpanFree. Transitions between these states are
+// constrained as follows:
+//
+// * A span may transition from free to in-use or manual during any GC
+//   phase.
+//
+// * During sweeping (gcphase == _GCoff), a span may transition from
+//   in-use to free (as a result of sweeping) or manual to free (as a
+//   result of stacks being freed).
+//
+// * During GC (gcphase != _GCoff), a span *must not* transition from
+//   manual or in-use to free. Because concurrent GC may read a pointer
+//   and then look up its span, the span state must be monotonic.
+//
+// Setting mspan.state to mSpanInUse or mSpanManual must be done
+// atomically and only after all other span fields are valid.
+// Likewise, if inspecting a span is contingent on it being
+// mSpanInUse, the state should be loaded atomically and checked
+// before depending on other fields. This allows the garbage collector
+// to safely deal with potentially invalid pointers, since resolving
+// such pointers may race with a span being allocated.
+type mSpanState uint8
+
+const (
+	mSpanDead   mSpanState = iota
+	mSpanInUse             // allocated for garbage collected heap
+	mSpanManual            // allocated for manual management (e.g., stack allocator)
+)
+
+// mSpanStateNames are the names of the span states, indexed by
+// mSpanState.
+var mSpanStateNames = []string{
+	"mSpanDead",
+	"mSpanInUse",
+	"mSpanManual",
+	"mSpanFree",
+}
+
+// mSpanStateBox holds an mSpanState and provides atomic operations on
+// it. This is a separate type to disallow accidental comparison or
+// assignment with mSpanState.
+type mSpanStateBox struct {
+	s mSpanState
+}
+
+func (b *mSpanStateBox) set(s mSpanState) {
+	atomic.Store8((*uint8)(&b.s), uint8(s))
+}
+
+func (b *mSpanStateBox) get() mSpanState {
+	return mSpanState(atomic.Load8((*uint8)(&b.s)))
+}
+
+// mSpanList heads a linked list of spans.
+//
+//go:notinheap
+type mSpanList struct {
+	first *mspan // first span in list, or nil if none
+	last  *mspan // last span in list, or nil if none
+}
+
+//go:notinheap
+type mspan struct {
+	next *mspan     // next span in list, or nil if none
+	prev *mspan     // previous span in list, or nil if none
+	list *mSpanList // For debugging. TODO: Remove.
+
+	startAddr uintptr // address of first byte of span aka s.base()
+	npages    uintptr // number of pages in span
+
+	manualFreeList gclinkptr // list of free objects in mSpanManual spans
+
+	// freeindex is the slot index between 0 and nelems at which to begin scanning
+	// for the next free object in this span.
+	// Each allocation scans allocBits starting at freeindex until it encounters a 0
+	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
+	// just past the newly discovered free object.
+	//
+	// If freeindex == nelem, this span has no free objects.
+	//
+	// allocBits is a bitmap of objects in this span.
+	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
+	// then object n is free;
+	// otherwise, object n is allocated. Bits starting at nelem are
+	// undefined and should never be referenced.
+	//
+	// Object n starts at address n*elemsize + (start << pageShift).
+	freeindex uintptr
+	// TODO: Look up nelems from sizeclass and remove this field if it
+	// helps performance.
+	nelems uintptr // number of object in the span.
+
+	// Cache of the allocBits at freeindex. allocCache is shifted
+	// such that the lowest bit corresponds to the bit freeindex.
+	// allocCache holds the complement of allocBits, thus allowing
+	// ctz (count trailing zero) to use it directly.
+	// allocCache may contain bits beyond s.nelems; the caller must ignore
+	// these.
+	allocCache uint64
+
+	// allocBits and gcmarkBits hold pointers to a span's mark and
+	// allocation bits. The pointers are 8 byte aligned.
+	// There are three arenas where this data is held.
+	// free: Dirty arenas that are no longer accessed
+	//       and can be reused.
+	// next: Holds information to be used in the next GC cycle.
+	// current: Information being used during this GC cycle.
+	// previous: Information being used during the last GC cycle.
+	// A new GC cycle starts with the call to finishsweep_m.
+	// finishsweep_m moves the previous arena to the free arena,
+	// the current arena to the previous arena, and
+	// the next arena to the current arena.
+	// The next arena is populated as the spans request
+	// memory to hold gcmarkBits for the next GC cycle as well
+	// as allocBits for newly allocated spans.
+	//
+	// The pointer arithmetic is done "by hand" instead of using
+	// arrays to avoid bounds checks along critical performance
+	// paths.
+	// The sweep will free the old allocBits and set allocBits to the
+	// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
+	// out memory.
+	allocBits  *gcBits
+	gcmarkBits *gcBits
+
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// if sweepgen == h->sweepgen + 1, the span was cached before sweep began and is still cached, and needs sweeping
+	// if sweepgen == h->sweepgen + 3, the span was swept and then cached and is still cached
+	// h->sweepgen is incremented by 2 after every GC
+
+	sweepgen    uint32
+	divMul      uint32        // for divide by elemsize
+	allocCount  uint16        // number of allocated objects
+	spanclass   spanClass     // size class and noscan (uint8)
+	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
+	needzero    uint8         // needs to be zeroed before allocation
+	elemsize    uintptr       // computed from sizeclass or from npages
+	limit       uintptr       // end of data in span
+	speciallock mutex         // guards specials list
+	specials    *special      // linked list of special records sorted by offset.
+}
+
+func (s *mspan) base() uintptr {
+	return s.startAddr
+}
+
+func (s *mspan) layout() (size, n, total uintptr) {
+	total = s.npages << _PageShift
+	size = s.elemsize
+	if size > 0 {
+		n = total / size
+	}
+	return
+}
+
+// recordspan adds a newly allocated span to h.allspans.
+//
+// This only happens the first time a span is allocated from
+// mheap.spanalloc (it is not called when a span is reused).
+//
+// Write barriers are disallowed here because it can be called from
+// gcWork when allocating new workbufs. However, because it's an
+// indirect call from the fixalloc initializer, the compiler can't see
+// this.
+//
+// The heap lock must be held.
+//
+//go:nowritebarrierrec
+func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
+	h := (*mheap)(vh)
+	s := (*mspan)(p)
+
+	assertLockHeld(&h.lock)
+
+	if len(h.allspans) >= cap(h.allspans) {
+		n := 64 * 1024 / goarch.PtrSize
+		if n < cap(h.allspans)*3/2 {
+			n = cap(h.allspans) * 3 / 2
+		}
+		var new []*mspan
+		sp := (*slice)(unsafe.Pointer(&new))
+		sp.array = sysAlloc(uintptr(n)*goarch.PtrSize, &memstats.other_sys)
+		if sp.array == nil {
+			throw("runtime: cannot allocate memory")
+		}
+		sp.len = len(h.allspans)
+		sp.cap = n
+		if len(h.allspans) > 0 {
+			copy(new, h.allspans)
+		}
+		oldAllspans := h.allspans
+		*(*notInHeapSlice)(unsafe.Pointer(&h.allspans)) = *(*notInHeapSlice)(unsafe.Pointer(&new))
+		if len(oldAllspans) != 0 {
+			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
+		}
+	}
+	h.allspans = h.allspans[:len(h.allspans)+1]
+	h.allspans[len(h.allspans)-1] = s
+}
+
+// A spanClass represents the size class and noscan-ness of a span.
+//
+// Each size class has a noscan spanClass and a scan spanClass. The
+// noscan spanClass contains only noscan objects, which do not contain
+// pointers and thus do not need to be scanned by the garbage
+// collector.
+type spanClass uint8
+
+const (
+	numSpanClasses = _NumSizeClasses << 1
+	tinySpanClass  = spanClass(tinySizeClass<<1 | 1)
+)
+
+func makeSpanClass(sizeclass uint8, noscan bool) spanClass {
+	return spanClass(sizeclass<<1) | spanClass(bool2int(noscan))
+}
+
+func (sc spanClass) sizeclass() int8 {
+	return int8(sc >> 1)
+}
+
+func (sc spanClass) noscan() bool {
+	return sc&1 != 0
+}
+
+// arenaIndex returns the index into mheap_.arenas of the arena
+// containing metadata for p. This index combines of an index into the
+// L1 map and an index into the L2 map and should be used as
+// mheap_.arenas[ai.l1()][ai.l2()].
+//
+// If p is outside the range of valid heap addresses, either l1() or
+// l2() will be out of bounds.
+//
+// It is nosplit because it's called by spanOf and several other
+// nosplit functions.
+//
+//go:nosplit
+func arenaIndex(p uintptr) arenaIdx {
+	return arenaIdx((p - arenaBaseOffset) / heapArenaBytes)
+}
+
+// arenaBase returns the low address of the region covered by heap
+// arena i.
+func arenaBase(i arenaIdx) uintptr {
+	return uintptr(i)*heapArenaBytes + arenaBaseOffset
+}
+
+type arenaIdx uint
+
+func (i arenaIdx) l1() uint {
+	if arenaL1Bits == 0 {
+		// Let the compiler optimize this away if there's no
+		// L1 map.
+		return 0
+	} else {
+		return uint(i) >> arenaL1Shift
+	}
+}
+
+func (i arenaIdx) l2() uint {
+	if arenaL1Bits == 0 {
+		return uint(i)
+	} else {
+		return uint(i) & (1<<arenaL2Bits - 1)
+	}
+}
+
+// inheap reports whether b is a pointer into a (potentially dead) heap object.
+// It returns false for pointers into mSpanManual spans.
+// Non-preemptible because it is used by write barriers.
+//go:nowritebarrier
+//go:nosplit
+func inheap(b uintptr) bool {
+	return spanOfHeap(b) != nil
+}
+
+// inHeapOrStack is a variant of inheap that returns true for pointers
+// into any allocated heap span.
+//
+//go:nowritebarrier
+//go:nosplit
+func inHeapOrStack(b uintptr) bool {
+	s := spanOf(b)
+	if s == nil || b < s.base() {
+		return false
+	}
+	switch s.state.get() {
+	case mSpanInUse, mSpanManual:
+		return b < s.limit
+	default:
+		return false
+	}
+}
+
+// spanOf returns the span of p. If p does not point into the heap
+// arena or no span has ever contained p, spanOf returns nil.
+//
+// If p does not point to allocated memory, this may return a non-nil
+// span that does *not* contain p. If this is a possibility, the
+// caller should either call spanOfHeap or check the span bounds
+// explicitly.
+//
+// Must be nosplit because it has callers that are nosplit.
+//
+//go:nosplit
+func spanOf(p uintptr) *mspan {
+	// This function looks big, but we use a lot of constant
+	// folding around arenaL1Bits to get it under the inlining
+	// budget. Also, many of the checks here are safety checks
+	// that Go needs to do anyway, so the generated code is quite
+	// short.
+	ri := arenaIndex(p)
+	if arenaL1Bits == 0 {
+		// If there's no L1, then ri.l1() can't be out of bounds but ri.l2() can.
+		if ri.l2() >= uint(len(mheap_.arenas[0])) {
+			return nil
+		}
+	} else {
+		// If there's an L1, then ri.l1() can be out of bounds but ri.l2() can't.
+		if ri.l1() >= uint(len(mheap_.arenas)) {
+			return nil
+		}
+	}
+	l2 := mheap_.arenas[ri.l1()]
+	if arenaL1Bits != 0 && l2 == nil { // Should never happen if there's no L1.
+		return nil
+	}
+	ha := l2[ri.l2()]
+	if ha == nil {
+		return nil
+	}
+	return ha.spans[(p/pageSize)%pagesPerArena]
+}
+
+// spanOfUnchecked is equivalent to spanOf, but the caller must ensure
+// that p points into an allocated heap arena.
+//
+// Must be nosplit because it has callers that are nosplit.
+//
+//go:nosplit
+func spanOfUnchecked(p uintptr) *mspan {
+	ai := arenaIndex(p)
+	return mheap_.arenas[ai.l1()][ai.l2()].spans[(p/pageSize)%pagesPerArena]
+}
+
+// spanOfHeap is like spanOf, but returns nil if p does not point to a
+// heap object.
+//
+// Must be nosplit because it has callers that are nosplit.
+//
+//go:nosplit
+func spanOfHeap(p uintptr) *mspan {
+	s := spanOf(p)
+	// s is nil if it's never been allocated. Otherwise, we check
+	// its state first because we don't trust this pointer, so we
+	// have to synchronize with span initialization. Then, it's
+	// still possible we picked up a stale span pointer, so we
+	// have to check the span's bounds.
+	if s == nil || s.state.get() != mSpanInUse || p < s.base() || p >= s.limit {
+		return nil
+	}
+	return s
+}
+
+// pageIndexOf returns the arena, page index, and page mask for pointer p.
+// The caller must ensure p is in the heap.
+func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8) {
+	ai := arenaIndex(p)
+	arena = mheap_.arenas[ai.l1()][ai.l2()]
+	pageIdx = ((p / pageSize) / 8) % uintptr(len(arena.pageInUse))
+	pageMask = byte(1 << ((p / pageSize) % 8))
+	return
+}
+
+// Initialize the heap.
+func (h *mheap) init() {
+	lockInit(&h.lock, lockRankMheap)
+	lockInit(&h.speciallock, lockRankMheapSpecial)
+
+	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
+	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
+	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
+	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+	h.specialReachableAlloc.init(unsafe.Sizeof(specialReachable{}), nil, nil, &memstats.other_sys)
+	h.arenaHintAlloc.init(unsafe.Sizeof(arenaHint{}), nil, nil, &memstats.other_sys)
+
+	// Don't zero mspan allocations. Background sweeping can
+	// inspect a span concurrently with allocating it, so it's
+	// important that the span's sweepgen survive across freeing
+	// and re-allocating a span to prevent background sweeping
+	// from improperly cas'ing it from 0.
+	//
+	// This is safe because mspan contains no heap pointers.
+	h.spanalloc.zero = false
+
+	// h->mapcache needs no init
+
+	for i := range h.central {
+		h.central[i].mcentral.init(spanClass(i))
+	}
+
+	h.pages.init(&h.lock, &memstats.gcMiscSys)
+}
+
+// reclaim sweeps and reclaims at least npage pages into the heap.
+// It is called before allocating npage pages to keep growth in check.
+//
+// reclaim implements the page-reclaimer half of the sweeper.
+//
+// h.lock must NOT be held.
+func (h *mheap) reclaim(npage uintptr) {
+	// TODO(austin): Half of the time spent freeing spans is in
+	// locking/unlocking the heap (even with low contention). We
+	// could make the slow path here several times faster by
+	// batching heap frees.
+
+	// Bail early if there's no more reclaim work.
+	if h.reclaimIndex.Load() >= 1<<63 {
+		return
+	}
+
+	// Disable preemption so the GC can't start while we're
+	// sweeping, so we can read h.sweepArenas, and so
+	// traceGCSweepStart/Done pair on the P.
+	mp := acquirem()
+
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	arenas := h.sweepArenas
+	locked := false
+	for npage > 0 {
+		// Pull from accumulated credit first.
+		if credit := h.reclaimCredit.Load(); credit > 0 {
+			take := credit
+			if take > npage {
+				// Take only what we need.
+				take = npage
+			}
+			if h.reclaimCredit.CompareAndSwap(credit, credit-take) {
+				npage -= take
+			}
+			continue
+		}
+
+		// Claim a chunk of work.
+		idx := uintptr(h.reclaimIndex.Add(pagesPerReclaimerChunk) - pagesPerReclaimerChunk)
+		if idx/pagesPerArena >= uintptr(len(arenas)) {
+			// Page reclaiming is done.
+			h.reclaimIndex.Store(1 << 63)
+			break
+		}
+
+		if !locked {
+			// Lock the heap for reclaimChunk.
+			lock(&h.lock)
+			locked = true
+		}
+
+		// Scan this chunk.
+		nfound := h.reclaimChunk(arenas, idx, pagesPerReclaimerChunk)
+		if nfound <= npage {
+			npage -= nfound
+		} else {
+			// Put spare pages toward global credit.
+			h.reclaimCredit.Add(nfound - npage)
+			npage = 0
+		}
+	}
+	if locked {
+		unlock(&h.lock)
+	}
+
+	if trace.enabled {
+		traceGCSweepDone()
+	}
+	releasem(mp)
+}
+
+// reclaimChunk sweeps unmarked spans that start at page indexes [pageIdx, pageIdx+n).
+// It returns the number of pages returned to the heap.
+//
+// h.lock must be held and the caller must be non-preemptible. Note: h.lock may be
+// temporarily unlocked and re-locked in order to do sweeping or if tracing is
+// enabled.
+func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
+	// The heap lock must be held because this accesses the
+	// heapArena.spans arrays using potentially non-live pointers.
+	// In particular, if a span were freed and merged concurrently
+	// with this probing heapArena.spans, it would be possible to
+	// observe arbitrary, stale span pointers.
+	assertLockHeld(&h.lock)
+
+	n0 := n
+	var nFreed uintptr
+	sl := sweep.active.begin()
+	if !sl.valid {
+		return 0
+	}
+	for n > 0 {
+		ai := arenas[pageIdx/pagesPerArena]
+		ha := h.arenas[ai.l1()][ai.l2()]
+
+		// Get a chunk of the bitmap to work on.
+		arenaPage := uint(pageIdx % pagesPerArena)
+		inUse := ha.pageInUse[arenaPage/8:]
+		marked := ha.pageMarks[arenaPage/8:]
+		if uintptr(len(inUse)) > n/8 {
+			inUse = inUse[:n/8]
+			marked = marked[:n/8]
+		}
+
+		// Scan this bitmap chunk for spans that are in-use
+		// but have no marked objects on them.
+		for i := range inUse {
+			inUseUnmarked := atomic.Load8(&inUse[i]) &^ marked[i]
+			if inUseUnmarked == 0 {
+				continue
+			}
+
+			for j := uint(0); j < 8; j++ {
+				if inUseUnmarked&(1<<j) != 0 {
+					s := ha.spans[arenaPage+uint(i)*8+j]
+					if s, ok := sl.tryAcquire(s); ok {
+						npages := s.npages
+						unlock(&h.lock)
+						if s.sweep(false) {
+							nFreed += npages
+						}
+						lock(&h.lock)
+						// Reload inUse. It's possible nearby
+						// spans were freed when we dropped the
+						// lock and we don't want to get stale
+						// pointers from the spans array.
+						inUseUnmarked = atomic.Load8(&inUse[i]) &^ marked[i]
+					}
+				}
+			}
+		}
+
+		// Advance.
+		pageIdx += uintptr(len(inUse) * 8)
+		n -= uintptr(len(inUse) * 8)
+	}
+	sweep.active.end(sl)
+	if trace.enabled {
+		unlock(&h.lock)
+		// Account for pages scanned but not reclaimed.
+		traceGCSweepSpan((n0 - nFreed) * pageSize)
+		lock(&h.lock)
+	}
+
+	assertLockHeld(&h.lock) // Must be locked on return.
+	return nFreed
+}
+
+// spanAllocType represents the type of allocation to make, or
+// the type of allocation to be freed.
+type spanAllocType uint8
+
+const (
+	spanAllocHeap          spanAllocType = iota // heap span
+	spanAllocStack                              // stack span
+	spanAllocPtrScalarBits                      // unrolled GC prog bitmap span
+	spanAllocWorkBuf                            // work buf span
+)
+
+// manual returns true if the span allocation is manually managed.
+func (s spanAllocType) manual() bool {
+	return s != spanAllocHeap
+}
+
+// alloc allocates a new span of npage pages from the GC'd heap.
+//
+// spanclass indicates the span's size class and scannability.
+//
+// Returns a span that has been fully initialized. span.needzero indicates
+// whether the span has been zeroed. Note that it may not be.
+func (h *mheap) alloc(npages uintptr, spanclass spanClass) *mspan {
+	// Don't do any operations that lock the heap on the G stack.
+	// It might trigger stack growth, and the stack growth code needs
+	// to be able to allocate heap.
+	var s *mspan
+	systemstack(func() {
+		// To prevent excessive heap growth, before allocating n pages
+		// we need to sweep and reclaim at least n pages.
+		if !isSweepDone() {
+			h.reclaim(npages)
+		}
+		s = h.allocSpan(npages, spanAllocHeap, spanclass)
+	})
+	return s
+}
+
+// allocManual allocates a manually-managed span of npage pages.
+// allocManual returns nil if allocation fails.
+//
+// allocManual adds the bytes used to *stat, which should be a
+// memstats in-use field. Unlike allocations in the GC'd heap, the
+// allocation does *not* count toward heap_inuse or heap_sys.
+//
+// The memory backing the returned span may not be zeroed if
+// span.needzero is set.
+//
+// allocManual must be called on the system stack because it may
+// acquire the heap lock via allocSpan. See mheap for details.
+//
+// If new code is written to call allocManual, do NOT use an
+// existing spanAllocType value and instead declare a new one.
+//
+//go:systemstack
+func (h *mheap) allocManual(npages uintptr, typ spanAllocType) *mspan {
+	if !typ.manual() {
+		throw("manual span allocation called with non-manually-managed type")
+	}
+	return h.allocSpan(npages, typ, 0)
+}
+
+// setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
+// is s.
+func (h *mheap) setSpans(base, npage uintptr, s *mspan) {
+	p := base / pageSize
+	ai := arenaIndex(base)
+	ha := h.arenas[ai.l1()][ai.l2()]
+	for n := uintptr(0); n < npage; n++ {
+		i := (p + n) % pagesPerArena
+		if i == 0 {
+			ai = arenaIndex(base + n*pageSize)
+			ha = h.arenas[ai.l1()][ai.l2()]
+		}
+		ha.spans[i] = s
+	}
+}
+
+// allocNeedsZero checks if the region of address space [base, base+npage*pageSize),
+// assumed to be allocated, needs to be zeroed, updating heap arena metadata for
+// future allocations.
+//
+// This must be called each time pages are allocated from the heap, even if the page
+// allocator can otherwise prove the memory it's allocating is already zero because
+// they're fresh from the operating system. It updates heapArena metadata that is
+// critical for future page allocations.
+//
+// There are no locking constraints on this method.
+func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
+	for npage > 0 {
+		ai := arenaIndex(base)
+		ha := h.arenas[ai.l1()][ai.l2()]
+
+		zeroedBase := atomic.Loaduintptr(&ha.zeroedBase)
+		arenaBase := base % heapArenaBytes
+		if arenaBase < zeroedBase {
+			// We extended into the non-zeroed part of the
+			// arena, so this region needs to be zeroed before use.
+			//
+			// zeroedBase is monotonically increasing, so if we see this now then
+			// we can be sure we need to zero this memory region.
+			//
+			// We still need to update zeroedBase for this arena, and
+			// potentially more arenas.
+			needZero = true
+		}
+		// We may observe arenaBase > zeroedBase if we're racing with one or more
+		// allocations which are acquiring memory directly before us in the address
+		// space. But, because we know no one else is acquiring *this* memory, it's
+		// still safe to not zero.
+
+		// Compute how far into the arena we extend into, capped
+		// at heapArenaBytes.
+		arenaLimit := arenaBase + npage*pageSize
+		if arenaLimit > heapArenaBytes {
+			arenaLimit = heapArenaBytes
+		}
+		// Increase ha.zeroedBase so it's >= arenaLimit.
+		// We may be racing with other updates.
+		for arenaLimit > zeroedBase {
+			if atomic.Casuintptr(&ha.zeroedBase, zeroedBase, arenaLimit) {
+				break
+			}
+			zeroedBase = atomic.Loaduintptr(&ha.zeroedBase)
+			// Double check basic conditions of zeroedBase.
+			if zeroedBase <= arenaLimit && zeroedBase > arenaBase {
+				// The zeroedBase moved into the space we were trying to
+				// claim. That's very bad, and indicates someone allocated
+				// the same region we did.
+				throw("potentially overlapping in-use allocations detected")
+			}
+		}
+
+		// Move base forward and subtract from npage to move into
+		// the next arena, or finish.
+		base += arenaLimit - arenaBase
+		npage -= (arenaLimit - arenaBase) / pageSize
+	}
+	return
+}
+
+// tryAllocMSpan attempts to allocate an mspan object from
+// the P-local cache, but may fail.
+//
+// h.lock need not be held.
+//
+// This caller must ensure that its P won't change underneath
+// it during this function. Currently to ensure that we enforce
+// that the function is run on the system stack, because that's
+// the only place it is used now. In the future, this requirement
+// may be relaxed if its use is necessary elsewhere.
+//
+//go:systemstack
+func (h *mheap) tryAllocMSpan() *mspan {
+	pp := getg().m.p.ptr()
+	// If we don't have a p or the cache is empty, we can't do
+	// anything here.
+	if pp == nil || pp.mspancache.len == 0 {
+		return nil
+	}
+	// Pull off the last entry in the cache.
+	s := pp.mspancache.buf[pp.mspancache.len-1]
+	pp.mspancache.len--
+	return s
+}
+
+// allocMSpanLocked allocates an mspan object.
+//
+// h.lock must be held.
+//
+// allocMSpanLocked must be called on the system stack because
+// its caller holds the heap lock. See mheap for details.
+// Running on the system stack also ensures that we won't
+// switch Ps during this function. See tryAllocMSpan for details.
+//
+//go:systemstack
+func (h *mheap) allocMSpanLocked() *mspan {
+	assertLockHeld(&h.lock)
+
+	pp := getg().m.p.ptr()
+	if pp == nil {
+		// We don't have a p so just do the normal thing.
+		return (*mspan)(h.spanalloc.alloc())
+	}
+	// Refill the cache if necessary.
+	if pp.mspancache.len == 0 {
+		const refillCount = len(pp.mspancache.buf) / 2
+		for i := 0; i < refillCount; i++ {
+			pp.mspancache.buf[i] = (*mspan)(h.spanalloc.alloc())
+		}
+		pp.mspancache.len = refillCount
+	}
+	// Pull off the last entry in the cache.
+	s := pp.mspancache.buf[pp.mspancache.len-1]
+	pp.mspancache.len--
+	return s
+}
+
+// freeMSpanLocked free an mspan object.
+//
+// h.lock must be held.
+//
+// freeMSpanLocked must be called on the system stack because
+// its caller holds the heap lock. See mheap for details.
+// Running on the system stack also ensures that we won't
+// switch Ps during this function. See tryAllocMSpan for details.
+//
+//go:systemstack
+func (h *mheap) freeMSpanLocked(s *mspan) {
+	assertLockHeld(&h.lock)
+
+	pp := getg().m.p.ptr()
+	// First try to free the mspan directly to the cache.
+	if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
+		pp.mspancache.buf[pp.mspancache.len] = s
+		pp.mspancache.len++
+		return
+	}
+	// Failing that (or if we don't have a p), just free it to
+	// the heap.
+	h.spanalloc.free(unsafe.Pointer(s))
+}
+
+// allocSpan allocates an mspan which owns npages worth of memory.
+//
+// If typ.manual() == false, allocSpan allocates a heap span of class spanclass
+// and updates heap accounting. If manual == true, allocSpan allocates a
+// manually-managed span (spanclass is ignored), and the caller is
+// responsible for any accounting related to its use of the span. Either
+// way, allocSpan will atomically add the bytes in the newly allocated
+// span to *sysStat.
+//
+// The returned span is fully initialized.
+//
+// h.lock must not be held.
+//
+// allocSpan must be called on the system stack both because it acquires
+// the heap lock and because it must block GC transitions.
+//
+//go:systemstack
+func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass) (s *mspan) {
+	// Function-global state.
+	gp := getg()
+	base, scav := uintptr(0), uintptr(0)
+	growth := uintptr(0)
+
+	// On some platforms we need to provide physical page aligned stack
+	// allocations. Where the page size is less than the physical page
+	// size, we already manage to do this by default.
+	needPhysPageAlign := physPageAlignedStacks && typ == spanAllocStack && pageSize < physPageSize
+
+	// If the allocation is small enough, try the page cache!
+	// The page cache does not support aligned allocations, so we cannot use
+	// it if we need to provide a physical page aligned stack allocation.
+	pp := gp.m.p.ptr()
+	if !needPhysPageAlign && pp != nil && npages < pageCachePages/4 {
+		c := &pp.pcache
+
+		// If the cache is empty, refill it.
+		if c.empty() {
+			lock(&h.lock)
+			*c = h.pages.allocToCache()
+			unlock(&h.lock)
+		}
+
+		// Try to allocate from the cache.
+		base, scav = c.alloc(npages)
+		if base != 0 {
+			s = h.tryAllocMSpan()
+			if s != nil {
+				goto HaveSpan
+			}
+			// We have a base but no mspan, so we need
+			// to lock the heap.
+		}
+	}
+
+	// For one reason or another, we couldn't get the
+	// whole job done without the heap lock.
+	lock(&h.lock)
+
+	if needPhysPageAlign {
+		// Overallocate by a physical page to allow for later alignment.
+		npages += physPageSize / pageSize
+	}
+
+	if base == 0 {
+		// Try to acquire a base address.
+		base, scav = h.pages.alloc(npages)
+		if base == 0 {
+			var ok bool
+			growth, ok = h.grow(npages)
+			if !ok {
+				unlock(&h.lock)
+				return nil
+			}
+			base, scav = h.pages.alloc(npages)
+			if base == 0 {
+				throw("grew heap, but no adequate free space found")
+			}
+		}
+	}
+	if s == nil {
+		// We failed to get an mspan earlier, so grab
+		// one now that we have the heap lock.
+		s = h.allocMSpanLocked()
+	}
+
+	if needPhysPageAlign {
+		allocBase, allocPages := base, npages
+		base = alignUp(allocBase, physPageSize)
+		npages -= physPageSize / pageSize
+
+		// Return memory around the aligned allocation.
+		spaceBefore := base - allocBase
+		if spaceBefore > 0 {
+			h.pages.free(allocBase, spaceBefore/pageSize, false)
+		}
+		spaceAfter := (allocPages-npages)*pageSize - spaceBefore
+		if spaceAfter > 0 {
+			h.pages.free(base+npages*pageSize, spaceAfter/pageSize, false)
+		}
+	}
+
+	unlock(&h.lock)
+
+	if growth > 0 {
+		// We just caused a heap growth, so scavenge down what will soon be used.
+		// By scavenging inline we deal with the failure to allocate out of
+		// memory fragments by scavenging the memory fragments that are least
+		// likely to be re-used.
+		scavengeGoal := atomic.Load64(&h.scavengeGoal)
+		if retained := heapRetained(); retained+uint64(growth) > scavengeGoal {
+			// The scavenging algorithm requires the heap lock to be dropped so it
+			// can acquire it only sparingly. This is a potentially expensive operation
+			// so it frees up other goroutines to allocate in the meanwhile. In fact,
+			// they can make use of the growth we just created.
+			todo := growth
+			if overage := uintptr(retained + uint64(growth) - scavengeGoal); todo > overage {
+				todo = overage
+			}
+			h.pages.scavenge(todo)
+		}
+	}
+
+HaveSpan:
+	// At this point, both s != nil and base != 0, and the heap
+	// lock is no longer held. Initialize the span.
+	s.init(base, npages)
+	if h.allocNeedsZero(base, npages) {
+		s.needzero = 1
+	}
+	nbytes := npages * pageSize
+	if typ.manual() {
+		s.manualFreeList = 0
+		s.nelems = 0
+		s.limit = s.base() + s.npages*pageSize
+		s.state.set(mSpanManual)
+	} else {
+		// We must set span properties before the span is published anywhere
+		// since we're not holding the heap lock.
+		s.spanclass = spanclass
+		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
+			s.elemsize = nbytes
+			s.nelems = 1
+			s.divMul = 0
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+			s.nelems = nbytes / s.elemsize
+			s.divMul = class_to_divmagic[sizeclass]
+		}
+
+		// Initialize mark and allocation structures.
+		s.freeindex = 0
+		s.allocCache = ^uint64(0) // all 1s indicating all free.
+		s.gcmarkBits = newMarkBits(s.nelems)
+		s.allocBits = newAllocBits(s.nelems)
+
+		// It's safe to access h.sweepgen without the heap lock because it's
+		// only ever updated with the world stopped and we run on the
+		// systemstack which blocks a STW transition.
+		atomic.Store(&s.sweepgen, h.sweepgen)
+
+		// Now that the span is filled in, set its state. This
+		// is a publication barrier for the other fields in
+		// the span. While valid pointers into this span
+		// should never be visible until the span is returned,
+		// if the garbage collector finds an invalid pointer,
+		// access to the span may race with initialization of
+		// the span. We resolve this race by atomically
+		// setting the state after the span is fully
+		// initialized, and atomically checking the state in
+		// any situation where a pointer is suspect.
+		s.state.set(mSpanInUse)
+	}
+
+	// Commit and account for any scavenged memory that the span now owns.
+	if scav != 0 {
+		// sysUsed all the pages that are actually available
+		// in the span since some of them might be scavenged.
+		sysUsed(unsafe.Pointer(base), nbytes)
+		atomic.Xadd64(&memstats.heap_released, -int64(scav))
+	}
+	// Update stats.
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys.
+		memstats.heap_sys.add(-int64(nbytes))
+	}
+	// Update consistent stats.
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.committed, int64(scav))
+	atomic.Xaddint64(&stats.released, -int64(scav))
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
+	}
+	memstats.heapStats.release()
+
+	// Publish the span in various locations.
+
+	// This is safe to call without the lock held because the slots
+	// related to this span will only ever be read or modified by
+	// this thread until pointers into the span are published (and
+	// we execute a publication barrier at the end of this function
+	// before that happens) or pageInUse is updated.
+	h.setSpans(s.base(), npages, s)
+
+	if !typ.manual() {
+		// Mark in-use span in arena page bitmap.
+		//
+		// This publishes the span to the page sweeper, so
+		// it's imperative that the span be completely initialized
+		// prior to this line.
+		arena, pageIdx, pageMask := pageIndexOf(s.base())
+		atomic.Or8(&arena.pageInUse[pageIdx], pageMask)
+
+		// Update related page sweeper stats.
+		h.pagesInUse.Add(int64(npages))
+	}
+
+	// Make sure the newly allocated span will be observed
+	// by the GC before pointers into the span are published.
+	publicationBarrier()
+
+	return s
+}
+
+// Try to add at least npage pages of memory to the heap,
+// returning how much the heap grew by and whether it worked.
+//
+// h.lock must be held.
+func (h *mheap) grow(npage uintptr) (uintptr, bool) {
+	assertLockHeld(&h.lock)
+
+	// We must grow the heap in whole palloc chunks.
+	// We call sysMap below but note that because we
+	// round up to pallocChunkPages which is on the order
+	// of MiB (generally >= to the huge page size) we
+	// won't be calling it too much.
+	ask := alignUp(npage, pallocChunkPages) * pageSize
+
+	totalGrowth := uintptr(0)
+	// This may overflow because ask could be very large
+	// and is otherwise unrelated to h.curArena.base.
+	end := h.curArena.base + ask
+	nBase := alignUp(end, physPageSize)
+	if nBase > h.curArena.end || /* overflow */ end < h.curArena.base {
+		// Not enough room in the current arena. Allocate more
+		// arena space. This may not be contiguous with the
+		// current arena, so we have to request the full ask.
+		av, asize := h.sysAlloc(ask)
+		if av == nil {
+			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
+			return 0, false
+		}
+
+		if uintptr(av) == h.curArena.end {
+			// The new space is contiguous with the old
+			// space, so just extend the current space.
+			h.curArena.end = uintptr(av) + asize
+		} else {
+			// The new space is discontiguous. Track what
+			// remains of the current space and switch to
+			// the new space. This should be rare.
+			if size := h.curArena.end - h.curArena.base; size != 0 {
+				// Transition this space from Reserved to Prepared and mark it
+				// as released since we'll be able to start using it after updating
+				// the page allocator and releasing the lock at any time.
+				sysMap(unsafe.Pointer(h.curArena.base), size, &memstats.heap_sys)
+				// Update stats.
+				atomic.Xadd64(&memstats.heap_released, int64(size))
+				stats := memstats.heapStats.acquire()
+				atomic.Xaddint64(&stats.released, int64(size))
+				memstats.heapStats.release()
+				// Update the page allocator's structures to make this
+				// space ready for allocation.
+				h.pages.grow(h.curArena.base, size)
+				totalGrowth += size
+			}
+			// Switch to the new space.
+			h.curArena.base = uintptr(av)
+			h.curArena.end = uintptr(av) + asize
+		}
+
+		// Recalculate nBase.
+		// We know this won't overflow, because sysAlloc returned
+		// a valid region starting at h.curArena.base which is at
+		// least ask bytes in size.
+		nBase = alignUp(h.curArena.base+ask, physPageSize)
+	}
+
+	// Grow into the current arena.
+	v := h.curArena.base
+	h.curArena.base = nBase
+
+	// Transition the space we're going to use from Reserved to Prepared.
+	sysMap(unsafe.Pointer(v), nBase-v, &memstats.heap_sys)
+
+	// The memory just allocated counts as both released
+	// and idle, even though it's not yet backed by spans.
+	//
+	// The allocation is always aligned to the heap arena
+	// size which is always > physPageSize, so its safe to
+	// just add directly to heap_released.
+	atomic.Xadd64(&memstats.heap_released, int64(nBase-v))
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.released, int64(nBase-v))
+	memstats.heapStats.release()
+
+	// Update the page allocator's structures to make this
+	// space ready for allocation.
+	h.pages.grow(v, nBase-v)
+	totalGrowth += nBase - v
+	return totalGrowth, true
+}
+
+// Free the span back into the heap.
+func (h *mheap) freeSpan(s *mspan) {
+	systemstack(func() {
+		lock(&h.lock)
+		if msanenabled {
+			// Tell msan that this entire span is no longer in use.
+			base := unsafe.Pointer(s.base())
+			bytes := s.npages << _PageShift
+			msanfree(base, bytes)
+		}
+		if asanenabled {
+			// Tell asan that this entire span is no longer in use.
+			base := unsafe.Pointer(s.base())
+			bytes := s.npages << _PageShift
+			asanpoison(base, bytes)
+		}
+		h.freeSpanLocked(s, spanAllocHeap)
+		unlock(&h.lock)
+	})
+}
+
+// freeManual frees a manually-managed span returned by allocManual.
+// typ must be the same as the spanAllocType passed to the allocManual that
+// allocated s.
+//
+// This must only be called when gcphase == _GCoff. See mSpanState for
+// an explanation.
+//
+// freeManual must be called on the system stack because it acquires
+// the heap lock. See mheap for details.
+//
+//go:systemstack
+func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
+	s.needzero = 1
+	lock(&h.lock)
+	h.freeSpanLocked(s, typ)
+	unlock(&h.lock)
+}
+
+func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
+	assertLockHeld(&h.lock)
+
+	switch s.state.get() {
+	case mSpanManual:
+		if s.allocCount != 0 {
+			throw("mheap.freeSpanLocked - invalid stack free")
+		}
+	case mSpanInUse:
+		if s.allocCount != 0 || s.sweepgen != h.sweepgen {
+			print("mheap.freeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
+			throw("mheap.freeSpanLocked - invalid free")
+		}
+		h.pagesInUse.Add(-int64(s.npages))
+
+		// Clear in-use bit in arena page bitmap.
+		arena, pageIdx, pageMask := pageIndexOf(s.base())
+		atomic.And8(&arena.pageInUse[pageIdx], ^pageMask)
+	default:
+		throw("mheap.freeSpanLocked - invalid span state")
+	}
+
+	// Update stats.
+	//
+	// Mirrors the code in allocSpan.
+	nbytes := s.npages * pageSize
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys, so add it back.
+		memstats.heap_sys.add(int64(nbytes))
+	}
+	// Update consistent stats.
+	stats := memstats.heapStats.acquire()
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
+	}
+	memstats.heapStats.release()
+
+	// Mark the space as free.
+	h.pages.free(s.base(), s.npages, false)
+
+	// Free the span structure. We no longer have a use for it.
+	s.state.set(mSpanDead)
+	h.freeMSpanLocked(s)
+}
+
+// scavengeAll acquires the heap lock (blocking any additional
+// manipulation of the page allocator) and iterates over the whole
+// heap, scavenging every free page available.
+func (h *mheap) scavengeAll() {
+	// Disallow malloc or panic while holding the heap lock. We do
+	// this here because this is a non-mallocgc entry-point to
+	// the mheap API.
+	gp := getg()
+	gp.m.mallocing++
+
+	lock(&h.lock)
+	// Start a new scavenge generation so we have a chance to walk
+	// over the whole heap.
+	h.pages.scavengeStartGen()
+	unlock(&h.lock)
+
+	released := h.pages.scavenge(^uintptr(0))
+
+	lock(&h.pages.scav.lock)
+	gen := h.pages.scav.gen
+	unlock(&h.pages.scav.lock)
+
+	gp.m.mallocing--
+
+	if debug.scavtrace > 0 {
+		printScavTrace(gen, released, true)
+	}
+}
+
+//go:linkname runtime_debug_freeOSMemory runtime/debug.freeOSMemory
+func runtime_debug_freeOSMemory() {
+	GC()
+	systemstack(func() { mheap_.scavengeAll() })
+}
+
+// Initialize a new span with the given start and npages.
+func (span *mspan) init(base uintptr, npages uintptr) {
+	// span is *not* zeroed.
+	span.next = nil
+	span.prev = nil
+	span.list = nil
+	span.startAddr = base
+	span.npages = npages
+	span.allocCount = 0
+	span.spanclass = 0
+	span.elemsize = 0
+	span.speciallock.key = 0
+	span.specials = nil
+	span.needzero = 0
+	span.freeindex = 0
+	span.allocBits = nil
+	span.gcmarkBits = nil
+	span.state.set(mSpanDead)
+	lockInit(&span.speciallock, lockRankMspanSpecial)
+}
+
+func (span *mspan) inList() bool {
+	return span.list != nil
+}
+
+// Initialize an empty doubly-linked list.
+func (list *mSpanList) init() {
+	list.first = nil
+	list.last = nil
+}
+
+func (list *mSpanList) remove(span *mspan) {
+	if span.list != list {
+		print("runtime: failed mSpanList.remove span.npages=", span.npages,
+			" span=", span, " prev=", span.prev, " span.list=", span.list, " list=", list, "\n")
+		throw("mSpanList.remove")
+	}
+	if list.first == span {
+		list.first = span.next
+	} else {
+		span.prev.next = span.next
+	}
+	if list.last == span {
+		list.last = span.prev
+	} else {
+		span.next.prev = span.prev
+	}
+	span.next = nil
+	span.prev = nil
+	span.list = nil
+}
+
+func (list *mSpanList) isEmpty() bool {
+	return list.first == nil
+}
+
+func (list *mSpanList) insert(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("runtime: failed mSpanList.insert", span, span.next, span.prev, span.list)
+		throw("mSpanList.insert")
+	}
+	span.next = list.first
+	if list.first != nil {
+		// The list contains at least one span; link it in.
+		// The last span in the list doesn't change.
+		list.first.prev = span
+	} else {
+		// The list contains no spans, so this is also the last span.
+		list.last = span
+	}
+	list.first = span
+	span.list = list
+}
+
+func (list *mSpanList) insertBack(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("runtime: failed mSpanList.insertBack", span, span.next, span.prev, span.list)
+		throw("mSpanList.insertBack")
+	}
+	span.prev = list.last
+	if list.last != nil {
+		// The list contains at least one span.
+		list.last.next = span
+	} else {
+		// The list contains no spans, so this is also the first span.
+		list.first = span
+	}
+	list.last = span
+	span.list = list
+}
+
+// takeAll removes all spans from other and inserts them at the front
+// of list.
+func (list *mSpanList) takeAll(other *mSpanList) {
+	if other.isEmpty() {
+		return
+	}
+
+	// Reparent everything in other to list.
+	for s := other.first; s != nil; s = s.next {
+		s.list = list
+	}
+
+	// Concatenate the lists.
+	if list.isEmpty() {
+		*list = *other
+	} else {
+		// Neither list is empty. Put other before list.
+		other.last.next = list.first
+		list.first.prev = other.last
+		list.first = other.first
+	}
+
+	other.first, other.last = nil, nil
+}
+
+const (
+	_KindSpecialFinalizer = 1
+	_KindSpecialProfile   = 2
+	// _KindSpecialReachable is a special used for tracking
+	// reachability during testing.
+	_KindSpecialReachable = 3
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+)
+
+//go:notinheap
+type special struct {
+	next   *special // linked list in span
+	offset uint16   // span offset of object
+	kind   byte     // kind of special
+}
+
+// spanHasSpecials marks a span as having specials in the arena bitmap.
+func spanHasSpecials(s *mspan) {
+	arenaPage := (s.base() / pageSize) % pagesPerArena
+	ai := arenaIndex(s.base())
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	atomic.Or8(&ha.pageSpecials[arenaPage/8], uint8(1)<<(arenaPage%8))
+}
+
+// spanHasNoSpecials marks a span as having no specials in the arena bitmap.
+func spanHasNoSpecials(s *mspan) {
+	arenaPage := (s.base() / pageSize) % pagesPerArena
+	ai := arenaIndex(s.base())
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	atomic.And8(&ha.pageSpecials[arenaPage/8], ^(uint8(1) << (arenaPage % 8)))
+}
+
+// Adds the special record s to the list of special records for
+// the object p. All fields of s should be filled in except for
+// offset & next, which this routine will fill in.
+// Returns true if the special was successfully added, false otherwise.
+// (The add will fail only if a record with the same p and s->kind
+//  already exists.)
+func addspecial(p unsafe.Pointer, s *special) bool {
+	span := spanOfHeap(uintptr(p))
+	if span == nil {
+		throw("addspecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
+	mp := acquirem()
+	span.ensureSwept()
+
+	offset := uintptr(p) - span.base()
+	kind := s.kind
+
+	lock(&span.speciallock)
+
+	// Find splice point, check for existing record.
+	t := &span.specials
+	for {
+		x := *t
+		if x == nil {
+			break
+		}
+		if offset == uintptr(x.offset) && kind == x.kind {
+			unlock(&span.speciallock)
+			releasem(mp)
+			return false // already exists
+		}
+		if offset < uintptr(x.offset) || (offset == uintptr(x.offset) && kind < x.kind) {
+			break
+		}
+		t = &x.next
+	}
+
+	// Splice in record, fill in offset.
+	s.offset = uint16(offset)
+	s.next = *t
+	*t = s
+	spanHasSpecials(span)
+	unlock(&span.speciallock)
+	releasem(mp)
+
+	return true
+}
+
+// Removes the Special record of the given kind for the object p.
+// Returns the record if the record existed, nil otherwise.
+// The caller must FixAlloc_Free the result.
+func removespecial(p unsafe.Pointer, kind uint8) *special {
+	span := spanOfHeap(uintptr(p))
+	if span == nil {
+		throw("removespecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
+	mp := acquirem()
+	span.ensureSwept()
+
+	offset := uintptr(p) - span.base()
+
+	var result *special
+	lock(&span.speciallock)
+	t := &span.specials
+	for {
+		s := *t
+		if s == nil {
+			break
+		}
+		// This function is used for finalizers only, so we don't check for
+		// "interior" specials (p must be exactly equal to s->offset).
+		if offset == uintptr(s.offset) && kind == s.kind {
+			*t = s.next
+			result = s
+			break
+		}
+		t = &s.next
+	}
+	if span.specials == nil {
+		spanHasNoSpecials(span)
+	}
+	unlock(&span.speciallock)
+	releasem(mp)
+	return result
+}
+
+// The described object has a finalizer set for it.
+//
+// specialfinalizer is allocated from non-GC'd memory, so any heap
+// pointers must be specially handled.
+//
+//go:notinheap
+type specialfinalizer struct {
+	special special
+	fn      *funcval // May be a heap pointer.
+	nret    uintptr
+	fint    *_type   // May be a heap pointer, but always live.
+	ot      *ptrtype // May be a heap pointer, but always live.
+}
+
+// Adds a finalizer to the object p. Returns true if it succeeded.
+func addfinalizer(p unsafe.Pointer, f *funcval, nret uintptr, fint *_type, ot *ptrtype) bool {
+	lock(&mheap_.speciallock)
+	s := (*specialfinalizer)(mheap_.specialfinalizeralloc.alloc())
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialFinalizer
+	s.fn = f
+	s.nret = nret
+	s.fint = fint
+	s.ot = ot
+	if addspecial(p, &s.special) {
+		// This is responsible for maintaining the same
+		// GC-related invariants as markrootSpans in any
+		// situation where it's possible that markrootSpans
+		// has already run but mark termination hasn't yet.
+		if gcphase != _GCoff {
+			base, _, _ := findObject(uintptr(p), 0, 0)
+			mp := acquirem()
+			gcw := &mp.p.ptr().gcw
+			// Mark everything reachable from the object
+			// so it's retained for the finalizer.
+			scanobject(base, gcw)
+			// Mark the finalizer itself, since the
+			// special isn't part of the GC'd heap.
+			scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
+			releasem(mp)
+		}
+		return true
+	}
+
+	// There was an old finalizer
+	lock(&mheap_.speciallock)
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
+	unlock(&mheap_.speciallock)
+	return false
+}
+
+// Removes the finalizer (if any) from the object p.
+func removefinalizer(p unsafe.Pointer) {
+	s := (*specialfinalizer)(unsafe.Pointer(removespecial(p, _KindSpecialFinalizer)))
+	if s == nil {
+		return // there wasn't a finalizer to remove
+	}
+	lock(&mheap_.speciallock)
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
+	unlock(&mheap_.speciallock)
+}
+
+// The described object is being heap profiled.
+//
+//go:notinheap
+type specialprofile struct {
+	special special
+	b       *bucket
+}
+
+// Set the heap profile bucket associated with addr to b.
+func setprofilebucket(p unsafe.Pointer, b *bucket) {
+	lock(&mheap_.speciallock)
+	s := (*specialprofile)(mheap_.specialprofilealloc.alloc())
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialProfile
+	s.b = b
+	if !addspecial(p, &s.special) {
+		throw("setprofilebucket: profile already set")
+	}
+}
+
+// specialReachable tracks whether an object is reachable on the next
+// GC cycle. This is used by testing.
+type specialReachable struct {
+	special   special
+	done      bool
+	reachable bool
+}
+
+// specialsIter helps iterate over specials lists.
+type specialsIter struct {
+	pprev **special
+	s     *special
+}
+
+func newSpecialsIter(span *mspan) specialsIter {
+	return specialsIter{&span.specials, span.specials}
+}
+
+func (i *specialsIter) valid() bool {
+	return i.s != nil
+}
+
+func (i *specialsIter) next() {
+	i.pprev = &i.s.next
+	i.s = *i.pprev
+}
+
+// unlinkAndNext removes the current special from the list and moves
+// the iterator to the next special. It returns the unlinked special.
+func (i *specialsIter) unlinkAndNext() *special {
+	cur := i.s
+	i.s = cur.next
+	*i.pprev = i.s
+	return cur
+}
+
+// freeSpecial performs any cleanup on special s and deallocates it.
+// s must already be unlinked from the specials list.
+func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
+	switch s.kind {
+	case _KindSpecialFinalizer:
+		sf := (*specialfinalizer)(unsafe.Pointer(s))
+		queuefinalizer(p, sf.fn, sf.nret, sf.fint, sf.ot)
+		lock(&mheap_.speciallock)
+		mheap_.specialfinalizeralloc.free(unsafe.Pointer(sf))
+		unlock(&mheap_.speciallock)
+	case _KindSpecialProfile:
+		sp := (*specialprofile)(unsafe.Pointer(s))
+		mProf_Free(sp.b, size)
+		lock(&mheap_.speciallock)
+		mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
+		unlock(&mheap_.speciallock)
+	case _KindSpecialReachable:
+		sp := (*specialReachable)(unsafe.Pointer(s))
+		sp.done = true
+		// The creator frees these.
+	default:
+		throw("bad special kind")
+		panic("not reached")
+	}
+}
+
+// gcBits is an alloc/mark bitmap. This is always used as *gcBits.
+//
+//go:notinheap
+type gcBits uint8
+
+// bytep returns a pointer to the n'th byte of b.
+func (b *gcBits) bytep(n uintptr) *uint8 {
+	return addb((*uint8)(b), n)
+}
+
+// bitp returns a pointer to the byte containing bit n and a mask for
+// selecting that bit from *bytep.
+func (b *gcBits) bitp(n uintptr) (bytep *uint8, mask uint8) {
+	return b.bytep(n / 8), 1 << (n % 8)
+}
+
+const gcBitsChunkBytes = uintptr(64 << 10)
+const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
+
+type gcBitsHeader struct {
+	free uintptr // free is the index into bits of the next free byte.
+	next uintptr // *gcBits triggers recursive type bug. (issue 14620)
+}
+
+//go:notinheap
+type gcBitsArena struct {
+	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
+	free uintptr // free is the index into bits of the next free byte; read/write atomically
+	next *gcBitsArena
+	bits [gcBitsChunkBytes - gcBitsHeaderBytes]gcBits
+}
+
+var gcBitsArenas struct {
+	lock     mutex
+	free     *gcBitsArena
+	next     *gcBitsArena // Read atomically. Write atomically under lock.
+	current  *gcBitsArena
+	previous *gcBitsArena
+}
+
+// tryAlloc allocates from b or returns nil if b does not have enough room.
+// This is safe to call concurrently.
+func (b *gcBitsArena) tryAlloc(bytes uintptr) *gcBits {
+	if b == nil || atomic.Loaduintptr(&b.free)+bytes > uintptr(len(b.bits)) {
+		return nil
+	}
+	// Try to allocate from this block.
+	end := atomic.Xadduintptr(&b.free, bytes)
+	if end > uintptr(len(b.bits)) {
+		return nil
+	}
+	// There was enough room.
+	start := end - bytes
+	return &b.bits[start]
+}
+
+// newMarkBits returns a pointer to 8 byte aligned bytes
+// to be used for a span's mark bits.
+func newMarkBits(nelems uintptr) *gcBits {
+	blocksNeeded := uintptr((nelems + 63) / 64)
+	bytesNeeded := blocksNeeded * 8
+
+	// Try directly allocating from the current head arena.
+	head := (*gcBitsArena)(atomic.Loadp(unsafe.Pointer(&gcBitsArenas.next)))
+	if p := head.tryAlloc(bytesNeeded); p != nil {
+		return p
+	}
+
+	// There's not enough room in the head arena. We may need to
+	// allocate a new arena.
+	lock(&gcBitsArenas.lock)
+	// Try the head arena again, since it may have changed. Now
+	// that we hold the lock, the list head can't change, but its
+	// free position still can.
+	if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
+		unlock(&gcBitsArenas.lock)
+		return p
+	}
+
+	// Allocate a new arena. This may temporarily drop the lock.
+	fresh := newArenaMayUnlock()
+	// If newArenaMayUnlock dropped the lock, another thread may
+	// have put a fresh arena on the "next" list. Try allocating
+	// from next again.
+	if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
+		// Put fresh back on the free list.
+		// TODO: Mark it "already zeroed"
+		fresh.next = gcBitsArenas.free
+		gcBitsArenas.free = fresh
+		unlock(&gcBitsArenas.lock)
+		return p
+	}
+
+	// Allocate from the fresh arena. We haven't linked it in yet, so
+	// this cannot race and is guaranteed to succeed.
+	p := fresh.tryAlloc(bytesNeeded)
+	if p == nil {
+		throw("markBits overflow")
+	}
+
+	// Add the fresh arena to the "next" list.
+	fresh.next = gcBitsArenas.next
+	atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), unsafe.Pointer(fresh))
+
+	unlock(&gcBitsArenas.lock)
+	return p
+}
+
+// newAllocBits returns a pointer to 8 byte aligned bytes
+// to be used for this span's alloc bits.
+// newAllocBits is used to provide newly initialized spans
+// allocation bits. For spans not being initialized the
+// mark bits are repurposed as allocation bits when
+// the span is swept.
+func newAllocBits(nelems uintptr) *gcBits {
+	return newMarkBits(nelems)
+}
+
+// nextMarkBitArenaEpoch establishes a new epoch for the arenas
+// holding the mark bits. The arenas are named relative to the
+// current GC cycle which is demarcated by the call to finishweep_m.
+//
+// All current spans have been swept.
+// During that sweep each span allocated room for its gcmarkBits in
+// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current
+// where the GC will mark objects and after each span is swept these bits
+// will be used to allocate objects.
+// gcBitsArenas.current becomes gcBitsArenas.previous where the span's
+// gcAllocBits live until all the spans have been swept during this GC cycle.
+// The span's sweep extinguishes all the references to gcBitsArenas.previous
+// by pointing gcAllocBits into the gcBitsArenas.current.
+// The gcBitsArenas.previous is released to the gcBitsArenas.free list.
+func nextMarkBitArenaEpoch() {
+	lock(&gcBitsArenas.lock)
+	if gcBitsArenas.previous != nil {
+		if gcBitsArenas.free == nil {
+			gcBitsArenas.free = gcBitsArenas.previous
+		} else {
+			// Find end of previous arenas.
+			last := gcBitsArenas.previous
+			for last = gcBitsArenas.previous; last.next != nil; last = last.next {
+			}
+			last.next = gcBitsArenas.free
+			gcBitsArenas.free = gcBitsArenas.previous
+		}
+	}
+	gcBitsArenas.previous = gcBitsArenas.current
+	gcBitsArenas.current = gcBitsArenas.next
+	atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), nil) // newMarkBits calls newArena when needed
+	unlock(&gcBitsArenas.lock)
+}
+
+// newArenaMayUnlock allocates and zeroes a gcBits arena.
+// The caller must hold gcBitsArena.lock. This may temporarily release it.
+func newArenaMayUnlock() *gcBitsArena {
+	var result *gcBitsArena
+	if gcBitsArenas.free == nil {
+		unlock(&gcBitsArenas.lock)
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
+		if result == nil {
+			throw("runtime: cannot allocate memory")
+		}
+		lock(&gcBitsArenas.lock)
+	} else {
+		result = gcBitsArenas.free
+		gcBitsArenas.free = gcBitsArenas.free.next
+		memclrNoHeapPointers(unsafe.Pointer(result), gcBitsChunkBytes)
+	}
+	result.next = nil
+	// If result.bits is not 8 byte aligned adjust index so
+	// that &result.bits[result.free] is 8 byte aligned.
+	if uintptr(unsafe.Offsetof(gcBitsArena{}.bits))&7 == 0 {
+		result.free = 0
+	} else {
+		result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)
+	}
+	return result
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mpagealloc.go b/contrib/go/_std_1.18/src/runtime/mpagealloc.go
new file mode 100644
index 0000000000..2725e3b7c7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mpagealloc.go
@@ -0,0 +1,1026 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page allocator.
+//
+// The page allocator manages mapped pages (defined by pageSize, NOT
+// physPageSize) for allocation and re-use. It is embedded into mheap.
+//
+// Pages are managed using a bitmap that is sharded into chunks.
+// In the bitmap, 1 means in-use, and 0 means free. The bitmap spans the
+// process's address space. Chunks are managed in a sparse-array-style structure
+// similar to mheap.arenas, since the bitmap may be large on some systems.
+//
+// The bitmap is efficiently searched by using a radix tree in combination
+// with fast bit-wise intrinsics. Allocation is performed using an address-ordered
+// first-fit approach.
+//
+// Each entry in the radix tree is a summary that describes three properties of
+// a particular region of the address space: the number of contiguous free pages
+// at the start and end of the region it represents, and the maximum number of
+// contiguous free pages found anywhere in that region.
+//
+// Each level of the radix tree is stored as one contiguous array, which represents
+// a different granularity of subdivision of the processes' address space. Thus, this
+// radix tree is actually implicit in these large arrays, as opposed to having explicit
+// dynamically-allocated pointer-based node structures. Naturally, these arrays may be
+// quite large for system with large address spaces, so in these cases they are mapped
+// into memory as needed. The leaf summaries of the tree correspond to a bitmap chunk.
+//
+// The root level (referred to as L0 and index 0 in pageAlloc.summary) has each
+// summary represent the largest section of address space (16 GiB on 64-bit systems),
+// with each subsequent level representing successively smaller subsections until we
+// reach the finest granularity at the leaves, a chunk.
+//
+// More specifically, each summary in each level (except for leaf summaries)
+// represents some number of entries in the following level. For example, each
+// summary in the root level may represent a 16 GiB region of address space,
+// and in the next level there could be 8 corresponding entries which represent 2
+// GiB subsections of that 16 GiB region, each of which could correspond to 8
+// entries in the next level which each represent 256 MiB regions, and so on.
+//
+// Thus, this design only scales to heaps so large, but can always be extended to
+// larger heaps by simply adding levels to the radix tree, which mostly costs
+// additional virtual address space. The choice of managing large arrays also means
+// that a large amount of virtual address space may be reserved by the runtime.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// The size of a bitmap chunk, i.e. the amount of bits (that is, pages) to consider
+	// in the bitmap at once.
+	pallocChunkPages    = 1 << logPallocChunkPages
+	pallocChunkBytes    = pallocChunkPages * pageSize
+	logPallocChunkPages = 9
+	logPallocChunkBytes = logPallocChunkPages + pageShift
+
+	// The number of radix bits for each level.
+	//
+	// The value of 3 is chosen such that the block of summaries we need to scan at
+	// each level fits in 64 bytes (2^3 summaries * 8 bytes per summary), which is
+	// close to the L1 cache line width on many systems. Also, a value of 3 fits 4 tree
+	// levels perfectly into the 21-bit pallocBits summary field at the root level.
+	//
+	// The following equation explains how each of the constants relate:
+	// summaryL0Bits + (summaryLevels-1)*summaryLevelBits + logPallocChunkBytes = heapAddrBits
+	//
+	// summaryLevels is an architecture-dependent value defined in mpagealloc_*.go.
+	summaryLevelBits = 3
+	summaryL0Bits    = heapAddrBits - logPallocChunkBytes - (summaryLevels-1)*summaryLevelBits
+
+	// pallocChunksL2Bits is the number of bits of the chunk index number
+	// covered by the second level of the chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this change.
+	pallocChunksL2Bits  = heapAddrBits - logPallocChunkBytes - pallocChunksL1Bits
+	pallocChunksL1Shift = pallocChunksL2Bits
+)
+
+// Maximum searchAddr value, which indicates that the heap has no free space.
+//
+// We alias maxOffAddr just to make it clear that this is the maximum address
+// for the page allocator's search space. See maxOffAddr for details.
+var maxSearchAddr = maxOffAddr
+
+// Global chunk index.
+//
+// Represents an index into the leaf level of the radix tree.
+// Similar to arenaIndex, except instead of arenas, it divides the address
+// space into chunks.
+type chunkIdx uint
+
+// chunkIndex returns the global index of the palloc chunk containing the
+// pointer p.
+func chunkIndex(p uintptr) chunkIdx {
+	return chunkIdx((p - arenaBaseOffset) / pallocChunkBytes)
+}
+
+// chunkIndex returns the base address of the palloc chunk at index ci.
+func chunkBase(ci chunkIdx) uintptr {
+	return uintptr(ci)*pallocChunkBytes + arenaBaseOffset
+}
+
+// chunkPageIndex computes the index of the page that contains p,
+// relative to the chunk which contains p.
+func chunkPageIndex(p uintptr) uint {
+	return uint(p % pallocChunkBytes / pageSize)
+}
+
+// l1 returns the index into the first level of (*pageAlloc).chunks.
+func (i chunkIdx) l1() uint {
+	if pallocChunksL1Bits == 0 {
+		// Let the compiler optimize this away if there's no
+		// L1 map.
+		return 0
+	} else {
+		return uint(i) >> pallocChunksL1Shift
+	}
+}
+
+// l2 returns the index into the second level of (*pageAlloc).chunks.
+func (i chunkIdx) l2() uint {
+	if pallocChunksL1Bits == 0 {
+		return uint(i)
+	} else {
+		return uint(i) & (1<<pallocChunksL2Bits - 1)
+	}
+}
+
+// offAddrToLevelIndex converts an address in the offset address space
+// to the index into summary[level] containing addr.
+func offAddrToLevelIndex(level int, addr offAddr) int {
+	return int((addr.a - arenaBaseOffset) >> levelShift[level])
+}
+
+// levelIndexToOffAddr converts an index into summary[level] into
+// the corresponding address in the offset address space.
+func levelIndexToOffAddr(level, idx int) offAddr {
+	return offAddr{(uintptr(idx) << levelShift[level]) + arenaBaseOffset}
+}
+
+// addrsToSummaryRange converts base and limit pointers into a range
+// of entries for the given summary level.
+//
+// The returned range is inclusive on the lower bound and exclusive on
+// the upper bound.
+func addrsToSummaryRange(level int, base, limit uintptr) (lo int, hi int) {
+	// This is slightly more nuanced than just a shift for the exclusive
+	// upper-bound. Note that the exclusive upper bound may be within a
+	// summary at this level, meaning if we just do the obvious computation
+	// hi will end up being an inclusive upper bound. Unfortunately, just
+	// adding 1 to that is too broad since we might be on the very edge
+	// of a summary's max page count boundary for this level
+	// (1 << levelLogPages[level]). So, make limit an inclusive upper bound
+	// then shift, then add 1, so we get an exclusive upper bound at the end.
+	lo = int((base - arenaBaseOffset) >> levelShift[level])
+	hi = int(((limit-1)-arenaBaseOffset)>>levelShift[level]) + 1
+	return
+}
+
+// blockAlignSummaryRange aligns indices into the given level to that
+// level's block width (1 << levelBits[level]). It assumes lo is inclusive
+// and hi is exclusive, and so aligns them down and up respectively.
+func blockAlignSummaryRange(level int, lo, hi int) (int, int) {
+	e := uintptr(1) << levelBits[level]
+	return int(alignDown(uintptr(lo), e)), int(alignUp(uintptr(hi), e))
+}
+
+type pageAlloc struct {
+	// Radix tree of summaries.
+	//
+	// Each slice's cap represents the whole memory reservation.
+	// Each slice's len reflects the allocator's maximum known
+	// mapped heap address for that level.
+	//
+	// The backing store of each summary level is reserved in init
+	// and may or may not be committed in grow (small address spaces
+	// may commit all the memory in init).
+	//
+	// The purpose of keeping len <= cap is to enforce bounds checks
+	// on the top end of the slice so that instead of an unknown
+	// runtime segmentation fault, we get a much friendlier out-of-bounds
+	// error.
+	//
+	// To iterate over a summary level, use inUse to determine which ranges
+	// are currently available. Otherwise one might try to access
+	// memory which is only Reserved which may result in a hard fault.
+	//
+	// We may still get segmentation faults < len since some of that
+	// memory may not be committed yet.
+	summary [summaryLevels][]pallocSum
+
+	// chunks is a slice of bitmap chunks.
+	//
+	// The total size of chunks is quite large on most 64-bit platforms
+	// (O(GiB) or more) if flattened, so rather than making one large mapping
+	// (which has problems on some platforms, even when PROT_NONE) we use a
+	// two-level sparse array approach similar to the arena index in mheap.
+	//
+	// To find the chunk containing a memory address `a`, do:
+	//   chunkOf(chunkIndex(a))
+	//
+	// Below is a table describing the configuration for chunks for various
+	// heapAddrBits supported by the runtime.
+	//
+	// heapAddrBits | L1 Bits | L2 Bits | L2 Entry Size
+	// ------------------------------------------------
+	// 32           | 0       | 10      | 128 KiB
+	// 33 (iOS)     | 0       | 11      | 256 KiB
+	// 48           | 13      | 13      | 1 MiB
+	//
+	// There's no reason to use the L1 part of chunks on 32-bit, the
+	// address space is small so the L2 is small. For platforms with a
+	// 48-bit address space, we pick the L1 such that the L2 is 1 MiB
+	// in size, which is a good balance between low granularity without
+	// making the impact on BSS too high (note the L1 is stored directly
+	// in pageAlloc).
+	//
+	// To iterate over the bitmap, use inUse to determine which ranges
+	// are currently available. Otherwise one might iterate over unused
+	// ranges.
+	//
+	// Protected by mheapLock.
+	//
+	// TODO(mknyszek): Consider changing the definition of the bitmap
+	// such that 1 means free and 0 means in-use so that summaries and
+	// the bitmaps align better on zero-values.
+	chunks [1 << pallocChunksL1Bits]*[1 << pallocChunksL2Bits]pallocData
+
+	// The address to start an allocation search with. It must never
+	// point to any memory that is not contained in inUse, i.e.
+	// inUse.contains(searchAddr.addr()) must always be true. The one
+	// exception to this rule is that it may take on the value of
+	// maxOffAddr to indicate that the heap is exhausted.
+	//
+	// We guarantee that all valid heap addresses below this value
+	// are allocated and not worth searching.
+	searchAddr offAddr
+
+	// start and end represent the chunk indices
+	// which pageAlloc knows about. It assumes
+	// chunks in the range [start, end) are
+	// currently ready to use.
+	start, end chunkIdx
+
+	// inUse is a slice of ranges of address space which are
+	// known by the page allocator to be currently in-use (passed
+	// to grow).
+	//
+	// This field is currently unused on 32-bit architectures but
+	// is harmless to track. We care much more about having a
+	// contiguous heap in these cases and take additional measures
+	// to ensure that, so in nearly all cases this should have just
+	// 1 element.
+	//
+	// All access is protected by the mheapLock.
+	inUse addrRanges
+
+	// scav stores the scavenger state.
+	scav struct {
+		lock mutex
+
+		// inUse is a slice of ranges of address space which have not
+		// yet been looked at by the scavenger.
+		//
+		// Protected by lock.
+		inUse addrRanges
+
+		// gen is the scavenge generation number.
+		//
+		// Protected by lock.
+		gen uint32
+
+		// reservationBytes is how large of a reservation should be made
+		// in bytes of address space for each scavenge iteration.
+		//
+		// Protected by lock.
+		reservationBytes uintptr
+
+		// released is the amount of memory released this generation.
+		//
+		// Updated atomically.
+		released uintptr
+
+		// scavLWM is the lowest (offset) address that the scavenger reached this
+		// scavenge generation.
+		//
+		// Protected by lock.
+		scavLWM offAddr
+
+		// freeHWM is the highest (offset) address of a page that was freed to
+		// the page allocator this scavenge generation.
+		//
+		// Protected by mheapLock.
+		freeHWM offAddr
+	}
+
+	// mheap_.lock. This level of indirection makes it possible
+	// to test pageAlloc indepedently of the runtime allocator.
+	mheapLock *mutex
+
+	// sysStat is the runtime memstat to update when new system
+	// memory is committed by the pageAlloc for allocation metadata.
+	sysStat *sysMemStat
+
+	// Whether or not this struct is being used in tests.
+	test bool
+}
+
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
+	if levelLogPages[0] > logMaxPackedValue {
+		// We can't represent 1<<levelLogPages[0] pages, the maximum number
+		// of pages we need to represent at the root level, in a summary, which
+		// is a big problem. Throw.
+		print("runtime: root level max pages = ", 1<<levelLogPages[0], "\n")
+		print("runtime: summary max pages = ", maxPackedValue, "\n")
+		throw("root level max pages doesn't fit in summary")
+	}
+	p.sysStat = sysStat
+
+	// Initialize p.inUse.
+	p.inUse.init(sysStat)
+
+	// System-dependent initialization.
+	p.sysInit()
+
+	// Start with the searchAddr in a state indicating there's no free memory.
+	p.searchAddr = maxSearchAddr
+
+	// Set the mheapLock.
+	p.mheapLock = mheapLock
+
+	// Initialize scavenge tracking state.
+	p.scav.scavLWM = maxSearchAddr
+}
+
+// tryChunkOf returns the bitmap data for the given chunk.
+//
+// Returns nil if the chunk data has not been mapped.
+func (p *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
+	l2 := p.chunks[ci.l1()]
+	if l2 == nil {
+		return nil
+	}
+	return &l2[ci.l2()]
+}
+
+// chunkOf returns the chunk at the given chunk index.
+//
+// The chunk index must be valid or this method may throw.
+func (p *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &p.chunks[ci.l1()][ci.l2()]
+}
+
+// grow sets up the metadata for the address range [base, base+size).
+// It may allocate metadata, in which case *p.sysStat will be updated.
+//
+// p.mheapLock must be held.
+func (p *pageAlloc) grow(base, size uintptr) {
+	assertLockHeld(p.mheapLock)
+
+	// Round up to chunks, since we can't deal with increments smaller
+	// than chunks. Also, sysGrow expects aligned values.
+	limit := alignUp(base+size, pallocChunkBytes)
+	base = alignDown(base, pallocChunkBytes)
+
+	// Grow the summary levels in a system-dependent manner.
+	// We just update a bunch of additional metadata here.
+	p.sysGrow(base, limit)
+
+	// Update p.start and p.end.
+	// If no growth happened yet, start == 0. This is generally
+	// safe since the zero page is unmapped.
+	firstGrowth := p.start == 0
+	start, end := chunkIndex(base), chunkIndex(limit)
+	if firstGrowth || start < p.start {
+		p.start = start
+	}
+	if end > p.end {
+		p.end = end
+	}
+	// Note that [base, limit) will never overlap with any existing
+	// range inUse because grow only ever adds never-used memory
+	// regions to the page allocator.
+	p.inUse.add(makeAddrRange(base, limit))
+
+	// A grow operation is a lot like a free operation, so if our
+	// chunk ends up below p.searchAddr, update p.searchAddr to the
+	// new address, just like in free.
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
+	}
+
+	// Add entries into chunks, which is sparse, if needed. Then,
+	// initialize the bitmap.
+	//
+	// Newly-grown memory is always considered scavenged.
+	// Set all the bits in the scavenged bitmaps high.
+	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
+		if p.chunks[c.l1()] == nil {
+			// Create the necessary l2 entry.
+			//
+			// Store it atomically to avoid races with readers which
+			// don't acquire the heap lock.
+			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			if r == nil {
+				throw("pageAlloc: out of memory")
+			}
+			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
+		}
+		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+	}
+
+	// Update summaries accordingly. The grow acts like a free, so
+	// we need to ensure this newly-free memory is visible in the
+	// summaries.
+	p.update(base, size/pageSize, true, false)
+}
+
+// update updates heap metadata. It must be called each time the bitmap
+// is updated.
+//
+// If contig is true, update does some optimizations assuming that there was
+// a contiguous allocation or free between addr and addr+npages. alloc indicates
+// whether the operation performed was an allocation or a free.
+//
+// p.mheapLock must be held.
+func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+	assertLockHeld(p.mheapLock)
+
+	// base, limit, start, and end are inclusive.
+	limit := base + npages*pageSize - 1
+	sc, ec := chunkIndex(base), chunkIndex(limit)
+
+	// Handle updating the lowest level first.
+	if sc == ec {
+		// Fast path: the allocation doesn't span more than one chunk,
+		// so update this one and if the summary didn't change, return.
+		x := p.summary[len(p.summary)-1][sc]
+		y := p.chunkOf(sc).summarize()
+		if x == y {
+			return
+		}
+		p.summary[len(p.summary)-1][sc] = y
+	} else if contig {
+		// Slow contiguous path: the allocation spans more than one chunk
+		// and at least one summary is guaranteed to change.
+		summary := p.summary[len(p.summary)-1]
+
+		// Update the summary for chunk sc.
+		summary[sc] = p.chunkOf(sc).summarize()
+
+		// Update the summaries for chunks in between, which are
+		// either totally allocated or freed.
+		whole := p.summary[len(p.summary)-1][sc+1 : ec]
+		if alloc {
+			// Should optimize into a memclr.
+			for i := range whole {
+				whole[i] = 0
+			}
+		} else {
+			for i := range whole {
+				whole[i] = freeChunkSum
+			}
+		}
+
+		// Update the summary for chunk ec.
+		summary[ec] = p.chunkOf(ec).summarize()
+	} else {
+		// Slow general path: the allocation spans more than one chunk
+		// and at least one summary is guaranteed to change.
+		//
+		// We can't assume a contiguous allocation happened, so walk over
+		// every chunk in the range and manually recompute the summary.
+		summary := p.summary[len(p.summary)-1]
+		for c := sc; c <= ec; c++ {
+			summary[c] = p.chunkOf(c).summarize()
+		}
+	}
+
+	// Walk up the radix tree and update the summaries appropriately.
+	changed := true
+	for l := len(p.summary) - 2; l >= 0 && changed; l-- {
+		// Update summaries at level l from summaries at level l+1.
+		changed = false
+
+		// "Constants" for the previous level which we
+		// need to compute the summary from that level.
+		logEntriesPerBlock := levelBits[l+1]
+		logMaxPages := levelLogPages[l+1]
+
+		// lo and hi describe all the parts of the level we need to look at.
+		lo, hi := addrsToSummaryRange(l, base, limit+1)
+
+		// Iterate over each block, updating the corresponding summary in the less-granular level.
+		for i := lo; i < hi; i++ {
+			children := p.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			sum := mergeSummaries(children, logMaxPages)
+			old := p.summary[l][i]
+			if old != sum {
+				changed = true
+				p.summary[l][i] = sum
+			}
+		}
+	}
+}
+
+// allocRange marks the range of memory [base, base+npages*pageSize) as
+// allocated. It also updates the summaries to reflect the newly-updated
+// bitmap.
+//
+// Returns the amount of scavenged memory in bytes present in the
+// allocated range.
+//
+// p.mheapLock must be held.
+func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
+	assertLockHeld(p.mheapLock)
+
+	limit := base + npages*pageSize - 1
+	sc, ec := chunkIndex(base), chunkIndex(limit)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
+
+	scav := uint(0)
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		chunk := p.chunkOf(sc)
+		scav += chunk.scavenged.popcntRange(si, ei+1-si)
+		chunk.allocRange(si, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		chunk := p.chunkOf(sc)
+		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
+		chunk.allocRange(si, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			chunk := p.chunkOf(c)
+			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
+			chunk.allocAll()
+		}
+		chunk = p.chunkOf(ec)
+		scav += chunk.scavenged.popcntRange(0, ei+1)
+		chunk.allocRange(0, ei+1)
+	}
+	p.update(base, npages, true, true)
+	return uintptr(scav) * pageSize
+}
+
+// findMappedAddr returns the smallest mapped offAddr that is
+// >= addr. That is, if addr refers to mapped memory, then it is
+// returned. If addr is higher than any mapped region, then
+// it returns maxOffAddr.
+//
+// p.mheapLock must be held.
+func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+	assertLockHeld(p.mheapLock)
+
+	// If we're not in a test, validate first by checking mheap_.arenas.
+	// This is a fast path which is only safe to use outside of testing.
+	ai := arenaIndex(addr.addr())
+	if p.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
+		vAddr, ok := p.inUse.findAddrGreaterEqual(addr.addr())
+		if ok {
+			return offAddr{vAddr}
+		} else {
+			// The candidate search address is greater than any
+			// known address, which means we definitely have no
+			// free memory left.
+			return maxOffAddr
+		}
+	}
+	return addr
+}
+
+// find searches for the first (address-ordered) contiguous free region of
+// npages in size and returns a base address for that region.
+//
+// It uses p.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(p.searchAddr) contain any free memory at all.
+//
+// find also computes and returns a candidate p.searchAddr, which may or
+// may not prune more of the address space than p.searchAddr already does.
+// This candidate is always a valid p.searchAddr.
+//
+// find represents the slow path and the full radix tree search.
+//
+// Returns a base address of 0 on failure, in which case the candidate
+// searchAddr returned is invalid and must be ignored.
+//
+// p.mheapLock must be held.
+func (p *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+	assertLockHeld(p.mheapLock)
+
+	// Search algorithm.
+	//
+	// This algorithm walks each level l of the radix tree from the root level
+	// to the leaf level. It iterates over at most 1 << levelBits[l] of entries
+	// in a given level in the radix tree, and uses the summary information to
+	// find either:
+	//  1) That a given subtree contains a large enough contiguous region, at
+	//     which point it continues iterating on the next level, or
+	//  2) That there are enough contiguous boundary-crossing bits to satisfy
+	//     the allocation, at which point it knows exactly where to start
+	//     allocating from.
+	//
+	// i tracks the index into the current level l's structure for the
+	// contiguous 1 << levelBits[l] entries we're actually interested in.
+	//
+	// NOTE: Technically this search could allocate a region which crosses
+	// the arenaBaseOffset boundary, which when arenaBaseOffset != 0, is
+	// a discontinuity. However, the only way this could happen is if the
+	// page at the zero address is mapped, and this is impossible on
+	// every system we support where arenaBaseOffset != 0. So, the
+	// discontinuity is already encoded in the fact that the OS will never
+	// map the zero page for us, and this function doesn't try to handle
+	// this case in any way.
+
+	// i is the beginning of the block of entries we're searching at the
+	// current level.
+	i := 0
+
+	// firstFree is the region of address space that we are certain to
+	// find the first free page in the heap. base and bound are the inclusive
+	// bounds of this window, and both are addresses in the linearized, contiguous
+	// view of the address space (with arenaBaseOffset pre-added). At each level,
+	// this window is narrowed as we find the memory region containing the
+	// first free page of memory. To begin with, the range reflects the
+	// full process address space.
+	//
+	// firstFree is updated by calling foundFree each time free space in the
+	// heap is discovered.
+	//
+	// At the end of the search, base.addr() is the best new
+	// searchAddr we could deduce in this search.
+	firstFree := struct {
+		base, bound offAddr
+	}{
+		base:  minOffAddr,
+		bound: maxOffAddr,
+	}
+	// foundFree takes the given address range [addr, addr+size) and
+	// updates firstFree if it is a narrower range. The input range must
+	// either be fully contained within firstFree or not overlap with it
+	// at all.
+	//
+	// This way, we'll record the first summary we find with any free
+	// pages on the root level and narrow that down if we descend into
+	// that summary. But as soon as we need to iterate beyond that summary
+	// in a level to find a large enough range, we'll stop narrowing.
+	foundFree := func(addr offAddr, size uintptr) {
+		if firstFree.base.lessEqual(addr) && addr.add(size-1).lessEqual(firstFree.bound) {
+			// This range fits within the current firstFree window, so narrow
+			// down the firstFree window to the base and bound of this range.
+			firstFree.base = addr
+			firstFree.bound = addr.add(size - 1)
+		} else if !(addr.add(size-1).lessThan(firstFree.base) || firstFree.bound.lessThan(addr)) {
+			// This range only partially overlaps with the firstFree range,
+			// so throw.
+			print("runtime: addr = ", hex(addr.addr()), ", size = ", size, "\n")
+			print("runtime: base = ", hex(firstFree.base.addr()), ", bound = ", hex(firstFree.bound.addr()), "\n")
+			throw("range partially overlaps")
+		}
+	}
+
+	// lastSum is the summary which we saw on the previous level that made us
+	// move on to the next level. Used to print additional information in the
+	// case of a catastrophic failure.
+	// lastSumIdx is that summary's index in the previous level.
+	lastSum := packPallocSum(0, 0, 0)
+	lastSumIdx := -1
+
+nextLevel:
+	for l := 0; l < len(p.summary); l++ {
+		// For the root level, entriesPerBlock is the whole level.
+		entriesPerBlock := 1 << levelBits[l]
+		logMaxPages := levelLogPages[l]
+
+		// We've moved into a new level, so let's update i to our new
+		// starting index. This is a no-op for level 0.
+		i <<= levelBits[l]
+
+		// Slice out the block of entries we care about.
+		entries := p.summary[l][i : i+entriesPerBlock]
+
+		// Determine j0, the first index we should start iterating from.
+		// The searchAddr may help us eliminate iterations if we followed the
+		// searchAddr on the previous level or we're on the root leve, in which
+		// case the searchAddr should be the same as i after levelShift.
+		j0 := 0
+		if searchIdx := offAddrToLevelIndex(l, p.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
+			j0 = searchIdx & (entriesPerBlock - 1)
+		}
+
+		// Run over the level entries looking for
+		// a contiguous run of at least npages either
+		// within an entry or across entries.
+		//
+		// base contains the page index (relative to
+		// the first entry's first page) of the currently
+		// considered run of consecutive pages.
+		//
+		// size contains the size of the currently considered
+		// run of consecutive pages.
+		var base, size uint
+		for j := j0; j < len(entries); j++ {
+			sum := entries[j]
+			if sum == 0 {
+				// A full entry means we broke any streak and
+				// that we should skip it altogether.
+				size = 0
+				continue
+			}
+
+			// We've encountered a non-zero summary which means
+			// free memory, so update firstFree.
+			foundFree(levelIndexToOffAddr(l, i+j), (uintptr(1)<<logMaxPages)*pageSize)
+
+			s := sum.start()
+			if size+s >= uint(npages) {
+				// If size == 0 we don't have a run yet,
+				// which means base isn't valid. So, set
+				// base to the first page in this block.
+				if size == 0 {
+					base = uint(j) << logMaxPages
+				}
+				// We hit npages; we're done!
+				size += s
+				break
+			}
+			if sum.max() >= uint(npages) {
+				// The entry itself contains npages contiguous
+				// free pages, so continue on the next level
+				// to find that run.
+				i += j
+				lastSumIdx = i
+				lastSum = sum
+				continue nextLevel
+			}
+			if size == 0 || s < 1<<logMaxPages {
+				// We either don't have a current run started, or this entry
+				// isn't totally free (meaning we can't continue the current
+				// one), so try to begin a new run by setting size and base
+				// based on sum.end.
+				size = sum.end()
+				base = uint(j+1)<<logMaxPages - size
+				continue
+			}
+			// The entry is completely free, so continue the run.
+			size += 1 << logMaxPages
+		}
+		if size >= uint(npages) {
+			// We found a sufficiently large run of free pages straddling
+			// some boundary, so compute the address and return it.
+			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
+			return addr, p.findMappedAddr(firstFree.base)
+		}
+		if l == 0 {
+			// We're at level zero, so that means we've exhausted our search.
+			return 0, maxSearchAddr
+		}
+
+		// We're not at level zero, and we exhausted the level we were looking in.
+		// This means that either our calculations were wrong or the level above
+		// lied to us. In either case, dump some useful state and throw.
+		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
+		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
+		print("runtime: p.searchAddr = ", hex(p.searchAddr.addr()), ", i = ", i, "\n")
+		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
+		for j := 0; j < len(entries); j++ {
+			sum := entries[j]
+			print("runtime: summary[", l, "][", i+j, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		}
+		throw("bad summary data")
+	}
+
+	// Since we've gotten to this point, that means we haven't found a
+	// sufficiently-sized free region straddling some boundary (chunk or larger).
+	// This means the last summary we inspected must have had a large enough "max"
+	// value, so look inside the chunk to find a suitable run.
+	//
+	// After iterating over all levels, i must contain a chunk index which
+	// is what the final level represents.
+	ci := chunkIdx(i)
+	j, searchIdx := p.chunkOf(ci).find(npages, 0)
+	if j == ^uint(0) {
+		// We couldn't find any space in this chunk despite the summaries telling
+		// us it should be there. There's likely a bug, so dump some state and throw.
+		sum := p.summary[len(p.summary)-1][i]
+		print("runtime: summary[", len(p.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		print("runtime: npages = ", npages, "\n")
+		throw("bad summary data")
+	}
+
+	// Compute the address at which the free space starts.
+	addr := chunkBase(ci) + uintptr(j)*pageSize
+
+	// Since we actually searched the chunk, we may have
+	// found an even narrower free window.
+	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
+	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
+	return addr, p.findMappedAddr(firstFree.base)
+}
+
+// alloc allocates npages worth of memory from the page heap, returning the base
+// address for the allocation and the amount of scavenged memory in bytes
+// contained in the region [base address, base address + npages*pageSize).
+//
+// Returns a 0 base address on failure, in which case other returned values
+// should be ignored.
+//
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+	assertLockHeld(p.mheapLock)
+
+	// If the searchAddr refers to a region which has a higher address than
+	// any known chunk, then we know we're out of memory.
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
+		return 0, 0
+	}
+
+	// If npages has a chance of fitting in the chunk where the searchAddr is,
+	// search it directly.
+	searchAddr := minOffAddr
+	if pallocChunkPages-chunkPageIndex(p.searchAddr.addr()) >= uint(npages) {
+		// npages is guaranteed to be no greater than pallocChunkPages here.
+		i := chunkIndex(p.searchAddr.addr())
+		if max := p.summary[len(p.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := p.chunkOf(i).find(npages, chunkPageIndex(p.searchAddr.addr()))
+			if j == ^uint(0) {
+				print("runtime: max = ", max, ", npages = ", npages, "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(p.searchAddr.addr()), ", p.searchAddr = ", hex(p.searchAddr.addr()), "\n")
+				throw("bad summary data")
+			}
+			addr = chunkBase(i) + uintptr(j)*pageSize
+			searchAddr = offAddr{chunkBase(i) + uintptr(searchIdx)*pageSize}
+			goto Found
+		}
+	}
+	// We failed to use a searchAddr for one reason or another, so try
+	// the slow path.
+	addr, searchAddr = p.find(npages)
+	if addr == 0 {
+		if npages == 1 {
+			// We failed to find a single free page, the smallest unit
+			// of allocation. This means we know the heap is completely
+			// exhausted. Otherwise, the heap still might have free
+			// space in it, just not enough contiguous space to
+			// accommodate npages.
+			p.searchAddr = maxSearchAddr
+		}
+		return 0, 0
+	}
+Found:
+	// Go ahead and actually mark the bits now that we have an address.
+	scav = p.allocRange(addr, npages)
+
+	// If we found a higher searchAddr, we know that all the
+	// heap memory before that searchAddr in an offset address space is
+	// allocated, so bump p.searchAddr up to the new one.
+	if p.searchAddr.lessThan(searchAddr) {
+		p.searchAddr = searchAddr
+	}
+	return addr, scav
+}
+
+// free returns npages worth of memory starting at base back to the page heap.
+//
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
+	assertLockHeld(p.mheapLock)
+
+	// If we're freeing pages below the p.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
+	}
+	limit := base + npages*pageSize - 1
+	if !scavenged {
+		// Update the free high watermark for the scavenger.
+		if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+			p.scav.freeHWM = offLimit
+		}
+	}
+	if npages == 1 {
+		// Fast path: we're clearing a single bit, and we know exactly
+		// where it is, so mark it directly.
+		i := chunkIndex(base)
+		p.chunkOf(i).free1(chunkPageIndex(base))
+	} else {
+		// Slow path: we're clearing more bits so we may need to iterate.
+		sc, ec := chunkIndex(base), chunkIndex(limit)
+		si, ei := chunkPageIndex(base), chunkPageIndex(limit)
+
+		if sc == ec {
+			// The range doesn't cross any chunk boundaries.
+			p.chunkOf(sc).free(si, ei+1-si)
+		} else {
+			// The range crosses at least one chunk boundary.
+			p.chunkOf(sc).free(si, pallocChunkPages-si)
+			for c := sc + 1; c < ec; c++ {
+				p.chunkOf(c).freeAll()
+			}
+			p.chunkOf(ec).free(0, ei+1)
+		}
+	}
+	p.update(base, npages, true, false)
+}
+
+const (
+	pallocSumBytes = unsafe.Sizeof(pallocSum(0))
+
+	// maxPackedValue is the maximum value that any of the three fields in
+	// the pallocSum may take on.
+	maxPackedValue    = 1 << logMaxPackedValue
+	logMaxPackedValue = logPallocChunkPages + (summaryLevels-1)*summaryLevelBits
+
+	freeChunkSum = pallocSum(uint64(pallocChunkPages) |
+		uint64(pallocChunkPages<<logMaxPackedValue) |
+		uint64(pallocChunkPages<<(2*logMaxPackedValue)))
+)
+
+// pallocSum is a packed summary type which packs three numbers: start, max,
+// and end into a single 8-byte value. Each of these values are a summary of
+// a bitmap and are thus counts, each of which may have a maximum value of
+// 2^21 - 1, or all three may be equal to 2^21. The latter case is represented
+// by just setting the 64th bit.
+type pallocSum uint64
+
+// packPallocSum takes a start, max, and end value and produces a pallocSum.
+func packPallocSum(start, max, end uint) pallocSum {
+	if max == maxPackedValue {
+		return pallocSum(uint64(1 << 63))
+	}
+	return pallocSum((uint64(start) & (maxPackedValue - 1)) |
+		((uint64(max) & (maxPackedValue - 1)) << logMaxPackedValue) |
+		((uint64(end) & (maxPackedValue - 1)) << (2 * logMaxPackedValue)))
+}
+
+// start extracts the start value from a packed sum.
+func (p pallocSum) start() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint(uint64(p) & (maxPackedValue - 1))
+}
+
+// max extracts the max value from a packed sum.
+func (p pallocSum) max() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint((uint64(p) >> logMaxPackedValue) & (maxPackedValue - 1))
+}
+
+// end extracts the end value from a packed sum.
+func (p pallocSum) end() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint((uint64(p) >> (2 * logMaxPackedValue)) & (maxPackedValue - 1))
+}
+
+// unpack unpacks all three values from the summary.
+func (p pallocSum) unpack() (uint, uint, uint) {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue, maxPackedValue, maxPackedValue
+	}
+	return uint(uint64(p) & (maxPackedValue - 1)),
+		uint((uint64(p) >> logMaxPackedValue) & (maxPackedValue - 1)),
+		uint((uint64(p) >> (2 * logMaxPackedValue)) & (maxPackedValue - 1))
+}
+
+// mergeSummaries merges consecutive summaries which may each represent at
+// most 1 << logMaxPagesPerSum pages each together into one.
+func mergeSummaries(sums []pallocSum, logMaxPagesPerSum uint) pallocSum {
+	// Merge the summaries in sums into one.
+	//
+	// We do this by keeping a running summary representing the merged
+	// summaries of sums[:i] in start, max, and end.
+	start, max, end := sums[0].unpack()
+	for i := 1; i < len(sums); i++ {
+		// Merge in sums[i].
+		si, mi, ei := sums[i].unpack()
+
+		// Merge in sums[i].start only if the running summary is
+		// completely free, otherwise this summary's start
+		// plays no role in the combined sum.
+		if start == uint(i)<<logMaxPagesPerSum {
+			start += si
+		}
+
+		// Recompute the max value of the running sum by looking
+		// across the boundary between the running sum and sums[i]
+		// and at the max sums[i], taking the greatest of those two
+		// and the max of the running sum.
+		if end+si > max {
+			max = end + si
+		}
+		if mi > max {
+			max = mi
+		}
+
+		// Merge in end by checking if this new summary is totally
+		// free. If it is, then we want to extend the running sum's
+		// end by the new summary. If not, then we have some alloc'd
+		// pages in there and we just want to take the end value in
+		// sums[i].
+		if ei == 1<<logMaxPagesPerSum {
+			end += 1 << logMaxPagesPerSum
+		} else {
+			end = ei
+		}
+	}
+	return packPallocSum(start, max, end)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mpagealloc_64bit.go b/contrib/go/_std_1.18/src/runtime/mpagealloc_64bit.go
new file mode 100644
index 0000000000..1bacfbe0fa
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mpagealloc_64bit.go
@@ -0,0 +1,178 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x
+
+package runtime
+
+import "unsafe"
+
+const (
+	// The number of levels in the radix tree.
+	summaryLevels = 5
+
+	// Constants for testing.
+	pageAlloc32Bit = 0
+	pageAlloc64Bit = 1
+
+	// Number of bits needed to represent all indices into the L1 of the
+	// chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this number change.
+	pallocChunksL1Bits = 13
+)
+
+// levelBits is the number of bits in the radix for a given level in the super summary
+// structure.
+//
+// The sum of all the entries of levelBits should equal heapAddrBits.
+var levelBits = [summaryLevels]uint{
+	summaryL0Bits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+}
+
+// levelShift is the number of bits to shift to acquire the radix for a given level
+// in the super summary structure.
+//
+// With levelShift, one can compute the index of the summary at level l related to a
+// pointer p by doing:
+//   p >> levelShift[l]
+var levelShift = [summaryLevels]uint{
+	heapAddrBits - summaryL0Bits,
+	heapAddrBits - summaryL0Bits - 1*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 2*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 3*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 4*summaryLevelBits,
+}
+
+// levelLogPages is log2 the maximum number of runtime pages in the address space
+// a summary in the given level represents.
+//
+// The leaf level always represents exactly log2 of 1 chunk's worth of pages.
+var levelLogPages = [summaryLevels]uint{
+	logPallocChunkPages + 4*summaryLevelBits,
+	logPallocChunkPages + 3*summaryLevelBits,
+	logPallocChunkPages + 2*summaryLevelBits,
+	logPallocChunkPages + 1*summaryLevelBits,
+	logPallocChunkPages,
+}
+
+// sysInit performs architecture-dependent initialization of fields
+// in pageAlloc. pageAlloc should be uninitialized except for sysStat
+// if any runtime statistic should be updated.
+func (p *pageAlloc) sysInit() {
+	// Reserve memory for each level. This will get mapped in
+	// as R/W by setArenas.
+	for l, shift := range levelShift {
+		entries := 1 << (heapAddrBits - shift)
+
+		// Reserve b bytes of memory anywhere in the address space.
+		b := alignUp(uintptr(entries)*pallocSumBytes, physPageSize)
+		r := sysReserve(nil, b)
+		if r == nil {
+			throw("failed to reserve page summary memory")
+		}
+
+		// Put this reservation into a slice.
+		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+	}
+}
+
+// sysGrow performs architecture-dependent operations on heap
+// growth for the page allocator, such as mapping in new memory
+// for summaries. It also updates the length of the slices in
+// [.summary.
+//
+// base is the base of the newly-added heap memory and limit is
+// the first address past the end of the newly-added heap memory.
+// Both must be aligned to pallocChunkBytes.
+//
+// The caller must update p.start and p.end after calling sysGrow.
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
+	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
+		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
+		throw("sysGrow bounds not aligned to pallocChunkBytes")
+	}
+
+	// addrRangeToSummaryRange converts a range of addresses into a range
+	// of summary indices which must be mapped to support those addresses
+	// in the summary range.
+	addrRangeToSummaryRange := func(level int, r addrRange) (int, int) {
+		sumIdxBase, sumIdxLimit := addrsToSummaryRange(level, r.base.addr(), r.limit.addr())
+		return blockAlignSummaryRange(level, sumIdxBase, sumIdxLimit)
+	}
+
+	// summaryRangeToSumAddrRange converts a range of indices in any
+	// level of p.summary into page-aligned addresses which cover that
+	// range of indices.
+	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
+		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
+		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
+		base := unsafe.Pointer(&p.summary[level][0])
+		return addrRange{
+			offAddr{uintptr(add(base, baseOffset))},
+			offAddr{uintptr(add(base, limitOffset))},
+		}
+	}
+
+	// addrRangeToSumAddrRange is a convienience function that converts
+	// an address range r to the address range of the given summary level
+	// that stores the summaries for r.
+	addrRangeToSumAddrRange := func(level int, r addrRange) addrRange {
+		sumIdxBase, sumIdxLimit := addrRangeToSummaryRange(level, r)
+		return summaryRangeToSumAddrRange(level, sumIdxBase, sumIdxLimit)
+	}
+
+	// Find the first inUse index which is strictly greater than base.
+	//
+	// Because this function will never be asked remap the same memory
+	// twice, this index is effectively the index at which we would insert
+	// this new growth, and base will never overlap/be contained within
+	// any existing range.
+	//
+	// This will be used to look at what memory in the summary array is already
+	// mapped before and after this new range.
+	inUseIndex := p.inUse.findSucc(base)
+
+	// Walk up the radix tree and map summaries in as needed.
+	for l := range p.summary {
+		// Figure out what part of the summary array this new address space needs.
+		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
+
+		// Update the summary slices with a new upper-bound. This ensures
+		// we get tight bounds checks on at least the top bound.
+		//
+		// We must do this regardless of whether we map new memory.
+		if needIdxLimit > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:needIdxLimit]
+		}
+
+		// Compute the needed address range in the summary array for level l.
+		need := summaryRangeToSumAddrRange(l, needIdxBase, needIdxLimit)
+
+		// Prune need down to what needs to be newly mapped. Some parts of it may
+		// already be mapped by what inUse describes due to page alignment requirements
+		// for mapping. prune's invariants are guaranteed by the fact that this
+		// function will never be asked to remap the same memory twice.
+		if inUseIndex > 0 {
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex-1]))
+		}
+		if inUseIndex < len(p.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex]))
+		}
+		// It's possible that after our pruning above, there's nothing new to map.
+		if need.size() == 0 {
+			continue
+		}
+
+		// Map and commit need.
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), p.sysStat)
+		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mpagecache.go b/contrib/go/_std_1.18/src/runtime/mpagecache.go
new file mode 100644
index 0000000000..7206e2dbdb
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mpagecache.go
@@ -0,0 +1,177 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const pageCachePages = 8 * unsafe.Sizeof(pageCache{}.cache)
+
+// pageCache represents a per-p cache of pages the allocator can
+// allocate from without a lock. More specifically, it represents
+// a pageCachePages*pageSize chunk of memory with 0 or more free
+// pages in it.
+type pageCache struct {
+	base  uintptr // base address of the chunk
+	cache uint64  // 64-bit bitmap representing free pages (1 means free)
+	scav  uint64  // 64-bit bitmap representing scavenged pages (1 means scavenged)
+}
+
+// empty returns true if the pageCache has any free pages, and false
+// otherwise.
+func (c *pageCache) empty() bool {
+	return c.cache == 0
+}
+
+// alloc allocates npages from the page cache and is the main entry
+// point for allocation.
+//
+// Returns a base address and the amount of scavenged memory in the
+// allocated region in bytes.
+//
+// Returns a base address of zero on failure, in which case the
+// amount of scavenged memory should be ignored.
+func (c *pageCache) alloc(npages uintptr) (uintptr, uintptr) {
+	if c.cache == 0 {
+		return 0, 0
+	}
+	if npages == 1 {
+		i := uintptr(sys.TrailingZeros64(c.cache))
+		scav := (c.scav >> i) & 1
+		c.cache &^= 1 << i // set bit to mark in-use
+		c.scav &^= 1 << i  // clear bit to mark unscavenged
+		return c.base + i*pageSize, uintptr(scav) * pageSize
+	}
+	return c.allocN(npages)
+}
+
+// allocN is a helper which attempts to allocate npages worth of pages
+// from the cache. It represents the general case for allocating from
+// the page cache.
+//
+// Returns a base address and the amount of scavenged memory in the
+// allocated region in bytes.
+func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
+	i := findBitRange64(c.cache, uint(npages))
+	if i >= 64 {
+		return 0, 0
+	}
+	mask := ((uint64(1) << npages) - 1) << i
+	scav := sys.OnesCount64(c.scav & mask)
+	c.cache &^= mask // mark in-use bits
+	c.scav &^= mask  // clear scavenged bits
+	return c.base + uintptr(i*pageSize), uintptr(scav) * pageSize
+}
+
+// flush empties out unallocated free pages in the given cache
+// into s. Then, it clears the cache, such that empty returns
+// true.
+//
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (c *pageCache) flush(p *pageAlloc) {
+	assertLockHeld(p.mheapLock)
+
+	if c.empty() {
+		return
+	}
+	ci := chunkIndex(c.base)
+	pi := chunkPageIndex(c.base)
+
+	// This method is called very infrequently, so just do the
+	// slower, safer thing by iterating over each bit individually.
+	for i := uint(0); i < 64; i++ {
+		if c.cache&(1<<i) != 0 {
+			p.chunkOf(ci).free1(pi + i)
+		}
+		if c.scav&(1<<i) != 0 {
+			p.chunkOf(ci).scavenged.setRange(pi+i, 1)
+		}
+	}
+	// Since this is a lot like a free, we need to make sure
+	// we update the searchAddr just like free does.
+	if b := (offAddr{c.base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
+	}
+	p.update(c.base, pageCachePages, false, false)
+	*c = pageCache{}
+}
+
+// allocToCache acquires a pageCachePages-aligned chunk of free pages which
+// may not be contiguous, and returns a pageCache structure which owns the
+// chunk.
+//
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) allocToCache() pageCache {
+	assertLockHeld(p.mheapLock)
+
+	// If the searchAddr refers to a region which has a higher address than
+	// any known chunk, then we know we're out of memory.
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
+		return pageCache{}
+	}
+	c := pageCache{}
+	ci := chunkIndex(p.searchAddr.addr()) // chunk index
+	var chunk *pallocData
+	if p.summary[len(p.summary)-1][ci] != 0 {
+		// Fast path: there's free pages at or near the searchAddr address.
+		chunk = p.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(p.searchAddr.addr()))
+		if j == ^uint(0) {
+			throw("bad summary data")
+		}
+		c = pageCache{
+			base:  chunkBase(ci) + alignDown(uintptr(j), 64)*pageSize,
+			cache: ^chunk.pages64(j),
+			scav:  chunk.scavenged.block64(j),
+		}
+	} else {
+		// Slow path: the searchAddr address had nothing there, so go find
+		// the first free page the slow way.
+		addr, _ := p.find(1)
+		if addr == 0 {
+			// We failed to find adequate free space, so mark the searchAddr as OoM
+			// and return an empty pageCache.
+			p.searchAddr = maxSearchAddr
+			return pageCache{}
+		}
+		ci := chunkIndex(addr)
+		chunk = p.chunkOf(ci)
+		c = pageCache{
+			base:  alignDown(addr, 64*pageSize),
+			cache: ^chunk.pages64(chunkPageIndex(addr)),
+			scav:  chunk.scavenged.block64(chunkPageIndex(addr)),
+		}
+	}
+
+	// Set the page bits as allocated and clear the scavenged bits, but
+	// be careful to only set and clear the relevant bits.
+	cpi := chunkPageIndex(c.base)
+	chunk.allocPages64(cpi, c.cache)
+	chunk.scavenged.clearBlock64(cpi, c.cache&c.scav /* free and scavenged */)
+
+	// Update as an allocation, but note that it's not contiguous.
+	p.update(c.base, pageCachePages, false, true)
+
+	// Set the search address to the last page represented by the cache.
+	// Since all of the pages in this block are going to the cache, and we
+	// searched for the first free page, we can confidently start at the
+	// next page.
+	//
+	// However, p.searchAddr is not allowed to point into unmapped heap memory
+	// unless it is maxSearchAddr, so make it the last page as opposed to
+	// the page after.
+	p.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
+	return c
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mpallocbits.go b/contrib/go/_std_1.18/src/runtime/mpallocbits.go
new file mode 100644
index 0000000000..f63164becd
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mpallocbits.go
@@ -0,0 +1,446 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+)
+
+// pageBits is a bitmap representing one bit per page in a palloc chunk.
+type pageBits [pallocChunkPages / 64]uint64
+
+// get returns the value of the i'th bit in the bitmap.
+func (b *pageBits) get(i uint) uint {
+	return uint((b[i/64] >> (i % 64)) & 1)
+}
+
+// block64 returns the 64-bit aligned block of bits containing the i'th bit.
+func (b *pageBits) block64(i uint) uint64 {
+	return b[i/64]
+}
+
+// set sets bit i of pageBits.
+func (b *pageBits) set(i uint) {
+	b[i/64] |= 1 << (i % 64)
+}
+
+// setRange sets bits in the range [i, i+n).
+func (b *pageBits) setRange(i, n uint) {
+	_ = b[i/64]
+	if n == 1 {
+		// Fast path for the n == 1 case.
+		b.set(i)
+		return
+	}
+	// Set bits [i, j].
+	j := i + n - 1
+	if i/64 == j/64 {
+		b[i/64] |= ((uint64(1) << n) - 1) << (i % 64)
+		return
+	}
+	_ = b[j/64]
+	// Set leading bits.
+	b[i/64] |= ^uint64(0) << (i % 64)
+	for k := i/64 + 1; k < j/64; k++ {
+		b[k] = ^uint64(0)
+	}
+	// Set trailing bits.
+	b[j/64] |= (uint64(1) << (j%64 + 1)) - 1
+}
+
+// setAll sets all the bits of b.
+func (b *pageBits) setAll() {
+	for i := range b {
+		b[i] = ^uint64(0)
+	}
+}
+
+// setBlock64 sets the 64-bit aligned block of bits containing the i'th bit that
+// are set in v.
+func (b *pageBits) setBlock64(i uint, v uint64) {
+	b[i/64] |= v
+}
+
+// clear clears bit i of pageBits.
+func (b *pageBits) clear(i uint) {
+	b[i/64] &^= 1 << (i % 64)
+}
+
+// clearRange clears bits in the range [i, i+n).
+func (b *pageBits) clearRange(i, n uint) {
+	_ = b[i/64]
+	if n == 1 {
+		// Fast path for the n == 1 case.
+		b.clear(i)
+		return
+	}
+	// Clear bits [i, j].
+	j := i + n - 1
+	if i/64 == j/64 {
+		b[i/64] &^= ((uint64(1) << n) - 1) << (i % 64)
+		return
+	}
+	_ = b[j/64]
+	// Clear leading bits.
+	b[i/64] &^= ^uint64(0) << (i % 64)
+	for k := i/64 + 1; k < j/64; k++ {
+		b[k] = 0
+	}
+	// Clear trailing bits.
+	b[j/64] &^= (uint64(1) << (j%64 + 1)) - 1
+}
+
+// clearAll frees all the bits of b.
+func (b *pageBits) clearAll() {
+	for i := range b {
+		b[i] = 0
+	}
+}
+
+// clearBlock64 clears the 64-bit aligned block of bits containing the i'th bit that
+// are set in v.
+func (b *pageBits) clearBlock64(i uint, v uint64) {
+	b[i/64] &^= v
+}
+
+// popcntRange counts the number of set bits in the
+// range [i, i+n).
+func (b *pageBits) popcntRange(i, n uint) (s uint) {
+	if n == 1 {
+		return uint((b[i/64] >> (i % 64)) & 1)
+	}
+	_ = b[i/64]
+	j := i + n - 1
+	if i/64 == j/64 {
+		return uint(sys.OnesCount64((b[i/64] >> (i % 64)) & ((1 << n) - 1)))
+	}
+	_ = b[j/64]
+	s += uint(sys.OnesCount64(b[i/64] >> (i % 64)))
+	for k := i/64 + 1; k < j/64; k++ {
+		s += uint(sys.OnesCount64(b[k]))
+	}
+	s += uint(sys.OnesCount64(b[j/64] & ((1 << (j%64 + 1)) - 1)))
+	return
+}
+
+// pallocBits is a bitmap that tracks page allocations for at most one
+// palloc chunk.
+//
+// The precise representation is an implementation detail, but for the
+// sake of documentation, 0s are free pages and 1s are allocated pages.
+type pallocBits pageBits
+
+// summarize returns a packed summary of the bitmap in pallocBits.
+func (b *pallocBits) summarize() pallocSum {
+	var start, max, cur uint
+	const notSetYet = ^uint(0) // sentinel for start value
+	start = notSetYet
+	for i := 0; i < len(b); i++ {
+		x := b[i]
+		if x == 0 {
+			cur += 64
+			continue
+		}
+		t := uint(sys.TrailingZeros64(x))
+		l := uint(sys.LeadingZeros64(x))
+
+		// Finish any region spanning the uint64s
+		cur += t
+		if start == notSetYet {
+			start = cur
+		}
+		if cur > max {
+			max = cur
+		}
+		// Final region that might span to next uint64
+		cur = l
+	}
+	if start == notSetYet {
+		// Made it all the way through without finding a single 1 bit.
+		const n = uint(64 * len(b))
+		return packPallocSum(n, n, n)
+	}
+	if cur > max {
+		max = cur
+	}
+	if max >= 64-2 {
+		// There is no way an internal run of zeros could beat max.
+		return packPallocSum(start, max, cur)
+	}
+	// Now look inside each uint64 for runs of zeros.
+	// All uint64s must be nonzero, or we would have aborted above.
+outer:
+	for i := 0; i < len(b); i++ {
+		x := b[i]
+
+		// Look inside this uint64. We have a pattern like
+		// 000000 1xxxxx1 000000
+		// We need to look inside the 1xxxxx1 for any contiguous
+		// region of zeros.
+
+		// We already know the trailing zeros are no larger than max. Remove them.
+		x >>= sys.TrailingZeros64(x) & 63
+		if x&(x+1) == 0 { // no more zeros (except at the top).
+			continue
+		}
+
+		// Strategy: shrink all runs of zeros by max. If any runs of zero
+		// remain, then we've identified a larger maxiumum zero run.
+		p := max     // number of zeros we still need to shrink by.
+		k := uint(1) // current minimum length of runs of ones in x.
+		for {
+			// Shrink all runs of zeros by p places (except the top zeros).
+			for p > 0 {
+				if p <= k {
+					// Shift p ones down into the top of each run of zeros.
+					x |= x >> (p & 63)
+					if x&(x+1) == 0 { // no more zeros (except at the top).
+						continue outer
+					}
+					break
+				}
+				// Shift k ones down into the top of each run of zeros.
+				x |= x >> (k & 63)
+				if x&(x+1) == 0 { // no more zeros (except at the top).
+					continue outer
+				}
+				p -= k
+				// We've just doubled the minimum length of 1-runs.
+				// This allows us to shift farther in the next iteration.
+				k *= 2
+			}
+
+			// The length of the lowest-order zero run is an increment to our maximum.
+			j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
+			x >>= j & 63                       // remove trailing ones
+			j = uint(sys.TrailingZeros64(x))   // count contiguous trailing zeros
+			x >>= j & 63                       // remove zeros
+			max += j                           // we have a new maximum!
+			if x&(x+1) == 0 {                  // no more zeros (except at the top).
+				continue outer
+			}
+			p = j // remove j more zeros from each zero run.
+		}
+	}
+	return packPallocSum(start, max, cur)
+}
+
+// find searches for npages contiguous free pages in pallocBits and returns
+// the index where that run starts, as well as the index of the first free page
+// it found in the search. searchIdx represents the first known free page and
+// where to begin the next search from.
+//
+// If find fails to find any free space, it returns an index of ^uint(0) and
+// the new searchIdx should be ignored.
+//
+// Note that if npages == 1, the two returned values will always be identical.
+func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) {
+	if npages == 1 {
+		addr := b.find1(searchIdx)
+		return addr, addr
+	} else if npages <= 64 {
+		return b.findSmallN(npages, searchIdx)
+	}
+	return b.findLargeN(npages, searchIdx)
+}
+
+// find1 is a helper for find which searches for a single free page
+// in the pallocBits and returns the index.
+//
+// See find for an explanation of the searchIdx parameter.
+func (b *pallocBits) find1(searchIdx uint) uint {
+	_ = b[0] // lift nil check out of loop
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		x := b[i]
+		if ^x == 0 {
+			continue
+		}
+		return i*64 + uint(sys.TrailingZeros64(^x))
+	}
+	return ^uint(0)
+}
+
+// findSmallN is a helper for find which searches for npages contiguous free pages
+// in this pallocBits and returns the index where that run of contiguous pages
+// starts as well as the index of the first free page it finds in its search.
+//
+// See find for an explanation of the searchIdx parameter.
+//
+// Returns a ^uint(0) index on failure and the new searchIdx should be ignored.
+//
+// findSmallN assumes npages <= 64, where any such contiguous run of pages
+// crosses at most one aligned 64-bit boundary in the bits.
+func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) {
+	end, newSearchIdx := uint(0), ^uint(0)
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		bi := b[i]
+		if ^bi == 0 {
+			end = 0
+			continue
+		}
+		// First see if we can pack our allocation in the trailing
+		// zeros plus the end of the last 64 bits.
+		if newSearchIdx == ^uint(0) {
+			// The new searchIdx is going to be at these 64 bits after any
+			// 1s we file, so count trailing 1s.
+			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi))
+		}
+		start := uint(sys.TrailingZeros64(bi))
+		if end+start >= uint(npages) {
+			return i*64 - end, newSearchIdx
+		}
+		// Next, check the interior of the 64-bit chunk.
+		j := findBitRange64(^bi, uint(npages))
+		if j < 64 {
+			return i*64 + j, newSearchIdx
+		}
+		end = uint(sys.LeadingZeros64(bi))
+	}
+	return ^uint(0), newSearchIdx
+}
+
+// findLargeN is a helper for find which searches for npages contiguous free pages
+// in this pallocBits and returns the index where that run starts, as well as the
+// index of the first free page it found it its search.
+//
+// See alloc for an explanation of the searchIdx parameter.
+//
+// Returns a ^uint(0) index on failure and the new searchIdx should be ignored.
+//
+// findLargeN assumes npages > 64, where any such run of free pages
+// crosses at least one aligned 64-bit boundary in the bits.
+func (b *pallocBits) findLargeN(npages uintptr, searchIdx uint) (uint, uint) {
+	start, size, newSearchIdx := ^uint(0), uint(0), ^uint(0)
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		x := b[i]
+		if x == ^uint64(0) {
+			size = 0
+			continue
+		}
+		if newSearchIdx == ^uint(0) {
+			// The new searchIdx is going to be at these 64 bits after any
+			// 1s we file, so count trailing 1s.
+			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^x))
+		}
+		if size == 0 {
+			size = uint(sys.LeadingZeros64(x))
+			start = i*64 + 64 - size
+			continue
+		}
+		s := uint(sys.TrailingZeros64(x))
+		if s+size >= uint(npages) {
+			size += s
+			return start, newSearchIdx
+		}
+		if s < 64 {
+			size = uint(sys.LeadingZeros64(x))
+			start = i*64 + 64 - size
+			continue
+		}
+		size += 64
+	}
+	if size < uint(npages) {
+		return ^uint(0), newSearchIdx
+	}
+	return start, newSearchIdx
+}
+
+// allocRange allocates the range [i, i+n).
+func (b *pallocBits) allocRange(i, n uint) {
+	(*pageBits)(b).setRange(i, n)
+}
+
+// allocAll allocates all the bits of b.
+func (b *pallocBits) allocAll() {
+	(*pageBits)(b).setAll()
+}
+
+// free1 frees a single page in the pallocBits at i.
+func (b *pallocBits) free1(i uint) {
+	(*pageBits)(b).clear(i)
+}
+
+// free frees the range [i, i+n) of pages in the pallocBits.
+func (b *pallocBits) free(i, n uint) {
+	(*pageBits)(b).clearRange(i, n)
+}
+
+// freeAll frees all the bits of b.
+func (b *pallocBits) freeAll() {
+	(*pageBits)(b).clearAll()
+}
+
+// pages64 returns a 64-bit bitmap representing a block of 64 pages aligned
+// to 64 pages. The returned block of pages is the one containing the i'th
+// page in this pallocBits. Each bit represents whether the page is in-use.
+func (b *pallocBits) pages64(i uint) uint64 {
+	return (*pageBits)(b).block64(i)
+}
+
+// allocPages64 allocates a 64-bit block of 64 pages aligned to 64 pages according
+// to the bits set in alloc. The block set is the one containing the i'th page.
+func (b *pallocBits) allocPages64(i uint, alloc uint64) {
+	(*pageBits)(b).setBlock64(i, alloc)
+}
+
+// findBitRange64 returns the bit index of the first set of
+// n consecutive 1 bits. If no consecutive set of 1 bits of
+// size n may be found in c, then it returns an integer >= 64.
+// n must be > 0.
+func findBitRange64(c uint64, n uint) uint {
+	// This implementation is based on shrinking the length of
+	// runs of contiguous 1 bits. We remove the top n-1 1 bits
+	// from each run of 1s, then look for the first remaining 1 bit.
+	p := n - 1   // number of 1s we want to remove.
+	k := uint(1) // current minimum width of runs of 0 in c.
+	for p > 0 {
+		if p <= k {
+			// Shift p 0s down into the top of each run of 1s.
+			c &= c >> (p & 63)
+			break
+		}
+		// Shift k 0s down into the top of each run of 1s.
+		c &= c >> (k & 63)
+		if c == 0 {
+			return 64
+		}
+		p -= k
+		// We've just doubled the minimum length of 0-runs.
+		// This allows us to shift farther in the next iteration.
+		k *= 2
+	}
+	// Find first remaining 1.
+	// Since we shrunk from the top down, the first 1 is in
+	// its correct original position.
+	return uint(sys.TrailingZeros64(c))
+}
+
+// pallocData encapsulates pallocBits and a bitmap for
+// whether or not a given page is scavenged in a single
+// structure. It's effectively a pallocBits with
+// additional functionality.
+//
+// Update the comment on (*pageAlloc).chunks should this
+// structure change.
+type pallocData struct {
+	pallocBits
+	scavenged pageBits
+}
+
+// allocRange sets bits [i, i+n) in the bitmap to 1 and
+// updates the scavenged bits appropriately.
+func (m *pallocData) allocRange(i, n uint) {
+	// Clear the scavenged bits when we alloc the range.
+	m.pallocBits.allocRange(i, n)
+	m.scavenged.clearRange(i, n)
+}
+
+// allocAll sets every bit in the bitmap to 1 and updates
+// the scavenged bits appropriately.
+func (m *pallocData) allocAll() {
+	// Clear the scavenged bits when we alloc the range.
+	m.pallocBits.allocAll()
+	m.scavenged.clearAll()
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mprof.go b/contrib/go/_std_1.18/src/runtime/mprof.go
new file mode 100644
index 0000000000..569c17f0a7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mprof.go
@@ -0,0 +1,937 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc profiling.
+// Patterned after tcmalloc's algorithms; shorter code.
+
+package runtime
+
+import (
+	"internal/abi"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// NOTE(rsc): Everything here could use cas if contention became an issue.
+var proflock mutex
+
+// All memory allocations are local and do not escape outside of the profiler.
+// The profiler is forbidden from referring to garbage-collected memory.
+
+const (
+	// profile types
+	memProfile bucketType = 1 + iota
+	blockProfile
+	mutexProfile
+
+	// size of bucket hash table
+	buckHashSize = 179999
+
+	// max depth of stack to record in bucket
+	maxStack = 32
+)
+
+type bucketType int
+
+// A bucket holds per-call-stack profiling information.
+// The representation is a bit sleazy, inherited from C.
+// This struct defines the bucket header. It is followed in
+// memory by the stack words and then the actual record
+// data, either a memRecord or a blockRecord.
+//
+// Per-call-stack profiling information.
+// Lookup by hashing call stack into a linked-list hash table.
+//
+// No heap pointers.
+//
+//go:notinheap
+type bucket struct {
+	next    *bucket
+	allnext *bucket
+	typ     bucketType // memBucket or blockBucket (includes mutexProfile)
+	hash    uintptr
+	size    uintptr
+	nstk    uintptr
+}
+
+// A memRecord is the bucket data for a bucket of type memProfile,
+// part of the memory profile.
+type memRecord struct {
+	// The following complex 3-stage scheme of stats accumulation
+	// is required to obtain a consistent picture of mallocs and frees
+	// for some point in time.
+	// The problem is that mallocs come in real time, while frees
+	// come only after a GC during concurrent sweeping. So if we would
+	// naively count them, we would get a skew toward mallocs.
+	//
+	// Hence, we delay information to get consistent snapshots as
+	// of mark termination. Allocations count toward the next mark
+	// termination's snapshot, while sweep frees count toward the
+	// previous mark termination's snapshot:
+	//
+	//              MT          MT          MT          MT
+	//             .·|         .·|         .·|         .·|
+	//          .·˙  |      .·˙  |      .·˙  |      .·˙  |
+	//       .·˙     |   .·˙     |   .·˙     |   .·˙     |
+	//    .·˙        |.·˙        |.·˙        |.·˙        |
+	//
+	//       alloc → ▲ ← free
+	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
+	//       C+2     →    C+1    →  C
+	//
+	//                   alloc → ▲ ← free
+	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
+	//                   C+2     →    C+1    →  C
+	//
+	// Since we can't publish a consistent snapshot until all of
+	// the sweep frees are accounted for, we wait until the next
+	// mark termination ("MT" above) to publish the previous mark
+	// termination's snapshot ("P" above). To do this, allocation
+	// and free events are accounted to *future* heap profile
+	// cycles ("C+n" above) and we only publish a cycle once all
+	// of the events from that cycle must be done. Specifically:
+	//
+	// Mallocs are accounted to cycle C+2.
+	// Explicit frees are accounted to cycle C+2.
+	// GC frees (done during sweeping) are accounted to cycle C+1.
+	//
+	// After mark termination, we increment the global heap
+	// profile cycle counter and accumulate the stats from cycle C
+	// into the active profile.
+
+	// active is the currently published profile. A profiling
+	// cycle can be accumulated into active once its complete.
+	active memRecordCycle
+
+	// future records the profile events we're counting for cycles
+	// that have not yet been published. This is ring buffer
+	// indexed by the global heap profile cycle C and stores
+	// cycles C, C+1, and C+2. Unlike active, these counts are
+	// only for a single cycle; they are not cumulative across
+	// cycles.
+	//
+	// We store cycle C here because there's a window between when
+	// C becomes the active cycle and when we've flushed it to
+	// active.
+	future [3]memRecordCycle
+}
+
+// memRecordCycle
+type memRecordCycle struct {
+	allocs, frees           uintptr
+	alloc_bytes, free_bytes uintptr
+}
+
+// add accumulates b into a. It does not zero b.
+func (a *memRecordCycle) add(b *memRecordCycle) {
+	a.allocs += b.allocs
+	a.frees += b.frees
+	a.alloc_bytes += b.alloc_bytes
+	a.free_bytes += b.free_bytes
+}
+
+// A blockRecord is the bucket data for a bucket of type blockProfile,
+// which is used in blocking and mutex profiles.
+type blockRecord struct {
+	count  float64
+	cycles int64
+}
+
+var (
+	mbuckets  *bucket // memory profile buckets
+	bbuckets  *bucket // blocking profile buckets
+	xbuckets  *bucket // mutex profile buckets
+	buckhash  *[buckHashSize]*bucket
+	bucketmem uintptr
+
+	mProf struct {
+		// All fields in mProf are protected by proflock.
+
+		// cycle is the global heap profile cycle. This wraps
+		// at mProfCycleWrap.
+		cycle uint32
+		// flushed indicates that future[cycle] in all buckets
+		// has been flushed to the active profile.
+		flushed bool
+	}
+)
+
+const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
+
+// newBucket allocates a bucket with the given type and number of stack entries.
+func newBucket(typ bucketType, nstk int) *bucket {
+	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
+	switch typ {
+	default:
+		throw("invalid profile bucket type")
+	case memProfile:
+		size += unsafe.Sizeof(memRecord{})
+	case blockProfile, mutexProfile:
+		size += unsafe.Sizeof(blockRecord{})
+	}
+
+	b := (*bucket)(persistentalloc(size, 0, &memstats.buckhash_sys))
+	bucketmem += size
+	b.typ = typ
+	b.nstk = uintptr(nstk)
+	return b
+}
+
+// stk returns the slice in b holding the stack.
+func (b *bucket) stk() []uintptr {
+	stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
+	return stk[:b.nstk:b.nstk]
+}
+
+// mp returns the memRecord associated with the memProfile bucket b.
+func (b *bucket) mp() *memRecord {
+	if b.typ != memProfile {
+		throw("bad use of bucket.mp")
+	}
+	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
+	return (*memRecord)(data)
+}
+
+// bp returns the blockRecord associated with the blockProfile bucket b.
+func (b *bucket) bp() *blockRecord {
+	if b.typ != blockProfile && b.typ != mutexProfile {
+		throw("bad use of bucket.bp")
+	}
+	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
+	return (*blockRecord)(data)
+}
+
+// Return the bucket for stk[0:nstk], allocating new bucket if needed.
+func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
+	if buckhash == nil {
+		buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys))
+		if buckhash == nil {
+			throw("runtime: cannot allocate memory")
+		}
+	}
+
+	// Hash stack.
+	var h uintptr
+	for _, pc := range stk {
+		h += pc
+		h += h << 10
+		h ^= h >> 6
+	}
+	// hash in size
+	h += size
+	h += h << 10
+	h ^= h >> 6
+	// finalize
+	h += h << 3
+	h ^= h >> 11
+
+	i := int(h % buckHashSize)
+	for b := buckhash[i]; b != nil; b = b.next {
+		if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
+			return b
+		}
+	}
+
+	if !alloc {
+		return nil
+	}
+
+	// Create new bucket.
+	b := newBucket(typ, len(stk))
+	copy(b.stk(), stk)
+	b.hash = h
+	b.size = size
+	b.next = buckhash[i]
+	buckhash[i] = b
+	if typ == memProfile {
+		b.allnext = mbuckets
+		mbuckets = b
+	} else if typ == mutexProfile {
+		b.allnext = xbuckets
+		xbuckets = b
+	} else {
+		b.allnext = bbuckets
+		bbuckets = b
+	}
+	return b
+}
+
+func eqslice(x, y []uintptr) bool {
+	if len(x) != len(y) {
+		return false
+	}
+	for i, xi := range x {
+		if xi != y[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// mProf_NextCycle publishes the next heap profile cycle and creates a
+// fresh heap profile cycle. This operation is fast and can be done
+// during STW. The caller must call mProf_Flush before calling
+// mProf_NextCycle again.
+//
+// This is called by mark termination during STW so allocations and
+// frees after the world is started again count towards a new heap
+// profiling cycle.
+func mProf_NextCycle() {
+	lock(&proflock)
+	// We explicitly wrap mProf.cycle rather than depending on
+	// uint wraparound because the memRecord.future ring does not
+	// itself wrap at a power of two.
+	mProf.cycle = (mProf.cycle + 1) % mProfCycleWrap
+	mProf.flushed = false
+	unlock(&proflock)
+}
+
+// mProf_Flush flushes the events from the current heap profiling
+// cycle into the active profile. After this it is safe to start a new
+// heap profiling cycle with mProf_NextCycle.
+//
+// This is called by GC after mark termination starts the world. In
+// contrast with mProf_NextCycle, this is somewhat expensive, but safe
+// to do concurrently.
+func mProf_Flush() {
+	lock(&proflock)
+	if !mProf.flushed {
+		mProf_FlushLocked()
+		mProf.flushed = true
+	}
+	unlock(&proflock)
+}
+
+func mProf_FlushLocked() {
+	c := mProf.cycle
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+
+		// Flush cycle C into the published profile and clear
+		// it for reuse.
+		mpc := &mp.future[c%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
+	}
+}
+
+// mProf_PostSweep records that all sweep frees for this GC cycle have
+// completed. This has the effect of publishing the heap profile
+// snapshot as of the last mark termination without advancing the heap
+// profile cycle.
+func mProf_PostSweep() {
+	lock(&proflock)
+	// Flush cycle C+1 to the active profile so everything as of
+	// the last mark termination becomes visible. *Don't* advance
+	// the cycle, since we're still accumulating allocs in cycle
+	// C+2, which have to become C+1 in the next mark termination
+	// and so on.
+	c := mProf.cycle
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
+	}
+	unlock(&proflock)
+}
+
+// Called by malloc to record a profiled block.
+func mProf_Malloc(p unsafe.Pointer, size uintptr) {
+	var stk [maxStack]uintptr
+	nstk := callers(4, stk[:])
+	lock(&proflock)
+	b := stkbucket(memProfile, size, stk[:nstk], true)
+	c := mProf.cycle
+	mp := b.mp()
+	mpc := &mp.future[(c+2)%uint32(len(mp.future))]
+	mpc.allocs++
+	mpc.alloc_bytes += size
+	unlock(&proflock)
+
+	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
+	// This reduces potential contention and chances of deadlocks.
+	// Since the object must be alive during call to mProf_Malloc,
+	// it's fine to do this non-atomically.
+	systemstack(func() {
+		setprofilebucket(p, b)
+	})
+}
+
+// Called when freeing a profiled block.
+func mProf_Free(b *bucket, size uintptr) {
+	lock(&proflock)
+	c := mProf.cycle
+	mp := b.mp()
+	mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+	mpc.frees++
+	mpc.free_bytes += size
+	unlock(&proflock)
+}
+
+var blockprofilerate uint64 // in CPU ticks
+
+// SetBlockProfileRate controls the fraction of goroutine blocking events
+// that are reported in the blocking profile. The profiler aims to sample
+// an average of one blocking event per rate nanoseconds spent blocked.
+//
+// To include every blocking event in the profile, pass rate = 1.
+// To turn off profiling entirely, pass rate <= 0.
+func SetBlockProfileRate(rate int) {
+	var r int64
+	if rate <= 0 {
+		r = 0 // disable profiling
+	} else if rate == 1 {
+		r = 1 // profile everything
+	} else {
+		// convert ns to cycles, use float64 to prevent overflow during multiplication
+		r = int64(float64(rate) * float64(tickspersecond()) / (1000 * 1000 * 1000))
+		if r == 0 {
+			r = 1
+		}
+	}
+
+	atomic.Store64(&blockprofilerate, uint64(r))
+}
+
+func blockevent(cycles int64, skip int) {
+	if cycles <= 0 {
+		cycles = 1
+	}
+
+	rate := int64(atomic.Load64(&blockprofilerate))
+	if blocksampled(cycles, rate) {
+		saveblockevent(cycles, rate, skip+1, blockProfile)
+	}
+}
+
+// blocksampled returns true for all events where cycles >= rate. Shorter
+// events have a cycles/rate random chance of returning true.
+func blocksampled(cycles, rate int64) bool {
+	if rate <= 0 || (rate > cycles && int64(fastrand())%rate > cycles) {
+		return false
+	}
+	return true
+}
+
+func saveblockevent(cycles, rate int64, skip int, which bucketType) {
+	gp := getg()
+	var nstk int
+	var stk [maxStack]uintptr
+	if gp.m.curg == nil || gp.m.curg == gp {
+		nstk = callers(skip, stk[:])
+	} else {
+		nstk = gcallers(gp.m.curg, skip, stk[:])
+	}
+	lock(&proflock)
+	b := stkbucket(which, 0, stk[:nstk], true)
+
+	if which == blockProfile && cycles < rate {
+		// Remove sampling bias, see discussion on http://golang.org/cl/299991.
+		b.bp().count += float64(rate) / float64(cycles)
+		b.bp().cycles += rate
+	} else {
+		b.bp().count++
+		b.bp().cycles += cycles
+	}
+	unlock(&proflock)
+}
+
+var mutexprofilerate uint64 // fraction sampled
+
+// SetMutexProfileFraction controls the fraction of mutex contention events
+// that are reported in the mutex profile. On average 1/rate events are
+// reported. The previous rate is returned.
+//
+// To turn off profiling entirely, pass rate 0.
+// To just read the current rate, pass rate < 0.
+// (For n>1 the details of sampling may change.)
+func SetMutexProfileFraction(rate int) int {
+	if rate < 0 {
+		return int(mutexprofilerate)
+	}
+	old := mutexprofilerate
+	atomic.Store64(&mutexprofilerate, uint64(rate))
+	return int(old)
+}
+
+//go:linkname mutexevent sync.event
+func mutexevent(cycles int64, skip int) {
+	if cycles < 0 {
+		cycles = 0
+	}
+	rate := int64(atomic.Load64(&mutexprofilerate))
+	// TODO(pjw): measure impact of always calling fastrand vs using something
+	// like malloc.go:nextSample()
+	if rate > 0 && int64(fastrand())%rate == 0 {
+		saveblockevent(cycles, rate, skip+1, mutexProfile)
+	}
+}
+
+// Go interface to profile data.
+
+// A StackRecord describes a single execution stack.
+type StackRecord struct {
+	Stack0 [32]uintptr // stack trace for this record; ends at first 0 entry
+}
+
+// Stack returns the stack trace associated with the record,
+// a prefix of r.Stack0.
+func (r *StackRecord) Stack() []uintptr {
+	for i, v := range r.Stack0 {
+		if v == 0 {
+			return r.Stack0[0:i]
+		}
+	}
+	return r.Stack0[0:]
+}
+
+// MemProfileRate controls the fraction of memory allocations
+// that are recorded and reported in the memory profile.
+// The profiler aims to sample an average of
+// one allocation per MemProfileRate bytes allocated.
+//
+// To include every allocated block in the profile, set MemProfileRate to 1.
+// To turn off profiling entirely, set MemProfileRate to 0.
+//
+// The tools that process the memory profiles assume that the
+// profile rate is constant across the lifetime of the program
+// and equal to the current value. Programs that change the
+// memory profiling rate should do so just once, as early as
+// possible in the execution of the program (for example,
+// at the beginning of main).
+var MemProfileRate int = defaultMemProfileRate(512 * 1024)
+
+// defaultMemProfileRate returns 0 if disableMemoryProfiling is set.
+// It exists primarily for the godoc rendering of MemProfileRate
+// above.
+func defaultMemProfileRate(v int) int {
+	if disableMemoryProfiling {
+		return 0
+	}
+	return v
+}
+
+// disableMemoryProfiling is set by the linker if runtime.MemProfile
+// is not used and the link type guarantees nobody else could use it
+// elsewhere.
+var disableMemoryProfiling bool
+
+// A MemProfileRecord describes the live objects allocated
+// by a particular call sequence (stack trace).
+type MemProfileRecord struct {
+	AllocBytes, FreeBytes     int64       // number of bytes allocated, freed
+	AllocObjects, FreeObjects int64       // number of objects allocated, freed
+	Stack0                    [32]uintptr // stack trace for this record; ends at first 0 entry
+}
+
+// InUseBytes returns the number of bytes in use (AllocBytes - FreeBytes).
+func (r *MemProfileRecord) InUseBytes() int64 { return r.AllocBytes - r.FreeBytes }
+
+// InUseObjects returns the number of objects in use (AllocObjects - FreeObjects).
+func (r *MemProfileRecord) InUseObjects() int64 {
+	return r.AllocObjects - r.FreeObjects
+}
+
+// Stack returns the stack trace associated with the record,
+// a prefix of r.Stack0.
+func (r *MemProfileRecord) Stack() []uintptr {
+	for i, v := range r.Stack0 {
+		if v == 0 {
+			return r.Stack0[0:i]
+		}
+	}
+	return r.Stack0[0:]
+}
+
+// MemProfile returns a profile of memory allocated and freed per allocation
+// site.
+//
+// MemProfile returns n, the number of records in the current memory profile.
+// If len(p) >= n, MemProfile copies the profile into p and returns n, true.
+// If len(p) < n, MemProfile does not change p and returns n, false.
+//
+// If inuseZero is true, the profile includes allocation records
+// where r.AllocBytes > 0 but r.AllocBytes == r.FreeBytes.
+// These are sites where memory was allocated, but it has all
+// been released back to the runtime.
+//
+// The returned profile may be up to two garbage collection cycles old.
+// This is to avoid skewing the profile toward allocations; because
+// allocations happen in real time but frees are delayed until the garbage
+// collector performs sweeping, the profile only accounts for allocations
+// that have had a chance to be freed by the garbage collector.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.memprofile flag instead
+// of calling MemProfile directly.
+func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
+	lock(&proflock)
+	// If we're between mProf_NextCycle and mProf_Flush, take care
+	// of flushing to the active profile so we only have to look
+	// at the active profile below.
+	mProf_FlushLocked()
+	clear := true
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
+			n++
+		}
+		if mp.active.allocs != 0 || mp.active.frees != 0 {
+			clear = false
+		}
+	}
+	if clear {
+		// Absolutely no data, suggesting that a garbage collection
+		// has not yet happened. In order to allow profiling when
+		// garbage collection is disabled from the beginning of execution,
+		// accumulate all of the cycles, and recount buckets.
+		n = 0
+		for b := mbuckets; b != nil; b = b.allnext {
+			mp := b.mp()
+			for c := range mp.future {
+				mp.active.add(&mp.future[c])
+				mp.future[c] = memRecordCycle{}
+			}
+			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
+				n++
+			}
+		}
+	}
+	if n <= len(p) {
+		ok = true
+		idx := 0
+		for b := mbuckets; b != nil; b = b.allnext {
+			mp := b.mp()
+			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
+				record(&p[idx], b)
+				idx++
+			}
+		}
+	}
+	unlock(&proflock)
+	return
+}
+
+// Write b's data to r.
+func record(r *MemProfileRecord, b *bucket) {
+	mp := b.mp()
+	r.AllocBytes = int64(mp.active.alloc_bytes)
+	r.FreeBytes = int64(mp.active.free_bytes)
+	r.AllocObjects = int64(mp.active.allocs)
+	r.FreeObjects = int64(mp.active.frees)
+	if raceenabled {
+		racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(), abi.FuncPCABIInternal(MemProfile))
+	}
+	if msanenabled {
+		msanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0))
+	}
+	if asanenabled {
+		asanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0))
+	}
+	copy(r.Stack0[:], b.stk())
+	for i := int(b.nstk); i < len(r.Stack0); i++ {
+		r.Stack0[i] = 0
+	}
+}
+
+func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {
+	lock(&proflock)
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees)
+	}
+	unlock(&proflock)
+}
+
+// BlockProfileRecord describes blocking events originated
+// at a particular call sequence (stack trace).
+type BlockProfileRecord struct {
+	Count  int64
+	Cycles int64
+	StackRecord
+}
+
+// BlockProfile returns n, the number of records in the current blocking profile.
+// If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
+// If len(p) < n, BlockProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.blockprofile flag instead
+// of calling BlockProfile directly.
+func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
+	lock(&proflock)
+	for b := bbuckets; b != nil; b = b.allnext {
+		n++
+	}
+	if n <= len(p) {
+		ok = true
+		for b := bbuckets; b != nil; b = b.allnext {
+			bp := b.bp()
+			r := &p[0]
+			r.Count = int64(bp.count)
+			// Prevent callers from having to worry about division by zero errors.
+			// See discussion on http://golang.org/cl/299991.
+			if r.Count == 0 {
+				r.Count = 1
+			}
+			r.Cycles = bp.cycles
+			if raceenabled {
+				racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(), abi.FuncPCABIInternal(BlockProfile))
+			}
+			if msanenabled {
+				msanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0))
+			}
+			if asanenabled {
+				asanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0))
+			}
+			i := copy(r.Stack0[:], b.stk())
+			for ; i < len(r.Stack0); i++ {
+				r.Stack0[i] = 0
+			}
+			p = p[1:]
+		}
+	}
+	unlock(&proflock)
+	return
+}
+
+// MutexProfile returns n, the number of records in the current mutex profile.
+// If len(p) >= n, MutexProfile copies the profile into p and returns n, true.
+// Otherwise, MutexProfile does not change p, and returns n, false.
+//
+// Most clients should use the runtime/pprof package
+// instead of calling MutexProfile directly.
+func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
+	lock(&proflock)
+	for b := xbuckets; b != nil; b = b.allnext {
+		n++
+	}
+	if n <= len(p) {
+		ok = true
+		for b := xbuckets; b != nil; b = b.allnext {
+			bp := b.bp()
+			r := &p[0]
+			r.Count = int64(bp.count)
+			r.Cycles = bp.cycles
+			i := copy(r.Stack0[:], b.stk())
+			for ; i < len(r.Stack0); i++ {
+				r.Stack0[i] = 0
+			}
+			p = p[1:]
+		}
+	}
+	unlock(&proflock)
+	return
+}
+
+// ThreadCreateProfile returns n, the number of records in the thread creation profile.
+// If len(p) >= n, ThreadCreateProfile copies the profile into p and returns n, true.
+// If len(p) < n, ThreadCreateProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package instead
+// of calling ThreadCreateProfile directly.
+func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
+	first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
+	for mp := first; mp != nil; mp = mp.alllink {
+		n++
+	}
+	if n <= len(p) {
+		ok = true
+		i := 0
+		for mp := first; mp != nil; mp = mp.alllink {
+			p[i].Stack0 = mp.createstack
+			i++
+		}
+	}
+	return
+}
+
+//go:linkname runtime_goroutineProfileWithLabels runtime/pprof.runtime_goroutineProfileWithLabels
+func runtime_goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+	return goroutineProfileWithLabels(p, labels)
+}
+
+// labels may be nil. If labels is non-nil, it must have the same length as p.
+func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+	if labels != nil && len(labels) != len(p) {
+		labels = nil
+	}
+	gp := getg()
+
+	isOK := func(gp1 *g) bool {
+		// Checking isSystemGoroutine here makes GoroutineProfile
+		// consistent with both NumGoroutine and Stack.
+		return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1, false)
+	}
+
+	stopTheWorld("profile")
+
+	// World is stopped, no locking required.
+	n = 1
+	forEachGRace(func(gp1 *g) {
+		if isOK(gp1) {
+			n++
+		}
+	})
+
+	if n <= len(p) {
+		ok = true
+		r, lbl := p, labels
+
+		// Save current goroutine.
+		sp := getcallersp()
+		pc := getcallerpc()
+		systemstack(func() {
+			saveg(pc, sp, gp, &r[0])
+		})
+		r = r[1:]
+
+		// If we have a place to put our goroutine labelmap, insert it there.
+		if labels != nil {
+			lbl[0] = gp.labels
+			lbl = lbl[1:]
+		}
+
+		// Save other goroutines.
+		forEachGRace(func(gp1 *g) {
+			if !isOK(gp1) {
+				return
+			}
+
+			if len(r) == 0 {
+				// Should be impossible, but better to return a
+				// truncated profile than to crash the entire process.
+				return
+			}
+			// saveg calls gentraceback, which may call cgo traceback functions.
+			// The world is stopped, so it cannot use cgocall (which will be
+			// blocked at exitsyscall). Do it on the system stack so it won't
+			// call into the schedular (see traceback.go:cgoContextPCs).
+			systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &r[0]) })
+			if labels != nil {
+				lbl[0] = gp1.labels
+				lbl = lbl[1:]
+			}
+			r = r[1:]
+		})
+	}
+
+	startTheWorld()
+	return n, ok
+}
+
+// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
+// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
+// If len(p) < n, GoroutineProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package instead
+// of calling GoroutineProfile directly.
+func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+
+	return goroutineProfileWithLabels(p, nil)
+}
+
+func saveg(pc, sp uintptr, gp *g, r *StackRecord) {
+	n := gentraceback(pc, sp, 0, gp, 0, &r.Stack0[0], len(r.Stack0), nil, nil, 0)
+	if n < len(r.Stack0) {
+		r.Stack0[n] = 0
+	}
+}
+
+// Stack formats a stack trace of the calling goroutine into buf
+// and returns the number of bytes written to buf.
+// If all is true, Stack formats stack traces of all other goroutines
+// into buf after the trace for the current goroutine.
+func Stack(buf []byte, all bool) int {
+	if all {
+		stopTheWorld("stack trace")
+	}
+
+	n := 0
+	if len(buf) > 0 {
+		gp := getg()
+		sp := getcallersp()
+		pc := getcallerpc()
+		systemstack(func() {
+			g0 := getg()
+			// Force traceback=1 to override GOTRACEBACK setting,
+			// so that Stack's results are consistent.
+			// GOTRACEBACK is only about crash dumps.
+			g0.m.traceback = 1
+			g0.writebuf = buf[0:0:len(buf)]
+			goroutineheader(gp)
+			traceback(pc, sp, 0, gp)
+			if all {
+				tracebackothers(gp)
+			}
+			g0.m.traceback = 0
+			n = len(g0.writebuf)
+			g0.writebuf = nil
+		})
+	}
+
+	if all {
+		startTheWorld()
+	}
+	return n
+}
+
+// Tracing of alloc/free/gc.
+
+var tracelock mutex
+
+func tracealloc(p unsafe.Pointer, size uintptr, typ *_type) {
+	lock(&tracelock)
+	gp := getg()
+	gp.m.traceback = 2
+	if typ == nil {
+		print("tracealloc(", p, ", ", hex(size), ")\n")
+	} else {
+		print("tracealloc(", p, ", ", hex(size), ", ", typ.string(), ")\n")
+	}
+	if gp.m.curg == nil || gp == gp.m.curg {
+		goroutineheader(gp)
+		pc := getcallerpc()
+		sp := getcallersp()
+		systemstack(func() {
+			traceback(pc, sp, 0, gp)
+		})
+	} else {
+		goroutineheader(gp.m.curg)
+		traceback(^uintptr(0), ^uintptr(0), 0, gp.m.curg)
+	}
+	print("\n")
+	gp.m.traceback = 0
+	unlock(&tracelock)
+}
+
+func tracefree(p unsafe.Pointer, size uintptr) {
+	lock(&tracelock)
+	gp := getg()
+	gp.m.traceback = 2
+	print("tracefree(", p, ", ", hex(size), ")\n")
+	goroutineheader(gp)
+	pc := getcallerpc()
+	sp := getcallersp()
+	systemstack(func() {
+		traceback(pc, sp, 0, gp)
+	})
+	print("\n")
+	gp.m.traceback = 0
+	unlock(&tracelock)
+}
+
+func tracegc() {
+	lock(&tracelock)
+	gp := getg()
+	gp.m.traceback = 2
+	print("tracegc()\n")
+	// running on m->g0 stack; show all non-g0 goroutines
+	tracebackothers(gp)
+	print("end tracegc\n")
+	print("\n")
+	gp.m.traceback = 0
+	unlock(&tracelock)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mranges.go b/contrib/go/_std_1.18/src/runtime/mranges.go
new file mode 100644
index 0000000000..e0be1e134e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mranges.go
@@ -0,0 +1,372 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Address range data structure.
+//
+// This file contains an implementation of a data structure which
+// manages ordered address ranges.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// addrRange represents a region of address space.
+//
+// An addrRange must never span a gap in the address space.
+type addrRange struct {
+	// base and limit together represent the region of address space
+	// [base, limit). That is, base is inclusive, limit is exclusive.
+	// These are address over an offset view of the address space on
+	// platforms with a segmented address space, that is, on platforms
+	// where arenaBaseOffset != 0.
+	base, limit offAddr
+}
+
+// makeAddrRange creates a new address range from two virtual addresses.
+//
+// Throws if the base and limit are not in the same memory segment.
+func makeAddrRange(base, limit uintptr) addrRange {
+	r := addrRange{offAddr{base}, offAddr{limit}}
+	if (base-arenaBaseOffset >= base) != (limit-arenaBaseOffset >= limit) {
+		throw("addr range base and limit are not in the same memory segment")
+	}
+	return r
+}
+
+// size returns the size of the range represented in bytes.
+func (a addrRange) size() uintptr {
+	if !a.base.lessThan(a.limit) {
+		return 0
+	}
+	// Subtraction is safe because limit and base must be in the same
+	// segment of the address space.
+	return a.limit.diff(a.base)
+}
+
+// contains returns whether or not the range contains a given address.
+func (a addrRange) contains(addr uintptr) bool {
+	return a.base.lessEqual(offAddr{addr}) && (offAddr{addr}).lessThan(a.limit)
+}
+
+// subtract takes the addrRange toPrune and cuts out any overlap with
+// from, then returns the new range. subtract assumes that a and b
+// either don't overlap at all, only overlap on one side, or are equal.
+// If b is strictly contained in a, thus forcing a split, it will throw.
+func (a addrRange) subtract(b addrRange) addrRange {
+	if b.base.lessEqual(a.base) && a.limit.lessEqual(b.limit) {
+		return addrRange{}
+	} else if a.base.lessThan(b.base) && b.limit.lessThan(a.limit) {
+		throw("bad prune")
+	} else if b.limit.lessThan(a.limit) && a.base.lessThan(b.limit) {
+		a.base = b.limit
+	} else if a.base.lessThan(b.base) && b.base.lessThan(a.limit) {
+		a.limit = b.base
+	}
+	return a
+}
+
+// removeGreaterEqual removes all addresses in a greater than or equal
+// to addr and returns the new range.
+func (a addrRange) removeGreaterEqual(addr uintptr) addrRange {
+	if (offAddr{addr}).lessEqual(a.base) {
+		return addrRange{}
+	}
+	if a.limit.lessEqual(offAddr{addr}) {
+		return a
+	}
+	return makeAddrRange(a.base.addr(), addr)
+}
+
+var (
+	// minOffAddr is the minimum address in the offset space, and
+	// it corresponds to the virtual address arenaBaseOffset.
+	minOffAddr = offAddr{arenaBaseOffset}
+
+	// maxOffAddr is the maximum address in the offset address
+	// space. It corresponds to the highest virtual address representable
+	// by the page alloc chunk and heap arena maps.
+	maxOffAddr = offAddr{(((1 << heapAddrBits) - 1) + arenaBaseOffset) & uintptrMask}
+)
+
+// offAddr represents an address in a contiguous view
+// of the address space on systems where the address space is
+// segmented. On other systems, it's just a normal address.
+type offAddr struct {
+	// a is just the virtual address, but should never be used
+	// directly. Call addr() to get this value instead.
+	a uintptr
+}
+
+// add adds a uintptr offset to the offAddr.
+func (l offAddr) add(bytes uintptr) offAddr {
+	return offAddr{a: l.a + bytes}
+}
+
+// sub subtracts a uintptr offset from the offAddr.
+func (l offAddr) sub(bytes uintptr) offAddr {
+	return offAddr{a: l.a - bytes}
+}
+
+// diff returns the amount of bytes in between the
+// two offAddrs.
+func (l1 offAddr) diff(l2 offAddr) uintptr {
+	return l1.a - l2.a
+}
+
+// lessThan returns true if l1 is less than l2 in the offset
+// address space.
+func (l1 offAddr) lessThan(l2 offAddr) bool {
+	return (l1.a - arenaBaseOffset) < (l2.a - arenaBaseOffset)
+}
+
+// lessEqual returns true if l1 is less than or equal to l2 in
+// the offset address space.
+func (l1 offAddr) lessEqual(l2 offAddr) bool {
+	return (l1.a - arenaBaseOffset) <= (l2.a - arenaBaseOffset)
+}
+
+// equal returns true if the two offAddr values are equal.
+func (l1 offAddr) equal(l2 offAddr) bool {
+	// No need to compare in the offset space, it
+	// means the same thing.
+	return l1 == l2
+}
+
+// addr returns the virtual address for this offset address.
+func (l offAddr) addr() uintptr {
+	return l.a
+}
+
+// addrRanges is a data structure holding a collection of ranges of
+// address space.
+//
+// The ranges are coalesced eagerly to reduce the
+// number ranges it holds.
+//
+// The slice backing store for this field is persistentalloc'd
+// and thus there is no way to free it.
+//
+// addrRanges is not thread-safe.
+type addrRanges struct {
+	// ranges is a slice of ranges sorted by base.
+	ranges []addrRange
+
+	// totalBytes is the total amount of address space in bytes counted by
+	// this addrRanges.
+	totalBytes uintptr
+
+	// sysStat is the stat to track allocations by this type
+	sysStat *sysMemStat
+}
+
+func (a *addrRanges) init(sysStat *sysMemStat) {
+	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
+	ranges.len = 0
+	ranges.cap = 16
+	ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), goarch.PtrSize, sysStat))
+	a.sysStat = sysStat
+	a.totalBytes = 0
+}
+
+// findSucc returns the first index in a such that addr is
+// less than the base of the addrRange at that index.
+func (a *addrRanges) findSucc(addr uintptr) int {
+	base := offAddr{addr}
+
+	// Narrow down the search space via a binary search
+	// for large addrRanges until we have at most iterMax
+	// candidates left.
+	const iterMax = 8
+	bot, top := 0, len(a.ranges)
+	for top-bot > iterMax {
+		i := ((top - bot) / 2) + bot
+		if a.ranges[i].contains(base.addr()) {
+			// a.ranges[i] contains base, so
+			// its successor is the next index.
+			return i + 1
+		}
+		if base.lessThan(a.ranges[i].base) {
+			// In this case i might actually be
+			// the successor, but we can't be sure
+			// until we check the ones before it.
+			top = i
+		} else {
+			// In this case we know base is
+			// greater than or equal to a.ranges[i].limit-1,
+			// so i is definitely not the successor.
+			// We already checked i, so pick the next
+			// one.
+			bot = i + 1
+		}
+	}
+	// There are top-bot candidates left, so
+	// iterate over them and find the first that
+	// base is strictly less than.
+	for i := bot; i < top; i++ {
+		if base.lessThan(a.ranges[i].base) {
+			return i
+		}
+	}
+	return top
+}
+
+// findAddrGreaterEqual returns the smallest address represented by a
+// that is >= addr. Thus, if the address is represented by a,
+// then it returns addr. The second return value indicates whether
+// such an address exists for addr in a. That is, if addr is larger than
+// any address known to a, the second return value will be false.
+func (a *addrRanges) findAddrGreaterEqual(addr uintptr) (uintptr, bool) {
+	i := a.findSucc(addr)
+	if i == 0 {
+		return a.ranges[0].base.addr(), true
+	}
+	if a.ranges[i-1].contains(addr) {
+		return addr, true
+	}
+	if i < len(a.ranges) {
+		return a.ranges[i].base.addr(), true
+	}
+	return 0, false
+}
+
+// contains returns true if a covers the address addr.
+func (a *addrRanges) contains(addr uintptr) bool {
+	i := a.findSucc(addr)
+	if i == 0 {
+		return false
+	}
+	return a.ranges[i-1].contains(addr)
+}
+
+// add inserts a new address range to a.
+//
+// r must not overlap with any address range in a and r.size() must be > 0.
+func (a *addrRanges) add(r addrRange) {
+	// The copies in this function are potentially expensive, but this data
+	// structure is meant to represent the Go heap. At worst, copying this
+	// would take ~160µs assuming a conservative copying rate of 25 GiB/s (the
+	// copy will almost never trigger a page fault) for a 1 TiB heap with 4 MiB
+	// arenas which is completely discontiguous. ~160µs is still a lot, but in
+	// practice most platforms have 64 MiB arenas (which cuts this by a factor
+	// of 16) and Go heaps are usually mostly contiguous, so the chance that
+	// an addrRanges even grows to that size is extremely low.
+
+	// An empty range has no effect on the set of addresses represented
+	// by a, but passing a zero-sized range is almost always a bug.
+	if r.size() == 0 {
+		print("runtime: range = {", hex(r.base.addr()), ", ", hex(r.limit.addr()), "}\n")
+		throw("attempted to add zero-sized address range")
+	}
+	// Because we assume r is not currently represented in a,
+	// findSucc gives us our insertion index.
+	i := a.findSucc(r.base.addr())
+	coalescesDown := i > 0 && a.ranges[i-1].limit.equal(r.base)
+	coalescesUp := i < len(a.ranges) && r.limit.equal(a.ranges[i].base)
+	if coalescesUp && coalescesDown {
+		// We have neighbors and they both border us.
+		// Merge a.ranges[i-1], r, and a.ranges[i] together into a.ranges[i-1].
+		a.ranges[i-1].limit = a.ranges[i].limit
+
+		// Delete a.ranges[i].
+		copy(a.ranges[i:], a.ranges[i+1:])
+		a.ranges = a.ranges[:len(a.ranges)-1]
+	} else if coalescesDown {
+		// We have a neighbor at a lower address only and it borders us.
+		// Merge the new space into a.ranges[i-1].
+		a.ranges[i-1].limit = r.limit
+	} else if coalescesUp {
+		// We have a neighbor at a higher address only and it borders us.
+		// Merge the new space into a.ranges[i].
+		a.ranges[i].base = r.base
+	} else {
+		// We may or may not have neighbors which don't border us.
+		// Add the new range.
+		if len(a.ranges)+1 > cap(a.ranges) {
+			// Grow the array. Note that this leaks the old array, but since
+			// we're doubling we have at most 2x waste. For a 1 TiB heap and
+			// 4 MiB arenas which are all discontiguous (both very conservative
+			// assumptions), this would waste at most 4 MiB of memory.
+			oldRanges := a.ranges
+			ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
+			ranges.len = len(oldRanges) + 1
+			ranges.cap = cap(oldRanges) * 2
+			ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), goarch.PtrSize, a.sysStat))
+
+			// Copy in the old array, but make space for the new range.
+			copy(a.ranges[:i], oldRanges[:i])
+			copy(a.ranges[i+1:], oldRanges[i:])
+		} else {
+			a.ranges = a.ranges[:len(a.ranges)+1]
+			copy(a.ranges[i+1:], a.ranges[i:])
+		}
+		a.ranges[i] = r
+	}
+	a.totalBytes += r.size()
+}
+
+// removeLast removes and returns the highest-addressed contiguous range
+// of a, or the last nBytes of that range, whichever is smaller. If a is
+// empty, it returns an empty range.
+func (a *addrRanges) removeLast(nBytes uintptr) addrRange {
+	if len(a.ranges) == 0 {
+		return addrRange{}
+	}
+	r := a.ranges[len(a.ranges)-1]
+	size := r.size()
+	if size > nBytes {
+		newEnd := r.limit.sub(nBytes)
+		a.ranges[len(a.ranges)-1].limit = newEnd
+		a.totalBytes -= nBytes
+		return addrRange{newEnd, r.limit}
+	}
+	a.ranges = a.ranges[:len(a.ranges)-1]
+	a.totalBytes -= size
+	return r
+}
+
+// removeGreaterEqual removes the ranges of a which are above addr, and additionally
+// splits any range containing addr.
+func (a *addrRanges) removeGreaterEqual(addr uintptr) {
+	pivot := a.findSucc(addr)
+	if pivot == 0 {
+		// addr is before all ranges in a.
+		a.totalBytes = 0
+		a.ranges = a.ranges[:0]
+		return
+	}
+	removed := uintptr(0)
+	for _, r := range a.ranges[pivot:] {
+		removed += r.size()
+	}
+	if r := a.ranges[pivot-1]; r.contains(addr) {
+		removed += r.size()
+		r = r.removeGreaterEqual(addr)
+		if r.size() == 0 {
+			pivot--
+		} else {
+			removed -= r.size()
+			a.ranges[pivot-1] = r
+		}
+	}
+	a.ranges = a.ranges[:pivot]
+	a.totalBytes -= removed
+}
+
+// cloneInto makes a deep clone of a's state into b, re-using
+// b's ranges if able.
+func (a *addrRanges) cloneInto(b *addrRanges) {
+	if len(a.ranges) > cap(b.ranges) {
+		// Grow the array.
+		ranges := (*notInHeapSlice)(unsafe.Pointer(&b.ranges))
+		ranges.len = 0
+		ranges.cap = cap(a.ranges)
+		ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), goarch.PtrSize, b.sysStat))
+	}
+	b.ranges = b.ranges[:len(a.ranges)]
+	b.totalBytes = a.totalBytes
+	copy(b.ranges, a.ranges)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/msan0.go b/contrib/go/_std_1.18/src/runtime/msan0.go
new file mode 100644
index 0000000000..2f5fd2d982
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/msan0.go
@@ -0,0 +1,23 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !msan
+
+// Dummy MSan support API, used when not built with -msan.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const msanenabled = false
+
+// Because msanenabled is false, none of these functions should be called.
+
+func msanread(addr unsafe.Pointer, sz uintptr)     { throw("msan") }
+func msanwrite(addr unsafe.Pointer, sz uintptr)    { throw("msan") }
+func msanmalloc(addr unsafe.Pointer, sz uintptr)   { throw("msan") }
+func msanfree(addr unsafe.Pointer, sz uintptr)     { throw("msan") }
+func msanmove(dst, src unsafe.Pointer, sz uintptr) { throw("msan") }
diff --git a/contrib/go/_std_1.18/src/runtime/msize.go b/contrib/go/_std_1.18/src/runtime/msize.go
new file mode 100644
index 0000000000..c56aa5a7b2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/msize.go
@@ -0,0 +1,25 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc small size classes.
+//
+// See malloc.go for overview.
+// See also mksizeclasses.go for how we decide what size classes to use.
+
+package runtime
+
+// Returns size of the memory block that mallocgc will allocate if you ask for the size.
+func roundupsize(size uintptr) uintptr {
+	if size < _MaxSmallSize {
+		if size <= smallSizeMax-8 {
+			return uintptr(class_to_size[size_to_class8[divRoundUp(size, smallSizeDiv)]])
+		} else {
+			return uintptr(class_to_size[size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]])
+		}
+	}
+	if size+_PageSize < size {
+		return size
+	}
+	return alignUp(size, _PageSize)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mspanset.go b/contrib/go/_std_1.18/src/runtime/mspanset.go
new file mode 100644
index 0000000000..29f14910cc
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mspanset.go
@@ -0,0 +1,354 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// A spanSet is a set of *mspans.
+//
+// spanSet is safe for concurrent push and pop operations.
+type spanSet struct {
+	// A spanSet is a two-level data structure consisting of a
+	// growable spine that points to fixed-sized blocks. The spine
+	// can be accessed without locks, but adding a block or
+	// growing it requires taking the spine lock.
+	//
+	// Because each mspan covers at least 8K of heap and takes at
+	// most 8 bytes in the spanSet, the growth of the spine is
+	// quite limited.
+	//
+	// The spine and all blocks are allocated off-heap, which
+	// allows this to be used in the memory manager and avoids the
+	// need for write barriers on all of these. spanSetBlocks are
+	// managed in a pool, though never freed back to the operating
+	// system. We never release spine memory because there could be
+	// concurrent lock-free access and we're likely to reuse it
+	// anyway. (In principle, we could do this during STW.)
+
+	spineLock mutex
+	spine     unsafe.Pointer // *[N]*spanSetBlock, accessed atomically
+	spineLen  uintptr        // Spine array length, accessed atomically
+	spineCap  uintptr        // Spine array cap, accessed under lock
+
+	// index is the head and tail of the spanSet in a single field.
+	// The head and the tail both represent an index into the logical
+	// concatenation of all blocks, with the head always behind or
+	// equal to the tail (indicating an empty set). This field is
+	// always accessed atomically.
+	//
+	// The head and the tail are only 32 bits wide, which means we
+	// can only support up to 2^32 pushes before a reset. If every
+	// span in the heap were stored in this set, and each span were
+	// the minimum size (1 runtime page, 8 KiB), then roughly the
+	// smallest heap which would be unrepresentable is 32 TiB in size.
+	index headTailIndex
+}
+
+const (
+	spanSetBlockEntries = 512 // 4KB on 64-bit
+	spanSetInitSpineCap = 256 // Enough for 1GB heap on 64-bit
+)
+
+type spanSetBlock struct {
+	// Free spanSetBlocks are managed via a lock-free stack.
+	lfnode
+
+	// popped is the number of pop operations that have occurred on
+	// this block. This number is used to help determine when a block
+	// may be safely recycled.
+	popped uint32
+
+	// spans is the set of spans in this block.
+	spans [spanSetBlockEntries]*mspan
+}
+
+// push adds span s to buffer b. push is safe to call concurrently
+// with other push and pop operations.
+func (b *spanSet) push(s *mspan) {
+	// Obtain our slot.
+	cursor := uintptr(b.index.incTail().tail() - 1)
+	top, bottom := cursor/spanSetBlockEntries, cursor%spanSetBlockEntries
+
+	// Do we need to add a block?
+	spineLen := atomic.Loaduintptr(&b.spineLen)
+	var block *spanSetBlock
+retry:
+	if top < spineLen {
+		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+		blockp := add(spine, goarch.PtrSize*top)
+		block = (*spanSetBlock)(atomic.Loadp(blockp))
+	} else {
+		// Add a new block to the spine, potentially growing
+		// the spine.
+		lock(&b.spineLock)
+		// spineLen cannot change until we release the lock,
+		// but may have changed while we were waiting.
+		spineLen = atomic.Loaduintptr(&b.spineLen)
+		if top < spineLen {
+			unlock(&b.spineLock)
+			goto retry
+		}
+
+		if spineLen == b.spineCap {
+			// Grow the spine.
+			newCap := b.spineCap * 2
+			if newCap == 0 {
+				newCap = spanSetInitSpineCap
+			}
+			newSpine := persistentalloc(newCap*goarch.PtrSize, cpu.CacheLineSize, &memstats.gcMiscSys)
+			if b.spineCap != 0 {
+				// Blocks are allocated off-heap, so
+				// no write barriers.
+				memmove(newSpine, b.spine, b.spineCap*goarch.PtrSize)
+			}
+			// Spine is allocated off-heap, so no write barrier.
+			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
+			b.spineCap = newCap
+			// We can't immediately free the old spine
+			// since a concurrent push with a lower index
+			// could still be reading from it. We let it
+			// leak because even a 1TB heap would waste
+			// less than 2MB of memory on old spines. If
+			// this is a problem, we could free old spines
+			// during STW.
+		}
+
+		// Allocate a new block from the pool.
+		block = spanSetBlockPool.alloc()
+
+		// Add it to the spine.
+		blockp := add(b.spine, goarch.PtrSize*top)
+		// Blocks are allocated off-heap, so no write barrier.
+		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
+		atomic.Storeuintptr(&b.spineLen, spineLen+1)
+		unlock(&b.spineLock)
+	}
+
+	// We have a block. Insert the span atomically, since there may be
+	// concurrent readers via the block API.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
+}
+
+// pop removes and returns a span from buffer b, or nil if b is empty.
+// pop is safe to call concurrently with other pop and push operations.
+func (b *spanSet) pop() *mspan {
+	var head, tail uint32
+claimLoop:
+	for {
+		headtail := b.index.load()
+		head, tail = headtail.split()
+		if head >= tail {
+			// The buf is empty, as far as we can tell.
+			return nil
+		}
+		// Check if the head position we want to claim is actually
+		// backed by a block.
+		spineLen := atomic.Loaduintptr(&b.spineLen)
+		if spineLen <= uintptr(head)/spanSetBlockEntries {
+			// We're racing with a spine growth and the allocation of
+			// a new block (and maybe a new spine!), and trying to grab
+			// the span at the index which is currently being pushed.
+			// Instead of spinning, let's just notify the caller that
+			// there's nothing currently here. Spinning on this is
+			// almost definitely not worth it.
+			return nil
+		}
+		// Try to claim the current head by CASing in an updated head.
+		// This may fail transiently due to a push which modifies the
+		// tail, so keep trying while the head isn't changing.
+		want := head
+		for want == head {
+			if b.index.cas(headtail, makeHeadTailIndex(want+1, tail)) {
+				break claimLoop
+			}
+			headtail = b.index.load()
+			head, tail = headtail.split()
+		}
+		// We failed to claim the spot we were after and the head changed,
+		// meaning a popper got ahead of us. Try again from the top because
+		// the buf may not be empty.
+	}
+	top, bottom := head/spanSetBlockEntries, head%spanSetBlockEntries
+
+	// We may be reading a stale spine pointer, but because the length
+	// grows monotonically and we've already verified it, we'll definitely
+	// be reading from a valid block.
+	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+	blockp := add(spine, goarch.PtrSize*uintptr(top))
+
+	// Given that the spine length is correct, we know we will never
+	// see a nil block here, since the length is always updated after
+	// the block is set.
+	block := (*spanSetBlock)(atomic.Loadp(blockp))
+	s := (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	for s == nil {
+		// We raced with the span actually being set, but given that we
+		// know a block for this span exists, the race window here is
+		// extremely small. Try again.
+		s = (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	}
+	// Clear the pointer. This isn't strictly necessary, but defensively
+	// avoids accidentally re-using blocks which could lead to memory
+	// corruption. This way, we'll get a nil pointer access instead.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), nil)
+
+	// Increase the popped count. If we are the last possible popper
+	// in the block (note that bottom need not equal spanSetBlockEntries-1
+	// due to races) then it's our resposibility to free the block.
+	//
+	// If we increment popped to spanSetBlockEntries, we can be sure that
+	// we're the last popper for this block, and it's thus safe to free it.
+	// Every other popper must have crossed this barrier (and thus finished
+	// popping its corresponding mspan) by the time we get here. Because
+	// we're the last popper, we also don't have to worry about concurrent
+	// pushers (there can't be any). Note that we may not be the popper
+	// which claimed the last slot in the block, we're just the last one
+	// to finish popping.
+	if atomic.Xadd(&block.popped, 1) == spanSetBlockEntries {
+		// Clear the block's pointer.
+		atomic.StorepNoWB(blockp, nil)
+
+		// Return the block to the block pool.
+		spanSetBlockPool.free(block)
+	}
+	return s
+}
+
+// reset resets a spanSet which is empty. It will also clean up
+// any left over blocks.
+//
+// Throws if the buf is not empty.
+//
+// reset may not be called concurrently with any other operations
+// on the span set.
+func (b *spanSet) reset() {
+	head, tail := b.index.load().split()
+	if head < tail {
+		print("head = ", head, ", tail = ", tail, "\n")
+		throw("attempt to clear non-empty span set")
+	}
+	top := head / spanSetBlockEntries
+	if uintptr(top) < b.spineLen {
+		// If the head catches up to the tail and the set is empty,
+		// we may not clean up the block containing the head and tail
+		// since it may be pushed into again. In order to avoid leaking
+		// memory since we're going to reset the head and tail, clean
+		// up such a block now, if it exists.
+		blockp := (**spanSetBlock)(add(b.spine, goarch.PtrSize*uintptr(top)))
+		block := *blockp
+		if block != nil {
+			// Sanity check the popped value.
+			if block.popped == 0 {
+				// popped should never be zero because that means we have
+				// pushed at least one value but not yet popped if this
+				// block pointer is not nil.
+				throw("span set block with unpopped elements found in reset")
+			}
+			if block.popped == spanSetBlockEntries {
+				// popped should also never be equal to spanSetBlockEntries
+				// because the last popper should have made the block pointer
+				// in this slot nil.
+				throw("fully empty unfreed span set block found in reset")
+			}
+
+			// Clear the pointer to the block.
+			atomic.StorepNoWB(unsafe.Pointer(blockp), nil)
+
+			// Return the block to the block pool.
+			spanSetBlockPool.free(block)
+		}
+	}
+	b.index.reset()
+	atomic.Storeuintptr(&b.spineLen, 0)
+}
+
+// spanSetBlockPool is a global pool of spanSetBlocks.
+var spanSetBlockPool spanSetBlockAlloc
+
+// spanSetBlockAlloc represents a concurrent pool of spanSetBlocks.
+type spanSetBlockAlloc struct {
+	stack lfstack
+}
+
+// alloc tries to grab a spanSetBlock out of the pool, and if it fails
+// persistentallocs a new one and returns it.
+func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
+	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
+		return s
+	}
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gcMiscSys))
+}
+
+// free returns a spanSetBlock back to the pool.
+func (p *spanSetBlockAlloc) free(block *spanSetBlock) {
+	atomic.Store(&block.popped, 0)
+	p.stack.push(&block.lfnode)
+}
+
+// haidTailIndex represents a combined 32-bit head and 32-bit tail
+// of a queue into a single 64-bit value.
+type headTailIndex uint64
+
+// makeHeadTailIndex creates a headTailIndex value from a separate
+// head and tail.
+func makeHeadTailIndex(head, tail uint32) headTailIndex {
+	return headTailIndex(uint64(head)<<32 | uint64(tail))
+}
+
+// head returns the head of a headTailIndex value.
+func (h headTailIndex) head() uint32 {
+	return uint32(h >> 32)
+}
+
+// tail returns the tail of a headTailIndex value.
+func (h headTailIndex) tail() uint32 {
+	return uint32(h)
+}
+
+// split splits the headTailIndex value into its parts.
+func (h headTailIndex) split() (head uint32, tail uint32) {
+	return h.head(), h.tail()
+}
+
+// load atomically reads a headTailIndex value.
+func (h *headTailIndex) load() headTailIndex {
+	return headTailIndex(atomic.Load64((*uint64)(h)))
+}
+
+// cas atomically compares-and-swaps a headTailIndex value.
+func (h *headTailIndex) cas(old, new headTailIndex) bool {
+	return atomic.Cas64((*uint64)(h), uint64(old), uint64(new))
+}
+
+// incHead atomically increments the head of a headTailIndex.
+func (h *headTailIndex) incHead() headTailIndex {
+	return headTailIndex(atomic.Xadd64((*uint64)(h), (1 << 32)))
+}
+
+// decHead atomically decrements the head of a headTailIndex.
+func (h *headTailIndex) decHead() headTailIndex {
+	return headTailIndex(atomic.Xadd64((*uint64)(h), -(1 << 32)))
+}
+
+// incTail atomically increments the tail of a headTailIndex.
+func (h *headTailIndex) incTail() headTailIndex {
+	ht := headTailIndex(atomic.Xadd64((*uint64)(h), +1))
+	// Check for overflow.
+	if ht.tail() == 0 {
+		print("runtime: head = ", ht.head(), ", tail = ", ht.tail(), "\n")
+		throw("headTailIndex overflow")
+	}
+	return ht
+}
+
+// reset clears the headTailIndex to (0, 0).
+func (h *headTailIndex) reset() {
+	atomic.Store64((*uint64)(h), 0)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mstats.go b/contrib/go/_std_1.18/src/runtime/mstats.go
new file mode 100644
index 0000000000..2c1ec79bf8
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mstats.go
@@ -0,0 +1,928 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Memory statistics
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// Statistics.
+//
+// For detailed descriptions see the documentation for MemStats.
+// Fields that differ from MemStats are further documented here.
+//
+// Many of these fields are updated on the fly, while others are only
+// updated when updatememstats is called.
+type mstats struct {
+	// General statistics.
+	alloc       uint64 // bytes allocated and not yet freed
+	total_alloc uint64 // bytes allocated (even if freed)
+	sys         uint64 // bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
+	nlookup     uint64 // number of pointer lookups (unused)
+	nmalloc     uint64 // number of mallocs
+	nfree       uint64 // number of frees
+
+	// Statistics about malloc heap.
+	// Updated atomically, or with the world stopped.
+	//
+	// Like MemStats, heap_sys and heap_inuse do not count memory
+	// in manually-managed spans.
+	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
+	heap_inuse    uint64     // bytes in mSpanInUse spans
+	heap_released uint64     // bytes released to the os
+
+	// heap_objects is not used by the runtime directly and instead
+	// computed on the fly by updatememstats.
+	heap_objects uint64 // total number of allocated objects
+
+	// Statistics about stacks.
+	stacks_inuse uint64     // bytes in manually-managed stack spans; computed by updatememstats
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+
+	// Statistics about allocation of low-level fixed-size structures.
+	// Protected by FixAlloc locks.
+	mspan_inuse  uint64 // mspan structures
+	mspan_sys    sysMemStat
+	mcache_inuse uint64 // mcache structures
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+
+	// Statistics about GC overhead.
+	gcWorkBufInUse           uint64     // computed by updatememstats
+	gcProgPtrScalarBitsInUse uint64     // computed by updatememstats
+	gcMiscSys                sysMemStat // updated atomically or during STW
+
+	// Miscellaneous statistics.
+	other_sys sysMemStat // updated atomically or during STW
+
+	// Statistics about the garbage collector.
+
+	// Protected by mheap or stopping the world during GC.
+	last_gc_unix    uint64 // last gc (in unix time)
+	pause_total_ns  uint64
+	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
+	pause_end       [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
+	numgc           uint32
+	numforcedgc     uint32  // number of user-forced GCs
+	gc_cpu_fraction float64 // fraction of CPU time used by GC
+	enablegc        bool
+	debuggc         bool
+
+	// Statistics about allocation size classes.
+
+	by_size [_NumSizeClasses]struct {
+		size    uint32
+		nmalloc uint64
+		nfree   uint64
+	}
+
+	// Add an uint32 for even number of size classes to align below fields
+	// to 64 bits for atomic operations on 32 bit platforms.
+	_ [1 - _NumSizeClasses%2]uint32
+
+	last_gc_nanotime uint64 // last gc (monotonic time)
+	last_heap_inuse  uint64 // heap_inuse at mark termination of the previous GC
+
+	// heapStats is a set of statistics
+	heapStats consistentHeapStats
+
+	// _ uint32 // ensure gcPauseDist is aligned
+
+	// gcPauseDist represents the distribution of all GC-related
+	// application pauses in the runtime.
+	//
+	// Each individual pause is counted separately, unlike pause_ns.
+	gcPauseDist timeHistogram
+}
+
+var memstats mstats
+
+// A MemStats records statistics about the memory allocator.
+type MemStats struct {
+	// General statistics.
+
+	// Alloc is bytes of allocated heap objects.
+	//
+	// This is the same as HeapAlloc (see below).
+	Alloc uint64
+
+	// TotalAlloc is cumulative bytes allocated for heap objects.
+	//
+	// TotalAlloc increases as heap objects are allocated, but
+	// unlike Alloc and HeapAlloc, it does not decrease when
+	// objects are freed.
+	TotalAlloc uint64
+
+	// Sys is the total bytes of memory obtained from the OS.
+	//
+	// Sys is the sum of the XSys fields below. Sys measures the
+	// virtual address space reserved by the Go runtime for the
+	// heap, stacks, and other internal data structures. It's
+	// likely that not all of the virtual address space is backed
+	// by physical memory at any given moment, though in general
+	// it all was at some point.
+	Sys uint64
+
+	// Lookups is the number of pointer lookups performed by the
+	// runtime.
+	//
+	// This is primarily useful for debugging runtime internals.
+	Lookups uint64
+
+	// Mallocs is the cumulative count of heap objects allocated.
+	// The number of live objects is Mallocs - Frees.
+	Mallocs uint64
+
+	// Frees is the cumulative count of heap objects freed.
+	Frees uint64
+
+	// Heap memory statistics.
+	//
+	// Interpreting the heap statistics requires some knowledge of
+	// how Go organizes memory. Go divides the virtual address
+	// space of the heap into "spans", which are contiguous
+	// regions of memory 8K or larger. A span may be in one of
+	// three states:
+	//
+	// An "idle" span contains no objects or other data. The
+	// physical memory backing an idle span can be released back
+	// to the OS (but the virtual address space never is), or it
+	// can be converted into an "in use" or "stack" span.
+	//
+	// An "in use" span contains at least one heap object and may
+	// have free space available to allocate more heap objects.
+	//
+	// A "stack" span is used for goroutine stacks. Stack spans
+	// are not considered part of the heap. A span can change
+	// between heap and stack memory; it is never used for both
+	// simultaneously.
+
+	// HeapAlloc is bytes of allocated heap objects.
+	//
+	// "Allocated" heap objects include all reachable objects, as
+	// well as unreachable objects that the garbage collector has
+	// not yet freed. Specifically, HeapAlloc increases as heap
+	// objects are allocated and decreases as the heap is swept
+	// and unreachable objects are freed. Sweeping occurs
+	// incrementally between GC cycles, so these two processes
+	// occur simultaneously, and as a result HeapAlloc tends to
+	// change smoothly (in contrast with the sawtooth that is
+	// typical of stop-the-world garbage collectors).
+	HeapAlloc uint64
+
+	// HeapSys is bytes of heap memory obtained from the OS.
+	//
+	// HeapSys measures the amount of virtual address space
+	// reserved for the heap. This includes virtual address space
+	// that has been reserved but not yet used, which consumes no
+	// physical memory, but tends to be small, as well as virtual
+	// address space for which the physical memory has been
+	// returned to the OS after it became unused (see HeapReleased
+	// for a measure of the latter).
+	//
+	// HeapSys estimates the largest size the heap has had.
+	HeapSys uint64
+
+	// HeapIdle is bytes in idle (unused) spans.
+	//
+	// Idle spans have no objects in them. These spans could be
+	// (and may already have been) returned to the OS, or they can
+	// be reused for heap allocations, or they can be reused as
+	// stack memory.
+	//
+	// HeapIdle minus HeapReleased estimates the amount of memory
+	// that could be returned to the OS, but is being retained by
+	// the runtime so it can grow the heap without requesting more
+	// memory from the OS. If this difference is significantly
+	// larger than the heap size, it indicates there was a recent
+	// transient spike in live heap size.
+	HeapIdle uint64
+
+	// HeapInuse is bytes in in-use spans.
+	//
+	// In-use spans have at least one object in them. These spans
+	// can only be used for other objects of roughly the same
+	// size.
+	//
+	// HeapInuse minus HeapAlloc estimates the amount of memory
+	// that has been dedicated to particular size classes, but is
+	// not currently being used. This is an upper bound on
+	// fragmentation, but in general this memory can be reused
+	// efficiently.
+	HeapInuse uint64
+
+	// HeapReleased is bytes of physical memory returned to the OS.
+	//
+	// This counts heap memory from idle spans that was returned
+	// to the OS and has not yet been reacquired for the heap.
+	HeapReleased uint64
+
+	// HeapObjects is the number of allocated heap objects.
+	//
+	// Like HeapAlloc, this increases as objects are allocated and
+	// decreases as the heap is swept and unreachable objects are
+	// freed.
+	HeapObjects uint64
+
+	// Stack memory statistics.
+	//
+	// Stacks are not considered part of the heap, but the runtime
+	// can reuse a span of heap memory for stack memory, and
+	// vice-versa.
+
+	// StackInuse is bytes in stack spans.
+	//
+	// In-use stack spans have at least one stack in them. These
+	// spans can only be used for other stacks of the same size.
+	//
+	// There is no StackIdle because unused stack spans are
+	// returned to the heap (and hence counted toward HeapIdle).
+	StackInuse uint64
+
+	// StackSys is bytes of stack memory obtained from the OS.
+	//
+	// StackSys is StackInuse, plus any memory obtained directly
+	// from the OS for OS thread stacks (which should be minimal).
+	StackSys uint64
+
+	// Off-heap memory statistics.
+	//
+	// The following statistics measure runtime-internal
+	// structures that are not allocated from heap memory (usually
+	// because they are part of implementing the heap). Unlike
+	// heap or stack memory, any memory allocated to these
+	// structures is dedicated to these structures.
+	//
+	// These are primarily useful for debugging runtime memory
+	// overheads.
+
+	// MSpanInuse is bytes of allocated mspan structures.
+	MSpanInuse uint64
+
+	// MSpanSys is bytes of memory obtained from the OS for mspan
+	// structures.
+	MSpanSys uint64
+
+	// MCacheInuse is bytes of allocated mcache structures.
+	MCacheInuse uint64
+
+	// MCacheSys is bytes of memory obtained from the OS for
+	// mcache structures.
+	MCacheSys uint64
+
+	// BuckHashSys is bytes of memory in profiling bucket hash tables.
+	BuckHashSys uint64
+
+	// GCSys is bytes of memory in garbage collection metadata.
+	GCSys uint64
+
+	// OtherSys is bytes of memory in miscellaneous off-heap
+	// runtime allocations.
+	OtherSys uint64
+
+	// Garbage collector statistics.
+
+	// NextGC is the target heap size of the next GC cycle.
+	//
+	// The garbage collector's goal is to keep HeapAlloc ≤ NextGC.
+	// At the end of each GC cycle, the target for the next cycle
+	// is computed based on the amount of reachable data and the
+	// value of GOGC.
+	NextGC uint64
+
+	// LastGC is the time the last garbage collection finished, as
+	// nanoseconds since 1970 (the UNIX epoch).
+	LastGC uint64
+
+	// PauseTotalNs is the cumulative nanoseconds in GC
+	// stop-the-world pauses since the program started.
+	//
+	// During a stop-the-world pause, all goroutines are paused
+	// and only the garbage collector can run.
+	PauseTotalNs uint64
+
+	// PauseNs is a circular buffer of recent GC stop-the-world
+	// pause times in nanoseconds.
+	//
+	// The most recent pause is at PauseNs[(NumGC+255)%256]. In
+	// general, PauseNs[N%256] records the time paused in the most
+	// recent N%256th GC cycle. There may be multiple pauses per
+	// GC cycle; this is the sum of all pauses during a cycle.
+	PauseNs [256]uint64
+
+	// PauseEnd is a circular buffer of recent GC pause end times,
+	// as nanoseconds since 1970 (the UNIX epoch).
+	//
+	// This buffer is filled the same way as PauseNs. There may be
+	// multiple pauses per GC cycle; this records the end of the
+	// last pause in a cycle.
+	PauseEnd [256]uint64
+
+	// NumGC is the number of completed GC cycles.
+	NumGC uint32
+
+	// NumForcedGC is the number of GC cycles that were forced by
+	// the application calling the GC function.
+	NumForcedGC uint32
+
+	// GCCPUFraction is the fraction of this program's available
+	// CPU time used by the GC since the program started.
+	//
+	// GCCPUFraction is expressed as a number between 0 and 1,
+	// where 0 means GC has consumed none of this program's CPU. A
+	// program's available CPU time is defined as the integral of
+	// GOMAXPROCS since the program started. That is, if
+	// GOMAXPROCS is 2 and a program has been running for 10
+	// seconds, its "available CPU" is 20 seconds. GCCPUFraction
+	// does not include CPU time used for write barrier activity.
+	//
+	// This is the same as the fraction of CPU reported by
+	// GODEBUG=gctrace=1.
+	GCCPUFraction float64
+
+	// EnableGC indicates that GC is enabled. It is always true,
+	// even if GOGC=off.
+	EnableGC bool
+
+	// DebugGC is currently unused.
+	DebugGC bool
+
+	// BySize reports per-size class allocation statistics.
+	//
+	// BySize[N] gives statistics for allocations of size S where
+	// BySize[N-1].Size < S ≤ BySize[N].Size.
+	//
+	// This does not report allocations larger than BySize[60].Size.
+	BySize [61]struct {
+		// Size is the maximum byte size of an object in this
+		// size class.
+		Size uint32
+
+		// Mallocs is the cumulative count of heap objects
+		// allocated in this size class. The cumulative bytes
+		// of allocation is Size*Mallocs. The number of live
+		// objects in this size class is Mallocs - Frees.
+		Mallocs uint64
+
+		// Frees is the cumulative count of heap objects freed
+		// in this size class.
+		Frees uint64
+	}
+}
+
+func init() {
+	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
+		println(offset)
+		throw("memstats.heapStats not aligned to 8 bytes")
+	}
+	if offset := unsafe.Offsetof(memstats.gcPauseDist); offset%8 != 0 {
+		println(offset)
+		throw("memstats.gcPauseDist not aligned to 8 bytes")
+	}
+	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
+	// [3]heapStatsDelta) to be 8-byte aligned.
+	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
+		println(size)
+		throw("heapStatsDelta not a multiple of 8 bytes in size")
+	}
+}
+
+// ReadMemStats populates m with memory allocator statistics.
+//
+// The returned memory allocator statistics are up to date as of the
+// call to ReadMemStats. This is in contrast with a heap profile,
+// which is a snapshot as of the most recently completed garbage
+// collection cycle.
+func ReadMemStats(m *MemStats) {
+	stopTheWorld("read mem stats")
+
+	systemstack(func() {
+		readmemstats_m(m)
+	})
+
+	startTheWorld()
+}
+
+func readmemstats_m(stats *MemStats) {
+	updatememstats()
+
+	stats.Alloc = memstats.alloc
+	stats.TotalAlloc = memstats.total_alloc
+	stats.Sys = memstats.sys
+	stats.Mallocs = memstats.nmalloc
+	stats.Frees = memstats.nfree
+	stats.HeapAlloc = memstats.alloc
+	stats.HeapSys = memstats.heap_sys.load()
+	// By definition, HeapIdle is memory that was mapped
+	// for the heap but is not currently used to hold heap
+	// objects. It also specifically is memory that can be
+	// used for other purposes, like stacks, but this memory
+	// is subtracted out of HeapSys before it makes that
+	// transition. Put another way:
+	//
+	// heap_sys = bytes allocated from the OS for the heap - bytes ultimately used for non-heap purposes
+	// heap_idle = bytes allocated from the OS for the heap - bytes ultimately used for any purpose
+	//
+	// or
+	//
+	// heap_sys = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse
+	// heap_idle = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse - heap_inuse
+	//
+	// => heap_idle = heap_sys - heap_inuse
+	stats.HeapIdle = memstats.heap_sys.load() - memstats.heap_inuse
+	stats.HeapInuse = memstats.heap_inuse
+	stats.HeapReleased = memstats.heap_released
+	stats.HeapObjects = memstats.heap_objects
+	stats.StackInuse = memstats.stacks_inuse
+	// memstats.stacks_sys is only memory mapped directly for OS stacks.
+	// Add in heap-allocated stack memory for user consumption.
+	stats.StackSys = memstats.stacks_inuse + memstats.stacks_sys.load()
+	stats.MSpanInuse = memstats.mspan_inuse
+	stats.MSpanSys = memstats.mspan_sys.load()
+	stats.MCacheInuse = memstats.mcache_inuse
+	stats.MCacheSys = memstats.mcache_sys.load()
+	stats.BuckHashSys = memstats.buckhash_sys.load()
+	// MemStats defines GCSys as an aggregate of all memory related
+	// to the memory management system, but we track this memory
+	// at a more granular level in the runtime.
+	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+	stats.OtherSys = memstats.other_sys.load()
+	stats.NextGC = gcController.heapGoal
+	stats.LastGC = memstats.last_gc_unix
+	stats.PauseTotalNs = memstats.pause_total_ns
+	stats.PauseNs = memstats.pause_ns
+	stats.PauseEnd = memstats.pause_end
+	stats.NumGC = memstats.numgc
+	stats.NumForcedGC = memstats.numforcedgc
+	stats.GCCPUFraction = memstats.gc_cpu_fraction
+	stats.EnableGC = true
+
+	// Handle BySize. Copy N values, where N is
+	// the minimum of the lengths of the two arrays.
+	// Unfortunately copy() won't work here because
+	// the arrays have different structs.
+	//
+	// TODO(mknyszek): Consider renaming the fields
+	// of by_size's elements to align so we can use
+	// the copy built-in.
+	bySizeLen := len(stats.BySize)
+	if l := len(memstats.by_size); l < bySizeLen {
+		bySizeLen = l
+	}
+	for i := 0; i < bySizeLen; i++ {
+		stats.BySize[i].Size = memstats.by_size[i].size
+		stats.BySize[i].Mallocs = memstats.by_size[i].nmalloc
+		stats.BySize[i].Frees = memstats.by_size[i].nfree
+	}
+}
+
+//go:linkname readGCStats runtime/debug.readGCStats
+func readGCStats(pauses *[]uint64) {
+	systemstack(func() {
+		readGCStats_m(pauses)
+	})
+}
+
+// readGCStats_m must be called on the system stack because it acquires the heap
+// lock. See mheap for details.
+//go:systemstack
+func readGCStats_m(pauses *[]uint64) {
+	p := *pauses
+	// Calling code in runtime/debug should make the slice large enough.
+	if cap(p) < len(memstats.pause_ns)+3 {
+		throw("short slice passed to readGCStats")
+	}
+
+	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
+	lock(&mheap_.lock)
+
+	n := memstats.numgc
+	if n > uint32(len(memstats.pause_ns)) {
+		n = uint32(len(memstats.pause_ns))
+	}
+
+	// The pause buffer is circular. The most recent pause is at
+	// pause_ns[(numgc-1)%len(pause_ns)], and then backward
+	// from there to go back farther in time. We deliver the times
+	// most recent first (in p[0]).
+	p = p[:cap(p)]
+	for i := uint32(0); i < n; i++ {
+		j := (memstats.numgc - 1 - i) % uint32(len(memstats.pause_ns))
+		p[i] = memstats.pause_ns[j]
+		p[n+i] = memstats.pause_end[j]
+	}
+
+	p[n+n] = memstats.last_gc_unix
+	p[n+n+1] = uint64(memstats.numgc)
+	p[n+n+2] = memstats.pause_total_ns
+	unlock(&mheap_.lock)
+	*pauses = p[:n+n+3]
+}
+
+// Updates the memstats structure.
+//
+// The world must be stopped.
+//
+//go:nowritebarrier
+func updatememstats() {
+	assertWorldStopped()
+
+	// Flush mcaches to mcentral before doing anything else.
+	//
+	// Flushing to the mcentral may in general cause stats to
+	// change as mcentral data structures are manipulated.
+	systemstack(flushallmcaches)
+
+	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
+	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
+	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
+		memstats.other_sys.load()
+
+	// Calculate memory allocator stats.
+	// During program execution we only count number of frees and amount of freed memory.
+	// Current number of alive objects in the heap and amount of alive heap memory
+	// are calculated by scanning all spans.
+	// Total number of mallocs is calculated as number of frees plus number of alive objects.
+	// Similarly, total amount of allocated memory is calculated as amount of freed memory
+	// plus amount of alive heap memory.
+	memstats.alloc = 0
+	memstats.total_alloc = 0
+	memstats.nmalloc = 0
+	memstats.nfree = 0
+	for i := 0; i < len(memstats.by_size); i++ {
+		memstats.by_size[i].nmalloc = 0
+		memstats.by_size[i].nfree = 0
+	}
+	// Collect consistent stats, which are the source-of-truth in the some cases.
+	var consStats heapStatsDelta
+	memstats.heapStats.unsafeRead(&consStats)
+
+	// Collect large allocation stats.
+	totalAlloc := consStats.largeAlloc
+	memstats.nmalloc += consStats.largeAllocCount
+	totalFree := consStats.largeFree
+	memstats.nfree += consStats.largeFreeCount
+
+	// Collect per-sizeclass stats.
+	for i := 0; i < _NumSizeClasses; i++ {
+		// Malloc stats.
+		a := consStats.smallAllocCount[i]
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
+
+		// Free stats.
+		f := consStats.smallFreeCount[i]
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
+	}
+
+	// Account for tiny allocations.
+	memstats.nfree += consStats.tinyAllocCount
+	memstats.nmalloc += consStats.tinyAllocCount
+
+	// Calculate derived stats.
+	memstats.total_alloc = totalAlloc
+	memstats.alloc = totalAlloc - totalFree
+	memstats.heap_objects = memstats.nmalloc - memstats.nfree
+
+	memstats.stacks_inuse = uint64(consStats.inStacks)
+	memstats.gcWorkBufInUse = uint64(consStats.inWorkBufs)
+	memstats.gcProgPtrScalarBitsInUse = uint64(consStats.inPtrScalarBits)
+
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+
+	// The world is stopped, so the consistent stats (after aggregation)
+	// should be identical to some combination of memstats. In particular:
+	//
+	// * heap_inuse == inHeap
+	// * heap_released == released
+	// * heap_sys - heap_released == committed - inStacks - inWorkBufs - inPtrScalarBits
+	//
+	// Check if that's actually true.
+	//
+	// TODO(mknyszek): Maybe don't throw here. It would be bad if a
+	// bug in otherwise benign accounting caused the whole application
+	// to crash.
+	if memstats.heap_inuse != uint64(consStats.inHeap) {
+		print("runtime: heap_inuse=", memstats.heap_inuse, "\n")
+		print("runtime: consistent value=", consStats.inHeap, "\n")
+		throw("heap_inuse and consistent stats are not equal")
+	}
+	if memstats.heap_released != uint64(consStats.released) {
+		print("runtime: heap_released=", memstats.heap_released, "\n")
+		print("runtime: consistent value=", consStats.released, "\n")
+		throw("heap_released and consistent stats are not equal")
+	}
+	globalRetained := memstats.heap_sys.load() - memstats.heap_released
+	consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
+	if globalRetained != consRetained {
+		print("runtime: global value=", globalRetained, "\n")
+		print("runtime: consistent value=", consRetained, "\n")
+		throw("measures of the retained heap are not equal")
+	}
+}
+
+// flushmcache flushes the mcache of allp[i].
+//
+// The world must be stopped.
+//
+//go:nowritebarrier
+func flushmcache(i int) {
+	assertWorldStopped()
+
+	p := allp[i]
+	c := p.mcache
+	if c == nil {
+		return
+	}
+	c.releaseAll()
+	stackcache_clear(c)
+}
+
+// flushallmcaches flushes the mcaches of all Ps.
+//
+// The world must be stopped.
+//
+//go:nowritebarrier
+func flushallmcaches() {
+	assertWorldStopped()
+
+	for i := 0; i < int(gomaxprocs); i++ {
+		flushmcache(i)
+	}
+}
+
+// sysMemStat represents a global system statistic that is managed atomically.
+//
+// This type must structurally be a uint64 so that mstats aligns with MemStats.
+type sysMemStat uint64
+
+// load atomically reads the value of the stat.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
+//go:nosplit
+func (s *sysMemStat) load() uint64 {
+	return atomic.Load64((*uint64)(s))
+}
+
+// add atomically adds the sysMemStat by n.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
+//go:nosplit
+func (s *sysMemStat) add(n int64) {
+	if s == nil {
+		return
+	}
+	val := atomic.Xadd64((*uint64)(s), n)
+	if (n > 0 && int64(val) < n) || (n < 0 && int64(val)+n < n) {
+		print("runtime: val=", val, " n=", n, "\n")
+		throw("sysMemStat overflow")
+	}
+}
+
+// heapStatsDelta contains deltas of various runtime memory statistics
+// that need to be updated together in order for them to be kept
+// consistent with one another.
+type heapStatsDelta struct {
+	// Memory stats.
+	committed       int64 // byte delta of memory committed
+	released        int64 // byte delta of released memory generated
+	inHeap          int64 // byte delta of memory placed in the heap
+	inStacks        int64 // byte delta of memory reserved for stacks
+	inWorkBufs      int64 // byte delta of memory reserved for work bufs
+	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	//
+	// These are all uint64 because they're cumulative, and could quickly wrap
+	// around otherwise.
+	tinyAllocCount  uint64                  // number of tiny allocations
+	largeAlloc      uint64                  // bytes allocated for large objects
+	largeAllocCount uint64                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uint64 // number of allocs for small objects
+	largeFree       uint64                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uint64                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxSmallSize)
+
+	// NOTE: This struct must be a multiple of 8 bytes in size because it
+	// is stored in an array. If it's not, atomic accesses to the above
+	// fields may be unaligned and fail on 32-bit platforms.
+}
+
+// merge adds in the deltas from b into a.
+func (a *heapStatsDelta) merge(b *heapStatsDelta) {
+	a.committed += b.committed
+	a.released += b.released
+	a.inHeap += b.inHeap
+	a.inStacks += b.inStacks
+	a.inWorkBufs += b.inWorkBufs
+	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.tinyAllocCount += b.tinyAllocCount
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
+	}
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
+	}
+}
+
+// consistentHeapStats represents a set of various memory statistics
+// whose updates must be viewed completely to get a consistent
+// state of the world.
+//
+// To write updates to memory stats use the acquire and release
+// methods. To obtain a consistent global snapshot of these statistics,
+// use read.
+type consistentHeapStats struct {
+	// stats is a ring buffer of heapStatsDelta values.
+	// Writers always atomically update the delta at index gen.
+	//
+	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
+	// and synchronizing with writers by observing each P's
+	// statsSeq field. If the reader observes a P not writing,
+	// it can be sure that it will pick up the new gen value the
+	// next time it writes.
+	//
+	// The reader then takes responsibility by clearing space
+	// in the ring buffer for the next reader to rotate gen to
+	// that space (i.e. it merges in values from index (gen-2) mod 3
+	// to index (gen-1) mod 3, then clears the former).
+	//
+	// Note that this means only one reader can be reading at a time.
+	// There is no way for readers to synchronize.
+	//
+	// This process is why we need a ring buffer of size 3 instead
+	// of 2: one is for the writers, one contains the most recent
+	// data, and the last one is clear so writers can begin writing
+	// to it the moment gen is updated.
+	stats [3]heapStatsDelta
+
+	// gen represents the current index into which writers
+	// are writing, and can take on the value of 0, 1, or 2.
+	// This value is updated atomically.
+	gen uint32
+
+	// noPLock is intended to provide mutual exclusion for updating
+	// stats when no P is available. It does not block other writers
+	// with a P, only other writers without a P and the reader. Because
+	// stats are usually updated when a P is available, contention on
+	// this lock should be minimal.
+	noPLock mutex
+}
+
+// acquire returns a heapStatsDelta to be updated. In effect,
+// it acquires the shard for writing. release must be called
+// as soon as the relevant deltas are updated.
+//
+// The returned heapStatsDelta must be updated atomically.
+//
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between. A P also must
+// not acquire a given consistentHeapStats if it hasn't
+// yet released it.
+//
+// nosplit because a stack growth in this function could
+// lead to a stack allocation that could reenter the
+// function.
+//
+//go:nosplit
+func (m *consistentHeapStats) acquire() *heapStatsDelta {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 == 0 {
+			// Should have been incremented to odd.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		lock(&m.noPLock)
+	}
+	gen := atomic.Load(&m.gen) % 3
+	return &m.stats[gen]
+}
+
+// release indicates that the writer is done modifying
+// the delta. The value returned by the corresponding
+// acquire must no longer be accessed or modified after
+// release is called.
+//
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+//
+// nosplit because a stack growth in this function could
+// lead to a stack allocation that causes another acquire
+// before this operation has completed.
+//
+//go:nosplit
+func (m *consistentHeapStats) release() {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 != 0 {
+			// Should have been incremented to even.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		unlock(&m.noPLock)
+	}
+}
+
+// unsafeRead aggregates the delta for this shard into out.
+//
+// Unsafe because it does so without any synchronization. The
+// world must be stopped.
+func (m *consistentHeapStats) unsafeRead(out *heapStatsDelta) {
+	assertWorldStopped()
+
+	for i := range m.stats {
+		out.merge(&m.stats[i])
+	}
+}
+
+// unsafeClear clears the shard.
+//
+// Unsafe because the world must be stopped and values should
+// be donated elsewhere before clearing.
+func (m *consistentHeapStats) unsafeClear() {
+	assertWorldStopped()
+
+	for i := range m.stats {
+		m.stats[i] = heapStatsDelta{}
+	}
+}
+
+// read takes a globally consistent snapshot of m
+// and puts the aggregated value in out. Even though out is a
+// heapStatsDelta, the resulting values should be complete and
+// valid statistic values.
+//
+// Not safe to call concurrently. The world must be stopped
+// or metricsSema must be held.
+func (m *consistentHeapStats) read(out *heapStatsDelta) {
+	// Getting preempted after this point is not safe because
+	// we read allp. We need to make sure a STW can't happen
+	// so it doesn't change out from under us.
+	mp := acquirem()
+
+	// Get the current generation. We can be confident that this
+	// will not change since read is serialized and is the only
+	// one that modifies currGen.
+	currGen := atomic.Load(&m.gen)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
+	}
+
+	// Prevent writers without a P from writing while we update gen.
+	lock(&m.noPLock)
+
+	// Rotate gen, effectively taking a snapshot of the state of
+	// these statistics at the point of the exchange by moving
+	// writers to the next set of deltas.
+	//
+	// This exchange is safe to do because we won't race
+	// with anyone else trying to update this value.
+	atomic.Xchg(&m.gen, (currGen+1)%3)
+
+	// Allow P-less writers to continue. They'll be writing to the
+	// next generation now.
+	unlock(&m.noPLock)
+
+	for _, p := range allp {
+		// Spin until there are no more writers.
+		for atomic.Load(&p.statsSeq)%2 != 0 {
+		}
+	}
+
+	// At this point we've observed that each sequence
+	// number is even, so any future writers will observe
+	// the new gen value. That means it's safe to read from
+	// the other deltas in the stats buffer.
+
+	// Perform our responsibilities and free up
+	// stats[prevGen] for the next time we want to take
+	// a snapshot.
+	m.stats[currGen].merge(&m.stats[prevGen])
+	m.stats[prevGen] = heapStatsDelta{}
+
+	// Finally, copy out the complete delta.
+	*out = m.stats[currGen]
+
+	releasem(mp)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/mwbbuf.go b/contrib/go/_std_1.18/src/runtime/mwbbuf.go
new file mode 100644
index 0000000000..78d9382620
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/mwbbuf.go
@@ -0,0 +1,290 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This implements the write barrier buffer. The write barrier itself
+// is gcWriteBarrier and is implemented in assembly.
+//
+// See mbarrier.go for algorithmic details on the write barrier. This
+// file deals only with the buffer.
+//
+// The write barrier has a fast path and a slow path. The fast path
+// simply enqueues to a per-P write barrier buffer. It's written in
+// assembly and doesn't clobber any general purpose registers, so it
+// doesn't have the usual overheads of a Go call.
+//
+// When the buffer fills up, the write barrier invokes the slow path
+// (wbBufFlush) to flush the buffer to the GC work queues. In this
+// path, since the compiler didn't spill registers, we spill *all*
+// registers and disallow any GC safe points that could observe the
+// stack frame (since we don't know the types of the spilled
+// registers).
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// testSmallBuf forces a small write barrier buffer to stress write
+// barrier flushing.
+const testSmallBuf = false
+
+// wbBuf is a per-P buffer of pointers queued by the write barrier.
+// This buffer is flushed to the GC workbufs when it fills up and on
+// various GC transitions.
+//
+// This is closely related to a "sequential store buffer" (SSB),
+// except that SSBs are usually used for maintaining remembered sets,
+// while this is used for marking.
+type wbBuf struct {
+	// next points to the next slot in buf. It must not be a
+	// pointer type because it can point past the end of buf and
+	// must be updated without write barriers.
+	//
+	// This is a pointer rather than an index to optimize the
+	// write barrier assembly.
+	next uintptr
+
+	// end points to just past the end of buf. It must not be a
+	// pointer type because it points past the end of buf and must
+	// be updated without write barriers.
+	end uintptr
+
+	// buf stores a series of pointers to execute write barriers
+	// on. This must be a multiple of wbBufEntryPointers because
+	// the write barrier only checks for overflow once per entry.
+	buf [wbBufEntryPointers * wbBufEntries]uintptr
+}
+
+const (
+	// wbBufEntries is the number of write barriers between
+	// flushes of the write barrier buffer.
+	//
+	// This trades latency for throughput amortization. Higher
+	// values amortize flushing overhead more, but increase the
+	// latency of flushing. Higher values also increase the cache
+	// footprint of the buffer.
+	//
+	// TODO: What is the latency cost of this? Tune this value.
+	wbBufEntries = 256
+
+	// wbBufEntryPointers is the number of pointers added to the
+	// buffer by each write barrier.
+	wbBufEntryPointers = 2
+)
+
+// reset empties b by resetting its next and end pointers.
+func (b *wbBuf) reset() {
+	start := uintptr(unsafe.Pointer(&b.buf[0]))
+	b.next = start
+	if writeBarrier.cgo {
+		// Effectively disable the buffer by forcing a flush
+		// on every barrier.
+		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+	} else if testSmallBuf {
+		// For testing, allow two barriers in the buffer. If
+		// we only did one, then barriers of non-heap pointers
+		// would be no-ops. This lets us combine a buffered
+		// barrier with a flush at a later time.
+		b.end = uintptr(unsafe.Pointer(&b.buf[2*wbBufEntryPointers]))
+	} else {
+		b.end = start + uintptr(len(b.buf))*unsafe.Sizeof(b.buf[0])
+	}
+
+	if (b.end-b.next)%(wbBufEntryPointers*unsafe.Sizeof(b.buf[0])) != 0 {
+		throw("bad write barrier buffer bounds")
+	}
+}
+
+// discard resets b's next pointer, but not its end pointer.
+//
+// This must be nosplit because it's called by wbBufFlush.
+//
+//go:nosplit
+func (b *wbBuf) discard() {
+	b.next = uintptr(unsafe.Pointer(&b.buf[0]))
+}
+
+// empty reports whether b contains no pointers.
+func (b *wbBuf) empty() bool {
+	return b.next == uintptr(unsafe.Pointer(&b.buf[0]))
+}
+
+// putFast adds old and new to the write barrier buffer and returns
+// false if a flush is necessary. Callers should use this as:
+//
+//     buf := &getg().m.p.ptr().wbBuf
+//     if !buf.putFast(old, new) {
+//         wbBufFlush(...)
+//     }
+//     ... actual memory write ...
+//
+// The arguments to wbBufFlush depend on whether the caller is doing
+// its own cgo pointer checks. If it is, then this can be
+// wbBufFlush(nil, 0). Otherwise, it must pass the slot address and
+// new.
+//
+// The caller must ensure there are no preemption points during the
+// above sequence. There must be no preemption points while buf is in
+// use because it is a per-P resource. There must be no preemption
+// points between the buffer put and the write to memory because this
+// could allow a GC phase change, which could result in missed write
+// barriers.
+//
+// putFast must be nowritebarrierrec to because write barriers here would
+// corrupt the write barrier buffer. It (and everything it calls, if
+// it called anything) has to be nosplit to avoid scheduling on to a
+// different P and a different buffer.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func (b *wbBuf) putFast(old, new uintptr) bool {
+	p := (*[2]uintptr)(unsafe.Pointer(b.next))
+	p[0] = old
+	p[1] = new
+	b.next += 2 * goarch.PtrSize
+	return b.next != b.end
+}
+
+// wbBufFlush flushes the current P's write barrier buffer to the GC
+// workbufs. It is passed the slot and value of the write barrier that
+// caused the flush so that it can implement cgocheck.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation.
+//
+// This and everything it calls must be nosplit because 1) the stack
+// contains untyped slots from gcWriteBarrier and 2) there must not be
+// a GC safe point between the write barrier test in the caller and
+// flushing the buffer.
+//
+// TODO: A "go:nosplitrec" annotation would be perfect for this.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func wbBufFlush(dst *uintptr, src uintptr) {
+	// Note: Every possible return from this function must reset
+	// the buffer's next pointer to prevent buffer overflow.
+
+	// This *must not* modify its arguments because this
+	// function's argument slots do double duty in gcWriteBarrier
+	// as register spill slots. Currently, not modifying the
+	// arguments is sufficient to keep the spill slots unmodified
+	// (which seems unlikely to change since it costs little and
+	// helps with debugging).
+
+	if getg().m.dying > 0 {
+		// We're going down. Not much point in write barriers
+		// and this way we can allow write barriers in the
+		// panic path.
+		getg().m.p.ptr().wbBuf.discard()
+		return
+	}
+
+	if writeBarrier.cgo && dst != nil {
+		// This must be called from the stack that did the
+		// write. It's nosplit all the way down.
+		cgoCheckWriteBarrier(dst, src)
+		if !writeBarrier.needed {
+			// We were only called for cgocheck.
+			getg().m.p.ptr().wbBuf.discard()
+			return
+		}
+	}
+
+	// Switch to the system stack so we don't have to worry about
+	// the untyped stack slots or safe points.
+	systemstack(func() {
+		wbBufFlush1(getg().m.p.ptr())
+	})
+}
+
+// wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation, so this may lead to infinite loops or
+// buffer corruption.
+//
+// This must be non-preemptible because it uses the P's workbuf.
+//
+//go:nowritebarrierrec
+//go:systemstack
+func wbBufFlush1(_p_ *p) {
+	// Get the buffered pointers.
+	start := uintptr(unsafe.Pointer(&_p_.wbBuf.buf[0]))
+	n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
+	ptrs := _p_.wbBuf.buf[:n]
+
+	// Poison the buffer to make extra sure nothing is enqueued
+	// while we're processing the buffer.
+	_p_.wbBuf.next = 0
+
+	if useCheckmark {
+		// Slow path for checkmark mode.
+		for _, ptr := range ptrs {
+			shade(ptr)
+		}
+		_p_.wbBuf.reset()
+		return
+	}
+
+	// Mark all of the pointers in the buffer and record only the
+	// pointers we greyed. We use the buffer itself to temporarily
+	// record greyed pointers.
+	//
+	// TODO: Should scanobject/scanblock just stuff pointers into
+	// the wbBuf? Then this would become the sole greying path.
+	//
+	// TODO: We could avoid shading any of the "new" pointers in
+	// the buffer if the stack has been shaded, or even avoid
+	// putting them in the buffer at all (which would double its
+	// capacity). This is slightly complicated with the buffer; we
+	// could track whether any un-shaded goroutine has used the
+	// buffer, or just track globally whether there are any
+	// un-shaded stacks and flush after each stack scan.
+	gcw := &_p_.gcw
+	pos := 0
+	for _, ptr := range ptrs {
+		if ptr < minLegalPointer {
+			// nil pointers are very common, especially
+			// for the "old" values. Filter out these and
+			// other "obvious" non-heap pointers ASAP.
+			//
+			// TODO: Should we filter out nils in the fast
+			// path to reduce the rate of flushes?
+			continue
+		}
+		obj, span, objIndex := findObject(ptr, 0, 0)
+		if obj == 0 {
+			continue
+		}
+		// TODO: Consider making two passes where the first
+		// just prefetches the mark bits.
+		mbits := span.markBitsForIndex(objIndex)
+		if mbits.isMarked() {
+			continue
+		}
+		mbits.setMarked()
+
+		// Mark span.
+		arena, pageIdx, pageMask := pageIndexOf(span.base())
+		if arena.pageMarks[pageIdx]&pageMask == 0 {
+			atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+		}
+
+		if span.spanclass.noscan() {
+			gcw.bytesMarked += uint64(span.elemsize)
+			continue
+		}
+		ptrs[pos] = obj
+		pos++
+	}
+
+	// Enqueue the greyed objects.
+	gcw.putBatch(ptrs[:pos])
+
+	_p_.wbBuf.reset()
+}
diff --git a/contrib/go/_std_1.18/src/runtime/nbpipe_pipe.go b/contrib/go/_std_1.18/src/runtime/nbpipe_pipe.go
new file mode 100644
index 0000000000..408e1ec410
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/nbpipe_pipe.go
@@ -0,0 +1,19 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build aix || darwin
+
+package runtime
+
+func nonblockingPipe() (r, w int32, errno int32) {
+	r, w, errno = pipe()
+	if errno != 0 {
+		return -1, -1, errno
+	}
+	closeonexec(r)
+	setNonblock(r)
+	closeonexec(w)
+	setNonblock(w)
+	return r, w, errno
+}
diff --git a/contrib/go/_std_1.18/src/runtime/nbpipe_pipe2.go b/contrib/go/_std_1.18/src/runtime/nbpipe_pipe2.go
new file mode 100644
index 0000000000..6a555bcd99
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/nbpipe_pipe2.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build dragonfly || freebsd || linux || netbsd || openbsd || solaris
+
+package runtime
+
+func nonblockingPipe() (r, w int32, errno int32) {
+	r, w, errno = pipe2(_O_NONBLOCK | _O_CLOEXEC)
+	if errno == -_ENOSYS {
+		r, w, errno = pipe()
+		if errno != 0 {
+			return -1, -1, errno
+		}
+		closeonexec(r)
+		setNonblock(r)
+		closeonexec(w)
+		setNonblock(w)
+	}
+	return r, w, errno
+}
diff --git a/contrib/go/_std_1.18/src/runtime/netpoll.go b/contrib/go/_std_1.18/src/runtime/netpoll.go
new file mode 100644
index 0000000000..bb3dd35317
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/netpoll.go
@@ -0,0 +1,652 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build aix || darwin || dragonfly || freebsd || (js && wasm) || linux || netbsd || openbsd || solaris || windows
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// Integrated network poller (platform-independent part).
+// A particular implementation (epoll/kqueue/port/AIX/Windows)
+// must define the following functions:
+//
+// func netpollinit()
+//     Initialize the poller. Only called once.
+//
+// func netpollopen(fd uintptr, pd *pollDesc) int32
+//     Arm edge-triggered notifications for fd. The pd argument is to pass
+//     back to netpollready when fd is ready. Return an errno value.
+//
+// func netpollclose(fd uintptr) int32
+//     Disable notifications for fd. Return an errno value.
+//
+// func netpoll(delta int64) gList
+//     Poll the network. If delta < 0, block indefinitely. If delta == 0,
+//     poll without blocking. If delta > 0, block for up to delta nanoseconds.
+//     Return a list of goroutines built by calling netpollready.
+//
+// func netpollBreak()
+//     Wake up the network poller, assumed to be blocked in netpoll.
+//
+// func netpollIsPollDescriptor(fd uintptr) bool
+//     Reports whether fd is a file descriptor used by the poller.
+
+// Error codes returned by runtime_pollReset and runtime_pollWait.
+// These must match the values in internal/poll/fd_poll_runtime.go.
+const (
+	pollNoError        = 0 // no error
+	pollErrClosing     = 1 // descriptor is closed
+	pollErrTimeout     = 2 // I/O timeout
+	pollErrNotPollable = 3 // general error polling descriptor
+)
+
+// pollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
+// goroutines respectively. The semaphore can be in the following states:
+// pdReady - io readiness notification is pending;
+//           a goroutine consumes the notification by changing the state to nil.
+// pdWait - a goroutine prepares to park on the semaphore, but not yet parked;
+//          the goroutine commits to park by changing the state to G pointer,
+//          or, alternatively, concurrent io notification changes the state to pdReady,
+//          or, alternatively, concurrent timeout/close changes the state to nil.
+// G pointer - the goroutine is blocked on the semaphore;
+//             io notification or timeout/close changes the state to pdReady or nil respectively
+//             and unparks the goroutine.
+// nil - none of the above.
+const (
+	pdReady uintptr = 1
+	pdWait  uintptr = 2
+)
+
+const pollBlockSize = 4 * 1024
+
+// Network poller descriptor.
+//
+// No heap pointers.
+//
+//go:notinheap
+type pollDesc struct {
+	link *pollDesc // in pollcache, protected by pollcache.lock
+	fd   uintptr   // constant for pollDesc usage lifetime
+
+	// atomicInfo holds bits from closing, rd, and wd,
+	// which are only ever written while holding the lock,
+	// summarized for use by netpollcheckerr,
+	// which cannot acquire the lock.
+	// After writing these fields under lock in a way that
+	// might change the summary, code must call publishInfo
+	// before releasing the lock.
+	// Code that changes fields and then calls netpollunblock
+	// (while still holding the lock) must call publishInfo
+	// before calling netpollunblock, because publishInfo is what
+	// stops netpollblock from blocking anew
+	// (by changing the result of netpollcheckerr).
+	// atomicInfo also holds the eventErr bit,
+	// recording whether a poll event on the fd got an error;
+	// atomicInfo is the only source of truth for that bit.
+	atomicInfo atomic.Uint32 // atomic pollInfo
+
+	// rg, wg are accessed atomically and hold g pointers.
+	// (Using atomic.Uintptr here is similar to using guintptr elsewhere.)
+	rg atomic.Uintptr // pdReady, pdWait, G waiting for read or nil
+	wg atomic.Uintptr // pdReady, pdWait, G waiting for write or nil
+
+	lock    mutex // protects the following fields
+	closing bool
+	user    uint32    // user settable cookie
+	rseq    uintptr   // protects from stale read timers
+	rt      timer     // read deadline timer (set if rt.f != nil)
+	rd      int64     // read deadline (a nanotime in the future, -1 when expired)
+	wseq    uintptr   // protects from stale write timers
+	wt      timer     // write deadline timer
+	wd      int64     // write deadline (a nanotime in the future, -1 when expired)
+	self    *pollDesc // storage for indirect interface. See (*pollDesc).makeArg.
+}
+
+// pollInfo is the bits needed by netpollcheckerr, stored atomically,
+// mostly duplicating state that is manipulated under lock in pollDesc.
+// The one exception is the pollEventErr bit, which is maintained only
+// in the pollInfo.
+type pollInfo uint32
+
+const (
+	pollClosing = 1 << iota
+	pollEventErr
+	pollExpiredReadDeadline
+	pollExpiredWriteDeadline
+)
+
+func (i pollInfo) closing() bool              { return i&pollClosing != 0 }
+func (i pollInfo) eventErr() bool             { return i&pollEventErr != 0 }
+func (i pollInfo) expiredReadDeadline() bool  { return i&pollExpiredReadDeadline != 0 }
+func (i pollInfo) expiredWriteDeadline() bool { return i&pollExpiredWriteDeadline != 0 }
+
+// info returns the pollInfo corresponding to pd.
+func (pd *pollDesc) info() pollInfo {
+	return pollInfo(pd.atomicInfo.Load())
+}
+
+// publishInfo updates pd.atomicInfo (returned by pd.info)
+// using the other values in pd.
+// It must be called while holding pd.lock,
+// and it must be called after changing anything
+// that might affect the info bits.
+// In practice this means after changing closing
+// or changing rd or wd from < 0 to >= 0.
+func (pd *pollDesc) publishInfo() {
+	var info uint32
+	if pd.closing {
+		info |= pollClosing
+	}
+	if pd.rd < 0 {
+		info |= pollExpiredReadDeadline
+	}
+	if pd.wd < 0 {
+		info |= pollExpiredWriteDeadline
+	}
+
+	// Set all of x except the pollEventErr bit.
+	x := pd.atomicInfo.Load()
+	for !pd.atomicInfo.CompareAndSwap(x, (x&pollEventErr)|info) {
+		x = pd.atomicInfo.Load()
+	}
+}
+
+// setEventErr sets the result of pd.info().eventErr() to b.
+func (pd *pollDesc) setEventErr(b bool) {
+	x := pd.atomicInfo.Load()
+	for (x&pollEventErr != 0) != b && !pd.atomicInfo.CompareAndSwap(x, x^pollEventErr) {
+		x = pd.atomicInfo.Load()
+	}
+}
+
+type pollCache struct {
+	lock  mutex
+	first *pollDesc
+	// PollDesc objects must be type-stable,
+	// because we can get ready notification from epoll/kqueue
+	// after the descriptor is closed/reused.
+	// Stale notifications are detected using seq variable,
+	// seq is incremented when deadlines are changed or descriptor is reused.
+}
+
+var (
+	netpollInitLock mutex
+	netpollInited   uint32
+
+	pollcache      pollCache
+	netpollWaiters uint32
+)
+
+//go:linkname poll_runtime_pollServerInit internal/poll.runtime_pollServerInit
+func poll_runtime_pollServerInit() {
+	netpollGenericInit()
+}
+
+func netpollGenericInit() {
+	if atomic.Load(&netpollInited) == 0 {
+		lockInit(&netpollInitLock, lockRankNetpollInit)
+		lock(&netpollInitLock)
+		if netpollInited == 0 {
+			netpollinit()
+			atomic.Store(&netpollInited, 1)
+		}
+		unlock(&netpollInitLock)
+	}
+}
+
+func netpollinited() bool {
+	return atomic.Load(&netpollInited) != 0
+}
+
+//go:linkname poll_runtime_isPollServerDescriptor internal/poll.runtime_isPollServerDescriptor
+
+// poll_runtime_isPollServerDescriptor reports whether fd is a
+// descriptor being used by netpoll.
+func poll_runtime_isPollServerDescriptor(fd uintptr) bool {
+	return netpollIsPollDescriptor(fd)
+}
+
+//go:linkname poll_runtime_pollOpen internal/poll.runtime_pollOpen
+func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
+	pd := pollcache.alloc()
+	lock(&pd.lock)
+	wg := pd.wg.Load()
+	if wg != 0 && wg != pdReady {
+		throw("runtime: blocked write on free polldesc")
+	}
+	rg := pd.rg.Load()
+	if rg != 0 && rg != pdReady {
+		throw("runtime: blocked read on free polldesc")
+	}
+	pd.fd = fd
+	pd.closing = false
+	pd.setEventErr(false)
+	pd.rseq++
+	pd.rg.Store(0)
+	pd.rd = 0
+	pd.wseq++
+	pd.wg.Store(0)
+	pd.wd = 0
+	pd.self = pd
+	pd.publishInfo()
+	unlock(&pd.lock)
+
+	errno := netpollopen(fd, pd)
+	if errno != 0 {
+		pollcache.free(pd)
+		return nil, int(errno)
+	}
+	return pd, 0
+}
+
+//go:linkname poll_runtime_pollClose internal/poll.runtime_pollClose
+func poll_runtime_pollClose(pd *pollDesc) {
+	if !pd.closing {
+		throw("runtime: close polldesc w/o unblock")
+	}
+	wg := pd.wg.Load()
+	if wg != 0 && wg != pdReady {
+		throw("runtime: blocked write on closing polldesc")
+	}
+	rg := pd.rg.Load()
+	if rg != 0 && rg != pdReady {
+		throw("runtime: blocked read on closing polldesc")
+	}
+	netpollclose(pd.fd)
+	pollcache.free(pd)
+}
+
+func (c *pollCache) free(pd *pollDesc) {
+	lock(&c.lock)
+	pd.link = c.first
+	c.first = pd
+	unlock(&c.lock)
+}
+
+// poll_runtime_pollReset, which is internal/poll.runtime_pollReset,
+// prepares a descriptor for polling in mode, which is 'r' or 'w'.
+// This returns an error code; the codes are defined above.
+//go:linkname poll_runtime_pollReset internal/poll.runtime_pollReset
+func poll_runtime_pollReset(pd *pollDesc, mode int) int {
+	errcode := netpollcheckerr(pd, int32(mode))
+	if errcode != pollNoError {
+		return errcode
+	}
+	if mode == 'r' {
+		pd.rg.Store(0)
+	} else if mode == 'w' {
+		pd.wg.Store(0)
+	}
+	return pollNoError
+}
+
+// poll_runtime_pollWait, which is internal/poll.runtime_pollWait,
+// waits for a descriptor to be ready for reading or writing,
+// according to mode, which is 'r' or 'w'.
+// This returns an error code; the codes are defined above.
+//go:linkname poll_runtime_pollWait internal/poll.runtime_pollWait
+func poll_runtime_pollWait(pd *pollDesc, mode int) int {
+	errcode := netpollcheckerr(pd, int32(mode))
+	if errcode != pollNoError {
+		return errcode
+	}
+	// As for now only Solaris, illumos, and AIX use level-triggered IO.
+	if GOOS == "solaris" || GOOS == "illumos" || GOOS == "aix" {
+		netpollarm(pd, mode)
+	}
+	for !netpollblock(pd, int32(mode), false) {
+		errcode = netpollcheckerr(pd, int32(mode))
+		if errcode != pollNoError {
+			return errcode
+		}
+		// Can happen if timeout has fired and unblocked us,
+		// but before we had a chance to run, timeout has been reset.
+		// Pretend it has not happened and retry.
+	}
+	return pollNoError
+}
+
+//go:linkname poll_runtime_pollWaitCanceled internal/poll.runtime_pollWaitCanceled
+func poll_runtime_pollWaitCanceled(pd *pollDesc, mode int) {
+	// This function is used only on windows after a failed attempt to cancel
+	// a pending async IO operation. Wait for ioready, ignore closing or timeouts.
+	for !netpollblock(pd, int32(mode), true) {
+	}
+}
+
+//go:linkname poll_runtime_pollSetDeadline internal/poll.runtime_pollSetDeadline
+func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
+	lock(&pd.lock)
+	if pd.closing {
+		unlock(&pd.lock)
+		return
+	}
+	rd0, wd0 := pd.rd, pd.wd
+	combo0 := rd0 > 0 && rd0 == wd0
+	if d > 0 {
+		d += nanotime()
+		if d <= 0 {
+			// If the user has a deadline in the future, but the delay calculation
+			// overflows, then set the deadline to the maximum possible value.
+			d = 1<<63 - 1
+		}
+	}
+	if mode == 'r' || mode == 'r'+'w' {
+		pd.rd = d
+	}
+	if mode == 'w' || mode == 'r'+'w' {
+		pd.wd = d
+	}
+	pd.publishInfo()
+	combo := pd.rd > 0 && pd.rd == pd.wd
+	rtf := netpollReadDeadline
+	if combo {
+		rtf = netpollDeadline
+	}
+	if pd.rt.f == nil {
+		if pd.rd > 0 {
+			pd.rt.f = rtf
+			// Copy current seq into the timer arg.
+			// Timer func will check the seq against current descriptor seq,
+			// if they differ the descriptor was reused or timers were reset.
+			pd.rt.arg = pd.makeArg()
+			pd.rt.seq = pd.rseq
+			resettimer(&pd.rt, pd.rd)
+		}
+	} else if pd.rd != rd0 || combo != combo0 {
+		pd.rseq++ // invalidate current timers
+		if pd.rd > 0 {
+			modtimer(&pd.rt, pd.rd, 0, rtf, pd.makeArg(), pd.rseq)
+		} else {
+			deltimer(&pd.rt)
+			pd.rt.f = nil
+		}
+	}
+	if pd.wt.f == nil {
+		if pd.wd > 0 && !combo {
+			pd.wt.f = netpollWriteDeadline
+			pd.wt.arg = pd.makeArg()
+			pd.wt.seq = pd.wseq
+			resettimer(&pd.wt, pd.wd)
+		}
+	} else if pd.wd != wd0 || combo != combo0 {
+		pd.wseq++ // invalidate current timers
+		if pd.wd > 0 && !combo {
+			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd.makeArg(), pd.wseq)
+		} else {
+			deltimer(&pd.wt)
+			pd.wt.f = nil
+		}
+	}
+	// If we set the new deadline in the past, unblock currently pending IO if any.
+	// Note that pd.publishInfo has already been called, above, immediately after modifying rd and wd.
+	var rg, wg *g
+	if pd.rd < 0 {
+		rg = netpollunblock(pd, 'r', false)
+	}
+	if pd.wd < 0 {
+		wg = netpollunblock(pd, 'w', false)
+	}
+	unlock(&pd.lock)
+	if rg != nil {
+		netpollgoready(rg, 3)
+	}
+	if wg != nil {
+		netpollgoready(wg, 3)
+	}
+}
+
+//go:linkname poll_runtime_pollUnblock internal/poll.runtime_pollUnblock
+func poll_runtime_pollUnblock(pd *pollDesc) {
+	lock(&pd.lock)
+	if pd.closing {
+		throw("runtime: unblock on closing polldesc")
+	}
+	pd.closing = true
+	pd.rseq++
+	pd.wseq++
+	var rg, wg *g
+	pd.publishInfo()
+	rg = netpollunblock(pd, 'r', false)
+	wg = netpollunblock(pd, 'w', false)
+	if pd.rt.f != nil {
+		deltimer(&pd.rt)
+		pd.rt.f = nil
+	}
+	if pd.wt.f != nil {
+		deltimer(&pd.wt)
+		pd.wt.f = nil
+	}
+	unlock(&pd.lock)
+	if rg != nil {
+		netpollgoready(rg, 3)
+	}
+	if wg != nil {
+		netpollgoready(wg, 3)
+	}
+}
+
+// netpollready is called by the platform-specific netpoll function.
+// It declares that the fd associated with pd is ready for I/O.
+// The toRun argument is used to build a list of goroutines to return
+// from netpoll. The mode argument is 'r', 'w', or 'r'+'w' to indicate
+// whether the fd is ready for reading or writing or both.
+//
+// This may run while the world is stopped, so write barriers are not allowed.
+//go:nowritebarrier
+func netpollready(toRun *gList, pd *pollDesc, mode int32) {
+	var rg, wg *g
+	if mode == 'r' || mode == 'r'+'w' {
+		rg = netpollunblock(pd, 'r', true)
+	}
+	if mode == 'w' || mode == 'r'+'w' {
+		wg = netpollunblock(pd, 'w', true)
+	}
+	if rg != nil {
+		toRun.push(rg)
+	}
+	if wg != nil {
+		toRun.push(wg)
+	}
+}
+
+func netpollcheckerr(pd *pollDesc, mode int32) int {
+	info := pd.info()
+	if info.closing() {
+		return pollErrClosing
+	}
+	if (mode == 'r' && info.expiredReadDeadline()) || (mode == 'w' && info.expiredWriteDeadline()) {
+		return pollErrTimeout
+	}
+	// Report an event scanning error only on a read event.
+	// An error on a write event will be captured in a subsequent
+	// write call that is able to report a more specific error.
+	if mode == 'r' && info.eventErr() {
+		return pollErrNotPollable
+	}
+	return pollNoError
+}
+
+func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
+	r := atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	if r {
+		// Bump the count of goroutines waiting for the poller.
+		// The scheduler uses this to decide whether to block
+		// waiting for the poller if there is nothing else to do.
+		atomic.Xadd(&netpollWaiters, 1)
+	}
+	return r
+}
+
+func netpollgoready(gp *g, traceskip int) {
+	atomic.Xadd(&netpollWaiters, -1)
+	goready(gp, traceskip+1)
+}
+
+// returns true if IO is ready, or false if timedout or closed
+// waitio - wait only for completed IO, ignore errors
+// Concurrent calls to netpollblock in the same mode are forbidden, as pollDesc
+// can hold only a single waiting goroutine for each mode.
+func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
+	gpp := &pd.rg
+	if mode == 'w' {
+		gpp = &pd.wg
+	}
+
+	// set the gpp semaphore to pdWait
+	for {
+		// Consume notification if already ready.
+		if gpp.CompareAndSwap(pdReady, 0) {
+			return true
+		}
+		if gpp.CompareAndSwap(0, pdWait) {
+			break
+		}
+
+		// Double check that this isn't corrupt; otherwise we'd loop
+		// forever.
+		if v := gpp.Load(); v != pdReady && v != 0 {
+			throw("runtime: double wait")
+		}
+	}
+
+	// need to recheck error states after setting gpp to pdWait
+	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
+	// do the opposite: store to closing/rd/wd, publishInfo, load of rg/wg
+	if waitio || netpollcheckerr(pd, mode) == pollNoError {
+		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
+	}
+	// be careful to not lose concurrent pdReady notification
+	old := gpp.Swap(0)
+	if old > pdWait {
+		throw("runtime: corrupted polldesc")
+	}
+	return old == pdReady
+}
+
+func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
+	gpp := &pd.rg
+	if mode == 'w' {
+		gpp = &pd.wg
+	}
+
+	for {
+		old := gpp.Load()
+		if old == pdReady {
+			return nil
+		}
+		if old == 0 && !ioready {
+			// Only set pdReady for ioready. runtime_pollWait
+			// will check for timeout/cancel before waiting.
+			return nil
+		}
+		var new uintptr
+		if ioready {
+			new = pdReady
+		}
+		if gpp.CompareAndSwap(old, new) {
+			if old == pdWait {
+				old = 0
+			}
+			return (*g)(unsafe.Pointer(old))
+		}
+	}
+}
+
+func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
+	lock(&pd.lock)
+	// Seq arg is seq when the timer was set.
+	// If it's stale, ignore the timer event.
+	currentSeq := pd.rseq
+	if !read {
+		currentSeq = pd.wseq
+	}
+	if seq != currentSeq {
+		// The descriptor was reused or timers were reset.
+		unlock(&pd.lock)
+		return
+	}
+	var rg *g
+	if read {
+		if pd.rd <= 0 || pd.rt.f == nil {
+			throw("runtime: inconsistent read deadline")
+		}
+		pd.rd = -1
+		pd.publishInfo()
+		rg = netpollunblock(pd, 'r', false)
+	}
+	var wg *g
+	if write {
+		if pd.wd <= 0 || pd.wt.f == nil && !read {
+			throw("runtime: inconsistent write deadline")
+		}
+		pd.wd = -1
+		pd.publishInfo()
+		wg = netpollunblock(pd, 'w', false)
+	}
+	unlock(&pd.lock)
+	if rg != nil {
+		netpollgoready(rg, 0)
+	}
+	if wg != nil {
+		netpollgoready(wg, 0)
+	}
+}
+
+func netpollDeadline(arg any, seq uintptr) {
+	netpolldeadlineimpl(arg.(*pollDesc), seq, true, true)
+}
+
+func netpollReadDeadline(arg any, seq uintptr) {
+	netpolldeadlineimpl(arg.(*pollDesc), seq, true, false)
+}
+
+func netpollWriteDeadline(arg any, seq uintptr) {
+	netpolldeadlineimpl(arg.(*pollDesc), seq, false, true)
+}
+
+func (c *pollCache) alloc() *pollDesc {
+	lock(&c.lock)
+	if c.first == nil {
+		const pdSize = unsafe.Sizeof(pollDesc{})
+		n := pollBlockSize / pdSize
+		if n == 0 {
+			n = 1
+		}
+		// Must be in non-GC memory because can be referenced
+		// only from epoll/kqueue internals.
+		mem := persistentalloc(n*pdSize, 0, &memstats.other_sys)
+		for i := uintptr(0); i < n; i++ {
+			pd := (*pollDesc)(add(mem, i*pdSize))
+			pd.link = c.first
+			c.first = pd
+		}
+	}
+	pd := c.first
+	c.first = pd.link
+	lockInit(&pd.lock, lockRankPollDesc)
+	unlock(&c.lock)
+	return pd
+}
+
+// makeArg converts pd to an interface{}.
+// makeArg does not do any allocation. Normally, such
+// a conversion requires an allocation because pointers to
+// go:notinheap types (which pollDesc is) must be stored
+// in interfaces indirectly. See issue 42076.
+func (pd *pollDesc) makeArg() (i any) {
+	x := (*eface)(unsafe.Pointer(&i))
+	x._type = pdType
+	x.data = unsafe.Pointer(&pd.self)
+	return
+}
+
+var (
+	pdEface any    = (*pollDesc)(nil)
+	pdType  *_type = efaceOf(&pdEface)._type
+)
diff --git a/contrib/go/_std_1.18/src/runtime/netpoll_epoll.go b/contrib/go/_std_1.18/src/runtime/netpoll_epoll.go
new file mode 100644
index 0000000000..b7d6199965
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/netpoll_epoll.go
@@ -0,0 +1,176 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+func epollcreate(size int32) int32
+func epollcreate1(flags int32) int32
+
+//go:noescape
+func epollctl(epfd, op, fd int32, ev *epollevent) int32
+
+//go:noescape
+func epollwait(epfd int32, ev *epollevent, nev, timeout int32) int32
+func closeonexec(fd int32)
+
+var (
+	epfd int32 = -1 // epoll descriptor
+
+	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+)
+
+func netpollinit() {
+	epfd = epollcreate1(_EPOLL_CLOEXEC)
+	if epfd < 0 {
+		epfd = epollcreate(1024)
+		if epfd < 0 {
+			println("runtime: epollcreate failed with", -epfd)
+			throw("runtime: netpollinit failed")
+		}
+		closeonexec(epfd)
+	}
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
+		println("runtime: pipe failed with", -errno)
+		throw("runtime: pipe failed")
+	}
+	ev := epollevent{
+		events: _EPOLLIN,
+	}
+	*(**uintptr)(unsafe.Pointer(&ev.data)) = &netpollBreakRd
+	errno = epollctl(epfd, _EPOLL_CTL_ADD, r, &ev)
+	if errno != 0 {
+		println("runtime: epollctl failed with", -errno)
+		throw("runtime: epollctl failed")
+	}
+	netpollBreakRd = uintptr(r)
+	netpollBreakWr = uintptr(w)
+}
+
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(epfd) || fd == netpollBreakRd || fd == netpollBreakWr
+}
+
+func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	var ev epollevent
+	ev.events = _EPOLLIN | _EPOLLOUT | _EPOLLRDHUP | _EPOLLET
+	*(**pollDesc)(unsafe.Pointer(&ev.data)) = pd
+	return -epollctl(epfd, _EPOLL_CTL_ADD, int32(fd), &ev)
+}
+
+func netpollclose(fd uintptr) int32 {
+	var ev epollevent
+	return -epollctl(epfd, _EPOLL_CTL_DEL, int32(fd), &ev)
+}
+
+func netpollarm(pd *pollDesc, mode int) {
+	throw("runtime: unused")
+}
+
+// netpollBreak interrupts an epollwait.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		for {
+			var b byte
+			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+			if n == 1 {
+				break
+			}
+			if n == -_EINTR {
+				continue
+			}
+			if n == -_EAGAIN {
+				return
+			}
+			println("runtime: netpollBreak write failed with", -n)
+			throw("runtime: netpollBreak write failed")
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
+	if epfd == -1 {
+		return gList{}
+	}
+	var waitms int32
+	if delay < 0 {
+		waitms = -1
+	} else if delay == 0 {
+		waitms = 0
+	} else if delay < 1e6 {
+		waitms = 1
+	} else if delay < 1e15 {
+		waitms = int32(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		waitms = 1e9
+	}
+	var events [128]epollevent
+retry:
+	n := epollwait(epfd, &events[0], int32(len(events)), waitms)
+	if n < 0 {
+		if n != -_EINTR {
+			println("runtime: epollwait on fd", epfd, "failed with", -n)
+			throw("runtime: netpoll failed")
+		}
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if waitms > 0 {
+			return gList{}
+		}
+		goto retry
+	}
+	var toRun gList
+	for i := int32(0); i < n; i++ {
+		ev := &events[i]
+		if ev.events == 0 {
+			continue
+		}
+
+		if *(**uintptr)(unsafe.Pointer(&ev.data)) == &netpollBreakRd {
+			if ev.events != _EPOLLIN {
+				println("runtime: netpoll: break fd ready for", ev.events)
+				throw("runtime: netpoll: break fd ready for something unexpected")
+			}
+			if delay != 0 {
+				// netpollBreak could be picked up by a
+				// nonblocking poll. Only read the byte
+				// if blocking.
+				var tmp [16]byte
+				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+				atomic.Store(&netpollWakeSig, 0)
+			}
+			continue
+		}
+
+		var mode int32
+		if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
+			mode += 'r'
+		}
+		if ev.events&(_EPOLLOUT|_EPOLLHUP|_EPOLLERR) != 0 {
+			mode += 'w'
+		}
+		if mode != 0 {
+			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
+			pd.setEventErr(ev.events == _EPOLLERR)
+			netpollready(&toRun, pd, mode)
+		}
+	}
+	return toRun
+}
diff --git a/contrib/go/_std_1.18/src/runtime/netpoll_kqueue.go b/contrib/go/_std_1.18/src/runtime/netpoll_kqueue.go
new file mode 100644
index 0000000000..1694753b6f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/netpoll_kqueue.go
@@ -0,0 +1,187 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin || dragonfly || freebsd || netbsd || openbsd
+
+package runtime
+
+// Integrated network poller (kqueue-based implementation).
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var (
+	kq int32 = -1
+
+	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+)
+
+func netpollinit() {
+	kq = kqueue()
+	if kq < 0 {
+		println("runtime: kqueue failed with", -kq)
+		throw("runtime: netpollinit failed")
+	}
+	closeonexec(kq)
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
+		println("runtime: pipe failed with", -errno)
+		throw("runtime: pipe failed")
+	}
+	ev := keventt{
+		filter: _EVFILT_READ,
+		flags:  _EV_ADD,
+	}
+	*(*uintptr)(unsafe.Pointer(&ev.ident)) = uintptr(r)
+	n := kevent(kq, &ev, 1, nil, 0, nil)
+	if n < 0 {
+		println("runtime: kevent failed with", -n)
+		throw("runtime: kevent failed")
+	}
+	netpollBreakRd = uintptr(r)
+	netpollBreakWr = uintptr(w)
+}
+
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(kq) || fd == netpollBreakRd || fd == netpollBreakWr
+}
+
+func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
+	// for the whole fd lifetime. The notifications are automatically unregistered
+	// when fd is closed.
+	var ev [2]keventt
+	*(*uintptr)(unsafe.Pointer(&ev[0].ident)) = fd
+	ev[0].filter = _EVFILT_READ
+	ev[0].flags = _EV_ADD | _EV_CLEAR
+	ev[0].fflags = 0
+	ev[0].data = 0
+	ev[0].udata = (*byte)(unsafe.Pointer(pd))
+	ev[1] = ev[0]
+	ev[1].filter = _EVFILT_WRITE
+	n := kevent(kq, &ev[0], 2, nil, 0, nil)
+	if n < 0 {
+		return -n
+	}
+	return 0
+}
+
+func netpollclose(fd uintptr) int32 {
+	// Don't need to unregister because calling close()
+	// on fd will remove any kevents that reference the descriptor.
+	return 0
+}
+
+func netpollarm(pd *pollDesc, mode int) {
+	throw("runtime: unused")
+}
+
+// netpollBreak interrupts a kevent.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		for {
+			var b byte
+			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+			if n == 1 || n == -_EAGAIN {
+				break
+			}
+			if n == -_EINTR {
+				continue
+			}
+			println("runtime: netpollBreak write failed with", -n)
+			throw("runtime: netpollBreak write failed")
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
+	if kq == -1 {
+		return gList{}
+	}
+	var tp *timespec
+	var ts timespec
+	if delay < 0 {
+		tp = nil
+	} else if delay == 0 {
+		tp = &ts
+	} else {
+		ts.setNsec(delay)
+		if ts.tv_sec > 1e6 {
+			// Darwin returns EINVAL if the sleep time is too long.
+			ts.tv_sec = 1e6
+		}
+		tp = &ts
+	}
+	var events [64]keventt
+retry:
+	n := kevent(kq, nil, 0, &events[0], int32(len(events)), tp)
+	if n < 0 {
+		if n != -_EINTR {
+			println("runtime: kevent on fd", kq, "failed with", -n)
+			throw("runtime: netpoll failed")
+		}
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if delay > 0 {
+			return gList{}
+		}
+		goto retry
+	}
+	var toRun gList
+	for i := 0; i < int(n); i++ {
+		ev := &events[i]
+
+		if uintptr(ev.ident) == netpollBreakRd {
+			if ev.filter != _EVFILT_READ {
+				println("runtime: netpoll: break fd ready for", ev.filter)
+				throw("runtime: netpoll: break fd ready for something unexpected")
+			}
+			if delay != 0 {
+				// netpollBreak could be picked up by a
+				// nonblocking poll. Only read the byte
+				// if blocking.
+				var tmp [16]byte
+				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+				atomic.Store(&netpollWakeSig, 0)
+			}
+			continue
+		}
+
+		var mode int32
+		switch ev.filter {
+		case _EVFILT_READ:
+			mode += 'r'
+
+			// On some systems when the read end of a pipe
+			// is closed the write end will not get a
+			// _EVFILT_WRITE event, but will get a
+			// _EVFILT_READ event with EV_EOF set.
+			// Note that setting 'w' here just means that we
+			// will wake up a goroutine waiting to write;
+			// that goroutine will try the write again,
+			// and the appropriate thing will happen based
+			// on what that write returns (success, EPIPE, EAGAIN).
+			if ev.flags&_EV_EOF != 0 {
+				mode += 'w'
+			}
+		case _EVFILT_WRITE:
+			mode += 'w'
+		}
+		if mode != 0 {
+			pd := (*pollDesc)(unsafe.Pointer(ev.udata))
+			pd.setEventErr(ev.flags == _EV_ERROR)
+			netpollready(&toRun, pd, mode)
+		}
+	}
+	return toRun
+}
diff --git a/contrib/go/_std_1.18/src/runtime/os_darwin.go b/contrib/go/_std_1.18/src/runtime/os_darwin.go
new file mode 100644
index 0000000000..9065b76375
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/os_darwin.go
@@ -0,0 +1,470 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"unsafe"
+)
+
+type mOS struct {
+	initialized bool
+	mutex       pthreadmutex
+	cond        pthreadcond
+	count       int
+}
+
+func unimplemented(name string) {
+	println(name, "not implemented")
+	*(*int)(unsafe.Pointer(uintptr(1231))) = 1231
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.initialized {
+		return
+	}
+	mp.initialized = true
+	if err := pthread_mutex_init(&mp.mutex, nil); err != 0 {
+		throw("pthread_mutex_init")
+	}
+	if err := pthread_cond_init(&mp.cond, nil); err != 0 {
+		throw("pthread_cond_init")
+	}
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	var start int64
+	if ns >= 0 {
+		start = nanotime()
+	}
+	mp := getg().m
+	pthread_mutex_lock(&mp.mutex)
+	for {
+		if mp.count > 0 {
+			mp.count--
+			pthread_mutex_unlock(&mp.mutex)
+			return 0
+		}
+		if ns >= 0 {
+			spent := nanotime() - start
+			if spent >= ns {
+				pthread_mutex_unlock(&mp.mutex)
+				return -1
+			}
+			var t timespec
+			t.setNsec(ns - spent)
+			err := pthread_cond_timedwait_relative_np(&mp.cond, &mp.mutex, &t)
+			if err == _ETIMEDOUT {
+				pthread_mutex_unlock(&mp.mutex)
+				return -1
+			}
+		} else {
+			pthread_cond_wait(&mp.cond, &mp.mutex)
+		}
+	}
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	pthread_mutex_lock(&mp.mutex)
+	mp.count++
+	if mp.count > 0 {
+		pthread_cond_signal(&mp.cond)
+	}
+	pthread_mutex_unlock(&mp.mutex)
+}
+
+// The read and write file descriptors used by the sigNote functions.
+var sigNoteRead, sigNoteWrite int32
+
+// sigNoteSetup initializes an async-signal-safe note.
+//
+// The current implementation of notes on Darwin is not async-signal-safe,
+// because the functions pthread_mutex_lock, pthread_cond_signal, and
+// pthread_mutex_unlock, called by semawakeup, are not async-signal-safe.
+// There is only one case where we need to wake up a note from a signal
+// handler: the sigsend function. The signal handler code does not require
+// all the features of notes: it does not need to do a timed wait.
+// This is a separate implementation of notes, based on a pipe, that does
+// not support timed waits but is async-signal-safe.
+func sigNoteSetup(*note) {
+	if sigNoteRead != 0 || sigNoteWrite != 0 {
+		throw("duplicate sigNoteSetup")
+	}
+	var errno int32
+	sigNoteRead, sigNoteWrite, errno = pipe()
+	if errno != 0 {
+		throw("pipe failed")
+	}
+	closeonexec(sigNoteRead)
+	closeonexec(sigNoteWrite)
+
+	// Make the write end of the pipe non-blocking, so that if the pipe
+	// buffer is somehow full we will not block in the signal handler.
+	// Leave the read end of the pipe blocking so that we will block
+	// in sigNoteSleep.
+	setNonblock(sigNoteWrite)
+}
+
+// sigNoteWakeup wakes up a thread sleeping on a note created by sigNoteSetup.
+func sigNoteWakeup(*note) {
+	var b byte
+	write(uintptr(sigNoteWrite), unsafe.Pointer(&b), 1)
+}
+
+// sigNoteSleep waits for a note created by sigNoteSetup to be woken.
+func sigNoteSleep(*note) {
+	for {
+		var b byte
+		entersyscallblock()
+		n := read(sigNoteRead, unsafe.Pointer(&b), 1)
+		exitsyscall()
+		if n != -_EINTR {
+			return
+		}
+	}
+}
+
+// BSD interface for threading.
+func osinit() {
+	// pthread_create delayed until end of goenvs so that we
+	// can look at the environment first.
+
+	ncpu = getncpu()
+	physPageSize = getPageSize()
+}
+
+func sysctlbynameInt32(name []byte) (int32, int32) {
+	out := int32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctlbyname(&name[0], (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	return ret, out
+}
+
+//go:linkname internal_cpu_getsysctlbyname internal/cpu.getsysctlbyname
+func internal_cpu_getsysctlbyname(name []byte) (int32, int32) {
+	return sysctlbynameInt32(name)
+}
+
+const (
+	_CTL_HW      = 6
+	_HW_NCPU     = 3
+	_HW_PAGESIZE = 7
+)
+
+func getncpu() int32 {
+	// Use sysctl to fetch hw.ncpu.
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 && int32(out) > 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+func getPageSize() uintptr {
+	// Use sysctl to fetch hw.pagesize.
+	mib := [2]uint32{_CTL_HW, _HW_PAGESIZE}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 && int32(out) > 0 {
+		return uintptr(out)
+	}
+	return 0
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrierrec
+func newosproc(mp *m) {
+	stk := unsafe.Pointer(mp.g0.stack.hi)
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	// Initialize an attribute object.
+	var attr pthreadattr
+	var err int32
+	err = pthread_attr_init(&attr)
+	if err != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+
+	// Find out OS stack size for our own stack guard.
+	var stacksize uintptr
+	if pthread_attr_getstacksize(&attr, &stacksize) != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+	mp.g0.stack.hi = stacksize // for mstart
+
+	// Tell the pthread library we won't join with this thread.
+	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+
+	// Finally, create the thread. It starts at mstart_stub, which does some low-level
+	// setup and then calls mstart.
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	err = pthread_create(&attr, abi.FuncPCABI0(mstart_stub), unsafe.Pointer(mp))
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+	if err != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+// glue code to call mstart from pthread_create.
+func mstart_stub()
+
+// newosproc0 is a version of newosproc that can be called before the runtime
+// is initialized.
+//
+// This function is not safe to use after initialization as it does not pass an M as fnarg.
+//
+//go:nosplit
+func newosproc0(stacksize uintptr, fn uintptr) {
+	// Initialize an attribute object.
+	var attr pthreadattr
+	var err int32
+	err = pthread_attr_init(&attr)
+	if err != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+
+	// The caller passes in a suggested stack size,
+	// from when we allocated the stack and thread ourselves,
+	// without libpthread. Now that we're using libpthread,
+	// we use the OS default stack size instead of the suggestion.
+	// Find out that stack size for our own stack guard.
+	if pthread_attr_getstacksize(&attr, &stacksize) != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+	g0.stack.hi = stacksize // for mstart
+	memstats.stacks_sys.add(int64(stacksize))
+
+	// Tell the pthread library we won't join with this thread.
+	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+
+	// Finally, create the thread. It starts at mstart_stub, which does some low-level
+	// setup and then calls mstart.
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	err = pthread_create(&attr, fn, nil)
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+	if err != 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
+var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
+
+// Called to do synchronous initialization of Go code built with
+// -buildmode=c-archive or -buildmode=c-shared.
+// None of the Go runtime is initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func libpreinit() {
+	initsig(true)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024) // OS X wants >= 8K
+	mp.gsignal.m = mp
+	if GOOS == "darwin" && GOARCH == "arm64" {
+		// mlock the signal stack to work around a kernel bug where it may
+		// SIGILL when the signal stack is not faulted in while a signal
+		// arrives. See issue 42774.
+		mlock(unsafe.Pointer(mp.gsignal.stack.hi-physPageSize), physPageSize)
+	}
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	// iOS does not support alternate signal stack.
+	// The signal handler handles it directly.
+	if !(GOOS == "ios" && GOARCH == "arm64") {
+		minitSignalStack()
+	}
+	minitSignalMask()
+	getg().m.procid = uint64(pthread_self())
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	// iOS does not support alternate signal stack.
+	// See minit.
+	if !(GOOS == "ios" && GOARCH == "arm64") {
+		unminitSignals()
+	}
+}
+
+// Called from exitm, but not from drop, to undo the effect of thread-owned
+// resources in minit, semacreate, or elsewhere. Do not take locks after calling this.
+func mdestroy(mp *m) {
+}
+
+//go:nosplit
+func osyield_no_g() {
+	usleep_no_g(1)
+}
+
+//go:nosplit
+func osyield() {
+	usleep(1)
+}
+
+const (
+	_NSIG        = 32
+	_SI_USER     = 0 /* empirically true, but not what headers say */
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_SS_DISABLE  = 4
+)
+
+//extern SigTabTT runtime·sigtab[];
+
+type sigset uint32
+
+var sigset_all = ^sigset(0)
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i uint32, fn uintptr) {
+	var sa usigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTART
+	sa.sa_mask = ^uint32(0)
+	if fn == abi.FuncPCABIInternal(sighandler) { // abi.FuncPCABIInternal(sighandler) matches the callers in signal_unix.go
+		if iscgo {
+			fn = abi.FuncPCABI0(cgoSigtramp)
+		} else {
+			fn = abi.FuncPCABI0(sigtramp)
+		}
+	}
+	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
+	sigaction(i, &sa, nil)
+}
+
+// sigtramp is the callback from libc when a signal is received.
+// It is called with the C calling convention.
+func sigtramp()
+func cgoSigtramp()
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i uint32) {
+	var osa usigactiont
+	sigaction(i, nil, &osa)
+	handler := *(*uintptr)(unsafe.Pointer(&osa.__sigaction_u))
+	if osa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	var sa usigactiont
+	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = handler
+	sa.sa_mask = osa.sa_mask
+	sa.sa_flags = osa.sa_flags | _SA_ONSTACK
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i uint32) uintptr {
+	var sa usigactiont
+	sigaction(i, nil, &sa)
+	return *(*uintptr)(unsafe.Pointer(&sa.__sigaction_u))
+}
+
+// setSignaltstackSP sets the ss_sp field of a stackt.
+//go:nosplit
+func setSignalstackSP(s *stackt, sp uintptr) {
+	*(*uintptr)(unsafe.Pointer(&s.ss_sp)) = sp
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigaddset(mask *sigset, i int) {
+	*mask |= 1 << (uint32(i) - 1)
+}
+
+func sigdelset(mask *sigset, i int) {
+	*mask &^= 1 << (uint32(i) - 1)
+}
+
+func setProcessCPUProfiler(hz int32) {
+	setProcessCPUProfilerTimer(hz)
+}
+
+func setThreadCPUProfiler(hz int32) {
+	setThreadCPUProfilerHz(hz)
+}
+
+//go:nosplit
+func validSIGPROF(mp *m, c *sigctxt) bool {
+	return true
+}
+
+//go:linkname executablePath os.executablePath
+var executablePath string
+
+func sysargs(argc int32, argv **byte) {
+	// skip over argv, envv and the first string will be the path
+	n := argc + 1
+	for argv_index(argv, n) != nil {
+		n++
+	}
+	executablePath = gostringnocopy(argv_index(argv, n+1))
+
+	// strip "executable_path=" prefix if available, it's added after OS X 10.11.
+	const prefix = "executable_path="
+	if len(executablePath) > len(prefix) && executablePath[:len(prefix)] == prefix {
+		executablePath = executablePath[len(prefix):]
+	}
+}
+
+func signalM(mp *m, sig int) {
+	pthread_kill(pthread(mp.procid), uint32(sig))
+}
+
+// sigPerThreadSyscall is only used on linux, so we assign a bogus signal
+// number.
+const sigPerThreadSyscall = 1 << 31
+
+//go:nosplit
+func runPerThreadSyscall() {
+	throw("runPerThreadSyscall only valid on linux")
+}
diff --git a/contrib/go/_std_1.18/src/runtime/os_linux.go b/contrib/go/_std_1.18/src/runtime/os_linux.go
new file mode 100644
index 0000000000..eb8aa076e9
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/os_linux.go
@@ -0,0 +1,878 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/syscall"
+	"unsafe"
+)
+
+// sigPerThreadSyscall is the same signal (SIGSETXID) used by glibc for
+// per-thread syscalls on Linux. We use it for the same purpose in non-cgo
+// binaries.
+const sigPerThreadSyscall = _SIGRTMIN + 1
+
+type mOS struct {
+	// profileTimer holds the ID of the POSIX interval timer for profiling CPU
+	// usage on this thread.
+	//
+	// It is valid when the profileTimerValid field is non-zero. A thread
+	// creates and manages its own timer, and these fields are read and written
+	// only by this thread. But because some of the reads on profileTimerValid
+	// are in signal handling code, access to that field uses atomic operations.
+	profileTimer      int32
+	profileTimerValid uint32
+
+	// needPerThreadSyscall indicates that a per-thread syscall is required
+	// for doAllThreadsSyscall.
+	needPerThreadSyscall atomic.Uint8
+}
+
+//go:noescape
+func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
+
+// Linux futex.
+//
+//	futexsleep(uint32 *addr, uint32 val)
+//	futexwakeup(uint32 *addr)
+//
+// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+// Futexwakeup wakes up threads sleeping on addr.
+// Futexsleep is allowed to wake up spuriously.
+
+const (
+	_FUTEX_PRIVATE_FLAG = 128
+	_FUTEX_WAIT_PRIVATE = 0 | _FUTEX_PRIVATE_FLAG
+	_FUTEX_WAKE_PRIVATE = 1 | _FUTEX_PRIVATE_FLAG
+)
+
+// Atomically,
+//	if(*addr == val) sleep
+// Might be woken up spuriously; that's allowed.
+// Don't sleep longer than ns; ns < 0 means forever.
+//go:nosplit
+func futexsleep(addr *uint32, val uint32, ns int64) {
+	// Some Linux kernels have a bug where futex of
+	// FUTEX_WAIT returns an internal error code
+	// as an errno. Libpthread ignores the return value
+	// here, and so can we: as it says a few lines up,
+	// spurious wakeups are allowed.
+	if ns < 0 {
+		futex(unsafe.Pointer(addr), _FUTEX_WAIT_PRIVATE, val, nil, nil, 0)
+		return
+	}
+
+	var ts timespec
+	ts.setNsec(ns)
+	futex(unsafe.Pointer(addr), _FUTEX_WAIT_PRIVATE, val, unsafe.Pointer(&ts), nil, 0)
+}
+
+// If any procs are sleeping on addr, wake up at most cnt.
+//go:nosplit
+func futexwakeup(addr *uint32, cnt uint32) {
+	ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE_PRIVATE, cnt, nil, nil, 0)
+	if ret >= 0 {
+		return
+	}
+
+	// I don't know that futex wakeup can return
+	// EAGAIN or EINTR, but if it does, it would be
+	// safe to loop and call futex again.
+	systemstack(func() {
+		print("futexwakeup addr=", addr, " returned ", ret, "\n")
+	})
+
+	*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+}
+
+func getproccount() int32 {
+	// This buffer is huge (8 kB) but we are on the system stack
+	// and there should be plenty of space (64 kB).
+	// Also this is a leaf, so we're not holding up the memory for long.
+	// See golang.org/issue/11823.
+	// The suggested behavior here is to keep trying with ever-larger
+	// buffers, but we don't have a dynamic memory allocator at the
+	// moment, so that's a bit tricky and seems like overkill.
+	const maxCPUs = 64 * 1024
+	var buf [maxCPUs / 8]byte
+	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
+	if r < 0 {
+		return 1
+	}
+	n := int32(0)
+	for _, v := range buf[:r] {
+		for v != 0 {
+			n += int32(v & 1)
+			v >>= 1
+		}
+	}
+	if n == 0 {
+		n = 1
+	}
+	return n
+}
+
+// Clone, the Linux rfork.
+const (
+	_CLONE_VM             = 0x100
+	_CLONE_FS             = 0x200
+	_CLONE_FILES          = 0x400
+	_CLONE_SIGHAND        = 0x800
+	_CLONE_PTRACE         = 0x2000
+	_CLONE_VFORK          = 0x4000
+	_CLONE_PARENT         = 0x8000
+	_CLONE_THREAD         = 0x10000
+	_CLONE_NEWNS          = 0x20000
+	_CLONE_SYSVSEM        = 0x40000
+	_CLONE_SETTLS         = 0x80000
+	_CLONE_PARENT_SETTID  = 0x100000
+	_CLONE_CHILD_CLEARTID = 0x200000
+	_CLONE_UNTRACED       = 0x800000
+	_CLONE_CHILD_SETTID   = 0x1000000
+	_CLONE_STOPPED        = 0x2000000
+	_CLONE_NEWUTS         = 0x4000000
+	_CLONE_NEWIPC         = 0x8000000
+
+	// As of QEMU 2.8.0 (5ea2fc84d), user emulation requires all six of these
+	// flags to be set when creating a thread; attempts to share the other
+	// five but leave SYSVSEM unshared will fail with -EINVAL.
+	//
+	// In non-QEMU environments CLONE_SYSVSEM is inconsequential as we do not
+	// use System V semaphores.
+
+	cloneFlags = _CLONE_VM | /* share memory */
+		_CLONE_FS | /* share cwd, etc */
+		_CLONE_FILES | /* share fd table */
+		_CLONE_SIGHAND | /* share sig handler table */
+		_CLONE_SYSVSEM | /* share SysV semaphore undo lists (see issue #20763) */
+		_CLONE_THREAD /* revisit - okay for now */
+)
+
+//go:noescape
+func clone(flags int32, stk, mp, gp, fn unsafe.Pointer) int32
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m) {
+	stk := unsafe.Pointer(mp.g0.stack.hi)
+	/*
+	 * note: strace gets confused if we use CLONE_PTRACE here.
+	 */
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", abi.FuncPCABI0(clone), " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	// Disable signals during clone, so that the new thread starts
+	// with signals disabled. It will enable them in minit.
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(abi.FuncPCABI0(mstart)))
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+
+	if ret < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
+		if ret == -_EAGAIN {
+			println("runtime: may need to increase max user processes (ulimit -u)")
+		}
+		throw("newosproc")
+	}
+}
+
+// Version of newosproc that doesn't require a valid G.
+//go:nosplit
+func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
+	stack := sysAlloc(stacksize, &memstats.stacks_sys)
+	if stack == nil {
+		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		exit(1)
+	}
+	ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
+	if ret < 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
+var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
+
+const (
+	_AT_NULL   = 0  // End of vector
+	_AT_PAGESZ = 6  // System physical page size
+	_AT_HWCAP  = 16 // hardware capability bit vector
+	_AT_RANDOM = 25 // introduced in 2.6.29
+	_AT_HWCAP2 = 26 // hardware capability bit vector 2
+)
+
+var procAuxv = []byte("/proc/self/auxv\x00")
+
+var addrspace_vec [1]byte
+
+func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+
+func sysargs(argc int32, argv **byte) {
+	n := argc + 1
+
+	// skip over argv, envp to get to auxv
+	for argv_index(argv, n) != nil {
+		n++
+	}
+
+	// skip NULL separator
+	n++
+
+	// now argv+n is auxv
+	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*goarch.PtrSize))
+	if sysauxv(auxv[:]) != 0 {
+		return
+	}
+	// In some situations we don't get a loader-provided
+	// auxv, such as when loaded as a library on Android.
+	// Fall back to /proc/self/auxv.
+	fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
+	if fd < 0 {
+		// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
+		// try using mincore to detect the physical page size.
+		// mincore should return EINVAL when address is not a multiple of system page size.
+		const size = 256 << 10 // size of memory region to allocate
+		p, err := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if err != 0 {
+			return
+		}
+		var n uintptr
+		for n = 4 << 10; n < size; n <<= 1 {
+			err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
+			if err == 0 {
+				physPageSize = n
+				break
+			}
+		}
+		if physPageSize == 0 {
+			physPageSize = size
+		}
+		munmap(p, size)
+		return
+	}
+	var buf [128]uintptr
+	n = read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
+	closefd(fd)
+	if n < 0 {
+		return
+	}
+	// Make sure buf is terminated, even if we didn't read
+	// the whole file.
+	buf[len(buf)-2] = _AT_NULL
+	sysauxv(buf[:])
+}
+
+// startupRandomData holds random bytes initialized at startup. These come from
+// the ELF AT_RANDOM auxiliary vector.
+var startupRandomData []byte
+
+func sysauxv(auxv []uintptr) int {
+	var i int
+	for ; auxv[i] != _AT_NULL; i += 2 {
+		tag, val := auxv[i], auxv[i+1]
+		switch tag {
+		case _AT_RANDOM:
+			// The kernel provides a pointer to 16-bytes
+			// worth of random data.
+			startupRandomData = (*[16]byte)(unsafe.Pointer(val))[:]
+
+		case _AT_PAGESZ:
+			physPageSize = val
+		}
+
+		archauxv(tag, val)
+		vdsoauxv(tag, val)
+	}
+	return i / 2
+}
+
+var sysTHPSizePath = []byte("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size\x00")
+
+func getHugePageSize() uintptr {
+	var numbuf [20]byte
+	fd := open(&sysTHPSizePath[0], 0 /* O_RDONLY */, 0)
+	if fd < 0 {
+		return 0
+	}
+	ptr := noescape(unsafe.Pointer(&numbuf[0]))
+	n := read(fd, ptr, int32(len(numbuf)))
+	closefd(fd)
+	if n <= 0 {
+		return 0
+	}
+	n-- // remove trailing newline
+	v, ok := atoi(slicebytetostringtmp((*byte)(ptr), int(n)))
+	if !ok || v < 0 {
+		v = 0
+	}
+	if v&(v-1) != 0 {
+		// v is not a power of 2
+		return 0
+	}
+	return uintptr(v)
+}
+
+func osinit() {
+	ncpu = getproccount()
+	physHugePageSize = getHugePageSize()
+	if iscgo {
+		// #42494 glibc and musl reserve some signals for
+		// internal use and require they not be blocked by
+		// the rest of a normal C runtime. When the go runtime
+		// blocks...unblocks signals, temporarily, the blocked
+		// interval of time is generally very short. As such,
+		// these expectations of *libc code are mostly met by
+		// the combined go+cgo system of threads. However,
+		// when go causes a thread to exit, via a return from
+		// mstart(), the combined runtime can deadlock if
+		// these signals are blocked. Thus, don't block these
+		// signals when exiting threads.
+		// - glibc: SIGCANCEL (32), SIGSETXID (33)
+		// - musl: SIGTIMER (32), SIGCANCEL (33), SIGSYNCCALL (34)
+		sigdelset(&sigsetAllExiting, 32)
+		sigdelset(&sigsetAllExiting, 33)
+		sigdelset(&sigsetAllExiting, 34)
+	}
+	osArchInit()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+func getRandomData(r []byte) {
+	if startupRandomData != nil {
+		n := copy(r, startupRandomData)
+		extendRandom(r, n)
+		return
+	}
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to do synchronous initialization of Go code built with
+// -buildmode=c-archive or -buildmode=c-shared.
+// None of the Go runtime is initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func libpreinit() {
+	initsig(true)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
+	mp.gsignal.m = mp
+}
+
+func gettid() uint32
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	minitSignals()
+
+	// Cgo-created threads and the bootstrap m are missing a
+	// procid. We need this for asynchronous preemption and it's
+	// useful in debuggers.
+	getg().m.procid = uint64(gettid())
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	unminitSignals()
+}
+
+// Called from exitm, but not from drop, to undo the effect of thread-owned
+// resources in minit, semacreate, or elsewhere. Do not take locks after calling this.
+func mdestroy(mp *m) {
+}
+
+//#ifdef GOARCH_386
+//#define sa_handler k_sa_handler
+//#endif
+
+func sigreturn()
+func sigtramp() // Called via C ABI
+func cgoSigtramp()
+
+//go:noescape
+func sigaltstack(new, old *stackt)
+
+//go:noescape
+func setitimer(mode int32, new, old *itimerval)
+
+//go:noescape
+func timer_create(clockid int32, sevp *sigevent, timerid *int32) int32
+
+//go:noescape
+func timer_settime(timerid int32, flags int32, new, old *itimerspec) int32
+
+//go:noescape
+func timer_delete(timerid int32) int32
+
+//go:noescape
+func rtsigprocmask(how int32, new, old *sigset, size int32)
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigprocmask(how int32, new, old *sigset) {
+	rtsigprocmask(how, new, old, int32(unsafe.Sizeof(*new)))
+}
+
+func raise(sig uint32)
+func raiseproc(sig uint32)
+
+//go:noescape
+func sched_getaffinity(pid, len uintptr, buf *byte) int32
+func osyield()
+
+//go:nosplit
+func osyield_no_g() {
+	osyield()
+}
+
+func pipe() (r, w int32, errno int32)
+func pipe2(flags int32) (r, w int32, errno int32)
+func setNonblock(fd int32)
+
+const (
+	_si_max_size    = 128
+	_sigev_max_size = 64
+)
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i uint32, fn uintptr) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER | _SA_RESTART
+	sigfillset(&sa.sa_mask)
+	// Although Linux manpage says "sa_restorer element is obsolete and
+	// should not be used". x86_64 kernel requires it. Only use it on
+	// x86.
+	if GOARCH == "386" || GOARCH == "amd64" {
+		sa.sa_restorer = abi.FuncPCABI0(sigreturn)
+	}
+	if fn == abi.FuncPCABIInternal(sighandler) { // abi.FuncPCABIInternal(sighandler) matches the callers in signal_unix.go
+		if iscgo {
+			fn = abi.FuncPCABI0(cgoSigtramp)
+		} else {
+			fn = abi.FuncPCABI0(sigtramp)
+		}
+	}
+	sa.sa_handler = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i uint32) {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	sa.sa_flags |= _SA_ONSTACK
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i uint32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	return sa.sa_handler
+}
+
+// setSignaltstackSP sets the ss_sp field of a stackt.
+//go:nosplit
+func setSignalstackSP(s *stackt, sp uintptr) {
+	*(*uintptr)(unsafe.Pointer(&s.ss_sp)) = sp
+}
+
+//go:nosplit
+func (c *sigctxt) fixsigcode(sig uint32) {
+}
+
+// sysSigaction calls the rt_sigaction system call.
+//go:nosplit
+func sysSigaction(sig uint32, new, old *sigactiont) {
+	if rt_sigaction(uintptr(sig), new, old, unsafe.Sizeof(sigactiont{}.sa_mask)) != 0 {
+		// Workaround for bugs in QEMU user mode emulation.
+		//
+		// QEMU turns calls to the sigaction system call into
+		// calls to the C library sigaction call; the C
+		// library call rejects attempts to call sigaction for
+		// SIGCANCEL (32) or SIGSETXID (33).
+		//
+		// QEMU rejects calling sigaction on SIGRTMAX (64).
+		//
+		// Just ignore the error in these case. There isn't
+		// anything we can do about it anyhow.
+		if sig != 32 && sig != 33 && sig != 64 {
+			// Use system stack to avoid split stack overflow on ppc64/ppc64le.
+			systemstack(func() {
+				throw("sigaction failed")
+			})
+		}
+	}
+}
+
+// rt_sigaction is implemented in assembly.
+//go:noescape
+func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
+
+func getpid() int
+func tgkill(tgid, tid, sig int)
+
+// signalM sends a signal to mp.
+func signalM(mp *m, sig int) {
+	tgkill(getpid(), int(mp.procid), sig)
+}
+
+// go118UseTimerCreateProfiler enables the per-thread CPU profiler.
+const go118UseTimerCreateProfiler = true
+
+// validSIGPROF compares this signal delivery's code against the signal sources
+// that the profiler uses, returning whether the delivery should be processed.
+// To be processed, a signal delivery from a known profiling mechanism should
+// correspond to the best profiling mechanism available to this thread. Signals
+// from other sources are always considered valid.
+//
+//go:nosplit
+func validSIGPROF(mp *m, c *sigctxt) bool {
+	code := int32(c.sigcode())
+	setitimer := code == _SI_KERNEL
+	timer_create := code == _SI_TIMER
+
+	if !(setitimer || timer_create) {
+		// The signal doesn't correspond to a profiling mechanism that the
+		// runtime enables itself. There's no reason to process it, but there's
+		// no reason to ignore it either.
+		return true
+	}
+
+	if mp == nil {
+		// Since we don't have an M, we can't check if there's an active
+		// per-thread timer for this thread. We don't know how long this thread
+		// has been around, and if it happened to interact with the Go scheduler
+		// at a time when profiling was active (causing it to have a per-thread
+		// timer). But it may have never interacted with the Go scheduler, or
+		// never while profiling was active. To avoid double-counting, process
+		// only signals from setitimer.
+		//
+		// When a custom cgo traceback function has been registered (on
+		// platforms that support runtime.SetCgoTraceback), SIGPROF signals
+		// delivered to a thread that cannot find a matching M do this check in
+		// the assembly implementations of runtime.cgoSigtramp.
+		return setitimer
+	}
+
+	// Having an M means the thread interacts with the Go scheduler, and we can
+	// check whether there's an active per-thread timer for this thread.
+	if atomic.Load(&mp.profileTimerValid) != 0 {
+		// If this M has its own per-thread CPU profiling interval timer, we
+		// should track the SIGPROF signals that come from that timer (for
+		// accurate reporting of its CPU usage; see issue 35057) and ignore any
+		// that it gets from the process-wide setitimer (to not over-count its
+		// CPU consumption).
+		return timer_create
+	}
+
+	// No active per-thread timer means the only valid profiler is setitimer.
+	return setitimer
+}
+
+func setProcessCPUProfiler(hz int32) {
+	setProcessCPUProfilerTimer(hz)
+}
+
+func setThreadCPUProfiler(hz int32) {
+	mp := getg().m
+	mp.profilehz = hz
+
+	if !go118UseTimerCreateProfiler {
+		return
+	}
+
+	// destroy any active timer
+	if atomic.Load(&mp.profileTimerValid) != 0 {
+		timerid := mp.profileTimer
+		atomic.Store(&mp.profileTimerValid, 0)
+		mp.profileTimer = 0
+
+		ret := timer_delete(timerid)
+		if ret != 0 {
+			print("runtime: failed to disable profiling timer; timer_delete(", timerid, ") errno=", -ret, "\n")
+			throw("timer_delete")
+		}
+	}
+
+	if hz == 0 {
+		// If the goal was to disable profiling for this thread, then the job's done.
+		return
+	}
+
+	// The period of the timer should be 1/Hz. For every "1/Hz" of additional
+	// work, the user should expect one additional sample in the profile.
+	//
+	// But to scale down to very small amounts of application work, to observe
+	// even CPU usage of "one tenth" of the requested period, set the initial
+	// timing delay in a different way: So that "one tenth" of a period of CPU
+	// spend shows up as a 10% chance of one sample (for an expected value of
+	// 0.1 samples), and so that "two and six tenths" periods of CPU spend show
+	// up as a 60% chance of 3 samples and a 40% chance of 2 samples (for an
+	// expected value of 2.6). Set the initial delay to a value in the unifom
+	// random distribution between 0 and the desired period. And because "0"
+	// means "disable timer", add 1 so the half-open interval [0,period) turns
+	// into (0,period].
+	//
+	// Otherwise, this would show up as a bias away from short-lived threads and
+	// from threads that are only occasionally active: for example, when the
+	// garbage collector runs on a mostly-idle system, the additional threads it
+	// activates may do a couple milliseconds of GC-related work and nothing
+	// else in the few seconds that the profiler observes.
+	spec := new(itimerspec)
+	spec.it_value.setNsec(1 + int64(fastrandn(uint32(1e9/hz))))
+	spec.it_interval.setNsec(1e9 / int64(hz))
+
+	var timerid int32
+	var sevp sigevent
+	sevp.notify = _SIGEV_THREAD_ID
+	sevp.signo = _SIGPROF
+	sevp.sigev_notify_thread_id = int32(mp.procid)
+	ret := timer_create(_CLOCK_THREAD_CPUTIME_ID, &sevp, &timerid)
+	if ret != 0 {
+		// If we cannot create a timer for this M, leave profileTimerValid false
+		// to fall back to the process-wide setitimer profiler.
+		return
+	}
+
+	ret = timer_settime(timerid, 0, spec, nil)
+	if ret != 0 {
+		print("runtime: failed to configure profiling timer; timer_settime(", timerid,
+			", 0, {interval: {",
+			spec.it_interval.tv_sec, "s + ", spec.it_interval.tv_nsec, "ns} value: {",
+			spec.it_value.tv_sec, "s + ", spec.it_value.tv_nsec, "ns}}, nil) errno=", -ret, "\n")
+		throw("timer_settime")
+	}
+
+	mp.profileTimer = timerid
+	atomic.Store(&mp.profileTimerValid, 1)
+}
+
+// perThreadSyscallArgs contains the system call number, arguments, and
+// expected return values for a system call to be executed on all threads.
+type perThreadSyscallArgs struct {
+	trap uintptr
+	a1   uintptr
+	a2   uintptr
+	a3   uintptr
+	a4   uintptr
+	a5   uintptr
+	a6   uintptr
+	r1   uintptr
+	r2   uintptr
+}
+
+// perThreadSyscall is the system call to execute for the ongoing
+// doAllThreadsSyscall.
+//
+// perThreadSyscall may only be written while mp.needPerThreadSyscall == 0 on
+// all Ms.
+var perThreadSyscall perThreadSyscallArgs
+
+// syscall_runtime_doAllThreadsSyscall and executes a specified system call on
+// all Ms.
+//
+// The system call is expected to succeed and return the same value on every
+// thread. If any threads do not match, the runtime throws.
+//
+//go:linkname syscall_runtime_doAllThreadsSyscall syscall.runtime_doAllThreadsSyscall
+//go:uintptrescapes
+func syscall_runtime_doAllThreadsSyscall(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	if iscgo {
+		// In cgo, we are not aware of threads created in C, so this approach will not work.
+		panic("doAllThreadsSyscall not supported with cgo enabled")
+	}
+
+	// STW to guarantee that user goroutines see an atomic change to thread
+	// state. Without STW, goroutines could migrate Ms while change is in
+	// progress and e.g., see state old -> new -> old -> new.
+	//
+	// N.B. Internally, this function does not depend on STW to
+	// successfully change every thread. It is only needed for user
+	// expectations, per above.
+	stopTheWorld("doAllThreadsSyscall")
+
+	// This function depends on several properties:
+	//
+	// 1. All OS threads that already exist are associated with an M in
+	//    allm. i.e., we won't miss any pre-existing threads.
+	// 2. All Ms listed in allm will eventually have an OS thread exist.
+	//    i.e., they will set procid and be able to receive signals.
+	// 3. OS threads created after we read allm will clone from a thread
+	//    that has executed the system call. i.e., they inherit the
+	//    modified state.
+	//
+	// We achieve these through different mechanisms:
+	//
+	// 1. Addition of new Ms to allm in allocm happens before clone of its
+	//    OS thread later in newm.
+	// 2. newm does acquirem to avoid being preempted, ensuring that new Ms
+	//    created in allocm will eventually reach OS thread clone later in
+	//    newm.
+	// 3. We take allocmLock for write here to prevent allocation of new Ms
+	//    while this function runs. Per (1), this prevents clone of OS
+	//    threads that are not yet in allm.
+	allocmLock.lock()
+
+	// Disable preemption, preventing us from changing Ms, as we handle
+	// this M specially.
+	//
+	// N.B. STW and lock() above do this as well, this is added for extra
+	// clarity.
+	acquirem()
+
+	// N.B. allocmLock also prevents concurrent execution of this function,
+	// serializing use of perThreadSyscall, mp.needPerThreadSyscall, and
+	// ensuring all threads execute system calls from multiple calls in the
+	// same order.
+
+	r1, r2, errno := syscall.Syscall6(trap, a1, a2, a3, a4, a5, a6)
+	if GOARCH == "ppc64" || GOARCH == "ppc64le" {
+		// TODO(https://go.dev/issue/51192 ): ppc64 doesn't use r2.
+		r2 = 0
+	}
+	if errno != 0 {
+		releasem(getg().m)
+		allocmLock.unlock()
+		startTheWorld()
+		return r1, r2, errno
+	}
+
+	perThreadSyscall = perThreadSyscallArgs{
+		trap: trap,
+		a1:   a1,
+		a2:   a2,
+		a3:   a3,
+		a4:   a4,
+		a5:   a5,
+		a6:   a6,
+		r1:   r1,
+		r2:   r2,
+	}
+
+	// Wait for all threads to start.
+	//
+	// As described above, some Ms have been added to allm prior to
+	// allocmLock, but not yet completed OS clone and set procid.
+	//
+	// At minimum we must wait for a thread to set procid before we can
+	// send it a signal.
+	//
+	// We take this one step further and wait for all threads to start
+	// before sending any signals. This prevents system calls from getting
+	// applied twice: once in the parent and once in the child, like so:
+	//
+	//          A                     B                  C
+	//                         add C to allm
+	// doAllThreadsSyscall
+	//   allocmLock.lock()
+	//   signal B
+	//                         <receive signal>
+	//                         execute syscall
+	//                         <signal return>
+	//                         clone C
+	//                                             <thread start>
+	//                                             set procid
+	//   signal C
+	//                                             <receive signal>
+	//                                             execute syscall
+	//                                             <signal return>
+	//
+	// In this case, thread C inherited the syscall-modified state from
+	// thread B and did not need to execute the syscall, but did anyway
+	// because doAllThreadsSyscall could not be sure whether it was
+	// required.
+	//
+	// Some system calls may not be idempotent, so we ensure each thread
+	// executes the system call exactly once.
+	for mp := allm; mp != nil; mp = mp.alllink {
+		for atomic.Load64(&mp.procid) == 0 {
+			// Thread is starting.
+			osyield()
+		}
+	}
+
+	// Signal every other thread, where they will execute perThreadSyscall
+	// from the signal handler.
+	gp := getg()
+	tid := gp.m.procid
+	for mp := allm; mp != nil; mp = mp.alllink {
+		if atomic.Load64(&mp.procid) == tid {
+			// Our thread already performed the syscall.
+			continue
+		}
+		mp.needPerThreadSyscall.Store(1)
+		signalM(mp, sigPerThreadSyscall)
+	}
+
+	// Wait for all threads to complete.
+	for mp := allm; mp != nil; mp = mp.alllink {
+		if mp.procid == tid {
+			continue
+		}
+		for mp.needPerThreadSyscall.Load() != 0 {
+			osyield()
+		}
+	}
+
+	perThreadSyscall = perThreadSyscallArgs{}
+
+	releasem(getg().m)
+	allocmLock.unlock()
+	startTheWorld()
+
+	return r1, r2, errno
+}
+
+// runPerThreadSyscall runs perThreadSyscall for this M if required.
+//
+// This function throws if the system call returns with anything other than the
+// expected values.
+//go:nosplit
+func runPerThreadSyscall() {
+	gp := getg()
+	if gp.m.needPerThreadSyscall.Load() == 0 {
+		return
+	}
+
+	args := perThreadSyscall
+	r1, r2, errno := syscall.Syscall6(args.trap, args.a1, args.a2, args.a3, args.a4, args.a5, args.a6)
+	if GOARCH == "ppc64" || GOARCH == "ppc64le" {
+		// TODO(https://go.dev/issue/51192 ): ppc64 doesn't use r2.
+		r2 = 0
+	}
+	if errno != 0 || r1 != args.r1 || r2 != args.r2 {
+		print("trap:", args.trap, ", a123456=[", args.a1, ",", args.a2, ",", args.a3, ",", args.a4, ",", args.a5, ",", args.a6, "]\n")
+		print("results: got {r1=", r1, ",r2=", r2, ",errno=", errno, "}, want {r1=", args.r1, ",r2=", args.r2, ",errno=0\n")
+		throw("AllThreadsSyscall6 results differ between threads; runtime corrupted")
+	}
+
+	gp.m.needPerThreadSyscall.Store(0)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/os_linux_generic.go b/contrib/go/_std_1.18/src/runtime/os_linux_generic.go
new file mode 100644
index 0000000000..bed9e66e15
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/os_linux_generic.go
@@ -0,0 +1,38 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !mips && !mipsle && !mips64 && !mips64le && !s390x && !ppc64 && linux
+
+package runtime
+
+const (
+	_SS_DISABLE  = 2
+	_NSIG        = 65
+	_SI_USER     = 0
+	_SIG_BLOCK   = 0
+	_SIG_UNBLOCK = 1
+	_SIG_SETMASK = 2
+)
+
+// It's hard to tease out exactly how big a Sigset is, but
+// rt_sigprocmask crashes if we get it wrong, so if binaries
+// are running, this is right.
+type sigset [2]uint32
+
+var sigset_all = sigset{^uint32(0), ^uint32(0)}
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigaddset(mask *sigset, i int) {
+	(*mask)[(i-1)/32] |= 1 << ((uint32(i) - 1) & 31)
+}
+
+func sigdelset(mask *sigset, i int) {
+	(*mask)[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+}
+
+//go:nosplit
+func sigfillset(mask *uint64) {
+	*mask = ^uint64(0)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/os_linux_noauxv.go b/contrib/go/_std_1.18/src/runtime/os_linux_noauxv.go
new file mode 100644
index 0000000000..7b84f713d6
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/os_linux_noauxv.go
@@ -0,0 +1,10 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && !arm && !arm64 && !mips && !mipsle && !mips64 && !mips64le && !s390x && !ppc64 && !ppc64le
+
+package runtime
+
+func archauxv(tag, val uintptr) {
+}
diff --git a/contrib/go/_std_1.18/src/runtime/os_linux_x86.go b/contrib/go/_std_1.18/src/runtime/os_linux_x86.go
new file mode 100644
index 0000000000..c88f61fa2e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/os_linux_x86.go
@@ -0,0 +1,9 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && (386 || amd64)
+
+package runtime
+
+func osArchInit() {}
diff --git a/contrib/go/_std_1.18/src/runtime/os_nonopenbsd.go b/contrib/go/_std_1.18/src/runtime/os_nonopenbsd.go
new file mode 100644
index 0000000000..a5775961e8
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/os_nonopenbsd.go
@@ -0,0 +1,17 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !openbsd
+
+package runtime
+
+// osStackAlloc performs OS-specific initialization before s is used
+// as stack memory.
+func osStackAlloc(s *mspan) {
+}
+
+// osStackFree undoes the effect of osStackAlloc before s is returned
+// to the heap.
+func osStackFree(s *mspan) {
+}
diff --git a/contrib/go/_std_1.18/src/runtime/panic.go b/contrib/go/_std_1.18/src/runtime/panic.go
new file mode 100644
index 0000000000..6600410cb6
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/panic.go
@@ -0,0 +1,1292 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// We have two different ways of doing defers. The older way involves creating a
+// defer record at the time that a defer statement is executing and adding it to a
+// defer chain. This chain is inspected by the deferreturn call at all function
+// exits in order to run the appropriate defer calls. A cheaper way (which we call
+// open-coded defers) is used for functions in which no defer statements occur in
+// loops. In that case, we simply store the defer function/arg information into
+// specific stack slots at the point of each defer statement, as well as setting a
+// bit in a bitmask. At each function exit, we add inline code to directly make
+// the appropriate defer calls based on the bitmask and fn/arg information stored
+// on the stack. During panic/Goexit processing, the appropriate defer calls are
+// made using extra funcdata info that indicates the exact stack slots that
+// contain the bitmask and defer fn/args.
+
+// Check to make sure we can really generate a panic. If the panic
+// was generated from the runtime, or from inside malloc, then convert
+// to a throw of msg.
+// pc should be the program counter of the compiler-generated code that
+// triggered this panic.
+func panicCheck1(pc uintptr, msg string) {
+	if goarch.IsWasm == 0 && hasPrefix(funcname(findfunc(pc)), "runtime.") {
+		// Note: wasm can't tail call, so we can't get the original caller's pc.
+		throw(msg)
+	}
+	// TODO: is this redundant? How could we be in malloc
+	// but not in the runtime? runtime/internal/*, maybe?
+	gp := getg()
+	if gp != nil && gp.m != nil && gp.m.mallocing != 0 {
+		throw(msg)
+	}
+}
+
+// Same as above, but calling from the runtime is allowed.
+//
+// Using this function is necessary for any panic that may be
+// generated by runtime.sigpanic, since those are always called by the
+// runtime.
+func panicCheck2(err string) {
+	// panic allocates, so to avoid recursive malloc, turn panics
+	// during malloc into throws.
+	gp := getg()
+	if gp != nil && gp.m != nil && gp.m.mallocing != 0 {
+		throw(err)
+	}
+}
+
+// Many of the following panic entry-points turn into throws when they
+// happen in various runtime contexts. These should never happen in
+// the runtime, and if they do, they indicate a serious issue and
+// should not be caught by user code.
+//
+// The panic{Index,Slice,divide,shift} functions are called by
+// code generated by the compiler for out of bounds index expressions,
+// out of bounds slice expressions, division by zero, and shift by negative.
+// The panicdivide (again), panicoverflow, panicfloat, and panicmem
+// functions are called by the signal handler when a signal occurs
+// indicating the respective problem.
+//
+// Since panic{Index,Slice,shift} are never called directly, and
+// since the runtime package should never have an out of bounds slice
+// or array reference or negative shift, if we see those functions called from the
+// runtime package we turn the panic into a throw. That will dump the
+// entire runtime stack for easier debugging.
+//
+// The entry points called by the signal handler will be called from
+// runtime.sigpanic, so we can't disallow calls from the runtime to
+// these (they always look like they're called from the runtime).
+// Hence, for these, we just check for clearly bad runtime conditions.
+//
+// The panic{Index,Slice} functions are implemented in assembly and tail call
+// to the goPanic{Index,Slice} functions below. This is done so we can use
+// a space-minimal register calling convention.
+
+// failures in the comparisons for s[x], 0 <= x < y (y == len(s))
+func goPanicIndex(x int, y int) {
+	panicCheck1(getcallerpc(), "index out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsIndex})
+}
+func goPanicIndexU(x uint, y int) {
+	panicCheck1(getcallerpc(), "index out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsIndex})
+}
+
+// failures in the comparisons for s[:x], 0 <= x <= y (y == len(s) or cap(s))
+func goPanicSliceAlen(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSliceAlen})
+}
+func goPanicSliceAlenU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSliceAlen})
+}
+func goPanicSliceAcap(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSliceAcap})
+}
+func goPanicSliceAcapU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSliceAcap})
+}
+
+// failures in the comparisons for s[x:y], 0 <= x <= y
+func goPanicSliceB(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSliceB})
+}
+func goPanicSliceBU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSliceB})
+}
+
+// failures in the comparisons for s[::x], 0 <= x <= y (y == len(s) or cap(s))
+func goPanicSlice3Alen(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSlice3Alen})
+}
+func goPanicSlice3AlenU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSlice3Alen})
+}
+func goPanicSlice3Acap(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSlice3Acap})
+}
+func goPanicSlice3AcapU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSlice3Acap})
+}
+
+// failures in the comparisons for s[:x:y], 0 <= x <= y
+func goPanicSlice3B(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSlice3B})
+}
+func goPanicSlice3BU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSlice3B})
+}
+
+// failures in the comparisons for s[x:y:], 0 <= x <= y
+func goPanicSlice3C(x int, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsSlice3C})
+}
+func goPanicSlice3CU(x uint, y int) {
+	panicCheck1(getcallerpc(), "slice bounds out of range")
+	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSlice3C})
+}
+
+// failures in the conversion (*[x]T)s, 0 <= x <= y, x == cap(s)
+func goPanicSliceConvert(x int, y int) {
+	panicCheck1(getcallerpc(), "slice length too short to convert to pointer to array")
+	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsConvert})
+}
+
+// Implemented in assembly, as they take arguments in registers.
+// Declared here to mark them as ABIInternal.
+func panicIndex(x int, y int)
+func panicIndexU(x uint, y int)
+func panicSliceAlen(x int, y int)
+func panicSliceAlenU(x uint, y int)
+func panicSliceAcap(x int, y int)
+func panicSliceAcapU(x uint, y int)
+func panicSliceB(x int, y int)
+func panicSliceBU(x uint, y int)
+func panicSlice3Alen(x int, y int)
+func panicSlice3AlenU(x uint, y int)
+func panicSlice3Acap(x int, y int)
+func panicSlice3AcapU(x uint, y int)
+func panicSlice3B(x int, y int)
+func panicSlice3BU(x uint, y int)
+func panicSlice3C(x int, y int)
+func panicSlice3CU(x uint, y int)
+func panicSliceConvert(x int, y int)
+
+var shiftError = error(errorString("negative shift amount"))
+
+func panicshift() {
+	panicCheck1(getcallerpc(), "negative shift amount")
+	panic(shiftError)
+}
+
+var divideError = error(errorString("integer divide by zero"))
+
+func panicdivide() {
+	panicCheck2("integer divide by zero")
+	panic(divideError)
+}
+
+var overflowError = error(errorString("integer overflow"))
+
+func panicoverflow() {
+	panicCheck2("integer overflow")
+	panic(overflowError)
+}
+
+var floatError = error(errorString("floating point error"))
+
+func panicfloat() {
+	panicCheck2("floating point error")
+	panic(floatError)
+}
+
+var memoryError = error(errorString("invalid memory address or nil pointer dereference"))
+
+func panicmem() {
+	panicCheck2("invalid memory address or nil pointer dereference")
+	panic(memoryError)
+}
+
+func panicmemAddr(addr uintptr) {
+	panicCheck2("invalid memory address or nil pointer dereference")
+	panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr})
+}
+
+// Create a new deferred function fn, which has no arguments and results.
+// The compiler turns a defer statement into a call to this.
+func deferproc(fn func()) {
+	gp := getg()
+	if gp.m.curg != gp {
+		// go code on the system stack can't defer
+		throw("defer on system stack")
+	}
+
+	d := newdefer()
+	if d._panic != nil {
+		throw("deferproc: d.panic != nil after newdefer")
+	}
+	d.link = gp._defer
+	gp._defer = d
+	d.fn = fn
+	d.pc = getcallerpc()
+	// We must not be preempted between calling getcallersp and
+	// storing it to d.sp because getcallersp's result is a
+	// uintptr stack pointer.
+	d.sp = getcallersp()
+
+	// deferproc returns 0 normally.
+	// a deferred func that stops a panic
+	// makes the deferproc return 1.
+	// the code the compiler generates always
+	// checks the return value and jumps to the
+	// end of the function if deferproc returns != 0.
+	return0()
+	// No code can go here - the C return register has
+	// been set and must not be clobbered.
+}
+
+// deferprocStack queues a new deferred function with a defer record on the stack.
+// The defer record must have its fn field initialized.
+// All other fields can contain junk.
+// Nosplit because of the uninitialized pointer fields on the stack.
+//
+//go:nosplit
+func deferprocStack(d *_defer) {
+	gp := getg()
+	if gp.m.curg != gp {
+		// go code on the system stack can't defer
+		throw("defer on system stack")
+	}
+	// fn is already set.
+	// The other fields are junk on entry to deferprocStack and
+	// are initialized here.
+	d.started = false
+	d.heap = false
+	d.openDefer = false
+	d.sp = getcallersp()
+	d.pc = getcallerpc()
+	d.framepc = 0
+	d.varp = 0
+	// The lines below implement:
+	//   d.panic = nil
+	//   d.fd = nil
+	//   d.link = gp._defer
+	//   gp._defer = d
+	// But without write barriers. The first three are writes to
+	// the stack so they don't need a write barrier, and furthermore
+	// are to uninitialized memory, so they must not use a write barrier.
+	// The fourth write does not require a write barrier because we
+	// explicitly mark all the defer structures, so we don't need to
+	// keep track of pointers to them with a write barrier.
+	*(*uintptr)(unsafe.Pointer(&d._panic)) = 0
+	*(*uintptr)(unsafe.Pointer(&d.fd)) = 0
+	*(*uintptr)(unsafe.Pointer(&d.link)) = uintptr(unsafe.Pointer(gp._defer))
+	*(*uintptr)(unsafe.Pointer(&gp._defer)) = uintptr(unsafe.Pointer(d))
+
+	return0()
+	// No code can go here - the C return register has
+	// been set and must not be clobbered.
+}
+
+// Each P holds a pool for defers.
+
+// Allocate a Defer, usually using per-P pool.
+// Each defer must be released with freedefer.  The defer is not
+// added to any defer chain yet.
+func newdefer() *_defer {
+	var d *_defer
+	mp := acquirem()
+	pp := mp.p.ptr()
+	if len(pp.deferpool) == 0 && sched.deferpool != nil {
+		lock(&sched.deferlock)
+		for len(pp.deferpool) < cap(pp.deferpool)/2 && sched.deferpool != nil {
+			d := sched.deferpool
+			sched.deferpool = d.link
+			d.link = nil
+			pp.deferpool = append(pp.deferpool, d)
+		}
+		unlock(&sched.deferlock)
+	}
+	if n := len(pp.deferpool); n > 0 {
+		d = pp.deferpool[n-1]
+		pp.deferpool[n-1] = nil
+		pp.deferpool = pp.deferpool[:n-1]
+	}
+	releasem(mp)
+	mp, pp = nil, nil
+
+	if d == nil {
+		// Allocate new defer.
+		d = new(_defer)
+	}
+	d.heap = true
+	return d
+}
+
+// Free the given defer.
+// The defer cannot be used after this call.
+//
+// This is nosplit because the incoming defer is in a perilous state.
+// It's not on any defer list, so stack copying won't adjust stack
+// pointers in it (namely, d.link). Hence, if we were to copy the
+// stack, d could then contain a stale pointer.
+//
+//go:nosplit
+func freedefer(d *_defer) {
+	d.link = nil
+	// After this point we can copy the stack.
+
+	if d._panic != nil {
+		freedeferpanic()
+	}
+	if d.fn != nil {
+		freedeferfn()
+	}
+	if !d.heap {
+		return
+	}
+
+	mp := acquirem()
+	pp := mp.p.ptr()
+	if len(pp.deferpool) == cap(pp.deferpool) {
+		// Transfer half of local cache to the central cache.
+		var first, last *_defer
+		for len(pp.deferpool) > cap(pp.deferpool)/2 {
+			n := len(pp.deferpool)
+			d := pp.deferpool[n-1]
+			pp.deferpool[n-1] = nil
+			pp.deferpool = pp.deferpool[:n-1]
+			if first == nil {
+				first = d
+			} else {
+				last.link = d
+			}
+			last = d
+		}
+		lock(&sched.deferlock)
+		last.link = sched.deferpool
+		sched.deferpool = first
+		unlock(&sched.deferlock)
+	}
+
+	*d = _defer{}
+
+	pp.deferpool = append(pp.deferpool, d)
+
+	releasem(mp)
+	mp, pp = nil, nil
+}
+
+// Separate function so that it can split stack.
+// Windows otherwise runs out of stack space.
+func freedeferpanic() {
+	// _panic must be cleared before d is unlinked from gp.
+	throw("freedefer with d._panic != nil")
+}
+
+func freedeferfn() {
+	// fn must be cleared before d is unlinked from gp.
+	throw("freedefer with d.fn != nil")
+}
+
+// deferreturn runs deferred functions for the caller's frame.
+// The compiler inserts a call to this at the end of any
+// function which calls defer.
+func deferreturn() {
+	gp := getg()
+	for {
+		d := gp._defer
+		if d == nil {
+			return
+		}
+		sp := getcallersp()
+		if d.sp != sp {
+			return
+		}
+		if d.openDefer {
+			done := runOpenDeferFrame(gp, d)
+			if !done {
+				throw("unfinished open-coded defers in deferreturn")
+			}
+			gp._defer = d.link
+			freedefer(d)
+			// If this frame uses open defers, then this
+			// must be the only defer record for the
+			// frame, so we can just return.
+			return
+		}
+
+		fn := d.fn
+		d.fn = nil
+		gp._defer = d.link
+		freedefer(d)
+		fn()
+	}
+}
+
+// Goexit terminates the goroutine that calls it. No other goroutine is affected.
+// Goexit runs all deferred calls before terminating the goroutine. Because Goexit
+// is not a panic, any recover calls in those deferred functions will return nil.
+//
+// Calling Goexit from the main goroutine terminates that goroutine
+// without func main returning. Since func main has not returned,
+// the program continues execution of other goroutines.
+// If all other goroutines exit, the program crashes.
+func Goexit() {
+	// Run all deferred functions for the current goroutine.
+	// This code is similar to gopanic, see that implementation
+	// for detailed comments.
+	gp := getg()
+
+	// Create a panic object for Goexit, so we can recognize when it might be
+	// bypassed by a recover().
+	var p _panic
+	p.goexit = true
+	p.link = gp._panic
+	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+
+	addOneOpenDeferFrame(gp, getcallerpc(), unsafe.Pointer(getcallersp()))
+	for {
+		d := gp._defer
+		if d == nil {
+			break
+		}
+		if d.started {
+			if d._panic != nil {
+				d._panic.aborted = true
+				d._panic = nil
+			}
+			if !d.openDefer {
+				d.fn = nil
+				gp._defer = d.link
+				freedefer(d)
+				continue
+			}
+		}
+		d.started = true
+		d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+		if d.openDefer {
+			done := runOpenDeferFrame(gp, d)
+			if !done {
+				// We should always run all defers in the frame,
+				// since there is no panic associated with this
+				// defer that can be recovered.
+				throw("unfinished open-coded defers in Goexit")
+			}
+			if p.aborted {
+				// Since our current defer caused a panic and may
+				// have been already freed, just restart scanning
+				// for open-coded defers from this frame again.
+				addOneOpenDeferFrame(gp, getcallerpc(), unsafe.Pointer(getcallersp()))
+			} else {
+				addOneOpenDeferFrame(gp, 0, nil)
+			}
+		} else {
+			// Save the pc/sp in deferCallSave(), so we can "recover" back to this
+			// loop if necessary.
+			deferCallSave(&p, d.fn)
+		}
+		if p.aborted {
+			// We had a recursive panic in the defer d we started, and
+			// then did a recover in a defer that was further down the
+			// defer chain than d. In the case of an outstanding Goexit,
+			// we force the recover to return back to this loop. d will
+			// have already been freed if completed, so just continue
+			// immediately to the next defer on the chain.
+			p.aborted = false
+			continue
+		}
+		if gp._defer != d {
+			throw("bad defer entry in Goexit")
+		}
+		d._panic = nil
+		d.fn = nil
+		gp._defer = d.link
+		freedefer(d)
+		// Note: we ignore recovers here because Goexit isn't a panic
+	}
+	goexit1()
+}
+
+// Call all Error and String methods before freezing the world.
+// Used when crashing with panicking.
+func preprintpanics(p *_panic) {
+	defer func() {
+		if recover() != nil {
+			throw("panic while printing panic value")
+		}
+	}()
+	for p != nil {
+		switch v := p.arg.(type) {
+		case error:
+			p.arg = v.Error()
+		case stringer:
+			p.arg = v.String()
+		}
+		p = p.link
+	}
+}
+
+// Print all currently active panics. Used when crashing.
+// Should only be called after preprintpanics.
+func printpanics(p *_panic) {
+	if p.link != nil {
+		printpanics(p.link)
+		if !p.link.goexit {
+			print("\t")
+		}
+	}
+	if p.goexit {
+		return
+	}
+	print("panic: ")
+	printany(p.arg)
+	if p.recovered {
+		print(" [recovered]")
+	}
+	print("\n")
+}
+
+// addOneOpenDeferFrame scans the stack (in gentraceback order, from inner frames to
+// outer frames) for the first frame (if any) with open-coded defers. If it finds
+// one, it adds a single entry to the defer chain for that frame. The entry added
+// represents all the defers in the associated open defer frame, and is sorted in
+// order with respect to any non-open-coded defers.
+//
+// addOneOpenDeferFrame stops (possibly without adding a new entry) if it encounters
+// an in-progress open defer entry. An in-progress open defer entry means there has
+// been a new panic because of a defer in the associated frame. addOneOpenDeferFrame
+// does not add an open defer entry past a started entry, because that started entry
+// still needs to finished, and addOneOpenDeferFrame will be called when that started
+// entry is completed. The defer removal loop in gopanic() similarly stops at an
+// in-progress defer entry. Together, addOneOpenDeferFrame and the defer removal loop
+// ensure the invariant that there is no open defer entry further up the stack than
+// an in-progress defer, and also that the defer removal loop is guaranteed to remove
+// all not-in-progress open defer entries from the defer chain.
+//
+// If sp is non-nil, addOneOpenDeferFrame starts the stack scan from the frame
+// specified by sp. If sp is nil, it uses the sp from the current defer record (which
+// has just been finished). Hence, it continues the stack scan from the frame of the
+// defer that just finished. It skips any frame that already has a (not-in-progress)
+// open-coded _defer record in the defer chain.
+//
+// Note: All entries of the defer chain (including this new open-coded entry) have
+// their pointers (including sp) adjusted properly if the stack moves while
+// running deferred functions. Also, it is safe to pass in the sp arg (which is
+// the direct result of calling getcallersp()), because all pointer variables
+// (including arguments) are adjusted as needed during stack copies.
+func addOneOpenDeferFrame(gp *g, pc uintptr, sp unsafe.Pointer) {
+	var prevDefer *_defer
+	if sp == nil {
+		prevDefer = gp._defer
+		pc = prevDefer.framepc
+		sp = unsafe.Pointer(prevDefer.sp)
+	}
+	systemstack(func() {
+		gentraceback(pc, uintptr(sp), 0, gp, 0, nil, 0x7fffffff,
+			func(frame *stkframe, unused unsafe.Pointer) bool {
+				if prevDefer != nil && prevDefer.sp == frame.sp {
+					// Skip the frame for the previous defer that
+					// we just finished (and was used to set
+					// where we restarted the stack scan)
+					return true
+				}
+				f := frame.fn
+				fd := funcdata(f, _FUNCDATA_OpenCodedDeferInfo)
+				if fd == nil {
+					return true
+				}
+				// Insert the open defer record in the
+				// chain, in order sorted by sp.
+				d := gp._defer
+				var prev *_defer
+				for d != nil {
+					dsp := d.sp
+					if frame.sp < dsp {
+						break
+					}
+					if frame.sp == dsp {
+						if !d.openDefer {
+							throw("duplicated defer entry")
+						}
+						// Don't add any record past an
+						// in-progress defer entry. We don't
+						// need it, and more importantly, we
+						// want to keep the invariant that
+						// there is no open defer entry
+						// passed an in-progress entry (see
+						// header comment).
+						if d.started {
+							return false
+						}
+						return true
+					}
+					prev = d
+					d = d.link
+				}
+				if frame.fn.deferreturn == 0 {
+					throw("missing deferreturn")
+				}
+
+				d1 := newdefer()
+				d1.openDefer = true
+				d1._panic = nil
+				// These are the pc/sp to set after we've
+				// run a defer in this frame that did a
+				// recover. We return to a special
+				// deferreturn that runs any remaining
+				// defers and then returns from the
+				// function.
+				d1.pc = frame.fn.entry() + uintptr(frame.fn.deferreturn)
+				d1.varp = frame.varp
+				d1.fd = fd
+				// Save the SP/PC associated with current frame,
+				// so we can continue stack trace later if needed.
+				d1.framepc = frame.pc
+				d1.sp = frame.sp
+				d1.link = d
+				if prev == nil {
+					gp._defer = d1
+				} else {
+					prev.link = d1
+				}
+				// Stop stack scanning after adding one open defer record
+				return false
+			},
+			nil, 0)
+	})
+}
+
+// readvarintUnsafe reads the uint32 in varint format starting at fd, and returns the
+// uint32 and a pointer to the byte following the varint.
+//
+// There is a similar function runtime.readvarint, which takes a slice of bytes,
+// rather than an unsafe pointer. These functions are duplicated, because one of
+// the two use cases for the functions would get slower if the functions were
+// combined.
+func readvarintUnsafe(fd unsafe.Pointer) (uint32, unsafe.Pointer) {
+	var r uint32
+	var shift int
+	for {
+		b := *(*uint8)((unsafe.Pointer(fd)))
+		fd = add(fd, unsafe.Sizeof(b))
+		if b < 128 {
+			return r + uint32(b)<<shift, fd
+		}
+		r += ((uint32(b) &^ 128) << shift)
+		shift += 7
+		if shift > 28 {
+			panic("Bad varint")
+		}
+	}
+}
+
+// runOpenDeferFrame runs the active open-coded defers in the frame specified by
+// d. It normally processes all active defers in the frame, but stops immediately
+// if a defer does a successful recover. It returns true if there are no
+// remaining defers to run in the frame.
+func runOpenDeferFrame(gp *g, d *_defer) bool {
+	done := true
+	fd := d.fd
+
+	deferBitsOffset, fd := readvarintUnsafe(fd)
+	nDefers, fd := readvarintUnsafe(fd)
+	deferBits := *(*uint8)(unsafe.Pointer(d.varp - uintptr(deferBitsOffset)))
+
+	for i := int(nDefers) - 1; i >= 0; i-- {
+		// read the funcdata info for this defer
+		var closureOffset uint32
+		closureOffset, fd = readvarintUnsafe(fd)
+		if deferBits&(1<<i) == 0 {
+			continue
+		}
+		closure := *(*func())(unsafe.Pointer(d.varp - uintptr(closureOffset)))
+		d.fn = closure
+		deferBits = deferBits &^ (1 << i)
+		*(*uint8)(unsafe.Pointer(d.varp - uintptr(deferBitsOffset))) = deferBits
+		p := d._panic
+		// Call the defer. Note that this can change d.varp if
+		// the stack moves.
+		deferCallSave(p, d.fn)
+		if p != nil && p.aborted {
+			break
+		}
+		d.fn = nil
+		if d._panic != nil && d._panic.recovered {
+			done = deferBits == 0
+			break
+		}
+	}
+
+	return done
+}
+
+// deferCallSave calls fn() after saving the caller's pc and sp in the
+// panic record. This allows the runtime to return to the Goexit defer
+// processing loop, in the unusual case where the Goexit may be
+// bypassed by a successful recover.
+//
+// This is marked as a wrapper by the compiler so it doesn't appear in
+// tracebacks.
+func deferCallSave(p *_panic, fn func()) {
+	if p != nil {
+		p.argp = unsafe.Pointer(getargp())
+		p.pc = getcallerpc()
+		p.sp = unsafe.Pointer(getcallersp())
+	}
+	fn()
+	if p != nil {
+		p.pc = 0
+		p.sp = unsafe.Pointer(nil)
+	}
+}
+
+// The implementation of the predeclared function panic.
+func gopanic(e any) {
+	gp := getg()
+	if gp.m.curg != gp {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		throw("panic on system stack")
+	}
+
+	if gp.m.mallocing != 0 {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		throw("panic during malloc")
+	}
+	if gp.m.preemptoff != "" {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		print("preempt off reason: ")
+		print(gp.m.preemptoff)
+		print("\n")
+		throw("panic during preemptoff")
+	}
+	if gp.m.locks != 0 {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		throw("panic holding locks")
+	}
+
+	var p _panic
+	p.arg = e
+	p.link = gp._panic
+	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+
+	atomic.Xadd(&runningPanicDefers, 1)
+
+	// By calculating getcallerpc/getcallersp here, we avoid scanning the
+	// gopanic frame (stack scanning is slow...)
+	addOneOpenDeferFrame(gp, getcallerpc(), unsafe.Pointer(getcallersp()))
+
+	for {
+		d := gp._defer
+		if d == nil {
+			break
+		}
+
+		// If defer was started by earlier panic or Goexit (and, since we're back here, that triggered a new panic),
+		// take defer off list. An earlier panic will not continue running, but we will make sure below that an
+		// earlier Goexit does continue running.
+		if d.started {
+			if d._panic != nil {
+				d._panic.aborted = true
+			}
+			d._panic = nil
+			if !d.openDefer {
+				// For open-coded defers, we need to process the
+				// defer again, in case there are any other defers
+				// to call in the frame (not including the defer
+				// call that caused the panic).
+				d.fn = nil
+				gp._defer = d.link
+				freedefer(d)
+				continue
+			}
+		}
+
+		// Mark defer as started, but keep on list, so that traceback
+		// can find and update the defer's argument frame if stack growth
+		// or a garbage collection happens before executing d.fn.
+		d.started = true
+
+		// Record the panic that is running the defer.
+		// If there is a new panic during the deferred call, that panic
+		// will find d in the list and will mark d._panic (this panic) aborted.
+		d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+
+		done := true
+		if d.openDefer {
+			done = runOpenDeferFrame(gp, d)
+			if done && !d._panic.recovered {
+				addOneOpenDeferFrame(gp, 0, nil)
+			}
+		} else {
+			p.argp = unsafe.Pointer(getargp())
+			d.fn()
+		}
+		p.argp = nil
+
+		// Deferred function did not panic. Remove d.
+		if gp._defer != d {
+			throw("bad defer entry in panic")
+		}
+		d._panic = nil
+
+		// trigger shrinkage to test stack copy. See stack_test.go:TestStackPanic
+		//GC()
+
+		pc := d.pc
+		sp := unsafe.Pointer(d.sp) // must be pointer so it gets adjusted during stack copy
+		if done {
+			d.fn = nil
+			gp._defer = d.link
+			freedefer(d)
+		}
+		if p.recovered {
+			gp._panic = p.link
+			if gp._panic != nil && gp._panic.goexit && gp._panic.aborted {
+				// A normal recover would bypass/abort the Goexit.  Instead,
+				// we return to the processing loop of the Goexit.
+				gp.sigcode0 = uintptr(gp._panic.sp)
+				gp.sigcode1 = uintptr(gp._panic.pc)
+				mcall(recovery)
+				throw("bypassed recovery failed") // mcall should not return
+			}
+			atomic.Xadd(&runningPanicDefers, -1)
+
+			// After a recover, remove any remaining non-started,
+			// open-coded defer entries, since the corresponding defers
+			// will be executed normally (inline). Any such entry will
+			// become stale once we run the corresponding defers inline
+			// and exit the associated stack frame. We only remove up to
+			// the first started (in-progress) open defer entry, not
+			// including the current frame, since any higher entries will
+			// be from a higher panic in progress, and will still be
+			// needed.
+			d := gp._defer
+			var prev *_defer
+			if !done {
+				// Skip our current frame, if not done. It is
+				// needed to complete any remaining defers in
+				// deferreturn()
+				prev = d
+				d = d.link
+			}
+			for d != nil {
+				if d.started {
+					// This defer is started but we
+					// are in the middle of a
+					// defer-panic-recover inside of
+					// it, so don't remove it or any
+					// further defer entries
+					break
+				}
+				if d.openDefer {
+					if prev == nil {
+						gp._defer = d.link
+					} else {
+						prev.link = d.link
+					}
+					newd := d.link
+					freedefer(d)
+					d = newd
+				} else {
+					prev = d
+					d = d.link
+				}
+			}
+
+			gp._panic = p.link
+			// Aborted panics are marked but remain on the g.panic list.
+			// Remove them from the list.
+			for gp._panic != nil && gp._panic.aborted {
+				gp._panic = gp._panic.link
+			}
+			if gp._panic == nil { // must be done with signal
+				gp.sig = 0
+			}
+			// Pass information about recovering frame to recovery.
+			gp.sigcode0 = uintptr(sp)
+			gp.sigcode1 = pc
+			mcall(recovery)
+			throw("recovery failed") // mcall should not return
+		}
+	}
+
+	// ran out of deferred calls - old-school panic now
+	// Because it is unsafe to call arbitrary user code after freezing
+	// the world, we call preprintpanics to invoke all necessary Error
+	// and String methods to prepare the panic strings before startpanic.
+	preprintpanics(gp._panic)
+
+	fatalpanic(gp._panic) // should not return
+	*(*int)(nil) = 0      // not reached
+}
+
+// getargp returns the location where the caller
+// writes outgoing function call arguments.
+//go:nosplit
+//go:noinline
+func getargp() uintptr {
+	return getcallersp() + sys.MinFrameSize
+}
+
+// The implementation of the predeclared function recover.
+// Cannot split the stack because it needs to reliably
+// find the stack segment of its caller.
+//
+// TODO(rsc): Once we commit to CopyStackAlways,
+// this doesn't need to be nosplit.
+//go:nosplit
+func gorecover(argp uintptr) any {
+	// Must be in a function running as part of a deferred call during the panic.
+	// Must be called from the topmost function of the call
+	// (the function used in the defer statement).
+	// p.argp is the argument pointer of that topmost deferred function call.
+	// Compare against argp reported by caller.
+	// If they match, the caller is the one who can recover.
+	gp := getg()
+	p := gp._panic
+	if p != nil && !p.goexit && !p.recovered && argp == uintptr(p.argp) {
+		p.recovered = true
+		return p.arg
+	}
+	return nil
+}
+
+//go:linkname sync_throw sync.throw
+func sync_throw(s string) {
+	throw(s)
+}
+
+//go:nosplit
+func throw(s string) {
+	// Everything throw does should be recursively nosplit so it
+	// can be called even when it's unsafe to grow the stack.
+	systemstack(func() {
+		print("fatal error: ", s, "\n")
+	})
+	gp := getg()
+	if gp.m.throwing == 0 {
+		gp.m.throwing = 1
+	}
+	fatalthrow()
+	*(*int)(nil) = 0 // not reached
+}
+
+// runningPanicDefers is non-zero while running deferred functions for panic.
+// runningPanicDefers is incremented and decremented atomically.
+// This is used to try hard to get a panic stack trace out when exiting.
+var runningPanicDefers uint32
+
+// panicking is non-zero when crashing the program for an unrecovered panic.
+// panicking is incremented and decremented atomically.
+var panicking uint32
+
+// paniclk is held while printing the panic information and stack trace,
+// so that two concurrent panics don't overlap their output.
+var paniclk mutex
+
+// Unwind the stack after a deferred function calls recover
+// after a panic. Then arrange to continue running as though
+// the caller of the deferred function returned normally.
+func recovery(gp *g) {
+	// Info about defer passed in G struct.
+	sp := gp.sigcode0
+	pc := gp.sigcode1
+
+	// d's arguments need to be in the stack.
+	if sp != 0 && (sp < gp.stack.lo || gp.stack.hi < sp) {
+		print("recover: ", hex(sp), " not in [", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
+		throw("bad recovery")
+	}
+
+	// Make the deferproc for this d return again,
+	// this time returning 1. The calling function will
+	// jump to the standard return epilogue.
+	gp.sched.sp = sp
+	gp.sched.pc = pc
+	gp.sched.lr = 0
+	gp.sched.ret = 1
+	gogo(&gp.sched)
+}
+
+// fatalthrow implements an unrecoverable runtime throw. It freezes the
+// system, prints stack traces starting from its caller, and terminates the
+// process.
+//
+//go:nosplit
+func fatalthrow() {
+	pc := getcallerpc()
+	sp := getcallersp()
+	gp := getg()
+	// Switch to the system stack to avoid any stack growth, which
+	// may make things worse if the runtime is in a bad state.
+	systemstack(func() {
+		startpanic_m()
+
+		if dopanic_m(gp, pc, sp) {
+			// crash uses a decent amount of nosplit stack and we're already
+			// low on stack in throw, so crash on the system stack (unlike
+			// fatalpanic).
+			crash()
+		}
+
+		exit(2)
+	})
+
+	*(*int)(nil) = 0 // not reached
+}
+
+// fatalpanic implements an unrecoverable panic. It is like fatalthrow, except
+// that if msgs != nil, fatalpanic also prints panic messages and decrements
+// runningPanicDefers once main is blocked from exiting.
+//
+//go:nosplit
+func fatalpanic(msgs *_panic) {
+	pc := getcallerpc()
+	sp := getcallersp()
+	gp := getg()
+	var docrash bool
+	// Switch to the system stack to avoid any stack growth, which
+	// may make things worse if the runtime is in a bad state.
+	systemstack(func() {
+		if startpanic_m() && msgs != nil {
+			// There were panic messages and startpanic_m
+			// says it's okay to try to print them.
+
+			// startpanic_m set panicking, which will
+			// block main from exiting, so now OK to
+			// decrement runningPanicDefers.
+			atomic.Xadd(&runningPanicDefers, -1)
+
+			printpanics(msgs)
+		}
+
+		docrash = dopanic_m(gp, pc, sp)
+	})
+
+	if docrash {
+		// By crashing outside the above systemstack call, debuggers
+		// will not be confused when generating a backtrace.
+		// Function crash is marked nosplit to avoid stack growth.
+		crash()
+	}
+
+	systemstack(func() {
+		exit(2)
+	})
+
+	*(*int)(nil) = 0 // not reached
+}
+
+// startpanic_m prepares for an unrecoverable panic.
+//
+// It returns true if panic messages should be printed, or false if
+// the runtime is in bad shape and should just print stacks.
+//
+// It must not have write barriers even though the write barrier
+// explicitly ignores writes once dying > 0. Write barriers still
+// assume that g.m.p != nil, and this function may not have P
+// in some contexts (e.g. a panic in a signal handler for a signal
+// sent to an M with no P).
+//
+//go:nowritebarrierrec
+func startpanic_m() bool {
+	_g_ := getg()
+	if mheap_.cachealloc.size == 0 { // very early
+		print("runtime: panic before malloc heap initialized\n")
+	}
+	// Disallow malloc during an unrecoverable panic. A panic
+	// could happen in a signal handler, or in a throw, or inside
+	// malloc itself. We want to catch if an allocation ever does
+	// happen (even if we're not in one of these situations).
+	_g_.m.mallocing++
+
+	// If we're dying because of a bad lock count, set it to a
+	// good lock count so we don't recursively panic below.
+	if _g_.m.locks < 0 {
+		_g_.m.locks = 1
+	}
+
+	switch _g_.m.dying {
+	case 0:
+		// Setting dying >0 has the side-effect of disabling this G's writebuf.
+		_g_.m.dying = 1
+		atomic.Xadd(&panicking, 1)
+		lock(&paniclk)
+		if debug.schedtrace > 0 || debug.scheddetail > 0 {
+			schedtrace(true)
+		}
+		freezetheworld()
+		return true
+	case 1:
+		// Something failed while panicking.
+		// Just print a stack trace and exit.
+		_g_.m.dying = 2
+		print("panic during panic\n")
+		return false
+	case 2:
+		// This is a genuine bug in the runtime, we couldn't even
+		// print the stack trace successfully.
+		_g_.m.dying = 3
+		print("stack trace unavailable\n")
+		exit(4)
+		fallthrough
+	default:
+		// Can't even print! Just exit.
+		exit(5)
+		return false // Need to return something.
+	}
+}
+
+var didothers bool
+var deadlock mutex
+
+func dopanic_m(gp *g, pc, sp uintptr) bool {
+	if gp.sig != 0 {
+		signame := signame(gp.sig)
+		if signame != "" {
+			print("[signal ", signame)
+		} else {
+			print("[signal ", hex(gp.sig))
+		}
+		print(" code=", hex(gp.sigcode0), " addr=", hex(gp.sigcode1), " pc=", hex(gp.sigpc), "]\n")
+	}
+
+	level, all, docrash := gotraceback()
+	_g_ := getg()
+	if level > 0 {
+		if gp != gp.m.curg {
+			all = true
+		}
+		if gp != gp.m.g0 {
+			print("\n")
+			goroutineheader(gp)
+			traceback(pc, sp, 0, gp)
+		} else if level >= 2 || _g_.m.throwing > 0 {
+			print("\nruntime stack:\n")
+			traceback(pc, sp, 0, gp)
+		}
+		if !didothers && all {
+			didothers = true
+			tracebackothers(gp)
+		}
+	}
+	unlock(&paniclk)
+
+	if atomic.Xadd(&panicking, -1) != 0 {
+		// Some other m is panicking too.
+		// Let it print what it needs to print.
+		// Wait forever without chewing up cpu.
+		// It will exit when it's done.
+		lock(&deadlock)
+		lock(&deadlock)
+	}
+
+	printDebugLog()
+
+	return docrash
+}
+
+// canpanic returns false if a signal should throw instead of
+// panicking.
+//
+//go:nosplit
+func canpanic(gp *g) bool {
+	// Note that g is m->gsignal, different from gp.
+	// Note also that g->m can change at preemption, so m can go stale
+	// if this function ever makes a function call.
+	_g_ := getg()
+	mp := _g_.m
+
+	// Is it okay for gp to panic instead of crashing the program?
+	// Yes, as long as it is running Go code, not runtime code,
+	// and not stuck in a system call.
+	if gp == nil || gp != mp.curg {
+		return false
+	}
+	if mp.locks != 0 || mp.mallocing != 0 || mp.throwing != 0 || mp.preemptoff != "" || mp.dying != 0 {
+		return false
+	}
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning || gp.syscallsp != 0 {
+		return false
+	}
+	if GOOS == "windows" && mp.libcallsp != 0 {
+		return false
+	}
+	return true
+}
+
+// shouldPushSigpanic reports whether pc should be used as sigpanic's
+// return PC (pushing a frame for the call). Otherwise, it should be
+// left alone so that LR is used as sigpanic's return PC, effectively
+// replacing the top-most frame with sigpanic. This is used by
+// preparePanic.
+func shouldPushSigpanic(gp *g, pc, lr uintptr) bool {
+	if pc == 0 {
+		// Probably a call to a nil func. The old LR is more
+		// useful in the stack trace. Not pushing the frame
+		// will make the trace look like a call to sigpanic
+		// instead. (Otherwise the trace will end at sigpanic
+		// and we won't get to see who faulted.)
+		return false
+	}
+	// If we don't recognize the PC as code, but we do recognize
+	// the link register as code, then this assumes the panic was
+	// caused by a call to non-code. In this case, we want to
+	// ignore this call to make unwinding show the context.
+	//
+	// If we running C code, we're not going to recognize pc as a
+	// Go function, so just assume it's good. Otherwise, traceback
+	// may try to read a stale LR that looks like a Go code
+	// pointer and wander into the woods.
+	if gp.m.incgo || findfunc(pc).valid() {
+		// This wasn't a bad call, so use PC as sigpanic's
+		// return PC.
+		return true
+	}
+	if findfunc(lr).valid() {
+		// This was a bad call, but the LR is good, so use the
+		// LR as sigpanic's return PC.
+		return false
+	}
+	// Neither the PC or LR is good. Hopefully pushing a frame
+	// will work.
+	return true
+}
+
+// isAbortPC reports whether pc is the program counter at which
+// runtime.abort raises a signal.
+//
+// It is nosplit because it's part of the isgoexception
+// implementation.
+//
+//go:nosplit
+func isAbortPC(pc uintptr) bool {
+	f := findfunc(pc)
+	if !f.valid() {
+		return false
+	}
+	return f.funcID == funcID_abort
+}
diff --git a/contrib/go/_std_1.18/src/runtime/plugin.go b/contrib/go/_std_1.18/src/runtime/plugin.go
new file mode 100644
index 0000000000..a61dcc3b5d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/plugin.go
@@ -0,0 +1,137 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+//go:linkname plugin_lastmoduleinit plugin.lastmoduleinit
+func plugin_lastmoduleinit() (path string, syms map[string]any, errstr string) {
+	var md *moduledata
+	for pmd := firstmoduledata.next; pmd != nil; pmd = pmd.next {
+		if pmd.bad {
+			md = nil // we only want the last module
+			continue
+		}
+		md = pmd
+	}
+	if md == nil {
+		throw("runtime: no plugin module data")
+	}
+	if md.pluginpath == "" {
+		throw("runtime: plugin has empty pluginpath")
+	}
+	if md.typemap != nil {
+		return "", nil, "plugin already loaded"
+	}
+
+	for _, pmd := range activeModules() {
+		if pmd.pluginpath == md.pluginpath {
+			md.bad = true
+			return "", nil, "plugin already loaded"
+		}
+
+		if inRange(pmd.text, pmd.etext, md.text, md.etext) ||
+			inRange(pmd.bss, pmd.ebss, md.bss, md.ebss) ||
+			inRange(pmd.data, pmd.edata, md.data, md.edata) ||
+			inRange(pmd.types, pmd.etypes, md.types, md.etypes) {
+			println("plugin: new module data overlaps with previous moduledata")
+			println("\tpmd.text-etext=", hex(pmd.text), "-", hex(pmd.etext))
+			println("\tpmd.bss-ebss=", hex(pmd.bss), "-", hex(pmd.ebss))
+			println("\tpmd.data-edata=", hex(pmd.data), "-", hex(pmd.edata))
+			println("\tpmd.types-etypes=", hex(pmd.types), "-", hex(pmd.etypes))
+			println("\tmd.text-etext=", hex(md.text), "-", hex(md.etext))
+			println("\tmd.bss-ebss=", hex(md.bss), "-", hex(md.ebss))
+			println("\tmd.data-edata=", hex(md.data), "-", hex(md.edata))
+			println("\tmd.types-etypes=", hex(md.types), "-", hex(md.etypes))
+			throw("plugin: new module data overlaps with previous moduledata")
+		}
+	}
+	for _, pkghash := range md.pkghashes {
+		if pkghash.linktimehash != *pkghash.runtimehash {
+			md.bad = true
+			return "", nil, "plugin was built with a different version of package " + pkghash.modulename
+		}
+	}
+
+	// Initialize the freshly loaded module.
+	modulesinit()
+	typelinksinit()
+
+	pluginftabverify(md)
+	moduledataverify1(md)
+
+	lock(&itabLock)
+	for _, i := range md.itablinks {
+		itabAdd(i)
+	}
+	unlock(&itabLock)
+
+	// Build a map of symbol names to symbols. Here in the runtime
+	// we fill out the first word of the interface, the type. We
+	// pass these zero value interfaces to the plugin package,
+	// where the symbol value is filled in (usually via cgo).
+	//
+	// Because functions are handled specially in the plugin package,
+	// function symbol names are prefixed here with '.' to avoid
+	// a dependency on the reflect package.
+	syms = make(map[string]any, len(md.ptab))
+	for _, ptab := range md.ptab {
+		symName := resolveNameOff(unsafe.Pointer(md.types), ptab.name)
+		t := (*_type)(unsafe.Pointer(md.types)).typeOff(ptab.typ)
+		var val any
+		valp := (*[2]unsafe.Pointer)(unsafe.Pointer(&val))
+		(*valp)[0] = unsafe.Pointer(t)
+
+		name := symName.name()
+		if t.kind&kindMask == kindFunc {
+			name = "." + name
+		}
+		syms[name] = val
+	}
+	return md.pluginpath, syms, ""
+}
+
+func pluginftabverify(md *moduledata) {
+	badtable := false
+	for i := 0; i < len(md.ftab); i++ {
+		entry := md.textAddr(md.ftab[i].entryoff)
+		if md.minpc <= entry && entry <= md.maxpc {
+			continue
+		}
+
+		f := funcInfo{(*_func)(unsafe.Pointer(&md.pclntable[md.ftab[i].funcoff])), md}
+		name := funcname(f)
+
+		// A common bug is f.entry has a relocation to a duplicate
+		// function symbol, meaning if we search for its PC we get
+		// a valid entry with a name that is useful for debugging.
+		name2 := "none"
+		entry2 := uintptr(0)
+		f2 := findfunc(entry)
+		if f2.valid() {
+			name2 = funcname(f2)
+			entry2 = f2.entry()
+		}
+		badtable = true
+		println("ftab entry", hex(entry), "/", hex(entry2), ": ",
+			name, "/", name2, "outside pc range:[", hex(md.minpc), ",", hex(md.maxpc), "], modulename=", md.modulename, ", pluginpath=", md.pluginpath)
+	}
+	if badtable {
+		throw("runtime: plugin has bad symbol table")
+	}
+}
+
+// inRange reports whether v0 or v1 are in the range [r0, r1].
+func inRange(r0, r1, v0, v1 uintptr) bool {
+	return (v0 >= r0 && v0 <= r1) || (v1 >= r0 && v1 <= r1)
+}
+
+// A ptabEntry is generated by the compiler for each exported function
+// and global variable in the main package of a plugin. It is used to
+// initialize the plugin module's symbol map.
+type ptabEntry struct {
+	name nameOff
+	typ  typeOff
+}
diff --git a/contrib/go/_std_1.18/src/runtime/preempt.go b/contrib/go/_std_1.18/src/runtime/preempt.go
new file mode 100644
index 0000000000..da24f5042c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/preempt.go
@@ -0,0 +1,453 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Goroutine preemption
+//
+// A goroutine can be preempted at any safe-point. Currently, there
+// are a few categories of safe-points:
+//
+// 1. A blocked safe-point occurs for the duration that a goroutine is
+//    descheduled, blocked on synchronization, or in a system call.
+//
+// 2. Synchronous safe-points occur when a running goroutine checks
+//    for a preemption request.
+//
+// 3. Asynchronous safe-points occur at any instruction in user code
+//    where the goroutine can be safely paused and a conservative
+//    stack and register scan can find stack roots. The runtime can
+//    stop a goroutine at an async safe-point using a signal.
+//
+// At both blocked and synchronous safe-points, a goroutine's CPU
+// state is minimal and the garbage collector has complete information
+// about its entire stack. This makes it possible to deschedule a
+// goroutine with minimal space, and to precisely scan a goroutine's
+// stack.
+//
+// Synchronous safe-points are implemented by overloading the stack
+// bound check in function prologues. To preempt a goroutine at the
+// next synchronous safe-point, the runtime poisons the goroutine's
+// stack bound to a value that will cause the next stack bound check
+// to fail and enter the stack growth implementation, which will
+// detect that it was actually a preemption and redirect to preemption
+// handling.
+//
+// Preemption at asynchronous safe-points is implemented by suspending
+// the thread using an OS mechanism (e.g., signals) and inspecting its
+// state to determine if the goroutine was at an asynchronous
+// safe-point. Since the thread suspension itself is generally
+// asynchronous, it also checks if the running goroutine wants to be
+// preempted, since this could have changed. If all conditions are
+// satisfied, it adjusts the signal context to make it look like the
+// signaled thread just called asyncPreempt and resumes the thread.
+// asyncPreempt spills all registers and enters the scheduler.
+//
+// (An alternative would be to preempt in the signal handler itself.
+// This would let the OS save and restore the register state and the
+// runtime would only need to know how to extract potentially
+// pointer-containing registers from the signal context. However, this
+// would consume an M for every preempted G, and the scheduler itself
+// is not designed to run from a signal handler, as it tends to
+// allocate memory and start threads in the preemption path.)
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/atomic"
+)
+
+type suspendGState struct {
+	g *g
+
+	// dead indicates the goroutine was not suspended because it
+	// is dead. This goroutine could be reused after the dead
+	// state was observed, so the caller must not assume that it
+	// remains dead.
+	dead bool
+
+	// stopped indicates that this suspendG transitioned the G to
+	// _Gwaiting via g.preemptStop and thus is responsible for
+	// readying it when done.
+	stopped bool
+}
+
+// suspendG suspends goroutine gp at a safe-point and returns the
+// state of the suspended goroutine. The caller gets read access to
+// the goroutine until it calls resumeG.
+//
+// It is safe for multiple callers to attempt to suspend the same
+// goroutine at the same time. The goroutine may execute between
+// subsequent successful suspend operations. The current
+// implementation grants exclusive access to the goroutine, and hence
+// multiple callers will serialize. However, the intent is to grant
+// shared read access, so please don't depend on exclusive access.
+//
+// This must be called from the system stack and the user goroutine on
+// the current M (if any) must be in a preemptible state. This
+// prevents deadlocks where two goroutines attempt to suspend each
+// other and both are in non-preemptible states. There are other ways
+// to resolve this deadlock, but this seems simplest.
+//
+// TODO(austin): What if we instead required this to be called from a
+// user goroutine? Then we could deschedule the goroutine while
+// waiting instead of blocking the thread. If two goroutines tried to
+// suspend each other, one of them would win and the other wouldn't
+// complete the suspend until it was resumed. We would have to be
+// careful that they couldn't actually queue up suspend for each other
+// and then both be suspended. This would also avoid the need for a
+// kernel context switch in the synchronous case because we could just
+// directly schedule the waiter. The context switch is unavoidable in
+// the signal case.
+//
+//go:systemstack
+func suspendG(gp *g) suspendGState {
+	if mp := getg().m; mp.curg != nil && readgstatus(mp.curg) == _Grunning {
+		// Since we're on the system stack of this M, the user
+		// G is stuck at an unsafe point. If another goroutine
+		// were to try to preempt m.curg, it could deadlock.
+		throw("suspendG from non-preemptible goroutine")
+	}
+
+	// See https://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 10 * 1000
+	var nextYield int64
+
+	// Drive the goroutine to a preemption point.
+	stopped := false
+	var asyncM *m
+	var asyncGen uint32
+	var nextPreemptM int64
+	for i := 0; ; i++ {
+		switch s := readgstatus(gp); s {
+		default:
+			if s&_Gscan != 0 {
+				// Someone else is suspending it. Wait
+				// for them to finish.
+				//
+				// TODO: It would be nicer if we could
+				// coalesce suspends.
+				break
+			}
+
+			dumpgstatus(gp)
+			throw("invalid g status")
+
+		case _Gdead:
+			// Nothing to suspend.
+			//
+			// preemptStop may need to be cleared, but
+			// doing that here could race with goroutine
+			// reuse. Instead, goexit0 clears it.
+			return suspendGState{dead: true}
+
+		case _Gcopystack:
+			// The stack is being copied. We need to wait
+			// until this is done.
+
+		case _Gpreempted:
+			// We (or someone else) suspended the G. Claim
+			// ownership of it by transitioning it to
+			// _Gwaiting.
+			if !casGFromPreempted(gp, _Gpreempted, _Gwaiting) {
+				break
+			}
+
+			// We stopped the G, so we have to ready it later.
+			stopped = true
+
+			s = _Gwaiting
+			fallthrough
+
+		case _Grunnable, _Gsyscall, _Gwaiting:
+			// Claim goroutine by setting scan bit.
+			// This may race with execution or readying of gp.
+			// The scan bit keeps it from transition state.
+			if !castogscanstatus(gp, s, s|_Gscan) {
+				break
+			}
+
+			// Clear the preemption request. It's safe to
+			// reset the stack guard because we hold the
+			// _Gscan bit and thus own the stack.
+			gp.preemptStop = false
+			gp.preempt = false
+			gp.stackguard0 = gp.stack.lo + _StackGuard
+
+			// The goroutine was already at a safe-point
+			// and we've now locked that in.
+			//
+			// TODO: It would be much better if we didn't
+			// leave it in _Gscan, but instead gently
+			// prevented its scheduling until resumption.
+			// Maybe we only use this to bump a suspended
+			// count and the scheduler skips suspended
+			// goroutines? That wouldn't be enough for
+			// {_Gsyscall,_Gwaiting} -> _Grunning. Maybe
+			// for all those transitions we need to check
+			// suspended and deschedule?
+			return suspendGState{g: gp, stopped: stopped}
+
+		case _Grunning:
+			// Optimization: if there is already a pending preemption request
+			// (from the previous loop iteration), don't bother with the atomics.
+			if gp.preemptStop && gp.preempt && gp.stackguard0 == stackPreempt && asyncM == gp.m && atomic.Load(&asyncM.preemptGen) == asyncGen {
+				break
+			}
+
+			// Temporarily block state transitions.
+			if !castogscanstatus(gp, _Grunning, _Gscanrunning) {
+				break
+			}
+
+			// Request synchronous preemption.
+			gp.preemptStop = true
+			gp.preempt = true
+			gp.stackguard0 = stackPreempt
+
+			// Prepare for asynchronous preemption.
+			asyncM2 := gp.m
+			asyncGen2 := atomic.Load(&asyncM2.preemptGen)
+			needAsync := asyncM != asyncM2 || asyncGen != asyncGen2
+			asyncM = asyncM2
+			asyncGen = asyncGen2
+
+			casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+
+			// Send asynchronous preemption. We do this
+			// after CASing the G back to _Grunning
+			// because preemptM may be synchronous and we
+			// don't want to catch the G just spinning on
+			// its status.
+			if preemptMSupported && debug.asyncpreemptoff == 0 && needAsync {
+				// Rate limit preemptM calls. This is
+				// particularly important on Windows
+				// where preemptM is actually
+				// synchronous and the spin loop here
+				// can lead to live-lock.
+				now := nanotime()
+				if now >= nextPreemptM {
+					nextPreemptM = now + yieldDelay/2
+					preemptM(asyncM)
+				}
+			}
+		}
+
+		// TODO: Don't busy wait. This loop should really only
+		// be a simple read/decide/CAS loop that only fails if
+		// there's an active race. Once the CAS succeeds, we
+		// should queue up the preemption (which will require
+		// it to be reliable in the _Grunning case, not
+		// best-effort) and then sleep until we're notified
+		// that the goroutine is suspended.
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			procyield(10)
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+}
+
+// resumeG undoes the effects of suspendG, allowing the suspended
+// goroutine to continue from its current safe-point.
+func resumeG(state suspendGState) {
+	if state.dead {
+		// We didn't actually stop anything.
+		return
+	}
+
+	gp := state.g
+	switch s := readgstatus(gp); s {
+	default:
+		dumpgstatus(gp)
+		throw("unexpected g status")
+
+	case _Grunnable | _Gscan,
+		_Gwaiting | _Gscan,
+		_Gsyscall | _Gscan:
+		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+	}
+
+	if state.stopped {
+		// We stopped it, so we need to re-schedule it.
+		ready(gp, 0, true)
+	}
+}
+
+// canPreemptM reports whether mp is in a state that is safe to preempt.
+//
+// It is nosplit because it has nosplit callers.
+//
+//go:nosplit
+func canPreemptM(mp *m) bool {
+	return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
+}
+
+//go:generate go run mkpreempt.go
+
+// asyncPreempt saves all user registers and calls asyncPreempt2.
+//
+// When stack scanning encounters an asyncPreempt frame, it scans that
+// frame and its parent frame conservatively.
+//
+// asyncPreempt is implemented in assembly.
+func asyncPreempt()
+
+//go:nosplit
+func asyncPreempt2() {
+	gp := getg()
+	gp.asyncSafePoint = true
+	if gp.preemptStop {
+		mcall(preemptPark)
+	} else {
+		mcall(gopreempt_m)
+	}
+	gp.asyncSafePoint = false
+}
+
+// asyncPreemptStack is the bytes of stack space required to inject an
+// asyncPreempt call.
+var asyncPreemptStack = ^uintptr(0)
+
+func init() {
+	f := findfunc(abi.FuncPCABI0(asyncPreempt))
+	total := funcMaxSPDelta(f)
+	f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
+	total += funcMaxSPDelta(f)
+	// Add some overhead for return PCs, etc.
+	asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
+	if asyncPreemptStack > _StackLimit {
+		// We need more than the nosplit limit. This isn't
+		// unsafe, but it may limit asynchronous preemption.
+		//
+		// This may be a problem if we start using more
+		// registers. In that case, we should store registers
+		// in a context object. If we pre-allocate one per P,
+		// asyncPreempt can spill just a few registers to the
+		// stack, then grab its context object and spill into
+		// it. When it enters the runtime, it would allocate a
+		// new context for the P.
+		print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
+		throw("async stack too large")
+	}
+}
+
+// wantAsyncPreempt returns whether an asynchronous preemption is
+// queued for gp.
+func wantAsyncPreempt(gp *g) bool {
+	// Check both the G and the P.
+	return (gp.preempt || gp.m.p != 0 && gp.m.p.ptr().preempt) && readgstatus(gp)&^_Gscan == _Grunning
+}
+
+// isAsyncSafePoint reports whether gp at instruction PC is an
+// asynchronous safe point. This indicates that:
+//
+// 1. It's safe to suspend gp and conservatively scan its stack and
+// registers. There are no potentially hidden pointer values and it's
+// not in the middle of an atomic sequence like a write barrier.
+//
+// 2. gp has enough stack space to inject the asyncPreempt call.
+//
+// 3. It's generally safe to interact with the runtime, even if we're
+// in a signal handler stopped here. For example, there are no runtime
+// locks held, so acquiring a runtime lock won't self-deadlock.
+//
+// In some cases the PC is safe for asynchronous preemption but it
+// also needs to adjust the resumption PC. The new PC is returned in
+// the second result.
+func isAsyncSafePoint(gp *g, pc, sp, lr uintptr) (bool, uintptr) {
+	mp := gp.m
+
+	// Only user Gs can have safe-points. We check this first
+	// because it's extremely common that we'll catch mp in the
+	// scheduler processing this G preemption.
+	if mp.curg != gp {
+		return false, 0
+	}
+
+	// Check M state.
+	if mp.p == 0 || !canPreemptM(mp) {
+		return false, 0
+	}
+
+	// Check stack space.
+	if sp < gp.stack.lo || sp-gp.stack.lo < asyncPreemptStack {
+		return false, 0
+	}
+
+	// Check if PC is an unsafe-point.
+	f := findfunc(pc)
+	if !f.valid() {
+		// Not Go code.
+		return false, 0
+	}
+	if (GOARCH == "mips" || GOARCH == "mipsle" || GOARCH == "mips64" || GOARCH == "mips64le") && lr == pc+8 && funcspdelta(f, pc, nil) == 0 {
+		// We probably stopped at a half-executed CALL instruction,
+		// where the LR is updated but the PC has not. If we preempt
+		// here we'll see a seemingly self-recursive call, which is in
+		// fact not.
+		// This is normally ok, as we use the return address saved on
+		// stack for unwinding, not the LR value. But if this is a
+		// call to morestack, we haven't created the frame, and we'll
+		// use the LR for unwinding, which will be bad.
+		return false, 0
+	}
+	up, startpc := pcdatavalue2(f, _PCDATA_UnsafePoint, pc)
+	if up == _PCDATA_UnsafePointUnsafe {
+		// Unsafe-point marked by compiler. This includes
+		// atomic sequences (e.g., write barrier) and nosplit
+		// functions (except at calls).
+		return false, 0
+	}
+	if fd := funcdata(f, _FUNCDATA_LocalsPointerMaps); fd == nil || f.flag&funcFlag_ASM != 0 {
+		// This is assembly code. Don't assume it's well-formed.
+		// TODO: Empirically we still need the fd == nil check. Why?
+		//
+		// TODO: Are there cases that are safe but don't have a
+		// locals pointer map, like empty frame functions?
+		// It might be possible to preempt any assembly functions
+		// except the ones that have funcFlag_SPWRITE set in f.flag.
+		return false, 0
+	}
+	name := funcname(f)
+	if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+		inltree := (*[1 << 20]inlinedCall)(inldata)
+		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
+		if ix >= 0 {
+			name = funcnameFromNameoff(f, inltree[ix].func_)
+		}
+	}
+	if hasPrefix(name, "runtime.") ||
+		hasPrefix(name, "runtime/internal/") ||
+		hasPrefix(name, "reflect.") {
+		// For now we never async preempt the runtime or
+		// anything closely tied to the runtime. Known issues
+		// include: various points in the scheduler ("don't
+		// preempt between here and here"), much of the defer
+		// implementation (untyped info on stack), bulk write
+		// barriers (write barrier check),
+		// reflect.{makeFuncStub,methodValueCall}.
+		//
+		// TODO(austin): We should improve this, or opt things
+		// in incrementally.
+		return false, 0
+	}
+	switch up {
+	case _PCDATA_Restart1, _PCDATA_Restart2:
+		// Restartable instruction sequence. Back off PC to
+		// the start PC.
+		if startpc == 0 || startpc > pc || pc-startpc > 20 {
+			throw("bad restart PC")
+		}
+		return true, startpc
+	case _PCDATA_RestartAtEntry:
+		// Restart from the function entry at resumption.
+		return true, f.entry()
+	}
+	return true, pc
+}
diff --git a/contrib/go/_std_1.18/src/runtime/preempt_amd64.s b/contrib/go/_std_1.18/src/runtime/preempt_amd64.s
new file mode 100644
index 0000000000..31f7c8b66f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/preempt_amd64.s
@@ -0,0 +1,84 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	PUSHQ BP
+	MOVQ SP, BP
+	// Save flags before clobbering them
+	PUSHFQ
+	// obj doesn't understand ADD/SUB on SP, but does understand ADJSP
+	ADJSP $368
+	// But vet doesn't know ADJSP, so suppress vet stack checking
+	NOP SP
+	MOVQ AX, 0(SP)
+	MOVQ CX, 8(SP)
+	MOVQ DX, 16(SP)
+	MOVQ BX, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ DI, 40(SP)
+	MOVQ R8, 48(SP)
+	MOVQ R9, 56(SP)
+	MOVQ R10, 64(SP)
+	MOVQ R11, 72(SP)
+	MOVQ R12, 80(SP)
+	MOVQ R13, 88(SP)
+	MOVQ R14, 96(SP)
+	MOVQ R15, 104(SP)
+	#ifdef GOOS_darwin
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
+	JE 2(PC)
+	VZEROUPPER
+	#endif
+	MOVUPS X0, 112(SP)
+	MOVUPS X1, 128(SP)
+	MOVUPS X2, 144(SP)
+	MOVUPS X3, 160(SP)
+	MOVUPS X4, 176(SP)
+	MOVUPS X5, 192(SP)
+	MOVUPS X6, 208(SP)
+	MOVUPS X7, 224(SP)
+	MOVUPS X8, 240(SP)
+	MOVUPS X9, 256(SP)
+	MOVUPS X10, 272(SP)
+	MOVUPS X11, 288(SP)
+	MOVUPS X12, 304(SP)
+	MOVUPS X13, 320(SP)
+	MOVUPS X14, 336(SP)
+	MOVUPS X15, 352(SP)
+	CALL ·asyncPreempt2(SB)
+	MOVUPS 352(SP), X15
+	MOVUPS 336(SP), X14
+	MOVUPS 320(SP), X13
+	MOVUPS 304(SP), X12
+	MOVUPS 288(SP), X11
+	MOVUPS 272(SP), X10
+	MOVUPS 256(SP), X9
+	MOVUPS 240(SP), X8
+	MOVUPS 224(SP), X7
+	MOVUPS 208(SP), X6
+	MOVUPS 192(SP), X5
+	MOVUPS 176(SP), X4
+	MOVUPS 160(SP), X3
+	MOVUPS 144(SP), X2
+	MOVUPS 128(SP), X1
+	MOVUPS 112(SP), X0
+	MOVQ 104(SP), R15
+	MOVQ 96(SP), R14
+	MOVQ 88(SP), R13
+	MOVQ 80(SP), R12
+	MOVQ 72(SP), R11
+	MOVQ 64(SP), R10
+	MOVQ 56(SP), R9
+	MOVQ 48(SP), R8
+	MOVQ 40(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 24(SP), BX
+	MOVQ 16(SP), DX
+	MOVQ 8(SP), CX
+	MOVQ 0(SP), AX
+	ADJSP $-368
+	POPFQ
+	POPQ BP
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/preempt_nonwindows.go b/contrib/go/_std_1.18/src/runtime/preempt_nonwindows.go
new file mode 100644
index 0000000000..d6a2408cb7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/preempt_nonwindows.go
@@ -0,0 +1,13 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !windows
+
+package runtime
+
+//go:nosplit
+func osPreemptExtEnter(mp *m) {}
+
+//go:nosplit
+func osPreemptExtExit(mp *m) {}
diff --git a/contrib/go/_std_1.18/src/runtime/print.go b/contrib/go/_std_1.18/src/runtime/print.go
new file mode 100644
index 0000000000..b2a642bb86
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/print.go
@@ -0,0 +1,302 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// The compiler knows that a print of a value of this type
+// should use printhex instead of printuint (decimal).
+type hex uint64
+
+func bytes(s string) (ret []byte) {
+	rp := (*slice)(unsafe.Pointer(&ret))
+	sp := stringStructOf(&s)
+	rp.array = sp.str
+	rp.len = sp.len
+	rp.cap = sp.len
+	return
+}
+
+var (
+	// printBacklog is a circular buffer of messages written with the builtin
+	// print* functions, for use in postmortem analysis of core dumps.
+	printBacklog      [512]byte
+	printBacklogIndex int
+)
+
+// recordForPanic maintains a circular buffer of messages written by the
+// runtime leading up to a process crash, allowing the messages to be
+// extracted from a core dump.
+//
+// The text written during a process crash (following "panic" or "fatal
+// error") is not saved, since the goroutine stacks will generally be readable
+// from the runtime datastructures in the core file.
+func recordForPanic(b []byte) {
+	printlock()
+
+	if atomic.Load(&panicking) == 0 {
+		// Not actively crashing: maintain circular buffer of print output.
+		for i := 0; i < len(b); {
+			n := copy(printBacklog[printBacklogIndex:], b[i:])
+			i += n
+			printBacklogIndex += n
+			printBacklogIndex %= len(printBacklog)
+		}
+	}
+
+	printunlock()
+}
+
+var debuglock mutex
+
+// The compiler emits calls to printlock and printunlock around
+// the multiple calls that implement a single Go print or println
+// statement. Some of the print helpers (printslice, for example)
+// call print recursively. There is also the problem of a crash
+// happening during the print routines and needing to acquire
+// the print lock to print information about the crash.
+// For both these reasons, let a thread acquire the printlock 'recursively'.
+
+func printlock() {
+	mp := getg().m
+	mp.locks++ // do not reschedule between printlock++ and lock(&debuglock).
+	mp.printlock++
+	if mp.printlock == 1 {
+		lock(&debuglock)
+	}
+	mp.locks-- // now we know debuglock is held and holding up mp.locks for us.
+}
+
+func printunlock() {
+	mp := getg().m
+	mp.printlock--
+	if mp.printlock == 0 {
+		unlock(&debuglock)
+	}
+}
+
+// write to goroutine-local buffer if diverting output,
+// or else standard error.
+func gwrite(b []byte) {
+	if len(b) == 0 {
+		return
+	}
+	recordForPanic(b)
+	gp := getg()
+	// Don't use the writebuf if gp.m is dying. We want anything
+	// written through gwrite to appear in the terminal rather
+	// than be written to in some buffer, if we're in a panicking state.
+	// Note that we can't just clear writebuf in the gp.m.dying case
+	// because a panic isn't allowed to have any write barriers.
+	if gp == nil || gp.writebuf == nil || gp.m.dying > 0 {
+		writeErr(b)
+		return
+	}
+
+	n := copy(gp.writebuf[len(gp.writebuf):cap(gp.writebuf)], b)
+	gp.writebuf = gp.writebuf[:len(gp.writebuf)+n]
+}
+
+func printsp() {
+	printstring(" ")
+}
+
+func printnl() {
+	printstring("\n")
+}
+
+func printbool(v bool) {
+	if v {
+		printstring("true")
+	} else {
+		printstring("false")
+	}
+}
+
+func printfloat(v float64) {
+	switch {
+	case v != v:
+		printstring("NaN")
+		return
+	case v+v == v && v > 0:
+		printstring("+Inf")
+		return
+	case v+v == v && v < 0:
+		printstring("-Inf")
+		return
+	}
+
+	const n = 7 // digits printed
+	var buf [n + 7]byte
+	buf[0] = '+'
+	e := 0 // exp
+	if v == 0 {
+		if 1/v < 0 {
+			buf[0] = '-'
+		}
+	} else {
+		if v < 0 {
+			v = -v
+			buf[0] = '-'
+		}
+
+		// normalize
+		for v >= 10 {
+			e++
+			v /= 10
+		}
+		for v < 1 {
+			e--
+			v *= 10
+		}
+
+		// round
+		h := 5.0
+		for i := 0; i < n; i++ {
+			h /= 10
+		}
+		v += h
+		if v >= 10 {
+			e++
+			v /= 10
+		}
+	}
+
+	// format +d.dddd+edd
+	for i := 0; i < n; i++ {
+		s := int(v)
+		buf[i+2] = byte(s + '0')
+		v -= float64(s)
+		v *= 10
+	}
+	buf[1] = buf[2]
+	buf[2] = '.'
+
+	buf[n+2] = 'e'
+	buf[n+3] = '+'
+	if e < 0 {
+		e = -e
+		buf[n+3] = '-'
+	}
+
+	buf[n+4] = byte(e/100) + '0'
+	buf[n+5] = byte(e/10)%10 + '0'
+	buf[n+6] = byte(e%10) + '0'
+	gwrite(buf[:])
+}
+
+func printcomplex(c complex128) {
+	print("(", real(c), imag(c), "i)")
+}
+
+func printuint(v uint64) {
+	var buf [100]byte
+	i := len(buf)
+	for i--; i > 0; i-- {
+		buf[i] = byte(v%10 + '0')
+		if v < 10 {
+			break
+		}
+		v /= 10
+	}
+	gwrite(buf[i:])
+}
+
+func printint(v int64) {
+	if v < 0 {
+		printstring("-")
+		v = -v
+	}
+	printuint(uint64(v))
+}
+
+var minhexdigits = 0 // protected by printlock
+
+func printhex(v uint64) {
+	const dig = "0123456789abcdef"
+	var buf [100]byte
+	i := len(buf)
+	for i--; i > 0; i-- {
+		buf[i] = dig[v%16]
+		if v < 16 && len(buf)-i >= minhexdigits {
+			break
+		}
+		v /= 16
+	}
+	i--
+	buf[i] = 'x'
+	i--
+	buf[i] = '0'
+	gwrite(buf[i:])
+}
+
+func printpointer(p unsafe.Pointer) {
+	printhex(uint64(uintptr(p)))
+}
+func printuintptr(p uintptr) {
+	printhex(uint64(p))
+}
+
+func printstring(s string) {
+	gwrite(bytes(s))
+}
+
+func printslice(s []byte) {
+	sp := (*slice)(unsafe.Pointer(&s))
+	print("[", len(s), "/", cap(s), "]")
+	printpointer(sp.array)
+}
+
+func printeface(e eface) {
+	print("(", e._type, ",", e.data, ")")
+}
+
+func printiface(i iface) {
+	print("(", i.tab, ",", i.data, ")")
+}
+
+// hexdumpWords prints a word-oriented hex dump of [p, end).
+//
+// If mark != nil, it will be called with each printed word's address
+// and should return a character mark to appear just before that
+// word's value. It can return 0 to indicate no mark.
+func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
+	printlock()
+	var markbuf [1]byte
+	markbuf[0] = ' '
+	minhexdigits = int(unsafe.Sizeof(uintptr(0)) * 2)
+	for i := uintptr(0); p+i < end; i += goarch.PtrSize {
+		if i%16 == 0 {
+			if i != 0 {
+				println()
+			}
+			print(hex(p+i), ": ")
+		}
+
+		if mark != nil {
+			markbuf[0] = mark(p + i)
+			if markbuf[0] == 0 {
+				markbuf[0] = ' '
+			}
+		}
+		gwrite(markbuf[:])
+		val := *(*uintptr)(unsafe.Pointer(p + i))
+		print(hex(val))
+		print(" ")
+
+		// Can we symbolize val?
+		fn := findfunc(val)
+		if fn.valid() {
+			print("<", funcname(fn), "+", hex(val-fn.entry()), "> ")
+		}
+	}
+	minhexdigits = 0
+	println()
+	printunlock()
+}
diff --git a/contrib/go/_std_1.18/src/runtime/proc.go b/contrib/go/_std_1.18/src/runtime/proc.go
new file mode 100644
index 0000000000..b997a467ba
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/proc.go
@@ -0,0 +1,6244 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/cpu"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// set using cmd/go/internal/modload.ModInfoProg
+var modinfo string
+
+// Goroutine scheduler
+// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+//
+// The main concepts are:
+// G - goroutine.
+// M - worker thread, or machine.
+// P - processor, a resource that is required to execute Go code.
+//     M must have an associated P to execute Go code, however it can be
+//     blocked or in a syscall w/o an associated P.
+//
+// Design doc at https://golang.org/s/go11sched.
+
+// Worker thread parking/unparking.
+// We need to balance between keeping enough running worker threads to utilize
+// available hardware parallelism and parking excessive running worker threads
+// to conserve CPU resources and power. This is not simple for two reasons:
+// (1) scheduler state is intentionally distributed (in particular, per-P work
+// queues), so it is not possible to compute global predicates on fast paths;
+// (2) for optimal thread management we would need to know the future (don't park
+// a worker thread when a new goroutine will be readied in near future).
+//
+// Three rejected approaches that would work badly:
+// 1. Centralize all scheduler state (would inhibit scalability).
+// 2. Direct goroutine handoff. That is, when we ready a new goroutine and there
+//    is a spare P, unpark a thread and handoff it the thread and the goroutine.
+//    This would lead to thread state thrashing, as the thread that readied the
+//    goroutine can be out of work the very next moment, we will need to park it.
+//    Also, it would destroy locality of computation as we want to preserve
+//    dependent goroutines on the same thread; and introduce additional latency.
+// 3. Unpark an additional thread whenever we ready a goroutine and there is an
+//    idle P, but don't do handoff. This would lead to excessive thread parking/
+//    unparking as the additional threads will instantly park without discovering
+//    any work to do.
+//
+// The current approach:
+//
+// This approach applies to three primary sources of potential work: readying a
+// goroutine, new/modified-earlier timers, and idle-priority GC. See below for
+// additional details.
+//
+// We unpark an additional thread when we submit work if (this is wakep()):
+// 1. There is an idle P, and
+// 2. There are no "spinning" worker threads.
+//
+// A worker thread is considered spinning if it is out of local work and did
+// not find work in the global run queue or netpoller; the spinning state is
+// denoted in m.spinning and in sched.nmspinning. Threads unparked this way are
+// also considered spinning; we don't do goroutine handoff so such threads are
+// out of work initially. Spinning threads spin on looking for work in per-P
+// run queues and timer heaps or from the GC before parking. If a spinning
+// thread finds work it takes itself out of the spinning state and proceeds to
+// execution. If it does not find work it takes itself out of the spinning
+// state and then parks.
+//
+// If there is at least one spinning thread (sched.nmspinning>1), we don't
+// unpark new threads when submitting work. To compensate for that, if the last
+// spinning thread finds work and stops spinning, it must unpark a new spinning
+// thread.  This approach smooths out unjustified spikes of thread unparking,
+// but at the same time guarantees eventual maximal CPU parallelism
+// utilization.
+//
+// The main implementation complication is that we need to be very careful
+// during spinning->non-spinning thread transition. This transition can race
+// with submission of new work, and either one part or another needs to unpark
+// another worker thread. If they both fail to do that, we can end up with
+// semi-persistent CPU underutilization.
+//
+// The general pattern for submission is:
+// 1. Submit work to the local run queue, timer heap, or GC state.
+// 2. #StoreLoad-style memory barrier.
+// 3. Check sched.nmspinning.
+//
+// The general pattern for spinning->non-spinning transition is:
+// 1. Decrement nmspinning.
+// 2. #StoreLoad-style memory barrier.
+// 3. Check all per-P work queues and GC for new work.
+//
+// Note that all this complexity does not apply to global run queue as we are
+// not sloppy about thread unparking when submitting to global queue. Also see
+// comments for nmspinning manipulation.
+//
+// How these different sources of work behave varies, though it doesn't affect
+// the synchronization approach:
+// * Ready goroutine: this is an obvious source of work; the goroutine is
+//   immediately ready and must run on some thread eventually.
+// * New/modified-earlier timer: The current timer implementation (see time.go)
+//   uses netpoll in a thread with no work available to wait for the soonest
+//   timer. If there is no thread waiting, we want a new spinning thread to go
+//   wait.
+// * Idle-priority GC: The GC wakes a stopped idle thread to contribute to
+//   background GC work (note: currently disabled per golang.org/issue/19112).
+//   Also see golang.org/issue/44313, as this should be extended to all GC
+//   workers.
+
+var (
+	m0           m
+	g0           g
+	mcache0      *mcache
+	raceprocctx0 uintptr
+)
+
+//go:linkname runtime_inittask runtime..inittask
+var runtime_inittask initTask
+
+//go:linkname main_inittask main..inittask
+var main_inittask initTask
+
+// main_init_done is a signal used by cgocallbackg that initialization
+// has been completed. It is made before _cgo_notify_runtime_init_done,
+// so all cgo calls can rely on it existing. When main_init is complete,
+// it is closed, meaning cgocallbackg can reliably receive from it.
+var main_init_done chan bool
+
+//go:linkname main_main main.main
+func main_main()
+
+// mainStarted indicates that the main M has started.
+var mainStarted bool
+
+// runtimeInitTime is the nanotime() at which the runtime started.
+var runtimeInitTime int64
+
+// Value to use for signal mask for newly created M's.
+var initSigmask sigset
+
+// The main goroutine.
+func main() {
+	g := getg()
+
+	// Racectx of m0->g0 is used only as the parent of the main goroutine.
+	// It must not be used for anything else.
+	g.m.g0.racectx = 0
+
+	// Max stack size is 1 GB on 64-bit, 250 MB on 32-bit.
+	// Using decimal instead of binary GB and MB because
+	// they look nicer in the stack overflow failure message.
+	if goarch.PtrSize == 8 {
+		maxstacksize = 1000000000
+	} else {
+		maxstacksize = 250000000
+	}
+
+	// An upper limit for max stack size. Used to avoid random crashes
+	// after calling SetMaxStack and trying to allocate a stack that is too big,
+	// since stackalloc works with 32-bit sizes.
+	maxstackceiling = 2 * maxstacksize
+
+	// Allow newproc to start new Ms.
+	mainStarted = true
+
+	if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon
+		systemstack(func() {
+			newm(sysmon, nil, -1)
+		})
+	}
+
+	// Lock the main goroutine onto this, the main OS thread,
+	// during initialization. Most programs won't care, but a few
+	// do require certain calls to be made by the main thread.
+	// Those can arrange for main.main to run in the main thread
+	// by calling runtime.LockOSThread during initialization
+	// to preserve the lock.
+	lockOSThread()
+
+	if g.m != &m0 {
+		throw("runtime.main not on m0")
+	}
+
+	// Record when the world started.
+	// Must be before doInit for tracing init.
+	runtimeInitTime = nanotime()
+	if runtimeInitTime == 0 {
+		throw("nanotime returning zero")
+	}
+
+	if debug.inittrace != 0 {
+		inittrace.id = getg().goid
+		inittrace.active = true
+	}
+
+	doInit(&runtime_inittask) // Must be before defer.
+
+	// Defer unlock so that runtime.Goexit during init does the unlock too.
+	needUnlock := true
+	defer func() {
+		if needUnlock {
+			unlockOSThread()
+		}
+	}()
+
+	gcenable()
+
+	main_init_done = make(chan bool)
+	if iscgo {
+		if _cgo_thread_start == nil {
+			throw("_cgo_thread_start missing")
+		}
+		if GOOS != "windows" {
+			if _cgo_setenv == nil {
+				throw("_cgo_setenv missing")
+			}
+			if _cgo_unsetenv == nil {
+				throw("_cgo_unsetenv missing")
+			}
+		}
+		if _cgo_notify_runtime_init_done == nil {
+			throw("_cgo_notify_runtime_init_done missing")
+		}
+		// Start the template thread in case we enter Go from
+		// a C-created thread and need to create a new thread.
+		startTemplateThread()
+		cgocall(_cgo_notify_runtime_init_done, nil)
+	}
+
+	doInit(&main_inittask)
+
+	// Disable init tracing after main init done to avoid overhead
+	// of collecting statistics in malloc and newproc
+	inittrace.active = false
+
+	close(main_init_done)
+
+	needUnlock = false
+	unlockOSThread()
+
+	if isarchive || islibrary {
+		// A program compiled with -buildmode=c-archive or c-shared
+		// has a main, but it is not executed.
+		return
+	}
+	fn := main_main // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
+	fn()
+	if raceenabled {
+		racefini()
+	}
+
+	// Make racy client program work: if panicking on
+	// another goroutine at the same time as main returns,
+	// let the other goroutine finish printing the panic trace.
+	// Once it does, it will exit. See issues 3934 and 20018.
+	if atomic.Load(&runningPanicDefers) != 0 {
+		// Running deferred functions should not take long.
+		for c := 0; c < 1000; c++ {
+			if atomic.Load(&runningPanicDefers) == 0 {
+				break
+			}
+			Gosched()
+		}
+	}
+	if atomic.Load(&panicking) != 0 {
+		gopark(nil, nil, waitReasonPanicWait, traceEvGoStop, 1)
+	}
+
+	exit(0)
+	for {
+		var x *int32
+		*x = 0
+	}
+}
+
+// os_beforeExit is called from os.Exit(0).
+//go:linkname os_beforeExit os.runtime_beforeExit
+func os_beforeExit() {
+	if raceenabled {
+		racefini()
+	}
+}
+
+// start forcegc helper goroutine
+func init() {
+	go forcegchelper()
+}
+
+func forcegchelper() {
+	forcegc.g = getg()
+	lockInit(&forcegc.lock, lockRankForcegc)
+	for {
+		lock(&forcegc.lock)
+		if forcegc.idle != 0 {
+			throw("forcegc: phase error")
+		}
+		atomic.Store(&forcegc.idle, 1)
+		goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceEvGoBlock, 1)
+		// this goroutine is explicitly resumed by sysmon
+		if debug.gctrace > 0 {
+			println("GC forced")
+		}
+		// Time-triggered, fully concurrent.
+		gcStart(gcTrigger{kind: gcTriggerTime, now: nanotime()})
+	}
+}
+
+//go:nosplit
+
+// Gosched yields the processor, allowing other goroutines to run. It does not
+// suspend the current goroutine, so execution resumes automatically.
+func Gosched() {
+	checkTimeouts()
+	mcall(gosched_m)
+}
+
+// goschedguarded yields the processor like gosched, but also checks
+// for forbidden states and opts out of the yield in those cases.
+//go:nosplit
+func goschedguarded() {
+	mcall(goschedguarded_m)
+}
+
+// Puts the current goroutine into a waiting state and calls unlockf on the
+// system stack.
+//
+// If unlockf returns false, the goroutine is resumed.
+//
+// unlockf must not access this G's stack, as it may be moved between
+// the call to gopark and the call to unlockf.
+//
+// Note that because unlockf is called after putting the G into a waiting
+// state, the G may have already been readied by the time unlockf is called
+// unless there is external synchronization preventing the G from being
+// readied. If unlockf returns false, it must guarantee that the G cannot be
+// externally readied.
+//
+// Reason explains why the goroutine has been parked. It is displayed in stack
+// traces and heap dumps. Reasons should be unique and descriptive. Do not
+// re-use reasons, add new ones.
+func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int) {
+	if reason != waitReasonSleep {
+		checkTimeouts() // timeouts may expire while two goroutines keep the scheduler busy
+	}
+	mp := acquirem()
+	gp := mp.curg
+	status := readgstatus(gp)
+	if status != _Grunning && status != _Gscanrunning {
+		throw("gopark: bad g status")
+	}
+	mp.waitlock = lock
+	mp.waitunlockf = unlockf
+	gp.waitreason = reason
+	mp.waittraceev = traceEv
+	mp.waittraceskip = traceskip
+	releasem(mp)
+	// can't do anything that might move the G between Ms here.
+	mcall(park_m)
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling goready(gp).
+func goparkunlock(lock *mutex, reason waitReason, traceEv byte, traceskip int) {
+	gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceEv, traceskip)
+}
+
+func goready(gp *g, traceskip int) {
+	systemstack(func() {
+		ready(gp, traceskip, true)
+	})
+}
+
+//go:nosplit
+func acquireSudog() *sudog {
+	// Delicate dance: the semaphore implementation calls
+	// acquireSudog, acquireSudog calls new(sudog),
+	// new calls malloc, malloc can call the garbage collector,
+	// and the garbage collector calls the semaphore implementation
+	// in stopTheWorld.
+	// Break the cycle by doing acquirem/releasem around new(sudog).
+	// The acquirem/releasem increments m.locks during new(sudog),
+	// which keeps the garbage collector from being invoked.
+	mp := acquirem()
+	pp := mp.p.ptr()
+	if len(pp.sudogcache) == 0 {
+		lock(&sched.sudoglock)
+		// First, try to grab a batch from central cache.
+		for len(pp.sudogcache) < cap(pp.sudogcache)/2 && sched.sudogcache != nil {
+			s := sched.sudogcache
+			sched.sudogcache = s.next
+			s.next = nil
+			pp.sudogcache = append(pp.sudogcache, s)
+		}
+		unlock(&sched.sudoglock)
+		// If the central cache is empty, allocate a new one.
+		if len(pp.sudogcache) == 0 {
+			pp.sudogcache = append(pp.sudogcache, new(sudog))
+		}
+	}
+	n := len(pp.sudogcache)
+	s := pp.sudogcache[n-1]
+	pp.sudogcache[n-1] = nil
+	pp.sudogcache = pp.sudogcache[:n-1]
+	if s.elem != nil {
+		throw("acquireSudog: found s.elem != nil in cache")
+	}
+	releasem(mp)
+	return s
+}
+
+//go:nosplit
+func releaseSudog(s *sudog) {
+	if s.elem != nil {
+		throw("runtime: sudog with non-nil elem")
+	}
+	if s.isSelect {
+		throw("runtime: sudog with non-false isSelect")
+	}
+	if s.next != nil {
+		throw("runtime: sudog with non-nil next")
+	}
+	if s.prev != nil {
+		throw("runtime: sudog with non-nil prev")
+	}
+	if s.waitlink != nil {
+		throw("runtime: sudog with non-nil waitlink")
+	}
+	if s.c != nil {
+		throw("runtime: sudog with non-nil c")
+	}
+	gp := getg()
+	if gp.param != nil {
+		throw("runtime: releaseSudog with non-nil gp.param")
+	}
+	mp := acquirem() // avoid rescheduling to another P
+	pp := mp.p.ptr()
+	if len(pp.sudogcache) == cap(pp.sudogcache) {
+		// Transfer half of local cache to the central cache.
+		var first, last *sudog
+		for len(pp.sudogcache) > cap(pp.sudogcache)/2 {
+			n := len(pp.sudogcache)
+			p := pp.sudogcache[n-1]
+			pp.sudogcache[n-1] = nil
+			pp.sudogcache = pp.sudogcache[:n-1]
+			if first == nil {
+				first = p
+			} else {
+				last.next = p
+			}
+			last = p
+		}
+		lock(&sched.sudoglock)
+		last.next = sched.sudogcache
+		sched.sudogcache = first
+		unlock(&sched.sudoglock)
+	}
+	pp.sudogcache = append(pp.sudogcache, s)
+	releasem(mp)
+}
+
+// called from assembly
+func badmcall(fn func(*g)) {
+	throw("runtime: mcall called on m->g0 stack")
+}
+
+func badmcall2(fn func(*g)) {
+	throw("runtime: mcall function returned")
+}
+
+func badreflectcall() {
+	panic(plainError("arg size to reflect.call more than 1GB"))
+}
+
+var badmorestackg0Msg = "fatal: morestack on g0\n"
+
+//go:nosplit
+//go:nowritebarrierrec
+func badmorestackg0() {
+	sp := stringStructOf(&badmorestackg0Msg)
+	write(2, sp.str, int32(sp.len))
+}
+
+var badmorestackgsignalMsg = "fatal: morestack on gsignal\n"
+
+//go:nosplit
+//go:nowritebarrierrec
+func badmorestackgsignal() {
+	sp := stringStructOf(&badmorestackgsignalMsg)
+	write(2, sp.str, int32(sp.len))
+}
+
+//go:nosplit
+func badctxt() {
+	throw("ctxt != 0")
+}
+
+func lockedOSThread() bool {
+	gp := getg()
+	return gp.lockedm != 0 && gp.m.lockedg != 0
+}
+
+var (
+	// allgs contains all Gs ever created (including dead Gs), and thus
+	// never shrinks.
+	//
+	// Access via the slice is protected by allglock or stop-the-world.
+	// Readers that cannot take the lock may (carefully!) use the atomic
+	// variables below.
+	allglock mutex
+	allgs    []*g
+
+	// allglen and allgptr are atomic variables that contain len(allgs) and
+	// &allgs[0] respectively. Proper ordering depends on totally-ordered
+	// loads and stores. Writes are protected by allglock.
+	//
+	// allgptr is updated before allglen. Readers should read allglen
+	// before allgptr to ensure that allglen is always <= len(allgptr). New
+	// Gs appended during the race can be missed. For a consistent view of
+	// all Gs, allglock must be held.
+	//
+	// allgptr copies should always be stored as a concrete type or
+	// unsafe.Pointer, not uintptr, to ensure that GC can still reach it
+	// even if it points to a stale array.
+	allglen uintptr
+	allgptr **g
+)
+
+func allgadd(gp *g) {
+	if readgstatus(gp) == _Gidle {
+		throw("allgadd: bad status Gidle")
+	}
+
+	lock(&allglock)
+	allgs = append(allgs, gp)
+	if &allgs[0] != allgptr {
+		atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0]))
+	}
+	atomic.Storeuintptr(&allglen, uintptr(len(allgs)))
+	unlock(&allglock)
+}
+
+// allGsSnapshot returns a snapshot of the slice of all Gs.
+//
+// The world must be stopped or allglock must be held.
+func allGsSnapshot() []*g {
+	assertWorldStoppedOrLockHeld(&allglock)
+
+	// Because the world is stopped or allglock is held, allgadd
+	// cannot happen concurrently with this. allgs grows
+	// monotonically and existing entries never change, so we can
+	// simply return a copy of the slice header. For added safety,
+	// we trim everything past len because that can still change.
+	return allgs[:len(allgs):len(allgs)]
+}
+
+// atomicAllG returns &allgs[0] and len(allgs) for use with atomicAllGIndex.
+func atomicAllG() (**g, uintptr) {
+	length := atomic.Loaduintptr(&allglen)
+	ptr := (**g)(atomic.Loadp(unsafe.Pointer(&allgptr)))
+	return ptr, length
+}
+
+// atomicAllGIndex returns ptr[i] with the allgptr returned from atomicAllG.
+func atomicAllGIndex(ptr **g, i uintptr) *g {
+	return *(**g)(add(unsafe.Pointer(ptr), i*goarch.PtrSize))
+}
+
+// forEachG calls fn on every G from allgs.
+//
+// forEachG takes a lock to exclude concurrent addition of new Gs.
+func forEachG(fn func(gp *g)) {
+	lock(&allglock)
+	for _, gp := range allgs {
+		fn(gp)
+	}
+	unlock(&allglock)
+}
+
+// forEachGRace calls fn on every G from allgs.
+//
+// forEachGRace avoids locking, but does not exclude addition of new Gs during
+// execution, which may be missed.
+func forEachGRace(fn func(gp *g)) {
+	ptr, length := atomicAllG()
+	for i := uintptr(0); i < length; i++ {
+		gp := atomicAllGIndex(ptr, i)
+		fn(gp)
+	}
+	return
+}
+
+const (
+	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
+	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+	_GoidCacheBatch = 16
+)
+
+// cpuinit extracts the environment variable GODEBUG from the environment on
+// Unix-like operating systems and calls internal/cpu.Initialize.
+func cpuinit() {
+	const prefix = "GODEBUG="
+	var env string
+
+	switch GOOS {
+	case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
+		cpu.DebugOptions = true
+
+		// Similar to goenv_unix but extracts the environment value for
+		// GODEBUG directly.
+		// TODO(moehrmann): remove when general goenvs() can be called before cpuinit()
+		n := int32(0)
+		for argv_index(argv, argc+1+n) != nil {
+			n++
+		}
+
+		for i := int32(0); i < n; i++ {
+			p := argv_index(argv, argc+1+i)
+			s := *(*string)(unsafe.Pointer(&stringStruct{unsafe.Pointer(p), findnull(p)}))
+
+			if hasPrefix(s, prefix) {
+				env = gostring(p)[len(prefix):]
+				break
+			}
+		}
+	}
+
+	cpu.Initialize(env)
+
+	// Support cpu feature variables are used in code generated by the compiler
+	// to guard execution of instructions that can not be assumed to be always supported.
+	switch GOARCH {
+	case "386", "amd64":
+		x86HasPOPCNT = cpu.X86.HasPOPCNT
+		x86HasSSE41 = cpu.X86.HasSSE41
+		x86HasFMA = cpu.X86.HasFMA
+
+	case "arm":
+		armHasVFPv4 = cpu.ARM.HasVFPv4
+
+	case "arm64":
+		arm64HasATOMICS = cpu.ARM64.HasATOMICS
+	}
+}
+
+// The bootstrap sequence is:
+//
+//	call osinit
+//	call schedinit
+//	make & queue new G
+//	call runtime·mstart
+//
+// The new G calls runtime·main.
+func schedinit() {
+	lockInit(&sched.lock, lockRankSched)
+	lockInit(&sched.sysmonlock, lockRankSysmon)
+	lockInit(&sched.deferlock, lockRankDefer)
+	lockInit(&sched.sudoglock, lockRankSudog)
+	lockInit(&deadlock, lockRankDeadlock)
+	lockInit(&paniclk, lockRankPanic)
+	lockInit(&allglock, lockRankAllg)
+	lockInit(&allpLock, lockRankAllp)
+	lockInit(&reflectOffs.lock, lockRankReflectOffs)
+	lockInit(&finlock, lockRankFin)
+	lockInit(&trace.bufLock, lockRankTraceBuf)
+	lockInit(&trace.stringsLock, lockRankTraceStrings)
+	lockInit(&trace.lock, lockRankTrace)
+	lockInit(&cpuprof.lock, lockRankCpuprof)
+	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+	// Enforce that this lock is always a leaf lock.
+	// All of this lock's critical sections should be
+	// extremely short.
+	lockInit(&memstats.heapStats.noPLock, lockRankLeafRank)
+
+	// raceinit must be the first call to race detector.
+	// In particular, it must be done before mallocinit below calls racemapshadow.
+	_g_ := getg()
+	if raceenabled {
+		_g_.racectx, raceprocctx0 = raceinit()
+	}
+
+	sched.maxmcount = 10000
+
+	// The world starts stopped.
+	worldStopped()
+
+	moduledataverify()
+	stackinit()
+	mallocinit()
+	cpuinit()      // must run before alginit
+	alginit()      // maps, hash, fastrand must not be used before this call
+	fastrandinit() // must run before mcommoninit
+	mcommoninit(_g_.m, -1)
+	modulesinit()   // provides activeModules
+	typelinksinit() // uses maps, activeModules
+	itabsinit()     // uses activeModules
+	stkobjinit()    // must run before GC starts
+
+	sigsave(&_g_.m.sigmask)
+	initSigmask = _g_.m.sigmask
+
+	if offset := unsafe.Offsetof(sched.timeToRun); offset%8 != 0 {
+		println(offset)
+		throw("sched.timeToRun not aligned to 8 bytes")
+	}
+
+	goargs()
+	goenvs()
+	parsedebugvars()
+	gcinit()
+
+	lock(&sched.lock)
+	sched.lastpoll = uint64(nanotime())
+	procs := ncpu
+	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
+		procs = n
+	}
+	if procresize(procs) != nil {
+		throw("unknown runnable goroutine during bootstrap")
+	}
+	unlock(&sched.lock)
+
+	// World is effectively started now, as P's can run.
+	worldStarted()
+
+	// For cgocheck > 1, we turn on the write barrier at all times
+	// and check all pointer writes. We can't do this until after
+	// procresize because the write barrier needs a P.
+	if debug.cgocheck > 1 {
+		writeBarrier.cgo = true
+		writeBarrier.enabled = true
+		for _, p := range allp {
+			p.wbBuf.reset()
+		}
+	}
+
+	if buildVersion == "" {
+		// Condition should never trigger. This code just serves
+		// to ensure runtime·buildVersion is kept in the resulting binary.
+		buildVersion = "unknown"
+	}
+	if len(modinfo) == 1 {
+		// Condition should never trigger. This code just serves
+		// to ensure runtime·modinfo is kept in the resulting binary.
+		modinfo = ""
+	}
+}
+
+func dumpgstatus(gp *g) {
+	_g_ := getg()
+	print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
+}
+
+// sched.lock must be held.
+func checkmcount() {
+	assertLockHeld(&sched.lock)
+
+	if mcount() > sched.maxmcount {
+		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
+		throw("thread exhaustion")
+	}
+}
+
+// mReserveID returns the next ID to use for a new m. This new m is immediately
+// considered 'running' by checkdead.
+//
+// sched.lock must be held.
+func mReserveID() int64 {
+	assertLockHeld(&sched.lock)
+
+	if sched.mnext+1 < sched.mnext {
+		throw("runtime: thread ID overflow")
+	}
+	id := sched.mnext
+	sched.mnext++
+	checkmcount()
+	return id
+}
+
+// Pre-allocated ID may be passed as 'id', or omitted by passing -1.
+func mcommoninit(mp *m, id int64) {
+	_g_ := getg()
+
+	// g0 stack won't make sense for user (and is not necessary unwindable).
+	if _g_ != _g_.m.g0 {
+		callers(1, mp.createstack[:])
+	}
+
+	lock(&sched.lock)
+
+	if id >= 0 {
+		mp.id = id
+	} else {
+		mp.id = mReserveID()
+	}
+
+	lo := uint32(int64Hash(uint64(mp.id), fastrandseed))
+	hi := uint32(int64Hash(uint64(cputicks()), ^fastrandseed))
+	if lo|hi == 0 {
+		hi = 1
+	}
+	// Same behavior as for 1.17.
+	// TODO: Simplify ths.
+	if goarch.BigEndian {
+		mp.fastrand = uint64(lo)<<32 | uint64(hi)
+	} else {
+		mp.fastrand = uint64(hi)<<32 | uint64(lo)
+	}
+
+	mpreinit(mp)
+	if mp.gsignal != nil {
+		mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
+	}
+
+	// Add to allm so garbage collector doesn't free g->m
+	// when it is just in a register or thread-local storage.
+	mp.alllink = allm
+
+	// NumCgoCall() iterates over allm w/o schedlock,
+	// so we need to publish it safely.
+	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
+	unlock(&sched.lock)
+
+	// Allocate memory to hold a cgo traceback if the cgo call crashes.
+	if iscgo || GOOS == "solaris" || GOOS == "illumos" || GOOS == "windows" {
+		mp.cgoCallers = new(cgoCallers)
+	}
+}
+
+var fastrandseed uintptr
+
+func fastrandinit() {
+	s := (*[unsafe.Sizeof(fastrandseed)]byte)(unsafe.Pointer(&fastrandseed))[:]
+	getRandomData(s)
+}
+
+// Mark gp ready to run.
+func ready(gp *g, traceskip int, next bool) {
+	if trace.enabled {
+		traceGoUnpark(gp, traceskip)
+	}
+
+	status := readgstatus(gp)
+
+	// Mark runnable.
+	_g_ := getg()
+	mp := acquirem() // disable preemption because it can be holding p in a local var
+	if status&^_Gscan != _Gwaiting {
+		dumpgstatus(gp)
+		throw("bad g->status in ready")
+	}
+
+	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	runqput(_g_.m.p.ptr(), gp, next)
+	wakep()
+	releasem(mp)
+}
+
+// freezeStopWait is a large value that freezetheworld sets
+// sched.stopwait to in order to request that all Gs permanently stop.
+const freezeStopWait = 0x7fffffff
+
+// freezing is set to non-zero if the runtime is trying to freeze the
+// world.
+var freezing uint32
+
+// Similar to stopTheWorld but best-effort and can be called several times.
+// There is no reverse operation, used during crashing.
+// This function must not lock any mutexes.
+func freezetheworld() {
+	atomic.Store(&freezing, 1)
+	// stopwait and preemption requests can be lost
+	// due to races with concurrently executing threads,
+	// so try several times
+	for i := 0; i < 5; i++ {
+		// this should tell the scheduler to not start any new goroutines
+		sched.stopwait = freezeStopWait
+		atomic.Store(&sched.gcwaiting, 1)
+		// this should stop running goroutines
+		if !preemptall() {
+			break // no running goroutines
+		}
+		usleep(1000)
+	}
+	// to be sure
+	usleep(1000)
+	preemptall()
+	usleep(1000)
+}
+
+// All reads and writes of g's status go through readgstatus, casgstatus
+// castogscanstatus, casfrom_Gscanstatus.
+//go:nosplit
+func readgstatus(gp *g) uint32 {
+	return atomic.Load(&gp.atomicstatus)
+}
+
+// The Gscanstatuses are acting like locks and this releases them.
+// If it proves to be a performance hit we should be able to make these
+// simple atomic stores but for now we are going to throw if
+// we see an inconsistent state.
+func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
+	success := false
+
+	// Check that transition is valid.
+	switch oldval {
+	default:
+		print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+		dumpgstatus(gp)
+		throw("casfrom_Gscanstatus:top gp->status is not in scan state")
+	case _Gscanrunnable,
+		_Gscanwaiting,
+		_Gscanrunning,
+		_Gscansyscall,
+		_Gscanpreempted:
+		if newval == oldval&^_Gscan {
+			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
+		}
+	}
+	if !success {
+		print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+		dumpgstatus(gp)
+		throw("casfrom_Gscanstatus: gp->status is not in scan state")
+	}
+	releaseLockRank(lockRankGscan)
+}
+
+// This will return false if the gp is not in the expected status and the cas fails.
+// This acts like a lock acquire while the casfromgstatus acts like a lock release.
+func castogscanstatus(gp *g, oldval, newval uint32) bool {
+	switch oldval {
+	case _Grunnable,
+		_Grunning,
+		_Gwaiting,
+		_Gsyscall:
+		if newval == oldval|_Gscan {
+			r := atomic.Cas(&gp.atomicstatus, oldval, newval)
+			if r {
+				acquireLockRank(lockRankGscan)
+			}
+			return r
+
+		}
+	}
+	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
+	throw("castogscanstatus")
+	panic("not reached")
+}
+
+// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
+// and casfrom_Gscanstatus instead.
+// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
+// put it in the Gscan state is finished.
+//go:nosplit
+func casgstatus(gp *g, oldval, newval uint32) {
+	if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval {
+		systemstack(func() {
+			print("runtime: casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n")
+			throw("casgstatus: bad incoming values")
+		})
+	}
+
+	acquireLockRank(lockRankGscan)
+	releaseLockRank(lockRankGscan)
+
+	// See https://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 5 * 1000
+	var nextYield int64
+
+	// loop if gp->atomicstatus is in a scan state giving
+	// GC time to finish and change the state to oldval.
+	for i := 0; !atomic.Cas(&gp.atomicstatus, oldval, newval); i++ {
+		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
+			throw("casgstatus: waiting for Gwaiting but is Grunnable")
+		}
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			for x := 0; x < 10 && gp.atomicstatus != oldval; x++ {
+				procyield(1)
+			}
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+
+	// Handle tracking for scheduling latencies.
+	if oldval == _Grunning {
+		// Track every 8th time a goroutine transitions out of running.
+		if gp.trackingSeq%gTrackingPeriod == 0 {
+			gp.tracking = true
+		}
+		gp.trackingSeq++
+	}
+	if gp.tracking {
+		if oldval == _Grunnable {
+			// We transitioned out of runnable, so measure how much
+			// time we spent in this state and add it to
+			// runnableTime.
+			now := nanotime()
+			gp.runnableTime += now - gp.runnableStamp
+			gp.runnableStamp = 0
+		}
+		if newval == _Grunnable {
+			// We just transitioned into runnable, so record what
+			// time that happened.
+			now := nanotime()
+			gp.runnableStamp = now
+		} else if newval == _Grunning {
+			// We're transitioning into running, so turn off
+			// tracking and record how much time we spent in
+			// runnable.
+			gp.tracking = false
+			sched.timeToRun.record(gp.runnableTime)
+			gp.runnableTime = 0
+		}
+	}
+}
+
+// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
+// Returns old status. Cannot call casgstatus directly, because we are racing with an
+// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
+// it might have become Grunnable by the time we get to the cas. If we called casgstatus,
+// it would loop waiting for the status to go back to Gwaiting, which it never will.
+//go:nosplit
+func casgcopystack(gp *g) uint32 {
+	for {
+		oldstatus := readgstatus(gp) &^ _Gscan
+		if oldstatus != _Gwaiting && oldstatus != _Grunnable {
+			throw("copystack: bad status, not Gwaiting or Grunnable")
+		}
+		if atomic.Cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
+			return oldstatus
+		}
+	}
+}
+
+// casGToPreemptScan transitions gp from _Grunning to _Gscan|_Gpreempted.
+//
+// TODO(austin): This is the only status operation that both changes
+// the status and locks the _Gscan bit. Rethink this.
+func casGToPreemptScan(gp *g, old, new uint32) {
+	if old != _Grunning || new != _Gscan|_Gpreempted {
+		throw("bad g transition")
+	}
+	acquireLockRank(lockRankGscan)
+	for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
+	}
+}
+
+// casGFromPreempted attempts to transition gp from _Gpreempted to
+// _Gwaiting. If successful, the caller is responsible for
+// re-scheduling gp.
+func casGFromPreempted(gp *g, old, new uint32) bool {
+	if old != _Gpreempted || new != _Gwaiting {
+		throw("bad g transition")
+	}
+	return atomic.Cas(&gp.atomicstatus, _Gpreempted, _Gwaiting)
+}
+
+// stopTheWorld stops all P's from executing goroutines, interrupting
+// all goroutines at GC safe points and records reason as the reason
+// for the stop. On return, only the current goroutine's P is running.
+// stopTheWorld must not be called from a system stack and the caller
+// must not hold worldsema. The caller must call startTheWorld when
+// other P's should resume execution.
+//
+// stopTheWorld is safe for multiple goroutines to call at the
+// same time. Each will execute its own stop, and the stops will
+// be serialized.
+//
+// This is also used by routines that do stack dumps. If the system is
+// in panic or being exited, this may not reliably stop all
+// goroutines.
+func stopTheWorld(reason string) {
+	semacquire(&worldsema)
+	gp := getg()
+	gp.m.preemptoff = reason
+	systemstack(func() {
+		// Mark the goroutine which called stopTheWorld preemptible so its
+		// stack may be scanned.
+		// This lets a mark worker scan us while we try to stop the world
+		// since otherwise we could get in a mutual preemption deadlock.
+		// We must not modify anything on the G stack because a stack shrink
+		// may occur. A stack shrink is otherwise OK though because in order
+		// to return from this function (and to leave the system stack) we
+		// must have preempted all goroutines, including any attempting
+		// to scan our stack, in which case, any stack shrinking will
+		// have already completed by the time we exit.
+		casgstatus(gp, _Grunning, _Gwaiting)
+		stopTheWorldWithSema()
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
+}
+
+// startTheWorld undoes the effects of stopTheWorld.
+func startTheWorld() {
+	systemstack(func() { startTheWorldWithSema(false) })
+
+	// worldsema must be held over startTheWorldWithSema to ensure
+	// gomaxprocs cannot change while worldsema is held.
+	//
+	// Release worldsema with direct handoff to the next waiter, but
+	// acquirem so that semrelease1 doesn't try to yield our time.
+	//
+	// Otherwise if e.g. ReadMemStats is being called in a loop,
+	// it might stomp on other attempts to stop the world, such as
+	// for starting or ending GC. The operation this blocks is
+	// so heavy-weight that we should just try to be as fair as
+	// possible here.
+	//
+	// We don't want to just allow us to get preempted between now
+	// and releasing the semaphore because then we keep everyone
+	// (including, for example, GCs) waiting longer.
+	mp := acquirem()
+	mp.preemptoff = ""
+	semrelease1(&worldsema, true, 0)
+	releasem(mp)
+}
+
+// stopTheWorldGC has the same effect as stopTheWorld, but blocks
+// until the GC is not running. It also blocks a GC from starting
+// until startTheWorldGC is called.
+func stopTheWorldGC(reason string) {
+	semacquire(&gcsema)
+	stopTheWorld(reason)
+}
+
+// startTheWorldGC undoes the effects of stopTheWorldGC.
+func startTheWorldGC() {
+	startTheWorld()
+	semrelease(&gcsema)
+}
+
+// Holding worldsema grants an M the right to try to stop the world.
+var worldsema uint32 = 1
+
+// Holding gcsema grants the M the right to block a GC, and blocks
+// until the current GC is done. In particular, it prevents gomaxprocs
+// from changing concurrently.
+//
+// TODO(mknyszek): Once gomaxprocs and the execution tracer can handle
+// being changed/enabled during a GC, remove this.
+var gcsema uint32 = 1
+
+// stopTheWorldWithSema is the core implementation of stopTheWorld.
+// The caller is responsible for acquiring worldsema and disabling
+// preemption first and then should stopTheWorldWithSema on the system
+// stack:
+//
+//	semacquire(&worldsema, 0)
+//	m.preemptoff = "reason"
+//	systemstack(stopTheWorldWithSema)
+//
+// When finished, the caller must either call startTheWorld or undo
+// these three operations separately:
+//
+//	m.preemptoff = ""
+//	systemstack(startTheWorldWithSema)
+//	semrelease(&worldsema)
+//
+// It is allowed to acquire worldsema once and then execute multiple
+// startTheWorldWithSema/stopTheWorldWithSema pairs.
+// Other P's are able to execute between successive calls to
+// startTheWorldWithSema and stopTheWorldWithSema.
+// Holding worldsema causes any other goroutines invoking
+// stopTheWorld to block.
+func stopTheWorldWithSema() {
+	_g_ := getg()
+
+	// If we hold a lock, then we won't be able to stop another M
+	// that is blocked trying to acquire the lock.
+	if _g_.m.locks > 0 {
+		throw("stopTheWorld: holding locks")
+	}
+
+	lock(&sched.lock)
+	sched.stopwait = gomaxprocs
+	atomic.Store(&sched.gcwaiting, 1)
+	preemptall()
+	// stop current P
+	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
+	sched.stopwait--
+	// try to retake all P's in Psyscall status
+	for _, p := range allp {
+		s := p.status
+		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
+			if trace.enabled {
+				traceGoSysBlock(p)
+				traceProcStop(p)
+			}
+			p.syscalltick++
+			sched.stopwait--
+		}
+	}
+	// stop idle P's
+	for {
+		p := pidleget()
+		if p == nil {
+			break
+		}
+		p.status = _Pgcstop
+		sched.stopwait--
+	}
+	wait := sched.stopwait > 0
+	unlock(&sched.lock)
+
+	// wait for remaining P's to stop voluntarily
+	if wait {
+		for {
+			// wait for 100us, then try to re-preempt in case of any races
+			if notetsleep(&sched.stopnote, 100*1000) {
+				noteclear(&sched.stopnote)
+				break
+			}
+			preemptall()
+		}
+	}
+
+	// sanity checks
+	bad := ""
+	if sched.stopwait != 0 {
+		bad = "stopTheWorld: not stopped (stopwait != 0)"
+	} else {
+		for _, p := range allp {
+			if p.status != _Pgcstop {
+				bad = "stopTheWorld: not stopped (status != _Pgcstop)"
+			}
+		}
+	}
+	if atomic.Load(&freezing) != 0 {
+		// Some other thread is panicking. This can cause the
+		// sanity checks above to fail if the panic happens in
+		// the signal handler on a stopped thread. Either way,
+		// we should halt this thread.
+		lock(&deadlock)
+		lock(&deadlock)
+	}
+	if bad != "" {
+		throw(bad)
+	}
+
+	worldStopped()
+}
+
+func startTheWorldWithSema(emitTraceEvent bool) int64 {
+	assertWorldStopped()
+
+	mp := acquirem() // disable preemption because it can be holding p in a local var
+	if netpollinited() {
+		list := netpoll(0) // non-blocking
+		injectglist(&list)
+	}
+	lock(&sched.lock)
+
+	procs := gomaxprocs
+	if newprocs != 0 {
+		procs = newprocs
+		newprocs = 0
+	}
+	p1 := procresize(procs)
+	sched.gcwaiting = 0
+	if sched.sysmonwait != 0 {
+		sched.sysmonwait = 0
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+
+	worldStarted()
+
+	for p1 != nil {
+		p := p1
+		p1 = p1.link.ptr()
+		if p.m != 0 {
+			mp := p.m.ptr()
+			p.m = 0
+			if mp.nextp != 0 {
+				throw("startTheWorld: inconsistent mp->nextp")
+			}
+			mp.nextp.set(p)
+			notewakeup(&mp.park)
+		} else {
+			// Start M to run P.  Do not start another M below.
+			newm(nil, p, -1)
+		}
+	}
+
+	// Capture start-the-world time before doing clean-up tasks.
+	startTime := nanotime()
+	if emitTraceEvent {
+		traceGCSTWDone()
+	}
+
+	// Wakeup an additional proc in case we have excessive runnable goroutines
+	// in local queues or in the global queue. If we don't, the proc will park itself.
+	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
+	wakep()
+
+	releasem(mp)
+
+	return startTime
+}
+
+// usesLibcall indicates whether this runtime performs system calls
+// via libcall.
+func usesLibcall() bool {
+	switch GOOS {
+	case "aix", "darwin", "illumos", "ios", "solaris", "windows":
+		return true
+	case "openbsd":
+		return GOARCH == "386" || GOARCH == "amd64" || GOARCH == "arm" || GOARCH == "arm64"
+	}
+	return false
+}
+
+// mStackIsSystemAllocated indicates whether this runtime starts on a
+// system-allocated stack.
+func mStackIsSystemAllocated() bool {
+	switch GOOS {
+	case "aix", "darwin", "plan9", "illumos", "ios", "solaris", "windows":
+		return true
+	case "openbsd":
+		switch GOARCH {
+		case "386", "amd64", "arm", "arm64":
+			return true
+		}
+	}
+	return false
+}
+
+// mstart is the entry-point for new Ms.
+// It is written in assembly, uses ABI0, is marked TOPFRAME, and calls mstart0.
+func mstart()
+
+// mstart0 is the Go entry-point for new Ms.
+// This must not split the stack because we may not even have stack
+// bounds set up yet.
+//
+// May run during STW (because it doesn't have a P yet), so write
+// barriers are not allowed.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func mstart0() {
+	_g_ := getg()
+
+	osStack := _g_.stack.lo == 0
+	if osStack {
+		// Initialize stack bounds from system stack.
+		// Cgo may have left stack size in stack.hi.
+		// minit may update the stack bounds.
+		//
+		// Note: these bounds may not be very accurate.
+		// We set hi to &size, but there are things above
+		// it. The 1024 is supposed to compensate this,
+		// but is somewhat arbitrary.
+		size := _g_.stack.hi
+		if size == 0 {
+			size = 8192 * sys.StackGuardMultiplier
+		}
+		_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
+		_g_.stack.lo = _g_.stack.hi - size + 1024
+	}
+	// Initialize stack guard so that we can start calling regular
+	// Go code.
+	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+	// This is the g0, so we can also call go:systemstack
+	// functions, which check stackguard1.
+	_g_.stackguard1 = _g_.stackguard0
+	mstart1()
+
+	// Exit this thread.
+	if mStackIsSystemAllocated() {
+		// Windows, Solaris, illumos, Darwin, AIX and Plan 9 always system-allocate
+		// the stack, but put it in _g_.stack before mstart,
+		// so the logic above hasn't set osStack yet.
+		osStack = true
+	}
+	mexit(osStack)
+}
+
+// The go:noinline is to guarantee the getcallerpc/getcallersp below are safe,
+// so that we can set up g0.sched to return to the call of mstart1 above.
+//go:noinline
+func mstart1() {
+	_g_ := getg()
+
+	if _g_ != _g_.m.g0 {
+		throw("bad runtime·mstart")
+	}
+
+	// Set up m.g0.sched as a label returning to just
+	// after the mstart1 call in mstart0 above, for use by goexit0 and mcall.
+	// We're never coming back to mstart1 after we call schedule,
+	// so other calls can reuse the current frame.
+	// And goexit0 does a gogo that needs to return from mstart1
+	// and let mstart0 exit the thread.
+	_g_.sched.g = guintptr(unsafe.Pointer(_g_))
+	_g_.sched.pc = getcallerpc()
+	_g_.sched.sp = getcallersp()
+
+	asminit()
+	minit()
+
+	// Install signal handlers; after minit so that minit can
+	// prepare the thread to be able to handle the signals.
+	if _g_.m == &m0 {
+		mstartm0()
+	}
+
+	if fn := _g_.m.mstartfn; fn != nil {
+		fn()
+	}
+
+	if _g_.m != &m0 {
+		acquirep(_g_.m.nextp.ptr())
+		_g_.m.nextp = 0
+	}
+	schedule()
+}
+
+// mstartm0 implements part of mstart1 that only runs on the m0.
+//
+// Write barriers are allowed here because we know the GC can't be
+// running yet, so they'll be no-ops.
+//
+//go:yeswritebarrierrec
+func mstartm0() {
+	// Create an extra M for callbacks on threads not created by Go.
+	// An extra M is also needed on Windows for callbacks created by
+	// syscall.NewCallback. See issue #6751 for details.
+	if (iscgo || GOOS == "windows") && !cgoHasExtraM {
+		cgoHasExtraM = true
+		newextram()
+	}
+	initsig(false)
+}
+
+// mPark causes a thread to park itself, returning once woken.
+//go:nosplit
+func mPark() {
+	gp := getg()
+	notesleep(&gp.m.park)
+	noteclear(&gp.m.park)
+}
+
+// mexit tears down and exits the current thread.
+//
+// Don't call this directly to exit the thread, since it must run at
+// the top of the thread stack. Instead, use gogo(&_g_.m.g0.sched) to
+// unwind the stack to the point that exits the thread.
+//
+// It is entered with m.p != nil, so write barriers are allowed. It
+// will release the P before exiting.
+//
+//go:yeswritebarrierrec
+func mexit(osStack bool) {
+	g := getg()
+	m := g.m
+
+	if m == &m0 {
+		// This is the main thread. Just wedge it.
+		//
+		// On Linux, exiting the main thread puts the process
+		// into a non-waitable zombie state. On Plan 9,
+		// exiting the main thread unblocks wait even though
+		// other threads are still running. On Solaris we can
+		// neither exitThread nor return from mstart. Other
+		// bad things probably happen on other platforms.
+		//
+		// We could try to clean up this M more before wedging
+		// it, but that complicates signal handling.
+		handoffp(releasep())
+		lock(&sched.lock)
+		sched.nmfreed++
+		checkdead()
+		unlock(&sched.lock)
+		mPark()
+		throw("locked m0 woke up")
+	}
+
+	sigblock(true)
+	unminit()
+
+	// Free the gsignal stack.
+	if m.gsignal != nil {
+		stackfree(m.gsignal.stack)
+		// On some platforms, when calling into VDSO (e.g. nanotime)
+		// we store our g on the gsignal stack, if there is one.
+		// Now the stack is freed, unlink it from the m, so we
+		// won't write to it when calling VDSO code.
+		m.gsignal = nil
+	}
+
+	// Remove m from allm.
+	lock(&sched.lock)
+	for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink {
+		if *pprev == m {
+			*pprev = m.alllink
+			goto found
+		}
+	}
+	throw("m not found in allm")
+found:
+	if !osStack {
+		// Delay reaping m until it's done with the stack.
+		//
+		// If this is using an OS stack, the OS will free it
+		// so there's no need for reaping.
+		atomic.Store(&m.freeWait, 1)
+		// Put m on the free list, though it will not be reaped until
+		// freeWait is 0. Note that the free list must not be linked
+		// through alllink because some functions walk allm without
+		// locking, so may be using alllink.
+		m.freelink = sched.freem
+		sched.freem = m
+	}
+	unlock(&sched.lock)
+
+	atomic.Xadd64(&ncgocall, int64(m.ncgocall))
+
+	// Release the P.
+	handoffp(releasep())
+	// After this point we must not have write barriers.
+
+	// Invoke the deadlock detector. This must happen after
+	// handoffp because it may have started a new M to take our
+	// P's work.
+	lock(&sched.lock)
+	sched.nmfreed++
+	checkdead()
+	unlock(&sched.lock)
+
+	if GOOS == "darwin" || GOOS == "ios" {
+		// Make sure pendingPreemptSignals is correct when an M exits.
+		// For #41702.
+		if atomic.Load(&m.signalPending) != 0 {
+			atomic.Xadd(&pendingPreemptSignals, -1)
+		}
+	}
+
+	// Destroy all allocated resources. After this is called, we may no
+	// longer take any locks.
+	mdestroy(m)
+
+	if osStack {
+		// Return from mstart and let the system thread
+		// library free the g0 stack and terminate the thread.
+		return
+	}
+
+	// mstart is the thread's entry point, so there's nothing to
+	// return to. Exit the thread directly. exitThread will clear
+	// m.freeWait when it's done with the stack and the m can be
+	// reaped.
+	exitThread(&m.freeWait)
+}
+
+// forEachP calls fn(p) for every P p when p reaches a GC safe point.
+// If a P is currently executing code, this will bring the P to a GC
+// safe point and execute fn on that P. If the P is not executing code
+// (it is idle or in a syscall), this will call fn(p) directly while
+// preventing the P from exiting its state. This does not ensure that
+// fn will run on every CPU executing Go code, but it acts as a global
+// memory barrier. GC uses this as a "ragged barrier."
+//
+// The caller must hold worldsema.
+//
+//go:systemstack
+func forEachP(fn func(*p)) {
+	mp := acquirem()
+	_p_ := getg().m.p.ptr()
+
+	lock(&sched.lock)
+	if sched.safePointWait != 0 {
+		throw("forEachP: sched.safePointWait != 0")
+	}
+	sched.safePointWait = gomaxprocs - 1
+	sched.safePointFn = fn
+
+	// Ask all Ps to run the safe point function.
+	for _, p := range allp {
+		if p != _p_ {
+			atomic.Store(&p.runSafePointFn, 1)
+		}
+	}
+	preemptall()
+
+	// Any P entering _Pidle or _Psyscall from now on will observe
+	// p.runSafePointFn == 1 and will call runSafePointFn when
+	// changing its status to _Pidle/_Psyscall.
+
+	// Run safe point function for all idle Ps. sched.pidle will
+	// not change because we hold sched.lock.
+	for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() {
+		if atomic.Cas(&p.runSafePointFn, 1, 0) {
+			fn(p)
+			sched.safePointWait--
+		}
+	}
+
+	wait := sched.safePointWait > 0
+	unlock(&sched.lock)
+
+	// Run fn for the current P.
+	fn(_p_)
+
+	// Force Ps currently in _Psyscall into _Pidle and hand them
+	// off to induce safe point function execution.
+	for _, p := range allp {
+		s := p.status
+		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
+			if trace.enabled {
+				traceGoSysBlock(p)
+				traceProcStop(p)
+			}
+			p.syscalltick++
+			handoffp(p)
+		}
+	}
+
+	// Wait for remaining Ps to run fn.
+	if wait {
+		for {
+			// Wait for 100us, then try to re-preempt in
+			// case of any races.
+			//
+			// Requires system stack.
+			if notetsleep(&sched.safePointNote, 100*1000) {
+				noteclear(&sched.safePointNote)
+				break
+			}
+			preemptall()
+		}
+	}
+	if sched.safePointWait != 0 {
+		throw("forEachP: not done")
+	}
+	for _, p := range allp {
+		if p.runSafePointFn != 0 {
+			throw("forEachP: P did not run fn")
+		}
+	}
+
+	lock(&sched.lock)
+	sched.safePointFn = nil
+	unlock(&sched.lock)
+	releasem(mp)
+}
+
+// runSafePointFn runs the safe point function, if any, for this P.
+// This should be called like
+//
+//     if getg().m.p.runSafePointFn != 0 {
+//         runSafePointFn()
+//     }
+//
+// runSafePointFn must be checked on any transition in to _Pidle or
+// _Psyscall to avoid a race where forEachP sees that the P is running
+// just before the P goes into _Pidle/_Psyscall and neither forEachP
+// nor the P run the safe-point function.
+func runSafePointFn() {
+	p := getg().m.p.ptr()
+	// Resolve the race between forEachP running the safe-point
+	// function on this P's behalf and this P running the
+	// safe-point function directly.
+	if !atomic.Cas(&p.runSafePointFn, 1, 0) {
+		return
+	}
+	sched.safePointFn(p)
+	lock(&sched.lock)
+	sched.safePointWait--
+	if sched.safePointWait == 0 {
+		notewakeup(&sched.safePointNote)
+	}
+	unlock(&sched.lock)
+}
+
+// When running with cgo, we call _cgo_thread_start
+// to start threads for us so that we can play nicely with
+// foreign code.
+var cgoThreadStart unsafe.Pointer
+
+type cgothreadstart struct {
+	g   guintptr
+	tls *uint64
+	fn  unsafe.Pointer
+}
+
+// Allocate a new m unassociated with any thread.
+// Can use p for allocation context if needed.
+// fn is recorded as the new m's m.mstartfn.
+// id is optional pre-allocated m ID. Omit by passing -1.
+//
+// This function is allowed to have write barriers even if the caller
+// isn't because it borrows _p_.
+//
+//go:yeswritebarrierrec
+func allocm(_p_ *p, fn func(), id int64) *m {
+	allocmLock.rlock()
+
+	// The caller owns _p_, but we may borrow (i.e., acquirep) it. We must
+	// disable preemption to ensure it is not stolen, which would make the
+	// caller lose ownership.
+	acquirem()
+
+	_g_ := getg()
+	if _g_.m.p == 0 {
+		acquirep(_p_) // temporarily borrow p for mallocs in this function
+	}
+
+	// Release the free M list. We need to do this somewhere and
+	// this may free up a stack we can use.
+	if sched.freem != nil {
+		lock(&sched.lock)
+		var newList *m
+		for freem := sched.freem; freem != nil; {
+			if freem.freeWait != 0 {
+				next := freem.freelink
+				freem.freelink = newList
+				newList = freem
+				freem = next
+				continue
+			}
+			// stackfree must be on the system stack, but allocm is
+			// reachable off the system stack transitively from
+			// startm.
+			systemstack(func() {
+				stackfree(freem.g0.stack)
+			})
+			freem = freem.freelink
+		}
+		sched.freem = newList
+		unlock(&sched.lock)
+	}
+
+	mp := new(m)
+	mp.mstartfn = fn
+	mcommoninit(mp, id)
+
+	// In case of cgo or Solaris or illumos or Darwin, pthread_create will make us a stack.
+	// Windows and Plan 9 will layout sched stack on OS stack.
+	if iscgo || mStackIsSystemAllocated() {
+		mp.g0 = malg(-1)
+	} else {
+		mp.g0 = malg(8192 * sys.StackGuardMultiplier)
+	}
+	mp.g0.m = mp
+
+	if _p_ == _g_.m.p.ptr() {
+		releasep()
+	}
+
+	releasem(_g_.m)
+	allocmLock.runlock()
+	return mp
+}
+
+// needm is called when a cgo callback happens on a
+// thread without an m (a thread not created by Go).
+// In this case, needm is expected to find an m to use
+// and return with m, g initialized correctly.
+// Since m and g are not set now (likely nil, but see below)
+// needm is limited in what routines it can call. In particular
+// it can only call nosplit functions (textflag 7) and cannot
+// do any scheduling that requires an m.
+//
+// In order to avoid needing heavy lifting here, we adopt
+// the following strategy: there is a stack of available m's
+// that can be stolen. Using compare-and-swap
+// to pop from the stack has ABA races, so we simulate
+// a lock by doing an exchange (via Casuintptr) to steal the stack
+// head and replace the top pointer with MLOCKED (1).
+// This serves as a simple spin lock that we can use even
+// without an m. The thread that locks the stack in this way
+// unlocks the stack by storing a valid stack head pointer.
+//
+// In order to make sure that there is always an m structure
+// available to be stolen, we maintain the invariant that there
+// is always one more than needed. At the beginning of the
+// program (if cgo is in use) the list is seeded with a single m.
+// If needm finds that it has taken the last m off the list, its job
+// is - once it has installed its own m so that it can do things like
+// allocate memory - to create a spare m and put it on the list.
+//
+// Each of these extra m's also has a g0 and a curg that are
+// pressed into service as the scheduling stack and current
+// goroutine for the duration of the cgo callback.
+//
+// When the callback is done with the m, it calls dropm to
+// put the m back on the list.
+//go:nosplit
+func needm() {
+	if (iscgo || GOOS == "windows") && !cgoHasExtraM {
+		// Can happen if C/C++ code calls Go from a global ctor.
+		// Can also happen on Windows if a global ctor uses a
+		// callback created by syscall.NewCallback. See issue #6751
+		// for details.
+		//
+		// Can not throw, because scheduler is not initialized yet.
+		write(2, unsafe.Pointer(&earlycgocallback[0]), int32(len(earlycgocallback)))
+		exit(1)
+	}
+
+	// Save and block signals before getting an M.
+	// The signal handler may call needm itself,
+	// and we must avoid a deadlock. Also, once g is installed,
+	// any incoming signals will try to execute,
+	// but we won't have the sigaltstack settings and other data
+	// set up appropriately until the end of minit, which will
+	// unblock the signals. This is the same dance as when
+	// starting a new m to run Go code via newosproc.
+	var sigmask sigset
+	sigsave(&sigmask)
+	sigblock(false)
+
+	// Lock extra list, take head, unlock popped list.
+	// nilokay=false is safe here because of the invariant above,
+	// that the extra list always contains or will soon contain
+	// at least one m.
+	mp := lockextra(false)
+
+	// Set needextram when we've just emptied the list,
+	// so that the eventual call into cgocallbackg will
+	// allocate a new m for the extra list. We delay the
+	// allocation until then so that it can be done
+	// after exitsyscall makes sure it is okay to be
+	// running at all (that is, there's no garbage collection
+	// running right now).
+	mp.needextram = mp.schedlink == 0
+	extraMCount--
+	unlockextra(mp.schedlink.ptr())
+
+	// Store the original signal mask for use by minit.
+	mp.sigmask = sigmask
+
+	// Install TLS on some platforms (previously setg
+	// would do this if necessary).
+	osSetupTLS(mp)
+
+	// Install g (= m->g0) and set the stack bounds
+	// to match the current stack. We don't actually know
+	// how big the stack is, like we don't know how big any
+	// scheduling stack is, but we assume there's at least 32 kB,
+	// which is more than enough for us.
+	setg(mp.g0)
+	_g_ := getg()
+	_g_.stack.hi = getcallersp() + 1024
+	_g_.stack.lo = getcallersp() - 32*1024
+	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+
+	// Initialize this thread to use the m.
+	asminit()
+	minit()
+
+	// mp.curg is now a real goroutine.
+	casgstatus(mp.curg, _Gdead, _Gsyscall)
+	atomic.Xadd(&sched.ngsys, -1)
+}
+
+var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
+
+// newextram allocates m's and puts them on the extra list.
+// It is called with a working local m, so that it can do things
+// like call schedlock and allocate.
+func newextram() {
+	c := atomic.Xchg(&extraMWaiters, 0)
+	if c > 0 {
+		for i := uint32(0); i < c; i++ {
+			oneNewExtraM()
+		}
+	} else {
+		// Make sure there is at least one extra M.
+		mp := lockextra(true)
+		unlockextra(mp)
+		if mp == nil {
+			oneNewExtraM()
+		}
+	}
+}
+
+// oneNewExtraM allocates an m and puts it on the extra list.
+func oneNewExtraM() {
+	// Create extra goroutine locked to extra m.
+	// The goroutine is the context in which the cgo callback will run.
+	// The sched.pc will never be returned to, but setting it to
+	// goexit makes clear to the traceback routines where
+	// the goroutine stack ends.
+	mp := allocm(nil, nil, -1)
+	gp := malg(4096)
+	gp.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum
+	gp.sched.sp = gp.stack.hi
+	gp.sched.sp -= 4 * goarch.PtrSize // extra space in case of reads slightly beyond frame
+	gp.sched.lr = 0
+	gp.sched.g = guintptr(unsafe.Pointer(gp))
+	gp.syscallpc = gp.sched.pc
+	gp.syscallsp = gp.sched.sp
+	gp.stktopsp = gp.sched.sp
+	// malg returns status as _Gidle. Change to _Gdead before
+	// adding to allg where GC can see it. We use _Gdead to hide
+	// this from tracebacks and stack scans since it isn't a
+	// "real" goroutine until needm grabs it.
+	casgstatus(gp, _Gidle, _Gdead)
+	gp.m = mp
+	mp.curg = gp
+	mp.lockedInt++
+	mp.lockedg.set(gp)
+	gp.lockedm.set(mp)
+	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
+	if raceenabled {
+		gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum)
+	}
+	// put on allg for garbage collector
+	allgadd(gp)
+
+	// gp is now on the allg list, but we don't want it to be
+	// counted by gcount. It would be more "proper" to increment
+	// sched.ngfree, but that requires locking. Incrementing ngsys
+	// has the same effect.
+	atomic.Xadd(&sched.ngsys, +1)
+
+	// Add m to the extra list.
+	mnext := lockextra(true)
+	mp.schedlink.set(mnext)
+	extraMCount++
+	unlockextra(mp)
+}
+
+// dropm is called when a cgo callback has called needm but is now
+// done with the callback and returning back into the non-Go thread.
+// It puts the current m back onto the extra list.
+//
+// The main expense here is the call to signalstack to release the
+// m's signal stack, and then the call to needm on the next callback
+// from this thread. It is tempting to try to save the m for next time,
+// which would eliminate both these costs, but there might not be
+// a next time: the current thread (which Go does not control) might exit.
+// If we saved the m for that thread, there would be an m leak each time
+// such a thread exited. Instead, we acquire and release an m on each
+// call. These should typically not be scheduling operations, just a few
+// atomics, so the cost should be small.
+//
+// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+// variable using pthread_key_create. Unlike the pthread keys we already use
+// on OS X, this dummy key would never be read by Go code. It would exist
+// only so that we could register at thread-exit-time destructor.
+// That destructor would put the m back onto the extra list.
+// This is purely a performance optimization. The current version,
+// in which dropm happens on each cgo call, is still correct too.
+// We may have to keep the current version on systems with cgo
+// but without pthreads, like Windows.
+func dropm() {
+	// Clear m and g, and return m to the extra list.
+	// After the call to setg we can only call nosplit functions
+	// with no pointer manipulation.
+	mp := getg().m
+
+	// Return mp.curg to dead state.
+	casgstatus(mp.curg, _Gsyscall, _Gdead)
+	mp.curg.preemptStop = false
+	atomic.Xadd(&sched.ngsys, +1)
+
+	// Block signals before unminit.
+	// Unminit unregisters the signal handling stack (but needs g on some systems).
+	// Setg(nil) clears g, which is the signal handler's cue not to run Go handlers.
+	// It's important not to try to handle a signal between those two steps.
+	sigmask := mp.sigmask
+	sigblock(false)
+	unminit()
+
+	mnext := lockextra(true)
+	extraMCount++
+	mp.schedlink.set(mnext)
+
+	setg(nil)
+
+	// Commit the release of mp.
+	unlockextra(mp)
+
+	msigrestore(sigmask)
+}
+
+// A helper function for EnsureDropM.
+func getm() uintptr {
+	return uintptr(unsafe.Pointer(getg().m))
+}
+
+var extram uintptr
+var extraMCount uint32 // Protected by lockextra
+var extraMWaiters uint32
+
+// lockextra locks the extra list and returns the list head.
+// The caller must unlock the list by storing a new list head
+// to extram. If nilokay is true, then lockextra will
+// return a nil list head if that's what it finds. If nilokay is false,
+// lockextra will keep waiting until the list head is no longer nil.
+//go:nosplit
+func lockextra(nilokay bool) *m {
+	const locked = 1
+
+	incr := false
+	for {
+		old := atomic.Loaduintptr(&extram)
+		if old == locked {
+			osyield_no_g()
+			continue
+		}
+		if old == 0 && !nilokay {
+			if !incr {
+				// Add 1 to the number of threads
+				// waiting for an M.
+				// This is cleared by newextram.
+				atomic.Xadd(&extraMWaiters, 1)
+				incr = true
+			}
+			usleep_no_g(1)
+			continue
+		}
+		if atomic.Casuintptr(&extram, old, locked) {
+			return (*m)(unsafe.Pointer(old))
+		}
+		osyield_no_g()
+		continue
+	}
+}
+
+//go:nosplit
+func unlockextra(mp *m) {
+	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+}
+
+var (
+	// allocmLock is locked for read when creating new Ms in allocm and their
+	// addition to allm. Thus acquiring this lock for write blocks the
+	// creation of new Ms.
+	allocmLock rwmutex
+
+	// execLock serializes exec and clone to avoid bugs or unspecified
+	// behaviour around exec'ing while creating/destroying threads. See
+	// issue #19546.
+	execLock rwmutex
+)
+
+// newmHandoff contains a list of m structures that need new OS threads.
+// This is used by newm in situations where newm itself can't safely
+// start an OS thread.
+var newmHandoff struct {
+	lock mutex
+
+	// newm points to a list of M structures that need new OS
+	// threads. The list is linked through m.schedlink.
+	newm muintptr
+
+	// waiting indicates that wake needs to be notified when an m
+	// is put on the list.
+	waiting bool
+	wake    note
+
+	// haveTemplateThread indicates that the templateThread has
+	// been started. This is not protected by lock. Use cas to set
+	// to 1.
+	haveTemplateThread uint32
+}
+
+// Create a new m. It will start off with a call to fn, or else the scheduler.
+// fn needs to be static and not a heap allocated closure.
+// May run with m.p==nil, so write barriers are not allowed.
+//
+// id is optional pre-allocated m ID. Omit by passing -1.
+//go:nowritebarrierrec
+func newm(fn func(), _p_ *p, id int64) {
+	// allocm adds a new M to allm, but they do not start until created by
+	// the OS in newm1 or the template thread.
+	//
+	// doAllThreadsSyscall requires that every M in allm will eventually
+	// start and be signal-able, even with a STW.
+	//
+	// Disable preemption here until we start the thread to ensure that
+	// newm is not preempted between allocm and starting the new thread,
+	// ensuring that anything added to allm is guaranteed to eventually
+	// start.
+	acquirem()
+
+	mp := allocm(_p_, fn, id)
+	mp.nextp.set(_p_)
+	mp.sigmask = initSigmask
+	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
+		// We're on a locked M or a thread that may have been
+		// started by C. The kernel state of this thread may
+		// be strange (the user may have locked it for that
+		// purpose). We don't want to clone that into another
+		// thread. Instead, ask a known-good thread to create
+		// the thread for us.
+		//
+		// This is disabled on Plan 9. See golang.org/issue/22227.
+		//
+		// TODO: This may be unnecessary on Windows, which
+		// doesn't model thread creation off fork.
+		lock(&newmHandoff.lock)
+		if newmHandoff.haveTemplateThread == 0 {
+			throw("on a locked thread with no template thread")
+		}
+		mp.schedlink = newmHandoff.newm
+		newmHandoff.newm.set(mp)
+		if newmHandoff.waiting {
+			newmHandoff.waiting = false
+			notewakeup(&newmHandoff.wake)
+		}
+		unlock(&newmHandoff.lock)
+		// The M has not started yet, but the template thread does not
+		// participate in STW, so it will always process queued Ms and
+		// it is safe to releasem.
+		releasem(getg().m)
+		return
+	}
+	newm1(mp)
+	releasem(getg().m)
+}
+
+func newm1(mp *m) {
+	if iscgo {
+		var ts cgothreadstart
+		if _cgo_thread_start == nil {
+			throw("_cgo_thread_start missing")
+		}
+		ts.g.set(mp.g0)
+		ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
+		ts.fn = unsafe.Pointer(abi.FuncPCABI0(mstart))
+		if msanenabled {
+			msanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts))
+		}
+		if asanenabled {
+			asanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts))
+		}
+		execLock.rlock() // Prevent process clone.
+		asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
+		execLock.runlock()
+		return
+	}
+	execLock.rlock() // Prevent process clone.
+	newosproc(mp)
+	execLock.runlock()
+}
+
+// startTemplateThread starts the template thread if it is not already
+// running.
+//
+// The calling thread must itself be in a known-good state.
+func startTemplateThread() {
+	if GOARCH == "wasm" { // no threads on wasm yet
+		return
+	}
+
+	// Disable preemption to guarantee that the template thread will be
+	// created before a park once haveTemplateThread is set.
+	mp := acquirem()
+	if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) {
+		releasem(mp)
+		return
+	}
+	newm(templateThread, nil, -1)
+	releasem(mp)
+}
+
+// templateThread is a thread in a known-good state that exists solely
+// to start new threads in known-good states when the calling thread
+// may not be in a good state.
+//
+// Many programs never need this, so templateThread is started lazily
+// when we first enter a state that might lead to running on a thread
+// in an unknown state.
+//
+// templateThread runs on an M without a P, so it must not have write
+// barriers.
+//
+//go:nowritebarrierrec
+func templateThread() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
+	for {
+		lock(&newmHandoff.lock)
+		for newmHandoff.newm != 0 {
+			newm := newmHandoff.newm.ptr()
+			newmHandoff.newm = 0
+			unlock(&newmHandoff.lock)
+			for newm != nil {
+				next := newm.schedlink.ptr()
+				newm.schedlink = 0
+				newm1(newm)
+				newm = next
+			}
+			lock(&newmHandoff.lock)
+		}
+		newmHandoff.waiting = true
+		noteclear(&newmHandoff.wake)
+		unlock(&newmHandoff.lock)
+		notesleep(&newmHandoff.wake)
+	}
+}
+
+// Stops execution of the current m until new work is available.
+// Returns with acquired P.
+func stopm() {
+	_g_ := getg()
+
+	if _g_.m.locks != 0 {
+		throw("stopm holding locks")
+	}
+	if _g_.m.p != 0 {
+		throw("stopm holding p")
+	}
+	if _g_.m.spinning {
+		throw("stopm spinning")
+	}
+
+	lock(&sched.lock)
+	mput(_g_.m)
+	unlock(&sched.lock)
+	mPark()
+	acquirep(_g_.m.nextp.ptr())
+	_g_.m.nextp = 0
+}
+
+func mspinning() {
+	// startm's caller incremented nmspinning. Set the new M's spinning.
+	getg().m.spinning = true
+}
+
+// Schedules some M to run the p (creates an M if necessary).
+// If p==nil, tries to get an idle P, if no idle P's does nothing.
+// May run with m.p==nil, so write barriers are not allowed.
+// If spinning is set, the caller has incremented nmspinning and startm will
+// either decrement nmspinning or set m.spinning in the newly started M.
+//
+// Callers passing a non-nil P must call from a non-preemptible context. See
+// comment on acquirem below.
+//
+// Must not have write barriers because this may be called without a P.
+//go:nowritebarrierrec
+func startm(_p_ *p, spinning bool) {
+	// Disable preemption.
+	//
+	// Every owned P must have an owner that will eventually stop it in the
+	// event of a GC stop request. startm takes transient ownership of a P
+	// (either from argument or pidleget below) and transfers ownership to
+	// a started M, which will be responsible for performing the stop.
+	//
+	// Preemption must be disabled during this transient ownership,
+	// otherwise the P this is running on may enter GC stop while still
+	// holding the transient P, leaving that P in limbo and deadlocking the
+	// STW.
+	//
+	// Callers passing a non-nil P must already be in non-preemptible
+	// context, otherwise such preemption could occur on function entry to
+	// startm. Callers passing a nil P may be preemptible, so we must
+	// disable preemption before acquiring a P from pidleget below.
+	mp := acquirem()
+	lock(&sched.lock)
+	if _p_ == nil {
+		_p_ = pidleget()
+		if _p_ == nil {
+			unlock(&sched.lock)
+			if spinning {
+				// The caller incremented nmspinning, but there are no idle Ps,
+				// so it's okay to just undo the increment and give up.
+				if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+					throw("startm: negative nmspinning")
+				}
+			}
+			releasem(mp)
+			return
+		}
+	}
+	nmp := mget()
+	if nmp == nil {
+		// No M is available, we must drop sched.lock and call newm.
+		// However, we already own a P to assign to the M.
+		//
+		// Once sched.lock is released, another G (e.g., in a syscall),
+		// could find no idle P while checkdead finds a runnable G but
+		// no running M's because this new M hasn't started yet, thus
+		// throwing in an apparent deadlock.
+		//
+		// Avoid this situation by pre-allocating the ID for the new M,
+		// thus marking it as 'running' before we drop sched.lock. This
+		// new M will eventually run the scheduler to execute any
+		// queued G's.
+		id := mReserveID()
+		unlock(&sched.lock)
+
+		var fn func()
+		if spinning {
+			// The caller incremented nmspinning, so set m.spinning in the new M.
+			fn = mspinning
+		}
+		newm(fn, _p_, id)
+		// Ownership transfer of _p_ committed by start in newm.
+		// Preemption is now safe.
+		releasem(mp)
+		return
+	}
+	unlock(&sched.lock)
+	if nmp.spinning {
+		throw("startm: m is spinning")
+	}
+	if nmp.nextp != 0 {
+		throw("startm: m has p")
+	}
+	if spinning && !runqempty(_p_) {
+		throw("startm: p has runnable gs")
+	}
+	// The caller incremented nmspinning, so set m.spinning in the new M.
+	nmp.spinning = spinning
+	nmp.nextp.set(_p_)
+	notewakeup(&nmp.park)
+	// Ownership transfer of _p_ committed by wakeup. Preemption is now
+	// safe.
+	releasem(mp)
+}
+
+// Hands off P from syscall or locked M.
+// Always runs without a P, so write barriers are not allowed.
+//go:nowritebarrierrec
+func handoffp(_p_ *p) {
+	// handoffp must start an M in any situation where
+	// findrunnable would return a G to run on _p_.
+
+	// if it has local work, start it straight away
+	if !runqempty(_p_) || sched.runqsize != 0 {
+		startm(_p_, false)
+		return
+	}
+	// if it has GC work, start it straight away
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
+		startm(_p_, false)
+		return
+	}
+	// no local work, check that there are no spinning/idle M's,
+	// otherwise our help is not required
+	if atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
+		startm(_p_, true)
+		return
+	}
+	lock(&sched.lock)
+	if sched.gcwaiting != 0 {
+		_p_.status = _Pgcstop
+		sched.stopwait--
+		if sched.stopwait == 0 {
+			notewakeup(&sched.stopnote)
+		}
+		unlock(&sched.lock)
+		return
+	}
+	if _p_.runSafePointFn != 0 && atomic.Cas(&_p_.runSafePointFn, 1, 0) {
+		sched.safePointFn(_p_)
+		sched.safePointWait--
+		if sched.safePointWait == 0 {
+			notewakeup(&sched.safePointNote)
+		}
+	}
+	if sched.runqsize != 0 {
+		unlock(&sched.lock)
+		startm(_p_, false)
+		return
+	}
+	// If this is the last running P and nobody is polling network,
+	// need to wakeup another M to poll network.
+	if sched.npidle == uint32(gomaxprocs-1) && atomic.Load64(&sched.lastpoll) != 0 {
+		unlock(&sched.lock)
+		startm(_p_, false)
+		return
+	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
+	pidleput(_p_)
+	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
+}
+
+// Tries to add one more P to execute G's.
+// Called when a G is made runnable (newproc, ready).
+func wakep() {
+	if atomic.Load(&sched.npidle) == 0 {
+		return
+	}
+	// be conservative about spinning threads
+	if atomic.Load(&sched.nmspinning) != 0 || !atomic.Cas(&sched.nmspinning, 0, 1) {
+		return
+	}
+	startm(nil, true)
+}
+
+// Stops execution of the current m that is locked to a g until the g is runnable again.
+// Returns with acquired P.
+func stoplockedm() {
+	_g_ := getg()
+
+	if _g_.m.lockedg == 0 || _g_.m.lockedg.ptr().lockedm.ptr() != _g_.m {
+		throw("stoplockedm: inconsistent locking")
+	}
+	if _g_.m.p != 0 {
+		// Schedule another M to run this p.
+		_p_ := releasep()
+		handoffp(_p_)
+	}
+	incidlelocked(1)
+	// Wait until another thread schedules lockedg again.
+	mPark()
+	status := readgstatus(_g_.m.lockedg.ptr())
+	if status&^_Gscan != _Grunnable {
+		print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n")
+		dumpgstatus(_g_.m.lockedg.ptr())
+		throw("stoplockedm: not runnable")
+	}
+	acquirep(_g_.m.nextp.ptr())
+	_g_.m.nextp = 0
+}
+
+// Schedules the locked m to run the locked gp.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func startlockedm(gp *g) {
+	_g_ := getg()
+
+	mp := gp.lockedm.ptr()
+	if mp == _g_.m {
+		throw("startlockedm: locked to me")
+	}
+	if mp.nextp != 0 {
+		throw("startlockedm: m has p")
+	}
+	// directly handoff current P to the locked m
+	incidlelocked(-1)
+	_p_ := releasep()
+	mp.nextp.set(_p_)
+	notewakeup(&mp.park)
+	stopm()
+}
+
+// Stops the current m for stopTheWorld.
+// Returns when the world is restarted.
+func gcstopm() {
+	_g_ := getg()
+
+	if sched.gcwaiting == 0 {
+		throw("gcstopm: not waiting for gc")
+	}
+	if _g_.m.spinning {
+		_g_.m.spinning = false
+		// OK to just drop nmspinning here,
+		// startTheWorld will unpark threads as necessary.
+		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+			throw("gcstopm: negative nmspinning")
+		}
+	}
+	_p_ := releasep()
+	lock(&sched.lock)
+	_p_.status = _Pgcstop
+	sched.stopwait--
+	if sched.stopwait == 0 {
+		notewakeup(&sched.stopnote)
+	}
+	unlock(&sched.lock)
+	stopm()
+}
+
+// Schedules gp to run on the current M.
+// If inheritTime is true, gp inherits the remaining time in the
+// current time slice. Otherwise, it starts a new time slice.
+// Never returns.
+//
+// Write barriers are allowed because this is called immediately after
+// acquiring a P in several places.
+//
+//go:yeswritebarrierrec
+func execute(gp *g, inheritTime bool) {
+	_g_ := getg()
+
+	// Assign gp.m before entering _Grunning so running Gs have an
+	// M.
+	_g_.m.curg = gp
+	gp.m = _g_.m
+	casgstatus(gp, _Grunnable, _Grunning)
+	gp.waitsince = 0
+	gp.preempt = false
+	gp.stackguard0 = gp.stack.lo + _StackGuard
+	if !inheritTime {
+		_g_.m.p.ptr().schedtick++
+	}
+
+	// Check whether the profiler needs to be turned on or off.
+	hz := sched.profilehz
+	if _g_.m.profilehz != hz {
+		setThreadCPUProfiler(hz)
+	}
+
+	if trace.enabled {
+		// GoSysExit has to happen when we have a P, but before GoStart.
+		// So we emit it here.
+		if gp.syscallsp != 0 && gp.sysblocktraced {
+			traceGoSysExit(gp.sysexitticks)
+		}
+		traceGoStart()
+	}
+
+	gogo(&gp.sched)
+}
+
+// Finds a runnable goroutine to execute.
+// Tries to steal from other P's, get g from local or global queue, poll network.
+func findrunnable() (gp *g, inheritTime bool) {
+	_g_ := getg()
+
+	// The conditions here and in handoffp must agree: if
+	// findrunnable would return a G to run, handoffp must start
+	// an M.
+
+top:
+	_p_ := _g_.m.p.ptr()
+	if sched.gcwaiting != 0 {
+		gcstopm()
+		goto top
+	}
+	if _p_.runSafePointFn != 0 {
+		runSafePointFn()
+	}
+
+	now, pollUntil, _ := checkTimers(_p_, 0)
+
+	if fingwait && fingwake {
+		if gp := wakefing(); gp != nil {
+			ready(gp, 0, true)
+		}
+	}
+	if *cgo_yield != nil {
+		asmcgocall(*cgo_yield, nil)
+	}
+
+	// local runq
+	if gp, inheritTime := runqget(_p_); gp != nil {
+		return gp, inheritTime
+	}
+
+	// global runq
+	if sched.runqsize != 0 {
+		lock(&sched.lock)
+		gp := globrunqget(_p_, 0)
+		unlock(&sched.lock)
+		if gp != nil {
+			return gp, false
+		}
+	}
+
+	// Poll network.
+	// This netpoll is only an optimization before we resort to stealing.
+	// We can safely skip it if there are no waiters or a thread is blocked
+	// in netpoll already. If there is any kind of logical race with that
+	// blocked thread (e.g. it has already returned from netpoll, but does
+	// not set lastpoll yet), this thread will do blocking netpoll below
+	// anyway.
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
+		if list := netpoll(0); !list.empty() { // non-blocking
+			gp := list.pop()
+			injectglist(&list)
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
+		}
+	}
+
+	// Spinning Ms: steal work from other Ps.
+	//
+	// Limit the number of spinning Ms to half the number of busy Ps.
+	// This is necessary to prevent excessive CPU consumption when
+	// GOMAXPROCS>>1 but the program parallelism is low.
+	procs := uint32(gomaxprocs)
+	if _g_.m.spinning || 2*atomic.Load(&sched.nmspinning) < procs-atomic.Load(&sched.npidle) {
+		if !_g_.m.spinning {
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+		}
+
+		gp, inheritTime, tnow, w, newWork := stealWork(now)
+		now = tnow
+		if gp != nil {
+			// Successfully stole.
+			return gp, inheritTime
+		}
+		if newWork {
+			// There may be new timer or GC work; restart to
+			// discover.
+			goto top
+		}
+		if w != 0 && (pollUntil == 0 || w < pollUntil) {
+			// Earlier timer to wait for.
+			pollUntil = w
+		}
+	}
+
+	// We have nothing to do.
+	//
+	// If we're in the GC mark phase, can safely scan and blacken objects,
+	// and have work to do, run idle-time marking rather than give up the
+	// P.
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
+		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+		if node != nil {
+			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			gp := node.gp.ptr()
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
+		}
+	}
+
+	// wasm only:
+	// If a callback returned and no other goroutine is awake,
+	// then wake event handler goroutine which pauses execution
+	// until a callback was triggered.
+	gp, otherReady := beforeIdle(now, pollUntil)
+	if gp != nil {
+		casgstatus(gp, _Gwaiting, _Grunnable)
+		if trace.enabled {
+			traceGoUnpark(gp, 0)
+		}
+		return gp, false
+	}
+	if otherReady {
+		goto top
+	}
+
+	// Before we drop our P, make a snapshot of the allp slice,
+	// which can change underfoot once we no longer block
+	// safe-points. We don't need to snapshot the contents because
+	// everything up to cap(allp) is immutable.
+	allpSnapshot := allp
+	// Also snapshot masks. Value changes are OK, but we can't allow
+	// len to change out from under us.
+	idlepMaskSnapshot := idlepMask
+	timerpMaskSnapshot := timerpMask
+
+	// return P and block
+	lock(&sched.lock)
+	if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
+		unlock(&sched.lock)
+		goto top
+	}
+	if sched.runqsize != 0 {
+		gp := globrunqget(_p_, 0)
+		unlock(&sched.lock)
+		return gp, false
+	}
+	if releasep() != _p_ {
+		throw("findrunnable: wrong p")
+	}
+	pidleput(_p_)
+	unlock(&sched.lock)
+
+	// Delicate dance: thread transitions from spinning to non-spinning
+	// state, potentially concurrently with submission of new work. We must
+	// drop nmspinning first and then check all sources again (with
+	// #StoreLoad memory barrier in between). If we do it the other way
+	// around, another thread can submit work after we've checked all
+	// sources but before we drop nmspinning; as a result nobody will
+	// unpark a thread to run the work.
+	//
+	// This applies to the following sources of work:
+	//
+	// * Goroutines added to a per-P run queue.
+	// * New/modified-earlier timers on a per-P timer heap.
+	// * Idle-priority GC work (barring golang.org/issue/19112).
+	//
+	// If we discover new work below, we need to restore m.spinning as a signal
+	// for resetspinning to unpark a new worker thread (because there can be more
+	// than one starving goroutine). However, if after discovering new work
+	// we also observe no idle Ps it is OK to skip unparking a new worker
+	// thread: the system is fully loaded so no spinning threads are required.
+	// Also see "Worker thread parking/unparking" comment at the top of the file.
+	wasSpinning := _g_.m.spinning
+	if _g_.m.spinning {
+		_g_.m.spinning = false
+		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+			throw("findrunnable: negative nmspinning")
+		}
+
+		// Note the for correctness, only the last M transitioning from
+		// spinning to non-spinning must perform these rechecks to
+		// ensure no missed work. We are performing it on every M that
+		// transitions as a conservative change to monitor effects on
+		// latency. See golang.org/issue/43997.
+
+		// Check all runqueues once again.
+		_p_ = checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
+		if _p_ != nil {
+			acquirep(_p_)
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+			goto top
+		}
+
+		// Check for idle-priority GC work again.
+		_p_, gp = checkIdleGCNoP()
+		if _p_ != nil {
+			acquirep(_p_)
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+
+			// Run the idle worker.
+			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
+		}
+
+		// Finally, check for timer creation or expiry concurrently with
+		// transitioning from spinning to non-spinning.
+		//
+		// Note that we cannot use checkTimers here because it calls
+		// adjusttimers which may need to allocate memory, and that isn't
+		// allowed when we don't have an active P.
+		pollUntil = checkTimersNoP(allpSnapshot, timerpMaskSnapshot, pollUntil)
+	}
+
+	// Poll network until next timer.
+	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
+		if _g_.m.p != 0 {
+			throw("findrunnable: netpoll with p")
+		}
+		if _g_.m.spinning {
+			throw("findrunnable: netpoll with spinning")
+		}
+		delay := int64(-1)
+		if pollUntil != 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			delay = pollUntil - now
+			if delay < 0 {
+				delay = 0
+			}
+		}
+		if faketime != 0 {
+			// When using fake time, just poll.
+			delay = 0
+		}
+		list := netpoll(delay) // block until new work is available
+		atomic.Store64(&sched.pollUntil, 0)
+		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
+		if faketime != 0 && list.empty() {
+			// Using fake time and nothing is ready; stop M.
+			// When all M's stop, checkdead will call timejump.
+			stopm()
+			goto top
+		}
+		lock(&sched.lock)
+		_p_ = pidleget()
+		unlock(&sched.lock)
+		if _p_ == nil {
+			injectglist(&list)
+		} else {
+			acquirep(_p_)
+			if !list.empty() {
+				gp := list.pop()
+				injectglist(&list)
+				casgstatus(gp, _Gwaiting, _Grunnable)
+				if trace.enabled {
+					traceGoUnpark(gp, 0)
+				}
+				return gp, false
+			}
+			if wasSpinning {
+				_g_.m.spinning = true
+				atomic.Xadd(&sched.nmspinning, 1)
+			}
+			goto top
+		}
+	} else if pollUntil != 0 && netpollinited() {
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > pollUntil {
+			netpollBreak()
+		}
+	}
+	stopm()
+	goto top
+}
+
+// pollWork reports whether there is non-background work this P could
+// be doing. This is a fairly lightweight check to be used for
+// background work loops, like idle GC. It checks a subset of the
+// conditions checked by the actual scheduler.
+func pollWork() bool {
+	if sched.runqsize != 0 {
+		return true
+	}
+	p := getg().m.p.ptr()
+	if !runqempty(p) {
+		return true
+	}
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
+		if list := netpoll(0); !list.empty() {
+			injectglist(&list)
+			return true
+		}
+	}
+	return false
+}
+
+// stealWork attempts to steal a runnable goroutine or timer from any P.
+//
+// If newWork is true, new work may have been readied.
+//
+// If now is not 0 it is the current time. stealWork returns the passed time or
+// the current time if now was passed as 0.
+func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWork bool) {
+	pp := getg().m.p.ptr()
+
+	ranTimer := false
+
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
+		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
+			if sched.gcwaiting != 0 {
+				// GC work may be available.
+				return nil, false, now, pollUntil, true
+			}
+			p2 := allp[enum.position()]
+			if pp == p2 {
+				continue
+			}
+
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
+			//
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
+			//
+			// timerpMask tells us whether the P may have timers at all. If it
+			// can't, no need to check at all.
+			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
+				tnow, w, ran := checkTimers(p2, now)
+				now = tnow
+				if w != 0 && (pollUntil == 0 || w < pollUntil) {
+					pollUntil = w
+				}
+				if ran {
+					// Running the timers may have
+					// made an arbitrary number of G's
+					// ready and added them to this P's
+					// local run queue. That invalidates
+					// the assumption of runqsteal
+					// that it always has room to add
+					// stolen G's. So check now if there
+					// is a local G to run.
+					if gp, inheritTime := runqget(pp); gp != nil {
+						return gp, inheritTime, now, pollUntil, ranTimer
+					}
+					ranTimer = true
+				}
+			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(pp, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false, now, pollUntil, ranTimer
+				}
+			}
+		}
+	}
+
+	// No goroutines found to steal. Regardless, running a timer may have
+	// made some goroutine ready that we missed. Indicate the next timer to
+	// wait for.
+	return nil, false, now, pollUntil, ranTimer
+}
+
+// Check all Ps for a runnable G to steal.
+//
+// On entry we have no P. If a G is available to steal and a P is available,
+// the P is returned which the caller should acquire and attempt to steal the
+// work to.
+func checkRunqsNoP(allpSnapshot []*p, idlepMaskSnapshot pMask) *p {
+	for id, p2 := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) {
+			lock(&sched.lock)
+			pp := pidleget()
+			unlock(&sched.lock)
+			if pp != nil {
+				return pp
+			}
+
+			// Can't get a P, don't bother checking remaining Ps.
+			break
+		}
+	}
+
+	return nil
+}
+
+// Check all Ps for a timer expiring sooner than pollUntil.
+//
+// Returns updated pollUntil value.
+func checkTimersNoP(allpSnapshot []*p, timerpMaskSnapshot pMask, pollUntil int64) int64 {
+	for id, p2 := range allpSnapshot {
+		if timerpMaskSnapshot.read(uint32(id)) {
+			w := nobarrierWakeTime(p2)
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+
+	return pollUntil
+}
+
+// Check for idle-priority GC, without a P on entry.
+//
+// If some GC work, a P, and a worker G are all available, the P and G will be
+// returned. The returned P has not been wired yet.
+func checkIdleGCNoP() (*p, *g) {
+	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
+	// must check again after acquiring a P.
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		return nil, nil
+	}
+	if !gcMarkWorkAvailable(nil) {
+		return nil, nil
+	}
+
+	// Work is available; we can start an idle GC worker only if there is
+	// an available P and available worker G.
+	//
+	// We can attempt to acquire these in either order, though both have
+	// synchronization concerns (see below). Workers are almost always
+	// available (see comment in findRunnableGCWorker for the one case
+	// there may be none). Since we're slightly less likely to find a P,
+	// check for that first.
+	//
+	// Synchronization: note that we must hold sched.lock until we are
+	// committed to keeping it. Otherwise we cannot put the unnecessary P
+	// back in sched.pidle without performing the full set of idle
+	// transition checks.
+	//
+	// If we were to check gcBgMarkWorkerPool first, we must somehow handle
+	// the assumption in gcControllerState.findRunnableGCWorker that an
+	// empty gcBgMarkWorkerPool is only possible if gcMarkDone is running.
+	lock(&sched.lock)
+	pp := pidleget()
+	if pp == nil {
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	// Now that we own a P, gcBlackenEnabled can't change (as it requires
+	// STW).
+	if gcBlackenEnabled == 0 {
+		pidleput(pp)
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		pidleput(pp)
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	unlock(&sched.lock)
+
+	return pp, node.gp.ptr()
+}
+
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
+func wakeNetPoller(when int64) {
+	if atomic.Load64(&sched.lastpoll) == 0 {
+		// In findrunnable we ensure that when polling the pollUntil
+		// field is either zero or the time to which the current
+		// poll is expected to run. This can have a spurious wakeup
+		// but should never miss a wakeup.
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > when {
+			netpollBreak()
+		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		if GOOS != "plan9" { // Temporary workaround - see issue #42303.
+			wakep()
+		}
+	}
+}
+
+func resetspinning() {
+	_g_ := getg()
+	if !_g_.m.spinning {
+		throw("resetspinning: not a spinning m")
+	}
+	_g_.m.spinning = false
+	nmspinning := atomic.Xadd(&sched.nmspinning, -1)
+	if int32(nmspinning) < 0 {
+		throw("findrunnable: negative nmspinning")
+	}
+	// M wakeup policy is deliberately somewhat conservative, so check if we
+	// need to wakeup another P here. See "Worker thread parking/unparking"
+	// comment at the top of the file for details.
+	wakep()
+}
+
+// injectglist adds each runnable G on the list to some run queue,
+// and clears glist. If there is no current P, they are added to the
+// global queue, and up to npidle M's are started to run them.
+// Otherwise, for each idle P, this adds a G to the global queue
+// and starts an M. Any remaining G's are added to the current P's
+// local run queue.
+// This may temporarily acquire sched.lock.
+// Can run concurrently with GC.
+func injectglist(glist *gList) {
+	if glist.empty() {
+		return
+	}
+	if trace.enabled {
+		for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
+			traceGoUnpark(gp, 0)
+		}
+	}
+
+	// Mark all the goroutines as runnable before we put them
+	// on the run queues.
+	head := glist.head.ptr()
+	var tail *g
+	qsize := 0
+	for gp := head; gp != nil; gp = gp.schedlink.ptr() {
+		tail = gp
+		qsize++
+		casgstatus(gp, _Gwaiting, _Grunnable)
+	}
+
+	// Turn the gList into a gQueue.
+	var q gQueue
+	q.head.set(head)
+	q.tail.set(tail)
+	*glist = gList{}
+
+	startIdle := func(n int) {
+		for ; n != 0 && sched.npidle != 0; n-- {
+			startm(nil, false)
+		}
+	}
+
+	pp := getg().m.p.ptr()
+	if pp == nil {
+		lock(&sched.lock)
+		globrunqputbatch(&q, int32(qsize))
+		unlock(&sched.lock)
+		startIdle(qsize)
+		return
+	}
+
+	npidle := int(atomic.Load(&sched.npidle))
+	var globq gQueue
+	var n int
+	for n = 0; n < npidle && !q.empty(); n++ {
+		g := q.pop()
+		globq.pushBack(g)
+	}
+	if n > 0 {
+		lock(&sched.lock)
+		globrunqputbatch(&globq, int32(n))
+		unlock(&sched.lock)
+		startIdle(n)
+		qsize -= n
+	}
+
+	if !q.empty() {
+		runqputbatch(pp, &q, qsize)
+	}
+}
+
+// One round of scheduler: find a runnable goroutine and execute it.
+// Never returns.
+func schedule() {
+	_g_ := getg()
+
+	if _g_.m.locks != 0 {
+		throw("schedule: holding locks")
+	}
+
+	if _g_.m.lockedg != 0 {
+		stoplockedm()
+		execute(_g_.m.lockedg.ptr(), false) // Never returns.
+	}
+
+	// We should not schedule away from a g that is executing a cgo call,
+	// since the cgo call is using the m's g0 stack.
+	if _g_.m.incgo {
+		throw("schedule: in cgo")
+	}
+
+top:
+	pp := _g_.m.p.ptr()
+	pp.preempt = false
+
+	if sched.gcwaiting != 0 {
+		gcstopm()
+		goto top
+	}
+	if pp.runSafePointFn != 0 {
+		runSafePointFn()
+	}
+
+	// Sanity check: if we are spinning, the run queue should be empty.
+	// Check this before calling checkTimers, as that might call
+	// goready to put a ready goroutine on the local run queue.
+	if _g_.m.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
+		throw("schedule: spinning with local work")
+	}
+
+	checkTimers(pp, 0)
+
+	var gp *g
+	var inheritTime bool
+
+	// Normal goroutines will check for need to wakeP in ready,
+	// but GCworkers and tracereaders will not, so the check must
+	// be done here instead.
+	tryWakeP := false
+	if trace.enabled || trace.shutdown {
+		gp = traceReader()
+		if gp != nil {
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			traceGoUnpark(gp, 0)
+			tryWakeP = true
+		}
+	}
+	if gp == nil && gcBlackenEnabled != 0 {
+		gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
+		if gp != nil {
+			tryWakeP = true
+		}
+	}
+	if gp == nil {
+		// Check the global runnable queue once in a while to ensure fairness.
+		// Otherwise two goroutines can completely occupy the local runqueue
+		// by constantly respawning each other.
+		if _g_.m.p.ptr().schedtick%61 == 0 && sched.runqsize > 0 {
+			lock(&sched.lock)
+			gp = globrunqget(_g_.m.p.ptr(), 1)
+			unlock(&sched.lock)
+		}
+	}
+	if gp == nil {
+		gp, inheritTime = runqget(_g_.m.p.ptr())
+		// We can see gp != nil here even if the M is spinning,
+		// if checkTimers added a local goroutine via goready.
+	}
+	if gp == nil {
+		gp, inheritTime = findrunnable() // blocks until work is available
+	}
+
+	// This thread is going to run a goroutine and is not spinning anymore,
+	// so if it was marked as spinning we need to reset it now and potentially
+	// start a new spinning M.
+	if _g_.m.spinning {
+		resetspinning()
+	}
+
+	if sched.disable.user && !schedEnabled(gp) {
+		// Scheduling of this goroutine is disabled. Put it on
+		// the list of pending runnable goroutines for when we
+		// re-enable user scheduling and look again.
+		lock(&sched.lock)
+		if schedEnabled(gp) {
+			// Something re-enabled scheduling while we
+			// were acquiring the lock.
+			unlock(&sched.lock)
+		} else {
+			sched.disable.runnable.pushBack(gp)
+			sched.disable.n++
+			unlock(&sched.lock)
+			goto top
+		}
+	}
+
+	// If about to schedule a not-normal goroutine (a GCworker or tracereader),
+	// wake a P if there is one.
+	if tryWakeP {
+		wakep()
+	}
+	if gp.lockedm != 0 {
+		// Hands off own p to the locked m,
+		// then blocks waiting for a new p.
+		startlockedm(gp)
+		goto top
+	}
+
+	execute(gp, inheritTime)
+}
+
+// dropg removes the association between m and the current goroutine m->curg (gp for short).
+// Typically a caller sets gp's status away from Grunning and then
+// immediately calls dropg to finish the job. The caller is also responsible
+// for arranging that gp will be restarted using ready at an
+// appropriate time. After calling dropg and arranging for gp to be
+// readied later, the caller can do other work but eventually should
+// call schedule to restart the scheduling of goroutines on this m.
+func dropg() {
+	_g_ := getg()
+
+	setMNoWB(&_g_.m.curg.m, nil)
+	setGNoWB(&_g_.m.curg, nil)
+}
+
+// checkTimers runs any timers for the P that are ready.
+// If now is not 0 it is the current time.
+// It returns the passed time or the current time if now was passed as 0.
+// and the time when the next timer should run or 0 if there is no next timer,
+// and reports whether it ran any timers.
+// If the time when the next timer should run is not 0,
+// it is always larger than the returned time.
+// We pass now in and out to avoid extra calls of nanotime.
+//go:yeswritebarrierrec
+func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
+	// If it's not yet time for the first timer, or the first adjusted
+	// timer, then there is nothing to do.
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
+	}
+
+	if next == 0 {
+		// No timers to run or adjust.
+		return now, 0, false
+	}
+
+	if now == 0 {
+		now = nanotime()
+	}
+	if now < next {
+		// Next timer is not ready to run, but keep going
+		// if we would clear deleted timers.
+		// This corresponds to the condition below where
+		// we decide whether to call clearDeletedTimers.
+		if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+			return now, next, false
+		}
+	}
+
+	lock(&pp.timersLock)
+
+	if len(pp.timers) > 0 {
+		adjusttimers(pp, now)
+		for len(pp.timers) > 0 {
+			// Note that runtimer may temporarily unlock
+			// pp.timersLock.
+			if tw := runtimer(pp, now); tw != 0 {
+				if tw > 0 {
+					pollUntil = tw
+				}
+				break
+			}
+			ran = true
+		}
+	}
+
+	// If this is the local P, and there are a lot of deleted timers,
+	// clear them out. We only do this for the local P to reduce
+	// lock contention on timersLock.
+	if pp == getg().m.p.ptr() && int(atomic.Load(&pp.deletedTimers)) > len(pp.timers)/4 {
+		clearDeletedTimers(pp)
+	}
+
+	unlock(&pp.timersLock)
+
+	return now, pollUntil, ran
+}
+
+func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
+	unlock((*mutex)(lock))
+	return true
+}
+
+// park continuation on g0.
+func park_m(gp *g) {
+	_g_ := getg()
+
+	if trace.enabled {
+		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip)
+	}
+
+	casgstatus(gp, _Grunning, _Gwaiting)
+	dropg()
+
+	if fn := _g_.m.waitunlockf; fn != nil {
+		ok := fn(gp, _g_.m.waitlock)
+		_g_.m.waitunlockf = nil
+		_g_.m.waitlock = nil
+		if !ok {
+			if trace.enabled {
+				traceGoUnpark(gp, 2)
+			}
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			execute(gp, true) // Schedule it back, never returns.
+		}
+	}
+	schedule()
+}
+
+func goschedImpl(gp *g) {
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	lock(&sched.lock)
+	globrunqput(gp)
+	unlock(&sched.lock)
+
+	schedule()
+}
+
+// Gosched continuation on g0.
+func gosched_m(gp *g) {
+	if trace.enabled {
+		traceGoSched()
+	}
+	goschedImpl(gp)
+}
+
+// goschedguarded is a forbidden-states-avoided version of gosched_m
+func goschedguarded_m(gp *g) {
+
+	if !canPreemptM(gp.m) {
+		gogo(&gp.sched) // never return
+	}
+
+	if trace.enabled {
+		traceGoSched()
+	}
+	goschedImpl(gp)
+}
+
+func gopreempt_m(gp *g) {
+	if trace.enabled {
+		traceGoPreempt()
+	}
+	goschedImpl(gp)
+}
+
+// preemptPark parks gp and puts it in _Gpreempted.
+//
+//go:systemstack
+func preemptPark(gp *g) {
+	if trace.enabled {
+		traceGoPark(traceEvGoBlock, 0)
+	}
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	gp.waitreason = waitReasonPreempted
+
+	if gp.asyncSafePoint {
+		// Double-check that async preemption does not
+		// happen in SPWRITE assembly functions.
+		// isAsyncSafePoint must exclude this case.
+		f := findfunc(gp.sched.pc)
+		if !f.valid() {
+			throw("preempt at unknown pc")
+		}
+		if f.flag&funcFlag_SPWRITE != 0 {
+			println("runtime: unexpected SPWRITE function", funcname(f), "in async preempt")
+			throw("preempt SPWRITE")
+		}
+	}
+
+	// Transition from _Grunning to _Gscan|_Gpreempted. We can't
+	// be in _Grunning when we dropg because then we'd be running
+	// without an M, but the moment we're in _Gpreempted,
+	// something could claim this G before we've fully cleaned it
+	// up. Hence, we set the scan bit to lock down further
+	// transitions until we can dropg.
+	casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
+	dropg()
+	casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+	schedule()
+}
+
+// goyield is like Gosched, but it:
+// - emits a GoPreempt trace event instead of a GoSched trace event
+// - puts the current G on the runq of the current P instead of the globrunq
+func goyield() {
+	checkTimeouts()
+	mcall(goyield_m)
+}
+
+func goyield_m(gp *g) {
+	if trace.enabled {
+		traceGoPreempt()
+	}
+	pp := gp.m.p.ptr()
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	runqput(pp, gp, false)
+	schedule()
+}
+
+// Finishes execution of the current goroutine.
+func goexit1() {
+	if raceenabled {
+		racegoend()
+	}
+	if trace.enabled {
+		traceGoEnd()
+	}
+	mcall(goexit0)
+}
+
+// goexit continuation on g0.
+func goexit0(gp *g) {
+	_g_ := getg()
+	_p_ := _g_.m.p.ptr()
+
+	casgstatus(gp, _Grunning, _Gdead)
+	gcController.addScannableStack(_p_, -int64(gp.stack.hi-gp.stack.lo))
+	if isSystemGoroutine(gp, false) {
+		atomic.Xadd(&sched.ngsys, -1)
+	}
+	gp.m = nil
+	locked := gp.lockedm != 0
+	gp.lockedm = 0
+	_g_.m.lockedg = 0
+	gp.preemptStop = false
+	gp.paniconfault = false
+	gp._defer = nil // should be true already but just in case.
+	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
+	gp.writebuf = nil
+	gp.waitreason = 0
+	gp.param = nil
+	gp.labels = nil
+	gp.timer = nil
+
+	if gcBlackenEnabled != 0 && gp.gcAssistBytes > 0 {
+		// Flush assist credit to the global pool. This gives
+		// better information to pacing if the application is
+		// rapidly creating an exiting goroutines.
+		assistWorkPerByte := gcController.assistWorkPerByte.Load()
+		scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes))
+		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
+		gp.gcAssistBytes = 0
+	}
+
+	dropg()
+
+	if GOARCH == "wasm" { // no threads yet on wasm
+		gfput(_p_, gp)
+		schedule() // never returns
+	}
+
+	if _g_.m.lockedInt != 0 {
+		print("invalid m->lockedInt = ", _g_.m.lockedInt, "\n")
+		throw("internal lockOSThread error")
+	}
+	gfput(_p_, gp)
+	if locked {
+		// The goroutine may have locked this thread because
+		// it put it in an unusual kernel state. Kill it
+		// rather than returning it to the thread pool.
+
+		// Return to mstart, which will release the P and exit
+		// the thread.
+		if GOOS != "plan9" { // See golang.org/issue/22227.
+			gogo(&_g_.m.g0.sched)
+		} else {
+			// Clear lockedExt on plan9 since we may end up re-using
+			// this thread.
+			_g_.m.lockedExt = 0
+		}
+	}
+	schedule()
+}
+
+// save updates getg().sched to refer to pc and sp so that a following
+// gogo will restore pc and sp.
+//
+// save must not have write barriers because invoking a write barrier
+// can clobber getg().sched.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func save(pc, sp uintptr) {
+	_g_ := getg()
+
+	if _g_ == _g_.m.g0 || _g_ == _g_.m.gsignal {
+		// m.g0.sched is special and must describe the context
+		// for exiting the thread. mstart1 writes to it directly.
+		// m.gsignal.sched should not be used at all.
+		// This check makes sure save calls do not accidentally
+		// run in contexts where they'd write to system g's.
+		throw("save on system g not allowed")
+	}
+
+	_g_.sched.pc = pc
+	_g_.sched.sp = sp
+	_g_.sched.lr = 0
+	_g_.sched.ret = 0
+	// We need to ensure ctxt is zero, but can't have a write
+	// barrier here. However, it should always already be zero.
+	// Assert that.
+	if _g_.sched.ctxt != nil {
+		badctxt()
+	}
+}
+
+// The goroutine g is about to enter a system call.
+// Record that it's not using the cpu anymore.
+// This is called only from the go syscall library and cgocall,
+// not from the low-level system calls used by the runtime.
+//
+// Entersyscall cannot split the stack: the save must
+// make g->sched refer to the caller's stack segment, because
+// entersyscall is going to return immediately after.
+//
+// Nothing entersyscall calls can split the stack either.
+// We cannot safely move the stack during an active call to syscall,
+// because we do not know which of the uintptr arguments are
+// really pointers (back into the stack).
+// In practice, this means that we make the fast path run through
+// entersyscall doing no-split things, and the slow path has to use systemstack
+// to run bigger things on the system stack.
+//
+// reentersyscall is the entry point used by cgo callbacks, where explicitly
+// saved SP and PC are restored. This is needed when exitsyscall will be called
+// from a function further up in the call stack than the parent, as g->syscallsp
+// must always point to a valid stack frame. entersyscall below is the normal
+// entry point for syscalls, which obtains the SP and PC from the caller.
+//
+// Syscall tracing:
+// At the start of a syscall we emit traceGoSysCall to capture the stack trace.
+// If the syscall does not block, that is it, we do not emit any other events.
+// If the syscall blocks (that is, P is retaken), retaker emits traceGoSysBlock;
+// when syscall returns we emit traceGoSysExit and when the goroutine starts running
+// (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
+// To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
+// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
+// whoever emits traceGoSysBlock increments p.syscalltick afterwards;
+// and we wait for the increment before emitting traceGoSysExit.
+// Note that the increment is done even if tracing is not enabled,
+// because tracing can be enabled in the middle of syscall. We don't want the wait to hang.
+//
+//go:nosplit
+func reentersyscall(pc, sp uintptr) {
+	_g_ := getg()
+
+	// Disable preemption because during this function g is in Gsyscall status,
+	// but can have inconsistent g->sched, do not let GC observe it.
+	_g_.m.locks++
+
+	// Entersyscall must not call any function that might split/grow the stack.
+	// (See details in comment above.)
+	// Catch calls that might, by replacing the stack guard with something that
+	// will trip any stack check and leaving a flag to tell newstack to die.
+	_g_.stackguard0 = stackPreempt
+	_g_.throwsplit = true
+
+	// Leave SP around for GC and traceback.
+	save(pc, sp)
+	_g_.syscallsp = sp
+	_g_.syscallpc = pc
+	casgstatus(_g_, _Grunning, _Gsyscall)
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		systemstack(func() {
+			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			throw("entersyscall")
+		})
+	}
+
+	if trace.enabled {
+		systemstack(traceGoSysCall)
+		// systemstack itself clobbers g.sched.{pc,sp} and we might
+		// need them later when the G is genuinely blocked in a
+		// syscall
+		save(pc, sp)
+	}
+
+	if atomic.Load(&sched.sysmonwait) != 0 {
+		systemstack(entersyscall_sysmon)
+		save(pc, sp)
+	}
+
+	if _g_.m.p.ptr().runSafePointFn != 0 {
+		// runSafePointFn may stack split if run on this stack
+		systemstack(runSafePointFn)
+		save(pc, sp)
+	}
+
+	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
+	_g_.sysblocktraced = true
+	pp := _g_.m.p.ptr()
+	pp.m = 0
+	_g_.m.oldp.set(pp)
+	_g_.m.p = 0
+	atomic.Store(&pp.status, _Psyscall)
+	if sched.gcwaiting != 0 {
+		systemstack(entersyscall_gcwait)
+		save(pc, sp)
+	}
+
+	_g_.m.locks--
+}
+
+// Standard syscall entry used by the go syscall library and normal cgo calls.
+//
+// This is exported via linkname to assembly in the syscall package.
+//
+//go:nosplit
+//go:linkname entersyscall
+func entersyscall() {
+	reentersyscall(getcallerpc(), getcallersp())
+}
+
+func entersyscall_sysmon() {
+	lock(&sched.lock)
+	if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+}
+
+func entersyscall_gcwait() {
+	_g_ := getg()
+	_p_ := _g_.m.oldp.ptr()
+
+	lock(&sched.lock)
+	if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
+		if trace.enabled {
+			traceGoSysBlock(_p_)
+			traceProcStop(_p_)
+		}
+		_p_.syscalltick++
+		if sched.stopwait--; sched.stopwait == 0 {
+			notewakeup(&sched.stopnote)
+		}
+	}
+	unlock(&sched.lock)
+}
+
+// The same as entersyscall(), but with a hint that the syscall is blocking.
+//go:nosplit
+func entersyscallblock() {
+	_g_ := getg()
+
+	_g_.m.locks++ // see comment in entersyscall
+	_g_.throwsplit = true
+	_g_.stackguard0 = stackPreempt // see comment in entersyscall
+	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
+	_g_.sysblocktraced = true
+	_g_.m.p.ptr().syscalltick++
+
+	// Leave SP around for GC and traceback.
+	pc := getcallerpc()
+	sp := getcallersp()
+	save(pc, sp)
+	_g_.syscallsp = _g_.sched.sp
+	_g_.syscallpc = _g_.sched.pc
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		sp1 := sp
+		sp2 := _g_.sched.sp
+		sp3 := _g_.syscallsp
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			throw("entersyscallblock")
+		})
+	}
+	casgstatus(_g_, _Grunning, _Gsyscall)
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			throw("entersyscallblock")
+		})
+	}
+
+	systemstack(entersyscallblock_handoff)
+
+	// Resave for traceback during blocked call.
+	save(getcallerpc(), getcallersp())
+
+	_g_.m.locks--
+}
+
+func entersyscallblock_handoff() {
+	if trace.enabled {
+		traceGoSysCall()
+		traceGoSysBlock(getg().m.p.ptr())
+	}
+	handoffp(releasep())
+}
+
+// The goroutine g exited its system call.
+// Arrange for it to run on a cpu again.
+// This is called only from the go syscall library, not
+// from the low-level system calls used by the runtime.
+//
+// Write barriers are not allowed because our P may have been stolen.
+//
+// This is exported via linkname to assembly in the syscall package.
+//
+//go:nosplit
+//go:nowritebarrierrec
+//go:linkname exitsyscall
+func exitsyscall() {
+	_g_ := getg()
+
+	_g_.m.locks++ // see comment in entersyscall
+	if getcallersp() > _g_.syscallsp {
+		throw("exitsyscall: syscall frame is no longer valid")
+	}
+
+	_g_.waitsince = 0
+	oldp := _g_.m.oldp.ptr()
+	_g_.m.oldp = 0
+	if exitsyscallfast(oldp) {
+		if trace.enabled {
+			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+				systemstack(traceGoStart)
+			}
+		}
+		// There's a cpu for us, so we can run.
+		_g_.m.p.ptr().syscalltick++
+		// We need to cas the status and scan before resuming...
+		casgstatus(_g_, _Gsyscall, _Grunning)
+
+		// Garbage collector isn't running (since we are),
+		// so okay to clear syscallsp.
+		_g_.syscallsp = 0
+		_g_.m.locks--
+		if _g_.preempt {
+			// restore the preemption request in case we've cleared it in newstack
+			_g_.stackguard0 = stackPreempt
+		} else {
+			// otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
+			_g_.stackguard0 = _g_.stack.lo + _StackGuard
+		}
+		_g_.throwsplit = false
+
+		if sched.disable.user && !schedEnabled(_g_) {
+			// Scheduling of this goroutine is disabled.
+			Gosched()
+		}
+
+		return
+	}
+
+	_g_.sysexitticks = 0
+	if trace.enabled {
+		// Wait till traceGoSysBlock event is emitted.
+		// This ensures consistency of the trace (the goroutine is started after it is blocked).
+		for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
+			osyield()
+		}
+		// We can't trace syscall exit right now because we don't have a P.
+		// Tracing code can invoke write barriers that cannot run without a P.
+		// So instead we remember the syscall exit time and emit the event
+		// in execute when we have a P.
+		_g_.sysexitticks = cputicks()
+	}
+
+	_g_.m.locks--
+
+	// Call the scheduler.
+	mcall(exitsyscall0)
+
+	// Scheduler returned, so we're allowed to run now.
+	// Delete the syscallsp information that we left for
+	// the garbage collector during the system call.
+	// Must wait until now because until gosched returns
+	// we don't know for sure that the garbage collector
+	// is not running.
+	_g_.syscallsp = 0
+	_g_.m.p.ptr().syscalltick++
+	_g_.throwsplit = false
+}
+
+//go:nosplit
+func exitsyscallfast(oldp *p) bool {
+	_g_ := getg()
+
+	// Freezetheworld sets stopwait but does not retake P's.
+	if sched.stopwait == freezeStopWait {
+		return false
+	}
+
+	// Try to re-acquire the last P.
+	if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
+		// There's a cpu for us, so we can run.
+		wirep(oldp)
+		exitsyscallfast_reacquired()
+		return true
+	}
+
+	// Try to get any other idle P.
+	if sched.pidle != 0 {
+		var ok bool
+		systemstack(func() {
+			ok = exitsyscallfast_pidle()
+			if ok && trace.enabled {
+				if oldp != nil {
+					// Wait till traceGoSysBlock event is emitted.
+					// This ensures consistency of the trace (the goroutine is started after it is blocked).
+					for oldp.syscalltick == _g_.m.syscalltick {
+						osyield()
+					}
+				}
+				traceGoSysExit(0)
+			}
+		})
+		if ok {
+			return true
+		}
+	}
+	return false
+}
+
+// exitsyscallfast_reacquired is the exitsyscall path on which this G
+// has successfully reacquired the P it was running on before the
+// syscall.
+//
+//go:nosplit
+func exitsyscallfast_reacquired() {
+	_g_ := getg()
+	if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+		if trace.enabled {
+			// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
+			// traceGoSysBlock for this syscall was already emitted,
+			// but here we effectively retake the p from the new syscall running on the same p.
+			systemstack(func() {
+				// Denote blocking of the new syscall.
+				traceGoSysBlock(_g_.m.p.ptr())
+				// Denote completion of the current syscall.
+				traceGoSysExit(0)
+			})
+		}
+		_g_.m.p.ptr().syscalltick++
+	}
+}
+
+func exitsyscallfast_pidle() bool {
+	lock(&sched.lock)
+	_p_ := pidleget()
+	if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+	if _p_ != nil {
+		acquirep(_p_)
+		return true
+	}
+	return false
+}
+
+// exitsyscall slow path on g0.
+// Failed to acquire P, enqueue gp as runnable.
+//
+// Called via mcall, so gp is the calling g from this M.
+//
+//go:nowritebarrierrec
+func exitsyscall0(gp *g) {
+	casgstatus(gp, _Gsyscall, _Grunnable)
+	dropg()
+	lock(&sched.lock)
+	var _p_ *p
+	if schedEnabled(gp) {
+		_p_ = pidleget()
+	}
+	var locked bool
+	if _p_ == nil {
+		globrunqput(gp)
+
+		// Below, we stoplockedm if gp is locked. globrunqput releases
+		// ownership of gp, so we must check if gp is locked prior to
+		// committing the release by unlocking sched.lock, otherwise we
+		// could race with another M transitioning gp from unlocked to
+		// locked.
+		locked = gp.lockedm != 0
+	} else if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+	if _p_ != nil {
+		acquirep(_p_)
+		execute(gp, false) // Never returns.
+	}
+	if locked {
+		// Wait until another thread schedules gp and so m again.
+		//
+		// N.B. lockedm must be this M, as this g was running on this M
+		// before entersyscall.
+		stoplockedm()
+		execute(gp, false) // Never returns.
+	}
+	stopm()
+	schedule() // Never returns.
+}
+
+// Called from syscall package before fork.
+//go:linkname syscall_runtime_BeforeFork syscall.runtime_BeforeFork
+//go:nosplit
+func syscall_runtime_BeforeFork() {
+	gp := getg().m.curg
+
+	// Block signals during a fork, so that the child does not run
+	// a signal handler before exec if a signal is sent to the process
+	// group. See issue #18600.
+	gp.m.locks++
+	sigsave(&gp.m.sigmask)
+	sigblock(false)
+
+	// This function is called before fork in syscall package.
+	// Code between fork and exec must not allocate memory nor even try to grow stack.
+	// Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
+	// runtime_AfterFork will undo this in parent process, but not in child.
+	gp.stackguard0 = stackFork
+}
+
+// Called from syscall package after fork in parent.
+//go:linkname syscall_runtime_AfterFork syscall.runtime_AfterFork
+//go:nosplit
+func syscall_runtime_AfterFork() {
+	gp := getg().m.curg
+
+	// See the comments in beforefork.
+	gp.stackguard0 = gp.stack.lo + _StackGuard
+
+	msigrestore(gp.m.sigmask)
+
+	gp.m.locks--
+}
+
+// inForkedChild is true while manipulating signals in the child process.
+// This is used to avoid calling libc functions in case we are using vfork.
+var inForkedChild bool
+
+// Called from syscall package after fork in child.
+// It resets non-sigignored signals to the default handler, and
+// restores the signal mask in preparation for the exec.
+//
+// Because this might be called during a vfork, and therefore may be
+// temporarily sharing address space with the parent process, this must
+// not change any global variables or calling into C code that may do so.
+//
+//go:linkname syscall_runtime_AfterForkInChild syscall.runtime_AfterForkInChild
+//go:nosplit
+//go:nowritebarrierrec
+func syscall_runtime_AfterForkInChild() {
+	// It's OK to change the global variable inForkedChild here
+	// because we are going to change it back. There is no race here,
+	// because if we are sharing address space with the parent process,
+	// then the parent process can not be running concurrently.
+	inForkedChild = true
+
+	clearSignalHandlers()
+
+	// When we are the child we are the only thread running,
+	// so we know that nothing else has changed gp.m.sigmask.
+	msigrestore(getg().m.sigmask)
+
+	inForkedChild = false
+}
+
+// pendingPreemptSignals is the number of preemption signals
+// that have been sent but not received. This is only used on Darwin.
+// For #41702.
+var pendingPreemptSignals uint32
+
+// Called from syscall package before Exec.
+//go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec
+func syscall_runtime_BeforeExec() {
+	// Prevent thread creation during exec.
+	execLock.lock()
+
+	// On Darwin, wait for all pending preemption signals to
+	// be received. See issue #41702.
+	if GOOS == "darwin" || GOOS == "ios" {
+		for int32(atomic.Load(&pendingPreemptSignals)) > 0 {
+			osyield()
+		}
+	}
+}
+
+// Called from syscall package after Exec.
+//go:linkname syscall_runtime_AfterExec syscall.runtime_AfterExec
+func syscall_runtime_AfterExec() {
+	execLock.unlock()
+}
+
+// Allocate a new g, with a stack big enough for stacksize bytes.
+func malg(stacksize int32) *g {
+	newg := new(g)
+	if stacksize >= 0 {
+		stacksize = round2(_StackSystem + stacksize)
+		systemstack(func() {
+			newg.stack = stackalloc(uint32(stacksize))
+		})
+		newg.stackguard0 = newg.stack.lo + _StackGuard
+		newg.stackguard1 = ^uintptr(0)
+		// Clear the bottom word of the stack. We record g
+		// there on gsignal stack during VDSO on ARM and ARM64.
+		*(*uintptr)(unsafe.Pointer(newg.stack.lo)) = 0
+	}
+	return newg
+}
+
+// Create a new g running fn.
+// Put it on the queue of g's waiting to run.
+// The compiler turns a go statement into a call to this.
+func newproc(fn *funcval) {
+	gp := getg()
+	pc := getcallerpc()
+	systemstack(func() {
+		newg := newproc1(fn, gp, pc)
+
+		_p_ := getg().m.p.ptr()
+		runqput(_p_, newg, true)
+
+		if mainStarted {
+			wakep()
+		}
+	})
+}
+
+// Create a new g in state _Grunnable, starting at fn. callerpc is the
+// address of the go statement that created this. The caller is responsible
+// for adding the new g to the scheduler.
+func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
+	_g_ := getg()
+
+	if fn == nil {
+		_g_.m.throwing = -1 // do not dump full stacks
+		throw("go of nil func value")
+	}
+	acquirem() // disable preemption because it can be holding p in a local var
+
+	_p_ := _g_.m.p.ptr()
+	newg := gfget(_p_)
+	if newg == nil {
+		newg = malg(_StackMin)
+		casgstatus(newg, _Gidle, _Gdead)
+		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
+	}
+	if newg.stack.hi == 0 {
+		throw("newproc1: newg missing stack")
+	}
+
+	if readgstatus(newg) != _Gdead {
+		throw("newproc1: new g is not Gdead")
+	}
+
+	totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
+	totalSize = alignUp(totalSize, sys.StackAlign)
+	sp := newg.stack.hi - totalSize
+	spArg := sp
+	if usesLR {
+		// caller's LR
+		*(*uintptr)(unsafe.Pointer(sp)) = 0
+		prepGoExitFrame(sp)
+		spArg += sys.MinFrameSize
+	}
+
+	memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
+	newg.sched.sp = sp
+	newg.stktopsp = sp
+	newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
+	newg.sched.g = guintptr(unsafe.Pointer(newg))
+	gostartcallfn(&newg.sched, fn)
+	newg.gopc = callerpc
+	newg.ancestors = saveAncestors(callergp)
+	newg.startpc = fn.fn
+	if isSystemGoroutine(newg, false) {
+		atomic.Xadd(&sched.ngsys, +1)
+	} else {
+		// Only user goroutines inherit pprof labels.
+		if _g_.m.curg != nil {
+			newg.labels = _g_.m.curg.labels
+		}
+	}
+	// Track initial transition?
+	newg.trackingSeq = uint8(fastrand())
+	if newg.trackingSeq%gTrackingPeriod == 0 {
+		newg.tracking = true
+	}
+	casgstatus(newg, _Gdead, _Grunnable)
+	gcController.addScannableStack(_p_, int64(newg.stack.hi-newg.stack.lo))
+
+	if _p_.goidcache == _p_.goidcacheend {
+		// Sched.goidgen is the last allocated id,
+		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
+		// At startup sched.goidgen=0, so main goroutine receives goid=1.
+		_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
+		_p_.goidcache -= _GoidCacheBatch - 1
+		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
+	}
+	newg.goid = int64(_p_.goidcache)
+	_p_.goidcache++
+	if raceenabled {
+		newg.racectx = racegostart(callerpc)
+	}
+	if trace.enabled {
+		traceGoCreate(newg, newg.startpc)
+	}
+	releasem(_g_.m)
+
+	return newg
+}
+
+// saveAncestors copies previous ancestors of the given caller g and
+// includes infor for the current caller into a new set of tracebacks for
+// a g being created.
+func saveAncestors(callergp *g) *[]ancestorInfo {
+	// Copy all prior info, except for the root goroutine (goid 0).
+	if debug.tracebackancestors <= 0 || callergp.goid == 0 {
+		return nil
+	}
+	var callerAncestors []ancestorInfo
+	if callergp.ancestors != nil {
+		callerAncestors = *callergp.ancestors
+	}
+	n := int32(len(callerAncestors)) + 1
+	if n > debug.tracebackancestors {
+		n = debug.tracebackancestors
+	}
+	ancestors := make([]ancestorInfo, n)
+	copy(ancestors[1:], callerAncestors)
+
+	var pcs [_TracebackMaxFrames]uintptr
+	npcs := gcallers(callergp, 0, pcs[:])
+	ipcs := make([]uintptr, npcs)
+	copy(ipcs, pcs[:])
+	ancestors[0] = ancestorInfo{
+		pcs:  ipcs,
+		goid: callergp.goid,
+		gopc: callergp.gopc,
+	}
+
+	ancestorsp := new([]ancestorInfo)
+	*ancestorsp = ancestors
+	return ancestorsp
+}
+
+// Put on gfree list.
+// If local list is too long, transfer a batch to the global list.
+func gfput(_p_ *p, gp *g) {
+	if readgstatus(gp) != _Gdead {
+		throw("gfput: bad status (not Gdead)")
+	}
+
+	stksize := gp.stack.hi - gp.stack.lo
+
+	if stksize != _FixedStack {
+		// non-standard stack size - free it.
+		stackfree(gp.stack)
+		gp.stack.lo = 0
+		gp.stack.hi = 0
+		gp.stackguard0 = 0
+	}
+
+	_p_.gFree.push(gp)
+	_p_.gFree.n++
+	if _p_.gFree.n >= 64 {
+		var (
+			inc      int32
+			stackQ   gQueue
+			noStackQ gQueue
+		)
+		for _p_.gFree.n >= 32 {
+			gp = _p_.gFree.pop()
+			_p_.gFree.n--
+			if gp.stack.lo == 0 {
+				noStackQ.push(gp)
+			} else {
+				stackQ.push(gp)
+			}
+			inc++
+		}
+		lock(&sched.gFree.lock)
+		sched.gFree.noStack.pushAll(noStackQ)
+		sched.gFree.stack.pushAll(stackQ)
+		sched.gFree.n += inc
+		unlock(&sched.gFree.lock)
+	}
+}
+
+// Get from gfree list.
+// If local list is empty, grab a batch from global list.
+func gfget(_p_ *p) *g {
+retry:
+	if _p_.gFree.empty() && (!sched.gFree.stack.empty() || !sched.gFree.noStack.empty()) {
+		lock(&sched.gFree.lock)
+		// Move a batch of free Gs to the P.
+		for _p_.gFree.n < 32 {
+			// Prefer Gs with stacks.
+			gp := sched.gFree.stack.pop()
+			if gp == nil {
+				gp = sched.gFree.noStack.pop()
+				if gp == nil {
+					break
+				}
+			}
+			sched.gFree.n--
+			_p_.gFree.push(gp)
+			_p_.gFree.n++
+		}
+		unlock(&sched.gFree.lock)
+		goto retry
+	}
+	gp := _p_.gFree.pop()
+	if gp == nil {
+		return nil
+	}
+	_p_.gFree.n--
+	if gp.stack.lo == 0 {
+		// Stack was deallocated in gfput. Allocate a new one.
+		systemstack(func() {
+			gp.stack = stackalloc(_FixedStack)
+		})
+		gp.stackguard0 = gp.stack.lo + _StackGuard
+	} else {
+		if raceenabled {
+			racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
+		}
+		if msanenabled {
+			msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
+		}
+		if asanenabled {
+			asanunpoison(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
+		}
+	}
+	return gp
+}
+
+// Purge all cached G's from gfree list to the global list.
+func gfpurge(_p_ *p) {
+	var (
+		inc      int32
+		stackQ   gQueue
+		noStackQ gQueue
+	)
+	for !_p_.gFree.empty() {
+		gp := _p_.gFree.pop()
+		_p_.gFree.n--
+		if gp.stack.lo == 0 {
+			noStackQ.push(gp)
+		} else {
+			stackQ.push(gp)
+		}
+		inc++
+	}
+	lock(&sched.gFree.lock)
+	sched.gFree.noStack.pushAll(noStackQ)
+	sched.gFree.stack.pushAll(stackQ)
+	sched.gFree.n += inc
+	unlock(&sched.gFree.lock)
+}
+
+// Breakpoint executes a breakpoint trap.
+func Breakpoint() {
+	breakpoint()
+}
+
+// dolockOSThread is called by LockOSThread and lockOSThread below
+// after they modify m.locked. Do not allow preemption during this call,
+// or else the m might be different in this function than in the caller.
+//go:nosplit
+func dolockOSThread() {
+	if GOARCH == "wasm" {
+		return // no threads on wasm yet
+	}
+	_g_ := getg()
+	_g_.m.lockedg.set(_g_)
+	_g_.lockedm.set(_g_.m)
+}
+
+//go:nosplit
+
+// LockOSThread wires the calling goroutine to its current operating system thread.
+// The calling goroutine will always execute in that thread,
+// and no other goroutine will execute in it,
+// until the calling goroutine has made as many calls to
+// UnlockOSThread as to LockOSThread.
+// If the calling goroutine exits without unlocking the thread,
+// the thread will be terminated.
+//
+// All init functions are run on the startup thread. Calling LockOSThread
+// from an init function will cause the main function to be invoked on
+// that thread.
+//
+// A goroutine should call LockOSThread before calling OS services or
+// non-Go library functions that depend on per-thread state.
+func LockOSThread() {
+	if atomic.Load(&newmHandoff.haveTemplateThread) == 0 && GOOS != "plan9" {
+		// If we need to start a new thread from the locked
+		// thread, we need the template thread. Start it now
+		// while we're in a known-good state.
+		startTemplateThread()
+	}
+	_g_ := getg()
+	_g_.m.lockedExt++
+	if _g_.m.lockedExt == 0 {
+		_g_.m.lockedExt--
+		panic("LockOSThread nesting overflow")
+	}
+	dolockOSThread()
+}
+
+//go:nosplit
+func lockOSThread() {
+	getg().m.lockedInt++
+	dolockOSThread()
+}
+
+// dounlockOSThread is called by UnlockOSThread and unlockOSThread below
+// after they update m->locked. Do not allow preemption during this call,
+// or else the m might be in different in this function than in the caller.
+//go:nosplit
+func dounlockOSThread() {
+	if GOARCH == "wasm" {
+		return // no threads on wasm yet
+	}
+	_g_ := getg()
+	if _g_.m.lockedInt != 0 || _g_.m.lockedExt != 0 {
+		return
+	}
+	_g_.m.lockedg = 0
+	_g_.lockedm = 0
+}
+
+//go:nosplit
+
+// UnlockOSThread undoes an earlier call to LockOSThread.
+// If this drops the number of active LockOSThread calls on the
+// calling goroutine to zero, it unwires the calling goroutine from
+// its fixed operating system thread.
+// If there are no active LockOSThread calls, this is a no-op.
+//
+// Before calling UnlockOSThread, the caller must ensure that the OS
+// thread is suitable for running other goroutines. If the caller made
+// any permanent changes to the state of the thread that would affect
+// other goroutines, it should not call this function and thus leave
+// the goroutine locked to the OS thread until the goroutine (and
+// hence the thread) exits.
+func UnlockOSThread() {
+	_g_ := getg()
+	if _g_.m.lockedExt == 0 {
+		return
+	}
+	_g_.m.lockedExt--
+	dounlockOSThread()
+}
+
+//go:nosplit
+func unlockOSThread() {
+	_g_ := getg()
+	if _g_.m.lockedInt == 0 {
+		systemstack(badunlockosthread)
+	}
+	_g_.m.lockedInt--
+	dounlockOSThread()
+}
+
+func badunlockosthread() {
+	throw("runtime: internal error: misuse of lockOSThread/unlockOSThread")
+}
+
+func gcount() int32 {
+	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
+	for _, _p_ := range allp {
+		n -= _p_.gFree.n
+	}
+
+	// All these variables can be changed concurrently, so the result can be inconsistent.
+	// But at least the current goroutine is running.
+	if n < 1 {
+		n = 1
+	}
+	return n
+}
+
+func mcount() int32 {
+	return int32(sched.mnext - sched.nmfreed)
+}
+
+var prof struct {
+	signalLock uint32
+	hz         int32
+}
+
+func _System()                    { _System() }
+func _ExternalCode()              { _ExternalCode() }
+func _LostExternalCode()          { _LostExternalCode() }
+func _GC()                        { _GC() }
+func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() }
+func _VDSO()                      { _VDSO() }
+
+// Called if we receive a SIGPROF signal.
+// Called by the signal handler, may run during STW.
+//go:nowritebarrierrec
+func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
+	if prof.hz == 0 {
+		return
+	}
+
+	// If mp.profilehz is 0, then profiling is not enabled for this thread.
+	// We must check this to avoid a deadlock between setcpuprofilerate
+	// and the call to cpuprof.add, below.
+	if mp != nil && mp.profilehz == 0 {
+		return
+	}
+
+	// On mips{,le}/arm, 64bit atomics are emulated with spinlocks, in
+	// runtime/internal/atomic. If SIGPROF arrives while the program is inside
+	// the critical section, it creates a deadlock (when writing the sample).
+	// As a workaround, create a counter of SIGPROFs while in critical section
+	// to store the count, and pass it to sigprof.add() later when SIGPROF is
+	// received from somewhere else (with _LostSIGPROFDuringAtomic64 as pc).
+	if GOARCH == "mips" || GOARCH == "mipsle" || GOARCH == "arm" {
+		if f := findfunc(pc); f.valid() {
+			if hasPrefix(funcname(f), "runtime/internal/atomic") {
+				cpuprof.lostAtomic++
+				return
+			}
+		}
+		if GOARCH == "arm" && goarm < 7 && GOOS == "linux" && pc&0xffff0000 == 0xffff0000 {
+			// runtime/internal/atomic functions call into kernel
+			// helpers on arm < 7. See
+			// runtime/internal/atomic/sys_linux_arm.s.
+			cpuprof.lostAtomic++
+			return
+		}
+	}
+
+	// Profiling runs concurrently with GC, so it must not allocate.
+	// Set a trap in case the code does allocate.
+	// Note that on windows, one thread takes profiles of all the
+	// other threads, so mp is usually not getg().m.
+	// In fact mp may not even be stopped.
+	// See golang.org/issue/17165.
+	getg().m.mallocing++
+
+	var stk [maxCPUProfStack]uintptr
+	n := 0
+	if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
+		cgoOff := 0
+		// Check cgoCallersUse to make sure that we are not
+		// interrupting other code that is fiddling with
+		// cgoCallers.  We are running in a signal handler
+		// with all signals blocked, so we don't have to worry
+		// about any other code interrupting us.
+		if atomic.Load(&mp.cgoCallersUse) == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
+			for cgoOff < len(mp.cgoCallers) && mp.cgoCallers[cgoOff] != 0 {
+				cgoOff++
+			}
+			copy(stk[:], mp.cgoCallers[:cgoOff])
+			mp.cgoCallers[0] = 0
+		}
+
+		// Collect Go stack that leads to the cgo call.
+		n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[cgoOff], len(stk)-cgoOff, nil, nil, 0)
+		if n > 0 {
+			n += cgoOff
+		}
+	} else {
+		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
+	}
+
+	if n <= 0 {
+		// Normal traceback is impossible or has failed.
+		// See if it falls into several common cases.
+		n = 0
+		if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
+			// Libcall, i.e. runtime syscall on windows.
+			// Collect Go stack that leads to the call.
+			n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
+		}
+		if n == 0 && mp != nil && mp.vdsoSP != 0 {
+			n = gentraceback(mp.vdsoPC, mp.vdsoSP, 0, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
+		}
+		if n == 0 {
+			// If all of the above has failed, account it against abstract "System" or "GC".
+			n = 2
+			if inVDSOPage(pc) {
+				pc = abi.FuncPCABIInternal(_VDSO) + sys.PCQuantum
+			} else if pc > firstmoduledata.etext {
+				// "ExternalCode" is better than "etext".
+				pc = abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum
+			}
+			stk[0] = pc
+			if mp.preemptoff != "" {
+				stk[1] = abi.FuncPCABIInternal(_GC) + sys.PCQuantum
+			} else {
+				stk[1] = abi.FuncPCABIInternal(_System) + sys.PCQuantum
+			}
+		}
+	}
+
+	if prof.hz != 0 {
+		// Note: it can happen on Windows that we interrupted a system thread
+		// with no g, so gp could nil. The other nil checks are done out of
+		// caution, but not expected to be nil in practice.
+		var tagPtr *unsafe.Pointer
+		if gp != nil && gp.m != nil && gp.m.curg != nil {
+			tagPtr = &gp.m.curg.labels
+		}
+		cpuprof.add(tagPtr, stk[:n])
+	}
+	getg().m.mallocing--
+}
+
+// setcpuprofilerate sets the CPU profiling rate to hz times per second.
+// If hz <= 0, setcpuprofilerate turns off CPU profiling.
+func setcpuprofilerate(hz int32) {
+	// Force sane arguments.
+	if hz < 0 {
+		hz = 0
+	}
+
+	// Disable preemption, otherwise we can be rescheduled to another thread
+	// that has profiling enabled.
+	_g_ := getg()
+	_g_.m.locks++
+
+	// Stop profiler on this thread so that it is safe to lock prof.
+	// if a profiling signal came in while we had prof locked,
+	// it would deadlock.
+	setThreadCPUProfiler(0)
+
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
+	}
+	if prof.hz != hz {
+		setProcessCPUProfiler(hz)
+		prof.hz = hz
+	}
+	atomic.Store(&prof.signalLock, 0)
+
+	lock(&sched.lock)
+	sched.profilehz = hz
+	unlock(&sched.lock)
+
+	if hz != 0 {
+		setThreadCPUProfiler(hz)
+	}
+
+	_g_.m.locks--
+}
+
+// init initializes pp, which may be a freshly allocated p or a
+// previously destroyed p, and transitions it to status _Pgcstop.
+func (pp *p) init(id int32) {
+	pp.id = id
+	pp.status = _Pgcstop
+	pp.sudogcache = pp.sudogbuf[:0]
+	pp.deferpool = pp.deferpoolbuf[:0]
+	pp.wbBuf.reset()
+	if pp.mcache == nil {
+		if id == 0 {
+			if mcache0 == nil {
+				throw("missing mcache?")
+			}
+			// Use the bootstrap mcache0. Only one P will get
+			// mcache0: the one with ID 0.
+			pp.mcache = mcache0
+		} else {
+			pp.mcache = allocmcache()
+		}
+	}
+	if raceenabled && pp.raceprocctx == 0 {
+		if id == 0 {
+			pp.raceprocctx = raceprocctx0
+			raceprocctx0 = 0 // bootstrap
+		} else {
+			pp.raceprocctx = raceproccreate()
+		}
+	}
+	lockInit(&pp.timersLock, lockRankTimers)
+
+	// This P may get timers when it starts running. Set the mask here
+	// since the P may not go through pidleget (notably P 0 on startup).
+	timerpMask.set(id)
+	// Similarly, we may not go through pidleget before this P starts
+	// running if it is P 0 on startup.
+	idlepMask.clear(id)
+}
+
+// destroy releases all of the resources associated with pp and
+// transitions it to status _Pdead.
+//
+// sched.lock must be held and the world must be stopped.
+func (pp *p) destroy() {
+	assertLockHeld(&sched.lock)
+	assertWorldStopped()
+
+	// Move all runnable goroutines to the global queue
+	for pp.runqhead != pp.runqtail {
+		// Pop from tail of local queue
+		pp.runqtail--
+		gp := pp.runq[pp.runqtail%uint32(len(pp.runq))].ptr()
+		// Push onto head of global queue
+		globrunqputhead(gp)
+	}
+	if pp.runnext != 0 {
+		globrunqputhead(pp.runnext.ptr())
+		pp.runnext = 0
+	}
+	if len(pp.timers) > 0 {
+		plocal := getg().m.p.ptr()
+		// The world is stopped, but we acquire timersLock to
+		// protect against sysmon calling timeSleepUntil.
+		// This is the only case where we hold the timersLock of
+		// more than one P, so there are no deadlock concerns.
+		lock(&plocal.timersLock)
+		lock(&pp.timersLock)
+		moveTimers(plocal, pp.timers)
+		pp.timers = nil
+		pp.numTimers = 0
+		pp.deletedTimers = 0
+		atomic.Store64(&pp.timer0When, 0)
+		unlock(&pp.timersLock)
+		unlock(&plocal.timersLock)
+	}
+	// Flush p's write barrier buffer.
+	if gcphase != _GCoff {
+		wbBufFlush1(pp)
+		pp.gcw.dispose()
+	}
+	for i := range pp.sudogbuf {
+		pp.sudogbuf[i] = nil
+	}
+	pp.sudogcache = pp.sudogbuf[:0]
+	for j := range pp.deferpoolbuf {
+		pp.deferpoolbuf[j] = nil
+	}
+	pp.deferpool = pp.deferpoolbuf[:0]
+	systemstack(func() {
+		for i := 0; i < pp.mspancache.len; i++ {
+			// Safe to call since the world is stopped.
+			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
+		}
+		pp.mspancache.len = 0
+		lock(&mheap_.lock)
+		pp.pcache.flush(&mheap_.pages)
+		unlock(&mheap_.lock)
+	})
+	freemcache(pp.mcache)
+	pp.mcache = nil
+	gfpurge(pp)
+	traceProcFree(pp)
+	if raceenabled {
+		if pp.timerRaceCtx != 0 {
+			// The race detector code uses a callback to fetch
+			// the proc context, so arrange for that callback
+			// to see the right thing.
+			// This hack only works because we are the only
+			// thread running.
+			mp := getg().m
+			phold := mp.p.ptr()
+			mp.p.set(pp)
+
+			racectxend(pp.timerRaceCtx)
+			pp.timerRaceCtx = 0
+
+			mp.p.set(phold)
+		}
+		raceprocdestroy(pp.raceprocctx)
+		pp.raceprocctx = 0
+	}
+	pp.gcAssistTime = 0
+	pp.status = _Pdead
+}
+
+// Change number of processors.
+//
+// sched.lock must be held, and the world must be stopped.
+//
+// gcworkbufs must not be being modified by either the GC or the write barrier
+// code, so the GC must not be running if the number of Ps actually changes.
+//
+// Returns list of Ps with local work, they need to be scheduled by the caller.
+func procresize(nprocs int32) *p {
+	assertLockHeld(&sched.lock)
+	assertWorldStopped()
+
+	old := gomaxprocs
+	if old < 0 || nprocs <= 0 {
+		throw("procresize: invalid arg")
+	}
+	if trace.enabled {
+		traceGomaxprocs(nprocs)
+	}
+
+	// update statistics
+	now := nanotime()
+	if sched.procresizetime != 0 {
+		sched.totaltime += int64(old) * (now - sched.procresizetime)
+	}
+	sched.procresizetime = now
+
+	maskWords := (nprocs + 31) / 32
+
+	// Grow allp if necessary.
+	if nprocs > int32(len(allp)) {
+		// Synchronize with retake, which could be running
+		// concurrently since it doesn't run on a P.
+		lock(&allpLock)
+		if nprocs <= int32(cap(allp)) {
+			allp = allp[:nprocs]
+		} else {
+			nallp := make([]*p, nprocs)
+			// Copy everything up to allp's cap so we
+			// never lose old allocated Ps.
+			copy(nallp, allp[:cap(allp)])
+			allp = nallp
+		}
+
+		if maskWords <= int32(cap(idlepMask)) {
+			idlepMask = idlepMask[:maskWords]
+			timerpMask = timerpMask[:maskWords]
+		} else {
+			nidlepMask := make([]uint32, maskWords)
+			// No need to copy beyond len, old Ps are irrelevant.
+			copy(nidlepMask, idlepMask)
+			idlepMask = nidlepMask
+
+			ntimerpMask := make([]uint32, maskWords)
+			copy(ntimerpMask, timerpMask)
+			timerpMask = ntimerpMask
+		}
+		unlock(&allpLock)
+	}
+
+	// initialize new P's
+	for i := old; i < nprocs; i++ {
+		pp := allp[i]
+		if pp == nil {
+			pp = new(p)
+		}
+		pp.init(i)
+		atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
+	}
+
+	_g_ := getg()
+	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
+		// continue to use the current P
+		_g_.m.p.ptr().status = _Prunning
+		_g_.m.p.ptr().mcache.prepareForSweep()
+	} else {
+		// release the current P and acquire allp[0].
+		//
+		// We must do this before destroying our current P
+		// because p.destroy itself has write barriers, so we
+		// need to do that from a valid P.
+		if _g_.m.p != 0 {
+			if trace.enabled {
+				// Pretend that we were descheduled
+				// and then scheduled again to keep
+				// the trace sane.
+				traceGoSched()
+				traceProcStop(_g_.m.p.ptr())
+			}
+			_g_.m.p.ptr().m = 0
+		}
+		_g_.m.p = 0
+		p := allp[0]
+		p.m = 0
+		p.status = _Pidle
+		acquirep(p)
+		if trace.enabled {
+			traceGoStart()
+		}
+	}
+
+	// g.m.p is now set, so we no longer need mcache0 for bootstrapping.
+	mcache0 = nil
+
+	// release resources from unused P's
+	for i := nprocs; i < old; i++ {
+		p := allp[i]
+		p.destroy()
+		// can't free P itself because it can be referenced by an M in syscall
+	}
+
+	// Trim allp.
+	if int32(len(allp)) != nprocs {
+		lock(&allpLock)
+		allp = allp[:nprocs]
+		idlepMask = idlepMask[:maskWords]
+		timerpMask = timerpMask[:maskWords]
+		unlock(&allpLock)
+	}
+
+	var runnablePs *p
+	for i := nprocs - 1; i >= 0; i-- {
+		p := allp[i]
+		if _g_.m.p.ptr() == p {
+			continue
+		}
+		p.status = _Pidle
+		if runqempty(p) {
+			pidleput(p)
+		} else {
+			p.m.set(mget())
+			p.link.set(runnablePs)
+			runnablePs = p
+		}
+	}
+	stealOrder.reset(uint32(nprocs))
+	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
+	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
+	return runnablePs
+}
+
+// Associate p and the current m.
+//
+// This function is allowed to have write barriers even if the caller
+// isn't because it immediately acquires _p_.
+//
+//go:yeswritebarrierrec
+func acquirep(_p_ *p) {
+	// Do the part that isn't allowed to have write barriers.
+	wirep(_p_)
+
+	// Have p; write barriers now allowed.
+
+	// Perform deferred mcache flush before this P can allocate
+	// from a potentially stale mcache.
+	_p_.mcache.prepareForSweep()
+
+	if trace.enabled {
+		traceProcStart()
+	}
+}
+
+// wirep is the first step of acquirep, which actually associates the
+// current M to _p_. This is broken out so we can disallow write
+// barriers for this part, since we don't yet have a P.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func wirep(_p_ *p) {
+	_g_ := getg()
+
+	if _g_.m.p != 0 {
+		throw("wirep: already in go")
+	}
+	if _p_.m != 0 || _p_.status != _Pidle {
+		id := int64(0)
+		if _p_.m != 0 {
+			id = _p_.m.ptr().id
+		}
+		print("wirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
+		throw("wirep: invalid p state")
+	}
+	_g_.m.p.set(_p_)
+	_p_.m.set(_g_.m)
+	_p_.status = _Prunning
+}
+
+// Disassociate p and the current m.
+func releasep() *p {
+	_g_ := getg()
+
+	if _g_.m.p == 0 {
+		throw("releasep: invalid arg")
+	}
+	_p_ := _g_.m.p.ptr()
+	if _p_.m.ptr() != _g_.m || _p_.status != _Prunning {
+		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", hex(_p_.m), " p->status=", _p_.status, "\n")
+		throw("releasep: invalid p state")
+	}
+	if trace.enabled {
+		traceProcStop(_g_.m.p.ptr())
+	}
+	_g_.m.p = 0
+	_p_.m = 0
+	_p_.status = _Pidle
+	return _p_
+}
+
+func incidlelocked(v int32) {
+	lock(&sched.lock)
+	sched.nmidlelocked += v
+	if v > 0 {
+		checkdead()
+	}
+	unlock(&sched.lock)
+}
+
+// Check for deadlock situation.
+// The check is based on number of running M's, if 0 -> deadlock.
+// sched.lock must be held.
+func checkdead() {
+	assertLockHeld(&sched.lock)
+
+	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
+	// there are no running goroutines. The calling program is
+	// assumed to be running.
+	if islibrary || isarchive {
+		return
+	}
+
+	// If we are dying because of a signal caught on an already idle thread,
+	// freezetheworld will cause all running threads to block.
+	// And runtime will essentially enter into deadlock state,
+	// except that there is a thread that will call exit soon.
+	if panicking > 0 {
+		return
+	}
+
+	// If we are not running under cgo, but we have an extra M then account
+	// for it. (It is possible to have an extra M on Windows without cgo to
+	// accommodate callbacks created by syscall.NewCallback. See issue #6751
+	// for details.)
+	var run0 int32
+	if !iscgo && cgoHasExtraM {
+		mp := lockextra(true)
+		haveExtraM := extraMCount > 0
+		unlockextra(mp)
+		if haveExtraM {
+			run0 = 1
+		}
+	}
+
+	run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys
+	if run > run0 {
+		return
+	}
+	if run < 0 {
+		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n")
+		throw("checkdead: inconsistent counts")
+	}
+
+	grunning := 0
+	forEachG(func(gp *g) {
+		if isSystemGoroutine(gp, false) {
+			return
+		}
+		s := readgstatus(gp)
+		switch s &^ _Gscan {
+		case _Gwaiting,
+			_Gpreempted:
+			grunning++
+		case _Grunnable,
+			_Grunning,
+			_Gsyscall:
+			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
+			throw("checkdead: runnable g")
+		}
+	})
+	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
+		unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
+		throw("no goroutines (main called runtime.Goexit) - deadlock!")
+	}
+
+	// Maybe jump time forward for playground.
+	if faketime != 0 {
+		when, _p_ := timeSleepUntil()
+		if _p_ != nil {
+			faketime = when
+			for pp := &sched.pidle; *pp != 0; pp = &(*pp).ptr().link {
+				if (*pp).ptr() == _p_ {
+					*pp = _p_.link
+					break
+				}
+			}
+			mp := mget()
+			if mp == nil {
+				// There should always be a free M since
+				// nothing is running.
+				throw("checkdead: no m for timer")
+			}
+			mp.nextp.set(_p_)
+			notewakeup(&mp.park)
+			return
+		}
+	}
+
+	// There are no goroutines running, so we can look at the P's.
+	for _, _p_ := range allp {
+		if len(_p_.timers) > 0 {
+			return
+		}
+	}
+
+	getg().m.throwing = -1 // do not dump full stacks
+	unlock(&sched.lock)    // unlock so that GODEBUG=scheddetail=1 doesn't hang
+	throw("all goroutines are asleep - deadlock!")
+}
+
+// forcegcperiod is the maximum time in nanoseconds between garbage
+// collections. If we go this long without a garbage collection, one
+// is forced to run.
+//
+// This is a variable for testing purposes. It normally doesn't change.
+var forcegcperiod int64 = 2 * 60 * 1e9
+
+// needSysmonWorkaround is true if the workaround for
+// golang.org/issue/42515 is needed on NetBSD.
+var needSysmonWorkaround bool = false
+
+// Always runs without a P, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func sysmon() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
+	lasttrace := int64(0)
+	idle := 0 // how many cycles in succession we had not wokeup somebody
+	delay := uint32(0)
+
+	for {
+		if idle == 0 { // start with 20us sleep...
+			delay = 20
+		} else if idle > 50 { // start doubling the sleep after 1ms...
+			delay *= 2
+		}
+		if delay > 10*1000 { // up to 10ms
+			delay = 10 * 1000
+		}
+		usleep(delay)
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
+		now := nanotime()
+		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
+			lock(&sched.lock)
+			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
+				if next > now {
+					atomic.Store(&sched.sysmonwait, 1)
+					unlock(&sched.lock)
+					// Make wake-up period small enough
+					// for the sampling to be correct.
+					sleep := forcegcperiod / 2
+					if next-now < sleep {
+						sleep = next - now
+					}
+					shouldRelax := sleep >= osRelaxMinNS
+					if shouldRelax {
+						osRelax(true)
+					}
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
+					if shouldRelax {
+						osRelax(false)
+					}
+					lock(&sched.lock)
+					atomic.Store(&sched.sysmonwait, 0)
+					noteclear(&sched.sysmonnote)
+				}
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
+			}
+			unlock(&sched.lock)
+		}
+
+		lock(&sched.sysmonlock)
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
+
+		// trigger libc interceptors if needed
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
+		// poll network if not polled for more than 10ms
+		lastpoll := int64(atomic.Load64(&sched.lastpoll))
+		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
+			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+			list := netpoll(0) // non-blocking - returns list of goroutines
+			if !list.empty() {
+				// Need to decrement number of idle locked M's
+				// (pretending that one more is running) before injectglist.
+				// Otherwise it can lead to the following situation:
+				// injectglist grabs all P's but before it starts M's to run the P's,
+				// another M returns from syscall, finishes running its G,
+				// observes that there is no work to do and no other running M's
+				// and reports deadlock.
+				incidlelocked(-1)
+				injectglist(&list)
+				incidlelocked(1)
+			}
+		}
+		if GOOS == "netbsd" && needSysmonWorkaround {
+			// netpoll is responsible for waiting for timer
+			// expiration, so we typically don't have to worry
+			// about starting an M to service timers. (Note that
+			// sleep for timeSleepUntil above simply ensures sysmon
+			// starts running again when that timer expiration may
+			// cause Go code to run again).
+			//
+			// However, netbsd has a kernel bug that sometimes
+			// misses netpollBreak wake-ups, which can lead to
+			// unbounded delays servicing timers. If we detect this
+			// overrun, then startm to get something to handle the
+			// timer.
+			//
+			// See issue 42515 and
+			// https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
+			if next, _ := timeSleepUntil(); next < now {
+				startm(nil, false)
+			}
+		}
+		if atomic.Load(&scavenge.sysmonWake) != 0 {
+			// Kick the scavenger awake if someone requested it.
+			wakeScavenger()
+		}
+		// retake P's blocked in syscalls
+		// and preempt long running G's
+		if retake(now) != 0 {
+			idle = 0
+		} else {
+			idle++
+		}
+		// check if we need to force a GC
+		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
+			lock(&forcegc.lock)
+			forcegc.idle = 0
+			var list gList
+			list.push(forcegc.g)
+			injectglist(&list)
+			unlock(&forcegc.lock)
+		}
+		if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now {
+			lasttrace = now
+			schedtrace(debug.scheddetail > 0)
+		}
+		unlock(&sched.sysmonlock)
+	}
+}
+
+type sysmontick struct {
+	schedtick   uint32
+	schedwhen   int64
+	syscalltick uint32
+	syscallwhen int64
+}
+
+// forcePreemptNS is the time slice given to a G before it is
+// preempted.
+const forcePreemptNS = 10 * 1000 * 1000 // 10ms
+
+func retake(now int64) uint32 {
+	n := 0
+	// Prevent allp slice changes. This lock will be completely
+	// uncontended unless we're already stopping the world.
+	lock(&allpLock)
+	// We can't use a range loop over allp because we may
+	// temporarily drop the allpLock. Hence, we need to re-fetch
+	// allp each time around the loop.
+	for i := 0; i < len(allp); i++ {
+		_p_ := allp[i]
+		if _p_ == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
+			continue
+		}
+		pd := &_p_.sysmontick
+		s := _p_.status
+		sysretake := false
+		if s == _Prunning || s == _Psyscall {
+			// Preempt G if it's running for too long.
+			t := int64(_p_.schedtick)
+			if int64(pd.schedtick) != t {
+				pd.schedtick = uint32(t)
+				pd.schedwhen = now
+			} else if pd.schedwhen+forcePreemptNS <= now {
+				preemptone(_p_)
+				// In case of syscall, preemptone() doesn't
+				// work, because there is no M wired to P.
+				sysretake = true
+			}
+		}
+		if s == _Psyscall {
+			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
+			t := int64(_p_.syscalltick)
+			if !sysretake && int64(pd.syscalltick) != t {
+				pd.syscalltick = uint32(t)
+				pd.syscallwhen = now
+				continue
+			}
+			// On the one hand we don't want to retake Ps if there is no other work to do,
+			// but on the other hand we want to retake them eventually
+			// because they can prevent the sysmon thread from deep sleep.
+			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
+				continue
+			}
+			// Drop allpLock so we can take sched.lock.
+			unlock(&allpLock)
+			// Need to decrement number of idle locked M's
+			// (pretending that one more is running) before the CAS.
+			// Otherwise the M from which we retake can exit the syscall,
+			// increment nmidle and report deadlock.
+			incidlelocked(-1)
+			if atomic.Cas(&_p_.status, s, _Pidle) {
+				if trace.enabled {
+					traceGoSysBlock(_p_)
+					traceProcStop(_p_)
+				}
+				n++
+				_p_.syscalltick++
+				handoffp(_p_)
+			}
+			incidlelocked(1)
+			lock(&allpLock)
+		}
+	}
+	unlock(&allpLock)
+	return uint32(n)
+}
+
+// Tell all goroutines that they have been preempted and they should stop.
+// This function is purely best-effort. It can fail to inform a goroutine if a
+// processor just started running it.
+// No locks need to be held.
+// Returns true if preemption request was issued to at least one goroutine.
+func preemptall() bool {
+	res := false
+	for _, _p_ := range allp {
+		if _p_.status != _Prunning {
+			continue
+		}
+		if preemptone(_p_) {
+			res = true
+		}
+	}
+	return res
+}
+
+// Tell the goroutine running on processor P to stop.
+// This function is purely best-effort. It can incorrectly fail to inform the
+// goroutine. It can inform the wrong goroutine. Even if it informs the
+// correct goroutine, that goroutine might ignore the request if it is
+// simultaneously executing newstack.
+// No lock needs to be held.
+// Returns true if preemption request was issued.
+// The actual preemption will happen at some point in the future
+// and will be indicated by the gp->status no longer being
+// Grunning
+func preemptone(_p_ *p) bool {
+	mp := _p_.m.ptr()
+	if mp == nil || mp == getg().m {
+		return false
+	}
+	gp := mp.curg
+	if gp == nil || gp == mp.g0 {
+		return false
+	}
+
+	gp.preempt = true
+
+	// Every call in a goroutine checks for stack overflow by
+	// comparing the current stack pointer to gp->stackguard0.
+	// Setting gp->stackguard0 to StackPreempt folds
+	// preemption into the normal stack overflow check.
+	gp.stackguard0 = stackPreempt
+
+	// Request an async preemption of this P.
+	if preemptMSupported && debug.asyncpreemptoff == 0 {
+		_p_.preempt = true
+		preemptM(mp)
+	}
+
+	return true
+}
+
+var starttime int64
+
+func schedtrace(detailed bool) {
+	now := nanotime()
+	if starttime == 0 {
+		starttime = now
+	}
+
+	lock(&sched.lock)
+	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", mcount(), " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+	if detailed {
+		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
+	}
+	// We must be careful while reading data from P's, M's and G's.
+	// Even if we hold schedlock, most data can be changed concurrently.
+	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
+	for i, _p_ := range allp {
+		mp := _p_.m.ptr()
+		h := atomic.Load(&_p_.runqhead)
+		t := atomic.Load(&_p_.runqtail)
+		if detailed {
+			id := int64(-1)
+			if mp != nil {
+				id = mp.id
+			}
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, " timerslen=", len(_p_.timers), "\n")
+		} else {
+			// In non-detailed mode format lengths of per-P run queues as:
+			// [len1 len2 len3 len4]
+			print(" ")
+			if i == 0 {
+				print("[")
+			}
+			print(t - h)
+			if i == len(allp)-1 {
+				print("]\n")
+			}
+		}
+	}
+
+	if !detailed {
+		unlock(&sched.lock)
+		return
+	}
+
+	for mp := allm; mp != nil; mp = mp.alllink {
+		_p_ := mp.p.ptr()
+		gp := mp.curg
+		lockedg := mp.lockedg.ptr()
+		id1 := int32(-1)
+		if _p_ != nil {
+			id1 = _p_.id
+		}
+		id2 := int64(-1)
+		if gp != nil {
+			id2 = gp.goid
+		}
+		id3 := int64(-1)
+		if lockedg != nil {
+			id3 = lockedg.goid
+		}
+		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
+	}
+
+	forEachG(func(gp *g) {
+		mp := gp.m
+		lockedm := gp.lockedm.ptr()
+		id1 := int64(-1)
+		if mp != nil {
+			id1 = mp.id
+		}
+		id2 := int64(-1)
+		if lockedm != nil {
+			id2 = lockedm.id
+		}
+		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=", id1, " lockedm=", id2, "\n")
+	})
+	unlock(&sched.lock)
+}
+
+// schedEnableUser enables or disables the scheduling of user
+// goroutines.
+//
+// This does not stop already running user goroutines, so the caller
+// should first stop the world when disabling user goroutines.
+func schedEnableUser(enable bool) {
+	lock(&sched.lock)
+	if sched.disable.user == !enable {
+		unlock(&sched.lock)
+		return
+	}
+	sched.disable.user = !enable
+	if enable {
+		n := sched.disable.n
+		sched.disable.n = 0
+		globrunqputbatch(&sched.disable.runnable, n)
+		unlock(&sched.lock)
+		for ; n != 0 && sched.npidle != 0; n-- {
+			startm(nil, false)
+		}
+	} else {
+		unlock(&sched.lock)
+	}
+}
+
+// schedEnabled reports whether gp should be scheduled. It returns
+// false is scheduling of gp is disabled.
+//
+// sched.lock must be held.
+func schedEnabled(gp *g) bool {
+	assertLockHeld(&sched.lock)
+
+	if sched.disable.user {
+		return isSystemGoroutine(gp, true)
+	}
+	return true
+}
+
+// Put mp on midle list.
+// sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func mput(mp *m) {
+	assertLockHeld(&sched.lock)
+
+	mp.schedlink = sched.midle
+	sched.midle.set(mp)
+	sched.nmidle++
+	checkdead()
+}
+
+// Try to get an m from midle list.
+// sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func mget() *m {
+	assertLockHeld(&sched.lock)
+
+	mp := sched.midle.ptr()
+	if mp != nil {
+		sched.midle = mp.schedlink
+		sched.nmidle--
+	}
+	return mp
+}
+
+// Put gp on the global runnable queue.
+// sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func globrunqput(gp *g) {
+	assertLockHeld(&sched.lock)
+
+	sched.runq.pushBack(gp)
+	sched.runqsize++
+}
+
+// Put gp at the head of the global runnable queue.
+// sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func globrunqputhead(gp *g) {
+	assertLockHeld(&sched.lock)
+
+	sched.runq.push(gp)
+	sched.runqsize++
+}
+
+// Put a batch of runnable goroutines on the global runnable queue.
+// This clears *batch.
+// sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func globrunqputbatch(batch *gQueue, n int32) {
+	assertLockHeld(&sched.lock)
+
+	sched.runq.pushBackAll(*batch)
+	sched.runqsize += n
+	*batch = gQueue{}
+}
+
+// Try get a batch of G's from the global runnable queue.
+// sched.lock must be held.
+func globrunqget(_p_ *p, max int32) *g {
+	assertLockHeld(&sched.lock)
+
+	if sched.runqsize == 0 {
+		return nil
+	}
+
+	n := sched.runqsize/gomaxprocs + 1
+	if n > sched.runqsize {
+		n = sched.runqsize
+	}
+	if max > 0 && n > max {
+		n = max
+	}
+	if n > int32(len(_p_.runq))/2 {
+		n = int32(len(_p_.runq)) / 2
+	}
+
+	sched.runqsize -= n
+
+	gp := sched.runq.pop()
+	n--
+	for ; n > 0; n-- {
+		gp1 := sched.runq.pop()
+		runqput(_p_, gp1, false)
+	}
+	return gp
+}
+
+// pMask is an atomic bitstring with one bit per P.
+type pMask []uint32
+
+// read returns true if P id's bit is set.
+func (p pMask) read(id uint32) bool {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	return (atomic.Load(&p[word]) & mask) != 0
+}
+
+// set sets P id's bit.
+func (p pMask) set(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.Or(&p[word], mask)
+}
+
+// clear clears P id's bit.
+func (p pMask) clear(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.And(&p[word], ^mask)
+}
+
+// updateTimerPMask clears pp's timer mask if it has no timers on its heap.
+//
+// Ideally, the timer mask would be kept immediately consistent on any timer
+// operations. Unfortunately, updating a shared global data structure in the
+// timer hot path adds too much overhead in applications frequently switching
+// between no timers and some timers.
+//
+// As a compromise, the timer mask is updated only on pidleget / pidleput. A
+// running P (returned by pidleget) may add a timer at any time, so its mask
+// must be set. An idle P (passed to pidleput) cannot add new timers while
+// idle, so if it has no timers at that time, its mask may be cleared.
+//
+// Thus, we get the following effects on timer-stealing in findrunnable:
+//
+// * Idle Ps with no timers when they go idle are never checked in findrunnable
+//   (for work- or timer-stealing; this is the ideal case).
+// * Running Ps must always be checked.
+// * Idle Ps whose timers are stolen must continue to be checked until they run
+//   again, even after timer expiration.
+//
+// When the P starts running again, the mask should be set, as a timer may be
+// added at any time.
+//
+// TODO(prattmic): Additional targeted updates may improve the above cases.
+// e.g., updating the mask when stealing a timer.
+func updateTimerPMask(pp *p) {
+	if atomic.Load(&pp.numTimers) > 0 {
+		return
+	}
+
+	// Looks like there are no timers, however another P may transiently
+	// decrement numTimers when handling a timerModified timer in
+	// checkTimers. We must take timersLock to serialize with these changes.
+	lock(&pp.timersLock)
+	if atomic.Load(&pp.numTimers) == 0 {
+		timerpMask.clear(pp.id)
+	}
+	unlock(&pp.timersLock)
+}
+
+// pidleput puts p to on the _Pidle list.
+//
+// This releases ownership of p. Once sched.lock is released it is no longer
+// safe to use p.
+//
+// sched.lock must be held.
+//
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func pidleput(_p_ *p) {
+	assertLockHeld(&sched.lock)
+
+	if !runqempty(_p_) {
+		throw("pidleput: P has non-empty run queue")
+	}
+	updateTimerPMask(_p_) // clear if there are no timers.
+	idlepMask.set(_p_.id)
+	_p_.link = sched.pidle
+	sched.pidle.set(_p_)
+	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
+}
+
+// pidleget tries to get a p from the _Pidle list, acquiring ownership.
+//
+// sched.lock must be held.
+//
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
+func pidleget() *p {
+	assertLockHeld(&sched.lock)
+
+	_p_ := sched.pidle.ptr()
+	if _p_ != nil {
+		// Timer may get added at any time now.
+		timerpMask.set(_p_.id)
+		idlepMask.clear(_p_.id)
+		sched.pidle = _p_.link
+		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
+	}
+	return _p_
+}
+
+// runqempty reports whether _p_ has no Gs on its local run queue.
+// It never returns true spuriously.
+func runqempty(_p_ *p) bool {
+	// Defend against a race where 1) _p_ has G1 in runqnext but runqhead == runqtail,
+	// 2) runqput on _p_ kicks G1 to the runq, 3) runqget on _p_ empties runqnext.
+	// Simply observing that runqhead == runqtail and then observing that runqnext == nil
+	// does not mean the queue is empty.
+	for {
+		head := atomic.Load(&_p_.runqhead)
+		tail := atomic.Load(&_p_.runqtail)
+		runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&_p_.runnext)))
+		if tail == atomic.Load(&_p_.runqtail) {
+			return head == tail && runnext == 0
+		}
+	}
+}
+
+// To shake out latent assumptions about scheduling order,
+// we introduce some randomness into scheduling decisions
+// when running with the race detector.
+// The need for this was made obvious by changing the
+// (deterministic) scheduling order in Go 1.5 and breaking
+// many poorly-written tests.
+// With the randomness here, as long as the tests pass
+// consistently with -race, they shouldn't have latent scheduling
+// assumptions.
+const randomizeScheduler = raceenabled
+
+// runqput tries to put g on the local runnable queue.
+// If next is false, runqput adds g to the tail of the runnable queue.
+// If next is true, runqput puts g in the _p_.runnext slot.
+// If the run queue is full, runnext puts g on the global queue.
+// Executed only by the owner P.
+func runqput(_p_ *p, gp *g, next bool) {
+	if randomizeScheduler && next && fastrandn(2) == 0 {
+		next = false
+	}
+
+	if next {
+	retryNext:
+		oldnext := _p_.runnext
+		if !_p_.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
+			goto retryNext
+		}
+		if oldnext == 0 {
+			return
+		}
+		// Kick the old runnext out to the regular run queue.
+		gp = oldnext.ptr()
+	}
+
+retry:
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
+	t := _p_.runqtail
+	if t-h < uint32(len(_p_.runq)) {
+		_p_.runq[t%uint32(len(_p_.runq))].set(gp)
+		atomic.StoreRel(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+		return
+	}
+	if runqputslow(_p_, gp, h, t) {
+		return
+	}
+	// the queue is not full, now the put above must succeed
+	goto retry
+}
+
+// Put g and a batch of work from local runnable queue on global queue.
+// Executed only by the owner P.
+func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
+	var batch [len(_p_.runq)/2 + 1]*g
+
+	// First, grab a batch from local queue.
+	n := t - h
+	n = n / 2
+	if n != uint32(len(_p_.runq)/2) {
+		throw("runqputslow: queue is not full")
+	}
+	for i := uint32(0); i < n; i++ {
+		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+	}
+	if !atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+		return false
+	}
+	batch[n] = gp
+
+	if randomizeScheduler {
+		for i := uint32(1); i <= n; i++ {
+			j := fastrandn(i + 1)
+			batch[i], batch[j] = batch[j], batch[i]
+		}
+	}
+
+	// Link the goroutines.
+	for i := uint32(0); i < n; i++ {
+		batch[i].schedlink.set(batch[i+1])
+	}
+	var q gQueue
+	q.head.set(batch[0])
+	q.tail.set(batch[n])
+
+	// Now put the batch on global queue.
+	lock(&sched.lock)
+	globrunqputbatch(&q, int32(n+1))
+	unlock(&sched.lock)
+	return true
+}
+
+// runqputbatch tries to put all the G's on q on the local runnable queue.
+// If the queue is full, they are put on the global queue; in that case
+// this will temporarily acquire the scheduler lock.
+// Executed only by the owner P.
+func runqputbatch(pp *p, q *gQueue, qsize int) {
+	h := atomic.LoadAcq(&pp.runqhead)
+	t := pp.runqtail
+	n := uint32(0)
+	for !q.empty() && t-h < uint32(len(pp.runq)) {
+		gp := q.pop()
+		pp.runq[t%uint32(len(pp.runq))].set(gp)
+		t++
+		n++
+	}
+	qsize -= int(n)
+
+	if randomizeScheduler {
+		off := func(o uint32) uint32 {
+			return (pp.runqtail + o) % uint32(len(pp.runq))
+		}
+		for i := uint32(1); i < n; i++ {
+			j := fastrandn(i + 1)
+			pp.runq[off(i)], pp.runq[off(j)] = pp.runq[off(j)], pp.runq[off(i)]
+		}
+	}
+
+	atomic.StoreRel(&pp.runqtail, t)
+	if !q.empty() {
+		lock(&sched.lock)
+		globrunqputbatch(q, int32(qsize))
+		unlock(&sched.lock)
+	}
+}
+
+// Get g from local runnable queue.
+// If inheritTime is true, gp should inherit the remaining time in the
+// current time slice. Otherwise, it should start a new time slice.
+// Executed only by the owner P.
+func runqget(_p_ *p) (gp *g, inheritTime bool) {
+	// If there's a runnext, it's the next G to run.
+	next := _p_.runnext
+	// If the runnext is non-0 and the CAS fails, it could only have been stolen by another P,
+	// because other Ps can race to set runnext to 0, but only the current P can set it to non-0.
+	// Hence, there's no need to retry this CAS if it falls.
+	if next != 0 && _p_.runnext.cas(next, 0) {
+		return next.ptr(), true
+	}
+
+	for {
+		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		t := _p_.runqtail
+		if t == h {
+			return nil, false
+		}
+		gp := _p_.runq[h%uint32(len(_p_.runq))].ptr()
+		if atomic.CasRel(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+			return gp, false
+		}
+	}
+}
+
+// runqdrain drains the local runnable queue of _p_ and returns all goroutines in it.
+// Executed only by the owner P.
+func runqdrain(_p_ *p) (drainQ gQueue, n uint32) {
+	oldNext := _p_.runnext
+	if oldNext != 0 && _p_.runnext.cas(oldNext, 0) {
+		drainQ.pushBack(oldNext.ptr())
+		n++
+	}
+
+retry:
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
+	t := _p_.runqtail
+	qn := t - h
+	if qn == 0 {
+		return
+	}
+	if qn > uint32(len(_p_.runq)) { // read inconsistent h and t
+		goto retry
+	}
+
+	if !atomic.CasRel(&_p_.runqhead, h, h+qn) { // cas-release, commits consume
+		goto retry
+	}
+
+	// We've inverted the order in which it gets G's from the local P's runnable queue
+	// and then advances the head pointer because we don't want to mess up the statuses of G's
+	// while runqdrain() and runqsteal() are running in parallel.
+	// Thus we should advance the head pointer before draining the local P into a gQueue,
+	// so that we can update any gp.schedlink only after we take the full ownership of G,
+	// meanwhile, other P's can't access to all G's in local P's runnable queue and steal them.
+	// See https://groups.google.com/g/golang-dev/c/0pTKxEKhHSc/m/6Q85QjdVBQAJ for more details.
+	for i := uint32(0); i < qn; i++ {
+		gp := _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+		drainQ.pushBack(gp)
+		n++
+	}
+	return
+}
+
+// Grabs a batch of goroutines from _p_'s runnable queue into batch.
+// Batch is a ring buffer starting at batchHead.
+// Returns number of grabbed goroutines.
+// Can be executed by any P.
+func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
+	for {
+		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		t := atomic.LoadAcq(&_p_.runqtail) // load-acquire, synchronize with the producer
+		n := t - h
+		n = n - n/2
+		if n == 0 {
+			if stealRunNextG {
+				// Try to steal from _p_.runnext.
+				if next := _p_.runnext; next != 0 {
+					if _p_.status == _Prunning {
+						// Sleep to ensure that _p_ isn't about to run the g
+						// we are about to steal.
+						// The important use case here is when the g running
+						// on _p_ ready()s another g and then almost
+						// immediately blocks. Instead of stealing runnext
+						// in this window, back off to give _p_ a chance to
+						// schedule runnext. This will avoid thrashing gs
+						// between different Ps.
+						// A sync chan send/recv takes ~50ns as of time of
+						// writing, so 3us gives ~50x overshoot.
+						if GOOS != "windows" {
+							usleep(3)
+						} else {
+							// On windows system timer granularity is
+							// 1-15ms, which is way too much for this
+							// optimization. So just yield.
+							osyield()
+						}
+					}
+					if !_p_.runnext.cas(next, 0) {
+						continue
+					}
+					batch[batchHead%uint32(len(batch))] = next
+					return 1
+				}
+			}
+			return 0
+		}
+		if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
+			continue
+		}
+		for i := uint32(0); i < n; i++ {
+			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
+			batch[(batchHead+i)%uint32(len(batch))] = g
+		}
+		if atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+			return n
+		}
+	}
+}
+
+// Steal half of elements from local runnable queue of p2
+// and put onto local runnable queue of p.
+// Returns one of the stolen elements (or nil if failed).
+func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
+	t := _p_.runqtail
+	n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
+	if n == 0 {
+		return nil
+	}
+	n--
+	gp := _p_.runq[(t+n)%uint32(len(_p_.runq))].ptr()
+	if n == 0 {
+		return gp
+	}
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
+	if t-h+n >= uint32(len(_p_.runq)) {
+		throw("runqsteal: runq overflow")
+	}
+	atomic.StoreRel(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+	return gp
+}
+
+// A gQueue is a dequeue of Gs linked through g.schedlink. A G can only
+// be on one gQueue or gList at a time.
+type gQueue struct {
+	head guintptr
+	tail guintptr
+}
+
+// empty reports whether q is empty.
+func (q *gQueue) empty() bool {
+	return q.head == 0
+}
+
+// push adds gp to the head of q.
+func (q *gQueue) push(gp *g) {
+	gp.schedlink = q.head
+	q.head.set(gp)
+	if q.tail == 0 {
+		q.tail.set(gp)
+	}
+}
+
+// pushBack adds gp to the tail of q.
+func (q *gQueue) pushBack(gp *g) {
+	gp.schedlink = 0
+	if q.tail != 0 {
+		q.tail.ptr().schedlink.set(gp)
+	} else {
+		q.head.set(gp)
+	}
+	q.tail.set(gp)
+}
+
+// pushBackAll adds all Gs in q2 to the tail of q. After this q2 must
+// not be used.
+func (q *gQueue) pushBackAll(q2 gQueue) {
+	if q2.tail == 0 {
+		return
+	}
+	q2.tail.ptr().schedlink = 0
+	if q.tail != 0 {
+		q.tail.ptr().schedlink = q2.head
+	} else {
+		q.head = q2.head
+	}
+	q.tail = q2.tail
+}
+
+// pop removes and returns the head of queue q. It returns nil if
+// q is empty.
+func (q *gQueue) pop() *g {
+	gp := q.head.ptr()
+	if gp != nil {
+		q.head = gp.schedlink
+		if q.head == 0 {
+			q.tail = 0
+		}
+	}
+	return gp
+}
+
+// popList takes all Gs in q and returns them as a gList.
+func (q *gQueue) popList() gList {
+	stack := gList{q.head}
+	*q = gQueue{}
+	return stack
+}
+
+// A gList is a list of Gs linked through g.schedlink. A G can only be
+// on one gQueue or gList at a time.
+type gList struct {
+	head guintptr
+}
+
+// empty reports whether l is empty.
+func (l *gList) empty() bool {
+	return l.head == 0
+}
+
+// push adds gp to the head of l.
+func (l *gList) push(gp *g) {
+	gp.schedlink = l.head
+	l.head.set(gp)
+}
+
+// pushAll prepends all Gs in q to l.
+func (l *gList) pushAll(q gQueue) {
+	if !q.empty() {
+		q.tail.ptr().schedlink = l.head
+		l.head = q.head
+	}
+}
+
+// pop removes and returns the head of l. If l is empty, it returns nil.
+func (l *gList) pop() *g {
+	gp := l.head.ptr()
+	if gp != nil {
+		l.head = gp.schedlink
+	}
+	return gp
+}
+
+//go:linkname setMaxThreads runtime/debug.setMaxThreads
+func setMaxThreads(in int) (out int) {
+	lock(&sched.lock)
+	out = int(sched.maxmcount)
+	if in > 0x7fffffff { // MaxInt32
+		sched.maxmcount = 0x7fffffff
+	} else {
+		sched.maxmcount = int32(in)
+	}
+	checkmcount()
+	unlock(&sched.lock)
+	return
+}
+
+//go:nosplit
+func procPin() int {
+	_g_ := getg()
+	mp := _g_.m
+
+	mp.locks++
+	return int(mp.p.ptr().id)
+}
+
+//go:nosplit
+func procUnpin() {
+	_g_ := getg()
+	_g_.m.locks--
+}
+
+//go:linkname sync_runtime_procPin sync.runtime_procPin
+//go:nosplit
+func sync_runtime_procPin() int {
+	return procPin()
+}
+
+//go:linkname sync_runtime_procUnpin sync.runtime_procUnpin
+//go:nosplit
+func sync_runtime_procUnpin() {
+	procUnpin()
+}
+
+//go:linkname sync_atomic_runtime_procPin sync/atomic.runtime_procPin
+//go:nosplit
+func sync_atomic_runtime_procPin() int {
+	return procPin()
+}
+
+//go:linkname sync_atomic_runtime_procUnpin sync/atomic.runtime_procUnpin
+//go:nosplit
+func sync_atomic_runtime_procUnpin() {
+	procUnpin()
+}
+
+// Active spinning for sync.Mutex.
+//go:linkname sync_runtime_canSpin sync.runtime_canSpin
+//go:nosplit
+func sync_runtime_canSpin(i int) bool {
+	// sync.Mutex is cooperative, so we are conservative with spinning.
+	// Spin only few times and only if running on a multicore machine and
+	// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
+	// As opposed to runtime mutex we don't do passive spinning here,
+	// because there can be work on global runq or on other Ps.
+	if i >= active_spin || ncpu <= 1 || gomaxprocs <= int32(sched.npidle+sched.nmspinning)+1 {
+		return false
+	}
+	if p := getg().m.p.ptr(); !runqempty(p) {
+		return false
+	}
+	return true
+}
+
+//go:linkname sync_runtime_doSpin sync.runtime_doSpin
+//go:nosplit
+func sync_runtime_doSpin() {
+	procyield(active_spin_cnt)
+}
+
+var stealOrder randomOrder
+
+// randomOrder/randomEnum are helper types for randomized work stealing.
+// They allow to enumerate all Ps in different pseudo-random orders without repetitions.
+// The algorithm is based on the fact that if we have X such that X and GOMAXPROCS
+// are coprime, then a sequences of (i + X) % GOMAXPROCS gives the required enumeration.
+type randomOrder struct {
+	count    uint32
+	coprimes []uint32
+}
+
+type randomEnum struct {
+	i     uint32
+	count uint32
+	pos   uint32
+	inc   uint32
+}
+
+func (ord *randomOrder) reset(count uint32) {
+	ord.count = count
+	ord.coprimes = ord.coprimes[:0]
+	for i := uint32(1); i <= count; i++ {
+		if gcd(i, count) == 1 {
+			ord.coprimes = append(ord.coprimes, i)
+		}
+	}
+}
+
+func (ord *randomOrder) start(i uint32) randomEnum {
+	return randomEnum{
+		count: ord.count,
+		pos:   i % ord.count,
+		inc:   ord.coprimes[i%uint32(len(ord.coprimes))],
+	}
+}
+
+func (enum *randomEnum) done() bool {
+	return enum.i == enum.count
+}
+
+func (enum *randomEnum) next() {
+	enum.i++
+	enum.pos = (enum.pos + enum.inc) % enum.count
+}
+
+func (enum *randomEnum) position() uint32 {
+	return enum.pos
+}
+
+func gcd(a, b uint32) uint32 {
+	for b != 0 {
+		a, b = b, a%b
+	}
+	return a
+}
+
+// An initTask represents the set of initializations that need to be done for a package.
+// Keep in sync with ../../test/initempty.go:initTask
+type initTask struct {
+	// TODO: pack the first 3 fields more tightly?
+	state uintptr // 0 = uninitialized, 1 = in progress, 2 = done
+	ndeps uintptr
+	nfns  uintptr
+	// followed by ndeps instances of an *initTask, one per package depended on
+	// followed by nfns pcs, one per init function to run
+}
+
+// inittrace stores statistics for init functions which are
+// updated by malloc and newproc when active is true.
+var inittrace tracestat
+
+type tracestat struct {
+	active bool   // init tracing activation status
+	id     int64  // init goroutine id
+	allocs uint64 // heap allocations
+	bytes  uint64 // heap allocated bytes
+}
+
+func doInit(t *initTask) {
+	switch t.state {
+	case 2: // fully initialized
+		return
+	case 1: // initialization in progress
+		throw("recursive call during initialization - linker skew")
+	default: // not initialized yet
+		t.state = 1 // initialization in progress
+
+		for i := uintptr(0); i < t.ndeps; i++ {
+			p := add(unsafe.Pointer(t), (3+i)*goarch.PtrSize)
+			t2 := *(**initTask)(p)
+			doInit(t2)
+		}
+
+		if t.nfns == 0 {
+			t.state = 2 // initialization done
+			return
+		}
+
+		var (
+			start  int64
+			before tracestat
+		)
+
+		if inittrace.active {
+			start = nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init goroutine.
+			before = inittrace
+		}
+
+		firstFunc := add(unsafe.Pointer(t), (3+t.ndeps)*goarch.PtrSize)
+		for i := uintptr(0); i < t.nfns; i++ {
+			p := add(firstFunc, i*goarch.PtrSize)
+			f := *(*func())(unsafe.Pointer(&p))
+			f()
+		}
+
+		if inittrace.active {
+			end := nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init goroutine.
+			after := inittrace
+
+			f := *(*func())(unsafe.Pointer(&firstFunc))
+			pkg := funcpkgpath(findfunc(abi.FuncPCABIInternal(f)))
+
+			var sbuf [24]byte
+			print("init ", pkg, " @")
+			print(string(fmtNSAsMS(sbuf[:], uint64(start-runtimeInitTime))), " ms, ")
+			print(string(fmtNSAsMS(sbuf[:], uint64(end-start))), " ms clock, ")
+			print(string(itoa(sbuf[:], after.bytes-before.bytes)), " bytes, ")
+			print(string(itoa(sbuf[:], after.allocs-before.allocs)), " allocs")
+			print("\n")
+		}
+
+		t.state = 2 // initialization done
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/profbuf.go b/contrib/go/_std_1.18/src/runtime/profbuf.go
new file mode 100644
index 0000000000..f40881aed5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/profbuf.go
@@ -0,0 +1,561 @@
+// Copyright 2017 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// A profBuf is a lock-free buffer for profiling events,
+// safe for concurrent use by one reader and one writer.
+// The writer may be a signal handler running without a user g.
+// The reader is assumed to be a user g.
+//
+// Each logged event corresponds to a fixed size header, a list of
+// uintptrs (typically a stack), and exactly one unsafe.Pointer tag.
+// The header and uintptrs are stored in the circular buffer data and the
+// tag is stored in a circular buffer tags, running in parallel.
+// In the circular buffer data, each event takes 2+hdrsize+len(stk)
+// words: the value 2+hdrsize+len(stk), then the time of the event, then
+// hdrsize words giving the fixed-size header, and then len(stk) words
+// for the stack.
+//
+// The current effective offsets into the tags and data circular buffers
+// for reading and writing are stored in the high 30 and low 32 bits of r and w.
+// The bottom bits of the high 32 are additional flag bits in w, unused in r.
+// "Effective" offsets means the total number of reads or writes, mod 2^length.
+// The offset in the buffer is the effective offset mod the length of the buffer.
+// To make wraparound mod 2^length match wraparound mod length of the buffer,
+// the length of the buffer must be a power of two.
+//
+// If the reader catches up to the writer, a flag passed to read controls
+// whether the read blocks until more data is available. A read returns a
+// pointer to the buffer data itself; the caller is assumed to be done with
+// that data at the next read. The read offset rNext tracks the next offset to
+// be returned by read. By definition, r ≤ rNext ≤ w (before wraparound),
+// and rNext is only used by the reader, so it can be accessed without atomics.
+//
+// If the writer gets ahead of the reader, so that the buffer fills,
+// future writes are discarded and replaced in the output stream by an
+// overflow entry, which has size 2+hdrsize+1, time set to the time of
+// the first discarded write, a header of all zeroed words, and a "stack"
+// containing one word, the number of discarded writes.
+//
+// Between the time the buffer fills and the buffer becomes empty enough
+// to hold more data, the overflow entry is stored as a pending overflow
+// entry in the fields overflow and overflowTime. The pending overflow
+// entry can be turned into a real record by either the writer or the
+// reader. If the writer is called to write a new record and finds that
+// the output buffer has room for both the pending overflow entry and the
+// new record, the writer emits the pending overflow entry and the new
+// record into the buffer. If the reader is called to read data and finds
+// that the output buffer is empty but that there is a pending overflow
+// entry, the reader will return a synthesized record for the pending
+// overflow entry.
+//
+// Only the writer can create or add to a pending overflow entry, but
+// either the reader or the writer can clear the pending overflow entry.
+// A pending overflow entry is indicated by the low 32 bits of 'overflow'
+// holding the number of discarded writes, and overflowTime holding the
+// time of the first discarded write. The high 32 bits of 'overflow'
+// increment each time the low 32 bits transition from zero to non-zero
+// or vice versa. This sequence number avoids ABA problems in the use of
+// compare-and-swap to coordinate between reader and writer.
+// The overflowTime is only written when the low 32 bits of overflow are
+// zero, that is, only when there is no pending overflow entry, in
+// preparation for creating a new one. The reader can therefore fetch and
+// clear the entry atomically using
+//
+//	for {
+//		overflow = load(&b.overflow)
+//		if uint32(overflow) == 0 {
+//			// no pending entry
+//			break
+//		}
+//		time = load(&b.overflowTime)
+//		if cas(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+//			// pending entry cleared
+//			break
+//		}
+//	}
+//	if uint32(overflow) > 0 {
+//		emit entry for uint32(overflow), time
+//	}
+//
+type profBuf struct {
+	// accessed atomically
+	r, w         profAtomic
+	overflow     uint64
+	overflowTime uint64
+	eof          uint32
+
+	// immutable (excluding slice content)
+	hdrsize uintptr
+	data    []uint64
+	tags    []unsafe.Pointer
+
+	// owned by reader
+	rNext       profIndex
+	overflowBuf []uint64 // for use by reader to return overflow record
+	wait        note
+}
+
+// A profAtomic is the atomically-accessed word holding a profIndex.
+type profAtomic uint64
+
+// A profIndex is the packet tag and data counts and flags bits, described above.
+type profIndex uint64
+
+const (
+	profReaderSleeping profIndex = 1 << 32 // reader is sleeping and must be woken up
+	profWriteExtra     profIndex = 1 << 33 // overflow or eof waiting
+)
+
+func (x *profAtomic) load() profIndex {
+	return profIndex(atomic.Load64((*uint64)(x)))
+}
+
+func (x *profAtomic) store(new profIndex) {
+	atomic.Store64((*uint64)(x), uint64(new))
+}
+
+func (x *profAtomic) cas(old, new profIndex) bool {
+	return atomic.Cas64((*uint64)(x), uint64(old), uint64(new))
+}
+
+func (x profIndex) dataCount() uint32 {
+	return uint32(x)
+}
+
+func (x profIndex) tagCount() uint32 {
+	return uint32(x >> 34)
+}
+
+// countSub subtracts two counts obtained from profIndex.dataCount or profIndex.tagCount,
+// assuming that they are no more than 2^29 apart (guaranteed since they are never more than
+// len(data) or len(tags) apart, respectively).
+// tagCount wraps at 2^30, while dataCount wraps at 2^32.
+// This function works for both.
+func countSub(x, y uint32) int {
+	// x-y is 32-bit signed or 30-bit signed; sign-extend to 32 bits and convert to int.
+	return int(int32(x-y) << 2 >> 2)
+}
+
+// addCountsAndClearFlags returns the packed form of "x + (data, tag) - all flags".
+func (x profIndex) addCountsAndClearFlags(data, tag int) profIndex {
+	return profIndex((uint64(x)>>34+uint64(uint32(tag)<<2>>2))<<34 | uint64(uint32(x)+uint32(data)))
+}
+
+// hasOverflow reports whether b has any overflow records pending.
+func (b *profBuf) hasOverflow() bool {
+	return uint32(atomic.Load64(&b.overflow)) > 0
+}
+
+// takeOverflow consumes the pending overflow records, returning the overflow count
+// and the time of the first overflow.
+// When called by the reader, it is racing against incrementOverflow.
+func (b *profBuf) takeOverflow() (count uint32, time uint64) {
+	overflow := atomic.Load64(&b.overflow)
+	time = atomic.Load64(&b.overflowTime)
+	for {
+		count = uint32(overflow)
+		if count == 0 {
+			time = 0
+			break
+		}
+		// Increment generation, clear overflow count in low bits.
+		if atomic.Cas64(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+			break
+		}
+		overflow = atomic.Load64(&b.overflow)
+		time = atomic.Load64(&b.overflowTime)
+	}
+	return uint32(overflow), time
+}
+
+// incrementOverflow records a single overflow at time now.
+// It is racing against a possible takeOverflow in the reader.
+func (b *profBuf) incrementOverflow(now int64) {
+	for {
+		overflow := atomic.Load64(&b.overflow)
+
+		// Once we see b.overflow reach 0, it's stable: no one else is changing it underfoot.
+		// We need to set overflowTime if we're incrementing b.overflow from 0.
+		if uint32(overflow) == 0 {
+			// Store overflowTime first so it's always available when overflow != 0.
+			atomic.Store64(&b.overflowTime, uint64(now))
+			atomic.Store64(&b.overflow, (((overflow>>32)+1)<<32)+1)
+			break
+		}
+		// Otherwise we're racing to increment against reader
+		// who wants to set b.overflow to 0.
+		// Out of paranoia, leave 2³²-1 a sticky overflow value,
+		// to avoid wrapping around. Extremely unlikely.
+		if int32(overflow) == -1 {
+			break
+		}
+		if atomic.Cas64(&b.overflow, overflow, overflow+1) {
+			break
+		}
+	}
+}
+
+// newProfBuf returns a new profiling buffer with room for
+// a header of hdrsize words and a buffer of at least bufwords words.
+func newProfBuf(hdrsize, bufwords, tags int) *profBuf {
+	if min := 2 + hdrsize + 1; bufwords < min {
+		bufwords = min
+	}
+
+	// Buffer sizes must be power of two, so that we don't have to
+	// worry about uint32 wraparound changing the effective position
+	// within the buffers. We store 30 bits of count; limiting to 28
+	// gives us some room for intermediate calculations.
+	if bufwords >= 1<<28 || tags >= 1<<28 {
+		throw("newProfBuf: buffer too large")
+	}
+	var i int
+	for i = 1; i < bufwords; i <<= 1 {
+	}
+	bufwords = i
+	for i = 1; i < tags; i <<= 1 {
+	}
+	tags = i
+
+	b := new(profBuf)
+	b.hdrsize = uintptr(hdrsize)
+	b.data = make([]uint64, bufwords)
+	b.tags = make([]unsafe.Pointer, tags)
+	b.overflowBuf = make([]uint64, 2+b.hdrsize+1)
+	return b
+}
+
+// canWriteRecord reports whether the buffer has room
+// for a single contiguous record with a stack of length nstk.
+func (b *profBuf) canWriteRecord(nstk int) bool {
+	br := b.r.load()
+	bw := b.w.load()
+
+	// room for tag?
+	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 1 {
+		return false
+	}
+
+	// room for data?
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+	want := 2 + int(b.hdrsize) + nstk
+	i := int(bw.dataCount() % uint32(len(b.data)))
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+	}
+	return nd >= want
+}
+
+// canWriteTwoRecords reports whether the buffer has room
+// for two records with stack lengths nstk1, nstk2, in that order.
+// Each record must be contiguous on its own, but the two
+// records need not be contiguous (one can be at the end of the buffer
+// and the other can wrap around and start at the beginning of the buffer).
+func (b *profBuf) canWriteTwoRecords(nstk1, nstk2 int) bool {
+	br := b.r.load()
+	bw := b.w.load()
+
+	// room for tag?
+	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 2 {
+		return false
+	}
+
+	// room for data?
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+
+	// first record
+	want := 2 + int(b.hdrsize) + nstk1
+	i := int(bw.dataCount() % uint32(len(b.data)))
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+		i = 0
+	}
+	i += want
+	nd -= want
+
+	// second record
+	want = 2 + int(b.hdrsize) + nstk2
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+		i = 0
+	}
+	return nd >= want
+}
+
+// write writes an entry to the profiling buffer b.
+// The entry begins with a fixed hdr, which must have
+// length b.hdrsize, followed by a variable-sized stack
+// and a single tag pointer *tagPtr (or nil if tagPtr is nil).
+// No write barriers allowed because this might be called from a signal handler.
+func (b *profBuf) write(tagPtr *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+	if b == nil {
+		return
+	}
+	if len(hdr) > int(b.hdrsize) {
+		throw("misuse of profBuf.write")
+	}
+
+	if hasOverflow := b.hasOverflow(); hasOverflow && b.canWriteTwoRecords(1, len(stk)) {
+		// Room for both an overflow record and the one being written.
+		// Write the overflow record if the reader hasn't gotten to it yet.
+		// Only racing against reader, not other writers.
+		count, time := b.takeOverflow()
+		if count > 0 {
+			var stk [1]uintptr
+			stk[0] = uintptr(count)
+			b.write(nil, int64(time), nil, stk[:])
+		}
+	} else if hasOverflow || !b.canWriteRecord(len(stk)) {
+		// Pending overflow without room to write overflow and new records
+		// or no overflow but also no room for new record.
+		b.incrementOverflow(now)
+		b.wakeupExtra()
+		return
+	}
+
+	// There's room: write the record.
+	br := b.r.load()
+	bw := b.w.load()
+
+	// Profiling tag
+	//
+	// The tag is a pointer, but we can't run a write barrier here.
+	// We have interrupted the OS-level execution of gp, but the
+	// runtime still sees gp as executing. In effect, we are running
+	// in place of the real gp. Since gp is the only goroutine that
+	// can overwrite gp.labels, the value of gp.labels is stable during
+	// this signal handler: it will still be reachable from gp when
+	// we finish executing. If a GC is in progress right now, it must
+	// keep gp.labels alive, because gp.labels is reachable from gp.
+	// If gp were to overwrite gp.labels, the deletion barrier would
+	// still shade that pointer, which would preserve it for the
+	// in-progress GC, so all is well. Any future GC will see the
+	// value we copied when scanning b.tags (heap-allocated).
+	// We arrange that the store here is always overwriting a nil,
+	// so there is no need for a deletion barrier on b.tags[wt].
+	wt := int(bw.tagCount() % uint32(len(b.tags)))
+	if tagPtr != nil {
+		*(*uintptr)(unsafe.Pointer(&b.tags[wt])) = uintptr(unsafe.Pointer(*tagPtr))
+	}
+
+	// Main record.
+	// It has to fit in a contiguous section of the slice, so if it doesn't fit at the end,
+	// leave a rewind marker (0) and start over at the beginning of the slice.
+	wd := int(bw.dataCount() % uint32(len(b.data)))
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+	skip := 0
+	if wd+2+int(b.hdrsize)+len(stk) > len(b.data) {
+		b.data[wd] = 0
+		skip = len(b.data) - wd
+		nd -= skip
+		wd = 0
+	}
+	data := b.data[wd:]
+	data[0] = uint64(2 + b.hdrsize + uintptr(len(stk))) // length
+	data[1] = uint64(now)                               // time stamp
+	// header, zero-padded
+	i := uintptr(copy(data[2:2+b.hdrsize], hdr))
+	for ; i < b.hdrsize; i++ {
+		data[2+i] = 0
+	}
+	for i, pc := range stk {
+		data[2+b.hdrsize+uintptr(i)] = uint64(pc)
+	}
+
+	for {
+		// Commit write.
+		// Racing with reader setting flag bits in b.w, to avoid lost wakeups.
+		old := b.w.load()
+		new := old.addCountsAndClearFlags(skip+2+len(stk)+int(b.hdrsize), 1)
+		if !b.w.cas(old, new) {
+			continue
+		}
+		// If there was a reader, wake it up.
+		if old&profReaderSleeping != 0 {
+			notewakeup(&b.wait)
+		}
+		break
+	}
+}
+
+// close signals that there will be no more writes on the buffer.
+// Once all the data has been read from the buffer, reads will return eof=true.
+func (b *profBuf) close() {
+	if atomic.Load(&b.eof) > 0 {
+		throw("runtime: profBuf already closed")
+	}
+	atomic.Store(&b.eof, 1)
+	b.wakeupExtra()
+}
+
+// wakeupExtra must be called after setting one of the "extra"
+// atomic fields b.overflow or b.eof.
+// It records the change in b.w and wakes up the reader if needed.
+func (b *profBuf) wakeupExtra() {
+	for {
+		old := b.w.load()
+		new := old | profWriteExtra
+		if !b.w.cas(old, new) {
+			continue
+		}
+		if old&profReaderSleeping != 0 {
+			notewakeup(&b.wait)
+		}
+		break
+	}
+}
+
+// profBufReadMode specifies whether to block when no data is available to read.
+type profBufReadMode int
+
+const (
+	profBufBlocking profBufReadMode = iota
+	profBufNonBlocking
+)
+
+var overflowTag [1]unsafe.Pointer // always nil
+
+func (b *profBuf) read(mode profBufReadMode) (data []uint64, tags []unsafe.Pointer, eof bool) {
+	if b == nil {
+		return nil, nil, true
+	}
+
+	br := b.rNext
+
+	// Commit previous read, returning that part of the ring to the writer.
+	// First clear tags that have now been read, both to avoid holding
+	// up the memory they point at for longer than necessary
+	// and so that b.write can assume it is always overwriting
+	// nil tag entries (see comment in b.write).
+	rPrev := b.r.load()
+	if rPrev != br {
+		ntag := countSub(br.tagCount(), rPrev.tagCount())
+		ti := int(rPrev.tagCount() % uint32(len(b.tags)))
+		for i := 0; i < ntag; i++ {
+			b.tags[ti] = nil
+			if ti++; ti == len(b.tags) {
+				ti = 0
+			}
+		}
+		b.r.store(br)
+	}
+
+Read:
+	bw := b.w.load()
+	numData := countSub(bw.dataCount(), br.dataCount())
+	if numData == 0 {
+		if b.hasOverflow() {
+			// No data to read, but there is overflow to report.
+			// Racing with writer flushing b.overflow into a real record.
+			count, time := b.takeOverflow()
+			if count == 0 {
+				// Lost the race, go around again.
+				goto Read
+			}
+			// Won the race, report overflow.
+			dst := b.overflowBuf
+			dst[0] = uint64(2 + b.hdrsize + 1)
+			dst[1] = uint64(time)
+			for i := uintptr(0); i < b.hdrsize; i++ {
+				dst[2+i] = 0
+			}
+			dst[2+b.hdrsize] = uint64(count)
+			return dst[:2+b.hdrsize+1], overflowTag[:1], false
+		}
+		if atomic.Load(&b.eof) > 0 {
+			// No data, no overflow, EOF set: done.
+			return nil, nil, true
+		}
+		if bw&profWriteExtra != 0 {
+			// Writer claims to have published extra information (overflow or eof).
+			// Attempt to clear notification and then check again.
+			// If we fail to clear the notification it means b.w changed,
+			// so we still need to check again.
+			b.w.cas(bw, bw&^profWriteExtra)
+			goto Read
+		}
+
+		// Nothing to read right now.
+		// Return or sleep according to mode.
+		if mode == profBufNonBlocking {
+			return nil, nil, false
+		}
+		if !b.w.cas(bw, bw|profReaderSleeping) {
+			goto Read
+		}
+		// Committed to sleeping.
+		notetsleepg(&b.wait, -1)
+		noteclear(&b.wait)
+		goto Read
+	}
+	data = b.data[br.dataCount()%uint32(len(b.data)):]
+	if len(data) > numData {
+		data = data[:numData]
+	} else {
+		numData -= len(data) // available in case of wraparound
+	}
+	skip := 0
+	if data[0] == 0 {
+		// Wraparound record. Go back to the beginning of the ring.
+		skip = len(data)
+		data = b.data
+		if len(data) > numData {
+			data = data[:numData]
+		}
+	}
+
+	ntag := countSub(bw.tagCount(), br.tagCount())
+	if ntag == 0 {
+		throw("runtime: malformed profBuf buffer - tag and data out of sync")
+	}
+	tags = b.tags[br.tagCount()%uint32(len(b.tags)):]
+	if len(tags) > ntag {
+		tags = tags[:ntag]
+	}
+
+	// Count out whole data records until either data or tags is done.
+	// They are always in sync in the buffer, but due to an end-of-slice
+	// wraparound we might need to stop early and return the rest
+	// in the next call.
+	di := 0
+	ti := 0
+	for di < len(data) && data[di] != 0 && ti < len(tags) {
+		if uintptr(di)+uintptr(data[di]) > uintptr(len(data)) {
+			throw("runtime: malformed profBuf buffer - invalid size")
+		}
+		di += int(data[di])
+		ti++
+	}
+
+	// Remember how much we returned, to commit read on next call.
+	b.rNext = br.addCountsAndClearFlags(skip+di, ti)
+
+	if raceenabled {
+		// Match racereleasemerge in runtime_setProfLabel,
+		// so that the setting of the labels in runtime_setProfLabel
+		// is treated as happening before any use of the labels
+		// by our caller. The synchronization on labelSync itself is a fiction
+		// for the race detector. The actual synchronization is handled
+		// by the fact that the signal handler only reads from the current
+		// goroutine and uses atomics to write the updated queue indices,
+		// and then the read-out from the signal handler buffer uses
+		// atomics to read those queue indices.
+		raceacquire(unsafe.Pointer(&labelSync))
+	}
+
+	return data[:di], tags[:ti], false
+}
diff --git a/contrib/go/_std_1.18/src/runtime/proflabel.go b/contrib/go/_std_1.18/src/runtime/proflabel.go
new file mode 100644
index 0000000000..b2a161729e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/proflabel.go
@@ -0,0 +1,40 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var labelSync uintptr
+
+//go:linkname runtime_setProfLabel runtime/pprof.runtime_setProfLabel
+func runtime_setProfLabel(labels unsafe.Pointer) {
+	// Introduce race edge for read-back via profile.
+	// This would more properly use &getg().labels as the sync address,
+	// but we do the read in a signal handler and can't call the race runtime then.
+	//
+	// This uses racereleasemerge rather than just racerelease so
+	// the acquire in profBuf.read synchronizes with *all* prior
+	// setProfLabel operations, not just the most recent one. This
+	// is important because profBuf.read will observe different
+	// labels set by different setProfLabel operations on
+	// different goroutines, so it needs to synchronize with all
+	// of them (this wouldn't be an issue if we could synchronize
+	// on &getg().labels since we would synchronize with each
+	// most-recent labels write separately.)
+	//
+	// racereleasemerge is like a full read-modify-write on
+	// labelSync, rather than just a store-release, so it carries
+	// a dependency on the previous racereleasemerge, which
+	// ultimately carries forward to the acquire in profBuf.read.
+	if raceenabled {
+		racereleasemerge(unsafe.Pointer(&labelSync))
+	}
+	getg().labels = labels
+}
+
+//go:linkname runtime_getProfLabel runtime/pprof.runtime_getProfLabel
+func runtime_getProfLabel() unsafe.Pointer {
+	return getg().labels
+}
diff --git a/contrib/go/_std_1.18/src/runtime/race0.go b/contrib/go/_std_1.18/src/runtime/race0.go
new file mode 100644
index 0000000000..f36d4387c7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/race0.go
@@ -0,0 +1,44 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !race
+
+// Dummy race detection API, used when not built with -race.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const raceenabled = false
+
+// Because raceenabled is false, none of these functions should be called.
+
+func raceReadObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr)  { throw("race") }
+func raceWriteObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") }
+func raceinit() (uintptr, uintptr)                                          { throw("race"); return 0, 0 }
+func racefini()                                                             { throw("race") }
+func raceproccreate() uintptr                                               { throw("race"); return 0 }
+func raceprocdestroy(ctx uintptr)                                           { throw("race") }
+func racemapshadow(addr unsafe.Pointer, size uintptr)                       { throw("race") }
+func racewritepc(addr unsafe.Pointer, callerpc, pc uintptr)                 { throw("race") }
+func racereadpc(addr unsafe.Pointer, callerpc, pc uintptr)                  { throw("race") }
+func racereadrangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)         { throw("race") }
+func racewriterangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)        { throw("race") }
+func raceacquire(addr unsafe.Pointer)                                       { throw("race") }
+func raceacquireg(gp *g, addr unsafe.Pointer)                               { throw("race") }
+func raceacquirectx(racectx uintptr, addr unsafe.Pointer)                   { throw("race") }
+func racerelease(addr unsafe.Pointer)                                       { throw("race") }
+func racereleaseg(gp *g, addr unsafe.Pointer)                               { throw("race") }
+func racereleaseacquire(addr unsafe.Pointer)                                { throw("race") }
+func racereleaseacquireg(gp *g, addr unsafe.Pointer)                        { throw("race") }
+func racereleasemerge(addr unsafe.Pointer)                                  { throw("race") }
+func racereleasemergeg(gp *g, addr unsafe.Pointer)                          { throw("race") }
+func racefingo()                                                            { throw("race") }
+func racemalloc(p unsafe.Pointer, sz uintptr)                               { throw("race") }
+func racefree(p unsafe.Pointer, sz uintptr)                                 { throw("race") }
+func racegostart(pc uintptr) uintptr                                        { throw("race"); return 0 }
+func racegoend()                                                            { throw("race") }
+func racectxend(racectx uintptr)                                            { throw("race") }
diff --git a/contrib/go/_std_1.18/src/runtime/rdebug.go b/contrib/go/_std_1.18/src/runtime/rdebug.go
new file mode 100644
index 0000000000..1b213f1934
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/rdebug.go
@@ -0,0 +1,22 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+//go:linkname setMaxStack runtime/debug.setMaxStack
+func setMaxStack(in int) (out int) {
+	out = int(maxstacksize)
+	maxstacksize = uintptr(in)
+	return out
+}
+
+//go:linkname setPanicOnFault runtime/debug.setPanicOnFault
+func setPanicOnFault(new bool) (old bool) {
+	_g_ := getg()
+	old = _g_.paniconfault
+	_g_.paniconfault = new
+	return old
+}
diff --git a/contrib/go/_std_1.18/src/runtime/relax_stub.go b/contrib/go/_std_1.18/src/runtime/relax_stub.go
new file mode 100644
index 0000000000..e507702fc1
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/relax_stub.go
@@ -0,0 +1,17 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !windows
+
+package runtime
+
+// osRelaxMinNS is the number of nanoseconds of idleness to tolerate
+// without performing an osRelax. Since osRelax may reduce the
+// precision of timers, this should be enough larger than the relaxed
+// timer precision to keep the timer error acceptable.
+const osRelaxMinNS = 0
+
+// osRelax is called by the scheduler when transitioning to and from
+// all Ps being idle.
+func osRelax(relax bool) {}
diff --git a/contrib/go/_std_1.18/src/runtime/rt0_darwin_amd64.s b/contrib/go/_std_1.18/src/runtime/rt0_darwin_amd64.s
new file mode 100644
index 0000000000..ed804d47c5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/rt0_darwin_amd64.s
@@ -0,0 +1,13 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_darwin(SB),NOSPLIT,$-8
+	JMP	_rt0_amd64(SB)
+
+// When linking with -shared, this symbol is called when the shared library
+// is loaded.
+TEXT _rt0_amd64_darwin_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/contrib/go/_std_1.18/src/runtime/rt0_linux_amd64.s b/contrib/go/_std_1.18/src/runtime/rt0_linux_amd64.s
new file mode 100644
index 0000000000..94ff7094d6
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/rt0_linux_amd64.s
@@ -0,0 +1,11 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_linux(SB),NOSPLIT,$-8
+	JMP	_rt0_amd64(SB)
+
+TEXT _rt0_amd64_linux_lib(SB),NOSPLIT,$0
+	JMP	_rt0_amd64_lib(SB)
diff --git a/contrib/go/_std_1.18/src/runtime/runtime.go b/contrib/go/_std_1.18/src/runtime/runtime.go
new file mode 100644
index 0000000000..33ecc260dd
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/runtime.go
@@ -0,0 +1,65 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	_ "unsafe" // for go:linkname
+)
+
+//go:generate go run wincallback.go
+//go:generate go run mkduff.go
+//go:generate go run mkfastlog2table.go
+
+var ticks struct {
+	lock mutex
+	pad  uint32 // ensure 8-byte alignment of val on 386
+	val  uint64
+}
+
+// Note: Called by runtime/pprof in addition to runtime code.
+func tickspersecond() int64 {
+	r := int64(atomic.Load64(&ticks.val))
+	if r != 0 {
+		return r
+	}
+	lock(&ticks.lock)
+	r = int64(ticks.val)
+	if r == 0 {
+		t0 := nanotime()
+		c0 := cputicks()
+		usleep(100 * 1000)
+		t1 := nanotime()
+		c1 := cputicks()
+		if t1 == t0 {
+			t1++
+		}
+		r = (c1 - c0) * 1000 * 1000 * 1000 / (t1 - t0)
+		if r == 0 {
+			r++
+		}
+		atomic.Store64(&ticks.val, uint64(r))
+	}
+	unlock(&ticks.lock)
+	return r
+}
+
+var envs []string
+var argslice []string
+
+//go:linkname syscall_runtime_envs syscall.runtime_envs
+func syscall_runtime_envs() []string { return append([]string{}, envs...) }
+
+//go:linkname syscall_Getpagesize syscall.Getpagesize
+func syscall_Getpagesize() int { return int(physPageSize) }
+
+//go:linkname os_runtime_args os.runtime_args
+func os_runtime_args() []string { return append([]string{}, argslice...) }
+
+//go:linkname syscall_Exit syscall.Exit
+//go:nosplit
+func syscall_Exit(code int) {
+	exit(int32(code))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/runtime1.go b/contrib/go/_std_1.18/src/runtime/runtime1.go
new file mode 100644
index 0000000000..65e1e0eebc
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/runtime1.go
@@ -0,0 +1,544 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/bytealg"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// Keep a cached value to make gotraceback fast,
+// since we call it on every call to gentraceback.
+// The cached value is a uint32 in which the low bits
+// are the "crash" and "all" settings and the remaining
+// bits are the traceback value (0 off, 1 on, 2 include system).
+const (
+	tracebackCrash = 1 << iota
+	tracebackAll
+	tracebackShift = iota
+)
+
+var traceback_cache uint32 = 2 << tracebackShift
+var traceback_env uint32
+
+// gotraceback returns the current traceback settings.
+//
+// If level is 0, suppress all tracebacks.
+// If level is 1, show tracebacks, but exclude runtime frames.
+// If level is 2, show tracebacks including runtime frames.
+// If all is set, print all goroutine stacks. Otherwise, print just the current goroutine.
+// If crash is set, crash (core dump, etc) after tracebacking.
+//
+//go:nosplit
+func gotraceback() (level int32, all, crash bool) {
+	_g_ := getg()
+	t := atomic.Load(&traceback_cache)
+	crash = t&tracebackCrash != 0
+	all = _g_.m.throwing > 0 || t&tracebackAll != 0
+	if _g_.m.traceback != 0 {
+		level = int32(_g_.m.traceback)
+	} else {
+		level = int32(t >> tracebackShift)
+	}
+	return
+}
+
+var (
+	argc int32
+	argv **byte
+)
+
+// nosplit for use in linux startup sysargs
+//go:nosplit
+func argv_index(argv **byte, i int32) *byte {
+	return *(**byte)(add(unsafe.Pointer(argv), uintptr(i)*goarch.PtrSize))
+}
+
+func args(c int32, v **byte) {
+	argc = c
+	argv = v
+	sysargs(c, v)
+}
+
+func goargs() {
+	if GOOS == "windows" {
+		return
+	}
+	argslice = make([]string, argc)
+	for i := int32(0); i < argc; i++ {
+		argslice[i] = gostringnocopy(argv_index(argv, i))
+	}
+}
+
+func goenvs_unix() {
+	// TODO(austin): ppc64 in dynamic linking mode doesn't
+	// guarantee env[] will immediately follow argv. Might cause
+	// problems.
+	n := int32(0)
+	for argv_index(argv, argc+1+n) != nil {
+		n++
+	}
+
+	envs = make([]string, n)
+	for i := int32(0); i < n; i++ {
+		envs[i] = gostring(argv_index(argv, argc+1+i))
+	}
+}
+
+func environ() []string {
+	return envs
+}
+
+// TODO: These should be locals in testAtomic64, but we don't 8-byte
+// align stack variables on 386.
+var test_z64, test_x64 uint64
+
+func testAtomic64() {
+	test_z64 = 42
+	test_x64 = 0
+	if atomic.Cas64(&test_z64, test_x64, 1) {
+		throw("cas64 failed")
+	}
+	if test_x64 != 0 {
+		throw("cas64 failed")
+	}
+	test_x64 = 42
+	if !atomic.Cas64(&test_z64, test_x64, 1) {
+		throw("cas64 failed")
+	}
+	if test_x64 != 42 || test_z64 != 1 {
+		throw("cas64 failed")
+	}
+	if atomic.Load64(&test_z64) != 1 {
+		throw("load64 failed")
+	}
+	atomic.Store64(&test_z64, (1<<40)+1)
+	if atomic.Load64(&test_z64) != (1<<40)+1 {
+		throw("store64 failed")
+	}
+	if atomic.Xadd64(&test_z64, (1<<40)+1) != (2<<40)+2 {
+		throw("xadd64 failed")
+	}
+	if atomic.Load64(&test_z64) != (2<<40)+2 {
+		throw("xadd64 failed")
+	}
+	if atomic.Xchg64(&test_z64, (3<<40)+3) != (2<<40)+2 {
+		throw("xchg64 failed")
+	}
+	if atomic.Load64(&test_z64) != (3<<40)+3 {
+		throw("xchg64 failed")
+	}
+}
+
+func check() {
+	var (
+		a     int8
+		b     uint8
+		c     int16
+		d     uint16
+		e     int32
+		f     uint32
+		g     int64
+		h     uint64
+		i, i1 float32
+		j, j1 float64
+		k     unsafe.Pointer
+		l     *uint16
+		m     [4]byte
+	)
+	type x1t struct {
+		x uint8
+	}
+	type y1t struct {
+		x1 x1t
+		y  uint8
+	}
+	var x1 x1t
+	var y1 y1t
+
+	if unsafe.Sizeof(a) != 1 {
+		throw("bad a")
+	}
+	if unsafe.Sizeof(b) != 1 {
+		throw("bad b")
+	}
+	if unsafe.Sizeof(c) != 2 {
+		throw("bad c")
+	}
+	if unsafe.Sizeof(d) != 2 {
+		throw("bad d")
+	}
+	if unsafe.Sizeof(e) != 4 {
+		throw("bad e")
+	}
+	if unsafe.Sizeof(f) != 4 {
+		throw("bad f")
+	}
+	if unsafe.Sizeof(g) != 8 {
+		throw("bad g")
+	}
+	if unsafe.Sizeof(h) != 8 {
+		throw("bad h")
+	}
+	if unsafe.Sizeof(i) != 4 {
+		throw("bad i")
+	}
+	if unsafe.Sizeof(j) != 8 {
+		throw("bad j")
+	}
+	if unsafe.Sizeof(k) != goarch.PtrSize {
+		throw("bad k")
+	}
+	if unsafe.Sizeof(l) != goarch.PtrSize {
+		throw("bad l")
+	}
+	if unsafe.Sizeof(x1) != 1 {
+		throw("bad unsafe.Sizeof x1")
+	}
+	if unsafe.Offsetof(y1.y) != 1 {
+		throw("bad offsetof y1.y")
+	}
+	if unsafe.Sizeof(y1) != 2 {
+		throw("bad unsafe.Sizeof y1")
+	}
+
+	if timediv(12345*1000000000+54321, 1000000000, &e) != 12345 || e != 54321 {
+		throw("bad timediv")
+	}
+
+	var z uint32
+	z = 1
+	if !atomic.Cas(&z, 1, 2) {
+		throw("cas1")
+	}
+	if z != 2 {
+		throw("cas2")
+	}
+
+	z = 4
+	if atomic.Cas(&z, 5, 6) {
+		throw("cas3")
+	}
+	if z != 4 {
+		throw("cas4")
+	}
+
+	z = 0xffffffff
+	if !atomic.Cas(&z, 0xffffffff, 0xfffffffe) {
+		throw("cas5")
+	}
+	if z != 0xfffffffe {
+		throw("cas6")
+	}
+
+	m = [4]byte{1, 1, 1, 1}
+	atomic.Or8(&m[1], 0xf0)
+	if m[0] != 1 || m[1] != 0xf1 || m[2] != 1 || m[3] != 1 {
+		throw("atomicor8")
+	}
+
+	m = [4]byte{0xff, 0xff, 0xff, 0xff}
+	atomic.And8(&m[1], 0x1)
+	if m[0] != 0xff || m[1] != 0x1 || m[2] != 0xff || m[3] != 0xff {
+		throw("atomicand8")
+	}
+
+	*(*uint64)(unsafe.Pointer(&j)) = ^uint64(0)
+	if j == j {
+		throw("float64nan")
+	}
+	if !(j != j) {
+		throw("float64nan1")
+	}
+
+	*(*uint64)(unsafe.Pointer(&j1)) = ^uint64(1)
+	if j == j1 {
+		throw("float64nan2")
+	}
+	if !(j != j1) {
+		throw("float64nan3")
+	}
+
+	*(*uint32)(unsafe.Pointer(&i)) = ^uint32(0)
+	if i == i {
+		throw("float32nan")
+	}
+	if i == i {
+		throw("float32nan1")
+	}
+
+	*(*uint32)(unsafe.Pointer(&i1)) = ^uint32(1)
+	if i == i1 {
+		throw("float32nan2")
+	}
+	if i == i1 {
+		throw("float32nan3")
+	}
+
+	testAtomic64()
+
+	if _FixedStack != round2(_FixedStack) {
+		throw("FixedStack is not power-of-2")
+	}
+
+	if !checkASM() {
+		throw("assembly checks failed")
+	}
+}
+
+type dbgVar struct {
+	name  string
+	value *int32
+}
+
+// Holds variables parsed from GODEBUG env var,
+// except for "memprofilerate" since there is an
+// existing int var for that value, which may
+// already have an initial value.
+var debug struct {
+	cgocheck           int32
+	clobberfree        int32
+	efence             int32
+	gccheckmark        int32
+	gcpacertrace       int32
+	gcshrinkstackoff   int32
+	gcstoptheworld     int32
+	gctrace            int32
+	invalidptr         int32
+	madvdontneed       int32 // for Linux; issue 28466
+	scavtrace          int32
+	scheddetail        int32
+	schedtrace         int32
+	tracebackancestors int32
+	asyncpreemptoff    int32
+	harddecommit       int32
+
+	// debug.malloc is used as a combined debug check
+	// in the malloc function and should be set
+	// if any of the below debug options is != 0.
+	malloc         bool
+	allocfreetrace int32
+	inittrace      int32
+	sbrk           int32
+}
+
+var dbgvars = []dbgVar{
+	{"allocfreetrace", &debug.allocfreetrace},
+	{"clobberfree", &debug.clobberfree},
+	{"cgocheck", &debug.cgocheck},
+	{"efence", &debug.efence},
+	{"gccheckmark", &debug.gccheckmark},
+	{"gcpacertrace", &debug.gcpacertrace},
+	{"gcshrinkstackoff", &debug.gcshrinkstackoff},
+	{"gcstoptheworld", &debug.gcstoptheworld},
+	{"gctrace", &debug.gctrace},
+	{"invalidptr", &debug.invalidptr},
+	{"madvdontneed", &debug.madvdontneed},
+	{"sbrk", &debug.sbrk},
+	{"scavtrace", &debug.scavtrace},
+	{"scheddetail", &debug.scheddetail},
+	{"schedtrace", &debug.schedtrace},
+	{"tracebackancestors", &debug.tracebackancestors},
+	{"asyncpreemptoff", &debug.asyncpreemptoff},
+	{"inittrace", &debug.inittrace},
+	{"harddecommit", &debug.harddecommit},
+}
+
+func parsedebugvars() {
+	// defaults
+	debug.cgocheck = 1
+	debug.invalidptr = 1
+	if GOOS == "linux" {
+		// On Linux, MADV_FREE is faster than MADV_DONTNEED,
+		// but doesn't affect many of the statistics that
+		// MADV_DONTNEED does until the memory is actually
+		// reclaimed. This generally leads to poor user
+		// experience, like confusing stats in top and other
+		// monitoring tools; and bad integration with
+		// management systems that respond to memory usage.
+		// Hence, default to MADV_DONTNEED.
+		debug.madvdontneed = 1
+	}
+
+	for p := gogetenv("GODEBUG"); p != ""; {
+		field := ""
+		i := bytealg.IndexByteString(p, ',')
+		if i < 0 {
+			field, p = p, ""
+		} else {
+			field, p = p[:i], p[i+1:]
+		}
+		i = bytealg.IndexByteString(field, '=')
+		if i < 0 {
+			continue
+		}
+		key, value := field[:i], field[i+1:]
+
+		// Update MemProfileRate directly here since it
+		// is int, not int32, and should only be updated
+		// if specified in GODEBUG.
+		if key == "memprofilerate" {
+			if n, ok := atoi(value); ok {
+				MemProfileRate = n
+			}
+		} else {
+			for _, v := range dbgvars {
+				if v.name == key {
+					if n, ok := atoi32(value); ok {
+						*v.value = n
+					}
+				}
+			}
+		}
+	}
+
+	debug.malloc = (debug.allocfreetrace | debug.inittrace | debug.sbrk) != 0
+
+	setTraceback(gogetenv("GOTRACEBACK"))
+	traceback_env = traceback_cache
+}
+
+//go:linkname setTraceback runtime/debug.SetTraceback
+func setTraceback(level string) {
+	var t uint32
+	switch level {
+	case "none":
+		t = 0
+	case "single", "":
+		t = 1 << tracebackShift
+	case "all":
+		t = 1<<tracebackShift | tracebackAll
+	case "system":
+		t = 2<<tracebackShift | tracebackAll
+	case "crash":
+		t = 2<<tracebackShift | tracebackAll | tracebackCrash
+	default:
+		t = tracebackAll
+		if n, ok := atoi(level); ok && n == int(uint32(n)) {
+			t |= uint32(n) << tracebackShift
+		}
+	}
+	// when C owns the process, simply exit'ing the process on fatal errors
+	// and panics is surprising. Be louder and abort instead.
+	if islibrary || isarchive {
+		t |= tracebackCrash
+	}
+
+	t |= traceback_env
+
+	atomic.Store(&traceback_cache, t)
+}
+
+// Poor mans 64-bit division.
+// This is a very special function, do not use it if you are not sure what you are doing.
+// int64 division is lowered into _divv() call on 386, which does not fit into nosplit functions.
+// Handles overflow in a time-specific manner.
+// This keeps us within no-split stack limits on 32-bit processors.
+//go:nosplit
+func timediv(v int64, div int32, rem *int32) int32 {
+	res := int32(0)
+	for bit := 30; bit >= 0; bit-- {
+		if v >= int64(div)<<uint(bit) {
+			v = v - (int64(div) << uint(bit))
+			// Before this for loop, res was 0, thus all these
+			// power of 2 increments are now just bitsets.
+			res |= 1 << uint(bit)
+		}
+	}
+	if v >= int64(div) {
+		if rem != nil {
+			*rem = 0
+		}
+		return 0x7fffffff
+	}
+	if rem != nil {
+		*rem = int32(v)
+	}
+	return res
+}
+
+// Helpers for Go. Must be NOSPLIT, must only call NOSPLIT functions, and must not block.
+
+//go:nosplit
+func acquirem() *m {
+	_g_ := getg()
+	_g_.m.locks++
+	return _g_.m
+}
+
+//go:nosplit
+func releasem(mp *m) {
+	_g_ := getg()
+	mp.locks--
+	if mp.locks == 0 && _g_.preempt {
+		// restore the preemption request in case we've cleared it in newstack
+		_g_.stackguard0 = stackPreempt
+	}
+}
+
+//go:linkname reflect_typelinks reflect.typelinks
+func reflect_typelinks() ([]unsafe.Pointer, [][]int32) {
+	modules := activeModules()
+	sections := []unsafe.Pointer{unsafe.Pointer(modules[0].types)}
+	ret := [][]int32{modules[0].typelinks}
+	for _, md := range modules[1:] {
+		sections = append(sections, unsafe.Pointer(md.types))
+		ret = append(ret, md.typelinks)
+	}
+	return sections, ret
+}
+
+// reflect_resolveNameOff resolves a name offset from a base pointer.
+//go:linkname reflect_resolveNameOff reflect.resolveNameOff
+func reflect_resolveNameOff(ptrInModule unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer(resolveNameOff(ptrInModule, nameOff(off)).bytes)
+}
+
+// reflect_resolveTypeOff resolves an *rtype offset from a base type.
+//go:linkname reflect_resolveTypeOff reflect.resolveTypeOff
+func reflect_resolveTypeOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer((*_type)(rtype).typeOff(typeOff(off)))
+}
+
+// reflect_resolveTextOff resolves a function pointer offset from a base type.
+//go:linkname reflect_resolveTextOff reflect.resolveTextOff
+func reflect_resolveTextOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return (*_type)(rtype).textOff(textOff(off))
+
+}
+
+// reflectlite_resolveNameOff resolves a name offset from a base pointer.
+//go:linkname reflectlite_resolveNameOff internal/reflectlite.resolveNameOff
+func reflectlite_resolveNameOff(ptrInModule unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer(resolveNameOff(ptrInModule, nameOff(off)).bytes)
+}
+
+// reflectlite_resolveTypeOff resolves an *rtype offset from a base type.
+//go:linkname reflectlite_resolveTypeOff internal/reflectlite.resolveTypeOff
+func reflectlite_resolveTypeOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer((*_type)(rtype).typeOff(typeOff(off)))
+}
+
+// reflect_addReflectOff adds a pointer to the reflection offset lookup map.
+//go:linkname reflect_addReflectOff reflect.addReflectOff
+func reflect_addReflectOff(ptr unsafe.Pointer) int32 {
+	reflectOffsLock()
+	if reflectOffs.m == nil {
+		reflectOffs.m = make(map[int32]unsafe.Pointer)
+		reflectOffs.minv = make(map[unsafe.Pointer]int32)
+		reflectOffs.next = -1
+	}
+	id, found := reflectOffs.minv[ptr]
+	if !found {
+		id = reflectOffs.next
+		reflectOffs.next-- // use negative offsets as IDs to aid debugging
+		reflectOffs.m[id] = ptr
+		reflectOffs.minv[ptr] = id
+	}
+	reflectOffsUnlock()
+	return id
+}
diff --git a/contrib/go/_std_1.18/src/runtime/runtime2.go b/contrib/go/_std_1.18/src/runtime/runtime2.go
new file mode 100644
index 0000000000..b40045e4a5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/runtime2.go
@@ -0,0 +1,1134 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// defined constants
+const (
+	// G status
+	//
+	// Beyond indicating the general state of a G, the G status
+	// acts like a lock on the goroutine's stack (and hence its
+	// ability to execute user code).
+	//
+	// If you add to this list, add to the list
+	// of "okay during garbage collection" status
+	// in mgcmark.go too.
+	//
+	// TODO(austin): The _Gscan bit could be much lighter-weight.
+	// For example, we could choose not to run _Gscanrunnable
+	// goroutines found in the run queue, rather than CAS-looping
+	// until they become _Grunnable. And transitions like
+	// _Gscanwaiting -> _Gscanrunnable are actually okay because
+	// they don't affect stack ownership.
+
+	// _Gidle means this goroutine was just allocated and has not
+	// yet been initialized.
+	_Gidle = iota // 0
+
+	// _Grunnable means this goroutine is on a run queue. It is
+	// not currently executing user code. The stack is not owned.
+	_Grunnable // 1
+
+	// _Grunning means this goroutine may execute user code. The
+	// stack is owned by this goroutine. It is not on a run queue.
+	// It is assigned an M and a P (g.m and g.m.p are valid).
+	_Grunning // 2
+
+	// _Gsyscall means this goroutine is executing a system call.
+	// It is not executing user code. The stack is owned by this
+	// goroutine. It is not on a run queue. It is assigned an M.
+	_Gsyscall // 3
+
+	// _Gwaiting means this goroutine is blocked in the runtime.
+	// It is not executing user code. It is not on a run queue,
+	// but should be recorded somewhere (e.g., a channel wait
+	// queue) so it can be ready()d when necessary. The stack is
+	// not owned *except* that a channel operation may read or
+	// write parts of the stack under the appropriate channel
+	// lock. Otherwise, it is not safe to access the stack after a
+	// goroutine enters _Gwaiting (e.g., it may get moved).
+	_Gwaiting // 4
+
+	// _Gmoribund_unused is currently unused, but hardcoded in gdb
+	// scripts.
+	_Gmoribund_unused // 5
+
+	// _Gdead means this goroutine is currently unused. It may be
+	// just exited, on a free list, or just being initialized. It
+	// is not executing user code. It may or may not have a stack
+	// allocated. The G and its stack (if any) are owned by the M
+	// that is exiting the G or that obtained the G from the free
+	// list.
+	_Gdead // 6
+
+	// _Genqueue_unused is currently unused.
+	_Genqueue_unused // 7
+
+	// _Gcopystack means this goroutine's stack is being moved. It
+	// is not executing user code and is not on a run queue. The
+	// stack is owned by the goroutine that put it in _Gcopystack.
+	_Gcopystack // 8
+
+	// _Gpreempted means this goroutine stopped itself for a
+	// suspendG preemption. It is like _Gwaiting, but nothing is
+	// yet responsible for ready()ing it. Some suspendG must CAS
+	// the status to _Gwaiting to take responsibility for
+	// ready()ing this G.
+	_Gpreempted // 9
+
+	// _Gscan combined with one of the above states other than
+	// _Grunning indicates that GC is scanning the stack. The
+	// goroutine is not executing user code and the stack is owned
+	// by the goroutine that set the _Gscan bit.
+	//
+	// _Gscanrunning is different: it is used to briefly block
+	// state transitions while GC signals the G to scan its own
+	// stack. This is otherwise like _Grunning.
+	//
+	// atomicstatus&~Gscan gives the state the goroutine will
+	// return to when the scan completes.
+	_Gscan          = 0x1000
+	_Gscanrunnable  = _Gscan + _Grunnable  // 0x1001
+	_Gscanrunning   = _Gscan + _Grunning   // 0x1002
+	_Gscansyscall   = _Gscan + _Gsyscall   // 0x1003
+	_Gscanwaiting   = _Gscan + _Gwaiting   // 0x1004
+	_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
+)
+
+const (
+	// P status
+
+	// _Pidle means a P is not being used to run user code or the
+	// scheduler. Typically, it's on the idle P list and available
+	// to the scheduler, but it may just be transitioning between
+	// other states.
+	//
+	// The P is owned by the idle list or by whatever is
+	// transitioning its state. Its run queue is empty.
+	_Pidle = iota
+
+	// _Prunning means a P is owned by an M and is being used to
+	// run user code or the scheduler. Only the M that owns this P
+	// is allowed to change the P's status from _Prunning. The M
+	// may transition the P to _Pidle (if it has no more work to
+	// do), _Psyscall (when entering a syscall), or _Pgcstop (to
+	// halt for the GC). The M may also hand ownership of the P
+	// off directly to another M (e.g., to schedule a locked G).
+	_Prunning
+
+	// _Psyscall means a P is not running user code. It has
+	// affinity to an M in a syscall but is not owned by it and
+	// may be stolen by another M. This is similar to _Pidle but
+	// uses lightweight transitions and maintains M affinity.
+	//
+	// Leaving _Psyscall must be done with a CAS, either to steal
+	// or retake the P. Note that there's an ABA hazard: even if
+	// an M successfully CASes its original P back to _Prunning
+	// after a syscall, it must understand the P may have been
+	// used by another M in the interim.
+	_Psyscall
+
+	// _Pgcstop means a P is halted for STW and owned by the M
+	// that stopped the world. The M that stopped the world
+	// continues to use its P, even in _Pgcstop. Transitioning
+	// from _Prunning to _Pgcstop causes an M to release its P and
+	// park.
+	//
+	// The P retains its run queue and startTheWorld will restart
+	// the scheduler on Ps with non-empty run queues.
+	_Pgcstop
+
+	// _Pdead means a P is no longer used (GOMAXPROCS shrank). We
+	// reuse Ps if GOMAXPROCS increases. A dead P is mostly
+	// stripped of its resources, though a few things remain
+	// (e.g., trace buffers).
+	_Pdead
+)
+
+// Mutual exclusion locks.  In the uncontended case,
+// as fast as spin locks (just a few user-level instructions),
+// but on the contention path they sleep in the kernel.
+// A zeroed Mutex is unlocked (no need to initialize each lock).
+// Initialization is helpful for static lock ranking, but not required.
+type mutex struct {
+	// Empty struct if lock ranking is disabled, otherwise includes the lock rank
+	lockRankStruct
+	// Futex-based impl treats it as uint32 key,
+	// while sema-based impl as M* waitm.
+	// Used to be a union, but unions break precise GC.
+	key uintptr
+}
+
+// sleep and wakeup on one-time events.
+// before any calls to notesleep or notewakeup,
+// must call noteclear to initialize the Note.
+// then, exactly one thread can call notesleep
+// and exactly one thread can call notewakeup (once).
+// once notewakeup has been called, the notesleep
+// will return.  future notesleep will return immediately.
+// subsequent noteclear must be called only after
+// previous notesleep has returned, e.g. it's disallowed
+// to call noteclear straight after notewakeup.
+//
+// notetsleep is like notesleep but wakes up after
+// a given number of nanoseconds even if the event
+// has not yet happened.  if a goroutine uses notetsleep to
+// wake up early, it must wait to call noteclear until it
+// can be sure that no other goroutine is calling
+// notewakeup.
+//
+// notesleep/notetsleep are generally called on g0,
+// notetsleepg is similar to notetsleep but is called on user g.
+type note struct {
+	// Futex-based impl treats it as uint32 key,
+	// while sema-based impl as M* waitm.
+	// Used to be a union, but unions break precise GC.
+	key uintptr
+}
+
+type funcval struct {
+	fn uintptr
+	// variable-size, fn-specific data here
+}
+
+type iface struct {
+	tab  *itab
+	data unsafe.Pointer
+}
+
+type eface struct {
+	_type *_type
+	data  unsafe.Pointer
+}
+
+func efaceOf(ep *any) *eface {
+	return (*eface)(unsafe.Pointer(ep))
+}
+
+// The guintptr, muintptr, and puintptr are all used to bypass write barriers.
+// It is particularly important to avoid write barriers when the current P has
+// been released, because the GC thinks the world is stopped, and an
+// unexpected write barrier would not be synchronized with the GC,
+// which can lead to a half-executed write barrier that has marked the object
+// but not queued it. If the GC skips the object and completes before the
+// queuing can occur, it will incorrectly free the object.
+//
+// We tried using special assignment functions invoked only when not
+// holding a running P, but then some updates to a particular memory
+// word went through write barriers and some did not. This breaks the
+// write barrier shadow checking mode, and it is also scary: better to have
+// a word that is completely ignored by the GC than to have one for which
+// only a few updates are ignored.
+//
+// Gs and Ps are always reachable via true pointers in the
+// allgs and allp lists or (during allocation before they reach those lists)
+// from stack variables.
+//
+// Ms are always reachable via true pointers either from allm or
+// freem. Unlike Gs and Ps we do free Ms, so it's important that
+// nothing ever hold an muintptr across a safe point.
+
+// A guintptr holds a goroutine pointer, but typed as a uintptr
+// to bypass write barriers. It is used in the Gobuf goroutine state
+// and in scheduling lists that are manipulated without a P.
+//
+// The Gobuf.g goroutine pointer is almost always updated by assembly code.
+// In one of the few places it is updated by Go code - func save - it must be
+// treated as a uintptr to avoid a write barrier being emitted at a bad time.
+// Instead of figuring out how to emit the write barriers missing in the
+// assembly manipulation, we change the type of the field to uintptr,
+// so that it does not require write barriers at all.
+//
+// Goroutine structs are published in the allg list and never freed.
+// That will keep the goroutine structs from being collected.
+// There is never a time that Gobuf.g's contain the only references
+// to a goroutine: the publishing of the goroutine in allg comes first.
+// Goroutine pointers are also kept in non-GC-visible places like TLS,
+// so I can't see them ever moving. If we did want to start moving data
+// in the GC, we'd need to allocate the goroutine structs from an
+// alternate arena. Using guintptr doesn't make that problem any worse.
+// Note that pollDesc.rg, pollDesc.wg also store g in uintptr form,
+// so they would need to be updated too if g's start moving.
+type guintptr uintptr
+
+//go:nosplit
+func (gp guintptr) ptr() *g { return (*g)(unsafe.Pointer(gp)) }
+
+//go:nosplit
+func (gp *guintptr) set(g *g) { *gp = guintptr(unsafe.Pointer(g)) }
+
+//go:nosplit
+func (gp *guintptr) cas(old, new guintptr) bool {
+	return atomic.Casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
+}
+
+// setGNoWB performs *gp = new without a write barrier.
+// For times when it's impractical to use a guintptr.
+//go:nosplit
+//go:nowritebarrier
+func setGNoWB(gp **g, new *g) {
+	(*guintptr)(unsafe.Pointer(gp)).set(new)
+}
+
+type puintptr uintptr
+
+//go:nosplit
+func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
+
+//go:nosplit
+func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
+
+// muintptr is a *m that is not tracked by the garbage collector.
+//
+// Because we do free Ms, there are some additional constrains on
+// muintptrs:
+//
+// 1. Never hold an muintptr locally across a safe point.
+//
+// 2. Any muintptr in the heap must be owned by the M itself so it can
+//    ensure it is not in use when the last true *m is released.
+type muintptr uintptr
+
+//go:nosplit
+func (mp muintptr) ptr() *m { return (*m)(unsafe.Pointer(mp)) }
+
+//go:nosplit
+func (mp *muintptr) set(m *m) { *mp = muintptr(unsafe.Pointer(m)) }
+
+// setMNoWB performs *mp = new without a write barrier.
+// For times when it's impractical to use an muintptr.
+//go:nosplit
+//go:nowritebarrier
+func setMNoWB(mp **m, new *m) {
+	(*muintptr)(unsafe.Pointer(mp)).set(new)
+}
+
+type gobuf struct {
+	// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
+	//
+	// ctxt is unusual with respect to GC: it may be a
+	// heap-allocated funcval, so GC needs to track it, but it
+	// needs to be set and cleared from assembly, where it's
+	// difficult to have write barriers. However, ctxt is really a
+	// saved, live register, and we only ever exchange it between
+	// the real register and the gobuf. Hence, we treat it as a
+	// root during stack scanning, which means assembly that saves
+	// and restores it doesn't need write barriers. It's still
+	// typed as a pointer so that any other writes from Go get
+	// write barriers.
+	sp   uintptr
+	pc   uintptr
+	g    guintptr
+	ctxt unsafe.Pointer
+	ret  uintptr
+	lr   uintptr
+	bp   uintptr // for framepointer-enabled architectures
+}
+
+// sudog represents a g in a wait list, such as for sending/receiving
+// on a channel.
+//
+// sudog is necessary because the g ↔ synchronization object relation
+// is many-to-many. A g can be on many wait lists, so there may be
+// many sudogs for one g; and many gs may be waiting on the same
+// synchronization object, so there may be many sudogs for one object.
+//
+// sudogs are allocated from a special pool. Use acquireSudog and
+// releaseSudog to allocate and free them.
+type sudog struct {
+	// The following fields are protected by the hchan.lock of the
+	// channel this sudog is blocking on. shrinkstack depends on
+	// this for sudogs involved in channel ops.
+
+	g *g
+
+	next *sudog
+	prev *sudog
+	elem unsafe.Pointer // data element (may point to stack)
+
+	// The following fields are never accessed concurrently.
+	// For channels, waitlink is only accessed by g.
+	// For semaphores, all fields (including the ones above)
+	// are only accessed when holding a semaRoot lock.
+
+	acquiretime int64
+	releasetime int64
+	ticket      uint32
+
+	// isSelect indicates g is participating in a select, so
+	// g.selectDone must be CAS'd to win the wake-up race.
+	isSelect bool
+
+	// success indicates whether communication over channel c
+	// succeeded. It is true if the goroutine was awoken because a
+	// value was delivered over channel c, and false if awoken
+	// because c was closed.
+	success bool
+
+	parent   *sudog // semaRoot binary tree
+	waitlink *sudog // g.waiting list or semaRoot
+	waittail *sudog // semaRoot
+	c        *hchan // channel
+}
+
+type libcall struct {
+	fn   uintptr
+	n    uintptr // number of parameters
+	args uintptr // parameters
+	r1   uintptr // return values
+	r2   uintptr
+	err  uintptr // error number
+}
+
+// Stack describes a Go execution stack.
+// The bounds of the stack are exactly [lo, hi),
+// with no implicit data structures on either side.
+type stack struct {
+	lo uintptr
+	hi uintptr
+}
+
+// heldLockInfo gives info on a held lock and the rank of that lock
+type heldLockInfo struct {
+	lockAddr uintptr
+	rank     lockRank
+}
+
+type g struct {
+	// Stack parameters.
+	// stack describes the actual stack memory: [stack.lo, stack.hi).
+	// stackguard0 is the stack pointer compared in the Go stack growth prologue.
+	// It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
+	// stackguard1 is the stack pointer compared in the C stack growth prologue.
+	// It is stack.lo+StackGuard on g0 and gsignal stacks.
+	// It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
+	stack       stack   // offset known to runtime/cgo
+	stackguard0 uintptr // offset known to liblink
+	stackguard1 uintptr // offset known to liblink
+
+	_panic    *_panic // innermost panic - offset known to liblink
+	_defer    *_defer // innermost defer
+	m         *m      // current m; offset known to arm liblink
+	sched     gobuf
+	syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
+	syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
+	stktopsp  uintptr // expected sp at top of stack, to check in traceback
+	// param is a generic pointer parameter field used to pass
+	// values in particular contexts where other storage for the
+	// parameter would be difficult to find. It is currently used
+	// in three ways:
+	// 1. When a channel operation wakes up a blocked goroutine, it sets param to
+	//    point to the sudog of the completed blocking operation.
+	// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
+	//    the GC cycle. It is unsafe to do so in any other way, because the goroutine's
+	//    stack may have moved in the meantime.
+	// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
+	//    closure in the runtime is forbidden.
+	param        unsafe.Pointer
+	atomicstatus uint32
+	stackLock    uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
+	goid         int64
+	schedlink    guintptr
+	waitsince    int64      // approx time when the g become blocked
+	waitreason   waitReason // if status==Gwaiting
+
+	preempt       bool // preemption signal, duplicates stackguard0 = stackpreempt
+	preemptStop   bool // transition to _Gpreempted on preemption; otherwise, just deschedule
+	preemptShrink bool // shrink stack at synchronous safe point
+
+	// asyncSafePoint is set if g is stopped at an asynchronous
+	// safe point. This means there are frames on the stack
+	// without precise pointer information.
+	asyncSafePoint bool
+
+	paniconfault bool // panic (instead of crash) on unexpected fault address
+	gcscandone   bool // g has scanned stack; protected by _Gscan bit in status
+	throwsplit   bool // must not split stack
+	// activeStackChans indicates that there are unlocked channels
+	// pointing into this goroutine's stack. If true, stack
+	// copying needs to acquire channel locks to protect these
+	// areas of the stack.
+	activeStackChans bool
+	// parkingOnChan indicates that the goroutine is about to
+	// park on a chansend or chanrecv. Used to signal an unsafe point
+	// for stack shrinking. It's a boolean value, but is updated atomically.
+	parkingOnChan uint8
+
+	raceignore     int8     // ignore race detection events
+	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	tracking       bool     // whether we're tracking this G for sched latency statistics
+	trackingSeq    uint8    // used to decide whether to track this G
+	runnableStamp  int64    // timestamp of when the G last became runnable, only used when tracking
+	runnableTime   int64    // the amount of time spent runnable, cleared when running, only used when tracking
+	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
+	traceseq       uint64   // trace event sequencer
+	tracelastp     puintptr // last P emitted an event for this goroutine
+	lockedm        muintptr
+	sig            uint32
+	writebuf       []byte
+	sigcode0       uintptr
+	sigcode1       uintptr
+	sigpc          uintptr
+	gopc           uintptr         // pc of go statement that created this goroutine
+	ancestors      *[]ancestorInfo // ancestor information goroutine(s) that created this goroutine (only used if debug.tracebackancestors)
+	startpc        uintptr         // pc of goroutine function
+	racectx        uintptr
+	waiting        *sudog         // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
+	cgoCtxt        []uintptr      // cgo traceback context
+	labels         unsafe.Pointer // profiler labels
+	timer          *timer         // cached timer for time.Sleep
+	selectDone     uint32         // are we participating in a select and did someone win the race?
+
+	// Per-G GC state
+
+	// gcAssistBytes is this G's GC assist credit in terms of
+	// bytes allocated. If this is positive, then the G has credit
+	// to allocate gcAssistBytes bytes without assisting. If this
+	// is negative, then the G must correct this by performing
+	// scan work. We track this in bytes to make it fast to update
+	// and check for debt in the malloc hot path. The assist ratio
+	// determines how this corresponds to scan work debt.
+	gcAssistBytes int64
+}
+
+// gTrackingPeriod is the number of transitions out of _Grunning between
+// latency tracking runs.
+const gTrackingPeriod = 8
+
+const (
+	// tlsSlots is the number of pointer-sized slots reserved for TLS on some platforms,
+	// like Windows.
+	tlsSlots = 6
+	tlsSize  = tlsSlots * goarch.PtrSize
+)
+
+type m struct {
+	g0      *g     // goroutine with scheduling stack
+	morebuf gobuf  // gobuf arg to morestack
+	divmod  uint32 // div/mod denominator for arm - known to liblink
+	_       uint32 // align next field to 8 bytes
+
+	// Fields not known to debuggers.
+	procid        uint64            // for debuggers, but offset not hard-coded
+	gsignal       *g                // signal-handling g
+	goSigStack    gsignalStack      // Go-allocated signal handling stack
+	sigmask       sigset            // storage for saved signal mask
+	tls           [tlsSlots]uintptr // thread-local storage (for x86 extern register)
+	mstartfn      func()
+	curg          *g       // current running goroutine
+	caughtsig     guintptr // goroutine running during fatal signal
+	p             puintptr // attached p for executing go code (nil if not executing go code)
+	nextp         puintptr
+	oldp          puintptr // the p that was attached before executing a syscall
+	id            int64
+	mallocing     int32
+	throwing      int32
+	preemptoff    string // if != "", keep curg running on this m
+	locks         int32
+	dying         int32
+	profilehz     int32
+	spinning      bool // m is out of work and is actively looking for work
+	blocked       bool // m is blocked on a note
+	newSigstack   bool // minit on C thread called sigaltstack
+	printlock     int8
+	incgo         bool   // m is executing a cgo call
+	freeWait      uint32 // if == 0, safe to free g0 and delete m (atomic)
+	fastrand      uint64
+	needextram    bool
+	traceback     uint8
+	ncgocall      uint64      // number of cgo calls in total
+	ncgo          int32       // number of cgo calls currently in progress
+	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
+	cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
+	park          note
+	alllink       *m // on allm
+	schedlink     muintptr
+	lockedg       guintptr
+	createstack   [32]uintptr // stack that created this thread.
+	lockedExt     uint32      // tracking for external LockOSThread
+	lockedInt     uint32      // tracking for internal lockOSThread
+	nextwaitm     muintptr    // next m waiting for lock
+	waitunlockf   func(*g, unsafe.Pointer) bool
+	waitlock      unsafe.Pointer
+	waittraceev   byte
+	waittraceskip int
+	startingtrace bool
+	syscalltick   uint32
+	freelink      *m // on sched.freem
+
+	// these are here because they are too large to be on the stack
+	// of low-level NOSPLIT functions.
+	libcall   libcall
+	libcallpc uintptr // for cpu profiler
+	libcallsp uintptr
+	libcallg  guintptr
+	syscall   libcall // stores syscall parameters on windows
+
+	vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
+	vdsoPC uintptr // PC for traceback while in VDSO call
+
+	// preemptGen counts the number of completed preemption
+	// signals. This is used to detect when a preemption is
+	// requested, but fails. Accessed atomically.
+	preemptGen uint32
+
+	// Whether this is a pending preemption signal on this M.
+	// Accessed atomically.
+	signalPending uint32
+
+	dlogPerM
+
+	mOS
+
+	// Up to 10 locks held by this m, maintained by the lock ranking code.
+	locksHeldLen int
+	locksHeld    [10]heldLockInfo
+}
+
+type p struct {
+	id          int32
+	status      uint32 // one of pidle/prunning/...
+	link        puintptr
+	schedtick   uint32     // incremented on every scheduler call
+	syscalltick uint32     // incremented on every system call
+	sysmontick  sysmontick // last tick observed by sysmon
+	m           muintptr   // back-link to associated m (nil if idle)
+	mcache      *mcache
+	pcache      pageCache
+	raceprocctx uintptr
+
+	deferpool    []*_defer // pool of available defer structs (see panic.go)
+	deferpoolbuf [32]*_defer
+
+	// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
+	goidcache    uint64
+	goidcacheend uint64
+
+	// Queue of runnable goroutines. Accessed without lock.
+	runqhead uint32
+	runqtail uint32
+	runq     [256]guintptr
+	// runnext, if non-nil, is a runnable G that was ready'd by
+	// the current G and should be run next instead of what's in
+	// runq if there's time remaining in the running G's time
+	// slice. It will inherit the time left in the current time
+	// slice. If a set of goroutines is locked in a
+	// communicate-and-wait pattern, this schedules that set as a
+	// unit and eliminates the (potentially large) scheduling
+	// latency that otherwise arises from adding the ready'd
+	// goroutines to the end of the run queue.
+	//
+	// Note that while other P's may atomically CAS this to zero,
+	// only the owner P can CAS it to a valid G.
+	runnext guintptr
+
+	// Available G's (status == Gdead)
+	gFree struct {
+		gList
+		n int32
+	}
+
+	sudogcache []*sudog
+	sudogbuf   [128]*sudog
+
+	// Cache of mspan objects from the heap.
+	mspancache struct {
+		// We need an explicit length here because this field is used
+		// in allocation codepaths where write barriers are not allowed,
+		// and eliminating the write barrier/keeping it eliminated from
+		// slice updates is tricky, moreso than just managing the length
+		// ourselves.
+		len int
+		buf [128]*mspan
+	}
+
+	tracebuf traceBufPtr
+
+	// traceSweep indicates the sweep events should be traced.
+	// This is used to defer the sweep start event until a span
+	// has actually been swept.
+	traceSweep bool
+	// traceSwept and traceReclaimed track the number of bytes
+	// swept and reclaimed by sweeping in the current sweep loop.
+	traceSwept, traceReclaimed uintptr
+
+	palloc persistentAlloc // per-P to avoid mutex
+
+	_ uint32 // Alignment for atomic fields below
+
+	// The when field of the first entry on the timer heap.
+	// This is updated using atomic functions.
+	// This is 0 if the timer heap is empty.
+	timer0When uint64
+
+	// The earliest known nextwhen field of a timer with
+	// timerModifiedEarlier status. Because the timer may have been
+	// modified again, there need not be any timer with this value.
+	// This is updated using atomic functions.
+	// This is 0 if there are no timerModifiedEarlier timers.
+	timerModifiedEarliest uint64
+
+	// Per-P GC state
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)
+
+	// gcMarkWorkerMode is the mode for the next mark worker to run in.
+	// That is, this is used to communicate with the worker goroutine
+	// selected for immediate execution by
+	// gcController.findRunnableGCWorker. When scheduling other goroutines,
+	// this field must be set to gcMarkWorkerNotWorker.
+	gcMarkWorkerMode gcMarkWorkerMode
+	// gcMarkWorkerStartTime is the nanotime() at which the most recent
+	// mark worker started.
+	gcMarkWorkerStartTime int64
+
+	// gcw is this P's GC work buffer cache. The work buffer is
+	// filled by write barriers, drained by mutator assists, and
+	// disposed on certain GC state transitions.
+	gcw gcWork
+
+	// wbBuf is this P's GC write barrier buffer.
+	//
+	// TODO: Consider caching this in the running G.
+	wbBuf wbBuf
+
+	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
+
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
+
+	// Lock for timers. We normally access the timers while running
+	// on this P, but the scheduler can also do it from a different P.
+	timersLock mutex
+
+	// Actions to take at some time. This is used to implement the
+	// standard library's time package.
+	// Must hold timersLock to access.
+	timers []*timer
+
+	// Number of timers in P's heap.
+	// Modified using atomic instructions.
+	numTimers uint32
+
+	// Number of timerDeleted timers in P's heap.
+	// Modified using atomic instructions.
+	deletedTimers uint32
+
+	// Race context used while executing timer functions.
+	timerRaceCtx uintptr
+
+	// scannableStackSizeDelta accumulates the amount of stack space held by
+	// live goroutines (i.e. those eligible for stack scanning).
+	// Flushed to gcController.scannableStackSize once scannableStackSizeSlack
+	// or -scannableStackSizeSlack is reached.
+	scannableStackSizeDelta int64
+
+	// preempt is set to indicate that this P should be enter the
+	// scheduler ASAP (regardless of what G is running on it).
+	preempt bool
+
+	// Padding is no longer needed. False sharing is now not a worry because p is large enough
+	// that its size class is an integer multiple of the cache line size (for any of our architectures).
+}
+
+type schedt struct {
+	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
+	goidgen   uint64
+	lastpoll  uint64 // time of last network poll, 0 if currently polling
+	pollUntil uint64 // time to which current poll is sleeping
+
+	lock mutex
+
+	// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
+	// sure to call checkdead().
+
+	midle        muintptr // idle m's waiting for work
+	nmidle       int32    // number of idle m's waiting for work
+	nmidlelocked int32    // number of locked m's waiting for work
+	mnext        int64    // number of m's that have been created and next M ID
+	maxmcount    int32    // maximum number of m's allowed (or die)
+	nmsys        int32    // number of system m's not counted for deadlock
+	nmfreed      int64    // cumulative number of freed m's
+
+	ngsys uint32 // number of system goroutines; updated atomically
+
+	pidle      puintptr // idle p's
+	npidle     uint32
+	nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.
+
+	// Global runnable queue.
+	runq     gQueue
+	runqsize int32
+
+	// disable controls selective disabling of the scheduler.
+	//
+	// Use schedEnableUser to control this.
+	//
+	// disable is protected by sched.lock.
+	disable struct {
+		// user disables scheduling of user goroutines.
+		user     bool
+		runnable gQueue // pending runnable Gs
+		n        int32  // length of runnable
+	}
+
+	// Global cache of dead G's.
+	gFree struct {
+		lock    mutex
+		stack   gList // Gs with stacks
+		noStack gList // Gs without stacks
+		n       int32
+	}
+
+	// Central cache of sudog structs.
+	sudoglock  mutex
+	sudogcache *sudog
+
+	// Central pool of available defer structs.
+	deferlock mutex
+	deferpool *_defer
+
+	// freem is the list of m's waiting to be freed when their
+	// m.exited is set. Linked through m.freelink.
+	freem *m
+
+	gcwaiting  uint32 // gc is waiting to run
+	stopwait   int32
+	stopnote   note
+	sysmonwait uint32
+	sysmonnote note
+
+	// safepointFn should be called on each P at the next GC
+	// safepoint if p.runSafePointFn is set.
+	safePointFn   func(*p)
+	safePointWait int32
+	safePointNote note
+
+	profilehz int32 // cpu profiling rate
+
+	procresizetime int64 // nanotime() of last change to gomaxprocs
+	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
+
+	// sysmonlock protects sysmon's actions on the runtime.
+	//
+	// Acquire and hold this mutex to block sysmon from interacting
+	// with the rest of the runtime.
+	sysmonlock mutex
+
+	// timeToRun is a distribution of scheduling latencies, defined
+	// as the sum of time a G spends in the _Grunnable state before
+	// it transitions to _Grunning.
+	//
+	// timeToRun is protected by sched.lock.
+	timeToRun timeHistogram
+}
+
+// Values for the flags field of a sigTabT.
+const (
+	_SigNotify   = 1 << iota // let signal.Notify have signal, even if from kernel
+	_SigKill                 // if signal.Notify doesn't take it, exit quietly
+	_SigThrow                // if signal.Notify doesn't take it, exit loudly
+	_SigPanic                // if the signal is from the kernel, panic
+	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
+	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
+	_SigSetStack             // Don't explicitly install handler, but add SA_ONSTACK to existing libc handler
+	_SigUnblock              // always unblock; see blockableSig
+	_SigIgn                  // _SIG_DFL action is to ignore the signal
+)
+
+// Layout of in-memory per-function information prepared by linker
+// See https://golang.org/s/go12symtab.
+// Keep in sync with linker (../cmd/link/internal/ld/pcln.go:/pclntab)
+// and with package debug/gosym and with symtab.go in package runtime.
+type _func struct {
+	entryoff uint32 // start pc, as offset from moduledata.text/pcHeader.textStart
+	nameoff  int32  // function name
+
+	args        int32  // in/out args size
+	deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.
+
+	pcsp      uint32
+	pcfile    uint32
+	pcln      uint32
+	npcdata   uint32
+	cuOffset  uint32 // runtime.cutab offset of this function's CU
+	funcID    funcID // set for certain special runtime functions
+	flag      funcFlag
+	_         [1]byte // pad
+	nfuncdata uint8   // must be last, must end on a uint32-aligned boundary
+}
+
+// Pseudo-Func that is returned for PCs that occur in inlined code.
+// A *Func can be either a *_func or a *funcinl, and they are distinguished
+// by the first uintptr.
+type funcinl struct {
+	ones  uint32  // set to ^0 to distinguish from _func
+	entry uintptr // entry of the real (the "outermost") frame
+	name  string
+	file  string
+	line  int
+}
+
+// layout of Itab known to compilers
+// allocated in non-garbage-collected memory
+// Needs to be in sync with
+// ../cmd/compile/internal/reflectdata/reflect.go:/^func.WriteTabs.
+type itab struct {
+	inter *interfacetype
+	_type *_type
+	hash  uint32 // copy of _type.hash. Used for type switches.
+	_     [4]byte
+	fun   [1]uintptr // variable sized. fun[0]==0 means _type does not implement inter.
+}
+
+// Lock-free stack node.
+// Also known to export_test.go.
+type lfnode struct {
+	next    uint64
+	pushcnt uintptr
+}
+
+type forcegcstate struct {
+	lock mutex
+	g    *g
+	idle uint32
+}
+
+// extendRandom extends the random numbers in r[:n] to the whole slice r.
+// Treats n<0 as n==0.
+func extendRandom(r []byte, n int) {
+	if n < 0 {
+		n = 0
+	}
+	for n < len(r) {
+		// Extend random bits using hash function & time seed
+		w := n
+		if w > 16 {
+			w = 16
+		}
+		h := memhash(unsafe.Pointer(&r[n-w]), uintptr(nanotime()), uintptr(w))
+		for i := 0; i < goarch.PtrSize && n < len(r); i++ {
+			r[n] = byte(h)
+			n++
+			h >>= 8
+		}
+	}
+}
+
+// A _defer holds an entry on the list of deferred calls.
+// If you add a field here, add code to clear it in deferProcStack.
+// This struct must match the code in cmd/compile/internal/ssagen/ssa.go:deferstruct
+// and cmd/compile/internal/ssagen/ssa.go:(*state).call.
+// Some defers will be allocated on the stack and some on the heap.
+// All defers are logically part of the stack, so write barriers to
+// initialize them are not required. All defers must be manually scanned,
+// and for heap defers, marked.
+type _defer struct {
+	started bool
+	heap    bool
+	// openDefer indicates that this _defer is for a frame with open-coded
+	// defers. We have only one defer record for the entire frame (which may
+	// currently have 0, 1, or more defers active).
+	openDefer bool
+	sp        uintptr // sp at time of defer
+	pc        uintptr // pc at time of defer
+	fn        func()  // can be nil for open-coded defers
+	_panic    *_panic // panic that is running defer
+	link      *_defer // next defer on G; can point to either heap or stack!
+
+	// If openDefer is true, the fields below record values about the stack
+	// frame and associated function that has the open-coded defer(s). sp
+	// above will be the sp for the frame, and pc will be address of the
+	// deferreturn call in the function.
+	fd   unsafe.Pointer // funcdata for the function associated with the frame
+	varp uintptr        // value of varp for the stack frame
+	// framepc is the current pc associated with the stack frame. Together,
+	// with sp above (which is the sp associated with the stack frame),
+	// framepc/sp can be used as pc/sp pair to continue a stack trace via
+	// gentraceback().
+	framepc uintptr
+}
+
+// A _panic holds information about an active panic.
+//
+// A _panic value must only ever live on the stack.
+//
+// The argp and link fields are stack pointers, but don't need special
+// handling during stack growth: because they are pointer-typed and
+// _panic values only live on the stack, regular stack pointer
+// adjustment takes care of them.
+type _panic struct {
+	argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
+	arg       any            // argument to panic
+	link      *_panic        // link to earlier panic
+	pc        uintptr        // where to return to in runtime if this panic is bypassed
+	sp        unsafe.Pointer // where to return to in runtime if this panic is bypassed
+	recovered bool           // whether this panic is over
+	aborted   bool           // the panic was aborted
+	goexit    bool
+}
+
+// stack traces
+type stkframe struct {
+	fn       funcInfo   // function being run
+	pc       uintptr    // program counter within fn
+	continpc uintptr    // program counter where execution can continue, or 0 if not
+	lr       uintptr    // program counter at caller aka link register
+	sp       uintptr    // stack pointer at pc
+	fp       uintptr    // stack pointer at caller aka frame pointer
+	varp     uintptr    // top of local variables
+	argp     uintptr    // pointer to function arguments
+	arglen   uintptr    // number of bytes at argp
+	argmap   *bitvector // force use of this argmap
+}
+
+// ancestorInfo records details of where a goroutine was started.
+type ancestorInfo struct {
+	pcs  []uintptr // pcs from the stack of this goroutine
+	goid int64     // goroutine id of this goroutine; original goroutine possibly dead
+	gopc uintptr   // pc of go statement that created this goroutine
+}
+
+const (
+	_TraceRuntimeFrames = 1 << iota // include frames for internal runtime functions.
+	_TraceTrap                      // the initial PC, SP are from a trap, not a return PC from a call
+	_TraceJumpStack                 // if traceback is on a systemstack, resume trace at g that called into it
+)
+
+// The maximum number of frames we print for a traceback
+const _TracebackMaxFrames = 100
+
+// A waitReason explains why a goroutine has been stopped.
+// See gopark. Do not re-use waitReasons, add new ones.
+type waitReason uint8
+
+const (
+	waitReasonZero                  waitReason = iota // ""
+	waitReasonGCAssistMarking                         // "GC assist marking"
+	waitReasonIOWait                                  // "IO wait"
+	waitReasonChanReceiveNilChan                      // "chan receive (nil chan)"
+	waitReasonChanSendNilChan                         // "chan send (nil chan)"
+	waitReasonDumpingHeap                             // "dumping heap"
+	waitReasonGarbageCollection                       // "garbage collection"
+	waitReasonGarbageCollectionScan                   // "garbage collection scan"
+	waitReasonPanicWait                               // "panicwait"
+	waitReasonSelect                                  // "select"
+	waitReasonSelectNoCases                           // "select (no cases)"
+	waitReasonGCAssistWait                            // "GC assist wait"
+	waitReasonGCSweepWait                             // "GC sweep wait"
+	waitReasonGCScavengeWait                          // "GC scavenge wait"
+	waitReasonChanReceive                             // "chan receive"
+	waitReasonChanSend                                // "chan send"
+	waitReasonFinalizerWait                           // "finalizer wait"
+	waitReasonForceGCIdle                             // "force gc (idle)"
+	waitReasonSemacquire                              // "semacquire"
+	waitReasonSleep                                   // "sleep"
+	waitReasonSyncCondWait                            // "sync.Cond.Wait"
+	waitReasonTimerGoroutineIdle                      // "timer goroutine (idle)"
+	waitReasonTraceReaderBlocked                      // "trace reader (blocked)"
+	waitReasonWaitForGCCycle                          // "wait for GC cycle"
+	waitReasonGCWorkerIdle                            // "GC worker (idle)"
+	waitReasonPreempted                               // "preempted"
+	waitReasonDebugCall                               // "debug call"
+)
+
+var waitReasonStrings = [...]string{
+	waitReasonZero:                  "",
+	waitReasonGCAssistMarking:       "GC assist marking",
+	waitReasonIOWait:                "IO wait",
+	waitReasonChanReceiveNilChan:    "chan receive (nil chan)",
+	waitReasonChanSendNilChan:       "chan send (nil chan)",
+	waitReasonDumpingHeap:           "dumping heap",
+	waitReasonGarbageCollection:     "garbage collection",
+	waitReasonGarbageCollectionScan: "garbage collection scan",
+	waitReasonPanicWait:             "panicwait",
+	waitReasonSelect:                "select",
+	waitReasonSelectNoCases:         "select (no cases)",
+	waitReasonGCAssistWait:          "GC assist wait",
+	waitReasonGCSweepWait:           "GC sweep wait",
+	waitReasonGCScavengeWait:        "GC scavenge wait",
+	waitReasonChanReceive:           "chan receive",
+	waitReasonChanSend:              "chan send",
+	waitReasonFinalizerWait:         "finalizer wait",
+	waitReasonForceGCIdle:           "force gc (idle)",
+	waitReasonSemacquire:            "semacquire",
+	waitReasonSleep:                 "sleep",
+	waitReasonSyncCondWait:          "sync.Cond.Wait",
+	waitReasonTimerGoroutineIdle:    "timer goroutine (idle)",
+	waitReasonTraceReaderBlocked:    "trace reader (blocked)",
+	waitReasonWaitForGCCycle:        "wait for GC cycle",
+	waitReasonGCWorkerIdle:          "GC worker (idle)",
+	waitReasonPreempted:             "preempted",
+	waitReasonDebugCall:             "debug call",
+}
+
+func (w waitReason) String() string {
+	if w < 0 || w >= waitReason(len(waitReasonStrings)) {
+		return "unknown wait reason"
+	}
+	return waitReasonStrings[w]
+}
+
+var (
+	allm       *m
+	gomaxprocs int32
+	ncpu       int32
+	forcegc    forcegcstate
+	sched      schedt
+	newprocs   int32
+
+	// allpLock protects P-less reads and size changes of allp, idlepMask,
+	// and timerpMask, and all writes to allp.
+	allpLock mutex
+	// len(allp) == gomaxprocs; may change at safe points, otherwise
+	// immutable.
+	allp []*p
+	// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
+	// be atomic. Length may change at safe points.
+	//
+	// Each P must update only its own bit. In order to maintain
+	// consistency, a P going idle must the idle mask simultaneously with
+	// updates to the idle P list under the sched.lock, otherwise a racing
+	// pidleget may clear the mask before pidleput sets the mask,
+	// corrupting the bitmap.
+	//
+	// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
+	idlepMask pMask
+	// Bitmask of Ps that may have a timer, one bit per P. Reads and writes
+	// must be atomic. Length may change at safe points.
+	timerpMask pMask
+
+	// Pool of GC parked background workers. Entries are type
+	// *gcBgMarkWorkerNode.
+	gcBgMarkWorkerPool lfstack
+
+	// Total number of gcBgMarkWorker goroutines. Protected by worldsema.
+	gcBgMarkWorkerCount int32
+
+	// Information about what cpu features are available.
+	// Packages outside the runtime should not use these
+	// as they are not an external api.
+	// Set on startup in asm_{386,amd64}.s
+	processorVersionInfo uint32
+	isIntel              bool
+
+	goarm uint8 // set by cmd/link on arm systems
+)
+
+// Set by the linker so the runtime can determine the buildmode.
+var (
+	islibrary bool // -buildmode=c-shared
+	isarchive bool // -buildmode=c-archive
+)
+
+// Must agree with internal/buildcfg.Experiment.FramePointer.
+const framepointer_enabled = GOARCH == "amd64" || GOARCH == "arm64"
diff --git a/contrib/go/_std_1.18/src/runtime/rwmutex.go b/contrib/go/_std_1.18/src/runtime/rwmutex.go
new file mode 100644
index 0000000000..7713c3f1cc
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/rwmutex.go
@@ -0,0 +1,125 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+)
+
+// This is a copy of sync/rwmutex.go rewritten to work in the runtime.
+
+// A rwmutex is a reader/writer mutual exclusion lock.
+// The lock can be held by an arbitrary number of readers or a single writer.
+// This is a variant of sync.RWMutex, for the runtime package.
+// Like mutex, rwmutex blocks the calling M.
+// It does not interact with the goroutine scheduler.
+type rwmutex struct {
+	rLock      mutex    // protects readers, readerPass, writer
+	readers    muintptr // list of pending readers
+	readerPass uint32   // number of pending readers to skip readers list
+
+	wLock  mutex    // serializes writers
+	writer muintptr // pending writer waiting for completing readers
+
+	readerCount uint32 // number of pending readers
+	readerWait  uint32 // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// rlock locks rw for reading.
+func (rw *rwmutex) rlock() {
+	// The reader must not be allowed to lose its P or else other
+	// things blocking on the lock may consume all of the Ps and
+	// deadlock (issue #20903). Alternatively, we could drop the P
+	// while sleeping.
+	acquirem()
+	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
+		// A writer is pending. Park on the reader queue.
+		systemstack(func() {
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
+			if rw.readerPass > 0 {
+				// Writer finished.
+				rw.readerPass -= 1
+				unlock(&rw.rLock)
+			} else {
+				// Queue this reader to be woken by
+				// the writer.
+				m := getg().m
+				m.schedlink = rw.readers
+				rw.readers.set(m)
+				unlock(&rw.rLock)
+				notesleep(&m.park)
+				noteclear(&m.park)
+			}
+		})
+	}
+}
+
+// runlock undoes a single rlock call on rw.
+func (rw *rwmutex) runlock() {
+	if r := int32(atomic.Xadd(&rw.readerCount, -1)); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			throw("runlock of unlocked rwmutex")
+		}
+		// A writer is pending.
+		if atomic.Xadd(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
+			w := rw.writer.ptr()
+			if w != nil {
+				notewakeup(&w.park)
+			}
+			unlock(&rw.rLock)
+		}
+	}
+	releasem(getg().m)
+}
+
+// lock locks rw for writing.
+func (rw *rwmutex) lock() {
+	// Resolve competition with other writers and stick to our P.
+	lockWithRank(&rw.wLock, lockRankRwmutexW)
+	m := getg().m
+	// Announce that there is a pending writer.
+	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
+	// Wait for any active readers to complete.
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
+	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
+		// Wait for reader to wake us up.
+		systemstack(func() {
+			rw.writer.set(m)
+			unlock(&rw.rLock)
+			notesleep(&m.park)
+			noteclear(&m.park)
+		})
+	} else {
+		unlock(&rw.rLock)
+	}
+}
+
+// unlock unlocks rw for writing.
+func (rw *rwmutex) unlock() {
+	// Announce to readers that there is no active writer.
+	r := int32(atomic.Xadd(&rw.readerCount, rwmutexMaxReaders))
+	if r >= rwmutexMaxReaders {
+		throw("unlock of unlocked rwmutex")
+	}
+	// Unblock blocked readers.
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
+	for rw.readers.ptr() != nil {
+		reader := rw.readers.ptr()
+		rw.readers = reader.schedlink
+		reader.schedlink.set(nil)
+		notewakeup(&reader.park)
+		r -= 1
+	}
+	// If r > 0, there are pending readers that aren't on the
+	// queue. Tell them to skip waiting.
+	rw.readerPass += uint32(r)
+	unlock(&rw.rLock)
+	// Allow other writers to proceed.
+	unlock(&rw.wLock)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/select.go b/contrib/go/_std_1.18/src/runtime/select.go
new file mode 100644
index 0000000000..e18b2f14c0
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/select.go
@@ -0,0 +1,633 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// This file contains the implementation of Go select statements.
+
+import (
+	"internal/abi"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const debugSelect = false
+
+// Select case descriptor.
+// Known to compiler.
+// Changes here must also be made in src/cmd/compile/internal/walk/select.go's scasetype.
+type scase struct {
+	c    *hchan         // chan
+	elem unsafe.Pointer // data element
+}
+
+var (
+	chansendpc = abi.FuncPCABIInternal(chansend)
+	chanrecvpc = abi.FuncPCABIInternal(chanrecv)
+)
+
+func selectsetpc(pc *uintptr) {
+	*pc = getcallerpc()
+}
+
+func sellock(scases []scase, lockorder []uint16) {
+	var c *hchan
+	for _, o := range lockorder {
+		c0 := scases[o].c
+		if c0 != c {
+			c = c0
+			lock(&c.lock)
+		}
+	}
+}
+
+func selunlock(scases []scase, lockorder []uint16) {
+	// We must be very careful here to not touch sel after we have unlocked
+	// the last lock, because sel can be freed right after the last unlock.
+	// Consider the following situation.
+	// First M calls runtime·park() in runtime·selectgo() passing the sel.
+	// Once runtime·park() has unlocked the last lock, another M makes
+	// the G that calls select runnable again and schedules it for execution.
+	// When the G runs on another M, it locks all the locks and frees sel.
+	// Now if the first M touches sel, it will access freed memory.
+	for i := len(lockorder) - 1; i >= 0; i-- {
+		c := scases[lockorder[i]].c
+		if i > 0 && c == scases[lockorder[i-1]].c {
+			continue // will unlock it on the next iteration
+		}
+		unlock(&c.lock)
+	}
+}
+
+func selparkcommit(gp *g, _ unsafe.Pointer) bool {
+	// There are unlocked sudogs that point into gp's stack. Stack
+	// copying must lock the channels of those sudogs.
+	// Set activeStackChans here instead of before we try parking
+	// because we could self-deadlock in stack growth on a
+	// channel lock.
+	gp.activeStackChans = true
+	// Mark that it's safe for stack shrinking to occur now,
+	// because any thread acquiring this G's stack for shrinking
+	// is guaranteed to observe activeStackChans after this store.
+	atomic.Store8(&gp.parkingOnChan, 0)
+	// Make sure we unlock after setting activeStackChans and
+	// unsetting parkingOnChan. The moment we unlock any of the
+	// channel locks we risk gp getting readied by a channel operation
+	// and so gp could continue running before everything before the
+	// unlock is visible (even to gp itself).
+
+	// This must not access gp's stack (see gopark). In
+	// particular, it must not access the *hselect. That's okay,
+	// because by the time this is called, gp.waiting has all
+	// channels in lock order.
+	var lastc *hchan
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		if sg.c != lastc && lastc != nil {
+			// As soon as we unlock the channel, fields in
+			// any sudog with that channel may change,
+			// including c and waitlink. Since multiple
+			// sudogs may have the same channel, we unlock
+			// only after we've passed the last instance
+			// of a channel.
+			unlock(&lastc.lock)
+		}
+		lastc = sg.c
+	}
+	if lastc != nil {
+		unlock(&lastc.lock)
+	}
+	return true
+}
+
+func block() {
+	gopark(nil, nil, waitReasonSelectNoCases, traceEvGoStop, 1) // forever
+}
+
+// selectgo implements the select statement.
+//
+// cas0 points to an array of type [ncases]scase, and order0 points to
+// an array of type [2*ncases]uint16 where ncases must be <= 65536.
+// Both reside on the goroutine's stack (regardless of any escaping in
+// selectgo).
+//
+// For race detector builds, pc0 points to an array of type
+// [ncases]uintptr (also on the stack); for other builds, it's set to
+// nil.
+//
+// selectgo returns the index of the chosen scase, which matches the
+// ordinal position of its respective select{recv,send,default} call.
+// Also, if the chosen scase was a receive operation, it reports whether
+// a value was received.
+func selectgo(cas0 *scase, order0 *uint16, pc0 *uintptr, nsends, nrecvs int, block bool) (int, bool) {
+	if debugSelect {
+		print("select: cas0=", cas0, "\n")
+	}
+
+	// NOTE: In order to maintain a lean stack size, the number of scases
+	// is capped at 65536.
+	cas1 := (*[1 << 16]scase)(unsafe.Pointer(cas0))
+	order1 := (*[1 << 17]uint16)(unsafe.Pointer(order0))
+
+	ncases := nsends + nrecvs
+	scases := cas1[:ncases:ncases]
+	pollorder := order1[:ncases:ncases]
+	lockorder := order1[ncases:][:ncases:ncases]
+	// NOTE: pollorder/lockorder's underlying array was not zero-initialized by compiler.
+
+	// Even when raceenabled is true, there might be select
+	// statements in packages compiled without -race (e.g.,
+	// ensureSigM in runtime/signal_unix.go).
+	var pcs []uintptr
+	if raceenabled && pc0 != nil {
+		pc1 := (*[1 << 16]uintptr)(unsafe.Pointer(pc0))
+		pcs = pc1[:ncases:ncases]
+	}
+	casePC := func(casi int) uintptr {
+		if pcs == nil {
+			return 0
+		}
+		return pcs[casi]
+	}
+
+	var t0 int64
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+	}
+
+	// The compiler rewrites selects that statically have
+	// only 0 or 1 cases plus default into simpler constructs.
+	// The only way we can end up with such small sel.ncase
+	// values here is for a larger select in which most channels
+	// have been nilled out. The general code handles those
+	// cases correctly, and they are rare enough not to bother
+	// optimizing (and needing to test).
+
+	// generate permuted order
+	norder := 0
+	for i := range scases {
+		cas := &scases[i]
+
+		// Omit cases without channels from the poll and lock orders.
+		if cas.c == nil {
+			cas.elem = nil // allow GC
+			continue
+		}
+
+		j := fastrandn(uint32(norder + 1))
+		pollorder[norder] = pollorder[j]
+		pollorder[j] = uint16(i)
+		norder++
+	}
+	pollorder = pollorder[:norder]
+	lockorder = lockorder[:norder]
+
+	// sort the cases by Hchan address to get the locking order.
+	// simple heap sort, to guarantee n log n time and constant stack footprint.
+	for i := range lockorder {
+		j := i
+		// Start with the pollorder to permute cases on the same channel.
+		c := scases[pollorder[i]].c
+		for j > 0 && scases[lockorder[(j-1)/2]].c.sortkey() < c.sortkey() {
+			k := (j - 1) / 2
+			lockorder[j] = lockorder[k]
+			j = k
+		}
+		lockorder[j] = pollorder[i]
+	}
+	for i := len(lockorder) - 1; i >= 0; i-- {
+		o := lockorder[i]
+		c := scases[o].c
+		lockorder[i] = lockorder[0]
+		j := 0
+		for {
+			k := j*2 + 1
+			if k >= i {
+				break
+			}
+			if k+1 < i && scases[lockorder[k]].c.sortkey() < scases[lockorder[k+1]].c.sortkey() {
+				k++
+			}
+			if c.sortkey() < scases[lockorder[k]].c.sortkey() {
+				lockorder[j] = lockorder[k]
+				j = k
+				continue
+			}
+			break
+		}
+		lockorder[j] = o
+	}
+
+	if debugSelect {
+		for i := 0; i+1 < len(lockorder); i++ {
+			if scases[lockorder[i]].c.sortkey() > scases[lockorder[i+1]].c.sortkey() {
+				print("i=", i, " x=", lockorder[i], " y=", lockorder[i+1], "\n")
+				throw("select: broken sort")
+			}
+		}
+	}
+
+	// lock all the channels involved in the select
+	sellock(scases, lockorder)
+
+	var (
+		gp     *g
+		sg     *sudog
+		c      *hchan
+		k      *scase
+		sglist *sudog
+		sgnext *sudog
+		qp     unsafe.Pointer
+		nextp  **sudog
+	)
+
+	// pass 1 - look for something already waiting
+	var casi int
+	var cas *scase
+	var caseSuccess bool
+	var caseReleaseTime int64 = -1
+	var recvOK bool
+	for _, casei := range pollorder {
+		casi = int(casei)
+		cas = &scases[casi]
+		c = cas.c
+
+		if casi >= nsends {
+			sg = c.sendq.dequeue()
+			if sg != nil {
+				goto recv
+			}
+			if c.qcount > 0 {
+				goto bufrecv
+			}
+			if c.closed != 0 {
+				goto rclose
+			}
+		} else {
+			if raceenabled {
+				racereadpc(c.raceaddr(), casePC(casi), chansendpc)
+			}
+			if c.closed != 0 {
+				goto sclose
+			}
+			sg = c.recvq.dequeue()
+			if sg != nil {
+				goto send
+			}
+			if c.qcount < c.dataqsiz {
+				goto bufsend
+			}
+		}
+	}
+
+	if !block {
+		selunlock(scases, lockorder)
+		casi = -1
+		goto retc
+	}
+
+	// pass 2 - enqueue on all chans
+	gp = getg()
+	if gp.waiting != nil {
+		throw("gp.waiting != nil")
+	}
+	nextp = &gp.waiting
+	for _, casei := range lockorder {
+		casi = int(casei)
+		cas = &scases[casi]
+		c = cas.c
+		sg := acquireSudog()
+		sg.g = gp
+		sg.isSelect = true
+		// No stack splits between assigning elem and enqueuing
+		// sg on gp.waiting where copystack can find it.
+		sg.elem = cas.elem
+		sg.releasetime = 0
+		if t0 != 0 {
+			sg.releasetime = -1
+		}
+		sg.c = c
+		// Construct waiting list in lock order.
+		*nextp = sg
+		nextp = &sg.waitlink
+
+		if casi < nsends {
+			c.sendq.enqueue(sg)
+		} else {
+			c.recvq.enqueue(sg)
+		}
+	}
+
+	// wait for someone to wake us up
+	gp.param = nil
+	// Signal to anyone trying to shrink our stack that we're about
+	// to park on a channel. The window between when this G's status
+	// changes and when we set gp.activeStackChans is not safe for
+	// stack shrinking.
+	atomic.Store8(&gp.parkingOnChan, 1)
+	gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1)
+	gp.activeStackChans = false
+
+	sellock(scases, lockorder)
+
+	gp.selectDone = 0
+	sg = (*sudog)(gp.param)
+	gp.param = nil
+
+	// pass 3 - dequeue from unsuccessful chans
+	// otherwise they stack up on quiet channels
+	// record the successful case, if any.
+	// We singly-linked up the SudoGs in lock order.
+	casi = -1
+	cas = nil
+	caseSuccess = false
+	sglist = gp.waiting
+	// Clear all elem before unlinking from gp.waiting.
+	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
+		sg1.isSelect = false
+		sg1.elem = nil
+		sg1.c = nil
+	}
+	gp.waiting = nil
+
+	for _, casei := range lockorder {
+		k = &scases[casei]
+		if sg == sglist {
+			// sg has already been dequeued by the G that woke us up.
+			casi = int(casei)
+			cas = k
+			caseSuccess = sglist.success
+			if sglist.releasetime > 0 {
+				caseReleaseTime = sglist.releasetime
+			}
+		} else {
+			c = k.c
+			if int(casei) < nsends {
+				c.sendq.dequeueSudoG(sglist)
+			} else {
+				c.recvq.dequeueSudoG(sglist)
+			}
+		}
+		sgnext = sglist.waitlink
+		sglist.waitlink = nil
+		releaseSudog(sglist)
+		sglist = sgnext
+	}
+
+	if cas == nil {
+		throw("selectgo: bad wakeup")
+	}
+
+	c = cas.c
+
+	if debugSelect {
+		print("wait-return: cas0=", cas0, " c=", c, " cas=", cas, " send=", casi < nsends, "\n")
+	}
+
+	if casi < nsends {
+		if !caseSuccess {
+			goto sclose
+		}
+	} else {
+		recvOK = caseSuccess
+	}
+
+	if raceenabled {
+		if casi < nsends {
+			raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
+		} else if cas.elem != nil {
+			raceWriteObjectPC(c.elemtype, cas.elem, casePC(casi), chanrecvpc)
+		}
+	}
+	if msanenabled {
+		if casi < nsends {
+			msanread(cas.elem, c.elemtype.size)
+		} else if cas.elem != nil {
+			msanwrite(cas.elem, c.elemtype.size)
+		}
+	}
+	if asanenabled {
+		if casi < nsends {
+			asanread(cas.elem, c.elemtype.size)
+		} else if cas.elem != nil {
+			asanwrite(cas.elem, c.elemtype.size)
+		}
+	}
+
+	selunlock(scases, lockorder)
+	goto retc
+
+bufrecv:
+	// can receive from buffer
+	if raceenabled {
+		if cas.elem != nil {
+			raceWriteObjectPC(c.elemtype, cas.elem, casePC(casi), chanrecvpc)
+		}
+		racenotify(c, c.recvx, nil)
+	}
+	if msanenabled && cas.elem != nil {
+		msanwrite(cas.elem, c.elemtype.size)
+	}
+	if asanenabled && cas.elem != nil {
+		asanwrite(cas.elem, c.elemtype.size)
+	}
+	recvOK = true
+	qp = chanbuf(c, c.recvx)
+	if cas.elem != nil {
+		typedmemmove(c.elemtype, cas.elem, qp)
+	}
+	typedmemclr(c.elemtype, qp)
+	c.recvx++
+	if c.recvx == c.dataqsiz {
+		c.recvx = 0
+	}
+	c.qcount--
+	selunlock(scases, lockorder)
+	goto retc
+
+bufsend:
+	// can send to buffer
+	if raceenabled {
+		racenotify(c, c.sendx, nil)
+		raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
+	}
+	if msanenabled {
+		msanread(cas.elem, c.elemtype.size)
+	}
+	if asanenabled {
+		asanread(cas.elem, c.elemtype.size)
+	}
+	typedmemmove(c.elemtype, chanbuf(c, c.sendx), cas.elem)
+	c.sendx++
+	if c.sendx == c.dataqsiz {
+		c.sendx = 0
+	}
+	c.qcount++
+	selunlock(scases, lockorder)
+	goto retc
+
+recv:
+	// can receive from sleeping sender (sg)
+	recv(c, sg, cas.elem, func() { selunlock(scases, lockorder) }, 2)
+	if debugSelect {
+		print("syncrecv: cas0=", cas0, " c=", c, "\n")
+	}
+	recvOK = true
+	goto retc
+
+rclose:
+	// read at end of closed channel
+	selunlock(scases, lockorder)
+	recvOK = false
+	if cas.elem != nil {
+		typedmemclr(c.elemtype, cas.elem)
+	}
+	if raceenabled {
+		raceacquire(c.raceaddr())
+	}
+	goto retc
+
+send:
+	// can send to a sleeping receiver (sg)
+	if raceenabled {
+		raceReadObjectPC(c.elemtype, cas.elem, casePC(casi), chansendpc)
+	}
+	if msanenabled {
+		msanread(cas.elem, c.elemtype.size)
+	}
+	if asanenabled {
+		asanread(cas.elem, c.elemtype.size)
+	}
+	send(c, sg, cas.elem, func() { selunlock(scases, lockorder) }, 2)
+	if debugSelect {
+		print("syncsend: cas0=", cas0, " c=", c, "\n")
+	}
+	goto retc
+
+retc:
+	if caseReleaseTime > 0 {
+		blockevent(caseReleaseTime-t0, 1)
+	}
+	return casi, recvOK
+
+sclose:
+	// send on closed channel
+	selunlock(scases, lockorder)
+	panic(plainError("send on closed channel"))
+}
+
+func (c *hchan) sortkey() uintptr {
+	return uintptr(unsafe.Pointer(c))
+}
+
+// A runtimeSelect is a single case passed to rselect.
+// This must match ../reflect/value.go:/runtimeSelect
+type runtimeSelect struct {
+	dir selectDir
+	typ unsafe.Pointer // channel type (not used here)
+	ch  *hchan         // channel
+	val unsafe.Pointer // ptr to data (SendDir) or ptr to receive buffer (RecvDir)
+}
+
+// These values must match ../reflect/value.go:/SelectDir.
+type selectDir int
+
+const (
+	_             selectDir = iota
+	selectSend              // case Chan <- Send
+	selectRecv              // case <-Chan:
+	selectDefault           // default
+)
+
+//go:linkname reflect_rselect reflect.rselect
+func reflect_rselect(cases []runtimeSelect) (int, bool) {
+	if len(cases) == 0 {
+		block()
+	}
+	sel := make([]scase, len(cases))
+	orig := make([]int, len(cases))
+	nsends, nrecvs := 0, 0
+	dflt := -1
+	for i, rc := range cases {
+		var j int
+		switch rc.dir {
+		case selectDefault:
+			dflt = i
+			continue
+		case selectSend:
+			j = nsends
+			nsends++
+		case selectRecv:
+			nrecvs++
+			j = len(cases) - nrecvs
+		}
+
+		sel[j] = scase{c: rc.ch, elem: rc.val}
+		orig[j] = i
+	}
+
+	// Only a default case.
+	if nsends+nrecvs == 0 {
+		return dflt, false
+	}
+
+	// Compact sel and orig if necessary.
+	if nsends+nrecvs < len(cases) {
+		copy(sel[nsends:], sel[len(cases)-nrecvs:])
+		copy(orig[nsends:], orig[len(cases)-nrecvs:])
+	}
+
+	order := make([]uint16, 2*(nsends+nrecvs))
+	var pc0 *uintptr
+	if raceenabled {
+		pcs := make([]uintptr, nsends+nrecvs)
+		for i := range pcs {
+			selectsetpc(&pcs[i])
+		}
+		pc0 = &pcs[0]
+	}
+
+	chosen, recvOK := selectgo(&sel[0], &order[0], pc0, nsends, nrecvs, dflt == -1)
+
+	// Translate chosen back to caller's ordering.
+	if chosen < 0 {
+		chosen = dflt
+	} else {
+		chosen = orig[chosen]
+	}
+	return chosen, recvOK
+}
+
+func (q *waitq) dequeueSudoG(sgp *sudog) {
+	x := sgp.prev
+	y := sgp.next
+	if x != nil {
+		if y != nil {
+			// middle of queue
+			x.next = y
+			y.prev = x
+			sgp.next = nil
+			sgp.prev = nil
+			return
+		}
+		// end of queue
+		x.next = nil
+		q.last = x
+		sgp.prev = nil
+		return
+	}
+	if y != nil {
+		// start of queue
+		y.prev = nil
+		q.first = y
+		sgp.next = nil
+		return
+	}
+
+	// x==y==nil. Either sgp is the only element in the queue,
+	// or it has already been removed. Use q.first to disambiguate.
+	if q.first == sgp {
+		q.first = nil
+		q.last = nil
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sema.go b/contrib/go/_std_1.18/src/runtime/sema.go
new file mode 100644
index 0000000000..f94c1aa891
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sema.go
@@ -0,0 +1,617 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Semaphore implementation exposed to Go.
+// Intended use is provide a sleep and wakeup
+// primitive that can be used in the contended case
+// of other synchronization primitives.
+// Thus it targets the same goal as Linux's futex,
+// but it has much simpler semantics.
+//
+// That is, don't think of these as semaphores.
+// Think of them as a way to implement sleep and wakeup
+// such that every sleep is paired with a single wakeup,
+// even if, due to races, the wakeup happens before the sleep.
+//
+// See Mullender and Cox, ``Semaphores in Plan 9,''
+// https://swtch.com/semaphore.pdf
+
+package runtime
+
+import (
+	"internal/cpu"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// Asynchronous semaphore for sync.Mutex.
+
+// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
+// Each of those sudog may in turn point (through s.waitlink) to a list
+// of other sudogs waiting on the same address.
+// The operations on the inner lists of sudogs with the same address
+// are all O(1). The scanning of the top-level semaRoot list is O(log n),
+// where n is the number of distinct addresses with goroutines blocked
+// on them that hash to the given semaRoot.
+// See golang.org/issue/17953 for a program that worked badly
+// before we introduced the second level of list, and test/locklinear.go
+// for a test that exercises this.
+type semaRoot struct {
+	lock  mutex
+	treap *sudog // root of balanced tree of unique waiters.
+	nwait uint32 // Number of waiters. Read w/o the lock.
+}
+
+// Prime to not correlate with any user patterns.
+const semTabSize = 251
+
+var semtable [semTabSize]struct {
+	root semaRoot
+	pad  [cpu.CacheLinePadSize - unsafe.Sizeof(semaRoot{})]byte
+}
+
+//go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
+func sync_runtime_Semacquire(addr *uint32) {
+	semacquire1(addr, false, semaBlockProfile, 0)
+}
+
+//go:linkname poll_runtime_Semacquire internal/poll.runtime_Semacquire
+func poll_runtime_Semacquire(addr *uint32) {
+	semacquire1(addr, false, semaBlockProfile, 0)
+}
+
+//go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
+func sync_runtime_Semrelease(addr *uint32, handoff bool, skipframes int) {
+	semrelease1(addr, handoff, skipframes)
+}
+
+//go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
+func sync_runtime_SemacquireMutex(addr *uint32, lifo bool, skipframes int) {
+	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes)
+}
+
+//go:linkname poll_runtime_Semrelease internal/poll.runtime_Semrelease
+func poll_runtime_Semrelease(addr *uint32) {
+	semrelease(addr)
+}
+
+func readyWithTime(s *sudog, traceskip int) {
+	if s.releasetime != 0 {
+		s.releasetime = cputicks()
+	}
+	goready(s.g, traceskip)
+}
+
+type semaProfileFlags int
+
+const (
+	semaBlockProfile semaProfileFlags = 1 << iota
+	semaMutexProfile
+)
+
+// Called from runtime.
+func semacquire(addr *uint32) {
+	semacquire1(addr, false, 0, 0)
+}
+
+func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int) {
+	gp := getg()
+	if gp != gp.m.curg {
+		throw("semacquire not on the G stack")
+	}
+
+	// Easy case.
+	if cansemacquire(addr) {
+		return
+	}
+
+	// Harder case:
+	//	increment waiter count
+	//	try cansemacquire one more time, return if succeeded
+	//	enqueue itself as a waiter
+	//	sleep
+	//	(waiter descriptor is dequeued by signaler)
+	s := acquireSudog()
+	root := semroot(addr)
+	t0 := int64(0)
+	s.releasetime = 0
+	s.acquiretime = 0
+	s.ticket = 0
+	if profile&semaBlockProfile != 0 && blockprofilerate > 0 {
+		t0 = cputicks()
+		s.releasetime = -1
+	}
+	if profile&semaMutexProfile != 0 && mutexprofilerate > 0 {
+		if t0 == 0 {
+			t0 = cputicks()
+		}
+		s.acquiretime = t0
+	}
+	for {
+		lockWithRank(&root.lock, lockRankRoot)
+		// Add ourselves to nwait to disable "easy case" in semrelease.
+		atomic.Xadd(&root.nwait, 1)
+		// Check cansemacquire to avoid missed wakeup.
+		if cansemacquire(addr) {
+			atomic.Xadd(&root.nwait, -1)
+			unlock(&root.lock)
+			break
+		}
+		// Any semrelease after the cansemacquire knows we're waiting
+		// (we set nwait above), so go to sleep.
+		root.queue(addr, s, lifo)
+		goparkunlock(&root.lock, waitReasonSemacquire, traceEvGoBlockSync, 4+skipframes)
+		if s.ticket != 0 || cansemacquire(addr) {
+			break
+		}
+	}
+	if s.releasetime > 0 {
+		blockevent(s.releasetime-t0, 3+skipframes)
+	}
+	releaseSudog(s)
+}
+
+func semrelease(addr *uint32) {
+	semrelease1(addr, false, 0)
+}
+
+func semrelease1(addr *uint32, handoff bool, skipframes int) {
+	root := semroot(addr)
+	atomic.Xadd(addr, 1)
+
+	// Easy case: no waiters?
+	// This check must happen after the xadd, to avoid a missed wakeup
+	// (see loop in semacquire).
+	if atomic.Load(&root.nwait) == 0 {
+		return
+	}
+
+	// Harder case: search for a waiter and wake it.
+	lockWithRank(&root.lock, lockRankRoot)
+	if atomic.Load(&root.nwait) == 0 {
+		// The count is already consumed by another goroutine,
+		// so no need to wake up another goroutine.
+		unlock(&root.lock)
+		return
+	}
+	s, t0 := root.dequeue(addr)
+	if s != nil {
+		atomic.Xadd(&root.nwait, -1)
+	}
+	unlock(&root.lock)
+	if s != nil { // May be slow or even yield, so unlock first
+		acquiretime := s.acquiretime
+		if acquiretime != 0 {
+			mutexevent(t0-acquiretime, 3+skipframes)
+		}
+		if s.ticket != 0 {
+			throw("corrupted semaphore ticket")
+		}
+		if handoff && cansemacquire(addr) {
+			s.ticket = 1
+		}
+		readyWithTime(s, 5+skipframes)
+		if s.ticket == 1 && getg().m.locks == 0 {
+			// Direct G handoff
+			// readyWithTime has added the waiter G as runnext in the
+			// current P; we now call the scheduler so that we start running
+			// the waiter G immediately.
+			// Note that waiter inherits our time slice: this is desirable
+			// to avoid having a highly contended semaphore hog the P
+			// indefinitely. goyield is like Gosched, but it emits a
+			// "preempted" trace event instead and, more importantly, puts
+			// the current G on the local runq instead of the global one.
+			// We only do this in the starving regime (handoff=true), as in
+			// the non-starving case it is possible for a different waiter
+			// to acquire the semaphore while we are yielding/scheduling,
+			// and this would be wasteful. We wait instead to enter starving
+			// regime, and then we start to do direct handoffs of ticket and
+			// P.
+			// See issue 33747 for discussion.
+			goyield()
+		}
+	}
+}
+
+func semroot(addr *uint32) *semaRoot {
+	return &semtable[(uintptr(unsafe.Pointer(addr))>>3)%semTabSize].root
+}
+
+func cansemacquire(addr *uint32) bool {
+	for {
+		v := atomic.Load(addr)
+		if v == 0 {
+			return false
+		}
+		if atomic.Cas(addr, v, v-1) {
+			return true
+		}
+	}
+}
+
+// queue adds s to the blocked goroutines in semaRoot.
+func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
+	s.g = getg()
+	s.elem = unsafe.Pointer(addr)
+	s.next = nil
+	s.prev = nil
+
+	var last *sudog
+	pt := &root.treap
+	for t := *pt; t != nil; t = *pt {
+		if t.elem == unsafe.Pointer(addr) {
+			// Already have addr in list.
+			if lifo {
+				// Substitute s in t's place in treap.
+				*pt = s
+				s.ticket = t.ticket
+				s.acquiretime = t.acquiretime
+				s.parent = t.parent
+				s.prev = t.prev
+				s.next = t.next
+				if s.prev != nil {
+					s.prev.parent = s
+				}
+				if s.next != nil {
+					s.next.parent = s
+				}
+				// Add t first in s's wait list.
+				s.waitlink = t
+				s.waittail = t.waittail
+				if s.waittail == nil {
+					s.waittail = t
+				}
+				t.parent = nil
+				t.prev = nil
+				t.next = nil
+				t.waittail = nil
+			} else {
+				// Add s to end of t's wait list.
+				if t.waittail == nil {
+					t.waitlink = s
+				} else {
+					t.waittail.waitlink = s
+				}
+				t.waittail = s
+				s.waitlink = nil
+			}
+			return
+		}
+		last = t
+		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
+			pt = &t.prev
+		} else {
+			pt = &t.next
+		}
+	}
+
+	// Add s as new leaf in tree of unique addrs.
+	// The balanced tree is a treap using ticket as the random heap priority.
+	// That is, it is a binary tree ordered according to the elem addresses,
+	// but then among the space of possible binary trees respecting those
+	// addresses, it is kept balanced on average by maintaining a heap ordering
+	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
+	// https://en.wikipedia.org/wiki/Treap
+	// https://faculty.washington.edu/aragon/pubs/rst89.pdf
+	//
+	// s.ticket compared with zero in couple of places, therefore set lowest bit.
+	// It will not affect treap's quality noticeably.
+	s.ticket = fastrand() | 1
+	s.parent = last
+	*pt = s
+
+	// Rotate up into tree according to ticket (priority).
+	for s.parent != nil && s.parent.ticket > s.ticket {
+		if s.parent.prev == s {
+			root.rotateRight(s.parent)
+		} else {
+			if s.parent.next != s {
+				panic("semaRoot queue")
+			}
+			root.rotateLeft(s.parent)
+		}
+	}
+}
+
+// dequeue searches for and finds the first goroutine
+// in semaRoot blocked on addr.
+// If the sudog was being profiled, dequeue returns the time
+// at which it was woken up as now. Otherwise now is 0.
+func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now int64) {
+	ps := &root.treap
+	s := *ps
+	for ; s != nil; s = *ps {
+		if s.elem == unsafe.Pointer(addr) {
+			goto Found
+		}
+		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
+			ps = &s.prev
+		} else {
+			ps = &s.next
+		}
+	}
+	return nil, 0
+
+Found:
+	now = int64(0)
+	if s.acquiretime != 0 {
+		now = cputicks()
+	}
+	if t := s.waitlink; t != nil {
+		// Substitute t, also waiting on addr, for s in root tree of unique addrs.
+		*ps = t
+		t.ticket = s.ticket
+		t.parent = s.parent
+		t.prev = s.prev
+		if t.prev != nil {
+			t.prev.parent = t
+		}
+		t.next = s.next
+		if t.next != nil {
+			t.next.parent = t
+		}
+		if t.waitlink != nil {
+			t.waittail = s.waittail
+		} else {
+			t.waittail = nil
+		}
+		t.acquiretime = now
+		s.waitlink = nil
+		s.waittail = nil
+	} else {
+		// Rotate s down to be leaf of tree for removal, respecting priorities.
+		for s.next != nil || s.prev != nil {
+			if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
+				root.rotateRight(s)
+			} else {
+				root.rotateLeft(s)
+			}
+		}
+		// Remove s, now a leaf.
+		if s.parent != nil {
+			if s.parent.prev == s {
+				s.parent.prev = nil
+			} else {
+				s.parent.next = nil
+			}
+		} else {
+			root.treap = nil
+		}
+	}
+	s.parent = nil
+	s.elem = nil
+	s.next = nil
+	s.prev = nil
+	s.ticket = 0
+	return s, now
+}
+
+// rotateLeft rotates the tree rooted at node x.
+// turning (x a (y b c)) into (y (x a b) c).
+func (root *semaRoot) rotateLeft(x *sudog) {
+	// p -> (x a (y b c))
+	p := x.parent
+	y := x.next
+	b := y.prev
+
+	y.prev = x
+	x.parent = y
+	x.next = b
+	if b != nil {
+		b.parent = x
+	}
+
+	y.parent = p
+	if p == nil {
+		root.treap = y
+	} else if p.prev == x {
+		p.prev = y
+	} else {
+		if p.next != x {
+			throw("semaRoot rotateLeft")
+		}
+		p.next = y
+	}
+}
+
+// rotateRight rotates the tree rooted at node y.
+// turning (y (x a b) c) into (x a (y b c)).
+func (root *semaRoot) rotateRight(y *sudog) {
+	// p -> (y (x a b) c)
+	p := y.parent
+	x := y.prev
+	b := x.next
+
+	x.next = y
+	y.parent = x
+	y.prev = b
+	if b != nil {
+		b.parent = y
+	}
+
+	x.parent = p
+	if p == nil {
+		root.treap = x
+	} else if p.prev == y {
+		p.prev = x
+	} else {
+		if p.next != y {
+			throw("semaRoot rotateRight")
+		}
+		p.next = x
+	}
+}
+
+// notifyList is a ticket-based notification list used to implement sync.Cond.
+//
+// It must be kept in sync with the sync package.
+type notifyList struct {
+	// wait is the ticket number of the next waiter. It is atomically
+	// incremented outside the lock.
+	wait uint32
+
+	// notify is the ticket number of the next waiter to be notified. It can
+	// be read outside the lock, but is only written to with lock held.
+	//
+	// Both wait & notify can wrap around, and such cases will be correctly
+	// handled as long as their "unwrapped" difference is bounded by 2^31.
+	// For this not to be the case, we'd need to have 2^31+ goroutines
+	// blocked on the same condvar, which is currently not possible.
+	notify uint32
+
+	// List of parked waiters.
+	lock mutex
+	head *sudog
+	tail *sudog
+}
+
+// less checks if a < b, considering a & b running counts that may overflow the
+// 32-bit range, and that their "unwrapped" difference is always less than 2^31.
+func less(a, b uint32) bool {
+	return int32(a-b) < 0
+}
+
+// notifyListAdd adds the caller to a notify list such that it can receive
+// notifications. The caller must eventually call notifyListWait to wait for
+// such a notification, passing the returned ticket number.
+//go:linkname notifyListAdd sync.runtime_notifyListAdd
+func notifyListAdd(l *notifyList) uint32 {
+	// This may be called concurrently, for example, when called from
+	// sync.Cond.Wait while holding a RWMutex in read mode.
+	return atomic.Xadd(&l.wait, 1) - 1
+}
+
+// notifyListWait waits for a notification. If one has been sent since
+// notifyListAdd was called, it returns immediately. Otherwise, it blocks.
+//go:linkname notifyListWait sync.runtime_notifyListWait
+func notifyListWait(l *notifyList, t uint32) {
+	lockWithRank(&l.lock, lockRankNotifyList)
+
+	// Return right away if this ticket has already been notified.
+	if less(t, l.notify) {
+		unlock(&l.lock)
+		return
+	}
+
+	// Enqueue itself.
+	s := acquireSudog()
+	s.g = getg()
+	s.ticket = t
+	s.releasetime = 0
+	t0 := int64(0)
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+		s.releasetime = -1
+	}
+	if l.tail == nil {
+		l.head = s
+	} else {
+		l.tail.next = s
+	}
+	l.tail = s
+	goparkunlock(&l.lock, waitReasonSyncCondWait, traceEvGoBlockCond, 3)
+	if t0 != 0 {
+		blockevent(s.releasetime-t0, 2)
+	}
+	releaseSudog(s)
+}
+
+// notifyListNotifyAll notifies all entries in the list.
+//go:linkname notifyListNotifyAll sync.runtime_notifyListNotifyAll
+func notifyListNotifyAll(l *notifyList) {
+	// Fast-path: if there are no new waiters since the last notification
+	// we don't need to acquire the lock.
+	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
+		return
+	}
+
+	// Pull the list out into a local variable, waiters will be readied
+	// outside the lock.
+	lockWithRank(&l.lock, lockRankNotifyList)
+	s := l.head
+	l.head = nil
+	l.tail = nil
+
+	// Update the next ticket to be notified. We can set it to the current
+	// value of wait because any previous waiters are already in the list
+	// or will notice that they have already been notified when trying to
+	// add themselves to the list.
+	atomic.Store(&l.notify, atomic.Load(&l.wait))
+	unlock(&l.lock)
+
+	// Go through the local list and ready all waiters.
+	for s != nil {
+		next := s.next
+		s.next = nil
+		readyWithTime(s, 4)
+		s = next
+	}
+}
+
+// notifyListNotifyOne notifies one entry in the list.
+//go:linkname notifyListNotifyOne sync.runtime_notifyListNotifyOne
+func notifyListNotifyOne(l *notifyList) {
+	// Fast-path: if there are no new waiters since the last notification
+	// we don't need to acquire the lock at all.
+	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
+		return
+	}
+
+	lockWithRank(&l.lock, lockRankNotifyList)
+
+	// Re-check under the lock if we need to do anything.
+	t := l.notify
+	if t == atomic.Load(&l.wait) {
+		unlock(&l.lock)
+		return
+	}
+
+	// Update the next notify ticket number.
+	atomic.Store(&l.notify, t+1)
+
+	// Try to find the g that needs to be notified.
+	// If it hasn't made it to the list yet we won't find it,
+	// but it won't park itself once it sees the new notify number.
+	//
+	// This scan looks linear but essentially always stops quickly.
+	// Because g's queue separately from taking numbers,
+	// there may be minor reorderings in the list, but we
+	// expect the g we're looking for to be near the front.
+	// The g has others in front of it on the list only to the
+	// extent that it lost the race, so the iteration will not
+	// be too long. This applies even when the g is missing:
+	// it hasn't yet gotten to sleep and has lost the race to
+	// the (few) other g's that we find on the list.
+	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
+		if s.ticket == t {
+			n := s.next
+			if p != nil {
+				p.next = n
+			} else {
+				l.head = n
+			}
+			if n == nil {
+				l.tail = p
+			}
+			unlock(&l.lock)
+			s.next = nil
+			readyWithTime(s, 4)
+			return
+		}
+	}
+	unlock(&l.lock)
+}
+
+//go:linkname notifyListCheck sync.runtime_notifyListCheck
+func notifyListCheck(sz uintptr) {
+	if sz != unsafe.Sizeof(notifyList{}) {
+		print("runtime: bad notifyList size - sync=", sz, " runtime=", unsafe.Sizeof(notifyList{}), "\n")
+		throw("bad notifyList size")
+	}
+}
+
+//go:linkname sync_nanotime sync.runtime_nanotime
+func sync_nanotime() int64 {
+	return nanotime()
+}
diff --git a/contrib/go/_std_1.18/src/runtime/signal_amd64.go b/contrib/go/_std_1.18/src/runtime/signal_amd64.go
new file mode 100644
index 0000000000..8ade208836
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/signal_amd64.go
@@ -0,0 +1,87 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && (darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris)
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"unsafe"
+)
+
+func dumpregs(c *sigctxt) {
+	print("rax    ", hex(c.rax()), "\n")
+	print("rbx    ", hex(c.rbx()), "\n")
+	print("rcx    ", hex(c.rcx()), "\n")
+	print("rdx    ", hex(c.rdx()), "\n")
+	print("rdi    ", hex(c.rdi()), "\n")
+	print("rsi    ", hex(c.rsi()), "\n")
+	print("rbp    ", hex(c.rbp()), "\n")
+	print("rsp    ", hex(c.rsp()), "\n")
+	print("r8     ", hex(c.r8()), "\n")
+	print("r9     ", hex(c.r9()), "\n")
+	print("r10    ", hex(c.r10()), "\n")
+	print("r11    ", hex(c.r11()), "\n")
+	print("r12    ", hex(c.r12()), "\n")
+	print("r13    ", hex(c.r13()), "\n")
+	print("r14    ", hex(c.r14()), "\n")
+	print("r15    ", hex(c.r15()), "\n")
+	print("rip    ", hex(c.rip()), "\n")
+	print("rflags ", hex(c.rflags()), "\n")
+	print("cs     ", hex(c.cs()), "\n")
+	print("fs     ", hex(c.fs()), "\n")
+	print("gs     ", hex(c.gs()), "\n")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) sigpc() uintptr { return uintptr(c.rip()) }
+
+func (c *sigctxt) setsigpc(x uint64) { c.set_rip(x) }
+func (c *sigctxt) sigsp() uintptr    { return uintptr(c.rsp()) }
+func (c *sigctxt) siglr() uintptr    { return 0 }
+func (c *sigctxt) fault() uintptr    { return uintptr(c.sigaddr()) }
+
+// preparePanic sets up the stack to look like a call to sigpanic.
+func (c *sigctxt) preparePanic(sig uint32, gp *g) {
+	// Work around Leopard bug that doesn't set FPE_INTDIV.
+	// Look at instruction to see if it is a divide.
+	// Not necessary in Snow Leopard (si_code will be != 0).
+	if GOOS == "darwin" && sig == _SIGFPE && gp.sigcode0 == 0 {
+		pc := (*[4]byte)(unsafe.Pointer(gp.sigpc))
+		i := 0
+		if pc[i]&0xF0 == 0x40 { // 64-bit REX prefix
+			i++
+		} else if pc[i] == 0x66 { // 16-bit instruction prefix
+			i++
+		}
+		if pc[i] == 0xF6 || pc[i] == 0xF7 {
+			gp.sigcode0 = _FPE_INTDIV
+		}
+	}
+
+	pc := uintptr(c.rip())
+	sp := uintptr(c.rsp())
+
+	// In case we are panicking from external code, we need to initialize
+	// Go special registers. We inject sigpanic0 (instead of sigpanic),
+	// which takes care of that.
+	if shouldPushSigpanic(gp, pc, *(*uintptr)(unsafe.Pointer(sp))) {
+		c.pushCall(abi.FuncPCABI0(sigpanic0), pc)
+	} else {
+		// Not safe to push the call. Just clobber the frame.
+		c.set_rip(uint64(abi.FuncPCABI0(sigpanic0)))
+	}
+}
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Make it look like we called target at resumePC.
+	sp := uintptr(c.rsp())
+	sp -= goarch.PtrSize
+	*(*uintptr)(unsafe.Pointer(sp)) = resumePC
+	c.set_rsp(uint64(sp))
+	c.set_rip(uint64(targetPC))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/signal_darwin.go b/contrib/go/_std_1.18/src/runtime/signal_darwin.go
new file mode 100644
index 0000000000..8090fb22a5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/signal_darwin.go
@@ -0,0 +1,40 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+var sigtable = [...]sigTabT{
+	/* 0 */ {0, "SIGNONE: no trap"},
+	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
+	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
+	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
+	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
+	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
+	/* 9 */ {0, "SIGKILL: kill"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
+	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
+	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
+	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
+	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
+	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
+	/* 17 */ {0, "SIGSTOP: stop"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGIO: i/o now possible"},
+	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
+	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
+	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 29 */ {_SigNotify + _SigIgn, "SIGINFO: status request from keyboard"},
+	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
+	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
+}
diff --git a/contrib/go/_std_1.18/src/runtime/signal_darwin_amd64.go b/contrib/go/_std_1.18/src/runtime/signal_darwin_amd64.go
new file mode 100644
index 0000000000..abc212ad51
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/signal_darwin_amd64.go
@@ -0,0 +1,92 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) regs() *regs64 { return &(*ucontext)(c.ctxt).uc_mcontext.ss }
+
+func (c *sigctxt) rax() uint64 { return c.regs().rax }
+func (c *sigctxt) rbx() uint64 { return c.regs().rbx }
+func (c *sigctxt) rcx() uint64 { return c.regs().rcx }
+func (c *sigctxt) rdx() uint64 { return c.regs().rdx }
+func (c *sigctxt) rdi() uint64 { return c.regs().rdi }
+func (c *sigctxt) rsi() uint64 { return c.regs().rsi }
+func (c *sigctxt) rbp() uint64 { return c.regs().rbp }
+func (c *sigctxt) rsp() uint64 { return c.regs().rsp }
+func (c *sigctxt) r8() uint64  { return c.regs().r8 }
+func (c *sigctxt) r9() uint64  { return c.regs().r9 }
+func (c *sigctxt) r10() uint64 { return c.regs().r10 }
+func (c *sigctxt) r11() uint64 { return c.regs().r11 }
+func (c *sigctxt) r12() uint64 { return c.regs().r12 }
+func (c *sigctxt) r13() uint64 { return c.regs().r13 }
+func (c *sigctxt) r14() uint64 { return c.regs().r14 }
+func (c *sigctxt) r15() uint64 { return c.regs().r15 }
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) rip() uint64 { return c.regs().rip }
+
+func (c *sigctxt) rflags() uint64  { return c.regs().rflags }
+func (c *sigctxt) cs() uint64      { return c.regs().cs }
+func (c *sigctxt) fs() uint64      { return c.regs().fs }
+func (c *sigctxt) gs() uint64      { return c.regs().gs }
+func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_rip(x uint64)     { c.regs().rip = x }
+func (c *sigctxt) set_rsp(x uint64)     { c.regs().rsp = x }
+func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) { c.info.si_addr = x }
+
+//go:nosplit
+func (c *sigctxt) fixsigcode(sig uint32) {
+	switch sig {
+	case _SIGTRAP:
+		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
+		// leaving no way to distinguish a breakpoint-induced SIGTRAP
+		// from an asynchronous signal SIGTRAP.
+		// They all look breakpoint-induced by default.
+		// Try looking at the code to see if it's a breakpoint.
+		// The assumption is that we're very unlikely to get an
+		// asynchronous SIGTRAP at just the moment that the
+		// PC started to point at unmapped memory.
+		pc := uintptr(c.rip())
+		// OS X will leave the pc just after the INT 3 instruction.
+		// INT 3 is usually 1 byte, but there is a 2-byte form.
+		code := (*[2]byte)(unsafe.Pointer(pc - 2))
+		if code[1] != 0xCC && (code[0] != 0xCD || code[1] != 3) {
+			// SIGTRAP on something other than INT 3.
+			c.set_sigcode(_SI_USER)
+		}
+
+	case _SIGSEGV:
+		// x86-64 has 48-bit virtual addresses. The top 16 bits must echo bit 47.
+		// The hardware delivers a different kind of fault for a malformed address
+		// than it does for an attempt to access a valid but unmapped address.
+		// OS X 10.9.2 mishandles the malformed address case, making it look like
+		// a user-generated signal (like someone ran kill -SEGV ourpid).
+		// We pass user-generated signals to os/signal, or else ignore them.
+		// Doing that here - and returning to the faulting code - results in an
+		// infinite loop. It appears the best we can do is rewrite what the kernel
+		// delivers into something more like the truth. The address used below
+		// has very little chance of being the one that caused the fault, but it is
+		// malformed, it is clearly not a real pointer, and if it does get printed
+		// in real life, people will probably search for it and find this code.
+		// There are no Google hits for b01dfacedebac1e or 0xb01dfacedebac1e
+		// as I type this comment.
+		if c.sigcode() == _SI_USER {
+			c.set_sigcode(_SI_USER + 1)
+			c.set_sigaddr(0xb01dfacedebac1e)
+		}
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/signal_linux_amd64.go b/contrib/go/_std_1.18/src/runtime/signal_linux_amd64.go
new file mode 100644
index 0000000000..573b118397
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/signal_linux_amd64.go
@@ -0,0 +1,56 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) regs() *sigcontext {
+	return (*sigcontext)(unsafe.Pointer(&(*ucontext)(c.ctxt).uc_mcontext))
+}
+
+func (c *sigctxt) rax() uint64 { return c.regs().rax }
+func (c *sigctxt) rbx() uint64 { return c.regs().rbx }
+func (c *sigctxt) rcx() uint64 { return c.regs().rcx }
+func (c *sigctxt) rdx() uint64 { return c.regs().rdx }
+func (c *sigctxt) rdi() uint64 { return c.regs().rdi }
+func (c *sigctxt) rsi() uint64 { return c.regs().rsi }
+func (c *sigctxt) rbp() uint64 { return c.regs().rbp }
+func (c *sigctxt) rsp() uint64 { return c.regs().rsp }
+func (c *sigctxt) r8() uint64  { return c.regs().r8 }
+func (c *sigctxt) r9() uint64  { return c.regs().r9 }
+func (c *sigctxt) r10() uint64 { return c.regs().r10 }
+func (c *sigctxt) r11() uint64 { return c.regs().r11 }
+func (c *sigctxt) r12() uint64 { return c.regs().r12 }
+func (c *sigctxt) r13() uint64 { return c.regs().r13 }
+func (c *sigctxt) r14() uint64 { return c.regs().r14 }
+func (c *sigctxt) r15() uint64 { return c.regs().r15 }
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) rip() uint64 { return c.regs().rip }
+
+func (c *sigctxt) rflags() uint64  { return c.regs().eflags }
+func (c *sigctxt) cs() uint64      { return uint64(c.regs().cs) }
+func (c *sigctxt) fs() uint64      { return uint64(c.regs().fs) }
+func (c *sigctxt) gs() uint64      { return uint64(c.regs().gs) }
+func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_rip(x uint64)     { c.regs().rip = x }
+func (c *sigctxt) set_rsp(x uint64)     { c.regs().rsp = x }
+func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) {
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*goarch.PtrSize)) = uintptr(x)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/signal_unix.go b/contrib/go/_std_1.18/src/runtime/signal_unix.go
new file mode 100644
index 0000000000..5cb51d10ba
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/signal_unix.go
@@ -0,0 +1,1320 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
+
+package runtime
+
+import (
+	"internal/abi"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// sigTabT is the type of an entry in the global sigtable array.
+// sigtable is inherently system dependent, and appears in OS-specific files,
+// but sigTabT is the same for all Unixy systems.
+// The sigtable array is indexed by a system signal number to get the flags
+// and printable name of each signal.
+type sigTabT struct {
+	flags int32
+	name  string
+}
+
+//go:linkname os_sigpipe os.sigpipe
+func os_sigpipe() {
+	systemstack(sigpipe)
+}
+
+func signame(sig uint32) string {
+	if sig >= uint32(len(sigtable)) {
+		return ""
+	}
+	return sigtable[sig].name
+}
+
+const (
+	_SIG_DFL uintptr = 0
+	_SIG_IGN uintptr = 1
+)
+
+// sigPreempt is the signal used for non-cooperative preemption.
+//
+// There's no good way to choose this signal, but there are some
+// heuristics:
+//
+// 1. It should be a signal that's passed-through by debuggers by
+// default. On Linux, this is SIGALRM, SIGURG, SIGCHLD, SIGIO,
+// SIGVTALRM, SIGPROF, and SIGWINCH, plus some glibc-internal signals.
+//
+// 2. It shouldn't be used internally by libc in mixed Go/C binaries
+// because libc may assume it's the only thing that can handle these
+// signals. For example SIGCANCEL or SIGSETXID.
+//
+// 3. It should be a signal that can happen spuriously without
+// consequences. For example, SIGALRM is a bad choice because the
+// signal handler can't tell if it was caused by the real process
+// alarm or not (arguably this means the signal is broken, but I
+// digress). SIGUSR1 and SIGUSR2 are also bad because those are often
+// used in meaningful ways by applications.
+//
+// 4. We need to deal with platforms without real-time signals (like
+// macOS), so those are out.
+//
+// We use SIGURG because it meets all of these criteria, is extremely
+// unlikely to be used by an application for its "real" meaning (both
+// because out-of-band data is basically unused and because SIGURG
+// doesn't report which socket has the condition, making it pretty
+// useless), and even if it is, the application has to be ready for
+// spurious SIGURG. SIGIO wouldn't be a bad choice either, but is more
+// likely to be used for real.
+const sigPreempt = _SIGURG
+
+// Stores the signal handlers registered before Go installed its own.
+// These signal handlers will be invoked in cases where Go doesn't want to
+// handle a particular signal (e.g., signal occurred on a non-Go thread).
+// See sigfwdgo for more information on when the signals are forwarded.
+//
+// This is read by the signal handler; accesses should use
+// atomic.Loaduintptr and atomic.Storeuintptr.
+var fwdSig [_NSIG]uintptr
+
+// handlingSig is indexed by signal number and is non-zero if we are
+// currently handling the signal. Or, to put it another way, whether
+// the signal handler is currently set to the Go signal handler or not.
+// This is uint32 rather than bool so that we can use atomic instructions.
+var handlingSig [_NSIG]uint32
+
+// channels for synchronizing signal mask updates with the signal mask
+// thread
+var (
+	disableSigChan  chan uint32
+	enableSigChan   chan uint32
+	maskUpdatedChan chan struct{}
+)
+
+func init() {
+	// _NSIG is the number of signals on this operating system.
+	// sigtable should describe what to do for all the possible signals.
+	if len(sigtable) != _NSIG {
+		print("runtime: len(sigtable)=", len(sigtable), " _NSIG=", _NSIG, "\n")
+		throw("bad sigtable len")
+	}
+}
+
+var signalsOK bool
+
+// Initialize signals.
+// Called by libpreinit so runtime may not be initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func initsig(preinit bool) {
+	if !preinit {
+		// It's now OK for signal handlers to run.
+		signalsOK = true
+	}
+
+	// For c-archive/c-shared this is called by libpreinit with
+	// preinit == true.
+	if (isarchive || islibrary) && !preinit {
+		return
+	}
+
+	for i := uint32(0); i < _NSIG; i++ {
+		t := &sigtable[i]
+		if t.flags == 0 || t.flags&_SigDefault != 0 {
+			continue
+		}
+
+		// We don't need to use atomic operations here because
+		// there shouldn't be any other goroutines running yet.
+		fwdSig[i] = getsig(i)
+
+		if !sigInstallGoHandler(i) {
+			// Even if we are not installing a signal handler,
+			// set SA_ONSTACK if necessary.
+			if fwdSig[i] != _SIG_DFL && fwdSig[i] != _SIG_IGN {
+				setsigstack(i)
+			} else if fwdSig[i] == _SIG_IGN {
+				sigInitIgnored(i)
+			}
+			continue
+		}
+
+		handlingSig[i] = 1
+		setsig(i, abi.FuncPCABIInternal(sighandler))
+	}
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigInstallGoHandler(sig uint32) bool {
+	// For some signals, we respect an inherited SIG_IGN handler
+	// rather than insist on installing our own default handler.
+	// Even these signals can be fetched using the os/signal package.
+	switch sig {
+	case _SIGHUP, _SIGINT:
+		if atomic.Loaduintptr(&fwdSig[sig]) == _SIG_IGN {
+			return false
+		}
+	}
+
+	if (GOOS == "linux" || GOOS == "android") && !iscgo && sig == sigPerThreadSyscall {
+		// sigPerThreadSyscall is the same signal used by glibc for
+		// per-thread syscalls on Linux. We use it for the same purpose
+		// in non-cgo binaries.
+		return true
+	}
+
+	t := &sigtable[sig]
+	if t.flags&_SigSetStack != 0 {
+		return false
+	}
+
+	// When built using c-archive or c-shared, only install signal
+	// handlers for synchronous signals and SIGPIPE and sigPreempt.
+	if (isarchive || islibrary) && t.flags&_SigPanic == 0 && sig != _SIGPIPE && sig != sigPreempt {
+		return false
+	}
+
+	return true
+}
+
+// sigenable enables the Go signal handler to catch the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.enableSignal and signal_enable.
+func sigenable(sig uint32) {
+	if sig >= uint32(len(sigtable)) {
+		return
+	}
+
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
+	t := &sigtable[sig]
+	if t.flags&_SigNotify != 0 {
+		ensureSigM()
+		enableSigChan <- sig
+		<-maskUpdatedChan
+		if atomic.Cas(&handlingSig[sig], 0, 1) {
+			atomic.Storeuintptr(&fwdSig[sig], getsig(sig))
+			setsig(sig, abi.FuncPCABIInternal(sighandler))
+		}
+	}
+}
+
+// sigdisable disables the Go signal handler for the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.disableSignal and signal_disable.
+func sigdisable(sig uint32) {
+	if sig >= uint32(len(sigtable)) {
+		return
+	}
+
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
+	t := &sigtable[sig]
+	if t.flags&_SigNotify != 0 {
+		ensureSigM()
+		disableSigChan <- sig
+		<-maskUpdatedChan
+
+		// If initsig does not install a signal handler for a
+		// signal, then to go back to the state before Notify
+		// we should remove the one we installed.
+		if !sigInstallGoHandler(sig) {
+			atomic.Store(&handlingSig[sig], 0)
+			setsig(sig, atomic.Loaduintptr(&fwdSig[sig]))
+		}
+	}
+}
+
+// sigignore ignores the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.ignoreSignal and signal_ignore.
+func sigignore(sig uint32) {
+	if sig >= uint32(len(sigtable)) {
+		return
+	}
+
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
+	t := &sigtable[sig]
+	if t.flags&_SigNotify != 0 {
+		atomic.Store(&handlingSig[sig], 0)
+		setsig(sig, _SIG_IGN)
+	}
+}
+
+// clearSignalHandlers clears all signal handlers that are not ignored
+// back to the default. This is called by the child after a fork, so that
+// we can enable the signal mask for the exec without worrying about
+// running a signal handler in the child.
+//go:nosplit
+//go:nowritebarrierrec
+func clearSignalHandlers() {
+	for i := uint32(0); i < _NSIG; i++ {
+		if atomic.Load(&handlingSig[i]) != 0 {
+			setsig(i, _SIG_DFL)
+		}
+	}
+}
+
+// setProcessCPUProfilerTimer is called when the profiling timer changes.
+// It is called with prof.signalLock held. hz is the new timer, and is 0 if
+// profiling is being disabled. Enable or disable the signal as
+// required for -buildmode=c-archive.
+func setProcessCPUProfilerTimer(hz int32) {
+	if hz != 0 {
+		// Enable the Go signal handler if not enabled.
+		if atomic.Cas(&handlingSig[_SIGPROF], 0, 1) {
+			h := getsig(_SIGPROF)
+			// If no signal handler was installed before, then we record
+			// _SIG_IGN here. When we turn off profiling (below) we'll start
+			// ignoring SIGPROF signals. We do this, rather than change
+			// to SIG_DFL, because there may be a pending SIGPROF
+			// signal that has not yet been delivered to some other thread.
+			// If we change to SIG_DFL when turning off profiling, the
+			// program will crash when that SIGPROF is delivered. We assume
+			// that programs that use profiling don't want to crash on a
+			// stray SIGPROF. See issue 19320.
+			// We do the change here instead of when turning off profiling,
+			// because there we may race with a signal handler running
+			// concurrently, in particular, sigfwdgo may observe _SIG_DFL and
+			// die. See issue 43828.
+			if h == _SIG_DFL {
+				h = _SIG_IGN
+			}
+			atomic.Storeuintptr(&fwdSig[_SIGPROF], h)
+			setsig(_SIGPROF, abi.FuncPCABIInternal(sighandler))
+		}
+
+		var it itimerval
+		it.it_interval.tv_sec = 0
+		it.it_interval.set_usec(1000000 / hz)
+		it.it_value = it.it_interval
+		setitimer(_ITIMER_PROF, &it, nil)
+	} else {
+		setitimer(_ITIMER_PROF, &itimerval{}, nil)
+
+		// If the Go signal handler should be disabled by default,
+		// switch back to the signal handler that was installed
+		// when we enabled profiling. We don't try to handle the case
+		// of a program that changes the SIGPROF handler while Go
+		// profiling is enabled.
+		if !sigInstallGoHandler(_SIGPROF) {
+			if atomic.Cas(&handlingSig[_SIGPROF], 1, 0) {
+				h := atomic.Loaduintptr(&fwdSig[_SIGPROF])
+				setsig(_SIGPROF, h)
+			}
+		}
+	}
+}
+
+// setThreadCPUProfilerHz makes any thread-specific changes required to
+// implement profiling at a rate of hz.
+// No changes required on Unix systems when using setitimer.
+func setThreadCPUProfilerHz(hz int32) {
+	getg().m.profilehz = hz
+}
+
+func sigpipe() {
+	if signal_ignored(_SIGPIPE) || sigsend(_SIGPIPE) {
+		return
+	}
+	dieFromSignal(_SIGPIPE)
+}
+
+// doSigPreempt handles a preemption signal on gp.
+func doSigPreempt(gp *g, ctxt *sigctxt) {
+	// Check if this G wants to be preempted and is safe to
+	// preempt.
+	if wantAsyncPreempt(gp) {
+		if ok, newpc := isAsyncSafePoint(gp, ctxt.sigpc(), ctxt.sigsp(), ctxt.siglr()); ok {
+			// Adjust the PC and inject a call to asyncPreempt.
+			ctxt.pushCall(abi.FuncPCABI0(asyncPreempt), newpc)
+		}
+	}
+
+	// Acknowledge the preemption.
+	atomic.Xadd(&gp.m.preemptGen, 1)
+	atomic.Store(&gp.m.signalPending, 0)
+
+	if GOOS == "darwin" || GOOS == "ios" {
+		atomic.Xadd(&pendingPreemptSignals, -1)
+	}
+}
+
+const preemptMSupported = true
+
+// preemptM sends a preemption request to mp. This request may be
+// handled asynchronously and may be coalesced with other requests to
+// the M. When the request is received, if the running G or P are
+// marked for preemption and the goroutine is at an asynchronous
+// safe-point, it will preempt the goroutine. It always atomically
+// increments mp.preemptGen after handling a preemption request.
+func preemptM(mp *m) {
+	// On Darwin, don't try to preempt threads during exec.
+	// Issue #41702.
+	if GOOS == "darwin" || GOOS == "ios" {
+		execLock.rlock()
+	}
+
+	if atomic.Cas(&mp.signalPending, 0, 1) {
+		if GOOS == "darwin" || GOOS == "ios" {
+			atomic.Xadd(&pendingPreemptSignals, 1)
+		}
+
+		// If multiple threads are preempting the same M, it may send many
+		// signals to the same M such that it hardly make progress, causing
+		// live-lock problem. Apparently this could happen on darwin. See
+		// issue #37741.
+		// Only send a signal if there isn't already one pending.
+		signalM(mp, sigPreempt)
+	}
+
+	if GOOS == "darwin" || GOOS == "ios" {
+		execLock.runlock()
+	}
+}
+
+// sigFetchG fetches the value of G safely when running in a signal handler.
+// On some architectures, the g value may be clobbered when running in a VDSO.
+// See issue #32912.
+//
+//go:nosplit
+func sigFetchG(c *sigctxt) *g {
+	switch GOARCH {
+	case "arm", "arm64", "ppc64", "ppc64le", "riscv64":
+		if !iscgo && inVDSOPage(c.sigpc()) {
+			// When using cgo, we save the g on TLS and load it from there
+			// in sigtramp. Just use that.
+			// Otherwise, before making a VDSO call we save the g to the
+			// bottom of the signal stack. Fetch from there.
+			// TODO: in efence mode, stack is sysAlloc'd, so this wouldn't
+			// work.
+			sp := getcallersp()
+			s := spanOf(sp)
+			if s != nil && s.state.get() == mSpanManual && s.base() < sp && sp < s.limit {
+				gp := *(**g)(unsafe.Pointer(s.base()))
+				return gp
+			}
+			return nil
+		}
+	}
+	return getg()
+}
+
+// sigtrampgo is called from the signal handler function, sigtramp,
+// written in assembly code.
+// This is called by the signal handler, and the world may be stopped.
+//
+// It must be nosplit because getg() is still the G that was running
+// (if any) when the signal was delivered, but it's (usually) called
+// on the gsignal stack. Until this switches the G to gsignal, the
+// stack bounds check won't work.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
+	if sigfwdgo(sig, info, ctx) {
+		return
+	}
+	c := &sigctxt{info, ctx}
+	g := sigFetchG(c)
+	setg(g)
+	if g == nil {
+		if sig == _SIGPROF {
+			// Some platforms (Linux) have per-thread timers, which we use in
+			// combination with the process-wide timer. Avoid double-counting.
+			if validSIGPROF(nil, c) {
+				sigprofNonGoPC(c.sigpc())
+			}
+			return
+		}
+		if sig == sigPreempt && preemptMSupported && debug.asyncpreemptoff == 0 {
+			// This is probably a signal from preemptM sent
+			// while executing Go code but received while
+			// executing non-Go code.
+			// We got past sigfwdgo, so we know that there is
+			// no non-Go signal handler for sigPreempt.
+			// The default behavior for sigPreempt is to ignore
+			// the signal, so badsignal will be a no-op anyway.
+			if GOOS == "darwin" || GOOS == "ios" {
+				atomic.Xadd(&pendingPreemptSignals, -1)
+			}
+			return
+		}
+		c.fixsigcode(sig)
+		badsignal(uintptr(sig), c)
+		return
+	}
+
+	setg(g.m.gsignal)
+
+	// If some non-Go code called sigaltstack, adjust.
+	var gsignalStack gsignalStack
+	setStack := adjustSignalStack(sig, g.m, &gsignalStack)
+	if setStack {
+		g.m.gsignal.stktopsp = getcallersp()
+	}
+
+	if g.stackguard0 == stackFork {
+		signalDuringFork(sig)
+	}
+
+	c.fixsigcode(sig)
+	sighandler(sig, info, ctx, g)
+	setg(g)
+	if setStack {
+		restoreGsignalStack(&gsignalStack)
+	}
+}
+
+// If the signal handler receives a SIGPROF signal on a non-Go thread,
+// it tries to collect a traceback into sigprofCallers.
+// sigprofCallersUse is set to non-zero while sigprofCallers holds a traceback.
+var sigprofCallers cgoCallers
+var sigprofCallersUse uint32
+
+// sigprofNonGo is called if we receive a SIGPROF signal on a non-Go thread,
+// and the signal handler collected a stack trace in sigprofCallers.
+// When this is called, sigprofCallersUse will be non-zero.
+// g is nil, and what we can do is very limited.
+//
+// It is called from the signal handling functions written in assembly code that
+// are active for cgo programs, cgoSigtramp and sigprofNonGoWrapper, which have
+// not verified that the SIGPROF delivery corresponds to the best available
+// profiling source for this thread.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func sigprofNonGo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
+	if prof.hz != 0 {
+		c := &sigctxt{info, ctx}
+		// Some platforms (Linux) have per-thread timers, which we use in
+		// combination with the process-wide timer. Avoid double-counting.
+		if validSIGPROF(nil, c) {
+			n := 0
+			for n < len(sigprofCallers) && sigprofCallers[n] != 0 {
+				n++
+			}
+			cpuprof.addNonGo(sigprofCallers[:n])
+		}
+	}
+
+	atomic.Store(&sigprofCallersUse, 0)
+}
+
+// sigprofNonGoPC is called when a profiling signal arrived on a
+// non-Go thread and we have a single PC value, not a stack trace.
+// g is nil, and what we can do is very limited.
+//go:nosplit
+//go:nowritebarrierrec
+func sigprofNonGoPC(pc uintptr) {
+	if prof.hz != 0 {
+		stk := []uintptr{
+			pc,
+			abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum,
+		}
+		cpuprof.addNonGo(stk)
+	}
+}
+
+// adjustSignalStack adjusts the current stack guard based on the
+// stack pointer that is actually in use while handling a signal.
+// We do this in case some non-Go code called sigaltstack.
+// This reports whether the stack was adjusted, and if so stores the old
+// signal stack in *gsigstack.
+//go:nosplit
+func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
+	sp := uintptr(unsafe.Pointer(&sig))
+	if sp >= mp.gsignal.stack.lo && sp < mp.gsignal.stack.hi {
+		return false
+	}
+
+	var st stackt
+	sigaltstack(nil, &st)
+	stsp := uintptr(unsafe.Pointer(st.ss_sp))
+	if st.ss_flags&_SS_DISABLE == 0 && sp >= stsp && sp < stsp+st.ss_size {
+		setGsignalStack(&st, gsigStack)
+		return true
+	}
+
+	if sp >= mp.g0.stack.lo && sp < mp.g0.stack.hi {
+		// The signal was delivered on the g0 stack.
+		// This can happen when linked with C code
+		// using the thread sanitizer, which collects
+		// signals then delivers them itself by calling
+		// the signal handler directly when C code,
+		// including C code called via cgo, calls a
+		// TSAN-intercepted function such as malloc.
+		//
+		// We check this condition last as g0.stack.lo
+		// may be not very accurate (see mstart).
+		st := stackt{ss_size: mp.g0.stack.hi - mp.g0.stack.lo}
+		setSignalstackSP(&st, mp.g0.stack.lo)
+		setGsignalStack(&st, gsigStack)
+		return true
+	}
+
+	// sp is not within gsignal stack, g0 stack, or sigaltstack. Bad.
+	setg(nil)
+	needm()
+	if st.ss_flags&_SS_DISABLE != 0 {
+		noSignalStack(sig)
+	} else {
+		sigNotOnStack(sig)
+	}
+	dropm()
+	return false
+}
+
+// crashing is the number of m's we have waited for when implementing
+// GOTRACEBACK=crash when a signal is received.
+var crashing int32
+
+// testSigtrap and testSigusr1 are used by the runtime tests. If
+// non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the
+// normal behavior on this signal is suppressed.
+var testSigtrap func(info *siginfo, ctxt *sigctxt, gp *g) bool
+var testSigusr1 func(gp *g) bool
+
+// sighandler is invoked when a signal occurs. The global g will be
+// set to a gsignal goroutine and we will be running on the alternate
+// signal stack. The parameter g will be the value of the global g
+// when the signal occurred. The sig, info, and ctxt parameters are
+// from the system signal handler: they are the parameters passed when
+// the SA is passed to the sigaction system call.
+//
+// The garbage collector may have stopped the world, so write barriers
+// are not allowed.
+//
+//go:nowritebarrierrec
+func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
+	_g_ := getg()
+	c := &sigctxt{info, ctxt}
+
+	if sig == _SIGPROF {
+		mp := _g_.m
+		// Some platforms (Linux) have per-thread timers, which we use in
+		// combination with the process-wide timer. Avoid double-counting.
+		if validSIGPROF(mp, c) {
+			sigprof(c.sigpc(), c.sigsp(), c.siglr(), gp, mp)
+		}
+		return
+	}
+
+	if sig == _SIGTRAP && testSigtrap != nil && testSigtrap(info, (*sigctxt)(noescape(unsafe.Pointer(c))), gp) {
+		return
+	}
+
+	if sig == _SIGUSR1 && testSigusr1 != nil && testSigusr1(gp) {
+		return
+	}
+
+	if (GOOS == "linux" || GOOS == "android") && sig == sigPerThreadSyscall {
+		// sigPerThreadSyscall is the same signal used by glibc for
+		// per-thread syscalls on Linux. We use it for the same purpose
+		// in non-cgo binaries. Since this signal is not _SigNotify,
+		// there is nothing more to do once we run the syscall.
+		runPerThreadSyscall()
+		return
+	}
+
+	if sig == sigPreempt && debug.asyncpreemptoff == 0 {
+		// Might be a preemption signal.
+		doSigPreempt(gp, c)
+		// Even if this was definitely a preemption signal, it
+		// may have been coalesced with another signal, so we
+		// still let it through to the application.
+	}
+
+	flags := int32(_SigThrow)
+	if sig < uint32(len(sigtable)) {
+		flags = sigtable[sig].flags
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 && gp.throwsplit {
+		// We can't safely sigpanic because it may grow the
+		// stack. Abort in the signal handler instead.
+		flags = _SigThrow
+	}
+	if isAbortPC(c.sigpc()) {
+		// On many architectures, the abort function just
+		// causes a memory fault. Don't turn that into a panic.
+		flags = _SigThrow
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+		// The signal is going to cause a panic.
+		// Arrange the stack so that it looks like the point
+		// where the signal occurred made a call to the
+		// function sigpanic. Then set the PC to sigpanic.
+
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp.sig = sig
+		gp.sigcode0 = uintptr(c.sigcode())
+		gp.sigcode1 = uintptr(c.fault())
+		gp.sigpc = c.sigpc()
+
+		c.preparePanic(sig, gp)
+		return
+	}
+
+	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+		if sigsend(sig) {
+			return
+		}
+	}
+
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
+	if flags&_SigKill != 0 {
+		dieFromSignal(sig)
+	}
+
+	// _SigThrow means that we should exit now.
+	// If we get here with _SigPanic, it means that the signal
+	// was sent to us by a program (c.sigcode() == _SI_USER);
+	// in that case, if we didn't handle it in sigsend, we exit now.
+	if flags&(_SigThrow|_SigPanic) == 0 {
+		return
+	}
+
+	_g_.m.throwing = 1
+	_g_.m.caughtsig.set(gp)
+
+	if crashing == 0 {
+		startpanic_m()
+	}
+
+	if sig < uint32(len(sigtable)) {
+		print(sigtable[sig].name, "\n")
+	} else {
+		print("Signal ", sig, "\n")
+	}
+
+	print("PC=", hex(c.sigpc()), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
+	if _g_.m.incgo && gp == _g_.m.g0 && _g_.m.curg != nil {
+		print("signal arrived during cgo execution\n")
+		// Switch to curg so that we get a traceback of the Go code
+		// leading up to the cgocall, which switched from curg to g0.
+		gp = _g_.m.curg
+	}
+	if sig == _SIGILL || sig == _SIGFPE {
+		// It would be nice to know how long the instruction is.
+		// Unfortunately, that's complicated to do in general (mostly for x86
+		// and s930x, but other archs have non-standard instruction lengths also).
+		// Opt to print 16 bytes, which covers most instructions.
+		const maxN = 16
+		n := uintptr(maxN)
+		// We have to be careful, though. If we're near the end of
+		// a page and the following page isn't mapped, we could
+		// segfault. So make sure we don't straddle a page (even though
+		// that could lead to printing an incomplete instruction).
+		// We're assuming here we can read at least the page containing the PC.
+		// I suppose it is possible that the page is mapped executable but not readable?
+		pc := c.sigpc()
+		if n > physPageSize-pc%physPageSize {
+			n = physPageSize - pc%physPageSize
+		}
+		print("instruction bytes:")
+		b := (*[maxN]byte)(unsafe.Pointer(pc))
+		for i := uintptr(0); i < n; i++ {
+			print(" ", hex(b[i]))
+		}
+		println()
+	}
+	print("\n")
+
+	level, _, docrash := gotraceback()
+	if level > 0 {
+		goroutineheader(gp)
+		tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp)
+		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
+			// tracebackothers on original m skipped this one; trace it now.
+			goroutineheader(_g_.m.curg)
+			traceback(^uintptr(0), ^uintptr(0), 0, _g_.m.curg)
+		} else if crashing == 0 {
+			tracebackothers(gp)
+			print("\n")
+		}
+		dumpregs(c)
+	}
+
+	if docrash {
+		crashing++
+		if crashing < mcount()-int32(extraMCount) {
+			// There are other m's that need to dump their stacks.
+			// Relay SIGQUIT to the next m by sending it to the current process.
+			// All m's that have already received SIGQUIT have signal masks blocking
+			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
+			// When the last m receives the SIGQUIT, it will fall through to the call to
+			// crash below. Just in case the relaying gets botched, each m involved in
+			// the relay sleeps for 5 seconds and then does the crash/exit itself.
+			// In expected operation, the last m has received the SIGQUIT and run
+			// crash/exit and the process is gone, all long before any of the
+			// 5-second sleeps have finished.
+			print("\n-----\n\n")
+			raiseproc(_SIGQUIT)
+			usleep(5 * 1000 * 1000)
+		}
+		crash()
+	}
+
+	printDebugLog()
+
+	exit(2)
+}
+
+// sigpanic turns a synchronous signal into a run-time panic.
+// If the signal handler sees a synchronous panic, it arranges the
+// stack to look like the function where the signal occurred called
+// sigpanic, sets the signal's PC value to sigpanic, and returns from
+// the signal handler. The effect is that the program will act as
+// though the function that got the signal simply called sigpanic
+// instead.
+//
+// This must NOT be nosplit because the linker doesn't know where
+// sigpanic calls can be injected.
+//
+// The signal handler must not inject a call to sigpanic if
+// getg().throwsplit, since sigpanic may need to grow the stack.
+//
+// This is exported via linkname to assembly in runtime/cgo.
+//go:linkname sigpanic
+func sigpanic() {
+	g := getg()
+	if !canpanic(g) {
+		throw("unexpected signal during runtime execution")
+	}
+
+	switch g.sig {
+	case _SIGBUS:
+		if g.sigcode0 == _BUS_ADRERR && g.sigcode1 < 0x1000 {
+			panicmem()
+		}
+		// Support runtime/debug.SetPanicOnFault.
+		if g.paniconfault {
+			panicmemAddr(g.sigcode1)
+		}
+		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		throw("fault")
+	case _SIGSEGV:
+		if (g.sigcode0 == 0 || g.sigcode0 == _SEGV_MAPERR || g.sigcode0 == _SEGV_ACCERR) && g.sigcode1 < 0x1000 {
+			panicmem()
+		}
+		// Support runtime/debug.SetPanicOnFault.
+		if g.paniconfault {
+			panicmemAddr(g.sigcode1)
+		}
+		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		throw("fault")
+	case _SIGFPE:
+		switch g.sigcode0 {
+		case _FPE_INTDIV:
+			panicdivide()
+		case _FPE_INTOVF:
+			panicoverflow()
+		}
+		panicfloat()
+	}
+
+	if g.sig >= uint32(len(sigtable)) {
+		// can't happen: we looked up g.sig in sigtable to decide to call sigpanic
+		throw("unexpected signal value")
+	}
+	panic(errorString(sigtable[g.sig].name))
+}
+
+// dieFromSignal kills the program with a signal.
+// This provides the expected exit status for the shell.
+// This is only called with fatal signals expected to kill the process.
+//go:nosplit
+//go:nowritebarrierrec
+func dieFromSignal(sig uint32) {
+	unblocksig(sig)
+	// Mark the signal as unhandled to ensure it is forwarded.
+	atomic.Store(&handlingSig[sig], 0)
+	raise(sig)
+
+	// That should have killed us. On some systems, though, raise
+	// sends the signal to the whole process rather than to just
+	// the current thread, which means that the signal may not yet
+	// have been delivered. Give other threads a chance to run and
+	// pick up the signal.
+	osyield()
+	osyield()
+	osyield()
+
+	// If that didn't work, try _SIG_DFL.
+	setsig(sig, _SIG_DFL)
+	raise(sig)
+
+	osyield()
+	osyield()
+	osyield()
+
+	// If we are still somehow running, just exit with the wrong status.
+	exit(2)
+}
+
+// raisebadsignal is called when a signal is received on a non-Go
+// thread, and the Go program does not want to handle it (that is, the
+// program has not called os/signal.Notify for the signal).
+func raisebadsignal(sig uint32, c *sigctxt) {
+	if sig == _SIGPROF {
+		// Ignore profiling signals that arrive on non-Go threads.
+		return
+	}
+
+	var handler uintptr
+	if sig >= _NSIG {
+		handler = _SIG_DFL
+	} else {
+		handler = atomic.Loaduintptr(&fwdSig[sig])
+	}
+
+	// Reset the signal handler and raise the signal.
+	// We are currently running inside a signal handler, so the
+	// signal is blocked. We need to unblock it before raising the
+	// signal, or the signal we raise will be ignored until we return
+	// from the signal handler. We know that the signal was unblocked
+	// before entering the handler, or else we would not have received
+	// it. That means that we don't have to worry about blocking it
+	// again.
+	unblocksig(sig)
+	setsig(sig, handler)
+
+	// If we're linked into a non-Go program we want to try to
+	// avoid modifying the original context in which the signal
+	// was raised. If the handler is the default, we know it
+	// is non-recoverable, so we don't have to worry about
+	// re-installing sighandler. At this point we can just
+	// return and the signal will be re-raised and caught by
+	// the default handler with the correct context.
+	//
+	// On FreeBSD, the libthr sigaction code prevents
+	// this from working so we fall through to raise.
+	if GOOS != "freebsd" && (isarchive || islibrary) && handler == _SIG_DFL && c.sigcode() != _SI_USER {
+		return
+	}
+
+	raise(sig)
+
+	// Give the signal a chance to be delivered.
+	// In almost all real cases the program is about to crash,
+	// so sleeping here is not a waste of time.
+	usleep(1000)
+
+	// If the signal didn't cause the program to exit, restore the
+	// Go signal handler and carry on.
+	//
+	// We may receive another instance of the signal before we
+	// restore the Go handler, but that is not so bad: we know
+	// that the Go program has been ignoring the signal.
+	setsig(sig, abi.FuncPCABIInternal(sighandler))
+}
+
+//go:nosplit
+func crash() {
+	// OS X core dumps are linear dumps of the mapped memory,
+	// from the first virtual byte to the last, with zeros in the gaps.
+	// Because of the way we arrange the address space on 64-bit systems,
+	// this means the OS X core file will be >128 GB and even on a zippy
+	// workstation can take OS X well over an hour to write (uninterruptible).
+	// Save users from making that mistake.
+	if GOOS == "darwin" && GOARCH == "amd64" {
+		return
+	}
+
+	dieFromSignal(_SIGABRT)
+}
+
+// ensureSigM starts one global, sleeping thread to make sure at least one thread
+// is available to catch signals enabled for os/signal.
+func ensureSigM() {
+	if maskUpdatedChan != nil {
+		return
+	}
+	maskUpdatedChan = make(chan struct{})
+	disableSigChan = make(chan uint32)
+	enableSigChan = make(chan uint32)
+	go func() {
+		// Signal masks are per-thread, so make sure this goroutine stays on one
+		// thread.
+		LockOSThread()
+		defer UnlockOSThread()
+		// The sigBlocked mask contains the signals not active for os/signal,
+		// initially all signals except the essential. When signal.Notify()/Stop is called,
+		// sigenable/sigdisable in turn notify this thread to update its signal
+		// mask accordingly.
+		sigBlocked := sigset_all
+		for i := range sigtable {
+			if !blockableSig(uint32(i)) {
+				sigdelset(&sigBlocked, i)
+			}
+		}
+		sigprocmask(_SIG_SETMASK, &sigBlocked, nil)
+		for {
+			select {
+			case sig := <-enableSigChan:
+				if sig > 0 {
+					sigdelset(&sigBlocked, int(sig))
+				}
+			case sig := <-disableSigChan:
+				if sig > 0 && blockableSig(sig) {
+					sigaddset(&sigBlocked, int(sig))
+				}
+			}
+			sigprocmask(_SIG_SETMASK, &sigBlocked, nil)
+			maskUpdatedChan <- struct{}{}
+		}
+	}()
+}
+
+// This is called when we receive a signal when there is no signal stack.
+// This can only happen if non-Go code calls sigaltstack to disable the
+// signal stack.
+func noSignalStack(sig uint32) {
+	println("signal", sig, "received on thread with no signal stack")
+	throw("non-Go code disabled sigaltstack")
+}
+
+// This is called if we receive a signal when there is a signal stack
+// but we are not on it. This can only happen if non-Go code called
+// sigaction without setting the SS_ONSTACK flag.
+func sigNotOnStack(sig uint32) {
+	println("signal", sig, "received but handler not on signal stack")
+	throw("non-Go code set up signal handler without SA_ONSTACK flag")
+}
+
+// signalDuringFork is called if we receive a signal while doing a fork.
+// We do not want signals at that time, as a signal sent to the process
+// group may be delivered to the child process, causing confusion.
+// This should never be called, because we block signals across the fork;
+// this function is just a safety check. See issue 18600 for background.
+func signalDuringFork(sig uint32) {
+	println("signal", sig, "received during fork")
+	throw("signal received during fork")
+}
+
+var badginsignalMsg = "fatal: bad g in signal handler\n"
+
+// This runs on a foreign stack, without an m or a g. No stack split.
+//go:nosplit
+//go:norace
+//go:nowritebarrierrec
+func badsignal(sig uintptr, c *sigctxt) {
+	if !iscgo && !cgoHasExtraM {
+		// There is no extra M. needm will not be able to grab
+		// an M. Instead of hanging, just crash.
+		// Cannot call split-stack function as there is no G.
+		s := stringStructOf(&badginsignalMsg)
+		write(2, s.str, int32(s.len))
+		exit(2)
+		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
+	}
+	needm()
+	if !sigsend(uint32(sig)) {
+		// A foreign thread received the signal sig, and the
+		// Go code does not want to handle it.
+		raisebadsignal(uint32(sig), c)
+	}
+	dropm()
+}
+
+//go:noescape
+func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
+
+// Determines if the signal should be handled by Go and if not, forwards the
+// signal to the handler that was installed before Go's. Returns whether the
+// signal was forwarded.
+// This is called by the signal handler, and the world may be stopped.
+//go:nosplit
+//go:nowritebarrierrec
+func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
+	if sig >= uint32(len(sigtable)) {
+		return false
+	}
+	fwdFn := atomic.Loaduintptr(&fwdSig[sig])
+	flags := sigtable[sig].flags
+
+	// If we aren't handling the signal, forward it.
+	if atomic.Load(&handlingSig[sig]) == 0 || !signalsOK {
+		// If the signal is ignored, doing nothing is the same as forwarding.
+		if fwdFn == _SIG_IGN || (fwdFn == _SIG_DFL && flags&_SigIgn != 0) {
+			return true
+		}
+		// We are not handling the signal and there is no other handler to forward to.
+		// Crash with the default behavior.
+		if fwdFn == _SIG_DFL {
+			setsig(sig, _SIG_DFL)
+			dieFromSignal(sig)
+			return false
+		}
+
+		sigfwd(fwdFn, sig, info, ctx)
+		return true
+	}
+
+	// This function and its caller sigtrampgo assumes SIGPIPE is delivered on the
+	// originating thread. This property does not hold on macOS (golang.org/issue/33384),
+	// so we have no choice but to ignore SIGPIPE.
+	if (GOOS == "darwin" || GOOS == "ios") && sig == _SIGPIPE {
+		return true
+	}
+
+	// If there is no handler to forward to, no need to forward.
+	if fwdFn == _SIG_DFL {
+		return false
+	}
+
+	c := &sigctxt{info, ctx}
+	// Only forward synchronous signals and SIGPIPE.
+	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
+	// is set to _SI_USER even for a SIGPIPE raised from a write to a closed socket
+	// or pipe.
+	if (c.sigcode() == _SI_USER || flags&_SigPanic == 0) && sig != _SIGPIPE {
+		return false
+	}
+	// Determine if the signal occurred inside Go code. We test that:
+	//   (1) we weren't in VDSO page,
+	//   (2) we were in a goroutine (i.e., m.curg != nil), and
+	//   (3) we weren't in CGO.
+	g := sigFetchG(c)
+	if g != nil && g.m != nil && g.m.curg != nil && !g.m.incgo {
+		return false
+	}
+
+	// Signal not handled by Go, forward it.
+	if fwdFn != _SIG_IGN {
+		sigfwd(fwdFn, sig, info, ctx)
+	}
+
+	return true
+}
+
+// sigsave saves the current thread's signal mask into *p.
+// This is used to preserve the non-Go signal mask when a non-Go
+// thread calls a Go function.
+// This is nosplit and nowritebarrierrec because it is called by needm
+// which may be called on a non-Go thread with no g available.
+//go:nosplit
+//go:nowritebarrierrec
+func sigsave(p *sigset) {
+	sigprocmask(_SIG_SETMASK, nil, p)
+}
+
+// msigrestore sets the current thread's signal mask to sigmask.
+// This is used to restore the non-Go signal mask when a non-Go thread
+// calls a Go function.
+// This is nosplit and nowritebarrierrec because it is called by dropm
+// after g has been cleared.
+//go:nosplit
+//go:nowritebarrierrec
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+// sigsetAllExiting is used by sigblock(true) when a thread is
+// exiting. sigset_all is defined in OS specific code, and per GOOS
+// behavior may override this default for sigsetAllExiting: see
+// osinit().
+var sigsetAllExiting = sigset_all
+
+// sigblock blocks signals in the current thread's signal mask.
+// This is used to block signals while setting up and tearing down g
+// when a non-Go thread calls a Go function. When a thread is exiting
+// we use the sigsetAllExiting value, otherwise the OS specific
+// definition of sigset_all is used.
+// This is nosplit and nowritebarrierrec because it is called by needm
+// which may be called on a non-Go thread with no g available.
+//go:nosplit
+//go:nowritebarrierrec
+func sigblock(exiting bool) {
+	if exiting {
+		sigprocmask(_SIG_SETMASK, &sigsetAllExiting, nil)
+		return
+	}
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// unblocksig removes sig from the current thread's signal mask.
+// This is nosplit and nowritebarrierrec because it is called from
+// dieFromSignal, which can be called by sigfwdgo while running in the
+// signal handler, on the signal stack, with no g available.
+//go:nosplit
+//go:nowritebarrierrec
+func unblocksig(sig uint32) {
+	var set sigset
+	sigaddset(&set, int(sig))
+	sigprocmask(_SIG_UNBLOCK, &set, nil)
+}
+
+// minitSignals is called when initializing a new m to set the
+// thread's alternate signal stack and signal mask.
+func minitSignals() {
+	minitSignalStack()
+	minitSignalMask()
+}
+
+// minitSignalStack is called when initializing a new m to set the
+// alternate signal stack. If the alternate signal stack is not set
+// for the thread (the normal case) then set the alternate signal
+// stack to the gsignal stack. If the alternate signal stack is set
+// for the thread (the case when a non-Go thread sets the alternate
+// signal stack and then calls a Go function) then set the gsignal
+// stack to the alternate signal stack. We also set the alternate
+// signal stack to the gsignal stack if cgo is not used (regardless
+// of whether it is already set). Record which choice was made in
+// newSigstack, so that it can be undone in unminit.
+func minitSignalStack() {
+	_g_ := getg()
+	var st stackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 || !iscgo {
+		signalstack(&_g_.m.gsignal.stack)
+		_g_.m.newSigstack = true
+	} else {
+		setGsignalStack(&st, &_g_.m.goSigStack)
+		_g_.m.newSigstack = false
+	}
+}
+
+// minitSignalMask is called when initializing a new m to set the
+// thread's signal mask. When this is called all signals have been
+// blocked for the thread.  This starts with m.sigmask, which was set
+// either from initSigmask for a newly created thread or by calling
+// sigsave if this is a non-Go thread calling a Go function. It
+// removes all essential signals from the mask, thus causing those
+// signals to not be blocked. Then it sets the thread's signal mask.
+// After this is called the thread can receive signals.
+func minitSignalMask() {
+	nmask := getg().m.sigmask
+	for i := range sigtable {
+		if !blockableSig(uint32(i)) {
+			sigdelset(&nmask, i)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// unminitSignals is called from dropm, via unminit, to undo the
+// effect of calling minit on a non-Go thread.
+//go:nosplit
+func unminitSignals() {
+	if getg().m.newSigstack {
+		st := stackt{ss_flags: _SS_DISABLE}
+		sigaltstack(&st, nil)
+	} else {
+		// We got the signal stack from someone else. Restore
+		// the Go-allocated stack in case this M gets reused
+		// for another thread (e.g., it's an extram). Also, on
+		// Android, libc allocates a signal stack for all
+		// threads, so it's important to restore the Go stack
+		// even on Go-created threads so we can free it.
+		restoreGsignalStack(&getg().m.goSigStack)
+	}
+}
+
+// blockableSig reports whether sig may be blocked by the signal mask.
+// We never want to block the signals marked _SigUnblock;
+// these are the synchronous signals that turn into a Go panic.
+// We never want to block the preemption signal if it is being used.
+// In a Go program--not a c-archive/c-shared--we never want to block
+// the signals marked _SigKill or _SigThrow, as otherwise it's possible
+// for all running threads to block them and delay their delivery until
+// we start a new thread. When linked into a C program we let the C code
+// decide on the disposition of those signals.
+func blockableSig(sig uint32) bool {
+	flags := sigtable[sig].flags
+	if flags&_SigUnblock != 0 {
+		return false
+	}
+	if sig == sigPreempt && preemptMSupported && debug.asyncpreemptoff == 0 {
+		return false
+	}
+	if isarchive || islibrary {
+		return true
+	}
+	return flags&(_SigKill|_SigThrow) == 0
+}
+
+// gsignalStack saves the fields of the gsignal stack changed by
+// setGsignalStack.
+type gsignalStack struct {
+	stack       stack
+	stackguard0 uintptr
+	stackguard1 uintptr
+	stktopsp    uintptr
+}
+
+// setGsignalStack sets the gsignal stack of the current m to an
+// alternate signal stack returned from the sigaltstack system call.
+// It saves the old values in *old for use by restoreGsignalStack.
+// This is used when handling a signal if non-Go code has set the
+// alternate signal stack.
+//go:nosplit
+//go:nowritebarrierrec
+func setGsignalStack(st *stackt, old *gsignalStack) {
+	g := getg()
+	if old != nil {
+		old.stack = g.m.gsignal.stack
+		old.stackguard0 = g.m.gsignal.stackguard0
+		old.stackguard1 = g.m.gsignal.stackguard1
+		old.stktopsp = g.m.gsignal.stktopsp
+	}
+	stsp := uintptr(unsafe.Pointer(st.ss_sp))
+	g.m.gsignal.stack.lo = stsp
+	g.m.gsignal.stack.hi = stsp + st.ss_size
+	g.m.gsignal.stackguard0 = stsp + _StackGuard
+	g.m.gsignal.stackguard1 = stsp + _StackGuard
+}
+
+// restoreGsignalStack restores the gsignal stack to the value it had
+// before entering the signal handler.
+//go:nosplit
+//go:nowritebarrierrec
+func restoreGsignalStack(st *gsignalStack) {
+	gp := getg().m.gsignal
+	gp.stack = st.stack
+	gp.stackguard0 = st.stackguard0
+	gp.stackguard1 = st.stackguard1
+	gp.stktopsp = st.stktopsp
+}
+
+// signalstack sets the current thread's alternate signal stack to s.
+//go:nosplit
+func signalstack(s *stack) {
+	st := stackt{ss_size: s.hi - s.lo}
+	setSignalstackSP(&st, s.lo)
+	sigaltstack(&st, nil)
+}
+
+// setsigsegv is used on darwin/arm64 to fake a segmentation fault.
+//
+// This is exported via linkname to assembly in runtime/cgo.
+//
+//go:nosplit
+//go:linkname setsigsegv
+func setsigsegv(pc uintptr) {
+	g := getg()
+	g.sig = _SIGSEGV
+	g.sigpc = pc
+	g.sigcode0 = _SEGV_MAPERR
+	g.sigcode1 = 0 // TODO: emulate si_addr
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sigqueue.go b/contrib/go/_std_1.18/src/runtime/sigqueue.go
new file mode 100644
index 0000000000..fdf99d94a2
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sigqueue.go
@@ -0,0 +1,268 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file implements runtime support for signal handling.
+//
+// Most synchronization primitives are not available from
+// the signal handler (it cannot block, allocate memory, or use locks)
+// so the handler communicates with a processing goroutine
+// via struct sig, below.
+//
+// sigsend is called by the signal handler to queue a new signal.
+// signal_recv is called by the Go program to receive a newly queued signal.
+//
+// Synchronization between sigsend and signal_recv is based on the sig.state
+// variable. It can be in three states:
+// * sigReceiving means that signal_recv is blocked on sig.Note and there are
+//   no new pending signals.
+// * sigSending means that sig.mask *may* contain new pending signals,
+//   signal_recv can't be blocked in this state.
+// * sigIdle means that there are no new pending signals and signal_recv is not
+//   blocked.
+//
+// Transitions between states are done atomically with CAS.
+//
+// When signal_recv is unblocked, it resets sig.Note and rechecks sig.mask.
+// If several sigsends and signal_recv execute concurrently, it can lead to
+// unnecessary rechecks of sig.mask, but it cannot lead to missed signals
+// nor deadlocks.
+
+//go:build !plan9
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	_ "unsafe" // for go:linkname
+)
+
+// sig handles communication between the signal handler and os/signal.
+// Other than the inuse and recv fields, the fields are accessed atomically.
+//
+// The wanted and ignored fields are only written by one goroutine at
+// a time; access is controlled by the handlers Mutex in os/signal.
+// The fields are only read by that one goroutine and by the signal handler.
+// We access them atomically to minimize the race between setting them
+// in the goroutine calling os/signal and the signal handler,
+// which may be running in a different thread. That race is unavoidable,
+// as there is no connection between handling a signal and receiving one,
+// but atomic instructions should minimize it.
+var sig struct {
+	note       note
+	mask       [(_NSIG + 31) / 32]uint32
+	wanted     [(_NSIG + 31) / 32]uint32
+	ignored    [(_NSIG + 31) / 32]uint32
+	recv       [(_NSIG + 31) / 32]uint32
+	state      uint32
+	delivering uint32
+	inuse      bool
+}
+
+const (
+	sigIdle = iota
+	sigReceiving
+	sigSending
+)
+
+// sigsend delivers a signal from sighandler to the internal signal delivery queue.
+// It reports whether the signal was sent. If not, the caller typically crashes the program.
+// It runs from the signal handler, so it's limited in what it can do.
+func sigsend(s uint32) bool {
+	bit := uint32(1) << uint(s&31)
+	if s >= uint32(32*len(sig.wanted)) {
+		return false
+	}
+
+	atomic.Xadd(&sig.delivering, 1)
+	// We are running in the signal handler; defer is not available.
+
+	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
+		atomic.Xadd(&sig.delivering, -1)
+		return false
+	}
+
+	// Add signal to outgoing queue.
+	for {
+		mask := sig.mask[s/32]
+		if mask&bit != 0 {
+			atomic.Xadd(&sig.delivering, -1)
+			return true // signal already in queue
+		}
+		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
+			break
+		}
+	}
+
+	// Notify receiver that queue has new bit.
+Send:
+	for {
+		switch atomic.Load(&sig.state) {
+		default:
+			throw("sigsend: inconsistent state")
+		case sigIdle:
+			if atomic.Cas(&sig.state, sigIdle, sigSending) {
+				break Send
+			}
+		case sigSending:
+			// notification already pending
+			break Send
+		case sigReceiving:
+			if atomic.Cas(&sig.state, sigReceiving, sigIdle) {
+				if GOOS == "darwin" || GOOS == "ios" {
+					sigNoteWakeup(&sig.note)
+					break Send
+				}
+				notewakeup(&sig.note)
+				break Send
+			}
+		}
+	}
+
+	atomic.Xadd(&sig.delivering, -1)
+	return true
+}
+
+// Called to receive the next queued signal.
+// Must only be called from a single goroutine at a time.
+//go:linkname signal_recv os/signal.signal_recv
+func signal_recv() uint32 {
+	for {
+		// Serve any signals from local copy.
+		for i := uint32(0); i < _NSIG; i++ {
+			if sig.recv[i/32]&(1<<(i&31)) != 0 {
+				sig.recv[i/32] &^= 1 << (i & 31)
+				return i
+			}
+		}
+
+		// Wait for updates to be available from signal sender.
+	Receive:
+		for {
+			switch atomic.Load(&sig.state) {
+			default:
+				throw("signal_recv: inconsistent state")
+			case sigIdle:
+				if atomic.Cas(&sig.state, sigIdle, sigReceiving) {
+					if GOOS == "darwin" || GOOS == "ios" {
+						sigNoteSleep(&sig.note)
+						break Receive
+					}
+					notetsleepg(&sig.note, -1)
+					noteclear(&sig.note)
+					break Receive
+				}
+			case sigSending:
+				if atomic.Cas(&sig.state, sigSending, sigIdle) {
+					break Receive
+				}
+			}
+		}
+
+		// Incorporate updates from sender into local copy.
+		for i := range sig.mask {
+			sig.recv[i] = atomic.Xchg(&sig.mask[i], 0)
+		}
+	}
+}
+
+// signalWaitUntilIdle waits until the signal delivery mechanism is idle.
+// This is used to ensure that we do not drop a signal notification due
+// to a race between disabling a signal and receiving a signal.
+// This assumes that signal delivery has already been disabled for
+// the signal(s) in question, and here we are just waiting to make sure
+// that all the signals have been delivered to the user channels
+// by the os/signal package.
+//go:linkname signalWaitUntilIdle os/signal.signalWaitUntilIdle
+func signalWaitUntilIdle() {
+	// Although the signals we care about have been removed from
+	// sig.wanted, it is possible that another thread has received
+	// a signal, has read from sig.wanted, is now updating sig.mask,
+	// and has not yet woken up the processor thread. We need to wait
+	// until all current signal deliveries have completed.
+	for atomic.Load(&sig.delivering) != 0 {
+		Gosched()
+	}
+
+	// Although WaitUntilIdle seems like the right name for this
+	// function, the state we are looking for is sigReceiving, not
+	// sigIdle.  The sigIdle state is really more like sigProcessing.
+	for atomic.Load(&sig.state) != sigReceiving {
+		Gosched()
+	}
+}
+
+// Must only be called from a single goroutine at a time.
+//go:linkname signal_enable os/signal.signal_enable
+func signal_enable(s uint32) {
+	if !sig.inuse {
+		// This is the first call to signal_enable. Initialize.
+		sig.inuse = true // enable reception of signals; cannot disable
+		if GOOS == "darwin" || GOOS == "ios" {
+			sigNoteSetup(&sig.note)
+		} else {
+			noteclear(&sig.note)
+		}
+	}
+
+	if s >= uint32(len(sig.wanted)*32) {
+		return
+	}
+
+	w := sig.wanted[s/32]
+	w |= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+
+	i := sig.ignored[s/32]
+	i &^= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
+
+	sigenable(s)
+}
+
+// Must only be called from a single goroutine at a time.
+//go:linkname signal_disable os/signal.signal_disable
+func signal_disable(s uint32) {
+	if s >= uint32(len(sig.wanted)*32) {
+		return
+	}
+	sigdisable(s)
+
+	w := sig.wanted[s/32]
+	w &^= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+}
+
+// Must only be called from a single goroutine at a time.
+//go:linkname signal_ignore os/signal.signal_ignore
+func signal_ignore(s uint32) {
+	if s >= uint32(len(sig.wanted)*32) {
+		return
+	}
+	sigignore(s)
+
+	w := sig.wanted[s/32]
+	w &^= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+
+	i := sig.ignored[s/32]
+	i |= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
+}
+
+// sigInitIgnored marks the signal as already ignored. This is called at
+// program start by initsig. In a shared library initsig is called by
+// libpreinit, so the runtime may not be initialized yet.
+//go:nosplit
+func sigInitIgnored(s uint32) {
+	i := sig.ignored[s/32]
+	i |= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
+}
+
+// Checked by signal handlers.
+//go:linkname signal_ignored os/signal.signal_ignored
+func signal_ignored(s uint32) bool {
+	i := atomic.Load(&sig.ignored[s/32])
+	return i&(1<<(s&31)) != 0
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sigqueue_note.go b/contrib/go/_std_1.18/src/runtime/sigqueue_note.go
new file mode 100644
index 0000000000..fb1a517fa5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sigqueue_note.go
@@ -0,0 +1,24 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The current implementation of notes on Darwin is not async-signal-safe,
+// so on Darwin the sigqueue code uses different functions to wake up the
+// signal_recv thread. This file holds the non-Darwin implementations of
+// those functions. These functions will never be called.
+
+//go:build !darwin && !plan9
+
+package runtime
+
+func sigNoteSetup(*note) {
+	throw("sigNoteSetup")
+}
+
+func sigNoteSleep(*note) {
+	throw("sigNoteSleep")
+}
+
+func sigNoteWakeup(*note) {
+	throw("sigNoteWakeup")
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sigtab_linux_generic.go b/contrib/go/_std_1.18/src/runtime/sigtab_linux_generic.go
new file mode 100644
index 0000000000..fe93bbafb5
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sigtab_linux_generic.go
@@ -0,0 +1,75 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !mips && !mipsle && !mips64 && !mips64le && linux
+
+package runtime
+
+var sigtable = [...]sigTabT{
+	/* 0 */ {0, "SIGNONE: no trap"},
+	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
+	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
+	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
+	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
+	/* 7 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
+	/* 9 */ {0, "SIGKILL: kill"},
+	/* 10 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
+	/* 12 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
+	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
+	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
+	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
+	/* 16 */ {_SigThrow + _SigUnblock, "SIGSTKFLT: stack fault"},
+	/* 17 */ {_SigNotify + _SigUnblock + _SigIgn, "SIGCHLD: child status has changed"},
+	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue"},
+	/* 19 */ {0, "SIGSTOP: stop, unblockable"},
+	/* 20 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
+	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
+	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
+	/* 23 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
+	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
+	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
+	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
+	/* 28 */ {_SigNotify + _SigIgn, "SIGWINCH: window size change"},
+	/* 29 */ {_SigNotify, "SIGIO: i/o now possible"},
+	/* 30 */ {_SigNotify, "SIGPWR: power failure restart"},
+	/* 31 */ {_SigThrow, "SIGSYS: bad system call"},
+	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
+	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
+	/* 34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
+	/* 35 */ {_SigNotify, "signal 35"},
+	/* 36 */ {_SigNotify, "signal 36"},
+	/* 37 */ {_SigNotify, "signal 37"},
+	/* 38 */ {_SigNotify, "signal 38"},
+	/* 39 */ {_SigNotify, "signal 39"},
+	/* 40 */ {_SigNotify, "signal 40"},
+	/* 41 */ {_SigNotify, "signal 41"},
+	/* 42 */ {_SigNotify, "signal 42"},
+	/* 43 */ {_SigNotify, "signal 43"},
+	/* 44 */ {_SigNotify, "signal 44"},
+	/* 45 */ {_SigNotify, "signal 45"},
+	/* 46 */ {_SigNotify, "signal 46"},
+	/* 47 */ {_SigNotify, "signal 47"},
+	/* 48 */ {_SigNotify, "signal 48"},
+	/* 49 */ {_SigNotify, "signal 49"},
+	/* 50 */ {_SigNotify, "signal 50"},
+	/* 51 */ {_SigNotify, "signal 51"},
+	/* 52 */ {_SigNotify, "signal 52"},
+	/* 53 */ {_SigNotify, "signal 53"},
+	/* 54 */ {_SigNotify, "signal 54"},
+	/* 55 */ {_SigNotify, "signal 55"},
+	/* 56 */ {_SigNotify, "signal 56"},
+	/* 57 */ {_SigNotify, "signal 57"},
+	/* 58 */ {_SigNotify, "signal 58"},
+	/* 59 */ {_SigNotify, "signal 59"},
+	/* 60 */ {_SigNotify, "signal 60"},
+	/* 61 */ {_SigNotify, "signal 61"},
+	/* 62 */ {_SigNotify, "signal 62"},
+	/* 63 */ {_SigNotify, "signal 63"},
+	/* 64 */ {_SigNotify, "signal 64"},
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sizeclasses.go b/contrib/go/_std_1.18/src/runtime/sizeclasses.go
new file mode 100644
index 0000000000..067871eaf3
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sizeclasses.go
@@ -0,0 +1,97 @@
+// Code generated by mksizeclasses.go; DO NOT EDIT.
+//go:generate go run mksizeclasses.go
+
+package runtime
+
+// class  bytes/obj  bytes/span  objects  tail waste  max waste  min align
+//     1          8        8192     1024           0     87.50%          8
+//     2         16        8192      512           0     43.75%         16
+//     3         24        8192      341           8     29.24%          8
+//     4         32        8192      256           0     21.88%         32
+//     5         48        8192      170          32     31.52%         16
+//     6         64        8192      128           0     23.44%         64
+//     7         80        8192      102          32     19.07%         16
+//     8         96        8192       85          32     15.95%         32
+//     9        112        8192       73          16     13.56%         16
+//    10        128        8192       64           0     11.72%        128
+//    11        144        8192       56         128     11.82%         16
+//    12        160        8192       51          32      9.73%         32
+//    13        176        8192       46          96      9.59%         16
+//    14        192        8192       42         128      9.25%         64
+//    15        208        8192       39          80      8.12%         16
+//    16        224        8192       36         128      8.15%         32
+//    17        240        8192       34          32      6.62%         16
+//    18        256        8192       32           0      5.86%        256
+//    19        288        8192       28         128     12.16%         32
+//    20        320        8192       25         192     11.80%         64
+//    21        352        8192       23          96      9.88%         32
+//    22        384        8192       21         128      9.51%        128
+//    23        416        8192       19         288     10.71%         32
+//    24        448        8192       18         128      8.37%         64
+//    25        480        8192       17          32      6.82%         32
+//    26        512        8192       16           0      6.05%        512
+//    27        576        8192       14         128     12.33%         64
+//    28        640        8192       12         512     15.48%        128
+//    29        704        8192       11         448     13.93%         64
+//    30        768        8192       10         512     13.94%        256
+//    31        896        8192        9         128     15.52%        128
+//    32       1024        8192        8           0     12.40%       1024
+//    33       1152        8192        7         128     12.41%        128
+//    34       1280        8192        6         512     15.55%        256
+//    35       1408       16384       11         896     14.00%        128
+//    36       1536        8192        5         512     14.00%        512
+//    37       1792       16384        9         256     15.57%        256
+//    38       2048        8192        4           0     12.45%       2048
+//    39       2304       16384        7         256     12.46%        256
+//    40       2688        8192        3         128     15.59%        128
+//    41       3072       24576        8           0     12.47%       1024
+//    42       3200       16384        5         384      6.22%        128
+//    43       3456       24576        7         384      8.83%        128
+//    44       4096        8192        2           0     15.60%       4096
+//    45       4864       24576        5         256     16.65%        256
+//    46       5376       16384        3         256     10.92%        256
+//    47       6144       24576        4           0     12.48%       2048
+//    48       6528       32768        5         128      6.23%        128
+//    49       6784       40960        6         256      4.36%        128
+//    50       6912       49152        7         768      3.37%        256
+//    51       8192        8192        1           0     15.61%       8192
+//    52       9472       57344        6         512     14.28%        256
+//    53       9728       49152        5         512      3.64%        512
+//    54      10240       40960        4           0      4.99%       2048
+//    55      10880       32768        3         128      6.24%        128
+//    56      12288       24576        2           0     11.45%       4096
+//    57      13568       40960        3         256      9.99%        256
+//    58      14336       57344        4           0      5.35%       2048
+//    59      16384       16384        1           0     12.49%       8192
+//    60      18432       73728        4           0     11.11%       2048
+//    61      19072       57344        3         128      3.57%        128
+//    62      20480       40960        2           0      6.87%       4096
+//    63      21760       65536        3         256      6.25%        256
+//    64      24576       24576        1           0     11.45%       8192
+//    65      27264       81920        3         128     10.00%        128
+//    66      28672       57344        2           0      4.91%       4096
+//    67      32768       32768        1           0     12.50%       8192
+
+// alignment  bits  min obj size
+//         8     3             8
+//        16     4            32
+//        32     5           256
+//        64     6           512
+//       128     7           768
+//      4096    12         28672
+//      8192    13         32768
+
+const (
+	_MaxSmallSize   = 32768
+	smallSizeDiv    = 8
+	smallSizeMax    = 1024
+	largeSizeDiv    = 128
+	_NumSizeClasses = 68
+	_PageShift      = 13
+)
+
+var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
+var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
+var class_to_divmagic = [_NumSizeClasses]uint32{0, ^uint32(0)/8 + 1, ^uint32(0)/16 + 1, ^uint32(0)/24 + 1, ^uint32(0)/32 + 1, ^uint32(0)/48 + 1, ^uint32(0)/64 + 1, ^uint32(0)/80 + 1, ^uint32(0)/96 + 1, ^uint32(0)/112 + 1, ^uint32(0)/128 + 1, ^uint32(0)/144 + 1, ^uint32(0)/160 + 1, ^uint32(0)/176 + 1, ^uint32(0)/192 + 1, ^uint32(0)/208 + 1, ^uint32(0)/224 + 1, ^uint32(0)/240 + 1, ^uint32(0)/256 + 1, ^uint32(0)/288 + 1, ^uint32(0)/320 + 1, ^uint32(0)/352 + 1, ^uint32(0)/384 + 1, ^uint32(0)/416 + 1, ^uint32(0)/448 + 1, ^uint32(0)/480 + 1, ^uint32(0)/512 + 1, ^uint32(0)/576 + 1, ^uint32(0)/640 + 1, ^uint32(0)/704 + 1, ^uint32(0)/768 + 1, ^uint32(0)/896 + 1, ^uint32(0)/1024 + 1, ^uint32(0)/1152 + 1, ^uint32(0)/1280 + 1, ^uint32(0)/1408 + 1, ^uint32(0)/1536 + 1, ^uint32(0)/1792 + 1, ^uint32(0)/2048 + 1, ^uint32(0)/2304 + 1, ^uint32(0)/2688 + 1, ^uint32(0)/3072 + 1, ^uint32(0)/3200 + 1, ^uint32(0)/3456 + 1, ^uint32(0)/4096 + 1, ^uint32(0)/4864 + 1, ^uint32(0)/5376 + 1, ^uint32(0)/6144 + 1, ^uint32(0)/6528 + 1, ^uint32(0)/6784 + 1, ^uint32(0)/6912 + 1, ^uint32(0)/8192 + 1, ^uint32(0)/9472 + 1, ^uint32(0)/9728 + 1, ^uint32(0)/10240 + 1, ^uint32(0)/10880 + 1, ^uint32(0)/12288 + 1, ^uint32(0)/13568 + 1, ^uint32(0)/14336 + 1, ^uint32(0)/16384 + 1, ^uint32(0)/18432 + 1, ^uint32(0)/19072 + 1, ^uint32(0)/20480 + 1, ^uint32(0)/21760 + 1, ^uint32(0)/24576 + 1, ^uint32(0)/27264 + 1, ^uint32(0)/28672 + 1, ^uint32(0)/32768 + 1}
+var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
+var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
diff --git a/contrib/go/_std_1.18/src/runtime/slice.go b/contrib/go/_std_1.18/src/runtime/slice.go
new file mode 100644
index 0000000000..e0aeba604f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/slice.go
@@ -0,0 +1,332 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/math"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type slice struct {
+	array unsafe.Pointer
+	len   int
+	cap   int
+}
+
+// A notInHeapSlice is a slice backed by go:notinheap memory.
+type notInHeapSlice struct {
+	array *notInHeap
+	len   int
+	cap   int
+}
+
+func panicmakeslicelen() {
+	panic(errorString("makeslice: len out of range"))
+}
+
+func panicmakeslicecap() {
+	panic(errorString("makeslice: cap out of range"))
+}
+
+// makeslicecopy allocates a slice of "tolen" elements of type "et",
+// then copies "fromlen" elements of type "et" into that new allocation from "from".
+func makeslicecopy(et *_type, tolen int, fromlen int, from unsafe.Pointer) unsafe.Pointer {
+	var tomem, copymem uintptr
+	if uintptr(tolen) > uintptr(fromlen) {
+		var overflow bool
+		tomem, overflow = math.MulUintptr(et.size, uintptr(tolen))
+		if overflow || tomem > maxAlloc || tolen < 0 {
+			panicmakeslicelen()
+		}
+		copymem = et.size * uintptr(fromlen)
+	} else {
+		// fromlen is a known good length providing and equal or greater than tolen,
+		// thereby making tolen a good slice length too as from and to slices have the
+		// same element width.
+		tomem = et.size * uintptr(tolen)
+		copymem = tomem
+	}
+
+	var to unsafe.Pointer
+	if et.ptrdata == 0 {
+		to = mallocgc(tomem, nil, false)
+		if copymem < tomem {
+			memclrNoHeapPointers(add(to, copymem), tomem-copymem)
+		}
+	} else {
+		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
+		to = mallocgc(tomem, et, true)
+		if copymem > 0 && writeBarrier.enabled {
+			// Only shade the pointers in old.array since we know the destination slice to
+			// only contains nil pointers because it has been cleared during alloc.
+			bulkBarrierPreWriteSrcOnly(uintptr(to), uintptr(from), copymem)
+		}
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(makeslicecopy)
+		racereadrangepc(from, copymem, callerpc, pc)
+	}
+	if msanenabled {
+		msanread(from, copymem)
+	}
+	if asanenabled {
+		asanread(from, copymem)
+	}
+
+	memmove(to, from, copymem)
+
+	return to
+}
+
+func makeslice(et *_type, len, cap int) unsafe.Pointer {
+	mem, overflow := math.MulUintptr(et.size, uintptr(cap))
+	if overflow || mem > maxAlloc || len < 0 || len > cap {
+		// NOTE: Produce a 'len out of range' error instead of a
+		// 'cap out of range' error when someone does make([]T, bignumber).
+		// 'cap out of range' is true too, but since the cap is only being
+		// supplied implicitly, saying len is clearer.
+		// See golang.org/issue/4085.
+		mem, overflow := math.MulUintptr(et.size, uintptr(len))
+		if overflow || mem > maxAlloc || len < 0 {
+			panicmakeslicelen()
+		}
+		panicmakeslicecap()
+	}
+
+	return mallocgc(mem, et, true)
+}
+
+func makeslice64(et *_type, len64, cap64 int64) unsafe.Pointer {
+	len := int(len64)
+	if int64(len) != len64 {
+		panicmakeslicelen()
+	}
+
+	cap := int(cap64)
+	if int64(cap) != cap64 {
+		panicmakeslicecap()
+	}
+
+	return makeslice(et, len, cap)
+}
+
+func unsafeslice(et *_type, ptr unsafe.Pointer, len int) {
+	if len < 0 {
+		panicunsafeslicelen()
+	}
+
+	mem, overflow := math.MulUintptr(et.size, uintptr(len))
+	if overflow || mem > -uintptr(ptr) {
+		if ptr == nil {
+			panic(errorString("unsafe.Slice: ptr is nil and len is not zero"))
+		}
+		panicunsafeslicelen()
+	}
+}
+
+func unsafeslice64(et *_type, ptr unsafe.Pointer, len64 int64) {
+	len := int(len64)
+	if int64(len) != len64 {
+		panicunsafeslicelen()
+	}
+	unsafeslice(et, ptr, len)
+}
+
+func unsafeslicecheckptr(et *_type, ptr unsafe.Pointer, len64 int64) {
+	unsafeslice64(et, ptr, len64)
+
+	// Check that underlying array doesn't straddle multiple heap objects.
+	// unsafeslice64 has already checked for overflow.
+	if checkptrStraddles(ptr, uintptr(len64)*et.size) {
+		throw("checkptr: unsafe.Slice result straddles multiple allocations")
+	}
+}
+
+func panicunsafeslicelen() {
+	panic(errorString("unsafe.Slice: len out of range"))
+}
+
+// growslice handles slice growth during append.
+// It is passed the slice element type, the old slice, and the desired new minimum capacity,
+// and it returns a new slice with at least that capacity, with the old data
+// copied into it.
+// The new slice's length is set to the old slice's length,
+// NOT to the new requested capacity.
+// This is for codegen convenience. The old slice's length is used immediately
+// to calculate where to write new values during an append.
+// TODO: When the old backend is gone, reconsider this decision.
+// The SSA backend might prefer the new length or to return only ptr/cap and save stack space.
+func growslice(et *_type, old slice, cap int) slice {
+	if raceenabled {
+		callerpc := getcallerpc()
+		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, abi.FuncPCABIInternal(growslice))
+	}
+	if msanenabled {
+		msanread(old.array, uintptr(old.len*int(et.size)))
+	}
+	if asanenabled {
+		asanread(old.array, uintptr(old.len*int(et.size)))
+	}
+
+	if cap < old.cap {
+		panic(errorString("growslice: cap out of range"))
+	}
+
+	if et.size == 0 {
+		// append should not create a slice with nil pointer but non-zero len.
+		// We assume that append doesn't need to preserve old.array in this case.
+		return slice{unsafe.Pointer(&zerobase), old.len, cap}
+	}
+
+	newcap := old.cap
+	doublecap := newcap + newcap
+	if cap > doublecap {
+		newcap = cap
+	} else {
+		const threshold = 256
+		if old.cap < threshold {
+			newcap = doublecap
+		} else {
+			// Check 0 < newcap to detect overflow
+			// and prevent an infinite loop.
+			for 0 < newcap && newcap < cap {
+				// Transition from growing 2x for small slices
+				// to growing 1.25x for large slices. This formula
+				// gives a smooth-ish transition between the two.
+				newcap += (newcap + 3*threshold) / 4
+			}
+			// Set newcap to the requested cap when
+			// the newcap calculation overflowed.
+			if newcap <= 0 {
+				newcap = cap
+			}
+		}
+	}
+
+	var overflow bool
+	var lenmem, newlenmem, capmem uintptr
+	// Specialize for common values of et.size.
+	// For 1 we don't need any division/multiplication.
+	// For goarch.PtrSize, compiler will optimize division/multiplication into a shift by a constant.
+	// For powers of 2, use a variable shift.
+	switch {
+	case et.size == 1:
+		lenmem = uintptr(old.len)
+		newlenmem = uintptr(cap)
+		capmem = roundupsize(uintptr(newcap))
+		overflow = uintptr(newcap) > maxAlloc
+		newcap = int(capmem)
+	case et.size == goarch.PtrSize:
+		lenmem = uintptr(old.len) * goarch.PtrSize
+		newlenmem = uintptr(cap) * goarch.PtrSize
+		capmem = roundupsize(uintptr(newcap) * goarch.PtrSize)
+		overflow = uintptr(newcap) > maxAlloc/goarch.PtrSize
+		newcap = int(capmem / goarch.PtrSize)
+	case isPowerOfTwo(et.size):
+		var shift uintptr
+		if goarch.PtrSize == 8 {
+			// Mask shift for better code generation.
+			shift = uintptr(sys.Ctz64(uint64(et.size))) & 63
+		} else {
+			shift = uintptr(sys.Ctz32(uint32(et.size))) & 31
+		}
+		lenmem = uintptr(old.len) << shift
+		newlenmem = uintptr(cap) << shift
+		capmem = roundupsize(uintptr(newcap) << shift)
+		overflow = uintptr(newcap) > (maxAlloc >> shift)
+		newcap = int(capmem >> shift)
+	default:
+		lenmem = uintptr(old.len) * et.size
+		newlenmem = uintptr(cap) * et.size
+		capmem, overflow = math.MulUintptr(et.size, uintptr(newcap))
+		capmem = roundupsize(capmem)
+		newcap = int(capmem / et.size)
+	}
+
+	// The check of overflow in addition to capmem > maxAlloc is needed
+	// to prevent an overflow which can be used to trigger a segfault
+	// on 32bit architectures with this example program:
+	//
+	// type T [1<<27 + 1]int64
+	//
+	// var d T
+	// var s []T
+	//
+	// func main() {
+	//   s = append(s, d, d, d, d)
+	//   print(len(s), "\n")
+	// }
+	if overflow || capmem > maxAlloc {
+		panic(errorString("growslice: cap out of range"))
+	}
+
+	var p unsafe.Pointer
+	if et.ptrdata == 0 {
+		p = mallocgc(capmem, nil, false)
+		// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
+		// Only clear the part that will not be overwritten.
+		memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
+	} else {
+		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
+		p = mallocgc(capmem, et, true)
+		if lenmem > 0 && writeBarrier.enabled {
+			// Only shade the pointers in old.array since we know the destination slice p
+			// only contains nil pointers because it has been cleared during alloc.
+			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem-et.size+et.ptrdata)
+		}
+	}
+	memmove(p, old.array, lenmem)
+
+	return slice{p, old.len, newcap}
+}
+
+func isPowerOfTwo(x uintptr) bool {
+	return x&(x-1) == 0
+}
+
+// slicecopy is used to copy from a string or slice of pointerless elements into a slice.
+func slicecopy(toPtr unsafe.Pointer, toLen int, fromPtr unsafe.Pointer, fromLen int, width uintptr) int {
+	if fromLen == 0 || toLen == 0 {
+		return 0
+	}
+
+	n := fromLen
+	if toLen < n {
+		n = toLen
+	}
+
+	if width == 0 {
+		return n
+	}
+
+	size := uintptr(n) * width
+	if raceenabled {
+		callerpc := getcallerpc()
+		pc := abi.FuncPCABIInternal(slicecopy)
+		racereadrangepc(fromPtr, size, callerpc, pc)
+		racewriterangepc(toPtr, size, callerpc, pc)
+	}
+	if msanenabled {
+		msanread(fromPtr, size)
+		msanwrite(toPtr, size)
+	}
+	if asanenabled {
+		asanread(fromPtr, size)
+		asanwrite(toPtr, size)
+	}
+
+	if size == 1 { // common case worth about 2x to do here
+		// TODO: is this still worth it with new memmove impl?
+		*(*byte)(toPtr) = *(*byte)(fromPtr) // known to be a byte pointer
+	} else {
+		memmove(toPtr, fromPtr, size)
+	}
+	return n
+}
diff --git a/contrib/go/_std_1.18/src/runtime/softfloat64.go b/contrib/go/_std_1.18/src/runtime/softfloat64.go
new file mode 100644
index 0000000000..42ef009297
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/softfloat64.go
@@ -0,0 +1,627 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Software IEEE754 64-bit floating point.
+// Only referred to (and thus linked in) by softfloat targets
+// and by tests in this directory.
+
+package runtime
+
+const (
+	mantbits64 uint = 52
+	expbits64  uint = 11
+	bias64          = -1<<(expbits64-1) + 1
+
+	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1<<(mantbits64-1) // quiet NaN, 0 payload
+	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
+	neg64 uint64 = 1 << (expbits64 + mantbits64)
+
+	mantbits32 uint = 23
+	expbits32  uint = 8
+	bias32          = -1<<(expbits32-1) + 1
+
+	nan32 uint32 = (1<<expbits32-1)<<mantbits32 + 1<<(mantbits32-1) // quiet NaN, 0 payload
+	inf32 uint32 = (1<<expbits32 - 1) << mantbits32
+	neg32 uint32 = 1 << (expbits32 + mantbits32)
+)
+
+func funpack64(f uint64) (sign, mant uint64, exp int, inf, nan bool) {
+	sign = f & (1 << (mantbits64 + expbits64))
+	mant = f & (1<<mantbits64 - 1)
+	exp = int(f>>mantbits64) & (1<<expbits64 - 1)
+
+	switch exp {
+	case 1<<expbits64 - 1:
+		if mant != 0 {
+			nan = true
+			return
+		}
+		inf = true
+		return
+
+	case 0:
+		// denormalized
+		if mant != 0 {
+			exp += bias64 + 1
+			for mant < 1<<mantbits64 {
+				mant <<= 1
+				exp--
+			}
+		}
+
+	default:
+		// add implicit top bit
+		mant |= 1 << mantbits64
+		exp += bias64
+	}
+	return
+}
+
+func funpack32(f uint32) (sign, mant uint32, exp int, inf, nan bool) {
+	sign = f & (1 << (mantbits32 + expbits32))
+	mant = f & (1<<mantbits32 - 1)
+	exp = int(f>>mantbits32) & (1<<expbits32 - 1)
+
+	switch exp {
+	case 1<<expbits32 - 1:
+		if mant != 0 {
+			nan = true
+			return
+		}
+		inf = true
+		return
+
+	case 0:
+		// denormalized
+		if mant != 0 {
+			exp += bias32 + 1
+			for mant < 1<<mantbits32 {
+				mant <<= 1
+				exp--
+			}
+		}
+
+	default:
+		// add implicit top bit
+		mant |= 1 << mantbits32
+		exp += bias32
+	}
+	return
+}
+
+func fpack64(sign, mant uint64, exp int, trunc uint64) uint64 {
+	mant0, exp0, trunc0 := mant, exp, trunc
+	if mant == 0 {
+		return sign
+	}
+	for mant < 1<<mantbits64 {
+		mant <<= 1
+		exp--
+	}
+	for mant >= 4<<mantbits64 {
+		trunc |= mant & 1
+		mant >>= 1
+		exp++
+	}
+	if mant >= 2<<mantbits64 {
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+			if mant >= 4<<mantbits64 {
+				mant >>= 1
+				exp++
+			}
+		}
+		mant >>= 1
+		exp++
+	}
+	if exp >= 1<<expbits64-1+bias64 {
+		return sign ^ inf64
+	}
+	if exp < bias64+1 {
+		if exp < bias64-int(mantbits64) {
+			return sign | 0
+		}
+		// repeat expecting denormal
+		mant, exp, trunc = mant0, exp0, trunc0
+		for exp < bias64 {
+			trunc |= mant & 1
+			mant >>= 1
+			exp++
+		}
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+		}
+		mant >>= 1
+		exp++
+		if mant < 1<<mantbits64 {
+			return sign | mant
+		}
+	}
+	return sign | uint64(exp-bias64)<<mantbits64 | mant&(1<<mantbits64-1)
+}
+
+func fpack32(sign, mant uint32, exp int, trunc uint32) uint32 {
+	mant0, exp0, trunc0 := mant, exp, trunc
+	if mant == 0 {
+		return sign
+	}
+	for mant < 1<<mantbits32 {
+		mant <<= 1
+		exp--
+	}
+	for mant >= 4<<mantbits32 {
+		trunc |= mant & 1
+		mant >>= 1
+		exp++
+	}
+	if mant >= 2<<mantbits32 {
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+			if mant >= 4<<mantbits32 {
+				mant >>= 1
+				exp++
+			}
+		}
+		mant >>= 1
+		exp++
+	}
+	if exp >= 1<<expbits32-1+bias32 {
+		return sign ^ inf32
+	}
+	if exp < bias32+1 {
+		if exp < bias32-int(mantbits32) {
+			return sign | 0
+		}
+		// repeat expecting denormal
+		mant, exp, trunc = mant0, exp0, trunc0
+		for exp < bias32 {
+			trunc |= mant & 1
+			mant >>= 1
+			exp++
+		}
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+		}
+		mant >>= 1
+		exp++
+		if mant < 1<<mantbits32 {
+			return sign | mant
+		}
+	}
+	return sign | uint32(exp-bias32)<<mantbits32 | mant&(1<<mantbits32-1)
+}
+
+func fadd64(f, g uint64) uint64 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	gs, gm, ge, gi, gn := funpack64(g)
+
+	// Special cases.
+	switch {
+	case fn || gn: // NaN + x or x + NaN = NaN
+		return nan64
+
+	case fi && gi && fs != gs: // +Inf + -Inf or -Inf + +Inf = NaN
+		return nan64
+
+	case fi: // ±Inf + g = ±Inf
+		return f
+
+	case gi: // f + ±Inf = ±Inf
+		return g
+
+	case fm == 0 && gm == 0 && fs != 0 && gs != 0: // -0 + -0 = -0
+		return f
+
+	case fm == 0: // 0 + g = g but 0 + -0 = +0
+		if gm == 0 {
+			g ^= gs
+		}
+		return g
+
+	case gm == 0: // f + 0 = f
+		return f
+
+	}
+
+	if fe < ge || fe == ge && fm < gm {
+		f, g, fs, fm, fe, gs, gm, ge = g, f, gs, gm, ge, fs, fm, fe
+	}
+
+	shift := uint(fe - ge)
+	fm <<= 2
+	gm <<= 2
+	trunc := gm & (1<<shift - 1)
+	gm >>= shift
+	if fs == gs {
+		fm += gm
+	} else {
+		fm -= gm
+		if trunc != 0 {
+			fm--
+		}
+	}
+	if fm == 0 {
+		fs = 0
+	}
+	return fpack64(fs, fm, fe-2, trunc)
+}
+
+func fsub64(f, g uint64) uint64 {
+	return fadd64(f, fneg64(g))
+}
+
+func fneg64(f uint64) uint64 {
+	return f ^ (1 << (mantbits64 + expbits64))
+}
+
+func fmul64(f, g uint64) uint64 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	gs, gm, ge, gi, gn := funpack64(g)
+
+	// Special cases.
+	switch {
+	case fn || gn: // NaN * g or f * NaN = NaN
+		return nan64
+
+	case fi && gi: // Inf * Inf = Inf (with sign adjusted)
+		return f ^ gs
+
+	case fi && gm == 0, fm == 0 && gi: // 0 * Inf = Inf * 0 = NaN
+		return nan64
+
+	case fm == 0: // 0 * x = 0 (with sign adjusted)
+		return f ^ gs
+
+	case gm == 0: // x * 0 = 0 (with sign adjusted)
+		return g ^ fs
+	}
+
+	// 53-bit * 53-bit = 107- or 108-bit
+	lo, hi := mullu(fm, gm)
+	shift := mantbits64 - 1
+	trunc := lo & (1<<shift - 1)
+	mant := hi<<(64-shift) | lo>>shift
+	return fpack64(fs^gs, mant, fe+ge-1, trunc)
+}
+
+func fdiv64(f, g uint64) uint64 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	gs, gm, ge, gi, gn := funpack64(g)
+
+	// Special cases.
+	switch {
+	case fn || gn: // NaN / g = f / NaN = NaN
+		return nan64
+
+	case fi && gi: // ±Inf / ±Inf = NaN
+		return nan64
+
+	case !fi && !gi && fm == 0 && gm == 0: // 0 / 0 = NaN
+		return nan64
+
+	case fi, !gi && gm == 0: // Inf / g = f / 0 = Inf
+		return fs ^ gs ^ inf64
+
+	case gi, fm == 0: // f / Inf = 0 / g = Inf
+		return fs ^ gs ^ 0
+	}
+	_, _, _, _ = fi, fn, gi, gn
+
+	// 53-bit<<54 / 53-bit = 53- or 54-bit.
+	shift := mantbits64 + 2
+	q, r := divlu(fm>>(64-shift), fm<<shift, gm)
+	return fpack64(fs^gs, q, fe-ge-2, r)
+}
+
+func f64to32(f uint64) uint32 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	if fn {
+		return nan32
+	}
+	fs32 := uint32(fs >> 32)
+	if fi {
+		return fs32 ^ inf32
+	}
+	const d = mantbits64 - mantbits32 - 1
+	return fpack32(fs32, uint32(fm>>d), fe-1, uint32(fm&(1<<d-1)))
+}
+
+func f32to64(f uint32) uint64 {
+	const d = mantbits64 - mantbits32
+	fs, fm, fe, fi, fn := funpack32(f)
+	if fn {
+		return nan64
+	}
+	fs64 := uint64(fs) << 32
+	if fi {
+		return fs64 ^ inf64
+	}
+	return fpack64(fs64, uint64(fm)<<d, fe, 0)
+}
+
+func fcmp64(f, g uint64) (cmp int32, isnan bool) {
+	fs, fm, _, fi, fn := funpack64(f)
+	gs, gm, _, gi, gn := funpack64(g)
+
+	switch {
+	case fn, gn: // flag NaN
+		return 0, true
+
+	case !fi && !gi && fm == 0 && gm == 0: // ±0 == ±0
+		return 0, false
+
+	case fs > gs: // f < 0, g > 0
+		return -1, false
+
+	case fs < gs: // f > 0, g < 0
+		return +1, false
+
+	// Same sign, not NaN.
+	// Can compare encodings directly now.
+	// Reverse for sign.
+	case fs == 0 && f < g, fs != 0 && f > g:
+		return -1, false
+
+	case fs == 0 && f > g, fs != 0 && f < g:
+		return +1, false
+	}
+
+	// f == g
+	return 0, false
+}
+
+func f64toint(f uint64) (val int64, ok bool) {
+	fs, fm, fe, fi, fn := funpack64(f)
+
+	switch {
+	case fi, fn: // NaN
+		return 0, false
+
+	case fe < -1: // f < 0.5
+		return 0, false
+
+	case fe > 63: // f >= 2^63
+		if fs != 0 && fm == 0 { // f == -2^63
+			return -1 << 63, true
+		}
+		if fs != 0 {
+			return 0, false
+		}
+		return 0, false
+	}
+
+	for fe > int(mantbits64) {
+		fe--
+		fm <<= 1
+	}
+	for fe < int(mantbits64) {
+		fe++
+		fm >>= 1
+	}
+	val = int64(fm)
+	if fs != 0 {
+		val = -val
+	}
+	return val, true
+}
+
+func fintto64(val int64) (f uint64) {
+	fs := uint64(val) & (1 << 63)
+	mant := uint64(val)
+	if fs != 0 {
+		mant = -mant
+	}
+	return fpack64(fs, mant, int(mantbits64), 0)
+}
+func fintto32(val int64) (f uint32) {
+	fs := uint64(val) & (1 << 63)
+	mant := uint64(val)
+	if fs != 0 {
+		mant = -mant
+	}
+	// Reduce mantissa size until it fits into a uint32.
+	// Keep track of the bits we throw away, and if any are
+	// nonzero or them into the lowest bit.
+	exp := int(mantbits32)
+	var trunc uint32
+	for mant >= 1<<32 {
+		trunc |= uint32(mant) & 1
+		mant >>= 1
+		exp++
+	}
+
+	return fpack32(uint32(fs>>32), uint32(mant), exp, trunc)
+}
+
+// 64x64 -> 128 multiply.
+// adapted from hacker's delight.
+func mullu(u, v uint64) (lo, hi uint64) {
+	const (
+		s    = 32
+		mask = 1<<s - 1
+	)
+	u0 := u & mask
+	u1 := u >> s
+	v0 := v & mask
+	v1 := v >> s
+	w0 := u0 * v0
+	t := u1*v0 + w0>>s
+	w1 := t & mask
+	w2 := t >> s
+	w1 += u0 * v1
+	return u * v, u1*v1 + w2 + w1>>s
+}
+
+// 128/64 -> 64 quotient, 64 remainder.
+// adapted from hacker's delight
+func divlu(u1, u0, v uint64) (q, r uint64) {
+	const b = 1 << 32
+
+	if u1 >= v {
+		return 1<<64 - 1, 1<<64 - 1
+	}
+
+	// s = nlz(v); v <<= s
+	s := uint(0)
+	for v&(1<<63) == 0 {
+		s++
+		v <<= 1
+	}
+
+	vn1 := v >> 32
+	vn0 := v & (1<<32 - 1)
+	un32 := u1<<s | u0>>(64-s)
+	un10 := u0 << s
+	un1 := un10 >> 32
+	un0 := un10 & (1<<32 - 1)
+	q1 := un32 / vn1
+	rhat := un32 - q1*vn1
+
+again1:
+	if q1 >= b || q1*vn0 > b*rhat+un1 {
+		q1--
+		rhat += vn1
+		if rhat < b {
+			goto again1
+		}
+	}
+
+	un21 := un32*b + un1 - q1*v
+	q0 := un21 / vn1
+	rhat = un21 - q0*vn1
+
+again2:
+	if q0 >= b || q0*vn0 > b*rhat+un0 {
+		q0--
+		rhat += vn1
+		if rhat < b {
+			goto again2
+		}
+	}
+
+	return q1*b + q0, (un21*b + un0 - q0*v) >> s
+}
+
+func fadd32(x, y uint32) uint32 {
+	return f64to32(fadd64(f32to64(x), f32to64(y)))
+}
+
+func fmul32(x, y uint32) uint32 {
+	return f64to32(fmul64(f32to64(x), f32to64(y)))
+}
+
+func fdiv32(x, y uint32) uint32 {
+	// TODO: are there double-rounding problems here? See issue 48807.
+	return f64to32(fdiv64(f32to64(x), f32to64(y)))
+}
+
+func feq32(x, y uint32) bool {
+	cmp, nan := fcmp64(f32to64(x), f32to64(y))
+	return cmp == 0 && !nan
+}
+
+func fgt32(x, y uint32) bool {
+	cmp, nan := fcmp64(f32to64(x), f32to64(y))
+	return cmp >= 1 && !nan
+}
+
+func fge32(x, y uint32) bool {
+	cmp, nan := fcmp64(f32to64(x), f32to64(y))
+	return cmp >= 0 && !nan
+}
+
+func feq64(x, y uint64) bool {
+	cmp, nan := fcmp64(x, y)
+	return cmp == 0 && !nan
+}
+
+func fgt64(x, y uint64) bool {
+	cmp, nan := fcmp64(x, y)
+	return cmp >= 1 && !nan
+}
+
+func fge64(x, y uint64) bool {
+	cmp, nan := fcmp64(x, y)
+	return cmp >= 0 && !nan
+}
+
+func fint32to32(x int32) uint32 {
+	return fintto32(int64(x))
+}
+
+func fint32to64(x int32) uint64 {
+	return fintto64(int64(x))
+}
+
+func fint64to32(x int64) uint32 {
+	return fintto32(x)
+}
+
+func fint64to64(x int64) uint64 {
+	return fintto64(x)
+}
+
+func f32toint32(x uint32) int32 {
+	val, _ := f64toint(f32to64(x))
+	return int32(val)
+}
+
+func f32toint64(x uint32) int64 {
+	val, _ := f64toint(f32to64(x))
+	return val
+}
+
+func f64toint32(x uint64) int32 {
+	val, _ := f64toint(x)
+	return int32(val)
+}
+
+func f64toint64(x uint64) int64 {
+	val, _ := f64toint(x)
+	return val
+}
+
+func f64touint64(x uint64) uint64 {
+	var m uint64 = 0x43e0000000000000 // float64 1<<63
+	if fgt64(m, x) {
+		return uint64(f64toint64(x))
+	}
+	y := fadd64(x, -m)
+	z := uint64(f64toint64(y))
+	return z | (1 << 63)
+}
+
+func f32touint64(x uint32) uint64 {
+	var m uint32 = 0x5f000000 // float32 1<<63
+	if fgt32(m, x) {
+		return uint64(f32toint64(x))
+	}
+	y := fadd32(x, -m)
+	z := uint64(f32toint64(y))
+	return z | (1 << 63)
+}
+
+func fuint64to64(x uint64) uint64 {
+	if int64(x) >= 0 {
+		return fint64to64(int64(x))
+	}
+	// See ../cmd/compile/internal/ssagen/ssa.go:uint64Tofloat
+	y := x & 1
+	z := x >> 1
+	z = z | y
+	r := fint64to64(int64(z))
+	return fadd64(r, r)
+}
+
+func fuint64to32(x uint64) uint32 {
+	if int64(x) >= 0 {
+		return fint64to32(int64(x))
+	}
+	// See ../cmd/compile/internal/ssagen/ssa.go:uint64Tofloat
+	y := x & 1
+	z := x >> 1
+	z = z | y
+	r := fint64to32(int64(z))
+	return fadd32(r, r)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/stack.go b/contrib/go/_std_1.18/src/runtime/stack.go
new file mode 100644
index 0000000000..edc37d4878
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stack.go
@@ -0,0 +1,1434 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/cpu"
+	"internal/goarch"
+	"internal/goos"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+/*
+Stack layout parameters.
+Included both by runtime (compiled via 6c) and linkers (compiled via gcc).
+
+The per-goroutine g->stackguard is set to point StackGuard bytes
+above the bottom of the stack.  Each function compares its stack
+pointer against g->stackguard to check for overflow.  To cut one
+instruction from the check sequence for functions with tiny frames,
+the stack is allowed to protrude StackSmall bytes below the stack
+guard.  Functions with large frames don't bother with the check and
+always call morestack.  The sequences are (for amd64, others are
+similar):
+
+	guard = g->stackguard
+	frame = function's stack frame size
+	argsize = size of function arguments (call + return)
+
+	stack frame size <= StackSmall:
+		CMPQ guard, SP
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size > StackSmall but < StackBig
+		LEAQ (frame-StackSmall)(SP), R0
+		CMPQ guard, R0
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size >= StackBig:
+		MOVQ m->morearg, $((argsize << 32) | frame)
+		CALL morestack(SB)
+
+The bottom StackGuard - StackSmall bytes are important: there has
+to be enough room to execute functions that refuse to check for
+stack overflow, either because they need to be adjacent to the
+actual caller's frame (deferproc) or because they handle the imminent
+stack overflow (morestack).
+
+For example, deferproc might call malloc, which does one of the
+above checks (without allocating a full frame), which might trigger
+a call to morestack.  This sequence needs to fit in the bottom
+section of the stack.  On amd64, morestack's frame is 40 bytes, and
+deferproc's frame is 56 bytes.  That fits well within the
+StackGuard - StackSmall bytes at the bottom.
+The linkers explore all possible call traces involving non-splitting
+functions to make sure that this limit cannot be violated.
+*/
+
+const (
+	// StackSystem is a number of additional bytes to add
+	// to each stack below the usual guard area for OS-specific
+	// purposes like signal handling. Used on Windows, Plan 9,
+	// and iOS because they do not use a separate stack.
+	_StackSystem = goos.IsWindows*512*goarch.PtrSize + goos.IsPlan9*512 + goos.IsIos*goarch.IsArm64*1024
+
+	// The minimum size of stack used by Go code
+	_StackMin = 2048
+
+	// The minimum stack size to allocate.
+	// The hackery here rounds FixedStack0 up to a power of 2.
+	_FixedStack0 = _StackMin + _StackSystem
+	_FixedStack1 = _FixedStack0 - 1
+	_FixedStack2 = _FixedStack1 | (_FixedStack1 >> 1)
+	_FixedStack3 = _FixedStack2 | (_FixedStack2 >> 2)
+	_FixedStack4 = _FixedStack3 | (_FixedStack3 >> 4)
+	_FixedStack5 = _FixedStack4 | (_FixedStack4 >> 8)
+	_FixedStack6 = _FixedStack5 | (_FixedStack5 >> 16)
+	_FixedStack  = _FixedStack6 + 1
+
+	// Functions that need frames bigger than this use an extra
+	// instruction to do the stack split check, to avoid overflow
+	// in case SP - framesize wraps below zero.
+	// This value can be no bigger than the size of the unmapped
+	// space at zero.
+	_StackBig = 4096
+
+	// The stack guard is a pointer this many bytes above the
+	// bottom of the stack.
+	//
+	// The guard leaves enough room for one _StackSmall frame plus
+	// a _StackLimit chain of NOSPLIT calls plus _StackSystem
+	// bytes for the OS.
+	_StackGuard = 928*sys.StackGuardMultiplier + _StackSystem
+
+	// After a stack split check the SP is allowed to be this
+	// many bytes below the stack guard. This saves an instruction
+	// in the checking sequence for tiny frames.
+	_StackSmall = 128
+
+	// The maximum number of bytes that a chain of NOSPLIT
+	// functions can use.
+	_StackLimit = _StackGuard - _StackSystem - _StackSmall
+)
+
+const (
+	// stackDebug == 0: no logging
+	//            == 1: logging of per-stack operations
+	//            == 2: logging of per-frame operations
+	//            == 3: logging of per-word updates
+	//            == 4: logging of per-word reads
+	stackDebug       = 0
+	stackFromSystem  = 0 // allocate stacks from system memory instead of the heap
+	stackFaultOnFree = 0 // old stacks are mapped noaccess to detect use after free
+	stackPoisonCopy  = 0 // fill stack that should not be accessed with garbage, to detect bad dereferences during copy
+	stackNoCache     = 0 // disable per-P small stack caches
+
+	// check the BP links during traceback.
+	debugCheckBP = false
+)
+
+const (
+	uintptrMask = 1<<(8*goarch.PtrSize) - 1
+
+	// The values below can be stored to g.stackguard0 to force
+	// the next stack check to fail.
+	// These are all larger than any real SP.
+
+	// Goroutine preemption request.
+	// 0xfffffade in hex.
+	stackPreempt = uintptrMask & -1314
+
+	// Thread is forking. Causes a split stack check failure.
+	// 0xfffffb2e in hex.
+	stackFork = uintptrMask & -1234
+
+	// Force a stack movement. Used for debugging.
+	// 0xfffffeed in hex.
+	stackForceMove = uintptrMask & -275
+
+	// stackPoisonMin is the lowest allowed stack poison value.
+	stackPoisonMin = uintptrMask & -4096
+)
+
+// Global pool of spans that have free stacks.
+// Stacks are assigned an order according to size.
+//     order = log_2(size/FixedStack)
+// There is a free list for each order.
+var stackpool [_NumStackOrders]struct {
+	item stackpoolItem
+	_    [cpu.CacheLinePadSize - unsafe.Sizeof(stackpoolItem{})%cpu.CacheLinePadSize]byte
+}
+
+//go:notinheap
+type stackpoolItem struct {
+	mu   mutex
+	span mSpanList
+}
+
+// Global pool of large stack spans.
+var stackLarge struct {
+	lock mutex
+	free [heapAddrBits - pageShift]mSpanList // free lists by log_2(s.npages)
+}
+
+func stackinit() {
+	if _StackCacheSize&_PageMask != 0 {
+		throw("cache size must be a multiple of page size")
+	}
+	for i := range stackpool {
+		stackpool[i].item.span.init()
+		lockInit(&stackpool[i].item.mu, lockRankStackpool)
+	}
+	for i := range stackLarge.free {
+		stackLarge.free[i].init()
+		lockInit(&stackLarge.lock, lockRankStackLarge)
+	}
+}
+
+// stacklog2 returns ⌊log_2(n)⌋.
+func stacklog2(n uintptr) int {
+	log2 := 0
+	for n > 1 {
+		n >>= 1
+		log2++
+	}
+	return log2
+}
+
+// Allocates a stack from the free pool. Must be called with
+// stackpool[order].item.mu held.
+func stackpoolalloc(order uint8) gclinkptr {
+	list := &stackpool[order].item.span
+	s := list.first
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
+	if s == nil {
+		// no free stacks. Allocate another span worth.
+		s = mheap_.allocManual(_StackCacheSize>>_PageShift, spanAllocStack)
+		if s == nil {
+			throw("out of memory")
+		}
+		if s.allocCount != 0 {
+			throw("bad allocCount")
+		}
+		if s.manualFreeList.ptr() != nil {
+			throw("bad manualFreeList")
+		}
+		osStackAlloc(s)
+		s.elemsize = _FixedStack << order
+		for i := uintptr(0); i < _StackCacheSize; i += s.elemsize {
+			x := gclinkptr(s.base() + i)
+			x.ptr().next = s.manualFreeList
+			s.manualFreeList = x
+		}
+		list.insert(s)
+	}
+	x := s.manualFreeList
+	if x.ptr() == nil {
+		throw("span has no free stacks")
+	}
+	s.manualFreeList = x.ptr().next
+	s.allocCount++
+	if s.manualFreeList.ptr() == nil {
+		// all stacks in s are allocated.
+		list.remove(s)
+	}
+	return x
+}
+
+// Adds stack x to the free pool. Must be called with stackpool[order].item.mu held.
+func stackpoolfree(x gclinkptr, order uint8) {
+	s := spanOfUnchecked(uintptr(x))
+	if s.state.get() != mSpanManual {
+		throw("freeing stack not in a stack span")
+	}
+	if s.manualFreeList.ptr() == nil {
+		// s will now have a free stack
+		stackpool[order].item.span.insert(s)
+	}
+	x.ptr().next = s.manualFreeList
+	s.manualFreeList = x
+	s.allocCount--
+	if gcphase == _GCoff && s.allocCount == 0 {
+		// Span is completely free. Return it to the heap
+		// immediately if we're sweeping.
+		//
+		// If GC is active, we delay the free until the end of
+		// GC to avoid the following type of situation:
+		//
+		// 1) GC starts, scans a SudoG but does not yet mark the SudoG.elem pointer
+		// 2) The stack that pointer points to is copied
+		// 3) The old stack is freed
+		// 4) The containing span is marked free
+		// 5) GC attempts to mark the SudoG.elem pointer. The
+		//    marking fails because the pointer looks like a
+		//    pointer into a free span.
+		//
+		// By not freeing, we prevent step #4 until GC is done.
+		stackpool[order].item.span.remove(s)
+		s.manualFreeList = 0
+		osStackFree(s)
+		mheap_.freeManual(s, spanAllocStack)
+	}
+}
+
+// stackcacherefill/stackcacherelease implement a global pool of stack segments.
+// The pool is required to prevent unlimited growth of per-thread caches.
+//
+//go:systemstack
+func stackcacherefill(c *mcache, order uint8) {
+	if stackDebug >= 1 {
+		print("stackcacherefill order=", order, "\n")
+	}
+
+	// Grab some stacks from the global cache.
+	// Grab half of the allowed capacity (to prevent thrashing).
+	var list gclinkptr
+	var size uintptr
+	lock(&stackpool[order].item.mu)
+	for size < _StackCacheSize/2 {
+		x := stackpoolalloc(order)
+		x.ptr().next = list
+		list = x
+		size += _FixedStack << order
+	}
+	unlock(&stackpool[order].item.mu)
+	c.stackcache[order].list = list
+	c.stackcache[order].size = size
+}
+
+//go:systemstack
+func stackcacherelease(c *mcache, order uint8) {
+	if stackDebug >= 1 {
+		print("stackcacherelease order=", order, "\n")
+	}
+	x := c.stackcache[order].list
+	size := c.stackcache[order].size
+	lock(&stackpool[order].item.mu)
+	for size > _StackCacheSize/2 {
+		y := x.ptr().next
+		stackpoolfree(x, order)
+		x = y
+		size -= _FixedStack << order
+	}
+	unlock(&stackpool[order].item.mu)
+	c.stackcache[order].list = x
+	c.stackcache[order].size = size
+}
+
+//go:systemstack
+func stackcache_clear(c *mcache) {
+	if stackDebug >= 1 {
+		print("stackcache clear\n")
+	}
+	for order := uint8(0); order < _NumStackOrders; order++ {
+		lock(&stackpool[order].item.mu)
+		x := c.stackcache[order].list
+		for x.ptr() != nil {
+			y := x.ptr().next
+			stackpoolfree(x, order)
+			x = y
+		}
+		c.stackcache[order].list = 0
+		c.stackcache[order].size = 0
+		unlock(&stackpool[order].item.mu)
+	}
+}
+
+// stackalloc allocates an n byte stack.
+//
+// stackalloc must run on the system stack because it uses per-P
+// resources and must not split the stack.
+//
+//go:systemstack
+func stackalloc(n uint32) stack {
+	// Stackalloc must be called on scheduler stack, so that we
+	// never try to grow the stack during the code that stackalloc runs.
+	// Doing so would cause a deadlock (issue 1547).
+	thisg := getg()
+	if thisg != thisg.m.g0 {
+		throw("stackalloc not on scheduler stack")
+	}
+	if n&(n-1) != 0 {
+		throw("stack size not a power of 2")
+	}
+	if stackDebug >= 1 {
+		print("stackalloc ", n, "\n")
+	}
+
+	if debug.efence != 0 || stackFromSystem != 0 {
+		n = uint32(alignUp(uintptr(n), physPageSize))
+		v := sysAlloc(uintptr(n), &memstats.stacks_sys)
+		if v == nil {
+			throw("out of memory (stackalloc)")
+		}
+		return stack{uintptr(v), uintptr(v) + uintptr(n)}
+	}
+
+	// Small stacks are allocated with a fixed-size free-list allocator.
+	// If we need a stack of a bigger size, we fall back on allocating
+	// a dedicated span.
+	var v unsafe.Pointer
+	if n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+		order := uint8(0)
+		n2 := n
+		for n2 > _FixedStack {
+			order++
+			n2 >>= 1
+		}
+		var x gclinkptr
+		if stackNoCache != 0 || thisg.m.p == 0 || thisg.m.preemptoff != "" {
+			// thisg.m.p == 0 can happen in the guts of exitsyscall
+			// or procresize. Just get a stack from the global pool.
+			// Also don't touch stackcache during gc
+			// as it's flushed concurrently.
+			lock(&stackpool[order].item.mu)
+			x = stackpoolalloc(order)
+			unlock(&stackpool[order].item.mu)
+		} else {
+			c := thisg.m.p.ptr().mcache
+			x = c.stackcache[order].list
+			if x.ptr() == nil {
+				stackcacherefill(c, order)
+				x = c.stackcache[order].list
+			}
+			c.stackcache[order].list = x.ptr().next
+			c.stackcache[order].size -= uintptr(n)
+		}
+		v = unsafe.Pointer(x)
+	} else {
+		var s *mspan
+		npage := uintptr(n) >> _PageShift
+		log2npage := stacklog2(npage)
+
+		// Try to get a stack from the large stack cache.
+		lock(&stackLarge.lock)
+		if !stackLarge.free[log2npage].isEmpty() {
+			s = stackLarge.free[log2npage].first
+			stackLarge.free[log2npage].remove(s)
+		}
+		unlock(&stackLarge.lock)
+
+		lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
+
+		if s == nil {
+			// Allocate a new stack from the heap.
+			s = mheap_.allocManual(npage, spanAllocStack)
+			if s == nil {
+				throw("out of memory")
+			}
+			osStackAlloc(s)
+			s.elemsize = uintptr(n)
+		}
+		v = unsafe.Pointer(s.base())
+	}
+
+	if raceenabled {
+		racemalloc(v, uintptr(n))
+	}
+	if msanenabled {
+		msanmalloc(v, uintptr(n))
+	}
+	if asanenabled {
+		asanunpoison(v, uintptr(n))
+	}
+	if stackDebug >= 1 {
+		print("  allocated ", v, "\n")
+	}
+	return stack{uintptr(v), uintptr(v) + uintptr(n)}
+}
+
+// stackfree frees an n byte stack allocation at stk.
+//
+// stackfree must run on the system stack because it uses per-P
+// resources and must not split the stack.
+//
+//go:systemstack
+func stackfree(stk stack) {
+	gp := getg()
+	v := unsafe.Pointer(stk.lo)
+	n := stk.hi - stk.lo
+	if n&(n-1) != 0 {
+		throw("stack not a power of 2")
+	}
+	if stk.lo+n < stk.hi {
+		throw("bad stack size")
+	}
+	if stackDebug >= 1 {
+		println("stackfree", v, n)
+		memclrNoHeapPointers(v, n) // for testing, clobber stack data
+	}
+	if debug.efence != 0 || stackFromSystem != 0 {
+		if debug.efence != 0 || stackFaultOnFree != 0 {
+			sysFault(v, n)
+		} else {
+			sysFree(v, n, &memstats.stacks_sys)
+		}
+		return
+	}
+	if msanenabled {
+		msanfree(v, n)
+	}
+	if asanenabled {
+		asanpoison(v, n)
+	}
+	if n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+		order := uint8(0)
+		n2 := n
+		for n2 > _FixedStack {
+			order++
+			n2 >>= 1
+		}
+		x := gclinkptr(v)
+		if stackNoCache != 0 || gp.m.p == 0 || gp.m.preemptoff != "" {
+			lock(&stackpool[order].item.mu)
+			stackpoolfree(x, order)
+			unlock(&stackpool[order].item.mu)
+		} else {
+			c := gp.m.p.ptr().mcache
+			if c.stackcache[order].size >= _StackCacheSize {
+				stackcacherelease(c, order)
+			}
+			x.ptr().next = c.stackcache[order].list
+			c.stackcache[order].list = x
+			c.stackcache[order].size += n
+		}
+	} else {
+		s := spanOfUnchecked(uintptr(v))
+		if s.state.get() != mSpanManual {
+			println(hex(s.base()), v)
+			throw("bad span state")
+		}
+		if gcphase == _GCoff {
+			// Free the stack immediately if we're
+			// sweeping.
+			osStackFree(s)
+			mheap_.freeManual(s, spanAllocStack)
+		} else {
+			// If the GC is running, we can't return a
+			// stack span to the heap because it could be
+			// reused as a heap span, and this state
+			// change would race with GC. Add it to the
+			// large stack cache instead.
+			log2npage := stacklog2(s.npages)
+			lock(&stackLarge.lock)
+			stackLarge.free[log2npage].insert(s)
+			unlock(&stackLarge.lock)
+		}
+	}
+}
+
+var maxstacksize uintptr = 1 << 20 // enough until runtime.main sets it for real
+
+var maxstackceiling = maxstacksize
+
+var ptrnames = []string{
+	0: "scalar",
+	1: "ptr",
+}
+
+// Stack frame layout
+//
+// (x86)
+// +------------------+
+// | args from caller |
+// +------------------+ <- frame->argp
+// |  return address  |
+// +------------------+
+// |  caller's BP (*) | (*) if framepointer_enabled && varp < sp
+// +------------------+ <- frame->varp
+// |     locals       |
+// +------------------+
+// |  args to callee  |
+// +------------------+ <- frame->sp
+//
+// (arm)
+// +------------------+
+// | args from caller |
+// +------------------+ <- frame->argp
+// | caller's retaddr |
+// +------------------+ <- frame->varp
+// |     locals       |
+// +------------------+
+// |  args to callee  |
+// +------------------+
+// |  return address  |
+// +------------------+ <- frame->sp
+
+type adjustinfo struct {
+	old   stack
+	delta uintptr // ptr distance from old to new stack (newbase - oldbase)
+	cache pcvalueCache
+
+	// sghi is the highest sudog.elem on the stack.
+	sghi uintptr
+}
+
+// Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
+// If so, it rewrites *vpp to point into the new stack.
+func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
+	pp := (*uintptr)(vpp)
+	p := *pp
+	if stackDebug >= 4 {
+		print("        ", pp, ":", hex(p), "\n")
+	}
+	if adjinfo.old.lo <= p && p < adjinfo.old.hi {
+		*pp = p + adjinfo.delta
+		if stackDebug >= 3 {
+			print("        adjust ptr ", pp, ":", hex(p), " -> ", hex(*pp), "\n")
+		}
+	}
+}
+
+// Information from the compiler about the layout of stack frames.
+// Note: this type must agree with reflect.bitVector.
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
+}
+
+// ptrbit returns the i'th bit in bv.
+// ptrbit is less efficient than iterating directly over bitvector bits,
+// and should only be used in non-performance-critical code.
+// See adjustpointers for an example of a high-efficiency walk of a bitvector.
+func (bv *bitvector) ptrbit(i uintptr) uint8 {
+	b := *(addb(bv.bytedata, i/8))
+	return (b >> (i % 8)) & 1
+}
+
+// bv describes the memory starting at address scanp.
+// Adjust any pointers contained therein.
+func adjustpointers(scanp unsafe.Pointer, bv *bitvector, adjinfo *adjustinfo, f funcInfo) {
+	minp := adjinfo.old.lo
+	maxp := adjinfo.old.hi
+	delta := adjinfo.delta
+	num := uintptr(bv.n)
+	// If this frame might contain channel receive slots, use CAS
+	// to adjust pointers. If the slot hasn't been received into
+	// yet, it may contain stack pointers and a concurrent send
+	// could race with adjusting those pointers. (The sent value
+	// itself can never contain stack pointers.)
+	useCAS := uintptr(scanp) < adjinfo.sghi
+	for i := uintptr(0); i < num; i += 8 {
+		if stackDebug >= 4 {
+			for j := uintptr(0); j < 8; j++ {
+				print("        ", add(scanp, (i+j)*goarch.PtrSize), ":", ptrnames[bv.ptrbit(i+j)], ":", hex(*(*uintptr)(add(scanp, (i+j)*goarch.PtrSize))), " # ", i, " ", *addb(bv.bytedata, i/8), "\n")
+			}
+		}
+		b := *(addb(bv.bytedata, i/8))
+		for b != 0 {
+			j := uintptr(sys.Ctz8(b))
+			b &= b - 1
+			pp := (*uintptr)(add(scanp, (i+j)*goarch.PtrSize))
+		retry:
+			p := *pp
+			if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
+				// Looks like a junk value in a pointer slot.
+				// Live analysis wrong?
+				getg().m.traceback = 2
+				print("runtime: bad pointer in frame ", funcname(f), " at ", pp, ": ", hex(p), "\n")
+				throw("invalid pointer found on stack")
+			}
+			if minp <= p && p < maxp {
+				if stackDebug >= 3 {
+					print("adjust ptr ", hex(p), " ", funcname(f), "\n")
+				}
+				if useCAS {
+					ppu := (*unsafe.Pointer)(unsafe.Pointer(pp))
+					if !atomic.Casp1(ppu, unsafe.Pointer(p), unsafe.Pointer(p+delta)) {
+						goto retry
+					}
+				} else {
+					*pp = p + delta
+				}
+			}
+		}
+	}
+}
+
+// Note: the argument/return area is adjusted by the callee.
+func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
+	adjinfo := (*adjustinfo)(arg)
+	if frame.continpc == 0 {
+		// Frame is dead.
+		return true
+	}
+	f := frame.fn
+	if stackDebug >= 2 {
+		print("    adjusting ", funcname(f), " frame=[", hex(frame.sp), ",", hex(frame.fp), "] pc=", hex(frame.pc), " continpc=", hex(frame.continpc), "\n")
+	}
+	if f.funcID == funcID_systemstack_switch {
+		// A special routine at the bottom of stack of a goroutine that does a systemstack call.
+		// We will allow it to be copied even though we don't
+		// have full GC info for it (because it is written in asm).
+		return true
+	}
+
+	locals, args, objs := getStackMap(frame, &adjinfo.cache, true)
+
+	// Adjust local variables if stack frame has been allocated.
+	if locals.n > 0 {
+		size := uintptr(locals.n) * goarch.PtrSize
+		adjustpointers(unsafe.Pointer(frame.varp-size), &locals, adjinfo, f)
+	}
+
+	// Adjust saved base pointer if there is one.
+	// TODO what about arm64 frame pointer adjustment?
+	if goarch.ArchFamily == goarch.AMD64 && frame.argp-frame.varp == 2*goarch.PtrSize {
+		if stackDebug >= 3 {
+			print("      saved bp\n")
+		}
+		if debugCheckBP {
+			// Frame pointers should always point to the next higher frame on
+			// the Go stack (or be nil, for the top frame on the stack).
+			bp := *(*uintptr)(unsafe.Pointer(frame.varp))
+			if bp != 0 && (bp < adjinfo.old.lo || bp >= adjinfo.old.hi) {
+				println("runtime: found invalid frame pointer")
+				print("bp=", hex(bp), " min=", hex(adjinfo.old.lo), " max=", hex(adjinfo.old.hi), "\n")
+				throw("bad frame pointer")
+			}
+		}
+		adjustpointer(adjinfo, unsafe.Pointer(frame.varp))
+	}
+
+	// Adjust arguments.
+	if args.n > 0 {
+		if stackDebug >= 3 {
+			print("      args\n")
+		}
+		adjustpointers(unsafe.Pointer(frame.argp), &args, adjinfo, funcInfo{})
+	}
+
+	// Adjust pointers in all stack objects (whether they are live or not).
+	// See comments in mgcmark.go:scanframeworker.
+	if frame.varp != 0 {
+		for i := range objs {
+			obj := &objs[i]
+			off := obj.off
+			base := frame.varp // locals base pointer
+			if off >= 0 {
+				base = frame.argp // arguments and return values base pointer
+			}
+			p := base + uintptr(off)
+			if p < frame.sp {
+				// Object hasn't been allocated in the frame yet.
+				// (Happens when the stack bounds check fails and
+				// we call into morestack.)
+				continue
+			}
+			ptrdata := obj.ptrdata()
+			gcdata := obj.gcdata()
+			var s *mspan
+			if obj.useGCProg() {
+				// See comments in mgcmark.go:scanstack
+				s = materializeGCProg(ptrdata, gcdata)
+				gcdata = (*byte)(unsafe.Pointer(s.startAddr))
+			}
+			for i := uintptr(0); i < ptrdata; i += goarch.PtrSize {
+				if *addb(gcdata, i/(8*goarch.PtrSize))>>(i/goarch.PtrSize&7)&1 != 0 {
+					adjustpointer(adjinfo, unsafe.Pointer(p+i))
+				}
+			}
+			if s != nil {
+				dematerializeGCProg(s)
+			}
+		}
+	}
+
+	return true
+}
+
+func adjustctxt(gp *g, adjinfo *adjustinfo) {
+	adjustpointer(adjinfo, unsafe.Pointer(&gp.sched.ctxt))
+	if !framepointer_enabled {
+		return
+	}
+	if debugCheckBP {
+		bp := gp.sched.bp
+		if bp != 0 && (bp < adjinfo.old.lo || bp >= adjinfo.old.hi) {
+			println("runtime: found invalid top frame pointer")
+			print("bp=", hex(bp), " min=", hex(adjinfo.old.lo), " max=", hex(adjinfo.old.hi), "\n")
+			throw("bad top frame pointer")
+		}
+	}
+	adjustpointer(adjinfo, unsafe.Pointer(&gp.sched.bp))
+}
+
+func adjustdefers(gp *g, adjinfo *adjustinfo) {
+	// Adjust pointers in the Defer structs.
+	// We need to do this first because we need to adjust the
+	// defer.link fields so we always work on the new stack.
+	adjustpointer(adjinfo, unsafe.Pointer(&gp._defer))
+	for d := gp._defer; d != nil; d = d.link {
+		adjustpointer(adjinfo, unsafe.Pointer(&d.fn))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.sp))
+		adjustpointer(adjinfo, unsafe.Pointer(&d._panic))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.link))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.varp))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.fd))
+	}
+}
+
+func adjustpanics(gp *g, adjinfo *adjustinfo) {
+	// Panics are on stack and already adjusted.
+	// Update pointer to head of list in G.
+	adjustpointer(adjinfo, unsafe.Pointer(&gp._panic))
+}
+
+func adjustsudogs(gp *g, adjinfo *adjustinfo) {
+	// the data elements pointed to by a SudoG structure
+	// might be in the stack.
+	for s := gp.waiting; s != nil; s = s.waitlink {
+		adjustpointer(adjinfo, unsafe.Pointer(&s.elem))
+	}
+}
+
+func fillstack(stk stack, b byte) {
+	for p := stk.lo; p < stk.hi; p++ {
+		*(*byte)(unsafe.Pointer(p)) = b
+	}
+}
+
+func findsghi(gp *g, stk stack) uintptr {
+	var sghi uintptr
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		p := uintptr(sg.elem) + uintptr(sg.c.elemsize)
+		if stk.lo <= p && p < stk.hi && p > sghi {
+			sghi = p
+		}
+	}
+	return sghi
+}
+
+// syncadjustsudogs adjusts gp's sudogs and copies the part of gp's
+// stack they refer to while synchronizing with concurrent channel
+// operations. It returns the number of bytes of stack copied.
+func syncadjustsudogs(gp *g, used uintptr, adjinfo *adjustinfo) uintptr {
+	if gp.waiting == nil {
+		return 0
+	}
+
+	// Lock channels to prevent concurrent send/receive.
+	var lastc *hchan
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		if sg.c != lastc {
+			// There is a ranking cycle here between gscan bit and
+			// hchan locks. Normally, we only allow acquiring hchan
+			// locks and then getting a gscan bit. In this case, we
+			// already have the gscan bit. We allow acquiring hchan
+			// locks here as a special case, since a deadlock can't
+			// happen because the G involved must already be
+			// suspended. So, we get a special hchan lock rank here
+			// that is lower than gscan, but doesn't allow acquiring
+			// any other locks other than hchan.
+			lockWithRank(&sg.c.lock, lockRankHchanLeaf)
+		}
+		lastc = sg.c
+	}
+
+	// Adjust sudogs.
+	adjustsudogs(gp, adjinfo)
+
+	// Copy the part of the stack the sudogs point in to
+	// while holding the lock to prevent races on
+	// send/receive slots.
+	var sgsize uintptr
+	if adjinfo.sghi != 0 {
+		oldBot := adjinfo.old.hi - used
+		newBot := oldBot + adjinfo.delta
+		sgsize = adjinfo.sghi - oldBot
+		memmove(unsafe.Pointer(newBot), unsafe.Pointer(oldBot), sgsize)
+	}
+
+	// Unlock channels.
+	lastc = nil
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		if sg.c != lastc {
+			unlock(&sg.c.lock)
+		}
+		lastc = sg.c
+	}
+
+	return sgsize
+}
+
+// Copies gp's stack to a new stack of a different size.
+// Caller must have changed gp status to Gcopystack.
+func copystack(gp *g, newsize uintptr) {
+	if gp.syscallsp != 0 {
+		throw("stack growth not allowed in system call")
+	}
+	old := gp.stack
+	if old.lo == 0 {
+		throw("nil stackbase")
+	}
+	used := old.hi - gp.sched.sp
+	// Add just the difference to gcController.addScannableStack.
+	// g0 stacks never move, so this will never account for them.
+	// It's also fine if we have no P, addScannableStack can deal with
+	// that case.
+	gcController.addScannableStack(getg().m.p.ptr(), int64(newsize)-int64(old.hi-old.lo))
+
+	// allocate new stack
+	new := stackalloc(uint32(newsize))
+	if stackPoisonCopy != 0 {
+		fillstack(new, 0xfd)
+	}
+	if stackDebug >= 1 {
+		print("copystack gp=", gp, " [", hex(old.lo), " ", hex(old.hi-used), " ", hex(old.hi), "]", " -> [", hex(new.lo), " ", hex(new.hi-used), " ", hex(new.hi), "]/", newsize, "\n")
+	}
+
+	// Compute adjustment.
+	var adjinfo adjustinfo
+	adjinfo.old = old
+	adjinfo.delta = new.hi - old.hi
+
+	// Adjust sudogs, synchronizing with channel ops if necessary.
+	ncopy := used
+	if !gp.activeStackChans {
+		if newsize < old.hi-old.lo && atomic.Load8(&gp.parkingOnChan) != 0 {
+			// It's not safe for someone to shrink this stack while we're actively
+			// parking on a channel, but it is safe to grow since we do that
+			// ourselves and explicitly don't want to synchronize with channels
+			// since we could self-deadlock.
+			throw("racy sudog adjustment due to parking on channel")
+		}
+		adjustsudogs(gp, &adjinfo)
+	} else {
+		// sudogs may be pointing in to the stack and gp has
+		// released channel locks, so other goroutines could
+		// be writing to gp's stack. Find the highest such
+		// pointer so we can handle everything there and below
+		// carefully. (This shouldn't be far from the bottom
+		// of the stack, so there's little cost in handling
+		// everything below it carefully.)
+		adjinfo.sghi = findsghi(gp, old)
+
+		// Synchronize with channel ops and copy the part of
+		// the stack they may interact with.
+		ncopy -= syncadjustsudogs(gp, used, &adjinfo)
+	}
+
+	// Copy the stack (or the rest of it) to the new location
+	memmove(unsafe.Pointer(new.hi-ncopy), unsafe.Pointer(old.hi-ncopy), ncopy)
+
+	// Adjust remaining structures that have pointers into stacks.
+	// We have to do most of these before we traceback the new
+	// stack because gentraceback uses them.
+	adjustctxt(gp, &adjinfo)
+	adjustdefers(gp, &adjinfo)
+	adjustpanics(gp, &adjinfo)
+	if adjinfo.sghi != 0 {
+		adjinfo.sghi += adjinfo.delta
+	}
+
+	// Swap out old stack for new one
+	gp.stack = new
+	gp.stackguard0 = new.lo + _StackGuard // NOTE: might clobber a preempt request
+	gp.sched.sp = new.hi - used
+	gp.stktopsp += adjinfo.delta
+
+	// Adjust pointers in the new stack.
+	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, adjustframe, noescape(unsafe.Pointer(&adjinfo)), 0)
+
+	// free old stack
+	if stackPoisonCopy != 0 {
+		fillstack(old, 0xfc)
+	}
+	stackfree(old)
+}
+
+// round x up to a power of 2.
+func round2(x int32) int32 {
+	s := uint(0)
+	for 1<<s < x {
+		s++
+	}
+	return 1 << s
+}
+
+// Called from runtime·morestack when more stack is needed.
+// Allocate larger stack and relocate to new stack.
+// Stack growth is multiplicative, for constant amortized cost.
+//
+// g->atomicstatus will be Grunning or Gscanrunning upon entry.
+// If the scheduler is trying to stop this g, then it will set preemptStop.
+//
+// This must be nowritebarrierrec because it can be called as part of
+// stack growth from other nowritebarrierrec functions, but the
+// compiler doesn't check this.
+//
+//go:nowritebarrierrec
+func newstack() {
+	thisg := getg()
+	// TODO: double check all gp. shouldn't be getg().
+	if thisg.m.morebuf.g.ptr().stackguard0 == stackFork {
+		throw("stack growth after fork")
+	}
+	if thisg.m.morebuf.g.ptr() != thisg.m.curg {
+		print("runtime: newstack called from g=", hex(thisg.m.morebuf.g), "\n"+"\tm=", thisg.m, " m->curg=", thisg.m.curg, " m->g0=", thisg.m.g0, " m->gsignal=", thisg.m.gsignal, "\n")
+		morebuf := thisg.m.morebuf
+		traceback(morebuf.pc, morebuf.sp, morebuf.lr, morebuf.g.ptr())
+		throw("runtime: wrong goroutine in newstack")
+	}
+
+	gp := thisg.m.curg
+
+	if thisg.m.curg.throwsplit {
+		// Update syscallsp, syscallpc in case traceback uses them.
+		morebuf := thisg.m.morebuf
+		gp.syscallsp = morebuf.sp
+		gp.syscallpc = morebuf.pc
+		pcname, pcoff := "(unknown)", uintptr(0)
+		f := findfunc(gp.sched.pc)
+		if f.valid() {
+			pcname = funcname(f)
+			pcoff = gp.sched.pc - f.entry()
+		}
+		print("runtime: newstack at ", pcname, "+", hex(pcoff),
+			" sp=", hex(gp.sched.sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
+			"\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
+			"\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
+
+		thisg.m.traceback = 2 // Include runtime frames
+		traceback(morebuf.pc, morebuf.sp, morebuf.lr, gp)
+		throw("runtime: stack split at bad time")
+	}
+
+	morebuf := thisg.m.morebuf
+	thisg.m.morebuf.pc = 0
+	thisg.m.morebuf.lr = 0
+	thisg.m.morebuf.sp = 0
+	thisg.m.morebuf.g = 0
+
+	// NOTE: stackguard0 may change underfoot, if another thread
+	// is about to try to preempt gp. Read it just once and use that same
+	// value now and below.
+	stackguard0 := atomic.Loaduintptr(&gp.stackguard0)
+
+	// Be conservative about where we preempt.
+	// We are interested in preempting user Go code, not runtime code.
+	// If we're holding locks, mallocing, or preemption is disabled, don't
+	// preempt.
+	// This check is very early in newstack so that even the status change
+	// from Grunning to Gwaiting and back doesn't happen in this case.
+	// That status change by itself can be viewed as a small preemption,
+	// because the GC might change Gwaiting to Gscanwaiting, and then
+	// this goroutine has to wait for the GC to finish before continuing.
+	// If the GC is in some way dependent on this goroutine (for example,
+	// it needs a lock held by the goroutine), that small preemption turns
+	// into a real deadlock.
+	preempt := stackguard0 == stackPreempt
+	if preempt {
+		if !canPreemptM(thisg.m) {
+			// Let the goroutine keep running for now.
+			// gp->preempt is set, so it will be preempted next time.
+			gp.stackguard0 = gp.stack.lo + _StackGuard
+			gogo(&gp.sched) // never return
+		}
+	}
+
+	if gp.stack.lo == 0 {
+		throw("missing stack in newstack")
+	}
+	sp := gp.sched.sp
+	if goarch.ArchFamily == goarch.AMD64 || goarch.ArchFamily == goarch.I386 || goarch.ArchFamily == goarch.WASM {
+		// The call to morestack cost a word.
+		sp -= goarch.PtrSize
+	}
+	if stackDebug >= 1 || sp < gp.stack.lo {
+		print("runtime: newstack sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
+			"\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
+			"\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
+	}
+	if sp < gp.stack.lo {
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->status=", hex(readgstatus(gp)), "\n ")
+		print("runtime: split stack overflow: ", hex(sp), " < ", hex(gp.stack.lo), "\n")
+		throw("runtime: split stack overflow")
+	}
+
+	if preempt {
+		if gp == thisg.m.g0 {
+			throw("runtime: preempt g0")
+		}
+		if thisg.m.p == 0 && thisg.m.locks == 0 {
+			throw("runtime: g is running but p is not")
+		}
+
+		if gp.preemptShrink {
+			// We're at a synchronous safe point now, so
+			// do the pending stack shrink.
+			gp.preemptShrink = false
+			shrinkstack(gp)
+		}
+
+		if gp.preemptStop {
+			preemptPark(gp) // never returns
+		}
+
+		// Act like goroutine called runtime.Gosched.
+		gopreempt_m(gp) // never return
+	}
+
+	// Allocate a bigger segment and move the stack.
+	oldsize := gp.stack.hi - gp.stack.lo
+	newsize := oldsize * 2
+
+	// Make sure we grow at least as much as needed to fit the new frame.
+	// (This is just an optimization - the caller of morestack will
+	// recheck the bounds on return.)
+	if f := findfunc(gp.sched.pc); f.valid() {
+		max := uintptr(funcMaxSPDelta(f))
+		needed := max + _StackGuard
+		used := gp.stack.hi - gp.sched.sp
+		for newsize-used < needed {
+			newsize *= 2
+		}
+	}
+
+	if stackguard0 == stackForceMove {
+		// Forced stack movement used for debugging.
+		// Don't double the stack (or we may quickly run out
+		// if this is done repeatedly).
+		newsize = oldsize
+	}
+
+	if newsize > maxstacksize || newsize > maxstackceiling {
+		if maxstacksize < maxstackceiling {
+			print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
+		} else {
+			print("runtime: goroutine stack exceeds ", maxstackceiling, "-byte limit\n")
+		}
+		print("runtime: sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
+		throw("stack overflow")
+	}
+
+	// The goroutine must be executing in order to call newstack,
+	// so it must be Grunning (or Gscanrunning).
+	casgstatus(gp, _Grunning, _Gcopystack)
+
+	// The concurrent GC will not scan the stack while we are doing the copy since
+	// the gp is in a Gcopystack status.
+	copystack(gp, newsize)
+	if stackDebug >= 1 {
+		print("stack grow done\n")
+	}
+	casgstatus(gp, _Gcopystack, _Grunning)
+	gogo(&gp.sched)
+}
+
+//go:nosplit
+func nilfunc() {
+	*(*uint8)(nil) = 0
+}
+
+// adjust Gobuf as if it executed a call to fn
+// and then stopped before the first instruction in fn.
+func gostartcallfn(gobuf *gobuf, fv *funcval) {
+	var fn unsafe.Pointer
+	if fv != nil {
+		fn = unsafe.Pointer(fv.fn)
+	} else {
+		fn = unsafe.Pointer(abi.FuncPCABIInternal(nilfunc))
+	}
+	gostartcall(gobuf, fn, unsafe.Pointer(fv))
+}
+
+// isShrinkStackSafe returns whether it's safe to attempt to shrink
+// gp's stack. Shrinking the stack is only safe when we have precise
+// pointer maps for all frames on the stack.
+func isShrinkStackSafe(gp *g) bool {
+	// We can't copy the stack if we're in a syscall.
+	// The syscall might have pointers into the stack and
+	// often we don't have precise pointer maps for the innermost
+	// frames.
+	//
+	// We also can't copy the stack if we're at an asynchronous
+	// safe-point because we don't have precise pointer maps for
+	// all frames.
+	//
+	// We also can't *shrink* the stack in the window between the
+	// goroutine calling gopark to park on a channel and
+	// gp.activeStackChans being set.
+	return gp.syscallsp == 0 && !gp.asyncSafePoint && atomic.Load8(&gp.parkingOnChan) == 0
+}
+
+// Maybe shrink the stack being used by gp.
+//
+// gp must be stopped and we must own its stack. It may be in
+// _Grunning, but only if this is our own user G.
+func shrinkstack(gp *g) {
+	if gp.stack.lo == 0 {
+		throw("missing stack in shrinkstack")
+	}
+	if s := readgstatus(gp); s&_Gscan == 0 {
+		// We don't own the stack via _Gscan. We could still
+		// own it if this is our own user G and we're on the
+		// system stack.
+		if !(gp == getg().m.curg && getg() != getg().m.curg && s == _Grunning) {
+			// We don't own the stack.
+			throw("bad status in shrinkstack")
+		}
+	}
+	if !isShrinkStackSafe(gp) {
+		throw("shrinkstack at bad time")
+	}
+	// Check for self-shrinks while in a libcall. These may have
+	// pointers into the stack disguised as uintptrs, but these
+	// code paths should all be nosplit.
+	if gp == getg().m.curg && gp.m.libcallsp != 0 {
+		throw("shrinking stack in libcall")
+	}
+
+	if debug.gcshrinkstackoff > 0 {
+		return
+	}
+	f := findfunc(gp.startpc)
+	if f.valid() && f.funcID == funcID_gcBgMarkWorker {
+		// We're not allowed to shrink the gcBgMarkWorker
+		// stack (see gcBgMarkWorker for explanation).
+		return
+	}
+
+	oldsize := gp.stack.hi - gp.stack.lo
+	newsize := oldsize / 2
+	// Don't shrink the allocation below the minimum-sized stack
+	// allocation.
+	if newsize < _FixedStack {
+		return
+	}
+	// Compute how much of the stack is currently in use and only
+	// shrink the stack if gp is using less than a quarter of its
+	// current stack. The currently used stack includes everything
+	// down to the SP plus the stack guard space that ensures
+	// there's room for nosplit functions.
+	avail := gp.stack.hi - gp.stack.lo
+	if used := gp.stack.hi - gp.sched.sp + _StackLimit; used >= avail/4 {
+		return
+	}
+
+	if stackDebug > 0 {
+		print("shrinking stack ", oldsize, "->", newsize, "\n")
+	}
+
+	copystack(gp, newsize)
+}
+
+// freeStackSpans frees unused stack spans at the end of GC.
+func freeStackSpans() {
+	// Scan stack pools for empty stack spans.
+	for order := range stackpool {
+		lock(&stackpool[order].item.mu)
+		list := &stackpool[order].item.span
+		for s := list.first; s != nil; {
+			next := s.next
+			if s.allocCount == 0 {
+				list.remove(s)
+				s.manualFreeList = 0
+				osStackFree(s)
+				mheap_.freeManual(s, spanAllocStack)
+			}
+			s = next
+		}
+		unlock(&stackpool[order].item.mu)
+	}
+
+	// Free large stack spans.
+	lock(&stackLarge.lock)
+	for i := range stackLarge.free {
+		for s := stackLarge.free[i].first; s != nil; {
+			next := s.next
+			stackLarge.free[i].remove(s)
+			osStackFree(s)
+			mheap_.freeManual(s, spanAllocStack)
+			s = next
+		}
+	}
+	unlock(&stackLarge.lock)
+}
+
+// getStackMap returns the locals and arguments live pointer maps, and
+// stack object list for frame.
+func getStackMap(frame *stkframe, cache *pcvalueCache, debug bool) (locals, args bitvector, objs []stackObjectRecord) {
+	targetpc := frame.continpc
+	if targetpc == 0 {
+		// Frame is dead. Return empty bitvectors.
+		return
+	}
+
+	f := frame.fn
+	pcdata := int32(-1)
+	if targetpc != f.entry() {
+		// Back up to the CALL. If we're at the function entry
+		// point, we want to use the entry map (-1), even if
+		// the first instruction of the function changes the
+		// stack map.
+		targetpc--
+		pcdata = pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
+	}
+	if pcdata == -1 {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function. It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0
+	}
+
+	// Local variables.
+	size := frame.varp - frame.sp
+	var minsize uintptr
+	switch goarch.ArchFamily {
+	case goarch.ARM64:
+		minsize = sys.StackAlign
+	default:
+		minsize = sys.MinFrameSize
+	}
+	if size > minsize {
+		stackid := pcdata
+		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+		if stkmap == nil || stkmap.n <= 0 {
+			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+			throw("missing stackmap")
+		}
+		// If nbit == 0, there's no work to do.
+		if stkmap.nbit > 0 {
+			if stackid < 0 || stackid >= stkmap.n {
+				// don't know where we are
+				print("runtime: pcdata is ", stackid, " and ", stkmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", hex(targetpc), ")\n")
+				throw("bad symbol table")
+			}
+			locals = stackmapdata(stkmap, stackid)
+			if stackDebug >= 3 && debug {
+				print("      locals ", stackid, "/", stkmap.n, " ", locals.n, " words ", locals.bytedata, "\n")
+			}
+		} else if stackDebug >= 3 && debug {
+			print("      no locals to adjust\n")
+		}
+	}
+
+	// Arguments.
+	if frame.arglen > 0 {
+		if frame.argmap != nil {
+			// argmap is set when the function is reflect.makeFuncStub or reflect.methodValueCall.
+			// In this case, arglen specifies how much of the args section is actually live.
+			// (It could be either all the args + results, or just the args.)
+			args = *frame.argmap
+			n := int32(frame.arglen / goarch.PtrSize)
+			if n < args.n {
+				args.n = n // Don't use more of the arguments than arglen.
+			}
+		} else {
+			stackmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+			if stackmap == nil || stackmap.n <= 0 {
+				print("runtime: frame ", funcname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
+				throw("missing stackmap")
+			}
+			if pcdata < 0 || pcdata >= stackmap.n {
+				// don't know where we are
+				print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " args stack map entries for ", funcname(f), " (targetpc=", hex(targetpc), ")\n")
+				throw("bad symbol table")
+			}
+			if stackmap.nbit > 0 {
+				args = stackmapdata(stackmap, pcdata)
+			}
+		}
+	}
+
+	// stack objects.
+	if (GOARCH == "amd64" || GOARCH == "arm64" || GOARCH == "ppc64" || GOARCH == "ppc64le") && unsafe.Sizeof(abi.RegArgs{}) > 0 && frame.argmap != nil {
+		// argmap is set when the function is reflect.makeFuncStub or reflect.methodValueCall.
+		// We don't actually use argmap in this case, but we need to fake the stack object
+		// record for these frames which contain an internal/abi.RegArgs at a hard-coded offset.
+		// This offset matches the assembly code on amd64 and arm64.
+		objs = methodValueCallFrameObjs[:]
+	} else {
+		p := funcdata(f, _FUNCDATA_StackObjects)
+		if p != nil {
+			n := *(*uintptr)(p)
+			p = add(p, goarch.PtrSize)
+			*(*slice)(unsafe.Pointer(&objs)) = slice{array: noescape(p), len: int(n), cap: int(n)}
+			// Note: the noescape above is needed to keep
+			// getStackMap from "leaking param content:
+			// frame".  That leak propagates up to getgcmask, then
+			// GCMask, then verifyGCInfo, which converts the stack
+			// gcinfo tests into heap gcinfo tests :(
+		}
+	}
+
+	return
+}
+
+var methodValueCallFrameObjs [1]stackObjectRecord // initialized in stackobjectinit
+
+func stkobjinit() {
+	var abiRegArgsEface any = abi.RegArgs{}
+	abiRegArgsType := efaceOf(&abiRegArgsEface)._type
+	if abiRegArgsType.kind&kindGCProg != 0 {
+		throw("abiRegArgsType needs GC Prog, update methodValueCallFrameObjs")
+	}
+	// Set methodValueCallFrameObjs[0].gcdataoff so that
+	// stackObjectRecord.gcdata() will work correctly with it.
+	ptr := uintptr(unsafe.Pointer(&methodValueCallFrameObjs[0]))
+	var mod *moduledata
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if datap.gofunc <= ptr && ptr < datap.end {
+			mod = datap
+			break
+		}
+	}
+	if mod == nil {
+		throw("methodValueCallFrameObjs is not in a module")
+	}
+	methodValueCallFrameObjs[0] = stackObjectRecord{
+		off:       -int32(alignUp(abiRegArgsType.size, 8)), // It's always the highest address local.
+		size:      int32(abiRegArgsType.size),
+		_ptrdata:  int32(abiRegArgsType.ptrdata),
+		gcdataoff: uint32(uintptr(unsafe.Pointer(abiRegArgsType.gcdata)) - mod.rodata),
+	}
+}
+
+// A stackObjectRecord is generated by the compiler for each stack object in a stack frame.
+// This record must match the generator code in cmd/compile/internal/liveness/plive.go:emitStackObjects.
+type stackObjectRecord struct {
+	// offset in frame
+	// if negative, offset from varp
+	// if non-negative, offset from argp
+	off       int32
+	size      int32
+	_ptrdata  int32  // ptrdata, or -ptrdata is GC prog is used
+	gcdataoff uint32 // offset to gcdata from moduledata.rodata
+}
+
+func (r *stackObjectRecord) useGCProg() bool {
+	return r._ptrdata < 0
+}
+
+func (r *stackObjectRecord) ptrdata() uintptr {
+	x := r._ptrdata
+	if x < 0 {
+		return uintptr(-x)
+	}
+	return uintptr(x)
+}
+
+// gcdata returns pointer map or GC prog of the type.
+func (r *stackObjectRecord) gcdata() *byte {
+	ptr := uintptr(unsafe.Pointer(r))
+	var mod *moduledata
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if datap.gofunc <= ptr && ptr < datap.end {
+			mod = datap
+			break
+		}
+	}
+	// If you get a panic here due to a nil mod,
+	// you may have made a copy of a stackObjectRecord.
+	// You must use the original pointer.
+	res := mod.rodata + uintptr(r.gcdataoff)
+	return (*byte)(unsafe.Pointer(res))
+}
+
+// This is exported as ABI0 via linkname so obj can call it.
+//
+//go:nosplit
+//go:linkname morestackc
+func morestackc() {
+	throw("attempt to execute system stack code on user stack")
+}
diff --git a/contrib/go/_std_1.18/src/runtime/string.go b/contrib/go/_std_1.18/src/runtime/string.go
new file mode 100644
index 0000000000..980a9866e6
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/string.go
@@ -0,0 +1,495 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/bytealg"
+	"internal/goarch"
+	"unsafe"
+)
+
+// The constant is known to the compiler.
+// There is no fundamental theory behind this number.
+const tmpStringBufSize = 32
+
+type tmpBuf [tmpStringBufSize]byte
+
+// concatstrings implements a Go string concatenation x+y+z+...
+// The operands are passed in the slice a.
+// If buf != nil, the compiler has determined that the result does not
+// escape the calling function, so the string data can be stored in buf
+// if small enough.
+func concatstrings(buf *tmpBuf, a []string) string {
+	idx := 0
+	l := 0
+	count := 0
+	for i, x := range a {
+		n := len(x)
+		if n == 0 {
+			continue
+		}
+		if l+n < l {
+			throw("string concatenation too long")
+		}
+		l += n
+		count++
+		idx = i
+	}
+	if count == 0 {
+		return ""
+	}
+
+	// If there is just one string and either it is not on the stack
+	// or our result does not escape the calling frame (buf != nil),
+	// then we can return that string directly.
+	if count == 1 && (buf != nil || !stringDataOnStack(a[idx])) {
+		return a[idx]
+	}
+	s, b := rawstringtmp(buf, l)
+	for _, x := range a {
+		copy(b, x)
+		b = b[len(x):]
+	}
+	return s
+}
+
+func concatstring2(buf *tmpBuf, a0, a1 string) string {
+	return concatstrings(buf, []string{a0, a1})
+}
+
+func concatstring3(buf *tmpBuf, a0, a1, a2 string) string {
+	return concatstrings(buf, []string{a0, a1, a2})
+}
+
+func concatstring4(buf *tmpBuf, a0, a1, a2, a3 string) string {
+	return concatstrings(buf, []string{a0, a1, a2, a3})
+}
+
+func concatstring5(buf *tmpBuf, a0, a1, a2, a3, a4 string) string {
+	return concatstrings(buf, []string{a0, a1, a2, a3, a4})
+}
+
+// slicebytetostring converts a byte slice to a string.
+// It is inserted by the compiler into generated code.
+// ptr is a pointer to the first element of the slice;
+// n is the length of the slice.
+// Buf is a fixed-size buffer for the result,
+// it is not nil if the result does not escape.
+func slicebytetostring(buf *tmpBuf, ptr *byte, n int) (str string) {
+	if n == 0 {
+		// Turns out to be a relatively common case.
+		// Consider that you want to parse out data between parens in "foo()bar",
+		// you find the indices and convert the subslice to string.
+		return ""
+	}
+	if raceenabled {
+		racereadrangepc(unsafe.Pointer(ptr),
+			uintptr(n),
+			getcallerpc(),
+			abi.FuncPCABIInternal(slicebytetostring))
+	}
+	if msanenabled {
+		msanread(unsafe.Pointer(ptr), uintptr(n))
+	}
+	if asanenabled {
+		asanread(unsafe.Pointer(ptr), uintptr(n))
+	}
+	if n == 1 {
+		p := unsafe.Pointer(&staticuint64s[*ptr])
+		if goarch.BigEndian {
+			p = add(p, 7)
+		}
+		stringStructOf(&str).str = p
+		stringStructOf(&str).len = 1
+		return
+	}
+
+	var p unsafe.Pointer
+	if buf != nil && n <= len(buf) {
+		p = unsafe.Pointer(buf)
+	} else {
+		p = mallocgc(uintptr(n), nil, false)
+	}
+	stringStructOf(&str).str = p
+	stringStructOf(&str).len = n
+	memmove(p, unsafe.Pointer(ptr), uintptr(n))
+	return
+}
+
+// stringDataOnStack reports whether the string's data is
+// stored on the current goroutine's stack.
+func stringDataOnStack(s string) bool {
+	ptr := uintptr(stringStructOf(&s).str)
+	stk := getg().stack
+	return stk.lo <= ptr && ptr < stk.hi
+}
+
+func rawstringtmp(buf *tmpBuf, l int) (s string, b []byte) {
+	if buf != nil && l <= len(buf) {
+		b = buf[:l]
+		s = slicebytetostringtmp(&b[0], len(b))
+	} else {
+		s, b = rawstring(l)
+	}
+	return
+}
+
+// slicebytetostringtmp returns a "string" referring to the actual []byte bytes.
+//
+// Callers need to ensure that the returned string will not be used after
+// the calling goroutine modifies the original slice or synchronizes with
+// another goroutine.
+//
+// The function is only called when instrumenting
+// and otherwise intrinsified by the compiler.
+//
+// Some internal compiler optimizations use this function.
+// - Used for m[T1{... Tn{..., string(k), ...} ...}] and m[string(k)]
+//   where k is []byte, T1 to Tn is a nesting of struct and array literals.
+// - Used for "<"+string(b)+">" concatenation where b is []byte.
+// - Used for string(b)=="foo" comparison where b is []byte.
+func slicebytetostringtmp(ptr *byte, n int) (str string) {
+	if raceenabled && n > 0 {
+		racereadrangepc(unsafe.Pointer(ptr),
+			uintptr(n),
+			getcallerpc(),
+			abi.FuncPCABIInternal(slicebytetostringtmp))
+	}
+	if msanenabled && n > 0 {
+		msanread(unsafe.Pointer(ptr), uintptr(n))
+	}
+	if asanenabled && n > 0 {
+		asanread(unsafe.Pointer(ptr), uintptr(n))
+	}
+	stringStructOf(&str).str = unsafe.Pointer(ptr)
+	stringStructOf(&str).len = n
+	return
+}
+
+func stringtoslicebyte(buf *tmpBuf, s string) []byte {
+	var b []byte
+	if buf != nil && len(s) <= len(buf) {
+		*buf = tmpBuf{}
+		b = buf[:len(s)]
+	} else {
+		b = rawbyteslice(len(s))
+	}
+	copy(b, s)
+	return b
+}
+
+func stringtoslicerune(buf *[tmpStringBufSize]rune, s string) []rune {
+	// two passes.
+	// unlike slicerunetostring, no race because strings are immutable.
+	n := 0
+	for range s {
+		n++
+	}
+
+	var a []rune
+	if buf != nil && n <= len(buf) {
+		*buf = [tmpStringBufSize]rune{}
+		a = buf[:n]
+	} else {
+		a = rawruneslice(n)
+	}
+
+	n = 0
+	for _, r := range s {
+		a[n] = r
+		n++
+	}
+	return a
+}
+
+func slicerunetostring(buf *tmpBuf, a []rune) string {
+	if raceenabled && len(a) > 0 {
+		racereadrangepc(unsafe.Pointer(&a[0]),
+			uintptr(len(a))*unsafe.Sizeof(a[0]),
+			getcallerpc(),
+			abi.FuncPCABIInternal(slicerunetostring))
+	}
+	if msanenabled && len(a) > 0 {
+		msanread(unsafe.Pointer(&a[0]), uintptr(len(a))*unsafe.Sizeof(a[0]))
+	}
+	if asanenabled && len(a) > 0 {
+		asanread(unsafe.Pointer(&a[0]), uintptr(len(a))*unsafe.Sizeof(a[0]))
+	}
+	var dum [4]byte
+	size1 := 0
+	for _, r := range a {
+		size1 += encoderune(dum[:], r)
+	}
+	s, b := rawstringtmp(buf, size1+3)
+	size2 := 0
+	for _, r := range a {
+		// check for race
+		if size2 >= size1 {
+			break
+		}
+		size2 += encoderune(b[size2:], r)
+	}
+	return s[:size2]
+}
+
+type stringStruct struct {
+	str unsafe.Pointer
+	len int
+}
+
+// Variant with *byte pointer type for DWARF debugging.
+type stringStructDWARF struct {
+	str *byte
+	len int
+}
+
+func stringStructOf(sp *string) *stringStruct {
+	return (*stringStruct)(unsafe.Pointer(sp))
+}
+
+func intstring(buf *[4]byte, v int64) (s string) {
+	var b []byte
+	if buf != nil {
+		b = buf[:]
+		s = slicebytetostringtmp(&b[0], len(b))
+	} else {
+		s, b = rawstring(4)
+	}
+	if int64(rune(v)) != v {
+		v = runeError
+	}
+	n := encoderune(b, rune(v))
+	return s[:n]
+}
+
+// rawstring allocates storage for a new string. The returned
+// string and byte slice both refer to the same storage.
+// The storage is not zeroed. Callers should use
+// b to set the string contents and then drop b.
+func rawstring(size int) (s string, b []byte) {
+	p := mallocgc(uintptr(size), nil, false)
+
+	stringStructOf(&s).str = p
+	stringStructOf(&s).len = size
+
+	*(*slice)(unsafe.Pointer(&b)) = slice{p, size, size}
+
+	return
+}
+
+// rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
+func rawbyteslice(size int) (b []byte) {
+	cap := roundupsize(uintptr(size))
+	p := mallocgc(cap, nil, false)
+	if cap != uintptr(size) {
+		memclrNoHeapPointers(add(p, uintptr(size)), cap-uintptr(size))
+	}
+
+	*(*slice)(unsafe.Pointer(&b)) = slice{p, size, int(cap)}
+	return
+}
+
+// rawruneslice allocates a new rune slice. The rune slice is not zeroed.
+func rawruneslice(size int) (b []rune) {
+	if uintptr(size) > maxAlloc/4 {
+		throw("out of memory")
+	}
+	mem := roundupsize(uintptr(size) * 4)
+	p := mallocgc(mem, nil, false)
+	if mem != uintptr(size)*4 {
+		memclrNoHeapPointers(add(p, uintptr(size)*4), mem-uintptr(size)*4)
+	}
+
+	*(*slice)(unsafe.Pointer(&b)) = slice{p, size, int(mem / 4)}
+	return
+}
+
+// used by cmd/cgo
+func gobytes(p *byte, n int) (b []byte) {
+	if n == 0 {
+		return make([]byte, 0)
+	}
+
+	if n < 0 || uintptr(n) > maxAlloc {
+		panic(errorString("gobytes: length out of range"))
+	}
+
+	bp := mallocgc(uintptr(n), nil, false)
+	memmove(bp, unsafe.Pointer(p), uintptr(n))
+
+	*(*slice)(unsafe.Pointer(&b)) = slice{bp, n, n}
+	return
+}
+
+// This is exported via linkname to assembly in syscall (for Plan9).
+//go:linkname gostring
+func gostring(p *byte) string {
+	l := findnull(p)
+	if l == 0 {
+		return ""
+	}
+	s, b := rawstring(l)
+	memmove(unsafe.Pointer(&b[0]), unsafe.Pointer(p), uintptr(l))
+	return s
+}
+
+func gostringn(p *byte, l int) string {
+	if l == 0 {
+		return ""
+	}
+	s, b := rawstring(l)
+	memmove(unsafe.Pointer(&b[0]), unsafe.Pointer(p), uintptr(l))
+	return s
+}
+
+func hasPrefix(s, prefix string) bool {
+	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
+}
+
+const (
+	maxUint = ^uint(0)
+	maxInt  = int(maxUint >> 1)
+)
+
+// atoi parses an int from a string s.
+// The bool result reports whether s is a number
+// representable by a value of type int.
+func atoi(s string) (int, bool) {
+	if s == "" {
+		return 0, false
+	}
+
+	neg := false
+	if s[0] == '-' {
+		neg = true
+		s = s[1:]
+	}
+
+	un := uint(0)
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		if c < '0' || c > '9' {
+			return 0, false
+		}
+		if un > maxUint/10 {
+			// overflow
+			return 0, false
+		}
+		un *= 10
+		un1 := un + uint(c) - '0'
+		if un1 < un {
+			// overflow
+			return 0, false
+		}
+		un = un1
+	}
+
+	if !neg && un > uint(maxInt) {
+		return 0, false
+	}
+	if neg && un > uint(maxInt)+1 {
+		return 0, false
+	}
+
+	n := int(un)
+	if neg {
+		n = -n
+	}
+
+	return n, true
+}
+
+// atoi32 is like atoi but for integers
+// that fit into an int32.
+func atoi32(s string) (int32, bool) {
+	if n, ok := atoi(s); n == int(int32(n)) {
+		return int32(n), ok
+	}
+	return 0, false
+}
+
+//go:nosplit
+func findnull(s *byte) int {
+	if s == nil {
+		return 0
+	}
+
+	// Avoid IndexByteString on Plan 9 because it uses SSE instructions
+	// on x86 machines, and those are classified as floating point instructions,
+	// which are illegal in a note handler.
+	if GOOS == "plan9" {
+		p := (*[maxAlloc/2 - 1]byte)(unsafe.Pointer(s))
+		l := 0
+		for p[l] != 0 {
+			l++
+		}
+		return l
+	}
+
+	// pageSize is the unit we scan at a time looking for NULL.
+	// It must be the minimum page size for any architecture Go
+	// runs on. It's okay (just a minor performance loss) if the
+	// actual system page size is larger than this value.
+	const pageSize = 4096
+
+	offset := 0
+	ptr := unsafe.Pointer(s)
+	// IndexByteString uses wide reads, so we need to be careful
+	// with page boundaries. Call IndexByteString on
+	// [ptr, endOfPage) interval.
+	safeLen := int(pageSize - uintptr(ptr)%pageSize)
+
+	for {
+		t := *(*string)(unsafe.Pointer(&stringStruct{ptr, safeLen}))
+		// Check one page at a time.
+		if i := bytealg.IndexByteString(t, 0); i != -1 {
+			return offset + i
+		}
+		// Move to next page
+		ptr = unsafe.Pointer(uintptr(ptr) + uintptr(safeLen))
+		offset += safeLen
+		safeLen = pageSize
+	}
+}
+
+func findnullw(s *uint16) int {
+	if s == nil {
+		return 0
+	}
+	p := (*[maxAlloc/2/2 - 1]uint16)(unsafe.Pointer(s))
+	l := 0
+	for p[l] != 0 {
+		l++
+	}
+	return l
+}
+
+//go:nosplit
+func gostringnocopy(str *byte) string {
+	ss := stringStruct{str: unsafe.Pointer(str), len: findnull(str)}
+	s := *(*string)(unsafe.Pointer(&ss))
+	return s
+}
+
+func gostringw(strw *uint16) string {
+	var buf [8]byte
+	str := (*[maxAlloc/2/2 - 1]uint16)(unsafe.Pointer(strw))
+	n1 := 0
+	for i := 0; str[i] != 0; i++ {
+		n1 += encoderune(buf[:], rune(str[i]))
+	}
+	s, b := rawstring(n1 + 4)
+	n2 := 0
+	for i := 0; str[i] != 0; i++ {
+		// check for race
+		if n2 >= n1 {
+			break
+		}
+		n2 += encoderune(b[n2:], rune(str[i]))
+	}
+	b[n2] = 0 // for luck
+	return s[:n2]
+}
diff --git a/contrib/go/_std_1.18/src/runtime/stubs.go b/contrib/go/_std_1.18/src/runtime/stubs.go
new file mode 100644
index 0000000000..ad78363bb6
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stubs.go
@@ -0,0 +1,437 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"internal/goexperiment"
+	"runtime/internal/math"
+	"unsafe"
+)
+
+// Should be a built-in for unsafe.Pointer?
+//go:nosplit
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+// getg returns the pointer to the current g.
+// The compiler rewrites calls to this function into instructions
+// that fetch the g directly (from TLS or from the dedicated register).
+func getg() *g
+
+// mcall switches from the g to the g0 stack and invokes fn(g),
+// where g is the goroutine that made the call.
+// mcall saves g's current PC/SP in g->sched so that it can be restored later.
+// It is up to fn to arrange for that later execution, typically by recording
+// g in a data structure, causing something to call ready(g) later.
+// mcall returns to the original goroutine g later, when g has been rescheduled.
+// fn must not return at all; typically it ends by calling schedule, to let the m
+// run other goroutines.
+//
+// mcall can only be called from g stacks (not g0, not gsignal).
+//
+// This must NOT be go:noescape: if fn is a stack-allocated closure,
+// fn puts g on a run queue, and g executes before fn returns, the
+// closure will be invalidated while it is still executing.
+func mcall(fn func(*g))
+
+// systemstack runs fn on a system stack.
+// If systemstack is called from the per-OS-thread (g0) stack, or
+// if systemstack is called from the signal handling (gsignal) stack,
+// systemstack calls fn directly and returns.
+// Otherwise, systemstack is being called from the limited stack
+// of an ordinary goroutine. In this case, systemstack switches
+// to the per-OS-thread stack, calls fn, and switches back.
+// It is common to use a func literal as the argument, in order
+// to share inputs and outputs with the code around the call
+// to system stack:
+//
+//	... set up y ...
+//	systemstack(func() {
+//		x = bigcall(y)
+//	})
+//	... use x ...
+//
+//go:noescape
+func systemstack(fn func())
+
+var badsystemstackMsg = "fatal: systemstack called from unexpected goroutine"
+
+//go:nosplit
+//go:nowritebarrierrec
+func badsystemstack() {
+	sp := stringStructOf(&badsystemstackMsg)
+	write(2, sp.str, int32(sp.len))
+}
+
+// memclrNoHeapPointers clears n bytes starting at ptr.
+//
+// Usually you should use typedmemclr. memclrNoHeapPointers should be
+// used only when the caller knows that *ptr contains no heap pointers
+// because either:
+//
+// *ptr is initialized memory and its type is pointer-free, or
+//
+// *ptr is uninitialized memory (e.g., memory that's being reused
+// for a new allocation) and hence contains only "junk".
+//
+// memclrNoHeapPointers ensures that if ptr is pointer-aligned, and n
+// is a multiple of the pointer size, then any pointer-aligned,
+// pointer-sized portion is cleared atomically. Despite the function
+// name, this is necessary because this function is the underlying
+// implementation of typedmemclr and memclrHasPointers. See the doc of
+// memmove for more details.
+//
+// The (CPU-specific) implementations of this function are in memclr_*.s.
+//
+//go:noescape
+func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
+
+//go:linkname reflect_memclrNoHeapPointers reflect.memclrNoHeapPointers
+func reflect_memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) {
+	memclrNoHeapPointers(ptr, n)
+}
+
+// memmove copies n bytes from "from" to "to".
+//
+// memmove ensures that any pointer in "from" is written to "to" with
+// an indivisible write, so that racy reads cannot observe a
+// half-written pointer. This is necessary to prevent the garbage
+// collector from observing invalid pointers, and differs from memmove
+// in unmanaged languages. However, memmove is only required to do
+// this if "from" and "to" may contain pointers, which can only be the
+// case if "from", "to", and "n" are all be word-aligned.
+//
+// Implementations are in memmove_*.s.
+//
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
+
+// Outside assembly calls memmove. Make sure it has ABI wrappers.
+//go:linkname memmove
+
+//go:linkname reflect_memmove reflect.memmove
+func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
+	memmove(to, from, n)
+}
+
+// exported value for testing
+const hashLoad = float32(loadFactorNum) / float32(loadFactorDen)
+
+//go:nosplit
+func fastrand() uint32 {
+	mp := getg().m
+	// Implement wyrand: https://github.com/wangyi-fudan/wyhash
+	// Only the platform that math.Mul64 can be lowered
+	// by the compiler should be in this list.
+	if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
+		goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
+		goarch.IsS390x|goarch.IsRiscv64 == 1 {
+		mp.fastrand += 0xa0761d6478bd642f
+		hi, lo := math.Mul64(mp.fastrand, mp.fastrand^0xe7037ed1a0b428db)
+		return uint32(hi ^ lo)
+	}
+
+	// Implement xorshift64+: 2 32-bit xorshift sequences added together.
+	// Shift triplet [17,7,16] was calculated as indicated in Marsaglia's
+	// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
+	// This generator passes the SmallCrush suite, part of TestU01 framework:
+	// http://simul.iro.umontreal.ca/testu01/tu01.html
+	t := (*[2]uint32)(unsafe.Pointer(&mp.fastrand))
+	s1, s0 := t[0], t[1]
+	s1 ^= s1 << 17
+	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
+	t[0], t[1] = s0, s1
+	return s0 + s1
+}
+
+//go:nosplit
+func fastrandn(n uint32) uint32 {
+	// This is similar to fastrand() % n, but faster.
+	// See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+	return uint32(uint64(fastrand()) * uint64(n) >> 32)
+}
+
+//go:linkname sync_fastrandn sync.fastrandn
+func sync_fastrandn(n uint32) uint32 { return fastrandn(n) }
+
+//go:linkname net_fastrand net.fastrand
+func net_fastrand() uint32 { return fastrand() }
+
+//go:linkname os_fastrand os.fastrand
+func os_fastrand() uint32 { return fastrand() }
+
+// in internal/bytealg/equal_*.s
+//go:noescape
+func memequal(a, b unsafe.Pointer, size uintptr) bool
+
+// noescape hides a pointer from escape analysis.  noescape is
+// the identity function but escape analysis doesn't think the
+// output depends on the input.  noescape is inlined and currently
+// compiles down to zero instructions.
+// USE CAREFULLY!
+//go:nosplit
+func noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
+
+// Not all cgocallback frames are actually cgocallback,
+// so not all have these arguments. Mark them uintptr so that the GC
+// does not misinterpret memory when the arguments are not present.
+// cgocallback is not called from Go, only from crosscall2.
+// This in turn calls cgocallbackg, which is where we'll find
+// pointer-declared arguments.
+func cgocallback(fn, frame, ctxt uintptr)
+
+func gogo(buf *gobuf)
+
+func asminit()
+func setg(gg *g)
+func breakpoint()
+
+// reflectcall calls fn with arguments described by stackArgs, stackArgsSize,
+// frameSize, and regArgs.
+//
+// Arguments passed on the stack and space for return values passed on the stack
+// must be laid out at the space pointed to by stackArgs (with total length
+// stackArgsSize) according to the ABI.
+//
+// stackRetOffset must be some value <= stackArgsSize that indicates the
+// offset within stackArgs where the return value space begins.
+//
+// frameSize is the total size of the argument frame at stackArgs and must
+// therefore be >= stackArgsSize. It must include additional space for spilling
+// register arguments for stack growth and preemption.
+//
+// TODO(mknyszek): Once we don't need the additional spill space, remove frameSize,
+// since frameSize will be redundant with stackArgsSize.
+//
+// Arguments passed in registers must be laid out in regArgs according to the ABI.
+// regArgs will hold any return values passed in registers after the call.
+//
+// reflectcall copies stack arguments from stackArgs to the goroutine stack, and
+// then copies back stackArgsSize-stackRetOffset bytes back to the return space
+// in stackArgs once fn has completed. It also "unspills" argument registers from
+// regArgs before calling fn, and spills them back into regArgs immediately
+// following the call to fn. If there are results being returned on the stack,
+// the caller should pass the argument frame type as stackArgsType so that
+// reflectcall can execute appropriate write barriers during the copy.
+//
+// reflectcall expects regArgs.ReturnIsPtr to be populated indicating which
+// registers on the return path will contain Go pointers. It will then store
+// these pointers in regArgs.Ptrs such that they are visible to the GC.
+//
+// Package reflect passes a frame type. In package runtime, there is only
+// one call that copies results back, in callbackWrap in syscall_windows.go, and it
+// does NOT pass a frame type, meaning there are no write barriers invoked. See that
+// call site for justification.
+//
+// Package reflect accesses this symbol through a linkname.
+//
+// Arguments passed through to reflectcall do not escape. The type is used
+// only in a very limited callee of reflectcall, the stackArgs are copied, and
+// regArgs is only used in the reflectcall frame.
+//go:noescape
+func reflectcall(stackArgsType *_type, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+
+func procyield(cycles uint32)
+
+type neverCallThisFunction struct{}
+
+// goexit is the return stub at the top of every goroutine call stack.
+// Each goroutine stack is constructed as if goexit called the
+// goroutine's entry point function, so that when the entry point
+// function returns, it will return to goexit, which will call goexit1
+// to perform the actual exit.
+//
+// This function must never be called directly. Call goexit1 instead.
+// gentraceback assumes that goexit terminates the stack. A direct
+// call on the stack will cause gentraceback to stop walking the stack
+// prematurely and if there is leftover state it may panic.
+func goexit(neverCallThisFunction)
+
+// publicationBarrier performs a store/store barrier (a "publication"
+// or "export" barrier). Some form of synchronization is required
+// between initializing an object and making that object accessible to
+// another processor. Without synchronization, the initialization
+// writes and the "publication" write may be reordered, allowing the
+// other processor to follow the pointer and observe an uninitialized
+// object. In general, higher-level synchronization should be used,
+// such as locking or an atomic pointer write. publicationBarrier is
+// for when those aren't an option, such as in the implementation of
+// the memory manager.
+//
+// There's no corresponding barrier for the read side because the read
+// side naturally has a data dependency order. All architectures that
+// Go supports or seems likely to ever support automatically enforce
+// data dependency ordering.
+func publicationBarrier()
+
+// getcallerpc returns the program counter (PC) of its caller's caller.
+// getcallersp returns the stack pointer (SP) of its caller's caller.
+// The implementation may be a compiler intrinsic; there is not
+// necessarily code implementing this on every platform.
+//
+// For example:
+//
+//	func f(arg1, arg2, arg3 int) {
+//		pc := getcallerpc()
+//		sp := getcallersp()
+//	}
+//
+// These two lines find the PC and SP immediately following
+// the call to f (where f will return).
+//
+// The call to getcallerpc and getcallersp must be done in the
+// frame being asked about.
+//
+// The result of getcallersp is correct at the time of the return,
+// but it may be invalidated by any subsequent call to a function
+// that might relocate the stack in order to grow or shrink it.
+// A general rule is that the result of getcallersp should be used
+// immediately and can only be passed to nosplit functions.
+
+//go:noescape
+func getcallerpc() uintptr
+
+//go:noescape
+func getcallersp() uintptr // implemented as an intrinsic on all platforms
+
+// getclosureptr returns the pointer to the current closure.
+// getclosureptr can only be used in an assignment statement
+// at the entry of a function. Moreover, go:nosplit directive
+// must be specified at the declaration of caller function,
+// so that the function prolog does not clobber the closure register.
+// for example:
+//
+//	//go:nosplit
+//	func f(arg1, arg2, arg3 int) {
+//		dx := getclosureptr()
+//	}
+//
+// The compiler rewrites calls to this function into instructions that fetch the
+// pointer from a well-known register (DX on x86 architecture, etc.) directly.
+func getclosureptr() uintptr
+
+//go:noescape
+func asmcgocall(fn, arg unsafe.Pointer) int32
+
+func morestack()
+func morestack_noctxt()
+func rt0_go()
+
+// return0 is a stub used to return 0 from deferproc.
+// It is called at the very end of deferproc to signal
+// the calling Go function that it should not jump
+// to deferreturn.
+// in asm_*.s
+func return0()
+
+// in asm_*.s
+// not called directly; definitions here supply type information for traceback.
+// These must have the same signature (arg pointer map) as reflectcall.
+func call16(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call32(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call64(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call128(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call256(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call512(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call1024(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call2048(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call4096(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call8192(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call16384(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call32768(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call65536(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call131072(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call262144(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call524288(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call1048576(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call2097152(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call4194304(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call8388608(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call16777216(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call33554432(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call67108864(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call134217728(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call268435456(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call536870912(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+func call1073741824(typ, fn, stackArgs unsafe.Pointer, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs)
+
+func systemstack_switch()
+
+// alignUp rounds n up to a multiple of a. a must be a power of 2.
+func alignUp(n, a uintptr) uintptr {
+	return (n + a - 1) &^ (a - 1)
+}
+
+// alignDown rounds n down to a multiple of a. a must be a power of 2.
+func alignDown(n, a uintptr) uintptr {
+	return n &^ (a - 1)
+}
+
+// divRoundUp returns ceil(n / a).
+func divRoundUp(n, a uintptr) uintptr {
+	// a is generally a power of two. This will get inlined and
+	// the compiler will optimize the division.
+	return (n + a - 1) / a
+}
+
+// checkASM reports whether assembly runtime checks have passed.
+func checkASM() bool
+
+func memequal_varlen(a, b unsafe.Pointer) bool
+
+// bool2int returns 0 if x is false or 1 if x is true.
+func bool2int(x bool) int {
+	// Avoid branches. In the SSA compiler, this compiles to
+	// exactly what you would want it to.
+	return int(uint8(*(*uint8)(unsafe.Pointer(&x))))
+}
+
+// abort crashes the runtime in situations where even throw might not
+// work. In general it should do something a debugger will recognize
+// (e.g., an INT3 on x86). A crash in abort is recognized by the
+// signal handler, which will attempt to tear down the runtime
+// immediately.
+func abort()
+
+// Called from compiled code; declared for vet; do NOT call from Go.
+func gcWriteBarrier()
+func duffzero()
+func duffcopy()
+
+// Called from linker-generated .initarray; declared for go vet; do NOT call from Go.
+func addmoduledata()
+
+// Injected by the signal handler for panicking signals.
+// Initializes any registers that have fixed meaning at calls but
+// are scratch in bodies and calls sigpanic.
+// On many platforms it just jumps to sigpanic.
+func sigpanic0()
+
+// intArgRegs is used by the various register assignment
+// algorithm implementations in the runtime. These include:.
+// - Finalizers (mfinal.go)
+// - Windows callbacks (syscall_windows.go)
+//
+// Both are stripped-down versions of the algorithm since they
+// only have to deal with a subset of cases (finalizers only
+// take a pointer or interface argument, Go Windows callbacks
+// don't support floating point).
+//
+// It should be modified with care and are generally only
+// modified when testing this package.
+//
+// It should never be set higher than its internal/abi
+// constant counterparts, because the system relies on a
+// structure that is at least large enough to hold the
+// registers the system supports.
+//
+// Protected by finlock.
+var intArgRegs = abi.IntArgRegs * (goexperiment.RegabiArgsInt | goarch.IsAmd64)
diff --git a/contrib/go/_std_1.18/src/runtime/stubs2.go b/contrib/go/_std_1.18/src/runtime/stubs2.go
new file mode 100644
index 0000000000..9aa965454d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stubs2.go
@@ -0,0 +1,40 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !aix && !darwin && !js && !openbsd && !plan9 && !solaris && !windows
+
+package runtime
+
+import "unsafe"
+
+// read calls the read system call.
+// It returns a non-negative number of bytes written or a negative errno value.
+func read(fd int32, p unsafe.Pointer, n int32) int32
+
+func closefd(fd int32) int32
+
+func exit(code int32)
+func usleep(usec uint32)
+
+//go:nosplit
+func usleep_no_g(usec uint32) {
+	usleep(usec)
+}
+
+// write calls the write system call.
+// It returns a non-negative number of bytes written or a negative errno value.
+//go:noescape
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32
+
+//go:noescape
+func open(name *byte, mode, perm int32) int32
+
+// return value is only set on linux to be used in osinit()
+func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
+
+// exitThread terminates the current thread, writing *wait = 0 when
+// the stack is safe to reclaim.
+//
+//go:noescape
+func exitThread(wait *uint32)
diff --git a/contrib/go/_std_1.18/src/runtime/stubs3.go b/contrib/go/_std_1.18/src/runtime/stubs3.go
new file mode 100644
index 0000000000..891663b110
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stubs3.go
@@ -0,0 +1,9 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !aix && !darwin && !freebsd && !openbsd && !plan9 && !solaris
+
+package runtime
+
+func nanotime1() int64
diff --git a/contrib/go/_std_1.18/src/runtime/stubs_amd64.go b/contrib/go/_std_1.18/src/runtime/stubs_amd64.go
new file mode 100644
index 0000000000..687a506cdd
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stubs_amd64.go
@@ -0,0 +1,49 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// Called from compiled code; declared for vet; do NOT call from Go.
+func gcWriteBarrierCX()
+func gcWriteBarrierDX()
+func gcWriteBarrierBX()
+func gcWriteBarrierBP()
+func gcWriteBarrierSI()
+func gcWriteBarrierR8()
+func gcWriteBarrierR9()
+
+// stackcheck checks that SP is in range [g->stack.lo, g->stack.hi).
+func stackcheck()
+
+// Called from assembly only; declared for go vet.
+func settls() // argument in DI
+
+// Retpolines, used by -spectre=ret flag in cmd/asm, cmd/compile.
+func retpolineAX()
+func retpolineCX()
+func retpolineDX()
+func retpolineBX()
+func retpolineBP()
+func retpolineSI()
+func retpolineDI()
+func retpolineR8()
+func retpolineR9()
+func retpolineR10()
+func retpolineR11()
+func retpolineR12()
+func retpolineR13()
+func retpolineR14()
+func retpolineR15()
+
+//go:noescape
+func asmcgocall_no_g(fn, arg unsafe.Pointer)
+
+// Used by reflectcall and the reflect package.
+//
+// Spills/loads arguments in registers to/from an internal/abi.RegArgs
+// respectively. Does not follow the Go ABI.
+func spillArgs()
+func unspillArgs()
diff --git a/contrib/go/_std_1.18/src/runtime/stubs_linux.go b/contrib/go/_std_1.18/src/runtime/stubs_linux.go
new file mode 100644
index 0000000000..06c14e2160
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stubs_linux.go
@@ -0,0 +1,19 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux
+
+package runtime
+
+import "unsafe"
+
+func sbrk0() uintptr
+
+// Called from write_err_android.go only, but defined in sys_linux_*.s;
+// declared here (instead of in write_err_android.go) for go vet on non-android builds.
+// The return value is the raw syscall result, which may encode an error number.
+//go:noescape
+func access(name *byte, mode int32) int32
+func connect(fd int32, addr unsafe.Pointer, len int32) int32
+func socket(domain int32, typ int32, prot int32) int32
diff --git a/contrib/go/_std_1.18/src/runtime/stubs_nonlinux.go b/contrib/go/_std_1.18/src/runtime/stubs_nonlinux.go
new file mode 100644
index 0000000000..1a06d7cc1d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/stubs_nonlinux.go
@@ -0,0 +1,12 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux
+
+package runtime
+
+// sbrk0 returns the current process brk, or 0 if not implemented.
+func sbrk0() uintptr {
+	return 0
+}
diff --git a/contrib/go/_std_1.18/src/runtime/symtab.go b/contrib/go/_std_1.18/src/runtime/symtab.go
new file mode 100644
index 0000000000..ee4db47314
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/symtab.go
@@ -0,0 +1,1180 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Frames may be used to get function/file/line information for a
+// slice of PC values returned by Callers.
+type Frames struct {
+	// callers is a slice of PCs that have not yet been expanded to frames.
+	callers []uintptr
+
+	// frames is a slice of Frames that have yet to be returned.
+	frames     []Frame
+	frameStore [2]Frame
+}
+
+// Frame is the information returned by Frames for each call frame.
+type Frame struct {
+	// PC is the program counter for the location in this frame.
+	// For a frame that calls another frame, this will be the
+	// program counter of a call instruction. Because of inlining,
+	// multiple frames may have the same PC value, but different
+	// symbolic information.
+	PC uintptr
+
+	// Func is the Func value of this call frame. This may be nil
+	// for non-Go code or fully inlined functions.
+	Func *Func
+
+	// Function is the package path-qualified function name of
+	// this call frame. If non-empty, this string uniquely
+	// identifies a single function in the program.
+	// This may be the empty string if not known.
+	// If Func is not nil then Function == Func.Name().
+	Function string
+
+	// File and Line are the file name and line number of the
+	// location in this frame. For non-leaf frames, this will be
+	// the location of a call. These may be the empty string and
+	// zero, respectively, if not known.
+	File string
+	Line int
+
+	// Entry point program counter for the function; may be zero
+	// if not known. If Func is not nil then Entry ==
+	// Func.Entry().
+	Entry uintptr
+
+	// The runtime's internal view of the function. This field
+	// is set (funcInfo.valid() returns true) only for Go functions,
+	// not for C functions.
+	funcInfo funcInfo
+}
+
+// CallersFrames takes a slice of PC values returned by Callers and
+// prepares to return function/file/line information.
+// Do not change the slice until you are done with the Frames.
+func CallersFrames(callers []uintptr) *Frames {
+	f := &Frames{callers: callers}
+	f.frames = f.frameStore[:0]
+	return f
+}
+
+// Next returns a Frame representing the next call frame in the slice
+// of PC values. If it has already returned all call frames, Next
+// returns a zero Frame.
+//
+// The more result indicates whether the next call to Next will return
+// a valid Frame. It does not necessarily indicate whether this call
+// returned one.
+//
+// See the Frames example for idiomatic usage.
+func (ci *Frames) Next() (frame Frame, more bool) {
+	for len(ci.frames) < 2 {
+		// Find the next frame.
+		// We need to look for 2 frames so we know what
+		// to return for the "more" result.
+		if len(ci.callers) == 0 {
+			break
+		}
+		pc := ci.callers[0]
+		ci.callers = ci.callers[1:]
+		funcInfo := findfunc(pc)
+		if !funcInfo.valid() {
+			if cgoSymbolizer != nil {
+				// Pre-expand cgo frames. We could do this
+				// incrementally, too, but there's no way to
+				// avoid allocation in this case anyway.
+				ci.frames = append(ci.frames, expandCgoFrames(pc)...)
+			}
+			continue
+		}
+		f := funcInfo._Func()
+		entry := f.Entry()
+		if pc > entry {
+			// We store the pc of the start of the instruction following
+			// the instruction in question (the call or the inline mark).
+			// This is done for historical reasons, and to make FuncForPC
+			// work correctly for entries in the result of runtime.Callers.
+			pc--
+		}
+		name := funcname(funcInfo)
+		if inldata := funcdata(funcInfo, _FUNCDATA_InlTree); inldata != nil {
+			inltree := (*[1 << 20]inlinedCall)(inldata)
+			// Non-strict as cgoTraceback may have added bogus PCs
+			// with a valid funcInfo but invalid PCDATA.
+			ix := pcdatavalue1(funcInfo, _PCDATA_InlTreeIndex, pc, nil, false)
+			if ix >= 0 {
+				// Note: entry is not modified. It always refers to a real frame, not an inlined one.
+				f = nil
+				name = funcnameFromNameoff(funcInfo, inltree[ix].func_)
+				// File/line is already correct.
+				// TODO: remove file/line from InlinedCall?
+			}
+		}
+		ci.frames = append(ci.frames, Frame{
+			PC:       pc,
+			Func:     f,
+			Function: name,
+			Entry:    entry,
+			funcInfo: funcInfo,
+			// Note: File,Line set below
+		})
+	}
+
+	// Pop one frame from the frame list. Keep the rest.
+	// Avoid allocation in the common case, which is 1 or 2 frames.
+	switch len(ci.frames) {
+	case 0: // In the rare case when there are no frames at all, we return Frame{}.
+		return
+	case 1:
+		frame = ci.frames[0]
+		ci.frames = ci.frameStore[:0]
+	case 2:
+		frame = ci.frames[0]
+		ci.frameStore[0] = ci.frames[1]
+		ci.frames = ci.frameStore[:1]
+	default:
+		frame = ci.frames[0]
+		ci.frames = ci.frames[1:]
+	}
+	more = len(ci.frames) > 0
+	if frame.funcInfo.valid() {
+		// Compute file/line just before we need to return it,
+		// as it can be expensive. This avoids computing file/line
+		// for the Frame we find but don't return. See issue 32093.
+		file, line := funcline1(frame.funcInfo, frame.PC, false)
+		frame.File, frame.Line = file, int(line)
+	}
+	return
+}
+
+// runtime_expandFinalInlineFrame expands the final pc in stk to include all
+// "callers" if pc is inline.
+//
+//go:linkname runtime_expandFinalInlineFrame runtime/pprof.runtime_expandFinalInlineFrame
+func runtime_expandFinalInlineFrame(stk []uintptr) []uintptr {
+	if len(stk) == 0 {
+		return stk
+	}
+	pc := stk[len(stk)-1]
+	tracepc := pc - 1
+
+	f := findfunc(tracepc)
+	if !f.valid() {
+		// Not a Go function.
+		return stk
+	}
+
+	inldata := funcdata(f, _FUNCDATA_InlTree)
+	if inldata == nil {
+		// Nothing inline in f.
+		return stk
+	}
+
+	// Treat the previous func as normal. We haven't actually checked, but
+	// since this pc was included in the stack, we know it shouldn't be
+	// elided.
+	lastFuncID := funcID_normal
+
+	// Remove pc from stk; we'll re-add it below.
+	stk = stk[:len(stk)-1]
+
+	// See inline expansion in gentraceback.
+	var cache pcvalueCache
+	inltree := (*[1 << 20]inlinedCall)(inldata)
+	for {
+		// Non-strict as cgoTraceback may have added bogus PCs
+		// with a valid funcInfo but invalid PCDATA.
+		ix := pcdatavalue1(f, _PCDATA_InlTreeIndex, tracepc, &cache, false)
+		if ix < 0 {
+			break
+		}
+		if inltree[ix].funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
+			// ignore wrappers
+		} else {
+			stk = append(stk, pc)
+		}
+		lastFuncID = inltree[ix].funcID
+		// Back up to an instruction in the "caller".
+		tracepc = f.entry() + uintptr(inltree[ix].parentPc)
+		pc = tracepc + 1
+	}
+
+	// N.B. we want to keep the last parentPC which is not inline.
+	stk = append(stk, pc)
+
+	return stk
+}
+
+// expandCgoFrames expands frame information for pc, known to be
+// a non-Go function, using the cgoSymbolizer hook. expandCgoFrames
+// returns nil if pc could not be expanded.
+func expandCgoFrames(pc uintptr) []Frame {
+	arg := cgoSymbolizerArg{pc: pc}
+	callCgoSymbolizer(&arg)
+
+	if arg.file == nil && arg.funcName == nil {
+		// No useful information from symbolizer.
+		return nil
+	}
+
+	var frames []Frame
+	for {
+		frames = append(frames, Frame{
+			PC:       pc,
+			Func:     nil,
+			Function: gostring(arg.funcName),
+			File:     gostring(arg.file),
+			Line:     int(arg.lineno),
+			Entry:    arg.entry,
+			// funcInfo is zero, which implies !funcInfo.valid().
+			// That ensures that we use the File/Line info given here.
+		})
+		if arg.more == 0 {
+			break
+		}
+		callCgoSymbolizer(&arg)
+	}
+
+	// No more frames for this PC. Tell the symbolizer we are done.
+	// We don't try to maintain a single cgoSymbolizerArg for the
+	// whole use of Frames, because there would be no good way to tell
+	// the symbolizer when we are done.
+	arg.pc = 0
+	callCgoSymbolizer(&arg)
+
+	return frames
+}
+
+// NOTE: Func does not expose the actual unexported fields, because we return *Func
+// values to users, and we want to keep them from being able to overwrite the data
+// with (say) *f = Func{}.
+// All code operating on a *Func must call raw() to get the *_func
+// or funcInfo() to get the funcInfo instead.
+
+// A Func represents a Go function in the running binary.
+type Func struct {
+	opaque struct{} // unexported field to disallow conversions
+}
+
+func (f *Func) raw() *_func {
+	return (*_func)(unsafe.Pointer(f))
+}
+
+func (f *Func) funcInfo() funcInfo {
+	return f.raw().funcInfo()
+}
+
+func (f *_func) funcInfo() funcInfo {
+	// Find the module containing fn. fn is located in the pclntable.
+	// The unsafe.Pointer to uintptr conversions and arithmetic
+	// are safe because we are working with module addresses.
+	ptr := uintptr(unsafe.Pointer(f))
+	var mod *moduledata
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if len(datap.pclntable) == 0 {
+			continue
+		}
+		base := uintptr(unsafe.Pointer(&datap.pclntable[0]))
+		if base <= ptr && ptr < base+uintptr(len(datap.pclntable)) {
+			mod = datap
+			break
+		}
+	}
+	return funcInfo{f, mod}
+}
+
+// PCDATA and FUNCDATA table indexes.
+//
+// See funcdata.h and ../cmd/internal/objabi/funcdata.go.
+const (
+	_PCDATA_UnsafePoint   = 0
+	_PCDATA_StackMapIndex = 1
+	_PCDATA_InlTreeIndex  = 2
+	_PCDATA_ArgLiveIndex  = 3
+
+	_FUNCDATA_ArgsPointerMaps    = 0
+	_FUNCDATA_LocalsPointerMaps  = 1
+	_FUNCDATA_StackObjects       = 2
+	_FUNCDATA_InlTree            = 3
+	_FUNCDATA_OpenCodedDeferInfo = 4
+	_FUNCDATA_ArgInfo            = 5
+	_FUNCDATA_ArgLiveInfo        = 6
+	_FUNCDATA_WrapInfo           = 7
+
+	_ArgsSizeUnknown = -0x80000000
+)
+
+const (
+	// PCDATA_UnsafePoint values.
+	_PCDATA_UnsafePointSafe   = -1 // Safe for async preemption
+	_PCDATA_UnsafePointUnsafe = -2 // Unsafe for async preemption
+
+	// _PCDATA_Restart1(2) apply on a sequence of instructions, within
+	// which if an async preemption happens, we should back off the PC
+	// to the start of the sequence when resume.
+	// We need two so we can distinguish the start/end of the sequence
+	// in case that two sequences are next to each other.
+	_PCDATA_Restart1 = -3
+	_PCDATA_Restart2 = -4
+
+	// Like _PCDATA_RestartAtEntry, but back to function entry if async
+	// preempted.
+	_PCDATA_RestartAtEntry = -5
+)
+
+// A FuncID identifies particular functions that need to be treated
+// specially by the runtime.
+// Note that in some situations involving plugins, there may be multiple
+// copies of a particular special runtime function.
+// Note: this list must match the list in cmd/internal/objabi/funcid.go.
+type funcID uint8
+
+const (
+	funcID_normal funcID = iota // not a special function
+	funcID_abort
+	funcID_asmcgocall
+	funcID_asyncPreempt
+	funcID_cgocallback
+	funcID_debugCallV2
+	funcID_gcBgMarkWorker
+	funcID_goexit
+	funcID_gogo
+	funcID_gopanic
+	funcID_handleAsyncEvent
+	funcID_mcall
+	funcID_morestack
+	funcID_mstart
+	funcID_panicwrap
+	funcID_rt0_go
+	funcID_runfinq
+	funcID_runtime_main
+	funcID_sigpanic
+	funcID_systemstack
+	funcID_systemstack_switch
+	funcID_wrapper // any autogenerated code (hash/eq algorithms, method wrappers, etc.)
+)
+
+// A FuncFlag holds bits about a function.
+// This list must match the list in cmd/internal/objabi/funcid.go.
+type funcFlag uint8
+
+const (
+	// TOPFRAME indicates a function that appears at the top of its stack.
+	// The traceback routine stop at such a function and consider that a
+	// successful, complete traversal of the stack.
+	// Examples of TOPFRAME functions include goexit, which appears
+	// at the top of a user goroutine stack, and mstart, which appears
+	// at the top of a system goroutine stack.
+	funcFlag_TOPFRAME funcFlag = 1 << iota
+
+	// SPWRITE indicates a function that writes an arbitrary value to SP
+	// (any write other than adding or subtracting a constant amount).
+	// The traceback routines cannot encode such changes into the
+	// pcsp tables, so the function traceback cannot safely unwind past
+	// SPWRITE functions. Stopping at an SPWRITE function is considered
+	// to be an incomplete unwinding of the stack. In certain contexts
+	// (in particular garbage collector stack scans) that is a fatal error.
+	funcFlag_SPWRITE
+
+	// ASM indicates that a function was implemented in assembly.
+	funcFlag_ASM
+)
+
+// pcHeader holds data used by the pclntab lookups.
+type pcHeader struct {
+	magic          uint32  // 0xFFFFFFF0
+	pad1, pad2     uint8   // 0,0
+	minLC          uint8   // min instruction size
+	ptrSize        uint8   // size of a ptr in bytes
+	nfunc          int     // number of functions in the module
+	nfiles         uint    // number of entries in the file tab
+	textStart      uintptr // base for function entry PC offsets in this module, equal to moduledata.text
+	funcnameOffset uintptr // offset to the funcnametab variable from pcHeader
+	cuOffset       uintptr // offset to the cutab variable from pcHeader
+	filetabOffset  uintptr // offset to the filetab variable from pcHeader
+	pctabOffset    uintptr // offset to the pctab variable from pcHeader
+	pclnOffset     uintptr // offset to the pclntab variable from pcHeader
+}
+
+// moduledata records information about the layout of the executable
+// image. It is written by the linker. Any changes here must be
+// matched changes to the code in cmd/link/internal/ld/symtab.go:symtab.
+// moduledata is stored in statically allocated non-pointer memory;
+// none of the pointers here are visible to the garbage collector.
+type moduledata struct {
+	pcHeader     *pcHeader
+	funcnametab  []byte
+	cutab        []uint32
+	filetab      []byte
+	pctab        []byte
+	pclntable    []byte
+	ftab         []functab
+	findfunctab  uintptr
+	minpc, maxpc uintptr
+
+	text, etext           uintptr
+	noptrdata, enoptrdata uintptr
+	data, edata           uintptr
+	bss, ebss             uintptr
+	noptrbss, enoptrbss   uintptr
+	end, gcdata, gcbss    uintptr
+	types, etypes         uintptr
+	rodata                uintptr
+	gofunc                uintptr // go.func.*
+
+	textsectmap []textsect
+	typelinks   []int32 // offsets from types
+	itablinks   []*itab
+
+	ptab []ptabEntry
+
+	pluginpath string
+	pkghashes  []modulehash
+
+	modulename   string
+	modulehashes []modulehash
+
+	hasmain uint8 // 1 if module contains the main function, 0 otherwise
+
+	gcdatamask, gcbssmask bitvector
+
+	typemap map[typeOff]*_type // offset to *_rtype in previous module
+
+	bad bool // module failed to load and should be ignored
+
+	next *moduledata
+}
+
+// A modulehash is used to compare the ABI of a new module or a
+// package in a new module with the loaded program.
+//
+// For each shared library a module links against, the linker creates an entry in the
+// moduledata.modulehashes slice containing the name of the module, the abi hash seen
+// at link time and a pointer to the runtime abi hash. These are checked in
+// moduledataverify1 below.
+//
+// For each loaded plugin, the pkghashes slice has a modulehash of the
+// newly loaded package that can be used to check the plugin's version of
+// a package against any previously loaded version of the package.
+// This is done in plugin.lastmoduleinit.
+type modulehash struct {
+	modulename   string
+	linktimehash string
+	runtimehash  *string
+}
+
+// pinnedTypemaps are the map[typeOff]*_type from the moduledata objects.
+//
+// These typemap objects are allocated at run time on the heap, but the
+// only direct reference to them is in the moduledata, created by the
+// linker and marked SNOPTRDATA so it is ignored by the GC.
+//
+// To make sure the map isn't collected, we keep a second reference here.
+var pinnedTypemaps []map[typeOff]*_type
+
+var firstmoduledata moduledata  // linker symbol
+var lastmoduledatap *moduledata // linker symbol
+var modulesSlice *[]*moduledata // see activeModules
+
+// activeModules returns a slice of active modules.
+//
+// A module is active once its gcdatamask and gcbssmask have been
+// assembled and it is usable by the GC.
+//
+// This is nosplit/nowritebarrier because it is called by the
+// cgo pointer checking code.
+//go:nosplit
+//go:nowritebarrier
+func activeModules() []*moduledata {
+	p := (*[]*moduledata)(atomic.Loadp(unsafe.Pointer(&modulesSlice)))
+	if p == nil {
+		return nil
+	}
+	return *p
+}
+
+// modulesinit creates the active modules slice out of all loaded modules.
+//
+// When a module is first loaded by the dynamic linker, an .init_array
+// function (written by cmd/link) is invoked to call addmoduledata,
+// appending to the module to the linked list that starts with
+// firstmoduledata.
+//
+// There are two times this can happen in the lifecycle of a Go
+// program. First, if compiled with -linkshared, a number of modules
+// built with -buildmode=shared can be loaded at program initialization.
+// Second, a Go program can load a module while running that was built
+// with -buildmode=plugin.
+//
+// After loading, this function is called which initializes the
+// moduledata so it is usable by the GC and creates a new activeModules
+// list.
+//
+// Only one goroutine may call modulesinit at a time.
+func modulesinit() {
+	modules := new([]*moduledata)
+	for md := &firstmoduledata; md != nil; md = md.next {
+		if md.bad {
+			continue
+		}
+		*modules = append(*modules, md)
+		if md.gcdatamask == (bitvector{}) {
+			scanDataSize := md.edata - md.data
+			md.gcdatamask = progToPointerMask((*byte)(unsafe.Pointer(md.gcdata)), scanDataSize)
+			scanBSSSize := md.ebss - md.bss
+			md.gcbssmask = progToPointerMask((*byte)(unsafe.Pointer(md.gcbss)), scanBSSSize)
+			gcController.addGlobals(int64(scanDataSize + scanBSSSize))
+		}
+	}
+
+	// Modules appear in the moduledata linked list in the order they are
+	// loaded by the dynamic loader, with one exception: the
+	// firstmoduledata itself the module that contains the runtime. This
+	// is not always the first module (when using -buildmode=shared, it
+	// is typically libstd.so, the second module). The order matters for
+	// typelinksinit, so we swap the first module with whatever module
+	// contains the main function.
+	//
+	// See Issue #18729.
+	for i, md := range *modules {
+		if md.hasmain != 0 {
+			(*modules)[0] = md
+			(*modules)[i] = &firstmoduledata
+			break
+		}
+	}
+
+	atomicstorep(unsafe.Pointer(&modulesSlice), unsafe.Pointer(modules))
+}
+
+type functab struct {
+	entryoff uint32 // relative to runtime.text
+	funcoff  uint32
+}
+
+// Mapping information for secondary text sections
+
+type textsect struct {
+	vaddr    uintptr // prelinked section vaddr
+	end      uintptr // vaddr + section length
+	baseaddr uintptr // relocated section address
+}
+
+const minfunc = 16                 // minimum function size
+const pcbucketsize = 256 * minfunc // size of bucket in the pc->func lookup table
+
+// findfunctab is an array of these structures.
+// Each bucket represents 4096 bytes of the text segment.
+// Each subbucket represents 256 bytes of the text segment.
+// To find a function given a pc, locate the bucket and subbucket for
+// that pc. Add together the idx and subbucket value to obtain a
+// function index. Then scan the functab array starting at that
+// index to find the target function.
+// This table uses 20 bytes for every 4096 bytes of code, or ~0.5% overhead.
+type findfuncbucket struct {
+	idx        uint32
+	subbuckets [16]byte
+}
+
+func moduledataverify() {
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		moduledataverify1(datap)
+	}
+}
+
+const debugPcln = false
+
+func moduledataverify1(datap *moduledata) {
+	// Check that the pclntab's format is valid.
+	hdr := datap.pcHeader
+	if hdr.magic != 0xfffffff0 || hdr.pad1 != 0 || hdr.pad2 != 0 ||
+		hdr.minLC != sys.PCQuantum || hdr.ptrSize != goarch.PtrSize || hdr.textStart != datap.text {
+		println("runtime: pcHeader: magic=", hex(hdr.magic), "pad1=", hdr.pad1, "pad2=", hdr.pad2,
+			"minLC=", hdr.minLC, "ptrSize=", hdr.ptrSize, "pcHeader.textStart=", hex(hdr.textStart),
+			"text=", hex(datap.text), "pluginpath=", datap.pluginpath)
+		throw("invalid function symbol table")
+	}
+
+	// ftab is lookup table for function by program counter.
+	nftab := len(datap.ftab) - 1
+	for i := 0; i < nftab; i++ {
+		// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
+		if datap.ftab[i].entryoff > datap.ftab[i+1].entryoff {
+			f1 := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i].funcoff])), datap}
+			f2 := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i+1].funcoff])), datap}
+			f2name := "end"
+			if i+1 < nftab {
+				f2name = funcname(f2)
+			}
+			println("function symbol table not sorted by PC offset:", hex(datap.ftab[i].entryoff), funcname(f1), ">", hex(datap.ftab[i+1].entryoff), f2name, ", plugin:", datap.pluginpath)
+			for j := 0; j <= i; j++ {
+				println("\t", hex(datap.ftab[j].entryoff), funcname(funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[j].funcoff])), datap}))
+			}
+			if GOOS == "aix" && isarchive {
+				println("-Wl,-bnoobjreorder is mandatory on aix/ppc64 with c-archive")
+			}
+			throw("invalid runtime symbol table")
+		}
+	}
+
+	min := datap.textAddr(datap.ftab[0].entryoff)
+	max := datap.textAddr(datap.ftab[nftab].entryoff)
+	if datap.minpc != min || datap.maxpc != max {
+		println("minpc=", hex(datap.minpc), "min=", hex(min), "maxpc=", hex(datap.maxpc), "max=", hex(max))
+		throw("minpc or maxpc invalid")
+	}
+
+	for _, modulehash := range datap.modulehashes {
+		if modulehash.linktimehash != *modulehash.runtimehash {
+			println("abi mismatch detected between", datap.modulename, "and", modulehash.modulename)
+			throw("abi mismatch")
+		}
+	}
+}
+
+// textAddr returns md.text + off, with special handling for multiple text sections.
+// off is a (virtual) offset computed at internal linking time,
+// before the external linker adjusts the sections' base addresses.
+//
+// The text, or instruction stream is generated as one large buffer.
+// The off (offset) for a function is its offset within this buffer.
+// If the total text size gets too large, there can be issues on platforms like ppc64
+// if the target of calls are too far for the call instruction.
+// To resolve the large text issue, the text is split into multiple text sections
+// to allow the linker to generate long calls when necessary.
+// When this happens, the vaddr for each text section is set to its offset within the text.
+// Each function's offset is compared against the section vaddrs and ends to determine the containing section.
+// Then the section relative offset is added to the section's
+// relocated baseaddr to compute the function address.
+//
+// It is nosplit because it is part of the findfunc implementation.
+//go:nosplit
+func (md *moduledata) textAddr(off32 uint32) uintptr {
+	off := uintptr(off32)
+	res := md.text + off
+	if len(md.textsectmap) > 1 {
+		for i, sect := range md.textsectmap {
+			// For the last section, include the end address (etext), as it is included in the functab.
+			if off >= sect.vaddr && off < sect.end || (i == len(md.textsectmap)-1 && off == sect.end) {
+				res = sect.baseaddr + off - sect.vaddr
+				break
+			}
+		}
+		if res > md.etext && GOARCH != "wasm" { // on wasm, functions do not live in the same address space as the linear memory
+			println("runtime: textAddr", hex(res), "out of range", hex(md.text), "-", hex(md.etext))
+			throw("runtime: text offset out of range")
+		}
+	}
+	return res
+}
+
+// textOff is the opposite of textAddr. It converts a PC to a (virtual) offset
+// to md.text, and returns if the PC is in any Go text section.
+//
+// It is nosplit because it is part of the findfunc implementation.
+//go:nosplit
+func (md *moduledata) textOff(pc uintptr) (uint32, bool) {
+	res := uint32(pc - md.text)
+	if len(md.textsectmap) > 1 {
+		for i, sect := range md.textsectmap {
+			if sect.baseaddr > pc {
+				// pc is not in any section.
+				return 0, false
+			}
+			end := sect.baseaddr + (sect.end - sect.vaddr)
+			// For the last section, include the end address (etext), as it is included in the functab.
+			if i == len(md.textsectmap) {
+				end++
+			}
+			if pc < end {
+				res = uint32(pc - sect.baseaddr + sect.vaddr)
+				break
+			}
+		}
+	}
+	return res, true
+}
+
+// FuncForPC returns a *Func describing the function that contains the
+// given program counter address, or else nil.
+//
+// If pc represents multiple functions because of inlining, it returns
+// the *Func describing the innermost function, but with an entry of
+// the outermost function.
+func FuncForPC(pc uintptr) *Func {
+	f := findfunc(pc)
+	if !f.valid() {
+		return nil
+	}
+	if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+		// Note: strict=false so bad PCs (those between functions) don't crash the runtime.
+		// We just report the preceding function in that situation. See issue 29735.
+		// TODO: Perhaps we should report no function at all in that case.
+		// The runtime currently doesn't have function end info, alas.
+		if ix := pcdatavalue1(f, _PCDATA_InlTreeIndex, pc, nil, false); ix >= 0 {
+			inltree := (*[1 << 20]inlinedCall)(inldata)
+			name := funcnameFromNameoff(f, inltree[ix].func_)
+			file, line := funcline(f, pc)
+			fi := &funcinl{
+				ones:  ^uint32(0),
+				entry: f.entry(), // entry of the real (the outermost) function.
+				name:  name,
+				file:  file,
+				line:  int(line),
+			}
+			return (*Func)(unsafe.Pointer(fi))
+		}
+	}
+	return f._Func()
+}
+
+// Name returns the name of the function.
+func (f *Func) Name() string {
+	if f == nil {
+		return ""
+	}
+	fn := f.raw()
+	if fn.isInlined() { // inlined version
+		fi := (*funcinl)(unsafe.Pointer(fn))
+		return fi.name
+	}
+	return funcname(f.funcInfo())
+}
+
+// Entry returns the entry address of the function.
+func (f *Func) Entry() uintptr {
+	fn := f.raw()
+	if fn.isInlined() { // inlined version
+		fi := (*funcinl)(unsafe.Pointer(fn))
+		return fi.entry
+	}
+	return fn.funcInfo().entry()
+}
+
+// FileLine returns the file name and line number of the
+// source code corresponding to the program counter pc.
+// The result will not be accurate if pc is not a program
+// counter within f.
+func (f *Func) FileLine(pc uintptr) (file string, line int) {
+	fn := f.raw()
+	if fn.isInlined() { // inlined version
+		fi := (*funcinl)(unsafe.Pointer(fn))
+		return fi.file, fi.line
+	}
+	// Pass strict=false here, because anyone can call this function,
+	// and they might just be wrong about targetpc belonging to f.
+	file, line32 := funcline1(f.funcInfo(), pc, false)
+	return file, int(line32)
+}
+
+// findmoduledatap looks up the moduledata for a PC.
+//
+// It is nosplit because it's part of the isgoexception
+// implementation.
+//
+//go:nosplit
+func findmoduledatap(pc uintptr) *moduledata {
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if datap.minpc <= pc && pc < datap.maxpc {
+			return datap
+		}
+	}
+	return nil
+}
+
+type funcInfo struct {
+	*_func
+	datap *moduledata
+}
+
+func (f funcInfo) valid() bool {
+	return f._func != nil
+}
+
+func (f funcInfo) _Func() *Func {
+	return (*Func)(unsafe.Pointer(f._func))
+}
+
+// isInlined reports whether f should be re-interpreted as a *funcinl.
+func (f *_func) isInlined() bool {
+	return f.entryoff == ^uint32(0) // see comment for funcinl.ones
+}
+
+// entry returns the entry PC for f.
+func (f funcInfo) entry() uintptr {
+	return f.datap.textAddr(f.entryoff)
+}
+
+// findfunc looks up function metadata for a PC.
+//
+// It is nosplit because it's part of the isgoexception
+// implementation.
+//
+//go:nosplit
+func findfunc(pc uintptr) funcInfo {
+	datap := findmoduledatap(pc)
+	if datap == nil {
+		return funcInfo{}
+	}
+	const nsub = uintptr(len(findfuncbucket{}.subbuckets))
+
+	pcOff, ok := datap.textOff(pc)
+	if !ok {
+		return funcInfo{}
+	}
+
+	x := uintptr(pcOff) + datap.text - datap.minpc // TODO: are datap.text and datap.minpc always equal?
+	b := x / pcbucketsize
+	i := x % pcbucketsize / (pcbucketsize / nsub)
+
+	ffb := (*findfuncbucket)(add(unsafe.Pointer(datap.findfunctab), b*unsafe.Sizeof(findfuncbucket{})))
+	idx := ffb.idx + uint32(ffb.subbuckets[i])
+
+	// Find the ftab entry.
+	for datap.ftab[idx+1].entryoff <= pcOff {
+		idx++
+	}
+
+	funcoff := datap.ftab[idx].funcoff
+	return funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[funcoff])), datap}
+}
+
+type pcvalueCache struct {
+	entries [2][8]pcvalueCacheEnt
+}
+
+type pcvalueCacheEnt struct {
+	// targetpc and off together are the key of this cache entry.
+	targetpc uintptr
+	off      uint32
+	// val is the value of this cached pcvalue entry.
+	val int32
+}
+
+// pcvalueCacheKey returns the outermost index in a pcvalueCache to use for targetpc.
+// It must be very cheap to calculate.
+// For now, align to goarch.PtrSize and reduce mod the number of entries.
+// In practice, this appears to be fairly randomly and evenly distributed.
+func pcvalueCacheKey(targetpc uintptr) uintptr {
+	return (targetpc / goarch.PtrSize) % uintptr(len(pcvalueCache{}.entries))
+}
+
+// Returns the PCData value, and the PC where this value starts.
+// TODO: the start PC is returned only when cache is nil.
+func pcvalue(f funcInfo, off uint32, targetpc uintptr, cache *pcvalueCache, strict bool) (int32, uintptr) {
+	if off == 0 {
+		return -1, 0
+	}
+
+	// Check the cache. This speeds up walks of deep stacks, which
+	// tend to have the same recursive functions over and over.
+	//
+	// This cache is small enough that full associativity is
+	// cheaper than doing the hashing for a less associative
+	// cache.
+	if cache != nil {
+		x := pcvalueCacheKey(targetpc)
+		for i := range cache.entries[x] {
+			// We check off first because we're more
+			// likely to have multiple entries with
+			// different offsets for the same targetpc
+			// than the other way around, so we'll usually
+			// fail in the first clause.
+			ent := &cache.entries[x][i]
+			if ent.off == off && ent.targetpc == targetpc {
+				return ent.val, 0
+			}
+		}
+	}
+
+	if !f.valid() {
+		if strict && panicking == 0 {
+			println("runtime: no module data for", hex(f.entry()))
+			throw("no module data")
+		}
+		return -1, 0
+	}
+	datap := f.datap
+	p := datap.pctab[off:]
+	pc := f.entry()
+	prevpc := pc
+	val := int32(-1)
+	for {
+		var ok bool
+		p, ok = step(p, &pc, &val, pc == f.entry())
+		if !ok {
+			break
+		}
+		if targetpc < pc {
+			// Replace a random entry in the cache. Random
+			// replacement prevents a performance cliff if
+			// a recursive stack's cycle is slightly
+			// larger than the cache.
+			// Put the new element at the beginning,
+			// since it is the most likely to be newly used.
+			if cache != nil {
+				x := pcvalueCacheKey(targetpc)
+				e := &cache.entries[x]
+				ci := fastrandn(uint32(len(cache.entries[x])))
+				e[ci] = e[0]
+				e[0] = pcvalueCacheEnt{
+					targetpc: targetpc,
+					off:      off,
+					val:      val,
+				}
+			}
+
+			return val, prevpc
+		}
+		prevpc = pc
+	}
+
+	// If there was a table, it should have covered all program counters.
+	// If not, something is wrong.
+	if panicking != 0 || !strict {
+		return -1, 0
+	}
+
+	print("runtime: invalid pc-encoded table f=", funcname(f), " pc=", hex(pc), " targetpc=", hex(targetpc), " tab=", p, "\n")
+
+	p = datap.pctab[off:]
+	pc = f.entry()
+	val = -1
+	for {
+		var ok bool
+		p, ok = step(p, &pc, &val, pc == f.entry())
+		if !ok {
+			break
+		}
+		print("\tvalue=", val, " until pc=", hex(pc), "\n")
+	}
+
+	throw("invalid runtime symbol table")
+	return -1, 0
+}
+
+func cfuncname(f funcInfo) *byte {
+	if !f.valid() || f.nameoff == 0 {
+		return nil
+	}
+	return &f.datap.funcnametab[f.nameoff]
+}
+
+func funcname(f funcInfo) string {
+	return gostringnocopy(cfuncname(f))
+}
+
+func funcpkgpath(f funcInfo) string {
+	name := funcname(f)
+	i := len(name) - 1
+	for ; i > 0; i-- {
+		if name[i] == '/' {
+			break
+		}
+	}
+	for ; i < len(name); i++ {
+		if name[i] == '.' {
+			break
+		}
+	}
+	return name[:i]
+}
+
+func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
+	if !f.valid() {
+		return nil
+	}
+	return &f.datap.funcnametab[nameoff]
+}
+
+func funcnameFromNameoff(f funcInfo, nameoff int32) string {
+	return gostringnocopy(cfuncnameFromNameoff(f, nameoff))
+}
+
+func funcfile(f funcInfo, fileno int32) string {
+	datap := f.datap
+	if !f.valid() {
+		return "?"
+	}
+	// Make sure the cu index and file offset are valid
+	if fileoff := datap.cutab[f.cuOffset+uint32(fileno)]; fileoff != ^uint32(0) {
+		return gostringnocopy(&datap.filetab[fileoff])
+	}
+	// pcln section is corrupt.
+	return "?"
+}
+
+func funcline1(f funcInfo, targetpc uintptr, strict bool) (file string, line int32) {
+	datap := f.datap
+	if !f.valid() {
+		return "?", 0
+	}
+	fileno, _ := pcvalue(f, f.pcfile, targetpc, nil, strict)
+	line, _ = pcvalue(f, f.pcln, targetpc, nil, strict)
+	if fileno == -1 || line == -1 || int(fileno) >= len(datap.filetab) {
+		// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
+		return "?", 0
+	}
+	file = funcfile(f, fileno)
+	return
+}
+
+func funcline(f funcInfo, targetpc uintptr) (file string, line int32) {
+	return funcline1(f, targetpc, true)
+}
+
+func funcspdelta(f funcInfo, targetpc uintptr, cache *pcvalueCache) int32 {
+	x, _ := pcvalue(f, f.pcsp, targetpc, cache, true)
+	if debugPcln && x&(goarch.PtrSize-1) != 0 {
+		print("invalid spdelta ", funcname(f), " ", hex(f.entry()), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
+		throw("bad spdelta")
+	}
+	return x
+}
+
+// funcMaxSPDelta returns the maximum spdelta at any point in f.
+func funcMaxSPDelta(f funcInfo) int32 {
+	datap := f.datap
+	p := datap.pctab[f.pcsp:]
+	pc := f.entry()
+	val := int32(-1)
+	max := int32(0)
+	for {
+		var ok bool
+		p, ok = step(p, &pc, &val, pc == f.entry())
+		if !ok {
+			return max
+		}
+		if val > max {
+			max = val
+		}
+	}
+}
+
+func pcdatastart(f funcInfo, table uint32) uint32 {
+	return *(*uint32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
+}
+
+func pcdatavalue(f funcInfo, table uint32, targetpc uintptr, cache *pcvalueCache) int32 {
+	if table >= f.npcdata {
+		return -1
+	}
+	r, _ := pcvalue(f, pcdatastart(f, table), targetpc, cache, true)
+	return r
+}
+
+func pcdatavalue1(f funcInfo, table uint32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
+	if table >= f.npcdata {
+		return -1
+	}
+	r, _ := pcvalue(f, pcdatastart(f, table), targetpc, cache, strict)
+	return r
+}
+
+// Like pcdatavalue, but also return the start PC of this PCData value.
+// It doesn't take a cache.
+func pcdatavalue2(f funcInfo, table uint32, targetpc uintptr) (int32, uintptr) {
+	if table >= f.npcdata {
+		return -1, 0
+	}
+	return pcvalue(f, pcdatastart(f, table), targetpc, nil, true)
+}
+
+// funcdata returns a pointer to the ith funcdata for f.
+// funcdata should be kept in sync with cmd/link:writeFuncs.
+func funcdata(f funcInfo, i uint8) unsafe.Pointer {
+	if i < 0 || i >= f.nfuncdata {
+		return nil
+	}
+	base := f.datap.gofunc // load gofunc address early so that we calculate during cache misses
+	p := uintptr(unsafe.Pointer(&f.nfuncdata)) + unsafe.Sizeof(f.nfuncdata) + uintptr(f.npcdata)*4 + uintptr(i)*4
+	off := *(*uint32)(unsafe.Pointer(p))
+	// Return off == ^uint32(0) ? 0 : f.datap.gofunc + uintptr(off), but without branches.
+	// The compiler calculates mask on most architectures using conditional assignment.
+	var mask uintptr
+	if off == ^uint32(0) {
+		mask = 1
+	}
+	mask--
+	raw := base + uintptr(off)
+	return unsafe.Pointer(raw & mask)
+}
+
+// step advances to the next pc, value pair in the encoded table.
+func step(p []byte, pc *uintptr, val *int32, first bool) (newp []byte, ok bool) {
+	// For both uvdelta and pcdelta, the common case (~70%)
+	// is that they are a single byte. If so, avoid calling readvarint.
+	uvdelta := uint32(p[0])
+	if uvdelta == 0 && !first {
+		return nil, false
+	}
+	n := uint32(1)
+	if uvdelta&0x80 != 0 {
+		n, uvdelta = readvarint(p)
+	}
+	*val += int32(-(uvdelta & 1) ^ (uvdelta >> 1))
+	p = p[n:]
+
+	pcdelta := uint32(p[0])
+	n = 1
+	if pcdelta&0x80 != 0 {
+		n, pcdelta = readvarint(p)
+	}
+	p = p[n:]
+	*pc += uintptr(pcdelta * sys.PCQuantum)
+	return p, true
+}
+
+// readvarint reads a varint from p.
+func readvarint(p []byte) (read uint32, val uint32) {
+	var v, shift, n uint32
+	for {
+		b := p[n]
+		n++
+		v |= uint32(b&0x7F) << (shift & 31)
+		if b&0x80 == 0 {
+			break
+		}
+		shift += 7
+	}
+	return n, v
+}
+
+type stackmap struct {
+	n        int32   // number of bitmaps
+	nbit     int32   // number of bits in each bitmap
+	bytedata [1]byte // bitmaps, each starting on a byte boundary
+}
+
+//go:nowritebarrier
+func stackmapdata(stkmap *stackmap, n int32) bitvector {
+	// Check this invariant only when stackDebug is on at all.
+	// The invariant is already checked by many of stackmapdata's callers,
+	// and disabling it by default allows stackmapdata to be inlined.
+	if stackDebug > 0 && (n < 0 || n >= stkmap.n) {
+		throw("stackmapdata: index out of range")
+	}
+	return bitvector{stkmap.nbit, addb(&stkmap.bytedata[0], uintptr(n*((stkmap.nbit+7)>>3)))}
+}
+
+// inlinedCall is the encoding of entries in the FUNCDATA_InlTree table.
+type inlinedCall struct {
+	parent   int16  // index of parent in the inltree, or < 0
+	funcID   funcID // type of the called function
+	_        byte
+	file     int32 // perCU file index for inlined call. See cmd/link:pcln.go
+	line     int32 // line number of the call site
+	func_    int32 // offset into pclntab for name of called function
+	parentPc int32 // position of an instruction whose source position is the call site (offset from entry)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sys_darwin.go b/contrib/go/_std_1.18/src/runtime/sys_darwin.go
new file mode 100644
index 0000000000..58b3a9171c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sys_darwin.go
@@ -0,0 +1,536 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"unsafe"
+)
+
+// The X versions of syscall expect the libc call to return a 64-bit result.
+// Otherwise (the non-X version) expects a 32-bit result.
+// This distinction is required because an error is indicated by returning -1,
+// and we need to know whether to check 32 or 64 bits of the result.
+// (Some libc functions that return 32 bits put junk in the upper 32 bits of AX.)
+
+//go:linkname syscall_syscall syscall.syscall
+//go:nosplit
+func syscall_syscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, r1, r2, err uintptr }{fn, a1, a2, a3, r1, r2, err}
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall)), unsafe.Pointer(&args))
+	exitsyscall()
+	return args.r1, args.r2, args.err
+}
+func syscall()
+
+//go:linkname syscall_syscallX syscall.syscallX
+//go:nosplit
+func syscall_syscallX(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, r1, r2, err uintptr }{fn, a1, a2, a3, r1, r2, err}
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscallX)), unsafe.Pointer(&args))
+	exitsyscall()
+	return args.r1, args.r2, args.err
+}
+func syscallX()
+
+//go:linkname syscall_syscall6 syscall.syscall6
+//go:nosplit
+func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, a4, a5, a6, r1, r2, err uintptr }{fn, a1, a2, a3, a4, a5, a6, r1, r2, err}
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall6)), unsafe.Pointer(&args))
+	exitsyscall()
+	return args.r1, args.r2, args.err
+}
+func syscall6()
+
+//go:linkname syscall_syscall6X syscall.syscall6X
+//go:nosplit
+func syscall_syscall6X(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, a4, a5, a6, r1, r2, err uintptr }{fn, a1, a2, a3, a4, a5, a6, r1, r2, err}
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall6X)), unsafe.Pointer(&args))
+	exitsyscall()
+	return args.r1, args.r2, args.err
+}
+func syscall6X()
+
+//go:linkname syscall_syscallPtr syscall.syscallPtr
+//go:nosplit
+func syscall_syscallPtr(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, r1, r2, err uintptr }{fn, a1, a2, a3, r1, r2, err}
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscallPtr)), unsafe.Pointer(&args))
+	exitsyscall()
+	return args.r1, args.r2, args.err
+}
+func syscallPtr()
+
+//go:linkname syscall_rawSyscall syscall.rawSyscall
+//go:nosplit
+func syscall_rawSyscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, r1, r2, err uintptr }{fn, a1, a2, a3, r1, r2, err}
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall)), unsafe.Pointer(&args))
+	return args.r1, args.r2, args.err
+}
+
+//go:linkname syscall_rawSyscall6 syscall.rawSyscall6
+//go:nosplit
+func syscall_rawSyscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	args := struct{ fn, a1, a2, a3, a4, a5, a6, r1, r2, err uintptr }{fn, a1, a2, a3, a4, a5, a6, r1, r2, err}
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall6)), unsafe.Pointer(&args))
+	return args.r1, args.r2, args.err
+}
+
+// syscallNoErr is used in crypto/x509 to call into Security.framework and CF.
+
+//go:linkname crypto_x509_syscall crypto/x509/internal/macos.syscall
+//go:nosplit
+func crypto_x509_syscall(fn, a1, a2, a3, a4, a5 uintptr, f1 float64) (r1 uintptr) {
+	args := struct {
+		fn, a1, a2, a3, a4, a5 uintptr
+		f1                     float64
+		r1                     uintptr
+	}{fn, a1, a2, a3, a4, a5, f1, r1}
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall_x509)), unsafe.Pointer(&args))
+	exitsyscall()
+	return args.r1
+}
+func syscall_x509()
+
+// The *_trampoline functions convert from the Go calling convention to the C calling convention
+// and then call the underlying libc function.  They are defined in sys_darwin_$ARCH.s.
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_attr_init(attr *pthreadattr) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_attr_init_trampoline)), unsafe.Pointer(&attr))
+	KeepAlive(attr)
+	return ret
+}
+func pthread_attr_init_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_attr_getstacksize(attr *pthreadattr, size *uintptr) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_attr_getstacksize_trampoline)), unsafe.Pointer(&attr))
+	KeepAlive(attr)
+	KeepAlive(size)
+	return ret
+}
+func pthread_attr_getstacksize_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_attr_setdetachstate(attr *pthreadattr, state int) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_attr_setdetachstate_trampoline)), unsafe.Pointer(&attr))
+	KeepAlive(attr)
+	return ret
+}
+func pthread_attr_setdetachstate_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_create(attr *pthreadattr, start uintptr, arg unsafe.Pointer) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_create_trampoline)), unsafe.Pointer(&attr))
+	KeepAlive(attr)
+	KeepAlive(arg) // Just for consistency. Arg of course needs to be kept alive for the start function.
+	return ret
+}
+func pthread_create_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func raise(sig uint32) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(raise_trampoline)), unsafe.Pointer(&sig))
+}
+func raise_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_self() (t pthread) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_self_trampoline)), unsafe.Pointer(&t))
+	return
+}
+func pthread_self_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_kill(t pthread, sig uint32) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_kill_trampoline)), unsafe.Pointer(&t))
+	return
+}
+func pthread_kill_trampoline()
+
+// mmap is used to do low-level memory allocation via mmap. Don't allow stack
+// splits, since this function (used by sysAlloc) is called in a lot of low-level
+// parts of the runtime and callers often assume it won't acquire any locks.
+//go:nosplit
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (unsafe.Pointer, int) {
+	args := struct {
+		addr            unsafe.Pointer
+		n               uintptr
+		prot, flags, fd int32
+		off             uint32
+		ret1            unsafe.Pointer
+		ret2            int
+	}{addr, n, prot, flags, fd, off, nil, 0}
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(mmap_trampoline)), unsafe.Pointer(&args))
+	return args.ret1, args.ret2
+}
+func mmap_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func munmap(addr unsafe.Pointer, n uintptr) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(munmap_trampoline)), unsafe.Pointer(&addr))
+	KeepAlive(addr) // Just for consistency. Hopefully addr is not a Go address.
+}
+func munmap_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func madvise(addr unsafe.Pointer, n uintptr, flags int32) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(madvise_trampoline)), unsafe.Pointer(&addr))
+	KeepAlive(addr) // Just for consistency. Hopefully addr is not a Go address.
+}
+func madvise_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func mlock(addr unsafe.Pointer, n uintptr) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(mlock_trampoline)), unsafe.Pointer(&addr))
+	KeepAlive(addr) // Just for consistency. Hopefully addr is not a Go address.
+}
+func mlock_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func read(fd int32, p unsafe.Pointer, n int32) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(read_trampoline)), unsafe.Pointer(&fd))
+	KeepAlive(p)
+	return ret
+}
+func read_trampoline()
+
+func pipe() (r, w int32, errno int32) {
+	var p [2]int32
+	errno = libcCall(unsafe.Pointer(abi.FuncPCABI0(pipe_trampoline)), noescape(unsafe.Pointer(&p)))
+	return p[0], p[1], errno
+}
+func pipe_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func closefd(fd int32) int32 {
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(close_trampoline)), unsafe.Pointer(&fd))
+}
+func close_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+//
+// This is exported via linkname to assembly in runtime/cgo.
+//go:linkname exit
+func exit(code int32) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(exit_trampoline)), unsafe.Pointer(&code))
+}
+func exit_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func usleep(usec uint32) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(usleep_trampoline)), unsafe.Pointer(&usec))
+}
+func usleep_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func usleep_no_g(usec uint32) {
+	asmcgocall_no_g(unsafe.Pointer(abi.FuncPCABI0(usleep_trampoline)), unsafe.Pointer(&usec))
+}
+
+//go:nosplit
+//go:cgo_unsafe_args
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(write_trampoline)), unsafe.Pointer(&fd))
+	KeepAlive(p)
+	return ret
+}
+func write_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func open(name *byte, mode, perm int32) (ret int32) {
+	ret = libcCall(unsafe.Pointer(abi.FuncPCABI0(open_trampoline)), unsafe.Pointer(&name))
+	KeepAlive(name)
+	return
+}
+func open_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func nanotime1() int64 {
+	var r struct {
+		t            int64  // raw timer
+		numer, denom uint32 // conversion factors. nanoseconds = t * numer / denom.
+	}
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(nanotime_trampoline)), unsafe.Pointer(&r))
+	// Note: Apple seems unconcerned about overflow here. See
+	// https://developer.apple.com/library/content/qa/qa1398/_index.html
+	// Note also, numer == denom == 1 is common.
+	t := r.t
+	if r.numer != 1 {
+		t *= int64(r.numer)
+	}
+	if r.denom != 1 {
+		t /= int64(r.denom)
+	}
+	return t
+}
+func nanotime_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func walltime() (int64, int32) {
+	var t timespec
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(walltime_trampoline)), unsafe.Pointer(&t))
+	return t.tv_sec, int32(t.tv_nsec)
+}
+func walltime_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func sigaction(sig uint32, new *usigactiont, old *usigactiont) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(sigaction_trampoline)), unsafe.Pointer(&sig))
+	KeepAlive(new)
+	KeepAlive(old)
+}
+func sigaction_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func sigprocmask(how uint32, new *sigset, old *sigset) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(sigprocmask_trampoline)), unsafe.Pointer(&how))
+	KeepAlive(new)
+	KeepAlive(old)
+}
+func sigprocmask_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func sigaltstack(new *stackt, old *stackt) {
+	if new != nil && new.ss_flags&_SS_DISABLE != 0 && new.ss_size == 0 {
+		// Despite the fact that Darwin's sigaltstack man page says it ignores the size
+		// when SS_DISABLE is set, it doesn't. sigaltstack returns ENOMEM
+		// if we don't give it a reasonable size.
+		// ref: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20140421/214296.html
+		new.ss_size = 32768
+	}
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(sigaltstack_trampoline)), unsafe.Pointer(&new))
+	KeepAlive(new)
+	KeepAlive(old)
+}
+func sigaltstack_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func raiseproc(sig uint32) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(raiseproc_trampoline)), unsafe.Pointer(&sig))
+}
+func raiseproc_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func setitimer(mode int32, new, old *itimerval) {
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(setitimer_trampoline)), unsafe.Pointer(&mode))
+	KeepAlive(new)
+	KeepAlive(old)
+}
+func setitimer_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func sysctl(mib *uint32, miblen uint32, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(sysctl_trampoline)), unsafe.Pointer(&mib))
+	KeepAlive(mib)
+	KeepAlive(oldp)
+	KeepAlive(oldlenp)
+	KeepAlive(newp)
+	return ret
+}
+func sysctl_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func sysctlbyname(name *byte, oldp *byte, oldlenp *uintptr, newp *byte, newlen uintptr) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(sysctlbyname_trampoline)), unsafe.Pointer(&name))
+	KeepAlive(name)
+	KeepAlive(oldp)
+	KeepAlive(oldlenp)
+	KeepAlive(newp)
+	return ret
+}
+func sysctlbyname_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func fcntl(fd, cmd, arg int32) int32 {
+	return libcCall(unsafe.Pointer(abi.FuncPCABI0(fcntl_trampoline)), unsafe.Pointer(&fd))
+}
+func fcntl_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func kqueue() int32 {
+	v := libcCall(unsafe.Pointer(abi.FuncPCABI0(kqueue_trampoline)), nil)
+	return v
+}
+func kqueue_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(kevent_trampoline)), unsafe.Pointer(&kq))
+	KeepAlive(ch)
+	KeepAlive(ev)
+	KeepAlive(ts)
+	return ret
+}
+func kevent_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_mutex_init(m *pthreadmutex, attr *pthreadmutexattr) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_mutex_init_trampoline)), unsafe.Pointer(&m))
+	KeepAlive(m)
+	KeepAlive(attr)
+	return ret
+}
+func pthread_mutex_init_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_mutex_lock(m *pthreadmutex) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_mutex_lock_trampoline)), unsafe.Pointer(&m))
+	KeepAlive(m)
+	return ret
+}
+func pthread_mutex_lock_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_mutex_unlock(m *pthreadmutex) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_mutex_unlock_trampoline)), unsafe.Pointer(&m))
+	KeepAlive(m)
+	return ret
+}
+func pthread_mutex_unlock_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_cond_init(c *pthreadcond, attr *pthreadcondattr) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_init_trampoline)), unsafe.Pointer(&c))
+	KeepAlive(c)
+	KeepAlive(attr)
+	return ret
+}
+func pthread_cond_init_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_cond_wait(c *pthreadcond, m *pthreadmutex) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_wait_trampoline)), unsafe.Pointer(&c))
+	KeepAlive(c)
+	KeepAlive(m)
+	return ret
+}
+func pthread_cond_wait_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_cond_timedwait_relative_np(c *pthreadcond, m *pthreadmutex, t *timespec) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_timedwait_relative_np_trampoline)), unsafe.Pointer(&c))
+	KeepAlive(c)
+	KeepAlive(m)
+	KeepAlive(t)
+	return ret
+}
+func pthread_cond_timedwait_relative_np_trampoline()
+
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_cond_signal(c *pthreadcond) int32 {
+	ret := libcCall(unsafe.Pointer(abi.FuncPCABI0(pthread_cond_signal_trampoline)), unsafe.Pointer(&c))
+	KeepAlive(c)
+	return ret
+}
+func pthread_cond_signal_trampoline()
+
+// Not used on Darwin, but must be defined.
+func exitThread(wait *uint32) {
+}
+
+//go:nosplit
+func closeonexec(fd int32) {
+	fcntl(fd, _F_SETFD, _FD_CLOEXEC)
+}
+
+//go:nosplit
+func setNonblock(fd int32) {
+	flags := fcntl(fd, _F_GETFL, 0)
+	fcntl(fd, _F_SETFL, flags|_O_NONBLOCK)
+}
+
+// Tell the linker that the libc_* functions are to be found
+// in a system library, with the libc_ prefix missing.
+
+//go:cgo_import_dynamic libc_pthread_attr_init pthread_attr_init "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_attr_getstacksize pthread_attr_getstacksize "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_attr_setdetachstate pthread_attr_setdetachstate "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_create pthread_create "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_self pthread_self "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_kill pthread_kill "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_exit _exit "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_raise raise "/usr/lib/libSystem.B.dylib"
+
+//go:cgo_import_dynamic libc_open open "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_close close "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_read read "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_write write "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pipe pipe "/usr/lib/libSystem.B.dylib"
+
+//go:cgo_import_dynamic libc_mmap mmap "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_munmap munmap "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_madvise madvise "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_mlock mlock "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_error __error "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_usleep usleep "/usr/lib/libSystem.B.dylib"
+
+//go:cgo_import_dynamic libc_mach_timebase_info mach_timebase_info "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_mach_absolute_time mach_absolute_time "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_clock_gettime clock_gettime "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_sigaction sigaction "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_sigmask pthread_sigmask "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_sigaltstack sigaltstack "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_getpid getpid "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_kill kill "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_setitimer setitimer "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_sysctl sysctl "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_sysctlbyname sysctlbyname "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_fcntl fcntl "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_kqueue kqueue "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_kevent kevent "/usr/lib/libSystem.B.dylib"
+
+//go:cgo_import_dynamic libc_pthread_mutex_init pthread_mutex_init "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_mutex_lock pthread_mutex_lock "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_mutex_unlock pthread_mutex_unlock "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_cond_init pthread_cond_init "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_cond_wait pthread_cond_wait "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_cond_timedwait_relative_np pthread_cond_timedwait_relative_np "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_cond_signal pthread_cond_signal "/usr/lib/libSystem.B.dylib"
diff --git a/contrib/go/_std_1.18/src/runtime/sys_darwin_amd64.s b/contrib/go/_std_1.18/src/runtime/sys_darwin_amd64.s
new file mode 100644
index 0000000000..db4715d2b7
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sys_darwin_amd64.s
@@ -0,0 +1,859 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// System calls and other sys.stuff for AMD64, Darwin
+// System calls are implemented in libSystem, this file contains
+// trampolines that convert from Go to C calling convention.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+#include "cgo/abi_amd64.h"
+
+#define CLOCK_REALTIME		0
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	0(DI), DI		// arg 1 exit status
+	CALL	libc_exit(SB)
+	MOVL	$0xf1, 0xf1  // crash
+	POPQ	BP
+	RET
+
+TEXT runtime·open_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	8(DI), SI		// arg 2 flags
+	MOVL	12(DI), DX		// arg 3 mode
+	MOVQ	0(DI), DI		// arg 1 pathname
+	XORL	AX, AX			// vararg: say "no float args"
+	CALL	libc_open(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·close_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	0(DI), DI		// arg 1 fd
+	CALL	libc_close(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·read_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 buf
+	MOVL	16(DI), DX		// arg 3 count
+	MOVL	0(DI), DI		// arg 1 fd
+	CALL	libc_read(SB)
+	TESTL	AX, AX
+	JGE	noerr
+	CALL	libc_error(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno value
+noerr:
+	POPQ	BP
+	RET
+
+TEXT runtime·write_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 buf
+	MOVL	16(DI), DX		// arg 3 count
+	MOVQ	0(DI), DI		// arg 1 fd
+	CALL	libc_write(SB)
+	TESTL	AX, AX
+	JGE	noerr
+	CALL	libc_error(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno value
+noerr:
+	POPQ	BP
+	RET
+
+TEXT runtime·pipe_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	CALL	libc_pipe(SB)		// pointer already in DI
+	TESTL	AX, AX
+	JEQ	3(PC)
+	CALL	libc_error(SB)		// return negative errno value
+	NEGL	AX
+	POPQ	BP
+	RET
+
+TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 new
+	MOVQ	16(DI), DX		// arg 3 old
+	MOVL	0(DI), DI		// arg 1 which
+	CALL	libc_setitimer(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·madvise_trampoline(SB), NOSPLIT, $0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 len
+	MOVL	16(DI), DX	// arg 3 advice
+	MOVQ	0(DI), DI	// arg 1 addr
+	CALL	libc_madvise(SB)
+	// ignore failure - maybe pages are locked
+	POPQ	BP
+	RET
+
+TEXT runtime·mlock_trampoline(SB), NOSPLIT, $0
+	UNDEF // unimplemented
+
+GLOBL timebase<>(SB),NOPTR,$(machTimebaseInfo__size)
+
+TEXT runtime·nanotime_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	DI, BX
+	CALL	libc_mach_absolute_time(SB)
+	MOVQ	AX, 0(BX)
+	MOVL	timebase<>+machTimebaseInfo_numer(SB), SI
+	MOVL	timebase<>+machTimebaseInfo_denom(SB), DI // atomic read
+	TESTL	DI, DI
+	JNE	initialized
+
+	SUBQ	$(machTimebaseInfo__size+15)/16*16, SP
+	MOVQ	SP, DI
+	CALL	libc_mach_timebase_info(SB)
+	MOVL	machTimebaseInfo_numer(SP), SI
+	MOVL	machTimebaseInfo_denom(SP), DI
+	ADDQ	$(machTimebaseInfo__size+15)/16*16, SP
+
+	MOVL	SI, timebase<>+machTimebaseInfo_numer(SB)
+	MOVL	DI, AX
+	XCHGL	AX, timebase<>+machTimebaseInfo_denom(SB) // atomic write
+
+initialized:
+	MOVL	SI, 8(BX)
+	MOVL	DI, 12(BX)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+TEXT runtime·walltime_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP			// make a frame; keep stack aligned
+	MOVQ	SP, BP
+	MOVQ	DI, SI			// arg 2 timespec
+	MOVL	$CLOCK_REALTIME, DI	// arg 1 clock_id
+	CALL	libc_clock_gettime(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·sigaction_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 new
+	MOVQ	16(DI), DX		// arg 3 old
+	MOVL	0(DI), DI		// arg 1 sig
+	CALL	libc_sigaction(SB)
+	TESTL	AX, AX
+	JEQ	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	POPQ	BP
+	RET
+
+TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 new
+	MOVQ	16(DI), DX	// arg 3 old
+	MOVL	0(DI), DI	// arg 1 how
+	CALL	libc_pthread_sigmask(SB)
+	TESTL	AX, AX
+	JEQ	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	POPQ	BP
+	RET
+
+TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 old
+	MOVQ	0(DI), DI		// arg 1 new
+	CALL	libc_sigaltstack(SB)
+	TESTQ	AX, AX
+	JEQ	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	POPQ	BP
+	RET
+
+TEXT runtime·raiseproc_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	0(DI), BX	// signal
+	CALL	libc_getpid(SB)
+	MOVL	AX, DI		// arg 1 pid
+	MOVL	BX, SI		// arg 2 signal
+	CALL	libc_kill(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVQ	fn+0(FP),    AX
+	MOVL	sig+8(FP),   DI
+	MOVQ	info+16(FP), SI
+	MOVQ	ctx+24(FP),  DX
+	PUSHQ	BP
+	MOVQ	SP, BP
+	ANDQ	$~15, SP     // alignment for x86_64 ABI
+	CALL	AX
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// This is the function registered during sigaction and is invoked when
+// a signal is received. It just redirects to the Go function sigtrampgo.
+// Called using C ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+	ADJSP	$24
+	MOVL	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
+
+	POP_REGS_HOST_TO_ABI0()
+	RET
+
+// Called using C ABI.
+TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+	ADJSP	$24
+	MOVL	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigprofNonGo(SB)
+	ADJSP	$-24
+
+	POP_REGS_HOST_TO_ABI0()
+	RET
+
+// Used instead of sigtramp in programs that use cgo.
+// Arguments from kernel are in DI, SI, DX.
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	// If no traceback function, do usual sigtramp.
+	MOVQ	runtime·cgoTraceback(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// If no traceback support function, which means that
+	// runtime/cgo was not linked in, do usual sigtramp.
+	MOVQ	_cgo_callers(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// Figure out if we are currently in a cgo call.
+	// If not, just do usual sigtramp.
+	get_tls(CX)
+	MOVQ	g(CX),AX
+	TESTQ	AX, AX
+	JZ	sigtrampnog     // g == nil
+	MOVQ	g_m(AX), AX
+	TESTQ	AX, AX
+	JZ	sigtramp        // g.m == nil
+	MOVL	m_ncgo(AX), CX
+	TESTL	CX, CX
+	JZ	sigtramp        // g.m.ncgo == 0
+	MOVQ	m_curg(AX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg == nil
+	MOVQ	g_syscallsp(CX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg.syscallsp == 0
+	MOVQ	m_cgoCallers(AX), R8
+	TESTQ	R8, R8
+	JZ	sigtramp        // g.m.cgoCallers == nil
+	MOVL	m_cgoCallersUse(AX), CX
+	TESTL	CX, CX
+	JNZ	sigtramp	// g.m.cgoCallersUse != 0
+
+	// Jump to a function in runtime/cgo.
+	// That function, written in C, will call the user's traceback
+	// function with proper unwind info, and will then call back here.
+	// The first three arguments, and the fifth, are already in registers.
+	// Set the two remaining arguments now.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigtramp(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+sigtramp:
+	JMP	runtime·sigtramp(SB)
+
+sigtrampnog:
+	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
+	// stack trace.
+	CMPL	DI, $27 // 27 == SIGPROF
+	JNZ	sigtramp
+
+	// Lock sigprofCallersUse.
+	MOVL	$0, AX
+	MOVL	$1, CX
+	MOVQ	$runtime·sigprofCallersUse(SB), R11
+	LOCK
+	CMPXCHGL	CX, 0(R11)
+	JNZ	sigtramp  // Skip stack trace if already locked.
+
+	// Jump to the traceback function in runtime/cgo.
+	// It will call back to sigprofNonGo, via sigprofNonGoWrapper, to convert
+	// the arguments to the Go calling convention.
+	// First three arguments to traceback function are in registers already.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigprofCallers(SB), R8
+	MOVQ	$runtime·sigprofNonGoWrapper<>(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP			// make a frame; keep stack aligned
+	MOVQ	SP, BP
+	MOVQ	DI, BX
+	MOVQ	0(BX), DI		// arg 1 addr
+	MOVQ	8(BX), SI		// arg 2 len
+	MOVL	16(BX), DX		// arg 3 prot
+	MOVL	20(BX), CX		// arg 4 flags
+	MOVL	24(BX), R8		// arg 5 fid
+	MOVL	28(BX), R9		// arg 6 offset
+	CALL	libc_mmap(SB)
+	XORL	DX, DX
+	CMPQ	AX, $-1
+	JNE	ok
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), DX		// errno
+	XORL	AX, AX
+ok:
+	MOVQ	AX, 32(BX)
+	MOVQ	DX, 40(BX)
+	POPQ	BP
+	RET
+
+TEXT runtime·munmap_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 len
+	MOVQ	0(DI), DI		// arg 1 addr
+	CALL	libc_munmap(SB)
+	TESTQ	AX, AX
+	JEQ	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	POPQ	BP
+	RET
+
+TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	0(DI), DI	// arg 1 usec
+	CALL	libc_usleep(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·settls(SB),NOSPLIT,$32
+	// Nothing to do on Darwin, pthread already set thread-local storage up.
+	RET
+
+TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	8(DI), SI		// arg 2 miblen
+	MOVQ	16(DI), DX		// arg 3 oldp
+	MOVQ	24(DI), CX		// arg 4 oldlenp
+	MOVQ	32(DI), R8		// arg 5 newp
+	MOVQ	40(DI), R9		// arg 6 newlen
+	MOVQ	0(DI), DI		// arg 1 mib
+	CALL	libc_sysctl(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·sysctlbyname_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 oldp
+	MOVQ	16(DI), DX		// arg 3 oldlenp
+	MOVQ	24(DI), CX		// arg 4 newp
+	MOVQ	32(DI), R8		// arg 5 newlen
+	MOVQ	0(DI), DI		// arg 1 name
+	CALL	libc_sysctlbyname(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	CALL	libc_kqueue(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI		// arg 2 keventt
+	MOVL	16(DI), DX		// arg 3 nch
+	MOVQ	24(DI), CX		// arg 4 ev
+	MOVL	32(DI), R8		// arg 5 nev
+	MOVQ	40(DI), R9		// arg 6 ts
+	MOVL	0(DI), DI		// arg 1 kq
+	CALL	libc_kevent(SB)
+	CMPL	AX, $-1
+	JNE	ok
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX		// errno
+	NEGQ	AX			// caller wants it as a negative error code
+ok:
+	POPQ	BP
+	RET
+
+TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	4(DI), SI		// arg 2 cmd
+	MOVL	8(DI), DX		// arg 3 arg
+	MOVL	0(DI), DI		// arg 1 fd
+	XORL	AX, AX			// vararg: say "no float args"
+	CALL	libc_fcntl(SB)
+	POPQ	BP
+	RET
+
+// mstart_stub is the first function executed on a new thread started by pthread_create.
+// It just does some low-level setup and then calls mstart.
+// Note: called with the C calling convention.
+TEXT runtime·mstart_stub(SB),NOSPLIT,$0
+	// DI points to the m.
+	// We are already on m's g0 stack.
+
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
+	MOVQ	m_g0(DI), DX // g
+
+	// Initialize TLS entry.
+	// See cmd/link/internal/ld/sym.go:computeTLSOffset.
+	MOVQ	DX, 0x30(GS)
+
+	CALL	runtime·mstart(SB)
+
+	POP_REGS_HOST_TO_ABI0()
+
+	// Go is all done with this OS thread.
+	// Tell pthread everything is ok (we never join with this thread, so
+	// the value here doesn't really matter).
+	XORL	AX, AX
+	RET
+
+// These trampolines help convert from Go calling convention to C calling convention.
+// They should be called with asmcgocall.
+// A pointer to the arguments is passed in DI.
+// A single int32 result is returned in AX.
+// (For more results, make an args/results structure.)
+TEXT runtime·pthread_attr_init_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP	// make frame, keep stack 16-byte aligned.
+	MOVQ	SP, BP
+	MOVQ	0(DI), DI // arg 1 attr
+	CALL	libc_pthread_attr_init(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_attr_getstacksize_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 size
+	MOVQ	0(DI), DI	// arg 1 attr
+	CALL	libc_pthread_attr_getstacksize(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_attr_setdetachstate_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 state
+	MOVQ	0(DI), DI	// arg 1 attr
+	CALL	libc_pthread_attr_setdetachstate(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_create_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	0(DI), SI	// arg 2 attr
+	MOVQ	8(DI), DX	// arg 3 start
+	MOVQ	16(DI), CX	// arg 4 arg
+	MOVQ	SP, DI		// arg 1 &threadid (which we throw away)
+	CALL	libc_pthread_create(SB)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+TEXT runtime·raise_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVL	0(DI), DI	// arg 1 signal
+	CALL	libc_raise(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_mutex_init_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 attr
+	MOVQ	0(DI), DI	// arg 1 mutex
+	CALL	libc_pthread_mutex_init(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_mutex_lock_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	0(DI), DI	// arg 1 mutex
+	CALL	libc_pthread_mutex_lock(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_mutex_unlock_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	0(DI), DI	// arg 1 mutex
+	CALL	libc_pthread_mutex_unlock(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_cond_init_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 attr
+	MOVQ	0(DI), DI	// arg 1 cond
+	CALL	libc_pthread_cond_init(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_cond_wait_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 mutex
+	MOVQ	0(DI), DI	// arg 1 cond
+	CALL	libc_pthread_cond_wait(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_cond_timedwait_relative_np_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 mutex
+	MOVQ	16(DI), DX	// arg 3 timeout
+	MOVQ	0(DI), DI	// arg 1 cond
+	CALL	libc_pthread_cond_timedwait_relative_np(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_cond_signal_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	0(DI), DI	// arg 1 cond
+	CALL	libc_pthread_cond_signal(SB)
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_self_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	DI, BX		// BX is caller-save
+	CALL	libc_pthread_self(SB)
+	MOVQ	AX, 0(BX)	// return value
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_kill_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 sig
+	MOVQ	0(DI), DI	// arg 1 thread
+	CALL	libc_pthread_kill(SB)
+	POPQ	BP
+	RET
+
+// syscall calls a function in libc on behalf of the syscall package.
+// syscall takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall expects a 32-bit result and tests for 32-bit -1
+// to decide there was an error.
+TEXT runtime·syscall(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), CX // fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	CX
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (4*8)(DI) // r1
+	MOVQ	DX, (5*8)(DI) // r2
+
+	// Standard libc functions return -1 on error
+	// and set errno.
+	CMPL	AX, $-1	      // Note: high 32 bits are junk
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX
+	MOVQ	(SP), DI
+	MOVQ	AX, (6*8)(DI) // err
+
+ok:
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// syscallX calls a function in libc on behalf of the syscall package.
+// syscallX takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscallX must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscallX is like syscall but expects a 64-bit result
+// and tests for 64-bit -1 to decide there was an error.
+TEXT runtime·syscallX(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), CX // fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	CX
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (4*8)(DI) // r1
+	MOVQ	DX, (5*8)(DI) // r2
+
+	// Standard libc functions return -1 on error
+	// and set errno.
+	CMPQ	AX, $-1
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX
+	MOVQ	(SP), DI
+	MOVQ	AX, (6*8)(DI) // err
+
+ok:
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// syscallPtr is like syscallX except that the libc function reports an
+// error by returning NULL and setting errno.
+TEXT runtime·syscallPtr(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), CX // fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	CX
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (4*8)(DI) // r1
+	MOVQ	DX, (5*8)(DI) // r2
+
+	// syscallPtr libc functions return NULL on error
+	// and set errno.
+	TESTQ	AX, AX
+	JNE	ok
+
+	// Get error code from libc.
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX
+	MOVQ	(SP), DI
+	MOVQ	AX, (6*8)(DI) // err
+
+ok:
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// syscall6 calls a function in libc on behalf of the syscall package.
+// syscall6 takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall6 must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall6 expects a 32-bit result and tests for 32-bit -1
+// to decide there was an error.
+TEXT runtime·syscall6(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), R11// fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	(4*8)(DI), CX // a4
+	MOVQ	(5*8)(DI), R8 // a5
+	MOVQ	(6*8)(DI), R9 // a6
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	R11
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (7*8)(DI) // r1
+	MOVQ	DX, (8*8)(DI) // r2
+
+	CMPL	AX, $-1
+	JNE	ok
+
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX
+	MOVQ	(SP), DI
+	MOVQ	AX, (9*8)(DI) // err
+
+ok:
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// syscall6X calls a function in libc on behalf of the syscall package.
+// syscall6X takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall6X must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall6X is like syscall6 but expects a 64-bit result
+// and tests for 64-bit -1 to decide there was an error.
+TEXT runtime·syscall6X(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), R11// fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	(4*8)(DI), CX // a4
+	MOVQ	(5*8)(DI), R8 // a5
+	MOVQ	(6*8)(DI), R9 // a6
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	R11
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (7*8)(DI) // r1
+	MOVQ	DX, (8*8)(DI) // r2
+
+	CMPQ	AX, $-1
+	JNE	ok
+
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX
+	MOVQ	(SP), DI
+	MOVQ	AX, (9*8)(DI) // err
+
+ok:
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// syscall_x509 is for crypto/x509. It is like syscall6 but does not check for errors,
+// takes 5 uintptrs and 1 float64, and only returns one value,
+// for use with standard C ABI functions.
+TEXT runtime·syscall_x509(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), R11// fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	(4*8)(DI), CX // a4
+	MOVQ	(5*8)(DI), R8 // a5
+	MOVQ	(6*8)(DI), X0 // f1
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	R11
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (7*8)(DI) // r1
+
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/sys_libc.go b/contrib/go/_std_1.18/src/runtime/sys_libc.go
new file mode 100644
index 0000000000..7012b4167e
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sys_libc.go
@@ -0,0 +1,53 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin || (openbsd && !mips64)
+
+package runtime
+
+import "unsafe"
+
+// Call fn with arg as its argument. Return what fn returns.
+// fn is the raw pc value of the entry point of the desired function.
+// Switches to the system stack, if not already there.
+// Preserves the calling point as the location where a profiler traceback will begin.
+//go:nosplit
+func libcCall(fn, arg unsafe.Pointer) int32 {
+	// Leave caller's PC/SP/G around for traceback.
+	gp := getg()
+	var mp *m
+	if gp != nil {
+		mp = gp.m
+	}
+	if mp != nil && mp.libcallsp == 0 {
+		mp.libcallg.set(gp)
+		mp.libcallpc = getcallerpc()
+		// sp must be the last, because once async cpu profiler finds
+		// all three values to be non-zero, it will use them
+		mp.libcallsp = getcallersp()
+	} else {
+		// Make sure we don't reset libcallsp. This makes
+		// libcCall reentrant; We remember the g/pc/sp for the
+		// first call on an M, until that libcCall instance
+		// returns.  Reentrance only matters for signals, as
+		// libc never calls back into Go.  The tricky case is
+		// where we call libcX from an M and record g/pc/sp.
+		// Before that call returns, a signal arrives on the
+		// same M and the signal handling code calls another
+		// libc function.  We don't want that second libcCall
+		// from within the handler to be recorded, and we
+		// don't want that call's completion to zero
+		// libcallsp.
+		// We don't need to set libcall* while we're in a sighandler
+		// (even if we're not currently in libc) because we block all
+		// signals while we're handling a signal. That includes the
+		// profile signal, which is the one that uses the libcall* info.
+		mp = nil
+	}
+	res := asmcgocall(fn, arg)
+	if mp != nil {
+		mp.libcallsp = 0
+	}
+	return res
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sys_linux_amd64.s b/contrib/go/_std_1.18/src/runtime/sys_linux_amd64.s
new file mode 100644
index 0000000000..f0e58e11db
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sys_linux_amd64.s
@@ -0,0 +1,765 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for AMD64, Linux
+//
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+#include "cgo/abi_amd64.h"
+
+#define AT_FDCWD -100
+
+#define SYS_read		0
+#define SYS_write		1
+#define SYS_close		3
+#define SYS_mmap		9
+#define SYS_munmap		11
+#define SYS_brk 		12
+#define SYS_rt_sigaction	13
+#define SYS_rt_sigprocmask	14
+#define SYS_rt_sigreturn	15
+#define SYS_pipe		22
+#define SYS_sched_yield 	24
+#define SYS_mincore		27
+#define SYS_madvise		28
+#define SYS_nanosleep		35
+#define SYS_setittimer		38
+#define SYS_getpid		39
+#define SYS_socket		41
+#define SYS_connect		42
+#define SYS_clone		56
+#define SYS_exit		60
+#define SYS_kill		62
+#define SYS_fcntl		72
+#define SYS_sigaltstack 	131
+#define SYS_arch_prctl		158
+#define SYS_gettid		186
+#define SYS_futex		202
+#define SYS_sched_getaffinity	204
+#define SYS_epoll_create	213
+#define SYS_timer_create	222
+#define SYS_timer_settime	223
+#define SYS_timer_delete	226
+#define SYS_clock_gettime	228
+#define SYS_exit_group		231
+#define SYS_epoll_ctl		233
+#define SYS_tgkill		234
+#define SYS_openat		257
+#define SYS_faccessat		269
+#define SYS_epoll_pwait		281
+#define SYS_epoll_create1	291
+#define SYS_pipe2		293
+
+TEXT runtime·exit(SB),NOSPLIT,$0-4
+	MOVL	code+0(FP), DI
+	MOVL	$SYS_exit_group, AX
+	SYSCALL
+	RET
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT,$0-8
+	MOVQ	wait+0(FP), AX
+	// We're done using the stack.
+	MOVL	$0, (AX)
+	MOVL	$0, DI	// exit code
+	MOVL	$SYS_exit, AX
+	SYSCALL
+	// We may not even have a stack any more.
+	INT	$3
+	JMP	0(PC)
+
+TEXT runtime·open(SB),NOSPLIT,$0-20
+	// This uses openat instead of open, because Android O blocks open.
+	MOVL	$AT_FDCWD, DI // AT_FDCWD, so this acts like open
+	MOVQ	name+0(FP), SI
+	MOVL	mode+8(FP), DX
+	MOVL	perm+12(FP), R10
+	MOVL	$SYS_openat, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$-1, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·closefd(SB),NOSPLIT,$0-12
+	MOVL	fd+0(FP), DI
+	MOVL	$SYS_close, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$-1, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$0-28
+	MOVQ	fd+0(FP), DI
+	MOVQ	p+8(FP), SI
+	MOVL	n+16(FP), DX
+	MOVL	$SYS_write, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0-28
+	MOVL	fd+0(FP), DI
+	MOVQ	p+8(FP), SI
+	MOVL	n+16(FP), DX
+	MOVL	$SYS_read, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	LEAQ	r+0(FP), DI
+	MOVL	$SYS_pipe, AX
+	SYSCALL
+	MOVL	AX, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-20
+	LEAQ	r+8(FP), DI
+	MOVL	flags+0(FP), SI
+	MOVL	$SYS_pipe2, AX
+	SYSCALL
+	MOVL	AX, errno+16(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)
+	MOVL	$1000, AX	// usec to nsec
+	MULL	DX
+	MOVQ	AX, 8(SP)
+
+	// nanosleep(&ts, 0)
+	MOVQ	SP, DI
+	MOVL	$0, SI
+	MOVL	$SYS_nanosleep, AX
+	SYSCALL
+	RET
+
+TEXT runtime·gettid(SB),NOSPLIT,$0-4
+	MOVL	$SYS_gettid, AX
+	SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$0
+	MOVL	$SYS_getpid, AX
+	SYSCALL
+	MOVL	AX, R12
+	MOVL	$SYS_gettid, AX
+	SYSCALL
+	MOVL	AX, SI	// arg 2 tid
+	MOVL	R12, DI	// arg 1 pid
+	MOVL	sig+0(FP), DX	// arg 3
+	MOVL	$SYS_tgkill, AX
+	SYSCALL
+	RET
+
+TEXT runtime·raiseproc(SB),NOSPLIT,$0
+	MOVL	$SYS_getpid, AX
+	SYSCALL
+	MOVL	AX, DI	// arg 1 pid
+	MOVL	sig+0(FP), SI	// arg 2
+	MOVL	$SYS_kill, AX
+	SYSCALL
+	RET
+
+TEXT ·getpid(SB),NOSPLIT,$0-8
+	MOVL	$SYS_getpid, AX
+	SYSCALL
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT,$0
+	MOVQ	tgid+0(FP), DI
+	MOVQ	tid+8(FP), SI
+	MOVQ	sig+16(FP), DX
+	MOVL	$SYS_tgkill, AX
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$0-24
+	MOVL	mode+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	$SYS_setittimer, AX
+	SYSCALL
+	RET
+
+TEXT runtime·timer_create(SB),NOSPLIT,$0-28
+	MOVL	clockid+0(FP), DI
+	MOVQ	sevp+8(FP), SI
+	MOVQ	timerid+16(FP), DX
+	MOVL	$SYS_timer_create, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·timer_settime(SB),NOSPLIT,$0-28
+	MOVL	timerid+0(FP), DI
+	MOVL	flags+4(FP), SI
+	MOVQ	new+8(FP), DX
+	MOVQ	old+16(FP), R10
+	MOVL	$SYS_timer_settime, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·timer_delete(SB),NOSPLIT,$0-12
+	MOVL	timerid+0(FP), DI
+	MOVL	$SYS_timer_delete, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT,$0-28
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	dst+16(FP), DX
+	MOVL	$SYS_mincore, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// func nanotime1() int64
+TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
+	// We don't know how much stack space the VDSO code will need,
+	// so switch to g0.
+	// In particular, a kernel configured with CONFIG_OPTIMIZE_INLINING=n
+	// and hardening can use a full page of stack space in gettime_sym
+	// due to stack probes inserted to avoid stack/heap collisions.
+	// See issue #20427.
+
+	MOVQ	SP, R12	// Save old SP; R12 unchanged by C code.
+
+	MOVQ	g_m(R14), BX // BX unchanged by C code.
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	// Save the old values on stack and restore them on exit,
+	// so this function is reentrant.
+	MOVQ	m_vdsoPC(BX), CX
+	MOVQ	m_vdsoSP(BX), DX
+	MOVQ	CX, 0(SP)
+	MOVQ	DX, 8(SP)
+
+	LEAQ	ret+0(FP), DX
+	MOVQ	-8(DX), CX
+	MOVQ	CX, m_vdsoPC(BX)
+	MOVQ	DX, m_vdsoSP(BX)
+
+	CMPQ	R14, m_curg(BX)	// Only switch if on curg.
+	JNE	noswitch
+
+	MOVQ	m_g0(BX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBQ	$16, SP		// Space for results
+	ANDQ	$~15, SP	// Align for C code
+
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
+	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback
+	CALL	AX
+ret:
+	MOVQ	0(SP), AX	// sec
+	MOVQ	8(SP), DX	// nsec
+	MOVQ	R12, SP		// Restore real SP
+	// Restore vdsoPC, vdsoSP
+	// We don't worry about being signaled between the two stores.
+	// If we are not in a signal handler, we'll restore vdsoSP to 0,
+	// and no one will care about vdsoPC. If we are in a signal handler,
+	// we cannot receive another signal.
+	MOVQ	8(SP), CX
+	MOVQ	CX, m_vdsoSP(BX)
+	MOVQ	0(SP), CX
+	MOVQ	CX, m_vdsoPC(BX)
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+fallback:
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
+	JMP	ret
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0-28
+	MOVL	how+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	size+24(FP), R10
+	MOVL	$SYS_rt_sigprocmask, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT,$0-36
+	MOVQ	sig+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVQ	size+24(FP), R10
+	MOVL	$SYS_rt_sigaction, AX
+	SYSCALL
+	MOVL	AX, ret+32(FP)
+	RET
+
+// Call the function stored in _cgo_sigaction using the GCC calling convention.
+TEXT runtime·callCgoSigaction(SB),NOSPLIT,$16
+	MOVQ	sig+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVQ	_cgo_sigaction(SB), AX
+	MOVQ	SP, BX	// callee-saved
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	CALL	AX
+	MOVQ	BX, SP
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVQ	fn+0(FP),    AX
+	MOVL	sig+8(FP),   DI
+	MOVQ	info+16(FP), SI
+	MOVQ	ctx+24(FP),  DX
+	PUSHQ	BP
+	MOVQ	SP, BP
+	ANDQ	$~15, SP     // alignment for x86_64 ABI
+	CALL	AX
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
+// Called using C ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+        ADJSP   $24
+	MOVQ	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigtrampgo(SB)
+	ADJSP	$-24
+
+        POP_REGS_HOST_TO_ABI0()
+	RET
+
+// Called using C ABI.
+TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT,$0
+	// Transition from C ABI to Go ABI.
+	PUSH_REGS_HOST_TO_ABI0()
+
+	// Call into the Go signal handler
+	NOP	SP		// disable vet stack checking
+	ADJSP	$24
+	MOVL	DI, 0(SP)	// sig
+	MOVQ	SI, 8(SP)	// info
+	MOVQ	DX, 16(SP)	// ctx
+	CALL	·sigprofNonGo(SB)
+	ADJSP	$-24
+
+	POP_REGS_HOST_TO_ABI0()
+	RET
+
+// Used instead of sigtramp in programs that use cgo.
+// Arguments from kernel are in DI, SI, DX.
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	// If no traceback function, do usual sigtramp.
+	MOVQ	runtime·cgoTraceback(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// If no traceback support function, which means that
+	// runtime/cgo was not linked in, do usual sigtramp.
+	MOVQ	_cgo_callers(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// Figure out if we are currently in a cgo call.
+	// If not, just do usual sigtramp.
+	get_tls(CX)
+	MOVQ	g(CX),AX
+	TESTQ	AX, AX
+	JZ	sigtrampnog     // g == nil
+	MOVQ	g_m(AX), AX
+	TESTQ	AX, AX
+	JZ	sigtramp        // g.m == nil
+	MOVL	m_ncgo(AX), CX
+	TESTL	CX, CX
+	JZ	sigtramp        // g.m.ncgo == 0
+	MOVQ	m_curg(AX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg == nil
+	MOVQ	g_syscallsp(CX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg.syscallsp == 0
+	MOVQ	m_cgoCallers(AX), R8
+	TESTQ	R8, R8
+	JZ	sigtramp        // g.m.cgoCallers == nil
+	MOVL	m_cgoCallersUse(AX), CX
+	TESTL	CX, CX
+	JNZ	sigtramp	// g.m.cgoCallersUse != 0
+
+	// Jump to a function in runtime/cgo.
+	// That function, written in C, will call the user's traceback
+	// function with proper unwind info, and will then call back here.
+	// The first three arguments, and the fifth, are already in registers.
+	// Set the two remaining arguments now.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigtramp(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+sigtramp:
+	JMP	runtime·sigtramp(SB)
+
+sigtrampnog:
+	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
+	// stack trace.
+	CMPL	DI, $27 // 27 == SIGPROF
+	JNZ	sigtramp
+
+	// Lock sigprofCallersUse.
+	MOVL	$0, AX
+	MOVL	$1, CX
+	MOVQ	$runtime·sigprofCallersUse(SB), R11
+	LOCK
+	CMPXCHGL	CX, 0(R11)
+	JNZ	sigtramp  // Skip stack trace if already locked.
+
+	// Jump to the traceback function in runtime/cgo.
+	// It will call back to sigprofNonGo, via sigprofNonGoWrapper, to convert
+	// the arguments to the Go calling convention.
+	// First three arguments to traceback function are in registers already.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigprofCallers(SB), R8
+	MOVQ	$runtime·sigprofNonGoWrapper<>(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+// For cgo unwinding to work, this function must look precisely like
+// the one in glibc.  The glibc source code is:
+// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sigaction.c
+// The code that cares about the precise instructions used is:
+// https://gcc.gnu.org/viewcvs/gcc/trunk/libgcc/config/i386/linux-unwind.h?revision=219188&view=markup
+TEXT runtime·sigreturn(SB),NOSPLIT,$0
+	MOVQ	$SYS_rt_sigreturn, AX
+	SYSCALL
+	INT $3	// not reached
+
+TEXT runtime·sysMmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	prot+16(FP), DX
+	MOVL	flags+20(FP), R10
+	MOVL	fd+24(FP), R8
+	MOVL	off+28(FP), R9
+
+	MOVL	$SYS_mmap, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	ok
+	NOTQ	AX
+	INCQ	AX
+	MOVQ	$0, p+32(FP)
+	MOVQ	AX, err+40(FP)
+	RET
+ok:
+	MOVQ	AX, p+32(FP)
+	MOVQ	$0, err+40(FP)
+	RET
+
+// Call the function stored in _cgo_mmap using the GCC calling convention.
+// This must be called on the system stack.
+TEXT runtime·callCgoMmap(SB),NOSPLIT,$16
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	prot+16(FP), DX
+	MOVL	flags+20(FP), CX
+	MOVL	fd+24(FP), R8
+	MOVL	off+28(FP), R9
+	MOVQ	_cgo_mmap(SB), AX
+	MOVQ	SP, BX
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	MOVQ	BX, 0(SP)
+	CALL	AX
+	MOVQ	0(SP), SP
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·sysMunmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	$SYS_munmap, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// Call the function stored in _cgo_munmap using the GCC calling convention.
+// This must be called on the system stack.
+TEXT runtime·callCgoMunmap(SB),NOSPLIT,$16-16
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	_cgo_munmap(SB), AX
+	MOVQ	SP, BX
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	MOVQ	BX, 0(SP)
+	CALL	AX
+	MOVQ	0(SP), SP
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	flags+16(FP), DX
+	MOVQ	$SYS_madvise, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int64 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVL	op+8(FP), SI
+	MOVL	val+12(FP), DX
+	MOVQ	ts+16(FP), R10
+	MOVQ	addr2+24(FP), R8
+	MOVL	val3+32(FP), R9
+	MOVL	$SYS_futex, AX
+	SYSCALL
+	MOVL	AX, ret+40(FP)
+	RET
+
+// int32 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT,$0
+	MOVL	flags+0(FP), DI
+	MOVQ	stk+8(FP), SI
+	MOVQ	$0, DX
+	MOVQ	$0, R10
+	MOVQ    $0, R8
+	// Copy mp, gp, fn off parent stack for use by child.
+	// Careful: Linux system call clobbers CX and R11.
+	MOVQ	mp+16(FP), R13
+	MOVQ	gp+24(FP), R9
+	MOVQ	fn+32(FP), R12
+	CMPQ	R13, $0    // m
+	JEQ	nog1
+	CMPQ	R9, $0    // g
+	JEQ	nog1
+	LEAQ	m_tls(R13), R8
+#ifdef GOOS_android
+	// Android stores the TLS offset in runtime·tls_g.
+	SUBQ	runtime·tls_g(SB), R8
+#else
+	ADDQ	$8, R8	// ELF wants to use -8(FS)
+#endif
+	ORQ 	$0x00080000, DI //add flag CLONE_SETTLS(0x00080000) to call clone
+nog1:
+	MOVL	$SYS_clone, AX
+	SYSCALL
+
+	// In parent, return.
+	CMPQ	AX, $0
+	JEQ	3(PC)
+	MOVL	AX, ret+40(FP)
+	RET
+
+	// In child, on new stack.
+	MOVQ	SI, SP
+
+	// If g or m are nil, skip Go-related setup.
+	CMPQ	R13, $0    // m
+	JEQ	nog2
+	CMPQ	R9, $0    // g
+	JEQ	nog2
+
+	// Initialize m->procid to Linux tid
+	MOVL	$SYS_gettid, AX
+	SYSCALL
+	MOVQ	AX, m_procid(R13)
+
+	// In child, set up new stack
+	get_tls(CX)
+	MOVQ	R13, g_m(R9)
+	MOVQ	R9, g(CX)
+	MOVQ	R9, R14 // set g register
+	CALL	runtime·stackcheck(SB)
+
+nog2:
+	// Call fn. This is the PC of an ABI0 function.
+	CALL	R12
+
+	// It shouldn't return. If it does, exit that thread.
+	MOVL	$111, DI
+	MOVL	$SYS_exit, AX
+	SYSCALL
+	JMP	-3(PC)	// keep exiting
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVQ	new+0(FP), DI
+	MOVQ	old+8(FP), SI
+	MOVQ	$SYS_sigaltstack, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$32
+#ifdef GOOS_android
+	// Android stores the TLS offset in runtime·tls_g.
+	SUBQ	runtime·tls_g(SB), DI
+#else
+	ADDQ	$8, DI	// ELF wants to use -8(FS)
+#endif
+	MOVQ	DI, SI
+	MOVQ	$0x1002, DI	// ARCH_SET_FS
+	MOVQ	$SYS_arch_prctl, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVL	$SYS_sched_yield, AX
+	SYSCALL
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
+	MOVQ	pid+0(FP), DI
+	MOVQ	len+8(FP), SI
+	MOVQ	buf+16(FP), DX
+	MOVL	$SYS_sched_getaffinity, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size);
+TEXT runtime·epollcreate(SB),NOSPLIT,$0
+	MOVL    size+0(FP), DI
+	MOVL    $SYS_epoll_create, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags);
+TEXT runtime·epollcreate1(SB),NOSPLIT,$0
+	MOVL	flags+0(FP), DI
+	MOVL	$SYS_epoll_create1, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT,$0
+	MOVL	epfd+0(FP), DI
+	MOVL	op+4(FP), SI
+	MOVL	fd+8(FP), DX
+	MOVQ	ev+16(FP), R10
+	MOVL	$SYS_epoll_ctl, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
+TEXT runtime·epollwait(SB),NOSPLIT,$0
+	// This uses pwait instead of wait, because Android O blocks wait.
+	MOVL	epfd+0(FP), DI
+	MOVQ	ev+8(FP), SI
+	MOVL	nev+16(FP), DX
+	MOVL	timeout+20(FP), R10
+	MOVQ	$0, R8
+	MOVL	$SYS_epoll_pwait, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $2, SI  // F_SETFD
+	MOVQ    $1, DX  // FD_CLOEXEC
+	MOVL	$SYS_fcntl, AX
+	SYSCALL
+	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $3, SI  // F_GETFL
+	MOVQ    $0, DX
+	MOVL	$SYS_fcntl, AX
+	SYSCALL
+	MOVL	fd+0(FP), DI // fd
+	MOVQ	$4, SI // F_SETFL
+	MOVQ	$0x800, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$SYS_fcntl, AX
+	SYSCALL
+	RET
+
+// int access(const char *name, int mode)
+TEXT runtime·access(SB),NOSPLIT,$0
+	// This uses faccessat instead of access, because Android O blocks access.
+	MOVL	$AT_FDCWD, DI // AT_FDCWD, so this acts like access
+	MOVQ	name+0(FP), SI
+	MOVL	mode+8(FP), DX
+	MOVL	$0, R10
+	MOVL	$SYS_faccessat, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+// int connect(int fd, const struct sockaddr *addr, socklen_t addrlen)
+TEXT runtime·connect(SB),NOSPLIT,$0-28
+	MOVL	fd+0(FP), DI
+	MOVQ	addr+8(FP), SI
+	MOVL	len+16(FP), DX
+	MOVL	$SYS_connect, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int socket(int domain, int type, int protocol)
+TEXT runtime·socket(SB),NOSPLIT,$0-20
+	MOVL	domain+0(FP), DI
+	MOVL	typ+4(FP), SI
+	MOVL	prot+8(FP), DX
+	MOVL	$SYS_socket, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
+	// Implemented as brk(NULL).
+	MOVQ	$0, DI
+	MOVL	$SYS_brk, AX
+	SYSCALL
+	MOVQ	AX, ret+0(FP)
+	RET
diff --git a/contrib/go/_std_1.18/src/runtime/sys_nonppc64x.go b/contrib/go/_std_1.18/src/runtime/sys_nonppc64x.go
new file mode 100644
index 0000000000..653f1c999f
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sys_nonppc64x.go
@@ -0,0 +1,10 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !ppc64 && !ppc64le
+
+package runtime
+
+func prepGoExitFrame(sp uintptr) {
+}
diff --git a/contrib/go/_std_1.18/src/runtime/sys_x86.go b/contrib/go/_std_1.18/src/runtime/sys_x86.go
new file mode 100644
index 0000000000..9fb36c2a66
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/sys_x86.go
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || 386
+
+package runtime
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// adjust Gobuf as if it executed a call to fn with context ctxt
+// and then stopped before the first instruction in fn.
+func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
+	sp := buf.sp
+	sp -= goarch.PtrSize
+	*(*uintptr)(unsafe.Pointer(sp)) = buf.pc
+	buf.sp = sp
+	buf.pc = uintptr(fn)
+	buf.ctxt = ctxt
+}
diff --git a/contrib/go/_std_1.18/src/runtime/textflag.h b/contrib/go/_std_1.18/src/runtime/textflag.h
new file mode 100644
index 0000000000..214075e360
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/textflag.h
@@ -0,0 +1,39 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file defines flags attached to various functions
+// and data objects. The compilers, assemblers, and linker must
+// all agree on these values.
+//
+// Keep in sync with src/cmd/internal/obj/textflag.go.
+
+// Don't profile the marked routine. This flag is deprecated.
+#define NOPROF	1
+// It is ok for the linker to get multiple of these symbols. It will
+// pick one of the duplicates to use.
+#define DUPOK	2
+// Don't insert stack check preamble.
+#define NOSPLIT	4
+// Put this data in a read-only section.
+#define RODATA	8
+// This data contains no pointers.
+#define NOPTR	16
+// This is a wrapper function and should not count as disabling 'recover'.
+#define WRAPPER 32
+// This function uses its incoming context register.
+#define NEEDCTXT 64
+// Allocate a word of thread local storage and store the offset from the
+// thread local base to the thread local storage in this variable.
+#define TLSBSS	256
+// Do not insert instructions to allocate a stack frame for this function.
+// Only valid on functions that declare a frame size of 0.
+// TODO(mwhudson): only implemented for ppc64x at present.
+#define NOFRAME 512
+// Function can call reflect.Type.Method or reflect.Type.MethodByName.
+#define REFLECTMETHOD 1024
+// Function is the outermost frame of the call stack. Call stack unwinders
+// should stop at this function.
+#define TOPFRAME 2048
+// Function is an ABI wrapper.
+#define ABIWRAPPER 4096
diff --git a/contrib/go/_std_1.18/src/runtime/time.go b/contrib/go/_std_1.18/src/runtime/time.go
new file mode 100644
index 0000000000..a9ad620776
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/time.go
@@ -0,0 +1,1128 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Time-related runtime and pieces of package time.
+
+package runtime
+
+import (
+	"internal/abi"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Package time knows the layout of this structure.
+// If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
+type timer struct {
+	// If this timer is on a heap, which P's heap it is on.
+	// puintptr rather than *p to match uintptr in the versions
+	// of this struct defined in other packages.
+	pp puintptr
+
+	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
+	// each time calling f(arg, now) in the timer goroutine, so f must be
+	// a well-behaved function and not block.
+	//
+	// when must be positive on an active timer.
+	when   int64
+	period int64
+	f      func(any, uintptr)
+	arg    any
+	seq    uintptr
+
+	// What to set the when field to in timerModifiedXX status.
+	nextwhen int64
+
+	// The status field holds one of the values below.
+	status uint32
+}
+
+// Code outside this file has to be careful in using a timer value.
+//
+// The pp, status, and nextwhen fields may only be used by code in this file.
+//
+// Code that creates a new timer value can set the when, period, f,
+// arg, and seq fields.
+// A new timer value may be passed to addtimer (called by time.startTimer).
+// After doing that no fields may be touched.
+//
+// An active timer (one that has been passed to addtimer) may be
+// passed to deltimer (time.stopTimer), after which it is no longer an
+// active timer. It is an inactive timer.
+// In an inactive timer the period, f, arg, and seq fields may be modified,
+// but not the when field.
+// It's OK to just drop an inactive timer and let the GC collect it.
+// It's not OK to pass an inactive timer to addtimer.
+// Only newly allocated timer values may be passed to addtimer.
+//
+// An active timer may be passed to modtimer. No fields may be touched.
+// It remains an active timer.
+//
+// An inactive timer may be passed to resettimer to turn into an
+// active timer with an updated when field.
+// It's OK to pass a newly allocated timer value to resettimer.
+//
+// Timer operations are addtimer, deltimer, modtimer, resettimer,
+// cleantimers, adjusttimers, and runtimer.
+//
+// We don't permit calling addtimer/deltimer/modtimer/resettimer simultaneously,
+// but adjusttimers and runtimer can be called at the same time as any of those.
+//
+// Active timers live in heaps attached to P, in the timers field.
+// Inactive timers live there too temporarily, until they are removed.
+//
+// addtimer:
+//   timerNoStatus   -> timerWaiting
+//   anything else   -> panic: invalid value
+// deltimer:
+//   timerWaiting         -> timerModifying -> timerDeleted
+//   timerModifiedEarlier -> timerModifying -> timerDeleted
+//   timerModifiedLater   -> timerModifying -> timerDeleted
+//   timerNoStatus        -> do nothing
+//   timerDeleted         -> do nothing
+//   timerRemoving        -> do nothing
+//   timerRemoved         -> do nothing
+//   timerRunning         -> wait until status changes
+//   timerMoving          -> wait until status changes
+//   timerModifying       -> wait until status changes
+// modtimer:
+//   timerWaiting    -> timerModifying -> timerModifiedXX
+//   timerModifiedXX -> timerModifying -> timerModifiedYY
+//   timerNoStatus   -> timerModifying -> timerWaiting
+//   timerRemoved    -> timerModifying -> timerWaiting
+//   timerDeleted    -> timerModifying -> timerModifiedXX
+//   timerRunning    -> wait until status changes
+//   timerMoving     -> wait until status changes
+//   timerRemoving   -> wait until status changes
+//   timerModifying  -> wait until status changes
+// cleantimers (looks in P's timer heap):
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerModifiedXX -> timerMoving -> timerWaiting
+// adjusttimers (looks in P's timer heap):
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerModifiedXX -> timerMoving -> timerWaiting
+// runtimer (looks in P's timer heap):
+//   timerNoStatus   -> panic: uninitialized timer
+//   timerWaiting    -> timerWaiting or
+//   timerWaiting    -> timerRunning -> timerNoStatus or
+//   timerWaiting    -> timerRunning -> timerWaiting
+//   timerModifying  -> wait until status changes
+//   timerModifiedXX -> timerMoving -> timerWaiting
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerRunning    -> panic: concurrent runtimer calls
+//   timerRemoved    -> panic: inconsistent timer heap
+//   timerRemoving   -> panic: inconsistent timer heap
+//   timerMoving     -> panic: inconsistent timer heap
+
+// Values for the timer status field.
+const (
+	// Timer has no status set yet.
+	timerNoStatus = iota
+
+	// Waiting for timer to fire.
+	// The timer is in some P's heap.
+	timerWaiting
+
+	// Running the timer function.
+	// A timer will only have this status briefly.
+	timerRunning
+
+	// The timer is deleted and should be removed.
+	// It should not be run, but it is still in some P's heap.
+	timerDeleted
+
+	// The timer is being removed.
+	// The timer will only have this status briefly.
+	timerRemoving
+
+	// The timer has been stopped.
+	// It is not in any P's heap.
+	timerRemoved
+
+	// The timer is being modified.
+	// The timer will only have this status briefly.
+	timerModifying
+
+	// The timer has been modified to an earlier time.
+	// The new when value is in the nextwhen field.
+	// The timer is in some P's heap, possibly in the wrong place.
+	timerModifiedEarlier
+
+	// The timer has been modified to the same or a later time.
+	// The new when value is in the nextwhen field.
+	// The timer is in some P's heap, possibly in the wrong place.
+	timerModifiedLater
+
+	// The timer has been modified and is being moved.
+	// The timer will only have this status briefly.
+	timerMoving
+)
+
+// maxWhen is the maximum value for timer's when field.
+const maxWhen = 1<<63 - 1
+
+// verifyTimers can be set to true to add debugging checks that the
+// timer heaps are valid.
+const verifyTimers = false
+
+// Package time APIs.
+// Godoc uses the comments in package time, not these.
+
+// time.now is implemented in assembly.
+
+// timeSleep puts the current goroutine to sleep for at least ns nanoseconds.
+//go:linkname timeSleep time.Sleep
+func timeSleep(ns int64) {
+	if ns <= 0 {
+		return
+	}
+
+	gp := getg()
+	t := gp.timer
+	if t == nil {
+		t = new(timer)
+		gp.timer = t
+	}
+	t.f = goroutineReady
+	t.arg = gp
+	t.nextwhen = nanotime() + ns
+	if t.nextwhen < 0 { // check for overflow.
+		t.nextwhen = maxWhen
+	}
+	gopark(resetForSleep, unsafe.Pointer(t), waitReasonSleep, traceEvGoSleep, 1)
+}
+
+// resetForSleep is called after the goroutine is parked for timeSleep.
+// We can't call resettimer in timeSleep itself because if this is a short
+// sleep and there are many goroutines then the P can wind up running the
+// timer function, goroutineReady, before the goroutine has been parked.
+func resetForSleep(gp *g, ut unsafe.Pointer) bool {
+	t := (*timer)(ut)
+	resettimer(t, t.nextwhen)
+	return true
+}
+
+// startTimer adds t to the timer heap.
+//go:linkname startTimer time.startTimer
+func startTimer(t *timer) {
+	if raceenabled {
+		racerelease(unsafe.Pointer(t))
+	}
+	addtimer(t)
+}
+
+// stopTimer stops a timer.
+// It reports whether t was stopped before being run.
+//go:linkname stopTimer time.stopTimer
+func stopTimer(t *timer) bool {
+	return deltimer(t)
+}
+
+// resetTimer resets an inactive timer, adding it to the heap.
+//go:linkname resetTimer time.resetTimer
+// Reports whether the timer was modified before it was run.
+func resetTimer(t *timer, when int64) bool {
+	if raceenabled {
+		racerelease(unsafe.Pointer(t))
+	}
+	return resettimer(t, when)
+}
+
+// modTimer modifies an existing timer.
+//go:linkname modTimer time.modTimer
+func modTimer(t *timer, when, period int64, f func(any, uintptr), arg any, seq uintptr) {
+	modtimer(t, when, period, f, arg, seq)
+}
+
+// Go runtime.
+
+// Ready the goroutine arg.
+func goroutineReady(arg any, seq uintptr) {
+	goready(arg.(*g), 0)
+}
+
+// addtimer adds a timer to the current P.
+// This should only be called with a newly created timer.
+// That avoids the risk of changing the when field of a timer in some P's heap,
+// which could cause the heap to become unsorted.
+func addtimer(t *timer) {
+	// when must be positive. A negative value will cause runtimer to
+	// overflow during its delta calculation and never expire other runtime
+	// timers. Zero will cause checkTimers to fail to notice the timer.
+	if t.when <= 0 {
+		throw("timer when must be positive")
+	}
+	if t.period < 0 {
+		throw("timer period must be non-negative")
+	}
+	if t.status != timerNoStatus {
+		throw("addtimer called with initialized timer")
+	}
+	t.status = timerWaiting
+
+	when := t.when
+
+	// Disable preemption while using pp to avoid changing another P's heap.
+	mp := acquirem()
+
+	pp := getg().m.p.ptr()
+	lock(&pp.timersLock)
+	cleantimers(pp)
+	doaddtimer(pp, t)
+	unlock(&pp.timersLock)
+
+	wakeNetPoller(when)
+
+	releasem(mp)
+}
+
+// doaddtimer adds t to the current P's heap.
+// The caller must have locked the timers for pp.
+func doaddtimer(pp *p, t *timer) {
+	// Timers rely on the network poller, so make sure the poller
+	// has started.
+	if netpollInited == 0 {
+		netpollGenericInit()
+	}
+
+	if t.pp != 0 {
+		throw("doaddtimer: P already set in timer")
+	}
+	t.pp.set(pp)
+	i := len(pp.timers)
+	pp.timers = append(pp.timers, t)
+	siftupTimer(pp.timers, i)
+	if t == pp.timers[0] {
+		atomic.Store64(&pp.timer0When, uint64(t.when))
+	}
+	atomic.Xadd(&pp.numTimers, 1)
+}
+
+// deltimer deletes the timer t. It may be on some other P, so we can't
+// actually remove it from the timers heap. We can only mark it as deleted.
+// It will be removed in due course by the P whose heap it is on.
+// Reports whether the timer was removed before it was run.
+func deltimer(t *timer) bool {
+	for {
+		switch s := atomic.Load(&t.status); s {
+		case timerWaiting, timerModifiedLater:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp := acquirem()
+			if atomic.Cas(&t.status, s, timerModifying) {
+				// Must fetch t.pp before changing status,
+				// as cleantimers in another goroutine
+				// can clear t.pp of a timerDeleted timer.
+				tpp := t.pp.ptr()
+				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+					badTimer()
+				}
+				releasem(mp)
+				atomic.Xadd(&tpp.deletedTimers, 1)
+				// Timer was not yet run.
+				return true
+			} else {
+				releasem(mp)
+			}
+		case timerModifiedEarlier:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp := acquirem()
+			if atomic.Cas(&t.status, s, timerModifying) {
+				// Must fetch t.pp before setting status
+				// to timerDeleted.
+				tpp := t.pp.ptr()
+				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+					badTimer()
+				}
+				releasem(mp)
+				atomic.Xadd(&tpp.deletedTimers, 1)
+				// Timer was not yet run.
+				return true
+			} else {
+				releasem(mp)
+			}
+		case timerDeleted, timerRemoving, timerRemoved:
+			// Timer was already run.
+			return false
+		case timerRunning, timerMoving:
+			// The timer is being run or moved, by a different P.
+			// Wait for it to complete.
+			osyield()
+		case timerNoStatus:
+			// Removing timer that was never added or
+			// has already been run. Also see issue 21874.
+			return false
+		case timerModifying:
+			// Simultaneous calls to deltimer and modtimer.
+			// Wait for the other call to complete.
+			osyield()
+		default:
+			badTimer()
+		}
+	}
+}
+
+// dodeltimer removes timer i from the current P's heap.
+// We are locked on the P when this is called.
+// It returns the smallest changed index in pp.timers.
+// The caller must have locked the timers for pp.
+func dodeltimer(pp *p, i int) int {
+	if t := pp.timers[i]; t.pp.ptr() != pp {
+		throw("dodeltimer: wrong P")
+	} else {
+		t.pp = 0
+	}
+	last := len(pp.timers) - 1
+	if i != last {
+		pp.timers[i] = pp.timers[last]
+	}
+	pp.timers[last] = nil
+	pp.timers = pp.timers[:last]
+	smallestChanged := i
+	if i != last {
+		// Moving to i may have moved the last timer to a new parent,
+		// so sift up to preserve the heap guarantee.
+		smallestChanged = siftupTimer(pp.timers, i)
+		siftdownTimer(pp.timers, i)
+	}
+	if i == 0 {
+		updateTimer0When(pp)
+	}
+	atomic.Xadd(&pp.numTimers, -1)
+	return smallestChanged
+}
+
+// dodeltimer0 removes timer 0 from the current P's heap.
+// We are locked on the P when this is called.
+// It reports whether it saw no problems due to races.
+// The caller must have locked the timers for pp.
+func dodeltimer0(pp *p) {
+	if t := pp.timers[0]; t.pp.ptr() != pp {
+		throw("dodeltimer0: wrong P")
+	} else {
+		t.pp = 0
+	}
+	last := len(pp.timers) - 1
+	if last > 0 {
+		pp.timers[0] = pp.timers[last]
+	}
+	pp.timers[last] = nil
+	pp.timers = pp.timers[:last]
+	if last > 0 {
+		siftdownTimer(pp.timers, 0)
+	}
+	updateTimer0When(pp)
+	atomic.Xadd(&pp.numTimers, -1)
+}
+
+// modtimer modifies an existing timer.
+// This is called by the netpoll code or time.Ticker.Reset or time.Timer.Reset.
+// Reports whether the timer was modified before it was run.
+func modtimer(t *timer, when, period int64, f func(any, uintptr), arg any, seq uintptr) bool {
+	if when <= 0 {
+		throw("timer when must be positive")
+	}
+	if period < 0 {
+		throw("timer period must be non-negative")
+	}
+
+	status := uint32(timerNoStatus)
+	wasRemoved := false
+	var pending bool
+	var mp *m
+loop:
+	for {
+		switch status = atomic.Load(&t.status); status {
+		case timerWaiting, timerModifiedEarlier, timerModifiedLater:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp = acquirem()
+			if atomic.Cas(&t.status, status, timerModifying) {
+				pending = true // timer not yet run
+				break loop
+			}
+			releasem(mp)
+		case timerNoStatus, timerRemoved:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp = acquirem()
+
+			// Timer was already run and t is no longer in a heap.
+			// Act like addtimer.
+			if atomic.Cas(&t.status, status, timerModifying) {
+				wasRemoved = true
+				pending = false // timer already run or stopped
+				break loop
+			}
+			releasem(mp)
+		case timerDeleted:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp = acquirem()
+			if atomic.Cas(&t.status, status, timerModifying) {
+				atomic.Xadd(&t.pp.ptr().deletedTimers, -1)
+				pending = false // timer already stopped
+				break loop
+			}
+			releasem(mp)
+		case timerRunning, timerRemoving, timerMoving:
+			// The timer is being run or moved, by a different P.
+			// Wait for it to complete.
+			osyield()
+		case timerModifying:
+			// Multiple simultaneous calls to modtimer.
+			// Wait for the other call to complete.
+			osyield()
+		default:
+			badTimer()
+		}
+	}
+
+	t.period = period
+	t.f = f
+	t.arg = arg
+	t.seq = seq
+
+	if wasRemoved {
+		t.when = when
+		pp := getg().m.p.ptr()
+		lock(&pp.timersLock)
+		doaddtimer(pp, t)
+		unlock(&pp.timersLock)
+		if !atomic.Cas(&t.status, timerModifying, timerWaiting) {
+			badTimer()
+		}
+		releasem(mp)
+		wakeNetPoller(when)
+	} else {
+		// The timer is in some other P's heap, so we can't change
+		// the when field. If we did, the other P's heap would
+		// be out of order. So we put the new when value in the
+		// nextwhen field, and let the other P set the when field
+		// when it is prepared to resort the heap.
+		t.nextwhen = when
+
+		newStatus := uint32(timerModifiedLater)
+		if when < t.when {
+			newStatus = timerModifiedEarlier
+		}
+
+		tpp := t.pp.ptr()
+
+		if newStatus == timerModifiedEarlier {
+			updateTimerModifiedEarliest(tpp, when)
+		}
+
+		// Set the new status of the timer.
+		if !atomic.Cas(&t.status, timerModifying, newStatus) {
+			badTimer()
+		}
+		releasem(mp)
+
+		// If the new status is earlier, wake up the poller.
+		if newStatus == timerModifiedEarlier {
+			wakeNetPoller(when)
+		}
+	}
+
+	return pending
+}
+
+// resettimer resets the time when a timer should fire.
+// If used for an inactive timer, the timer will become active.
+// This should be called instead of addtimer if the timer value has been,
+// or may have been, used previously.
+// Reports whether the timer was modified before it was run.
+func resettimer(t *timer, when int64) bool {
+	return modtimer(t, when, t.period, t.f, t.arg, t.seq)
+}
+
+// cleantimers cleans up the head of the timer queue. This speeds up
+// programs that create and delete timers; leaving them in the heap
+// slows down addtimer. Reports whether no timer problems were found.
+// The caller must have locked the timers for pp.
+func cleantimers(pp *p) {
+	gp := getg()
+	for {
+		if len(pp.timers) == 0 {
+			return
+		}
+
+		// This loop can theoretically run for a while, and because
+		// it is holding timersLock it cannot be preempted.
+		// If someone is trying to preempt us, just return.
+		// We can clean the timers later.
+		if gp.preemptStop {
+			return
+		}
+
+		t := pp.timers[0]
+		if t.pp.ptr() != pp {
+			throw("cleantimers: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerDeleted:
+			if !atomic.Cas(&t.status, s, timerRemoving) {
+				continue
+			}
+			dodeltimer0(pp)
+			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+				badTimer()
+			}
+			atomic.Xadd(&pp.deletedTimers, -1)
+		case timerModifiedEarlier, timerModifiedLater:
+			if !atomic.Cas(&t.status, s, timerMoving) {
+				continue
+			}
+			// Now we can change the when field.
+			t.when = t.nextwhen
+			// Move t to the right position.
+			dodeltimer0(pp)
+			doaddtimer(pp, t)
+			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				badTimer()
+			}
+		default:
+			// Head of timers does not need adjustment.
+			return
+		}
+	}
+}
+
+// moveTimers moves a slice of timers to pp. The slice has been taken
+// from a different P.
+// This is currently called when the world is stopped, but the caller
+// is expected to have locked the timers for pp.
+func moveTimers(pp *p, timers []*timer) {
+	for _, t := range timers {
+	loop:
+		for {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				if !atomic.Cas(&t.status, s, timerMoving) {
+					continue
+				}
+				t.pp = 0
+				doaddtimer(pp, t)
+				if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+					badTimer()
+				}
+				break loop
+			case timerModifiedEarlier, timerModifiedLater:
+				if !atomic.Cas(&t.status, s, timerMoving) {
+					continue
+				}
+				t.when = t.nextwhen
+				t.pp = 0
+				doaddtimer(pp, t)
+				if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+					badTimer()
+				}
+				break loop
+			case timerDeleted:
+				if !atomic.Cas(&t.status, s, timerRemoved) {
+					continue
+				}
+				t.pp = 0
+				// We no longer need this timer in the heap.
+				break loop
+			case timerModifying:
+				// Loop until the modification is complete.
+				osyield()
+			case timerNoStatus, timerRemoved:
+				// We should not see these status values in a timers heap.
+				badTimer()
+			case timerRunning, timerRemoving, timerMoving:
+				// Some other P thinks it owns this timer,
+				// which should not happen.
+				badTimer()
+			default:
+				badTimer()
+			}
+		}
+	}
+}
+
+// adjusttimers looks through the timers in the current P's heap for
+// any timers that have been modified to run earlier, and puts them in
+// the correct place in the heap. While looking for those timers,
+// it also moves timers that have been modified to run later,
+// and removes deleted timers. The caller must have locked the timers for pp.
+func adjusttimers(pp *p, now int64) {
+	// If we haven't yet reached the time of the first timerModifiedEarlier
+	// timer, don't do anything. This speeds up programs that adjust
+	// a lot of timers back and forth if the timers rarely expire.
+	// We'll postpone looking through all the adjusted timers until
+	// one would actually expire.
+	first := atomic.Load64(&pp.timerModifiedEarliest)
+	if first == 0 || int64(first) > now {
+		if verifyTimers {
+			verifyTimerHeap(pp)
+		}
+		return
+	}
+
+	// We are going to clear all timerModifiedEarlier timers.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
+	var moved []*timer
+	for i := 0; i < len(pp.timers); i++ {
+		t := pp.timers[i]
+		if t.pp.ptr() != pp {
+			throw("adjusttimers: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerDeleted:
+			if atomic.Cas(&t.status, s, timerRemoving) {
+				changed := dodeltimer(pp, i)
+				if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+					badTimer()
+				}
+				atomic.Xadd(&pp.deletedTimers, -1)
+				// Go back to the earliest changed heap entry.
+				// "- 1" because the loop will add 1.
+				i = changed - 1
+			}
+		case timerModifiedEarlier, timerModifiedLater:
+			if atomic.Cas(&t.status, s, timerMoving) {
+				// Now we can change the when field.
+				t.when = t.nextwhen
+				// Take t off the heap, and hold onto it.
+				// We don't add it back yet because the
+				// heap manipulation could cause our
+				// loop to skip some other timer.
+				changed := dodeltimer(pp, i)
+				moved = append(moved, t)
+				// Go back to the earliest changed heap entry.
+				// "- 1" because the loop will add 1.
+				i = changed - 1
+			}
+		case timerNoStatus, timerRunning, timerRemoving, timerRemoved, timerMoving:
+			badTimer()
+		case timerWaiting:
+			// OK, nothing to do.
+		case timerModifying:
+			// Check again after modification is complete.
+			osyield()
+			i--
+		default:
+			badTimer()
+		}
+	}
+
+	if len(moved) > 0 {
+		addAdjustedTimers(pp, moved)
+	}
+
+	if verifyTimers {
+		verifyTimerHeap(pp)
+	}
+}
+
+// addAdjustedTimers adds any timers we adjusted in adjusttimers
+// back to the timer heap.
+func addAdjustedTimers(pp *p, moved []*timer) {
+	for _, t := range moved {
+		doaddtimer(pp, t)
+		if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+			badTimer()
+		}
+	}
+}
+
+// nobarrierWakeTime looks at P's timers and returns the time when we
+// should wake up the netpoller. It returns 0 if there are no timers.
+// This function is invoked when dropping a P, and must run without
+// any write barriers.
+//go:nowritebarrierrec
+func nobarrierWakeTime(pp *p) int64 {
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
+	}
+	return next
+}
+
+// runtimer examines the first timer in timers. If it is ready based on now,
+// it runs the timer and removes or updates it.
+// Returns 0 if it ran a timer, -1 if there are no more timers, or the time
+// when the first timer should run.
+// The caller must have locked the timers for pp.
+// If a timer is run, this will temporarily unlock the timers.
+//go:systemstack
+func runtimer(pp *p, now int64) int64 {
+	for {
+		t := pp.timers[0]
+		if t.pp.ptr() != pp {
+			throw("runtimer: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerWaiting:
+			if t.when > now {
+				// Not ready to run.
+				return t.when
+			}
+
+			if !atomic.Cas(&t.status, s, timerRunning) {
+				continue
+			}
+			// Note that runOneTimer may temporarily unlock
+			// pp.timersLock.
+			runOneTimer(pp, t, now)
+			return 0
+
+		case timerDeleted:
+			if !atomic.Cas(&t.status, s, timerRemoving) {
+				continue
+			}
+			dodeltimer0(pp)
+			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+				badTimer()
+			}
+			atomic.Xadd(&pp.deletedTimers, -1)
+			if len(pp.timers) == 0 {
+				return -1
+			}
+
+		case timerModifiedEarlier, timerModifiedLater:
+			if !atomic.Cas(&t.status, s, timerMoving) {
+				continue
+			}
+			t.when = t.nextwhen
+			dodeltimer0(pp)
+			doaddtimer(pp, t)
+			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				badTimer()
+			}
+
+		case timerModifying:
+			// Wait for modification to complete.
+			osyield()
+
+		case timerNoStatus, timerRemoved:
+			// Should not see a new or inactive timer on the heap.
+			badTimer()
+		case timerRunning, timerRemoving, timerMoving:
+			// These should only be set when timers are locked,
+			// and we didn't do it.
+			badTimer()
+		default:
+			badTimer()
+		}
+	}
+}
+
+// runOneTimer runs a single timer.
+// The caller must have locked the timers for pp.
+// This will temporarily unlock the timers while running the timer function.
+//go:systemstack
+func runOneTimer(pp *p, t *timer, now int64) {
+	if raceenabled {
+		ppcur := getg().m.p.ptr()
+		if ppcur.timerRaceCtx == 0 {
+			ppcur.timerRaceCtx = racegostart(abi.FuncPCABIInternal(runtimer) + sys.PCQuantum)
+		}
+		raceacquirectx(ppcur.timerRaceCtx, unsafe.Pointer(t))
+	}
+
+	f := t.f
+	arg := t.arg
+	seq := t.seq
+
+	if t.period > 0 {
+		// Leave in heap but adjust next time to fire.
+		delta := t.when - now
+		t.when += t.period * (1 + -delta/t.period)
+		if t.when < 0 { // check for overflow.
+			t.when = maxWhen
+		}
+		siftdownTimer(pp.timers, 0)
+		if !atomic.Cas(&t.status, timerRunning, timerWaiting) {
+			badTimer()
+		}
+		updateTimer0When(pp)
+	} else {
+		// Remove from heap.
+		dodeltimer0(pp)
+		if !atomic.Cas(&t.status, timerRunning, timerNoStatus) {
+			badTimer()
+		}
+	}
+
+	if raceenabled {
+		// Temporarily use the current P's racectx for g0.
+		gp := getg()
+		if gp.racectx != 0 {
+			throw("runOneTimer: unexpected racectx")
+		}
+		gp.racectx = gp.m.p.ptr().timerRaceCtx
+	}
+
+	unlock(&pp.timersLock)
+
+	f(arg, seq)
+
+	lock(&pp.timersLock)
+
+	if raceenabled {
+		gp := getg()
+		gp.racectx = 0
+	}
+}
+
+// clearDeletedTimers removes all deleted timers from the P's timer heap.
+// This is used to avoid clogging up the heap if the program
+// starts a lot of long-running timers and then stops them.
+// For example, this can happen via context.WithTimeout.
+//
+// This is the only function that walks through the entire timer heap,
+// other than moveTimers which only runs when the world is stopped.
+//
+// The caller must have locked the timers for pp.
+func clearDeletedTimers(pp *p) {
+	// We are going to clear all timerModifiedEarlier timers.
+	// Do this now in case new ones show up while we are looping.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
+	cdel := int32(0)
+	to := 0
+	changedHeap := false
+	timers := pp.timers
+nextTimer:
+	for _, t := range timers {
+		for {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				if changedHeap {
+					timers[to] = t
+					siftupTimer(timers, to)
+				}
+				to++
+				continue nextTimer
+			case timerModifiedEarlier, timerModifiedLater:
+				if atomic.Cas(&t.status, s, timerMoving) {
+					t.when = t.nextwhen
+					timers[to] = t
+					siftupTimer(timers, to)
+					to++
+					changedHeap = true
+					if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+						badTimer()
+					}
+					continue nextTimer
+				}
+			case timerDeleted:
+				if atomic.Cas(&t.status, s, timerRemoving) {
+					t.pp = 0
+					cdel++
+					if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+						badTimer()
+					}
+					changedHeap = true
+					continue nextTimer
+				}
+			case timerModifying:
+				// Loop until modification complete.
+				osyield()
+			case timerNoStatus, timerRemoved:
+				// We should not see these status values in a timer heap.
+				badTimer()
+			case timerRunning, timerRemoving, timerMoving:
+				// Some other P thinks it owns this timer,
+				// which should not happen.
+				badTimer()
+			default:
+				badTimer()
+			}
+		}
+	}
+
+	// Set remaining slots in timers slice to nil,
+	// so that the timer values can be garbage collected.
+	for i := to; i < len(timers); i++ {
+		timers[i] = nil
+	}
+
+	atomic.Xadd(&pp.deletedTimers, -cdel)
+	atomic.Xadd(&pp.numTimers, -cdel)
+
+	timers = timers[:to]
+	pp.timers = timers
+	updateTimer0When(pp)
+
+	if verifyTimers {
+		verifyTimerHeap(pp)
+	}
+}
+
+// verifyTimerHeap verifies that the timer heap is in a valid state.
+// This is only for debugging, and is only called if verifyTimers is true.
+// The caller must have locked the timers.
+func verifyTimerHeap(pp *p) {
+	for i, t := range pp.timers {
+		if i == 0 {
+			// First timer has no parent.
+			continue
+		}
+
+		// The heap is 4-ary. See siftupTimer and siftdownTimer.
+		p := (i - 1) / 4
+		if t.when < pp.timers[p].when {
+			print("bad timer heap at ", i, ": ", p, ": ", pp.timers[p].when, ", ", i, ": ", t.when, "\n")
+			throw("bad timer heap")
+		}
+	}
+	if numTimers := int(atomic.Load(&pp.numTimers)); len(pp.timers) != numTimers {
+		println("timer heap len", len(pp.timers), "!= numTimers", numTimers)
+		throw("bad timer heap len")
+	}
+}
+
+// updateTimer0When sets the P's timer0When field.
+// The caller must have locked the timers for pp.
+func updateTimer0When(pp *p) {
+	if len(pp.timers) == 0 {
+		atomic.Store64(&pp.timer0When, 0)
+	} else {
+		atomic.Store64(&pp.timer0When, uint64(pp.timers[0].when))
+	}
+}
+
+// updateTimerModifiedEarliest updates the recorded nextwhen field of the
+// earlier timerModifiedEarier value.
+// The timers for pp will not be locked.
+func updateTimerModifiedEarliest(pp *p, nextwhen int64) {
+	for {
+		old := atomic.Load64(&pp.timerModifiedEarliest)
+		if old != 0 && int64(old) < nextwhen {
+			return
+		}
+		if atomic.Cas64(&pp.timerModifiedEarliest, old, uint64(nextwhen)) {
+			return
+		}
+	}
+}
+
+// timeSleepUntil returns the time when the next timer should fire,
+// and the P that holds the timer heap that that timer is on.
+// This is only called by sysmon and checkdead.
+func timeSleepUntil() (int64, *p) {
+	next := int64(maxWhen)
+	var pret *p
+
+	// Prevent allp slice changes. This is like retake.
+	lock(&allpLock)
+	for _, pp := range allp {
+		if pp == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
+			continue
+		}
+
+		w := int64(atomic.Load64(&pp.timer0When))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
+		}
+
+		w = int64(atomic.Load64(&pp.timerModifiedEarliest))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
+		}
+	}
+	unlock(&allpLock)
+
+	return next, pret
+}
+
+// Heap maintenance algorithms.
+// These algorithms check for slice index errors manually.
+// Slice index error can happen if the program is using racy
+// access to timers. We don't want to panic here, because
+// it will cause the program to crash with a mysterious
+// "panic holding locks" message. Instead, we panic while not
+// holding a lock.
+
+// siftupTimer puts the timer at position i in the right place
+// in the heap by moving it up toward the top of the heap.
+// It returns the smallest changed index.
+func siftupTimer(t []*timer, i int) int {
+	if i >= len(t) {
+		badTimer()
+	}
+	when := t[i].when
+	if when <= 0 {
+		badTimer()
+	}
+	tmp := t[i]
+	for i > 0 {
+		p := (i - 1) / 4 // parent
+		if when >= t[p].when {
+			break
+		}
+		t[i] = t[p]
+		i = p
+	}
+	if tmp != t[i] {
+		t[i] = tmp
+	}
+	return i
+}
+
+// siftdownTimer puts the timer at position i in the right place
+// in the heap by moving it down toward the bottom of the heap.
+func siftdownTimer(t []*timer, i int) {
+	n := len(t)
+	if i >= n {
+		badTimer()
+	}
+	when := t[i].when
+	if when <= 0 {
+		badTimer()
+	}
+	tmp := t[i]
+	for {
+		c := i*4 + 1 // left child
+		c3 := c + 2  // mid child
+		if c >= n {
+			break
+		}
+		w := t[c].when
+		if c+1 < n && t[c+1].when < w {
+			w = t[c+1].when
+			c++
+		}
+		if c3 < n {
+			w3 := t[c3].when
+			if c3+1 < n && t[c3+1].when < w3 {
+				w3 = t[c3+1].when
+				c3++
+			}
+			if w3 < w {
+				w = w3
+				c = c3
+			}
+		}
+		if w >= when {
+			break
+		}
+		t[i] = t[c]
+		i = c
+	}
+	if tmp != t[i] {
+		t[i] = tmp
+	}
+}
+
+// badTimer is called if the timer data structures have been corrupted,
+// presumably due to racy use by the program. We panic here rather than
+// panicing due to invalid slice access while holding locks.
+// See issue #25686.
+func badTimer() {
+	throw("timer data corruption")
+}
diff --git a/contrib/go/_std_1.18/src/runtime/time_linux_amd64.s b/contrib/go/_std_1.18/src/runtime/time_linux_amd64.s
new file mode 100644
index 0000000000..1416d23230
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/time_linux_amd64.s
@@ -0,0 +1,87 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define SYS_clock_gettime	228
+
+// func time.now() (sec int64, nsec int32, mono int64)
+TEXT time·now<ABIInternal>(SB),NOSPLIT,$16-24
+	MOVQ	SP, R12 // Save old SP; R12 unchanged by C code.
+
+	MOVQ	g_m(R14), BX // BX unchanged by C code.
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	// Save the old values on stack and restore them on exit,
+	// so this function is reentrant.
+	MOVQ	m_vdsoPC(BX), CX
+	MOVQ	m_vdsoSP(BX), DX
+	MOVQ	CX, 0(SP)
+	MOVQ	DX, 8(SP)
+
+	LEAQ	sec+0(FP), DX
+	MOVQ	-8(DX), CX	// Sets CX to function return address.
+	MOVQ	CX, m_vdsoPC(BX)
+	MOVQ	DX, m_vdsoSP(BX)
+
+	CMPQ	R14, m_curg(BX)	// Only switch if on curg.
+	JNE	noswitch
+
+	MOVQ	m_g0(BX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+
+noswitch:
+	SUBQ	$32, SP		// Space for two time results
+	ANDQ	$~15, SP	// Align for C code
+
+	MOVL	$0, DI // CLOCK_REALTIME
+	LEAQ	16(SP), SI
+	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback
+	CALL	AX
+
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
+	MOVQ	runtime·vdsoClockgettimeSym(SB), AX
+	CALL	AX
+
+ret:
+	MOVQ	16(SP), AX	// realtime sec
+	MOVQ	24(SP), DI	// realtime nsec (moved to BX below)
+	MOVQ	0(SP), CX	// monotonic sec
+	IMULQ	$1000000000, CX
+	MOVQ	8(SP), DX	// monotonic nsec
+
+	MOVQ	R12, SP		// Restore real SP
+
+	// Restore vdsoPC, vdsoSP
+	// We don't worry about being signaled between the two stores.
+	// If we are not in a signal handler, we'll restore vdsoSP to 0,
+	// and no one will care about vdsoPC. If we are in a signal handler,
+	// we cannot receive another signal.
+	MOVQ	8(SP), SI
+	MOVQ	SI, m_vdsoSP(BX)
+	MOVQ	0(SP), SI
+	MOVQ	SI, m_vdsoPC(BX)
+
+	// set result registers; AX is already correct
+	MOVQ	DI, BX
+	ADDQ	DX, CX
+	RET
+
+fallback:
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
+
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
+	MOVQ	$SYS_clock_gettime, AX
+	SYSCALL
+
+	JMP	ret
diff --git a/contrib/go/_std_1.18/src/runtime/time_nofake.go b/contrib/go/_std_1.18/src/runtime/time_nofake.go
new file mode 100644
index 0000000000..70a2102b22
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/time_nofake.go
@@ -0,0 +1,32 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !faketime
+
+package runtime
+
+import "unsafe"
+
+// faketime is the simulated time in nanoseconds since 1970 for the
+// playground.
+//
+// Zero means not to use faketime.
+var faketime int64
+
+//go:nosplit
+func nanotime() int64 {
+	return nanotime1()
+}
+
+var overrideWrite func(fd uintptr, p unsafe.Pointer, n int32) int32
+
+// write must be nosplit on Windows (see write1)
+//
+//go:nosplit
+func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+	if overrideWrite != nil {
+		return overrideWrite(fd, noescape(p), n)
+	}
+	return write1(fd, p, n)
+}
diff --git a/contrib/go/_std_1.18/src/runtime/timeasm.go b/contrib/go/_std_1.18/src/runtime/timeasm.go
new file mode 100644
index 0000000000..0421388686
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/timeasm.go
@@ -0,0 +1,14 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Declarations for operating systems implementing time.now directly in assembly.
+
+//go:build !faketime && (windows || (linux && amd64))
+
+package runtime
+
+import _ "unsafe"
+
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64)
diff --git a/contrib/go/_std_1.18/src/runtime/timestub.go b/contrib/go/_std_1.18/src/runtime/timestub.go
new file mode 100644
index 0000000000..1d2926b43d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/timestub.go
@@ -0,0 +1,18 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Declarations for operating systems implementing time.now
+// indirectly, in terms of walltime and nanotime assembly.
+
+//go:build !faketime && !windows && !(linux && amd64)
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64) {
+	sec, nsec = walltime()
+	return sec, nsec, nanotime()
+}
diff --git a/contrib/go/_std_1.18/src/runtime/tls_stub.go b/contrib/go/_std_1.18/src/runtime/tls_stub.go
new file mode 100644
index 0000000000..7bdfc6b89a
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/tls_stub.go
@@ -0,0 +1,10 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (windows && !amd64) || !windows
+
+package runtime
+
+//go:nosplit
+func osSetupTLS(mp *m) {}
diff --git a/contrib/go/_std_1.18/src/runtime/trace.go b/contrib/go/_std_1.18/src/runtime/trace.go
new file mode 100644
index 0000000000..8f60de2b05
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/trace.go
@@ -0,0 +1,1260 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Go execution tracer.
+// The tracer captures a wide range of execution events like goroutine
+// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
+// changes of heap size, processor start/stop, etc and writes them to a buffer
+// in a compact form. A precise nanosecond-precision timestamp and a stack
+// trace is captured for most events.
+// See https://golang.org/s/go15trace for more info.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Event types in the trace, args are given in square brackets.
+const (
+	traceEvNone              = 0  // unused
+	traceEvBatch             = 1  // start of per-P batch of events [pid, timestamp]
+	traceEvFrequency         = 2  // contains tracer timer frequency [frequency (ticks per second)]
+	traceEvStack             = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
+	traceEvGomaxprocs        = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
+	traceEvProcStart         = 5  // start of P [timestamp, thread id]
+	traceEvProcStop          = 6  // stop of P [timestamp]
+	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
+	traceEvGCDone            = 8  // GC done [timestamp]
+	traceEvGCSTWStart        = 9  // GC STW start [timestamp, kind]
+	traceEvGCSTWDone         = 10 // GC STW done [timestamp]
+	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
+	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
+	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
+	traceEvGoStart           = 14 // goroutine starts running [timestamp, goroutine id, seq]
+	traceEvGoEnd             = 15 // goroutine ends [timestamp]
+	traceEvGoStop            = 16 // goroutine stops (like in select{}) [timestamp, stack]
+	traceEvGoSched           = 17 // goroutine calls Gosched [timestamp, stack]
+	traceEvGoPreempt         = 18 // goroutine is preempted [timestamp, stack]
+	traceEvGoSleep           = 19 // goroutine calls Sleep [timestamp, stack]
+	traceEvGoBlock           = 20 // goroutine blocks [timestamp, stack]
+	traceEvGoUnblock         = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
+	traceEvGoBlockSend       = 22 // goroutine blocks on chan send [timestamp, stack]
+	traceEvGoBlockRecv       = 23 // goroutine blocks on chan recv [timestamp, stack]
+	traceEvGoBlockSelect     = 24 // goroutine blocks on select [timestamp, stack]
+	traceEvGoBlockSync       = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
+	traceEvGoBlockCond       = 26 // goroutine blocks on Cond [timestamp, stack]
+	traceEvGoBlockNet        = 27 // goroutine blocks on network [timestamp, stack]
+	traceEvGoSysCall         = 28 // syscall enter [timestamp, stack]
+	traceEvGoSysExit         = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
+	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
+	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
+	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
+	traceEvHeapAlloc         = 33 // gcController.heapLive change [timestamp, heap_alloc]
+	traceEvHeapGoal          = 34 // gcController.heapGoal (formerly next_gc) change [timestamp, heap goal in bytes]
+	traceEvTimerGoroutine    = 35 // not currently used; previously denoted timer goroutine [timer goroutine id]
+	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
+	traceEvString            = 37 // string dictionary entry [ID, length, string]
+	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal    = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal    = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvGoStartLabel      = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
+	traceEvGoBlockGC         = 42 // goroutine blocks on GC assist [timestamp, stack]
+	traceEvGCMarkAssistStart = 43 // GC mark assist start [timestamp, stack]
+	traceEvGCMarkAssistDone  = 44 // GC mark assist done [timestamp]
+	traceEvUserTaskCreate    = 45 // trace.NewContext [timestamp, internal task id, internal parent task id, stack, name string]
+	traceEvUserTaskEnd       = 46 // end of a task [timestamp, internal task id, stack]
+	traceEvUserRegion        = 47 // trace.WithRegion [timestamp, internal task id, mode(0:start, 1:end), stack, name string]
+	traceEvUserLog           = 48 // trace.Log [timestamp, internal task id, key string id, stack, value string]
+	traceEvCount             = 49
+	// Byte is used but only 6 bits are available for event type.
+	// The remaining 2 bits are used to specify the number of arguments.
+	// That means, the max event type value is 63.
+)
+
+const (
+	// Timestamps in trace are cputicks/traceTickDiv.
+	// This makes absolute values of timestamp diffs smaller,
+	// and so they are encoded in less number of bytes.
+	// 64 on x86 is somewhat arbitrary (one tick is ~20ns on a 3GHz machine).
+	// The suggested increment frequency for PowerPC's time base register is
+	// 512 MHz according to Power ISA v2.07 section 6.2, so we use 16 on ppc64
+	// and ppc64le.
+	// Tracing won't work reliably for architectures where cputicks is emulated
+	// by nanotime, so the value doesn't matter for those architectures.
+	traceTickDiv = 16 + 48*(goarch.Is386|goarch.IsAmd64)
+	// Maximum number of PCs in a single stack trace.
+	// Since events contain only stack id rather than whole stack trace,
+	// we can allow quite large values here.
+	traceStackSize = 128
+	// Identifier of a fake P that is used when we trace without a real P.
+	traceGlobProc = -1
+	// Maximum number of bytes to encode uint64 in base-128.
+	traceBytesPerNumber = 10
+	// Shift of the number of arguments in the first event byte.
+	traceArgCountShift = 6
+	// Flag passed to traceGoPark to denote that the previous wakeup of this
+	// goroutine was futile. For example, a goroutine was unblocked on a mutex,
+	// but another goroutine got ahead and acquired the mutex before the first
+	// goroutine is scheduled, so the first goroutine has to block again.
+	// Such wakeups happen on buffered channels and sync.Mutex,
+	// but are generally not interesting for end user.
+	traceFutileWakeup byte = 128
+)
+
+// trace is global tracing context.
+var trace struct {
+	lock          mutex       // protects the following members
+	lockOwner     *g          // to avoid deadlocks during recursive lock locks
+	enabled       bool        // when set runtime traces events
+	shutdown      bool        // set when we are waiting for trace reader to finish after setting enabled to false
+	headerWritten bool        // whether ReadTrace has emitted trace header
+	footerWritten bool        // whether ReadTrace has emitted trace footer
+	shutdownSema  uint32      // used to wait for ReadTrace completion
+	seqStart      uint64      // sequence number when tracing was started
+	ticksStart    int64       // cputicks when tracing was started
+	ticksEnd      int64       // cputicks when tracing was stopped
+	timeStart     int64       // nanotime when tracing was started
+	timeEnd       int64       // nanotime when tracing was stopped
+	seqGC         uint64      // GC start/done sequencer
+	reading       traceBufPtr // buffer currently handed off to user
+	empty         traceBufPtr // stack of empty buffers
+	fullHead      traceBufPtr // queue of full buffers
+	fullTail      traceBufPtr
+	reader        guintptr        // goroutine that called ReadTrace, or nil
+	stackTab      traceStackTable // maps stack traces to unique ids
+
+	// Dictionary for traceEvString.
+	//
+	// TODO: central lock to access the map is not ideal.
+	//   option: pre-assign ids to all user annotation region names and tags
+	//   option: per-P cache
+	//   option: sync.Map like data structure
+	stringsLock mutex
+	strings     map[string]uint64
+	stringSeq   uint64
+
+	// markWorkerLabels maps gcMarkWorkerMode to string ID.
+	markWorkerLabels [len(gcMarkWorkerModeStrings)]uint64
+
+	bufLock mutex       // protects buf
+	buf     traceBufPtr // global trace buffer, used when running without a p
+}
+
+// traceBufHeader is per-P tracing buffer.
+type traceBufHeader struct {
+	link      traceBufPtr             // in trace.empty/full
+	lastTicks uint64                  // when we wrote the last event
+	pos       int                     // next write offset in arr
+	stk       [traceStackSize]uintptr // scratch buffer for traceback
+}
+
+// traceBuf is per-P tracing buffer.
+//
+//go:notinheap
+type traceBuf struct {
+	traceBufHeader
+	arr [64<<10 - unsafe.Sizeof(traceBufHeader{})]byte // underlying buffer for traceBufHeader.buf
+}
+
+// traceBufPtr is a *traceBuf that is not traced by the garbage
+// collector and doesn't have write barriers. traceBufs are not
+// allocated from the GC'd heap, so this is safe, and are often
+// manipulated in contexts where write barriers are not allowed, so
+// this is necessary.
+//
+// TODO: Since traceBuf is now go:notinheap, this isn't necessary.
+type traceBufPtr uintptr
+
+func (tp traceBufPtr) ptr() *traceBuf   { return (*traceBuf)(unsafe.Pointer(tp)) }
+func (tp *traceBufPtr) set(b *traceBuf) { *tp = traceBufPtr(unsafe.Pointer(b)) }
+func traceBufPtrOf(b *traceBuf) traceBufPtr {
+	return traceBufPtr(unsafe.Pointer(b))
+}
+
+// StartTrace enables tracing for the current process.
+// While tracing, the data will be buffered and available via ReadTrace.
+// StartTrace returns an error if tracing is already enabled.
+// Most clients should use the runtime/trace package or the testing package's
+// -test.trace flag instead of calling StartTrace directly.
+func StartTrace() error {
+	// Stop the world so that we can take a consistent snapshot
+	// of all goroutines at the beginning of the trace.
+	// Do not stop the world during GC so we ensure we always see
+	// a consistent view of GC-related events (e.g. a start is always
+	// paired with an end).
+	stopTheWorldGC("start tracing")
+
+	// Prevent sysmon from running any code that could generate events.
+	lock(&sched.sysmonlock)
+
+	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
+	// Exitsyscall could check trace.enabled long before and then suddenly wake up
+	// and decide to write to trace at a random point in time.
+	// However, such syscall will use the global trace.buf buffer, because we've
+	// acquired all p's by doing stop-the-world. So this protects us from such races.
+	lock(&trace.bufLock)
+
+	if trace.enabled || trace.shutdown {
+		unlock(&trace.bufLock)
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
+		return errorString("tracing is already enabled")
+	}
+
+	// Can't set trace.enabled yet. While the world is stopped, exitsyscall could
+	// already emit a delayed event (see exitTicks in exitsyscall) if we set trace.enabled here.
+	// That would lead to an inconsistent trace:
+	// - either GoSysExit appears before EvGoInSyscall,
+	// - or GoSysExit appears for a goroutine for which we don't emit EvGoInSyscall below.
+	// To instruct traceEvent that it must not ignore events below, we set startingtrace.
+	// trace.enabled is set afterwards once we have emitted all preliminary events.
+	_g_ := getg()
+	_g_.m.startingtrace = true
+
+	// Obtain current stack ID to use in all traceEvGoCreate events below.
+	mp := acquirem()
+	stkBuf := make([]uintptr, traceStackSize)
+	stackID := traceStackID(mp, stkBuf, 2)
+	releasem(mp)
+
+	// World is stopped, no need to lock.
+	forEachGRace(func(gp *g) {
+		status := readgstatus(gp)
+		if status != _Gdead {
+			gp.traceseq = 0
+			gp.tracelastp = getg().m.p
+			// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+			id := trace.stackTab.put([]uintptr{startPCforTrace(gp.startpc) + sys.PCQuantum})
+			traceEvent(traceEvGoCreate, -1, uint64(gp.goid), uint64(id), stackID)
+		}
+		if status == _Gwaiting {
+			// traceEvGoWaiting is implied to have seq=1.
+			gp.traceseq++
+			traceEvent(traceEvGoWaiting, -1, uint64(gp.goid))
+		}
+		if status == _Gsyscall {
+			gp.traceseq++
+			traceEvent(traceEvGoInSyscall, -1, uint64(gp.goid))
+		} else {
+			gp.sysblocktraced = false
+		}
+	})
+	traceProcStart()
+	traceGoStart()
+	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
+	// If we do it the other way around, it is possible that exitsyscall will
+	// query sysexitticks after ticksStart but before traceEvGoInSyscall timestamp.
+	// It will lead to a false conclusion that cputicks is broken.
+	trace.ticksStart = cputicks()
+	trace.timeStart = nanotime()
+	trace.headerWritten = false
+	trace.footerWritten = false
+
+	// string to id mapping
+	//  0 : reserved for an empty string
+	//  remaining: other strings registered by traceString
+	trace.stringSeq = 0
+	trace.strings = make(map[string]uint64)
+
+	trace.seqGC = 0
+	_g_.m.startingtrace = false
+	trace.enabled = true
+
+	// Register runtime goroutine labels.
+	_, pid, bufp := traceAcquireBuffer()
+	for i, label := range gcMarkWorkerModeStrings[:] {
+		trace.markWorkerLabels[i], bufp = traceString(bufp, pid, label)
+	}
+	traceReleaseBuffer(pid)
+
+	unlock(&trace.bufLock)
+
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
+	return nil
+}
+
+// StopTrace stops tracing, if it was previously enabled.
+// StopTrace only returns after all the reads for the trace have completed.
+func StopTrace() {
+	// Stop the world so that we can collect the trace buffers from all p's below,
+	// and also to avoid races with traceEvent.
+	stopTheWorldGC("stop tracing")
+
+	// See the comment in StartTrace.
+	lock(&sched.sysmonlock)
+
+	// See the comment in StartTrace.
+	lock(&trace.bufLock)
+
+	if !trace.enabled {
+		unlock(&trace.bufLock)
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
+		return
+	}
+
+	traceGoSched()
+
+	// Loop over all allocated Ps because dead Ps may still have
+	// trace buffers.
+	for _, p := range allp[:cap(allp)] {
+		buf := p.tracebuf
+		if buf != 0 {
+			traceFullQueue(buf)
+			p.tracebuf = 0
+		}
+	}
+	if trace.buf != 0 {
+		buf := trace.buf
+		trace.buf = 0
+		if buf.ptr().pos != 0 {
+			traceFullQueue(buf)
+		}
+	}
+
+	for {
+		trace.ticksEnd = cputicks()
+		trace.timeEnd = nanotime()
+		// Windows time can tick only every 15ms, wait for at least one tick.
+		if trace.timeEnd != trace.timeStart {
+			break
+		}
+		osyield()
+	}
+
+	trace.enabled = false
+	trace.shutdown = true
+	unlock(&trace.bufLock)
+
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
+
+	// The world is started but we've set trace.shutdown, so new tracing can't start.
+	// Wait for the trace reader to flush pending buffers and stop.
+	semacquire(&trace.shutdownSema)
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&trace.shutdownSema))
+	}
+
+	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
+	lock(&trace.lock)
+	for _, p := range allp[:cap(allp)] {
+		if p.tracebuf != 0 {
+			throw("trace: non-empty trace buffer in proc")
+		}
+	}
+	if trace.buf != 0 {
+		throw("trace: non-empty global trace buffer")
+	}
+	if trace.fullHead != 0 || trace.fullTail != 0 {
+		throw("trace: non-empty full trace buffer")
+	}
+	if trace.reading != 0 || trace.reader != 0 {
+		throw("trace: reading after shutdown")
+	}
+	for trace.empty != 0 {
+		buf := trace.empty
+		trace.empty = buf.ptr().link
+		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
+	}
+	trace.strings = nil
+	trace.shutdown = false
+	unlock(&trace.lock)
+}
+
+// ReadTrace returns the next chunk of binary tracing data, blocking until data
+// is available. If tracing is turned off and all the data accumulated while it
+// was on has been returned, ReadTrace returns nil. The caller must copy the
+// returned data before calling ReadTrace again.
+// ReadTrace must be called from one goroutine at a time.
+func ReadTrace() []byte {
+	// This function may need to lock trace.lock recursively
+	// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
+	// To allow this we use trace.lockOwner.
+	// Also this function must not allocate while holding trace.lock:
+	// allocation can call heap allocate, which will try to emit a trace
+	// event while holding heap lock.
+	lock(&trace.lock)
+	trace.lockOwner = getg()
+
+	if trace.reader != 0 {
+		// More than one goroutine reads trace. This is bad.
+		// But we rather do not crash the program because of tracing,
+		// because tracing can be enabled at runtime on prod servers.
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		println("runtime: ReadTrace called from multiple goroutines simultaneously")
+		return nil
+	}
+	// Recycle the old buffer.
+	if buf := trace.reading; buf != 0 {
+		buf.ptr().link = trace.empty
+		trace.empty = buf
+		trace.reading = 0
+	}
+	// Write trace header.
+	if !trace.headerWritten {
+		trace.headerWritten = true
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return []byte("go 1.11 trace\x00\x00\x00")
+	}
+	// Wait for new data.
+	if trace.fullHead == 0 && !trace.shutdown {
+		trace.reader.set(getg())
+		goparkunlock(&trace.lock, waitReasonTraceReaderBlocked, traceEvGoBlock, 2)
+		lock(&trace.lock)
+	}
+	// Write a buffer.
+	if trace.fullHead != 0 {
+		buf := traceFullDequeue()
+		trace.reading = buf
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return buf.ptr().arr[:buf.ptr().pos]
+	}
+	// Write footer with timer frequency.
+	if !trace.footerWritten {
+		trace.footerWritten = true
+		// Use float64 because (trace.ticksEnd - trace.ticksStart) * 1e9 can overflow int64.
+		freq := float64(trace.ticksEnd-trace.ticksStart) * 1e9 / float64(trace.timeEnd-trace.timeStart) / traceTickDiv
+		if freq <= 0 {
+			throw("trace: ReadTrace got invalid frequency")
+		}
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		var data []byte
+		data = append(data, traceEvFrequency|0<<traceArgCountShift)
+		data = traceAppend(data, uint64(freq))
+		// This will emit a bunch of full buffers, we will pick them up
+		// on the next iteration.
+		trace.stackTab.dump()
+		return data
+	}
+	// Done.
+	if trace.shutdown {
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		if raceenabled {
+			// Model synchronization on trace.shutdownSema, which race
+			// detector does not see. This is required to avoid false
+			// race reports on writer passed to trace.Start.
+			racerelease(unsafe.Pointer(&trace.shutdownSema))
+		}
+		// trace.enabled is already reset, so can call traceable functions.
+		semrelease(&trace.shutdownSema)
+		return nil
+	}
+	// Also bad, but see the comment above.
+	trace.lockOwner = nil
+	unlock(&trace.lock)
+	println("runtime: spurious wakeup of trace reader")
+	return nil
+}
+
+// traceReader returns the trace reader that should be woken up, if any.
+func traceReader() *g {
+	if trace.reader == 0 || (trace.fullHead == 0 && !trace.shutdown) {
+		return nil
+	}
+	lock(&trace.lock)
+	if trace.reader == 0 || (trace.fullHead == 0 && !trace.shutdown) {
+		unlock(&trace.lock)
+		return nil
+	}
+	gp := trace.reader.ptr()
+	trace.reader.set(nil)
+	unlock(&trace.lock)
+	return gp
+}
+
+// traceProcFree frees trace buffer associated with pp.
+func traceProcFree(pp *p) {
+	buf := pp.tracebuf
+	pp.tracebuf = 0
+	if buf == 0 {
+		return
+	}
+	lock(&trace.lock)
+	traceFullQueue(buf)
+	unlock(&trace.lock)
+}
+
+// traceFullQueue queues buf into queue of full buffers.
+func traceFullQueue(buf traceBufPtr) {
+	buf.ptr().link = 0
+	if trace.fullHead == 0 {
+		trace.fullHead = buf
+	} else {
+		trace.fullTail.ptr().link = buf
+	}
+	trace.fullTail = buf
+}
+
+// traceFullDequeue dequeues from queue of full buffers.
+func traceFullDequeue() traceBufPtr {
+	buf := trace.fullHead
+	if buf == 0 {
+		return 0
+	}
+	trace.fullHead = buf.ptr().link
+	if trace.fullHead == 0 {
+		trace.fullTail = 0
+	}
+	buf.ptr().link = 0
+	return buf
+}
+
+// traceEvent writes a single event to trace buffer, flushing the buffer if necessary.
+// ev is event type.
+// If skip > 0, write current stack id as the last argument (skipping skip top frames).
+// If skip = 0, this event type should contain a stack, but we don't want
+// to collect and remember it for this particular call.
+func traceEvent(ev byte, skip int, args ...uint64) {
+	mp, pid, bufp := traceAcquireBuffer()
+	// Double-check trace.enabled now that we've done m.locks++ and acquired bufLock.
+	// This protects from races between traceEvent and StartTrace/StopTrace.
+
+	// The caller checked that trace.enabled == true, but trace.enabled might have been
+	// turned off between the check and now. Check again. traceLockBuffer did mp.locks++,
+	// StopTrace does stopTheWorld, and stopTheWorld waits for mp.locks to go back to zero,
+	// so if we see trace.enabled == true now, we know it's true for the rest of the function.
+	// Exitsyscall can run even during stopTheWorld. The race with StartTrace/StopTrace
+	// during tracing in exitsyscall is resolved by locking trace.bufLock in traceLockBuffer.
+	//
+	// Note trace_userTaskCreate runs the same check.
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	if skip > 0 {
+		if getg() == mp.curg {
+			skip++ // +1 because stack is captured in traceEventLocked.
+		}
+	}
+	traceEventLocked(0, mp, pid, bufp, ev, skip, args...)
+	traceReleaseBuffer(pid)
+}
+
+func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev byte, skip int, args ...uint64) {
+	buf := bufp.ptr()
+	// TODO: test on non-zero extraBytes param.
+	maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
+	if buf == nil || len(buf.arr)-buf.pos < maxSize {
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		bufp.set(buf)
+	}
+
+	// NOTE: ticks might be same after tick division, although the real cputicks is
+	// linear growth.
+	ticks := uint64(cputicks()) / traceTickDiv
+	tickDiff := ticks - buf.lastTicks
+	if tickDiff == 0 {
+		ticks = buf.lastTicks + 1
+		tickDiff = 1
+	}
+
+	buf.lastTicks = ticks
+	narg := byte(len(args))
+	if skip >= 0 {
+		narg++
+	}
+	// We have only 2 bits for number of arguments.
+	// If number is >= 3, then the event type is followed by event length in bytes.
+	if narg > 3 {
+		narg = 3
+	}
+	startPos := buf.pos
+	buf.byte(ev | narg<<traceArgCountShift)
+	var lenp *byte
+	if narg == 3 {
+		// Reserve the byte for length assuming that length < 128.
+		buf.varint(0)
+		lenp = &buf.arr[buf.pos-1]
+	}
+	buf.varint(tickDiff)
+	for _, a := range args {
+		buf.varint(a)
+	}
+	if skip == 0 {
+		buf.varint(0)
+	} else if skip > 0 {
+		buf.varint(traceStackID(mp, buf.stk[:], skip))
+	}
+	evSize := buf.pos - startPos
+	if evSize > maxSize {
+		throw("invalid length of trace event")
+	}
+	if lenp != nil {
+		// Fill in actual length.
+		*lenp = byte(evSize - 2)
+	}
+}
+
+func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
+	_g_ := getg()
+	gp := mp.curg
+	var nstk int
+	if gp == _g_ {
+		nstk = callers(skip+1, buf)
+	} else if gp != nil {
+		gp = mp.curg
+		nstk = gcallers(gp, skip, buf)
+	}
+	if nstk > 0 {
+		nstk-- // skip runtime.goexit
+	}
+	if nstk > 0 && gp.goid == 1 {
+		nstk-- // skip runtime.main
+	}
+	id := trace.stackTab.put(buf[:nstk])
+	return uint64(id)
+}
+
+// traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
+func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
+	mp = acquirem()
+	if p := mp.p.ptr(); p != nil {
+		return mp, p.id, &p.tracebuf
+	}
+	lock(&trace.bufLock)
+	return mp, traceGlobProc, &trace.buf
+}
+
+// traceReleaseBuffer releases a buffer previously acquired with traceAcquireBuffer.
+func traceReleaseBuffer(pid int32) {
+	if pid == traceGlobProc {
+		unlock(&trace.bufLock)
+	}
+	releasem(getg().m)
+}
+
+// traceFlush puts buf onto stack of full buffers and returns an empty buffer.
+func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
+	owner := trace.lockOwner
+	dolock := owner == nil || owner != getg().m.curg
+	if dolock {
+		lock(&trace.lock)
+	}
+	if buf != 0 {
+		traceFullQueue(buf)
+	}
+	if trace.empty != 0 {
+		buf = trace.empty
+		trace.empty = buf.ptr().link
+	} else {
+		buf = traceBufPtr(sysAlloc(unsafe.Sizeof(traceBuf{}), &memstats.other_sys))
+		if buf == 0 {
+			throw("trace: out of memory")
+		}
+	}
+	bufp := buf.ptr()
+	bufp.link.set(nil)
+	bufp.pos = 0
+
+	// initialize the buffer for a new batch
+	ticks := uint64(cputicks()) / traceTickDiv
+	if ticks == bufp.lastTicks {
+		ticks = bufp.lastTicks + 1
+	}
+	bufp.lastTicks = ticks
+	bufp.byte(traceEvBatch | 1<<traceArgCountShift)
+	bufp.varint(uint64(pid))
+	bufp.varint(ticks)
+
+	if dolock {
+		unlock(&trace.lock)
+	}
+	return buf
+}
+
+// traceString adds a string to the trace.strings and returns the id.
+func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr) {
+	if s == "" {
+		return 0, bufp
+	}
+
+	lock(&trace.stringsLock)
+	if raceenabled {
+		// raceacquire is necessary because the map access
+		// below is race annotated.
+		raceacquire(unsafe.Pointer(&trace.stringsLock))
+	}
+
+	if id, ok := trace.strings[s]; ok {
+		if raceenabled {
+			racerelease(unsafe.Pointer(&trace.stringsLock))
+		}
+		unlock(&trace.stringsLock)
+
+		return id, bufp
+	}
+
+	trace.stringSeq++
+	id := trace.stringSeq
+	trace.strings[s] = id
+
+	if raceenabled {
+		racerelease(unsafe.Pointer(&trace.stringsLock))
+	}
+	unlock(&trace.stringsLock)
+
+	// memory allocation in above may trigger tracing and
+	// cause *bufp changes. Following code now works with *bufp,
+	// so there must be no memory allocation or any activities
+	// that causes tracing after this point.
+
+	buf := bufp.ptr()
+	size := 1 + 2*traceBytesPerNumber + len(s)
+	if buf == nil || len(buf.arr)-buf.pos < size {
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		bufp.set(buf)
+	}
+	buf.byte(traceEvString)
+	buf.varint(id)
+
+	// double-check the string and the length can fit.
+	// Otherwise, truncate the string.
+	slen := len(s)
+	if room := len(buf.arr) - buf.pos; room < slen+traceBytesPerNumber {
+		slen = room
+	}
+
+	buf.varint(uint64(slen))
+	buf.pos += copy(buf.arr[buf.pos:], s[:slen])
+
+	bufp.set(buf)
+	return id, bufp
+}
+
+// traceAppend appends v to buf in little-endian-base-128 encoding.
+func traceAppend(buf []byte, v uint64) []byte {
+	for ; v >= 0x80; v >>= 7 {
+		buf = append(buf, 0x80|byte(v))
+	}
+	buf = append(buf, byte(v))
+	return buf
+}
+
+// varint appends v to buf in little-endian-base-128 encoding.
+func (buf *traceBuf) varint(v uint64) {
+	pos := buf.pos
+	for ; v >= 0x80; v >>= 7 {
+		buf.arr[pos] = 0x80 | byte(v)
+		pos++
+	}
+	buf.arr[pos] = byte(v)
+	pos++
+	buf.pos = pos
+}
+
+// byte appends v to buf.
+func (buf *traceBuf) byte(v byte) {
+	buf.arr[buf.pos] = v
+	buf.pos++
+}
+
+// traceStackTable maps stack traces (arrays of PC's) to unique uint32 ids.
+// It is lock-free for reading.
+type traceStackTable struct {
+	lock mutex
+	seq  uint32
+	mem  traceAlloc
+	tab  [1 << 13]traceStackPtr
+}
+
+// traceStack is a single stack in traceStackTable.
+type traceStack struct {
+	link traceStackPtr
+	hash uintptr
+	id   uint32
+	n    int
+	stk  [0]uintptr // real type [n]uintptr
+}
+
+type traceStackPtr uintptr
+
+func (tp traceStackPtr) ptr() *traceStack { return (*traceStack)(unsafe.Pointer(tp)) }
+
+// stack returns slice of PCs.
+func (ts *traceStack) stack() []uintptr {
+	return (*[traceStackSize]uintptr)(unsafe.Pointer(&ts.stk))[:ts.n]
+}
+
+// put returns a unique id for the stack trace pcs and caches it in the table,
+// if it sees the trace for the first time.
+func (tab *traceStackTable) put(pcs []uintptr) uint32 {
+	if len(pcs) == 0 {
+		return 0
+	}
+	hash := memhash(unsafe.Pointer(&pcs[0]), 0, uintptr(len(pcs))*unsafe.Sizeof(pcs[0]))
+	// First, search the hashtable w/o the mutex.
+	if id := tab.find(pcs, hash); id != 0 {
+		return id
+	}
+	// Now, double check under the mutex.
+	lock(&tab.lock)
+	if id := tab.find(pcs, hash); id != 0 {
+		unlock(&tab.lock)
+		return id
+	}
+	// Create new record.
+	tab.seq++
+	stk := tab.newStack(len(pcs))
+	stk.hash = hash
+	stk.id = tab.seq
+	stk.n = len(pcs)
+	stkpc := stk.stack()
+	for i, pc := range pcs {
+		stkpc[i] = pc
+	}
+	part := int(hash % uintptr(len(tab.tab)))
+	stk.link = tab.tab[part]
+	atomicstorep(unsafe.Pointer(&tab.tab[part]), unsafe.Pointer(stk))
+	unlock(&tab.lock)
+	return stk.id
+}
+
+// find checks if the stack trace pcs is already present in the table.
+func (tab *traceStackTable) find(pcs []uintptr, hash uintptr) uint32 {
+	part := int(hash % uintptr(len(tab.tab)))
+Search:
+	for stk := tab.tab[part].ptr(); stk != nil; stk = stk.link.ptr() {
+		if stk.hash == hash && stk.n == len(pcs) {
+			for i, stkpc := range stk.stack() {
+				if stkpc != pcs[i] {
+					continue Search
+				}
+			}
+			return stk.id
+		}
+	}
+	return 0
+}
+
+// newStack allocates a new stack of size n.
+func (tab *traceStackTable) newStack(n int) *traceStack {
+	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*goarch.PtrSize))
+}
+
+// allFrames returns all of the Frames corresponding to pcs.
+func allFrames(pcs []uintptr) []Frame {
+	frames := make([]Frame, 0, len(pcs))
+	ci := CallersFrames(pcs)
+	for {
+		f, more := ci.Next()
+		frames = append(frames, f)
+		if !more {
+			return frames
+		}
+	}
+}
+
+// dump writes all previously cached stacks to trace buffers,
+// releases all memory and resets state.
+func (tab *traceStackTable) dump() {
+	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
+	bufp := traceFlush(0, 0)
+	for _, stk := range tab.tab {
+		stk := stk.ptr()
+		for ; stk != nil; stk = stk.link.ptr() {
+			tmpbuf := tmp[:0]
+			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
+			frames := allFrames(stk.stack())
+			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
+			for _, f := range frames {
+				var frame traceFrame
+				frame, bufp = traceFrameForPC(bufp, 0, f)
+				tmpbuf = traceAppend(tmpbuf, uint64(f.PC))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
+			}
+			// Now copy to the buffer.
+			size := 1 + traceBytesPerNumber + len(tmpbuf)
+			if buf := bufp.ptr(); len(buf.arr)-buf.pos < size {
+				bufp = traceFlush(bufp, 0)
+			}
+			buf := bufp.ptr()
+			buf.byte(traceEvStack | 3<<traceArgCountShift)
+			buf.varint(uint64(len(tmpbuf)))
+			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
+		}
+	}
+
+	lock(&trace.lock)
+	traceFullQueue(bufp)
+	unlock(&trace.lock)
+
+	tab.mem.drop()
+	*tab = traceStackTable{}
+	lockInit(&((*tab).lock), lockRankTraceStackTab)
+}
+
+type traceFrame struct {
+	funcID uint64
+	fileID uint64
+	line   uint64
+}
+
+// traceFrameForPC records the frame information.
+// It may allocate memory.
+func traceFrameForPC(buf traceBufPtr, pid int32, f Frame) (traceFrame, traceBufPtr) {
+	bufp := &buf
+	var frame traceFrame
+
+	fn := f.Function
+	const maxLen = 1 << 10
+	if len(fn) > maxLen {
+		fn = fn[len(fn)-maxLen:]
+	}
+	frame.funcID, bufp = traceString(bufp, pid, fn)
+	frame.line = uint64(f.Line)
+	file := f.File
+	if len(file) > maxLen {
+		file = file[len(file)-maxLen:]
+	}
+	frame.fileID, bufp = traceString(bufp, pid, file)
+	return frame, (*bufp)
+}
+
+// traceAlloc is a non-thread-safe region allocator.
+// It holds a linked list of traceAllocBlock.
+type traceAlloc struct {
+	head traceAllocBlockPtr
+	off  uintptr
+}
+
+// traceAllocBlock is a block in traceAlloc.
+//
+// traceAllocBlock is allocated from non-GC'd memory, so it must not
+// contain heap pointers. Writes to pointers to traceAllocBlocks do
+// not need write barriers.
+//
+//go:notinheap
+type traceAllocBlock struct {
+	next traceAllocBlockPtr
+	data [64<<10 - goarch.PtrSize]byte
+}
+
+// TODO: Since traceAllocBlock is now go:notinheap, this isn't necessary.
+type traceAllocBlockPtr uintptr
+
+func (p traceAllocBlockPtr) ptr() *traceAllocBlock   { return (*traceAllocBlock)(unsafe.Pointer(p)) }
+func (p *traceAllocBlockPtr) set(x *traceAllocBlock) { *p = traceAllocBlockPtr(unsafe.Pointer(x)) }
+
+// alloc allocates n-byte block.
+func (a *traceAlloc) alloc(n uintptr) unsafe.Pointer {
+	n = alignUp(n, goarch.PtrSize)
+	if a.head == 0 || a.off+n > uintptr(len(a.head.ptr().data)) {
+		if n > uintptr(len(a.head.ptr().data)) {
+			throw("trace: alloc too large")
+		}
+		block := (*traceAllocBlock)(sysAlloc(unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys))
+		if block == nil {
+			throw("trace: out of memory")
+		}
+		block.next.set(a.head.ptr())
+		a.head.set(block)
+		a.off = 0
+	}
+	p := &a.head.ptr().data[a.off]
+	a.off += n
+	return unsafe.Pointer(p)
+}
+
+// drop frees all previously allocated memory and resets the allocator.
+func (a *traceAlloc) drop() {
+	for a.head != 0 {
+		block := a.head.ptr()
+		a.head.set(block.next.ptr())
+		sysFree(unsafe.Pointer(block), unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys)
+	}
+}
+
+// The following functions write specific events to trace.
+
+func traceGomaxprocs(procs int32) {
+	traceEvent(traceEvGomaxprocs, 1, uint64(procs))
+}
+
+func traceProcStart() {
+	traceEvent(traceEvProcStart, -1, uint64(getg().m.id))
+}
+
+func traceProcStop(pp *p) {
+	// Sysmon and stopTheWorld can stop Ps blocked in syscalls,
+	// to handle this we temporary employ the P.
+	mp := acquirem()
+	oldp := mp.p
+	mp.p.set(pp)
+	traceEvent(traceEvProcStop, -1)
+	mp.p = oldp
+	releasem(mp)
+}
+
+func traceGCStart() {
+	traceEvent(traceEvGCStart, 3, trace.seqGC)
+	trace.seqGC++
+}
+
+func traceGCDone() {
+	traceEvent(traceEvGCDone, -1)
+}
+
+func traceGCSTWStart(kind int) {
+	traceEvent(traceEvGCSTWStart, -1, uint64(kind))
+}
+
+func traceGCSTWDone() {
+	traceEvent(traceEvGCSTWDone, -1)
+}
+
+// traceGCSweepStart prepares to trace a sweep loop. This does not
+// emit any events until traceGCSweepSpan is called.
+//
+// traceGCSweepStart must be paired with traceGCSweepDone and there
+// must be no preemption points between these two calls.
+func traceGCSweepStart() {
+	// Delay the actual GCSweepStart event until the first span
+	// sweep. If we don't sweep anything, don't emit any events.
+	_p_ := getg().m.p.ptr()
+	if _p_.traceSweep {
+		throw("double traceGCSweepStart")
+	}
+	_p_.traceSweep, _p_.traceSwept, _p_.traceReclaimed = true, 0, 0
+}
+
+// traceGCSweepSpan traces the sweep of a single page.
+//
+// This may be called outside a traceGCSweepStart/traceGCSweepDone
+// pair; however, it will not emit any trace events in this case.
+func traceGCSweepSpan(bytesSwept uintptr) {
+	_p_ := getg().m.p.ptr()
+	if _p_.traceSweep {
+		if _p_.traceSwept == 0 {
+			traceEvent(traceEvGCSweepStart, 1)
+		}
+		_p_.traceSwept += bytesSwept
+	}
+}
+
+func traceGCSweepDone() {
+	_p_ := getg().m.p.ptr()
+	if !_p_.traceSweep {
+		throw("missing traceGCSweepStart")
+	}
+	if _p_.traceSwept != 0 {
+		traceEvent(traceEvGCSweepDone, -1, uint64(_p_.traceSwept), uint64(_p_.traceReclaimed))
+	}
+	_p_.traceSweep = false
+}
+
+func traceGCMarkAssistStart() {
+	traceEvent(traceEvGCMarkAssistStart, 1)
+}
+
+func traceGCMarkAssistDone() {
+	traceEvent(traceEvGCMarkAssistDone, -1)
+}
+
+func traceGoCreate(newg *g, pc uintptr) {
+	newg.traceseq = 0
+	newg.tracelastp = getg().m.p
+	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+	id := trace.stackTab.put([]uintptr{startPCforTrace(pc) + sys.PCQuantum})
+	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(id))
+}
+
+func traceGoStart() {
+	_g_ := getg().m.curg
+	_p_ := _g_.m.p
+	_g_.traceseq++
+	if _p_.ptr().gcMarkWorkerMode != gcMarkWorkerNotWorker {
+		traceEvent(traceEvGoStartLabel, -1, uint64(_g_.goid), _g_.traceseq, trace.markWorkerLabels[_p_.ptr().gcMarkWorkerMode])
+	} else if _g_.tracelastp == _p_ {
+		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
+	} else {
+		_g_.tracelastp = _p_
+		traceEvent(traceEvGoStart, -1, uint64(_g_.goid), _g_.traceseq)
+	}
+}
+
+func traceGoEnd() {
+	traceEvent(traceEvGoEnd, -1)
+}
+
+func traceGoSched() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
+	traceEvent(traceEvGoSched, 1)
+}
+
+func traceGoPreempt() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
+	traceEvent(traceEvGoPreempt, 1)
+}
+
+func traceGoPark(traceEv byte, skip int) {
+	if traceEv&traceFutileWakeup != 0 {
+		traceEvent(traceEvFutileWakeup, -1)
+	}
+	traceEvent(traceEv & ^traceFutileWakeup, skip)
+}
+
+func traceGoUnpark(gp *g, skip int) {
+	_p_ := getg().m.p
+	gp.traceseq++
+	if gp.tracelastp == _p_ {
+		traceEvent(traceEvGoUnblockLocal, skip, uint64(gp.goid))
+	} else {
+		gp.tracelastp = _p_
+		traceEvent(traceEvGoUnblock, skip, uint64(gp.goid), gp.traceseq)
+	}
+}
+
+func traceGoSysCall() {
+	traceEvent(traceEvGoSysCall, 1)
+}
+
+func traceGoSysExit(ts int64) {
+	if ts != 0 && ts < trace.ticksStart {
+		// There is a race between the code that initializes sysexitticks
+		// (in exitsyscall, which runs without a P, and therefore is not
+		// stopped with the rest of the world) and the code that initializes
+		// a new trace. The recorded sysexitticks must therefore be treated
+		// as "best effort". If they are valid for this trace, then great,
+		// use them for greater accuracy. But if they're not valid for this
+		// trace, assume that the trace was started after the actual syscall
+		// exit (but before we actually managed to start the goroutine,
+		// aka right now), and assign a fresh time stamp to keep the log consistent.
+		ts = 0
+	}
+	_g_ := getg().m.curg
+	_g_.traceseq++
+	_g_.tracelastp = _g_.m.p
+	traceEvent(traceEvGoSysExit, -1, uint64(_g_.goid), _g_.traceseq, uint64(ts)/traceTickDiv)
+}
+
+func traceGoSysBlock(pp *p) {
+	// Sysmon and stopTheWorld can declare syscalls running on remote Ps as blocked,
+	// to handle this we temporary employ the P.
+	mp := acquirem()
+	oldp := mp.p
+	mp.p.set(pp)
+	traceEvent(traceEvGoSysBlock, -1)
+	mp.p = oldp
+	releasem(mp)
+}
+
+func traceHeapAlloc() {
+	traceEvent(traceEvHeapAlloc, -1, gcController.heapLive)
+}
+
+func traceHeapGoal() {
+	if heapGoal := atomic.Load64(&gcController.heapGoal); heapGoal == ^uint64(0) {
+		// Heap-based triggering is disabled.
+		traceEvent(traceEvHeapGoal, -1, 0)
+	} else {
+		traceEvent(traceEvHeapGoal, -1, heapGoal)
+	}
+}
+
+// To access runtime functions from runtime/trace.
+// See runtime/trace/annotation.go
+
+//go:linkname trace_userTaskCreate runtime/trace.userTaskCreate
+func trace_userTaskCreate(id, parentID uint64, taskType string) {
+	if !trace.enabled {
+		return
+	}
+
+	// Same as in traceEvent.
+	mp, pid, bufp := traceAcquireBuffer()
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	typeStringID, bufp := traceString(bufp, pid, taskType)
+	traceEventLocked(0, mp, pid, bufp, traceEvUserTaskCreate, 3, id, parentID, typeStringID)
+	traceReleaseBuffer(pid)
+}
+
+//go:linkname trace_userTaskEnd runtime/trace.userTaskEnd
+func trace_userTaskEnd(id uint64) {
+	traceEvent(traceEvUserTaskEnd, 2, id)
+}
+
+//go:linkname trace_userRegion runtime/trace.userRegion
+func trace_userRegion(id, mode uint64, name string) {
+	if !trace.enabled {
+		return
+	}
+
+	mp, pid, bufp := traceAcquireBuffer()
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	nameStringID, bufp := traceString(bufp, pid, name)
+	traceEventLocked(0, mp, pid, bufp, traceEvUserRegion, 3, id, mode, nameStringID)
+	traceReleaseBuffer(pid)
+}
+
+//go:linkname trace_userLog runtime/trace.userLog
+func trace_userLog(id uint64, category, message string) {
+	if !trace.enabled {
+		return
+	}
+
+	mp, pid, bufp := traceAcquireBuffer()
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	categoryID, bufp := traceString(bufp, pid, category)
+
+	extraSpace := traceBytesPerNumber + len(message) // extraSpace for the value string
+	traceEventLocked(extraSpace, mp, pid, bufp, traceEvUserLog, 3, id, categoryID)
+	// traceEventLocked reserved extra space for val and len(val)
+	// in buf, so buf now has room for the following.
+	buf := bufp.ptr()
+
+	// double-check the message and its length can fit.
+	// Otherwise, truncate the message.
+	slen := len(message)
+	if room := len(buf.arr) - buf.pos; room < slen+traceBytesPerNumber {
+		slen = room
+	}
+	buf.varint(uint64(slen))
+	buf.pos += copy(buf.arr[buf.pos:], message[:slen])
+
+	traceReleaseBuffer(pid)
+}
+
+// the start PC of a goroutine for tracing purposes. If pc is a wrapper,
+// it returns the PC of the wrapped function. Otherwise it returns pc.
+func startPCforTrace(pc uintptr) uintptr {
+	f := findfunc(pc)
+	if !f.valid() {
+		return pc // should not happen, but don't care
+	}
+	w := funcdata(f, _FUNCDATA_WrapInfo)
+	if w == nil {
+		return pc // not a wrapper
+	}
+	return f.datap.textAddr(*(*uint32)(w))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/traceback.go b/contrib/go/_std_1.18/src/runtime/traceback.go
new file mode 100644
index 0000000000..229e3a9105
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/traceback.go
@@ -0,0 +1,1436 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/bytealg"
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// The code in this file implements stack trace walking for all architectures.
+// The most important fact about a given architecture is whether it uses a link register.
+// On systems with link registers, the prologue for a non-leaf function stores the
+// incoming value of LR at the bottom of the newly allocated stack frame.
+// On systems without link registers (x86), the architecture pushes a return PC during
+// the call instruction, so the return PC ends up above the stack frame.
+// In this file, the return PC is always called LR, no matter how it was found.
+
+const usesLR = sys.MinFrameSize > 0
+
+// Generic traceback. Handles runtime stack prints (pcbuf == nil),
+// the runtime.Callers function (pcbuf != nil), as well as the garbage
+// collector (callback != nil).  A little clunky to merge these, but avoids
+// duplicating the code and all its subtlety.
+//
+// The skip argument is only valid with pcbuf != nil and counts the number
+// of logical frames to skip rather than physical frames (with inlining, a
+// PC in pcbuf can represent multiple calls).
+func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max int, callback func(*stkframe, unsafe.Pointer) bool, v unsafe.Pointer, flags uint) int {
+	if skip > 0 && callback != nil {
+		throw("gentraceback callback cannot be used with non-zero skip")
+	}
+
+	// Don't call this "g"; it's too easy get "g" and "gp" confused.
+	if ourg := getg(); ourg == gp && ourg == ourg.m.curg {
+		// The starting sp has been passed in as a uintptr, and the caller may
+		// have other uintptr-typed stack references as well.
+		// If during one of the calls that got us here or during one of the
+		// callbacks below the stack must be grown, all these uintptr references
+		// to the stack will not be updated, and gentraceback will continue
+		// to inspect the old stack memory, which may no longer be valid.
+		// Even if all the variables were updated correctly, it is not clear that
+		// we want to expose a traceback that begins on one stack and ends
+		// on another stack. That could confuse callers quite a bit.
+		// Instead, we require that gentraceback and any other function that
+		// accepts an sp for the current goroutine (typically obtained by
+		// calling getcallersp) must not run on that goroutine's stack but
+		// instead on the g0 stack.
+		throw("gentraceback cannot trace user goroutine on its own stack")
+	}
+	level, _, _ := gotraceback()
+
+	var ctxt *funcval // Context pointer for unstarted goroutines. See issue #25897.
+
+	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
+		if gp.syscallsp != 0 {
+			pc0 = gp.syscallpc
+			sp0 = gp.syscallsp
+			if usesLR {
+				lr0 = 0
+			}
+		} else {
+			pc0 = gp.sched.pc
+			sp0 = gp.sched.sp
+			if usesLR {
+				lr0 = gp.sched.lr
+			}
+			ctxt = (*funcval)(gp.sched.ctxt)
+		}
+	}
+
+	nprint := 0
+	var frame stkframe
+	frame.pc = pc0
+	frame.sp = sp0
+	if usesLR {
+		frame.lr = lr0
+	}
+	waspanic := false
+	cgoCtxt := gp.cgoCtxt
+	printing := pcbuf == nil && callback == nil
+
+	// If the PC is zero, it's likely a nil function call.
+	// Start in the caller's frame.
+	if frame.pc == 0 {
+		if usesLR {
+			frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp))
+			frame.lr = 0
+		} else {
+			frame.pc = uintptr(*(*uintptr)(unsafe.Pointer(frame.sp)))
+			frame.sp += goarch.PtrSize
+		}
+	}
+
+	// runtime/internal/atomic functions call into kernel helpers on
+	// arm < 7. See runtime/internal/atomic/sys_linux_arm.s.
+	//
+	// Start in the caller's frame.
+	if GOARCH == "arm" && goarm < 7 && GOOS == "linux" && frame.pc&0xffff0000 == 0xffff0000 {
+		// Note that the calls are simple BL without pushing the return
+		// address, so we use LR directly.
+		//
+		// The kernel helpers are frameless leaf functions, so SP and
+		// LR are not touched.
+		frame.pc = frame.lr
+		frame.lr = 0
+	}
+
+	f := findfunc(frame.pc)
+	if !f.valid() {
+		if callback != nil || printing {
+			print("runtime: unknown pc ", hex(frame.pc), "\n")
+			tracebackHexdump(gp.stack, &frame, 0)
+		}
+		if callback != nil {
+			throw("unknown pc")
+		}
+		return 0
+	}
+	frame.fn = f
+
+	var cache pcvalueCache
+
+	lastFuncID := funcID_normal
+	n := 0
+	for n < max {
+		// Typically:
+		//	pc is the PC of the running function.
+		//	sp is the stack pointer at that program counter.
+		//	fp is the frame pointer (caller's stack pointer) at that program counter, or nil if unknown.
+		//	stk is the stack containing sp.
+		//	The caller's program counter is lr, unless lr is zero, in which case it is *(uintptr*)sp.
+		f = frame.fn
+		if f.pcsp == 0 {
+			// No frame information, must be external function, like race support.
+			// See golang.org/issue/13568.
+			break
+		}
+
+		// Compute function info flags.
+		flag := f.flag
+		if f.funcID == funcID_cgocallback {
+			// cgocallback does write SP to switch from the g0 to the curg stack,
+			// but it carefully arranges that during the transition BOTH stacks
+			// have cgocallback frame valid for unwinding through.
+			// So we don't need to exclude it with the other SP-writing functions.
+			flag &^= funcFlag_SPWRITE
+		}
+		if frame.pc == pc0 && frame.sp == sp0 && pc0 == gp.syscallpc && sp0 == gp.syscallsp {
+			// Some Syscall functions write to SP, but they do so only after
+			// saving the entry PC/SP using entersyscall.
+			// Since we are using the entry PC/SP, the later SP write doesn't matter.
+			flag &^= funcFlag_SPWRITE
+		}
+
+		// Found an actual function.
+		// Derive frame pointer and link register.
+		if frame.fp == 0 {
+			// Jump over system stack transitions. If we're on g0 and there's a user
+			// goroutine, try to jump. Otherwise this is a regular call.
+			if flags&_TraceJumpStack != 0 && gp == gp.m.g0 && gp.m.curg != nil {
+				switch f.funcID {
+				case funcID_morestack:
+					// morestack does not return normally -- newstack()
+					// gogo's to curg.sched. Match that.
+					// This keeps morestack() from showing up in the backtrace,
+					// but that makes some sense since it'll never be returned
+					// to.
+					frame.pc = gp.m.curg.sched.pc
+					frame.fn = findfunc(frame.pc)
+					f = frame.fn
+					flag = f.flag
+					frame.sp = gp.m.curg.sched.sp
+					cgoCtxt = gp.m.curg.cgoCtxt
+				case funcID_systemstack:
+					// systemstack returns normally, so just follow the
+					// stack transition.
+					frame.sp = gp.m.curg.sched.sp
+					cgoCtxt = gp.m.curg.cgoCtxt
+					flag &^= funcFlag_SPWRITE
+				}
+			}
+			frame.fp = frame.sp + uintptr(funcspdelta(f, frame.pc, &cache))
+			if !usesLR {
+				// On x86, call instruction pushes return PC before entering new function.
+				frame.fp += goarch.PtrSize
+			}
+		}
+		var flr funcInfo
+		if flag&funcFlag_TOPFRAME != 0 {
+			// This function marks the top of the stack. Stop the traceback.
+			frame.lr = 0
+			flr = funcInfo{}
+		} else if flag&funcFlag_SPWRITE != 0 && (callback == nil || n > 0) {
+			// The function we are in does a write to SP that we don't know
+			// how to encode in the spdelta table. Examples include context
+			// switch routines like runtime.gogo but also any code that switches
+			// to the g0 stack to run host C code. Since we can't reliably unwind
+			// the SP (we might not even be on the stack we think we are),
+			// we stop the traceback here.
+			// This only applies for profiling signals (callback == nil).
+			//
+			// For a GC stack traversal (callback != nil), we should only see
+			// a function when it has voluntarily preempted itself on entry
+			// during the stack growth check. In that case, the function has
+			// not yet had a chance to do any writes to SP and is safe to unwind.
+			// isAsyncSafePoint does not allow assembly functions to be async preempted,
+			// and preemptPark double-checks that SPWRITE functions are not async preempted.
+			// So for GC stack traversal we leave things alone (this if body does not execute for n == 0)
+			// at the bottom frame of the stack. But farther up the stack we'd better not
+			// find any.
+			if callback != nil {
+				println("traceback: unexpected SPWRITE function", funcname(f))
+				throw("traceback")
+			}
+			frame.lr = 0
+			flr = funcInfo{}
+		} else {
+			var lrPtr uintptr
+			if usesLR {
+				if n == 0 && frame.sp < frame.fp || frame.lr == 0 {
+					lrPtr = frame.sp
+					frame.lr = *(*uintptr)(unsafe.Pointer(lrPtr))
+				}
+			} else {
+				if frame.lr == 0 {
+					lrPtr = frame.fp - goarch.PtrSize
+					frame.lr = uintptr(*(*uintptr)(unsafe.Pointer(lrPtr)))
+				}
+			}
+			flr = findfunc(frame.lr)
+			if !flr.valid() {
+				// This happens if you get a profiling interrupt at just the wrong time.
+				// In that context it is okay to stop early.
+				// But if callback is set, we're doing a garbage collection and must
+				// get everything, so crash loudly.
+				doPrint := printing
+				if doPrint && gp.m.incgo && f.funcID == funcID_sigpanic {
+					// We can inject sigpanic
+					// calls directly into C code,
+					// in which case we'll see a C
+					// return PC. Don't complain.
+					doPrint = false
+				}
+				if callback != nil || doPrint {
+					print("runtime: unexpected return pc for ", funcname(f), " called from ", hex(frame.lr), "\n")
+					tracebackHexdump(gp.stack, &frame, lrPtr)
+				}
+				if callback != nil {
+					throw("unknown caller pc")
+				}
+			}
+		}
+
+		frame.varp = frame.fp
+		if !usesLR {
+			// On x86, call instruction pushes return PC before entering new function.
+			frame.varp -= goarch.PtrSize
+		}
+
+		// For architectures with frame pointers, if there's
+		// a frame, then there's a saved frame pointer here.
+		//
+		// NOTE: This code is not as general as it looks.
+		// On x86, the ABI is to save the frame pointer word at the
+		// top of the stack frame, so we have to back down over it.
+		// On arm64, the frame pointer should be at the bottom of
+		// the stack (with R29 (aka FP) = RSP), in which case we would
+		// not want to do the subtraction here. But we started out without
+		// any frame pointer, and when we wanted to add it, we didn't
+		// want to break all the assembly doing direct writes to 8(RSP)
+		// to set the first parameter to a called function.
+		// So we decided to write the FP link *below* the stack pointer
+		// (with R29 = RSP - 8 in Go functions).
+		// This is technically ABI-compatible but not standard.
+		// And it happens to end up mimicking the x86 layout.
+		// Other architectures may make different decisions.
+		if frame.varp > frame.sp && framepointer_enabled {
+			frame.varp -= goarch.PtrSize
+		}
+
+		// Derive size of arguments.
+		// Most functions have a fixed-size argument block,
+		// so we can use metadata about the function f.
+		// Not all, though: there are some variadic functions
+		// in package runtime and reflect, and for those we use call-specific
+		// metadata recorded by f's caller.
+		if callback != nil || printing {
+			frame.argp = frame.fp + sys.MinFrameSize
+			var ok bool
+			frame.arglen, frame.argmap, ok = getArgInfoFast(f, callback != nil)
+			if !ok {
+				frame.arglen, frame.argmap = getArgInfo(&frame, f, callback != nil, ctxt)
+			}
+		}
+		ctxt = nil // ctxt is only needed to get arg maps for the topmost frame
+
+		// Determine frame's 'continuation PC', where it can continue.
+		// Normally this is the return address on the stack, but if sigpanic
+		// is immediately below this function on the stack, then the frame
+		// stopped executing due to a trap, and frame.pc is probably not
+		// a safe point for looking up liveness information. In this panicking case,
+		// the function either doesn't return at all (if it has no defers or if the
+		// defers do not recover) or it returns from one of the calls to
+		// deferproc a second time (if the corresponding deferred func recovers).
+		// In the latter case, use a deferreturn call site as the continuation pc.
+		frame.continpc = frame.pc
+		if waspanic {
+			if frame.fn.deferreturn != 0 {
+				frame.continpc = frame.fn.entry() + uintptr(frame.fn.deferreturn) + 1
+				// Note: this may perhaps keep return variables alive longer than
+				// strictly necessary, as we are using "function has a defer statement"
+				// as a proxy for "function actually deferred something". It seems
+				// to be a minor drawback. (We used to actually look through the
+				// gp._defer for a defer corresponding to this function, but that
+				// is hard to do with defer records on the stack during a stack copy.)
+				// Note: the +1 is to offset the -1 that
+				// stack.go:getStackMap does to back up a return
+				// address make sure the pc is in the CALL instruction.
+			} else {
+				frame.continpc = 0
+			}
+		}
+
+		if callback != nil {
+			if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
+				return n
+			}
+		}
+
+		if pcbuf != nil {
+			pc := frame.pc
+			// backup to CALL instruction to read inlining info (same logic as below)
+			tracepc := pc
+			// Normally, pc is a return address. In that case, we want to look up
+			// file/line information using pc-1, because that is the pc of the
+			// call instruction (more precisely, the last byte of the call instruction).
+			// Callers expect the pc buffer to contain return addresses and do the
+			// same -1 themselves, so we keep pc unchanged.
+			// When the pc is from a signal (e.g. profiler or segv) then we want
+			// to look up file/line information using pc, and we store pc+1 in the
+			// pc buffer so callers can unconditionally subtract 1 before looking up.
+			// See issue 34123.
+			// The pc can be at function entry when the frame is initialized without
+			// actually running code, like runtime.mstart.
+			if (n == 0 && flags&_TraceTrap != 0) || waspanic || pc == f.entry() {
+				pc++
+			} else {
+				tracepc--
+			}
+
+			// If there is inlining info, record the inner frames.
+			if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+				inltree := (*[1 << 20]inlinedCall)(inldata)
+				for {
+					ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, &cache)
+					if ix < 0 {
+						break
+					}
+					if inltree[ix].funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
+						// ignore wrappers
+					} else if skip > 0 {
+						skip--
+					} else if n < max {
+						(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+						n++
+					}
+					lastFuncID = inltree[ix].funcID
+					// Back up to an instruction in the "caller".
+					tracepc = frame.fn.entry() + uintptr(inltree[ix].parentPc)
+					pc = tracepc + 1
+				}
+			}
+			// Record the main frame.
+			if f.funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
+				// Ignore wrapper functions (except when they trigger panics).
+			} else if skip > 0 {
+				skip--
+			} else if n < max {
+				(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+				n++
+			}
+			lastFuncID = f.funcID
+			n-- // offset n++ below
+		}
+
+		if printing {
+			// assume skip=0 for printing.
+			//
+			// Never elide wrappers if we haven't printed
+			// any frames. And don't elide wrappers that
+			// called panic rather than the wrapped
+			// function. Otherwise, leave them out.
+
+			// backup to CALL instruction to read inlining info (same logic as below)
+			tracepc := frame.pc
+			if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry() && !waspanic {
+				tracepc--
+			}
+			// If there is inlining info, print the inner frames.
+			if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+				inltree := (*[1 << 20]inlinedCall)(inldata)
+				var inlFunc _func
+				inlFuncInfo := funcInfo{&inlFunc, f.datap}
+				for {
+					ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, nil)
+					if ix < 0 {
+						break
+					}
+
+					// Create a fake _func for the
+					// inlined function.
+					inlFunc.nameoff = inltree[ix].func_
+					inlFunc.funcID = inltree[ix].funcID
+
+					if (flags&_TraceRuntimeFrames) != 0 || showframe(inlFuncInfo, gp, nprint == 0, inlFuncInfo.funcID, lastFuncID) {
+						name := funcname(inlFuncInfo)
+						file, line := funcline(f, tracepc)
+						print(name, "(...)\n")
+						print("\t", file, ":", line, "\n")
+						nprint++
+					}
+					lastFuncID = inltree[ix].funcID
+					// Back up to an instruction in the "caller".
+					tracepc = frame.fn.entry() + uintptr(inltree[ix].parentPc)
+				}
+			}
+			if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp, nprint == 0, f.funcID, lastFuncID) {
+				// Print during crash.
+				//	main(0x1, 0x2, 0x3)
+				//		/home/rsc/go/src/runtime/x.go:23 +0xf
+				//
+				name := funcname(f)
+				file, line := funcline(f, tracepc)
+				if name == "runtime.gopanic" {
+					name = "panic"
+				}
+				print(name, "(")
+				argp := unsafe.Pointer(frame.argp)
+				printArgs(f, argp, tracepc)
+				print(")\n")
+				print("\t", file, ":", line)
+				if frame.pc > f.entry() {
+					print(" +", hex(frame.pc-f.entry()))
+				}
+				if gp.m != nil && gp.m.throwing > 0 && gp == gp.m.curg || level >= 2 {
+					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp), " pc=", hex(frame.pc))
+				}
+				print("\n")
+				nprint++
+			}
+			lastFuncID = f.funcID
+		}
+		n++
+
+		if f.funcID == funcID_cgocallback && len(cgoCtxt) > 0 {
+			ctxt := cgoCtxt[len(cgoCtxt)-1]
+			cgoCtxt = cgoCtxt[:len(cgoCtxt)-1]
+
+			// skip only applies to Go frames.
+			// callback != nil only used when we only care
+			// about Go frames.
+			if skip == 0 && callback == nil {
+				n = tracebackCgoContext(pcbuf, printing, ctxt, n, max)
+			}
+		}
+
+		waspanic = f.funcID == funcID_sigpanic
+		injectedCall := waspanic || f.funcID == funcID_asyncPreempt || f.funcID == funcID_debugCallV2
+
+		// Do not unwind past the bottom of the stack.
+		if !flr.valid() {
+			break
+		}
+
+		// Unwind to next frame.
+		frame.fn = flr
+		frame.pc = frame.lr
+		frame.lr = 0
+		frame.sp = frame.fp
+		frame.fp = 0
+		frame.argmap = nil
+
+		// On link register architectures, sighandler saves the LR on stack
+		// before faking a call.
+		if usesLR && injectedCall {
+			x := *(*uintptr)(unsafe.Pointer(frame.sp))
+			frame.sp += alignUp(sys.MinFrameSize, sys.StackAlign)
+			f = findfunc(frame.pc)
+			frame.fn = f
+			if !f.valid() {
+				frame.pc = x
+			} else if funcspdelta(f, frame.pc, &cache) == 0 {
+				frame.lr = x
+			}
+		}
+	}
+
+	if printing {
+		n = nprint
+	}
+
+	// Note that panic != nil is okay here: there can be leftover panics,
+	// because the defers on the panic stack do not nest in frame order as
+	// they do on the defer stack. If you have:
+	//
+	//	frame 1 defers d1
+	//	frame 2 defers d2
+	//	frame 3 defers d3
+	//	frame 4 panics
+	//	frame 4's panic starts running defers
+	//	frame 5, running d3, defers d4
+	//	frame 5 panics
+	//	frame 5's panic starts running defers
+	//	frame 6, running d4, garbage collects
+	//	frame 6, running d2, garbage collects
+	//
+	// During the execution of d4, the panic stack is d4 -> d3, which
+	// is nested properly, and we'll treat frame 3 as resumable, because we
+	// can find d3. (And in fact frame 3 is resumable. If d4 recovers
+	// and frame 5 continues running, d3, d3 can recover and we'll
+	// resume execution in (returning from) frame 3.)
+	//
+	// During the execution of d2, however, the panic stack is d2 -> d3,
+	// which is inverted. The scan will match d2 to frame 2 but having
+	// d2 on the stack until then means it will not match d3 to frame 3.
+	// This is okay: if we're running d2, then all the defers after d2 have
+	// completed and their corresponding frames are dead. Not finding d3
+	// for frame 3 means we'll set frame 3's continpc == 0, which is correct
+	// (frame 3 is dead). At the end of the walk the panic stack can thus
+	// contain defers (d3 in this case) for dead frames. The inversion here
+	// always indicates a dead frame, and the effect of the inversion on the
+	// scan is to hide those dead frames, so the scan is still okay:
+	// what's left on the panic stack are exactly (and only) the dead frames.
+	//
+	// We require callback != nil here because only when callback != nil
+	// do we know that gentraceback is being called in a "must be correct"
+	// context as opposed to a "best effort" context. The tracebacks with
+	// callbacks only happen when everything is stopped nicely.
+	// At other times, such as when gathering a stack for a profiling signal
+	// or when printing a traceback during a crash, everything may not be
+	// stopped nicely, and the stack walk may not be able to complete.
+	if callback != nil && n < max && frame.sp != gp.stktopsp {
+		print("runtime: g", gp.goid, ": frame.sp=", hex(frame.sp), " top=", hex(gp.stktopsp), "\n")
+		print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "] n=", n, " max=", max, "\n")
+		throw("traceback did not unwind completely")
+	}
+
+	return n
+}
+
+// printArgs prints function arguments in traceback.
+func printArgs(f funcInfo, argp unsafe.Pointer, pc uintptr) {
+	// The "instruction" of argument printing is encoded in _FUNCDATA_ArgInfo.
+	// See cmd/compile/internal/ssagen.emitArgInfo for the description of the
+	// encoding.
+	// These constants need to be in sync with the compiler.
+	const (
+		_endSeq         = 0xff
+		_startAgg       = 0xfe
+		_endAgg         = 0xfd
+		_dotdotdot      = 0xfc
+		_offsetTooLarge = 0xfb
+	)
+
+	const (
+		limit    = 10                       // print no more than 10 args/components
+		maxDepth = 5                        // no more than 5 layers of nesting
+		maxLen   = (maxDepth*3+2)*limit + 1 // max length of _FUNCDATA_ArgInfo (see the compiler side for reasoning)
+	)
+
+	p := (*[maxLen]uint8)(funcdata(f, _FUNCDATA_ArgInfo))
+	if p == nil {
+		return
+	}
+
+	liveInfo := funcdata(f, _FUNCDATA_ArgLiveInfo)
+	liveIdx := pcdatavalue(f, _PCDATA_ArgLiveIndex, pc, nil)
+	startOffset := uint8(0xff) // smallest offset that needs liveness info (slots with a lower offset is always live)
+	if liveInfo != nil {
+		startOffset = *(*uint8)(liveInfo)
+	}
+
+	isLive := func(off, slotIdx uint8) bool {
+		if liveInfo == nil || liveIdx <= 0 {
+			return true // no liveness info, always live
+		}
+		if off < startOffset {
+			return true
+		}
+		bits := *(*uint8)(add(liveInfo, uintptr(liveIdx)+uintptr(slotIdx/8)))
+		return bits&(1<<(slotIdx%8)) != 0
+	}
+
+	print1 := func(off, sz, slotIdx uint8) {
+		x := readUnaligned64(add(argp, uintptr(off)))
+		// mask out irrelevant bits
+		if sz < 8 {
+			shift := 64 - sz*8
+			if goarch.BigEndian {
+				x = x >> shift
+			} else {
+				x = x << shift >> shift
+			}
+		}
+		print(hex(x))
+		if !isLive(off, slotIdx) {
+			print("?")
+		}
+	}
+
+	start := true
+	printcomma := func() {
+		if !start {
+			print(", ")
+		}
+	}
+	pi := 0
+	slotIdx := uint8(0) // register arg spill slot index
+printloop:
+	for {
+		o := p[pi]
+		pi++
+		switch o {
+		case _endSeq:
+			break printloop
+		case _startAgg:
+			printcomma()
+			print("{")
+			start = true
+			continue
+		case _endAgg:
+			print("}")
+		case _dotdotdot:
+			printcomma()
+			print("...")
+		case _offsetTooLarge:
+			printcomma()
+			print("_")
+		default:
+			printcomma()
+			sz := p[pi]
+			pi++
+			print1(o, sz, slotIdx)
+			if o >= startOffset {
+				slotIdx++
+			}
+		}
+		start = false
+	}
+}
+
+// reflectMethodValue is a partial duplicate of reflect.makeFuncImpl
+// and reflect.methodValue.
+type reflectMethodValue struct {
+	fn     uintptr
+	stack  *bitvector // ptrmap for both args and results
+	argLen uintptr    // just args
+}
+
+// getArgInfoFast returns the argument frame information for a call to f.
+// It is short and inlineable. However, it does not handle all functions.
+// If ok reports false, you must call getArgInfo instead.
+// TODO(josharian): once we do mid-stack inlining,
+// call getArgInfo directly from getArgInfoFast and stop returning an ok bool.
+func getArgInfoFast(f funcInfo, needArgMap bool) (arglen uintptr, argmap *bitvector, ok bool) {
+	return uintptr(f.args), nil, !(needArgMap && f.args == _ArgsSizeUnknown)
+}
+
+// getArgInfo returns the argument frame information for a call to f
+// with call frame frame.
+//
+// This is used for both actual calls with active stack frames and for
+// deferred calls or goroutines that are not yet executing. If this is an actual
+// call, ctxt must be nil (getArgInfo will retrieve what it needs from
+// the active stack frame). If this is a deferred call or unstarted goroutine,
+// ctxt must be the function object that was deferred or go'd.
+func getArgInfo(frame *stkframe, f funcInfo, needArgMap bool, ctxt *funcval) (arglen uintptr, argmap *bitvector) {
+	arglen = uintptr(f.args)
+	if needArgMap && f.args == _ArgsSizeUnknown {
+		// Extract argument bitmaps for reflect stubs from the calls they made to reflect.
+		switch funcname(f) {
+		case "reflect.makeFuncStub", "reflect.methodValueCall":
+			// These take a *reflect.methodValue as their
+			// context register.
+			var mv *reflectMethodValue
+			var retValid bool
+			if ctxt != nil {
+				// This is not an actual call, but a
+				// deferred call or an unstarted goroutine.
+				// The function value is itself the *reflect.methodValue.
+				mv = (*reflectMethodValue)(unsafe.Pointer(ctxt))
+			} else {
+				// This is a real call that took the
+				// *reflect.methodValue as its context
+				// register and immediately saved it
+				// to 0(SP). Get the methodValue from
+				// 0(SP).
+				arg0 := frame.sp + sys.MinFrameSize
+				mv = *(**reflectMethodValue)(unsafe.Pointer(arg0))
+				// Figure out whether the return values are valid.
+				// Reflect will update this value after it copies
+				// in the return values.
+				retValid = *(*bool)(unsafe.Pointer(arg0 + 4*goarch.PtrSize))
+			}
+			if mv.fn != f.entry() {
+				print("runtime: confused by ", funcname(f), "\n")
+				throw("reflect mismatch")
+			}
+			bv := mv.stack
+			arglen = uintptr(bv.n * goarch.PtrSize)
+			if !retValid {
+				arglen = uintptr(mv.argLen) &^ (goarch.PtrSize - 1)
+			}
+			argmap = bv
+		}
+	}
+	return
+}
+
+// tracebackCgoContext handles tracing back a cgo context value, from
+// the context argument to setCgoTraceback, for the gentraceback
+// function. It returns the new value of n.
+func tracebackCgoContext(pcbuf *uintptr, printing bool, ctxt uintptr, n, max int) int {
+	var cgoPCs [32]uintptr
+	cgoContextPCs(ctxt, cgoPCs[:])
+	var arg cgoSymbolizerArg
+	anySymbolized := false
+	for _, pc := range cgoPCs {
+		if pc == 0 || n >= max {
+			break
+		}
+		if pcbuf != nil {
+			(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+		}
+		if printing {
+			if cgoSymbolizer == nil {
+				print("non-Go function at pc=", hex(pc), "\n")
+			} else {
+				c := printOneCgoTraceback(pc, max-n, &arg)
+				n += c - 1 // +1 a few lines down
+				anySymbolized = true
+			}
+		}
+		n++
+	}
+	if anySymbolized {
+		arg.pc = 0
+		callCgoSymbolizer(&arg)
+	}
+	return n
+}
+
+func printcreatedby(gp *g) {
+	// Show what created goroutine, except main goroutine (goid 1).
+	pc := gp.gopc
+	f := findfunc(pc)
+	if f.valid() && showframe(f, gp, false, funcID_normal, funcID_normal) && gp.goid != 1 {
+		printcreatedby1(f, pc)
+	}
+}
+
+func printcreatedby1(f funcInfo, pc uintptr) {
+	print("created by ", funcname(f), "\n")
+	tracepc := pc // back up to CALL instruction for funcline.
+	if pc > f.entry() {
+		tracepc -= sys.PCQuantum
+	}
+	file, line := funcline(f, tracepc)
+	print("\t", file, ":", line)
+	if pc > f.entry() {
+		print(" +", hex(pc-f.entry()))
+	}
+	print("\n")
+}
+
+func traceback(pc, sp, lr uintptr, gp *g) {
+	traceback1(pc, sp, lr, gp, 0)
+}
+
+// tracebacktrap is like traceback but expects that the PC and SP were obtained
+// from a trap, not from gp->sched or gp->syscallpc/gp->syscallsp or getcallerpc/getcallersp.
+// Because they are from a trap instead of from a saved pair,
+// the initial PC must not be rewound to the previous instruction.
+// (All the saved pairs record a PC that is a return address, so we
+// rewind it into the CALL instruction.)
+// If gp.m.libcall{g,pc,sp} information is available, it uses that information in preference to
+// the pc/sp/lr passed in.
+func tracebacktrap(pc, sp, lr uintptr, gp *g) {
+	if gp.m.libcallsp != 0 {
+		// We're in C code somewhere, traceback from the saved position.
+		traceback1(gp.m.libcallpc, gp.m.libcallsp, 0, gp.m.libcallg.ptr(), 0)
+		return
+	}
+	traceback1(pc, sp, lr, gp, _TraceTrap)
+}
+
+func traceback1(pc, sp, lr uintptr, gp *g, flags uint) {
+	// If the goroutine is in cgo, and we have a cgo traceback, print that.
+	if iscgo && gp.m != nil && gp.m.ncgo > 0 && gp.syscallsp != 0 && gp.m.cgoCallers != nil && gp.m.cgoCallers[0] != 0 {
+		// Lock cgoCallers so that a signal handler won't
+		// change it, copy the array, reset it, unlock it.
+		// We are locked to the thread and are not running
+		// concurrently with a signal handler.
+		// We just have to stop a signal handler from interrupting
+		// in the middle of our copy.
+		atomic.Store(&gp.m.cgoCallersUse, 1)
+		cgoCallers := *gp.m.cgoCallers
+		gp.m.cgoCallers[0] = 0
+		atomic.Store(&gp.m.cgoCallersUse, 0)
+
+		printCgoTraceback(&cgoCallers)
+	}
+
+	if readgstatus(gp)&^_Gscan == _Gsyscall {
+		// Override registers if blocked in system call.
+		pc = gp.syscallpc
+		sp = gp.syscallsp
+		flags &^= _TraceTrap
+	}
+	if gp.m != nil && gp.m.vdsoSP != 0 {
+		// Override registers if running in VDSO. This comes after the
+		// _Gsyscall check to cover VDSO calls after entersyscall.
+		pc = gp.m.vdsoPC
+		sp = gp.m.vdsoSP
+		flags &^= _TraceTrap
+	}
+
+	// Print traceback. By default, omits runtime frames.
+	// If that means we print nothing at all, repeat forcing all frames printed.
+	n := gentraceback(pc, sp, lr, gp, 0, nil, _TracebackMaxFrames, nil, nil, flags)
+	if n == 0 && (flags&_TraceRuntimeFrames) == 0 {
+		n = gentraceback(pc, sp, lr, gp, 0, nil, _TracebackMaxFrames, nil, nil, flags|_TraceRuntimeFrames)
+	}
+	if n == _TracebackMaxFrames {
+		print("...additional frames elided...\n")
+	}
+	printcreatedby(gp)
+
+	if gp.ancestors == nil {
+		return
+	}
+	for _, ancestor := range *gp.ancestors {
+		printAncestorTraceback(ancestor)
+	}
+}
+
+// printAncestorTraceback prints the traceback of the given ancestor.
+// TODO: Unify this with gentraceback and CallersFrames.
+func printAncestorTraceback(ancestor ancestorInfo) {
+	print("[originating from goroutine ", ancestor.goid, "]:\n")
+	for fidx, pc := range ancestor.pcs {
+		f := findfunc(pc) // f previously validated
+		if showfuncinfo(f, fidx == 0, funcID_normal, funcID_normal) {
+			printAncestorTracebackFuncInfo(f, pc)
+		}
+	}
+	if len(ancestor.pcs) == _TracebackMaxFrames {
+		print("...additional frames elided...\n")
+	}
+	// Show what created goroutine, except main goroutine (goid 1).
+	f := findfunc(ancestor.gopc)
+	if f.valid() && showfuncinfo(f, false, funcID_normal, funcID_normal) && ancestor.goid != 1 {
+		printcreatedby1(f, ancestor.gopc)
+	}
+}
+
+// printAncestorTraceback prints the given function info at a given pc
+// within an ancestor traceback. The precision of this info is reduced
+// due to only have access to the pcs at the time of the caller
+// goroutine being created.
+func printAncestorTracebackFuncInfo(f funcInfo, pc uintptr) {
+	name := funcname(f)
+	if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+		inltree := (*[1 << 20]inlinedCall)(inldata)
+		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
+		if ix >= 0 {
+			name = funcnameFromNameoff(f, inltree[ix].func_)
+		}
+	}
+	file, line := funcline(f, pc)
+	if name == "runtime.gopanic" {
+		name = "panic"
+	}
+	print(name, "(...)\n")
+	print("\t", file, ":", line)
+	if pc > f.entry() {
+		print(" +", hex(pc-f.entry()))
+	}
+	print("\n")
+}
+
+func callers(skip int, pcbuf []uintptr) int {
+	sp := getcallersp()
+	pc := getcallerpc()
+	gp := getg()
+	var n int
+	systemstack(func() {
+		n = gentraceback(pc, sp, 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
+	})
+	return n
+}
+
+func gcallers(gp *g, skip int, pcbuf []uintptr) int {
+	return gentraceback(^uintptr(0), ^uintptr(0), 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
+}
+
+// showframe reports whether the frame with the given characteristics should
+// be printed during a traceback.
+func showframe(f funcInfo, gp *g, firstFrame bool, funcID, childID funcID) bool {
+	g := getg()
+	if g.m.throwing > 0 && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig.ptr()) {
+		return true
+	}
+	return showfuncinfo(f, firstFrame, funcID, childID)
+}
+
+// showfuncinfo reports whether a function with the given characteristics should
+// be printed during a traceback.
+func showfuncinfo(f funcInfo, firstFrame bool, funcID, childID funcID) bool {
+	// Note that f may be a synthesized funcInfo for an inlined
+	// function, in which case only nameoff and funcID are set.
+
+	level, _, _ := gotraceback()
+	if level > 1 {
+		// Show all frames.
+		return true
+	}
+
+	if !f.valid() {
+		return false
+	}
+
+	if funcID == funcID_wrapper && elideWrapperCalling(childID) {
+		return false
+	}
+
+	name := funcname(f)
+
+	// Special case: always show runtime.gopanic frame
+	// in the middle of a stack trace, so that we can
+	// see the boundary between ordinary code and
+	// panic-induced deferred code.
+	// See golang.org/issue/5832.
+	if name == "runtime.gopanic" && !firstFrame {
+		return true
+	}
+
+	return bytealg.IndexByteString(name, '.') >= 0 && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
+}
+
+// isExportedRuntime reports whether name is an exported runtime function.
+// It is only for runtime functions, so ASCII A-Z is fine.
+func isExportedRuntime(name string) bool {
+	const n = len("runtime.")
+	return len(name) > n && name[:n] == "runtime." && 'A' <= name[n] && name[n] <= 'Z'
+}
+
+// elideWrapperCalling reports whether a wrapper function that called
+// function id should be elided from stack traces.
+func elideWrapperCalling(id funcID) bool {
+	// If the wrapper called a panic function instead of the
+	// wrapped function, we want to include it in stacks.
+	return !(id == funcID_gopanic || id == funcID_sigpanic || id == funcID_panicwrap)
+}
+
+var gStatusStrings = [...]string{
+	_Gidle:      "idle",
+	_Grunnable:  "runnable",
+	_Grunning:   "running",
+	_Gsyscall:   "syscall",
+	_Gwaiting:   "waiting",
+	_Gdead:      "dead",
+	_Gcopystack: "copystack",
+	_Gpreempted: "preempted",
+}
+
+func goroutineheader(gp *g) {
+	gpstatus := readgstatus(gp)
+
+	isScan := gpstatus&_Gscan != 0
+	gpstatus &^= _Gscan // drop the scan bit
+
+	// Basic string status
+	var status string
+	if 0 <= gpstatus && gpstatus < uint32(len(gStatusStrings)) {
+		status = gStatusStrings[gpstatus]
+	} else {
+		status = "???"
+	}
+
+	// Override.
+	if gpstatus == _Gwaiting && gp.waitreason != waitReasonZero {
+		status = gp.waitreason.String()
+	}
+
+	// approx time the G is blocked, in minutes
+	var waitfor int64
+	if (gpstatus == _Gwaiting || gpstatus == _Gsyscall) && gp.waitsince != 0 {
+		waitfor = (nanotime() - gp.waitsince) / 60e9
+	}
+	print("goroutine ", gp.goid, " [", status)
+	if isScan {
+		print(" (scan)")
+	}
+	if waitfor >= 1 {
+		print(", ", waitfor, " minutes")
+	}
+	if gp.lockedm != 0 {
+		print(", locked to thread")
+	}
+	print("]:\n")
+}
+
+func tracebackothers(me *g) {
+	level, _, _ := gotraceback()
+
+	// Show the current goroutine first, if we haven't already.
+	curgp := getg().m.curg
+	if curgp != nil && curgp != me {
+		print("\n")
+		goroutineheader(curgp)
+		traceback(^uintptr(0), ^uintptr(0), 0, curgp)
+	}
+
+	// We can't call locking forEachG here because this may be during fatal
+	// throw/panic, where locking could be out-of-order or a direct
+	// deadlock.
+	//
+	// Instead, use forEachGRace, which requires no locking. We don't lock
+	// against concurrent creation of new Gs, but even with allglock we may
+	// miss Gs created after this loop.
+	forEachGRace(func(gp *g) {
+		if gp == me || gp == curgp || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
+			return
+		}
+		print("\n")
+		goroutineheader(gp)
+		// Note: gp.m == g.m occurs when tracebackothers is
+		// called from a signal handler initiated during a
+		// systemstack call. The original G is still in the
+		// running state, and we want to print its stack.
+		if gp.m != getg().m && readgstatus(gp)&^_Gscan == _Grunning {
+			print("\tgoroutine running on other thread; stack unavailable\n")
+			printcreatedby(gp)
+		} else {
+			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+		}
+	})
+}
+
+// tracebackHexdump hexdumps part of stk around frame.sp and frame.fp
+// for debugging purposes. If the address bad is included in the
+// hexdumped range, it will mark it as well.
+func tracebackHexdump(stk stack, frame *stkframe, bad uintptr) {
+	const expand = 32 * goarch.PtrSize
+	const maxExpand = 256 * goarch.PtrSize
+	// Start around frame.sp.
+	lo, hi := frame.sp, frame.sp
+	// Expand to include frame.fp.
+	if frame.fp != 0 && frame.fp < lo {
+		lo = frame.fp
+	}
+	if frame.fp != 0 && frame.fp > hi {
+		hi = frame.fp
+	}
+	// Expand a bit more.
+	lo, hi = lo-expand, hi+expand
+	// But don't go too far from frame.sp.
+	if lo < frame.sp-maxExpand {
+		lo = frame.sp - maxExpand
+	}
+	if hi > frame.sp+maxExpand {
+		hi = frame.sp + maxExpand
+	}
+	// And don't go outside the stack bounds.
+	if lo < stk.lo {
+		lo = stk.lo
+	}
+	if hi > stk.hi {
+		hi = stk.hi
+	}
+
+	// Print the hex dump.
+	print("stack: frame={sp:", hex(frame.sp), ", fp:", hex(frame.fp), "} stack=[", hex(stk.lo), ",", hex(stk.hi), ")\n")
+	hexdumpWords(lo, hi, func(p uintptr) byte {
+		switch p {
+		case frame.fp:
+			return '>'
+		case frame.sp:
+			return '<'
+		case bad:
+			return '!'
+		}
+		return 0
+	})
+}
+
+// isSystemGoroutine reports whether the goroutine g must be omitted
+// in stack dumps and deadlock detector. This is any goroutine that
+// starts at a runtime.* entry point, except for runtime.main,
+// runtime.handleAsyncEvent (wasm only) and sometimes runtime.runfinq.
+//
+// If fixed is true, any goroutine that can vary between user and
+// system (that is, the finalizer goroutine) is considered a user
+// goroutine.
+func isSystemGoroutine(gp *g, fixed bool) bool {
+	// Keep this in sync with cmd/trace/trace.go:isSystemGoroutine.
+	f := findfunc(gp.startpc)
+	if !f.valid() {
+		return false
+	}
+	if f.funcID == funcID_runtime_main || f.funcID == funcID_handleAsyncEvent {
+		return false
+	}
+	if f.funcID == funcID_runfinq {
+		// We include the finalizer goroutine if it's calling
+		// back into user code.
+		if fixed {
+			// This goroutine can vary. In fixed mode,
+			// always consider it a user goroutine.
+			return false
+		}
+		return !fingRunning
+	}
+	return hasPrefix(funcname(f), "runtime.")
+}
+
+// SetCgoTraceback records three C functions to use to gather
+// traceback information from C code and to convert that traceback
+// information into symbolic information. These are used when printing
+// stack traces for a program that uses cgo.
+//
+// The traceback and context functions may be called from a signal
+// handler, and must therefore use only async-signal safe functions.
+// The symbolizer function may be called while the program is
+// crashing, and so must be cautious about using memory.  None of the
+// functions may call back into Go.
+//
+// The context function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		Context uintptr
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t Context;
+//	};
+//
+// If the Context field is 0, the context function is being called to
+// record the current traceback context. It should record in the
+// Context field whatever information is needed about the current
+// point of execution to later produce a stack trace, probably the
+// stack pointer and PC. In this case the context function will be
+// called from C code.
+//
+// If the Context field is not 0, then it is a value returned by a
+// previous call to the context function. This case is called when the
+// context is no longer needed; that is, when the Go code is returning
+// to its C code caller. This permits the context function to release
+// any associated resources.
+//
+// While it would be correct for the context function to record a
+// complete a stack trace whenever it is called, and simply copy that
+// out in the traceback function, in a typical program the context
+// function will be called many times without ever recording a
+// traceback for that context. Recording a complete stack trace in a
+// call to the context function is likely to be inefficient.
+//
+// The traceback function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		Context    uintptr
+//		SigContext uintptr
+//		Buf        *uintptr
+//		Max        uintptr
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t  Context;
+//		uintptr_t  SigContext;
+//		uintptr_t* Buf;
+//		uintptr_t  Max;
+//	};
+//
+// The Context field will be zero to gather a traceback from the
+// current program execution point. In this case, the traceback
+// function will be called from C code.
+//
+// Otherwise Context will be a value previously returned by a call to
+// the context function. The traceback function should gather a stack
+// trace from that saved point in the program execution. The traceback
+// function may be called from an execution thread other than the one
+// that recorded the context, but only when the context is known to be
+// valid and unchanging. The traceback function may also be called
+// deeper in the call stack on the same thread that recorded the
+// context. The traceback function may be called multiple times with
+// the same Context value; it will usually be appropriate to cache the
+// result, if possible, the first time this is called for a specific
+// context value.
+//
+// If the traceback function is called from a signal handler on a Unix
+// system, SigContext will be the signal context argument passed to
+// the signal handler (a C ucontext_t* cast to uintptr_t). This may be
+// used to start tracing at the point where the signal occurred. If
+// the traceback function is not called from a signal handler,
+// SigContext will be zero.
+//
+// Buf is where the traceback information should be stored. It should
+// be PC values, such that Buf[0] is the PC of the caller, Buf[1] is
+// the PC of that function's caller, and so on.  Max is the maximum
+// number of entries to store.  The function should store a zero to
+// indicate the top of the stack, or that the caller is on a different
+// stack, presumably a Go stack.
+//
+// Unlike runtime.Callers, the PC values returned should, when passed
+// to the symbolizer function, return the file/line of the call
+// instruction.  No additional subtraction is required or appropriate.
+//
+// On all platforms, the traceback function is invoked when a call from
+// Go to C to Go requests a stack trace. On linux/amd64, linux/ppc64le,
+// and freebsd/amd64, the traceback function is also invoked when a
+// signal is received by a thread that is executing a cgo call. The
+// traceback function should not make assumptions about when it is
+// called, as future versions of Go may make additional calls.
+//
+// The symbolizer function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		PC      uintptr // program counter to fetch information for
+//		File    *byte   // file name (NUL terminated)
+//		Lineno  uintptr // line number
+//		Func    *byte   // function name (NUL terminated)
+//		Entry   uintptr // function entry point
+//		More    uintptr // set non-zero if more info for this PC
+//		Data    uintptr // unused by runtime, available for function
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t PC;
+//		char*     File;
+//		uintptr_t Lineno;
+//		char*     Func;
+//		uintptr_t Entry;
+//		uintptr_t More;
+//		uintptr_t Data;
+//	};
+//
+// The PC field will be a value returned by a call to the traceback
+// function.
+//
+// The first time the function is called for a particular traceback,
+// all the fields except PC will be 0. The function should fill in the
+// other fields if possible, setting them to 0/nil if the information
+// is not available. The Data field may be used to store any useful
+// information across calls. The More field should be set to non-zero
+// if there is more information for this PC, zero otherwise. If More
+// is set non-zero, the function will be called again with the same
+// PC, and may return different information (this is intended for use
+// with inlined functions). If More is zero, the function will be
+// called with the next PC value in the traceback. When the traceback
+// is complete, the function will be called once more with PC set to
+// zero; this may be used to free any information. Each call will
+// leave the fields of the struct set to the same values they had upon
+// return, except for the PC field when the More field is zero. The
+// function must not keep a copy of the struct pointer between calls.
+//
+// When calling SetCgoTraceback, the version argument is the version
+// number of the structs that the functions expect to receive.
+// Currently this must be zero.
+//
+// The symbolizer function may be nil, in which case the results of
+// the traceback function will be displayed as numbers. If the
+// traceback function is nil, the symbolizer function will never be
+// called. The context function may be nil, in which case the
+// traceback function will only be called with the context field set
+// to zero.  If the context function is nil, then calls from Go to C
+// to Go will not show a traceback for the C portion of the call stack.
+//
+// SetCgoTraceback should be called only once, ideally from an init function.
+func SetCgoTraceback(version int, traceback, context, symbolizer unsafe.Pointer) {
+	if version != 0 {
+		panic("unsupported version")
+	}
+
+	if cgoTraceback != nil && cgoTraceback != traceback ||
+		cgoContext != nil && cgoContext != context ||
+		cgoSymbolizer != nil && cgoSymbolizer != symbolizer {
+		panic("call SetCgoTraceback only once")
+	}
+
+	cgoTraceback = traceback
+	cgoContext = context
+	cgoSymbolizer = symbolizer
+
+	// The context function is called when a C function calls a Go
+	// function. As such it is only called by C code in runtime/cgo.
+	if _cgo_set_context_function != nil {
+		cgocall(_cgo_set_context_function, context)
+	}
+}
+
+var cgoTraceback unsafe.Pointer
+var cgoContext unsafe.Pointer
+var cgoSymbolizer unsafe.Pointer
+
+// cgoTracebackArg is the type passed to cgoTraceback.
+type cgoTracebackArg struct {
+	context    uintptr
+	sigContext uintptr
+	buf        *uintptr
+	max        uintptr
+}
+
+// cgoContextArg is the type passed to the context function.
+type cgoContextArg struct {
+	context uintptr
+}
+
+// cgoSymbolizerArg is the type passed to cgoSymbolizer.
+type cgoSymbolizerArg struct {
+	pc       uintptr
+	file     *byte
+	lineno   uintptr
+	funcName *byte
+	entry    uintptr
+	more     uintptr
+	data     uintptr
+}
+
+// cgoTraceback prints a traceback of callers.
+func printCgoTraceback(callers *cgoCallers) {
+	if cgoSymbolizer == nil {
+		for _, c := range callers {
+			if c == 0 {
+				break
+			}
+			print("non-Go function at pc=", hex(c), "\n")
+		}
+		return
+	}
+
+	var arg cgoSymbolizerArg
+	for _, c := range callers {
+		if c == 0 {
+			break
+		}
+		printOneCgoTraceback(c, 0x7fffffff, &arg)
+	}
+	arg.pc = 0
+	callCgoSymbolizer(&arg)
+}
+
+// printOneCgoTraceback prints the traceback of a single cgo caller.
+// This can print more than one line because of inlining.
+// Returns the number of frames printed.
+func printOneCgoTraceback(pc uintptr, max int, arg *cgoSymbolizerArg) int {
+	c := 0
+	arg.pc = pc
+	for c <= max {
+		callCgoSymbolizer(arg)
+		if arg.funcName != nil {
+			// Note that we don't print any argument
+			// information here, not even parentheses.
+			// The symbolizer must add that if appropriate.
+			println(gostringnocopy(arg.funcName))
+		} else {
+			println("non-Go function")
+		}
+		print("\t")
+		if arg.file != nil {
+			print(gostringnocopy(arg.file), ":", arg.lineno, " ")
+		}
+		print("pc=", hex(pc), "\n")
+		c++
+		if arg.more == 0 {
+			break
+		}
+	}
+	return c
+}
+
+// callCgoSymbolizer calls the cgoSymbolizer function.
+func callCgoSymbolizer(arg *cgoSymbolizerArg) {
+	call := cgocall
+	if panicking > 0 || getg().m.curg != getg() {
+		// We do not want to call into the scheduler when panicking
+		// or when on the system stack.
+		call = asmcgocall
+	}
+	if msanenabled {
+		msanwrite(unsafe.Pointer(arg), unsafe.Sizeof(cgoSymbolizerArg{}))
+	}
+	if asanenabled {
+		asanwrite(unsafe.Pointer(arg), unsafe.Sizeof(cgoSymbolizerArg{}))
+	}
+	call(cgoSymbolizer, noescape(unsafe.Pointer(arg)))
+}
+
+// cgoContextPCs gets the PC values from a cgo traceback.
+func cgoContextPCs(ctxt uintptr, buf []uintptr) {
+	if cgoTraceback == nil {
+		return
+	}
+	call := cgocall
+	if panicking > 0 || getg().m.curg != getg() {
+		// We do not want to call into the scheduler when panicking
+		// or when on the system stack.
+		call = asmcgocall
+	}
+	arg := cgoTracebackArg{
+		context: ctxt,
+		buf:     (*uintptr)(noescape(unsafe.Pointer(&buf[0]))),
+		max:     uintptr(len(buf)),
+	}
+	if msanenabled {
+		msanwrite(unsafe.Pointer(&arg), unsafe.Sizeof(arg))
+	}
+	if asanenabled {
+		asanwrite(unsafe.Pointer(&arg), unsafe.Sizeof(arg))
+	}
+	call(cgoTraceback, noescape(unsafe.Pointer(&arg)))
+}
diff --git a/contrib/go/_std_1.18/src/runtime/type.go b/contrib/go/_std_1.18/src/runtime/type.go
new file mode 100644
index 0000000000..da47147897
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/type.go
@@ -0,0 +1,708 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Runtime type representation.
+
+package runtime
+
+import (
+	"internal/abi"
+	"unsafe"
+)
+
+// tflag is documented in reflect/type.go.
+//
+// tflag values must be kept in sync with copies in:
+//	cmd/compile/internal/reflectdata/reflect.go
+//	cmd/link/internal/ld/decodesym.go
+//	reflect/type.go
+//      internal/reflectlite/type.go
+type tflag uint8
+
+const (
+	tflagUncommon      tflag = 1 << 0
+	tflagExtraStar     tflag = 1 << 1
+	tflagNamed         tflag = 1 << 2
+	tflagRegularMemory tflag = 1 << 3 // equal and hash can treat values of this type as a single region of t.size bytes
+)
+
+// Needs to be in sync with ../cmd/link/internal/ld/decodesym.go:/^func.commonsize,
+// ../cmd/compile/internal/reflectdata/reflect.go:/^func.dcommontype and
+// ../reflect/type.go:/^type.rtype.
+// ../internal/reflectlite/type.go:/^type.rtype.
+type _type struct {
+	size       uintptr
+	ptrdata    uintptr // size of memory prefix holding all pointers
+	hash       uint32
+	tflag      tflag
+	align      uint8
+	fieldAlign uint8
+	kind       uint8
+	// function for comparing objects of this type
+	// (ptr to object A, ptr to object B) -> ==?
+	equal func(unsafe.Pointer, unsafe.Pointer) bool
+	// gcdata stores the GC type data for the garbage collector.
+	// If the KindGCProg bit is set in kind, gcdata is a GC program.
+	// Otherwise it is a ptrmask bitmap. See mbitmap.go for details.
+	gcdata    *byte
+	str       nameOff
+	ptrToThis typeOff
+}
+
+func (t *_type) string() string {
+	s := t.nameOff(t.str).name()
+	if t.tflag&tflagExtraStar != 0 {
+		return s[1:]
+	}
+	return s
+}
+
+func (t *_type) uncommon() *uncommontype {
+	if t.tflag&tflagUncommon == 0 {
+		return nil
+	}
+	switch t.kind & kindMask {
+	case kindStruct:
+		type u struct {
+			structtype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindPtr:
+		type u struct {
+			ptrtype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindFunc:
+		type u struct {
+			functype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindSlice:
+		type u struct {
+			slicetype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindArray:
+		type u struct {
+			arraytype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindChan:
+		type u struct {
+			chantype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindMap:
+		type u struct {
+			maptype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindInterface:
+		type u struct {
+			interfacetype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	default:
+		type u struct {
+			_type
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	}
+}
+
+func (t *_type) name() string {
+	if t.tflag&tflagNamed == 0 {
+		return ""
+	}
+	s := t.string()
+	i := len(s) - 1
+	for i >= 0 && s[i] != '.' {
+		i--
+	}
+	return s[i+1:]
+}
+
+// pkgpath returns the path of the package where t was defined, if
+// available. This is not the same as the reflect package's PkgPath
+// method, in that it returns the package path for struct and interface
+// types, not just named types.
+func (t *_type) pkgpath() string {
+	if u := t.uncommon(); u != nil {
+		return t.nameOff(u.pkgpath).name()
+	}
+	switch t.kind & kindMask {
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		return st.pkgPath.name()
+	case kindInterface:
+		it := (*interfacetype)(unsafe.Pointer(t))
+		return it.pkgpath.name()
+	}
+	return ""
+}
+
+// reflectOffs holds type offsets defined at run time by the reflect package.
+//
+// When a type is defined at run time, its *rtype data lives on the heap.
+// There are a wide range of possible addresses the heap may use, that
+// may not be representable as a 32-bit offset. Moreover the GC may
+// one day start moving heap memory, in which case there is no stable
+// offset that can be defined.
+//
+// To provide stable offsets, we add pin *rtype objects in a global map
+// and treat the offset as an identifier. We use negative offsets that
+// do not overlap with any compile-time module offsets.
+//
+// Entries are created by reflect.addReflectOff.
+var reflectOffs struct {
+	lock mutex
+	next int32
+	m    map[int32]unsafe.Pointer
+	minv map[unsafe.Pointer]int32
+}
+
+func reflectOffsLock() {
+	lock(&reflectOffs.lock)
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&reflectOffs.lock))
+	}
+}
+
+func reflectOffsUnlock() {
+	if raceenabled {
+		racerelease(unsafe.Pointer(&reflectOffs.lock))
+	}
+	unlock(&reflectOffs.lock)
+}
+
+func resolveNameOff(ptrInModule unsafe.Pointer, off nameOff) name {
+	if off == 0 {
+		return name{}
+	}
+	base := uintptr(ptrInModule)
+	for md := &firstmoduledata; md != nil; md = md.next {
+		if base >= md.types && base < md.etypes {
+			res := md.types + uintptr(off)
+			if res > md.etypes {
+				println("runtime: nameOff", hex(off), "out of range", hex(md.types), "-", hex(md.etypes))
+				throw("runtime: name offset out of range")
+			}
+			return name{(*byte)(unsafe.Pointer(res))}
+		}
+	}
+
+	// No module found. see if it is a run time name.
+	reflectOffsLock()
+	res, found := reflectOffs.m[int32(off)]
+	reflectOffsUnlock()
+	if !found {
+		println("runtime: nameOff", hex(off), "base", hex(base), "not in ranges:")
+		for next := &firstmoduledata; next != nil; next = next.next {
+			println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+		}
+		throw("runtime: name offset base pointer out of range")
+	}
+	return name{(*byte)(res)}
+}
+
+func (t *_type) nameOff(off nameOff) name {
+	return resolveNameOff(unsafe.Pointer(t), off)
+}
+
+func resolveTypeOff(ptrInModule unsafe.Pointer, off typeOff) *_type {
+	if off == 0 || off == -1 {
+		// -1 is the sentinel value for unreachable code.
+		// See cmd/link/internal/ld/data.go:relocsym.
+		return nil
+	}
+	base := uintptr(ptrInModule)
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		reflectOffsLock()
+		res := reflectOffs.m[int32(off)]
+		reflectOffsUnlock()
+		if res == nil {
+			println("runtime: typeOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: type offset base pointer out of range")
+		}
+		return (*_type)(res)
+	}
+	if t := md.typemap[off]; t != nil {
+		return t
+	}
+	res := md.types + uintptr(off)
+	if res > md.etypes {
+		println("runtime: typeOff", hex(off), "out of range", hex(md.types), "-", hex(md.etypes))
+		throw("runtime: type offset out of range")
+	}
+	return (*_type)(unsafe.Pointer(res))
+}
+
+func (t *_type) typeOff(off typeOff) *_type {
+	return resolveTypeOff(unsafe.Pointer(t), off)
+}
+
+func (t *_type) textOff(off textOff) unsafe.Pointer {
+	if off == -1 {
+		// -1 is the sentinel value for unreachable code.
+		// See cmd/link/internal/ld/data.go:relocsym.
+		return unsafe.Pointer(abi.FuncPCABIInternal(unreachableMethod))
+	}
+	base := uintptr(unsafe.Pointer(t))
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		reflectOffsLock()
+		res := reflectOffs.m[int32(off)]
+		reflectOffsUnlock()
+		if res == nil {
+			println("runtime: textOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: text offset base pointer out of range")
+		}
+		return res
+	}
+	res := md.textAddr(uint32(off))
+	return unsafe.Pointer(res)
+}
+
+func (t *functype) in() []*_type {
+	// See funcType in reflect/type.go for details on data layout.
+	uadd := uintptr(unsafe.Sizeof(functype{}))
+	if t.typ.tflag&tflagUncommon != 0 {
+		uadd += unsafe.Sizeof(uncommontype{})
+	}
+	return (*[1 << 20]*_type)(add(unsafe.Pointer(t), uadd))[:t.inCount]
+}
+
+func (t *functype) out() []*_type {
+	// See funcType in reflect/type.go for details on data layout.
+	uadd := uintptr(unsafe.Sizeof(functype{}))
+	if t.typ.tflag&tflagUncommon != 0 {
+		uadd += unsafe.Sizeof(uncommontype{})
+	}
+	outCount := t.outCount & (1<<15 - 1)
+	return (*[1 << 20]*_type)(add(unsafe.Pointer(t), uadd))[t.inCount : t.inCount+outCount]
+}
+
+func (t *functype) dotdotdot() bool {
+	return t.outCount&(1<<15) != 0
+}
+
+type nameOff int32
+type typeOff int32
+type textOff int32
+
+type method struct {
+	name nameOff
+	mtyp typeOff
+	ifn  textOff
+	tfn  textOff
+}
+
+type uncommontype struct {
+	pkgpath nameOff
+	mcount  uint16 // number of methods
+	xcount  uint16 // number of exported methods
+	moff    uint32 // offset from this uncommontype to [mcount]method
+	_       uint32 // unused
+}
+
+type imethod struct {
+	name nameOff
+	ityp typeOff
+}
+
+type interfacetype struct {
+	typ     _type
+	pkgpath name
+	mhdr    []imethod
+}
+
+type maptype struct {
+	typ    _type
+	key    *_type
+	elem   *_type
+	bucket *_type // internal type representing a hash bucket
+	// function for hashing keys (ptr to key, seed) -> hash
+	hasher     func(unsafe.Pointer, uintptr) uintptr
+	keysize    uint8  // size of key slot
+	elemsize   uint8  // size of elem slot
+	bucketsize uint16 // size of bucket
+	flags      uint32
+}
+
+// Note: flag values must match those used in the TMAP case
+// in ../cmd/compile/internal/reflectdata/reflect.go:writeType.
+func (mt *maptype) indirectkey() bool { // store ptr to key instead of key itself
+	return mt.flags&1 != 0
+}
+func (mt *maptype) indirectelem() bool { // store ptr to elem instead of elem itself
+	return mt.flags&2 != 0
+}
+func (mt *maptype) reflexivekey() bool { // true if k==k for all keys
+	return mt.flags&4 != 0
+}
+func (mt *maptype) needkeyupdate() bool { // true if we need to update key on an overwrite
+	return mt.flags&8 != 0
+}
+func (mt *maptype) hashMightPanic() bool { // true if hash function might panic
+	return mt.flags&16 != 0
+}
+
+type arraytype struct {
+	typ   _type
+	elem  *_type
+	slice *_type
+	len   uintptr
+}
+
+type chantype struct {
+	typ  _type
+	elem *_type
+	dir  uintptr
+}
+
+type slicetype struct {
+	typ  _type
+	elem *_type
+}
+
+type functype struct {
+	typ      _type
+	inCount  uint16
+	outCount uint16
+}
+
+type ptrtype struct {
+	typ  _type
+	elem *_type
+}
+
+type structfield struct {
+	name       name
+	typ        *_type
+	offsetAnon uintptr
+}
+
+func (f *structfield) offset() uintptr {
+	return f.offsetAnon >> 1
+}
+
+type structtype struct {
+	typ     _type
+	pkgPath name
+	fields  []structfield
+}
+
+// name is an encoded type name with optional extra data.
+// See reflect/type.go for details.
+type name struct {
+	bytes *byte
+}
+
+func (n name) data(off int) *byte {
+	return (*byte)(add(unsafe.Pointer(n.bytes), uintptr(off)))
+}
+
+func (n name) isExported() bool {
+	return (*n.bytes)&(1<<0) != 0
+}
+
+func (n name) readvarint(off int) (int, int) {
+	v := 0
+	for i := 0; ; i++ {
+		x := *n.data(off + i)
+		v += int(x&0x7f) << (7 * i)
+		if x&0x80 == 0 {
+			return i + 1, v
+		}
+	}
+}
+
+func (n name) name() (s string) {
+	if n.bytes == nil {
+		return ""
+	}
+	i, l := n.readvarint(1)
+	if l == 0 {
+		return ""
+	}
+	hdr := (*stringStruct)(unsafe.Pointer(&s))
+	hdr.str = unsafe.Pointer(n.data(1 + i))
+	hdr.len = l
+	return
+}
+
+func (n name) tag() (s string) {
+	if *n.data(0)&(1<<1) == 0 {
+		return ""
+	}
+	i, l := n.readvarint(1)
+	i2, l2 := n.readvarint(1 + i + l)
+	hdr := (*stringStruct)(unsafe.Pointer(&s))
+	hdr.str = unsafe.Pointer(n.data(1 + i + l + i2))
+	hdr.len = l2
+	return
+}
+
+func (n name) pkgPath() string {
+	if n.bytes == nil || *n.data(0)&(1<<2) == 0 {
+		return ""
+	}
+	i, l := n.readvarint(1)
+	off := 1 + i + l
+	if *n.data(0)&(1<<1) != 0 {
+		i2, l2 := n.readvarint(off)
+		off += i2 + l2
+	}
+	var nameOff nameOff
+	copy((*[4]byte)(unsafe.Pointer(&nameOff))[:], (*[4]byte)(unsafe.Pointer(n.data(off)))[:])
+	pkgPathName := resolveNameOff(unsafe.Pointer(n.bytes), nameOff)
+	return pkgPathName.name()
+}
+
+func (n name) isBlank() bool {
+	if n.bytes == nil {
+		return false
+	}
+	_, l := n.readvarint(1)
+	return l == 1 && *n.data(2) == '_'
+}
+
+// typelinksinit scans the types from extra modules and builds the
+// moduledata typemap used to de-duplicate type pointers.
+func typelinksinit() {
+	if firstmoduledata.next == nil {
+		return
+	}
+	typehash := make(map[uint32][]*_type, len(firstmoduledata.typelinks))
+
+	modules := activeModules()
+	prev := modules[0]
+	for _, md := range modules[1:] {
+		// Collect types from the previous module into typehash.
+	collect:
+		for _, tl := range prev.typelinks {
+			var t *_type
+			if prev.typemap == nil {
+				t = (*_type)(unsafe.Pointer(prev.types + uintptr(tl)))
+			} else {
+				t = prev.typemap[typeOff(tl)]
+			}
+			// Add to typehash if not seen before.
+			tlist := typehash[t.hash]
+			for _, tcur := range tlist {
+				if tcur == t {
+					continue collect
+				}
+			}
+			typehash[t.hash] = append(tlist, t)
+		}
+
+		if md.typemap == nil {
+			// If any of this module's typelinks match a type from a
+			// prior module, prefer that prior type by adding the offset
+			// to this module's typemap.
+			tm := make(map[typeOff]*_type, len(md.typelinks))
+			pinnedTypemaps = append(pinnedTypemaps, tm)
+			md.typemap = tm
+			for _, tl := range md.typelinks {
+				t := (*_type)(unsafe.Pointer(md.types + uintptr(tl)))
+				for _, candidate := range typehash[t.hash] {
+					seen := map[_typePair]struct{}{}
+					if typesEqual(t, candidate, seen) {
+						t = candidate
+						break
+					}
+				}
+				md.typemap[typeOff(tl)] = t
+			}
+		}
+
+		prev = md
+	}
+}
+
+type _typePair struct {
+	t1 *_type
+	t2 *_type
+}
+
+// typesEqual reports whether two types are equal.
+//
+// Everywhere in the runtime and reflect packages, it is assumed that
+// there is exactly one *_type per Go type, so that pointer equality
+// can be used to test if types are equal. There is one place that
+// breaks this assumption: buildmode=shared. In this case a type can
+// appear as two different pieces of memory. This is hidden from the
+// runtime and reflect package by the per-module typemap built in
+// typelinksinit. It uses typesEqual to map types from later modules
+// back into earlier ones.
+//
+// Only typelinksinit needs this function.
+func typesEqual(t, v *_type, seen map[_typePair]struct{}) bool {
+	tp := _typePair{t, v}
+	if _, ok := seen[tp]; ok {
+		return true
+	}
+
+	// mark these types as seen, and thus equivalent which prevents an infinite loop if
+	// the two types are identical, but recursively defined and loaded from
+	// different modules
+	seen[tp] = struct{}{}
+
+	if t == v {
+		return true
+	}
+	kind := t.kind & kindMask
+	if kind != v.kind&kindMask {
+		return false
+	}
+	if t.string() != v.string() {
+		return false
+	}
+	ut := t.uncommon()
+	uv := v.uncommon()
+	if ut != nil || uv != nil {
+		if ut == nil || uv == nil {
+			return false
+		}
+		pkgpatht := t.nameOff(ut.pkgpath).name()
+		pkgpathv := v.nameOff(uv.pkgpath).name()
+		if pkgpatht != pkgpathv {
+			return false
+		}
+	}
+	if kindBool <= kind && kind <= kindComplex128 {
+		return true
+	}
+	switch kind {
+	case kindString, kindUnsafePointer:
+		return true
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		av := (*arraytype)(unsafe.Pointer(v))
+		return typesEqual(at.elem, av.elem, seen) && at.len == av.len
+	case kindChan:
+		ct := (*chantype)(unsafe.Pointer(t))
+		cv := (*chantype)(unsafe.Pointer(v))
+		return ct.dir == cv.dir && typesEqual(ct.elem, cv.elem, seen)
+	case kindFunc:
+		ft := (*functype)(unsafe.Pointer(t))
+		fv := (*functype)(unsafe.Pointer(v))
+		if ft.outCount != fv.outCount || ft.inCount != fv.inCount {
+			return false
+		}
+		tin, vin := ft.in(), fv.in()
+		for i := 0; i < len(tin); i++ {
+			if !typesEqual(tin[i], vin[i], seen) {
+				return false
+			}
+		}
+		tout, vout := ft.out(), fv.out()
+		for i := 0; i < len(tout); i++ {
+			if !typesEqual(tout[i], vout[i], seen) {
+				return false
+			}
+		}
+		return true
+	case kindInterface:
+		it := (*interfacetype)(unsafe.Pointer(t))
+		iv := (*interfacetype)(unsafe.Pointer(v))
+		if it.pkgpath.name() != iv.pkgpath.name() {
+			return false
+		}
+		if len(it.mhdr) != len(iv.mhdr) {
+			return false
+		}
+		for i := range it.mhdr {
+			tm := &it.mhdr[i]
+			vm := &iv.mhdr[i]
+			// Note the mhdr array can be relocated from
+			// another module. See #17724.
+			tname := resolveNameOff(unsafe.Pointer(tm), tm.name)
+			vname := resolveNameOff(unsafe.Pointer(vm), vm.name)
+			if tname.name() != vname.name() {
+				return false
+			}
+			if tname.pkgPath() != vname.pkgPath() {
+				return false
+			}
+			tityp := resolveTypeOff(unsafe.Pointer(tm), tm.ityp)
+			vityp := resolveTypeOff(unsafe.Pointer(vm), vm.ityp)
+			if !typesEqual(tityp, vityp, seen) {
+				return false
+			}
+		}
+		return true
+	case kindMap:
+		mt := (*maptype)(unsafe.Pointer(t))
+		mv := (*maptype)(unsafe.Pointer(v))
+		return typesEqual(mt.key, mv.key, seen) && typesEqual(mt.elem, mv.elem, seen)
+	case kindPtr:
+		pt := (*ptrtype)(unsafe.Pointer(t))
+		pv := (*ptrtype)(unsafe.Pointer(v))
+		return typesEqual(pt.elem, pv.elem, seen)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		sv := (*slicetype)(unsafe.Pointer(v))
+		return typesEqual(st.elem, sv.elem, seen)
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		sv := (*structtype)(unsafe.Pointer(v))
+		if len(st.fields) != len(sv.fields) {
+			return false
+		}
+		if st.pkgPath.name() != sv.pkgPath.name() {
+			return false
+		}
+		for i := range st.fields {
+			tf := &st.fields[i]
+			vf := &sv.fields[i]
+			if tf.name.name() != vf.name.name() {
+				return false
+			}
+			if !typesEqual(tf.typ, vf.typ, seen) {
+				return false
+			}
+			if tf.name.tag() != vf.name.tag() {
+				return false
+			}
+			if tf.offsetAnon != vf.offsetAnon {
+				return false
+			}
+		}
+		return true
+	default:
+		println("runtime: impossible type kind", kind)
+		throw("runtime: impossible type kind")
+		return false
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/typekind.go b/contrib/go/_std_1.18/src/runtime/typekind.go
new file mode 100644
index 0000000000..7087a9b046
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/typekind.go
@@ -0,0 +1,43 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	kindBool = 1 + iota
+	kindInt
+	kindInt8
+	kindInt16
+	kindInt32
+	kindInt64
+	kindUint
+	kindUint8
+	kindUint16
+	kindUint32
+	kindUint64
+	kindUintptr
+	kindFloat32
+	kindFloat64
+	kindComplex64
+	kindComplex128
+	kindArray
+	kindChan
+	kindFunc
+	kindInterface
+	kindMap
+	kindPtr
+	kindSlice
+	kindString
+	kindStruct
+	kindUnsafePointer
+
+	kindDirectIface = 1 << 5
+	kindGCProg      = 1 << 6
+	kindMask        = (1 << 5) - 1
+)
+
+// isDirectIface reports whether t is stored directly in an interface value.
+func isDirectIface(t *_type) bool {
+	return t.kind&kindDirectIface != 0
+}
diff --git a/contrib/go/_std_1.18/src/runtime/utf8.go b/contrib/go/_std_1.18/src/runtime/utf8.go
new file mode 100644
index 0000000000..52b757662d
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/utf8.go
@@ -0,0 +1,132 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Numbers fundamental to the encoding.
+const (
+	runeError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
+	runeSelf  = 0x80         // characters below runeSelf are represented as themselves in a single byte.
+	maxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
+)
+
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+	surrogateMin = 0xD800
+	surrogateMax = 0xDFFF
+)
+
+const (
+	t1 = 0x00 // 0000 0000
+	tx = 0x80 // 1000 0000
+	t2 = 0xC0 // 1100 0000
+	t3 = 0xE0 // 1110 0000
+	t4 = 0xF0 // 1111 0000
+	t5 = 0xF8 // 1111 1000
+
+	maskx = 0x3F // 0011 1111
+	mask2 = 0x1F // 0001 1111
+	mask3 = 0x0F // 0000 1111
+	mask4 = 0x07 // 0000 0111
+
+	rune1Max = 1<<7 - 1
+	rune2Max = 1<<11 - 1
+	rune3Max = 1<<16 - 1
+
+	// The default lowest and highest continuation byte.
+	locb = 0x80 // 1000 0000
+	hicb = 0xBF // 1011 1111
+)
+
+// countrunes returns the number of runes in s.
+func countrunes(s string) int {
+	n := 0
+	for range s {
+		n++
+	}
+	return n
+}
+
+// decoderune returns the non-ASCII rune at the start of
+// s[k:] and the index after the rune in s.
+//
+// decoderune assumes that caller has checked that
+// the to be decoded rune is a non-ASCII rune.
+//
+// If the string appears to be incomplete or decoding problems
+// are encountered (runeerror, k + 1) is returned to ensure
+// progress when decoderune is used to iterate over a string.
+func decoderune(s string, k int) (r rune, pos int) {
+	pos = k
+
+	if k >= len(s) {
+		return runeError, k + 1
+	}
+
+	s = s[k:]
+
+	switch {
+	case t2 <= s[0] && s[0] < t3:
+		// 0080-07FF two byte sequence
+		if len(s) > 1 && (locb <= s[1] && s[1] <= hicb) {
+			r = rune(s[0]&mask2)<<6 | rune(s[1]&maskx)
+			pos += 2
+			if rune1Max < r {
+				return
+			}
+		}
+	case t3 <= s[0] && s[0] < t4:
+		// 0800-FFFF three byte sequence
+		if len(s) > 2 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) {
+			r = rune(s[0]&mask3)<<12 | rune(s[1]&maskx)<<6 | rune(s[2]&maskx)
+			pos += 3
+			if rune2Max < r && !(surrogateMin <= r && r <= surrogateMax) {
+				return
+			}
+		}
+	case t4 <= s[0] && s[0] < t5:
+		// 10000-1FFFFF four byte sequence
+		if len(s) > 3 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) && (locb <= s[3] && s[3] <= hicb) {
+			r = rune(s[0]&mask4)<<18 | rune(s[1]&maskx)<<12 | rune(s[2]&maskx)<<6 | rune(s[3]&maskx)
+			pos += 4
+			if rune3Max < r && r <= maxRune {
+				return
+			}
+		}
+	}
+
+	return runeError, k + 1
+}
+
+// encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune.
+// It returns the number of bytes written.
+func encoderune(p []byte, r rune) int {
+	// Negative values are erroneous. Making it unsigned addresses the problem.
+	switch i := uint32(r); {
+	case i <= rune1Max:
+		p[0] = byte(r)
+		return 1
+	case i <= rune2Max:
+		_ = p[1] // eliminate bounds checks
+		p[0] = t2 | byte(r>>6)
+		p[1] = tx | byte(r)&maskx
+		return 2
+	case i > maxRune, surrogateMin <= i && i <= surrogateMax:
+		r = runeError
+		fallthrough
+	case i <= rune3Max:
+		_ = p[2] // eliminate bounds checks
+		p[0] = t3 | byte(r>>12)
+		p[1] = tx | byte(r>>6)&maskx
+		p[2] = tx | byte(r)&maskx
+		return 3
+	default:
+		_ = p[3] // eliminate bounds checks
+		p[0] = t4 | byte(r>>18)
+		p[1] = tx | byte(r>>12)&maskx
+		p[2] = tx | byte(r>>6)&maskx
+		p[3] = tx | byte(r)&maskx
+		return 4
+	}
+}
diff --git a/contrib/go/_std_1.18/src/runtime/vdso_elf64.go b/contrib/go/_std_1.18/src/runtime/vdso_elf64.go
new file mode 100644
index 0000000000..d46d6f8c34
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/vdso_elf64.go
@@ -0,0 +1,79 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && (amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64)
+
+package runtime
+
+// ELF64 structure definitions for use by the vDSO loader
+
+type elfSym struct {
+	st_name  uint32
+	st_info  byte
+	st_other byte
+	st_shndx uint16
+	st_value uint64
+	st_size  uint64
+}
+
+type elfVerdef struct {
+	vd_version uint16 /* Version revision */
+	vd_flags   uint16 /* Version information */
+	vd_ndx     uint16 /* Version Index */
+	vd_cnt     uint16 /* Number of associated aux entries */
+	vd_hash    uint32 /* Version name hash value */
+	vd_aux     uint32 /* Offset in bytes to verdaux array */
+	vd_next    uint32 /* Offset in bytes to next verdef entry */
+}
+
+type elfEhdr struct {
+	e_ident     [_EI_NIDENT]byte /* Magic number and other info */
+	e_type      uint16           /* Object file type */
+	e_machine   uint16           /* Architecture */
+	e_version   uint32           /* Object file version */
+	e_entry     uint64           /* Entry point virtual address */
+	e_phoff     uint64           /* Program header table file offset */
+	e_shoff     uint64           /* Section header table file offset */
+	e_flags     uint32           /* Processor-specific flags */
+	e_ehsize    uint16           /* ELF header size in bytes */
+	e_phentsize uint16           /* Program header table entry size */
+	e_phnum     uint16           /* Program header table entry count */
+	e_shentsize uint16           /* Section header table entry size */
+	e_shnum     uint16           /* Section header table entry count */
+	e_shstrndx  uint16           /* Section header string table index */
+}
+
+type elfPhdr struct {
+	p_type   uint32 /* Segment type */
+	p_flags  uint32 /* Segment flags */
+	p_offset uint64 /* Segment file offset */
+	p_vaddr  uint64 /* Segment virtual address */
+	p_paddr  uint64 /* Segment physical address */
+	p_filesz uint64 /* Segment size in file */
+	p_memsz  uint64 /* Segment size in memory */
+	p_align  uint64 /* Segment alignment */
+}
+
+type elfShdr struct {
+	sh_name      uint32 /* Section name (string tbl index) */
+	sh_type      uint32 /* Section type */
+	sh_flags     uint64 /* Section flags */
+	sh_addr      uint64 /* Section virtual addr at execution */
+	sh_offset    uint64 /* Section file offset */
+	sh_size      uint64 /* Section size in bytes */
+	sh_link      uint32 /* Link to another section */
+	sh_info      uint32 /* Additional section information */
+	sh_addralign uint64 /* Section alignment */
+	sh_entsize   uint64 /* Entry size if section holds table */
+}
+
+type elfDyn struct {
+	d_tag int64  /* Dynamic entry type */
+	d_val uint64 /* Integer value */
+}
+
+type elfVerdaux struct {
+	vda_name uint32 /* Version or dependency names */
+	vda_next uint32 /* Offset in bytes to next verdaux entry */
+}
diff --git a/contrib/go/_std_1.18/src/runtime/vdso_in_none.go b/contrib/go/_std_1.18/src/runtime/vdso_in_none.go
new file mode 100644
index 0000000000..618bd39b42
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/vdso_in_none.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (linux && !386 && !amd64 && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !riscv64) || !linux
+
+package runtime
+
+// A dummy version of inVDSOPage for targets that don't use a VDSO.
+
+func inVDSOPage(pc uintptr) bool {
+	return false
+}
diff --git a/contrib/go/_std_1.18/src/runtime/vdso_linux.go b/contrib/go/_std_1.18/src/runtime/vdso_linux.go
new file mode 100644
index 0000000000..cff2000767
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/vdso_linux.go
@@ -0,0 +1,292 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && (386 || amd64 || arm || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64)
+
+package runtime
+
+import "unsafe"
+
+// Look up symbols in the Linux vDSO.
+
+// This code was originally based on the sample Linux vDSO parser at
+// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/vDSO/parse_vdso.c
+
+// This implements the ELF dynamic linking spec at
+// http://sco.com/developers/gabi/latest/ch5.dynamic.html
+
+// The version section is documented at
+// https://refspecs.linuxfoundation.org/LSB_3.2.0/LSB-Core-generic/LSB-Core-generic/symversion.html
+
+const (
+	_AT_SYSINFO_EHDR = 33
+
+	_PT_LOAD    = 1 /* Loadable program segment */
+	_PT_DYNAMIC = 2 /* Dynamic linking information */
+
+	_DT_NULL     = 0          /* Marks end of dynamic section */
+	_DT_HASH     = 4          /* Dynamic symbol hash table */
+	_DT_STRTAB   = 5          /* Address of string table */
+	_DT_SYMTAB   = 6          /* Address of symbol table */
+	_DT_GNU_HASH = 0x6ffffef5 /* GNU-style dynamic symbol hash table */
+	_DT_VERSYM   = 0x6ffffff0
+	_DT_VERDEF   = 0x6ffffffc
+
+	_VER_FLG_BASE = 0x1 /* Version definition of file itself */
+
+	_SHN_UNDEF = 0 /* Undefined section */
+
+	_SHT_DYNSYM = 11 /* Dynamic linker symbol table */
+
+	_STT_FUNC = 2 /* Symbol is a code object */
+
+	_STT_NOTYPE = 0 /* Symbol type is not specified */
+
+	_STB_GLOBAL = 1 /* Global symbol */
+	_STB_WEAK   = 2 /* Weak symbol */
+
+	_EI_NIDENT = 16
+
+	// Maximum indices for the array types used when traversing the vDSO ELF structures.
+	// Computed from architecture-specific max provided by vdso_linux_*.go
+	vdsoSymTabSize     = vdsoArrayMax / unsafe.Sizeof(elfSym{})
+	vdsoDynSize        = vdsoArrayMax / unsafe.Sizeof(elfDyn{})
+	vdsoSymStringsSize = vdsoArrayMax     // byte
+	vdsoVerSymSize     = vdsoArrayMax / 2 // uint16
+	vdsoHashSize       = vdsoArrayMax / 4 // uint32
+
+	// vdsoBloomSizeScale is a scaling factor for gnuhash tables which are uint32 indexed,
+	// but contain uintptrs
+	vdsoBloomSizeScale = unsafe.Sizeof(uintptr(0)) / 4 // uint32
+)
+
+/* How to extract and insert information held in the st_info field.  */
+func _ELF_ST_BIND(val byte) byte { return val >> 4 }
+func _ELF_ST_TYPE(val byte) byte { return val & 0xf }
+
+type vdsoSymbolKey struct {
+	name    string
+	symHash uint32
+	gnuHash uint32
+	ptr     *uintptr
+}
+
+type vdsoVersionKey struct {
+	version string
+	verHash uint32
+}
+
+type vdsoInfo struct {
+	valid bool
+
+	/* Load information */
+	loadAddr   uintptr
+	loadOffset uintptr /* loadAddr - recorded vaddr */
+
+	/* Symbol table */
+	symtab     *[vdsoSymTabSize]elfSym
+	symstrings *[vdsoSymStringsSize]byte
+	chain      []uint32
+	bucket     []uint32
+	symOff     uint32
+	isGNUHash  bool
+
+	/* Version table */
+	versym *[vdsoVerSymSize]uint16
+	verdef *elfVerdef
+}
+
+// see vdso_linux_*.go for vdsoSymbolKeys[] and vdso*Sym vars
+
+func vdsoInitFromSysinfoEhdr(info *vdsoInfo, hdr *elfEhdr) {
+	info.valid = false
+	info.loadAddr = uintptr(unsafe.Pointer(hdr))
+
+	pt := unsafe.Pointer(info.loadAddr + uintptr(hdr.e_phoff))
+
+	// We need two things from the segment table: the load offset
+	// and the dynamic table.
+	var foundVaddr bool
+	var dyn *[vdsoDynSize]elfDyn
+	for i := uint16(0); i < hdr.e_phnum; i++ {
+		pt := (*elfPhdr)(add(pt, uintptr(i)*unsafe.Sizeof(elfPhdr{})))
+		switch pt.p_type {
+		case _PT_LOAD:
+			if !foundVaddr {
+				foundVaddr = true
+				info.loadOffset = info.loadAddr + uintptr(pt.p_offset-pt.p_vaddr)
+			}
+
+		case _PT_DYNAMIC:
+			dyn = (*[vdsoDynSize]elfDyn)(unsafe.Pointer(info.loadAddr + uintptr(pt.p_offset)))
+		}
+	}
+
+	if !foundVaddr || dyn == nil {
+		return // Failed
+	}
+
+	// Fish out the useful bits of the dynamic table.
+
+	var hash, gnuhash *[vdsoHashSize]uint32
+	info.symstrings = nil
+	info.symtab = nil
+	info.versym = nil
+	info.verdef = nil
+	for i := 0; dyn[i].d_tag != _DT_NULL; i++ {
+		dt := &dyn[i]
+		p := info.loadOffset + uintptr(dt.d_val)
+		switch dt.d_tag {
+		case _DT_STRTAB:
+			info.symstrings = (*[vdsoSymStringsSize]byte)(unsafe.Pointer(p))
+		case _DT_SYMTAB:
+			info.symtab = (*[vdsoSymTabSize]elfSym)(unsafe.Pointer(p))
+		case _DT_HASH:
+			hash = (*[vdsoHashSize]uint32)(unsafe.Pointer(p))
+		case _DT_GNU_HASH:
+			gnuhash = (*[vdsoHashSize]uint32)(unsafe.Pointer(p))
+		case _DT_VERSYM:
+			info.versym = (*[vdsoVerSymSize]uint16)(unsafe.Pointer(p))
+		case _DT_VERDEF:
+			info.verdef = (*elfVerdef)(unsafe.Pointer(p))
+		}
+	}
+
+	if info.symstrings == nil || info.symtab == nil || (hash == nil && gnuhash == nil) {
+		return // Failed
+	}
+
+	if info.verdef == nil {
+		info.versym = nil
+	}
+
+	if gnuhash != nil {
+		// Parse the GNU hash table header.
+		nbucket := gnuhash[0]
+		info.symOff = gnuhash[1]
+		bloomSize := gnuhash[2]
+		info.bucket = gnuhash[4+bloomSize*uint32(vdsoBloomSizeScale):][:nbucket]
+		info.chain = gnuhash[4+bloomSize*uint32(vdsoBloomSizeScale)+nbucket:]
+		info.isGNUHash = true
+	} else {
+		// Parse the hash table header.
+		nbucket := hash[0]
+		nchain := hash[1]
+		info.bucket = hash[2 : 2+nbucket]
+		info.chain = hash[2+nbucket : 2+nbucket+nchain]
+	}
+
+	// That's all we need.
+	info.valid = true
+}
+
+func vdsoFindVersion(info *vdsoInfo, ver *vdsoVersionKey) int32 {
+	if !info.valid {
+		return 0
+	}
+
+	def := info.verdef
+	for {
+		if def.vd_flags&_VER_FLG_BASE == 0 {
+			aux := (*elfVerdaux)(add(unsafe.Pointer(def), uintptr(def.vd_aux)))
+			if def.vd_hash == ver.verHash && ver.version == gostringnocopy(&info.symstrings[aux.vda_name]) {
+				return int32(def.vd_ndx & 0x7fff)
+			}
+		}
+
+		if def.vd_next == 0 {
+			break
+		}
+		def = (*elfVerdef)(add(unsafe.Pointer(def), uintptr(def.vd_next)))
+	}
+
+	return -1 // cannot match any version
+}
+
+func vdsoParseSymbols(info *vdsoInfo, version int32) {
+	if !info.valid {
+		return
+	}
+
+	apply := func(symIndex uint32, k vdsoSymbolKey) bool {
+		sym := &info.symtab[symIndex]
+		typ := _ELF_ST_TYPE(sym.st_info)
+		bind := _ELF_ST_BIND(sym.st_info)
+		// On ppc64x, VDSO functions are of type _STT_NOTYPE.
+		if typ != _STT_FUNC && typ != _STT_NOTYPE || bind != _STB_GLOBAL && bind != _STB_WEAK || sym.st_shndx == _SHN_UNDEF {
+			return false
+		}
+		if k.name != gostringnocopy(&info.symstrings[sym.st_name]) {
+			return false
+		}
+		// Check symbol version.
+		if info.versym != nil && version != 0 && int32(info.versym[symIndex]&0x7fff) != version {
+			return false
+		}
+
+		*k.ptr = info.loadOffset + uintptr(sym.st_value)
+		return true
+	}
+
+	if !info.isGNUHash {
+		// Old-style DT_HASH table.
+		for _, k := range vdsoSymbolKeys {
+			for chain := info.bucket[k.symHash%uint32(len(info.bucket))]; chain != 0; chain = info.chain[chain] {
+				if apply(chain, k) {
+					break
+				}
+			}
+		}
+		return
+	}
+
+	// New-style DT_GNU_HASH table.
+	for _, k := range vdsoSymbolKeys {
+		symIndex := info.bucket[k.gnuHash%uint32(len(info.bucket))]
+		if symIndex < info.symOff {
+			continue
+		}
+		for ; ; symIndex++ {
+			hash := info.chain[symIndex-info.symOff]
+			if hash|1 == k.gnuHash|1 {
+				// Found a hash match.
+				if apply(symIndex, k) {
+					break
+				}
+			}
+			if hash&1 != 0 {
+				// End of chain.
+				break
+			}
+		}
+	}
+}
+
+func vdsoauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_SYSINFO_EHDR:
+		if val == 0 {
+			// Something went wrong
+			return
+		}
+		var info vdsoInfo
+		// TODO(rsc): I don't understand why the compiler thinks info escapes
+		// when passed to the three functions below.
+		info1 := (*vdsoInfo)(noescape(unsafe.Pointer(&info)))
+		vdsoInitFromSysinfoEhdr(info1, (*elfEhdr)(unsafe.Pointer(val)))
+		vdsoParseSymbols(info1, vdsoFindVersion(info1, &vdsoLinuxVersion))
+	}
+}
+
+// vdsoMarker reports whether PC is on the VDSO page.
+//go:nosplit
+func inVDSOPage(pc uintptr) bool {
+	for _, k := range vdsoSymbolKeys {
+		if *k.ptr != 0 {
+			page := *k.ptr &^ (physPageSize - 1)
+			return pc >= page && pc < page+physPageSize
+		}
+	}
+	return false
+}
diff --git a/contrib/go/_std_1.18/src/runtime/vdso_linux_amd64.go b/contrib/go/_std_1.18/src/runtime/vdso_linux_amd64.go
new file mode 100644
index 0000000000..4e9f748f4a
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/vdso_linux_amd64.go
@@ -0,0 +1,23 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	// vdsoArrayMax is the byte-size of a maximally sized array on this architecture.
+	// See cmd/compile/internal/amd64/galign.go arch.MAXWIDTH initialization.
+	vdsoArrayMax = 1<<50 - 1
+)
+
+var vdsoLinuxVersion = vdsoVersionKey{"LINUX_2.6", 0x3ae75f6}
+
+var vdsoSymbolKeys = []vdsoSymbolKey{
+	{"__vdso_gettimeofday", 0x315ca59, 0xb01bca00, &vdsoGettimeofdaySym},
+	{"__vdso_clock_gettime", 0xd35ec75, 0x6e43a318, &vdsoClockgettimeSym},
+}
+
+var (
+	vdsoGettimeofdaySym uintptr
+	vdsoClockgettimeSym uintptr
+)
diff --git a/contrib/go/_std_1.18/src/runtime/write_err.go b/contrib/go/_std_1.18/src/runtime/write_err.go
new file mode 100644
index 0000000000..81ae872e9c
--- /dev/null
+++ b/contrib/go/_std_1.18/src/runtime/write_err.go
@@ -0,0 +1,13 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !android
+
+package runtime
+
+import "unsafe"
+
+func writeErr(b []byte) {
+	write(2, unsafe.Pointer(&b[0]), int32(len(b)))
+}
author	alexv-smirnov <alex@ydb.tech>	2022-08-18 16:52:30 +0300
committer	alexv-smirnov <alex@ydb.tech>	2022-08-18 16:52:30 +0300
commit	c140abc954b61ab7d86af80bdeced01482d9971a (patch)
tree	c47d70fa3213240d5e0eb59787a5325782a360de /contrib/go/_std_1.18/src/runtime
parent	0ce07b9705ed20e3fce2759eae41496014ca4c33 (diff)
download	ydb-c140abc954b61ab7d86af80bdeced01482d9971a.tar.gz