Update golang to 1.22.1

2967d19c907adf59101a1f47b4208bd0b04a6186
author: hiddenpath <hiddenpath@yandex-team.com> 2024-04-02 23:50:23 +0300
committer: hiddenpath <hiddenpath@yandex-team.com> 2024-04-03 00:02:31 +0300
commit: 8923c6d2c438e0aeed2e06b8b0275e1864eeee33 (patch)
tree: 6b5e476699fc0be5091cb650654ef5f602c8afff /contrib/go/_std_1.22/src/crypto/sha1
parent: d18afd09df2a08cd023012593b46109b77713a6c (diff)
download: ydb-8923c6d2c438e0aeed2e06b8b0275e1864eeee33.tar.gz
7 files changed, 2077 insertions, 0 deletions
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/sha1.go b/contrib/go/_std_1.22/src/crypto/sha1/sha1.go
new file mode 100644
index 0000000000..ac10fa1557
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/sha1.go
@@ -0,0 +1,265 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sha1 implements the SHA-1 hash algorithm as defined in RFC 3174.
+//
+// SHA-1 is cryptographically broken and should not be used for secure
+// applications.
+package sha1
+
+import (
+	"crypto"
+	"crypto/internal/boring"
+	"encoding/binary"
+	"errors"
+	"hash"
+)
+
+func init() {
+	crypto.RegisterHash(crypto.SHA1, New)
+}
+
+// The size of a SHA-1 checksum in bytes.
+const Size = 20
+
+// The blocksize of SHA-1 in bytes.
+const BlockSize = 64
+
+const (
+	chunk = 64
+	init0 = 0x67452301
+	init1 = 0xEFCDAB89
+	init2 = 0x98BADCFE
+	init3 = 0x10325476
+	init4 = 0xC3D2E1F0
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	h   [5]uint32
+	x   [chunk]byte
+	nx  int
+	len uint64
+}
+
+const (
+	magic         = "sha\x01"
+	marshaledSize = len(magic) + 5*4 + chunk + 8
+)
+
+func (d *digest) MarshalBinary() ([]byte, error) {
+	b := make([]byte, 0, marshaledSize)
+	b = append(b, magic...)
+	b = binary.BigEndian.AppendUint32(b, d.h[0])
+	b = binary.BigEndian.AppendUint32(b, d.h[1])
+	b = binary.BigEndian.AppendUint32(b, d.h[2])
+	b = binary.BigEndian.AppendUint32(b, d.h[3])
+	b = binary.BigEndian.AppendUint32(b, d.h[4])
+	b = append(b, d.x[:d.nx]...)
+	b = b[:len(b)+len(d.x)-d.nx] // already zero
+	b = binary.BigEndian.AppendUint64(b, d.len)
+	return b, nil
+}
+
+func (d *digest) UnmarshalBinary(b []byte) error {
+	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+		return errors.New("crypto/sha1: invalid hash state identifier")
+	}
+	if len(b) != marshaledSize {
+		return errors.New("crypto/sha1: invalid hash state size")
+	}
+	b = b[len(magic):]
+	b, d.h[0] = consumeUint32(b)
+	b, d.h[1] = consumeUint32(b)
+	b, d.h[2] = consumeUint32(b)
+	b, d.h[3] = consumeUint32(b)
+	b, d.h[4] = consumeUint32(b)
+	b = b[copy(d.x[:], b):]
+	b, d.len = consumeUint64(b)
+	d.nx = int(d.len % chunk)
+	return nil
+}
+
+func consumeUint64(b []byte) ([]byte, uint64) {
+	_ = b[7]
+	x := uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
+		uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56
+	return b[8:], x
+}
+
+func consumeUint32(b []byte) ([]byte, uint32) {
+	_ = b[3]
+	x := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	return b[4:], x
+}
+
+func (d *digest) Reset() {
+	d.h[0] = init0
+	d.h[1] = init1
+	d.h[2] = init2
+	d.h[3] = init3
+	d.h[4] = init4
+	d.nx = 0
+	d.len = 0
+}
+
+// New returns a new hash.Hash computing the SHA1 checksum. The Hash also
+// implements [encoding.BinaryMarshaler] and [encoding.BinaryUnmarshaler] to
+// marshal and unmarshal the internal state of the hash.
+func New() hash.Hash {
+	if boring.Enabled {
+		return boring.NewSHA1()
+	}
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return BlockSize }
+
+func (d *digest) Write(p []byte) (nn int, err error) {
+	boring.Unreachable()
+	nn = len(p)
+	d.len += uint64(nn)
+	if d.nx > 0 {
+		n := copy(d.x[d.nx:], p)
+		d.nx += n
+		if d.nx == chunk {
+			block(d, d.x[:])
+			d.nx = 0
+		}
+		p = p[n:]
+	}
+	if len(p) >= chunk {
+		n := len(p) &^ (chunk - 1)
+		block(d, p[:n])
+		p = p[n:]
+	}
+	if len(p) > 0 {
+		d.nx = copy(d.x[:], p)
+	}
+	return
+}
+
+func (d *digest) Sum(in []byte) []byte {
+	boring.Unreachable()
+	// Make a copy of d so that caller can keep writing and summing.
+	d0 := *d
+	hash := d0.checkSum()
+	return append(in, hash[:]...)
+}
+
+func (d *digest) checkSum() [Size]byte {
+	len := d.len
+	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
+	var tmp [64 + 8]byte // padding + length buffer
+	tmp[0] = 0x80
+	var t uint64
+	if len%64 < 56 {
+		t = 56 - len%64
+	} else {
+		t = 64 + 56 - len%64
+	}
+
+	// Length in bits.
+	len <<= 3
+	padlen := tmp[:t+8]
+	binary.BigEndian.PutUint64(padlen[t:], len)
+	d.Write(padlen)
+
+	if d.nx != 0 {
+		panic("d.nx != 0")
+	}
+
+	var digest [Size]byte
+
+	binary.BigEndian.PutUint32(digest[0:], d.h[0])
+	binary.BigEndian.PutUint32(digest[4:], d.h[1])
+	binary.BigEndian.PutUint32(digest[8:], d.h[2])
+	binary.BigEndian.PutUint32(digest[12:], d.h[3])
+	binary.BigEndian.PutUint32(digest[16:], d.h[4])
+
+	return digest
+}
+
+// ConstantTimeSum computes the same result of [Sum] but in constant time
+func (d *digest) ConstantTimeSum(in []byte) []byte {
+	d0 := *d
+	hash := d0.constSum()
+	return append(in, hash[:]...)
+}
+
+func (d *digest) constSum() [Size]byte {
+	var length [8]byte
+	l := d.len << 3
+	for i := uint(0); i < 8; i++ {
+		length[i] = byte(l >> (56 - 8*i))
+	}
+
+	nx := byte(d.nx)
+	t := nx - 56                 // if nx < 56 then the MSB of t is one
+	mask1b := byte(int8(t) >> 7) // mask1b is 0xFF iff one block is enough
+
+	separator := byte(0x80) // gets reset to 0x00 once used
+	for i := byte(0); i < chunk; i++ {
+		mask := byte(int8(i-nx) >> 7) // 0x00 after the end of data
+
+		// if we reached the end of the data, replace with 0x80 or 0x00
+		d.x[i] = (^mask & separator) | (mask & d.x[i])
+
+		// zero the separator once used
+		separator &= mask
+
+		if i >= 56 {
+			// we might have to write the length here if all fit in one block
+			d.x[i] |= mask1b & length[i-56]
+		}
+	}
+
+	// compress, and only keep the digest if all fit in one block
+	block(d, d.x[:])
+
+	var digest [Size]byte
+	for i, s := range d.h {
+		digest[i*4] = mask1b & byte(s>>24)
+		digest[i*4+1] = mask1b & byte(s>>16)
+		digest[i*4+2] = mask1b & byte(s>>8)
+		digest[i*4+3] = mask1b & byte(s)
+	}
+
+	for i := byte(0); i < chunk; i++ {
+		// second block, it's always past the end of data, might start with 0x80
+		if i < 56 {
+			d.x[i] = separator
+			separator = 0
+		} else {
+			d.x[i] = length[i-56]
+		}
+	}
+
+	// compress, and only keep the digest if we actually needed the second block
+	block(d, d.x[:])
+
+	for i, s := range d.h {
+		digest[i*4] |= ^mask1b & byte(s>>24)
+		digest[i*4+1] |= ^mask1b & byte(s>>16)
+		digest[i*4+2] |= ^mask1b & byte(s>>8)
+		digest[i*4+3] |= ^mask1b & byte(s)
+	}
+
+	return digest
+}
+
+// Sum returns the SHA-1 checksum of the data.
+func Sum(data []byte) [Size]byte {
+	if boring.Enabled {
+		return boring.SHA1(data)
+	}
+	var d digest
+	d.Reset()
+	d.Write(data)
+	return d.checkSum()
+}
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/sha1block.go b/contrib/go/_std_1.22/src/crypto/sha1/sha1block.go
new file mode 100644
index 0000000000..1c1a7c5f31
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/sha1block.go
@@ -0,0 +1,83 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import (
+	"math/bits"
+)
+
+const (
+	_K0 = 0x5A827999
+	_K1 = 0x6ED9EBA1
+	_K2 = 0x8F1BBCDC
+	_K3 = 0xCA62C1D6
+)
+
+// blockGeneric is a portable, pure Go version of the SHA-1 block step.
+// It's used by sha1block_generic.go and tests.
+func blockGeneric(dig *digest, p []byte) {
+	var w [16]uint32
+
+	h0, h1, h2, h3, h4 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4]
+	for len(p) >= chunk {
+		// Can interlace the computation of w with the
+		// rounds below if needed for speed.
+		for i := 0; i < 16; i++ {
+			j := i * 4
+			w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3])
+		}
+
+		a, b, c, d, e := h0, h1, h2, h3, h4
+
+		// Each of the four 20-iteration rounds
+		// differs only in the computation of f and
+		// the choice of K (_K0, _K1, etc).
+		i := 0
+		for ; i < 16; i++ {
+			f := b&c | (^b)&d
+			t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K0
+			a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+		}
+		for ; i < 20; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = bits.RotateLeft32(tmp, 1)
+
+			f := b&c | (^b)&d
+			t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K0
+			a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+		}
+		for ; i < 40; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = bits.RotateLeft32(tmp, 1)
+			f := b ^ c ^ d
+			t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K1
+			a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+		}
+		for ; i < 60; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = bits.RotateLeft32(tmp, 1)
+			f := ((b | c) & d) | (b & c)
+			t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K2
+			a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+		}
+		for ; i < 80; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = bits.RotateLeft32(tmp, 1)
+			f := b ^ c ^ d
+			t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K3
+			a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+		}
+
+		h0 += a
+		h1 += b
+		h2 += c
+		h3 += d
+		h4 += e
+
+		p = p[chunk:]
+	}
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4] = h0, h1, h2, h3, h4
+}
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/sha1block_amd64.go b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_amd64.go
new file mode 100644
index 0000000000..039813d7dc
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_amd64.go
@@ -0,0 +1,34 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import "internal/cpu"
+
+//go:noescape
+func blockAVX2(dig *digest, p []byte)
+
+//go:noescape
+func blockAMD64(dig *digest, p []byte)
+
+var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI1 && cpu.X86.HasBMI2
+
+func block(dig *digest, p []byte) {
+	if useAVX2 && len(p) >= 256 {
+		// blockAVX2 calculates sha1 for 2 block per iteration
+		// it also interleaves precalculation for next block.
+		// So it may read up-to 192 bytes past end of p
+		// We may add checks inside blockAVX2, but this will
+		// just turn it into a copy of blockAMD64,
+		// so call it directly, instead.
+		safeLen := len(p) - 128
+		if safeLen%128 != 0 {
+			safeLen -= 64
+		}
+		blockAVX2(dig, p[:safeLen])
+		blockAMD64(dig, p[safeLen:])
+	} else {
+		blockAMD64(dig, p)
+	}
+}
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/sha1block_amd64.s b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_amd64.s
new file mode 100644
index 0000000000..9bdf24cf49
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_amd64.s
@@ -0,0 +1,1500 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// AVX2 version by Intel, same algorithm as code in Linux kernel:
+// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+// Authors:
+// Ilya Albrekht <ilya.albrekht@intel.com>
+// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+// Ronen Zohar <ronen.zohar@intel.com>
+// Chandramouli Narayanan <mouli@linux.intel.com>
+
+
+#include "textflag.h"
+
+// SHA-1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
+//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+#define LOAD(index) \
+	MOVL	(index*4)(SI), R10; \
+	BSWAPL	R10; \
+	MOVL	R10, (index*4)(SP)
+
+#define SHUFFLE(index) \
+	MOVL	(((index)&0xf)*4)(SP), R10; \
+	XORL	(((index-3)&0xf)*4)(SP), R10; \
+	XORL	(((index-8)&0xf)*4)(SP), R10; \
+	XORL	(((index-14)&0xf)*4)(SP), R10; \
+	ROLL	$1, R10; \
+	MOVL	R10, (((index)&0xf)*4)(SP)
+
+#define FUNC1(a, b, c, d, e) \
+	MOVL	d, R9; \
+	XORL	c, R9; \
+	ANDL	b, R9; \
+	XORL	d, R9
+
+#define FUNC2(a, b, c, d, e) \
+	MOVL	b, R9; \
+	XORL	c, R9; \
+	XORL	d, R9
+
+#define FUNC3(a, b, c, d, e) \
+	MOVL	b, R8; \
+	ORL	c, R8; \
+	ANDL	d, R8; \
+	MOVL	b, R9; \
+	ANDL	c, R9; \
+	ORL	R8, R9
+
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+	ROLL	$30, b; \
+	ADDL	R9, e; \
+	MOVL	a, R8; \
+	ROLL	$5, R8; \
+	LEAL	const(e)(R10*1), e; \
+	ADDL	R8, e
+
+#define ROUND1(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC2(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC3(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC4(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0xCA62C1D6)
+
+TEXT ·blockAMD64(SB),NOSPLIT,$64-32
+	MOVQ	dig+0(FP),	BP
+	MOVQ	p_base+8(FP),	SI
+	MOVQ	p_len+16(FP),	DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+
+	LEAQ	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+	MOVL	(4*4)(BP),	BP
+
+	CMPQ	SI,		DI
+	JEQ	end
+
+loop:
+	MOVL	AX,	R11
+	MOVL	BX,	R12
+	MOVL	CX,	R13
+	MOVL	DX,	R14
+	MOVL	BP,	R15
+
+	ROUND1(AX, BX, CX, DX, BP, 0)
+	ROUND1(BP, AX, BX, CX, DX, 1)
+	ROUND1(DX, BP, AX, BX, CX, 2)
+	ROUND1(CX, DX, BP, AX, BX, 3)
+	ROUND1(BX, CX, DX, BP, AX, 4)
+	ROUND1(AX, BX, CX, DX, BP, 5)
+	ROUND1(BP, AX, BX, CX, DX, 6)
+	ROUND1(DX, BP, AX, BX, CX, 7)
+	ROUND1(CX, DX, BP, AX, BX, 8)
+	ROUND1(BX, CX, DX, BP, AX, 9)
+	ROUND1(AX, BX, CX, DX, BP, 10)
+	ROUND1(BP, AX, BX, CX, DX, 11)
+	ROUND1(DX, BP, AX, BX, CX, 12)
+	ROUND1(CX, DX, BP, AX, BX, 13)
+	ROUND1(BX, CX, DX, BP, AX, 14)
+	ROUND1(AX, BX, CX, DX, BP, 15)
+
+	ROUND1x(BP, AX, BX, CX, DX, 16)
+	ROUND1x(DX, BP, AX, BX, CX, 17)
+	ROUND1x(CX, DX, BP, AX, BX, 18)
+	ROUND1x(BX, CX, DX, BP, AX, 19)
+
+	ROUND2(AX, BX, CX, DX, BP, 20)
+	ROUND2(BP, AX, BX, CX, DX, 21)
+	ROUND2(DX, BP, AX, BX, CX, 22)
+	ROUND2(CX, DX, BP, AX, BX, 23)
+	ROUND2(BX, CX, DX, BP, AX, 24)
+	ROUND2(AX, BX, CX, DX, BP, 25)
+	ROUND2(BP, AX, BX, CX, DX, 26)
+	ROUND2(DX, BP, AX, BX, CX, 27)
+	ROUND2(CX, DX, BP, AX, BX, 28)
+	ROUND2(BX, CX, DX, BP, AX, 29)
+	ROUND2(AX, BX, CX, DX, BP, 30)
+	ROUND2(BP, AX, BX, CX, DX, 31)
+	ROUND2(DX, BP, AX, BX, CX, 32)
+	ROUND2(CX, DX, BP, AX, BX, 33)
+	ROUND2(BX, CX, DX, BP, AX, 34)
+	ROUND2(AX, BX, CX, DX, BP, 35)
+	ROUND2(BP, AX, BX, CX, DX, 36)
+	ROUND2(DX, BP, AX, BX, CX, 37)
+	ROUND2(CX, DX, BP, AX, BX, 38)
+	ROUND2(BX, CX, DX, BP, AX, 39)
+
+	ROUND3(AX, BX, CX, DX, BP, 40)
+	ROUND3(BP, AX, BX, CX, DX, 41)
+	ROUND3(DX, BP, AX, BX, CX, 42)
+	ROUND3(CX, DX, BP, AX, BX, 43)
+	ROUND3(BX, CX, DX, BP, AX, 44)
+	ROUND3(AX, BX, CX, DX, BP, 45)
+	ROUND3(BP, AX, BX, CX, DX, 46)
+	ROUND3(DX, BP, AX, BX, CX, 47)
+	ROUND3(CX, DX, BP, AX, BX, 48)
+	ROUND3(BX, CX, DX, BP, AX, 49)
+	ROUND3(AX, BX, CX, DX, BP, 50)
+	ROUND3(BP, AX, BX, CX, DX, 51)
+	ROUND3(DX, BP, AX, BX, CX, 52)
+	ROUND3(CX, DX, BP, AX, BX, 53)
+	ROUND3(BX, CX, DX, BP, AX, 54)
+	ROUND3(AX, BX, CX, DX, BP, 55)
+	ROUND3(BP, AX, BX, CX, DX, 56)
+	ROUND3(DX, BP, AX, BX, CX, 57)
+	ROUND3(CX, DX, BP, AX, BX, 58)
+	ROUND3(BX, CX, DX, BP, AX, 59)
+
+	ROUND4(AX, BX, CX, DX, BP, 60)
+	ROUND4(BP, AX, BX, CX, DX, 61)
+	ROUND4(DX, BP, AX, BX, CX, 62)
+	ROUND4(CX, DX, BP, AX, BX, 63)
+	ROUND4(BX, CX, DX, BP, AX, 64)
+	ROUND4(AX, BX, CX, DX, BP, 65)
+	ROUND4(BP, AX, BX, CX, DX, 66)
+	ROUND4(DX, BP, AX, BX, CX, 67)
+	ROUND4(CX, DX, BP, AX, BX, 68)
+	ROUND4(BX, CX, DX, BP, AX, 69)
+	ROUND4(AX, BX, CX, DX, BP, 70)
+	ROUND4(BP, AX, BX, CX, DX, 71)
+	ROUND4(DX, BP, AX, BX, CX, 72)
+	ROUND4(CX, DX, BP, AX, BX, 73)
+	ROUND4(BX, CX, DX, BP, AX, 74)
+	ROUND4(AX, BX, CX, DX, BP, 75)
+	ROUND4(BP, AX, BX, CX, DX, 76)
+	ROUND4(DX, BP, AX, BX, CX, 77)
+	ROUND4(CX, DX, BP, AX, BX, 78)
+	ROUND4(BX, CX, DX, BP, AX, 79)
+
+	ADDL	R11, AX
+	ADDL	R12, BX
+	ADDL	R13, CX
+	ADDL	R14, DX
+	ADDL	R15, BP
+
+	ADDQ	$64, SI
+	CMPQ	SI, DI
+	JB	loop
+
+end:
+	MOVQ	dig+0(FP), DI
+	MOVL	AX, (0*4)(DI)
+	MOVL	BX, (1*4)(DI)
+	MOVL	CX, (2*4)(DI)
+	MOVL	DX, (3*4)(DI)
+	MOVL	BP, (4*4)(DI)
+	RET
+
+
+// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
+// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
+// From http://software.intel.com/en-us/articles
+// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
+// This implementation is 2x unrolled, and interleaves vector instructions,
+// used to precompute W, with scalar computation of current round
+// for optimal scheduling.
+
+// Trivial helper macros.
+#define UPDATE_HASH(A,TB,C,D,E) \
+	ADDL	(R9), A \
+	MOVL	A, (R9) \
+	ADDL	4(R9), TB \
+	MOVL	TB, 4(R9) \
+	ADDL	8(R9), C \
+	MOVL	C, 8(R9) \
+	ADDL	12(R9), D \
+	MOVL	D, 12(R9) \
+	ADDL	16(R9), E \
+	MOVL	E, 16(R9)
+
+
+
+// Helper macros for PRECALC, which does precomputations
+#define PRECALC_0(OFFSET) \
+	VMOVDQU   OFFSET(R10),X0
+
+#define PRECALC_1(OFFSET) \
+	VINSERTI128 $1, OFFSET(R13), Y0, Y0
+
+#define PRECALC_2(YREG) \
+	VPSHUFB Y10, Y0, YREG
+
+#define PRECALC_4(YREG,K_OFFSET) \
+	VPADDD K_OFFSET(R8), YREG, Y0
+
+#define PRECALC_7(OFFSET) \
+	VMOVDQU Y0, (OFFSET*2)(R14)
+
+
+// Message scheduling pre-compute for rounds 0-15
+// R13 is a pointer to even 64-byte block
+// R10 is a pointer to odd 64-byte block
+// R14 is a pointer to temp buffer
+// X0 is used as temp register
+// YREG is clobbered as part of computation
+// OFFSET chooses 16 byte chunk within a block
+// R8 is a pointer to constants block
+// K_OFFSET chooses K constants relevant to this round
+// X10 holds swap mask
+#define PRECALC_00_15(OFFSET,YREG) \
+	PRECALC_0(OFFSET) \
+	PRECALC_1(OFFSET) \
+	PRECALC_2(YREG) \
+	PRECALC_4(YREG,0x0) \
+	PRECALC_7(OFFSET)
+
+
+// Helper macros for PRECALC_16_31
+#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
+	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
+	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
+
+#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
+	VPXOR  REG_SUB_8, REG, REG \
+	VPXOR  REG_SUB_16, Y0, Y0
+
+#define PRECALC_18(REG) \
+	VPXOR Y0, REG, REG \
+	VPSLLDQ $12, REG, Y9
+
+#define PRECALC_19(REG) \
+	VPSLLD $1, REG, Y0 \
+	VPSRLD $31, REG, REG
+
+#define PRECALC_20(REG) \
+	VPOR REG, Y0, Y0 \
+	VPSLLD $2, Y9,  REG
+
+#define PRECALC_21(REG) \
+	VPSRLD $30, Y9, Y9 \
+	VPXOR REG, Y0, Y0
+
+#define PRECALC_23(REG,K_OFFSET,OFFSET) \
+	VPXOR Y9, Y0, REG \
+	VPADDD K_OFFSET(R8), REG, Y0 \
+	VMOVDQU Y0, (OFFSET)(R14)
+
+// Message scheduling pre-compute for rounds 16-31
+// calculating last 32 w[i] values in 8 XMM registers
+// pre-calculate K+w[i] values and store to mem
+// for later load by ALU add instruction.
+// "brute force" vectorization for rounds 16-31 only
+// due to w[i]->w[i-3] dependency.
+// clobbers 5 input ymm registers REG_SUB*
+// uses X0 and X9 as temp registers
+// As always, R8 is a pointer to constants block
+// and R14 is a pointer to temp buffer
+#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
+	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
+	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
+	PRECALC_18(REG) \
+	PRECALC_19(REG) \
+	PRECALC_20(REG) \
+	PRECALC_21(REG) \
+	PRECALC_23(REG,K_OFFSET,OFFSET)
+
+
+// Helper macros for PRECALC_32_79
+#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
+	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
+
+#define PRECALC_33(REG_SUB_28,REG) \
+	VPXOR REG_SUB_28, REG, REG
+
+#define PRECALC_34(REG_SUB_16) \
+	VPXOR REG_SUB_16, Y0, Y0
+
+#define PRECALC_35(REG) \
+	VPXOR Y0, REG, REG
+
+#define PRECALC_36(REG) \
+	VPSLLD $2, REG, Y0
+
+#define PRECALC_37(REG) \
+	VPSRLD $30, REG, REG \
+	VPOR REG, Y0, REG
+
+#define PRECALC_39(REG,K_OFFSET,OFFSET) \
+	VPADDD K_OFFSET(R8), REG, Y0 \
+	VMOVDQU Y0, (OFFSET)(R14)
+
+// Message scheduling pre-compute for rounds 32-79
+// In SHA-1 specification we have:
+// w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+// Which is the same as:
+// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+// This allows for more efficient vectorization,
+// since w[i]->w[i-3] dependency is broken
+#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
+	PRECALC_32(REG_SUB_8,REG_SUB_4) \
+	PRECALC_33(REG_SUB_28,REG) \
+	PRECALC_34(REG_SUB_16) \
+	PRECALC_35(REG) \
+	PRECALC_36(REG) \
+	PRECALC_37(REG) \
+	PRECALC_39(REG,K_OFFSET,OFFSET)
+
+#define PRECALC \
+	PRECALC_00_15(0,Y15) \
+	PRECALC_00_15(0x10,Y14) \
+	PRECALC_00_15(0x20,Y13) \
+	PRECALC_00_15(0x30,Y12) \
+	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
+	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
+	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
+	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
+	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
+	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
+	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
+	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
+	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
+	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
+	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
+	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
+	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
+	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
+	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
+	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
+
+// Macros calculating individual rounds have general form
+// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
+// CALC_ROUND_{PRE,POST} macros follow
+
+#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
+	ADDL OFFSET(R15),REG_E \
+	ANDNL REG_C,REG_A,BP \
+	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
+	RORXL $0x1b, REG_A, R12 \
+	RORXL $2, REG_A, REG_B         // for next round
+
+// Calculate F for the next round
+#define CALC_F1_POST(REG_A,REG_B,REG_E) \
+	ANDL REG_B,REG_A \             // b&c
+	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
+	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
+
+
+// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
+#define CALC_0 \
+	MOVL SI, BX \ // Precalculating first round
+	RORXL $2, SI, SI \
+	ANDNL AX, BX, BP \
+	ANDL DI, BX \
+	XORL BP, BX \
+	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
+	PRECALC_0(0x80) \
+	CALC_F1_POST(CX,SI,DX)
+
+#define CALC_1 \
+	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
+	PRECALC_1(0x80) \
+	CALC_F1_POST(DX,BX,AX)
+
+#define CALC_2 \
+	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
+	PRECALC_2(Y15) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_3 \
+	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
+	CALC_F1_POST(DI,DX,SI)
+
+#define CALC_4 \
+	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
+	PRECALC_4(Y15,0x0) \
+	CALC_F1_POST(SI,AX,BX)
+
+#define CALC_5 \
+	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
+	CALC_F1_POST(BX,DI,CX)
+
+#define CALC_6 \
+	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
+	CALC_F1_POST(CX,SI,DX)
+
+#define CALC_7 \
+	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
+	PRECALC_7(0x0) \
+	CALC_F1_POST(DX,BX,AX)
+
+#define CALC_8 \
+	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
+	PRECALC_0(0x90) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_9 \
+	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
+	PRECALC_1(0x90) \
+	CALC_F1_POST(DI,DX,SI)
+
+#define CALC_10 \
+	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
+	PRECALC_2(Y14) \
+	CALC_F1_POST(SI,AX,BX)
+
+#define CALC_11 \
+	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
+	CALC_F1_POST(BX,DI,CX)
+
+#define CALC_12 \
+	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
+	PRECALC_4(Y14,0x0) \
+	CALC_F1_POST(CX,SI,DX)
+
+#define CALC_13 \
+	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
+	CALC_F1_POST(DX,BX,AX)
+
+#define CALC_14 \
+	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_15 \
+	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
+	PRECALC_7(0x10) \
+	CALC_F1_POST(DI,DX,SI)
+
+#define CALC_16 \
+	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
+	PRECALC_0(0xa0) \
+	CALC_F1_POST(SI,AX,BX)
+
+#define CALC_17 \
+	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
+	PRECALC_1(0xa0) \
+	CALC_F1_POST(BX,DI,CX)
+
+#define CALC_18 \
+	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
+	PRECALC_2(Y13) \
+	CALC_F1_POST(CX,SI,DX)
+
+
+#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
+	ADDL OFFSET(R15),REG_E \
+	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
+	RORXL $0x1b, REG_A, R12 \
+	RORXL $2, REG_A, REG_B         // for next round
+
+#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
+	XORL REG_B, REG_A \
+	ADDL R12, REG_E \
+	XORL REG_C, REG_A
+
+#define CALC_19 \
+	CALC_F2_PRE(0x8c,DX,CX,AX) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_20 \
+	CALC_F2_PRE(0xa0,AX,DX,DI) \
+	PRECALC_4(Y13,0x0) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_21 \
+	CALC_F2_PRE(0xa4,DI,AX,SI) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_22 \
+	CALC_F2_PRE(0xa8,SI,DI,BX) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_23 \
+	CALC_F2_PRE(0xac,BX,SI,CX) \
+	PRECALC_7(0x20) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_24 \
+	CALC_F2_PRE(0xc0,CX,BX,DX) \
+	PRECALC_0(0xb0) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_25 \
+	CALC_F2_PRE(0xc4,DX,CX,AX) \
+	PRECALC_1(0xb0) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_26 \
+	CALC_F2_PRE(0xc8,AX,DX,DI) \
+	PRECALC_2(Y12) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_27 \
+	CALC_F2_PRE(0xcc,DI,AX,SI) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_28 \
+	CALC_F2_PRE(0xe0,SI,DI,BX) \
+	PRECALC_4(Y12,0x0) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_29 \
+	CALC_F2_PRE(0xe4,BX,SI,CX) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_30 \
+	CALC_F2_PRE(0xe8,CX,BX,DX) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_31 \
+	CALC_F2_PRE(0xec,DX,CX,AX) \
+	PRECALC_7(0x30) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_32 \
+	CALC_F2_PRE(0x100,AX,DX,DI) \
+	PRECALC_16(Y15,Y14,Y12,Y8) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_33 \
+	CALC_F2_PRE(0x104,DI,AX,SI) \
+	PRECALC_17(Y15,Y13,Y8) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_34 \
+	CALC_F2_PRE(0x108,SI,DI,BX) \
+	PRECALC_18(Y8) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_35 \
+	CALC_F2_PRE(0x10c,BX,SI,CX) \
+	PRECALC_19(Y8) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_36 \
+	CALC_F2_PRE(0x120,CX,BX,DX) \
+	PRECALC_20(Y8) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_37 \
+	CALC_F2_PRE(0x124,DX,CX,AX) \
+	PRECALC_21(Y8) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_38 \
+	CALC_F2_PRE(0x128,AX,DX,DI) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+
+#define CALC_F3_PRE(OFFSET,REG_E) \
+	ADDL OFFSET(R15),REG_E
+
+#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
+	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
+	MOVL REG_B, BP \
+	ORL  REG_A, BP \
+	RORXL $0x1b, REG_A, R12 \
+	RORXL $2, REG_A, REG_TB \
+	ANDL REG_C, BP \		// Calculate F for the next round
+	ANDL REG_B, REG_A \
+	ORL  BP, REG_A \
+	ADDL R12, REG_E
+
+#define CALC_39 \
+	CALC_F3_PRE(0x12c,SI) \
+	PRECALC_23(Y8,0x0,0x80) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_40 \
+	CALC_F3_PRE(0x140,BX) \
+	PRECALC_16(Y14,Y13,Y8,Y7) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_41 \
+	CALC_F3_PRE(0x144,CX) \
+	PRECALC_17(Y14,Y12,Y7) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_42 \
+	CALC_F3_PRE(0x148,DX) \
+	PRECALC_18(Y7) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_43 \
+	CALC_F3_PRE(0x14c,AX) \
+	PRECALC_19(Y7) \
+	CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_44 \
+	CALC_F3_PRE(0x160,DI) \
+	PRECALC_20(Y7) \
+	CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_45 \
+	CALC_F3_PRE(0x164,SI) \
+	PRECALC_21(Y7) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_46 \
+	CALC_F3_PRE(0x168,BX) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_47 \
+	CALC_F3_PRE(0x16c,CX) \
+	VPXOR Y9, Y0, Y7 \
+	VPADDD 0x20(R8), Y7, Y0 \
+	VMOVDQU Y0, 0xa0(R14) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_48 \
+	CALC_F3_PRE(0x180,DX) \
+	PRECALC_16(Y13,Y12,Y7,Y5) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_49 \
+	CALC_F3_PRE(0x184,AX) \
+	PRECALC_17(Y13,Y8,Y5) \
+	CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_50 \
+	CALC_F3_PRE(0x188,DI) \
+	PRECALC_18(Y5) \
+	CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_51 \
+	CALC_F3_PRE(0x18c,SI) \
+	PRECALC_19(Y5) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_52 \
+	CALC_F3_PRE(0x1a0,BX) \
+	PRECALC_20(Y5) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_53 \
+	CALC_F3_PRE(0x1a4,CX) \
+	PRECALC_21(Y5) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_54 \
+	CALC_F3_PRE(0x1a8,DX) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_55 \
+	CALC_F3_PRE(0x1ac,AX) \
+	PRECALC_23(Y5,0x20,0xc0) \
+	CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_56 \
+	CALC_F3_PRE(0x1c0,DI) \
+	PRECALC_16(Y12,Y8,Y5,Y3) \
+	CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_57 \
+	CALC_F3_PRE(0x1c4,SI) \
+	PRECALC_17(Y12,Y7,Y3) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_58 \
+	CALC_F3_PRE(0x1c8,BX) \
+	PRECALC_18(Y3) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_59 \
+	CALC_F2_PRE(0x1cc,BX,SI,CX) \
+	PRECALC_19(Y3) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_60 \
+	CALC_F2_PRE(0x1e0,CX,BX,DX) \
+	PRECALC_20(Y3) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_61 \
+	CALC_F2_PRE(0x1e4,DX,CX,AX) \
+	PRECALC_21(Y3) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_62 \
+	CALC_F2_PRE(0x1e8,AX,DX,DI) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_63 \
+	CALC_F2_PRE(0x1ec,DI,AX,SI) \
+	PRECALC_23(Y3,0x20,0xe0) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_64 \
+	CALC_F2_PRE(0x200,SI,DI,BX) \
+	PRECALC_32(Y5,Y3) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_65 \
+	CALC_F2_PRE(0x204,BX,SI,CX) \
+	PRECALC_33(Y14,Y15) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_66 \
+	CALC_F2_PRE(0x208,CX,BX,DX) \
+	PRECALC_34(Y8) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_67 \
+	CALC_F2_PRE(0x20c,DX,CX,AX) \
+	PRECALC_35(Y15) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_68 \
+	CALC_F2_PRE(0x220,AX,DX,DI) \
+	PRECALC_36(Y15) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_69 \
+	CALC_F2_PRE(0x224,DI,AX,SI) \
+	PRECALC_37(Y15) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_70 \
+	CALC_F2_PRE(0x228,SI,DI,BX) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_71 \
+	CALC_F2_PRE(0x22c,BX,SI,CX) \
+	PRECALC_39(Y15,0x20,0x100) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_72 \
+	CALC_F2_PRE(0x240,CX,BX,DX) \
+	PRECALC_32(Y3,Y15) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_73 \
+	CALC_F2_PRE(0x244,DX,CX,AX) \
+	PRECALC_33(Y13,Y14) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_74 \
+	CALC_F2_PRE(0x248,AX,DX,DI) \
+	PRECALC_34(Y7) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_75 \
+	CALC_F2_PRE(0x24c,DI,AX,SI) \
+	PRECALC_35(Y14) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_76 \
+	CALC_F2_PRE(0x260,SI,DI,BX) \
+	PRECALC_36(Y14) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_77 \
+	CALC_F2_PRE(0x264,BX,SI,CX) \
+	PRECALC_37(Y14) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_78 \
+	CALC_F2_PRE(0x268,CX,BX,DX) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_79 \
+	ADDL 0x26c(R15), AX \
+	LEAL (AX)(CX*1), AX \
+	RORXL $0x1b, DX, R12 \
+	PRECALC_39(Y14,0x20,0x120) \
+	ADDL R12, AX
+
+// Similar to CALC_0
+#define CALC_80 \
+	MOVL CX, DX \
+	RORXL $2, CX, CX \
+	ANDNL SI, DX, BP \
+	ANDL BX, DX \
+	XORL BP, DX \
+	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
+	PRECALC_32(Y15,Y14) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_81 \
+	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
+	PRECALC_33(Y12,Y13) \
+	CALC_F1_POST(DI,DX,SI)
+
+#define CALC_82 \
+	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
+	PRECALC_34(Y5) \
+	CALC_F1_POST(SI,AX,BX)
+
+#define CALC_83 \
+	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
+	PRECALC_35(Y13) \
+	CALC_F1_POST(BX,DI,CX)
+
+#define CALC_84 \
+	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
+	PRECALC_36(Y13) \
+	CALC_F1_POST(CX,SI,DX)
+
+#define CALC_85 \
+	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
+	PRECALC_37(Y13) \
+	CALC_F1_POST(DX,BX,AX)
+
+#define CALC_86 \
+	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_87 \
+	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
+	PRECALC_39(Y13,0x40,0x140) \
+	CALC_F1_POST(DI,DX,SI)
+
+#define CALC_88 \
+	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
+	PRECALC_32(Y14,Y13) \
+	CALC_F1_POST(SI,AX,BX)
+
+#define CALC_89 \
+	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
+	PRECALC_33(Y8,Y12) \
+	CALC_F1_POST(BX,DI,CX)
+
+#define CALC_90 \
+	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
+	PRECALC_34(Y3) \
+	CALC_F1_POST(CX,SI,DX)
+
+#define CALC_91 \
+	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
+	PRECALC_35(Y12) \
+	CALC_F1_POST(DX,BX,AX)
+
+#define CALC_92 \
+	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
+	PRECALC_36(Y12) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_93 \
+	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
+	PRECALC_37(Y12) \
+	CALC_F1_POST(DI,DX,SI)
+
+#define CALC_94 \
+	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
+	CALC_F1_POST(SI,AX,BX)
+
+#define CALC_95 \
+	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
+	PRECALC_39(Y12,0x40,0x160) \
+	CALC_F1_POST(BX,DI,CX)
+
+#define CALC_96 \
+	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
+	PRECALC_32(Y13,Y12) \
+	CALC_F1_POST(CX,SI,DX)
+
+#define CALC_97 \
+	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
+	PRECALC_33(Y7,Y8) \
+	CALC_F1_POST(DX,BX,AX)
+
+#define CALC_98 \
+	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
+	PRECALC_34(Y15) \
+	CALC_F1_POST(AX,CX,DI)
+
+#define CALC_99 \
+	CALC_F2_PRE(0x9c,DI,AX,SI) \
+	PRECALC_35(Y8) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_100 \
+	CALC_F2_PRE(0xb0,SI,DI,BX) \
+	PRECALC_36(Y8) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_101 \
+	CALC_F2_PRE(0xb4,BX,SI,CX) \
+	PRECALC_37(Y8) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_102 \
+	CALC_F2_PRE(0xb8,CX,BX,DX) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_103 \
+	CALC_F2_PRE(0xbc,DX,CX,AX) \
+	PRECALC_39(Y8,0x40,0x180) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_104 \
+	CALC_F2_PRE(0xd0,AX,DX,DI) \
+	PRECALC_32(Y12,Y8) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_105 \
+	CALC_F2_PRE(0xd4,DI,AX,SI) \
+	PRECALC_33(Y5,Y7) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_106 \
+	CALC_F2_PRE(0xd8,SI,DI,BX) \
+	PRECALC_34(Y14) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_107 \
+	CALC_F2_PRE(0xdc,BX,SI,CX) \
+	PRECALC_35(Y7) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_108 \
+	CALC_F2_PRE(0xf0,CX,BX,DX) \
+	PRECALC_36(Y7) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_109 \
+	CALC_F2_PRE(0xf4,DX,CX,AX) \
+	PRECALC_37(Y7) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_110 \
+	CALC_F2_PRE(0xf8,AX,DX,DI) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_111 \
+	CALC_F2_PRE(0xfc,DI,AX,SI) \
+	PRECALC_39(Y7,0x40,0x1a0) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_112 \
+	CALC_F2_PRE(0x110,SI,DI,BX) \
+	PRECALC_32(Y8,Y7) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_113 \
+	CALC_F2_PRE(0x114,BX,SI,CX) \
+	PRECALC_33(Y3,Y5) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_114 \
+	CALC_F2_PRE(0x118,CX,BX,DX) \
+	PRECALC_34(Y13) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_115 \
+	CALC_F2_PRE(0x11c,DX,CX,AX) \
+	PRECALC_35(Y5) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_116 \
+	CALC_F2_PRE(0x130,AX,DX,DI) \
+	PRECALC_36(Y5) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_117 \
+	CALC_F2_PRE(0x134,DI,AX,SI) \
+	PRECALC_37(Y5) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_118 \
+	CALC_F2_PRE(0x138,SI,DI,BX) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_119 \
+	CALC_F3_PRE(0x13c,CX) \
+	PRECALC_39(Y5,0x40,0x1c0) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_120 \
+	CALC_F3_PRE(0x150,DX) \
+	PRECALC_32(Y7,Y5) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_121 \
+	CALC_F3_PRE(0x154,AX) \
+	PRECALC_33(Y15,Y3) \
+	CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_122 \
+	CALC_F3_PRE(0x158,DI) \
+	PRECALC_34(Y12) \
+	CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_123 \
+	CALC_F3_PRE(0x15c,SI) \
+	PRECALC_35(Y3) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_124 \
+	CALC_F3_PRE(0x170,BX) \
+	PRECALC_36(Y3) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_125 \
+	CALC_F3_PRE(0x174,CX) \
+	PRECALC_37(Y3) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_126 \
+	CALC_F3_PRE(0x178,DX) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_127 \
+	CALC_F3_PRE(0x17c,AX) \
+	PRECALC_39(Y3,0x60,0x1e0) \
+	CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_128 \
+	CALC_F3_PRE(0x190,DI) \
+	PRECALC_32(Y5,Y3) \
+	CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_129 \
+	CALC_F3_PRE(0x194,SI) \
+	PRECALC_33(Y14,Y15) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_130 \
+	CALC_F3_PRE(0x198,BX) \
+	PRECALC_34(Y8) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_131 \
+	CALC_F3_PRE(0x19c,CX) \
+	PRECALC_35(Y15) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_132 \
+	CALC_F3_PRE(0x1b0,DX) \
+	PRECALC_36(Y15) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_133 \
+	CALC_F3_PRE(0x1b4,AX) \
+	PRECALC_37(Y15) \
+	CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_134 \
+	CALC_F3_PRE(0x1b8,DI) \
+	CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_135 \
+	CALC_F3_PRE(0x1bc,SI) \
+	PRECALC_39(Y15,0x60,0x200) \
+	CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_136 \
+	CALC_F3_PRE(0x1d0,BX) \
+	PRECALC_32(Y3,Y15) \
+	CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_137 \
+	CALC_F3_PRE(0x1d4,CX) \
+	PRECALC_33(Y13,Y14) \
+	CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_138 \
+	CALC_F3_PRE(0x1d8,DX) \
+	PRECALC_34(Y7) \
+	CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_139 \
+	CALC_F2_PRE(0x1dc,DX,CX,AX) \
+	PRECALC_35(Y14) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_140 \
+	CALC_F2_PRE(0x1f0,AX,DX,DI) \
+	PRECALC_36(Y14) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_141 \
+	CALC_F2_PRE(0x1f4,DI,AX,SI) \
+	PRECALC_37(Y14) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_142 \
+	CALC_F2_PRE(0x1f8,SI,DI,BX) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_143 \
+	CALC_F2_PRE(0x1fc,BX,SI,CX) \
+	PRECALC_39(Y14,0x60,0x220) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_144 \
+	CALC_F2_PRE(0x210,CX,BX,DX) \
+	PRECALC_32(Y15,Y14) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_145 \
+	CALC_F2_PRE(0x214,DX,CX,AX) \
+	PRECALC_33(Y12,Y13) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_146 \
+	CALC_F2_PRE(0x218,AX,DX,DI) \
+	PRECALC_34(Y5) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_147 \
+	CALC_F2_PRE(0x21c,DI,AX,SI) \
+	PRECALC_35(Y13) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_148 \
+	CALC_F2_PRE(0x230,SI,DI,BX) \
+	PRECALC_36(Y13) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_149 \
+	CALC_F2_PRE(0x234,BX,SI,CX) \
+	PRECALC_37(Y13) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_150 \
+	CALC_F2_PRE(0x238,CX,BX,DX) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_151 \
+	CALC_F2_PRE(0x23c,DX,CX,AX) \
+	PRECALC_39(Y13,0x60,0x240) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_152 \
+	CALC_F2_PRE(0x250,AX,DX,DI) \
+	PRECALC_32(Y14,Y13) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_153 \
+	CALC_F2_PRE(0x254,DI,AX,SI) \
+	PRECALC_33(Y8,Y12) \
+	CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_154 \
+	CALC_F2_PRE(0x258,SI,DI,BX) \
+	PRECALC_34(Y3) \
+	CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_155 \
+	CALC_F2_PRE(0x25c,BX,SI,CX) \
+	PRECALC_35(Y12) \
+	CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_156 \
+	CALC_F2_PRE(0x270,CX,BX,DX) \
+	PRECALC_36(Y12) \
+	CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_157 \
+	CALC_F2_PRE(0x274,DX,CX,AX) \
+	PRECALC_37(Y12) \
+	CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_158 \
+	CALC_F2_PRE(0x278,AX,DX,DI) \
+	CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_159 \
+	ADDL 0x27c(R15),SI \
+	LEAL (SI)(AX*1), SI \
+	RORXL $0x1b, DI, R12 \
+	PRECALC_39(Y12,0x60,0x260) \
+	ADDL R12, SI
+
+
+
+#define CALC \
+	MOVL	(R9), CX \
+	MOVL	4(R9), SI \
+	MOVL	8(R9), DI \
+	MOVL	12(R9), AX \
+	MOVL	16(R9), DX \
+	MOVQ    SP, R14 \
+	LEAQ    (2*4*80+32)(SP), R15 \
+	PRECALC \ // Precalc WK for first 2 blocks
+	XCHGQ   R15, R14 \
+loop: \  // this loops is unrolled
+	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
+	JNE	begin \
+	VZEROUPPER \
+	RET \
+begin: \
+	CALC_0 \
+	CALC_1 \
+	CALC_2 \
+	CALC_3 \
+	CALC_4 \
+	CALC_5 \
+	CALC_6 \
+	CALC_7 \
+	CALC_8 \
+	CALC_9 \
+	CALC_10 \
+	CALC_11 \
+	CALC_12 \
+	CALC_13 \
+	CALC_14 \
+	CALC_15 \
+	CALC_16 \
+	CALC_17 \
+	CALC_18 \
+	CALC_19 \
+	CALC_20 \
+	CALC_21 \
+	CALC_22 \
+	CALC_23 \
+	CALC_24 \
+	CALC_25 \
+	CALC_26 \
+	CALC_27 \
+	CALC_28 \
+	CALC_29 \
+	CALC_30 \
+	CALC_31 \
+	CALC_32 \
+	CALC_33 \
+	CALC_34 \
+	CALC_35 \
+	CALC_36 \
+	CALC_37 \
+	CALC_38 \
+	CALC_39 \
+	CALC_40 \
+	CALC_41 \
+	CALC_42 \
+	CALC_43 \
+	CALC_44 \
+	CALC_45 \
+	CALC_46 \
+	CALC_47 \
+	CALC_48 \
+	CALC_49 \
+	CALC_50 \
+	CALC_51 \
+	CALC_52 \
+	CALC_53 \
+	CALC_54 \
+	CALC_55 \
+	CALC_56 \
+	CALC_57 \
+	CALC_58 \
+	CALC_59 \
+	ADDQ $128, R10 \ // move to next even-64-byte block
+	CMPQ R10, R11 \ // is current block the last one?
+	CMOVQCC R8, R10 \ // signal the last iteration smartly
+	CALC_60 \
+	CALC_61 \
+	CALC_62 \
+	CALC_63 \
+	CALC_64 \
+	CALC_65 \
+	CALC_66 \
+	CALC_67 \
+	CALC_68 \
+	CALC_69 \
+	CALC_70 \
+	CALC_71 \
+	CALC_72 \
+	CALC_73 \
+	CALC_74 \
+	CALC_75 \
+	CALC_76 \
+	CALC_77 \
+	CALC_78 \
+	CALC_79 \
+	UPDATE_HASH(AX,DX,BX,SI,DI) \
+	CMPQ R10, R8 \ // is current block the last one?
+	JE loop\
+	MOVL DX, CX \
+	CALC_80 \
+	CALC_81 \
+	CALC_82 \
+	CALC_83 \
+	CALC_84 \
+	CALC_85 \
+	CALC_86 \
+	CALC_87 \
+	CALC_88 \
+	CALC_89 \
+	CALC_90 \
+	CALC_91 \
+	CALC_92 \
+	CALC_93 \
+	CALC_94 \
+	CALC_95 \
+	CALC_96 \
+	CALC_97 \
+	CALC_98 \
+	CALC_99 \
+	CALC_100 \
+	CALC_101 \
+	CALC_102 \
+	CALC_103 \
+	CALC_104 \
+	CALC_105 \
+	CALC_106 \
+	CALC_107 \
+	CALC_108 \
+	CALC_109 \
+	CALC_110 \
+	CALC_111 \
+	CALC_112 \
+	CALC_113 \
+	CALC_114 \
+	CALC_115 \
+	CALC_116 \
+	CALC_117 \
+	CALC_118 \
+	CALC_119 \
+	CALC_120 \
+	CALC_121 \
+	CALC_122 \
+	CALC_123 \
+	CALC_124 \
+	CALC_125 \
+	CALC_126 \
+	CALC_127 \
+	CALC_128 \
+	CALC_129 \
+	CALC_130 \
+	CALC_131 \
+	CALC_132 \
+	CALC_133 \
+	CALC_134 \
+	CALC_135 \
+	CALC_136 \
+	CALC_137 \
+	CALC_138 \
+	CALC_139 \
+	ADDQ $128, R13 \ //move to next even-64-byte block
+	CMPQ R13, R11 \ //is current block the last one?
+	CMOVQCC R8, R10 \
+	CALC_140 \
+	CALC_141 \
+	CALC_142 \
+	CALC_143 \
+	CALC_144 \
+	CALC_145 \
+	CALC_146 \
+	CALC_147 \
+	CALC_148 \
+	CALC_149 \
+	CALC_150 \
+	CALC_151 \
+	CALC_152 \
+	CALC_153 \
+	CALC_154 \
+	CALC_155 \
+	CALC_156 \
+	CALC_157 \
+	CALC_158 \
+	CALC_159 \
+	UPDATE_HASH(SI,DI,DX,CX,BX) \
+	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
+	MOVL	DI, SI \
+	MOVL	DX, DI \
+	MOVL	BX, DX \
+	MOVL	CX, AX \
+	MOVL	R12, CX \
+	XCHGQ   R15, R14 \
+	JMP     loop
+
+
+
+TEXT ·blockAVX2(SB),$1408-32
+
+	MOVQ	dig+0(FP),	DI
+	MOVQ	p_base+8(FP),	SI
+	MOVQ	p_len+16(FP),	DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+
+	MOVQ	$K_XMM_AR<>(SB), R8
+
+	MOVQ	DI, R9
+	MOVQ	SI, R10
+	LEAQ	64(SI), R13
+
+	ADDQ	SI, DX
+	ADDQ	$64, DX
+	MOVQ	DX, R11
+
+	CMPQ	R13, R11
+	CMOVQCC	R8, R13
+
+	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
+
+	CALC // RET is inside macros
+
+DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
+GLOBL K_XMM_AR<>(SB),RODATA,$128
+
+DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
+DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
+DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
+DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
+DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
+DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
+DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
+DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
+GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/sha1block_arm64.go b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_arm64.go
new file mode 100644
index 0000000000..08d3df0000
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_arm64.go
@@ -0,0 +1,26 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import "internal/cpu"
+
+var k = []uint32{
+	0x5A827999,
+	0x6ED9EBA1,
+	0x8F1BBCDC,
+	0xCA62C1D6,
+}
+
+//go:noescape
+func sha1block(h []uint32, p []byte, k []uint32)
+
+func block(dig *digest, p []byte) {
+	if !cpu.ARM64.HasSHA1 {
+		blockGeneric(dig, p)
+	} else {
+		h := dig.h[:]
+		sha1block(h, p, k)
+	}
+}
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/sha1block_arm64.s b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_arm64.s
new file mode 100644
index 0000000000..d56838464d
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/sha1block_arm64.s
@@ -0,0 +1,152 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define HASHUPDATECHOOSE \
+	SHA1C	V16.S4, V1, V2 \
+	SHA1H	V3, V1 \
+	VMOV	V2.B16, V3.B16
+
+#define HASHUPDATEPARITY \
+	SHA1P	V16.S4, V1, V2 \
+	SHA1H	V3, V1 \
+	VMOV	V2.B16, V3.B16
+
+#define HASHUPDATEMAJ \
+	SHA1M	V16.S4, V1, V2 \
+	SHA1H	V3, V1 \
+	VMOV	V2.B16, V3.B16
+
+// func sha1block(h []uint32, p []byte, k []uint32)
+TEXT ·sha1block(SB),NOSPLIT,$0
+	MOVD	h_base+0(FP), R0                             // hash value first address
+	MOVD	p_base+24(FP), R1                            // message first address
+	MOVD	k_base+48(FP), R2                            // k constants first address
+	MOVD	p_len+32(FP), R3                             // message length
+	VLD1.P	16(R0), [V0.S4]
+	FMOVS	(R0), F20
+	SUB	$16, R0, R0
+
+blockloop:
+
+	VLD1.P	16(R1), [V4.B16]                             // load message
+	VLD1.P	16(R1), [V5.B16]
+	VLD1.P	16(R1), [V6.B16]
+	VLD1.P	16(R1), [V7.B16]
+	VLD1	(R2), [V19.S4]                               // load constant k0-k79
+	VMOV	V0.B16, V2.B16
+	VMOV	V20.S[0], V1
+	VMOV	V2.B16, V3.B16
+	VDUP	V19.S[0], V17.S4
+	VREV32	V4.B16, V4.B16                               // prepare for using message in Byte format
+	VREV32	V5.B16, V5.B16
+	VREV32	V6.B16, V6.B16
+	VREV32	V7.B16, V7.B16
+
+
+	VDUP	V19.S[1], V18.S4
+	VADD	V17.S4, V4.S4, V16.S4
+	SHA1SU0	V6.S4, V5.S4, V4.S4
+	HASHUPDATECHOOSE
+	SHA1SU1	V7.S4, V4.S4
+
+	VADD	V17.S4, V5.S4, V16.S4
+	SHA1SU0	V7.S4, V6.S4, V5.S4
+	HASHUPDATECHOOSE
+	SHA1SU1	V4.S4, V5.S4
+	VADD	V17.S4, V6.S4, V16.S4
+	SHA1SU0	V4.S4, V7.S4, V6.S4
+	HASHUPDATECHOOSE
+	SHA1SU1	V5.S4, V6.S4
+
+	VADD	V17.S4, V7.S4, V16.S4
+	SHA1SU0	V5.S4, V4.S4, V7.S4
+	HASHUPDATECHOOSE
+	SHA1SU1	V6.S4, V7.S4
+
+	VADD	V17.S4, V4.S4, V16.S4
+	SHA1SU0	V6.S4, V5.S4, V4.S4
+	HASHUPDATECHOOSE
+	SHA1SU1	V7.S4, V4.S4
+
+	VDUP	V19.S[2], V17.S4
+	VADD	V18.S4, V5.S4, V16.S4
+	SHA1SU0	V7.S4, V6.S4, V5.S4
+	HASHUPDATEPARITY
+	SHA1SU1	V4.S4, V5.S4
+
+	VADD	V18.S4, V6.S4, V16.S4
+	SHA1SU0	V4.S4, V7.S4, V6.S4
+	HASHUPDATEPARITY
+	SHA1SU1	V5.S4, V6.S4
+
+	VADD	V18.S4, V7.S4, V16.S4
+	SHA1SU0	V5.S4, V4.S4, V7.S4
+	HASHUPDATEPARITY
+	SHA1SU1	V6.S4, V7.S4
+
+	VADD	V18.S4, V4.S4, V16.S4
+	SHA1SU0	V6.S4, V5.S4, V4.S4
+	HASHUPDATEPARITY
+	SHA1SU1	V7.S4, V4.S4
+
+	VADD	V18.S4, V5.S4, V16.S4
+	SHA1SU0	V7.S4, V6.S4, V5.S4
+	HASHUPDATEPARITY
+	SHA1SU1	V4.S4, V5.S4
+
+	VDUP	V19.S[3], V18.S4
+	VADD	V17.S4, V6.S4, V16.S4
+	SHA1SU0	V4.S4, V7.S4, V6.S4
+	HASHUPDATEMAJ
+	SHA1SU1	V5.S4, V6.S4
+
+	VADD	V17.S4, V7.S4, V16.S4
+	SHA1SU0	V5.S4, V4.S4, V7.S4
+	HASHUPDATEMAJ
+	SHA1SU1	V6.S4, V7.S4
+
+	VADD	V17.S4, V4.S4, V16.S4
+	SHA1SU0	V6.S4, V5.S4, V4.S4
+	HASHUPDATEMAJ
+	SHA1SU1	V7.S4, V4.S4
+
+	VADD	V17.S4, V5.S4, V16.S4
+	SHA1SU0	V7.S4, V6.S4, V5.S4
+	HASHUPDATEMAJ
+	SHA1SU1	V4.S4, V5.S4
+
+	VADD	V17.S4, V6.S4, V16.S4
+	SHA1SU0	V4.S4, V7.S4, V6.S4
+	HASHUPDATEMAJ
+	SHA1SU1	V5.S4, V6.S4
+
+	VADD	V18.S4, V7.S4, V16.S4
+	SHA1SU0	V5.S4, V4.S4, V7.S4
+	HASHUPDATEPARITY
+	SHA1SU1	V6.S4, V7.S4
+
+	VADD	V18.S4, V4.S4, V16.S4
+	HASHUPDATEPARITY
+
+	VADD	V18.S4, V5.S4, V16.S4
+	HASHUPDATEPARITY
+
+	VADD	V18.S4, V6.S4, V16.S4
+	HASHUPDATEPARITY
+
+	VADD	V18.S4, V7.S4, V16.S4
+	HASHUPDATEPARITY
+
+	SUB	$64, R3, R3                                  // message length - 64bytes, then compare with 64bytes
+	VADD	V2.S4, V0.S4, V0.S4
+	VADD	V1.S4, V20.S4, V20.S4
+	CBNZ	R3, blockloop
+
+sha1ret:
+
+	VST1.P	[V0.S4], 16(R0)                               // store hash value H(dcba)
+	FMOVS	F20, (R0)                                     // store hash value H(e)
+	RET
diff --git a/contrib/go/_std_1.22/src/crypto/sha1/ya.make b/contrib/go/_std_1.22/src/crypto/sha1/ya.make
new file mode 100644
index 0000000000..594fb1759a
--- /dev/null
+++ b/contrib/go/_std_1.22/src/crypto/sha1/ya.make
@@ -0,0 +1,17 @@
+GO_LIBRARY()
+IF (OS_DARWIN AND ARCH_ARM64 AND RACE AND CGO_ENABLED OR OS_DARWIN AND ARCH_ARM64 AND RACE AND NOT CGO_ENABLED OR OS_DARWIN AND ARCH_ARM64 AND NOT RACE AND CGO_ENABLED OR OS_DARWIN AND ARCH_ARM64 AND NOT RACE AND NOT CGO_ENABLED OR OS_LINUX AND ARCH_AARCH64 AND RACE AND CGO_ENABLED OR OS_LINUX AND ARCH_AARCH64 AND RACE AND NOT CGO_ENABLED OR OS_LINUX AND ARCH_AARCH64 AND NOT RACE AND CGO_ENABLED OR OS_LINUX AND ARCH_AARCH64 AND NOT RACE AND NOT CGO_ENABLED)
+    SRCS(
+        sha1.go
+        sha1block.go
+        sha1block_arm64.go
+        sha1block_arm64.s
+    )
+ELSEIF (OS_DARWIN AND ARCH_X86_64 AND RACE AND CGO_ENABLED OR OS_DARWIN AND ARCH_X86_64 AND RACE AND NOT CGO_ENABLED OR OS_DARWIN AND ARCH_X86_64 AND NOT RACE AND CGO_ENABLED OR OS_DARWIN AND ARCH_X86_64 AND NOT RACE AND NOT CGO_ENABLED OR OS_LINUX AND ARCH_X86_64 AND RACE AND CGO_ENABLED OR OS_LINUX AND ARCH_X86_64 AND RACE AND NOT CGO_ENABLED OR OS_LINUX AND ARCH_X86_64 AND NOT RACE AND CGO_ENABLED OR OS_LINUX AND ARCH_X86_64 AND NOT RACE AND NOT CGO_ENABLED OR OS_WINDOWS AND ARCH_X86_64 AND RACE AND CGO_ENABLED OR OS_WINDOWS AND ARCH_X86_64 AND RACE AND NOT CGO_ENABLED OR OS_WINDOWS AND ARCH_X86_64 AND NOT RACE AND CGO_ENABLED OR OS_WINDOWS AND ARCH_X86_64 AND NOT RACE AND NOT CGO_ENABLED)
+    SRCS(
+        sha1.go
+        sha1block.go
+        sha1block_amd64.go
+        sha1block_amd64.s
+    )
+ENDIF()
+END()
author	hiddenpath <hiddenpath@yandex-team.com>	2024-04-02 23:50:23 +0300
committer	hiddenpath <hiddenpath@yandex-team.com>	2024-04-03 00:02:31 +0300
commit	8923c6d2c438e0aeed2e06b8b0275e1864eeee33 (patch)
tree	6b5e476699fc0be5091cb650654ef5f602c8afff /contrib/go/_std_1.22/src/crypto/sha1
parent	d18afd09df2a08cd023012593b46109b77713a6c (diff)
download	ydb-8923c6d2c438e0aeed2e06b8b0275e1864eeee33.tar.gz