aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/segmentio/asm/bswap/swap64_amd64.s
blob: 887f5b4a4cfc8a8c6d08b068a2d35f1e116230c2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Code generated by command: go run swap64_asm.go -pkg bswap -out ../bswap/swap64_amd64.s -stubs ../bswap/swap64_amd64.go. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func swap64(b []byte)
// Requires: AVX, AVX2
TEXT ·swap64(SB), NOSPLIT, $0-24
	MOVQ    b_base+0(FP), AX
	MOVQ    b_len+8(FP), CX
	MOVQ    AX, DX
	ADDQ    CX, DX
	BTL     $0x08, github·comsegmentioasmcpu·X86+0(SB)
	JCC     x86_loop
	VMOVDQU shuffle_mask<>+0(SB), Y0

avx2_loop:
	MOVQ    AX, CX
	ADDQ    $0x80, CX
	CMPQ    CX, DX
	JAE     x86_loop
	VMOVDQU (AX), Y1
	VMOVDQU 32(AX), Y2
	VMOVDQU 64(AX), Y3
	VMOVDQU 96(AX), Y4
	VPSHUFB Y0, Y1, Y1
	VPSHUFB Y0, Y2, Y2
	VPSHUFB Y0, Y3, Y3
	VPSHUFB Y0, Y4, Y4
	VMOVDQU Y1, (AX)
	VMOVDQU Y2, 32(AX)
	VMOVDQU Y3, 64(AX)
	VMOVDQU Y4, 96(AX)
	MOVQ    CX, AX
	JMP     avx2_loop

x86_loop:
	MOVQ   AX, CX
	ADDQ   $0x20, CX
	CMPQ   CX, DX
	JAE    slow_loop
	MOVQ   (AX), BX
	MOVQ   8(AX), SI
	MOVQ   16(AX), DI
	MOVQ   24(AX), R8
	BSWAPQ BX
	BSWAPQ SI
	BSWAPQ DI
	BSWAPQ R8
	MOVQ   BX, (AX)
	MOVQ   SI, 8(AX)
	MOVQ   DI, 16(AX)
	MOVQ   R8, 24(AX)
	MOVQ   CX, AX
	JMP    x86_loop

slow_loop:
	CMPQ   AX, DX
	JAE    done
	MOVQ   (AX), CX
	BSWAPQ CX
	MOVQ   CX, (AX)
	ADDQ   $0x08, AX
	JMP    slow_loop

done:
	RET

DATA shuffle_mask<>+0(SB)/8, $0x0001020304050607
DATA shuffle_mask<>+8(SB)/8, $0x08090a0b0c0d0e0f
DATA shuffle_mask<>+16(SB)/8, $0x0001020304050607
DATA shuffle_mask<>+24(SB)/8, $0x08090a0b0c0d0e0f
GLOBL shuffle_mask<>(SB), RODATA|NOPTR, $32