contrib/libs/asmlib/divfixedi64.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

%include "defs.asm"

;*************************  divfixedi64.asm  *********************************
; Author:           Agner Fog
; Date created:     2011-07-22
; Last modified:    2011-07-22
;
; Function prototypes:
; void setdivisori32(int buffer[2], int d);
; int dividefixedi32(const int buffer[2], int x);
; void setdivisoru32(uint32_t buffer[2], uint32_t d);
; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
;
; Description:
; Functions for fast repeated integer division by the same divisor, signed 
; and unsigned 32-bit integer versions. The divisor must be positive.
;
; The setdivisor functions calculate the reciprocal divisor and shift counts,
; the dividefixed functions do the division by multiplication and shift.
;
; The methods used are described by:
; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
;
; Mathematical formula, unsigned division:
; x = dividend
; d = divisor
; n = integer size, bits
; L = ceil(log2(d))
; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
; sh1 = min(L,1)
; sh2 = max(L-1,0)
; t = m*x >> n                    [high part of unsigned multiplication]
; x/d = (((x-t) >> sh1) + t) >> sh2
;
; Mathematical formula, signed division:
; x = dividend
; d = abs(divisor)
; n = integer size, bits
; L = ceil(log2(d))
; L = max(L,1)
; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
; sh1 = L-1
; q = x + (m*x >> n)              [high part of signed multiplication]
; q = (q >> sh1) - (x<0 ? -1 : 0)
; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
; x/d = q
;
; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
;******************************************************************************

%IFDEF  WINDOWS
%define par1   rcx                     ; function parameter 1
%define par2   edx                     ; function parameter 2
%define buf    r9                      ; copy of function parameter 1: buffer
%define rx     r8
%define rxd    r8d                     ; d or x
%ELSE   ; UNIX
%define par1   rdi                     ; function parameter 1
%define par2   esi                     ; function parameter 2
%define buf    rdi                     ; function parameter 1: buffer
%define rx     rsi
%define rxd    esi                     ; d or x
%ENDIF


section .text

; extern "C" void setdivisori32(int buffer[2], int d);
; 32 bit signed 

global setdivisori32: function
setdivisori32:
%IFDEF  WINDOWS
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ENDIF        
        dec     rxd                    ; rxd = r8d or esi
        mov     ecx, -1                ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
        bsr     ecx, rxd               ; floor(log2(d-1))
        inc     rxd
        js      H120                   ; d < 0. Generate error
        inc     ecx                    ; L = ceil(log2(d))        
        sub     ecx, 1                 ; shift count = L - 1
        adc     ecx, 0                 ; avoid negative shift count
        xor     eax, eax
        mov     edx, 1
        cmp     rxd, edx
        je      H110                   ; avoid overflow when d = 1
        shl     edx, cl
        div     rxd
H110:   inc     eax
        mov     [buf], eax             ; multiplier
        mov     [buf+4], ecx           ; shift count
        ret
        
H120:   ; d <= 0 not supported. Generate error
        mov     edx, 1
        div     edx                    ; will overflow
        ud2

        
; extern "C" int dividefixedi32(int buffer[2], int x);
global dividefixedi32: function
dividefixedi32:
%IFDEF  WINDOWS
        mov     eax, edx
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ELSE
        mov     eax, esi
%ENDIF        
        imul    dword [buf]            ; m
        lea     eax, [rdx+rx]          ; rx = r8 or rsi
        mov     ecx, [buf+4]           ; shift count
        sar     eax, cl
        sar     rxd, 31                ; sign(x)
        sub     eax, rxd
        ret


;extern "C" void setdivisoru32(int buffer[2], int d);
; 32 bit unsigned 

global setdivisoru32: function
setdivisoru32:
%IFDEF  WINDOWS
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ENDIF        
        dec     rxd                    ; rxd = r8d or esi
        mov     ecx, -1                ; value for bsr if r8d = 0
        bsr     ecx, rxd               ; floor(log2(d-1))
        inc     rxd
        inc     ecx                    ; L = ceil(log2(d))
        mov     edx, 1
        shl     rdx, cl                ; 2^L (64 bit shift because cl may be 32)
        sub     edx, rxd
        xor     eax, eax
        div     rxd
        inc     eax
        mov     [buf], eax             ; multiplier
        sub     ecx, 1
        setae   dl
        movzx   edx, dl                ; shift1
        seta    al
        neg     al
        and     al,cl
        movzx   eax, al                ; shift 2
        shl     eax, 8
        or      eax, edx
        mov     [buf+4], eax           ; shift 1 and shift 2
        ret
        
;extern "C" int dividefixedu32(int buffer[2], int x);
global dividefixedu32: function       ; unsigned
dividefixedu32:
%IFDEF  WINDOWS
        mov     eax, edx
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ELSE
        mov     eax, esi
%ENDIF        
        mul     dword [buf]            ; m
        sub     rxd, edx               ; x-t
        mov     ecx, [buf+4]           ; shift 1 and shift 2
        shr     rxd, cl
        lea     eax, [rx+rdx]
        shr     ecx, 8
        shr     eax, cl
        ret