Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
This commit is contained in:
Xamarin Public Jenkins (auto-signing)
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions

View File

@@ -0,0 +1,61 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// di_int __ashldi3(di_int input, int count);
// This routine has some extra memory traffic, loading the 64-bit input via two
// 32-bit loads, then immediately storing it back to the stack via a single 64-bit
// store. This is to avoid a write-small, read-large stall.
// However, if callers of this routine can be safely assumed to store the argument
// via a 64-bt store, this is unnecessary memory traffic, and should be avoided.
// It can be turned off by defining the TRUST_CALLERS_USE_64_BIT_STORES macro.
#ifdef __i386__
#ifdef __SSE2__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__ashldi3)
movd 12(%esp), %xmm2 // Load count
#ifndef TRUST_CALLERS_USE_64_BIT_STORES
movd 4(%esp), %xmm0
movd 8(%esp), %xmm1
punpckldq %xmm1, %xmm0 // Load input
#else
movq 4(%esp), %xmm0 // Load input
#endif
psllq %xmm2, %xmm0 // shift input by count
movd %xmm0, %eax
psrlq $32, %xmm0
movd %xmm0, %edx
ret
END_COMPILERRT_FUNCTION(__ashldi3)
#else // Use GPRs instead of SSE2 instructions, if they aren't available.
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__ashldi3)
movl 12(%esp), %ecx // Load count
movl 8(%esp), %edx // Load high
movl 4(%esp), %eax // Load low
testl $0x20, %ecx // If count >= 32
jnz 1f // goto 1
shldl %cl, %eax, %edx // left shift high by count
shll %cl, %eax // left shift low by count
ret
1: movl %eax, %edx // Move low to high
xorl %eax, %eax // clear low
shll %cl, %edx // shift high by count - 32
ret
END_COMPILERRT_FUNCTION(__ashldi3)
#endif // __SSE2__
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,72 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// di_int __ashrdi3(di_int input, int count);
#ifdef __i386__
#ifdef __SSE2__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__ashrdi3)
movd 12(%esp), %xmm2 // Load count
movl 8(%esp), %eax
#ifndef TRUST_CALLERS_USE_64_BIT_STORES
movd 4(%esp), %xmm0
movd 8(%esp), %xmm1
punpckldq %xmm1, %xmm0 // Load input
#else
movq 4(%esp), %xmm0 // Load input
#endif
psrlq %xmm2, %xmm0 // unsigned shift input by count
testl %eax, %eax // check the sign-bit of the input
jns 1f // early out for positive inputs
// If the input is negative, we need to construct the shifted sign bit
// to or into the result, as xmm does not have a signed right shift.
pcmpeqb %xmm1, %xmm1 // -1ULL
psrlq $58, %xmm1 // 0x3f
pandn %xmm1, %xmm2 // 63 - count
pcmpeqb %xmm1, %xmm1 // -1ULL
psubq %xmm1, %xmm2 // 64 - count
psllq %xmm2, %xmm1 // -1 << (64 - count) = leading sign bits
por %xmm1, %xmm0
// Move the result back to the general purpose registers and return
1: movd %xmm0, %eax
psrlq $32, %xmm0
movd %xmm0, %edx
ret
END_COMPILERRT_FUNCTION(__ashrdi3)
#else // Use GPRs instead of SSE2 instructions, if they aren't available.
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__ashrdi3)
movl 12(%esp), %ecx // Load count
movl 8(%esp), %edx // Load high
movl 4(%esp), %eax // Load low
testl $0x20, %ecx // If count >= 32
jnz 1f // goto 1
shrdl %cl, %edx, %eax // right shift low by count
sarl %cl, %edx // right shift high by count
ret
1: movl %edx, %eax // Move high to low
sarl $31, %edx // clear high
sarl %cl, %eax // shift low by count - 32
ret
END_COMPILERRT_FUNCTION(__ashrdi3)
#endif // __SSE2__
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,34 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// _chkstk routine
// This routine is windows specific
// http://msdn.microsoft.com/en-us/library/ms648426.aspx
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__chkstk_ms)
push %ecx
push %eax
cmp $0x1000,%eax
lea 12(%esp),%ecx
jb 1f
2:
sub $0x1000,%ecx
test %ecx,(%ecx)
sub $0x1000,%eax
cmp $0x1000,%eax
ja 2b
1:
sub %eax,%ecx
test %ecx,(%ecx)
pop %eax
pop %ecx
ret
END_COMPILERRT_FUNCTION(__chkstk_ms)
#endif // __i386__

View File

@@ -0,0 +1,40 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
#ifdef __i386__
// _chkstk (_alloca) routine - probe stack between %esp and (%esp-%eax) in 4k increments,
// then decrement %esp by %eax. Preserves all registers except %esp and flags.
// This routine is windows specific
// http://msdn.microsoft.com/en-us/library/ms648426.aspx
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function
DEFINE_COMPILERRT_FUNCTION(__chkstk)
push %ecx
cmp $0x1000,%eax
lea 8(%esp),%ecx // esp before calling this routine -> ecx
jb 1f
2:
sub $0x1000,%ecx
test %ecx,(%ecx)
sub $0x1000,%eax
cmp $0x1000,%eax
ja 2b
1:
sub %eax,%ecx
test %ecx,(%ecx)
lea 4(%esp),%eax // load pointer to the return address into eax
mov %ecx,%esp // install the new top of stack pointer into esp
mov -4(%eax),%ecx // restore ecx
push (%eax) // push return address onto the stack
sub %esp,%eax // restore the original value in eax
ret
END_COMPILERRT_FUNCTION(__chkstk)
END_COMPILERRT_FUNCTION(_alloca)
#endif // __i386__

View File

@@ -0,0 +1,165 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// di_int __divdi3(di_int a, di_int b);
// result = a / b.
// both inputs and the output are 64-bit signed integers.
// This will do whatever the underlying hardware is set to do on division by zero.
// No other exceptions are generated, as the divide cannot overflow.
//
// This is targeted at 32-bit x86 *only*, as this can be done directly in hardware
// on x86_64. The performance goal is ~40 cycles per divide, which is faster than
// currently possible via simulation of integer divides on the x87 unit.
//
// Stephen Canon, December 2008
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__divdi3)
/* This is currently implemented by wrapping the unsigned divide up in an absolute
value, then restoring the correct sign at the end of the computation. This could
certainly be improved upon. */
pushl %esi
movl 20(%esp), %edx // high word of b
movl 16(%esp), %eax // low word of b
movl %edx, %ecx
sarl $31, %ecx // (b < 0) ? -1 : 0
xorl %ecx, %eax
xorl %ecx, %edx // EDX:EAX = (b < 0) ? not(b) : b
subl %ecx, %eax
sbbl %ecx, %edx // EDX:EAX = abs(b)
movl %edx, 20(%esp)
movl %eax, 16(%esp) // store abs(b) back to stack
movl %ecx, %esi // set aside sign of b
movl 12(%esp), %edx // high word of b
movl 8(%esp), %eax // low word of b
movl %edx, %ecx
sarl $31, %ecx // (a < 0) ? -1 : 0
xorl %ecx, %eax
xorl %ecx, %edx // EDX:EAX = (a < 0) ? not(a) : a
subl %ecx, %eax
sbbl %ecx, %edx // EDX:EAX = abs(a)
movl %edx, 12(%esp)
movl %eax, 8(%esp) // store abs(a) back to stack
xorl %ecx, %esi // sign of result = (sign of a) ^ (sign of b)
pushl %ebx
movl 24(%esp), %ebx // Find the index i of the leading bit in b.
bsrl %ebx, %ecx // If the high word of b is zero, jump to
jz 9f // the code to handle that special case [9].
/* High word of b is known to be non-zero on this branch */
movl 20(%esp), %eax // Construct bhi, containing bits [1+i:32+i] of b
shrl %cl, %eax // Practically, this means that bhi is given by:
shrl %eax //
notl %ecx // bhi = (high word of b) << (31 - i) |
shll %cl, %ebx // (low word of b) >> (1 + i)
orl %eax, %ebx //
movl 16(%esp), %edx // Load the high and low words of a, and jump
movl 12(%esp), %eax // to [1] if the high word is larger than bhi
cmpl %ebx, %edx // to avoid overflowing the upcoming divide.
jae 1f
/* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
divl %ebx // eax <-- qs, edx <-- r such that ahi:alo = bs*qs + r
pushl %edi
notl %ecx
shrl %eax
shrl %cl, %eax // q = qs >> (1 + i)
movl %eax, %edi
mull 24(%esp) // q*blo
movl 16(%esp), %ebx
movl 20(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 28(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
sbbl $0, %edi // decrement q if remainder is negative
xorl %edx, %edx
movl %edi, %eax
addl %esi, %eax // Restore correct sign to result
adcl %esi, %edx
xorl %esi, %eax
xorl %esi, %edx
popl %edi // Restore callee-save registers
popl %ebx
popl %esi
retl // Return
1: /* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
subl %ebx, %edx // subtract bhi from ahi so that divide will not
divl %ebx // overflow, and find q and r such that
//
// ahi:alo = (1:q)*bhi + r
//
// Note that q is a number in (31-i).(1+i)
// fix point.
pushl %edi
notl %ecx
shrl %eax
orl $0x80000000, %eax
shrl %cl, %eax // q = (1:qs) >> (1 + i)
movl %eax, %edi
mull 24(%esp) // q*blo
movl 16(%esp), %ebx
movl 20(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 28(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
sbbl $0, %edi // decrement q if remainder is negative
xorl %edx, %edx
movl %edi, %eax
addl %esi, %eax // Restore correct sign to result
adcl %esi, %edx
xorl %esi, %eax
xorl %esi, %edx
popl %edi // Restore callee-save registers
popl %ebx
popl %esi
retl // Return
9: /* High word of b is zero on this branch */
movl 16(%esp), %eax // Find qhi and rhi such that
movl 20(%esp), %ecx //
xorl %edx, %edx // ahi = qhi*b + rhi with 0 rhi < b
divl %ecx //
movl %eax, %ebx //
movl 12(%esp), %eax // Find qlo such that
divl %ecx //
movl %ebx, %edx // rhi:alo = qlo*b + rlo with 0 rlo < b
addl %esi, %eax // Restore correct sign to result
adcl %esi, %edx
xorl %esi, %eax
xorl %esi, %edx
popl %ebx // Restore callee-save registers
popl %esi
retl // Return
END_COMPILERRT_FUNCTION(__divdi3)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,42 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// double __floatundidf(du_int a);
#ifdef __i386__
CONST_SECTION
.balign 16
twop52:
.quad 0x4330000000000000
.balign 16
twop32:
.quad 0x41f0000000000000
#define REL_ADDR(_a) (_a)-0b(%eax)
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatdidf)
cvtsi2sd 8(%esp), %xmm1
movss 4(%esp), %xmm0 // low 32 bits of a
calll 0f
0: popl %eax
mulsd REL_ADDR(twop32), %xmm1 // a_hi as a double (without rounding)
movsd REL_ADDR(twop52), %xmm2 // 0x1.0p52
subsd %xmm2, %xmm1 // a_hi - 0x1p52 (no rounding occurs)
orpd %xmm2, %xmm0 // 0x1p52 + a_lo (no rounding occurs)
addsd %xmm1, %xmm0 // a_hi + a_lo (round happens here)
movsd %xmm0, 4(%esp)
fldl 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatdidf)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,35 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// float __floatdisf(di_int a);
// This routine has some extra memory traffic, loading the 64-bit input via two
// 32-bit loads, then immediately storing it back to the stack via a single 64-bit
// store. This is to avoid a write-small, read-large stall.
// However, if callers of this routine can be safely assumed to store the argument
// via a 64-bt store, this is unnecessary memory traffic, and should be avoided.
// It can be turned off by defining the TRUST_CALLERS_USE_64_BIT_STORES macro.
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatdisf)
#ifndef TRUST_CALLERS_USE_64_BIT_STORES
movd 4(%esp), %xmm0
movd 8(%esp), %xmm1
punpckldq %xmm1, %xmm0
movq %xmm0, 4(%esp)
#endif
fildll 4(%esp)
fstps 4(%esp)
flds 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatdisf)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,33 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// float __floatdixf(di_int a);
#ifdef __i386__
// This routine has some extra memory traffic, loading the 64-bit input via two
// 32-bit loads, then immediately storing it back to the stack via a single 64-bit
// store. This is to avoid a write-small, read-large stall.
// However, if callers of this routine can be safely assumed to store the argument
// via a 64-bt store, this is unnecessary memory traffic, and should be avoided.
// It can be turned off by defining the TRUST_CALLERS_USE_64_BIT_STORES macro.
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatdixf)
#ifndef TRUST_CALLERS_USE_64_BIT_STORES
movd 4(%esp), %xmm0
movd 8(%esp), %xmm1
punpckldq %xmm1, %xmm0
movq %xmm0, 4(%esp)
#endif
fildll 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatdixf)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,55 @@
//===-- floatundidf.S - Implement __floatundidf for i386 ------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements __floatundidf for the compiler_rt library.
//
//===----------------------------------------------------------------------===//
#include "../assembly.h"
// double __floatundidf(du_int a);
#ifdef __i386__
CONST_SECTION
.balign 16
twop52:
.quad 0x4330000000000000
.balign 16
twop84_plus_twop52:
.quad 0x4530000000100000
.balign 16
twop84:
.quad 0x4530000000000000
#define REL_ADDR(_a) (_a)-0b(%eax)
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatundidf)
movss 8(%esp), %xmm1 // high 32 bits of a
movss 4(%esp), %xmm0 // low 32 bits of a
calll 0f
0: popl %eax
orpd REL_ADDR(twop84), %xmm1 // 0x1p84 + a_hi (no rounding occurs)
subsd REL_ADDR(twop84_plus_twop52), %xmm1 // a_hi - 0x1p52 (no rounding occurs)
orpd REL_ADDR(twop52), %xmm0 // 0x1p52 + a_lo (no rounding occurs)
addsd %xmm1, %xmm0 // a_hi + a_lo (round happens here)
movsd %xmm0, 4(%esp)
fldl 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatundidf)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,108 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// float __floatundisf(du_int a);
// Note that there is a hardware instruction, fildll, that does most of what
// this function needs to do. However, because of our ia32 ABI, it will take
// a write-small read-large stall, so the software implementation here is
// actually several cycles faster.
// This is a branch-free implementation. A branchy implementation might be
// faster for the common case if you know something a priori about the input
// distribution.
/* branch-free x87 implementation - one cycle slower than without x87.
#ifdef __i386__
CONST_SECTION
.balign 3
.quad 0x43f0000000000000
twop64: .quad 0x0000000000000000
#define TWOp64 twop64-0b(%ecx,%eax,8)
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatundisf)
movl 8(%esp), %eax
movd 8(%esp), %xmm1
movd 4(%esp), %xmm0
punpckldq %xmm1, %xmm0
calll 0f
0: popl %ecx
sarl $31, %eax
movq %xmm0, 4(%esp)
fildll 4(%esp)
faddl TWOp64
fstps 4(%esp)
flds 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatundisf)
#endif // __i386__
*/
/* branch-free, x87-free implementation - faster at the expense of code size */
#ifdef __i386__
CONST_SECTION
.balign 16
twop52:
.quad 0x4330000000000000
.quad 0x0000000000000fff
.balign 16
sticky:
.quad 0x0000000000000000
.long 0x00000012
.balign 16
twelve:
.long 0x00000000
#define TWOp52 twop52-0b(%ecx)
#define STICKY sticky-0b(%ecx,%eax,8)
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatundisf)
movl 8(%esp), %eax
movd 8(%esp), %xmm1
movd 4(%esp), %xmm0
punpckldq %xmm1, %xmm0
calll 0f
0: popl %ecx
shrl %eax // high 31 bits of input as sint32
addl $0x7ff80000, %eax
sarl $31, %eax // (big input) ? -1 : 0
movsd STICKY, %xmm1 // (big input) ? 0xfff : 0
movl $12, %edx
andl %eax, %edx // (big input) ? 12 : 0
movd %edx, %xmm3
andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0
movsd TWOp52, %xmm2 // 0x1.0p52
psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input
orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input)
orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input)
cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input)
pslld $23, %xmm3
paddd %xmm3, %xmm0 // (float)input
movd %xmm0, 4(%esp)
flds 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatundisf)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,46 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// long double __floatundixf(du_int a);16
#ifdef __i386__
CONST_SECTION
.balign 16
twop52:
.quad 0x4330000000000000
.balign 16
twop84_plus_twop52_neg:
.quad 0xc530000000100000
.balign 16
twop84:
.quad 0x4530000000000000
#define REL_ADDR(_a) (_a)-0b(%eax)
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__floatundixf)
calll 0f
0: popl %eax
movss 8(%esp), %xmm0 // hi 32 bits of input
movss 4(%esp), %xmm1 // lo 32 bits of input
orpd REL_ADDR(twop84), %xmm0 // 2^84 + hi (as a double)
orpd REL_ADDR(twop52), %xmm1 // 2^52 + lo (as a double)
addsd REL_ADDR(twop84_plus_twop52_neg), %xmm0 // hi - 2^52 (no rounding occurs)
movsd %xmm1, 4(%esp)
fldl 4(%esp)
movsd %xmm0, 4(%esp)
faddl 4(%esp)
ret
END_COMPILERRT_FUNCTION(__floatundixf)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,62 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// di_int __lshrdi3(di_int input, int count);
// This routine has some extra memory traffic, loading the 64-bit input via two
// 32-bit loads, then immediately storing it back to the stack via a single 64-bit
// store. This is to avoid a write-small, read-large stall.
// However, if callers of this routine can be safely assumed to store the argument
// via a 64-bt store, this is unnecessary memory traffic, and should be avoided.
// It can be turned off by defining the TRUST_CALLERS_USE_64_BIT_STORES macro.
#ifdef __i386__
#ifdef __SSE2__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__lshrdi3)
movd 12(%esp), %xmm2 // Load count
#ifndef TRUST_CALLERS_USE_64_BIT_STORES
movd 4(%esp), %xmm0
movd 8(%esp), %xmm1
punpckldq %xmm1, %xmm0 // Load input
#else
movq 4(%esp), %xmm0 // Load input
#endif
psrlq %xmm2, %xmm0 // shift input by count
movd %xmm0, %eax
psrlq $32, %xmm0
movd %xmm0, %edx
ret
END_COMPILERRT_FUNCTION(__lshrdi3)
#else // Use GPRs instead of SSE2 instructions, if they aren't available.
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__lshrdi3)
movl 12(%esp), %ecx // Load count
movl 8(%esp), %edx // Load high
movl 4(%esp), %eax // Load low
testl $0x20, %ecx // If count >= 32
jnz 1f // goto 1
shrdl %cl, %edx, %eax // right shift low by count
shrl %cl, %edx // right shift high by count
ret
1: movl %edx, %eax // Move high to low
xorl %edx, %edx // clear high
shrl %cl, %eax // shift low by count - 32
ret
END_COMPILERRT_FUNCTION(__lshrdi3)
#endif // __SSE2__
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,169 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// di_int __moddi3(di_int a, di_int b);
// result = remainder of a / b.
// both inputs and the output are 64-bit signed integers.
// This will do whatever the underlying hardware is set to do on division by zero.
// No other exceptions are generated, as the divide cannot overflow.
//
// This is targeted at 32-bit x86 *only*, as this can be done directly in hardware
// on x86_64. The performance goal is ~40 cycles per divide, which is faster than
// currently possible via simulation of integer divides on the x87 unit.
//
// Stephen Canon, December 2008
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__moddi3)
/* This is currently implemented by wrapping the unsigned modulus up in an absolute
value. This could certainly be improved upon. */
pushl %esi
movl 20(%esp), %edx // high word of b
movl 16(%esp), %eax // low word of b
movl %edx, %ecx
sarl $31, %ecx // (b < 0) ? -1 : 0
xorl %ecx, %eax
xorl %ecx, %edx // EDX:EAX = (b < 0) ? not(b) : b
subl %ecx, %eax
sbbl %ecx, %edx // EDX:EAX = abs(b)
movl %edx, 20(%esp)
movl %eax, 16(%esp) // store abs(b) back to stack
movl 12(%esp), %edx // high word of b
movl 8(%esp), %eax // low word of b
movl %edx, %ecx
sarl $31, %ecx // (a < 0) ? -1 : 0
xorl %ecx, %eax
xorl %ecx, %edx // EDX:EAX = (a < 0) ? not(a) : a
subl %ecx, %eax
sbbl %ecx, %edx // EDX:EAX = abs(a)
movl %edx, 12(%esp)
movl %eax, 8(%esp) // store abs(a) back to stack
movl %ecx, %esi // set aside sign of a
pushl %ebx
movl 24(%esp), %ebx // Find the index i of the leading bit in b.
bsrl %ebx, %ecx // If the high word of b is zero, jump to
jz 9f // the code to handle that special case [9].
/* High word of b is known to be non-zero on this branch */
movl 20(%esp), %eax // Construct bhi, containing bits [1+i:32+i] of b
shrl %cl, %eax // Practically, this means that bhi is given by:
shrl %eax //
notl %ecx // bhi = (high word of b) << (31 - i) |
shll %cl, %ebx // (low word of b) >> (1 + i)
orl %eax, %ebx //
movl 16(%esp), %edx // Load the high and low words of a, and jump
movl 12(%esp), %eax // to [2] if the high word is larger than bhi
cmpl %ebx, %edx // to avoid overflowing the upcoming divide.
jae 2f
/* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
divl %ebx // eax <-- qs, edx <-- r such that ahi:alo = bs*qs + r
pushl %edi
notl %ecx
shrl %eax
shrl %cl, %eax // q = qs >> (1 + i)
movl %eax, %edi
mull 24(%esp) // q*blo
movl 16(%esp), %ebx
movl 20(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 28(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
jnc 1f // if positive, this is the result.
addl 24(%esp), %ebx // otherwise
adcl 28(%esp), %ecx // ECX:EBX = a - (q-1)*b = result
1: movl %ebx, %eax
movl %ecx, %edx
addl %esi, %eax // Restore correct sign to result
adcl %esi, %edx
xorl %esi, %eax
xorl %esi, %edx
popl %edi // Restore callee-save registers
popl %ebx
popl %esi
retl // Return
2: /* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
subl %ebx, %edx // subtract bhi from ahi so that divide will not
divl %ebx // overflow, and find q and r such that
//
// ahi:alo = (1:q)*bhi + r
//
// Note that q is a number in (31-i).(1+i)
// fix point.
pushl %edi
notl %ecx
shrl %eax
orl $0x80000000, %eax
shrl %cl, %eax // q = (1:qs) >> (1 + i)
movl %eax, %edi
mull 24(%esp) // q*blo
movl 16(%esp), %ebx
movl 20(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 28(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
jnc 3f // if positive, this is the result.
addl 24(%esp), %ebx // otherwise
adcl 28(%esp), %ecx // ECX:EBX = a - (q-1)*b = result
3: movl %ebx, %eax
movl %ecx, %edx
addl %esi, %eax // Restore correct sign to result
adcl %esi, %edx
xorl %esi, %eax
xorl %esi, %edx
popl %edi // Restore callee-save registers
popl %ebx
popl %esi
retl // Return
9: /* High word of b is zero on this branch */
movl 16(%esp), %eax // Find qhi and rhi such that
movl 20(%esp), %ecx //
xorl %edx, %edx // ahi = qhi*b + rhi with 0 rhi < b
divl %ecx //
movl %eax, %ebx //
movl 12(%esp), %eax // Find rlo such that
divl %ecx //
movl %edx, %eax // rhi:alo = qlo*b + rlo with 0 rlo < b
popl %ebx //
xorl %edx, %edx // and return 0:rlo
addl %esi, %eax // Restore correct sign to result
adcl %esi, %edx
xorl %esi, %eax
xorl %esi, %edx
popl %esi
retl // Return
END_COMPILERRT_FUNCTION(__moddi3)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,33 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// di_int __muldi3(di_int a, di_int b);
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__muldi3)
pushl %ebx
movl 16(%esp), %eax // b.lo
movl 12(%esp), %ecx // a.hi
imull %eax, %ecx // b.lo * a.hi
movl 8(%esp), %edx // a.lo
movl 20(%esp), %ebx // b.hi
imull %edx, %ebx // a.lo * b.hi
mull %edx // EDX:EAX = a.lo * b.lo
addl %ecx, %ebx // EBX = (a.lo*b.hi + a.hi*b.lo)
addl %ebx, %edx
popl %ebx
retl
END_COMPILERRT_FUNCTION(__muldi3)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,118 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// du_int __udivdi3(du_int a, du_int b);
// result = a / b.
// both inputs and the output are 64-bit unsigned integers.
// This will do whatever the underlying hardware is set to do on division by zero.
// No other exceptions are generated, as the divide cannot overflow.
//
// This is targeted at 32-bit x86 *only*, as this can be done directly in hardware
// on x86_64. The performance goal is ~40 cycles per divide, which is faster than
// currently possible via simulation of integer divides on the x87 unit.
//
// Stephen Canon, December 2008
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__udivdi3)
pushl %ebx
movl 20(%esp), %ebx // Find the index i of the leading bit in b.
bsrl %ebx, %ecx // If the high word of b is zero, jump to
jz 9f // the code to handle that special case [9].
/* High word of b is known to be non-zero on this branch */
movl 16(%esp), %eax // Construct bhi, containing bits [1+i:32+i] of b
shrl %cl, %eax // Practically, this means that bhi is given by:
shrl %eax //
notl %ecx // bhi = (high word of b) << (31 - i) |
shll %cl, %ebx // (low word of b) >> (1 + i)
orl %eax, %ebx //
movl 12(%esp), %edx // Load the high and low words of a, and jump
movl 8(%esp), %eax // to [1] if the high word is larger than bhi
cmpl %ebx, %edx // to avoid overflowing the upcoming divide.
jae 1f
/* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
divl %ebx // eax <-- qs, edx <-- r such that ahi:alo = bs*qs + r
pushl %edi
notl %ecx
shrl %eax
shrl %cl, %eax // q = qs >> (1 + i)
movl %eax, %edi
mull 20(%esp) // q*blo
movl 12(%esp), %ebx
movl 16(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 24(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
sbbl $0, %edi // decrement q if remainder is negative
xorl %edx, %edx
movl %edi, %eax
popl %edi
popl %ebx
retl
1: /* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
subl %ebx, %edx // subtract bhi from ahi so that divide will not
divl %ebx // overflow, and find q and r such that
//
// ahi:alo = (1:q)*bhi + r
//
// Note that q is a number in (31-i).(1+i)
// fix point.
pushl %edi
notl %ecx
shrl %eax
orl $0x80000000, %eax
shrl %cl, %eax // q = (1:qs) >> (1 + i)
movl %eax, %edi
mull 20(%esp) // q*blo
movl 12(%esp), %ebx
movl 16(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 24(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
sbbl $0, %edi // decrement q if remainder is negative
xorl %edx, %edx
movl %edi, %eax
popl %edi
popl %ebx
retl
9: /* High word of b is zero on this branch */
movl 12(%esp), %eax // Find qhi and rhi such that
movl 16(%esp), %ecx //
xorl %edx, %edx // ahi = qhi*b + rhi with 0 rhi < b
divl %ecx //
movl %eax, %ebx //
movl 8(%esp), %eax // Find qlo such that
divl %ecx //
movl %ebx, %edx // rhi:alo = qlo*b + rlo with 0 rlo < b
popl %ebx //
retl // and return qhi:qlo
END_COMPILERRT_FUNCTION(__udivdi3)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE

View File

@@ -0,0 +1,129 @@
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
#include "../assembly.h"
// du_int __umoddi3(du_int a, du_int b);
// result = remainder of a / b.
// both inputs and the output are 64-bit unsigned integers.
// This will do whatever the underlying hardware is set to do on division by zero.
// No other exceptions are generated, as the divide cannot overflow.
//
// This is targeted at 32-bit x86 *only*, as this can be done directly in hardware
// on x86_64. The performance goal is ~40 cycles per divide, which is faster than
// currently possible via simulation of integer divides on the x87 unit.
//
// Stephen Canon, December 2008
#ifdef __i386__
.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__umoddi3)
pushl %ebx
movl 20(%esp), %ebx // Find the index i of the leading bit in b.
bsrl %ebx, %ecx // If the high word of b is zero, jump to
jz 9f // the code to handle that special case [9].
/* High word of b is known to be non-zero on this branch */
movl 16(%esp), %eax // Construct bhi, containing bits [1+i:32+i] of b
shrl %cl, %eax // Practically, this means that bhi is given by:
shrl %eax //
notl %ecx // bhi = (high word of b) << (31 - i) |
shll %cl, %ebx // (low word of b) >> (1 + i)
orl %eax, %ebx //
movl 12(%esp), %edx // Load the high and low words of a, and jump
movl 8(%esp), %eax // to [2] if the high word is larger than bhi
cmpl %ebx, %edx // to avoid overflowing the upcoming divide.
jae 2f
/* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
divl %ebx // eax <-- qs, edx <-- r such that ahi:alo = bs*qs + r
pushl %edi
notl %ecx
shrl %eax
shrl %cl, %eax // q = qs >> (1 + i)
movl %eax, %edi
mull 20(%esp) // q*blo
movl 12(%esp), %ebx
movl 16(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 24(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
jnc 1f // if positive, this is the result.
addl 20(%esp), %ebx // otherwise
adcl 24(%esp), %ecx // ECX:EBX = a - (q-1)*b = result
1: movl %ebx, %eax
movl %ecx, %edx
popl %edi
popl %ebx
retl
2: /* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
subl %ebx, %edx // subtract bhi from ahi so that divide will not
divl %ebx // overflow, and find q and r such that
//
// ahi:alo = (1:q)*bhi + r
//
// Note that q is a number in (31-i).(1+i)
// fix point.
pushl %edi
notl %ecx
shrl %eax
orl $0x80000000, %eax
shrl %cl, %eax // q = (1:qs) >> (1 + i)
movl %eax, %edi
mull 20(%esp) // q*blo
movl 12(%esp), %ebx
movl 16(%esp), %ecx // ECX:EBX = a
subl %eax, %ebx
sbbl %edx, %ecx // ECX:EBX = a - q*blo
movl 24(%esp), %eax
imull %edi, %eax // q*bhi
subl %eax, %ecx // ECX:EBX = a - q*b
jnc 3f // if positive, this is the result.
addl 20(%esp), %ebx // otherwise
adcl 24(%esp), %ecx // ECX:EBX = a - (q-1)*b = result
3: movl %ebx, %eax
movl %ecx, %edx
popl %edi
popl %ebx
retl
9: /* High word of b is zero on this branch */
movl 12(%esp), %eax // Find qhi and rhi such that
movl 16(%esp), %ecx //
xorl %edx, %edx // ahi = qhi*b + rhi with 0 rhi < b
divl %ecx //
movl %eax, %ebx //
movl 8(%esp), %eax // Find rlo such that
divl %ecx //
movl %edx, %eax // rhi:alo = qlo*b + rlo with 0 rlo < b
popl %ebx //
xorl %edx, %edx // and return 0:rlo
retl //
END_COMPILERRT_FUNCTION(__umoddi3)
#endif // __i386__
NO_EXEC_STACK_DIRECTIVE