mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			
		
			
				
	
	
		
			321 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			321 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /* Copyright (C) 2006-2019 Free Software Foundation, Inc.
 | |
| 
 | |
| This file is free software; you can redistribute it and/or modify it
 | |
| under the terms of the GNU General Public License as published by the
 | |
| Free Software Foundation; either version 3, or (at your option) any
 | |
| later version.
 | |
| 
 | |
| This file is distributed in the hope that it will be useful, but
 | |
| WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| General Public License for more details.
 | |
| 
 | |
| Under Section 7 of GPL version 3, you are granted additional
 | |
| permissions described in the GCC Runtime Library Exception, version
 | |
| 3.1, as published by the Free Software Foundation.
 | |
| 
 | |
| You should have received a copy of the GNU General Public License and
 | |
| a copy of the GCC Runtime Library Exception along with this program;
 | |
| see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 | |
| <http://www.gnu.org/licenses/>.  */
 | |
| 
 | |
| /* Moderately Space-optimized libgcc routines for the Renesas SH /
 | |
|    STMicroelectronics ST40 CPUs.
 | |
|    Contributed by J"orn Rennecke joern.rennecke@st.com.  */
 | |
| 
 | |
| #include "lib1funcs.h"
 | |
| 
 | |
| #ifdef L_udivsi3_i4i
 | |
| 
 | |
| /* 88 bytes; sh4-200 cycle counts:
 | |
|    divisor  >= 2G: 11 cycles
 | |
|    dividend <  2G: 48 cycles
 | |
|    dividend >= 2G: divisor != 1: 54 cycles
 | |
|    dividend >= 2G, divisor == 1: 22 cycles */
 | |
| #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
 | |
| !! args in r4 and r5, result in r0, clobber r1
 | |
| 
 | |
| 	.global GLOBAL(udivsi3_i4i)
 | |
| 	FUNC(GLOBAL(udivsi3_i4i))
 | |
| GLOBAL(udivsi3_i4i):
 | |
| 	mova L1,r0
 | |
| 	cmp/pz r5
 | |
| 	sts fpscr,r1
 | |
| 	lds.l @r0+,fpscr
 | |
| 	sts.l fpul,@-r15
 | |
| 	bf LOCAL(huge_divisor)
 | |
| 	mov.l r1,@-r15
 | |
| 	lds r4,fpul
 | |
| 	cmp/pz r4
 | |
| #ifdef FMOVD_WORKS
 | |
| 	fmov.d dr0,@-r15
 | |
| 	float fpul,dr0
 | |
| 	fmov.d dr2,@-r15
 | |
| 	bt LOCAL(dividend_adjusted)
 | |
| 	mov #1,r1
 | |
| 	fmov.d @r0,dr2
 | |
| 	cmp/eq r1,r5
 | |
| 	bt LOCAL(div_by_1)
 | |
| 	fadd dr2,dr0
 | |
| LOCAL(dividend_adjusted):
 | |
| 	lds r5,fpul
 | |
| 	float fpul,dr2
 | |
| 	fdiv dr2,dr0
 | |
| LOCAL(div_by_1):
 | |
| 	fmov.d @r15+,dr2
 | |
| 	ftrc dr0,fpul
 | |
| 	fmov.d @r15+,dr0
 | |
| #else /* !FMOVD_WORKS */
 | |
| 	fmov.s DR01,@-r15
 | |
| 	mov #1,r1
 | |
| 	fmov.s DR00,@-r15
 | |
| 	float fpul,dr0
 | |
| 	fmov.s DR21,@-r15
 | |
| 	bt/s LOCAL(dividend_adjusted)
 | |
| 	fmov.s DR20,@-r15
 | |
| 	cmp/eq r1,r5
 | |
| 	bt LOCAL(div_by_1)
 | |
| 	fmov.s @r0+,DR20
 | |
| 	fmov.s @r0,DR21
 | |
| 	fadd dr2,dr0
 | |
| LOCAL(dividend_adjusted):
 | |
| 	lds r5,fpul
 | |
| 	float fpul,dr2
 | |
| 	fdiv dr2,dr0
 | |
| LOCAL(div_by_1):
 | |
| 	fmov.s @r15+,DR20
 | |
| 	fmov.s @r15+,DR21
 | |
| 	ftrc dr0,fpul
 | |
| 	fmov.s @r15+,DR00
 | |
| 	fmov.s @r15+,DR01
 | |
| #endif /* !FMOVD_WORKS */
 | |
| 	lds.l @r15+,fpscr
 | |
| 	sts fpul,r0
 | |
| 	rts
 | |
| 	lds.l @r15+,fpul
 | |
| 
 | |
| #ifdef FMOVD_WORKS
 | |
| 	.p2align 3        ! make double below 8 byte aligned.
 | |
| #endif
 | |
| LOCAL(huge_divisor):
 | |
| 	lds r1,fpscr
 | |
| 	add #4,r15
 | |
| 	cmp/hs r5,r4
 | |
| 	rts
 | |
| 	movt r0
 | |
| 
 | |
| 	.p2align 2
 | |
| L1:
 | |
| #ifndef FMOVD_WORKS
 | |
| 	.long 0x80000
 | |
| #else
 | |
| 	.long 0x180000
 | |
| #endif
 | |
| 	.double 4294967296
 | |
| 
 | |
| 	ENDFUNC(GLOBAL(udivsi3_i4i))
 | |
| #elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */
 | |
| 
 | |
| #if 0
 | |
| /* With 36 bytes, the following would probably be the most compact
 | |
|    implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
 | |
| GLOBAL(udivsi3_i4i):
 | |
| 	mov.l r2,@-r15
 | |
| 	mov #0,r1
 | |
| 	div0u
 | |
| 	mov r1,r2
 | |
| 	mov.l r3,@-r15
 | |
| 	mov r1,r3
 | |
| 	sett
 | |
| 	mov r4,r0
 | |
| LOCAL(loop):
 | |
| 	rotcr r2
 | |
| 	;
 | |
| 	bt/s LOCAL(end)
 | |
| 	cmp/gt r2,r3
 | |
| 	rotcl r0
 | |
| 	bra LOCAL(loop)
 | |
| 	div1 r5,r1
 | |
| LOCAL(end):
 | |
| 	rotcl r0
 | |
| 	mov.l @r15+,r3
 | |
| 	rts
 | |
| 	mov.l @r15+,r2
 | |
| #endif /* 0 */
 | |
| 
 | |
| /* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
 | |
|    sh4-200 run times:
 | |
|    udiv small divisor: 55 cycles
 | |
|    udiv large divisor: 52 cycles
 | |
|    sdiv small divisor, positive result: 59 cycles
 | |
|    sdiv large divisor, positive result: 56 cycles
 | |
|    sdiv small divisor, negative result: 65 cycles (*)
 | |
|    sdiv large divisor, negative result: 62 cycles (*)
 | |
|    (*): r2 is restored in the rts delay slot and has a lingering latency
 | |
|         of two more cycles.  */
 | |
| 	.balign 4
 | |
| 	.global	GLOBAL(udivsi3_i4i)
 | |
| 	FUNC(GLOBAL(udivsi3_i4i))
 | |
| 	FUNC(GLOBAL(sdivsi3_i4i))
 | |
| GLOBAL(udivsi3_i4i):
 | |
| 	sts pr,r1
 | |
| 	mov.l r4,@-r15
 | |
| 	extu.w r5,r0
 | |
| 	cmp/eq r5,r0
 | |
| 	swap.w r4,r0
 | |
| 	shlr16 r4
 | |
| 	bf/s LOCAL(large_divisor)
 | |
| 	div0u
 | |
| 	mov.l r5,@-r15
 | |
| 	shll16 r5
 | |
| LOCAL(sdiv_small_divisor):
 | |
| 	div1 r5,r4
 | |
| 	bsr LOCAL(div6)
 | |
| 	div1 r5,r4
 | |
| 	div1 r5,r4
 | |
| 	bsr LOCAL(div6)
 | |
| 	div1 r5,r4
 | |
| 	xtrct r4,r0
 | |
| 	xtrct r0,r4
 | |
| 	bsr LOCAL(div7)
 | |
| 	swap.w r4,r4
 | |
| 	div1 r5,r4
 | |
| 	bsr LOCAL(div7)
 | |
| 	div1 r5,r4
 | |
| 	xtrct r4,r0
 | |
| 	mov.l @r15+,r5
 | |
| 	swap.w r0,r0
 | |
| 	mov.l @r15+,r4
 | |
| 	jmp @r1
 | |
| 	rotcl r0
 | |
| LOCAL(div7):
 | |
| 	div1 r5,r4
 | |
| LOCAL(div6):
 | |
| 	            div1 r5,r4; div1 r5,r4; div1 r5,r4
 | |
| 	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
 | |
| 
 | |
| LOCAL(divx3):
 | |
| 	rotcl r0
 | |
| 	div1 r5,r4
 | |
| 	rotcl r0
 | |
| 	div1 r5,r4
 | |
| 	rotcl r0
 | |
| 	rts
 | |
| 	div1 r5,r4
 | |
| 
 | |
| LOCAL(large_divisor):
 | |
| 	mov.l r5,@-r15
 | |
| LOCAL(sdiv_large_divisor):
 | |
| 	xor r4,r0
 | |
| 	.rept 4
 | |
| 	rotcl r0
 | |
| 	bsr LOCAL(divx3)
 | |
| 	div1 r5,r4
 | |
| 	.endr
 | |
| 	mov.l @r15+,r5
 | |
| 	mov.l @r15+,r4
 | |
| 	jmp @r1
 | |
| 	rotcl r0
 | |
| 	ENDFUNC(GLOBAL(udivsi3_i4i))
 | |
| 
 | |
| 	.global	GLOBAL(sdivsi3_i4i)
 | |
| GLOBAL(sdivsi3_i4i):
 | |
| 	mov.l r4,@-r15
 | |
| 	cmp/pz r5
 | |
| 	mov.l r5,@-r15
 | |
| 	bt/s LOCAL(pos_divisor)
 | |
| 	cmp/pz r4
 | |
| 	neg r5,r5
 | |
| 	extu.w r5,r0
 | |
| 	bt/s LOCAL(neg_result)
 | |
| 	cmp/eq r5,r0
 | |
| 	neg r4,r4
 | |
| LOCAL(pos_result):
 | |
| 	swap.w r4,r0
 | |
| 	bra LOCAL(sdiv_check_divisor)
 | |
| 	sts pr,r1
 | |
| LOCAL(pos_divisor):
 | |
| 	extu.w r5,r0
 | |
| 	bt/s LOCAL(pos_result)
 | |
| 	cmp/eq r5,r0
 | |
| 	neg r4,r4
 | |
| LOCAL(neg_result):
 | |
| 	mova LOCAL(negate_result),r0
 | |
| 	;
 | |
| 	mov r0,r1
 | |
| 	swap.w r4,r0
 | |
| 	lds r2,macl
 | |
| 	sts pr,r2
 | |
| LOCAL(sdiv_check_divisor):
 | |
| 	shlr16 r4
 | |
| 	bf/s LOCAL(sdiv_large_divisor)
 | |
| 	div0u
 | |
| 	bra LOCAL(sdiv_small_divisor)
 | |
| 	shll16 r5
 | |
| 	.balign 4
 | |
| LOCAL(negate_result):
 | |
| 	neg r0,r0
 | |
| 	jmp @r2
 | |
| 	sts macl,r2
 | |
| 	ENDFUNC(GLOBAL(sdivsi3_i4i))
 | |
| #endif /* !__SH_FPU_DOUBLE__ */
 | |
| #endif /* L_udivsi3_i4i */
 | |
| 
 | |
| #ifdef L_sdivsi3_i4i
 | |
| #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
 | |
| /* 48 bytes, 45 cycles on sh4-200  */
 | |
| !! args in r4 and r5, result in r0, clobber r1
 | |
| 
 | |
| 	.global GLOBAL(sdivsi3_i4i)
 | |
| 	FUNC(GLOBAL(sdivsi3_i4i))
 | |
| GLOBAL(sdivsi3_i4i):
 | |
| 	sts.l fpscr,@-r15
 | |
| 	sts fpul,r1
 | |
| 	mova L1,r0
 | |
| 	lds.l @r0+,fpscr
 | |
| 	lds r4,fpul
 | |
| #ifdef FMOVD_WORKS
 | |
| 	fmov.d dr0,@-r15
 | |
| 	float fpul,dr0
 | |
| 	lds r5,fpul
 | |
| 	fmov.d dr2,@-r15
 | |
| #else
 | |
| 	fmov.s DR01,@-r15
 | |
| 	fmov.s DR00,@-r15
 | |
| 	float fpul,dr0
 | |
| 	lds r5,fpul
 | |
| 	fmov.s DR21,@-r15
 | |
| 	fmov.s DR20,@-r15
 | |
| #endif
 | |
| 	float fpul,dr2
 | |
| 	fdiv dr2,dr0
 | |
| #ifdef FMOVD_WORKS
 | |
| 	fmov.d @r15+,dr2
 | |
| #else
 | |
| 	fmov.s @r15+,DR20
 | |
| 	fmov.s @r15+,DR21
 | |
| #endif
 | |
| 	ftrc dr0,fpul
 | |
| #ifdef FMOVD_WORKS
 | |
| 	fmov.d @r15+,dr0
 | |
| #else
 | |
| 	fmov.s @r15+,DR00
 | |
| 	fmov.s @r15+,DR01
 | |
| #endif
 | |
| 	lds.l @r15+,fpscr
 | |
| 	sts fpul,r0
 | |
| 	rts
 | |
| 	lds r1,fpul
 | |
| 
 | |
| 	.p2align 2
 | |
| L1:
 | |
| #ifndef FMOVD_WORKS
 | |
| 	.long 0x80000
 | |
| #else
 | |
| 	.long 0x180000
 | |
| #endif
 | |
| 
 | |
| 	ENDFUNC(GLOBAL(sdivsi3_i4i))
 | |
| #endif /* __SH_FPU_DOUBLE__ */
 | |
| #endif /* L_sdivsi3_i4i */
 |