/* * "memset" implementation for SH4 * * Copyright (C) 1999 Niibe Yutaka * Copyright (c) 2009 STMicroelectronics Limited * Author: Stuart Menefy */ /* * void *memset(void *s, int c, size_t n); */ #include ENTRY(memset) mov #12,r0 add r6,r4 cmp/gt r6,r0 bt/s 40f ! if it's too small, set a byte at once mov r4,r0 and #3,r0 cmp/eq #0,r0 bt/s 2f ! It's aligned sub r0,r6 1: dt r0 bf/s 1b mov.b r5,@-r4 2: ! make VVVV extu.b r5,r5 swap.b r5,r0 ! V0 or r0,r5 ! VV swap.w r5,r0 ! VV00 or r0,r5 ! VVVV ! Check if enough bytes need to be copied to be worth the big loop mov #0x40, r0 ! (MT) cmp/gt r6,r0 ! (MT) 64 > len => slow loop bt/s 22f mov r6,r0 ! align the dst to the cache block size if necessary mov r4, r3 mov #~(0x1f), r1 and r3, r1 cmp/eq r3, r1 bt/s 11f ! dst is already aligned sub r1, r3 ! r3-r1 -> r3 shlr2 r3 ! number of loops 10: mov.l r5,@-r4 dt r3 bf/s 10b add #-4, r6 11: ! dst is 32byte aligned mov r6,r2 mov #-5,r0 shld r0,r2 ! number of loops add #-32, r4 mov r5, r0 12: movca.l r0,@r4 mov.l r5,@(4, r4) mov.l r5,@(8, r4) mov.l r5,@(12,r4) mov.l r5,@(16,r4) mov.l r5,@(20,r4) add #-0x20, r6 mov.l r5,@(24,r4) dt r2 mov.l r5,@(28,r4) bf/s 12b add #-32, r4 add #32, r4 mov #8, r0 cmp/ge r0, r6 bf 40f mov r6,r0 22: shlr2 r0 shlr r0 ! r0 = r6 >> 3 3: dt r0 mov.l r5,@-r4 ! set 8-byte at once bf/s 3b mov.l r5,@-r4 ! mov #7,r0 and r0,r6 ! fill bytes (length may be zero) 40: tst r6,r6 bt 5f 4: dt r6 bf/s 4b mov.b r5,@-r4 5: rts mov r4,r0