add initial support for the crisarchitecture used on foxboards to openwrt

[openwrt/staging/dedeckeh.git] / target / linux / etrax-2.6 / patches / cris / 006-gcc-4.patch
diff --git a/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch b/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch

new file mode 100644 (file)

index 0000000..31a4107
--- /dev/null
+++ b/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch
@@ -0,0 +1,752 @@
+diff -urN linux-2.6.19.2.orig/arch/cris/arch-v10/lib/memset.c linux-2.6.19.2/arch/cris/arch-v10/lib/memset.c
+--- linux-2.6.19.2.orig/arch/cris/arch-v10/lib/memset.c        2007-05-20 01:46:35.000000000 +0200
++++ linux-2.6.19.2/arch/cris/arch-v10/lib/memset.c     2007-05-20 01:51:47.000000000 +0200
+@@ -29,224 +29,21 @@
+ 
+ #include <linux/types.h>
+ 
+-/* No, there's no macro saying 12*4, since it is "hard" to get it into
+-   the asm in a good way.  Thus better to expose the problem everywhere.
+-   */
+ 
+-/* Assuming 1 cycle per dword written or read (ok, not really true), and
+-   one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
+-   so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
+-
+-#define ZERO_BLOCK_SIZE (1*12*4)
+-
+-void *memset(void *pdst,
+-             int c,
+-             size_t plen)
++/**
++ * memset - Fill a region of memory with the given value
++ * @s: Pointer to the start of the area.
++ * @c: The byte to fill the area with
++ * @count: The size of the area.
++ *
++ * Do not use memset() to access IO space, use memset_io() instead.
++ */
++void *memset(void *s, int c, size_t count)
+ {
+-  /* Ok.  Now we want the parameters put in special registers.
+-     Make sure the compiler is able to make something useful of this. */
+-
+-  register char *return_dst __asm__ ("r10") = pdst;
+-  register int n __asm__ ("r12") = plen;
+-  register int lc __asm__ ("r11") = c;
+-
+-  /* Most apps use memset sanely.  Only those memsetting about 3..4
+-     bytes or less get penalized compared to the generic implementation
+-     - and that's not really sane use. */
+-
+-  /* Ugh.  This is fragile at best.  Check with newer GCC releases, if
+-     they compile cascaded "x |= x << 8" sanely! */
+-  __asm__("movu.b %0,$r13\n\t"
+-          "lslq 8,$r13\n\t"
+-        "move.b %0,$r13\n\t"
+-        "move.d $r13,%0\n\t"
+-        "lslq 16,$r13\n\t"
+-        "or.d $r13,%0"
+-          : "=r" (lc) : "0" (lc) : "r13");
+-
+-  {
+-    register char *dst __asm__ ("r13") = pdst;
+- 
+-  /* This is NONPORTABLE, but since this whole routine is     */
+-  /* grossly nonportable that doesn't matter.                 */
+-
+-  if (((unsigned long) pdst & 3) != 0
+-     /* Oops! n=0 must be a legal call, regardless of alignment. */
+-      && n >= 3)
+-  {
+-    if ((unsigned long)dst & 1)
+-    {
+-      *dst = (char) lc;
+-      n--;
+-      dst++;
+-    }
+-
+-    if ((unsigned long)dst & 2)
+-    {
+-      *(short *)dst = lc;
+-      n -= 2;
+-      dst += 2;
+-    }
+-  }
+-
+-  /* Now the fun part.  For the threshold value of this, check the equation
+-     above. */
+-  /* Decide which copying method to use. */
+-  if (n >= ZERO_BLOCK_SIZE)
+-  {
+-    /* For large copies we use 'movem' */
+-
+-  /* It is not optimal to tell the compiler about clobbering any
+-     registers; that will move the saving/restoring of those registers
+-     to the function prologue/epilogue, and make non-movem sizes
+-     suboptimal.
+-
+-      This method is not foolproof; it assumes that the "asm reg"
+-     declarations at the beginning of the function really are used
+-     here (beware: they may be moved to temporary registers).
+-      This way, we do not have to save/move the registers around into
+-     temporaries; we can safely use them straight away.
+-
+-      If you want to check that the allocation was right; then
+-      check the equalities in the first comment.  It should say
+-      "r13=r13, r12=r12, r11=r11" */
+-    __asm__ volatile ("
+-        ;; Check that the following is true (same register names on
+-        ;; both sides of equal sign, as in r8=r8):
+-        ;; %0=r13, %1=r12, %4=r11
+-        ;;
+-      ;; Save the registers we'll clobber in the movem process
+-      ;; on the stack.  Don't mention them to gcc, it will only be
+-      ;; upset.
+-      subq    11*4,$sp
+-        movem   $r10,[$sp]
+-
+-        move.d  $r11,$r0
+-        move.d  $r11,$r1
+-        move.d  $r11,$r2
+-        move.d  $r11,$r3
+-        move.d  $r11,$r4
+-        move.d  $r11,$r5
+-        move.d  $r11,$r6
+-        move.d  $r11,$r7
+-        move.d  $r11,$r8
+-        move.d  $r11,$r9
+-        move.d  $r11,$r10
+-
+-        ;; Now we've got this:
+-      ;; r13 - dst
+-      ;; r12 - n
+-      
+-        ;; Update n for the first loop
+-        subq    12*4,$r12
+-0:
+-        subq   12*4,$r12
+-        bge     0b
+-      movem   $r11,[$r13+]
+-
+-        addq   12*4,$r12  ;; compensate for last loop underflowing n
+-
+-      ;; Restore registers from stack
+-        movem [$sp+],$r10" 
+-
+-     /* Outputs */ : "=r" (dst), "=r" (n)
+-     /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
+-    
+-  }
+-
+-    /* Either we directly starts copying, using dword copying
+-       in a loop, or we copy as much as possible with 'movem' 
+-       and then the last block (<44 bytes) is copied here.
+-       This will work since 'movem' will have updated src,dst,n. */
+-
+-    while ( n >= 16 )
+-    {
+-      *((long*)dst)++ = lc;
+-      *((long*)dst)++ = lc;
+-      *((long*)dst)++ = lc;
+-      *((long*)dst)++ = lc;
+-      n -= 16;
+-    }
++      char *xs = s;
+ 
+-    /* A switch() is definitely the fastest although it takes a LOT of code.
+-     * Particularly if you inline code this.
+-     */
+-    switch (n)
+-    {
+-      case 0:
+-        break;
+-      case 1:
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 2:
+-        *(short*)dst = (short) lc;
+-        break;
+-      case 3:
+-        *((short*)dst)++ = (short) lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 4:
+-        *((long*)dst)++ = lc;
+-        break;
+-      case 5:
+-        *((long*)dst)++ = lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 6:
+-        *((long*)dst)++ = lc;
+-        *(short*)dst = (short) lc;
+-        break;
+-      case 7:
+-        *((long*)dst)++ = lc;
+-        *((short*)dst)++ = (short) lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 8:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        break;
+-      case 9:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 10:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *(short*)dst = (short) lc;
+-        break;
+-      case 11:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *((short*)dst)++ = (short) lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 12:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        break;
+-      case 13:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-      case 14:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *(short*)dst = (short) lc;
+-        break;
+-      case 15:
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *((long*)dst)++ = lc;
+-        *((short*)dst)++ = (short) lc;
+-        *(char*)dst = (char) lc;
+-        break;
+-    }
+-  }
++      while (count--)
++              *xs++ = c;
++      return s;
++}
+ 
+-  return return_dst; /* destination pointer. */
+-} /* memset() */
+diff -urN linux-2.6.19.2.orig/arch/cris/arch-v10/lib/string.c linux-2.6.19.2/arch/cris/arch-v10/lib/string.c
+--- linux-2.6.19.2.orig/arch/cris/arch-v10/lib/string.c        2007-05-20 01:46:35.000000000 +0200
++++ linux-2.6.19.2/arch/cris/arch-v10/lib/string.c     2007-05-20 01:51:19.000000000 +0200
+@@ -33,193 +33,21 @@
+ 
+ #include <linux/types.h>
+ 
+-void *memcpy(void *pdst,
+-             const void *psrc,
+-             size_t pn)
++ /**
++ * memcpy - Copy one area of memory to another
++ * @dest: Where to copy to
++ * @src: Where to copy from
++ * @count: The size of the area.
++ *
++ * You should not use this function to access IO space, use memcpy_toio()
++ * or memcpy_fromio() instead.
++ */
++void *memcpy(void *dest, const void *src, size_t count)
+ {
+-  /* Ok.  Now we want the parameters put in special registers.
+-     Make sure the compiler is able to make something useful of this.
+-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
++      char *tmp = dest;
++      const char *s = src;
+ 
+-     If gcc was allright, it really would need no temporaries, and no
+-     stack space to save stuff on. */
+-
+-  register void *return_dst __asm__ ("r10") = pdst;
+-  register char *dst __asm__ ("r13") = pdst;
+-  register const char *src __asm__ ("r11") = psrc;
+-  register int n __asm__ ("r12") = pn;
+-  
+- 
+-  /* When src is aligned but not dst, this makes a few extra needless
+-     cycles.  I believe it would take as many to check that the
+-     re-alignment was unnecessary.  */
+-  if (((unsigned long) dst & 3) != 0
+-      /* Don't align if we wouldn't copy more than a few bytes; so we
+-       don't have to check further for overflows.  */
+-      && n >= 3)
+-  {
+-    if ((unsigned long) dst & 1)
+-    {
+-      n--;
+-      *(char*)dst = *(char*)src;
+-      src++;
+-      dst++;
+-    }
+-
+-    if ((unsigned long) dst & 2)
+-    {
+-      n -= 2;
+-      *(short*)dst = *(short*)src;
+-      src += 2;
+-      dst += 2;
+-    }
+-  }
+-
+-  /* Decide which copying method to use. */
+-  if (n >= 44*2)                /* Break even between movem and
+-                                   move16 is at 38.7*2, but modulo 44. */
+-  {
+-    /* For large copies we use 'movem' */
+-
+-  /* It is not optimal to tell the compiler about clobbering any
+-     registers; that will move the saving/restoring of those registers
+-     to the function prologue/epilogue, and make non-movem sizes
+-     suboptimal.
+-
+-      This method is not foolproof; it assumes that the "asm reg"
+-     declarations at the beginning of the function really are used
+-     here (beware: they may be moved to temporary registers).
+-      This way, we do not have to save/move the registers around into
+-     temporaries; we can safely use them straight away.
+-
+-      If you want to check that the allocation was right; then
+-      check the equalities in the first comment.  It should say
+-      "r13=r13, r11=r11, r12=r12" */
+-    __asm__ volatile ("
+-        ;; Check that the following is true (same register names on
+-        ;; both sides of equal sign, as in r8=r8):
+-        ;; %0=r13, %1=r11, %2=r12
+-        ;;
+-      ;; Save the registers we'll use in the movem process
+-      ;; on the stack.
+-      subq    11*4,$sp
+-      movem   $r10,[$sp]
+-
+-        ;; Now we've got this:
+-      ;; r11 - src
+-      ;; r13 - dst
+-      ;; r12 - n
+-      
+-        ;; Update n for the first loop
+-        subq    44,$r12
+-0:
+-      movem   [$r11+],$r10
+-        subq   44,$r12
+-        bge     0b
+-      movem   $r10,[$r13+]
+-
+-        addq   44,$r12  ;; compensate for last loop underflowing n
+-
+-      ;; Restore registers from stack
+-        movem [$sp+],$r10" 
+-
+-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) 
+-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+-    
+-  }
+-
+-  /* Either we directly starts copying, using dword copying
+-     in a loop, or we copy as much as possible with 'movem' 
+-     and then the last block (<44 bytes) is copied here.
+-     This will work since 'movem' will have updated src,dst,n. */
+-
+-  while ( n >= 16 )
+-  {
+-    *((long*)dst)++ = *((long*)src)++;
+-    *((long*)dst)++ = *((long*)src)++;
+-    *((long*)dst)++ = *((long*)src)++;
+-    *((long*)dst)++ = *((long*)src)++;
+-    n -= 16;
+-  }
+-
+-  /* A switch() is definitely the fastest although it takes a LOT of code.
+-   * Particularly if you inline code this.
+-   */
+-  switch (n)
+-  {
+-    case 0:
+-      break;
+-    case 1:
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 2:
+-      *(short*)dst = *(short*)src;
+-      break;
+-    case 3:
+-      *((short*)dst)++ = *((short*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 4:
+-      *((long*)dst)++ = *((long*)src)++;
+-      break;
+-    case 5:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 6:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *(short*)dst = *(short*)src;
+-      break;
+-    case 7:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((short*)dst)++ = *((short*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 8:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      break;
+-    case 9:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 10:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *(short*)dst = *(short*)src;
+-      break;
+-    case 11:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((short*)dst)++ = *((short*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 12:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      break;
+-    case 13:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-    case 14:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *(short*)dst = *(short*)src;
+-      break;
+-    case 15:
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((long*)dst)++ = *((long*)src)++;
+-      *((short*)dst)++ = *((short*)src)++;
+-      *(char*)dst = *(char*)src;
+-      break;
+-  }
+-
+-  return return_dst; /* destination pointer. */
+-} /* memcpy() */
++      while (count--)
++              *tmp++ = *s++;
++      return dest;
++} 
+diff -urN linux-2.6.19.2.orig/arch/cris/arch-v10/lib/usercopy.c linux-2.6.19.2/arch/cris/arch-v10/lib/usercopy.c
+--- linux-2.6.19.2.orig/arch/cris/arch-v10/lib/usercopy.c      2007-05-16 22:11:26.000000000 +0200
++++ linux-2.6.19.2/arch/cris/arch-v10/lib/usercopy.c   2007-05-16 23:17:41.000000000 +0200
+@@ -88,63 +88,38 @@
+        If you want to check that the allocation was right; then
+        check the equalities in the first comment.  It should say
+        "r13=r13, r11=r11, r12=r12".  */
+-    __asm__ volatile ("\
+-      .ifnc %0%1%2%3,$r13$r11$r12$r10                                 \n\
+-      .err                                                            \n\
+-      .endif                                                          \n\
+-
+-      ;; Save the registers we'll use in the movem process
+-      ;; on the stack.
+-      subq    11*4,$sp
+-      movem   $r10,[$sp]
+-
+-      ;; Now we've got this:
+-      ;; r11 - src
+-      ;; r13 - dst
+-      ;; r12 - n
+-
+-      ;; Update n for the first loop
+-      subq    44,$r12
+-
+-; Since the noted PC of a faulting instruction in a delay-slot of a taken
+-; branch, is that of the branch target, we actually point at the from-movem
+-; for this case.  There is no ambiguity here; if there was a fault in that
+-; instruction (meaning a kernel oops), the faulted PC would be the address
+-; after *that* movem.
+-
+-0:
+-      movem   [$r11+],$r10
+-      subq   44,$r12
+-      bge     0b
+-      movem   $r10,[$r13+]
+-1:
+-      addq   44,$r12  ;; compensate for last loop underflowing n
+-
+-      ;; Restore registers from stack
+-      movem [$sp+],$r10
+-2:
+-      .section .fixup,\"ax\"
+-
+-; To provide a correct count in r10 of bytes that failed to be copied,
+-; we jump back into the loop if the loop-branch was taken.  There is no
+-; performance penalty for sany use; the program will segfault soon enough.
+-
+-3:
+-      move.d [$sp],$r10
+-      addq 44,$r10
+-      move.d $r10,[$sp]
+-      jump 0b
+-4:
+-      movem [$sp+],$r10
+-      addq 44,$r10
+-      addq 44,$r12
+-      jump 2b
+-
+-      .previous
+-      .section __ex_table,\"a\"
+-      .dword 0b,3b
+-      .dword 1b,4b
+-      .previous"
++    __asm__ volatile (
++      ".ifnc %0%1%2%3,$r13$r11$r12$r10        \n\t"   
++      ".err                           \n\t"           
++      ".endif                 \n\t"                   
++      "subq   11*4,$sp\n\t"
++      "movem  $r10,[$sp]\n\t"
++      "subq   44,$r12\n\t"
++      "0:\n\t"
++      "movem  [$r11+],$r10\n\t"
++      "subq   44,$r12\n\t"
++      "bge    0b\n\t"
++      "movem  $r10,[$r13+]\n\t"
++      "1:\n\t"
++      "addq   44,$r12  \n\t"
++      "movem [$sp+],$r10\n\t"
++      "2:\n\t"
++      ".section .fixup,\"ax\"\n\t"
++      "3:\n\t"
++      "move.d [$sp],$r10\n\t"
++      "addq 44,$r10\n\t"
++      "move.d $r10,[$sp]\n\t"
++      "jump 0b\n\t"
++      "4:\n\t"
++      "movem [$sp+],$r10\n\t"
++      "addq 44,$r10\n\t"
++      "addq 44,$r12\n\t"
++      "jump 2b\n\t"
++      ".previous\n\t"
++      ".section __ex_table,\"a\"\n\t"
++      ".dword 0b,3b\n\t"
++      ".dword 1b,4b\n\t"
++      ".previous\n\t"
+ 
+      /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
+      /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
+@@ -253,60 +228,32 @@
+        If you want to check that the allocation was right; then
+        check the equalities in the first comment.  It should say
+        "r13=r13, r11=r11, r12=r12" */
+-    __asm__ volatile ("
+-      .ifnc %0%1%2%3,$r13$r11$r12$r10                                 \n\
+-      .err                                                            \n\
+-      .endif                                                          \n\
+-
+-      ;; Save the registers we'll use in the movem process
+-      ;; on the stack.
+-      subq    11*4,$sp
+-      movem   $r10,[$sp]
+-
+-      ;; Now we've got this:
+-      ;; r11 - src
+-      ;; r13 - dst
+-      ;; r12 - n
+-
+-      ;; Update n for the first loop
+-      subq    44,$r12
+-0:
+-      movem   [$r11+],$r10
+-1:
+-      subq   44,$r12
+-      bge     0b
+-      movem   $r10,[$r13+]
+-
+-      addq   44,$r12  ;; compensate for last loop underflowing n
+-
+-      ;; Restore registers from stack
+-      movem [$sp+],$r10
+-4:
+-      .section .fixup,\"ax\"
+-
+-;; Do not jump back into the loop if we fail.  For some uses, we get a
+-;; page fault somewhere on the line.  Without checking for page limits,
+-;; we don't know where, but we need to copy accurately and keep an
+-;; accurate count; not just clear the whole line.  To do that, we fall
+-;; down in the code below, proceeding with smaller amounts.  It should
+-;; be kept in mind that we have to cater to code like what at one time
+-;; was in fs/super.c:
+-;;  i = size - copy_from_user((void *)page, data, size);
+-;; which would cause repeated faults while clearing the remainder of
+-;; the SIZE bytes at PAGE after the first fault.
+-;; A caveat here is that we must not fall through from a failing page
+-;; to a valid page.
+-
+-3:
+-      movem  [$sp+],$r10
+-      addq    44,$r12 ;; Get back count before faulting point.
+-      subq    44,$r11 ;; Get back pointer to faulting movem-line.
+-      jump    4b      ;; Fall through, pretending the fault didn't happen.
+-
+-      .previous
+-      .section __ex_table,\"a\"
+-      .dword 1b,3b
+-      .previous"
++    __asm__ volatile (
++      ".ifnc %0%1%2%3,$r13$r11$r12$r10                \n\t"
++      ".err                                   \n\t"        
++      ".endif                         \n\t"                
++      "subq   11*4,$sp\n\t"
++      "movem  $r10,[$sp]\n\t"
++      "subq   44,$r12\n\t"
++      "0:\n\t"
++      "movem  [$r11+],$r10\n\t"
++      "1:\n\t"
++      "subq   44,$r12\n\t"
++      "bge    0b\n\t"
++      "movem  $r10,[$r13+]\n\t"
++      "addq   44,$r12  \n\t"
++      "movem [$sp+],$r10\n\t"
++      "4:\n\t"
++      ".section .fixup,\"ax\"\n\t"
++      "3:\n\t"
++      "movem  [$sp+],$r10\n\t"
++      "addq   44,$r12\n\t"
++      "subq   44,$r11\n\t"
++      "jump   4b      \n\t"
++      ".previous\n\t"
++      ".section __ex_table,\"a\"\n\t"
++      ".dword 1b,3b\n\t"
++      ".previous\n\t"
+ 
+      /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
+      /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
+@@ -425,66 +372,50 @@
+       If you want to check that the allocation was right; then
+       check the equalities in the first comment.  It should say
+       something like "r13=r13, r11=r11, r12=r12". */
+-    __asm__ volatile ("
+-      .ifnc %0%1%2,$r13$r12$r10                                       \n\
+-      .err                                                            \n\
+-      .endif                                                          \n\
+-
+-      ;; Save the registers we'll clobber in the movem process
+-      ;; on the stack.  Don't mention them to gcc, it will only be
+-      ;; upset.
+-      subq    11*4,$sp
+-      movem   $r10,[$sp]
+-
+-      clear.d $r0
+-      clear.d $r1
+-      clear.d $r2
+-      clear.d $r3
+-      clear.d $r4
+-      clear.d $r5
+-      clear.d $r6
+-      clear.d $r7
+-      clear.d $r8
+-      clear.d $r9
+-      clear.d $r10
+-      clear.d $r11
+-
+-      ;; Now we've got this:
+-      ;; r13 - dst
+-      ;; r12 - n
+-
+-      ;; Update n for the first loop
+-      subq    12*4,$r12
+-0:
+-      subq   12*4,$r12
+-      bge     0b
+-      movem   $r11,[$r13+]
+-1:
+-      addq   12*4,$r12        ;; compensate for last loop underflowing n
+-
+-      ;; Restore registers from stack
+-      movem [$sp+],$r10
+-2:
+-      .section .fixup,\"ax\"
+-3:
+-      move.d [$sp],$r10
+-      addq 12*4,$r10
+-      move.d $r10,[$sp]
+-      clear.d $r10
+-      jump 0b
+-
+-4:
+-      movem [$sp+],$r10
+-      addq 12*4,$r10
+-      addq 12*4,$r12
+-      jump 2b
+-
+-      .previous
+-      .section __ex_table,\"a\"
+-      .dword 0b,3b
+-      .dword 1b,4b
+-      .previous"
+-
++    __asm__ volatile (
++      ".ifnc %0%1%2,$r13$r12$r10\n\t"
++      ".err                           \n\t"
++      ".endif\n\t"
++      "subq   11*4,$sp\n\t"
++      "movem  $r10,[$sp]\n\t"
++      "clear.d $r0\n\t"
++      "clear.d $r1\n\t"
++      "clear.d $r2\n\t"
++      "clear.d $r3\n\t"
++      "clear.d $r4\n\t"
++      "clear.d $r5\n\t"
++      "clear.d $r6\n\t"
++      "clear.d $r7\n\t"
++      "clear.d $r8\n\t"
++      "clear.d $r9\n\t"
++      "clear.d $r10\n\t"
++      "clear.d $r11\n\t"
++      "subq   12*4,$r12\n\t"
++      "0:\n\t"
++      "subq   12*4,$r12\n\t"
++      "bge    0b\n\t"
++      "movem  $r11,[$r13+]\n\t"
++      "1:     \n\t"
++      "addq   12*4,$r12        \n\t"
++      "movem [$sp+],$r10\n\t"
++      "2:\n\t"
++      ".section .fixup,\"ax\"\n\t"
++      "3:\n\t"
++      "move.d [$sp],$r10\n\t"
++      "addq 12*4,$r10\n\t"
++      "move.d $r10,[$sp]\n\t"
++      "clear.d $r10\n\t"
++      "jump 0b\n\t"
++      "4:\n\t"
++      "movem [$sp+],$r10\n\t"
++      "addq 12*4,$r10\n\t"
++      "addq 12*4,$r12\n\t"
++      "jump 2b\n\t"
++      ".previous\n\t"
++      ".section __ex_table,\"a\"\n\t"
++      ".dword 0b,3b\n\t"
++      ".dword 1b,4b\n\t"
++      ".previous\n\t"
+      /* Outputs */ : "=r" (dst), "=r" (n), "=r" (retn)
+      /* Inputs */ : "0" (dst), "1" (n), "2" (retn)
+      /* Clobber */ : "r11");