[PATCH] i386: optimize swab64

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] i386: optimize swab64

Denis Vlasenko
I noticed that swab64 explicitly swaps 32-bit halves, but this is
not really needed because CPU is 32-bit anyway and we can
just tell GCC to treat registers as being swapped.

Example of resulting code:

       mov    0x20(%ecx,%edi,8),%eax
       mov    0x24(%ecx,%edi,8),%edx
       lea    0x1(%edi),%esi
       mov    %esi,0xfffffdf4(%ebp)
       mov    %eax,%ebx
       mov    %edx,%esi
       bswap  %ebx
       bswap  %esi
       mov    %esi,0xffffff74(%ebp,%edi,8)
       mov    %ebx,0xffffff78(%ebp,%edi,8)

As you can see, swap is achieved simply by using
appropriate registers in last two insns.

(Why does gcc do extra register moves just before bswaps
is another question. No regression here, old code had them too)

Run-tested.
--
vda

diff -urpN linux-2.6.12-rc2.0.orig/include/asm-i386/byteorder.h linux-2.6.12-rc2.z.cur/include/asm-i386/byteorder.h
--- linux-2.6.12-rc2.0.orig/include/asm-i386/byteorder.h Tue Oct 19 00:54:36 2004
+++ linux-2.6.12-rc2.z.cur/include/asm-i386/byteorder.h Sun Apr 24 22:38:14 2005
@@ -25,6 +25,8 @@ static __inline__ __attribute_const__ __
  return x;
 }

+/* NB: swap of 32-bit halves is achieved by asm constraints.
+** This will save a xchgl in many cases */
 static __inline__ __attribute_const__ __u64 ___arch__swab64(__u64 val)
 {
  union {
@@ -33,13 +35,13 @@ static __inline__ __attribute_const__ __
  } v;
  v.u = val;
 #ifdef CONFIG_X86_BSWAP
- asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
-    : "=r" (v.s.a), "=r" (v.s.b)
+ asm("bswapl %0 ; bswapl %1"
+    : "=r" (v.s.b), "=r" (v.s.a)
     : "0" (v.s.a), "1" (v.s.b));
 #else
-   v.s.a = ___arch__swab32(v.s.a);
+ v.s.a = ___arch__swab32(v.s.a);
  v.s.b = ___arch__swab32(v.s.b);
- asm("xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b));
+ asm("" : "=r" (v.s.b), "=r" (v.s.a) : "0" (v.s.a), "1" (v.s.b));
 #endif
  return v.u;
 }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [hidden email]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] i386: optimize swab64

Andi Kleen-2
On Mon, Apr 25, 2005 at 10:19:30AM +0300, Denis Vlasenko wrote:
> I noticed that swab64 explicitly swaps 32-bit halves, but this is
> not really needed because CPU is 32-bit anyway and we can
> just tell GCC to treat registers as being swapped.

No, we went through this exactly when the code was originally done.
gcc puts long long only into aligned register pairs, and with
register swap you need at least 4 registers which blows near
all possible registers away and completely breaks register
allocation in the function. Dont apply this!

-Andi

>
> Example of resulting code:
>
>        mov    0x20(%ecx,%edi,8),%eax
>        mov    0x24(%ecx,%edi,8),%edx
>        lea    0x1(%edi),%esi
>        mov    %esi,0xfffffdf4(%ebp)
>        mov    %eax,%ebx
>        mov    %edx,%esi
>        bswap  %ebx
>        bswap  %esi
>        mov    %esi,0xffffff74(%ebp,%edi,8)
>        mov    %ebx,0xffffff78(%ebp,%edi,8)
>
> As you can see, swap is achieved simply by using
> appropriate registers in last two insns.
>
> (Why does gcc do extra register moves just before bswaps
> is another question. No regression here, old code had them too)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [hidden email]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] i386: optimize swab64

Denis Vlasenko
On Monday 25 April 2005 18:53, Andi Kleen wrote:
> On Mon, Apr 25, 2005 at 10:19:30AM +0300, Denis Vlasenko wrote:
> > I noticed that swab64 explicitly swaps 32-bit halves, but this is
> > not really needed because CPU is 32-bit anyway and we can
> > just tell GCC to treat registers as being swapped.
>
> No, we went through this exactly when the code was originally done.
> gcc puts long long only into aligned register pairs, and with

I don't see this. However, gcc is indeed does some unneeded moves,
both with and without xchgl. I filed a bug report:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21202

> register swap you need at least 4 registers which blows near
> all possible registers away and completely breaks register
> allocation in the function. Dont apply this!

Andi, are you saying that code gets worse with this patch?
This is not true at least for gcc 3.4.3 and 4.0.0.
I just re-checked this.

This is with original code:

# objdump -r -d crypto/wp512.o
00000000 <wp512_process_buffer>:
       0:       55                      push   %ebp
       1:       89 e5                   mov    %esp,%ebp
       3:       57                      push   %edi
       4:       56                      push   %esi
       5:       53                      push   %ebx
       6:       81 ec 00 02 00 00       sub    $0x200,%esp
       c:       31 ff                   xor    %edi,%edi
       e:       8b 4d 08                mov    0x8(%ebp),%ecx
      11:       8b 54 f9 24             mov    0x24(%ecx,%edi,8),%edx
      15:       8b 44 f9 20             mov    0x20(%ecx,%edi,8),%eax
      19:       8d 77 01                lea    0x1(%edi),%esi
      1c:       89 b5 f4 fd ff ff       mov    %esi,0xfffffdf4(%ebp)
      22:       89 c1                   mov    %eax,%ecx
      24:       89 d6                   mov    %edx,%esi
      26:       0f c9                   bswap  %ecx
      28:       0f ce                   bswap  %esi
      2a:       87 ce                   xchg   %ecx,%esi             <=======
      2c:       89 8c fd 74 ff ff ff    mov    %ecx,0xffffff74(%ebp,%edi,8)
      33:       89 b4 fd 78 ff ff ff    mov    %esi,0xffffff78(%ebp,%edi,8)

Patched:

# objdump -r -d crypto_noswap/wp512.o
00000000 <wp512_process_buffer>:
       0:       55                      push   %ebp
       1:       89 e5                   mov    %esp,%ebp
       3:       57                      push   %edi
       4:       56                      push   %esi
       5:       53                      push   %ebx
       6:       81 ec 00 02 00 00       sub    $0x200,%esp
       c:       31 ff                   xor    %edi,%edi
       e:       8b 4d 08                mov    0x8(%ebp),%ecx
      11:       8b 44 f9 20             mov    0x20(%ecx,%edi,8),%eax
      15:       8b 54 f9 24             mov    0x24(%ecx,%edi,8),%edx
      19:       8d 77 01                lea    0x1(%edi),%esi
      1c:       89 b5 f4 fd ff ff       mov    %esi,0xfffffdf4(%ebp)
      22:       89 c3                   mov    %eax,%ebx
      24:       89 d6                   mov    %edx,%esi
      26:       0f cb                   bswap  %ebx
      28:       0f ce                   bswap  %esi
                                                  <========= NO xchg
      2a:       89 b4 fd 74 ff ff ff    mov    %esi,0xffffff74(%ebp,%edi,8)
      31:       89 9c fd 78 ff ff ff    mov    %ebx,0xffffff78(%ebp,%edi,8)

It is not a win only for wp512, other crypto modules are a tiny bit smaller too:

# echo crypto*/*.o | xargs -n1 | grep -Fv .mod. | sort -t / -k2,99 | xargs size
   text    data     bss     dec     hex filename
  17743     108       0   17851    45bb crypto/khazad.o
  17735     108       0   17843    45b3 crypto_noswap/khazad.o
    666     108       0     774     306 crypto/sha1.o
    664     108       0     772     304 crypto_noswap/sha1.o
   5160     364       0    5524    1594 crypto/sha512.o
   5156     364       0    5520    1590 crypto_noswap/sha512.o
  10239     364       0   10603    296b crypto/tgr192.o
  10233     364       0   10597    2965 crypto_noswap/tgr192.o
  22774     364       0   23138    5a62 crypto/wp512.o
  22770     364       0   23134    5a5e crypto_noswap/wp512.o
--
vda

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [hidden email]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/