alsa-lib: NEON mixing optimizations
authorGrazvydas Ignotas <notasas@gmail.com>
Mon, 10 Jun 2013 21:57:09 +0000 (00:57 +0300)
committerGrazvydas Ignotas <notasas@gmail.com>
Tue, 11 Jun 2013 10:23:38 +0000 (13:23 +0300)
neasurements on pandora:
old code: ~280 cycles/sample
new (when only one stream): ~15
new (many streams): ~47 (per stream)

recipes/alsa/alsa-lib/neon_mixer.patch [new file with mode: 0644]
recipes/alsa/alsa-lib_1.0.20.bb

diff --git a/recipes/alsa/alsa-lib/neon_mixer.patch b/recipes/alsa/alsa-lib/neon_mixer.patch
new file mode 100644 (file)
index 0000000..79c024e
--- /dev/null
@@ -0,0 +1,275 @@
+diff -Nur alsa-lib-1.0.20_orig/configure.in alsa-lib-1.0.20/configure.in
+--- alsa-lib-1.0.20_orig/configure.in  2009-05-06 10:07:23.000000000 +0300
++++ alsa-lib-1.0.20/configure.in       2013-06-11 00:15:10.049448655 +0300
+@@ -43,6 +43,7 @@
+ AC_DISABLE_STATIC
+ AC_LIBTOOL_DLOPEN
+ AM_PROG_LIBTOOL
++AM_PROG_AS
+ CC_NOUNDEFINED
+diff -Nur alsa-lib-1.0.20_orig/src/pcm/Makefile.am alsa-lib-1.0.20/src/pcm/Makefile.am
+--- alsa-lib-1.0.20_orig/src/pcm/Makefile.am   2009-05-06 10:07:23.000000000 +0300
++++ alsa-lib-1.0.20/src/pcm/Makefile.am        2013-06-11 00:15:09.993448660 +0300
+@@ -66,6 +66,7 @@
+ endif
+ if BUILD_PCM_PLUGIN_DMIX
+ libpcm_la_SOURCES += pcm_dmix.c
++libpcm_la_SOURCES += pcm_dmix_arm_neon.S
+ endif
+ if BUILD_PCM_PLUGIN_DSHARE
+ libpcm_la_SOURCES += pcm_dshare.c
+@@ -103,7 +104,7 @@
+ libpcm_la_SOURCES += pcm_mmap_emul.c
+ endif
+-EXTRA_DIST = pcm_dmix_i386.c pcm_dmix_x86_64.c pcm_dmix_generic.c
++EXTRA_DIST = pcm_dmix_i386.c pcm_dmix_x86_64.c pcm_dmix_generic.c pcm_dmix_arm_neon.S
+ noinst_HEADERS = pcm_local.h pcm_plugin.h mask.h mask_inline.h \
+                interval.h interval_inline.h plugin_ops.h ladspa.h \
+diff -Nur alsa-lib-1.0.20_orig/src/pcm/pcm_dmix_arm_neon.S alsa-lib-1.0.20/src/pcm/pcm_dmix_arm_neon.S
+--- alsa-lib-1.0.20_orig/src/pcm/pcm_dmix_arm_neon.S   1970-01-01 03:00:00.000000000 +0300
++++ alsa-lib-1.0.20/src/pcm/pcm_dmix_arm_neon.S        2013-06-11 00:15:09.973448652 +0300
+@@ -0,0 +1,118 @@
++/*
++ * (C) GraÅžvydas "notaz" Ignotas, 2013
++ *
++ * This work is licensed under the terms of any of these licenses
++ * (at your option):
++ *  - GNU GPL, version 2 or later.
++ *  - GNU LGPL, version 2.1 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#ifdef __ARM_NEON__
++
++.text
++.align 2
++
++.macro do_sample_arm
++    ldrsh       r4, [r0]
++    ldrsh       r5, [r1], #2
++    ldr         r6, [r2]
++    tst         r4, r4
++    moveq       r6, #0
++    add         r6, r5
++    ssat        r12, #16, r6
++    subs        r3, #1
++    str         r6, [r2], #4
++    strh        r12,[r0], #2
++.endm
++
++.global mix_areas_16_arm_neon @ short *dst, short *src, int *sum, int samples
++.type mix_areas_16_arm_neon, %function
++mix_areas_16_arm_neon:
++    push        {r4-r6}
++    cmp         r3, #11       @ at least one block + potential alignment?
++    blt         3f
++    tst         r2, #0x0f     @ sum buffer aligned?
++    beq         1f
++
++0:
++    do_sample_arm
++    tst         r2, #0x0f
++    bne         0b
++
++1:
++    sub         r3, #8
++    vmov.i8     q15, #0
++
++0:
++    vld1.16     {q0}, [r0]
++    vld1.16     {q1}, [r1]!
++    vld1.32     {q2,q3}, [r2, :128]
++    vcgt.u16    q8, q0, q15     @ q8[n]=0xffff if dst[n] != 0
++    pld         [r0, #2*64]
++    vmovl.s16   q10, d16
++    vmovl.s16   q11, d17
++    pld         [r1, #2*64]
++    vand        q2, q10         @ sum[n] &= wide(q8[n])
++    vand        q3, q11
++    pld         [r2, #2*64]
++    vaddw.s16   q2, d2          @ sum[n] += src[n]
++    vaddw.s16   q3, d3
++    subs        r3, #8
++    vst1.32     {q2,q3}, [r2, :128]!
++    vqmovn.s32  d0, q2
++    vqmovn.s32  d1, q3
++    vst1.16     {q0}, [r0]!
++    bge         0b
++
++2:
++    adds        r3, #8
++    popeq       {r4-r6}
++    bxeq        lr
++
++3:
++    do_sample_arm              @ does subs r3, #1
++    bgt         3b
++
++    pop         {r4-r6}
++    bx          lr
++
++    .size       mix_areas_16_arm_neon, .-mix_areas_16_arm_neon
++
++
++.global expand_src_16_arm_neon @ short *dst, short *src, int *sum, int samples
++.type expand_src_16_arm_neon, %function
++expand_src_16_arm_neon:
++    subs        r3, #8
++    blt         1f
++
++0:
++    vld1.16     {q0}, [r1]!
++    subs        r3, #8
++    pld         [r1, #2*64]
++    vmovl.s16   q1, d0
++    vmovl.s16   q2, d1
++    vst1.16     {q0}, [r0]!
++    vst1.32     {q1,q2}, [r2]!
++    bge         0b
++
++1:
++    adds        r3, #8
++    bxeq        lr
++
++0:
++    ldrsh       r12, [r1], #2
++    subs        r3, #1
++    str         r12, [r2], #4
++    strh        r12, [r0], #2
++    bgt         0b
++
++    bx          lr
++
++    .size       expand_src_16_arm_neon, .-expand_src_16_arm_neon
++
++#else
++#error meh
++#endif
++
++@ vim:filetype=armasm:shiftwidth=4:expandtab
+diff -Nur alsa-lib-1.0.20_orig/src/pcm/pcm_dmix.c alsa-lib-1.0.20/src/pcm/pcm_dmix.c
+--- alsa-lib-1.0.20_orig/src/pcm/pcm_dmix.c    2009-05-06 10:07:23.000000000 +0300
++++ alsa-lib-1.0.20/src/pcm/pcm_dmix.c 2013-06-11 00:15:09.977448646 +0300
+@@ -154,7 +154,8 @@
+                     const snd_pcm_channel_area_t *dst_areas,
+                     snd_pcm_uframes_t src_ofs,
+                     snd_pcm_uframes_t dst_ofs,
+-                    snd_pcm_uframes_t size)
++                    snd_pcm_uframes_t size,
++                    int single_player)
+ {
+       unsigned int src_step, dst_step;
+       unsigned int chn, dchn, channels, sample_size;
+@@ -163,6 +164,24 @@
+       channels = dmix->channels;
+       switch (dmix->shmptr->s.format) {
+       case SND_PCM_FORMAT_S16_LE:
++#if defined(__ARM_NEON__) && defined(SNDRV_LITTLE_ENDIAN)
++              if (dmix->interleaved) {
++                      extern void mix_areas_16_arm_neon(short *dst,
++                              const short *src, int *sum, int samples);
++                      extern void expand_src_16_arm_neon(short *dst,
++                              const short *src, int *sum, int samples);
++                      short *dst = (short *)dst_areas[0].addr + dst_ofs * channels;
++                      const short *src = (short *)src_areas[0].addr + src_ofs * channels;
++                      int *sum = dmix->u.dmix.sum_buffer + dst_ofs * channels;
++
++                      if (single_player)
++                              expand_src_16_arm_neon(dst, src, sum, size * channels);
++                      else
++                              mix_areas_16_arm_neon(dst, src, sum, size * channels);
++                      return;
++              }
++#endif
++              // FALLTHROUGH
+       case SND_PCM_FORMAT_S16_BE:
+               sample_size = 2;
+               do_mix_areas = (mix_areas_t *)dmix->u.dmix.mix_areas_16;
+@@ -291,6 +311,22 @@
+ #endif
+ #endif
++static unsigned int ttime(void)
++{
++#if 0
++      struct timeval tv;
++      gettimeofday(&tv, NULL);
++      //return tv.tv_sec * 1000 + tv.tv_usec / 1000;
++      return tv.tv_sec * 1000000 + tv.tv_usec;
++#else
++        unsigned int val;
++      __asm__ volatile("mrc p15, 0, %0, c9, c13, 0"
++                      : "=r"(val));
++      return val;
++
++#endif
++}
++
+ /*
+  *  synchronize shm ring buffer with hardware
+  */
+@@ -300,6 +336,7 @@
+       snd_pcm_uframes_t slave_hw_ptr, slave_appl_ptr, slave_size;
+       snd_pcm_uframes_t appl_ptr, size, transfer;
+       const snd_pcm_channel_area_t *src_areas, *dst_areas;
++      int single_player = 0;
+       
+       /* calculate the size to transfer */
+       /* check the available size in the local buffer
+@@ -360,13 +397,33 @@
+       dmix->slave_appl_ptr += size;
+       dmix->slave_appl_ptr %= dmix->slave_boundary;
+       dmix_down_sem(dmix);
++#ifdef NO_CONCURRENT_ACCESS
++      struct shmid_ds buf;
++      int ret = shmctl(dmix->shmid, IPC_STAT, &buf);
++      single_player = (ret == 0 && buf.shm_nattch == 1);
++#endif
++#if 0
++      static int ccr;
++      if (!ccr++) {
++              int v;
++              asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
++              v |= 5; // master enable, ccnt reset
++              v &= ~8; // ccnt divider 0
++              asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
++              // enable cycle counter
++              asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
++      }
++      unsigned int start = ttime();
++      unsigned int ssize = size * dmix->channels;
++#endif
++
+       for (;;) {
+               transfer = size;
+               if (appl_ptr + transfer > pcm->buffer_size)
+                       transfer = pcm->buffer_size - appl_ptr;
+               if (slave_appl_ptr + transfer > dmix->slave_buffer_size)
+                       transfer = dmix->slave_buffer_size - slave_appl_ptr;
+-              mix_areas(dmix, src_areas, dst_areas, appl_ptr, slave_appl_ptr, transfer);
++              mix_areas(dmix, src_areas, dst_areas, appl_ptr, slave_appl_ptr, transfer, single_player);
+               size -= transfer;
+               if (! size)
+                       break;
+@@ -375,6 +432,17 @@
+               appl_ptr += transfer;
+               appl_ptr %= pcm->buffer_size;
+       }
++
++#if 0
++      static unsigned int ct, tbytes;
++
++      ct += ttime() - start;
++      tbytes += ssize;
++      if (tbytes >= 64*1024) {
++              printf("%.3f\n", (float)ct / (float)tbytes);
++              ct = tbytes = 0;
++      }
++#endif
+       dmix_up_sem(dmix);
+ }
index 55951be..69d97fc 100644 (file)
@@ -2,7 +2,7 @@ DESCRIPTION = "Alsa sound library"
 HOMEPAGE = "http://www.alsa-project.org"
 SECTION = "libs/multimedia"
 LICENSE = "LGPLv2.1"
-PR = "r3"
+PR = "r4"
 
 # configure.in sets -D__arm__ on the command line for any arm system
 # (not just those with the ARM instruction set), this should be removed,
@@ -13,6 +13,7 @@ ARM_INSTRUCTION_SET = "arm"
 SRC_URI = "ftp://ftp.alsa-project.org/pub/lib/alsa-lib-${PV}.tar.bz2 \
            file://fix-tstamp-declaration.patch;patch=1 \
           file://fix_libmath.patch;patch=1 \
+          file://neon_mixer.patch;patch=1 \
          "
 
 inherit autotools pkgconfig