--- /dev/null
+diff -Nur alsa-lib-1.0.20_orig/configure.in alsa-lib-1.0.20/configure.in
+--- alsa-lib-1.0.20_orig/configure.in 2009-05-06 10:07:23.000000000 +0300
++++ alsa-lib-1.0.20/configure.in 2013-06-11 00:15:10.049448655 +0300
+@@ -43,6 +43,7 @@
+ AC_DISABLE_STATIC
+ AC_LIBTOOL_DLOPEN
+ AM_PROG_LIBTOOL
++AM_PROG_AS
+
+ CC_NOUNDEFINED
+
+diff -Nur alsa-lib-1.0.20_orig/src/pcm/Makefile.am alsa-lib-1.0.20/src/pcm/Makefile.am
+--- alsa-lib-1.0.20_orig/src/pcm/Makefile.am 2009-05-06 10:07:23.000000000 +0300
++++ alsa-lib-1.0.20/src/pcm/Makefile.am 2013-06-11 00:15:09.993448660 +0300
+@@ -66,6 +66,7 @@
+ endif
+ if BUILD_PCM_PLUGIN_DMIX
+ libpcm_la_SOURCES += pcm_dmix.c
++libpcm_la_SOURCES += pcm_dmix_arm_neon.S
+ endif
+ if BUILD_PCM_PLUGIN_DSHARE
+ libpcm_la_SOURCES += pcm_dshare.c
+@@ -103,7 +104,7 @@
+ libpcm_la_SOURCES += pcm_mmap_emul.c
+ endif
+
+-EXTRA_DIST = pcm_dmix_i386.c pcm_dmix_x86_64.c pcm_dmix_generic.c
++EXTRA_DIST = pcm_dmix_i386.c pcm_dmix_x86_64.c pcm_dmix_generic.c pcm_dmix_arm_neon.S
+
+ noinst_HEADERS = pcm_local.h pcm_plugin.h mask.h mask_inline.h \
+ interval.h interval_inline.h plugin_ops.h ladspa.h \
+diff -Nur alsa-lib-1.0.20_orig/src/pcm/pcm_dmix_arm_neon.S alsa-lib-1.0.20/src/pcm/pcm_dmix_arm_neon.S
+--- alsa-lib-1.0.20_orig/src/pcm/pcm_dmix_arm_neon.S 1970-01-01 03:00:00.000000000 +0300
++++ alsa-lib-1.0.20/src/pcm/pcm_dmix_arm_neon.S 2013-06-11 00:15:09.973448652 +0300
+@@ -0,0 +1,118 @@
++/*
++ * (C) GraÅžvydas "notaz" Ignotas, 2013
++ *
++ * This work is licensed under the terms of any of these licenses
++ * (at your option):
++ * - GNU GPL, version 2 or later.
++ * - GNU LGPL, version 2.1 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#ifdef __ARM_NEON__
++
++.text
++.align 2
++
++.macro do_sample_arm
++ ldrsh r4, [r0]
++ ldrsh r5, [r1], #2
++ ldr r6, [r2]
++ tst r4, r4
++ moveq r6, #0
++ add r6, r5
++ ssat r12, #16, r6
++ subs r3, #1
++ str r6, [r2], #4
++ strh r12,[r0], #2
++.endm
++
++.global mix_areas_16_arm_neon @ short *dst, short *src, int *sum, int samples
++.type mix_areas_16_arm_neon, %function
++mix_areas_16_arm_neon:
++ push {r4-r6}
++ cmp r3, #11 @ at least one block + potential alignment?
++ blt 3f
++ tst r2, #0x0f @ sum buffer aligned?
++ beq 1f
++
++0:
++ do_sample_arm
++ tst r2, #0x0f
++ bne 0b
++
++1:
++ sub r3, #8
++ vmov.i8 q15, #0
++
++0:
++ vld1.16 {q0}, [r0]
++ vld1.16 {q1}, [r1]!
++ vld1.32 {q2,q3}, [r2, :128]
++ vcgt.u16 q8, q0, q15 @ q8[n]=0xffff if dst[n] != 0
++ pld [r0, #2*64]
++ vmovl.s16 q10, d16
++ vmovl.s16 q11, d17
++ pld [r1, #2*64]
++ vand q2, q10 @ sum[n] &= wide(q8[n])
++ vand q3, q11
++ pld [r2, #2*64]
++ vaddw.s16 q2, d2 @ sum[n] += src[n]
++ vaddw.s16 q3, d3
++ subs r3, #8
++ vst1.32 {q2,q3}, [r2, :128]!
++ vqmovn.s32 d0, q2
++ vqmovn.s32 d1, q3
++ vst1.16 {q0}, [r0]!
++ bge 0b
++
++2:
++ adds r3, #8
++ popeq {r4-r6}
++ bxeq lr
++
++3:
++ do_sample_arm @ does subs r3, #1
++ bgt 3b
++
++ pop {r4-r6}
++ bx lr
++
++ .size mix_areas_16_arm_neon, .-mix_areas_16_arm_neon
++
++
++.global expand_src_16_arm_neon @ short *dst, short *src, int *sum, int samples
++.type expand_src_16_arm_neon, %function
++expand_src_16_arm_neon:
++ subs r3, #8
++ blt 1f
++
++0:
++ vld1.16 {q0}, [r1]!
++ subs r3, #8
++ pld [r1, #2*64]
++ vmovl.s16 q1, d0
++ vmovl.s16 q2, d1
++ vst1.16 {q0}, [r0]!
++ vst1.32 {q1,q2}, [r2]!
++ bge 0b
++
++1:
++ adds r3, #8
++ bxeq lr
++
++0:
++ ldrsh r12, [r1], #2
++ subs r3, #1
++ str r12, [r2], #4
++ strh r12, [r0], #2
++ bgt 0b
++
++ bx lr
++
++ .size expand_src_16_arm_neon, .-expand_src_16_arm_neon
++
++#else
++#error meh
++#endif
++
++@ vim:filetype=armasm:shiftwidth=4:expandtab
+diff -Nur alsa-lib-1.0.20_orig/src/pcm/pcm_dmix.c alsa-lib-1.0.20/src/pcm/pcm_dmix.c
+--- alsa-lib-1.0.20_orig/src/pcm/pcm_dmix.c 2009-05-06 10:07:23.000000000 +0300
++++ alsa-lib-1.0.20/src/pcm/pcm_dmix.c 2013-06-11 00:15:09.977448646 +0300
+@@ -154,7 +154,8 @@
+ const snd_pcm_channel_area_t *dst_areas,
+ snd_pcm_uframes_t src_ofs,
+ snd_pcm_uframes_t dst_ofs,
+- snd_pcm_uframes_t size)
++ snd_pcm_uframes_t size,
++ int single_player)
+ {
+ unsigned int src_step, dst_step;
+ unsigned int chn, dchn, channels, sample_size;
+@@ -163,6 +164,24 @@
+ channels = dmix->channels;
+ switch (dmix->shmptr->s.format) {
+ case SND_PCM_FORMAT_S16_LE:
++#if defined(__ARM_NEON__) && defined(SNDRV_LITTLE_ENDIAN)
++ if (dmix->interleaved) {
++ extern void mix_areas_16_arm_neon(short *dst,
++ const short *src, int *sum, int samples);
++ extern void expand_src_16_arm_neon(short *dst,
++ const short *src, int *sum, int samples);
++ short *dst = (short *)dst_areas[0].addr + dst_ofs * channels;
++ const short *src = (short *)src_areas[0].addr + src_ofs * channels;
++ int *sum = dmix->u.dmix.sum_buffer + dst_ofs * channels;
++
++ if (single_player)
++ expand_src_16_arm_neon(dst, src, sum, size * channels);
++ else
++ mix_areas_16_arm_neon(dst, src, sum, size * channels);
++ return;
++ }
++#endif
++ // FALLTHROUGH
+ case SND_PCM_FORMAT_S16_BE:
+ sample_size = 2;
+ do_mix_areas = (mix_areas_t *)dmix->u.dmix.mix_areas_16;
+@@ -291,6 +311,22 @@
+ #endif
+ #endif
+
++static unsigned int ttime(void)
++{
++#if 0
++ struct timeval tv;
++ gettimeofday(&tv, NULL);
++ //return tv.tv_sec * 1000 + tv.tv_usec / 1000;
++ return tv.tv_sec * 1000000 + tv.tv_usec;
++#else
++ unsigned int val;
++ __asm__ volatile("mrc p15, 0, %0, c9, c13, 0"
++ : "=r"(val));
++ return val;
++
++#endif
++}
++
+ /*
+ * synchronize shm ring buffer with hardware
+ */
+@@ -300,6 +336,7 @@
+ snd_pcm_uframes_t slave_hw_ptr, slave_appl_ptr, slave_size;
+ snd_pcm_uframes_t appl_ptr, size, transfer;
+ const snd_pcm_channel_area_t *src_areas, *dst_areas;
++ int single_player = 0;
+
+ /* calculate the size to transfer */
+ /* check the available size in the local buffer
+@@ -360,13 +397,33 @@
+ dmix->slave_appl_ptr += size;
+ dmix->slave_appl_ptr %= dmix->slave_boundary;
+ dmix_down_sem(dmix);
++#ifdef NO_CONCURRENT_ACCESS
++ struct shmid_ds buf;
++ int ret = shmctl(dmix->shmid, IPC_STAT, &buf);
++ single_player = (ret == 0 && buf.shm_nattch == 1);
++#endif
++#if 0
++ static int ccr;
++ if (!ccr++) {
++ int v;
++ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
++ v |= 5; // master enable, ccnt reset
++ v &= ~8; // ccnt divider 0
++ asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
++ // enable cycle counter
++ asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
++ }
++ unsigned int start = ttime();
++ unsigned int ssize = size * dmix->channels;
++#endif
++
+ for (;;) {
+ transfer = size;
+ if (appl_ptr + transfer > pcm->buffer_size)
+ transfer = pcm->buffer_size - appl_ptr;
+ if (slave_appl_ptr + transfer > dmix->slave_buffer_size)
+ transfer = dmix->slave_buffer_size - slave_appl_ptr;
+- mix_areas(dmix, src_areas, dst_areas, appl_ptr, slave_appl_ptr, transfer);
++ mix_areas(dmix, src_areas, dst_areas, appl_ptr, slave_appl_ptr, transfer, single_player);
+ size -= transfer;
+ if (! size)
+ break;
+@@ -375,6 +432,17 @@
+ appl_ptr += transfer;
+ appl_ptr %= pcm->buffer_size;
+ }
++
++#if 0
++ static unsigned int ct, tbytes;
++
++ ct += ttime() - start;
++ tbytes += ssize;
++ if (tbytes >= 64*1024) {
++ printf("%.3f\n", (float)ct / (float)tbytes);
++ ct = tbytes = 0;
++ }
++#endif
+ dmix_up_sem(dmix);
+ }
+