From: Grazvydas Ignotas Date: Mon, 10 Jun 2013 21:57:09 +0000 (+0300) Subject: alsa-lib: NEON mixing optimizations X-Git-Tag: sz_160~19 X-Git-Url: http://git.openpandora.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bb3feed69496c7d2e5145bc7dd0c406e4c10b5ea;p=openembedded.git alsa-lib: NEON mixing optimizations neasurements on pandora: old code: ~280 cycles/sample new (when only one stream): ~15 new (many streams): ~47 (per stream) --- diff --git a/recipes/alsa/alsa-lib/neon_mixer.patch b/recipes/alsa/alsa-lib/neon_mixer.patch new file mode 100644 index 0000000000..79c024e848 --- /dev/null +++ b/recipes/alsa/alsa-lib/neon_mixer.patch @@ -0,0 +1,275 @@ +diff -Nur alsa-lib-1.0.20_orig/configure.in alsa-lib-1.0.20/configure.in +--- alsa-lib-1.0.20_orig/configure.in 2009-05-06 10:07:23.000000000 +0300 ++++ alsa-lib-1.0.20/configure.in 2013-06-11 00:15:10.049448655 +0300 +@@ -43,6 +43,7 @@ + AC_DISABLE_STATIC + AC_LIBTOOL_DLOPEN + AM_PROG_LIBTOOL ++AM_PROG_AS + + CC_NOUNDEFINED + +diff -Nur alsa-lib-1.0.20_orig/src/pcm/Makefile.am alsa-lib-1.0.20/src/pcm/Makefile.am +--- alsa-lib-1.0.20_orig/src/pcm/Makefile.am 2009-05-06 10:07:23.000000000 +0300 ++++ alsa-lib-1.0.20/src/pcm/Makefile.am 2013-06-11 00:15:09.993448660 +0300 +@@ -66,6 +66,7 @@ + endif + if BUILD_PCM_PLUGIN_DMIX + libpcm_la_SOURCES += pcm_dmix.c ++libpcm_la_SOURCES += pcm_dmix_arm_neon.S + endif + if BUILD_PCM_PLUGIN_DSHARE + libpcm_la_SOURCES += pcm_dshare.c +@@ -103,7 +104,7 @@ + libpcm_la_SOURCES += pcm_mmap_emul.c + endif + +-EXTRA_DIST = pcm_dmix_i386.c pcm_dmix_x86_64.c pcm_dmix_generic.c ++EXTRA_DIST = pcm_dmix_i386.c pcm_dmix_x86_64.c pcm_dmix_generic.c pcm_dmix_arm_neon.S + + noinst_HEADERS = pcm_local.h pcm_plugin.h mask.h mask_inline.h \ + interval.h interval_inline.h plugin_ops.h ladspa.h \ +diff -Nur alsa-lib-1.0.20_orig/src/pcm/pcm_dmix_arm_neon.S alsa-lib-1.0.20/src/pcm/pcm_dmix_arm_neon.S +--- alsa-lib-1.0.20_orig/src/pcm/pcm_dmix_arm_neon.S 1970-01-01 03:00:00.000000000 +0300 ++++ alsa-lib-1.0.20/src/pcm/pcm_dmix_arm_neon.S 2013-06-11 00:15:09.973448652 +0300 +@@ -0,0 +1,118 @@ ++/* ++ * (C) Gražvydas "notaz" Ignotas, 2013 ++ * ++ * This work is licensed under the terms of any of these licenses ++ * (at your option): ++ * - GNU GPL, version 2 or later. ++ * - GNU LGPL, version 2.1 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#ifdef __ARM_NEON__ ++ ++.text ++.align 2 ++ ++.macro do_sample_arm ++ ldrsh r4, [r0] ++ ldrsh r5, [r1], #2 ++ ldr r6, [r2] ++ tst r4, r4 ++ moveq r6, #0 ++ add r6, r5 ++ ssat r12, #16, r6 ++ subs r3, #1 ++ str r6, [r2], #4 ++ strh r12,[r0], #2 ++.endm ++ ++.global mix_areas_16_arm_neon @ short *dst, short *src, int *sum, int samples ++.type mix_areas_16_arm_neon, %function ++mix_areas_16_arm_neon: ++ push {r4-r6} ++ cmp r3, #11 @ at least one block + potential alignment? ++ blt 3f ++ tst r2, #0x0f @ sum buffer aligned? ++ beq 1f ++ ++0: ++ do_sample_arm ++ tst r2, #0x0f ++ bne 0b ++ ++1: ++ sub r3, #8 ++ vmov.i8 q15, #0 ++ ++0: ++ vld1.16 {q0}, [r0] ++ vld1.16 {q1}, [r1]! ++ vld1.32 {q2,q3}, [r2, :128] ++ vcgt.u16 q8, q0, q15 @ q8[n]=0xffff if dst[n] != 0 ++ pld [r0, #2*64] ++ vmovl.s16 q10, d16 ++ vmovl.s16 q11, d17 ++ pld [r1, #2*64] ++ vand q2, q10 @ sum[n] &= wide(q8[n]) ++ vand q3, q11 ++ pld [r2, #2*64] ++ vaddw.s16 q2, d2 @ sum[n] += src[n] ++ vaddw.s16 q3, d3 ++ subs r3, #8 ++ vst1.32 {q2,q3}, [r2, :128]! ++ vqmovn.s32 d0, q2 ++ vqmovn.s32 d1, q3 ++ vst1.16 {q0}, [r0]! ++ bge 0b ++ ++2: ++ adds r3, #8 ++ popeq {r4-r6} ++ bxeq lr ++ ++3: ++ do_sample_arm @ does subs r3, #1 ++ bgt 3b ++ ++ pop {r4-r6} ++ bx lr ++ ++ .size mix_areas_16_arm_neon, .-mix_areas_16_arm_neon ++ ++ ++.global expand_src_16_arm_neon @ short *dst, short *src, int *sum, int samples ++.type expand_src_16_arm_neon, %function ++expand_src_16_arm_neon: ++ subs r3, #8 ++ blt 1f ++ ++0: ++ vld1.16 {q0}, [r1]! ++ subs r3, #8 ++ pld [r1, #2*64] ++ vmovl.s16 q1, d0 ++ vmovl.s16 q2, d1 ++ vst1.16 {q0}, [r0]! ++ vst1.32 {q1,q2}, [r2]! ++ bge 0b ++ ++1: ++ adds r3, #8 ++ bxeq lr ++ ++0: ++ ldrsh r12, [r1], #2 ++ subs r3, #1 ++ str r12, [r2], #4 ++ strh r12, [r0], #2 ++ bgt 0b ++ ++ bx lr ++ ++ .size expand_src_16_arm_neon, .-expand_src_16_arm_neon ++ ++#else ++#error meh ++#endif ++ ++@ vim:filetype=armasm:shiftwidth=4:expandtab +diff -Nur alsa-lib-1.0.20_orig/src/pcm/pcm_dmix.c alsa-lib-1.0.20/src/pcm/pcm_dmix.c +--- alsa-lib-1.0.20_orig/src/pcm/pcm_dmix.c 2009-05-06 10:07:23.000000000 +0300 ++++ alsa-lib-1.0.20/src/pcm/pcm_dmix.c 2013-06-11 00:15:09.977448646 +0300 +@@ -154,7 +154,8 @@ + const snd_pcm_channel_area_t *dst_areas, + snd_pcm_uframes_t src_ofs, + snd_pcm_uframes_t dst_ofs, +- snd_pcm_uframes_t size) ++ snd_pcm_uframes_t size, ++ int single_player) + { + unsigned int src_step, dst_step; + unsigned int chn, dchn, channels, sample_size; +@@ -163,6 +164,24 @@ + channels = dmix->channels; + switch (dmix->shmptr->s.format) { + case SND_PCM_FORMAT_S16_LE: ++#if defined(__ARM_NEON__) && defined(SNDRV_LITTLE_ENDIAN) ++ if (dmix->interleaved) { ++ extern void mix_areas_16_arm_neon(short *dst, ++ const short *src, int *sum, int samples); ++ extern void expand_src_16_arm_neon(short *dst, ++ const short *src, int *sum, int samples); ++ short *dst = (short *)dst_areas[0].addr + dst_ofs * channels; ++ const short *src = (short *)src_areas[0].addr + src_ofs * channels; ++ int *sum = dmix->u.dmix.sum_buffer + dst_ofs * channels; ++ ++ if (single_player) ++ expand_src_16_arm_neon(dst, src, sum, size * channels); ++ else ++ mix_areas_16_arm_neon(dst, src, sum, size * channels); ++ return; ++ } ++#endif ++ // FALLTHROUGH + case SND_PCM_FORMAT_S16_BE: + sample_size = 2; + do_mix_areas = (mix_areas_t *)dmix->u.dmix.mix_areas_16; +@@ -291,6 +311,22 @@ + #endif + #endif + ++static unsigned int ttime(void) ++{ ++#if 0 ++ struct timeval tv; ++ gettimeofday(&tv, NULL); ++ //return tv.tv_sec * 1000 + tv.tv_usec / 1000; ++ return tv.tv_sec * 1000000 + tv.tv_usec; ++#else ++ unsigned int val; ++ __asm__ volatile("mrc p15, 0, %0, c9, c13, 0" ++ : "=r"(val)); ++ return val; ++ ++#endif ++} ++ + /* + * synchronize shm ring buffer with hardware + */ +@@ -300,6 +336,7 @@ + snd_pcm_uframes_t slave_hw_ptr, slave_appl_ptr, slave_size; + snd_pcm_uframes_t appl_ptr, size, transfer; + const snd_pcm_channel_area_t *src_areas, *dst_areas; ++ int single_player = 0; + + /* calculate the size to transfer */ + /* check the available size in the local buffer +@@ -360,13 +397,33 @@ + dmix->slave_appl_ptr += size; + dmix->slave_appl_ptr %= dmix->slave_boundary; + dmix_down_sem(dmix); ++#ifdef NO_CONCURRENT_ACCESS ++ struct shmid_ds buf; ++ int ret = shmctl(dmix->shmid, IPC_STAT, &buf); ++ single_player = (ret == 0 && buf.shm_nattch == 1); ++#endif ++#if 0 ++ static int ccr; ++ if (!ccr++) { ++ int v; ++ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v)); ++ v |= 5; // master enable, ccnt reset ++ v &= ~8; // ccnt divider 0 ++ asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v)); ++ // enable cycle counter ++ asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31)); ++ } ++ unsigned int start = ttime(); ++ unsigned int ssize = size * dmix->channels; ++#endif ++ + for (;;) { + transfer = size; + if (appl_ptr + transfer > pcm->buffer_size) + transfer = pcm->buffer_size - appl_ptr; + if (slave_appl_ptr + transfer > dmix->slave_buffer_size) + transfer = dmix->slave_buffer_size - slave_appl_ptr; +- mix_areas(dmix, src_areas, dst_areas, appl_ptr, slave_appl_ptr, transfer); ++ mix_areas(dmix, src_areas, dst_areas, appl_ptr, slave_appl_ptr, transfer, single_player); + size -= transfer; + if (! size) + break; +@@ -375,6 +432,17 @@ + appl_ptr += transfer; + appl_ptr %= pcm->buffer_size; + } ++ ++#if 0 ++ static unsigned int ct, tbytes; ++ ++ ct += ttime() - start; ++ tbytes += ssize; ++ if (tbytes >= 64*1024) { ++ printf("%.3f\n", (float)ct / (float)tbytes); ++ ct = tbytes = 0; ++ } ++#endif + dmix_up_sem(dmix); + } + diff --git a/recipes/alsa/alsa-lib_1.0.20.bb b/recipes/alsa/alsa-lib_1.0.20.bb index 55951be70b..69d97fc08c 100644 --- a/recipes/alsa/alsa-lib_1.0.20.bb +++ b/recipes/alsa/alsa-lib_1.0.20.bb @@ -2,7 +2,7 @@ DESCRIPTION = "Alsa sound library" HOMEPAGE = "http://www.alsa-project.org" SECTION = "libs/multimedia" LICENSE = "LGPLv2.1" -PR = "r3" +PR = "r4" # configure.in sets -D__arm__ on the command line for any arm system # (not just those with the ARM instruction set), this should be removed, @@ -13,6 +13,7 @@ ARM_INSTRUCTION_SET = "arm" SRC_URI = "ftp://ftp.alsa-project.org/pub/lib/alsa-lib-${PV}.tar.bz2 \ file://fix-tstamp-declaration.patch;patch=1 \ file://fix_libmath.patch;patch=1 \ + file://neon_mixer.patch;patch=1 \ " inherit autotools pkgconfig