--- /dev/null
+diff -ur openal-soft-1.15.1.orig/Alc/ALu.c openal-soft-1.15.1/Alc/ALu.c
+--- openal-soft-1.15.1.orig/Alc/ALu.c 2014-07-02 03:36:17.874323362 +0300
++++ openal-soft-1.15.1/Alc/ALu.c 2014-07-03 02:41:18.000116397 +0300
+@@ -956,11 +956,78 @@
+ return SamplesToDo*numchans*sizeof(T); \
+ }
+
++static int Write_ALshort(ALCdevice *device, ALshort *RESTRICT buffer, ALuint SamplesToDo)
++{
++ ALfloat (*RESTRICT DryBuffer)[BUFFERSIZE] = device->DryBuffer;
++ ALuint numchans = ChannelsFromDevFmt(device->FmtChans);
++ const ALuint *offsets = device->ChannelOffsets;
++ ALuint i, j;
++
++#ifdef __ARM_NEON__
++ if (numchans == 2 && offsets[0] == 0 && offsets[1] == 1)
++ {
++ ALfloat *cl = DryBuffer[0];
++ ALfloat *cr = DryBuffer[1];
++ ALuint samples = SamplesToDo;
++ asm volatile (
++ "movw r3, #0x0000fe00\n"
++ "movt r3, #0x46ff\n"
++ "vdup.32 d4, r3 @ 32767.0\n"
++ "0:\n"
++ "vld1.32 {q0}, [%1, :128]!\n"
++ "vld1.32 {q1}, [%2, :128]!\n"
++ "subs %3, #4\n"
++ "vmul.f32 q0, d4[0]\n"
++ "vmul.f32 q1, d4[0]\n"
++ "vcvt.s32.f32 q0, q0\n"
++ "vcvt.s32.f32 q1, q1\n"
++ "pld [%1, #64*2]\n"
++ "pld [%2, #64*2]\n"
++ "vqmovn.s32 d0, q0\n"
++ "vqmovn.s32 d1, q1\n"
++ "blt 1f\n"
++ "vst2.16 {d0,d1}, [%0]!\n"
++ "bgt 0b\n"
++ "nop\n"
++ "b 2f\n" /* eq 4 - all done */
++ "1:\n"
++ "vzip.16 q0, q0\n"
++ "add %3, #4\n"
++ "vst1.32 {d0[0]}, [%0]!\n"
++ "cmp %3, #1\n"
++ "ble 2f\n"
++ "vst1.32 {d0[1]}, [%0]!\n"
++ "cmp %3, #2\n"
++ "ble 2f\n"
++ "vst1.32 {d1[0]}, [%0]!\n"
++ "2:\n"
++ : "=&r"(buffer), "=&r"(cl), "=&r"(cr), "=&r"(samples)
++ : "0"(buffer), "1"(cl), "2"(cr), "3"(samples)
++ : "r3", "q0", "q1", "d4", "cc", "memory"
++ );
++ return SamplesToDo * numchans * sizeof(ALshort);
++ }
++#endif
++
++ for(j = 0;j < MaxChannels;j++)
++ {
++ ALshort *RESTRICT out;
++
++ if(offsets[j] == INVALID_OFFSET)
++ continue;
++
++ out = buffer + offsets[j];
++ for(i = 0;i < SamplesToDo;i++)
++ out[i * numchans] = aluF2S(DryBuffer[j][i]);
++ }
++ return SamplesToDo * numchans * sizeof(ALshort);
++}
++
+ DECL_TEMPLATE(ALfloat, aluF2F)
+ DECL_TEMPLATE(ALuint, aluF2UI)
+ DECL_TEMPLATE(ALint, aluF2I)
+ DECL_TEMPLATE(ALushort, aluF2US)
+-DECL_TEMPLATE(ALshort, aluF2S)
++//DECL_TEMPLATE(ALshort, aluF2S)
+ DECL_TEMPLATE(ALubyte, aluF2UB)
+ DECL_TEMPLATE(ALbyte, aluF2B)
+