opus: Packaging fixups after #33311

Removes the custom code for VS2012 support in config.h, and reduces diff with upstream config.h. We still have many custom defines for ARM optimizations which we probably don't use properly, since we don't compile the included asm code, so a thorough review and cleanup would be welcome.
2019-11-12 12:57:27 +01:00 · 2019-11-12 12:57:27 +01:00 · 0387657fa4
commit 0387657fa4
parent 7d836a7cc3
6 changed files with 568 additions and 478 deletions
--- a/thirdparty/opus/celt/arm/armopts.s
+++ b/thirdparty/opus/celt/arm/armopts.s
@ -1,8 +1,4 @@
-/* Copyright (C) 2008 CSIRO */
+/* Copyright (C) 2013 Mozilla Corporation */
 /**
   @file fixed_c6x.h
   @brief Fixed-point operations for the TI C6x DSP family
 */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -28,43 +24,14 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#ifndef FIXED_C6X_H
+; Set the following to 1 if we have EDSP instructions
-#define FIXED_C6X_H
+;  (LDRD/STRD, etc., ARMv5E and later).
 OPUS_ARM_MAY_HAVE_EDSP  * 
-#undef MULT16_16SU
+; Set the following to 1 if we have ARMv6 media instructions.
-#define MULT16_16SU(a,b) _mpysu(a,b)
+OPUS_ARM_MAY_HAVE_MEDIA * 
-#undef MULT_16_16
+; Set the following to 1 if we have NEON (some ARMv7)
-#define MULT_16_16(a,b) _mpy(a,b)
+OPUS_ARM_MAY_HAVE_NEON  * 
-#define celt_ilog2(x) (30 - _norm(x))
+END
 #define OVERRIDE_CELT_ILOG2
 #undef MULT16_32_Q15
 #define MULT16_32_Q15(a,b) (_mpylill(a, b) >> 15)
 #if 0
 #include "dsplib.h"
 #undef MAX16
 #define MAX16(a,b) _max(a,b)
 #undef MIN16
 #define MIN16(a,b) _min(a,b)
 #undef MAX32
 #define MAX32(a,b) _lmax(a,b)
 #undef MIN32
 #define MIN32(a,b) _lmin(a,b)
 #undef VSHR32
 #define VSHR32(a, shift) _lshl(a,-(shift))
 #undef MULT16_16_Q15
 #define MULT16_16_Q15(a,b) (_smpy(a,b))
 #define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
 #define OVERRIDE_CELT_MAXABS16
 #endif /* FIXED_C6X_H */
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
+++ b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
@ -0,0 +1,555 @@
    .syntax unified
@ Copyright (c) 2007-2008 CSIRO
@ Copyright (c) 2007-2009 Xiph.Org Foundation
@ Copyright (c) 2013      Parrot
@ Written by Aurélien Zanelli
@
@ Redistribution and use in source and binary forms, with or without
@ modification, are permitted provided that the following conditions
@ are met:
@
@ - Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@
@ - Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    .text;   .p2align 2;   .arch armv7-a
   .fpu neon
   .object_arch armv4t
  .include "celt/arm/armopts-gnu.S"
 .if OPUS_ARM_MAY_HAVE_EDSP
  .global celt_pitch_xcorr_edsp
 .endif
 .if OPUS_ARM_MAY_HAVE_NEON
  .global celt_pitch_xcorr_neon
 .endif
 .if OPUS_ARM_MAY_HAVE_NEON
@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
 	.type	xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
 xcorr_kernel_neon_start:
  @ input:
  @   r3     = int         len
  @   r4     = opus_val16 *x
  @   r5     = opus_val16 *y
  @   q0     = opus_val32  sum[4]
  @ output:
  @   q0     = opus_val32  sum[4]
  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
  @ internal usage:
  @   r12 = int j
  @   d3  = y_3|y_2|y_1|y_0
  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
  @   q8  = scratch
  @
  @ Load y[0...3]
  @ This requires len>0 to always be valid (which we assert in the C code).
  VLD1.16      {d5}, [r5]!
  SUBS         r12, r3, #8
  BLE xcorr_kernel_neon_process4
@ Process 8 samples at a time.
@ This loop loads one y value more than we actually need. Therefore we have to
@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
@ reading past the end of the array.
 xcorr_kernel_neon_process8:
  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
  @ - 2 cycles of ARM insrtuctions,
  @ - 10 cycles of load/store/byte permute instructions, and
  @ - 9 cycles of data processing instructions.
  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
  @ latter two categories, meaning the whole loop should run in 10 cycles per
  @ iteration, barring cache misses.
  @
  @ Load x[0...7]
  VLD1.16      {d6, d7}, [r4]!
  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
  VAND         d3, d5, d5
  SUBS         r12, r12, #8
  @ Load y[4...11]
  VLD1.16      {d4, d5}, [r5]!
  VMLAL.S16    q0, d3, d6[0]
  VEXT.16      d16, d3, d4, #1
  VMLAL.S16    q0, d4, d7[0]
  VEXT.16      d17, d4, d5, #1
  VMLAL.S16    q0, d16, d6[1]
  VEXT.16      d16, d3, d4, #2
  VMLAL.S16    q0, d17, d7[1]
  VEXT.16      d17, d4, d5, #2
  VMLAL.S16    q0, d16, d6[2]
  VEXT.16      d16, d3, d4, #3
  VMLAL.S16    q0, d17, d7[2]
  VEXT.16      d17, d4, d5, #3
  VMLAL.S16    q0, d16, d6[3]
  VMLAL.S16    q0, d17, d7[3]
  BGT xcorr_kernel_neon_process8
@ Process 4 samples here if we have > 4 left (still reading one extra y value).
 xcorr_kernel_neon_process4:
  ADDS         r12, r12, #4
  BLE xcorr_kernel_neon_process2
  @ Load x[0...3]
  VLD1.16      d6, [r4]!
  @ Use VAND since it's a data processing instruction again.
  VAND         d4, d5, d5
  SUB          r12, r12, #4
  @ Load y[4...7]
  VLD1.16      d5, [r5]!
  VMLAL.S16    q0, d4, d6[0]
  VEXT.16      d16, d4, d5, #1
  VMLAL.S16    q0, d16, d6[1]
  VEXT.16      d16, d4, d5, #2
  VMLAL.S16    q0, d16, d6[2]
  VEXT.16      d16, d4, d5, #3
  VMLAL.S16    q0, d16, d6[3]
@ Process 2 samples here if we have > 2 left (still reading one extra y value).
 xcorr_kernel_neon_process2:
  ADDS         r12, r12, #2
  BLE xcorr_kernel_neon_process1
  @ Load x[0...1]
  VLD2.16      {d6[],d7[]}, [r4]!
  @ Use VAND since it's a data processing instruction again.
  VAND         d4, d5, d5
  SUB          r12, r12, #2
  @ Load y[4...5]
  VLD1.32      {d5[]}, [r5]!
  VMLAL.S16    q0, d4, d6
  VEXT.16      d16, d4, d5, #1
  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
  @ instead of VEXT, since it's a data-processing instruction.
  VSRI.64      d5, d4, #32
  VMLAL.S16    q0, d16, d7
@ Process 1 sample using the extra y value we loaded above.
 xcorr_kernel_neon_process1:
  @ Load next *x
  VLD1.16      {d6[]}, [r4]!
  ADDS         r12, r12, #1
  @ y[0...3] are left in d5 from prior iteration(s) (if any)
  VMLAL.S16    q0, d5, d6
  MOVLE        pc, lr
@ Now process 1 last sample, not reading ahead.
  @ Load last *y
  VLD1.16      {d4[]}, [r5]!
  VSRI.64      d4, d5, #16
  @ Load last *x
  VLD1.16      {d6[]}, [r4]!
  VMLAL.S16    q0, d4, d6
  MOV          pc, lr
 	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
@  opus_val32 *xcorr, int len, int max_pitch, int arch)
 	.type	celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
  @ input:
  @   r0  = opus_val16 *_x
  @   r1  = opus_val16 *_y
  @   r2  = opus_val32 *xcorr
  @   r3  = int         len
  @ output:
  @   r0  = int         maxcorr
  @ internal usage:
  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
  @   r6  = int         max_pitch
  @   r12 = int         j
  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
  @ ignored:
  @         int         arch
  STMFD        sp!, {r4-r6, lr}
  LDR          r6, [sp, #16]
  VMOV.S32     q15, #1
  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
  SUBS         r6, r6, #4
  BLT celt_pitch_xcorr_neon_process4_done
 celt_pitch_xcorr_neon_process4:
  @ xcorr_kernel_neon parameters:
  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
  MOV          r4, r0
  MOV          r5, r1
  VEOR         q0, q0, q0
  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
  @ So we don't save/restore any other registers.
  BL xcorr_kernel_neon_start
  SUBS         r6, r6, #4
  VST1.32      {q0}, [r2]!
  @ _y += 4
  ADD          r1, r1, #8
  VMAX.S32     q15, q15, q0
  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
  BGE celt_pitch_xcorr_neon_process4
@ We have less than 4 sums left to compute.
 celt_pitch_xcorr_neon_process4_done:
  ADDS         r6, r6, #4
  @ Reduce maxcorr to a single value
  VMAX.S32     d30, d30, d31
  VPMAX.S32    d30, d30, d30
  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
  BLE celt_pitch_xcorr_neon_done
@ Now compute each remaining sum one at a time.
 celt_pitch_xcorr_neon_process_remaining:
  MOV          r4, r0
  MOV          r5, r1
  VMOV.I32     q0, #0
  SUBS         r12, r3, #8
  BLT celt_pitch_xcorr_neon_process_remaining4
@ Sum terms 8 at a time.
 celt_pitch_xcorr_neon_process_remaining_loop8:
  @ Load x[0...7]
  VLD1.16      {q1}, [r4]!
  @ Load y[0...7]
  VLD1.16      {q2}, [r5]!
  SUBS         r12, r12, #8
  VMLAL.S16    q0, d4, d2
  VMLAL.S16    q0, d5, d3
  BGE celt_pitch_xcorr_neon_process_remaining_loop8
@ Sum terms 4 at a time.
 celt_pitch_xcorr_neon_process_remaining4:
  ADDS         r12, r12, #4
  BLT celt_pitch_xcorr_neon_process_remaining4_done
  @ Load x[0...3]
  VLD1.16      {d2}, [r4]!
  @ Load y[0...3]
  VLD1.16      {d3}, [r5]!
  SUB          r12, r12, #4
  VMLAL.S16    q0, d3, d2
 celt_pitch_xcorr_neon_process_remaining4_done:
  @ Reduce the sum to a single value.
  VADD.S32     d0, d0, d1
  VPADDL.S32   d0, d0
  ADDS         r12, r12, #4
  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
@ Sum terms 1 at a time.
 celt_pitch_xcorr_neon_process_remaining_loop1:
  VLD1.16      {d2[]}, [r4]!
  VLD1.16      {d3[]}, [r5]!
  SUBS         r12, r12, #1
  VMLAL.S16    q0, d2, d3
  BGT celt_pitch_xcorr_neon_process_remaining_loop1
 celt_pitch_xcorr_neon_process_remaining_loop_done:
  VST1.32      {d0[0]}, [r2]!
  VMAX.S32     d30, d30, d0
  SUBS         r6, r6, #1
  @ _y++
  ADD          r1, r1, #2
  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
  BGT celt_pitch_xcorr_neon_process_remaining
 celt_pitch_xcorr_neon_done:
  VMOV.32      r0, d30[0]
  LDMFD        sp!, {r4-r6, pc}
 	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
 .endif
 .if OPUS_ARM_MAY_HAVE_EDSP
@ This will get used on ARMv7 devices without NEON, so it has been optimized
@ to take advantage of dual-issuing where possible.
 	.type	xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
 xcorr_kernel_edsp_start:
  @ input:
  @   r3      = int         len
  @   r4      = opus_val16 *_x (must be 32-bit aligned)
  @   r5      = opus_val16 *_y (must be 32-bit aligned)
  @   r6...r9 = opus_val32  sum[4]
  @ output:
  @   r6...r9 = opus_val32  sum[4]
  @ preserved: r0-r5
  @ internal usage
  @   r2      = int         j
  @   r12,r14 = opus_val16  x[4]
  @   r10,r11 = opus_val16  y[4]
  STMFD        sp!, {r2,r4,r5,lr}
  LDR          r10, [r5], #4      @ Load y[0...1]
  SUBS         r2, r3, #4         @ j = len-4
  LDR          r11, [r5], #4      @ Load y[2...3]
  BLE xcorr_kernel_edsp_process4_done
  LDR          r12, [r4], #4      @ Load x[0...1]
  @ Stall
 xcorr_kernel_edsp_process4:
  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
  @ other. Every other instruction here dual-issues with a multiply, and is
  @ thus "free". There should be no stalls in the body of the loop.
  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
  LDR          r14, [r4], #4      @ Load x[2...3]
  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
  SUBS         r2, r2, #4         @ j-=4
  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
  LDR          r10, [r5], #4      @ Load y[4...5]
  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
  LDRGT        r12, [r4], #4      @ Load x[0...1]
  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
  LDR          r11, [r5], #4      @ Load y[6...7]
  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
  BGT xcorr_kernel_edsp_process4
 xcorr_kernel_edsp_process4_done:
  ADDS         r2, r2, #4
  BLE xcorr_kernel_edsp_done
  LDRH         r12, [r4], #2      @ r12 = *x++
  SUBS         r2, r2, #1         @ j--
  @ Stall
  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
  LDRHGT       r14, [r4], #2      @ r14 = *x++
  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
  BLE xcorr_kernel_edsp_done
  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
  SUBS         r2, r2, #1         @ j--
  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
  LDRHGT       r12, [r4], #2      @ r12 = *x++
  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
  BLE xcorr_kernel_edsp_done
  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
  CMP          r2, #1             @ j--
  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
  LDRHGT       r14, [r4]          @ r14 = *x
  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
  BLE xcorr_kernel_edsp_done
  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
  LDRH         r11, [r5]          @ r11 = y_6 = *y
  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
 xcorr_kernel_edsp_done:
  LDMFD        sp!, {r2,r4,r5,pc}
 	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
 	.type	celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
  @ input:
  @   r0  = opus_val16 *_x (must be 32-bit aligned)
  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
  @   r2  = opus_val32 *xcorr
  @   r3  = int         len
  @ output:
  @   r0  = maxcorr
  @ internal usage
  @   r4  = opus_val16 *x
  @   r5  = opus_val16 *y
  @   r6  = opus_val32  sum0
  @   r7  = opus_val32  sum1
  @   r8  = opus_val32  sum2
  @   r9  = opus_val32  sum3
  @   r1  = int         max_pitch
  @   r12 = int         j
  @ ignored:
  @         int         arch
  STMFD        sp!, {r4-r11, lr}
  MOV          r5, r1
  LDR          r1, [sp, #36]
  MOV          r4, r0
  TST          r5, #3
  @ maxcorr = 1
  MOV          r0, #1
  BEQ          celt_pitch_xcorr_edsp_process1u_done
@ Compute one sum at the start to make y 32-bit aligned.
  SUBS         r12, r3, #4
  @ r14 = sum = 0
  MOV          r14, #0
  LDRH         r8, [r5], #2
  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
  LDR          r6, [r4], #4
  MOV          r8, r8, LSL #16
 celt_pitch_xcorr_edsp_process1u_loop4:
  LDR          r9, [r5], #4
  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
  LDR          r7, [r4], #4
  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
  LDR          r8, [r5], #4
  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
  SUBS         r12, r12, #4         @ j-=4
  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
  LDRGT        r6, [r4], #4
  BGT celt_pitch_xcorr_edsp_process1u_loop4
  MOV          r8, r8, LSR #16
 celt_pitch_xcorr_edsp_process1u_loop4_done:
  ADDS         r12, r12, #4
 celt_pitch_xcorr_edsp_process1u_loop1:
  LDRHGE       r6, [r4], #2
  @ Stall
  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
  SUBSGE       r12, r12, #1
  LDRHGT       r8, [r5], #2
  BGT celt_pitch_xcorr_edsp_process1u_loop1
  @ Restore _x
  SUB          r4, r4, r3, LSL #1
  @ Restore and advance _y
  SUB          r5, r5, r3, LSL #1
  @ maxcorr = max(maxcorr, sum)
  CMP          r0, r14
  ADD          r5, r5, #2
  MOVLT        r0, r14
  SUBS         r1, r1, #1
  @ xcorr[i] = sum
  STR          r14, [r2], #4
  BLE celt_pitch_xcorr_edsp_done
 celt_pitch_xcorr_edsp_process1u_done:
  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
  SUBS         r1, r1, #4
  BLT celt_pitch_xcorr_edsp_process2
 celt_pitch_xcorr_edsp_process4:
  @ xcorr_kernel_edsp parameters:
  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
  MOV          r6, #0
  MOV          r7, #0
  MOV          r8, #0
  MOV          r9, #0
  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
  CMP          r0, r6
  @ _y+=4
  ADD          r5, r5, #8
  MOVLT        r0, r6
  CMP          r0, r7
  MOVLT        r0, r7
  CMP          r0, r8
  MOVLT        r0, r8
  CMP          r0, r9
  MOVLT        r0, r9
  STMIA        r2!, {r6-r9}
  SUBS         r1, r1, #4
  BGE celt_pitch_xcorr_edsp_process4
 celt_pitch_xcorr_edsp_process2:
  ADDS         r1, r1, #2
  BLT celt_pitch_xcorr_edsp_process1a
  SUBS         r12, r3, #4
  @ {r10, r11} = {sum0, sum1} = {0, 0}
  MOV          r10, #0
  MOV          r11, #0
  LDR          r8, [r5], #4
  BLE celt_pitch_xcorr_edsp_process2_loop_done
  LDR          r6, [r4], #4
  LDR          r9, [r5], #4
 celt_pitch_xcorr_edsp_process2_loop4:
  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
  LDR          r7, [r4], #4
  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
  SUBS         r12, r12, #4         @ j-=4
  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
  LDR          r8, [r5], #4
  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
  LDRGT        r6, [r4], #4
  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
  LDRGT        r9, [r5], #4
  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
  BGT celt_pitch_xcorr_edsp_process2_loop4
 celt_pitch_xcorr_edsp_process2_loop_done:
  ADDS         r12, r12, #2
  BLE  celt_pitch_xcorr_edsp_process2_1
  LDR          r6, [r4], #4
  @ Stall
  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
  LDR          r9, [r5], #4
  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
  SUB          r12, r12, #2
  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
  MOV          r8, r9
  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
 celt_pitch_xcorr_edsp_process2_1:
  LDRH         r6, [r4], #2
  ADDS         r12, r12, #1
  @ Stall
  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
  LDRHGT       r7, [r4], #2
  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
  BLE celt_pitch_xcorr_edsp_process2_done
  LDRH         r9, [r5], #2
  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
 celt_pitch_xcorr_edsp_process2_done:
  @ Restore _x
  SUB          r4, r4, r3, LSL #1
  @ Restore and advance _y
  SUB          r5, r5, r3, LSL #1
  @ maxcorr = max(maxcorr, sum0)
  CMP          r0, r10
  ADD          r5, r5, #2
  MOVLT        r0, r10
  SUB          r1, r1, #2
  @ maxcorr = max(maxcorr, sum1)
  CMP          r0, r11
  @ xcorr[i] = sum
  STR          r10, [r2], #4
  MOVLT        r0, r11
  STR          r11, [r2], #4
 celt_pitch_xcorr_edsp_process1a:
  ADDS         r1, r1, #1
  BLT celt_pitch_xcorr_edsp_done
  SUBS         r12, r3, #4
  @ r14 = sum = 0
  MOV          r14, #0
  BLT celt_pitch_xcorr_edsp_process1a_loop_done
  LDR          r6, [r4], #4
  LDR          r8, [r5], #4
  LDR          r7, [r4], #4
  LDR          r9, [r5], #4
 celt_pitch_xcorr_edsp_process1a_loop4:
  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
  SUBS         r12, r12, #4         @ j-=4
  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
  LDRGE        r6, [r4], #4
  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
  LDRGE        r8, [r5], #4
  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
  LDRGE        r7, [r4], #4
  LDRGE        r9, [r5], #4
  BGE celt_pitch_xcorr_edsp_process1a_loop4
 celt_pitch_xcorr_edsp_process1a_loop_done:
  ADDS         r12, r12, #2
  LDRGE        r6, [r4], #4
  LDRGE        r8, [r5], #4
  @ Stall
  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
  SUBGE        r12, r12, #2
  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
  ADDS         r12, r12, #1
  LDRHGE       r6, [r4], #2
  LDRHGE       r8, [r5], #2
  @ Stall
  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
  @ maxcorr = max(maxcorr, sum)
  CMP          r0, r14
  @ xcorr[i] = sum
  STR          r14, [r2], #4
  MOVLT        r0, r14
 celt_pitch_xcorr_edsp_done:
  LDMFD        sp!, {r4-r11, pc}
 	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
 .endif
@ END:
    .section	.note.GNU-stack,"",%progbits
--- a/thirdparty/opus/celt/fixed_c5x.h
+++ b/thirdparty/opus/celt/fixed_c5x.h
@ -1,79 +0,0 @@
 /* Copyright (C) 2003 Jean-Marc Valin */
 /**
   @file fixed_c5x.h
   @brief Fixed-point operations for the TI C5x DSP family
 */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef FIXED_C5X_H
 #define FIXED_C5X_H
 #include "dsplib.h"
 #undef IMUL32
 static OPUS_INLINE long IMUL32(long i, long j)
 {
   long ac0, ac1;
   ac0 = _lmpy(i>>16,j);
   ac1 = ac0 + _lmpy(i,j>>16);
   return _lmpyu(i,j) + (ac1<<16);
 }
 #undef MAX16
 #define MAX16(a,b) _max(a,b)
 #undef MIN16
 #define MIN16(a,b) _min(a,b)
 #undef MAX32
 #define MAX32(a,b) _lmax(a,b)
 #undef MIN32
 #define MIN32(a,b) _lmin(a,b)
 #undef VSHR32
 #define VSHR32(a, shift) _lshl(a,-(shift))
 #undef MULT16_16_Q15
 #define MULT16_16_Q15(a,b) (_smpy(a,b))
 #undef MULT16_16SU
 #define MULT16_16SU(a,b) _lmpysu(a,b)
 #undef MULT_16_16
 #define MULT_16_16(a,b) _lmpy(a,b)
 /* FIXME: This is technically incorrect and is bound to cause problems. Is there any cleaner solution? */
 #undef MULT16_32_Q15
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),(b)),15))
 #define celt_ilog2(x) (30 - _lnorm(x))
 #define OVERRIDE_CELT_ILOG2
 #define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
 #define OVERRIDE_CELT_MAXABS16
 #endif /* FIXED_C5X_H */
--- a/thirdparty/opus/config.h
+++ b/thirdparty/opus/config.h
@ -35,7 +35,7 @@
 /* #undef FUZZING */
 /* Define to 1 if you have the <alloca.h> header file. */
-/*  #undef HAVE_ALLOCA_H  */
+/* #undef HAVE_ALLOCA_H */
 /* NE10 library is installed on host. Make sure it is on target! */
 /* #undef HAVE_ARM_NE10 */
@ -46,16 +46,12 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 #if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 ))
 /* Define to 1 if you have the `lrint' function. */
 #define HAVE_LRINT 1
 /* Define to 1 if you have the `lrintf' function. */
 #define HAVE_LRINTF 1
 #endif
 /* Define to 1 if you have the <memory.h> header file. */
 #define HAVE_MEMORY_H 1
@ -83,8 +79,7 @@
 /* Define to 1 if you have the `__malloc_hook' function. */
 #define HAVE___MALLOC_HOOK 1
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
   */
 #define LT_OBJDIR ".libs/"
 #ifdef OPUS_ARM_OPT
@ -191,7 +186,7 @@
 #define PACKAGE_NAME "opus"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "opus unknown"
+#define PACKAGE_STRING "opus 1.3.1"
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "opus"
@ -200,7 +195,7 @@
 #define PACKAGE_URL ""
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "unknown"
+#define PACKAGE_VERSION "1.3.1"
 /* Define to 1 if you have the ANSI C header files. */
 #define STDC_HEADERS 1
@ -232,11 +227,7 @@
 /* Define to the equivalent of the C99 'restrict' keyword, or to
   nothing if this is not supported.  Do not define if restrict is
   supported directly.  */
 #if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 ))
 #define restrict __restrict
 #else
 #undef restrict
 #endif
 /* Work around a bug in Sun C++: it does not support _Restrict or
   __restrict__, even though the corresponding Sun C compiler ends up with
   "#define restrict _Restrict" or "#define restrict __restrict__" in the
--- a/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h
+++ b/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h
@ -1,184 +0,0 @@
 /***********************************************************************
 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 - Redistributions of source code must retain the above copyright notice,
 this list of conditions and the following disclaimer.
 - Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 - Neither the name of Internet Society, IETF or IETF Trust, nor the
 names of specific contributors, may be used to endorse or promote
 products derived from this software without specific prior written
 permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 ***********************************************************************/
 #ifndef __PREFILTER_FIX_MIPSR1_H__
 #define __PREFILTER_FIX_MIPSR1_H__
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #include "main_FIX.h"
 #include "stack_alloc.h"
 #include "tuning_parameters.h"
 #define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
 void silk_warped_LPC_analysis_filter_FIX(
          opus_int32            state[],                    /* I/O  State [order + 1]                   */
          opus_int32            res_Q2[],                   /* O    Residual signal [length]            */
    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
    const opus_int16            input[],                    /* I    Input signal [length]               */
    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
    const opus_int              length,                     /* I    Length of input signal              */
    const opus_int              order,                      /* I    Filter order (even)                 */
               int              arch
 )
 {
    opus_int     n, i;
    opus_int32   acc_Q11, acc_Q22, tmp1, tmp2, tmp3, tmp4;
    opus_int32   state_cur, state_next;
    (void)arch;
    /* Order must be even */
    /* Length must be even */
    silk_assert( ( order & 1 ) == 0 );
    silk_assert( ( length & 1 ) == 0 );
    for( n = 0; n < length; n+=2 ) {
        /* Output of lowpass section */
        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
        state_cur = silk_LSHIFT( input[ n ], 14 );
        /* Output of allpass section */
        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
        state_next = tmp2;
        acc_Q11 = silk_RSHIFT( order, 1 );
        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
        /* Output of lowpass section */
        tmp4 = silk_SMLAWB( state_cur, state_next, lambda_Q16 );
        state[ 0 ] = silk_LSHIFT( input[ n+1 ], 14 );
        /* Output of allpass section */
        tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
        state[ 1 ] = tmp4;
        acc_Q22 = silk_RSHIFT( order, 1 );
        acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ 0 ] );
        /* Loop over allpass sections */
        for( i = 2; i < order; i += 2 ) {
            /* Output of allpass section */
            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
            state_cur = tmp1;
            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
            /* Output of allpass section */
            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
            state_next = tmp2;
            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
            /* Output of allpass section */
            tmp4 = silk_SMLAWB( state_cur, state_next - tmp3, lambda_Q16 );
            state[ i ] = tmp3;
            acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ i - 1 ] );
            /* Output of allpass section */
            tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
            state[ i + 1 ] = tmp4;
            acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ i ] );
        }
        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
        state[ order ] = tmp3;
        acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ order - 1 ] );
        res_Q2[ n+1 ] = silk_LSHIFT( (opus_int32)input[ n+1 ], 2 ) - silk_RSHIFT_ROUND( acc_Q22, 9 );
    }
 }
 /* Prefilter for finding Quantizer input signal */
 #define OVERRIDE_silk_prefilt_FIX
 static inline void silk_prefilt_FIX(
    silk_prefilter_state_FIX    *P,                         /* I/O  state                               */
    opus_int32                  st_res_Q12[],               /* I    short term residual signal          */
    opus_int32                  xw_Q3[],                    /* O    prefiltered signal                  */
    opus_int32                  HarmShapeFIRPacked_Q12,     /* I    Harmonic shaping coeficients        */
    opus_int                    Tilt_Q14,                   /* I    Tilt shaping coeficient             */
    opus_int32                  LF_shp_Q14,                 /* I    Low-frequancy shaping coeficients   */
    opus_int                    lag,                        /* I    Lag for harmonic shaping            */
    opus_int                    length                      /* I    Length of signals                   */
 )
 {
    opus_int   i, idx, LTP_shp_buf_idx;
    opus_int32 n_LTP_Q12, n_Tilt_Q10, n_LF_Q10;
    opus_int32 sLF_MA_shp_Q12, sLF_AR_shp_Q12;
    opus_int16 *LTP_shp_buf;
    /* To speed up use temp variables instead of using the struct */
    LTP_shp_buf     = P->sLTP_shp;
    LTP_shp_buf_idx = P->sLTP_shp_buf_idx;
    sLF_AR_shp_Q12  = P->sLF_AR_shp_Q12;
    sLF_MA_shp_Q12  = P->sLF_MA_shp_Q12;
    if( lag > 0 ) {
        for( i = 0; i < length; i++ ) {
            /* unrolled loop */
            silk_assert( HARM_SHAPE_FIR_TAPS == 3 );
            idx = lag + LTP_shp_buf_idx;
            n_LTP_Q12 = silk_SMULBB(            LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
            n_LTP_Q12 = silk_SMLABT( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2    ) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
            n_LTP_Q12 = silk_SMLABB( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
            n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
            n_LF_Q10   = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
            sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
            sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12,  silk_LSHIFT( n_LF_Q10,   2 ) );
            LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
            LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
            xw_Q3[i] = silk_RSHIFT_ROUND( silk_SUB32( sLF_MA_shp_Q12, n_LTP_Q12 ), 9 );
        }
    }
    else
    {
        for( i = 0; i < length; i++ ) {
            n_LTP_Q12 = 0;
            n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
            n_LF_Q10   = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
            sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
            sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12,  silk_LSHIFT( n_LF_Q10,   2 ) );
            LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
            LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
            xw_Q3[i] = silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 9 );
        }
    }
    /* Copy temp variable back to state */
    P->sLF_AR_shp_Q12   = sLF_AR_shp_Q12;
    P->sLF_MA_shp_Q12   = sLF_MA_shp_Q12;
    P->sLTP_shp_buf_idx = LTP_shp_buf_idx;
 }
 #endif /* __PREFILTER_FIX_MIPSR1_H__ */
--- a/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c
+++ b/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c
@ -1,160 +0,0 @@
 /* Copyright (c) 2014, Cisco Systems, INC
   Written by XiangMingZhu WeiZhou MinPeng YanWang
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #include <smmintrin.h>
 #include "main.h"
 #include "celt/x86/x86cpu.h"
 void silk_warped_LPC_analysis_filter_FIX_sse4_1(
    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
    const opus_int16            input[],                    /* I    Input signal [length]               */
    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
    const opus_int              length,                     /* I    Length of input signal              */
    const opus_int              order                       /* I    Filter order (even)                 */
 )
 {
    opus_int     n, i;
    opus_int32   acc_Q11, tmp1, tmp2;
    /* Order must be even */
    celt_assert( ( order & 1 ) == 0 );
    if (order == 10)
    {
        if (0 == lambda_Q16)
        {
            __m128i coef_Q13_3210, coef_Q13_7654;
            __m128i coef_Q13_0123, coef_Q13_4567;
            __m128i state_0123, state_4567;
            __m128i xmm_product1, xmm_product2;
            __m128i xmm_tempa, xmm_tempb;
            register opus_int32 sum;
            register opus_int32 state_8, state_9, state_a;
            register opus_int64 coef_Q13_8, coef_Q13_9;
            celt_assert( length > 0 );
            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
            state_8 = state[ 8 ];
            state_9 = state[ 9 ];
            state_a = 0;
            for( n = 0; n < length; n++ )
            {
                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
                sum  = (opus_int32)((coef_Q13_8 * state_8) >> 16);
                sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
                sum += _mm_cvtsi128_si32( xmm_tempa);
                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
                /* move right */
                state_a = state_9;
                state_9 = state_8;
                state_8 = _mm_cvtsi128_si32( state_4567 );
                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
            }
            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
            state[ 8 ] = state_8;
            state[ 9 ] = state_9;
            state[ 10 ] = state_a;
            return;
        }
    }
    for( n = 0; n < length; n++ ) {
        /* Output of lowpass section */
        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
        /* Output of allpass section */
        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
        state[ 1 ] = tmp2;
        acc_Q11 = silk_RSHIFT( order, 1 );
        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
        /* Loop over allpass sections */
        for( i = 2; i < order; i += 2 ) {
            /* Output of allpass section */
            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
            state[ i ] = tmp1;
            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
            /* Output of allpass section */
            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
            state[ i + 1 ] = tmp2;
            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
        }
        state[ order ] = tmp1;
        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
    }
 }