244 lines
12 KiB
C++
244 lines
12 KiB
C++
/********************************************************************
|
|
* *
|
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
|
* *
|
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
|
* *
|
|
********************************************************************
|
|
|
|
function:
|
|
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
|
|
|
********************************************************************/
|
|
|
|
#if !defined(_x86_vc_x86zigzag_H)
|
|
# define _x86_vc_x86zigzag_H (1)
|
|
# include "x86enc.h"
|
|
|
|
|
|
/*Converts DCT coefficients from transposed order into zig-zag scan order and
|
|
stores them in Y.
|
|
This relies on two macros to load the contents of each row:
|
|
OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
|
|
first four and second four entries of each row into the specified register,
|
|
respectively.
|
|
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
|
|
(because when the rows are already in SSE2 registers, loading the high half
|
|
destructively modifies the register).
|
|
The index of each output element in the original 64-element array should wind
|
|
up in the following 8x8 matrix (the letters indicate the order we compute
|
|
each 4-tuple below):
|
|
A 0 8 1 2 9 16 24 17 B
|
|
C 10 3 4 11 18 25 32 40 E
|
|
F 33 26 19 12 5 6 13 20 D
|
|
G 27 34 41 48 56 49 42 35 I
|
|
L 28 21 14 7 15 22 29 36 M
|
|
H 43 50 57 58 51 44 37 30 O
|
|
N 23 31 38 45 52 59 60 53 J
|
|
P 46 39 47 54 61 62 55 63 K
|
|
The order of the coefficients within each tuple is reversed in the comments
|
|
below to reflect the usual MSB to LSB notation.*/
|
|
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
|
|
OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \
|
|
OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \
|
|
OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \
|
|
OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \
|
|
OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \
|
|
OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \
|
|
OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \
|
|
__asm movq mm7,mm0 /*mm7=03 02 01 00*/ \
|
|
__asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \
|
|
__asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
|
|
__asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \
|
|
__asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
|
|
__asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \
|
|
__asm movq [Y+0x00],mm7 \
|
|
__asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \
|
|
__asm movq mm7,mm2 /*mm7=19 18 17 16*/ \
|
|
__asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \
|
|
__asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \
|
|
__asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \
|
|
__asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \
|
|
__asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \
|
|
OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \
|
|
__asm movq [Y+0x08],mm1 \
|
|
OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \
|
|
__asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
|
|
__asm movq [Y+0x10],mm0 \
|
|
__asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \
|
|
__asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \
|
|
__asm movq [Y+0x28],mm4 \
|
|
__asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \
|
|
__asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
|
|
__asm movq mm4,mm7 /*mm4=12 19 15 18*/ \
|
|
__asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \
|
|
__asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \
|
|
__asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \
|
|
__asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \
|
|
__asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \
|
|
OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \
|
|
__asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
|
|
__asm movq [Y+0x18],mm4 \
|
|
OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \
|
|
__asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \
|
|
__asm movq [Y+0x20],mm2 \
|
|
__asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
|
|
__asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
|
|
__asm movq mm2,mm3 /*mm2=35 42 34 27*/ \
|
|
__asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \
|
|
__asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
|
|
__asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \
|
|
__asm movq [Y+0x30],mm3 \
|
|
__asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \
|
|
__asm movq [Y+0x50],mm1 \
|
|
OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \
|
|
__asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \
|
|
OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \
|
|
__asm psllq mm6,16 /*mm6=07 23 22 ..*/ \
|
|
__asm movq mm3,mm4 /*mm3=49 56 51 59*/ \
|
|
__asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \
|
|
OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \
|
|
__asm movq [Y+0x38],mm4 \
|
|
__asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \
|
|
__asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \
|
|
__asm movq mm4,mm3 /*mm4=61 51 60 59*/ \
|
|
__asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \
|
|
__asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \
|
|
OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \
|
|
__asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
|
|
__asm movq [Y+0x68],mm3 \
|
|
__asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \
|
|
__asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
|
|
__asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \
|
|
OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \
|
|
__asm movq [Y+0x78],mm4 \
|
|
__asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \
|
|
__asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \
|
|
__asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \
|
|
__asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
|
|
__asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
|
|
__asm movq [Y+0x40],mm5 \
|
|
__asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \
|
|
__asm movq [Y+0x48],mm7 \
|
|
__asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
|
|
__asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \
|
|
__asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \
|
|
__asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \
|
|
__asm movq [Y+0x60],mm6 \
|
|
__asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \
|
|
__asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \
|
|
__asm movq [Y+0x58],mm3 \
|
|
__asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
|
|
__asm movq [Y+0x70],mm0 \
|
|
|
|
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
|
|
order and stores them in %[qdct].
|
|
The index of each output element in the original 64-element array should wind
|
|
up in the following 8x8 matrix (the letters indicate the order we compute
|
|
each 4-tuple below):
|
|
A 0 1 8 16 9 2 3 10 B
|
|
C 17 24 32 25 18 11 4 5 D
|
|
E 12 19 26 33 40 48 41 34 I
|
|
H 27 20 13 6 7 14 21 28 G
|
|
K 35 42 49 56 57 50 43 36 J
|
|
F 29 22 15 23 30 37 44 51 M
|
|
P 58 59 52 45 38 31 39 46 L
|
|
N 53 60 61 54 47 55 62 63 O
|
|
The order of the coefficients within each tuple is reversed in the comments
|
|
below to reflect the usual MSB to LSB notation.*/
|
|
#define OC_ZIG_ZAG_MMXEXT \
|
|
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
|
|
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
|
|
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
|
|
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
|
|
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
|
|
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
|
|
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
|
|
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
|
|
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
|
|
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
|
|
"movq %%mm0,0x00(%[qdct])\n\t" \
|
|
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
|
|
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
|
|
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
|
|
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
|
|
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
|
|
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
|
|
"movq %%mm6,0x08(%[qdct])\n\t" \
|
|
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
|
|
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
|
|
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
|
|
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
|
|
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
|
|
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
|
|
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
|
|
"movq %%mm2,0x10(%[qdct])\n\t" \
|
|
"movq %%mm3,0x18(%[qdct])\n\t" \
|
|
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
|
|
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
|
|
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
|
|
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
|
|
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
|
|
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
|
|
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
|
|
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
|
|
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
|
|
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
|
|
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
|
|
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
|
|
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
|
|
"movq %%mm0,0x20(%[qdct])\n\t" \
|
|
"movq %%mm3,0x50(%[qdct])\n\t" \
|
|
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
|
|
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
|
|
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
|
|
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
|
|
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
|
|
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
|
|
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
|
|
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
|
|
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
|
|
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
|
|
"movq %%mm2,0x30(%[qdct])\n\t" \
|
|
"movq %%mm6,0x38(%[qdct])\n\t" \
|
|
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
|
|
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
|
|
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
|
|
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
|
|
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
|
|
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
|
|
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
|
|
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
|
|
"movq %%mm0,0x28(%[qdct])\n\t" \
|
|
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
|
|
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
|
|
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
|
|
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
|
|
"movq %%mm4,0x40(%[qdct])\n\t" \
|
|
"movq %%mm6,0x48(%[qdct])\n\t" \
|
|
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
|
|
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
|
|
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
|
|
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
|
|
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
|
|
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
|
|
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
|
|
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
|
|
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
|
|
"movq %%mm2,0x68(%[qdct])\n\t" \
|
|
"movq %%mm1,0x58(%[qdct])\n\t" \
|
|
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
|
|
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
|
|
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
|
|
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
|
|
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
|
|
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
|
|
"movq %%mm6,0x70(%[qdct])\n\t" \
|
|
"movq %%mm5,0x78(%[qdct])\n\t" \
|
|
"movq %%mm7,0x60(%[qdct])\n\t" \
|
|
|
|
#endif
|