/******************************************************************** * * * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * * * * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * * * ******************************************************************** function: last mod: $Id$ ********************************************************************/ /*MMX acceleration of fragment reconstruction for motion compensation. Originally written by Rudolf Marek. Additional optimization by Nils Pipenbrinck. Note: Loops are unrolled for best performance. The iteration each instruction belongs to is marked in the comments as #i.*/ #include <stddef.h> #include "x86int.h" #if defined(OC_X86_ASM) /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes between rows.*/ # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ do{ \ const unsigned char *src; \ unsigned char *dst; \ src=(_src); \ dst=(_dst); \ __asm mov SRC,src \ __asm mov DST,dst \ __asm mov YSTRIDE,_ystride \ /*src+0*ystride*/ \ __asm movq mm0,[SRC] \ /*src+1*ystride*/ \ __asm movq mm1,[SRC+YSTRIDE] \ /*ystride3=ystride*3*/ \ __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ /*src+2*ystride*/ \ __asm movq mm2,[SRC+YSTRIDE*2] \ /*src+3*ystride*/ \ __asm movq mm3,[SRC+YSTRIDE3] \ /*dst+0*ystride*/ \ __asm movq [DST],mm0 \ /*dst+1*ystride*/ \ __asm movq [DST+YSTRIDE],mm1 \ /*Pointer to next 4.*/ \ __asm lea SRC,[SRC+YSTRIDE*4] \ /*dst+2*ystride*/ \ __asm movq [DST+YSTRIDE*2],mm2 \ /*dst+3*ystride*/ \ __asm movq [DST+YSTRIDE3],mm3 \ /*Pointer to next 4.*/ \ __asm lea DST,[DST+YSTRIDE*4] \ /*src+0*ystride*/ \ __asm movq mm0,[SRC] \ /*src+1*ystride*/ \ __asm movq mm1,[SRC+YSTRIDE] \ /*src+2*ystride*/ \ __asm movq mm2,[SRC+YSTRIDE*2] \ /*src+3*ystride*/ \ __asm movq mm3,[SRC+YSTRIDE3] \ /*dst+0*ystride*/ \ __asm movq [DST],mm0 \ /*dst+1*ystride*/ \ __asm movq [DST+YSTRIDE],mm1 \ /*dst+2*ystride*/ \ __asm movq [DST+YSTRIDE*2],mm2 \ /*dst+3*ystride*/ \ __asm movq [DST+YSTRIDE3],mm3 \ } \ while(0) /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes between rows.*/ void oc_frag_copy_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride){ #define SRC edx #define DST eax #define YSTRIDE ecx #define YSTRIDE3 esi OC_FRAG_COPY_MMX(_dst,_src,_ystride); #undef SRC #undef DST #undef YSTRIDE #undef YSTRIDE3 } /*Copies the fragments specified by the lists of fragment indices from one frame to another. _dst_frame: The reference frame to copy to. _src_frame: The reference frame to copy from. _ystride: The row stride of the reference frames. _fragis: A pointer to a list of fragment indices. _nfragis: The number of fragment indices to copy. _frag_buf_offs: The offsets of fragments in the reference frames.*/ void oc_frag_copy_list_mmx(unsigned char *_dst_frame, const unsigned char *_src_frame,int _ystride, const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ ptrdiff_t fragii; for(fragii=0;fragii<_nfragis;fragii++){ ptrdiff_t frag_buf_off; frag_buf_off=_frag_buf_offs[_fragis[fragii]]; #define SRC edx #define DST eax #define YSTRIDE ecx #define YSTRIDE3 edi OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, _src_frame+frag_buf_off,_ystride); #undef SRC #undef DST #undef YSTRIDE #undef YSTRIDE3 } } void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, const ogg_int16_t *_residue){ __asm{ #define DST edx #define DST4 esi #define YSTRIDE eax #define YSTRIDE3 edi #define RESIDUE ecx mov DST,_dst mov YSTRIDE,_ystride mov RESIDUE,_residue lea DST4,[DST+YSTRIDE*4] lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ pcmpeqw mm0,mm0 /*#0 Load low residue.*/ movq mm1,[0*8+RESIDUE] /*#0 Load high residue.*/ movq mm2,[1*8+RESIDUE] /*Set mm0 to 0x8000800080008000.*/ psllw mm0,15 /*#1 Load low residue.*/ movq mm3,[2*8+RESIDUE] /*#1 Load high residue.*/ movq mm4,[3*8+RESIDUE] /*Set mm0 to 0x0080008000800080.*/ psrlw mm0,8 /*#2 Load low residue.*/ movq mm5,[4*8+RESIDUE] /*#2 Load high residue.*/ movq mm6,[5*8+RESIDUE] /*#0 Bias low residue.*/ paddsw mm1,mm0 /*#0 Bias high residue.*/ paddsw mm2,mm0 /*#0 Pack to byte.*/ packuswb mm1,mm2 /*#1 Bias low residue.*/ paddsw mm3,mm0 /*#1 Bias high residue.*/ paddsw mm4,mm0 /*#1 Pack to byte.*/ packuswb mm3,mm4 /*#2 Bias low residue.*/ paddsw mm5,mm0 /*#2 Bias high residue.*/ paddsw mm6,mm0 /*#2 Pack to byte.*/ packuswb mm5,mm6 /*#0 Write row.*/ movq [DST],mm1 /*#1 Write row.*/ movq [DST+YSTRIDE],mm3 /*#2 Write row.*/ movq [DST+YSTRIDE*2],mm5 /*#3 Load low residue.*/ movq mm1,[6*8+RESIDUE] /*#3 Load high residue.*/ movq mm2,[7*8+RESIDUE] /*#4 Load high residue.*/ movq mm3,[8*8+RESIDUE] /*#4 Load high residue.*/ movq mm4,[9*8+RESIDUE] /*#5 Load high residue.*/ movq mm5,[10*8+RESIDUE] /*#5 Load high residue.*/ movq mm6,[11*8+RESIDUE] /*#3 Bias low residue.*/ paddsw mm1,mm0 /*#3 Bias high residue.*/ paddsw mm2,mm0 /*#3 Pack to byte.*/ packuswb mm1,mm2 /*#4 Bias low residue.*/ paddsw mm3,mm0 /*#4 Bias high residue.*/ paddsw mm4,mm0 /*#4 Pack to byte.*/ packuswb mm3,mm4 /*#5 Bias low residue.*/ paddsw mm5,mm0 /*#5 Bias high residue.*/ paddsw mm6,mm0 /*#5 Pack to byte.*/ packuswb mm5,mm6 /*#3 Write row.*/ movq [DST+YSTRIDE3],mm1 /*#4 Write row.*/ movq [DST4],mm3 /*#5 Write row.*/ movq [DST4+YSTRIDE],mm5 /*#6 Load low residue.*/ movq mm1,[12*8+RESIDUE] /*#6 Load high residue.*/ movq mm2,[13*8+RESIDUE] /*#7 Load low residue.*/ movq mm3,[14*8+RESIDUE] /*#7 Load high residue.*/ movq mm4,[15*8+RESIDUE] /*#6 Bias low residue.*/ paddsw mm1,mm0 /*#6 Bias high residue.*/ paddsw mm2,mm0 /*#6 Pack to byte.*/ packuswb mm1,mm2 /*#7 Bias low residue.*/ paddsw mm3,mm0 /*#7 Bias high residue.*/ paddsw mm4,mm0 /*#7 Pack to byte.*/ packuswb mm3,mm4 /*#6 Write row.*/ movq [DST4+YSTRIDE*2],mm1 /*#7 Write row.*/ movq [DST4+YSTRIDE3],mm3 #undef DST #undef DST4 #undef YSTRIDE #undef YSTRIDE3 #undef RESIDUE } } void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, int _ystride,const ogg_int16_t *_residue){ int i; /*Zero mm0.*/ __asm pxor mm0,mm0; for(i=4;i-->0;){ __asm{ #define DST edx #define SRC ecx #define YSTRIDE edi #define RESIDUE eax mov DST,_dst mov SRC,_src mov YSTRIDE,_ystride mov RESIDUE,_residue /*#0 Load source.*/ movq mm3,[SRC] /*#1 Load source.*/ movq mm7,[SRC+YSTRIDE] /*#0 Get copy of src.*/ movq mm4,mm3 /*#0 Expand high source.*/ punpckhbw mm4,mm0 /*#0 Expand low source.*/ punpcklbw mm3,mm0 /*#0 Add residue high.*/ paddsw mm4,[8+RESIDUE] /*#1 Get copy of src.*/ movq mm2,mm7 /*#0 Add residue low.*/ paddsw mm3,[RESIDUE] /*#1 Expand high source.*/ punpckhbw mm2,mm0 /*#0 Pack final row pixels.*/ packuswb mm3,mm4 /*#1 Expand low source.*/ punpcklbw mm7,mm0 /*#1 Add residue low.*/ paddsw mm7,[16+RESIDUE] /*#1 Add residue high.*/ paddsw mm2,[24+RESIDUE] /*Advance residue.*/ lea RESIDUE,[32+RESIDUE] /*#1 Pack final row pixels.*/ packuswb mm7,mm2 /*Advance src.*/ lea SRC,[SRC+YSTRIDE*2] /*#0 Write row.*/ movq [DST],mm3 /*#1 Write row.*/ movq [DST+YSTRIDE],mm7 /*Advance dst.*/ lea DST,[DST+YSTRIDE*2] mov _residue,RESIDUE mov _dst,DST mov _src,SRC #undef DST #undef SRC #undef YSTRIDE #undef RESIDUE } } } void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ int i; /*Zero mm7.*/ __asm pxor mm7,mm7; for(i=4;i-->0;){ __asm{ #define SRC1 ecx #define SRC2 edi #define YSTRIDE esi #define RESIDUE edx #define DST eax mov YSTRIDE,_ystride mov DST,_dst mov RESIDUE,_residue mov SRC1,_src1 mov SRC2,_src2 /*#0 Load src1.*/ movq mm0,[SRC1] /*#0 Load src2.*/ movq mm2,[SRC2] /*#0 Copy src1.*/ movq mm1,mm0 /*#0 Copy src2.*/ movq mm3,mm2 /*#1 Load src1.*/ movq mm4,[SRC1+YSTRIDE] /*#0 Unpack lower src1.*/ punpcklbw mm0,mm7 /*#1 Load src2.*/ movq mm5,[SRC2+YSTRIDE] /*#0 Unpack higher src1.*/ punpckhbw mm1,mm7 /*#0 Unpack lower src2.*/ punpcklbw mm2,mm7 /*#0 Unpack higher src2.*/ punpckhbw mm3,mm7 /*Advance src1 ptr.*/ lea SRC1,[SRC1+YSTRIDE*2] /*Advance src2 ptr.*/ lea SRC2,[SRC2+YSTRIDE*2] /*#0 Lower src1+src2.*/ paddsw mm0,mm2 /*#0 Higher src1+src2.*/ paddsw mm1,mm3 /*#1 Copy src1.*/ movq mm2,mm4 /*#0 Build lo average.*/ psraw mm0,1 /*#1 Copy src2.*/ movq mm3,mm5 /*#1 Unpack lower src1.*/ punpcklbw mm4,mm7 /*#0 Build hi average.*/ psraw mm1,1 /*#1 Unpack higher src1.*/ punpckhbw mm2,mm7 /*#0 low+=residue.*/ paddsw mm0,[RESIDUE] /*#1 Unpack lower src2.*/ punpcklbw mm5,mm7 /*#0 high+=residue.*/ paddsw mm1,[8+RESIDUE] /*#1 Unpack higher src2.*/ punpckhbw mm3,mm7 /*#1 Lower src1+src2.*/ paddsw mm5,mm4 /*#0 Pack and saturate.*/ packuswb mm0,mm1 /*#1 Higher src1+src2.*/ paddsw mm3,mm2 /*#0 Write row.*/ movq [DST],mm0 /*#1 Build lo average.*/ psraw mm5,1 /*#1 Build hi average.*/ psraw mm3,1 /*#1 low+=residue.*/ paddsw mm5,[16+RESIDUE] /*#1 high+=residue.*/ paddsw mm3,[24+RESIDUE] /*#1 Pack and saturate.*/ packuswb mm5,mm3 /*#1 Write row ptr.*/ movq [DST+YSTRIDE],mm5 /*Advance residue ptr.*/ add RESIDUE,32 /*Advance dest ptr.*/ lea DST,[DST+YSTRIDE*2] mov _dst,DST mov _residue,RESIDUE mov _src1,SRC1 mov _src2,SRC2 #undef SRC1 #undef SRC2 #undef YSTRIDE #undef RESIDUE #undef DST } } } void oc_restore_fpu_mmx(void){ __asm emms; } #endif