/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
 *                                                                  *
 ********************************************************************

  function:
    last mod: $Id$

 ********************************************************************/

/*MMX acceleration of fragment reconstruction for motion compensation.
  Originally written by Rudolf Marek.
  Additional optimization by Nils Pipenbrinck.
  Note: Loops are unrolled for best performance.
  The iteration each instruction belongs to is marked in the comments as #i.*/
#include <stddef.h>
#include "x86int.h"

#if defined(OC_X86_ASM)

/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
   between rows.*/
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
  do{ \
    const unsigned char *src; \
    unsigned char       *dst; \
    src=(_src); \
    dst=(_dst); \
    __asm  mov SRC,src \
    __asm  mov DST,dst \
    __asm  mov YSTRIDE,_ystride \
    /*src+0*ystride*/ \
    __asm  movq mm0,[SRC] \
    /*src+1*ystride*/ \
    __asm  movq mm1,[SRC+YSTRIDE] \
    /*ystride3=ystride*3*/ \
    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
    /*src+2*ystride*/ \
    __asm  movq mm2,[SRC+YSTRIDE*2] \
    /*src+3*ystride*/ \
    __asm  movq mm3,[SRC+YSTRIDE3] \
    /*dst+0*ystride*/ \
    __asm  movq [DST],mm0 \
    /*dst+1*ystride*/ \
    __asm  movq [DST+YSTRIDE],mm1 \
    /*Pointer to next 4.*/ \
    __asm  lea SRC,[SRC+YSTRIDE*4] \
    /*dst+2*ystride*/ \
    __asm  movq [DST+YSTRIDE*2],mm2 \
    /*dst+3*ystride*/ \
    __asm  movq [DST+YSTRIDE3],mm3 \
    /*Pointer to next 4.*/ \
    __asm  lea DST,[DST+YSTRIDE*4] \
    /*src+0*ystride*/ \
    __asm  movq mm0,[SRC] \
    /*src+1*ystride*/ \
    __asm  movq mm1,[SRC+YSTRIDE] \
    /*src+2*ystride*/ \
    __asm  movq mm2,[SRC+YSTRIDE*2] \
    /*src+3*ystride*/ \
    __asm  movq mm3,[SRC+YSTRIDE3] \
    /*dst+0*ystride*/ \
    __asm  movq [DST],mm0 \
    /*dst+1*ystride*/ \
    __asm  movq [DST+YSTRIDE],mm1 \
    /*dst+2*ystride*/ \
    __asm  movq [DST+YSTRIDE*2],mm2 \
    /*dst+3*ystride*/ \
    __asm  movq [DST+YSTRIDE3],mm3 \
  } \
  while(0)

/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
   between rows.*/
void oc_frag_copy_mmx(unsigned char *_dst,
 const unsigned char *_src,int _ystride){
#define SRC edx
#define DST eax
#define YSTRIDE ecx
#define YSTRIDE3 esi
  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
#undef SRC
#undef DST
#undef YSTRIDE
#undef YSTRIDE3
}

/*Copies the fragments specified by the lists of fragment indices from one
   frame to another.
  _dst_frame:     The reference frame to copy to.
  _src_frame:     The reference frame to copy from.
  _ystride:       The row stride of the reference frames.
  _fragis:        A pointer to a list of fragment indices.
  _nfragis:       The number of fragment indices to copy.
  _frag_buf_offs: The offsets of fragments in the reference frames.*/
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
 const unsigned char *_src_frame,int _ystride,
 const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
  ptrdiff_t fragii;
  for(fragii=0;fragii<_nfragis;fragii++){
    ptrdiff_t frag_buf_off;
    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
#define SRC edx
#define DST eax
#define YSTRIDE ecx
#define YSTRIDE3 edi
    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
     _src_frame+frag_buf_off,_ystride);
#undef SRC
#undef DST
#undef YSTRIDE
#undef YSTRIDE3
  }
}

void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
 const ogg_int16_t *_residue){
  __asm{
#define DST edx
#define DST4 esi
#define YSTRIDE eax
#define YSTRIDE3 edi
#define RESIDUE ecx
    mov DST,_dst
    mov YSTRIDE,_ystride
    mov RESIDUE,_residue
    lea DST4,[DST+YSTRIDE*4]
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
    pcmpeqw mm0,mm0
    /*#0 Load low residue.*/
    movq mm1,[0*8+RESIDUE]
    /*#0 Load high residue.*/
    movq mm2,[1*8+RESIDUE]
    /*Set mm0 to 0x8000800080008000.*/
    psllw mm0,15
    /*#1 Load low residue.*/
    movq mm3,[2*8+RESIDUE]
    /*#1 Load high residue.*/
    movq mm4,[3*8+RESIDUE]
    /*Set mm0 to 0x0080008000800080.*/
    psrlw mm0,8
    /*#2 Load low residue.*/
    movq mm5,[4*8+RESIDUE]
    /*#2 Load high residue.*/
    movq mm6,[5*8+RESIDUE]
    /*#0 Bias low  residue.*/
    paddsw mm1,mm0
    /*#0 Bias high residue.*/
    paddsw mm2,mm0
    /*#0 Pack to byte.*/
    packuswb mm1,mm2
    /*#1 Bias low  residue.*/
    paddsw mm3,mm0
    /*#1 Bias high residue.*/
    paddsw mm4,mm0
    /*#1 Pack to byte.*/
    packuswb mm3,mm4
    /*#2 Bias low  residue.*/
    paddsw mm5,mm0
    /*#2 Bias high residue.*/
    paddsw mm6,mm0
    /*#2 Pack to byte.*/
    packuswb mm5,mm6
    /*#0 Write row.*/
    movq [DST],mm1
    /*#1 Write row.*/
    movq [DST+YSTRIDE],mm3
    /*#2 Write row.*/
    movq [DST+YSTRIDE*2],mm5
    /*#3 Load low residue.*/
    movq mm1,[6*8+RESIDUE]
    /*#3 Load high residue.*/
    movq mm2,[7*8+RESIDUE]
    /*#4 Load high residue.*/
    movq mm3,[8*8+RESIDUE]
    /*#4 Load high residue.*/
    movq mm4,[9*8+RESIDUE]
    /*#5 Load high residue.*/
    movq mm5,[10*8+RESIDUE]
    /*#5 Load high residue.*/
    movq mm6,[11*8+RESIDUE]
    /*#3 Bias low  residue.*/
    paddsw mm1,mm0
    /*#3 Bias high residue.*/
    paddsw mm2,mm0
    /*#3 Pack to byte.*/
    packuswb mm1,mm2
    /*#4 Bias low  residue.*/
    paddsw mm3,mm0
    /*#4 Bias high residue.*/
    paddsw mm4,mm0
    /*#4 Pack to byte.*/
    packuswb mm3,mm4
    /*#5 Bias low  residue.*/
    paddsw mm5,mm0
    /*#5 Bias high residue.*/
    paddsw mm6,mm0
    /*#5 Pack to byte.*/
    packuswb mm5,mm6
    /*#3 Write row.*/
    movq [DST+YSTRIDE3],mm1
    /*#4 Write row.*/
    movq [DST4],mm3
    /*#5 Write row.*/
    movq [DST4+YSTRIDE],mm5
    /*#6 Load low residue.*/
    movq mm1,[12*8+RESIDUE]
    /*#6 Load high residue.*/
    movq mm2,[13*8+RESIDUE]
    /*#7 Load low residue.*/
    movq mm3,[14*8+RESIDUE]
    /*#7 Load high residue.*/
    movq mm4,[15*8+RESIDUE]
    /*#6 Bias low  residue.*/
    paddsw mm1,mm0
    /*#6 Bias high residue.*/
    paddsw mm2,mm0
    /*#6 Pack to byte.*/
    packuswb mm1,mm2
    /*#7 Bias low  residue.*/
    paddsw mm3,mm0
    /*#7 Bias high residue.*/
    paddsw mm4,mm0
    /*#7 Pack to byte.*/
    packuswb mm3,mm4
    /*#6 Write row.*/
    movq [DST4+YSTRIDE*2],mm1
    /*#7 Write row.*/
    movq [DST4+YSTRIDE3],mm3
#undef DST
#undef DST4
#undef YSTRIDE
#undef YSTRIDE3
#undef RESIDUE
  }
}

void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
 int _ystride,const ogg_int16_t *_residue){
  int i;
  /*Zero mm0.*/
  __asm pxor mm0,mm0;
  for(i=4;i-->0;){
    __asm{
#define DST edx
#define SRC ecx
#define YSTRIDE edi
#define RESIDUE eax
      mov DST,_dst
      mov SRC,_src
      mov YSTRIDE,_ystride
      mov RESIDUE,_residue
      /*#0 Load source.*/
      movq mm3,[SRC]
      /*#1 Load source.*/
      movq mm7,[SRC+YSTRIDE]
      /*#0 Get copy of src.*/
      movq mm4,mm3
      /*#0 Expand high source.*/
      punpckhbw mm4,mm0
      /*#0 Expand low  source.*/
      punpcklbw mm3,mm0
      /*#0 Add residue high.*/
      paddsw mm4,[8+RESIDUE]
      /*#1 Get copy of src.*/
      movq mm2,mm7
      /*#0 Add residue low.*/
      paddsw  mm3,[RESIDUE]
      /*#1 Expand high source.*/
      punpckhbw mm2,mm0
      /*#0 Pack final row pixels.*/
      packuswb mm3,mm4
      /*#1 Expand low  source.*/
      punpcklbw mm7,mm0
      /*#1 Add residue low.*/
      paddsw mm7,[16+RESIDUE]
      /*#1 Add residue high.*/
      paddsw mm2,[24+RESIDUE]
      /*Advance residue.*/
      lea RESIDUE,[32+RESIDUE]
      /*#1 Pack final row pixels.*/
      packuswb mm7,mm2
      /*Advance src.*/
      lea SRC,[SRC+YSTRIDE*2]
      /*#0 Write row.*/
      movq [DST],mm3
      /*#1 Write row.*/
      movq [DST+YSTRIDE],mm7
      /*Advance dst.*/
      lea DST,[DST+YSTRIDE*2]
      mov _residue,RESIDUE
      mov _dst,DST
      mov _src,SRC
#undef DST
#undef SRC
#undef YSTRIDE
#undef RESIDUE
    }
  }
}

void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
  int i;
  /*Zero mm7.*/
  __asm pxor mm7,mm7;
  for(i=4;i-->0;){
    __asm{
#define SRC1 ecx
#define SRC2 edi
#define YSTRIDE esi
#define RESIDUE edx
#define DST eax
      mov YSTRIDE,_ystride
      mov DST,_dst
      mov RESIDUE,_residue
      mov SRC1,_src1
      mov SRC2,_src2
      /*#0 Load src1.*/
      movq mm0,[SRC1]
      /*#0 Load src2.*/
      movq mm2,[SRC2]
      /*#0 Copy src1.*/
      movq mm1,mm0
      /*#0 Copy src2.*/
      movq mm3,mm2
      /*#1 Load src1.*/
      movq mm4,[SRC1+YSTRIDE]
      /*#0 Unpack lower src1.*/
      punpcklbw mm0,mm7
      /*#1 Load src2.*/
      movq mm5,[SRC2+YSTRIDE]
      /*#0 Unpack higher src1.*/
      punpckhbw mm1,mm7
      /*#0 Unpack lower src2.*/
      punpcklbw mm2,mm7
      /*#0 Unpack higher src2.*/
      punpckhbw mm3,mm7
      /*Advance src1 ptr.*/
      lea SRC1,[SRC1+YSTRIDE*2]
      /*Advance src2 ptr.*/
      lea SRC2,[SRC2+YSTRIDE*2]
      /*#0 Lower src1+src2.*/
      paddsw mm0,mm2
      /*#0 Higher src1+src2.*/
      paddsw mm1,mm3
      /*#1 Copy src1.*/
      movq mm2,mm4
      /*#0 Build lo average.*/
      psraw mm0,1
      /*#1 Copy src2.*/
      movq mm3,mm5
      /*#1 Unpack lower src1.*/
      punpcklbw mm4,mm7
      /*#0 Build hi average.*/
      psraw mm1,1
      /*#1 Unpack higher src1.*/
      punpckhbw mm2,mm7
      /*#0 low+=residue.*/
      paddsw mm0,[RESIDUE]
      /*#1 Unpack lower src2.*/
      punpcklbw mm5,mm7
      /*#0 high+=residue.*/
      paddsw mm1,[8+RESIDUE]
      /*#1 Unpack higher src2.*/
      punpckhbw mm3,mm7
      /*#1 Lower src1+src2.*/
      paddsw mm5,mm4
      /*#0 Pack and saturate.*/
      packuswb mm0,mm1
      /*#1 Higher src1+src2.*/
      paddsw mm3,mm2
      /*#0 Write row.*/
      movq [DST],mm0
      /*#1 Build lo average.*/
      psraw mm5,1
      /*#1 Build hi average.*/
      psraw mm3,1
      /*#1 low+=residue.*/
      paddsw mm5,[16+RESIDUE]
      /*#1 high+=residue.*/
      paddsw mm3,[24+RESIDUE]
      /*#1 Pack and saturate.*/
      packuswb  mm5,mm3
      /*#1 Write row ptr.*/
      movq [DST+YSTRIDE],mm5
      /*Advance residue ptr.*/
      add RESIDUE,32
      /*Advance dest ptr.*/
      lea DST,[DST+YSTRIDE*2]
      mov _dst,DST
      mov _residue,RESIDUE
      mov _src1,SRC1
      mov _src2,SRC2
#undef SRC1
#undef SRC2
#undef YSTRIDE
#undef RESIDUE
#undef DST
    }
  }
}

void oc_restore_fpu_mmx(void){
  __asm emms;
}

#endif