Add libvpx thirdparty library

Only necessary files
This commit is contained in:
Błażej Szczygieł 2016-09-14 22:10:03 +02:00
parent 2d77a6f5d3
commit 5268443fdf
275 changed files with 83150 additions and 0 deletions

View file

@ -114,6 +114,13 @@ Files extracted from upstream source:
- COPYING
## libvpx
- Upstream: http://www.webmproject.org/code/
- Version: 1.6.0
- License: BSD-3-Clause
## libwebp
- Upstream: https://chromium.googlesource.com/webm/libwebp/

142
thirdparty/libvpx/AUTHORS vendored Normal file
View file

@ -0,0 +1,142 @@
# This file is automatically generated from the git commit history
# by tools/gen_authors.sh.
Aaron Watry <awatry@gmail.com>
Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
Adam Xu <adam@xuyaowu.com>
Adrian Grange <agrange@google.com>
Aex Converse <aconverse@google.com>
Ahmad Sharif <asharif@google.com>
Alexander Voronov <avoronov@graphics.cs.msu.ru>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Alpha Lam <hclam@google.com>
A.Mahfoodh <ab.mahfoodh@gmail.com>
Ami Fischman <fischman@chromium.org>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Andrew Russell <anrussell@google.com>
Angie Chiang <angiebird@google.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Brion Vibber <bvibber@wikimedia.org>
changjun.yang <changjun.yang@intel.com>
Charles 'Buck' Krasic <ckrasic@google.com>
chm <chm@rock-chips.com>
Christian Duvivier <cduvivier@google.com>
Daniele Castagna <dcastagna@chromium.org>
Daniel Kang <ddkang@google.com>
Deb Mukherjee <debargha@google.com>
Dim Temp <dimtemp0@gmail.com>
Dmitry Kovalev <dkovalev@google.com>
Dragan Mrdjan <dmrdjan@mips.com>
Ed Baker <edward.baker@intel.com>
Ehsan Akhgari <ehsan.akhgari@gmail.com>
Erik Niemeyer <erik.a.niemeyer@intel.com>
Fabio Pedretti <fabio.ped@libero.it>
Frank Galligan <fgalligan@google.com>
Fredrik Söderquist <fs@opera.com>
Fritz Koenig <frkoenig@google.com>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
Geza Lore <gezalore@gmail.com>
Ghislain MARY <ghislainmary2@gmail.com>
Giuseppe Scrivano <gscrivano@gnu.org>
Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
Guillaume Martres <gmartres@google.com>
Guillermo Ballester Valor <gbvalor@gmail.com>
Hangyu Kuang <hkuang@google.com>
Hanno Böck <hanno@hboeck.de>
Henrik Lundin <hlundin@google.com>
Hui Su <huisu@google.com>
Ivan Maltz <ivanmaltz@google.com>
Jacek Caban <cjacek@gmail.com>
Jacky Chen <jackychen@google.com>
James Berry <jamesberry@google.com>
James Yu <james.yu@linaro.org>
James Zern <jzern@google.com>
Jan Gerber <j@mailb.org>
Jan Kratochvil <jan.kratochvil@redhat.com>
Janne Salonen <jsalonen@google.com>
Jean-Yves Avenard <jyavenard@mozilla.com>
Jeff Faust <jfaust@google.com>
Jeff Muizelaar <jmuizelaar@mozilla.com>
Jeff Petkau <jpet@chromium.org>
Jia Jia <jia.jia@linaro.org>
Jian Zhou <zhoujian@google.com>
Jim Bankoski <jimbankoski@google.com>
Jingning Han <jingning@google.com>
Joey Parrish <joeyparrish@google.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Johnny Klonaris <google@jawknee.com>
John Stark <jhnstrk@gmail.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Joshua Litt <joshualitt@google.com>
Julia Robson <juliamrobson@gmail.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
KO Myung-Hun <komh@chollian.net>
Lawrence Velázquez <larryv@macports.org>
Linfeng Zhang <linfengz@google.com>
Lou Quillio <louquillio@google.com>
Luca Barbato <lu_zero@gentoo.org>
Makoto Kato <makoto.kt@gmail.com>
Mans Rullgard <mans@mansr.com>
Marco Paniconi <marpan@google.com>
Mark Mentovai <mark@chromium.org>
Martin Ettl <ettl.martin78@googlemail.com>
Martin Storsjo <martin@martin.st>
Matthew Heaney <matthewjheaney@chromium.org>
Michael Kohler <michaelkohler@live.com>
Mike Frysinger <vapier@chromium.org>
Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
Minghai Shang <minghai@google.com>
Morton Jonuschat <yabawock@gmail.com>
Nico Weber <thakis@chromium.org>
Parag Salasakar <img.mips1@gmail.com>
Pascal Massimino <pascal.massimino@gmail.com>
Patrik Westin <patrik.westin@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Paweł Hajdan <phajdan@google.com>
Pengchong Jin <pengchong@google.com>
Peter de Rivaz <peter.derivaz@gmail.com>
Philip Jägenstedt <philipj@opera.com>
Priit Laes <plaes@plaes.org>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
Rafaël Carré <funman@videolan.org>
Ralph Giles <giles@xiph.org>
Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rsbultje@gmail.com>
Rui Ueyama <ruiu@google.com>
Sami Pietilä <samipietila@google.com>
Sasi Inguva <isasi@google.com>
Scott Graham <scottmg@chromium.org>
Scott LaVarnway <slavarnway@google.com>
Sean McGovern <gseanmcg@gmail.com>
Sergey Kolomenkin <kolomenkin@gmail.com>
Sergey Ulanov <sergeyu@chromium.org>
Shimon Doodkin <helpmepro1@gmail.com>
Shunyao Li <shunyaoli@google.com>
Stefan Holmer <holmer@google.com>
Suman Sunkara <sunkaras@google.com>
Taekhyun Kim <takim@nvidia.com>
Takanori MATSUURA <t.matsuu@gmail.com>
Tamar Levy <tamar.levy@intel.com>
Tao Bai <michaelbai@chromium.org>
Tero Rintaluoma <teror@google.com>
Thijs Vermeir <thijsvermeir@gmail.com>
Tim Kopp <tkopp@google.com>
Timothy B. Terriberry <tterribe@xiph.org>
Tom Finegan <tomfinegan@google.com>
Vignesh Venkatasubramanian <vigneshv@google.com>
Yaowu Xu <yaowu@google.com>
Yi Luo <luoyi@google.com>
Yongzhe Wang <yongzhe@google.com>
Yunqing Wang <yunqingwang@google.com>
Yury Gitman <yuryg@google.com>
Zoe Liu <zoeliu@google.com>
Google Inc.
The Mozilla Foundation
The Xiph.Org Foundation

654
thirdparty/libvpx/CHANGELOG vendored Normal file
View file

@ -0,0 +1,654 @@
2016-07-20 v1.6.0 "Khaki Campbell Duck"
This release improves upon the VP9 encoder and speeds up the encoding and
decoding processes.
- Upgrading:
This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum
in vpx_image and some minor changes to the VP8_COMP structure.
The default key frame interval for VP9 has changed from 128 to 9999.
- Enhancement:
A core focus has been performance for low end Intel processors. SSSE3
instructions such as 'pshufb' have been avoided and instructions have been
reordered to better accommodate the more constrained pipelines.
As a result, devices based on Celeron processors have seen substantial
decoding improvements. From Indian Runner Duck to Javan Whistling Duck,
decoding speed improved between 10 and 30%. Between Javan Whistling Duck
and Khaki Campbell Duck, it improved another 10 to 15%.
While Celeron benefited most, Core-i5 also improved 5% and 10% between the
respective releases.
Realtime performance for WebRTC for both speed and quality has received a
lot of attention.
- Bug Fixes:
A number of fuzzing issues, found variously by Mozilla, Chromium and others,
have been fixed and we strongly recommend updating.
2015-11-09 v1.5.0 "Javan Whistling Duck"
This release improves upon the VP9 encoder and speeds up the encoding and
decoding processes.
- Upgrading:
This release is ABI incompatible with 1.4.0. It drops deprecated VP8
controls and adds a variety of VP9 controls for testing.
The vpxenc utility now prefers VP9 by default.
- Enhancements:
Faster VP9 encoding and decoding
Smaller library size by combining functions used by VP8 and VP9
- Bug Fixes:
A variety of fuzzing issues
2015-04-03 v1.4.0 "Indian Runner Duck"
This release includes significant improvements to the VP9 codec.
- Upgrading:
This release is ABI incompatible with 1.3.0. It drops the compatibility
layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
controls for VP9.
- Enhancements:
Faster VP9 encoding and decoding
Multithreaded VP9 decoding (tile and frame-based)
Multithreaded VP9 encoding - on by default
YUV 4:2:2 and 4:4:4 support in VP9
10 and 12bit support in VP9
64bit ARM support by replacing ARM assembly with intrinsics
- Bug Fixes:
Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
files.
- Known Issues:
Frame Parallel decoding fails for segmented and non-420 files.
2013-11-15 v1.3.0 "Forest"
This release introduces the VP9 codec in a backward-compatible way.
All existing users of VP8 can continue to use the library without
modification. However, some VP8 options do not map to VP9 in the same manner.
The VP9 encoder in this release is not feature complete. Users interested in
the encoder are advised to use the git master branch and discuss issues on
libvpx mailing lists.
- Upgrading:
This release is ABI and API compatible with Duclair (v1.0.0). Users
of older releases should refer to the Upgrading notes in this document
for that release.
- Enhancements:
Get rid of bashisms in the main build scripts
Added usage info on command line options
Add lossless compression mode
Dll build of libvpx
Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
Add option to disable documentation
configure: add --enable-external-build support
make: support V=1 as short form of verbose=yes
configure: support mingw-w64
configure: support hardfloat armv7 CHOSTS
configure: add support for android x86
Add estimated completion time to vpxenc
Don't exit on decode errors in vpxenc
vpxenc: support scaling prior to encoding
vpxdec: support scaling output
vpxenc: improve progress indicators with --skip
msvs: Don't link to winmm.lib
Add a new script for producing vcxproj files
Produce Visual Studio 10 and 11 project files
Produce Windows Phone project files
msvs-build: use msbuild for vs >= 2005
configure: default configure log to config.log
Add encoding option --static-thresh
- Speed:
Miscellaneous speed optimizations for VP8 and VP9.
- Quality:
In general, quality is consistent with the Eider release.
- Bug Fixes:
This release represents approximately a year of engineering effort,
and contains multiple bug fixes. Please refer to git history for details.
2012-12-21 v1.2.0
This release acts as a checkpoint for a large amount of internal refactoring
and testing. It also contains a number of small bugfixes, so all users are
encouraged to upgrade.
- Upgrading:
This release is ABI and API compatible with Duclair (v1.0.0). Users
of older releases should refer to the Upgrading notes in this
document for that release.
- Enhancements:
VP8 optimizations for MIPS dspr2
vpxenc: add -quiet option
- Speed:
Encoder and decoder speed is consistent with the Eider release.
- Quality:
In general, quality is consistent with the Eider release.
Minor tweaks to ARNR filtering
Minor improvements to real time encoding with multiple temporal layers
- Bug Fixes:
Fixes multithreaded encoder race condition in loopfilter
Fixes multi-resolution threaded encoding
Fix potential encoder dead-lock after picture resize
2012-05-09 v1.1.0 "Eider"
This introduces a number of enhancements, mostly focused on real-time
encoding. In addition, it fixes a decoder bug (first introduced in
Duclair) so all users of that release are encouraged to upgrade.
- Upgrading:
This release is ABI and API compatible with Duclair (v1.0.0). Users
of older releases should refer to the Upgrading notes in this
document for that release.
This release introduces a new temporal denoiser, controlled by the
VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not
currently take a strength parameter, so the control is effectively
a boolean - zero (off) or non-zero (on). For compatibility with
existing applications, the values accepted are the same as those
for the spatial denoiser (0-6). The temporal denoiser is enabled
by default, and the older spatial denoiser may be restored by
configuring with --disable-temporal-denoising. The temporal denoiser
is more computationally intensive than the spatial one.
This release removes support for a legacy, decode only API that was
supported, but deprecated, at the initial release of libvpx
(v0.9.0). This is not expected to have any impact. If you are
impacted, you can apply a reversion to commit 2bf8fb58 locally.
Please update to the latest libvpx API if you are affected.
- Enhancements:
Adds a motion compensated temporal denoiser to the encoder, which
gives higher quality than the older spatial denoiser. (See above
for notes on upgrading).
In addition, support for new compilers and platforms were added,
including:
improved support for XCode
Android x86 NDK build
OS/2 support
SunCC support
Changing resolution with vpx_codec_enc_config_set() is now
supported. Previously, reinitializing the codec was required to
change the input resolution.
The vpxenc application has initial support for producing multiple
encodes from the same input in one call. Resizing is not yet
supported, but varying other codec parameters is. Use -- to
delineate output streams. Options persist from one stream to the
next.
Also, the vpxenc application will now use a keyframe interval of
5 seconds by default. Use the --kf-max-dist option to override.
- Speed:
Decoder performance improved 2.5% versus Duclair. Encoder speed is
consistent with Duclair for most material. Two pass encoding of
slideshow-like material will see significant improvements.
Large realtime encoding speed gains at a small quality expense are
possible by configuring the on-the-fly bitpacking experiment with
--enable-onthefly-bitpacking. Realtime encoder can be up to 13%
faster (ARM) depending on the number of threads and bitrate
settings. This technique sees constant gain over the 5-16 speed
range. For VC style input the loss seen is up to 0.2dB. See commit
52cf4dca for further details.
- Quality:
On the whole, quality is consistent with the Duclair release. Some
tweaks:
Reduced blockiness in easy sections by applying a penalty to
intra modes.
Improved quality of static sections (like slideshows) with
two pass encoding.
Improved keyframe sizing with multiple temporal layers
- Bug Fixes:
Corrected alt-ref contribution to frame rate for visible updates
to the alt-ref buffer. This affected applications making manual
usage of the frame reference flags, or temporal layers.
Additional constraints were added to disable multi-frame quality
enhancement (MFQE) in sections of the frame where there is motion.
(#392)
Fixed corruption issues when vpx_codec_enc_config_set() was called
with spatial resampling enabled.
Fixed a decoder error introduced in Duclair where the segmentation
map was not being reinitialized on keyframes (#378)
2012-01-27 v1.0.0 "Duclair"
Our fourth named release, focused on performance and features related to
real-time encoding. It also fixes a decoder crash bug introduced in
v0.9.7, so all users of that release are encouraged to upgrade.
- Upgrading:
This release is ABI incompatible with prior releases of libvpx, so the
"major" version number has been bumped to 1. You must recompile your
applications against the latest version of the libvpx headers. The
API remains compatible, and this should not require code changes in most
applications.
- Enhancements:
This release introduces several substantial new features to the encoder,
of particular interest to real time streaming applications.
Temporal scalability allows the encoder to produce a stream that can
be decimated to different frame rates, with independent rate targetting
for each substream.
Multiframe quality enhancement postprocessing can make visual quality
more consistent in the presence of frames that are substantially
different quality than the surrounding frames, as in the temporal
scalability case and in some forced keyframe scenarios.
Multiple-resolution encoding support allows the encoding of the
same content at different resolutions faster than encoding them
separately.
- Speed:
Optimization targets for this release included the decoder and the real-
time modes of the encoder. Decoder speed on x86 has improved 10.5% with
this release. Encoder improvements followed a curve where speeds 1-3
improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
Cayuga release.
- Quality:
Encoder quality in the single stream case is consistent with the Cayuga
release.
- Bug Fixes:
This release fixes an OOB read decoder crash bug present in v0.9.7
related to the clamping of motion vectors in SPLITMV blocks. This
behavior could be triggered by corrupt input or by starting
decoding from a P-frame.
2011-08-15 v0.9.7-p1 "Cayuga" patch 1
This is an incremental bugfix release against Cayuga. All users of that
release are strongly encouraged to upgrade.
- Fix potential OOB reads (cdae03a)
An unbounded out of bounds read was discovered when the
decoder was requested to perform error concealment (new in
Cayuga) given a frame with corrupt partition sizes.
A bounded out of bounds read was discovered affecting all
versions of libvpx. Given an multipartition input frame that
is truncated between the mode/mv partition and the first
residiual paritition (in the block of partition offsets), up
to 3 extra bytes could have been read from the source buffer.
The code will not take any action regardless of the contents
of these undefined bytes, as the truncated buffer is detected
immediately following the read based on the calculated
starting position of the coefficient partition.
- Fix potential error concealment crash when the very first frame
is missing or corrupt (a609be5)
- Fix significant artifacts in error concealment (a4c2211, 99d870a)
- Revert 1-pass CBR rate control changes (e961317)
Further testing showed this change produced undesirable visual
artifacts, rolling back for now.
2011-08-02 v0.9.7 "Cayuga"
Our third named release, focused on a faster, higher quality, encoder.
- Upgrading:
This release is backwards compatible with Aylesbury (v0.9.5) and
Bali (v0.9.6). Users of older releases should refer to the Upgrading
notes in this document for that release.
- Enhancements:
Stereo 3D format support for vpxenc
Runtime detection of available processor cores.
Allow specifying --end-usage by enum name
vpxdec: test for frame corruption
vpxenc: add quantizer histogram display
vpxenc: add rate histogram display
Set VPX_FRAME_IS_DROPPABLE
update configure for ios sdk 4.3
Avoid text relocations in ARM vp8 decoder
Generate a vpx.pc file for pkg-config.
New ways of passing encoded data between encoder and decoder.
- Speed:
This release includes across-the-board speed improvements to the
encoder. On x86, these measure at approximately 11.5% in Best mode,
21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
On ARM Cortex A9 with Neon extensions, real-time encoding of video
telephony content is 35% faster than Bali on single core and 48%
faster on multi-core. On the NVidia Tegra2 platform, real time
encoding is 40% faster than Bali.
Decoder speed was not a priority for this release, but improved
approximately 8.4% on x86.
Reduce motion vector search on alt-ref frame.
Encoder loopfilter running in its own thread
Reworked loopfilter to precalculate more parameters
SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
Removed redundant checks
Reduced structure sizes
utilize preload in ARMv6 MC/LPF/Copy routines
ARM optimized quantization, dfct, variance, subtract
Increase chrow row alignment to 16 bytes.
disable trellis optimization for first pass
Write SSSE3 sub-pixel filter function
Improve SSE2 half-pixel filter funtions
Add vp8_sub_pixel_variance16x8_ssse3 function
Reduce unnecessary distortion computation
Use diamond search to replace full search
Preload reference area in sub-pixel motion search (real-time mode)
- Quality:
This release focused primarily on one-pass use cases, including
video conferencing. Low latency data rate control was significantly
improved, improving streamability over bandwidth constrained links.
Added support for error concealment, allowing frames to maintain
visual quality in the presence of substantial packet loss.
Add rc_max_intra_bitrate_pct control
Limit size of initial keyframe in one-pass.
Improve framerate adaptation
Improved 1-pass CBR rate control
Improved KF insertion after fades to still.
Improved key frame detection.
Improved activity masking (lower PSNR impact for same SSIM boost)
Improved interaction between GF and ARFs
Adding error-concealment to the decoder.
Adding support for independent partitions
Adjusted rate-distortion constants
- Bug Fixes:
Removed firstpass motion map
Fix parallel make install
Fix multithreaded encoding for 1 MB wide frame
Fixed iwalsh_neon build problems with RVDS4.1
Fix semaphore emulation, spin-wait intrinsics on Windows
Fix build with xcode4 and simplify GLOBAL.
Mark ARM asm objects as allowing a non-executable stack.
Fix vpxenc encoding incorrect webm file header on big endian
2011-03-07 v0.9.6 "Bali"
Our second named release, focused on a faster, higher quality, encoder.
- Upgrading:
This release is backwards compatible with Aylesbury (v0.9.5). Users
of older releases should refer to the Upgrading notes in this
document for that release.
- Enhancements:
vpxenc --psnr shows a summary when encode completes
--tune=ssim option to enable activity masking
improved postproc visualizations for development
updated support for Apple iOS to SDK 4.2
query decoder to determine which reference frames were updated
implemented error tracking in the decoder
fix pipe support on windows
- Speed:
Primary focus was on good quality mode, speed 0. Average improvement
on x86 about 40%, up to 100% on user-generated content at that speed.
Best quality mode speed improved 35%, and realtime speed 10-20%. This
release also saw significant improvement in realtime encoding speed
on ARM platforms.
Improved encoder threading
Dont pick encoder filter level when loopfilter is disabled.
Avoid double copying of key frames into alt and golden buffer
FDCT optimizations.
x86 sse2 temporal filter
SSSE3 version of fast quantizer
vp8_rd_pick_best_mbsegmentation code restructure
Adjusted breakout RD for SPLITMV
Changed segmentation check order
Improved rd_pick_intra4x4block
Adds armv6 optimized variance calculation
ARMv6 optimized sad16x16
ARMv6 optimized half pixel variance calculations
Full search SAD function optimization in SSE4.1
Improve MV prediction accuracy to achieve performance gain
Improve MV prediction in vp8_pick_inter_mode() for speed>3
- Quality:
Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
also includes support for "activity masking," which greatly improves
SSIM at the expense of PSNR. For now, this feature is available with
the --tune=ssim option. Further experimentation in this area
is ongoing. This release also introduces a new rate control mode
called "CQ," which changes the allocation of bits within a clip to
the sections where they will have the most visual impact.
Tuning for the more exact quantizer.
Relax rate control for last few frames
CQ Mode
Limit key frame quantizer for forced key frames.
KF/GF Pulsing
Add simple version of activity masking.
make rdmult adaptive for intra in quantizer RDO
cap the best quantizer for 2nd order DC
change the threshold of DC check for encode breakout
- Bug Fixes:
Fix crash on Sparc Solaris.
Fix counter of fixed keyframe distance
ARNR filter pointer update bug fix
Fixed use of motion percentage in KF/GF group calc
Changed condition for using RD in Intra Mode
Fix encoder real-time only configuration.
Fix ARM encoder crash with multiple token partitions
Fixed bug first cluster timecode of webm file is wrong.
Fixed various encoder bugs with odd-sized images
vp8e_get_preview fixed when spatial resampling enabled
quantizer: fix assertion in fast quantizer path
Allocate source buffers to be multiples of 16
Fix for manual Golden frame frequency
Fix drastic undershoot in long form content
2010-10-28 v0.9.5 "Aylesbury"
Our first named release, focused on a faster decoder, and a better encoder.
- Upgrading:
This release incorporates backwards-incompatible changes to the
ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec.
vpxdec
* the -q (quiet) option has been removed, and replaced with
-v (verbose). the output is quiet by default. Use -v to see
the version number of the binary.
* The default behavior is now to write output to a single file
instead of individual frames. The -y option has been removed.
Y4M output is the default.
* For raw I420/YV12 output instead of Y4M, the --i420 or --yv12
options must be specified.
$ ivfdec -o OUTPUT INPUT
$ vpxdec --i420 -o OUTPUT INPUT
* If an output file is not specified, the default is to write
Y4M to stdout. This makes piping more natural.
$ ivfdec -y -o - INPUT | ...
$ vpxdec INPUT | ...
* The output file has additional flexibility for formatting the
filename. It supports escape characters for constructing a
filename from the width, height, and sequence number. This
replaces the -p option. To get the equivalent:
$ ivfdec -p frame INPUT
$ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT
vpxenc
* The output file must be specified with -o, rather than as the
last argument.
$ ivfenc <options> INPUT OUTPUT
$ vpxenc <options> -o OUTPUT INPUT
* The output defaults to webm. To get IVF output, use the --ivf
option.
$ ivfenc <options> INPUT OUTPUT.ivf
$ vpxenc <options> -o OUTPUT.ivf --ivf INPUT
- Enhancements:
ivfenc and ivfdec have been renamed to vpxenc, vpxdec.
vpxdec supports .webm input
vpxdec writes .y4m by default
vpxenc writes .webm output by default
vpxenc --psnr now shows the average/overall PSNR at the end
ARM platforms now support runtime cpu detection
vpxdec visualizations added for motion vectors, block modes, references
vpxdec now silent by default
vpxdec --progress shows frame-by-frame timing information
vpxenc supports the distinction between --fps and --timebase
NASM is now a supported assembler
configure: enable PIC for shared libs by default
configure: add --enable-small
configure: support for ppc32-linux-gcc
configure: support for sparc-solaris-gcc
- Bugs:
Improve handling of invalid frames
Fix valgrind errors in the NEON loop filters.
Fix loopfilter delta zero transitions
Fix valgrind errors in vp8_sixtap_predict8x4_armv6().
Build fixes for darwin-icc
- Speed:
20-40% (average 28%) improvement in libvpx decoder speed,
including:
Rewrite vp8_short_walsh4x4_sse2()
Optimizations on the loopfilters.
Miscellaneous improvements for Atom
Add 4-tap version of 2nd-pass ARMv6 MC filter.
Improved multithread utilization
Better instruction choices on x86
reorder data to use wider instructions
Update NEON wide idcts
Make block access to frame buffer sequential
Improved subset block search
Bilinear subpixel optimizations for ssse3.
Decrease memory footprint
Encoder speed improvements (percentage gain not measured):
Skip unnecessary search of identical frames
Add SSE2 subtract functions
Improve bounds checking in vp8_diamond_search_sadx4()
Added vp8_fast_quantize_b_sse2
- Quality:
Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality
encoding mode, and up to 60% improvement on very noisy, still
or slow moving source video
Motion compensated temporal filter for Alt-Ref Noise Reduction
Improved use of trellis quantization on 2nd order Y blocks
Tune effect of motion on KF/GF boost in two pass
Allow coefficient optimization for good quality speed 0.
Improved control of active min quantizer for two pass.
Enable ARFs for non-lagged compress
2010-09-02 v0.9.2
- Enhancements:
Disable frame dropping by default
Improved multithreaded performance
Improved Force Key Frame Behaviour
Increased rate control buffer level precision
Fix bug in 1st pass motion compensation
ivfenc: correct fixed kf interval, --disable-kf
- Speed:
Changed above and left context data layout
Rework idct calling structure.
Removed unnecessary MB_MODE_INFO copies
x86: SSSE3 sixtap prediction
Reworked IDCT to include reconstruction (add) step
Swap alt/gold/new/last frame buffer ptrs instead of copying.
Improve SSE2 loopfilter functions
Change bitreader to use a larger window.
Avoid loopfilter reinitialization when possible
- Quality:
Normalize quantizer's zero bin and rounding factors
Add trellis quantization.
Make the quantizer exact.
Updates to ARNR filtering algorithm
Fix breakout thresh computation for golden & AltRef frames
Redo the forward 4x4 dct
Improve the accuracy of forward walsh-hadamard transform
Further adjustment of RD behaviour with Q and Zbin.
- Build System:
Allow linking of libs built with MinGW to MSVC
Fix target auto-detection on mingw32
Allow --cpu= to work for x86.
configure: pass original arguments through to make dist
Fix builds without runtime CPU detection
msvs: fix install of codec sources
msvs: Change devenv.com command line for better msys support
msvs: Add vs9 targets.
Add x86_64-linux-icc target
- Bugs:
Potential crashes on older MinGW builds
Fix two-pass framrate for Y4M input.
Fixed simple loop filter, other crashes on ARM v6
arm: fix missing dependency with --enable-shared
configure: support directories containing .o
Replace pinsrw (SSE) with MMX instructions
apple: include proper mach primatives
Fixed rate control bug with long key frame interval.
Fix DSO link errors on x86-64 when not using a version script
Fixed buffer selection for UV in AltRef filtering
2010-06-17 v0.9.1
- Enhancements:
* ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O
* Speed optimizations
- Bugfixes:
* Rate control
* Prevent out-of-bounds accesses on invalid data
- Build system updates:
* Detect toolchain to be used automatically for native builds
* Support building shared libraries
* Better autotools emulation (--prefix, --libdir, DESTDIR)
- Updated LICENSE
* http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html
2010-05-18 v0.9.0
- Initial open source release. Welcome to WebM and VP8!

31
thirdparty/libvpx/LICENSE vendored Normal file
View file

@ -0,0 +1,31 @@
Copyright (c) 2010, The WebM Project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Google, nor the WebM Project, nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23
thirdparty/libvpx/PATENTS vendored Normal file
View file

@ -0,0 +1,23 @@
Additional IP Rights Grant (Patents)
------------------------------------
"These implementations" means the copyrightable works that implement the WebM
codecs distributed by Google as part of the WebM Project.
Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
royalty-free, irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and otherwise
run, modify and propagate the contents of these implementations of WebM, where
such license applies only to those patent claims, both currently owned by
Google and acquired in the future, licensable by Google that are necessarily
infringed by these implementations of WebM. This grant does not include claims
that would be infringed only as a consequence of further modification of these
implementations. If you or your agent or exclusive licensee institute or order
or agree to the institution of patent litigation or any other patent
enforcement activity against any entity (including a cross-claim or
counterclaim in a lawsuit) alleging that any of these implementations of WebM
or any code incorporated within any of these implementations of WebM
constitute direct or contributory patent infringement, or inducement of
patent infringement, then any patent rights granted to you under this License
for these implementations of WebM shall terminate as of the date such
litigation is filed.

View file

@ -0,0 +1,18 @@
Copyright (C) 2005-2012 x264 project
Authors: Loren Merritt <lorenm@u.washington.edu>
Anton Mitrofanov <BugMaster@narod.ru>
Jason Garrett-Glaser <darkshikari@gmail.com>
Henrik Gramner <hengar-6@student.ltu.se>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

View file

@ -0,0 +1,20 @@
URL: https://git.videolan.org/git/x264.git
Version: d23d18655249944c1ca894b451e2c82c7a584c62
License: ISC
License File: LICENSE
Description:
x264/libav's framework for x86 assembly. Contains a variety of macros and
defines that help automatically allow assembly to work cross-platform.
Local Modifications:
Get configuration from vpx_config.asm.
Prefix functions with vpx by default.
Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
exist in libvpx.
Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
Set 'private_extern' visibility for macho targets.
Copy PIC 'GLOBAL' macros from x86_abi_support.asm
Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
Use .text with no alignment for aout
Only use 'hidden' visibility with Chromium

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,190 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "alloccommon.h"
#include "blockd.h"
#include "vpx_mem/vpx_mem.h"
#include "onyxc_int.h"
#include "findnearmv.h"
#include "entropymode.h"
#include "systemdependent.h"
void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
{
int i;
for (i = 0; i < NUM_YV12_BUFFERS; i++)
vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
#if CONFIG_POSTPROC
vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
if (oci->post_proc_buffer_int_used)
vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
vpx_free(oci->pp_limits_buffer);
oci->pp_limits_buffer = NULL;
#endif
vpx_free(oci->above_context);
vpx_free(oci->mip);
#if CONFIG_ERROR_CONCEALMENT
vpx_free(oci->prev_mip);
oci->prev_mip = NULL;
#endif
oci->above_context = NULL;
oci->mip = NULL;
}
int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
{
int i;
vp8_de_alloc_frame_buffers(oci);
/* our internal buffers are always multiples of 16 */
if ((width & 0xf) != 0)
width += 16 - (width & 0xf);
if ((height & 0xf) != 0)
height += 16 - (height & 0xf);
for (i = 0; i < NUM_YV12_BUFFERS; i++)
{
oci->fb_idx_ref_cnt[i] = 0;
oci->yv12_fb[i].flags = 0;
if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
goto allocation_fail;
}
oci->new_fb_idx = 0;
oci->lst_fb_idx = 1;
oci->gld_fb_idx = 2;
oci->alt_fb_idx = 3;
oci->fb_idx_ref_cnt[0] = 1;
oci->fb_idx_ref_cnt[1] = 1;
oci->fb_idx_ref_cnt[2] = 1;
oci->fb_idx_ref_cnt[3] = 1;
if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0)
goto allocation_fail;
oci->mb_rows = height >> 4;
oci->mb_cols = width >> 4;
oci->MBs = oci->mb_rows * oci->mb_cols;
oci->mode_info_stride = oci->mb_cols + 1;
oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
if (!oci->mip)
goto allocation_fail;
oci->mi = oci->mip + oci->mode_info_stride + 1;
/* Allocation of previous mode info will be done in vp8_decode_frame()
* as it is a decoder only data */
oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
if (!oci->above_context)
goto allocation_fail;
#if CONFIG_POSTPROC
if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
goto allocation_fail;
oci->post_proc_buffer_int_used = 0;
memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
memset(oci->post_proc_buffer.buffer_alloc, 128,
oci->post_proc_buffer.frame_size);
/* Allocate buffer to store post-processing filter coefficients.
*
* Note: Round up mb_cols to support SIMD reads
*/
oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
if (!oci->pp_limits_buffer)
goto allocation_fail;
#endif
return 0;
allocation_fail:
vp8_de_alloc_frame_buffers(oci);
return 1;
}
void vp8_setup_version(VP8_COMMON *cm)
{
switch (cm->version)
{
case 0:
cm->no_lpf = 0;
cm->filter_type = NORMAL_LOOPFILTER;
cm->use_bilinear_mc_filter = 0;
cm->full_pixel = 0;
break;
case 1:
cm->no_lpf = 0;
cm->filter_type = SIMPLE_LOOPFILTER;
cm->use_bilinear_mc_filter = 1;
cm->full_pixel = 0;
break;
case 2:
cm->no_lpf = 1;
cm->filter_type = NORMAL_LOOPFILTER;
cm->use_bilinear_mc_filter = 1;
cm->full_pixel = 0;
break;
case 3:
cm->no_lpf = 1;
cm->filter_type = SIMPLE_LOOPFILTER;
cm->use_bilinear_mc_filter = 1;
cm->full_pixel = 1;
break;
default:
/*4,5,6,7 are reserved for future use*/
cm->no_lpf = 0;
cm->filter_type = NORMAL_LOOPFILTER;
cm->use_bilinear_mc_filter = 0;
cm->full_pixel = 0;
break;
}
}
void vp8_create_common(VP8_COMMON *oci)
{
vp8_machine_specific_config(oci);
vp8_init_mbmode_probs(oci);
vp8_default_bmode_probs(oci->fc.bmode_prob);
oci->mb_no_coeff_skip = 1;
oci->no_lpf = 0;
oci->filter_type = NORMAL_LOOPFILTER;
oci->use_bilinear_mc_filter = 0;
oci->full_pixel = 0;
oci->multi_token_partition = ONE_PARTITION;
oci->clamp_type = RECON_CLAMP_REQUIRED;
/* Initialize reference frame sign bias structure to defaults */
memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
/* Default disable buffer to buffer copying */
oci->copy_buffer_to_gf = 0;
oci->copy_buffer_to_arf = 0;
}
void vp8_remove_common(VP8_COMMON *oci)
{
vp8_de_alloc_frame_buffers(oci);
}

View file

@ -0,0 +1,31 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_ALLOCCOMMON_H_
#define VP8_COMMON_ALLOCCOMMON_H_
#include "onyxc_int.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_create_common(VP8_COMMON *oci);
void vp8_remove_common(VP8_COMMON *oci);
void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
void vp8_setup_version(VP8_COMMON *oci);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ALLOCCOMMON_H_

View file

@ -0,0 +1,181 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/onyxc_int.h"
#define prototype_loopfilter(sym) \
void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
const unsigned char *limit, const unsigned char *thresh, int count)
#if HAVE_MEDIA
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
#if HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif
#if HAVE_MEDIA
/* ARMV6/MEDIA loopfilter functions*/
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
}
#endif
#if HAVE_NEON
/* NEON loopfilter functions */
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
}
#endif

View file

@ -0,0 +1,591 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const uint8_t bifilter4_coeff[8][2] = {
{128, 0},
{112, 16},
{ 96, 32},
{ 80, 48},
{ 64, 64},
{ 48, 80},
{ 32, 96},
{ 16, 112}
};
void vp8_bilinear_predict8x4_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
uint16x8_t q1u16, q2u16, q3u16, q4u16;
uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
if (xoffset == 0) { // skip_1stpass_filter
d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d26u8 = vld1_u8(src_ptr);
} else {
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
d22u8 = vqrshrn_n_u16(q6u16, 7);
d23u8 = vqrshrn_n_u16(q7u16, 7);
d24u8 = vqrshrn_n_u16(q8u16, 7);
d25u8 = vqrshrn_n_u16(q9u16, 7);
d26u8 = vqrshrn_n_u16(q10u16, 7);
}
// secondpass_filter
if (yoffset == 0) { // skip_2ndpass_filter
vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d25u8);
} else {
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q1u16 = vmull_u8(d22u8, d0u8);
q2u16 = vmull_u8(d23u8, d0u8);
q3u16 = vmull_u8(d24u8, d0u8);
q4u16 = vmull_u8(d25u8, d0u8);
q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d5u8);
}
return;
}
void vp8_bilinear_predict8x8_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
if (xoffset == 0) { // skip_1stpass_filter
d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d30u8 = vld1_u8(src_ptr);
} else {
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
d22u8 = vqrshrn_n_u16(q6u16, 7);
d23u8 = vqrshrn_n_u16(q7u16, 7);
d24u8 = vqrshrn_n_u16(q8u16, 7);
d25u8 = vqrshrn_n_u16(q9u16, 7);
// first_pass filtering on the rest 5-line data
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
d26u8 = vqrshrn_n_u16(q6u16, 7);
d27u8 = vqrshrn_n_u16(q7u16, 7);
d28u8 = vqrshrn_n_u16(q8u16, 7);
d29u8 = vqrshrn_n_u16(q9u16, 7);
d30u8 = vqrshrn_n_u16(q10u16, 7);
}
// secondpass_filter
if (yoffset == 0) { // skip_2ndpass_filter
vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d29u8);
} else {
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q1u16 = vmull_u8(d22u8, d0u8);
q2u16 = vmull_u8(d23u8, d0u8);
q3u16 = vmull_u8(d24u8, d0u8);
q4u16 = vmull_u8(d25u8, d0u8);
q5u16 = vmull_u8(d26u8, d0u8);
q6u16 = vmull_u8(d27u8, d0u8);
q7u16 = vmull_u8(d28u8, d0u8);
q8u16 = vmull_u8(d29u8, d0u8);
q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
d6u8 = vqrshrn_n_u16(q5u16, 7);
d7u8 = vqrshrn_n_u16(q6u16, 7);
d8u8 = vqrshrn_n_u16(q7u16, 7);
d9u8 = vqrshrn_n_u16(q8u16, 7);
vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d9u8);
}
return;
}
void vp8_bilinear_predict16x16_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
int i;
unsigned char tmp[272];
unsigned char *tmpp;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
uint8x8_t d19u8, d20u8, d21u8;
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
if (xoffset == 0) { // secondpass_bfilter16x16_only
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q11u8 = vld1q_u8(src_ptr);
src_ptr += src_pixels_per_line;
for (i = 4; i > 0; i--) {
q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
d6u8 = vqrshrn_n_u16(q5u16, 7);
d7u8 = vqrshrn_n_u16(q6u16, 7);
d8u8 = vqrshrn_n_u16(q7u16, 7);
d9u8 = vqrshrn_n_u16(q8u16, 7);
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
q3u8 = vcombine_u8(d6u8, d7u8);
q4u8 = vcombine_u8(d8u8, d9u8);
q11u8 = q15u8;
vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
}
return;
}
if (yoffset == 0) { // firstpass_bfilter16x16_only
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
for (i = 4; i > 0 ; i--) {
d2u8 = vld1_u8(src_ptr);
d3u8 = vld1_u8(src_ptr + 8);
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr);
d6u8 = vld1_u8(src_ptr + 8);
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d8u8 = vld1_u8(src_ptr);
d9u8 = vld1_u8(src_ptr + 8);
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d11u8 = vld1_u8(src_ptr);
d12u8 = vld1_u8(src_ptr + 8);
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
q7u16 = vmull_u8(d2u8, d0u8);
q8u16 = vmull_u8(d3u8, d0u8);
q9u16 = vmull_u8(d5u8, d0u8);
q10u16 = vmull_u8(d6u8, d0u8);
q11u16 = vmull_u8(d8u8, d0u8);
q12u16 = vmull_u8(d9u8, d0u8);
q13u16 = vmull_u8(d11u8, d0u8);
q14u16 = vmull_u8(d12u8, d0u8);
d2u8 = vext_u8(d2u8, d3u8, 1);
d5u8 = vext_u8(d5u8, d6u8, 1);
d8u8 = vext_u8(d8u8, d9u8, 1);
d11u8 = vext_u8(d11u8, d12u8, 1);
q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
d3u8 = vext_u8(d3u8, d4u8, 1);
d6u8 = vext_u8(d6u8, d7u8, 1);
d9u8 = vext_u8(d9u8, d10u8, 1);
d12u8 = vext_u8(d12u8, d13u8, 1);
q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
d14u8 = vqrshrn_n_u16(q7u16, 7);
d15u8 = vqrshrn_n_u16(q8u16, 7);
d16u8 = vqrshrn_n_u16(q9u16, 7);
d17u8 = vqrshrn_n_u16(q10u16, 7);
d18u8 = vqrshrn_n_u16(q11u16, 7);
d19u8 = vqrshrn_n_u16(q12u16, 7);
d20u8 = vqrshrn_n_u16(q13u16, 7);
d21u8 = vqrshrn_n_u16(q14u16, 7);
q7u8 = vcombine_u8(d14u8, d15u8);
q8u8 = vcombine_u8(d16u8, d17u8);
q9u8 = vcombine_u8(d18u8, d19u8);
q10u8 =vcombine_u8(d20u8, d21u8);
vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
}
return;
}
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
d2u8 = vld1_u8(src_ptr);
d3u8 = vld1_u8(src_ptr + 8);
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr);
d6u8 = vld1_u8(src_ptr + 8);
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d8u8 = vld1_u8(src_ptr);
d9u8 = vld1_u8(src_ptr + 8);
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d11u8 = vld1_u8(src_ptr);
d12u8 = vld1_u8(src_ptr + 8);
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
// First Pass: output_height lines x output_width columns (17x16)
tmpp = tmp;
for (i = 3; i > 0; i--) {
q7u16 = vmull_u8(d2u8, d0u8);
q8u16 = vmull_u8(d3u8, d0u8);
q9u16 = vmull_u8(d5u8, d0u8);
q10u16 = vmull_u8(d6u8, d0u8);
q11u16 = vmull_u8(d8u8, d0u8);
q12u16 = vmull_u8(d9u8, d0u8);
q13u16 = vmull_u8(d11u8, d0u8);
q14u16 = vmull_u8(d12u8, d0u8);
d2u8 = vext_u8(d2u8, d3u8, 1);
d5u8 = vext_u8(d5u8, d6u8, 1);
d8u8 = vext_u8(d8u8, d9u8, 1);
d11u8 = vext_u8(d11u8, d12u8, 1);
q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
d3u8 = vext_u8(d3u8, d4u8, 1);
d6u8 = vext_u8(d6u8, d7u8, 1);
d9u8 = vext_u8(d9u8, d10u8, 1);
d12u8 = vext_u8(d12u8, d13u8, 1);
q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
d14u8 = vqrshrn_n_u16(q7u16, 7);
d15u8 = vqrshrn_n_u16(q8u16, 7);
d16u8 = vqrshrn_n_u16(q9u16, 7);
d17u8 = vqrshrn_n_u16(q10u16, 7);
d18u8 = vqrshrn_n_u16(q11u16, 7);
d19u8 = vqrshrn_n_u16(q12u16, 7);
d20u8 = vqrshrn_n_u16(q13u16, 7);
d21u8 = vqrshrn_n_u16(q14u16, 7);
d2u8 = vld1_u8(src_ptr);
d3u8 = vld1_u8(src_ptr + 8);
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr);
d6u8 = vld1_u8(src_ptr + 8);
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d8u8 = vld1_u8(src_ptr);
d9u8 = vld1_u8(src_ptr + 8);
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d11u8 = vld1_u8(src_ptr);
d12u8 = vld1_u8(src_ptr + 8);
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
q7u8 = vcombine_u8(d14u8, d15u8);
q8u8 = vcombine_u8(d16u8, d17u8);
q9u8 = vcombine_u8(d18u8, d19u8);
q10u8 = vcombine_u8(d20u8, d21u8);
vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
}
// First-pass filtering for rest 5 lines
d14u8 = vld1_u8(src_ptr);
d15u8 = vld1_u8(src_ptr + 8);
d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
q9u16 = vmull_u8(d2u8, d0u8);
q10u16 = vmull_u8(d3u8, d0u8);
q11u16 = vmull_u8(d5u8, d0u8);
q12u16 = vmull_u8(d6u8, d0u8);
q13u16 = vmull_u8(d8u8, d0u8);
q14u16 = vmull_u8(d9u8, d0u8);
d2u8 = vext_u8(d2u8, d3u8, 1);
d5u8 = vext_u8(d5u8, d6u8, 1);
d8u8 = vext_u8(d8u8, d9u8, 1);
q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
d3u8 = vext_u8(d3u8, d4u8, 1);
d6u8 = vext_u8(d6u8, d7u8, 1);
d9u8 = vext_u8(d9u8, d10u8, 1);
q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
q1u16 = vmull_u8(d11u8, d0u8);
q2u16 = vmull_u8(d12u8, d0u8);
q3u16 = vmull_u8(d14u8, d0u8);
q4u16 = vmull_u8(d15u8, d0u8);
d11u8 = vext_u8(d11u8, d12u8, 1);
d14u8 = vext_u8(d14u8, d15u8, 1);
q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
d12u8 = vext_u8(d12u8, d13u8, 1);
d15u8 = vext_u8(d15u8, d16u8, 1);
q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
d10u8 = vqrshrn_n_u16(q9u16, 7);
d11u8 = vqrshrn_n_u16(q10u16, 7);
d12u8 = vqrshrn_n_u16(q11u16, 7);
d13u8 = vqrshrn_n_u16(q12u16, 7);
d14u8 = vqrshrn_n_u16(q13u16, 7);
d15u8 = vqrshrn_n_u16(q14u16, 7);
d16u8 = vqrshrn_n_u16(q1u16, 7);
d17u8 = vqrshrn_n_u16(q2u16, 7);
d18u8 = vqrshrn_n_u16(q3u16, 7);
d19u8 = vqrshrn_n_u16(q4u16, 7);
q5u8 = vcombine_u8(d10u8, d11u8);
q6u8 = vcombine_u8(d12u8, d13u8);
q7u8 = vcombine_u8(d14u8, d15u8);
q8u8 = vcombine_u8(d16u8, d17u8);
q9u8 = vcombine_u8(d18u8, d19u8);
vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q9u8);
// secondpass_filter
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
tmpp = tmp;
q11u8 = vld1q_u8(tmpp);
tmpp += 16;
for (i = 4; i > 0; i--) {
q12u8 = vld1q_u8(tmpp); tmpp += 16;
q13u8 = vld1q_u8(tmpp); tmpp += 16;
q14u8 = vld1q_u8(tmpp); tmpp += 16;
q15u8 = vld1q_u8(tmpp); tmpp += 16;
q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
d6u8 = vqrshrn_n_u16(q5u16, 7);
d7u8 = vqrshrn_n_u16(q6u16, 7);
d8u8 = vqrshrn_n_u16(q7u16, 7);
d9u8 = vqrshrn_n_u16(q8u16, 7);
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
q3u8 = vcombine_u8(d6u8, d7u8);
q4u8 = vcombine_u8(d8u8, d9u8);
q11u8 = q15u8;
vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
}
return;
}

View file

@ -0,0 +1,59 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_copy_mem8x4_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
uint8x8_t vtmp;
int r;
for (r = 0; r < 4; r++) {
vtmp = vld1_u8(src);
vst1_u8(dst, vtmp);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem8x8_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
uint8x8_t vtmp;
int r;
for (r = 0; r < 8; r++) {
vtmp = vld1_u8(src);
vst1_u8(dst, vtmp);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem16x16_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
int r;
uint8x16_t qtmp;
for (r = 0; r < 16; r++) {
qtmp = vld1q_u8(src);
vst1q_u8(dst, qtmp);
src += src_stride;
dst += dst_stride;
}
}

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_dc_only_idct_add_neon(
int16_t input_dc,
unsigned char *pred_ptr,
int pred_stride,
unsigned char *dst_ptr,
int dst_stride) {
int i;
uint16_t a1 = ((input_dc + 4) >> 3);
uint32x2_t d2u32 = vdup_n_u32(0);
uint8x8_t d2u8;
uint16x8_t q1u16;
uint16x8_t qAdd;
qAdd = vdupq_n_u16(a1);
for (i = 0; i < 2; i++) {
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
pred_ptr += pred_stride;
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
pred_ptr += pred_stride;
q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
dst_ptr += dst_stride;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
dst_ptr += dst_stride;
}
}

View file

@ -0,0 +1,142 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_dequant_idct_add_neon(
int16_t *input,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int32x2_t d14, d15;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
int16x8_t q1, q2, q3, q4, q5, q6;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x2x2_t d2tmp0, d2tmp1;
int16x4x2_t d2tmp2, d2tmp3;
d14 = d15 = vdup_n_s32(0);
// load input
q3 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
input += 8;
q4 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
// load dq
q5 = vld1q_s16(dq);
dq += 8;
q6 = vld1q_s16(dq);
// load src from dst
dst0 = dst;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
dst0 += stride;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
vreinterpretq_u16_s16(q5)));
q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
vreinterpretq_u16_s16(q6)));
d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
// loop 2
q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
vreinterpret_u8_s32(d14)));
q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
vreinterpret_u8_s32(d15)));
d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d14, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d14, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 1);
return;
}

View file

@ -0,0 +1,25 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/common/blockd.h"
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
int16x8x2_t qQ, qDQC, qDQ;
qQ = vld2q_s16(d->qcoeff);
qDQC = vld2q_s16(DQC);
qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
vst2q_s16(d->dqcoeff, qDQ);
}

View file

@ -0,0 +1,96 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
void idct_dequant_full_2x_neon(short *q, short *dq,
unsigned char *dst, int stride);
void idct_dequant_0_2x_neon(short *q, short dq,
unsigned char *dst, int stride);
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
unsigned char *dst,
int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (((short *)(eobs))[0])
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dst, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dst, stride);
}
if (((short *)(eobs))[1])
{
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
else
idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
}
q += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
unsigned char *dstu,
unsigned char *dstv,
int stride, char *eobs)
{
if (((short *)(eobs))[0])
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstu, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
}
q += 32;
dstu += 4*stride;
if (((short *)(eobs))[1])
{
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstu, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
}
q += 32;
if (((short *)(eobs))[2])
{
if (((short *)eobs)[2] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstv, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
}
q += 32;
dstv += 4*stride;
if (((short *)(eobs))[3])
{
if (((short *)eobs)[3] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstv, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
}
}

View file

@ -0,0 +1,63 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void idct_dequant_0_2x_neon(
int16_t *q,
int16_t dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int i, a0, a1;
int16x8x2_t q2Add;
int32x2_t d2s32 = vdup_n_s32(0),
d4s32 = vdup_n_s32(0);
uint8x8_t d2u8, d4u8;
uint16x8_t q1u16, q2u16;
a0 = ((q[0] * dq) + 4) >> 3;
a1 = ((q[16] * dq) + 4) >> 3;
q[0] = q[16] = 0;
q2Add.val[0] = vdupq_n_s16((int16_t)a0);
q2Add.val[1] = vdupq_n_s16((int16_t)a1);
for (i = 0; i < 2; i++, dst += 4) {
dst0 = dst;
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
dst0 += stride;
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
dst0 += stride;
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
dst0 += stride;
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
vreinterpret_u8_s32(d2s32));
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
vreinterpret_u8_s32(d4s32));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
d2s32 = vreinterpret_s32_u8(d2u8);
d4s32 = vreinterpret_s32_u8(d4u8);
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d2s32, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d2s32, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d4s32, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d4s32, 1);
}
return;
}

View file

@ -0,0 +1,185 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 17734;
// because the lowest bit in 0x8a8c is 0, we can pre-shift this
void idct_dequant_full_2x_neon(
int16_t *q,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0, *dst1;
int32x2_t d28, d29, d30, d31;
int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x4x2_t q2tmp0, q2tmp1;
int16x8x2_t q2tmp2, q2tmp3;
int16x4_t dLow0, dLow1, dHigh0, dHigh1;
d28 = d29 = d30 = d31 = vdup_n_s32(0);
// load dq
q0 = vld1q_s16(dq);
dq += 8;
q1 = vld1q_s16(dq);
// load q
q2 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q3 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q4 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q5 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
// load src from dst
dst0 = dst;
dst1 = dst + 4;
d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
dst0 += stride;
d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
dst1 += stride;
d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
dst0 += stride;
d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
dst1 += stride;
d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
dst0 += stride;
d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
dst1 += stride;
d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
q2 = vmulq_s16(q2, q0);
q3 = vmulq_s16(q3, q1);
q4 = vmulq_s16(q4, q0);
q5 = vmulq_s16(q5, q1);
// vswp
dLow0 = vget_low_s16(q2);
dHigh0 = vget_high_s16(q2);
dLow1 = vget_low_s16(q4);
dHigh1 = vget_high_s16(q4);
q2 = vcombine_s16(dLow0, dLow1);
q4 = vcombine_s16(dHigh0, dHigh1);
dLow0 = vget_low_s16(q3);
dHigh0 = vget_high_s16(q3);
dLow1 = vget_low_s16(q5);
dHigh1 = vget_high_s16(q5);
q3 = vcombine_s16(dLow0, dLow1);
q5 = vcombine_s16(dHigh0, dHigh1);
q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
q10 = vqaddq_s16(q2, q3);
q11 = vqsubq_s16(q2, q3);
q8 = vshrq_n_s16(q8, 1);
q9 = vshrq_n_s16(q9, 1);
q4 = vqaddq_s16(q4, q8);
q5 = vqaddq_s16(q5, q9);
q2 = vqsubq_s16(q6, q5);
q3 = vqaddq_s16(q7, q4);
q4 = vqaddq_s16(q10, q3);
q5 = vqaddq_s16(q11, q2);
q6 = vqsubq_s16(q11, q2);
q7 = vqsubq_s16(q10, q3);
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
vreinterpretq_s16_s32(q2tmp1.val[0]));
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
vreinterpretq_s16_s32(q2tmp1.val[1]));
// loop 2
q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
q10 = vshrq_n_s16(q10, 1);
q11 = vshrq_n_s16(q11, 1);
q10 = vqaddq_s16(q2tmp2.val[1], q10);
q11 = vqaddq_s16(q2tmp3.val[1], q11);
q8 = vqsubq_s16(q8, q11);
q9 = vqaddq_s16(q9, q10);
q4 = vqaddq_s16(q2, q9);
q5 = vqaddq_s16(q3, q8);
q6 = vqsubq_s16(q3, q8);
q7 = vqsubq_s16(q2, q9);
q4 = vrshrq_n_s16(q4, 3);
q5 = vrshrq_n_s16(q5, 3);
q6 = vrshrq_n_s16(q6, 3);
q7 = vrshrq_n_s16(q7, 3);
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
vreinterpretq_s16_s32(q2tmp1.val[0]));
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
vreinterpretq_s16_s32(q2tmp1.val[1]));
q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
vreinterpret_u8_s32(d28)));
q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
vreinterpret_u8_s32(d29)));
q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
vreinterpret_u8_s32(d30)));
q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
vreinterpret_u8_s32(d31)));
d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
dst0 = dst;
dst1 = dst + 4;
vst1_lane_s32((int32_t *)dst0, d28, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d28, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d29, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d29, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d30, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d30, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d31, 0);
vst1_lane_s32((int32_t *)dst1, d31, 1);
return;
}

View file

@ -0,0 +1,102 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_short_inv_walsh4x4_neon(
int16_t *input,
int16_t *mb_dqcoeff) {
int16x8_t q0s16, q1s16, q2s16, q3s16;
int16x4_t d4s16, d5s16, d6s16, d7s16;
int16x4x2_t v2tmp0, v2tmp1;
int32x2x2_t v2tmp2, v2tmp3;
int16x8_t qAdd3;
q0s16 = vld1q_s16(input);
q1s16 = vld1q_s16(input + 8);
// 1st for loop
d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
q0s16 = vaddq_s16(q2s16, q3s16);
q1s16 = vsubq_s16(q2s16, q3s16);
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
vreinterpret_s32_s16(vget_low_s16(q1s16)));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
vreinterpret_s32_s16(vget_high_s16(q1s16)));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
vreinterpret_s16_s32(v2tmp3.val[0]));
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
vreinterpret_s16_s32(v2tmp3.val[1]));
// 2nd for loop
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
qAdd3 = vdupq_n_s16(3);
q0s16 = vaddq_s16(q2s16, q3s16);
q1s16 = vsubq_s16(q2s16, q3s16);
q0s16 = vaddq_s16(q0s16, qAdd3);
q1s16 = vaddq_s16(q1s16, qAdd3);
q0s16 = vshrq_n_s16(q0s16, 3);
q1s16 = vshrq_n_s16(q1s16, 3);
// store
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
mb_dqcoeff += 16;
return;
}

View file

@ -0,0 +1,111 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
uint8_t *sp;
uint8x16_t qblimit, q0u8;
uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
int16x8_t q2s16, q3s16, q13s16;
int8x8_t d8s8, d9s8;
int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
qblimit = vdupq_n_u8(*blimit);
sp = s - (p << 1);
q5u8 = vld1q_u8(sp);
sp += p;
q6u8 = vld1q_u8(sp);
sp += p;
q7u8 = vld1q_u8(sp);
sp += p;
q8u8 = vld1q_u8(sp);
q15u8 = vabdq_u8(q6u8, q7u8);
q14u8 = vabdq_u8(q5u8, q8u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q13s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q7u8 = veorq_u8(q7u8, q0u8);
q8u8 = veorq_u8(q8u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
vget_low_s8(vreinterpretq_s8_u8(q6u8)));
q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
vget_high_s8(vreinterpretq_s8_u8(q6u8)));
q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
vreinterpretq_s8_u8(q8u8));
q2s16 = vmulq_s16(q2s16, q13s16);
q3s16 = vmulq_s16(q3s16, q13s16);
q10u8 = vdupq_n_u8(3);
q9u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
d8s8 = vqmovn_s16(q2s16);
d9s8 = vqmovn_s16(q3s16);
q4s8 = vcombine_s8(d8s8, d9s8);
q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q3s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
vst1q_u8(s, q7u8);
s -= p;
vst1q_u8(s, q6u8);
return;
}
void vp8_loop_filter_bhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}

View file

@ -0,0 +1,283 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#include "vpx_ports/arm.h"
#ifdef VPX_INCOMPATIBLE_GCC
static INLINE void write_2x4(unsigned char *dst, int pitch,
const uint8x8x2_t result) {
/*
* uint8x8x2_t result
00 01 02 03 | 04 05 06 07
10 11 12 13 | 14 15 16 17
---
* after vtrn_u8
00 10 02 12 | 04 14 06 16
01 11 03 13 | 05 15 07 17
*/
const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
result.val[1]);
const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
}
static INLINE void write_2x8(unsigned char *dst, int pitch,
const uint8x8x2_t result,
const uint8x8x2_t result2) {
write_2x4(dst, pitch, result);
dst += pitch * 8;
write_2x4(dst, pitch, result2);
}
#else
static INLINE void write_2x8(unsigned char *dst, int pitch,
const uint8x8x2_t result,
const uint8x8x2_t result2) {
vst2_lane_u8(dst, result, 0);
dst += pitch;
vst2_lane_u8(dst, result, 1);
dst += pitch;
vst2_lane_u8(dst, result, 2);
dst += pitch;
vst2_lane_u8(dst, result, 3);
dst += pitch;
vst2_lane_u8(dst, result, 4);
dst += pitch;
vst2_lane_u8(dst, result, 5);
dst += pitch;
vst2_lane_u8(dst, result, 6);
dst += pitch;
vst2_lane_u8(dst, result, 7);
dst += pitch;
vst2_lane_u8(dst, result2, 0);
dst += pitch;
vst2_lane_u8(dst, result2, 1);
dst += pitch;
vst2_lane_u8(dst, result2, 2);
dst += pitch;
vst2_lane_u8(dst, result2, 3);
dst += pitch;
vst2_lane_u8(dst, result2, 4);
dst += pitch;
vst2_lane_u8(dst, result2, 5);
dst += pitch;
vst2_lane_u8(dst, result2, 6);
dst += pitch;
vst2_lane_u8(dst, result2, 7);
}
#endif // VPX_INCOMPATIBLE_GCC
#ifdef VPX_INCOMPATIBLE_GCC
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
uint8x8x4_t x;
const uint8x8_t a = vld1_u8(src);
const uint8x8_t b = vld1_u8(src + pitch * 1);
const uint8x8_t c = vld1_u8(src + pitch * 2);
const uint8x8_t d = vld1_u8(src + pitch * 3);
const uint8x8_t e = vld1_u8(src + pitch * 4);
const uint8x8_t f = vld1_u8(src + pitch * 5);
const uint8x8_t g = vld1_u8(src + pitch * 6);
const uint8x8_t h = vld1_u8(src + pitch * 7);
const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
vreinterpret_u32_u8(e));
const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
vreinterpret_u32_u8(f));
const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
vreinterpret_u32_u8(g));
const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
vreinterpret_u32_u8(h));
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
vreinterpret_u16_u32(r26_u32.val[0]));
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
vreinterpret_u16_u32(r37_u32.val[0]));
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
vreinterpret_u8_u16(r13_u16.val[0]));
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
vreinterpret_u8_u16(r13_u16.val[1]));
/*
* after vtrn_u32
00 01 02 03 | 40 41 42 43
10 11 12 13 | 50 51 52 53
20 21 22 23 | 60 61 62 63
30 31 32 33 | 70 71 72 73
---
* after vtrn_u16
00 01 20 21 | 40 41 60 61
02 03 22 23 | 42 43 62 63
10 11 30 31 | 50 51 70 71
12 13 32 33 | 52 52 72 73
00 01 20 21 | 40 41 60 61
10 11 30 31 | 50 51 70 71
02 03 22 23 | 42 43 62 63
12 13 32 33 | 52 52 72 73
---
* after vtrn_u8
00 10 20 30 | 40 50 60 70
01 11 21 31 | 41 51 61 71
02 12 22 32 | 42 52 62 72
03 13 23 33 | 43 53 63 73
*/
x.val[0] = r01_u8.val[0];
x.val[1] = r01_u8.val[1];
x.val[2] = r23_u8.val[0];
x.val[3] = r23_u8.val[1];
return x;
}
#else
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
uint8x8x4_t x;
x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
x = vld4_lane_u8(src, x, 0);
src += pitch;
x = vld4_lane_u8(src, x, 1);
src += pitch;
x = vld4_lane_u8(src, x, 2);
src += pitch;
x = vld4_lane_u8(src, x, 3);
src += pitch;
x = vld4_lane_u8(src, x, 4);
src += pitch;
x = vld4_lane_u8(src, x, 5);
src += pitch;
x = vld4_lane_u8(src, x, 6);
src += pitch;
x = vld4_lane_u8(src, x, 7);
return x;
}
#endif // VPX_INCOMPATIBLE_GCC
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
unsigned char *src1;
uint8x16_t qblimit, q0u8;
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
int16x8_t q2s16, q13s16, q11s16;
int8x8_t d28s8, d29s8;
int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
uint8x8x4_t d0u8x4; // d6, d7, d8, d9
uint8x8x4_t d1u8x4; // d10, d11, d12, d13
uint8x8x2_t d2u8x2; // d12, d13
uint8x8x2_t d3u8x2; // d14, d15
qblimit = vdupq_n_u8(*blimit);
src1 = s - 2;
d0u8x4 = read_4x8(src1, p);
src1 += p * 8;
d1u8x4 = read_4x8(src1, p);
q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
q15u8 = vabdq_u8(q5u8, q4u8);
q14u8 = vabdq_u8(q3u8, q6u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q11s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q3u8 = veorq_u8(q3u8, q0u8);
q4u8 = veorq_u8(q4u8, q0u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
vget_low_s8(vreinterpretq_s8_u8(q5u8)));
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
vget_high_s8(vreinterpretq_s8_u8(q5u8)));
q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
vreinterpretq_s8_u8(q6u8));
q2s16 = vmulq_s16(q2s16, q11s16);
q13s16 = vmulq_s16(q13s16, q11s16);
q11u8 = vdupq_n_u8(3);
q12u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
d28s8 = vqmovn_s16(q2s16);
d29s8 = vqmovn_s16(q13s16);
q14s8 = vcombine_s8(d28s8, d29s8);
q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q14s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
d2u8x2.val[0] = vget_low_u8(q6u8); // d12
d2u8x2.val[1] = vget_low_u8(q7u8); // d14
d3u8x2.val[0] = vget_high_u8(q6u8); // d13
d3u8x2.val[1] = vget_high_u8(q7u8); // d15
src1 = s - 1;
write_2x8(src1, p, d2u8x2, d3u8x2);
}
void vp8_loop_filter_bvs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbvs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
return;
}

View file

@ -0,0 +1,625 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_mbloop_filter_neon(
uint8x16_t qblimit, // mblimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p2
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q4r, // p1
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r, // q1
uint8x16_t *q9r) { // q1
uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
int8x16_t q0s8, q12s8, q14s8, q15s8;
int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q1u8 = vabdq_u8(q9, q8);
q0u8 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q1u8 = vmaxq_u8(q1u8, q0u8);
q15u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q1u8);
q15u8 = vcgeq_u8(qlimit, q15u8);
q1u8 = vabdq_u8(q5, q8);
q12u8 = vqaddq_u8(q12u8, q12u8);
// vp8_filter() function
// convert to signed
q0u8 = vdupq_n_u8(0x80);
q9 = veorq_u8(q9, q0u8);
q8 = veorq_u8(q8, q0u8);
q7 = veorq_u8(q7, q0u8);
q6 = veorq_u8(q6, q0u8);
q5 = veorq_u8(q5, q0u8);
q4 = veorq_u8(q4, q0u8);
q1u8 = vshrq_n_u8(q1u8, 1);
q12u8 = vqaddq_u8(q12u8, q1u8);
q14u8 = vorrq_u8(q13u8, q14u8);
q12u8 = vcgeq_u8(qblimit, q12u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
vreinterpretq_s8_u8(q8));
q11s16 = vdupq_n_s16(3);
q2s16 = vmulq_s16(q2s16, q11s16);
q13s16 = vmulq_s16(q13s16, q11s16);
q15u8 = vandq_u8(q15u8, q12u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
q12u8 = vdupq_n_u8(3);
q11u8 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2 = vqmovn_s16(q2s16);
d3 = vqmovn_s16(q13s16);
q1s8 = vcombine_s8(d2, d3);
q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q13s8 = vshrq_n_s8(q13s8, 3);
q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
d5 = vdup_n_s8(9);
d4 = vdup_n_s8(18);
q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
d5 = vdup_n_s8(27);
q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
d0 = vqshrn_n_s16(q0s16 , 7);
d1 = vqshrn_n_s16(q11s16, 7);
d24 = vqshrn_n_s16(q12s16, 7);
d25 = vqshrn_n_s16(q13s16, 7);
d28 = vqshrn_n_s16(q14s16, 7);
d29 = vqshrn_n_s16(q15s16, 7);
q0s8 = vcombine_s8(d0, d1);
q12s8 = vcombine_s8(d24, d25);
q14s8 = vcombine_s8(d28, d29);
q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
q15s8 = vqsubq_s8((q7s8), q14s8);
q14s8 = vqaddq_s8((q6s8), q14s8);
q1u8 = vdupq_n_u8(0x80);
*q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
*q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
*q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
*q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
*q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
*q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
return;
}
void vp8_mbloop_filter_horizontal_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
src -= (pitch << 2);
q3 = vld1q_u8(src);
src += pitch;
q4 = vld1q_u8(src);
src += pitch;
q5 = vld1q_u8(src);
src += pitch;
q6 = vld1q_u8(src);
src += pitch;
q7 = vld1q_u8(src);
src += pitch;
q8 = vld1q_u8(src);
src += pitch;
q9 = vld1q_u8(src);
src += pitch;
q10 = vld1q_u8(src);
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
src -= (pitch * 6);
vst1q_u8(src, q4);
src += pitch;
vst1q_u8(src, q5);
src += pitch;
vst1q_u8(src, q6);
src += pitch;
vst1q_u8(src, q7);
src += pitch;
vst1q_u8(src, q8);
src += pitch;
vst1q_u8(src, q9);
return;
}
void vp8_mbloop_filter_horizontal_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
u -= (pitch << 2);
v -= (pitch << 2);
d6 = vld1_u8(u);
u += pitch;
d7 = vld1_u8(v);
v += pitch;
d8 = vld1_u8(u);
u += pitch;
d9 = vld1_u8(v);
v += pitch;
d10 = vld1_u8(u);
u += pitch;
d11 = vld1_u8(v);
v += pitch;
d12 = vld1_u8(u);
u += pitch;
d13 = vld1_u8(v);
v += pitch;
d14 = vld1_u8(u);
u += pitch;
d15 = vld1_u8(v);
v += pitch;
d16 = vld1_u8(u);
u += pitch;
d17 = vld1_u8(v);
v += pitch;
d18 = vld1_u8(u);
u += pitch;
d19 = vld1_u8(v);
v += pitch;
d20 = vld1_u8(u);
d21 = vld1_u8(v);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
u -= (pitch * 6);
v -= (pitch * 6);
vst1_u8(u, vget_low_u8(q4));
u += pitch;
vst1_u8(v, vget_high_u8(q4));
v += pitch;
vst1_u8(u, vget_low_u8(q5));
u += pitch;
vst1_u8(v, vget_high_u8(q5));
v += pitch;
vst1_u8(u, vget_low_u8(q6));
u += pitch;
vst1_u8(v, vget_high_u8(q6));
v += pitch;
vst1_u8(u, vget_low_u8(q7));
u += pitch;
vst1_u8(v, vget_high_u8(q7));
v += pitch;
vst1_u8(u, vget_low_u8(q8));
u += pitch;
vst1_u8(v, vget_high_u8(q8));
v += pitch;
vst1_u8(u, vget_low_u8(q9));
vst1_u8(v, vget_high_u8(q9));
return;
}
void vp8_mbloop_filter_vertical_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
unsigned char *s1, *s2;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
s1 = src - 4;
s2 = s1 + 8 * pitch;
d6 = vld1_u8(s1);
s1 += pitch;
d7 = vld1_u8(s2);
s2 += pitch;
d8 = vld1_u8(s1);
s1 += pitch;
d9 = vld1_u8(s2);
s2 += pitch;
d10 = vld1_u8(s1);
s1 += pitch;
d11 = vld1_u8(s2);
s2 += pitch;
d12 = vld1_u8(s1);
s1 += pitch;
d13 = vld1_u8(s2);
s2 += pitch;
d14 = vld1_u8(s1);
s1 += pitch;
d15 = vld1_u8(s2);
s2 += pitch;
d16 = vld1_u8(s1);
s1 += pitch;
d17 = vld1_u8(s2);
s2 += pitch;
d18 = vld1_u8(s1);
s1 += pitch;
d19 = vld1_u8(s2);
s2 += pitch;
d20 = vld1_u8(s1);
d21 = vld1_u8(s2);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
s1 -= 7 * pitch;
s2 -= 7 * pitch;
vst1_u8(s1, vget_low_u8(q3));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q3));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q4));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q4));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q5));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q5));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q6));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q6));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q7));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q7));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q8));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q8));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q9));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q9));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q10));
vst1_u8(s2, vget_high_u8(q10));
return;
}
void vp8_mbloop_filter_vertical_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
unsigned char *us, *ud;
unsigned char *vs, *vd;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
us = u - 4;
vs = v - 4;
d6 = vld1_u8(us);
us += pitch;
d7 = vld1_u8(vs);
vs += pitch;
d8 = vld1_u8(us);
us += pitch;
d9 = vld1_u8(vs);
vs += pitch;
d10 = vld1_u8(us);
us += pitch;
d11 = vld1_u8(vs);
vs += pitch;
d12 = vld1_u8(us);
us += pitch;
d13 = vld1_u8(vs);
vs += pitch;
d14 = vld1_u8(us);
us += pitch;
d15 = vld1_u8(vs);
vs += pitch;
d16 = vld1_u8(us);
us += pitch;
d17 = vld1_u8(vs);
vs += pitch;
d18 = vld1_u8(us);
us += pitch;
d19 = vld1_u8(vs);
vs += pitch;
d20 = vld1_u8(us);
d21 = vld1_u8(vs);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
ud = u - 4;
vst1_u8(ud, vget_low_u8(q3));
ud += pitch;
vst1_u8(ud, vget_low_u8(q4));
ud += pitch;
vst1_u8(ud, vget_low_u8(q5));
ud += pitch;
vst1_u8(ud, vget_low_u8(q6));
ud += pitch;
vst1_u8(ud, vget_low_u8(q7));
ud += pitch;
vst1_u8(ud, vget_low_u8(q8));
ud += pitch;
vst1_u8(ud, vget_low_u8(q9));
ud += pitch;
vst1_u8(ud, vget_low_u8(q10));
vd = v - 4;
vst1_u8(vd, vget_high_u8(q3));
vd += pitch;
vst1_u8(vd, vget_high_u8(q4));
vd += pitch;
vst1_u8(vd, vget_high_u8(q5));
vd += pitch;
vst1_u8(vd, vget_high_u8(q6));
vd += pitch;
vst1_u8(vd, vget_high_u8(q7));
vd += pitch;
vst1_u8(vd, vget_high_u8(q8));
vd += pitch;
vst1_u8(vd, vget_high_u8(q9));
vd += pitch;
vst1_u8(vd, vget_high_u8(q10));
return;
}

View file

@ -0,0 +1,123 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_short_idct4x4llm_neon(
int16_t *input,
unsigned char *pred_ptr,
int pred_stride,
unsigned char *dst_ptr,
int dst_stride) {
int i;
uint32x2_t d6u32 = vdup_n_u32(0);
uint8x8_t d1u8;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
uint16x8_t q1u16;
int16x8_t q1s16, q2s16, q3s16, q4s16;
int32x2x2_t v2tmp0, v2tmp1;
int16x4x2_t v2tmp2, v2tmp3;
d2 = vld1_s16(input);
d3 = vld1_s16(input + 4);
d4 = vld1_s16(input + 8);
d5 = vld1_s16(input + 12);
// 1st for loop
q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
q2s16 = vcombine_s16(d3, d5);
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
vreinterpret_s16_s32(v2tmp1.val[0]));
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
vreinterpret_s16_s32(v2tmp1.val[1]));
// 2nd for loop
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
vreinterpret_s16_s32(v2tmp1.val[0]));
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
vreinterpret_s16_s32(v2tmp1.val[1]));
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
// dc_only_idct_add
for (i = 0; i < 2; i++, q1s16 = q2s16) {
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
pred_ptr += pred_stride;
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
pred_ptr += pred_stride;
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
vreinterpret_u8_u32(d6u32));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
dst_ptr += dst_stride;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
dst_ptr += dst_stride;
}
return;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,550 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#include "vpx_ports/arm.h"
static INLINE void vp8_loop_filter_neon(
uint8x16_t qblimit, // flimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p3
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r) { // q1
uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q2s16, q11s16;
uint16x8_t q4u16;
int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
int8x8_t d2s8, d3s8;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q3 = vabdq_u8(q9, q8);
q4 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q3 = vmaxq_u8(q3, q4);
q15u8 = vmaxq_u8(q11u8, q12u8);
q9 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q3);
q2u8 = vabdq_u8(q5, q8);
q9 = vqaddq_u8(q9, q9);
q15u8 = vcgeq_u8(qlimit, q15u8);
// vp8_filter() function
// convert to signed
q10 = vdupq_n_u8(0x80);
q8 = veorq_u8(q8, q10);
q7 = veorq_u8(q7, q10);
q6 = veorq_u8(q6, q10);
q5 = veorq_u8(q5, q10);
q2u8 = vshrq_n_u8(q2u8, 1);
q9 = vqaddq_u8(q9, q2u8);
q10 = vdupq_n_u8(3);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q9 = vcgeq_u8(qblimit, q9);
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
vreinterpretq_s8_u8(q8));
q14u8 = vorrq_u8(q13u8, q14u8);
q4u16 = vmovl_u8(vget_low_u8(q10));
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
q15u8 = vandq_u8(q15u8, q9);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
q9 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2s8 = vqmovn_s16(q2s16);
d3s8 = vqmovn_s16(q11s16);
q1s8 = vcombine_s8(d2s8, d3s8);
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
q2s8 = vshrq_n_s8(q2s8, 3);
q1s8 = vshrq_n_s8(q1s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
q1s8 = vrshrq_n_s8(q1s8, 1);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
q0u8 = vdupq_n_u8(0x80);
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
*q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
return;
}
void vp8_loop_filter_horizontal_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
src -= (pitch << 2);
q3 = vld1q_u8(src);
src += pitch;
q4 = vld1q_u8(src);
src += pitch;
q5 = vld1q_u8(src);
src += pitch;
q6 = vld1q_u8(src);
src += pitch;
q7 = vld1q_u8(src);
src += pitch;
q8 = vld1q_u8(src);
src += pitch;
q9 = vld1q_u8(src);
src += pitch;
q10 = vld1q_u8(src);
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
src -= (pitch * 5);
vst1q_u8(src, q5);
src += pitch;
vst1q_u8(src, q6);
src += pitch;
vst1q_u8(src, q7);
src += pitch;
vst1q_u8(src, q8);
return;
}
void vp8_loop_filter_horizontal_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
u -= (pitch << 2);
v -= (pitch << 2);
d6 = vld1_u8(u);
u += pitch;
d7 = vld1_u8(v);
v += pitch;
d8 = vld1_u8(u);
u += pitch;
d9 = vld1_u8(v);
v += pitch;
d10 = vld1_u8(u);
u += pitch;
d11 = vld1_u8(v);
v += pitch;
d12 = vld1_u8(u);
u += pitch;
d13 = vld1_u8(v);
v += pitch;
d14 = vld1_u8(u);
u += pitch;
d15 = vld1_u8(v);
v += pitch;
d16 = vld1_u8(u);
u += pitch;
d17 = vld1_u8(v);
v += pitch;
d18 = vld1_u8(u);
u += pitch;
d19 = vld1_u8(v);
v += pitch;
d20 = vld1_u8(u);
d21 = vld1_u8(v);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
u -= (pitch * 5);
vst1_u8(u, vget_low_u8(q5));
u += pitch;
vst1_u8(u, vget_low_u8(q6));
u += pitch;
vst1_u8(u, vget_low_u8(q7));
u += pitch;
vst1_u8(u, vget_low_u8(q8));
v -= (pitch * 5);
vst1_u8(v, vget_high_u8(q5));
v += pitch;
vst1_u8(v, vget_high_u8(q6));
v += pitch;
vst1_u8(v, vget_high_u8(q7));
v += pitch;
vst1_u8(v, vget_high_u8(q8));
return;
}
static INLINE void write_4x8(unsigned char *dst, int pitch,
const uint8x8x4_t result) {
#ifdef VPX_INCOMPATIBLE_GCC
/*
* uint8x8x4_t result
00 01 02 03 | 04 05 06 07
10 11 12 13 | 14 15 16 17
20 21 22 23 | 24 25 26 27
30 31 32 33 | 34 35 36 37
---
* after vtrn_u16
00 01 20 21 | 04 05 24 25
02 03 22 23 | 06 07 26 27
10 11 30 31 | 14 15 34 35
12 13 32 33 | 16 17 36 37
---
* after vtrn_u8
00 10 20 30 | 04 14 24 34
01 11 21 31 | 05 15 25 35
02 12 22 32 | 06 16 26 36
03 13 23 33 | 07 17 27 37
*/
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
vreinterpret_u16_u8(result.val[2]));
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
vreinterpret_u16_u8(result.val[3]));
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
vreinterpret_u8_u16(r13_u16.val[0]));
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
vreinterpret_u8_u16(r13_u16.val[1]));
const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
#else
vst4_lane_u8(dst, result, 0);
dst += pitch;
vst4_lane_u8(dst, result, 1);
dst += pitch;
vst4_lane_u8(dst, result, 2);
dst += pitch;
vst4_lane_u8(dst, result, 3);
dst += pitch;
vst4_lane_u8(dst, result, 4);
dst += pitch;
vst4_lane_u8(dst, result, 5);
dst += pitch;
vst4_lane_u8(dst, result, 6);
dst += pitch;
vst4_lane_u8(dst, result, 7);
#endif // VPX_INCOMPATIBLE_GCC
}
void vp8_loop_filter_vertical_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
unsigned char *s, *d;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
uint8x8x4_t q4ResultH, q4ResultL;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
s = src - 4;
d6 = vld1_u8(s);
s += pitch;
d8 = vld1_u8(s);
s += pitch;
d10 = vld1_u8(s);
s += pitch;
d12 = vld1_u8(s);
s += pitch;
d14 = vld1_u8(s);
s += pitch;
d16 = vld1_u8(s);
s += pitch;
d18 = vld1_u8(s);
s += pitch;
d20 = vld1_u8(s);
s += pitch;
d7 = vld1_u8(s);
s += pitch;
d9 = vld1_u8(s);
s += pitch;
d11 = vld1_u8(s);
s += pitch;
d13 = vld1_u8(s);
s += pitch;
d15 = vld1_u8(s);
s += pitch;
d17 = vld1_u8(s);
s += pitch;
d19 = vld1_u8(s);
s += pitch;
d21 = vld1_u8(s);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
q4ResultL.val[0] = vget_low_u8(q5); // d10
q4ResultL.val[1] = vget_low_u8(q6); // d12
q4ResultL.val[2] = vget_low_u8(q7); // d14
q4ResultL.val[3] = vget_low_u8(q8); // d16
q4ResultH.val[0] = vget_high_u8(q5); // d11
q4ResultH.val[1] = vget_high_u8(q6); // d13
q4ResultH.val[2] = vget_high_u8(q7); // d15
q4ResultH.val[3] = vget_high_u8(q8); // d17
d = src - 2;
write_4x8(d, pitch, q4ResultL);
d += pitch * 8;
write_4x8(d, pitch, q4ResultH);
}
void vp8_loop_filter_vertical_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
unsigned char *us, *ud;
unsigned char *vs, *vd;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
uint8x8x4_t q4ResultH, q4ResultL;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
us = u - 4;
d6 = vld1_u8(us);
us += pitch;
d8 = vld1_u8(us);
us += pitch;
d10 = vld1_u8(us);
us += pitch;
d12 = vld1_u8(us);
us += pitch;
d14 = vld1_u8(us);
us += pitch;
d16 = vld1_u8(us);
us += pitch;
d18 = vld1_u8(us);
us += pitch;
d20 = vld1_u8(us);
vs = v - 4;
d7 = vld1_u8(vs);
vs += pitch;
d9 = vld1_u8(vs);
vs += pitch;
d11 = vld1_u8(vs);
vs += pitch;
d13 = vld1_u8(vs);
vs += pitch;
d15 = vld1_u8(vs);
vs += pitch;
d17 = vld1_u8(vs);
vs += pitch;
d19 = vld1_u8(vs);
vs += pitch;
d21 = vld1_u8(vs);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
q4ResultL.val[0] = vget_low_u8(q5); // d10
q4ResultL.val[1] = vget_low_u8(q6); // d12
q4ResultL.val[2] = vget_low_u8(q7); // d14
q4ResultL.val[3] = vget_low_u8(q8); // d16
ud = u - 2;
write_4x8(ud, pitch, q4ResultL);
q4ResultH.val[0] = vget_high_u8(q5); // d11
q4ResultH.val[1] = vget_high_u8(q6); // d13
q4ResultH.val[2] = vget_high_u8(q7); // d15
q4ResultH.val[3] = vget_high_u8(q8); // d17
vd = v - 2;
write_4x8(vd, pitch, q4ResultH);
}

22
thirdparty/libvpx/vp8/common/blockd.c vendored Normal file
View file

@ -0,0 +1,22 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "blockd.h"
#include "vpx_mem/vpx_mem.h"
const unsigned char vp8_block2left[25] =
{
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
};
const unsigned char vp8_block2above[25] =
{
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
};

312
thirdparty/libvpx/vp8/common/blockd.h vendored Normal file
View file

@ -0,0 +1,312 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_BLOCKD_H_
#define VP8_COMMON_BLOCKD_H_
void vpx_log(const char *format, ...);
#include "vpx_config.h"
#include "vpx_scale/yv12config.h"
#include "mv.h"
#include "treecoder.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
/*#define DCPRED 1*/
#define DCPREDSIMTHRESH 0
#define DCPREDCNTTHRESH 3
#define MB_FEATURE_TREE_PROBS 3
#define MAX_MB_SEGMENTS 4
#define MAX_REF_LF_DELTAS 4
#define MAX_MODE_LF_DELTAS 4
/* Segment Feature Masks */
#define SEGMENT_DELTADATA 0
#define SEGMENT_ABSDATA 1
typedef struct
{
int r, c;
} POS;
#define PLANE_TYPE_Y_NO_DC 0
#define PLANE_TYPE_Y2 1
#define PLANE_TYPE_UV 2
#define PLANE_TYPE_Y_WITH_DC 3
typedef char ENTROPY_CONTEXT;
typedef struct
{
ENTROPY_CONTEXT y1[4];
ENTROPY_CONTEXT u[2];
ENTROPY_CONTEXT v[2];
ENTROPY_CONTEXT y2;
} ENTROPY_CONTEXT_PLANES;
extern const unsigned char vp8_block2left[25];
extern const unsigned char vp8_block2above[25];
#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
Dest = (A)+(B);
typedef enum
{
KEY_FRAME = 0,
INTER_FRAME = 1
} FRAME_TYPE;
typedef enum
{
DC_PRED, /* average of above and left pixels */
V_PRED, /* vertical prediction */
H_PRED, /* horizontal prediction */
TM_PRED, /* Truemotion prediction */
B_PRED, /* block based prediction, each block has its own prediction mode */
NEARESTMV,
NEARMV,
ZEROMV,
NEWMV,
SPLITMV,
MB_MODE_COUNT
} MB_PREDICTION_MODE;
/* Macroblock level features */
typedef enum
{
MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */
MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */
MB_LVL_MAX = 2 /* Number of MB level features supported */
} MB_LVL_FEATURES;
/* Segment Feature Masks */
#define SEGMENT_ALTQ 0x01
#define SEGMENT_ALT_LF 0x02
#define VP8_YMODES (B_PRED + 1)
#define VP8_UV_MODES (TM_PRED + 1)
#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
typedef enum
{
B_DC_PRED, /* average of above and left pixels */
B_TM_PRED,
B_VE_PRED, /* vertical prediction */
B_HE_PRED, /* horizontal prediction */
B_LD_PRED,
B_RD_PRED,
B_VR_PRED,
B_VL_PRED,
B_HD_PRED,
B_HU_PRED,
LEFT4X4,
ABOVE4X4,
ZERO4X4,
NEW4X4,
B_MODE_COUNT
} B_PREDICTION_MODE;
#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */
#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
/* For keyframes, intra block modes are predicted by the (already decoded)
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
union b_mode_info
{
B_PREDICTION_MODE as_mode;
int_mv mv;
};
typedef enum
{
INTRA_FRAME = 0,
LAST_FRAME = 1,
GOLDEN_FRAME = 2,
ALTREF_FRAME = 3,
MAX_REF_FRAMES = 4
} MV_REFERENCE_FRAME;
typedef struct
{
uint8_t mode, uv_mode;
uint8_t ref_frame;
uint8_t is_4x4;
int_mv mv;
uint8_t partitioning;
uint8_t mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
uint8_t need_to_clamp_mvs;
uint8_t segment_id; /* Which set of segmentation parameters should be used for this MB */
} MB_MODE_INFO;
typedef struct modeinfo
{
MB_MODE_INFO mbmi;
union b_mode_info bmi[16];
} MODE_INFO;
#if CONFIG_MULTI_RES_ENCODING
/* The mb-level information needed to be stored for higher-resolution encoder */
typedef struct
{
MB_PREDICTION_MODE mode;
MV_REFERENCE_FRAME ref_frame;
int_mv mv;
int dissim; /* dissimilarity level of the macroblock */
} LOWER_RES_MB_INFO;
/* The frame-level information needed to be stored for higher-resolution
* encoder */
typedef struct
{
FRAME_TYPE frame_type;
int is_frame_dropped;
// The frame rate for the lowest resolution.
double low_res_framerate;
/* The frame number of each reference frames */
unsigned int low_res_ref_frames[MAX_REF_FRAMES];
// The video frame counter value for the key frame, for lowest resolution.
unsigned int key_frame_counter_value;
LOWER_RES_MB_INFO *mb_info;
} LOWER_RES_FRAME_INFO;
#endif
typedef struct blockd
{
short *qcoeff;
short *dqcoeff;
unsigned char *predictor;
short *dequant;
int offset;
char *eob;
union b_mode_info bmi;
} BLOCKD;
typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
typedef struct macroblockd
{
DECLARE_ALIGNED(16, unsigned char, predictor[384]);
DECLARE_ALIGNED(16, short, qcoeff[400]);
DECLARE_ALIGNED(16, short, dqcoeff[400]);
DECLARE_ALIGNED(16, char, eobs[25]);
DECLARE_ALIGNED(16, short, dequant_y1[16]);
DECLARE_ALIGNED(16, short, dequant_y1_dc[16]);
DECLARE_ALIGNED(16, short, dequant_y2[16]);
DECLARE_ALIGNED(16, short, dequant_uv[16]);
/* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
BLOCKD block[25];
int fullpixel_mask;
YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
YV12_BUFFER_CONFIG dst;
MODE_INFO *mode_info_context;
int mode_info_stride;
FRAME_TYPE frame_type;
int up_available;
int left_available;
unsigned char *recon_above[3];
unsigned char *recon_left[3];
int recon_left_stride[2];
/* Y,U,V,Y2 */
ENTROPY_CONTEXT_PLANES *above_context;
ENTROPY_CONTEXT_PLANES *left_context;
/* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
unsigned char segmentation_enabled;
/* 0 (do not update) 1 (update) the macroblock segmentation map. */
unsigned char update_mb_segmentation_map;
/* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
unsigned char update_mb_segmentation_data;
/* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
unsigned char mb_segement_abs_delta;
/* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
/* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */
signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */
/* mode_based Loop filter adjustment */
unsigned char mode_ref_lf_delta_enabled;
unsigned char mode_ref_lf_delta_update;
/* Delta values have the range +/- MAX_LOOP_FILTER */
signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
/* Distance of MB away from frame edges */
int mb_to_left_edge;
int mb_to_right_edge;
int mb_to_top_edge;
int mb_to_bottom_edge;
vp8_subpix_fn_t subpixel_predict;
vp8_subpix_fn_t subpixel_predict8x4;
vp8_subpix_fn_t subpixel_predict8x8;
vp8_subpix_fn_t subpixel_predict16x16;
void *current_bc;
int corrupted;
#if ARCH_X86 || ARCH_X86_64
/* This is an intermediate buffer currently used in sub-pixel motion search
* to keep a copy of the reference area. This buffer can be used for other
* purpose.
*/
DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
#endif
} MACROBLOCKD;
extern void vp8_build_block_doffsets(MACROBLOCKD *x);
extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_BLOCKD_H_

View file

@ -0,0 +1,197 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
#define VP8_COMMON_COEFUPDATEPROBS_H_
#ifdef __cplusplus
extern "C" {
#endif
/* Update probabilities for the nodes in the token entropy tree.
Generated file included by entropy.c */
const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
{
{
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
{249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
{234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
{250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
},
{
{
{217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
{234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
},
{
{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
{250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
},
{
{
{186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
{234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
{251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
},
{
{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
},
{
{255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
},
{
{
{248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
{248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
{246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
{252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
},
{
{255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
{248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
{253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
{252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
{250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
{
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
},
},
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_COEFUPDATEPROBS_H_

48
thirdparty/libvpx/vp8/common/common.h vendored Normal file
View file

@ -0,0 +1,48 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_COMMON_H_
#define VP8_COMMON_COMMON_H_
#include <assert.h>
/* Interface header for common constant data structures and lookup tables */
#include "vpx_mem/vpx_mem.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Only need this for fixed-size arrays, for structs just assign. */
#define vp8_copy( Dest, Src) { \
assert( sizeof( Dest) == sizeof( Src)); \
memcpy( Dest, Src, sizeof( Src)); \
}
/* Use this for variably-sized arrays. */
#define vp8_copy_array( Dest, Src, N) { \
assert( sizeof( *Dest) == sizeof( *Src)); \
memcpy( Dest, Src, N * sizeof( *Src)); \
}
#define vp8_zero( Dest) memset( &Dest, 0, sizeof( Dest));
#define vp8_zero_array( Dest, N) memset( Dest, 0, N * sizeof( *Dest));
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_COMMON_H_

32
thirdparty/libvpx/vp8/common/copy_c.c vendored Normal file
View file

@ -0,0 +1,32 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <string.h>
#include "./vp8_rtcd.h"
#include "vpx/vpx_integer.h"
/* Copy 2 macroblocks to a buffer */
void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
unsigned char *dst_ptr, int dst_stride,
int height)
{
int r;
for (r = 0; r < height; r++)
{
memcpy(dst_ptr, src_ptr, 32);
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}

View file

@ -0,0 +1,155 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdio.h>
#include "blockd.h"
void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
{
int mb_row;
int mb_col;
int mb_index = 0;
FILE *mvs = fopen("mvs.stt", "a");
/* print out the macroblock Y modes */
mb_index = 0;
fprintf(mvs, "Mb Modes for Frame %d\n", frame);
for (mb_row = 0; mb_row < rows; mb_row++)
{
for (mb_col = 0; mb_col < cols; mb_col++)
{
fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
mb_index++;
}
fprintf(mvs, "\n");
mb_index++;
}
fprintf(mvs, "\n");
mb_index = 0;
fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
for (mb_row = 0; mb_row < rows; mb_row++)
{
for (mb_col = 0; mb_col < cols; mb_col++)
{
fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
mb_index++;
}
fprintf(mvs, "\n");
mb_index++;
}
fprintf(mvs, "\n");
/* print out the macroblock UV modes */
mb_index = 0;
fprintf(mvs, "UV Modes for Frame %d\n", frame);
for (mb_row = 0; mb_row < rows; mb_row++)
{
for (mb_col = 0; mb_col < cols; mb_col++)
{
fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
mb_index++;
}
mb_index++;
fprintf(mvs, "\n");
}
fprintf(mvs, "\n");
/* print out the block modes */
fprintf(mvs, "Mbs for Frame %d\n", frame);
{
int b_row;
for (b_row = 0; b_row < 4 * rows; b_row++)
{
int b_col;
int bindex;
for (b_col = 0; b_col < 4 * cols; b_col++)
{
mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
bindex = (b_row & 3) * 4 + (b_col & 3);
if (mi[mb_index].mbmi.mode == B_PRED)
fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
else
fprintf(mvs, "xx ");
}
fprintf(mvs, "\n");
}
}
fprintf(mvs, "\n");
/* print out the macroblock mvs */
mb_index = 0;
fprintf(mvs, "MVs for Frame %d\n", frame);
for (mb_row = 0; mb_row < rows; mb_row++)
{
for (mb_col = 0; mb_col < cols; mb_col++)
{
fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
mb_index++;
}
mb_index++;
fprintf(mvs, "\n");
}
fprintf(mvs, "\n");
/* print out the block modes */
fprintf(mvs, "MVs for Frame %d\n", frame);
{
int b_row;
for (b_row = 0; b_row < 4 * rows; b_row++)
{
int b_col;
int bindex;
for (b_col = 0; b_col < 4 * cols; b_col++)
{
mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
bindex = (b_row & 3) * 4 + (b_col & 3);
fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
}
fprintf(mvs, "\n");
}
}
fprintf(mvs, "\n");
fclose(mvs);
}

View file

@ -0,0 +1,200 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
#ifdef __cplusplus
extern "C" {
#endif
/*Generated file, included by entropy.c*/
static const vp8_prob default_coef_probs [BLOCK_TYPES]
[COEF_BANDS]
[PREV_COEF_CONTEXTS]
[ENTROPY_NODES] =
{
{ /* Block Type ( 0 ) */
{ /* Coeff Band ( 0 )*/
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 1 )*/
{ 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
{ 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
{ 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
},
{ /* Coeff Band ( 2 )*/
{ 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
{ 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
{ 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
},
{ /* Coeff Band ( 3 )*/
{ 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
{ 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
{ 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 4 )*/
{ 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
{ 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
{ 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
},
{ /* Coeff Band ( 5 )*/
{ 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
{ 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
{ 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 6 )*/
{ 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
{ 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
{ 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 7 )*/
{ 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
}
},
{ /* Block Type ( 1 ) */
{ /* Coeff Band ( 0 )*/
{ 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
{ 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
{ 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
},
{ /* Coeff Band ( 1 )*/
{ 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
{ 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
{ 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
},
{ /* Coeff Band ( 2 )*/
{ 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
{ 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
{ 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
},
{ /* Coeff Band ( 3 )*/
{ 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
{ 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
{ 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
},
{ /* Coeff Band ( 4 )*/
{ 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
{ 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
{ 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
},
{ /* Coeff Band ( 5 )*/
{ 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
{ 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
{ 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
},
{ /* Coeff Band ( 6 )*/
{ 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
{ 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
{ 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
},
{ /* Coeff Band ( 7 )*/
{ 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
{ 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
}
},
{ /* Block Type ( 2 ) */
{ /* Coeff Band ( 0 )*/
{ 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
{ 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
{ 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
},
{ /* Coeff Band ( 1 )*/
{ 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
{ 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
{ 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
},
{ /* Coeff Band ( 2 )*/
{ 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
{ 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
{ 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
},
{ /* Coeff Band ( 3 )*/
{ 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
{ 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 4 )*/
{ 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
{ 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 5 )*/
{ 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 6 )*/
{ 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ /* Coeff Band ( 7 )*/
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
}
},
{ /* Block Type ( 3 ) */
{ /* Coeff Band ( 0 )*/
{ 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
{ 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
{ 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
},
{ /* Coeff Band ( 1 )*/
{ 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
{ 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
{ 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
},
{ /* Coeff Band ( 2 )*/
{ 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
{ 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
{ 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
},
{ /* Coeff Band ( 3 )*/
{ 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
{ 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
{ 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
},
{ /* Coeff Band ( 4 )*/
{ 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
{ 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
{ 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
},
{ /* Coeff Band ( 5 )*/
{ 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
{ 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
{ 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
},
{ /* Coeff Band ( 6 )*/
{ 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
{ 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
{ 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
},
{ /* Coeff Band ( 7 )*/
{ 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
}
}
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_

View file

@ -0,0 +1,43 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vp8/common/blockd.h"
#include "vpx_mem/vpx_mem.h"
void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
{
int i;
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
for (i = 0; i < 16; i++)
{
DQ[i] = Q[i] * DQC[i];
}
}
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride)
{
int i;
for (i = 0; i < 16; i++)
{
input[i] = dq[i] * input[i];
}
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
memset(input, 0, 32);
}

188
thirdparty/libvpx/vp8/common/entropy.c vendored Normal file
View file

@ -0,0 +1,188 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "entropy.h"
#include "blockd.h"
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
#include "coefupdateprobs.h"
DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
{
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
DECLARE_ALIGNED(16, const unsigned char,
vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
{
0, 1, 4, 8,
5, 2, 3, 6,
9, 12, 13, 10,
7, 11, 14, 15,
};
DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
{
1, 2, 6, 7,
3, 5, 8, 13,
4, 9, 12, 14,
10, 11, 15, 16
};
/* vp8_default_zig_zag_mask generated with:
void vp8_init_scan_order_mask()
{
int i;
for (i = 0; i < 16; i++)
{
vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
}
}
*/
DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
{
1, 2, 32, 64,
4, 16, 128, 4096,
8, 256, 2048, 8192,
512, 1024, 16384, -32768
};
const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
/* Array indices are identical to previously-existing CONTEXT_NODE indices */
const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
{
-DCT_EOB_TOKEN, 2, /* 0 = EOB */
-ZERO_TOKEN, 4, /* 1 = ZERO */
-ONE_TOKEN, 6, /* 2 = ONE */
8, 12, /* 3 = LOW_VAL */
-TWO_TOKEN, 10, /* 4 = TWO */
-THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
14, 16, /* 6 = HIGH_LOW */
-DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
18, 20, /* 8 = CAT_THREEFOUR */
-DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
-DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
};
/* vp8_coef_encodings generated with:
vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
*/
vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
{
{2, 2},
{6, 3},
{28, 5},
{58, 6},
{59, 6},
{60, 6},
{61, 6},
{124, 7},
{125, 7},
{126, 7},
{127, 7},
{0, 1}
};
/* Trees for extra bits. Probabilities are constant and
do not depend on previously encoded bits */
static const vp8_prob Pcat1[] = { 159};
static const vp8_prob Pcat2[] = { 165, 145};
static const vp8_prob Pcat3[] = { 173, 148, 140};
static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
static const vp8_prob Pcat6[] =
{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
/* tree index tables generated with:
void init_bit_tree(vp8_tree_index *p, int n)
{
int i = 0;
while (++i < n)
{
p[0] = p[1] = i << 1;
p += 2;
}
p[0] = p[1] = 0;
}
void init_bit_trees()
{
init_bit_tree(cat1, 1);
init_bit_tree(cat2, 2);
init_bit_tree(cat3, 3);
init_bit_tree(cat4, 4);
init_bit_tree(cat5, 5);
init_bit_tree(cat6, 11);
}
*/
static const vp8_tree_index cat1[2] = { 0, 0 };
static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
const vp8_extra_bit_struct vp8_extra_bits[12] =
{
{ 0, 0, 0, 0},
{ 0, 0, 0, 1},
{ 0, 0, 0, 2},
{ 0, 0, 0, 3},
{ 0, 0, 0, 4},
{ cat1, Pcat1, 1, 5},
{ cat2, Pcat2, 2, 7},
{ cat3, Pcat3, 3, 11},
{ cat4, Pcat4, 4, 19},
{ cat5, Pcat5, 5, 35},
{ cat6, Pcat6, 11, 67},
{ 0, 0, 0, 0}
};
#include "default_coef_probs.h"
void vp8_default_coef_probs(VP8_COMMON *pc)
{
memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
}

109
thirdparty/libvpx/vp8/common/entropy.h vendored Normal file
View file

@ -0,0 +1,109 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_ENTROPY_H_
#define VP8_COMMON_ENTROPY_H_
#include "treecoder.h"
#include "blockd.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Coefficient token alphabet */
#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */
#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */
#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */
#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */
#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
#define MAX_ENTROPY_TOKENS 12
#define ENTROPY_NODES 11
extern const vp8_tree_index vp8_coef_tree[];
extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
typedef struct
{
vp8_tree_p tree;
const vp8_prob *prob;
int Len;
int base_val;
} vp8_extra_bit_struct;
extern const vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */
#define PROB_UPDATE_BASELINE_COST 7
#define MAX_PROB 255
#define DCT_MAX_VALUE 2048
/* Coefficients are predicted via a 3-dimensional probability table. */
/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
#define BLOCK_TYPES 4
/* Middle dimension is a coarsening of the coefficient's
position within the 4x4 DCT. */
#define COEF_BANDS 8
extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
/* Inside dimension is 3-valued measure of nearby complexity, that is,
the extent to which nearby coefficients are nonzero. For the first
coefficient (DC, unless block type is 0), we look at the (already encoded)
blocks above and to the left of the current block. The context index is
then the number (0,1,or 2) of these blocks having nonzero coefficients.
After decoding a coefficient, the measure is roughly the size of the
most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
Note that the intuitive meaning of this measure changes as coefficients
are decoded, e.g., prior to the first token, a zero means that my neighbors
are empty while, after the first token, because of the use of end-of-block,
a zero means we just decoded a zero and hence guarantees that a non-zero
coefficient will appear later in this block. However, this shift
in meaning is perfectly OK because our context depends also on the
coefficient band (and since zigzag positions 0, 1, and 2 are in
distinct bands). */
/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
# define PREV_COEF_CONTEXTS 3
extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
struct VP8Common;
void vp8_default_coef_probs(struct VP8Common *);
extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
void vp8_coef_tree_initialize(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ENTROPY_H_

View file

@ -0,0 +1,171 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#define USE_PREBUILT_TABLES
#include "entropymode.h"
#include "entropy.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8_entropymodedata.h"
int vp8_mv_cont(const int_mv *l, const int_mv *a)
{
int lez = (l->as_int == 0);
int aez = (a->as_int == 0);
int lea = (l->as_int == a->as_int);
if (lea && lez)
return SUBMVREF_LEFT_ABOVE_ZED;
if (lea)
return SUBMVREF_LEFT_ABOVE_SAME;
if (aez)
return SUBMVREF_ABOVE_ZED;
if (lez)
return SUBMVREF_LEFT_ZED;
return SUBMVREF_NORMAL;
}
static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
{
{ 147, 136, 18 },
{ 106, 145, 1 },
{ 179, 121, 1 },
{ 223, 1 , 34 },
{ 208, 1 , 1 }
};
const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
{
{
0, 0, 0, 0,
0, 0, 0, 0,
1, 1, 1, 1,
1, 1, 1, 1,
},
{
0, 0, 1, 1,
0, 0, 1, 1,
0, 0, 1, 1,
0, 0, 1, 1,
},
{
0, 0, 1, 1,
0, 0, 1, 1,
2, 2, 3, 3,
2, 2, 3, 3,
},
{
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
}
};
const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */
{
-B_DC_PRED, 2, /* 0 = DC_NODE */
-B_TM_PRED, 4, /* 1 = TM_NODE */
-B_VE_PRED, 6, /* 2 = VE_NODE */
8, 12, /* 3 = COM_NODE */
-B_HE_PRED, 10, /* 4 = HE_NODE */
-B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */
-B_LD_PRED, 14, /* 6 = LD_NODE */
-B_VL_PRED, 16, /* 7 = VL_NODE */
-B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */
};
/* Again, these trees use the same probability indices as their
explicitly-programmed predecessors. */
const vp8_tree_index vp8_ymode_tree[8] =
{
-DC_PRED, 2,
4, 6,
-V_PRED, -H_PRED,
-TM_PRED, -B_PRED
};
const vp8_tree_index vp8_kf_ymode_tree[8] =
{
-B_PRED, 2,
4, 6,
-DC_PRED, -V_PRED,
-H_PRED, -TM_PRED
};
const vp8_tree_index vp8_uv_mode_tree[6] =
{
-DC_PRED, 2,
-V_PRED, 4,
-H_PRED, -TM_PRED
};
const vp8_tree_index vp8_mbsplit_tree[6] =
{
-3, 2,
-2, 4,
-0, -1
};
const vp8_tree_index vp8_mv_ref_tree[8] =
{
-ZEROMV, 2,
-NEARESTMV, 4,
-NEARMV, 6,
-NEWMV, -SPLITMV
};
const vp8_tree_index vp8_sub_mv_ref_tree[6] =
{
-LEFT4X4, 2,
-ABOVE4X4, 4,
-ZERO4X4, -NEW4X4
};
const vp8_tree_index vp8_small_mvtree [14] =
{
2, 8,
4, 6,
-0, -1,
-2, -3,
10, 12,
-4, -5,
-6, -7
};
void vp8_init_mbmode_probs(VP8_COMMON *x)
{
memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
}
void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
{
memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
}

View file

@ -0,0 +1,88 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_ENTROPYMODE_H_
#define VP8_COMMON_ENTROPYMODE_H_
#include "onyxc_int.h"
#include "treecoder.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef enum
{
SUBMVREF_NORMAL,
SUBMVREF_LEFT_ZED,
SUBMVREF_ABOVE_ZED,
SUBMVREF_LEFT_ABOVE_SAME,
SUBMVREF_LEFT_ABOVE_ZED
} sumvfref_t;
typedef int vp8_mbsplit[16];
#define VP8_NUMMBSPLITS 4
extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */
extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
#define SUBMVREF_COUNT 5
extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
extern const vp8_tree_index vp8_bmode_tree[];
extern const vp8_tree_index vp8_ymode_tree[];
extern const vp8_tree_index vp8_kf_ymode_tree[];
extern const vp8_tree_index vp8_uv_mode_tree[];
extern const vp8_tree_index vp8_mbsplit_tree[];
extern const vp8_tree_index vp8_mv_ref_tree[];
extern const vp8_tree_index vp8_sub_mv_ref_tree[];
extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
/* Inter mode values do not start at zero */
extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
extern const vp8_tree_index vp8_small_mvtree[];
extern const struct vp8_token_struct vp8_small_mvencodings[8];
/* Key frame default mode probs */
extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
[VP8_BINTRAMODES-1];
extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
void vp8_init_mbmode_probs(VP8_COMMON *x);
void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ENTROPYMODE_H_

View file

@ -0,0 +1,49 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "entropymv.h"
const MV_CONTEXT vp8_mv_update_probs[2] =
{
{{
237,
246,
253, 253, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 250, 250, 252, 254, 254
}},
{{
231,
243,
245, 253, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 251, 251, 254, 254, 254
}}
};
const MV_CONTEXT vp8_default_mv_context[2] =
{
{{
/* row */
162, /* is short */
128, /* sign */
225, 146, 172, 147, 214, 39, 156, /* short tree */
128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */
}},
{{
/* same for column */
164, /* is short */
128,
204, 170, 119, 235, 140, 230, 228,
128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */
}}
};

View file

@ -0,0 +1,52 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_ENTROPYMV_H_
#define VP8_COMMON_ENTROPYMV_H_
#include "treecoder.h"
#ifdef __cplusplus
extern "C" {
#endif
enum
{
mv_max = 1023, /* max absolute value of a MV component */
MVvals = (2 * mv_max) + 1, /* # possible values "" */
mvfp_max = 255, /* max absolute value of a full pixel MV component */
MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
mvlong_width = 10, /* Large MVs have 9 bit magnitudes */
mvnum_short = 8, /* magnitudes 0 through 7 */
/* probability offsets for coding each MV component */
mvpis_short = 0, /* short (<= 7) vs long (>= 8) */
MVPsign, /* sign for non-zero */
MVPshort, /* 8 short values = 7-position tree */
MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */
};
typedef struct mv_context
{
vp8_prob prob[MVPcount]; /* often come in row, col pairs */
} MV_CONTEXT;
extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ENTROPYMV_H_

188
thirdparty/libvpx/vp8/common/extend.c vendored Normal file
View file

@ -0,0 +1,188 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "extend.h"
#include "vpx_mem/vpx_mem.h"
static void copy_and_extend_plane
(
unsigned char *s, /* source */
int sp, /* source pitch */
unsigned char *d, /* destination */
int dp, /* destination pitch */
int h, /* height */
int w, /* width */
int et, /* extend top border */
int el, /* extend left border */
int eb, /* extend bottom border */
int er /* extend right border */
)
{
int i;
unsigned char *src_ptr1, *src_ptr2;
unsigned char *dest_ptr1, *dest_ptr2;
int linesize;
/* copy the left and right most columns out */
src_ptr1 = s;
src_ptr2 = s + w - 1;
dest_ptr1 = d - el;
dest_ptr2 = d + w;
for (i = 0; i < h; i++)
{
memset(dest_ptr1, src_ptr1[0], el);
memcpy(dest_ptr1 + el, src_ptr1, w);
memset(dest_ptr2, src_ptr2[0], er);
src_ptr1 += sp;
src_ptr2 += sp;
dest_ptr1 += dp;
dest_ptr2 += dp;
}
/* Now copy the top and bottom lines into each line of the respective
* borders
*/
src_ptr1 = d - el;
src_ptr2 = d + dp * (h - 1) - el;
dest_ptr1 = d + dp * (-et) - el;
dest_ptr2 = d + dp * (h) - el;
linesize = el + er + w;
for (i = 0; i < et; i++)
{
memcpy(dest_ptr1, src_ptr1, linesize);
dest_ptr1 += dp;
}
for (i = 0; i < eb; i++)
{
memcpy(dest_ptr2, src_ptr2, linesize);
dest_ptr2 += dp;
}
}
void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst)
{
int et = dst->border;
int el = dst->border;
int eb = dst->border + dst->y_height - src->y_height;
int er = dst->border + dst->y_width - src->y_width;
copy_and_extend_plane(src->y_buffer, src->y_stride,
dst->y_buffer, dst->y_stride,
src->y_height, src->y_width,
et, el, eb, er);
et = dst->border >> 1;
el = dst->border >> 1;
eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
er = (dst->border >> 1) + dst->uv_width - src->uv_width;
copy_and_extend_plane(src->u_buffer, src->uv_stride,
dst->u_buffer, dst->uv_stride,
src->uv_height, src->uv_width,
et, el, eb, er);
copy_and_extend_plane(src->v_buffer, src->uv_stride,
dst->v_buffer, dst->uv_stride,
src->uv_height, src->uv_width,
et, el, eb, er);
}
void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst,
int srcy, int srcx,
int srch, int srcw)
{
int et = dst->border;
int el = dst->border;
int eb = dst->border + dst->y_height - src->y_height;
int er = dst->border + dst->y_width - src->y_width;
int src_y_offset = srcy * src->y_stride + srcx;
int dst_y_offset = srcy * dst->y_stride + srcx;
int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
/* If the side is not touching the bounder then don't extend. */
if (srcy)
et = 0;
if (srcx)
el = 0;
if (srcy + srch != src->y_height)
eb = 0;
if (srcx + srcw != src->y_width)
er = 0;
copy_and_extend_plane(src->y_buffer + src_y_offset,
src->y_stride,
dst->y_buffer + dst_y_offset,
dst->y_stride,
srch, srcw,
et, el, eb, er);
et = (et + 1) >> 1;
el = (el + 1) >> 1;
eb = (eb + 1) >> 1;
er = (er + 1) >> 1;
srch = (srch + 1) >> 1;
srcw = (srcw + 1) >> 1;
copy_and_extend_plane(src->u_buffer + src_uv_offset,
src->uv_stride,
dst->u_buffer + dst_uv_offset,
dst->uv_stride,
srch, srcw,
et, el, eb, er);
copy_and_extend_plane(src->v_buffer + src_uv_offset,
src->uv_stride,
dst->v_buffer + dst_uv_offset,
dst->uv_stride,
srch, srcw,
et, el, eb, er);
}
/* note the extension is only for the last row, for intra prediction purpose */
void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
unsigned char *YPtr,
unsigned char *UPtr,
unsigned char *VPtr)
{
int i;
YPtr += ybf->y_stride * 14;
UPtr += ybf->uv_stride * 6;
VPtr += ybf->uv_stride * 6;
for (i = 0; i < 4; i++)
{
YPtr[i] = YPtr[-1];
UPtr[i] = UPtr[-1];
VPtr[i] = VPtr[-1];
}
YPtr += ybf->y_stride;
UPtr += ybf->uv_stride;
VPtr += ybf->uv_stride;
for (i = 0; i < 4; i++)
{
YPtr[i] = YPtr[-1];
UPtr[i] = UPtr[-1];
VPtr[i] = VPtr[-1];
}
}

33
thirdparty/libvpx/vp8/common/extend.h vendored Normal file
View file

@ -0,0 +1,33 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_EXTEND_H_
#define VP8_COMMON_EXTEND_H_
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst,
int srcy, int srcx,
int srch, int srcw);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_EXTEND_H_

493
thirdparty/libvpx/vp8/common/filter.c vendored Normal file
View file

@ -0,0 +1,493 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "filter.h"
#include "./vp8_rtcd.h"
DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
{
{ 128, 0 },
{ 112, 16 },
{ 96, 32 },
{ 80, 48 },
{ 64, 64 },
{ 48, 80 },
{ 32, 96 },
{ 16, 112 }
};
DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
{
{ 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
{ 0, -6, 123, 12, -1, 0 },
{ 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
{ 0, -9, 93, 50, -6, 0 },
{ 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */
{ 0, -6, 50, 93, -9, 0 },
{ 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */
{ 0, -1, 12, 123, -6, 0 },
};
static void filter_block2d_first_pass
(
unsigned char *src_ptr,
int *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[0] * vp8_filter[2]) +
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
output_ptr[j] = Temp;
src_ptr++;
}
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
static void filter_block2d_second_pass
(
int *src_ptr,
unsigned char *output_ptr,
int output_pitch,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
/* Apply filter */
Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[0] * vp8_filter[2]) +
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
output_ptr[j] = (unsigned char)Temp;
src_ptr++;
}
/* Start next row */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_pitch;
}
}
static void filter_block2d
(
unsigned char *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
int output_pitch,
const short *HFilter,
const short *VFilter
)
{
int FData[9*4]; /* Temp data buffer used in filtering */
/* First filter 1-D horizontally... */
filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
/* then filter verticaly... */
filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
}
void vp8_sixtap_predict4x4_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
}
void vp8_sixtap_predict8x8_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
int FData[13*16]; /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
/* First filter 1-D horizontally... */
filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
/* then filter verticaly... */
filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
}
void vp8_sixtap_predict8x4_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
int FData[13*16]; /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
/* First filter 1-D horizontally... */
filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
/* then filter verticaly... */
filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
}
void vp8_sixtap_predict16x16_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
int FData[21*24]; /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
/* First filter 1-D horizontally... */
filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
/* then filter verticaly... */
filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_first_pass
*
* INPUTS : UINT8 *src_ptr : Pointer to source block.
* UINT32 src_stride : Stride of source block.
* UINT32 height : Block height.
* UINT32 width : Block width.
* INT32 *vp8_filter : Array of 2 bi-linear filter taps.
*
* OUTPUTS : INT32 *dst_ptr : Pointer to filtered block.
*
* RETURNS : void
*
* FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
* in the horizontal direction to produce the filtered output
* block. Used to implement first-pass of 2-D separable filter.
*
* SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
* Two filter taps should sum to VP8_FILTER_WEIGHT.
*
****************************************************************************/
static void filter_block2d_bil_first_pass
(
unsigned char *src_ptr,
unsigned short *dst_ptr,
unsigned int src_stride,
unsigned int height,
unsigned int width,
const short *vp8_filter
)
{
unsigned int i, j;
for (i = 0; i < height; i++)
{
for (j = 0; j < width; j++)
{
/* Apply bilinear filter */
dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[1] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
src_ptr++;
}
/* Next row... */
src_ptr += src_stride - width;
dst_ptr += width;
}
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_second_pass
*
* INPUTS : INT32 *src_ptr : Pointer to source block.
* UINT32 dst_pitch : Destination block pitch.
* UINT32 height : Block height.
* UINT32 width : Block width.
* INT32 *vp8_filter : Array of 2 bi-linear filter taps.
*
* OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
*
* RETURNS : void
*
* FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
* in the vertical direction to produce the filtered output
* block. Used to implement second-pass of 2-D separable filter.
*
* SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
* Two filter taps should sum to VP8_FILTER_WEIGHT.
*
****************************************************************************/
static void filter_block2d_bil_second_pass
(
unsigned short *src_ptr,
unsigned char *dst_ptr,
int dst_pitch,
unsigned int height,
unsigned int width,
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
for (i = 0; i < height; i++)
{
for (j = 0; j < width; j++)
{
/* Apply filter */
Temp = ((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[width] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2);
dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
src_ptr++;
}
/* Next row... */
dst_ptr += dst_pitch;
}
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil
*
* INPUTS : UINT8 *src_ptr : Pointer to source block.
* UINT32 src_pitch : Stride of source block.
* UINT32 dst_pitch : Stride of destination block.
* INT32 *HFilter : Array of 2 horizontal filter taps.
* INT32 *VFilter : Array of 2 vertical filter taps.
* INT32 Width : Block width
* INT32 Height : Block height
*
* OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
*
* RETURNS : void
*
* FUNCTION : 2-D filters an input block by applying a 2-tap
* bi-linear filter horizontally followed by a 2-tap
* bi-linear filter vertically on the result.
*
* SPECIAL NOTES : The largest block size can be handled here is 16x16
*
****************************************************************************/
static void filter_block2d_bil
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
unsigned int src_pitch,
unsigned int dst_pitch,
const short *HFilter,
const short *VFilter,
int Width,
int Height
)
{
unsigned short FData[17*16]; /* Temp data buffer used in filtering */
/* First filter 1-D horizontally... */
filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
/* then 1-D vertically... */
filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
}
void vp8_bilinear_predict4x4_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
#if 0
{
int i;
unsigned char temp1[16];
unsigned char temp2[16];
bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
for (i = 0; i < 16; i++)
{
if (temp1[i] != temp2[i])
{
bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
}
}
}
#endif
filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
}
void vp8_bilinear_predict8x8_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
}
void vp8_bilinear_predict8x4_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
}
void vp8_bilinear_predict16x16_c
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
}

32
thirdparty/libvpx/vp8/common/filter.h vendored Normal file
View file

@ -0,0 +1,32 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_FILTER_H_
#define VP8_COMMON_FILTER_H_
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
#define BLOCK_HEIGHT_WIDTH 4
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_FILTER_H_

View file

@ -0,0 +1,193 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "findnearmv.h"
const unsigned char vp8_mbsplit_offset[4][16] = {
{ 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
};
/* Predict motion vectors using those from already-decoded nearby blocks.
Note that we only consider one 4x4 subblock from each candidate 16x16
macroblock. */
void vp8_find_near_mvs
(
MACROBLOCKD *xd,
const MODE_INFO *here,
int_mv *nearest,
int_mv *nearby,
int_mv *best_mv,
int cnt[4],
int refframe,
int *ref_frame_sign_bias
)
{
const MODE_INFO *above = here - xd->mode_info_stride;
const MODE_INFO *left = here - 1;
const MODE_INFO *aboveleft = above - 1;
int_mv near_mvs[4];
int_mv *mv = near_mvs;
int *cntx = cnt;
enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
/* Zero accumulators */
mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
/* Process above */
if (above->mbmi.ref_frame != INTRA_FRAME)
{
if (above->mbmi.mv.as_int)
{
(++mv)->as_int = above->mbmi.mv.as_int;
mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
++cntx;
}
*cntx += 2;
}
/* Process left */
if (left->mbmi.ref_frame != INTRA_FRAME)
{
if (left->mbmi.mv.as_int)
{
int_mv this_mv;
this_mv.as_int = left->mbmi.mv.as_int;
mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
if (this_mv.as_int != mv->as_int)
{
(++mv)->as_int = this_mv.as_int;
++cntx;
}
*cntx += 2;
}
else
cnt[CNT_INTRA] += 2;
}
/* Process above left */
if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
{
if (aboveleft->mbmi.mv.as_int)
{
int_mv this_mv;
this_mv.as_int = aboveleft->mbmi.mv.as_int;
mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
if (this_mv.as_int != mv->as_int)
{
(++mv)->as_int = this_mv.as_int;
++cntx;
}
*cntx += 1;
}
else
cnt[CNT_INTRA] += 1;
}
/* If we have three distinct MV's ... */
if (cnt[CNT_SPLITMV])
{
/* See if above-left MV can be merged with NEAREST */
if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
cnt[CNT_NEAREST] += 1;
}
cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+ (left->mbmi.mode == SPLITMV)) * 2
+ (aboveleft->mbmi.mode == SPLITMV);
/* Swap near and nearest if necessary */
if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
{
int tmp;
tmp = cnt[CNT_NEAREST];
cnt[CNT_NEAREST] = cnt[CNT_NEAR];
cnt[CNT_NEAR] = tmp;
tmp = near_mvs[CNT_NEAREST].as_int;
near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
near_mvs[CNT_NEAR].as_int = tmp;
}
/* Use near_mvs[0] to store the "best" MV */
if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
/* Set up return values */
best_mv->as_int = near_mvs[0].as_int;
nearest->as_int = near_mvs[CNT_NEAREST].as_int;
nearby->as_int = near_mvs[CNT_NEAR].as_int;
}
static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
{
inv->as_mv.row = src->as_mv.row * -1;
inv->as_mv.col = src->as_mv.col * -1;
vp8_clamp_mv2(inv, xd);
vp8_clamp_mv2(src, xd);
}
int vp8_find_near_mvs_bias
(
MACROBLOCKD *xd,
const MODE_INFO *here,
int_mv mode_mv_sb[2][MB_MODE_COUNT],
int_mv best_mv_sb[2],
int cnt[4],
int refframe,
int *ref_frame_sign_bias
)
{
int sign_bias = ref_frame_sign_bias[refframe];
vp8_find_near_mvs(xd,
here,
&mode_mv_sb[sign_bias][NEARESTMV],
&mode_mv_sb[sign_bias][NEARMV],
&best_mv_sb[sign_bias],
cnt,
refframe,
ref_frame_sign_bias);
invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
&mode_mv_sb[sign_bias][NEARESTMV], xd);
invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
&mode_mv_sb[sign_bias][NEARMV], xd);
invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
&best_mv_sb[sign_bias], xd);
return sign_bias;
}
vp8_prob *vp8_mv_ref_probs(
vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
)
{
p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
/*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
return p;
}

View file

@ -0,0 +1,195 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_FINDNEARMV_H_
#define VP8_COMMON_FINDNEARMV_H_
#include "./vpx_config.h"
#include "mv.h"
#include "blockd.h"
#include "modecont.h"
#include "treecoder.h"
#ifdef __cplusplus
extern "C" {
#endif
static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
int_mv *mvp, const int *ref_frame_sign_bias)
{
if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
{
mvp->as_mv.row *= -1;
mvp->as_mv.col *= -1;
}
}
#define LEFT_TOP_MARGIN (16 << 3)
#define RIGHT_BOTTOM_MARGIN (16 << 3)
static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
{
if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
}
static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
int mb_to_right_edge, int mb_to_top_edge,
int mb_to_bottom_edge)
{
mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
mb_to_left_edge : mv->as_mv.col;
mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
mb_to_right_edge : mv->as_mv.col;
mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
mb_to_top_edge : mv->as_mv.row;
mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
mb_to_bottom_edge : mv->as_mv.row;
}
static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
int mb_to_right_edge,
int mb_to_top_edge,
int mb_to_bottom_edge)
{
unsigned int need_to_clamp;
need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
return need_to_clamp;
}
void vp8_find_near_mvs
(
MACROBLOCKD *xd,
const MODE_INFO *here,
int_mv *nearest, int_mv *nearby, int_mv *best,
int near_mv_ref_cts[4],
int refframe,
int *ref_frame_sign_bias
);
int vp8_find_near_mvs_bias
(
MACROBLOCKD *xd,
const MODE_INFO *here,
int_mv mode_mv_sb[2][MB_MODE_COUNT],
int_mv best_mv_sb[2],
int cnt[4],
int refframe,
int *ref_frame_sign_bias
);
vp8_prob *vp8_mv_ref_probs(
vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
);
extern const unsigned char vp8_mbsplit_offset[4][16];
static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
{
if (!(b & 3))
{
/* On L edge, get from MB to left of us */
--cur_mb;
if(cur_mb->mbmi.mode != SPLITMV)
return cur_mb->mbmi.mv.as_int;
b += 4;
}
return (cur_mb->bmi + b - 1)->mv.as_int;
}
static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
int mi_stride)
{
if (!(b >> 2))
{
/* On top edge, get from MB above us */
cur_mb -= mi_stride;
if(cur_mb->mbmi.mode != SPLITMV)
return cur_mb->mbmi.mv.as_int;
b += 16;
}
return (cur_mb->bmi + (b - 4))->mv.as_int;
}
static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
{
if (!(b & 3))
{
/* On L edge, get from MB to left of us */
--cur_mb;
switch (cur_mb->mbmi.mode)
{
case B_PRED:
return (cur_mb->bmi + b + 3)->as_mode;
case DC_PRED:
return B_DC_PRED;
case V_PRED:
return B_VE_PRED;
case H_PRED:
return B_HE_PRED;
case TM_PRED:
return B_TM_PRED;
default:
return B_DC_PRED;
}
}
return (cur_mb->bmi + b - 1)->as_mode;
}
static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
int mi_stride)
{
if (!(b >> 2))
{
/* On top edge, get from MB above us */
cur_mb -= mi_stride;
switch (cur_mb->mbmi.mode)
{
case B_PRED:
return (cur_mb->bmi + b + 12)->as_mode;
case DC_PRED:
return B_DC_PRED;
case V_PRED:
return B_VE_PRED;
case H_PRED:
return B_HE_PRED;
case TM_PRED:
return B_TM_PRED;
default:
return B_DC_PRED;
}
}
return (cur_mb->bmi + b - 4)->as_mode;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_FINDNEARMV_H_

View file

@ -0,0 +1,106 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#if ARCH_ARM
#include "vpx_ports/arm.h"
#elif ARCH_X86 || ARCH_X86_64
#include "vpx_ports/x86.h"
#endif
#include "vp8/common/onyxc_int.h"
#include "vp8/common/systemdependent.h"
#if CONFIG_MULTITHREAD
#if HAVE_UNISTD_H && !defined(__OS2__)
#include <unistd.h>
#elif defined(_WIN32)
#include <windows.h>
typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
#elif defined(__OS2__)
#define INCL_DOS
#define INCL_DOSSPINLOCK
#include <os2.h>
#endif
#endif
#if CONFIG_MULTITHREAD
static int get_cpu_count()
{
int core_count = 16;
#if HAVE_UNISTD_H && !defined(__OS2__)
#if defined(_SC_NPROCESSORS_ONLN)
core_count = sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(_SC_NPROC_ONLN)
core_count = sysconf(_SC_NPROC_ONLN);
#endif
#elif defined(_WIN32)
{
#if _WIN32_WINNT >= 0x0501
SYSTEM_INFO sysinfo;
GetNativeSystemInfo(&sysinfo);
#else
PGNSI pGNSI;
SYSTEM_INFO sysinfo;
/* Call GetNativeSystemInfo if supported or
* GetSystemInfo otherwise. */
pGNSI = (PGNSI) GetProcAddress(
GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
if (pGNSI != NULL)
pGNSI(&sysinfo);
else
GetSystemInfo(&sysinfo);
#endif
core_count = sysinfo.dwNumberOfProcessors;
}
#elif defined(__OS2__)
{
ULONG proc_id;
ULONG status;
core_count = 0;
for (proc_id = 1; ; proc_id++)
{
if (DosGetProcessorStatus(proc_id, &status))
break;
if (status == PROC_ONLINE)
core_count++;
}
}
#else
/* other platforms */
#endif
return core_count > 0 ? core_count : 1;
}
#endif
void vp8_clear_system_state_c() {};
void vp8_machine_specific_config(VP8_COMMON *ctx)
{
#if CONFIG_MULTITHREAD
ctx->processor_core_count = get_cpu_count();
#else
(void)ctx;
#endif /* CONFIG_MULTITHREAD */
#if ARCH_ARM
ctx->cpu_caps = arm_cpu_caps();
#elif ARCH_X86 || ARCH_X86_64
ctx->cpu_caps = x86_simd_caps();
#endif
}

51
thirdparty/libvpx/vp8/common/header.h vendored Normal file
View file

@ -0,0 +1,51 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_HEADER_H_
#define VP8_COMMON_HEADER_H_
#ifdef __cplusplus
extern "C" {
#endif
/* 24 bits total */
typedef struct
{
unsigned int type: 1;
unsigned int version: 3;
unsigned int show_frame: 1;
/* Allow 2^20 bytes = 8 megabits for first partition */
unsigned int first_partition_length_in_bytes: 19;
#ifdef PACKET_TESTING
unsigned int frame_number;
unsigned int update_gold: 1;
unsigned int uses_gold: 1;
unsigned int update_last: 1;
unsigned int uses_last: 1;
#endif
} VP8_HEADER;
#ifdef PACKET_TESTING
#define VP8_HEADER_SIZE 8
#else
#define VP8_HEADER_SIZE 3
#endif
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_HEADER_H_

90
thirdparty/libvpx/vp8/common/idct_blk.c vendored Normal file
View file

@ -0,0 +1,90 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vpx_mem/vpx_mem.h"
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride);
void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
int pred_stride, unsigned char *dst_ptr,
int dst_stride);
void vp8_dequant_idct_add_y_block_c
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i, j;
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, dst, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
dst += 4;
}
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_uv_block_c
(short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i, j;
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, dstu, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
dstu += 4;
}
dstu += 4*stride - 8;
}
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, dstv, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
dstv += 4;
}
dstv += 4*stride - 8;
}
}

205
thirdparty/libvpx/vp8/common/idctllm.c vendored Normal file
View file

@ -0,0 +1,205 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
/****************************************************************************
* Notes:
*
* This implementation makes use of 16 bit fixed point verio of two multiply
* constants:
* 1. sqrt(2) * cos (pi/8)
* 2. sqrt(2) * sin (pi/8)
* Becuase the first constant is bigger than 1, to maintain the same 16 bit
* fixed point precision as the second one, we use a trick of
* x * a = x + x*(a-1)
* so
* x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
**************************************************************************/
static const int cospi8sqrt2minus1 = 20091;
static const int sinpi8sqrt2 = 35468;
void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
int pred_stride, unsigned char *dst_ptr,
int dst_stride)
{
int i;
int r, c;
int a1, b1, c1, d1;
short output[16];
short *ip = input;
short *op = output;
int temp1, temp2;
int shortpitch = 4;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[8];
b1 = ip[0] - ip[8];
temp1 = (ip[4] * sinpi8sqrt2) >> 16;
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
c1 = temp1 - temp2;
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
temp2 = (ip[12] * sinpi8sqrt2) >> 16;
d1 = temp1 + temp2;
op[shortpitch*0] = a1 + d1;
op[shortpitch*3] = a1 - d1;
op[shortpitch*1] = b1 + c1;
op[shortpitch*2] = b1 - c1;
ip++;
op++;
}
ip = output;
op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[2];
b1 = ip[0] - ip[2];
temp1 = (ip[1] * sinpi8sqrt2) >> 16;
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
c1 = temp1 - temp2;
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
temp2 = (ip[3] * sinpi8sqrt2) >> 16;
d1 = temp1 + temp2;
op[0] = (a1 + d1 + 4) >> 3;
op[3] = (a1 - d1 + 4) >> 3;
op[1] = (b1 + c1 + 4) >> 3;
op[2] = (b1 - c1 + 4) >> 3;
ip += shortpitch;
op += shortpitch;
}
ip = output;
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int a = ip[c] + pred_ptr[c] ;
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dst_ptr[c] = (unsigned char) a ;
}
ip += 4;
dst_ptr += dst_stride;
pred_ptr += pred_stride;
}
}
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
int pred_stride, unsigned char *dst_ptr,
int dst_stride)
{
int a1 = ((input_dc + 4) >> 3);
int r, c;
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int a = a1 + pred_ptr[c] ;
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dst_ptr[c] = (unsigned char) a ;
}
dst_ptr += dst_stride;
pred_ptr += pred_stride;
}
}
void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
{
short output[16];
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
short *ip = input;
short *op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[12];
b1 = ip[4] + ip[8];
c1 = ip[4] - ip[8];
d1 = ip[0] - ip[12];
op[0] = a1 + b1;
op[4] = c1 + d1;
op[8] = a1 - b1;
op[12] = d1 - c1;
ip++;
op++;
}
ip = output;
op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[3];
b1 = ip[1] + ip[2];
c1 = ip[1] - ip[2];
d1 = ip[0] - ip[3];
a2 = a1 + b1;
b2 = c1 + d1;
c2 = a1 - b1;
d2 = d1 - c1;
op[0] = (a2 + 3) >> 3;
op[1] = (b2 + 3) >> 3;
op[2] = (c2 + 3) >> 3;
op[3] = (d2 + 3) >> 3;
ip += 4;
op += 4;
}
for(i = 0; i < 16; i++)
{
mb_dqcoeff[i * 16] = output[i];
}
}
void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
{
int i;
int a1;
a1 = ((input[0] + 3) >> 3);
for(i = 0; i < 16; i++)
{
mb_dqcoeff[i * 16] = a1;
}
}

70
thirdparty/libvpx/vp8/common/invtrans.h vendored Normal file
View file

@ -0,0 +1,70 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_INVTRANS_H_
#define VP8_COMMON_INVTRANS_H_
#include "./vpx_config.h"
#include "vp8_rtcd.h"
#include "blockd.h"
#include "onyxc_int.h"
#if CONFIG_MULTITHREAD
#include "vpx_mem/vpx_mem.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
static void eob_adjust(char *eobs, short *diff)
{
/* eob adjust.... the idct can only skip if both the dc and eob are zero */
int js;
for(js = 0; js < 16; js++)
{
if((eobs[js] == 0) && (diff[0] != 0))
eobs[js]++;
diff+=16;
}
}
static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd)
{
short *DQC = xd->dequant_y1;
if (xd->mode_info_context->mbmi.mode != SPLITMV)
{
/* do 2nd order transform on the dc block */
if (xd->eobs[24] > 1)
{
vp8_short_inv_walsh4x4
(&xd->block[24].dqcoeff[0], xd->qcoeff);
}
else
{
vp8_short_inv_walsh4x4_1
(&xd->block[24].dqcoeff[0], xd->qcoeff);
}
eob_adjust(xd->eobs, xd->qcoeff);
DQC = xd->dequant_y1_dc;
}
vp8_dequant_idct_add_y_block
(xd->qcoeff, DQC,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_INVTRANS_H_

View file

@ -0,0 +1,113 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_LOOPFILTER_H_
#define VP8_COMMON_LOOPFILTER_H_
#include "vpx_ports/mem.h"
#include "vpx_config.h"
#include "vp8_rtcd.h"
#ifdef __cplusplus
extern "C" {
#endif
#define MAX_LOOP_FILTER 63
/* fraction of total macroblock rows to be used in fast filter level picking */
/* has to be > 2 */
#define PARTIAL_FRAME_FRACTION 8
typedef enum
{
NORMAL_LOOPFILTER = 0,
SIMPLE_LOOPFILTER = 1
} LOOPFILTERTYPE;
#if ARCH_ARM
#define SIMD_WIDTH 1
#else
#define SIMD_WIDTH 16
#endif
/* Need to align this structure so when it is declared and
* passed it can be loaded into vector registers.
*/
typedef struct
{
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
unsigned char lvl[4][4][4];
unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
unsigned char mode_lf_lut[10];
} loop_filter_info_n;
typedef struct loop_filter_info
{
const unsigned char * mblim;
const unsigned char * blim;
const unsigned char * lim;
const unsigned char * hev_thr;
} loop_filter_info;
typedef void loop_filter_uvfunction
(
unsigned char *u, /* source pointer */
int p, /* pitch */
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
unsigned char *v
);
/* assorted loopfilter functions which get used elsewhere */
struct VP8Common;
struct macroblockd;
struct modeinfo;
void vp8_loop_filter_init(struct VP8Common *cm);
void vp8_loop_filter_frame_init(struct VP8Common *cm,
struct macroblockd *mbd,
int default_filt_lvl);
void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
int frame_type);
void vp8_loop_filter_partial_frame(struct VP8Common *cm,
struct macroblockd *mbd,
int default_filt_lvl);
void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
struct macroblockd *mbd,
int default_filt_lvl);
void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
int sharpness_lvl);
void vp8_loop_filter_row_normal(struct VP8Common *cm,
struct modeinfo *mode_info_context,
int mb_row, int post_ystride, int post_uvstride,
unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr);
void vp8_loop_filter_row_simple(struct VP8Common *cm,
struct modeinfo *mode_info_context,
int mb_row, int post_ystride, int post_uvstride,
unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_LOOPFILTER_H_

View file

@ -0,0 +1,430 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "loopfilter.h"
#include "onyxc_int.h"
typedef unsigned char uc;
static signed char vp8_signed_char_clamp(int t)
{
t = (t < -128 ? -128 : t);
t = (t > 127 ? 127 : t);
return (signed char) t;
}
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static signed char vp8_filter_mask(uc limit, uc blimit,
uc p3, uc p2, uc p1, uc p0,
uc q0, uc q1, uc q2, uc q3)
{
signed char mask = 0;
mask |= (abs(p3 - p2) > limit);
mask |= (abs(p2 - p1) > limit);
mask |= (abs(p1 - p0) > limit);
mask |= (abs(q1 - q0) > limit);
mask |= (abs(q2 - q1) > limit);
mask |= (abs(q3 - q2) > limit);
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit);
return mask - 1;
}
/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
{
signed char hev = 0;
hev |= (abs(p1 - p0) > thresh) * -1;
hev |= (abs(q1 - q0) > thresh) * -1;
return hev;
}
static void vp8_filter(signed char mask, uc hev, uc *op1,
uc *op0, uc *oq0, uc *oq1)
{
signed char ps0, qs0;
signed char ps1, qs1;
signed char filter_value, Filter1, Filter2;
signed char u;
ps1 = (signed char) * op1 ^ 0x80;
ps0 = (signed char) * op0 ^ 0x80;
qs0 = (signed char) * oq0 ^ 0x80;
qs1 = (signed char) * oq1 ^ 0x80;
/* add outer taps if we have high edge variance */
filter_value = vp8_signed_char_clamp(ps1 - qs1);
filter_value &= hev;
/* inner taps */
filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
filter_value &= mask;
/* save bottom 3 bits so that we round one side +4 and the other +3
* if it equals 4 we'll set to adjust by -1 to account for the fact
* we'd round 3 the other way
*/
Filter1 = vp8_signed_char_clamp(filter_value + 4);
Filter2 = vp8_signed_char_clamp(filter_value + 3);
Filter1 >>= 3;
Filter2 >>= 3;
u = vp8_signed_char_clamp(qs0 - Filter1);
*oq0 = u ^ 0x80;
u = vp8_signed_char_clamp(ps0 + Filter2);
*op0 = u ^ 0x80;
filter_value = Filter1;
/* outer tap adjustments */
filter_value += 1;
filter_value >>= 1;
filter_value &= ~hev;
u = vp8_signed_char_clamp(qs1 - filter_value);
*oq1 = u ^ 0x80;
u = vp8_signed_char_clamp(ps1 + filter_value);
*op1 = u ^ 0x80;
}
void vp8_loop_filter_horizontal_edge_c
(
unsigned char *s,
int p, /* pitch */
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
*/
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
while (++i < count * 8);
}
void vp8_loop_filter_vertical_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
*/
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
s += p;
}
while (++i < count * 8);
}
static void vp8_mbfilter(signed char mask, uc hev,
uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
{
signed char s, u;
signed char filter_value, Filter1, Filter2;
signed char ps2 = (signed char) * op2 ^ 0x80;
signed char ps1 = (signed char) * op1 ^ 0x80;
signed char ps0 = (signed char) * op0 ^ 0x80;
signed char qs0 = (signed char) * oq0 ^ 0x80;
signed char qs1 = (signed char) * oq1 ^ 0x80;
signed char qs2 = (signed char) * oq2 ^ 0x80;
/* add outer taps if we have high edge variance */
filter_value = vp8_signed_char_clamp(ps1 - qs1);
filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
filter_value &= mask;
Filter2 = filter_value;
Filter2 &= hev;
/* save bottom 3 bits so that we round one side +4 and the other +3 */
Filter1 = vp8_signed_char_clamp(Filter2 + 4);
Filter2 = vp8_signed_char_clamp(Filter2 + 3);
Filter1 >>= 3;
Filter2 >>= 3;
qs0 = vp8_signed_char_clamp(qs0 - Filter1);
ps0 = vp8_signed_char_clamp(ps0 + Filter2);
/* only apply wider filter if not high edge variance */
filter_value &= ~hev;
Filter2 = filter_value;
/* roughly 3/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
s = vp8_signed_char_clamp(qs0 - u);
*oq0 = s ^ 0x80;
s = vp8_signed_char_clamp(ps0 + u);
*op0 = s ^ 0x80;
/* roughly 2/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
s = vp8_signed_char_clamp(qs1 - u);
*oq1 = s ^ 0x80;
s = vp8_signed_char_clamp(ps1 + u);
*op1 = s ^ 0x80;
/* roughly 1/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
s = vp8_signed_char_clamp(qs2 - u);
*oq2 = s ^ 0x80;
s = vp8_signed_char_clamp(ps2 + u);
*op2 = s ^ 0x80;
}
void vp8_mbloop_filter_horizontal_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
*/
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
++s;
}
while (++i < count * 8);
}
void vp8_mbloop_filter_vertical_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
s += p;
}
while (++i < count * 8);
}
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
{
/* Why does this cause problems for win32?
* error C2143: syntax error : missing ';' before 'type'
* (void) limit;
*/
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
return mask;
}
static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
{
signed char filter_value, Filter1, Filter2;
signed char p1 = (signed char) * op1 ^ 0x80;
signed char p0 = (signed char) * op0 ^ 0x80;
signed char q0 = (signed char) * oq0 ^ 0x80;
signed char q1 = (signed char) * oq1 ^ 0x80;
signed char u;
filter_value = vp8_signed_char_clamp(p1 - q1);
filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
filter_value &= mask;
/* save bottom 3 bits so that we round one side +4 and the other +3 */
Filter1 = vp8_signed_char_clamp(filter_value + 4);
Filter1 >>= 3;
u = vp8_signed_char_clamp(q0 - Filter1);
*oq0 = u ^ 0x80;
Filter2 = vp8_signed_char_clamp(filter_value + 3);
Filter2 >>= 3;
u = vp8_signed_char_clamp(p0 + Filter2);
*op0 = u ^ 0x80;
}
void vp8_loop_filter_simple_horizontal_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit
)
{
signed char mask = 0;
int i = 0;
do
{
mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
while (++i < 16);
}
void vp8_loop_filter_simple_vertical_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit
)
{
signed char mask = 0;
int i = 0;
do
{
mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
s += p;
}
while (++i < 16);
}
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
}

68
thirdparty/libvpx/vp8/common/mbpitch.c vendored Normal file
View file

@ -0,0 +1,68 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "blockd.h"
void vp8_setup_block_dptrs(MACROBLOCKD *x)
{
int r, c;
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
}
}
for (r = 0; r < 2; r++)
{
for (c = 0; c < 2; c++)
{
x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
}
}
for (r = 0; r < 2; r++)
{
for (c = 0; c < 2; c++)
{
x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
}
}
for (r = 0; r < 25; r++)
{
x->block[r].qcoeff = x->qcoeff + r * 16;
x->block[r].dqcoeff = x->dqcoeff + r * 16;
x->block[r].eob = x->eobs + r;
}
}
void vp8_build_block_doffsets(MACROBLOCKD *x)
{
int block;
for (block = 0; block < 16; block++) /* y blocks */
{
x->block[block].offset =
(block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
}
for (block = 16; block < 20; block++) /* U and V blocks */
{
x->block[block+4].offset =
x->block[block].offset =
((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
}
}

40
thirdparty/libvpx/vp8/common/modecont.c vendored Normal file
View file

@ -0,0 +1,40 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "entropy.h"
const int vp8_mode_contexts[6][4] =
{
{
/* 0 */
7, 1, 1, 143,
},
{
/* 1 */
14, 18, 14, 107,
},
{
/* 2 */
135, 64, 57, 68,
},
{
/* 3 */
60, 56, 128, 65,
},
{
/* 4 */
159, 134, 128, 34,
},
{
/* 5 */
234, 188, 128, 28,
},
};

25
thirdparty/libvpx/vp8/common/modecont.h vendored Normal file
View file

@ -0,0 +1,25 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_MODECONT_H_
#define VP8_COMMON_MODECONT_H_
#ifdef __cplusplus
extern "C" {
#endif
extern const int vp8_mode_contexts[6][4];
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_MODECONT_H_

36
thirdparty/libvpx/vp8/common/mv.h vendored Normal file
View file

@ -0,0 +1,36 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_MV_H_
#define VP8_COMMON_MV_H_
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct
{
short row;
short col;
} MV;
typedef union int_mv
{
uint32_t as_int;
MV as_mv;
} int_mv; /* facilitates faster equality tests and copies */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_MV_H_

185
thirdparty/libvpx/vp8/common/onyxc_int.h vendored Normal file
View file

@ -0,0 +1,185 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_ONYXC_INT_H_
#define VP8_COMMON_ONYXC_INT_H_
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "loopfilter.h"
#include "entropymv.h"
#include "entropy.h"
#if CONFIG_POSTPROC
#include "postproc.h"
#endif
/*#ifdef PACKET_TESTING*/
#include "header.h"
/*#endif*/
#ifdef __cplusplus
extern "C" {
#endif
#define MINQ 0
#define MAXQ 127
#define QINDEX_RANGE (MAXQ + 1)
#define NUM_YV12_BUFFERS 4
#define MAX_PARTITIONS 9
typedef struct frame_contexts
{
vp8_prob bmode_prob [VP8_BINTRAMODES-1];
vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */
vp8_prob uv_mode_prob [VP8_UV_MODES-1];
vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
MV_CONTEXT mvc[2];
} FRAME_CONTEXT;
typedef enum
{
ONE_PARTITION = 0,
TWO_PARTITION = 1,
FOUR_PARTITION = 2,
EIGHT_PARTITION = 3
} TOKEN_PARTITION;
typedef enum
{
RECON_CLAMP_REQUIRED = 0,
RECON_CLAMP_NOTREQUIRED = 1
} CLAMP_TYPE;
typedef struct VP8Common
{
struct vpx_internal_error_info error;
DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
int Width;
int Height;
int horiz_scale;
int vert_scale;
CLAMP_TYPE clamp_type;
YV12_BUFFER_CONFIG *frame_to_show;
YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
YV12_BUFFER_CONFIG temp_scale_frame;
#if CONFIG_POSTPROC
YV12_BUFFER_CONFIG post_proc_buffer;
YV12_BUFFER_CONFIG post_proc_buffer_int;
int post_proc_buffer_int_used;
unsigned char *pp_limits_buffer; /* post-processing filter coefficients */
#endif
FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
FRAME_TYPE frame_type;
int show_frame;
int frame_flags;
int MBs;
int mb_rows;
int mb_cols;
int mode_info_stride;
/* profile settings */
int mb_no_coeff_skip;
int no_lpf;
int use_bilinear_mc_filter;
int full_pixel;
int base_qindex;
int y1dc_delta_q;
int y2dc_delta_q;
int y2ac_delta_q;
int uvdc_delta_q;
int uvac_delta_q;
/* We allocate a MODE_INFO struct for each macroblock, together with
an extra row on top and column on the left to simplify prediction. */
MODE_INFO *mip; /* Base of allocated array */
MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
#if CONFIG_ERROR_CONCEALMENT
MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
#endif
MODE_INFO *show_frame_mi; /* MODE_INFO for the last decoded frame
to show */
LOOPFILTERTYPE filter_type;
loop_filter_info_n lf_info;
int filter_level;
int last_sharpness_level;
int sharpness_level;
int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
/* Y,U,V,Y2 */
ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */
FRAME_CONTEXT lfc; /* last frame entropy */
FRAME_CONTEXT fc; /* this frame entropy */
unsigned int current_video_frame;
int version;
TOKEN_PARTITION multi_token_partition;
#ifdef PACKET_TESTING
VP8_HEADER oh;
#endif
#if CONFIG_POSTPROC_VISUALIZER
double bitrate;
double framerate;
#endif
#if CONFIG_MULTITHREAD
int processor_core_count;
#endif
#if CONFIG_POSTPROC
struct postproc_state postproc_state;
#endif
int cpu_caps;
} VP8_COMMON;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ONYXC_INT_H_

63
thirdparty/libvpx/vp8/common/onyxd.h vendored Normal file
View file

@ -0,0 +1,63 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_ONYXD_H_
#define VP8_COMMON_ONYXD_H_
/* Create/destroy static data structures. */
#ifdef __cplusplus
extern "C"
{
#endif
#include "vpx_scale/yv12config.h"
#include "ppflags.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_codec.h"
#include "vpx/vp8.h"
struct VP8D_COMP;
typedef struct
{
int Width;
int Height;
int Version;
int postprocess;
int max_threads;
int error_concealment;
} VP8D_CONFIG;
typedef enum
{
VP8D_OK = 0
} VP8D_SETTING;
void vp8dx_initialize(void);
void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
int vp8dx_receive_compressed_data(struct VP8D_COMP* comp,
size_t size, const uint8_t *dest,
int64_t time_stamp);
int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
#ifdef __cplusplus
}
#endif
#endif // VP8_COMMON_ONYXD_H_

49
thirdparty/libvpx/vp8/common/ppflags.h vendored Normal file
View file

@ -0,0 +1,49 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_PPFLAGS_H_
#define VP8_COMMON_PPFLAGS_H_
#ifdef __cplusplus
extern "C" {
#endif
enum
{
VP8D_NOFILTERING = 0,
VP8D_DEBLOCK = 1<<0,
VP8D_DEMACROBLOCK = 1<<1,
VP8D_ADDNOISE = 1<<2,
VP8D_DEBUG_TXT_FRAME_INFO = 1<<3,
VP8D_DEBUG_TXT_MBLK_MODES = 1<<4,
VP8D_DEBUG_TXT_DC_DIFF = 1<<5,
VP8D_DEBUG_TXT_RATE_INFO = 1<<6,
VP8D_DEBUG_DRAW_MV = 1<<7,
VP8D_DEBUG_CLR_BLK_MODES = 1<<8,
VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
VP8D_MFQE = 1<<10
};
typedef struct
{
int post_proc_flag;
int deblocking_level;
int noise_level;
int display_ref_frame_flag;
int display_mb_modes_flag;
int display_b_modes_flag;
int display_mv_flag;
} vp8_ppflags_t;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_PPFLAGS_H_

View file

@ -0,0 +1,135 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "quant_common.h"
static const int dc_qlookup[QINDEX_RANGE] =
{
4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17,
18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118,
122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157,
};
static const int ac_qlookup[QINDEX_RANGE] =
{
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108,
110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152,
155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209,
213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284,
};
int vp8_dc_quant(int QIndex, int Delta)
{
int retval;
QIndex = QIndex + Delta;
if (QIndex > 127)
QIndex = 127;
else if (QIndex < 0)
QIndex = 0;
retval = dc_qlookup[ QIndex ];
return retval;
}
int vp8_dc2quant(int QIndex, int Delta)
{
int retval;
QIndex = QIndex + Delta;
if (QIndex > 127)
QIndex = 127;
else if (QIndex < 0)
QIndex = 0;
retval = dc_qlookup[ QIndex ] * 2;
return retval;
}
int vp8_dc_uv_quant(int QIndex, int Delta)
{
int retval;
QIndex = QIndex + Delta;
if (QIndex > 127)
QIndex = 127;
else if (QIndex < 0)
QIndex = 0;
retval = dc_qlookup[ QIndex ];
if (retval > 132)
retval = 132;
return retval;
}
int vp8_ac_yquant(int QIndex)
{
int retval;
if (QIndex > 127)
QIndex = 127;
else if (QIndex < 0)
QIndex = 0;
retval = ac_qlookup[ QIndex ];
return retval;
}
int vp8_ac2quant(int QIndex, int Delta)
{
int retval;
QIndex = QIndex + Delta;
if (QIndex > 127)
QIndex = 127;
else if (QIndex < 0)
QIndex = 0;
/* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
* The smallest precision for that is '(x*6349) >> 12' but 16 is a good
* word size. */
retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
if (retval < 8)
retval = 8;
return retval;
}
int vp8_ac_uv_quant(int QIndex, int Delta)
{
int retval;
QIndex = QIndex + Delta;
if (QIndex > 127)
QIndex = 127;
else if (QIndex < 0)
QIndex = 0;
retval = ac_qlookup[ QIndex ];
return retval;
}

View file

@ -0,0 +1,34 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_QUANT_COMMON_H_
#define VP8_COMMON_QUANT_COMMON_H_
#include "string.h"
#include "blockd.h"
#include "onyxc_int.h"
#ifdef __cplusplus
extern "C" {
#endif
extern int vp8_ac_yquant(int QIndex);
extern int vp8_dc_quant(int QIndex, int Delta);
extern int vp8_dc2quant(int QIndex, int Delta);
extern int vp8_ac2quant(int QIndex, int Delta);
extern int vp8_dc_uv_quant(int QIndex, int Delta);
extern int vp8_ac_uv_quant(int QIndex, int Delta);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_QUANT_COMMON_H_

View file

@ -0,0 +1,544 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <limits.h>
#include <string.h>
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vpx/vpx_integer.h"
#include "blockd.h"
#include "reconinter.h"
#if CONFIG_RUNTIME_CPU_DETECT
#include "onyxc_int.h"
#endif
void vp8_copy_mem16x16_c(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride)
{
int r;
for (r = 0; r < 16; r++)
{
memcpy(dst, src, 16);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem8x8_c(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride)
{
int r;
for (r = 0; r < 8; r++)
{
memcpy(dst, src, 8);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem8x4_c(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride)
{
int r;
for (r = 0; r < 4; r++)
{
memcpy(dst, src, 8);
src += src_stride;
dst += dst_stride;
}
}
void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
{
int r;
unsigned char *pred_ptr = d->predictor;
unsigned char *ptr;
ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
}
else
{
for (r = 0; r < 4; r++)
{
pred_ptr[0] = ptr[0];
pred_ptr[1] = ptr[1];
pred_ptr[2] = ptr[2];
pred_ptr[3] = ptr[3];
pred_ptr += pitch;
ptr += pre_stride;
}
}
}
static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
{
unsigned char *ptr;
ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
}
}
static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
{
unsigned char *ptr;
ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
}
}
static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
{
int r;
unsigned char *ptr;
ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
for (r = 0; r < 4; r++)
{
dst[0] = ptr[0];
dst[1] = ptr[1];
dst[2] = ptr[2];
dst[3] = ptr[3];
dst += dst_stride;
ptr += pre_stride;
}
}
}
/*encoder only*/
void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
{
unsigned char *uptr, *vptr;
unsigned char *upred_ptr = &x->predictor[256];
unsigned char *vpred_ptr = &x->predictor[320];
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
int offset;
int pre_stride = x->pre.uv_stride;
/* calc uv motion vectors */
mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
mv_row /= 2;
mv_col /= 2;
mv_row &= x->fullpixel_mask;
mv_col &= x->fullpixel_mask;
offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
uptr = x->pre.u_buffer + offset;
vptr = x->pre.v_buffer + offset;
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
}
else
{
vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
}
}
/*encoder only*/
void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
{
int i, j;
int pre_stride = x->pre.uv_stride;
unsigned char *base_pre;
/* build uv mvs */
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
int yoffset = i * 8 + j * 2;
int uoffset = 16 + i * 2 + j;
int voffset = 20 + i * 2 + j;
int temp;
temp = x->block[yoffset ].bmi.mv.as_mv.row
+ x->block[yoffset+1].bmi.mv.as_mv.row
+ x->block[yoffset+4].bmi.mv.as_mv.row
+ x->block[yoffset+5].bmi.mv.as_mv.row;
temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
temp = x->block[yoffset ].bmi.mv.as_mv.col
+ x->block[yoffset+1].bmi.mv.as_mv.col
+ x->block[yoffset+4].bmi.mv.as_mv.col
+ x->block[yoffset+5].bmi.mv.as_mv.col;
temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
}
}
base_pre = x->pre.u_buffer;
for (i = 16; i < 20; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
else
{
vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
}
}
base_pre = x->pre.v_buffer;
for (i = 20; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
else
{
vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
}
}
}
/*encoder only*/
void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
unsigned char *dst_y,
int dst_ystride)
{
unsigned char *ptr_base;
unsigned char *ptr;
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
int pre_stride = x->pre.y_stride;
ptr_base = x->pre.y_buffer;
ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
dst_y, dst_ystride);
}
else
{
vp8_copy_mem16x16(ptr, pre_stride, dst_y,
dst_ystride);
}
}
static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
{
/* If the MV points so far into the UMV border that no visible pixels
* are used for reconstruction, the subpel part of the MV can be
* discarded and the MV limited to 16 pixels with equivalent results.
*
* This limit kicks in at 19 pixels for the top and left edges, for
* the 16 pixels plus 3 taps right of the central pixel when subpel
* filtering. The bottom and right edges use 16 pixels plus 2 pixels
* left of the central pixel when filtering.
*/
if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
mv->col = xd->mb_to_left_edge - (16 << 3);
else if (mv->col > xd->mb_to_right_edge + (18 << 3))
mv->col = xd->mb_to_right_edge + (16 << 3);
if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
mv->row = xd->mb_to_top_edge - (16 << 3);
else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
mv->row = xd->mb_to_bottom_edge + (16 << 3);
}
/* A version of the above function for chroma block MVs.*/
static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
{
mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ?
(xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ?
(xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ?
(xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ?
(xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
}
void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_y,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride)
{
int offset;
unsigned char *ptr;
unsigned char *uptr, *vptr;
int_mv _16x16mv;
unsigned char *ptr_base = x->pre.y_buffer;
int pre_stride = x->pre.y_stride;
_16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
if (x->mode_info_context->mbmi.need_to_clamp_mvs)
{
clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
}
ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
if ( _16x16mv.as_int & 0x00070007)
{
x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
}
else
{
vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
}
/* calc uv motion vectors */
_16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
_16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
_16x16mv.as_mv.row /= 2;
_16x16mv.as_mv.col /= 2;
_16x16mv.as_mv.row &= x->fullpixel_mask;
_16x16mv.as_mv.col &= x->fullpixel_mask;
pre_stride >>= 1;
offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
uptr = x->pre.u_buffer + offset;
vptr = x->pre.v_buffer + offset;
if ( _16x16mv.as_int & 0x00070007)
{
x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
}
else
{
vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
}
}
static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
{
int i;
unsigned char *base_dst = x->dst.y_buffer;
unsigned char *base_pre = x->pre.y_buffer;
if (x->mode_info_context->mbmi.partitioning < 3)
{
BLOCKD *b;
int dst_stride = x->dst.y_stride;
x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
x->block[10].bmi = x->mode_info_context->bmi[10];
if (x->mode_info_context->mbmi.need_to_clamp_mvs)
{
clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x);
clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x);
clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x);
clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
}
b = &x->block[ 0];
build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
b = &x->block[ 2];
build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
b = &x->block[ 8];
build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
b = &x->block[10];
build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
}
else
{
for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
int dst_stride = x->dst.y_stride;
x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
if (x->mode_info_context->mbmi.need_to_clamp_mvs)
{
clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x);
clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x);
}
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
else
{
build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
}
}
}
base_dst = x->dst.u_buffer;
base_pre = x->pre.u_buffer;
for (i = 16; i < 20; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
int dst_stride = x->dst.uv_stride;
/* Note: uv mvs already clamped in build_4x4uvmvs() */
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
else
{
build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
}
}
base_dst = x->dst.v_buffer;
base_pre = x->pre.v_buffer;
for (i = 20; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
int dst_stride = x->dst.uv_stride;
/* Note: uv mvs already clamped in build_4x4uvmvs() */
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
else
{
build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
}
}
}
static
void build_4x4uvmvs(MACROBLOCKD *x)
{
int i, j;
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
int yoffset = i * 8 + j * 2;
int uoffset = 16 + i * 2 + j;
int voffset = 20 + i * 2 + j;
int temp;
temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
+ x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
+ x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
+ x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
+ x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
+ x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
+ x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
if (x->mode_info_context->mbmi.need_to_clamp_mvs)
clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
}
}
}
void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
{
if (xd->mode_info_context->mbmi.mode != SPLITMV)
{
vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.y_stride, xd->dst.uv_stride);
}
else
{
build_4x4uvmvs(xd);
build_inter4x4_predictors_mb(xd);
}
}

View file

@ -0,0 +1,43 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_RECONINTER_H_
#define VP8_COMMON_RECONINTER_H_
#ifdef __cplusplus
extern "C" {
#endif
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_y,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride);
extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
unsigned char *dst_y,
int dst_ystride);
extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
unsigned char *base_pre,
int pre_stride,
vp8_subpix_fn_t sppf);
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_RECONINTER_H_

View file

@ -0,0 +1,117 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "./vp8_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/vpx_once.h"
#include "blockd.h"
#include "vp8/common/reconintra.h"
#include "vp8/common/reconintra4x4.h"
enum {
SIZE_16,
SIZE_8,
NUM_SIZES,
};
typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left);
static intra_pred_fn pred[4][NUM_SIZES];
static intra_pred_fn dc_pred[2][2][NUM_SIZES];
static void vp8_init_intra_predictors_internal(void)
{
#define INIT_SIZE(sz) \
pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \
pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \
pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \
\
dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \
dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \
dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \
dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz
INIT_SIZE(16);
INIT_SIZE(8);
vp8_init_intra4x4_predictors_internal();
}
void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
unsigned char * yabove_row,
unsigned char * yleft,
int left_stride,
unsigned char * ypred_ptr,
int y_stride)
{
MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode;
DECLARE_ALIGNED(16, uint8_t, yleft_col[16]);
int i;
intra_pred_fn fn;
for (i = 0; i < 16; i++)
{
yleft_col[i] = yleft[i* left_stride];
}
if (mode == DC_PRED)
{
fn = dc_pred[x->left_available][x->up_available][SIZE_16];
}
else
{
fn = pred[mode][SIZE_16];
}
fn(ypred_ptr, y_stride, yabove_row, yleft_col);
}
void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
unsigned char * uabove_row,
unsigned char * vabove_row,
unsigned char * uleft,
unsigned char * vleft,
int left_stride,
unsigned char * upred_ptr,
unsigned char * vpred_ptr,
int pred_stride)
{
MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
unsigned char uleft_col[8];
unsigned char vleft_col[8];
int i;
intra_pred_fn fn;
for (i = 0; i < 8; i++)
{
uleft_col[i] = uleft[i * left_stride];
vleft_col[i] = vleft[i * left_stride];
}
if (uvmode == DC_PRED)
{
fn = dc_pred[x->left_available][x->up_available][SIZE_8];
}
else
{
fn = pred[uvmode][SIZE_8];
}
fn(upred_ptr, pred_stride, uabove_row, uleft_col);
fn(vpred_ptr, pred_stride, vabove_row, vleft_col);
}
void vp8_init_intra_predictors(void)
{
once(vp8_init_intra_predictors_internal);
}

View file

@ -0,0 +1,44 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_RECONINTRA_H_
#define VP8_COMMON_RECONINTRA_H_
#include "vp8/common/blockd.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
unsigned char *yabove_row,
unsigned char *yleft,
int left_stride,
unsigned char *ypred_ptr,
int y_stride);
void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
unsigned char * uabove_row,
unsigned char * vabove_row,
unsigned char * uleft,
unsigned char * vleft,
int left_stride,
unsigned char * upred_ptr,
unsigned char * vpred_ptr,
int pred_stride);
void vp8_init_intra_predictors(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_RECONINTRA_H_

View file

@ -0,0 +1,54 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <string.h>
#include "vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8_rtcd.h"
#include "blockd.h"
typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left);
static intra_pred_fn pred[10];
void vp8_init_intra4x4_predictors_internal(void)
{
pred[B_DC_PRED] = vpx_dc_predictor_4x4;
pred[B_TM_PRED] = vpx_tm_predictor_4x4;
pred[B_VE_PRED] = vpx_ve_predictor_4x4;
pred[B_HE_PRED] = vpx_he_predictor_4x4;
pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
pred[B_RD_PRED] = vpx_d135_predictor_4x4;
pred[B_VR_PRED] = vpx_d117_predictor_4x4;
pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
pred[B_HD_PRED] = vpx_d153_predictor_4x4;
pred[B_HU_PRED] = vpx_d207_predictor_4x4;
}
void vp8_intra4x4_predict(unsigned char *above,
unsigned char *yleft, int left_stride,
B_PREDICTION_MODE b_mode,
unsigned char *dst, int dst_stride,
unsigned char top_left)
{
unsigned char Left[4];
unsigned char Aboveb[12], *Above = Aboveb + 4;
Left[0] = yleft[0];
Left[1] = yleft[left_stride];
Left[2] = yleft[2 * left_stride];
Left[3] = yleft[3 * left_stride];
memcpy(Above, above, 8);
Above[-1] = top_left;
pred[b_mode](dst, dst_stride, Above, Left);
}

View file

@ -0,0 +1,48 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_RECONINTRA4X4_H_
#define VP8_COMMON_RECONINTRA4X4_H_
#include "vp8/common/blockd.h"
#ifdef __cplusplus
extern "C" {
#endif
static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
unsigned char *above_right_src)
{
int dst_stride = xd->dst.y_stride;
unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
unsigned int *src_ptr = (unsigned int *)above_right_src;
unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
*dst_ptr0 = *src_ptr;
*dst_ptr1 = *src_ptr;
*dst_ptr2 = *src_ptr;
}
void vp8_intra4x4_predict(unsigned char *Above,
unsigned char *yleft, int left_stride,
B_PREDICTION_MODE b_mode,
unsigned char *dst, int dst_stride,
unsigned char top_left);
void vp8_init_intra4x4_predictors_internal(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_RECONINTRA4X4_H_

19
thirdparty/libvpx/vp8/common/rtcd.c vendored Normal file
View file

@ -0,0 +1,19 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#define RTCD_C
#include "./vp8_rtcd.h"
#include "vpx_ports/vpx_once.h"
void vp8_rtcd()
{
once(setup_rtcd_internal);
}

View file

@ -0,0 +1,39 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "setupintrarecon.h"
#include "vpx_mem/vpx_mem.h"
void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
{
int i;
/* set up frame new frame for intra coded blocks */
memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
for (i = 0; i < ybf->y_height; i++)
ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
for (i = 0; i < ybf->uv_height; i++)
ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
for (i = 0; i < ybf->uv_height; i++)
ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
}
void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
{
memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
}

View file

@ -0,0 +1,45 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_SETUPINTRARECON_H_
#define VP8_COMMON_SETUPINTRARECON_H_
#include "./vpx_config.h"
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
extern "C" {
#endif
extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
unsigned char *u_buffer,
unsigned char *v_buffer,
int y_stride,
int uv_stride)
{
int i;
for (i = 0; i < 16; i++)
y_buffer[y_stride *i] = (unsigned char) 129;
for (i = 0; i < 8; i++)
u_buffer[uv_stride *i] = (unsigned char) 129;
for (i = 0; i < 8; i++)
v_buffer[uv_stride *i] = (unsigned char) 129;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_SETUPINTRARECON_H_

View file

@ -0,0 +1,34 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "swapyv12buffer.h"
void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
{
unsigned char *temp;
temp = last_frame->buffer_alloc;
last_frame->buffer_alloc = new_frame->buffer_alloc;
new_frame->buffer_alloc = temp;
temp = last_frame->y_buffer;
last_frame->y_buffer = new_frame->y_buffer;
new_frame->y_buffer = temp;
temp = last_frame->u_buffer;
last_frame->u_buffer = new_frame->u_buffer;
new_frame->u_buffer = temp;
temp = last_frame->v_buffer;
last_frame->v_buffer = new_frame->v_buffer;
new_frame->v_buffer = temp;
}

View file

@ -0,0 +1,27 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
#define VP8_COMMON_SWAPYV12BUFFER_H_
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_SWAPYV12BUFFER_H_

View file

@ -0,0 +1,27 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
#define VP8_COMMON_SYSTEMDEPENDENT_H_
#include "vpx_config.h"
#ifdef __cplusplus
extern "C" {
#endif
struct VP8Common;
void vp8_machine_specific_config(struct VP8Common *);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_SYSTEMDEPENDENT_H_

232
thirdparty/libvpx/vp8/common/threading.h vendored Normal file
View file

@ -0,0 +1,232 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_THREADING_H_
#define VP8_COMMON_THREADING_H_
#include "./vpx_config.h"
#ifdef __cplusplus
extern "C" {
#endif
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
/* Thread management macros */
#if defined(_WIN32) && !HAVE_PTHREAD_H
/* Win32 */
#include <process.h>
#include <windows.h>
#define THREAD_FUNCTION unsigned int __stdcall
#define THREAD_FUNCTION_RETURN DWORD
#define THREAD_SPECIFIC_INDEX DWORD
#define pthread_t HANDLE
#define pthread_attr_t DWORD
#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
#define thread_sleep(nms) Sleep(nms)
#define pthread_cancel(thread) terminate_thread(thread,0)
#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
#define pthread_self() GetCurrentThreadId()
#elif defined(__OS2__)
/* OS/2 */
#define INCL_DOS
#include <os2.h>
#include <stdlib.h>
#define THREAD_FUNCTION void *
#define THREAD_FUNCTION_RETURN void *
#define THREAD_SPECIFIC_INDEX PULONG
#define pthread_t TID
#define pthread_attr_t ULONG
#define pthread_detach(thread) 0
#define thread_sleep(nms) DosSleep(nms)
#define pthread_cancel(thread) DosKillThread(thread)
#define ts_key_create(ts_key, destructor) \
DosAllocThreadLocalMemory(1, &(ts_key));
#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
#define pthread_self() _gettid()
#else
#ifdef __APPLE__
#include <mach/mach_init.h>
#include <mach/semaphore.h>
#include <mach/task.h>
#include <time.h>
#include <unistd.h>
#else
#include <semaphore.h>
#endif
#include <pthread.h>
/* pthreads */
/* Nearly everything is already defined */
#define THREAD_FUNCTION void *
#define THREAD_FUNCTION_RETURN void *
#define THREAD_SPECIFIC_INDEX pthread_key_t
#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
#endif
/* Synchronization macros: Win32 and Pthreads */
#if defined(_WIN32) && !HAVE_PTHREAD_H
#define sem_t HANDLE
#define pause(voidpara) __asm PAUSE
#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
#define thread_sleep(nms) Sleep(nms)
#elif defined(__OS2__)
typedef struct
{
HEV event;
HMTX wait_mutex;
HMTX count_mutex;
int count;
} sem_t;
static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
{
DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
value > 0 ? TRUE : FALSE);
DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
sem->count = value;
return 0;
}
static inline int sem_wait(sem_t * sem)
{
DosRequestMutexSem(sem->wait_mutex, -1);
DosWaitEventSem(sem->event, -1);
DosRequestMutexSem(sem->count_mutex, -1);
sem->count--;
if (sem->count == 0)
{
ULONG post_count;
DosResetEventSem(sem->event, &post_count);
}
DosReleaseMutexSem(sem->count_mutex);
DosReleaseMutexSem(sem->wait_mutex);
return 0;
}
static inline int sem_post(sem_t * sem)
{
DosRequestMutexSem(sem->count_mutex, -1);
if (sem->count < 32768)
{
sem->count++;
DosPostEventSem(sem->event);
}
DosReleaseMutexSem(sem->count_mutex);
return 0;
}
static inline int sem_destroy(sem_t * sem)
{
DosCloseEventSem(sem->event);
DosCloseMutexSem(sem->wait_mutex);
DosCloseMutexSem(sem->count_mutex);
return 0;
}
#define thread_sleep(nms) DosSleep(nms)
#else
#ifdef __APPLE__
#define sem_t semaphore_t
#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
#define sem_wait(sem) (semaphore_wait(*sem) )
#define sem_post(sem) semaphore_signal(*sem)
#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
#else
#include <unistd.h>
#include <sched.h>
#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
#endif
/* Not Windows. Assume pthreads */
#endif
#if ARCH_X86 || ARCH_X86_64
#include "vpx_ports/x86.h"
#else
#define x86_pause_hint()
#endif
#include "vpx_util/vpx_thread.h"
static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
const int kMaxTryLocks = 4000;
int locked = 0;
int i;
for (i = 0; i < kMaxTryLocks; ++i) {
if (!pthread_mutex_trylock(mutex)) {
locked = 1;
break;
}
}
if (!locked)
pthread_mutex_lock(mutex);
}
static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
int ret;
mutex_lock(mutex);
ret = *p;
pthread_mutex_unlock(mutex);
return ret;
}
static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
const int *last_row_current_mb_col,
const int nsync) {
while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
x86_pause_hint();
thread_sleep(0);
}
}
static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
mutex_lock(mutex);
*p = v;
pthread_mutex_unlock(mutex);
}
#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_THREADING_H_

143
thirdparty/libvpx/vp8/common/treecoder.c vendored Normal file
View file

@ -0,0 +1,143 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#if CONFIG_DEBUG
#include <assert.h>
#endif
#include <stdio.h>
#include "treecoder.h"
static void tree2tok(
struct vp8_token_struct *const p,
vp8_tree t,
int i,
int v,
int L
)
{
v += v;
++L;
do
{
const vp8_tree_index j = t[i++];
if (j <= 0)
{
p[-j].value = v;
p[-j].Len = L;
}
else
tree2tok(p, t, j, v, L);
}
while (++v & 1);
}
void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
{
tree2tok(p, t, 0, 0, 0);
}
void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
int offset)
{
tree2tok(p - offset, t, 0, 0, 0);
}
static void branch_counts(
int n, /* n = size of alphabet */
vp8_token tok [ /* n */ ],
vp8_tree tree,
unsigned int branch_ct [ /* n-1 */ ] [2],
const unsigned int num_events[ /* n */ ]
)
{
const int tree_len = n - 1;
int t = 0;
#if CONFIG_DEBUG
assert(tree_len);
#endif
do
{
branch_ct[t][0] = branch_ct[t][1] = 0;
}
while (++t < tree_len);
t = 0;
do
{
int L = tok[t].Len;
const int enc = tok[t].value;
const unsigned int ct = num_events[t];
vp8_tree_index i = 0;
do
{
const int b = (enc >> --L) & 1;
const int j = i >> 1;
#if CONFIG_DEBUG
assert(j < tree_len && 0 <= L);
#endif
branch_ct [j] [b] += ct;
i = tree[ i + b];
}
while (i > 0);
#if CONFIG_DEBUG
assert(!L);
#endif
}
while (++t < n);
}
void vp8_tree_probs_from_distribution(
int n, /* n = size of alphabet */
vp8_token tok [ /* n */ ],
vp8_tree tree,
vp8_prob probs [ /* n-1 */ ],
unsigned int branch_ct [ /* n-1 */ ] [2],
const unsigned int num_events[ /* n */ ],
unsigned int Pfac,
int rd
)
{
const int tree_len = n - 1;
int t = 0;
branch_counts(n, tok, tree, branch_ct, num_events);
do
{
const unsigned int *const c = branch_ct[t];
const unsigned int tot = c[0] + c[1];
#if CONFIG_DEBUG
assert(tot < (1 << 24)); /* no overflow below */
#endif
if (tot)
{
const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
}
else
probs[t] = vp8_prob_half;
}
while (++t < tree_len);
}

View file

@ -0,0 +1,98 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_TREECODER_H_
#define VP8_COMMON_TREECODER_H_
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned char vp8bc_index_t; /* probability index */
typedef unsigned char vp8_prob;
#define vp8_prob_half ( (vp8_prob) 128)
typedef signed char vp8_tree_index;
struct bool_coder_spec;
typedef struct bool_coder_spec bool_coder_spec;
typedef struct bool_writer bool_writer;
typedef struct bool_reader bool_reader;
typedef const bool_coder_spec c_bool_coder_spec;
typedef const bool_writer c_bool_writer;
typedef const bool_reader c_bool_reader;
# define vp8_complement( x) (255 - x)
/* We build coding trees compactly in arrays.
Each node of the tree is a pair of vp8_tree_indices.
Array index often references a corresponding probability table.
Index <= 0 means done encoding/decoding and value = -Index,
Index > 0 means need another bit, specification at index.
Nonnegative indices are always even; processing begins at node 0. */
typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
typedef const struct vp8_token_struct
{
int value;
int Len;
} vp8_token;
/* Construct encoding array from tree. */
void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
int offset);
/* Convert array of token occurrence counts into a table of probabilities
for the associated binary encoding tree. Also writes count of branches
taken for each node on the tree; this facilitiates decisions as to
probability updates. */
void vp8_tree_probs_from_distribution(
int n, /* n = size of alphabet */
vp8_token tok [ /* n */ ],
vp8_tree tree,
vp8_prob probs [ /* n-1 */ ],
unsigned int branch_ct [ /* n-1 */ ] [2],
const unsigned int num_events[ /* n */ ],
unsigned int Pfactor,
int Round
);
/* Variant of above using coder spec rather than hardwired 8-bit probs. */
void vp8bc_tree_probs_from_distribution(
int n, /* n = size of alphabet */
vp8_token tok [ /* n */ ],
vp8_tree tree,
vp8_prob probs [ /* n-1 */ ],
unsigned int branch_ct [ /* n-1 */ ] [2],
const unsigned int num_events[ /* n */ ],
c_bool_coder_spec *s
);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_TREECODER_H_

View file

@ -0,0 +1,254 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
#ifdef __cplusplus
extern "C" {
#endif
/*Generated file, included by entropymode.c*/
const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] =
{
{ 0, 1 },
{ 2, 2 },
{ 6, 3 },
{ 28, 5 },
{ 30, 5 },
{ 58, 6 },
{ 59, 6 },
{ 62, 6 },
{ 126, 7 },
{ 127, 7 }
};
const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] =
{
{ 0, 1 },
{ 4, 3 },
{ 5, 3 },
{ 6, 3 },
{ 7, 3 }
};
const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] =
{
{ 4, 3 },
{ 5, 3 },
{ 6, 3 },
{ 7, 3 },
{ 0, 1 }
};
const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] =
{
{ 0, 1 },
{ 2, 2 },
{ 6, 3 },
{ 7, 3 }
};
const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] =
{
{ 6, 3 },
{ 7, 3 },
{ 2, 2 },
{ 0, 1 }
};
const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] =
{
{ 2, 2 },
{ 6, 3 },
{ 0, 1 },
{ 14, 4 },
{ 15, 4 }
};
const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] =
{
{ 0, 1 },
{ 2, 2 },
{ 6, 3 },
{ 7, 3 }
};
const struct vp8_token_struct vp8_small_mvencodings[8] =
{
{ 0, 3 },
{ 1, 3 },
{ 2, 3 },
{ 3, 3 },
{ 4, 3 },
{ 5, 3 },
{ 6, 3 },
{ 7, 3 }
};
const vp8_prob vp8_ymode_prob[VP8_YMODES-1] =
{
112, 86, 140, 37
};
const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] =
{
145, 156, 163, 128
};
const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] =
{
162, 101, 204
};
const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] =
{
142, 114, 183
};
const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] =
{
120, 90, 79, 133, 87, 85, 80, 111, 151
};
const vp8_prob vp8_kf_bmode_prob
[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] =
{
{
{ 231, 120, 48, 89, 115, 113, 120, 152, 112 },
{ 152, 179, 64, 126, 170, 118, 46, 70, 95 },
{ 175, 69, 143, 80, 85, 82, 72, 155, 103 },
{ 56, 58, 10, 171, 218, 189, 17, 13, 152 },
{ 144, 71, 10, 38, 171, 213, 144, 34, 26 },
{ 114, 26, 17, 163, 44, 195, 21, 10, 173 },
{ 121, 24, 80, 195, 26, 62, 44, 64, 85 },
{ 170, 46, 55, 19, 136, 160, 33, 206, 71 },
{ 63, 20, 8, 114, 114, 208, 12, 9, 226 },
{ 81, 40, 11, 96, 182, 84, 29, 16, 36 }
},
{
{ 134, 183, 89, 137, 98, 101, 106, 165, 148 },
{ 72, 187, 100, 130, 157, 111, 32, 75, 80 },
{ 66, 102, 167, 99, 74, 62, 40, 234, 128 },
{ 41, 53, 9, 178, 241, 141, 26, 8, 107 },
{ 104, 79, 12, 27, 217, 255, 87, 17, 7 },
{ 74, 43, 26, 146, 73, 166, 49, 23, 157 },
{ 65, 38, 105, 160, 51, 52, 31, 115, 128 },
{ 87, 68, 71, 44, 114, 51, 15, 186, 23 },
{ 47, 41, 14, 110, 182, 183, 21, 17, 194 },
{ 66, 45, 25, 102, 197, 189, 23, 18, 22 }
},
{
{ 88, 88, 147, 150, 42, 46, 45, 196, 205 },
{ 43, 97, 183, 117, 85, 38, 35, 179, 61 },
{ 39, 53, 200, 87, 26, 21, 43, 232, 171 },
{ 56, 34, 51, 104, 114, 102, 29, 93, 77 },
{ 107, 54, 32, 26, 51, 1, 81, 43, 31 },
{ 39, 28, 85, 171, 58, 165, 90, 98, 64 },
{ 34, 22, 116, 206, 23, 34, 43, 166, 73 },
{ 68, 25, 106, 22, 64, 171, 36, 225, 114 },
{ 34, 19, 21, 102, 132, 188, 16, 76, 124 },
{ 62, 18, 78, 95, 85, 57, 50, 48, 51 }
},
{
{ 193, 101, 35, 159, 215, 111, 89, 46, 111 },
{ 60, 148, 31, 172, 219, 228, 21, 18, 111 },
{ 112, 113, 77, 85, 179, 255, 38, 120, 114 },
{ 40, 42, 1, 196, 245, 209, 10, 25, 109 },
{ 100, 80, 8, 43, 154, 1, 51, 26, 71 },
{ 88, 43, 29, 140, 166, 213, 37, 43, 154 },
{ 61, 63, 30, 155, 67, 45, 68, 1, 209 },
{ 142, 78, 78, 16, 255, 128, 34, 197, 171 },
{ 41, 40, 5, 102, 211, 183, 4, 1, 221 },
{ 51, 50, 17, 168, 209, 192, 23, 25, 82 }
},
{
{ 125, 98, 42, 88, 104, 85, 117, 175, 82 },
{ 95, 84, 53, 89, 128, 100, 113, 101, 45 },
{ 75, 79, 123, 47, 51, 128, 81, 171, 1 },
{ 57, 17, 5, 71, 102, 57, 53, 41, 49 },
{ 115, 21, 2, 10, 102, 255, 166, 23, 6 },
{ 38, 33, 13, 121, 57, 73, 26, 1, 85 },
{ 41, 10, 67, 138, 77, 110, 90, 47, 114 },
{ 101, 29, 16, 10, 85, 128, 101, 196, 26 },
{ 57, 18, 10, 102, 102, 213, 34, 20, 43 },
{ 117, 20, 15, 36, 163, 128, 68, 1, 26 }
},
{
{ 138, 31, 36, 171, 27, 166, 38, 44, 229 },
{ 67, 87, 58, 169, 82, 115, 26, 59, 179 },
{ 63, 59, 90, 180, 59, 166, 93, 73, 154 },
{ 40, 40, 21, 116, 143, 209, 34, 39, 175 },
{ 57, 46, 22, 24, 128, 1, 54, 17, 37 },
{ 47, 15, 16, 183, 34, 223, 49, 45, 183 },
{ 46, 17, 33, 183, 6, 98, 15, 32, 183 },
{ 65, 32, 73, 115, 28, 128, 23, 128, 205 },
{ 40, 3, 9, 115, 51, 192, 18, 6, 223 },
{ 87, 37, 9, 115, 59, 77, 64, 21, 47 }
},
{
{ 104, 55, 44, 218, 9, 54, 53, 130, 226 },
{ 64, 90, 70, 205, 40, 41, 23, 26, 57 },
{ 54, 57, 112, 184, 5, 41, 38, 166, 213 },
{ 30, 34, 26, 133, 152, 116, 10, 32, 134 },
{ 75, 32, 12, 51, 192, 255, 160, 43, 51 },
{ 39, 19, 53, 221, 26, 114, 32, 73, 255 },
{ 31, 9, 65, 234, 2, 15, 1, 118, 73 },
{ 88, 31, 35, 67, 102, 85, 55, 186, 85 },
{ 56, 21, 23, 111, 59, 205, 45, 37, 192 },
{ 55, 38, 70, 124, 73, 102, 1, 34, 98 }
},
{
{ 102, 61, 71, 37, 34, 53, 31, 243, 192 },
{ 69, 60, 71, 38, 73, 119, 28, 222, 37 },
{ 68, 45, 128, 34, 1, 47, 11, 245, 171 },
{ 62, 17, 19, 70, 146, 85, 55, 62, 70 },
{ 75, 15, 9, 9, 64, 255, 184, 119, 16 },
{ 37, 43, 37, 154, 100, 163, 85, 160, 1 },
{ 63, 9, 92, 136, 28, 64, 32, 201, 85 },
{ 86, 6, 28, 5, 64, 255, 25, 248, 1 },
{ 56, 8, 17, 132, 137, 255, 55, 116, 128 },
{ 58, 15, 20, 82, 135, 57, 26, 121, 40 }
},
{
{ 164, 50, 31, 137, 154, 133, 25, 35, 218 },
{ 51, 103, 44, 131, 131, 123, 31, 6, 158 },
{ 86, 40, 64, 135, 148, 224, 45, 183, 128 },
{ 22, 26, 17, 131, 240, 154, 14, 1, 209 },
{ 83, 12, 13, 54, 192, 255, 68, 47, 28 },
{ 45, 16, 21, 91, 64, 222, 7, 1, 197 },
{ 56, 21, 39, 155, 60, 138, 23, 102, 213 },
{ 85, 26, 85, 85, 128, 128, 32, 146, 171 },
{ 18, 11, 7, 63, 144, 171, 4, 4, 246 },
{ 35, 27, 10, 146, 174, 171, 12, 26, 128 }
},
{
{ 190, 80, 35, 99, 180, 80, 126, 54, 45 },
{ 85, 126, 47, 87, 176, 51, 41, 20, 32 },
{ 101, 75, 128, 139, 118, 146, 116, 128, 85 },
{ 56, 41, 15, 176, 236, 85, 37, 9, 62 },
{ 146, 36, 19, 30, 171, 255, 97, 27, 20 },
{ 71, 30, 17, 119, 118, 255, 17, 18, 138 },
{ 101, 38, 60, 138, 55, 70, 43, 26, 142 },
{ 138, 45, 61, 62, 219, 1, 81, 188, 64 },
{ 32, 41, 20, 117, 151, 142, 20, 21, 163 },
{ 112, 19, 12, 61, 195, 128, 48, 4, 24 }
}
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_

View file

@ -0,0 +1,661 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "loopfilter.h"
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
static void lf_init_lut(loop_filter_info_n *lfi)
{
int filt_lvl;
for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
{
if (filt_lvl >= 40)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
}
else if (filt_lvl >= 20)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
}
else if (filt_lvl >= 15)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
}
else
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
}
}
lfi->mode_lf_lut[DC_PRED] = 1;
lfi->mode_lf_lut[V_PRED] = 1;
lfi->mode_lf_lut[H_PRED] = 1;
lfi->mode_lf_lut[TM_PRED] = 1;
lfi->mode_lf_lut[B_PRED] = 0;
lfi->mode_lf_lut[ZEROMV] = 1;
lfi->mode_lf_lut[NEARESTMV] = 2;
lfi->mode_lf_lut[NEARMV] = 2;
lfi->mode_lf_lut[NEWMV] = 2;
lfi->mode_lf_lut[SPLITMV] = 3;
}
void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
int sharpness_lvl)
{
int i;
/* For each possible value for the loop filter fill out limits */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
int block_inside_limit = 0;
/* Set loop filter paramaeters that control sharpness. */
block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
if (sharpness_lvl > 0)
{
if (block_inside_limit > (9 - sharpness_lvl))
block_inside_limit = (9 - sharpness_lvl);
}
if (block_inside_limit < 1)
block_inside_limit = 1;
memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
SIMD_WIDTH);
}
}
void vp8_loop_filter_init(VP8_COMMON *cm)
{
loop_filter_info_n *lfi = &cm->lf_info;
int i;
/* init limits for given sharpness*/
vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
cm->last_sharpness_level = cm->sharpness_level;
/* init LUT for lvl and hev thr picking */
lf_init_lut(lfi);
/* init hev threshold const vectors */
for(i = 0; i < 4 ; i++)
{
memset(lfi->hev_thr[i], i, SIMD_WIDTH);
}
}
void vp8_loop_filter_frame_init(VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl)
{
int seg, /* segment number */
ref, /* index in ref_lf_deltas */
mode; /* index in mode_lf_deltas */
loop_filter_info_n *lfi = &cm->lf_info;
/* update limits if sharpness has changed */
if(cm->last_sharpness_level != cm->sharpness_level)
{
vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
cm->last_sharpness_level = cm->sharpness_level;
}
for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
{
int lvl_seg = default_filt_lvl;
int lvl_ref, lvl_mode;
/* Note the baseline filter values for each segment */
if (mbd->segmentation_enabled)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
{
lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
}
else /* Delta Value */
{
lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
}
lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
}
if (!mbd->mode_ref_lf_delta_enabled)
{
/* we could get rid of this if we assume that deltas are set to
* zero when not in use; encoder always uses deltas
*/
memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
continue;
}
/* INTRA_FRAME */
ref = INTRA_FRAME;
/* Apply delta for reference frame */
lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
/* Apply delta for Intra modes */
mode = 0; /* B_PRED */
/* Only the split mode BPRED has a further special case */
lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
/* clamp */
lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
lfi->lvl[seg][ref][mode] = lvl_mode;
mode = 1; /* all the rest of Intra modes */
/* clamp */
lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
lfi->lvl[seg][ref][mode] = lvl_mode;
/* LAST, GOLDEN, ALT */
for(ref = 1; ref < MAX_REF_FRAMES; ref++)
{
/* Apply delta for reference frame */
lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
/* Apply delta for Inter modes */
for (mode = 1; mode < 4; mode++)
{
lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
/* clamp */
lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
lfi->lvl[seg][ref][mode] = lvl_mode;
}
}
}
}
void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
int mb_row, int post_ystride, int post_uvstride,
unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr)
{
int mb_col;
int filter_level;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
FRAME_TYPE frame_type = cm->frame_type;
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
vp8_loop_filter_mbv
(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
if (!skip_lf)
vp8_loop_filter_bv
(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_mbh
(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
if (!skip_lf)
vp8_loop_filter_bh
(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mode_info_context++; /* step to next MB */
}
}
void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
int mb_row, int post_ystride, int post_uvstride,
unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr)
{
int mb_col;
int filter_level;
loop_filter_info_n *lfi_n = &cm->lf_info;
(void)post_uvstride;
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
if (mb_col > 0)
vp8_loop_filter_simple_mbv
(y_ptr, post_ystride, lfi_n->mblim[filter_level]);
if (!skip_lf)
vp8_loop_filter_simple_bv
(y_ptr, post_ystride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_simple_mbh
(y_ptr, post_ystride, lfi_n->mblim[filter_level]);
if (!skip_lf)
vp8_loop_filter_simple_bh
(y_ptr, post_ystride, lfi_n->blim[filter_level]);
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mode_info_context++; /* step to next MB */
}
}
void vp8_loop_filter_frame(VP8_COMMON *cm,
MACROBLOCKD *mbd,
int frame_type)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
int mb_row;
int mb_col;
int mb_rows = cm->mb_rows;
int mb_cols = cm->mb_cols;
int filter_level;
unsigned char *y_ptr, *u_ptr, *v_ptr;
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
int post_y_stride = post->y_stride;
int post_uv_stride = post->uv_stride;
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
u_ptr = post->u_buffer;
v_ptr = post->v_buffer;
/* vp8_filter each macro block */
if (cm->filter_type == NORMAL_LOOPFILTER)
{
for (mb_row = 0; mb_row < mb_rows; mb_row++)
{
for (mb_col = 0; mb_col < mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
vp8_loop_filter_mbv
(y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
if (!skip_lf)
vp8_loop_filter_bv
(y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_mbh
(y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
if (!skip_lf)
vp8_loop_filter_bh
(y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mode_info_context++; /* step to next MB */
}
y_ptr += post_y_stride * 16 - post->y_width;
u_ptr += post_uv_stride * 8 - post->uv_width;
v_ptr += post_uv_stride * 8 - post->uv_width;
mode_info_context++; /* Skip border mb */
}
}
else /* SIMPLE_LOOPFILTER */
{
for (mb_row = 0; mb_row < mb_rows; mb_row++)
{
for (mb_col = 0; mb_col < mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
const unsigned char * mblim = lfi_n->mblim[filter_level];
const unsigned char * blim = lfi_n->blim[filter_level];
if (mb_col > 0)
vp8_loop_filter_simple_mbv
(y_ptr, post_y_stride, mblim);
if (!skip_lf)
vp8_loop_filter_simple_bv
(y_ptr, post_y_stride, blim);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_simple_mbh
(y_ptr, post_y_stride, mblim);
if (!skip_lf)
vp8_loop_filter_simple_bh
(y_ptr, post_y_stride, blim);
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mode_info_context++; /* step to next MB */
}
y_ptr += post_y_stride * 16 - post->y_width;
u_ptr += post_uv_stride * 8 - post->uv_width;
v_ptr += post_uv_stride * 8 - post->uv_width;
mode_info_context++; /* Skip border mb */
}
}
}
void vp8_loop_filter_frame_yonly
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
unsigned char *y_ptr;
int mb_row;
int mb_col;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
int filter_level;
FRAME_TYPE frame_type = cm->frame_type;
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
#if 0
if(default_filt_lvl == 0) /* no filter applied */
return;
#endif
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
/* vp8_filter each macro block */
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
vp8_loop_filter_mbv
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
vp8_loop_filter_bv
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_mbh
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
vp8_loop_filter_bh
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
}
else
{
if (mb_col > 0)
vp8_loop_filter_simple_mbv
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
vp8_loop_filter_simple_bv
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_simple_mbh
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
vp8_loop_filter_simple_bh
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
}
y_ptr += 16;
mode_info_context ++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mode_info_context ++; /* Skip border mb */
}
}
void vp8_loop_filter_partial_frame
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
unsigned char *y_ptr;
int mb_row;
int mb_col;
int mb_cols = post->y_width >> 4;
int mb_rows = post->y_height >> 4;
int linestocopy;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
int filter_level;
FRAME_TYPE frame_type = cm->frame_type;
const MODE_INFO *mode_info_context;
#if 0
if(default_filt_lvl == 0) /* no filter applied */
return;
#endif
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
/* number of MB rows to use in partial filtering */
linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
/* Set up the buffer pointers; partial image starts at ~middle of frame */
y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
/* vp8_filter each macro block */
for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
{
for (mb_col = 0; mb_col < mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
const int mode_index =
lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
vp8_loop_filter_mbv
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
vp8_loop_filter_bv
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
vp8_loop_filter_mbh
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
vp8_loop_filter_bh
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
}
else
{
if (mb_col > 0)
vp8_loop_filter_simple_mbv
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
vp8_loop_filter_simple_bv
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
vp8_loop_filter_simple_mbh
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
vp8_loop_filter_simple_bh
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
}
y_ptr += 16;
mode_info_context += 1; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mode_info_context += 1; /* Skip border mb */
}
}

View file

@ -0,0 +1,93 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_copy32xn_sse2(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse2) PRIVATE
sym(vp8_copy32xn_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;dst_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
.block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqu xmm4, XMMWORD PTR [rsi]
movdqu xmm5, XMMWORD PTR [rsi + 16]
movdqu xmm6, XMMWORD PTR [rsi + rax]
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
movdqa XMMWORD PTR [rdi + rdx], xmm2
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
lea rdi, [rdi+rdx*2]
movdqa XMMWORD PTR [rdi], xmm4
movdqa XMMWORD PTR [rdi + 16], xmm5
movdqa XMMWORD PTR [rdi + rdx], xmm6
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
lea rdi, [rdi+rdx*2]
sub rcx, 4
cmp rcx, 4
jge .block_copy_sse2_loopx4
cmp rcx, 0
je .copy_is_done
.block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
lea rdi, [rdi+rdx]
sub rcx, 1
jne .block_copy_sse2_loop
.copy_is_done:
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

View file

@ -0,0 +1,146 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro STACK_FRAME_CREATE_X3 0
%if ABI_IS_32BIT
%define src_ptr rsi
%define src_stride rax
%define ref_ptr rdi
%define ref_stride rdx
%define end_ptr rcx
%define ret_var rbx
%define result_ptr arg(4)
%define max_sad arg(4)
%define height dword ptr arg(4)
push rbp
mov rbp, rsp
push rsi
push rdi
push rbx
mov rsi, arg(0) ; src_ptr
mov rdi, arg(2) ; ref_ptr
movsxd rax, dword ptr arg(1) ; src_stride
movsxd rdx, dword ptr arg(3) ; ref_stride
%else
%if LIBVPX_YASM_WIN64
SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define ref_ptr r8
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8]
%define max_sad [rsp+xmm_stack_space+8+4*8]
%define height dword ptr [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
%define ref_ptr rdx
%define ref_stride rcx
%define end_ptr r9
%define ret_var r10
%define result_ptr r8
%define max_sad r8
%define height r8
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY_X3 0
%define src_ptr
%define src_stride
%define ref_ptr
%define ref_stride
%define end_ptr
%define ret_var
%define result_ptr
%define max_sad
%define height
%if ABI_IS_32BIT
pop rbx
pop rdi
pop rsi
pop rbp
%else
%if LIBVPX_YASM_WIN64
RESTORE_XMM
%endif
%endif
ret
%endmacro
;void vp8_copy32xn_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse3) PRIVATE
sym(vp8_copy32xn_sse3):
STACK_FRAME_CREATE_X3
.block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
movdqu xmm4, XMMWORD PTR [end_ptr]
movdqu xmm5, XMMWORD PTR [end_ptr + 16]
movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
lea src_ptr, [src_ptr+src_stride*4]
lea end_ptr, [ref_ptr+ref_stride*2]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
movdqa XMMWORD PTR [end_ptr], xmm4
movdqa XMMWORD PTR [end_ptr + 16], xmm5
movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
lea ref_ptr, [ref_ptr+ref_stride*4]
sub height, 4
cmp height, 4
jge .block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
je .copy_is_done
.block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
jne .block_copy_sse3_loop
.copy_is_done:
STACK_FRAME_DESTROY_X3

View file

@ -0,0 +1,258 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
global sym(vp8_dequantize_b_impl_mmx) PRIVATE
sym(vp8_dequantize_b_impl_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;sq
mov rdi, arg(1) ;dq
mov rax, arg(2) ;q
movq mm1, [rsi]
pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
movq [rdi], mm1
movq mm1, [rsi+8]
pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
movq [rdi+8], mm1
movq mm1, [rsi+16]
pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
movq [rdi+16], mm1
movq mm1, [rsi+24]
pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
movq [rdi+24], mm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void dequant_idct_add_mmx(
;short *input, 0
;short *dq, 1
;unsigned char *dest, 2
;int stride) 3
global sym(vp8_dequant_idct_add_mmx) PRIVATE
sym(vp8_dequant_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
push rdi
; end prolog
mov rax, arg(0) ;input
mov rdx, arg(1) ;dq
movq mm0, [rax ]
pmullw mm0, [rdx]
movq mm1, [rax +8]
pmullw mm1, [rdx +8]
movq mm2, [rax+16]
pmullw mm2, [rdx+16]
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
mov rdx, arg(2) ;dest
pxor mm7, mm7
movq [rax], mm7
movq [rax+8], mm7
movq [rax+16],mm7
movq [rax+24],mm7
movsxd rdi, dword ptr arg(3) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)];
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
movq mm3, mm5 ; 33 23 13 03
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)];
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
paddw mm0, [GLOBAL(fours)]
paddw mm2, [GLOBAL(fours)]
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
psraw mm2, 3
psraw mm0, 3
psraw mm4, 3
psraw mm6, 3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
pxor mm7, mm7
movd mm4, [rdx]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
movd mm4, [rdx+rdi]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
movd [rdx+rdi], mm1
movd mm4, [rdx+2*rdi]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
movd [rdx+rdi*2], mm2
add rdx, rdi
movd mm4, [rdx+2*rdi]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
x_s1sqr2:
times 4 dw 0x8A8C
align 16
x_c1sqr2less1:
times 4 dw 0x4E7B
align 16
fours:
times 4 dw 0x0004

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp8/common/x86/filter_x86.h"
DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
{
{ 128, 128, 128, 128, 0, 0, 0, 0 },
{ 112, 112, 112, 112, 16, 16, 16, 16 },
{ 96, 96, 96, 96, 32, 32, 32, 32 },
{ 80, 80, 80, 80, 48, 48, 48, 48 },
{ 64, 64, 64, 64, 64, 64, 64, 64 },
{ 48, 48, 48, 48, 80, 80, 80, 80 },
{ 32, 32, 32, 32, 96, 96, 96, 96 },
{ 16, 16, 16, 16, 112, 112, 112, 112 }
};
DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
{
{ 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
{ 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
{ 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
{ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
{ 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
{ 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
{ 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
};

View file

@ -0,0 +1,33 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_X86_FILTER_X86_H_
#define VP8_COMMON_X86_FILTER_X86_H_
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
* duplicated values */
/* duplicated 4x */
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
/* duplicated 8x */
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_X86_FILTER_X86_H_

View file

@ -0,0 +1,128 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vp8/common/blockd.h"
#include "vpx_mem/vpx_mem.h"
extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
{
short *sq = (short *) d->qcoeff;
short *dq = (short *) d->dqcoeff;
vp8_dequantize_b_impl_mmx(sq, dq, DQC);
}
void vp8_dequant_idct_add_y_block_mmx
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_mmx (q, dq, dst, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
dst+4, stride);
memset(q + 16, 0, 2 * sizeof(q[0]));
}
if (eobs[2] > 1)
vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
else if (eobs[2] == 1)
{
vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
dst+8, stride);
memset(q + 32, 0, 2 * sizeof(q[0]));
}
if (eobs[3] > 1)
vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
else if (eobs[3] == 1)
{
vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
dst+12, stride);
memset(q + 48, 0, 2 * sizeof(q[0]));
}
q += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_mmx
(short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i;
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
dstu+4, stride);
memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
dstu += 4*stride;
eobs += 2;
}
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
dstv+4, stride);
memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
dstv += 4*stride;
eobs += 2;
}
}

View file

@ -0,0 +1,89 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
void vp8_idct_dequant_0_2x_sse2
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
void vp8_idct_dequant_full_2x_sse2
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
void vp8_dequant_idct_add_y_block_sse2
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (((short *)(eobs))[0])
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
}
if (((short *)(eobs))[1])
{
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
else
vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
}
q += 64;
dst += stride*4;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_sse2
(short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
if (((short *)(eobs))[0])
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
}
q += 32;
dstu += stride*4;
if (((short *)(eobs))[1])
{
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
}
q += 32;
if (((short *)(eobs))[2])
{
if (((short *)(eobs))[2] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
}
q += 32;
dstv += stride*4;
if (((short *)(eobs))[3])
{
if (((short *)(eobs))[3] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
}
}

View file

@ -0,0 +1,295 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
; /****************************************************************************
; * Notes:
; *
; * This implementation makes use of 16 bit fixed point version of two multiply
; * constants:
; * 1. sqrt(2) * cos (pi/8)
; * 2. sqrt(2) * sin (pi/8)
; * Because the first constant is bigger than 1, to maintain the same 16 bit
; * fixed point precision as the second one, we use a trick of
; * x * a = x + x*(a-1)
; * so
; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
; *
; * For the second constant, because of the 16bit version is 35468, which
; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
; * number.
; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
; *
; **************************************************************************/
;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
;int pitch, unsigned char *dest,int stride)
global sym(vp8_short_idct4x4llm_mmx) PRIVATE
sym(vp8_short_idct4x4llm_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rax, arg(0) ;input
mov rsi, arg(1) ;pred
movq mm0, [rax ]
movq mm1, [rax+ 8]
movq mm2, [rax+16]
movq mm3, [rax+24]
%if 0
pxor mm7, mm7
movq [rax], mm7
movq [rax+8], mm7
movq [rax+16],mm7
movq [rax+24],mm7
%endif
movsxd rax, dword ptr arg(2) ;pitch
mov rdx, arg(3) ;dest
movsxd rdi, dword ptr arg(4) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)];
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
movq mm3, mm5 ; 33 23 13 03
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)];
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
paddw mm0, [GLOBAL(fours)]
paddw mm2, [GLOBAL(fours)]
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
psraw mm2, 3
psraw mm0, 3
psraw mm4, 3
psraw mm6, 3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
pxor mm7, mm7
movd mm4, [rsi]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
movd mm4, [rsi+rax]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
movd [rdx+rdi], mm1
movd mm4, [rsi+2*rax]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
movd [rdx+rdi*2], mm2
add rdx, rdi
add rsi, rax
movd mm4, [rsi+2*rax]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_dc_only_idct_add_mmx(
;short input_dc,
;unsigned char *pred_ptr,
;int pred_stride,
;unsigned char *dst_ptr,
;int stride)
global sym(vp8_dc_only_idct_add_mmx) PRIVATE
sym(vp8_dc_only_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
; end prolog
movd mm5, arg(0) ;input_dc
mov rax, arg(1) ;pred_ptr
movsxd rdx, dword ptr arg(2) ;pred_stride
pxor mm0, mm0
paddw mm5, [GLOBAL(fours)]
lea rcx, [rdx + rdx*2]
psraw mm5, 3
punpcklwd mm5, mm5
punpckldq mm5, mm5
movd mm1, [rax]
movd mm2, [rax+rdx]
movd mm3, [rax+2*rdx]
movd mm4, [rax+rcx]
mov rax, arg(3) ;d -- destination
movsxd rdx, dword ptr arg(4) ;dst_stride
punpcklbw mm1, mm0
paddsw mm1, mm5
packuswb mm1, mm0 ; pack and unpack to saturate
lea rcx, [rdx + rdx*2]
punpcklbw mm2, mm0
paddsw mm2, mm5
packuswb mm2, mm0 ; pack and unpack to saturate
punpcklbw mm3, mm0
paddsw mm3, mm5
packuswb mm3, mm0 ; pack and unpack to saturate
punpcklbw mm4, mm0
paddsw mm4, mm5
packuswb mm4, mm0 ; pack and unpack to saturate
movd [rax], mm1
movd [rax+rdx], mm2
movd [rax+2*rdx], mm3
movd [rax+rcx], mm4
; begin epilog
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
x_s1sqr2:
times 4 dw 0x8A8C
align 16
x_c1sqr2less1:
times 4 dw 0x4E7B
align 16
fours:
times 4 dw 0x0004

View file

@ -0,0 +1,708 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_idct_dequant_0_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *dst - 2
; int dst_stride - 3
; )
global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
sym(vp8_idct_dequant_0_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
; end prolog
mov rdx, arg(1) ; dequant
mov rax, arg(0) ; qcoeff
movd xmm4, [rax]
movd xmm5, [rdx]
pinsrw xmm4, [rax+32], 4
pinsrw xmm5, [rdx], 4
pmullw xmm4, xmm5
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
; clear coeffs
movd [rax], xmm5
movd [rax+32], xmm5
;pshufb
mov rax, arg(2) ; dst
movsxd rdx, dword ptr arg(3) ; dst_stride
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
lea rcx, [rdx + rdx*2]
paddw xmm4, [GLOBAL(fours)]
psraw xmm4, 3
movq xmm0, [rax]
movq xmm1, [rax+rdx]
movq xmm2, [rax+2*rdx]
movq xmm3, [rax+rcx]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
; Add to predict buffer
paddw xmm0, xmm4
paddw xmm1, xmm4
paddw xmm2, xmm4
paddw xmm3, xmm4
; pack up before storing
packuswb xmm0, xmm5
packuswb xmm1, xmm5
packuswb xmm2, xmm5
packuswb xmm3, xmm5
; store blocks back out
movq [rax], xmm0
movq [rax + rdx], xmm1
lea rax, [rax + 2*rdx]
movq [rax], xmm2
movq [rax + rdx], xmm3
; begin epilog
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_idct_dequant_full_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *dst - 2
; int dst_stride - 3
; )
global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
sym(vp8_idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
mov rdx, arg(1) ; dequant
mov rdi, arg(2) ; dst
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
; note the transpose of xmm1 and xmm2, necessary for shuffle
; to spit out sensicle data
movdqa xmm0, [rax]
movdqa xmm2, [rax+16]
movdqa xmm1, [rax+32]
movdqa xmm3, [rax+48]
; Clear out coeffs
movdqa [rax], xmm7
movdqa [rax+16], xmm7
movdqa [rax+32], xmm7
movdqa [rax+48], xmm7
; dequantize qcoeff buffer
pmullw xmm0, [rdx]
pmullw xmm2, [rdx+16]
pmullw xmm1, [rdx]
pmullw xmm3, [rdx+16]
movsxd rdx, dword ptr arg(3) ; dst_stride
; repack so block 0 row x and block 1 row x are together
movdqa xmm4, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm4, xmm1
pshufd xmm0, xmm0, 11011000b
pshufd xmm1, xmm4, 11011000b
movdqa xmm4, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm4, xmm3
pshufd xmm2, xmm2, 11011000b
pshufd xmm3, xmm4, 11011000b
; first pass
psubw xmm0, xmm2 ; b1 = 0-2
paddw xmm2, xmm2 ;
movdqa xmm5, xmm1
paddw xmm2, xmm0 ; a1 = 0+2
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
lea rcx, [rdx + rdx*2] ;dst_stride * 3
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
movdqa xmm7, xmm3
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw xmm7, xmm5 ; c1
movdqa xmm5, xmm1
movdqa xmm4, xmm3
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
paddw xmm5, xmm1
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
paddw xmm3, xmm4
paddw xmm3, xmm5 ; d1
movdqa xmm6, xmm2 ; a1
movdqa xmm4, xmm0 ; b1
paddw xmm2, xmm3 ;0
paddw xmm4, xmm7 ;1
psubw xmm0, xmm7 ;2
psubw xmm6, xmm3 ;3
; transpose for the second pass
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
pshufd xmm0, xmm2, 11011000b
pshufd xmm2, xmm1, 11011000b
pshufd xmm1, xmm5, 11011000b
pshufd xmm3, xmm7, 11011000b
; second pass
psubw xmm0, xmm2 ; b1 = 0-2
paddw xmm2, xmm2
movdqa xmm5, xmm1
paddw xmm2, xmm0 ; a1 = 0+2
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
movdqa xmm7, xmm3
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw xmm7, xmm5 ; c1
movdqa xmm5, xmm1
movdqa xmm4, xmm3
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
paddw xmm5, xmm1
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
paddw xmm3, xmm4
paddw xmm3, xmm5 ; d1
paddw xmm0, [GLOBAL(fours)]
paddw xmm2, [GLOBAL(fours)]
movdqa xmm6, xmm2 ; a1
movdqa xmm4, xmm0 ; b1
paddw xmm2, xmm3 ;0
paddw xmm4, xmm7 ;1
psubw xmm0, xmm7 ;2
psubw xmm6, xmm3 ;3
psraw xmm2, 3
psraw xmm0, 3
psraw xmm4, 3
psraw xmm6, 3
; transpose to save
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
pshufd xmm0, xmm2, 11011000b
pshufd xmm2, xmm1, 11011000b
pshufd xmm1, xmm5, 11011000b
pshufd xmm3, xmm7, 11011000b
pxor xmm7, xmm7
; Load up predict blocks
movq xmm4, [rdi]
movq xmm5, [rdi+rdx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
movq xmm4, [rdi+2*rdx]
movq xmm5, [rdi+rcx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
paddw xmm2, xmm4
paddw xmm3, xmm5
.finish:
; pack up before storing
packuswb xmm0, xmm7
packuswb xmm1, xmm7
packuswb xmm2, xmm7
packuswb xmm3, xmm7
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
movq [rdi + rdx*2], xmm2
movq [rdi + rcx], xmm3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_idct_dequant_dc_0_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *dst - 2
; int dst_stride - 3
; short *dc - 4
; )
global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
sym(vp8_idct_dequant_dc_0_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
mov rdi, arg(2) ; dst
mov rdx, arg(4) ; dc
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
movsxd rdx, dword ptr arg(3) ; dst_stride
lea rcx, [rdx + rdx*2]
; Load up predict blocks
movq xmm0, [rdi]
movq xmm1, [rdi+rdx*1]
movq xmm2, [rdi+rdx*2]
movq xmm3, [rdi+rcx]
; Duplicate and expand dc across
punpcklwd xmm4, xmm4
punpckldq xmm4, xmm4
; Rounding to dequant and downshift
paddw xmm4, [GLOBAL(fours)]
psraw xmm4, 3
; Predict buffer needs to be expanded from bytes to words
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
; Add to predict buffer
paddw xmm0, xmm4
paddw xmm1, xmm4
paddw xmm2, xmm4
paddw xmm3, xmm4
; pack up before storing
packuswb xmm0, xmm5
packuswb xmm1, xmm5
packuswb xmm2, xmm5
packuswb xmm3, xmm5
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
movq [rdi + rdx*2], xmm2
movq [rdi + rcx], xmm3
; begin epilog
pop rdi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_idct_dequant_dc_full_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *dst - 2
; int dst_stride - 3
; short *dc - 4
; )
global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
sym(vp8_idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
mov rdx, arg(1) ; dequant
mov rdi, arg(2) ; dst
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
; note the transpose of xmm1 and xmm2, necessary for shuffle
; to spit out sensicle data
movdqa xmm0, [rax]
movdqa xmm2, [rax+16]
movdqa xmm1, [rax+32]
movdqa xmm3, [rax+48]
; Clear out coeffs
movdqa [rax], xmm7
movdqa [rax+16], xmm7
movdqa [rax+32], xmm7
movdqa [rax+48], xmm7
; dequantize qcoeff buffer
pmullw xmm0, [rdx]
pmullw xmm2, [rdx+16]
pmullw xmm1, [rdx]
pmullw xmm3, [rdx+16]
; DC component
mov rdx, arg(4)
; repack so block 0 row x and block 1 row x are together
movdqa xmm4, xmm0
punpckldq xmm0, xmm1
punpckhdq xmm4, xmm1
pshufd xmm0, xmm0, 11011000b
pshufd xmm1, xmm4, 11011000b
movdqa xmm4, xmm2
punpckldq xmm2, xmm3
punpckhdq xmm4, xmm3
pshufd xmm2, xmm2, 11011000b
pshufd xmm3, xmm4, 11011000b
; insert DC component
pinsrw xmm0, [rdx], 0
pinsrw xmm0, [rdx+2], 4
; first pass
psubw xmm0, xmm2 ; b1 = 0-2
paddw xmm2, xmm2 ;
movdqa xmm5, xmm1
paddw xmm2, xmm0 ; a1 = 0+2
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
movdqa xmm7, xmm3
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw xmm7, xmm5 ; c1
movdqa xmm5, xmm1
movdqa xmm4, xmm3
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
paddw xmm5, xmm1
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
paddw xmm3, xmm4
paddw xmm3, xmm5 ; d1
movdqa xmm6, xmm2 ; a1
movdqa xmm4, xmm0 ; b1
paddw xmm2, xmm3 ;0
paddw xmm4, xmm7 ;1
psubw xmm0, xmm7 ;2
psubw xmm6, xmm3 ;3
; transpose for the second pass
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
pshufd xmm0, xmm2, 11011000b
pshufd xmm2, xmm1, 11011000b
pshufd xmm1, xmm5, 11011000b
pshufd xmm3, xmm7, 11011000b
; second pass
psubw xmm0, xmm2 ; b1 = 0-2
paddw xmm2, xmm2
movdqa xmm5, xmm1
paddw xmm2, xmm0 ; a1 = 0+2
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
movdqa xmm7, xmm3
pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw xmm7, xmm5 ; c1
movdqa xmm5, xmm1
movdqa xmm4, xmm3
pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
paddw xmm5, xmm1
pmulhw xmm3, [GLOBAL(x_s1sqr2)]
paddw xmm3, xmm4
paddw xmm3, xmm5 ; d1
paddw xmm0, [GLOBAL(fours)]
paddw xmm2, [GLOBAL(fours)]
movdqa xmm6, xmm2 ; a1
movdqa xmm4, xmm0 ; b1
paddw xmm2, xmm3 ;0
paddw xmm4, xmm7 ;1
psubw xmm0, xmm7 ;2
psubw xmm6, xmm3 ;3
psraw xmm2, 3
psraw xmm0, 3
psraw xmm4, 3
psraw xmm6, 3
; transpose to save
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
pshufd xmm0, xmm2, 11011000b
pshufd xmm2, xmm1, 11011000b
pshufd xmm1, xmm5, 11011000b
pshufd xmm3, xmm7, 11011000b
pxor xmm7, xmm7
; Load up predict blocks
movsxd rdx, dword ptr arg(3) ; dst_stride
movq xmm4, [rdi]
movq xmm5, [rdi+rdx]
lea rcx, [rdx + rdx*2]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
movq xmm4, [rdi+rdx*2]
movq xmm5, [rdi+rcx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
paddw xmm2, xmm4
paddw xmm3, xmm5
.finish:
; pack up before storing
packuswb xmm0, xmm7
packuswb xmm1, xmm7
packuswb xmm2, xmm7
packuswb xmm3, xmm7
; Load destination stride before writing out,
; doesn't need to persist
movsxd rdx, dword ptr arg(3) ; dst_stride
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
lea rdi, [rdi + 2*rdx]
movq [rdi], xmm2
movq [rdi + rdx], xmm3
; begin epilog
pop rdi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
fours:
times 8 dw 0x0004
align 16
x_s1sqr2:
times 8 dw 0x8A8C
align 16
x_c1sqr2less1:
times 8 dw 0x4E7B

View file

@ -0,0 +1,140 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
sym(vp8_short_inv_walsh4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
; end prolog
mov rdx, arg(0)
mov rax, 30003h
movq mm0, [rdx + 0] ;ip[0]
movq mm1, [rdx + 8] ;ip[4]
movq mm7, rax
movq mm2, [rdx + 16] ;ip[8]
movq mm3, [rdx + 24] ;ip[12]
punpcklwd mm7, mm7 ;0003000300030003h
mov rdx, arg(1)
movq mm4, mm0
movq mm5, mm1
paddw mm4, mm3 ;ip[0] + ip[12] aka al
paddw mm5, mm2 ;ip[4] + ip[8] aka bl
movq mm6, mm4 ;temp al
paddw mm4, mm5 ;al + bl
psubw mm6, mm5 ;al - bl
psubw mm0, mm3 ;ip[0] - ip[12] aka d1
psubw mm1, mm2 ;ip[4] - ip[8] aka c1
movq mm5, mm0 ;temp dl
paddw mm0, mm1 ;dl + cl
psubw mm5, mm1 ;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
movq mm3, mm4 ; 03 02 01 00
punpcklwd mm4, mm0 ; 11 01 10 00
punpckhwd mm3, mm0 ; 13 03 12 02
movq mm1, mm6 ; 23 22 21 20
punpcklwd mm6, mm5 ; 31 21 30 20
punpckhwd mm1, mm5 ; 33 23 32 22
movq mm0, mm4 ; 11 01 10 00
movq mm2, mm3 ; 13 03 12 02
punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
movq mm1, mm0
movq mm5, mm4
paddw mm1, mm3 ;ip[0] + ip[12] aka al
paddw mm5, mm2 ;ip[4] + ip[8] aka bl
movq mm6, mm1 ;temp al
paddw mm1, mm5 ;al + bl
psubw mm6, mm5 ;al - bl
paddw mm1, mm7
paddw mm6, mm7
psraw mm1, 3
psraw mm6, 3
psubw mm0, mm3 ;ip[0] - ip[12] aka d1
psubw mm4, mm2 ;ip[4] - ip[8] aka c1
movq mm5, mm0 ;temp dl
paddw mm0, mm4 ;dl + cl
psubw mm5, mm4 ;dl - cl
paddw mm0, mm7
paddw mm5, mm7
psraw mm0, 3
psraw mm5, 3
;~~~~~~~~~~~~~~~~~~~~~
movd eax, mm1
movd ecx, mm0
psrlq mm0, 32
psrlq mm1, 32
mov word ptr[rdx+32*0], ax
mov word ptr[rdx+32*1], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*4], ax
mov word ptr[rdx+32*5], cx
movd eax, mm1
movd ecx, mm0
mov word ptr[rdx+32*8], ax
mov word ptr[rdx+32*9], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*12], ax
mov word ptr[rdx+32*13], cx
movd eax, mm6
movd ecx, mm5
psrlq mm5, 32
psrlq mm6, 32
mov word ptr[rdx+32*2], ax
mov word ptr[rdx+32*3], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*6], ax
mov word ptr[rdx+32*7], cx
movd eax, mm6
movd ecx, mm5
mov word ptr[rdx+32*10], ax
mov word ptr[rdx+32*11], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*14], ax
mov word ptr[rdx+32*15], cx
; begin epilog
UNSHADOW_ARGS
pop rbp
ret

View file

@ -0,0 +1,121 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
sym(vp8_short_inv_walsh4x4_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
; end prolog
mov rcx, arg(0)
mov rdx, arg(1)
mov rax, 30003h
movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
movdqa xmm3, xmm0 ;ip[4] ip[0]
paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa xmm4, xmm0
punpcklqdq xmm0, xmm3 ;d1 a1
punpckhqdq xmm4, xmm3 ;c1 b1
movdqa xmm1, xmm4 ;c1 b1
paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
movd xmm0, eax
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
movdqa xmm3, xmm4 ;ip[4] ip[0]
pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm3 ;d1 a1
punpckhqdq xmm5, xmm3 ;c1 b1
movdqa xmm1, xmm5 ;c1 b1
paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
paddw xmm5, xmm0
paddw xmm4, xmm0
psraw xmm5, 3
psraw xmm4, 3
movd eax, xmm5
movd ecx, xmm4
psrldq xmm5, 4
psrldq xmm4, 4
mov word ptr[rdx+32*0], ax
mov word ptr[rdx+32*2], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*4], ax
mov word ptr[rdx+32*6], cx
movd eax, xmm5
movd ecx, xmm4
psrldq xmm5, 4
psrldq xmm4, 4
mov word ptr[rdx+32*8], ax
mov word ptr[rdx+32*10], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*12], ax
mov word ptr[rdx+32*14], cx
movd eax, xmm5
movd ecx, xmm4
psrldq xmm5, 4
psrldq xmm4, 4
mov word ptr[rdx+32*1], ax
mov word ptr[rdx+32*3], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*5], ax
mov word ptr[rdx+32*7], cx
movd eax, xmm5
movd ecx, xmm4
mov word ptr[rdx+32*9], ax
mov word ptr[rdx+32*11], cx
shr eax, 16
shr ecx, 16
mov word ptr[rdx+32*13], ax
mov word ptr[rdx+32*15], cx
; begin epilog
UNSHADOW_ARGS
pop rbp
ret

View file

@ -0,0 +1,815 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro LF_ABS 2
; %1 value not preserved
; %2 value preserved
; output in %1
movdqa scratch1, %2 ; v2
psubusb scratch1, %1 ; v2 - v1
psubusb %1, %2 ; v1 - v2
por %1, scratch1 ; abs(v2 - v1)
%endmacro
%macro LF_FILTER_HEV_MASK 8-9
LF_ABS %1, %2 ; abs(p3 - p2)
LF_ABS %2, %3 ; abs(p2 - p1)
pmaxub %1, %2 ; accumulate mask
%if %0 == 8
movdqa scratch2, %3 ; save p1
LF_ABS scratch2, %4 ; abs(p1 - p0)
%endif
LF_ABS %4, %5 ; abs(p0 - q0)
LF_ABS %5, %6 ; abs(q0 - q1)
%if %0 == 8
pmaxub %5, scratch2 ; accumulate hev
%else
pmaxub %5, %9
%endif
pmaxub %1, %5 ; accumulate mask
LF_ABS %3, %6 ; abs(p1 - q1)
LF_ABS %6, %7 ; abs(q1 - q2)
pmaxub %1, %6 ; accumulate mask
LF_ABS %7, %8 ; abs(q2 - q3)
pmaxub %1, %7 ; accumulate mask
paddusb %4, %4 ; 2 * abs(p0 - q0)
pand %3, [GLOBAL(tfe)]
psrlw %3, 1 ; abs(p1 - q1) / 2
paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
psubusb %1, [limit]
psubusb %4, [blimit]
por %1, %4
pcmpeqb %1, zero ; mask
psubusb %5, [thresh]
pcmpeqb %5, zero ; ~hev
%endmacro
%macro LF_FILTER 6
; %1-%4: p1-q1
; %5: mask
; %6: hev
movdqa scratch2, %6 ; save hev
pxor %1, [GLOBAL(t80)] ; ps1
pxor %4, [GLOBAL(t80)] ; qs1
movdqa scratch1, %1
psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
pandn scratch2, scratch1 ; vp8_filter &= hev
pxor %2, [GLOBAL(t80)] ; ps0
pxor %3, [GLOBAL(t80)] ; qs0
movdqa scratch1, %3
psubsb scratch1, %2 ; qs0 - ps0
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
pand %5, scratch2 ; &= mask
movdqa scratch2, %5
paddsb %5, [GLOBAL(t4)] ; Filter1
paddsb scratch2, [GLOBAL(t3)] ; Filter2
; Filter1 >> 3
movdqa scratch1, zero
pcmpgtb scratch1, %5
psrlw %5, 3
pand scratch1, [GLOBAL(te0)]
pand %5, [GLOBAL(t1f)]
por %5, scratch1
psubsb %3, %5 ; qs0 - Filter1
pxor %3, [GLOBAL(t80)]
; Filter2 >> 3
movdqa scratch1, zero
pcmpgtb scratch1, scratch2
psrlw scratch2, 3
pand scratch1, [GLOBAL(te0)]
pand scratch2, [GLOBAL(t1f)]
por scratch2, scratch1
paddsb %2, scratch2 ; ps0 + Filter2
pxor %2, [GLOBAL(t80)]
; outer tap adjustments
paddsb %5, [GLOBAL(t1)]
movdqa scratch1, zero
pcmpgtb scratch1, %5
psrlw %5, 1
pand scratch1, [GLOBAL(t80)]
pand %5, [GLOBAL(t7f)]
por %5, scratch1
pand %5, %6 ; vp8_filter &= ~hev
psubsb %4, %5 ; qs1 - vp8_filter
pxor %4, [GLOBAL(t80)]
paddsb %1, %5 ; ps1 + vp8_filter
pxor %1, [GLOBAL(t80)]
%endmacro
;void vp8_loop_filter_bh_y_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh
;)
global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
sym(vp8_loop_filter_bh_y_sse2):
%if LIBVPX_YASM_WIN64
%define src rcx ; src_ptr
%define stride rdx ; src_pixel_step
%define blimit r8
%define limit r9
%define thresh r10
%define spp rax
%define stride3 r11
%define stride5 r12
%define stride7 r13
push rbp
mov rbp, rsp
SAVE_XMM 11
push r12
push r13
mov thresh, arg(4)
%else
%define src rdi ; src_ptr
%define stride rsi ; src_pixel_step
%define blimit rdx
%define limit rcx
%define thresh r8
%define spp rax
%define stride3 r9
%define stride5 r10
%define stride7 r11
%endif
%define scratch1 xmm5
%define scratch2 xmm6
%define zero xmm7
%define i0 [src]
%define i1 [spp]
%define i2 [src + 2 * stride]
%define i3 [spp + 2 * stride]
%define i4 [src + 4 * stride]
%define i5 [spp + 4 * stride]
%define i6 [src + 2 * stride3]
%define i7 [spp + 2 * stride3]
%define i8 [src + 8 * stride]
%define i9 [spp + 8 * stride]
%define i10 [src + 2 * stride5]
%define i11 [spp + 2 * stride5]
%define i12 [src + 4 * stride3]
%define i13 [spp + 4 * stride3]
%define i14 [src + 2 * stride7]
%define i15 [spp + 2 * stride7]
; prep work
lea spp, [src + stride]
lea stride3, [stride + 2 * stride]
lea stride5, [stride3 + 2 * stride]
lea stride7, [stride3 + 4 * stride]
pxor zero, zero
; load the first set into registers
movdqa xmm0, i0
movdqa xmm1, i1
movdqa xmm2, i2
movdqa xmm3, i3
movdqa xmm4, i4
movdqa xmm8, i5
movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
movdqa xmm10, i7
LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
movdqa xmm1, i2
movdqa xmm2, i3
movdqa xmm3, i4
movdqa xmm8, i5
LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
movdqa i2, xmm1
movdqa i3, xmm2
; second set
movdqa i4, xmm3
movdqa i5, xmm8
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm2, i8
movdqa xmm4, i9
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
movdqa xmm11, i11
LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm4, i8
movdqa xmm8, i9
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
movdqa i6, xmm0
movdqa i7, xmm1
; last set
movdqa i8, xmm4
movdqa i9, xmm8
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm2, i12
movdqa xmm3, i13
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
movdqa xmm11, i15
LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm3, i12
movdqa xmm8, i13
LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
movdqa i10, xmm0
movdqa i11, xmm1
movdqa i12, xmm3
movdqa i13, xmm8
%if LIBVPX_YASM_WIN64
pop r13
pop r12
RESTORE_XMM
pop rbp
%endif
ret
;void vp8_loop_filter_bv_y_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh
;)
global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
sym(vp8_loop_filter_bv_y_sse2):
%if LIBVPX_YASM_WIN64
%define src rcx ; src_ptr
%define stride rdx ; src_pixel_step
%define blimit r8
%define limit r9
%define thresh r10
%define spp rax
%define stride3 r11
%define stride5 r12
%define stride7 r13
push rbp
mov rbp, rsp
SAVE_XMM 15
push r12
push r13
mov thresh, arg(4)
%else
%define src rdi
%define stride rsi
%define blimit rdx
%define limit rcx
%define thresh r8
%define spp rax
%define stride3 r9
%define stride5 r10
%define stride7 r11
%endif
%define scratch1 xmm5
%define scratch2 xmm6
%define zero xmm7
%define s0 [src]
%define s1 [spp]
%define s2 [src + 2 * stride]
%define s3 [spp + 2 * stride]
%define s4 [src + 4 * stride]
%define s5 [spp + 4 * stride]
%define s6 [src + 2 * stride3]
%define s7 [spp + 2 * stride3]
%define s8 [src + 8 * stride]
%define s9 [spp + 8 * stride]
%define s10 [src + 2 * stride5]
%define s11 [spp + 2 * stride5]
%define s12 [src + 4 * stride3]
%define s13 [spp + 4 * stride3]
%define s14 [src + 2 * stride7]
%define s15 [spp + 2 * stride7]
%define i0 [rsp]
%define i1 [rsp + 16]
%define i2 [rsp + 32]
%define i3 [rsp + 48]
%define i4 [rsp + 64]
%define i5 [rsp + 80]
%define i6 [rsp + 96]
%define i7 [rsp + 112]
%define i8 [rsp + 128]
%define i9 [rsp + 144]
%define i10 [rsp + 160]
%define i11 [rsp + 176]
%define i12 [rsp + 192]
%define i13 [rsp + 208]
%define i14 [rsp + 224]
%define i15 [rsp + 240]
ALIGN_STACK 16, rax
; reserve stack space
%define temp_storage 0 ; size is 256 (16*16)
%define stack_size 256
sub rsp, stack_size
; prep work
lea spp, [src + stride]
lea stride3, [stride + 2 * stride]
lea stride5, [stride3 + 2 * stride]
lea stride7, [stride3 + 4 * stride]
; 8-f
movdqa xmm0, s8
movdqa xmm1, xmm0
punpcklbw xmm0, s9 ; 80 90
punpckhbw xmm1, s9 ; 88 98
movdqa xmm2, s10
movdqa xmm3, xmm2
punpcklbw xmm2, s11 ; a0 b0
punpckhbw xmm3, s11 ; a8 b8
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
; using xmm[0124]
; work on next 4 rows
movdqa xmm3, s12
movdqa xmm5, xmm3
punpcklbw xmm3, s13 ; c0 d0
punpckhbw xmm5, s13 ; c8 d8
movdqa xmm6, s14
movdqa xmm7, xmm6
punpcklbw xmm6, s15 ; e0 f0
punpckhbw xmm7, s15 ; e8 f8
movdqa xmm8, xmm3
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
movdqa xmm6, xmm5
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
punpckhwd xmm6, xmm7 ; cc dc ec fc
; pull the third and fourth sets together
movdqa xmm7, xmm0
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
movdqa xmm3, xmm4
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
movdqa xmm8, xmm1
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
movdqa xmm5, xmm2
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
; save the calculations. we only have 15 registers ...
movdqa i0, xmm0
movdqa i1, xmm7
movdqa i2, xmm4
movdqa i3, xmm3
movdqa i4, xmm1
movdqa i5, xmm8
movdqa i6, xmm2
movdqa i7, xmm5
; 0-7
movdqa xmm0, s0
movdqa xmm1, xmm0
punpcklbw xmm0, s1 ; 00 10
punpckhbw xmm1, s1 ; 08 18
movdqa xmm2, s2
movdqa xmm3, xmm2
punpcklbw xmm2, s3 ; 20 30
punpckhbw xmm3, s3 ; 28 38
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 00 10 20 30
punpckhwd xmm4, xmm2 ; 04 14 24 34
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 08 18 28 38
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
; using xmm[0124]
; work on next 4 rows
movdqa xmm3, s4
movdqa xmm5, xmm3
punpcklbw xmm3, s5 ; 40 50
punpckhbw xmm5, s5 ; 48 58
movdqa xmm6, s6
movdqa xmm7, xmm6
punpcklbw xmm6, s7 ; 60 70
punpckhbw xmm7, s7 ; 68 78
movdqa xmm8, xmm3
punpcklwd xmm3, xmm6 ; 40 50 60 70
punpckhwd xmm8, xmm6 ; 44 54 64 74
movdqa xmm6, xmm5
punpcklwd xmm5, xmm7 ; 48 58 68 78
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
; pull the first two sets together
movdqa xmm7, xmm0
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
movdqa xmm3, xmm4
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
movdqa xmm8, xmm1
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
movdqa xmm5, xmm2
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
; final combination
movdqa xmm6, xmm0
punpcklqdq xmm0, i0
punpckhqdq xmm6, i0
movdqa xmm9, xmm7
punpcklqdq xmm7, i1
punpckhqdq xmm9, i1
movdqa xmm10, xmm4
punpcklqdq xmm4, i2
punpckhqdq xmm10, i2
movdqa xmm11, xmm3
punpcklqdq xmm3, i3
punpckhqdq xmm11, i3
movdqa xmm12, xmm1
punpcklqdq xmm1, i4
punpckhqdq xmm12, i4
movdqa xmm13, xmm8
punpcklqdq xmm8, i5
punpckhqdq xmm13, i5
movdqa xmm14, xmm2
punpcklqdq xmm2, i6
punpckhqdq xmm14, i6
movdqa xmm15, xmm5
punpcklqdq xmm5, i7
punpckhqdq xmm15, i7
movdqa i0, xmm0
movdqa i1, xmm6
movdqa i2, xmm7
movdqa i3, xmm9
movdqa i4, xmm4
movdqa i5, xmm10
movdqa i6, xmm3
movdqa i7, xmm11
movdqa i8, xmm1
movdqa i9, xmm12
movdqa i10, xmm8
movdqa i11, xmm13
movdqa i12, xmm2
movdqa i13, xmm14
movdqa i14, xmm5
movdqa i15, xmm15
; TRANSPOSED DATA AVAILABLE ON THE STACK
movdqa xmm12, xmm6
movdqa xmm13, xmm7
pxor zero, zero
LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
movdqa xmm1, i2
movdqa xmm2, i3
movdqa xmm8, i4
movdqa xmm9, i5
LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
movdqa i2, xmm1
movdqa i3, xmm2
; second set
movdqa i4, xmm8
movdqa i5, xmm9
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm2, i8
movdqa xmm4, i9
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
movdqa xmm11, i11
LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm3, i8
movdqa xmm4, i9
LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
movdqa i6, xmm0
movdqa i7, xmm1
; last set
movdqa i8, xmm3
movdqa i9, xmm4
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm2, i12
movdqa xmm8, i13
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
movdqa xmm11, i15
LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm4, i12
movdqa xmm8, i13
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
movdqa i10, xmm0
movdqa i11, xmm1
movdqa i12, xmm4
movdqa i13, xmm8
; RESHUFFLE AND WRITE OUT
; 8-f
movdqa xmm0, i8
movdqa xmm1, xmm0
punpcklbw xmm0, i9 ; 80 90
punpckhbw xmm1, i9 ; 88 98
movdqa xmm2, i10
movdqa xmm3, xmm2
punpcklbw xmm2, i11 ; a0 b0
punpckhbw xmm3, i11 ; a8 b8
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
; using xmm[0124]
; work on next 4 rows
movdqa xmm3, i12
movdqa xmm5, xmm3
punpcklbw xmm3, i13 ; c0 d0
punpckhbw xmm5, i13 ; c8 d8
movdqa xmm6, i14
movdqa xmm7, xmm6
punpcklbw xmm6, i15 ; e0 f0
punpckhbw xmm7, i15 ; e8 f8
movdqa xmm8, xmm3
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
movdqa xmm6, xmm5
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
punpckhwd xmm6, xmm7 ; cc dc ec fc
; pull the third and fourth sets together
movdqa xmm7, xmm0
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
movdqa xmm3, xmm4
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
movdqa xmm8, xmm1
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
movdqa xmm5, xmm2
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
; save the calculations. we only have 15 registers ...
movdqa i8, xmm0
movdqa i9, xmm7
movdqa i10, xmm4
movdqa i11, xmm3
movdqa i12, xmm1
movdqa i13, xmm8
movdqa i14, xmm2
movdqa i15, xmm5
; 0-7
movdqa xmm0, i0
movdqa xmm1, xmm0
punpcklbw xmm0, i1 ; 00 10
punpckhbw xmm1, i1 ; 08 18
movdqa xmm2, i2
movdqa xmm3, xmm2
punpcklbw xmm2, i3 ; 20 30
punpckhbw xmm3, i3 ; 28 38
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 00 10 20 30
punpckhwd xmm4, xmm2 ; 04 14 24 34
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 08 18 28 38
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
; using xmm[0124]
; work on next 4 rows
movdqa xmm3, i4
movdqa xmm5, xmm3
punpcklbw xmm3, i5 ; 40 50
punpckhbw xmm5, i5 ; 48 58
movdqa xmm6, i6
movdqa xmm7, xmm6
punpcklbw xmm6, i7 ; 60 70
punpckhbw xmm7, i7 ; 68 78
movdqa xmm8, xmm3
punpcklwd xmm3, xmm6 ; 40 50 60 70
punpckhwd xmm8, xmm6 ; 44 54 64 74
movdqa xmm6, xmm5
punpcklwd xmm5, xmm7 ; 48 58 68 78
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
; pull the first two sets together
movdqa xmm7, xmm0
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
movdqa xmm3, xmm4
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
movdqa xmm8, xmm1
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
movdqa xmm5, xmm2
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
; final combination
movdqa xmm6, xmm0
punpcklqdq xmm0, i8
punpckhqdq xmm6, i8
movdqa xmm9, xmm7
punpcklqdq xmm7, i9
punpckhqdq xmm9, i9
movdqa xmm10, xmm4
punpcklqdq xmm4, i10
punpckhqdq xmm10, i10
movdqa xmm11, xmm3
punpcklqdq xmm3, i11
punpckhqdq xmm11, i11
movdqa xmm12, xmm1
punpcklqdq xmm1, i12
punpckhqdq xmm12, i12
movdqa xmm13, xmm8
punpcklqdq xmm8, i13
punpckhqdq xmm13, i13
movdqa xmm14, xmm2
punpcklqdq xmm2, i14
punpckhqdq xmm14, i14
movdqa xmm15, xmm5
punpcklqdq xmm5, i15
punpckhqdq xmm15, i15
movdqa s0, xmm0
movdqa s1, xmm6
movdqa s2, xmm7
movdqa s3, xmm9
movdqa s4, xmm4
movdqa s5, xmm10
movdqa s6, xmm3
movdqa s7, xmm11
movdqa s8, xmm1
movdqa s9, xmm12
movdqa s10, xmm8
movdqa s11, xmm13
movdqa s12, xmm2
movdqa s13, xmm14
movdqa s14, xmm5
movdqa s15, xmm15
; free stack space
add rsp, stack_size
; un-ALIGN_STACK
pop rsp
%if LIBVPX_YASM_WIN64
pop r13
pop r12
RESTORE_XMM
pop rbp
%endif
ret
SECTION_RODATA
align 16
te0:
times 16 db 0xe0
align 16
t7f:
times 16 db 0x7f
align 16
tfe:
times 16 db 0xfe
align 16
t1f:
times 16 db 0x1f
align 16
t80:
times 16 db 0x80
align 16
t1:
times 16 db 0x01
align 16
t3:
times 16 db 0x03
align 16
t4:
times 16 db 0x04

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,198 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8/common/loopfilter.h"
#define prototype_loopfilter(sym) \
void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
const unsigned char *limit, const unsigned char *thresh, int count)
#define prototype_loopfilter_nc(sym) \
void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
const unsigned char *limit, const unsigned char *thresh)
#define prototype_simple_loopfilter(sym) \
void sym(unsigned char *y, int ystride, const unsigned char *blimit)
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
#if HAVE_SSE2 && ARCH_X86_64
prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
#else
prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
#endif
prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
#if HAVE_MMX
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
}
#endif
/* Horizontal MB filtering */
#if HAVE_SSE2
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
#if ARCH_X86_64
vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#else
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
#endif
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
}
void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
#if ARCH_X86_64
vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#else
vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
#endif
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
}
void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
}
#endif

View file

@ -0,0 +1,274 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void copy_mem8x8_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem8x8_mmx) PRIVATE
sym(vp8_copy_mem8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movq mm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movq mm1, [rsi+rax]
movq mm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movq [rdi], mm0
add rsi, rax
movq [rdi+rcx], mm1
movq [rdi+rcx*2], mm2
lea rdi, [rdi+rcx*2]
movq mm3, [rsi]
add rdi, rcx
movq mm4, [rsi+rax]
movq mm5, [rsi+rax*2]
movq [rdi], mm3
lea rsi, [rsi+rax*2]
movq [rdi+rcx], mm4
movq [rdi+rcx*2], mm5
lea rdi, [rdi+rcx*2]
movq mm0, [rsi+rax]
movq mm1, [rsi+rax*2]
movq [rdi+rcx], mm0
movq [rdi+rcx*2],mm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem8x4_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem8x4_mmx) PRIVATE
sym(vp8_copy_mem8x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movq mm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movq mm1, [rsi+rax]
movq mm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movq [rdi], mm0
movq [rdi+rcx], mm1
movq [rdi+rcx*2], mm2
lea rdi, [rdi+rcx*2]
movq mm3, [rsi+rax]
movq [rdi+rcx], mm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem16x16_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem16x16_mmx) PRIVATE
sym(vp8_copy_mem16x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movsxd rcx, dword ptr arg(3) ;dst_stride
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq [rdi], mm0
movq [rdi+8], mm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

View file

@ -0,0 +1,116 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void copy_mem16x16_sse2(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem16x16_sse2) PRIVATE
sym(vp8_copy_mem16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movdqu xmm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movdqu xmm1, [rsi+rax]
movdqu xmm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm0
add rsi, rax
movdqa [rdi+rcx], xmm1
movdqa [rdi+rcx*2],xmm2
lea rdi, [rdi+rcx*2]
movdqu xmm3, [rsi]
add rdi, rcx
movdqu xmm4, [rsi+rax]
movdqu xmm5, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm3
add rsi, rax
movdqa [rdi+rcx], xmm4
movdqa [rdi+rcx*2],xmm5
lea rdi, [rdi+rcx*2]
movdqu xmm0, [rsi]
add rdi, rcx
movdqu xmm1, [rsi+rax]
movdqu xmm2, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm0
add rsi, rax
movdqa [rdi+rcx], xmm1
movdqa [rdi+rcx*2], xmm2
movdqu xmm3, [rsi]
movdqu xmm4, [rsi+rax]
lea rdi, [rdi+rcx*2]
add rdi, rcx
movdqu xmm5, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm3
add rsi, rax
movdqa [rdi+rcx], xmm4
movdqa [rdi+rcx*2],xmm5
movdqu xmm0, [rsi]
lea rdi, [rdi+rcx*2]
movdqu xmm1, [rsi+rax]
add rdi, rcx
movdqu xmm2, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm0
movdqa [rdi+rcx], xmm1
movdqa [rdi+rcx*2],xmm2
movdqu xmm3, [rsi+rax]
lea rdi, [rdi+rcx*2]
movdqa [rdi+rcx], xmm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

View file

@ -0,0 +1,702 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
extern sym(vp8_bilinear_filters_x86_8)
%define BLOCK_HEIGHT_WIDTH 4
%define vp8_filter_weight 128
%define VP8_FILTER_SHIFT 7
;void vp8_filter_block1d_h6_mmx
;(
; unsigned char *src_ptr,
; unsigned short *output_ptr,
; unsigned int src_pixels_per_line,
; unsigned int pixel_step,
; unsigned int output_height,
; unsigned int output_width,
; short * vp8_filter
;)
global sym(vp8_filter_block1d_h6_mmx) PRIVATE
sym(vp8_filter_block1d_h6_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdx, arg(6) ;vp8_filter
movq mm1, [rdx + 16] ; do both the negative taps first!!!
movq mm2, [rdx + 32] ;
movq mm6, [rdx + 48] ;
movq mm7, [rdx + 64] ;
mov rdi, arg(1) ;output_ptr
mov rsi, arg(0) ;src_ptr
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
.nextrow:
movq mm3, [rsi-2] ; mm3 = p-2..p5
movq mm4, mm3 ; mm4 = p-2..p5
psrlq mm3, 8 ; mm3 = p-1..p5
punpcklbw mm3, mm0 ; mm3 = p-1..p2
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
movq mm5, mm4 ; mm5 = p-2..p5
punpckhbw mm4, mm0 ; mm5 = p2..p5
pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
paddsw mm3, mm4 ; mm3 += mm5
movq mm4, mm5 ; mm4 = p-2..p5;
psrlq mm5, 16 ; mm5 = p0..p5;
punpcklbw mm5, mm0 ; mm5 = p0..p3
pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
paddsw mm3, mm5 ; mm3 += mm5
movq mm5, mm4 ; mm5 = p-2..p5
psrlq mm4, 24 ; mm4 = p1..p5
punpcklbw mm4, mm0 ; mm4 = p1..p4
pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
paddsw mm3, mm4 ; mm3 += mm5
; do outer positive taps
movd mm4, [rsi+3]
punpcklbw mm4, mm0 ; mm5 = p3..p6
pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
paddsw mm3, mm4 ; mm3 += mm5
punpcklbw mm5, mm0 ; mm5 = p-2..p1
pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
paddsw mm3, mm5 ; mm3 += mm5
paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
packuswb mm3, mm0 ; pack and unpack to saturate
punpcklbw mm3, mm0 ;
movq [rdi], mm3 ; store the results in the destination
%if ABI_IS_32BIT
add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
add rdi, rax;
%else
movsxd r8, dword ptr arg(2) ;src_pixels_per_line
add rdi, rax;
add rsi, r8 ; next line
%endif
dec rcx ; decrement count
jnz .nextrow ; next row
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1dc_v6_mmx
;(
; short *src_ptr,
; unsigned char *output_ptr,
; int output_pitch,
; unsigned int pixels_per_line,
; unsigned int pixel_step,
; unsigned int output_height,
; unsigned int output_width,
; short * vp8_filter
;)
global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
sym(vp8_filter_block1dc_v6_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
GET_GOT rbx
push rsi
push rdi
; end prolog
movq mm5, [GLOBAL(rd)]
push rbx
mov rbx, arg(7) ;vp8_filter
movq mm1, [rbx + 16] ; do both the negative taps first!!!
movq mm2, [rbx + 32] ;
movq mm6, [rbx + 48] ;
movq mm7, [rbx + 64] ;
movsxd rdx, dword ptr arg(3) ;pixels_per_line
mov rdi, arg(1) ;output_ptr
mov rsi, arg(0) ;src_ptr
sub rsi, rdx
sub rsi, rdx
movsxd rcx, DWORD PTR arg(5) ;output_height
movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
.nextrow_cv:
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
movq mm4, [rsi] ; mm4 = p0..p3 = row -2
pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
paddsw mm3, mm5 ; mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
packuswb mm3, mm0 ; pack and saturate
movd [rdi],mm3 ; store the results in the destination
; the subsequent iterations repeat 3 out of 4 of these reads. Since the
; recon block should be in cache this shouldn't cost much. Its obviously
; avoidable!!!.
lea rdi, [rdi+rax] ;
dec rcx ; decrement count
jnz .nextrow_cv ; next row
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void bilinear_predict8x8_mmx
;(
; unsigned char *src_ptr,
; int src_pixels_per_line,
; int xoffset,
; int yoffset,
; unsigned char *dst_ptr,
; int dst_pitch
;)
global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
sym(vp8_bilinear_predict8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
; end prolog
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
shl rax, 5 ; offset * 32
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
add rax, rcx ; HFilter
mov rsi, arg(0) ;src_ptr ;
movsxd rdx, dword ptr arg(5) ;dst_pitch
movq mm1, [rax] ;
movq mm2, [rax+16] ;
movsxd rax, dword ptr arg(3) ;yoffset
pxor mm0, mm0 ;
shl rax, 5 ; offset*32
add rax, rcx ; VFilter
lea rcx, [rdi+rdx*8] ;
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
; get the first horizontal line done ;
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
punpckhbw mm4, mm0 ;
pmullw mm3, mm1 ;
pmullw mm4, mm1 ;
movq mm5, [rsi+1] ;
movq mm6, mm5 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0 ;
pmullw mm5, mm2 ;
pmullw mm6, mm2 ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
movq mm7, mm3 ;
packuswb mm7, mm4 ;
add rsi, rdx ; next line
.next_row_8x8:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
punpckhbw mm4, mm0 ;
pmullw mm3, mm1 ;
pmullw mm4, mm1 ;
movq mm5, [rsi+1] ;
movq mm6, mm5 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0 ;
pmullw mm5, mm2 ;
pmullw mm6, mm2 ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
movq mm5, mm7 ;
movq mm6, mm7 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0
pmullw mm5, [rax] ;
pmullw mm6, [rax] ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
movq mm7, mm3 ;
packuswb mm7, mm4 ;
pmullw mm3, [rax+16] ;
pmullw mm4, [rax+16] ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
packuswb mm3, mm4
movq [rdi], mm3 ; store the results in the destination
%if ABI_IS_32BIT
add rsi, rdx ; next line
add rdi, dword ptr arg(5) ;dst_pitch ;
%else
movsxd r8, dword ptr arg(5) ;dst_pitch
add rsi, rdx ; next line
add rdi, r8 ;dst_pitch
%endif
cmp rdi, rcx ;
jne .next_row_8x8
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void bilinear_predict8x4_mmx
;(
; unsigned char *src_ptr,
; int src_pixels_per_line,
; int xoffset,
; int yoffset,
; unsigned char *dst_ptr,
; int dst_pitch
;)
global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
sym(vp8_bilinear_predict8x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
; end prolog
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
shl rax, 5
mov rsi, arg(0) ;src_ptr ;
add rax, rcx
movsxd rdx, dword ptr arg(5) ;dst_pitch
movq mm1, [rax] ;
movq mm2, [rax+16] ;
movsxd rax, dword ptr arg(3) ;yoffset
pxor mm0, mm0 ;
shl rax, 5
add rax, rcx
lea rcx, [rdi+rdx*4] ;
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
; get the first horizontal line done ;
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
punpckhbw mm4, mm0 ;
pmullw mm3, mm1 ;
pmullw mm4, mm1 ;
movq mm5, [rsi+1] ;
movq mm6, mm5 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0 ;
pmullw mm5, mm2 ;
pmullw mm6, mm2 ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
movq mm7, mm3 ;
packuswb mm7, mm4 ;
add rsi, rdx ; next line
.next_row_8x4:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
punpckhbw mm4, mm0 ;
pmullw mm3, mm1 ;
pmullw mm4, mm1 ;
movq mm5, [rsi+1] ;
movq mm6, mm5 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0 ;
pmullw mm5, mm2 ;
pmullw mm6, mm2 ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
movq mm5, mm7 ;
movq mm6, mm7 ;
punpcklbw mm5, mm0 ;
punpckhbw mm6, mm0
pmullw mm5, [rax] ;
pmullw mm6, [rax] ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
movq mm7, mm3 ;
packuswb mm7, mm4 ;
pmullw mm3, [rax+16] ;
pmullw mm4, [rax+16] ;
paddw mm3, mm5 ;
paddw mm4, mm6 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw mm4, [GLOBAL(rd)] ;
psraw mm4, VP8_FILTER_SHIFT ;
packuswb mm3, mm4
movq [rdi], mm3 ; store the results in the destination
%if ABI_IS_32BIT
add rsi, rdx ; next line
add rdi, dword ptr arg(5) ;dst_pitch ;
%else
movsxd r8, dword ptr arg(5) ;dst_pitch
add rsi, rdx ; next line
add rdi, r8
%endif
cmp rdi, rcx ;
jne .next_row_8x4
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void bilinear_predict4x4_mmx
;(
; unsigned char *src_ptr,
; int src_pixels_per_line,
; int xoffset,
; int yoffset,
; unsigned char *dst_ptr,
; int dst_pitch
;)
global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
sym(vp8_bilinear_predict4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
; end prolog
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
movsxd rax, dword ptr arg(2) ;xoffset
mov rdi, arg(4) ;dst_ptr ;
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
shl rax, 5
add rax, rcx ; HFilter
mov rsi, arg(0) ;src_ptr ;
movsxd rdx, dword ptr arg(5) ;ldst_pitch
movq mm1, [rax] ;
movq mm2, [rax+16] ;
movsxd rax, dword ptr arg(3) ;yoffset
pxor mm0, mm0 ;
shl rax, 5
add rax, rcx
lea rcx, [rdi+rdx*4] ;
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
; get the first horizontal line done ;
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
pmullw mm3, mm1 ;
movd mm5, [rsi+1] ;
punpcklbw mm5, mm0 ;
pmullw mm5, mm2 ;
paddw mm3, mm5 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
movq mm7, mm3 ;
packuswb mm7, mm0 ;
add rsi, rdx ; next line
.next_row_4x4:
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
pmullw mm3, mm1 ;
movd mm5, [rsi+1] ;
punpcklbw mm5, mm0 ;
pmullw mm5, mm2 ;
paddw mm3, mm5 ;
movq mm5, mm7 ;
punpcklbw mm5, mm0 ;
pmullw mm5, [rax] ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
movq mm7, mm3 ;
packuswb mm7, mm0 ;
pmullw mm3, [rax+16] ;
paddw mm3, mm5 ;
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
packuswb mm3, mm0
movd [rdi], mm3 ; store the results in the destination
%if ABI_IS_32BIT
add rsi, rdx ; next line
add rdi, dword ptr arg(5) ;dst_pitch ;
%else
movsxd r8, dword ptr arg(5) ;dst_pitch ;
add rsi, rdx ; next line
add rdi, r8
%endif
cmp rdi, rcx ;
jne .next_row_4x4
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
rd:
times 4 dw 0x40
align 16
global HIDDEN_DATA(sym(vp8_six_tap_mmx))
sym(vp8_six_tap_mmx):
times 8 dw 0
times 8 dw 0
times 8 dw 128
times 8 dw 0
times 8 dw 0
times 8 dw 0
times 8 dw 0
times 8 dw -6
times 8 dw 123
times 8 dw 12
times 8 dw -1
times 8 dw 0
times 8 dw 2
times 8 dw -11
times 8 dw 108
times 8 dw 36
times 8 dw -8
times 8 dw 1
times 8 dw 0
times 8 dw -9
times 8 dw 93
times 8 dw 50
times 8 dw -6
times 8 dw 0
times 8 dw 3
times 8 dw -16
times 8 dw 77
times 8 dw 77
times 8 dw -16
times 8 dw 3
times 8 dw 0
times 8 dw -6
times 8 dw 50
times 8 dw 93
times 8 dw -9
times 8 dw 0
times 8 dw 1
times 8 dw -8
times 8 dw 36
times 8 dw 108
times 8 dw -11
times 8 dw 2
times 8 dw 0
times 8 dw -1
times 8 dw 12
times 8 dw 123
times 8 dw -6
times 8 dw 0

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,625 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vpx_ports/mem.h"
#include "filter_x86.h"
extern const short vp8_six_tap_mmx[8][6*8];
extern void vp8_filter_block1d_h6_mmx
(
unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_filter_block1dc_v6_mmx
(
unsigned short *src_ptr,
unsigned char *output_ptr,
int output_pitch,
unsigned int pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_filter_block1d8_h6_sse2
(
unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_filter_block1d16_h6_sse2
(
unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_filter_block1d8_v6_sse2
(
unsigned short *src_ptr,
unsigned char *output_ptr,
int dst_ptich,
unsigned int pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_filter_block1d16_v6_sse2
(
unsigned short *src_ptr,
unsigned char *output_ptr,
int dst_ptich,
unsigned int pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_unpack_block1d16_h6_sse2
(
unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_height,
unsigned int output_width
);
extern void vp8_filter_block1d8_h6_only_sse2
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
int dst_ptich,
unsigned int output_height,
const short *vp8_filter
);
extern void vp8_filter_block1d16_h6_only_sse2
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
int dst_ptich,
unsigned int output_height,
const short *vp8_filter
);
extern void vp8_filter_block1d8_v6_only_sse2
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
int dst_ptich,
unsigned int output_height,
const short *vp8_filter
);
#if HAVE_MMX
void vp8_sixtap_predict4x4_mmx
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
}
void vp8_sixtap_predict16x16_mmx
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
}
void vp8_sixtap_predict8x8_mmx
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
}
void vp8_sixtap_predict8x4_mmx
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
}
void vp8_bilinear_predict16x16_mmx
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
}
#endif
#if HAVE_SSE2
void vp8_sixtap_predict16x16_sse2
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
if (xoffset)
{
if (yoffset)
{
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
}
else
{
/* First-pass only */
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
}
}
else
{
/* Second-pass only */
VFilter = vp8_six_tap_mmx[yoffset];
vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
}
}
void vp8_sixtap_predict8x8_sse2
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
if (xoffset)
{
if (yoffset)
{
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
}
else
{
/* First-pass only */
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
}
}
else
{
/* Second-pass only */
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
}
}
void vp8_sixtap_predict8x4_sse2
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
const short *HFilter, *VFilter;
if (xoffset)
{
if (yoffset)
{
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
}
else
{
/* First-pass only */
HFilter = vp8_six_tap_mmx[xoffset];
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
}
}
else
{
/* Second-pass only */
VFilter = vp8_six_tap_mmx[yoffset];
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
}
}
#endif
#if HAVE_SSSE3
extern void vp8_filter_block1d8_h6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
extern void vp8_filter_block1d16_h6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
extern void vp8_filter_block1d16_v6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
extern void vp8_filter_block1d8_v6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
extern void vp8_filter_block1d4_h6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pixels_per_line,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
extern void vp8_filter_block1d4_v6_ssse3
(
unsigned char *src_ptr,
unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
unsigned int vp8_filter_index
);
void vp8_sixtap_predict16x16_ssse3
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
if (xoffset)
{
if (yoffset)
{
vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line, FData2,
16, 21, xoffset);
vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
16, yoffset);
}
else
{
/* First-pass only */
vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
dst_ptr, dst_pitch, 16, xoffset);
}
}
else
{
if (yoffset)
{
/* Second-pass only */
vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line,
dst_ptr, dst_pitch, 16, yoffset);
}
else
{
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
* yoffset==0) case correctly. Add copy function here to guarantee
* six-tap function handles all possible offsets. */
vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
}
}
}
void vp8_sixtap_predict8x8_ssse3
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned char, FData2[256]);
if (xoffset)
{
if (yoffset)
{
vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line, FData2,
8, 13, xoffset);
vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
8, yoffset);
}
else
{
vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
dst_ptr, dst_pitch, 8, xoffset);
}
}
else
{
if (yoffset)
{
/* Second-pass only */
vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line,
dst_ptr, dst_pitch, 8, yoffset);
}
else
{
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
* yoffset==0) case correctly. Add copy function here to guarantee
* six-tap function handles all possible offsets. */
vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
}
}
}
void vp8_sixtap_predict8x4_ssse3
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned char, FData2[256]);
if (xoffset)
{
if (yoffset)
{
vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line, FData2,
8, 9, xoffset);
vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
4, yoffset);
}
else
{
/* First-pass only */
vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
dst_ptr, dst_pitch, 4, xoffset);
}
}
else
{
if (yoffset)
{
/* Second-pass only */
vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line,
dst_ptr, dst_pitch, 4, yoffset);
}
else
{
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
* yoffset==0) case correctly. Add copy function here to guarantee
* six-tap function handles all possible offsets. */
vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
}
}
}
void vp8_sixtap_predict4x4_ssse3
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
if (xoffset)
{
if (yoffset)
{
vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line,
FData2, 4, 9, xoffset);
vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
4, yoffset);
}
else
{
vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
dst_ptr, dst_pitch, 4, xoffset);
}
}
else
{
if (yoffset)
{
vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
src_pixels_per_line,
dst_ptr, dst_pitch, 4, yoffset);
}
else
{
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
* yoffset==0) case correctly. Add copy function here to guarantee
* six-tap function handles all possible offsets. */
int r;
for (r = 0; r < 4; r++)
{
dst_ptr[0] = src_ptr[0];
dst_ptr[1] = src_ptr[1];
dst_ptr[2] = src_ptr[2];
dst_ptr[3] = src_ptr[3];
dst_ptr += dst_pitch;
src_ptr += src_pixels_per_line;
}
}
}
}
#endif

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more