-Missing files in new webp version

This commit is contained in:
Juan Linietsky 2015-12-04 10:21:22 -03:00
parent bb392bde16
commit 504e464c8a
68 changed files with 22829 additions and 0 deletions

145
drivers/webp/config.h Normal file
View file

@ -0,0 +1,145 @@
/* src/webp/config.h. Generated from config.h.in by configure. */
/* src/webp/config.h.in. Generated from configure.ac by autoheader. */
/* Define if building universal (internal helper macro) */
/* #undef AC_APPLE_UNIVERSAL_BUILD */
/* Set to 1 if __builtin_bswap16 is available */
#define HAVE_BUILTIN_BSWAP16 1
/* Set to 1 if __builtin_bswap32 is available */
#define HAVE_BUILTIN_BSWAP32 1
/* Set to 1 if __builtin_bswap64 is available */
#define HAVE_BUILTIN_BSWAP64 1
/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1
/* Define to 1 if you have the <GLUT/glut.h> header file. */
/* #undef HAVE_GLUT_GLUT_H */
/* Define to 1 if you have the <GL/glut.h> header file. */
#define HAVE_GL_GLUT_H 1
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1
/* Define to 1 if you have the <OpenGL/glut.h> header file. */
/* #undef HAVE_OPENGL_GLUT_H */
/* Have PTHREAD_PRIO_INHERIT. */
#define HAVE_PTHREAD_PRIO_INHERIT 1
/* Define to 1 if you have the <shlwapi.h> header file. */
/* #undef HAVE_SHLWAPI_H */
/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1
/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1
/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1
/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1
/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1
/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1
/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1
/* Define to 1 if you have the <wincodec.h> header file. */
/* #undef HAVE_WINCODEC_H */
/* Define to 1 if you have the <windows.h> header file. */
/* #undef HAVE_WINDOWS_H */
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#define LT_OBJDIR ".libs/"
/* Name of package */
#define PACKAGE "libwebp"
/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT "http://code.google.com/p/webp/issues"
/* Define to the full name of this package. */
#define PACKAGE_NAME "libwebp"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "libwebp 0.4.4"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "libwebp"
/* Define to the home page for this package. */
#define PACKAGE_URL "http://developers.google.com/speed/webp"
/* Define to the version of this package. */
#define PACKAGE_VERSION "0.4.4"
/* Define to necessary symbol if this constant uses a non-standard name on
your system. */
/* #undef PTHREAD_CREATE_JOINABLE */
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1
/* Version number of package */
#define VERSION "0.4.4"
/* Enable experimental code */
/* #undef WEBP_EXPERIMENTAL_FEATURES */
/* Define to 1 to force aligned memory operations */
/* #undef WEBP_FORCE_ALIGNED */
/* Set to 1 if AVX2 is supported */
#define WEBP_HAVE_AVX2 1
/* Set to 1 if GIF library is installed */
/* #undef WEBP_HAVE_GIF */
/* Set to 1 if OpenGL is supported */
#define WEBP_HAVE_GL 1
/* Set to 1 if JPEG library is installed */
/* #undef WEBP_HAVE_JPEG */
/* Set to 1 if PNG library is installed */
#define WEBP_HAVE_PNG 1
/* Set to 1 if SSE2 is supported */
#define WEBP_HAVE_SSE2 1
/* Set to 1 if SSE4.1 is supported */
#define WEBP_HAVE_SSE41 1
/* Set to 1 if TIFF library is installed */
/* #undef WEBP_HAVE_TIFF */
/* Undefine this to disable thread support. */
#define WEBP_USE_THREAD 1
/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
significant byte first (like Motorola and SPARC, unlike Intel). */
#if defined AC_APPLE_UNIVERSAL_BUILD
# if defined __BIG_ENDIAN__
# define WORDS_BIGENDIAN 1
# endif
#else
# ifndef WORDS_BIGENDIAN
/* # undef WORDS_BIGENDIAN */
# endif
#endif

55
drivers/webp/dec/alphai.h Normal file
View file

@ -0,0 +1,55 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Alpha decoder: internal header.
//
// Author: Urvang (urvang@google.com)
#ifndef WEBP_DEC_ALPHAI_H_
#define WEBP_DEC_ALPHAI_H_
#include "./webpi.h"
#include "../utils/filters.h"
#ifdef __cplusplus
extern "C" {
#endif
struct VP8LDecoder; // Defined in dec/vp8li.h.
typedef struct ALPHDecoder ALPHDecoder;
struct ALPHDecoder {
int width_;
int height_;
int method_;
WEBP_FILTER_TYPE filter_;
int pre_processing_;
struct VP8LDecoder* vp8l_dec_;
VP8Io io_;
int use_8b_decode; // Although alpha channel requires only 1 byte per
// pixel, sometimes VP8LDecoder may need to allocate
// 4 bytes per pixel internally during decode.
};
//------------------------------------------------------------------------------
// internal functions. Not public.
// Allocates a new alpha decoder instance.
ALPHDecoder* ALPHNew(void);
// Clears and deallocates an alpha decoder instance.
void ALPHDelete(ALPHDecoder* const dec);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* WEBP_DEC_ALPHAI_H_ */

54
drivers/webp/dec/common.h Normal file
View file

@ -0,0 +1,54 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Definitions and macros common to encoding and decoding
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_COMMON_H_
#define WEBP_DEC_COMMON_H_
// intra prediction modes
enum { B_DC_PRED = 0, // 4x4 modes
B_TM_PRED = 1,
B_VE_PRED = 2,
B_HE_PRED = 3,
B_RD_PRED = 4,
B_VR_PRED = 5,
B_LD_PRED = 6,
B_VL_PRED = 7,
B_HD_PRED = 8,
B_HU_PRED = 9,
NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED, // = 10
// Luma16 or UV modes
DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
B_PRED = NUM_BMODES, // refined I4x4 mode
NUM_PRED_MODES = 4,
// special modes
B_DC_PRED_NOTOP = 4,
B_DC_PRED_NOLEFT = 5,
B_DC_PRED_NOTOPLEFT = 6,
NUM_B_DC_MODES = 7 };
enum { MB_FEATURE_TREE_PROBS = 3,
NUM_MB_SEGMENTS = 4,
NUM_REF_LF_DELTAS = 4,
NUM_MODE_LF_DELTAS = 4, // I4x4, ZERO, *, SPLIT
MAX_NUM_PARTITIONS = 8,
// Probabilities
NUM_TYPES = 4, // 0: i16-AC, 1: i16-DC, 2:chroma-AC, 3:i4-AC
NUM_BANDS = 8,
NUM_CTX = 3,
NUM_PROBAS = 11
};
#endif // WEBP_DEC_COMMON_H_

364
drivers/webp/demux.h Normal file
View file

@ -0,0 +1,364 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Demux API.
// Enables extraction of image and extended format data from WebP files.
// Code Example: Demuxing WebP data to extract all the frames, ICC profile
// and EXIF/XMP metadata.
/*
WebPDemuxer* demux = WebPDemux(&webp_data);
uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
// ... (Get information about the features present in the WebP file).
uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
// ... (Iterate over all frames).
WebPIterator iter;
if (WebPDemuxGetFrame(demux, 1, &iter)) {
do {
// ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
// ... and get other frame properties like width, height, offsets etc.
// ... see 'struct WebPIterator' below for more info).
} while (WebPDemuxNextFrame(&iter));
WebPDemuxReleaseIterator(&iter);
}
// ... (Extract metadata).
WebPChunkIterator chunk_iter;
if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
// ... (Consume the ICC profile in 'chunk_iter.chunk').
WebPDemuxReleaseChunkIterator(&chunk_iter);
if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
// ... (Consume the EXIF metadata in 'chunk_iter.chunk').
WebPDemuxReleaseChunkIterator(&chunk_iter);
if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
// ... (Consume the XMP metadata in 'chunk_iter.chunk').
WebPDemuxReleaseChunkIterator(&chunk_iter);
WebPDemuxDelete(demux);
*/
#ifndef WEBP_WEBP_DEMUX_H_
#define WEBP_WEBP_DEMUX_H_
#include "./decode.h" // for WEBP_CSP_MODE
#include "./mux_types.h"
#ifdef __cplusplus
extern "C" {
#endif
#define WEBP_DEMUX_ABI_VERSION 0x0105 // MAJOR(8b) + MINOR(8b)
// Note: forward declaring enumerations is not allowed in (strict) C and C++,
// the types are left here for reference.
// typedef enum WebPDemuxState WebPDemuxState;
// typedef enum WebPFormatFeature WebPFormatFeature;
typedef struct WebPDemuxer WebPDemuxer;
typedef struct WebPIterator WebPIterator;
typedef struct WebPChunkIterator WebPChunkIterator;
typedef struct WebPAnimInfo WebPAnimInfo;
typedef struct WebPAnimDecoderOptions WebPAnimDecoderOptions;
//------------------------------------------------------------------------------
// Returns the version number of the demux library, packed in hexadecimal using
// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
WEBP_EXTERN(int) WebPGetDemuxVersion(void);
//------------------------------------------------------------------------------
// Life of a Demux object
typedef enum WebPDemuxState {
WEBP_DEMUX_PARSE_ERROR = -1, // An error occurred while parsing.
WEBP_DEMUX_PARSING_HEADER = 0, // Not enough data to parse full header.
WEBP_DEMUX_PARSED_HEADER = 1, // Header parsing complete,
// data may be available.
WEBP_DEMUX_DONE = 2 // Entire file has been parsed.
} WebPDemuxState;
// Internal, version-checked, entry point
WEBP_EXTERN(WebPDemuxer*) WebPDemuxInternal(
const WebPData*, int, WebPDemuxState*, int);
// Parses the full WebP file given by 'data'.
// Returns a WebPDemuxer object on successful parse, NULL otherwise.
static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
return WebPDemuxInternal(data, 0, NULL, WEBP_DEMUX_ABI_VERSION);
}
// Parses the possibly incomplete WebP file given by 'data'.
// If 'state' is non-NULL it will be set to indicate the status of the demuxer.
// Returns NULL in case of error or if there isn't enough data to start parsing;
// and a WebPDemuxer object on successful parse.
// Note that WebPDemuxer keeps internal pointers to 'data' memory segment.
// If this data is volatile, the demuxer object should be deleted (by calling
// WebPDemuxDelete()) and WebPDemuxPartial() called again on the new data.
// This is usually an inexpensive operation.
static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
const WebPData* data, WebPDemuxState* state) {
return WebPDemuxInternal(data, 1, state, WEBP_DEMUX_ABI_VERSION);
}
// Frees memory associated with 'dmux'.
WEBP_EXTERN(void) WebPDemuxDelete(WebPDemuxer* dmux);
//------------------------------------------------------------------------------
// Data/information extraction.
typedef enum WebPFormatFeature {
WEBP_FF_FORMAT_FLAGS, // Extended format flags present in the 'VP8X' chunk.
WEBP_FF_CANVAS_WIDTH,
WEBP_FF_CANVAS_HEIGHT,
WEBP_FF_LOOP_COUNT,
WEBP_FF_BACKGROUND_COLOR,
WEBP_FF_FRAME_COUNT // Number of frames present in the demux object.
// In case of a partial demux, this is the number of
// frames seen so far, with the last frame possibly
// being partial.
} WebPFormatFeature;
// Get the 'feature' value from the 'dmux'.
// NOTE: values are only valid if WebPDemux() was used or WebPDemuxPartial()
// returned a state > WEBP_DEMUX_PARSING_HEADER.
WEBP_EXTERN(uint32_t) WebPDemuxGetI(
const WebPDemuxer* dmux, WebPFormatFeature feature);
//------------------------------------------------------------------------------
// Frame iteration.
struct WebPIterator {
int frame_num;
int num_frames; // equivalent to WEBP_FF_FRAME_COUNT.
int fragment_num;
int num_fragments;
int x_offset, y_offset; // offset relative to the canvas.
int width, height; // dimensions of this frame or fragment.
int duration; // display duration in milliseconds.
WebPMuxAnimDispose dispose_method; // dispose method for the frame.
int complete; // true if 'fragment' contains a full frame. partial images
// may still be decoded with the WebP incremental decoder.
WebPData fragment; // The frame or fragment given by 'frame_num' and
// 'fragment_num'.
int has_alpha; // True if the frame or fragment contains transparency.
WebPMuxAnimBlend blend_method; // Blend operation for the frame.
uint32_t pad[2]; // padding for later use.
void* private_; // for internal use only.
};
// Retrieves frame 'frame_number' from 'dmux'.
// 'iter->fragment' points to the first fragment on return from this function.
// Individual fragments may be extracted using WebPDemuxSelectFragment().
// Setting 'frame_number' equal to 0 will return the last frame of the image.
// Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
// Call WebPDemuxReleaseIterator() when use of the iterator is complete.
// NOTE: 'dmux' must persist for the lifetime of 'iter'.
WEBP_EXTERN(int) WebPDemuxGetFrame(
const WebPDemuxer* dmux, int frame_number, WebPIterator* iter);
// Sets 'iter->fragment' to point to the next ('iter->frame_num' + 1) or
// previous ('iter->frame_num' - 1) frame. These functions do not loop.
// Returns true on success, false otherwise.
WEBP_EXTERN(int) WebPDemuxNextFrame(WebPIterator* iter);
WEBP_EXTERN(int) WebPDemuxPrevFrame(WebPIterator* iter);
// Sets 'iter->fragment' to reflect fragment number 'fragment_num'.
// Returns true if fragment 'fragment_num' is present, false otherwise.
WEBP_EXTERN(int) WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num);
// Releases any memory associated with 'iter'.
// Must be called before any subsequent calls to WebPDemuxGetChunk() on the same
// iter. Also, must be called before destroying the associated WebPDemuxer with
// WebPDemuxDelete().
WEBP_EXTERN(void) WebPDemuxReleaseIterator(WebPIterator* iter);
//------------------------------------------------------------------------------
// Chunk iteration.
struct WebPChunkIterator {
// The current and total number of chunks with the fourcc given to
// WebPDemuxGetChunk().
int chunk_num;
int num_chunks;
WebPData chunk; // The payload of the chunk.
uint32_t pad[6]; // padding for later use
void* private_;
};
// Retrieves the 'chunk_number' instance of the chunk with id 'fourcc' from
// 'dmux'.
// 'fourcc' is a character array containing the fourcc of the chunk to return,
// e.g., "ICCP", "XMP ", "EXIF", etc.
// Setting 'chunk_number' equal to 0 will return the last chunk in a set.
// Returns true if the chunk is found, false otherwise. Image related chunk
// payloads are accessed through WebPDemuxGetFrame() and related functions.
// Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete.
// NOTE: 'dmux' must persist for the lifetime of the iterator.
WEBP_EXTERN(int) WebPDemuxGetChunk(const WebPDemuxer* dmux,
const char fourcc[4], int chunk_number,
WebPChunkIterator* iter);
// Sets 'iter->chunk' to point to the next ('iter->chunk_num' + 1) or previous
// ('iter->chunk_num' - 1) chunk. These functions do not loop.
// Returns true on success, false otherwise.
WEBP_EXTERN(int) WebPDemuxNextChunk(WebPChunkIterator* iter);
WEBP_EXTERN(int) WebPDemuxPrevChunk(WebPChunkIterator* iter);
// Releases any memory associated with 'iter'.
// Must be called before destroying the associated WebPDemuxer with
// WebPDemuxDelete().
WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
//------------------------------------------------------------------------------
// WebPAnimDecoder API
//
// This API allows decoding (possibly) animated WebP images.
//
// Code Example:
/*
WebPAnimDecoderOptions dec_options;
WebPAnimDecoderOptionsInit(&dec_options);
// Tune 'dec_options' as needed.
WebPAnimDecoder* dec = WebPAnimDecoderNew(webp_data, &dec_options);
WebPAnimInfo anim_info;
WebPAnimDecoderGetInfo(dec, &anim_info);
for (uint32_t i = 0; i < anim_info.loop_count; ++i) {
while (WebPAnimDecoderHasMoreFrames(dec)) {
uint8_t* buf;
int timestamp;
WebPAnimDecoderGetNext(dec, &buf, &timestamp);
// ... (Render 'buf' based on 'timestamp').
// ... (Do NOT free 'buf', as it is owned by 'dec').
}
WebPAnimDecoderReset(dec);
}
const WebPDemuxer* demuxer = WebPAnimDecoderGetDemuxer(dec);
// ... (Do something using 'demuxer'; e.g. get EXIF/XMP/ICC data).
WebPAnimDecoderDelete(dec);
*/
typedef struct WebPAnimDecoder WebPAnimDecoder; // Main opaque object.
// Global options.
struct WebPAnimDecoderOptions {
// Output colorspace. Only the following modes are supported:
// MODE_RGBA, MODE_BGRA, MODE_rgbA and MODE_bgrA.
WEBP_CSP_MODE color_mode;
int use_threads; // If true, use multi-threaded decoding.
uint32_t padding[7]; // Padding for later use.
};
// Internal, version-checked, entry point.
WEBP_EXTERN(int) WebPAnimDecoderOptionsInitInternal(
WebPAnimDecoderOptions*, int);
// Should always be called, to initialize a fresh WebPAnimDecoderOptions
// structure before modification. Returns false in case of version mismatch.
// WebPAnimDecoderOptionsInit() must have succeeded before using the
// 'dec_options' object.
static WEBP_INLINE int WebPAnimDecoderOptionsInit(
WebPAnimDecoderOptions* dec_options) {
return WebPAnimDecoderOptionsInitInternal(dec_options,
WEBP_DEMUX_ABI_VERSION);
}
// Internal, version-checked, entry point.
WEBP_EXTERN(WebPAnimDecoder*) WebPAnimDecoderNewInternal(
const WebPData*, const WebPAnimDecoderOptions*, int);
// Creates and initializes a WebPAnimDecoder object.
// Parameters:
// webp_data - (in) WebP bitstream. This should remain unchanged during the
// lifetime of the output WebPAnimDecoder object.
// dec_options - (in) decoding options. Can be passed NULL to choose
// reasonable defaults (in particular, color mode MODE_RGBA
// will be picked).
// Returns:
// A pointer to the newly created WebPAnimDecoder object, or NULL in case of
// parsing error, invalid option or memory error.
static WEBP_INLINE WebPAnimDecoder* WebPAnimDecoderNew(
const WebPData* webp_data, const WebPAnimDecoderOptions* dec_options) {
return WebPAnimDecoderNewInternal(webp_data, dec_options,
WEBP_DEMUX_ABI_VERSION);
}
// Global information about the animation..
struct WebPAnimInfo {
uint32_t canvas_width;
uint32_t canvas_height;
uint32_t loop_count;
uint32_t bgcolor;
uint32_t frame_count;
uint32_t pad[4]; // padding for later use
};
// Get global information about the animation.
// Parameters:
// dec - (in) decoder instance to get information from.
// info - (out) global information fetched from the animation.
// Returns:
// True on success.
WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
WebPAnimInfo* info);
// Fetch the next frame from 'dec' based on options supplied to
// WebPAnimDecoderNew(). This will be a fully reconstructed canvas of size
// 'canvas_width * 4 * canvas_height', and not just the frame sub-rectangle. The
// returned buffer 'buf' is valid only until the next call to
// WebPAnimDecoderGetNext(), WebPAnimDecoderReset() or WebPAnimDecoderDelete().
// Parameters:
// dec - (in/out) decoder instance from which the next frame is to be fetched.
// buf - (out) decoded frame.
// timestamp - (out) timestamp of the frame in milliseconds.
// Returns:
// False if any of the arguments are NULL, or if there is a parsing or
// decoding error, or if there are no more frames. Otherwise, returns true.
WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
uint8_t** buf, int* timestamp);
// Check if there are more frames left to decode.
// Parameters:
// dec - (in) decoder instance to be checked.
// Returns:
// True if 'dec' is not NULL and some frames are yet to be decoded.
// Otherwise, returns false.
WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
// Resets the WebPAnimDecoder object, so that next call to
// WebPAnimDecoderGetNext() will restart decoding from 1st frame. This would be
// helpful when all frames need to be decoded multiple times (e.g.
// info.loop_count times) without destroying and recreating the 'dec' object.
// Parameters:
// dec - (in/out) decoder instance to be reset
WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec);
// Grab the internal demuxer object.
// Getting the demuxer object can be useful if one wants to use operations only
// available through demuxer; e.g. to get XMP/EXIF/ICC metadata. The returned
// demuxer object is owned by 'dec' and is valid only until the next call to
// WebPAnimDecoderDelete().
//
// Parameters:
// dec - (in) decoder instance from which the demuxer object is to be fetched.
WEBP_EXTERN(const WebPDemuxer*) WebPAnimDecoderGetDemuxer(
const WebPAnimDecoder* dec);
// Deletes the WebPAnimDecoder object.
// Parameters:
// dec - (in/out) decoder instance to be deleted
WEBP_EXTERN(void) WebPAnimDecoderDelete(WebPAnimDecoder* dec);
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* WEBP_WEBP_DEMUX_H_ */

View file

@ -0,0 +1,442 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// AnimDecoder implementation.
//
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#endif
#include <assert.h>
#include <string.h>
#include "../utils/utils.h"
#include "../webp/decode.h"
#include "../webp/demux.h"
#define NUM_CHANNELS 4
typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
static void BlendPixelRowNonPremult(uint32_t* const src,
const uint32_t* const dst, int num_pixels);
static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
int num_pixels);
struct WebPAnimDecoder {
WebPDemuxer* demux_; // Demuxer created from given WebP bitstream.
WebPDecoderConfig config_; // Decoder config.
// Note: we use a pointer to a function blending multiple pixels at a time to
// allow possible inlining of per-pixel blending function.
BlendRowFunc blend_func_; // Pointer to the chose blend row function.
WebPAnimInfo info_; // Global info about the animation.
uint8_t* curr_frame_; // Current canvas (not disposed).
uint8_t* prev_frame_disposed_; // Previous canvas (properly disposed).
int prev_frame_timestamp_; // Previous frame timestamp (milliseconds).
WebPIterator prev_iter_; // Iterator object for previous frame.
int prev_frame_was_keyframe_; // True if previous frame was a keyframe.
int next_frame_; // Index of the next frame to be decoded
// (starting from 1).
};
static void DefaultDecoderOptions(WebPAnimDecoderOptions* const dec_options) {
dec_options->color_mode = MODE_RGBA;
dec_options->use_threads = 0;
}
int WebPAnimDecoderOptionsInitInternal(WebPAnimDecoderOptions* dec_options,
int abi_version) {
if (dec_options == NULL ||
WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
return 0;
}
DefaultDecoderOptions(dec_options);
return 1;
}
static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
WebPAnimDecoder* const dec) {
WEBP_CSP_MODE mode;
WebPDecoderConfig* config = &dec->config_;
assert(dec_options != NULL);
mode = dec_options->color_mode;
if (mode != MODE_RGBA && mode != MODE_BGRA &&
mode != MODE_rgbA && mode != MODE_bgrA) {
return 0;
}
dec->blend_func_ = (mode == MODE_RGBA || mode == MODE_BGRA)
? &BlendPixelRowNonPremult
: &BlendPixelRowPremult;
WebPInitDecoderConfig(config);
config->output.colorspace = mode;
config->output.is_external_memory = 1;
config->options.use_threads = dec_options->use_threads;
// Note: config->output.u.RGBA is set at the time of decoding each frame.
return 1;
}
WebPAnimDecoder* WebPAnimDecoderNewInternal(
const WebPData* webp_data, const WebPAnimDecoderOptions* dec_options,
int abi_version) {
WebPAnimDecoderOptions options;
WebPAnimDecoder* dec = NULL;
if (webp_data == NULL ||
WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
return NULL;
}
// Note: calloc() so that the pointer members are initialized to NULL.
dec = (WebPAnimDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
if (dec == NULL) goto Error;
if (dec_options != NULL) {
options = *dec_options;
} else {
DefaultDecoderOptions(&options);
}
if (!ApplyDecoderOptions(&options, dec)) goto Error;
dec->demux_ = WebPDemux(webp_data);
if (dec->demux_ == NULL) goto Error;
dec->info_.canvas_width = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_WIDTH);
dec->info_.canvas_height = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_HEIGHT);
dec->info_.loop_count = WebPDemuxGetI(dec->demux_, WEBP_FF_LOOP_COUNT);
dec->info_.bgcolor = WebPDemuxGetI(dec->demux_, WEBP_FF_BACKGROUND_COLOR);
dec->info_.frame_count = WebPDemuxGetI(dec->demux_, WEBP_FF_FRAME_COUNT);
{
const int canvas_bytes =
dec->info_.canvas_width * NUM_CHANNELS * dec->info_.canvas_height;
// Note: calloc() because we fill frame with zeroes as well.
dec->curr_frame_ = WebPSafeCalloc(1ULL, canvas_bytes);
if (dec->curr_frame_ == NULL) goto Error;
dec->prev_frame_disposed_ = WebPSafeCalloc(1ULL, canvas_bytes);
if (dec->prev_frame_disposed_ == NULL) goto Error;
}
WebPAnimDecoderReset(dec);
return dec;
Error:
WebPAnimDecoderDelete(dec);
return NULL;
}
int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, WebPAnimInfo* info) {
if (dec == NULL || info == NULL) return 0;
*info = dec->info_;
return 1;
}
// Returns true if the frame covers the full canvas.
static int IsFullFrame(int width, int height, int canvas_width,
int canvas_height) {
return (width == canvas_width && height == canvas_height);
}
// Clear the canvas to transparent.
static void ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
uint32_t canvas_height) {
memset(buf, 0, canvas_width * NUM_CHANNELS * canvas_height);
}
// Clear given frame rectangle to transparent.
static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
int y_offset, int width, int height) {
int j;
assert(width * NUM_CHANNELS <= buf_stride);
buf += y_offset * buf_stride + x_offset * NUM_CHANNELS;
for (j = 0; j < height; ++j) {
memset(buf, 0, width * NUM_CHANNELS);
buf += buf_stride;
}
}
// Copy width * height pixels from 'src' to 'dst'.
static void CopyCanvas(const uint8_t* src, uint8_t* dst,
uint32_t width, uint32_t height) {
assert(src != NULL && dst != NULL);
memcpy(dst, src, width * NUM_CHANNELS * height);
}
// Returns true if the current frame is a key-frame.
static int IsKeyFrame(const WebPIterator* const curr,
const WebPIterator* const prev,
int prev_frame_was_key_frame,
int canvas_width, int canvas_height) {
if (curr->frame_num == 1) {
return 1;
} else if ((!curr->has_alpha || curr->blend_method == WEBP_MUX_NO_BLEND) &&
IsFullFrame(curr->width, curr->height,
canvas_width, canvas_height)) {
return 1;
} else {
return (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) &&
(IsFullFrame(prev->width, prev->height, canvas_width,
canvas_height) ||
prev_frame_was_key_frame);
}
}
// Blend a single channel of 'src' over 'dst', given their alpha channel values.
// 'src' and 'dst' are assumed to be NOT pre-multiplied by alpha.
static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
uint32_t dst, uint8_t dst_a,
uint32_t scale, int shift) {
const uint8_t src_channel = (src >> shift) & 0xff;
const uint8_t dst_channel = (dst >> shift) & 0xff;
const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
assert(blend_unscaled < (1ULL << 32) / scale);
return (blend_unscaled * scale) >> 24;
}
// Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
const uint8_t src_a = (src >> 24) & 0xff;
if (src_a == 0) {
return dst;
} else {
const uint8_t dst_a = (dst >> 24) & 0xff;
// This is the approximate integer arithmetic for the actual formula:
// dst_factor_a = (dst_a * (255 - src_a)) / 255.
const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
const uint8_t blend_a = src_a + dst_factor_a;
const uint32_t scale = (1UL << 24) / blend_a;
const uint8_t blend_r =
BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
const uint8_t blend_g =
BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
const uint8_t blend_b =
BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
assert(src_a + dst_factor_a < 256);
return (blend_r << 0) |
(blend_g << 8) |
(blend_b << 16) |
((uint32_t)blend_a << 24);
}
}
// Blend 'num_pixels' in 'src' over 'dst' assuming they are NOT pre-multiplied
// by alpha.
static void BlendPixelRowNonPremult(uint32_t* const src,
const uint32_t* const dst, int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint8_t src_alpha = (src[i] >> 24) & 0xff;
if (src_alpha != 0xff) {
src[i] = BlendPixelNonPremult(src[i], dst[i]);
}
}
}
// Individually multiply each channel in 'pix' by 'scale'.
static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
uint32_t mask = 0x00FF00FF;
uint32_t rb = ((pix & mask) * scale) >> 8;
uint32_t ag = ((pix >> 8) & mask) * scale;
return (rb & mask) | (ag & ~mask);
}
// Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
const uint8_t src_a = (src >> 24) & 0xff;
return src + ChannelwiseMultiply(dst, 256 - src_a);
}
// Blend 'num_pixels' in 'src' over 'dst' assuming they are pre-multiplied by
// alpha.
static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint8_t src_alpha = (src[i] >> 24) & 0xff;
if (src_alpha != 0xff) {
src[i] = BlendPixelPremult(src[i], dst[i]);
}
}
}
// Returns two ranges (<left, width> pairs) at row 'canvas_y', that belong to
// 'src' but not 'dst'. A point range is empty if the corresponding width is 0.
static void FindBlendRangeAtRow(const WebPIterator* const src,
const WebPIterator* const dst, int canvas_y,
int* const left1, int* const width1,
int* const left2, int* const width2) {
const int src_max_x = src->x_offset + src->width;
const int dst_max_x = dst->x_offset + dst->width;
const int dst_max_y = dst->y_offset + dst->height;
assert(canvas_y >= src->y_offset && canvas_y < (src->y_offset + src->height));
*left1 = -1;
*width1 = 0;
*left2 = -1;
*width2 = 0;
if (canvas_y < dst->y_offset || canvas_y >= dst_max_y ||
src->x_offset >= dst_max_x || src_max_x <= dst->x_offset) {
*left1 = src->x_offset;
*width1 = src->width;
return;
}
if (src->x_offset < dst->x_offset) {
*left1 = src->x_offset;
*width1 = dst->x_offset - src->x_offset;
}
if (src_max_x > dst_max_x) {
*left2 = dst_max_x;
*width2 = src_max_x - dst_max_x;
}
}
int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
uint8_t** buf_ptr, int* timestamp_ptr) {
WebPIterator iter;
uint32_t width;
uint32_t height;
int is_key_frame;
int timestamp;
BlendRowFunc blend_row;
if (dec == NULL || buf_ptr == NULL || timestamp_ptr == NULL) return 0;
if (!WebPAnimDecoderHasMoreFrames(dec)) return 0;
width = dec->info_.canvas_width;
height = dec->info_.canvas_height;
blend_row = dec->blend_func_;
// Get compressed frame.
if (!WebPDemuxGetFrame(dec->demux_, dec->next_frame_, &iter)) {
return 0;
}
timestamp = dec->prev_frame_timestamp_ + iter.duration;
// Initialize.
is_key_frame = IsKeyFrame(&iter, &dec->prev_iter_,
dec->prev_frame_was_keyframe_, width, height);
if (is_key_frame) {
ZeroFillCanvas(dec->curr_frame_, width, height);
} else {
CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_, width, height);
}
// Decode.
{
const uint8_t* in = iter.fragment.bytes;
const size_t in_size = iter.fragment.size;
const size_t out_offset =
(iter.y_offset * width + iter.x_offset) * NUM_CHANNELS;
WebPDecoderConfig* const config = &dec->config_;
WebPRGBABuffer* const buf = &config->output.u.RGBA;
buf->stride = NUM_CHANNELS * width;
buf->size = buf->stride * iter.height;
buf->rgba = dec->curr_frame_ + out_offset;
if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {
goto Error;
}
}
// During the decoding of current frame, we may have set some pixels to be
// transparent (i.e. alpha < 255). However, the value of each of these
// pixels should have been determined by blending it against the value of
// that pixel in the previous frame if blending method of is WEBP_MUX_BLEND.
if (iter.frame_num > 1 && iter.blend_method == WEBP_MUX_BLEND &&
!is_key_frame) {
if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_NONE) {
int y;
// Blend transparent pixels with pixels in previous canvas.
for (y = 0; y < iter.height; ++y) {
const size_t offset =
(iter.y_offset + y) * width + iter.x_offset;
blend_row((uint32_t*)dec->curr_frame_ + offset,
(uint32_t*)dec->prev_frame_disposed_ + offset, iter.width);
}
} else {
int y;
assert(dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND);
// We need to blend a transparent pixel with its value just after
// initialization. That is, blend it with:
// * Fully transparent pixel if it belongs to prevRect <-- No-op.
// * The pixel in the previous canvas otherwise <-- Need alpha-blending.
for (y = 0; y < iter.height; ++y) {
const int canvas_y = iter.y_offset + y;
int left1, width1, left2, width2;
FindBlendRangeAtRow(&iter, &dec->prev_iter_, canvas_y, &left1, &width1,
&left2, &width2);
if (width1 > 0) {
const size_t offset1 = canvas_y * width + left1;
blend_row((uint32_t*)dec->curr_frame_ + offset1,
(uint32_t*)dec->prev_frame_disposed_ + offset1, width1);
}
if (width2 > 0) {
const size_t offset2 = canvas_y * width + left2;
blend_row((uint32_t*)dec->curr_frame_ + offset2,
(uint32_t*)dec->prev_frame_disposed_ + offset2, width2);
}
}
}
}
// Update info of the previous frame and dispose it for the next iteration.
dec->prev_frame_timestamp_ = timestamp;
dec->prev_iter_ = iter;
dec->prev_frame_was_keyframe_ = is_key_frame;
CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
ZeroFillFrameRect(dec->prev_frame_disposed_, width * NUM_CHANNELS,
dec->prev_iter_.x_offset, dec->prev_iter_.y_offset,
dec->prev_iter_.width, dec->prev_iter_.height);
}
++dec->next_frame_;
// All OK, fill in the values.
*buf_ptr = dec->curr_frame_;
*timestamp_ptr = timestamp;
return 1;
Error:
WebPDemuxReleaseIterator(&iter);
return 0;
}
int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec) {
if (dec == NULL) return 0;
return (dec->next_frame_ <= (int)dec->info_.frame_count);
}
void WebPAnimDecoderReset(WebPAnimDecoder* dec) {
if (dec != NULL) {
dec->prev_frame_timestamp_ = 0;
memset(&dec->prev_iter_, 0, sizeof(dec->prev_iter_));
dec->prev_frame_was_keyframe_ = 0;
dec->next_frame_ = 1;
}
}
const WebPDemuxer* WebPAnimDecoderGetDemuxer(const WebPAnimDecoder* dec) {
if (dec == NULL) return NULL;
return dec->demux_;
}
void WebPAnimDecoderDelete(WebPAnimDecoder* dec) {
if (dec != NULL) {
WebPDemuxDelete(dec->demux_);
WebPSafeFree(dec->curr_frame_);
WebPSafeFree(dec->prev_frame_disposed_);
WebPSafeFree(dec);
}
}

957
drivers/webp/demux/demux.c Normal file
View file

@ -0,0 +1,957 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebP container demux.
//
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#endif
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "../utils/utils.h"
#include "../webp/decode.h" // WebPGetFeatures
#include "../webp/demux.h"
#include "../webp/format_constants.h"
#define DMUX_MAJ_VERSION 0
#define DMUX_MIN_VERSION 2
#define DMUX_REV_VERSION 2
typedef struct {
size_t start_; // start location of the data
size_t end_; // end location
size_t riff_end_; // riff chunk end location, can be > end_.
size_t buf_size_; // size of the buffer
const uint8_t* buf_;
} MemBuffer;
typedef struct {
size_t offset_;
size_t size_;
} ChunkData;
typedef struct Frame {
int x_offset_, y_offset_;
int width_, height_;
int has_alpha_;
int duration_;
WebPMuxAnimDispose dispose_method_;
WebPMuxAnimBlend blend_method_;
int is_fragment_; // this is a frame fragment (and not a full frame).
int frame_num_; // the referent frame number for use in assembling fragments.
int complete_; // img_components_ contains a full image.
ChunkData img_components_[2]; // 0=VP8{,L} 1=ALPH
struct Frame* next_;
} Frame;
typedef struct Chunk {
ChunkData data_;
struct Chunk* next_;
} Chunk;
struct WebPDemuxer {
MemBuffer mem_;
WebPDemuxState state_;
int is_ext_format_;
uint32_t feature_flags_;
int canvas_width_, canvas_height_;
int loop_count_;
uint32_t bgcolor_;
int num_frames_;
Frame* frames_;
Frame** frames_tail_;
Chunk* chunks_; // non-image chunks
Chunk** chunks_tail_;
};
typedef enum {
PARSE_OK,
PARSE_NEED_MORE_DATA,
PARSE_ERROR
} ParseStatus;
typedef struct ChunkParser {
uint8_t id[4];
ParseStatus (*parse)(WebPDemuxer* const dmux);
int (*valid)(const WebPDemuxer* const dmux);
} ChunkParser;
static ParseStatus ParseSingleImage(WebPDemuxer* const dmux);
static ParseStatus ParseVP8X(WebPDemuxer* const dmux);
static int IsValidSimpleFormat(const WebPDemuxer* const dmux);
static int IsValidExtendedFormat(const WebPDemuxer* const dmux);
static const ChunkParser kMasterChunks[] = {
{ { 'V', 'P', '8', ' ' }, ParseSingleImage, IsValidSimpleFormat },
{ { 'V', 'P', '8', 'L' }, ParseSingleImage, IsValidSimpleFormat },
{ { 'V', 'P', '8', 'X' }, ParseVP8X, IsValidExtendedFormat },
{ { '0', '0', '0', '0' }, NULL, NULL },
};
//------------------------------------------------------------------------------
int WebPGetDemuxVersion(void) {
return (DMUX_MAJ_VERSION << 16) | (DMUX_MIN_VERSION << 8) | DMUX_REV_VERSION;
}
// -----------------------------------------------------------------------------
// MemBuffer
static int RemapMemBuffer(MemBuffer* const mem,
const uint8_t* data, size_t size) {
if (size < mem->buf_size_) return 0; // can't remap to a shorter buffer!
mem->buf_ = data;
mem->end_ = mem->buf_size_ = size;
return 1;
}
static int InitMemBuffer(MemBuffer* const mem,
const uint8_t* data, size_t size) {
memset(mem, 0, sizeof(*mem));
return RemapMemBuffer(mem, data, size);
}
// Return the remaining data size available in 'mem'.
static WEBP_INLINE size_t MemDataSize(const MemBuffer* const mem) {
return (mem->end_ - mem->start_);
}
// Return true if 'size' exceeds the end of the RIFF chunk.
static WEBP_INLINE int SizeIsInvalid(const MemBuffer* const mem, size_t size) {
return (size > mem->riff_end_ - mem->start_);
}
static WEBP_INLINE void Skip(MemBuffer* const mem, size_t size) {
mem->start_ += size;
}
static WEBP_INLINE void Rewind(MemBuffer* const mem, size_t size) {
mem->start_ -= size;
}
static WEBP_INLINE const uint8_t* GetBuffer(MemBuffer* const mem) {
return mem->buf_ + mem->start_;
}
// Read from 'mem' and skip the read bytes.
static WEBP_INLINE uint8_t ReadByte(MemBuffer* const mem) {
const uint8_t byte = mem->buf_[mem->start_];
Skip(mem, 1);
return byte;
}
static WEBP_INLINE int ReadLE16s(MemBuffer* const mem) {
const uint8_t* const data = mem->buf_ + mem->start_;
const int val = GetLE16(data);
Skip(mem, 2);
return val;
}
static WEBP_INLINE int ReadLE24s(MemBuffer* const mem) {
const uint8_t* const data = mem->buf_ + mem->start_;
const int val = GetLE24(data);
Skip(mem, 3);
return val;
}
static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
const uint8_t* const data = mem->buf_ + mem->start_;
const uint32_t val = GetLE32(data);
Skip(mem, 4);
return val;
}
// -----------------------------------------------------------------------------
// Secondary chunk parsing
static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
*dmux->chunks_tail_ = chunk;
chunk->next_ = NULL;
dmux->chunks_tail_ = &chunk->next_;
}
// Add a frame to the end of the list, ensuring the last frame is complete.
// Returns true on success, false otherwise.
static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
const Frame* const last_frame = *dmux->frames_tail_;
if (last_frame != NULL && !last_frame->complete_) return 0;
*dmux->frames_tail_ = frame;
frame->next_ = NULL;
dmux->frames_tail_ = &frame->next_;
return 1;
}
// Store image bearing chunks to 'frame'.
static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
MemBuffer* const mem, Frame* const frame) {
int alpha_chunks = 0;
int image_chunks = 0;
int done = (MemDataSize(mem) < min_size);
ParseStatus status = PARSE_OK;
if (done) return PARSE_NEED_MORE_DATA;
do {
const size_t chunk_start_offset = mem->start_;
const uint32_t fourcc = ReadLE32(mem);
const uint32_t payload_size = ReadLE32(mem);
const uint32_t payload_size_padded = payload_size + (payload_size & 1);
const size_t payload_available = (payload_size_padded > MemDataSize(mem))
? MemDataSize(mem) : payload_size_padded;
const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
switch (fourcc) {
case MKFOURCC('A', 'L', 'P', 'H'):
if (alpha_chunks == 0) {
++alpha_chunks;
frame->img_components_[1].offset_ = chunk_start_offset;
frame->img_components_[1].size_ = chunk_size;
frame->has_alpha_ = 1;
frame->frame_num_ = frame_num;
Skip(mem, payload_available);
} else {
goto Done;
}
break;
case MKFOURCC('V', 'P', '8', 'L'):
if (alpha_chunks > 0) return PARSE_ERROR; // VP8L has its own alpha
// fall through
case MKFOURCC('V', 'P', '8', ' '):
if (image_chunks == 0) {
// Extract the bitstream features, tolerating failures when the data
// is incomplete.
WebPBitstreamFeatures features;
const VP8StatusCode vp8_status =
WebPGetFeatures(mem->buf_ + chunk_start_offset, chunk_size,
&features);
if (status == PARSE_NEED_MORE_DATA &&
vp8_status == VP8_STATUS_NOT_ENOUGH_DATA) {
return PARSE_NEED_MORE_DATA;
} else if (vp8_status != VP8_STATUS_OK) {
// We have enough data, and yet WebPGetFeatures() failed.
return PARSE_ERROR;
}
++image_chunks;
frame->img_components_[0].offset_ = chunk_start_offset;
frame->img_components_[0].size_ = chunk_size;
frame->width_ = features.width;
frame->height_ = features.height;
frame->has_alpha_ |= features.has_alpha;
frame->frame_num_ = frame_num;
frame->complete_ = (status == PARSE_OK);
Skip(mem, payload_available);
} else {
goto Done;
}
break;
Done:
default:
// Restore fourcc/size when moving up one level in parsing.
Rewind(mem, CHUNK_HEADER_SIZE);
done = 1;
break;
}
if (mem->start_ == mem->riff_end_) {
done = 1;
} else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
status = PARSE_NEED_MORE_DATA;
}
} while (!done && status == PARSE_OK);
return status;
}
// Creates a new Frame if 'actual_size' is within bounds and 'mem' contains
// enough data ('min_size') to parse the payload.
// Returns PARSE_OK on success with *frame pointing to the new Frame.
// Returns PARSE_NEED_MORE_DATA with insufficient data, PARSE_ERROR otherwise.
static ParseStatus NewFrame(const MemBuffer* const mem,
uint32_t min_size, uint32_t actual_size,
Frame** frame) {
if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
if (actual_size < min_size) return PARSE_ERROR;
if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
*frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(**frame));
return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
}
// Parse a 'ANMF' chunk and any image bearing chunks that immediately follow.
// 'frame_chunk_size' is the previously validated, padded chunk size.
static ParseStatus ParseAnimationFrame(
WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
int added_frame = 0;
int bits;
MemBuffer* const mem = &dmux->mem_;
Frame* frame;
ParseStatus status =
NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
if (status != PARSE_OK) return status;
frame->x_offset_ = 2 * ReadLE24s(mem);
frame->y_offset_ = 2 * ReadLE24s(mem);
frame->width_ = 1 + ReadLE24s(mem);
frame->height_ = 1 + ReadLE24s(mem);
frame->duration_ = ReadLE24s(mem);
bits = ReadByte(mem);
frame->dispose_method_ =
(bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
WebPSafeFree(frame);
return PARSE_ERROR;
}
// Store a frame only if the animation flag is set there is some data for
// this frame is available.
status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
added_frame = AddFrame(dmux, frame);
if (added_frame) {
++dmux->num_frames_;
} else {
status = PARSE_ERROR;
}
}
if (!added_frame) WebPSafeFree(frame);
return status;
}
// General chunk storage, starting with the header at 'start_offset', allowing
// the user to request the payload via a fourcc string. 'size' includes the
// header and the unpadded payload size.
// Returns true on success, false otherwise.
static int StoreChunk(WebPDemuxer* const dmux,
size_t start_offset, uint32_t size) {
Chunk* const chunk = (Chunk*)WebPSafeCalloc(1ULL, sizeof(*chunk));
if (chunk == NULL) return 0;
chunk->data_.offset_ = start_offset;
chunk->data_.size_ = size;
AddChunk(dmux, chunk);
return 1;
}
// -----------------------------------------------------------------------------
// Primary chunk parsing
static ParseStatus ReadHeader(MemBuffer* const mem) {
const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
uint32_t riff_size;
// Basic file level validation.
if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
return PARSE_ERROR;
}
riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
// There's no point in reading past the end of the RIFF chunk
mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
if (mem->buf_size_ > mem->riff_end_) {
mem->buf_size_ = mem->end_ = mem->riff_end_;
}
Skip(mem, RIFF_HEADER_SIZE);
return PARSE_OK;
}
static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
const size_t min_size = CHUNK_HEADER_SIZE;
MemBuffer* const mem = &dmux->mem_;
Frame* frame;
ParseStatus status;
int image_added = 0;
if (dmux->frames_ != NULL) return PARSE_ERROR;
if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
if (frame == NULL) return PARSE_ERROR;
// For the single image case we allow parsing of a partial frame, but we need
// at least CHUNK_HEADER_SIZE for parsing.
status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
if (status != PARSE_ERROR) {
const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
// Clear any alpha when the alpha flag is missing.
if (!has_alpha && frame->img_components_[1].size_ > 0) {
frame->img_components_[1].offset_ = 0;
frame->img_components_[1].size_ = 0;
frame->has_alpha_ = 0;
}
// Use the frame width/height as the canvas values for non-vp8x files.
// Also, set ALPHA_FLAG if this is a lossless image with alpha.
if (!dmux->is_ext_format_ && frame->width_ > 0 && frame->height_ > 0) {
dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
dmux->canvas_width_ = frame->width_;
dmux->canvas_height_ = frame->height_;
dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
}
if (!AddFrame(dmux, frame)) {
status = PARSE_ERROR; // last frame was left incomplete
} else {
image_added = 1;
dmux->num_frames_ = 1;
}
}
if (!image_added) WebPSafeFree(frame);
return status;
}
static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
MemBuffer* const mem = &dmux->mem_;
int anim_chunks = 0;
ParseStatus status = PARSE_OK;
do {
int store_chunk = 1;
const size_t chunk_start_offset = mem->start_;
const uint32_t fourcc = ReadLE32(mem);
const uint32_t chunk_size = ReadLE32(mem);
const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
switch (fourcc) {
case MKFOURCC('V', 'P', '8', 'X'): {
return PARSE_ERROR;
}
case MKFOURCC('A', 'L', 'P', 'H'):
case MKFOURCC('V', 'P', '8', ' '):
case MKFOURCC('V', 'P', '8', 'L'): {
// check that this isn't an animation (all frames should be in an ANMF).
if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
Rewind(mem, CHUNK_HEADER_SIZE);
status = ParseSingleImage(dmux);
break;
}
case MKFOURCC('A', 'N', 'I', 'M'): {
if (chunk_size_padded < ANIM_CHUNK_SIZE) return PARSE_ERROR;
if (MemDataSize(mem) < chunk_size_padded) {
status = PARSE_NEED_MORE_DATA;
} else if (anim_chunks == 0) {
++anim_chunks;
dmux->bgcolor_ = ReadLE32(mem);
dmux->loop_count_ = ReadLE16s(mem);
Skip(mem, chunk_size_padded - ANIM_CHUNK_SIZE);
} else {
store_chunk = 0;
goto Skip;
}
break;
}
case MKFOURCC('A', 'N', 'M', 'F'): {
if (anim_chunks == 0) return PARSE_ERROR; // 'ANIM' precedes frames.
status = ParseAnimationFrame(dmux, chunk_size_padded);
break;
}
case MKFOURCC('I', 'C', 'C', 'P'): {
store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
goto Skip;
}
case MKFOURCC('E', 'X', 'I', 'F'): {
store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
goto Skip;
}
case MKFOURCC('X', 'M', 'P', ' '): {
store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
goto Skip;
}
Skip:
default: {
if (chunk_size_padded <= MemDataSize(mem)) {
if (store_chunk) {
// Store only the chunk header and unpadded size as only the payload
// will be returned to the user.
if (!StoreChunk(dmux, chunk_start_offset,
CHUNK_HEADER_SIZE + chunk_size)) {
return PARSE_ERROR;
}
}
Skip(mem, chunk_size_padded);
} else {
status = PARSE_NEED_MORE_DATA;
}
}
}
if (mem->start_ == mem->riff_end_) {
break;
} else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
status = PARSE_NEED_MORE_DATA;
}
} while (status == PARSE_OK);
return status;
}
static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
MemBuffer* const mem = &dmux->mem_;
uint32_t vp8x_size;
if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
dmux->is_ext_format_ = 1;
Skip(mem, TAG_SIZE); // VP8X
vp8x_size = ReadLE32(mem);
if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
vp8x_size += vp8x_size & 1;
if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
dmux->feature_flags_ = ReadByte(mem);
Skip(mem, 3); // Reserved.
dmux->canvas_width_ = 1 + ReadLE24s(mem);
dmux->canvas_height_ = 1 + ReadLE24s(mem);
if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
return PARSE_ERROR; // image final dimension is too large
}
Skip(mem, vp8x_size - VP8X_CHUNK_SIZE); // skip any trailing data.
dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
return ParseVP8XChunks(dmux);
}
// -----------------------------------------------------------------------------
// Format validation
static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
const Frame* const frame = dmux->frames_;
if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
if (dmux->state_ == WEBP_DEMUX_DONE && frame == NULL) return 0;
if (frame->width_ <= 0 || frame->height_ <= 0) return 0;
return 1;
}
// If 'exact' is true, check that the image resolution matches the canvas.
// If 'exact' is false, check that the x/y offsets do not exceed the canvas.
// TODO(jzern): this is insufficient in the fragmented image case if the
// expectation is that the fragments completely cover the canvas.
static int CheckFrameBounds(const Frame* const frame, int exact,
int canvas_width, int canvas_height) {
if (exact) {
if (frame->x_offset_ != 0 || frame->y_offset_ != 0) {
return 0;
}
if (frame->width_ != canvas_width || frame->height_ != canvas_height) {
return 0;
}
} else {
if (frame->x_offset_ < 0 || frame->y_offset_ < 0) return 0;
if (frame->width_ + frame->x_offset_ > canvas_width) return 0;
if (frame->height_ + frame->y_offset_ > canvas_height) return 0;
}
return 1;
}
static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
const Frame* f = dmux->frames_;
if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
if (dmux->loop_count_ < 0) return 0;
if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
if (is_fragmented) return 0;
while (f != NULL) {
const int cur_frame_set = f->frame_num_;
int frame_count = 0, fragment_count = 0;
// Check frame properties and if the image is composed of fragments that
// each fragment came from a fragment.
for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
const ChunkData* const image = f->img_components_;
const ChunkData* const alpha = f->img_components_ + 1;
if (is_fragmented && !f->is_fragment_) return 0;
if (!is_fragmented && f->is_fragment_) return 0;
if (!is_animation && f->frame_num_ > 1) return 0;
if (f->complete_) {
if (alpha->size_ == 0 && image->size_ == 0) return 0;
// Ensure alpha precedes image bitstream.
if (alpha->size_ > 0 && alpha->offset_ > image->offset_) {
return 0;
}
if (f->width_ <= 0 || f->height_ <= 0) return 0;
} else {
// There shouldn't be a partial frame in a complete file.
if (dmux->state_ == WEBP_DEMUX_DONE) return 0;
// Ensure alpha precedes image bitstream.
if (alpha->size_ > 0 && image->size_ > 0 &&
alpha->offset_ > image->offset_) {
return 0;
}
// There shouldn't be any frames after an incomplete one.
if (f->next_ != NULL) return 0;
}
if (f->width_ > 0 && f->height_ > 0 &&
!CheckFrameBounds(f, !(is_animation || is_fragmented),
dmux->canvas_width_, dmux->canvas_height_)) {
return 0;
}
fragment_count += f->is_fragment_;
++frame_count;
}
if (!is_fragmented && frame_count > 1) return 0;
if (fragment_count > 0 && frame_count != fragment_count) return 0;
}
return 1;
}
// -----------------------------------------------------------------------------
// WebPDemuxer object
static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
dmux->state_ = WEBP_DEMUX_PARSING_HEADER;
dmux->loop_count_ = 1;
dmux->bgcolor_ = 0xFFFFFFFF; // White background by default.
dmux->canvas_width_ = -1;
dmux->canvas_height_ = -1;
dmux->frames_tail_ = &dmux->frames_;
dmux->chunks_tail_ = &dmux->chunks_;
dmux->mem_ = *mem;
}
WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
WebPDemuxState* state, int version) {
const ChunkParser* parser;
int partial;
ParseStatus status = PARSE_ERROR;
MemBuffer mem;
WebPDemuxer* dmux;
if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
status = ReadHeader(&mem);
if (status != PARSE_OK) {
if (state != NULL) {
*state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
: WEBP_DEMUX_PARSE_ERROR;
}
return NULL;
}
partial = (mem.buf_size_ < mem.riff_end_);
if (!allow_partial && partial) return NULL;
dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
if (dmux == NULL) return NULL;
InitDemux(dmux, &mem);
status = PARSE_ERROR;
for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
status = parser->parse(dmux);
if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
break;
}
}
if (state != NULL) *state = dmux->state_;
if (status == PARSE_ERROR) {
WebPDemuxDelete(dmux);
return NULL;
}
return dmux;
}
void WebPDemuxDelete(WebPDemuxer* dmux) {
Chunk* c;
Frame* f;
if (dmux == NULL) return;
for (f = dmux->frames_; f != NULL;) {
Frame* const cur_frame = f;
f = f->next_;
WebPSafeFree(cur_frame);
}
for (c = dmux->chunks_; c != NULL;) {
Chunk* const cur_chunk = c;
c = c->next_;
WebPSafeFree(cur_chunk);
}
WebPSafeFree(dmux);
}
// -----------------------------------------------------------------------------
uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
if (dmux == NULL) return 0;
switch (feature) {
case WEBP_FF_FORMAT_FLAGS: return dmux->feature_flags_;
case WEBP_FF_CANVAS_WIDTH: return (uint32_t)dmux->canvas_width_;
case WEBP_FF_CANVAS_HEIGHT: return (uint32_t)dmux->canvas_height_;
case WEBP_FF_LOOP_COUNT: return (uint32_t)dmux->loop_count_;
case WEBP_FF_BACKGROUND_COLOR: return dmux->bgcolor_;
case WEBP_FF_FRAME_COUNT: return (uint32_t)dmux->num_frames_;
}
return 0;
}
// -----------------------------------------------------------------------------
// Frame iteration
// Find the first 'frame_num' frame. There may be multiple such frames in a
// fragmented frame.
static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
const Frame* f;
for (f = dmux->frames_; f != NULL; f = f->next_) {
if (frame_num == f->frame_num_) break;
}
return f;
}
// Returns fragment 'fragment_num' and the total count.
static const Frame* GetFragment(
const Frame* const frame_set, int fragment_num, int* const count) {
const int this_frame = frame_set->frame_num_;
const Frame* f = frame_set;
const Frame* fragment = NULL;
int total;
for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
if (++total == fragment_num) fragment = f;
}
*count = total;
return fragment;
}
static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
const Frame* const frame,
size_t* const data_size) {
*data_size = 0;
if (frame != NULL) {
const ChunkData* const image = frame->img_components_;
const ChunkData* const alpha = frame->img_components_ + 1;
size_t start_offset = image->offset_;
*data_size = image->size_;
// if alpha exists it precedes image, update the size allowing for
// intervening chunks.
if (alpha->size_ > 0) {
const size_t inter_size = (image->offset_ > 0)
? image->offset_ - (alpha->offset_ + alpha->size_)
: 0;
start_offset = alpha->offset_;
*data_size += alpha->size_ + inter_size;
}
return mem_buf + start_offset;
}
return NULL;
}
// Create a whole 'frame' from VP8 (+ alpha) or lossless.
static int SynthesizeFrame(const WebPDemuxer* const dmux,
const Frame* const first_frame,
int fragment_num, WebPIterator* const iter) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
int num_fragments;
size_t payload_size = 0;
const Frame* const fragment =
GetFragment(first_frame, fragment_num, &num_fragments);
const uint8_t* const payload =
GetFramePayload(mem_buf, fragment, &payload_size);
if (payload == NULL) return 0;
assert(first_frame != NULL);
iter->frame_num = first_frame->frame_num_;
iter->num_frames = dmux->num_frames_;
iter->fragment_num = fragment_num;
iter->num_fragments = num_fragments;
iter->x_offset = fragment->x_offset_;
iter->y_offset = fragment->y_offset_;
iter->width = fragment->width_;
iter->height = fragment->height_;
iter->has_alpha = fragment->has_alpha_;
iter->duration = fragment->duration_;
iter->dispose_method = fragment->dispose_method_;
iter->blend_method = fragment->blend_method_;
iter->complete = fragment->complete_;
iter->fragment.bytes = payload;
iter->fragment.size = payload_size;
return 1;
}
static int SetFrame(int frame_num, WebPIterator* const iter) {
const Frame* frame;
const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
if (dmux == NULL || frame_num < 0) return 0;
if (frame_num > dmux->num_frames_) return 0;
if (frame_num == 0) frame_num = dmux->num_frames_;
frame = GetFrame(dmux, frame_num);
if (frame == NULL) return 0;
return SynthesizeFrame(dmux, frame, 1, iter);
}
int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
if (iter == NULL) return 0;
memset(iter, 0, sizeof(*iter));
iter->private_ = (void*)dmux;
return SetFrame(frame, iter);
}
int WebPDemuxNextFrame(WebPIterator* iter) {
if (iter == NULL) return 0;
return SetFrame(iter->frame_num + 1, iter);
}
int WebPDemuxPrevFrame(WebPIterator* iter) {
if (iter == NULL) return 0;
if (iter->frame_num <= 1) return 0;
return SetFrame(iter->frame_num - 1, iter);
}
int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
const Frame* const frame = GetFrame(dmux, iter->frame_num);
if (frame == NULL) return 0;
return SynthesizeFrame(dmux, frame, fragment_num, iter);
}
return 0;
}
void WebPDemuxReleaseIterator(WebPIterator* iter) {
(void)iter;
}
// -----------------------------------------------------------------------------
// Chunk iteration
static int ChunkCount(const WebPDemuxer* const dmux, const char fourcc[4]) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
const Chunk* c;
int count = 0;
for (c = dmux->chunks_; c != NULL; c = c->next_) {
const uint8_t* const header = mem_buf + c->data_.offset_;
if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
}
return count;
}
static const Chunk* GetChunk(const WebPDemuxer* const dmux,
const char fourcc[4], int chunk_num) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
const Chunk* c;
int count = 0;
for (c = dmux->chunks_; c != NULL; c = c->next_) {
const uint8_t* const header = mem_buf + c->data_.offset_;
if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
if (count == chunk_num) break;
}
return c;
}
static int SetChunk(const char fourcc[4], int chunk_num,
WebPChunkIterator* const iter) {
const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
int count;
if (dmux == NULL || fourcc == NULL || chunk_num < 0) return 0;
count = ChunkCount(dmux, fourcc);
if (count == 0) return 0;
if (chunk_num == 0) chunk_num = count;
if (chunk_num <= count) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
const Chunk* const chunk = GetChunk(dmux, fourcc, chunk_num);
iter->chunk.bytes = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
iter->chunk.size = chunk->data_.size_ - CHUNK_HEADER_SIZE;
iter->num_chunks = count;
iter->chunk_num = chunk_num;
return 1;
}
return 0;
}
int WebPDemuxGetChunk(const WebPDemuxer* dmux,
const char fourcc[4], int chunk_num,
WebPChunkIterator* iter) {
if (iter == NULL) return 0;
memset(iter, 0, sizeof(*iter));
iter->private_ = (void*)dmux;
return SetChunk(fourcc, chunk_num, iter);
}
int WebPDemuxNextChunk(WebPChunkIterator* iter) {
if (iter != NULL) {
const char* const fourcc =
(const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
return SetChunk(fourcc, iter->chunk_num + 1, iter);
}
return 0;
}
int WebPDemuxPrevChunk(WebPChunkIterator* iter) {
if (iter != NULL && iter->chunk_num > 1) {
const char* const fourcc =
(const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
return SetChunk(fourcc, iter->chunk_num - 1, iter);
}
return 0;
}
void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
(void)iter;
}

View file

@ -0,0 +1,383 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "./dsp.h"
// Tables can be faster on some platform but incur some extra binary size (~2k).
// #define USE_TABLES_FOR_ALPHA_MULT
// -----------------------------------------------------------------------------
#define MFIX 24 // 24bit fixed-point arithmetic
#define HALF ((1u << MFIX) >> 1)
#define KINV_255 ((1u << MFIX) / 255u)
static uint32_t Mult(uint8_t x, uint32_t mult) {
const uint32_t v = (x * mult + HALF) >> MFIX;
assert(v <= 255); // <- 24bit precision is enough to ensure that.
return v;
}
#ifdef USE_TABLES_FOR_ALPHA_MULT
static const uint32_t kMultTables[2][256] = {
{ // (255u << MFIX) / alpha
0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
{ // alpha * KINV_255
0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
};
static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return kMultTables[!inverse][a];
}
#else
static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return inverse ? (255u << MFIX) / a : a * KINV_255;
}
#endif // USE_TABLES_FOR_ALPHA_MULT
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
if (argb < 0xff000000u) { // alpha < 255
if (argb <= 0x00ffffffu) { // alpha == 0
ptr[x] = 0;
} else {
const uint32_t alpha = (argb >> 24) & 0xff;
const uint32_t scale = GetScale(alpha, inverse);
uint32_t out = argb & 0xff000000u;
out |= Mult(argb >> 0, scale) << 0;
out |= Mult(argb >> 8, scale) << 8;
out |= Mult(argb >> 16, scale) << 16;
ptr[x] = out;
}
}
}
}
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t a = alpha[x];
if (a != 255) {
if (a == 0) {
ptr[x] = 0;
} else {
const uint32_t scale = GetScale(a, inverse);
ptr[x] = Mult(ptr[x], scale);
}
}
}
}
#undef KINV_255
#undef HALF
#undef MFIX
void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
//------------------------------------------------------------------------------
// Generic per-plane calls
void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
WebPMultARGBRow((uint32_t*)ptr, width, inverse);
ptr += stride;
}
}
void WebPMultRows(uint8_t* ptr, int stride,
const uint8_t* alpha, int alpha_stride,
int width, int num_rows, int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
WebPMultRow(ptr, alpha, width, inverse);
ptr += stride;
alpha += alpha_stride;
}
}
//------------------------------------------------------------------------------
// Premultiplied modes
// non dithered-modes
// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
#if 1 // (int)(x * a / 255.)
#define MULTIPLIER(a) ((a) * 32897U)
#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
#else // (int)(x * a / 255. + .5)
#define MULTIPLIER(a) ((a) * 65793U)
#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
#endif
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
while (h-- > 0) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
int i;
for (i = 0; i < w; ++i) {
const uint32_t a = alpha[4 * i];
if (a != 0xff) {
const uint32_t mult = MULTIPLIER(a);
rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
}
}
rgba += stride;
}
}
#undef MULTIPLIER
#undef PREMULTIPLY
// rgbA4444
#define MULTIPLIER(a) ((a) * 0x1111) // 0x1111 ~= (1 << 16) / 15
static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
return (x & 0xf0) | (x >> 4);
}
static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
return (x & 0x0f) | (x << 4);
}
static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
return (x * m) >> 16;
}
static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
int w, int h, int stride,
int rg_byte_pos /* 0 or 1 */) {
while (h-- > 0) {
int i;
for (i = 0; i < w; ++i) {
const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
const uint8_t a = ba & 0x0f;
const uint32_t mult = MULTIPLIER(a);
const uint8_t r = multiply(dither_hi(rg), mult);
const uint8_t g = multiply(dither_lo(rg), mult);
const uint8_t b = multiply(dither_hi(ba), mult);
rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
}
rgba4444 += stride;
}
}
#undef MULTIPLIER
static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
int w, int h, int stride) {
#ifdef WEBP_SWAP_16BIT_CSP
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
#else
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
#endif
}
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xff;
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
const uint32_t alpha_value = alpha[i];
dst[4 * i] = alpha_value;
alpha_mask &= alpha_value;
}
alpha += alpha_stride;
dst += dst_stride;
}
return (alpha_mask != 0xff);
}
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
dst[i] = alpha[i] << 8; // leave A/R/B channels zero'd.
}
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
uint8_t alpha_mask = 0xff;
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
const uint8_t alpha_value = argb[4 * i];
alpha[i] = alpha_value;
alpha_mask &= alpha_value;
}
argb += argb_stride;
alpha += alpha_stride;
}
return (alpha_mask == 0xff);
}
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
//------------------------------------------------------------------------------
// Init function
extern void WebPInitAlphaProcessingMIPSdspR2(void);
extern void WebPInitAlphaProcessingSSE2(void);
extern void WebPInitAlphaProcessingSSE41(void);
static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
(VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPMultARGBRow = WebPMultARGBRowC;
WebPMultRow = WebPMultRowC;
WebPApplyAlphaMultiply = ApplyAlphaMultiply;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
WebPDispatchAlpha = DispatchAlpha;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
WebPExtractAlpha = ExtractAlpha;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitAlphaProcessingSSE2();
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitAlphaProcessingSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitAlphaProcessingMIPSdspR2();
}
#endif
}
alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -0,0 +1,141 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel.
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffff;
int i, j, temp0;
for (j = 0; j < height; ++j) {
uint8_t* pdst = dst;
const uint8_t* palpha = alpha;
for (i = 0; i < (width >> 2); ++i) {
int temp1, temp2, temp3;
__asm__ volatile (
"ulw %[temp0], 0(%[palpha]) \n\t"
"addiu %[palpha], %[palpha], 4 \n\t"
"addiu %[pdst], %[pdst], 16 \n\t"
"srl %[temp1], %[temp0], 8 \n\t"
"srl %[temp2], %[temp0], 16 \n\t"
"srl %[temp3], %[temp0], 24 \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
"sb %[temp0], -16(%[pdst]) \n\t"
"sb %[temp1], -12(%[pdst]) \n\t"
"sb %[temp2], -8(%[pdst]) \n\t"
"sb %[temp3], -4(%[pdst]) \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
[alpha_mask]"+r"(alpha_mask)
:
: "memory"
);
}
for (i = 0; i < (width & 3); ++i) {
__asm__ volatile (
"lbu %[temp0], 0(%[palpha]) \n\t"
"addiu %[palpha], %[palpha], 1 \n\t"
"sb %[temp0], 0(%[pdst]) \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
"addiu %[pdst], %[pdst], 4 \n\t"
: [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
[alpha_mask]"+r"(alpha_mask)
:
: "memory"
);
}
alpha += alpha_stride;
dst += dst_stride;
}
__asm__ volatile (
"ext %[temp0], %[alpha_mask], 0, 16 \n\t"
"srl %[alpha_mask], %[alpha_mask], 16 \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
"ext %[temp0], %[alpha_mask], 0, 8 \n\t"
"srl %[alpha_mask], %[alpha_mask], 8 \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
: [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
:
);
return (alpha_mask != 0xff);
}
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
int x;
const uint32_t c_00ffffff = 0x00ffffffu;
const uint32_t c_ff000000 = 0xff000000u;
const uint32_t c_8000000 = 0x00800000u;
const uint32_t c_8000080 = 0x00800080u;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
if (argb < 0xff000000u) { // alpha < 255
if (argb <= 0x00ffffffu) { // alpha == 0
ptr[x] = 0;
} else {
int temp0, temp1, temp2, temp3, alpha;
__asm__ volatile (
"srl %[alpha], %[argb], 24 \n\t"
"replv.qb %[temp0], %[alpha] \n\t"
"and %[temp0], %[temp0], %[c_00ffffff] \n\t"
"beqz %[inverse], 0f \n\t"
"divu $zero, %[c_ff000000], %[alpha] \n\t"
"mflo %[temp0] \n\t"
"0: \n\t"
"andi %[temp1], %[argb], 0xff \n\t"
"ext %[temp2], %[argb], 8, 8 \n\t"
"ext %[temp3], %[argb], 16, 8 \n\t"
"mul %[temp1], %[temp1], %[temp0] \n\t"
"mul %[temp2], %[temp2], %[temp0] \n\t"
"mul %[temp3], %[temp3], %[temp0] \n\t"
"precrq.ph.w %[temp1], %[temp2], %[temp1] \n\t"
"addu %[temp3], %[temp3], %[c_8000000] \n\t"
"addu %[temp1], %[temp1], %[c_8000080] \n\t"
"precrq.ph.w %[temp3], %[argb], %[temp3] \n\t"
"precrq.qb.ph %[temp1], %[temp3], %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
: [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
[c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
[c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
: "memory", "hi", "lo"
);
ptr[x] = temp1;
}
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
WebPDispatchAlpha = DispatchAlpha;
WebPMultARGBRow = MultARGBRow;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,298 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
int i, j;
const __m128i zero = _mm_setzero_si128();
const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB
const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
__m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte
// 'dst[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet.
const int limit = (width - 1) & ~7;
for (j = 0; j < height; ++j) {
__m128i* out = (__m128i*)dst;
for (i = 0; i < limit; i += 8) {
// load 8 alpha bytes
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
// load 8 dst pixels (32 bytes)
const __m128i b0_lo = _mm_loadu_si128(out + 0);
const __m128i b0_hi = _mm_loadu_si128(out + 1);
// mask dst alpha values
const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
// combine
const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
// store
_mm_storeu_si128(out + 0, b2_lo);
_mm_storeu_si128(out + 1, b2_hi);
// accumulate eight alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, a0);
out += 2;
}
for (; i < width; ++i) {
const uint32_t alpha_value = alpha[i];
dst[4 * i] = alpha_value;
alpha_and &= alpha_value;
}
alpha += alpha_stride;
dst += dst_stride;
}
// Combine the eight alpha 'and' into a 8-bit mask.
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
return (alpha_and != 0xff);
}
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
for (j = 0; j < height; ++j) {
for (i = 0; i < limit; i += 16) { // process 16 alpha bytes
const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first!
const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
_mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo);
_mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi);
_mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo);
_mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
}
for (; i < width; ++i) dst[i] = alpha[i] << 8;
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
int i, j;
const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha
const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
__m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte
// 'src[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet.
const int limit = (width - 1) & ~7;
for (j = 0; j < height; ++j) {
const __m128i* src = (const __m128i*)argb;
for (i = 0; i < limit; i += 8) {
// load 32 argb bytes
const __m128i a0 = _mm_loadu_si128(src + 0);
const __m128i a1 = _mm_loadu_si128(src + 1);
const __m128i b0 = _mm_and_si128(a0, a_mask);
const __m128i b1 = _mm_and_si128(a1, a_mask);
const __m128i c0 = _mm_packs_epi32(b0, b1);
const __m128i d0 = _mm_packus_epi16(c0, c0);
// store
_mm_storel_epi64((__m128i*)&alpha[i], d0);
// accumulate eight alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, d0);
src += 2;
}
for (; i < width; ++i) {
const uint32_t alpha_value = argb[4 * i];
alpha[i] = alpha_value;
alpha_and &= alpha_value;
}
argb += argb_stride;
alpha += alpha_stride;
}
// Combine the eight alpha 'and' into a 8-bit mask.
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
return (alpha_and == 0xff);
}
//------------------------------------------------------------------------------
// Non-dither premultiplied modes
#define MULTIPLIER(a) ((a) * 0x8081)
#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
// We can't use a 'const int' for the SHUFFLE value, because it has to be an
// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.
#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do { \
const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX)); \
const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); \
const __m128i alpha0 = _mm_and_si128(argb1, MASK); \
const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE); \
const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE); \
/* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */ \
const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT); \
const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT); \
const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); \
const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); \
const __m128i argb4 = _mm_adds_epu16(argb2, argb3); \
const __m128i argb5 = _mm_srli_epi16(argb4, 7); \
const __m128i argb6 = _mm_or_si128(argb5, alpha0); \
const __m128i argb7 = _mm_packus_epi16(argb6, zero); \
_mm_storel_epi64((__m128i*)&(RGBX), argb7); \
} while (0)
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
const __m128i zero = _mm_setzero_si128();
const int kSpan = 2;
const int w2 = w & ~(kSpan - 1);
while (h-- > 0) {
uint32_t* const rgbx = (uint32_t*)rgba;
int i;
if (!alpha_first) {
const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
const __m128i kMult =
_mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
for (i = 0; i < w2; i += kSpan) {
APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
}
} else {
const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
const __m128i kMult =
_mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
for (i = 0; i < w2; i += kSpan) {
APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
}
}
// Finish with left-overs.
for (; i < w; ++i) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
const uint32_t a = alpha[4 * i];
if (a != 0xff) {
const uint32_t mult = MULTIPLIER(a);
rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
}
}
rgba += stride;
}
}
#undef MULTIPLIER
#undef PREMULTIPLY
// -----------------------------------------------------------------------------
// Apply alpha value to rows
// We use: kINV255 = (1 << 24) / 255 = 0x010101
// So: a * kINV255 = (a << 16) | [(a << 8) | a]
// -> _mm_mulhi_epu16() takes care of the (a<<16) part,
// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
int x = 0;
if (!inverse) {
const int kSpan = 2;
const __m128i zero = _mm_setzero_si128();
const __m128i kRound =
_mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
const __m128i kMult =
_mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
const int w2 = width & ~(kSpan - 1);
for (x = 0; x < w2; x += kSpan) {
const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
const __m128i argb6 = _mm_srli_epi16(argb5, 8);
const __m128i argb7 = _mm_packus_epi16(argb6, zero);
_mm_storel_epi64((__m128i*)&ptr[x], argb7);
}
}
width -= x;
if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
int x = 0;
if (!inverse) {
const int kSpan = 8;
const __m128i zero = _mm_setzero_si128();
const __m128i kRound = _mm_set1_epi16(1 << 7);
const int w2 = width & ~(kSpan - 1);
for (x = 0; x < w2; x += kSpan) {
const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
const __m128i v4 = _mm_adds_epu16(v2, v3);
const __m128i v5 = _mm_adds_epu16(v4, kRound);
const __m128i v6 = _mm_srli_epi16(v5, 8);
const __m128i v7 = _mm_packus_epi16(v6, zero);
_mm_storel_epi64((__m128i*)&ptr[x], v7);
}
}
width -= x;
if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPMultARGBRow = MultARGBRow;
WebPMultRow = MultRow;
WebPApplyAlphaMultiply = ApplyAlphaMultiply;
WebPDispatchAlpha = DispatchAlpha;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
WebPExtractAlpha = ExtractAlpha;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
#endif // WEBP_USE_SSE2

View file

@ -0,0 +1,92 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel, SSE4.1 variant.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
//------------------------------------------------------------------------------
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
int i, j;
const __m128i all_0xff = _mm_set1_epi32(~0u);
__m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte
// 'src[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet.
const int limit = (width - 1) & ~15;
const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 12, 8, 4, 0);
const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
12, 8, 4, 0, -1, -1, -1, -1);
const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0,
-1, -1, -1, -1, -1, -1, -1, -1);
const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1);
for (j = 0; j < height; ++j) {
const __m128i* src = (const __m128i*)argb;
for (i = 0; i < limit; i += 16) {
// load 64 argb bytes
const __m128i a0 = _mm_loadu_si128(src + 0);
const __m128i a1 = _mm_loadu_si128(src + 1);
const __m128i a2 = _mm_loadu_si128(src + 2);
const __m128i a3 = _mm_loadu_si128(src + 3);
const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0);
const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1);
const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2);
const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3);
const __m128i c0 = _mm_or_si128(b0, b1);
const __m128i c1 = _mm_or_si128(b2, b3);
const __m128i d0 = _mm_or_si128(c0, c1);
// store
_mm_storeu_si128((__m128i*)&alpha[i], d0);
// accumulate sixteen alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, d0);
src += 4;
}
for (; i < width; ++i) {
const uint32_t alpha_value = argb[4 * i];
alpha[i] = alpha_value;
alpha_and &= alpha_value;
}
argb += argb_stride;
alpha += alpha_stride;
}
// Combine the sixteen alpha 'and' into an 8-bit mask.
alpha_and |= 0xff00u; // pretend the upper bits [8..15] were tested ok.
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
return (alpha_and == 0xffffu);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
WebPExtractAlpha = ExtractAlpha;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41)
#endif // WEBP_USE_SSE41

68
drivers/webp/dsp/argb.c Normal file
View file

@ -0,0 +1,68 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions.
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
}
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
offset += step;
}
}
void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
const uint8_t*, int, uint32_t*);
void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
int, int, uint32_t*);
extern void VP8EncDspARGBInitMIPSdspR2(void);
extern void VP8EncDspARGBInitSSE2(void);
static volatile VP8CPUInfo argb_last_cpuinfo_used =
(VP8CPUInfo)&argb_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8PackARGB = PackARGB;
VP8PackRGB = PackRGB;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspARGBInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspARGBInitMIPSdspR2();
}
#endif
}
argb_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -0,0 +1,110 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions (mips version).
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int temp0, temp1, temp2, temp3, offset;
const int rest = len & 1;
const uint32_t* const loop_end = out + len - rest;
const int step = 4;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int temp0, temp1, temp2, offset;
const int rest = len & 1;
const int a = 0xff;
const uint32_t* const loop_end = out + len - rest;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspARGBInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
VP8PackARGB = PackARGB;
VP8PackRGB = PackRGB;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,67 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions (SSE2 version).
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
if (g == r + 1) { // RGBA input order. Need to swap R and B.
int i = 0;
const int len_max = len & ~3; // max length processed in main loop
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
assert(b == r + 2);
assert(a == r + 3);
for (; i < len_max; i += 4) {
const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0
const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A
const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i F = _mm_or_si128(E, C);
_mm_storeu_si128((__m128i*)(out + i), F);
}
for (; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
} else {
assert(g == b + 1);
assert(r == b + 2);
assert(a == b + 3);
memcpy(out, b, len * 4);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspARGBInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
VP8PackARGB = PackARGB;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
#endif // WEBP_USE_SSE2

412
drivers/webp/dsp/cost.c Normal file
View file

@ -0,0 +1,412 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "../enc/cost.h"
//------------------------------------------------------------------------------
// Boolean-cost cost table
const uint16_t VP8EntropyCost[256] = {
1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
1178, 1152, 1110, 1076, 1061, 1024, 1024, 992, 968, 951,
939, 911, 896, 878, 871, 854, 838, 820, 811, 794,
786, 768, 768, 752, 740, 732, 720, 709, 704, 690,
683, 672, 666, 655, 647, 640, 631, 622, 615, 607,
598, 592, 586, 576, 572, 564, 559, 555, 547, 541,
534, 528, 522, 512, 512, 504, 500, 494, 488, 483,
477, 473, 467, 461, 458, 452, 448, 443, 438, 434,
427, 424, 419, 415, 410, 406, 403, 399, 394, 390,
384, 384, 377, 374, 370, 366, 362, 359, 355, 351,
347, 342, 342, 336, 333, 330, 326, 323, 320, 316,
312, 308, 305, 302, 299, 296, 293, 288, 287, 283,
280, 277, 274, 272, 268, 266, 262, 256, 256, 256,
251, 248, 245, 242, 240, 237, 234, 232, 228, 226,
223, 221, 218, 216, 214, 211, 208, 205, 203, 201,
198, 196, 192, 191, 188, 187, 183, 181, 179, 176,
175, 171, 171, 168, 165, 163, 160, 159, 156, 154,
152, 150, 148, 146, 144, 142, 139, 138, 135, 133,
131, 128, 128, 125, 123, 121, 119, 117, 115, 113,
111, 110, 107, 105, 103, 102, 100, 98, 96, 94,
92, 91, 89, 86, 86, 83, 82, 80, 77, 76,
74, 73, 71, 69, 67, 66, 64, 63, 61, 59,
57, 55, 54, 52, 51, 49, 47, 46, 44, 43,
41, 40, 38, 36, 35, 33, 32, 30, 29, 27,
25, 24, 22, 21, 19, 18, 16, 15, 13, 12,
10, 9, 7, 6, 4, 3
};
//------------------------------------------------------------------------------
// Level cost tables
// fixed costs for coding levels, deduce from the coding tree.
// This is only the part that doesn't depend on the probability state.
const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
0, 256, 256, 256, 256, 432, 618, 630,
731, 640, 640, 828, 901, 948, 1021, 1101,
1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
};
//------------------------------------------------------------------------------
// Tables for level coding
const uint8_t VP8EncBands[16 + 1] = {
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
0 // sentinel
};
//------------------------------------------------------------------------------
// Mode costs
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
for (; n < res->last; ++n) {
const int v = abs(res->coeffs[n]);
const int ctx = (v >= 2) ? 2 : v;
cost += VP8LevelCost(t, v);
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
static void SetResidualCoeffs(const int16_t* const coeffs,
VP8Residual* const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);
for (n = 15; n >= 0; --n) {
if (coeffs[n]) {
res->last = n;
break;
}
}
res->coeffs = coeffs;
}
//------------------------------------------------------------------------------
// init function
VP8GetResidualCostFunc VP8GetResidualCost;
VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
extern void VP8EncDspCostInitMIPS32(void);
extern void VP8EncDspCostInitMIPSdspR2(void);
extern void VP8EncDspCostInitSSE2(void);
static volatile VP8CPUInfo cost_last_cpuinfo_used =
(VP8CPUInfo)&cost_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8GetResidualCost = GetResidualCost;
VP8SetResidualCoeffs = SetResidualCoeffs;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspCostInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspCostInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspCostInitSSE2();
}
#endif
}
cost_last_cpuinfo_used = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------

View file

@ -0,0 +1,154 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "../enc/cost.h"
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
const int16_t* res_coeffs = res->coeffs;
const int res_last = res->last;
const int const_max_level = MAX_VARIABLE_LEVEL;
const int const_2 = 2;
const uint16_t** p_costs = &costs[n][0];
const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
if (res->last < 0) {
return VP8BitCost(0, p0);
}
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"subu %[temp1], %[res_last], %[n] \n\t"
"sll %[temp0], %[n], 1 \n\t"
"blez %[temp1], 2f \n\t"
" addu %[res_coeffs], %[res_coeffs], %[temp0] \n\t"
"1: \n\t"
"lh %[v_reg], 0(%[res_coeffs]) \n\t"
"addiu %[n], %[n], 1 \n\t"
"negu %[temp0], %[v_reg] \n\t"
"slti %[temp1], %[v_reg], 0 \n\t"
"movn %[v_reg], %[temp0], %[temp1] \n\t"
"sltiu %[temp0], %[v_reg], 2 \n\t"
"move %[ctx_reg], %[v_reg] \n\t"
"movz %[ctx_reg], %[const_2], %[temp0] \n\t"
"sll %[temp1], %[v_reg], 1 \n\t"
"addu %[temp1], %[temp1], %[VP8LevelFixedCosts] \n\t"
"lhu %[temp1], 0(%[temp1]) \n\t"
"slt %[temp0], %[v_reg], %[const_max_level] \n\t"
"movz %[v_reg], %[const_max_level], %[temp0] \n\t"
"addu %[cost], %[cost], %[temp1] \n\t"
"sll %[v_reg], %[v_reg], 1 \n\t"
"sll %[ctx_reg], %[ctx_reg], 2 \n\t"
"addu %[v_reg], %[v_reg], %[t] \n\t"
"lhu %[temp0], 0(%[v_reg]) \n\t"
"addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
"addu %[t], %[p_costs], %[ctx_reg] \n\t"
"addu %[cost], %[cost], %[temp0] \n\t"
"addiu %[res_coeffs], %[res_coeffs], 2 \n\t"
"bne %[n], %[res_last], 1b \n\t"
" lw %[t], 0(%[t]) \n\t"
"2: \n\t"
".set pop \n\t"
: [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
[ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
: [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
[VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
[inc_p_costs]"r"(inc_p_costs)
: "memory"
);
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
static void SetResidualCoeffs(const int16_t* const coeffs,
VP8Residual* const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[p_coeffs], %[p_coeffs], 28 \n\t"
"li %[n], 15 \n\t"
"li %[temp2], -1 \n\t"
"0: \n\t"
"ulw %[temp0], 0(%[p_coeffs]) \n\t"
"beqz %[temp0], 1f \n\t"
#if defined(WORDS_BIGENDIAN)
" sll %[temp1], %[temp0], 16 \n\t"
#else
" srl %[temp1], %[temp0], 16 \n\t"
#endif
"addiu %[n1], %[n], -1 \n\t"
"movz %[temp0], %[n1], %[temp1] \n\t"
"movn %[temp0], %[n], %[temp1] \n\t"
"j 2f \n\t"
" addiu %[temp2], %[temp0], 0 \n\t"
"1: \n\t"
"addiu %[n], %[n], -2 \n\t"
"bgtz %[n], 0b \n\t"
" addiu %[p_coeffs], %[p_coeffs], -4 \n\t"
"2: \n\t"
".set pop \n\t"
: [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[n]"=&r"(n), [n1]"=&r"(n1)
:
: "memory"
);
res->last = temp2;
res->coeffs = coeffs;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
VP8GetResidualCost = GetResidualCost;
VP8SetResidualCoeffs = SetResidualCoeffs;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
#endif // WEBP_USE_MIPS32

View file

@ -0,0 +1,107 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "../enc/cost.h"
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
const int16_t* res_coeffs = res->coeffs;
const int res_last = res->last;
const int const_max_level = MAX_VARIABLE_LEVEL;
const int const_2 = 2;
const uint16_t** p_costs = &costs[n][0];
const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
if (res->last < 0) {
return VP8BitCost(0, p0);
}
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"subu %[temp1], %[res_last], %[n] \n\t"
"blez %[temp1], 2f \n\t"
" nop \n\t"
"1: \n\t"
"sll %[temp0], %[n], 1 \n\t"
"lhx %[v_reg], %[temp0](%[res_coeffs]) \n\t"
"addiu %[n], %[n], 1 \n\t"
"absq_s.w %[v_reg], %[v_reg] \n\t"
"sltiu %[temp0], %[v_reg], 2 \n\t"
"move %[ctx_reg], %[v_reg] \n\t"
"movz %[ctx_reg], %[const_2], %[temp0] \n\t"
"sll %[temp1], %[v_reg], 1 \n\t"
"lhx %[temp1], %[temp1](%[VP8LevelFixedCosts]) \n\t"
"slt %[temp0], %[v_reg], %[const_max_level] \n\t"
"movz %[v_reg], %[const_max_level], %[temp0] \n\t"
"addu %[cost], %[cost], %[temp1] \n\t"
"sll %[v_reg], %[v_reg], 1 \n\t"
"sll %[ctx_reg], %[ctx_reg], 2 \n\t"
"lhx %[temp0], %[v_reg](%[t]) \n\t"
"addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
"addu %[t], %[p_costs], %[ctx_reg] \n\t"
"addu %[cost], %[cost], %[temp0] \n\t"
"bne %[n], %[res_last], 1b \n\t"
" lw %[t], 0(%[t]) \n\t"
"2: \n\t"
".set pop \n\t"
: [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
[ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1)
: [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
[VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
[res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
: "memory"
);
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
VP8GetResidualCost = GetResidualCost;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,119 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of cost functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "../enc/cost.h"
#include "../enc/vp8enci.h"
#include "../utils/utils.h"
//------------------------------------------------------------------------------
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.
const __m128i zero = _mm_setzero_si128();
const __m128i m0 = _mm_packs_epi16(c0, c1);
const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
// Get the comparison results as a bitmask into 16bits. Negate the mask to get
// the position of entries that are not equal to zero. We don't need to mask
// out least significant bits according to res->first, since coeffs[0] is 0
// if res->first > 0.
const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
// The position of the most significant non-zero bit indicates the position of
// the last non-zero value.
assert(res->first == 0 || coeffs[0] == 0);
res->last = mask ? BitsLog2Floor(mask) : -1;
res->coeffs = coeffs;
}
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
{ // precompute clamped levels and contexts, packed to 8b.
const __m128i zero = _mm_setzero_si128();
const __m128i kCst2 = _mm_set1_epi8(2);
const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
const __m128i D0 = _mm_sub_epi16(zero, c0);
const __m128i D1 = _mm_sub_epi16(zero, c1);
const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b
const __m128i E1 = _mm_max_epi16(c1, D1);
const __m128i F = _mm_packs_epi16(E0, E1);
const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2
const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67]
_mm_storeu_si128((__m128i*)&ctxs[0], G);
_mm_storeu_si128((__m128i*)&levels[0], H);
_mm_storeu_si128((__m128i*)&abs_levels[0], E0);
_mm_storeu_si128((__m128i*)&abs_levels[8], E1);
}
for (; n < res->last; ++n) {
const int ctx = ctxs[n];
const int level = levels[n];
const int flevel = abs_levels[n]; // full level
cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int level = levels[n];
const int flevel = abs_levels[n];
assert(flevel != 0);
cost += VP8LevelFixedCosts[flevel] + t[level];
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = ctxs[n];
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
VP8GetResidualCost = GetResidualCostSSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
#endif // WEBP_USE_SSE2

View file

@ -0,0 +1,366 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Clipping tables for filtering
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#define USE_STATIC_TABLES // undefine to have run-time table initialization
#ifdef USE_STATIC_TABLES
static const uint8_t abs0[255 + 255 + 1] = {
0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
0xf3, 0xf2, 0xf1, 0xf0, 0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8,
0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde, 0xdd, 0xdc,
0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0,
0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac,
0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0,
0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97, 0x96, 0x95, 0x94,
0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88,
0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c,
0x7b, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64,
0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58,
0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c,
0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28,
0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c,
0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
0x03, 0x02, 0x01, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
static const int8_t sclip1[1020 + 1020 + 1] = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3,
0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3,
0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
};
static const int8_t sclip2[112 + 112 + 1] = {
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
};
static const uint8_t clip1[255 + 511 + 1] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};
#else
// uninitialized tables
static uint8_t abs0[255 + 255 + 1];
static int8_t sclip1[1020 + 1020 + 1];
static int8_t sclip2[112 + 112 + 1];
static uint8_t clip1[255 + 511 + 1];
// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
#endif
const int8_t* const VP8ksclip1 = &sclip1[1020];
const int8_t* const VP8ksclip2 = &sclip2[112];
const uint8_t* const VP8kclip1 = &clip1[255];
const uint8_t* const VP8kabs0 = &abs0[255];
WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
#if !defined(USE_STATIC_TABLES)
int i;
if (!tables_ok) {
for (i = -255; i <= 255; ++i) {
abs0[255 + i] = (i < 0) ? -i : i;
}
for (i = -1020; i <= 1020; ++i) {
sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
}
for (i = -112; i <= 112; ++i) {
sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
}
for (i = -255; i <= 255 + 255; ++i) {
clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
}
tables_ok = 1;
}
#endif // USE_STATIC_TABLES
}

View file

@ -0,0 +1,587 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of dsp functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
static WEBP_INLINE int abs_mips32(int x) {
const int sign = x >> 31;
return (x ^ sign) - sign;
}
// 4 pixels in, 2 pixels out
static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
const int a1 = VP8ksclip2[(a + 4) >> 3];
const int a2 = VP8ksclip2[(a + 3) >> 3];
p[-step] = VP8kclip1[p0 + a2];
p[ 0] = VP8kclip1[q0 - a1];
}
// 4 pixels in, 4 pixels out
static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
const int a1 = VP8ksclip2[(a + 4) >> 3];
const int a2 = VP8ksclip2[(a + 3) >> 3];
const int a3 = (a1 + 1) >> 1;
p[-2 * step] = VP8kclip1[p1 + a3];
p[- step] = VP8kclip1[p0 + a2];
p[ 0] = VP8kclip1[q0 - a1];
p[ step] = VP8kclip1[q1 - a3];
}
// 6 pixels in, 6 pixels out
static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
// a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
p[-3 * step] = VP8kclip1[p2 + a3];
p[-2 * step] = VP8kclip1[p1 + a2];
p[- step] = VP8kclip1[p0 + a1];
p[ 0] = VP8kclip1[q0 - a1];
p[ step] = VP8kclip1[q1 - a2];
p[ 2 * step] = VP8kclip1[q2 - a3];
}
static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
}
static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
}
static WEBP_INLINE int needs_filter2(const uint8_t* p,
int step, int t, int it) {
const int p3 = p[-4 * step], p2 = p[-3 * step];
const int p1 = p[-2 * step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
return 0;
}
return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
}
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
} else {
do_filter6(p, hstride);
}
}
p += vstride;
}
}
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
} else {
do_filter4(p, hstride);
}
}
p += vstride;
}
}
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i, stride, thresh2)) {
do_filter2(p + i, stride);
}
}
}
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i * stride, 1, thresh2)) {
do_filter2(p + i * stride, 1);
}
}
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
}
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14;
int temp15, temp16, temp17, temp18;
int16_t* p_in = (int16_t*)in;
// loops unrolled and merged to avoid usage of tmp buffer
// and to reduce number of stalls. MUL macro is written
// in assembler and inlined
__asm__ volatile(
"lh %[temp0], 0(%[in]) \n\t"
"lh %[temp8], 16(%[in]) \n\t"
"lh %[temp4], 8(%[in]) \n\t"
"lh %[temp12], 24(%[in]) \n\t"
"addu %[temp16], %[temp0], %[temp8] \n\t"
"subu %[temp0], %[temp0], %[temp8] \n\t"
"mul %[temp8], %[temp4], %[kC2] \n\t"
"mul %[temp17], %[temp12], %[kC1] \n\t"
"mul %[temp4], %[temp4], %[kC1] \n\t"
"mul %[temp12], %[temp12], %[kC2] \n\t"
"lh %[temp1], 2(%[in]) \n\t"
"lh %[temp5], 10(%[in]) \n\t"
"lh %[temp9], 18(%[in]) \n\t"
"lh %[temp13], 26(%[in]) \n\t"
"sra %[temp8], %[temp8], 16 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp4], %[temp4], 16 \n\t"
"sra %[temp12], %[temp12], 16 \n\t"
"lh %[temp2], 4(%[in]) \n\t"
"lh %[temp6], 12(%[in]) \n\t"
"lh %[temp10], 20(%[in]) \n\t"
"lh %[temp14], 28(%[in]) \n\t"
"subu %[temp17], %[temp8], %[temp17] \n\t"
"addu %[temp4], %[temp4], %[temp12] \n\t"
"addu %[temp8], %[temp16], %[temp4] \n\t"
"subu %[temp4], %[temp16], %[temp4] \n\t"
"addu %[temp16], %[temp1], %[temp9] \n\t"
"subu %[temp1], %[temp1], %[temp9] \n\t"
"lh %[temp3], 6(%[in]) \n\t"
"lh %[temp7], 14(%[in]) \n\t"
"lh %[temp11], 22(%[in]) \n\t"
"lh %[temp15], 30(%[in]) \n\t"
"addu %[temp12], %[temp0], %[temp17] \n\t"
"subu %[temp0], %[temp0], %[temp17] \n\t"
"mul %[temp9], %[temp5], %[kC2] \n\t"
"mul %[temp17], %[temp13], %[kC1] \n\t"
"mul %[temp5], %[temp5], %[kC1] \n\t"
"mul %[temp13], %[temp13], %[kC2] \n\t"
"sra %[temp9], %[temp9], 16 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"subu %[temp17], %[temp9], %[temp17] \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp13], %[temp13], 16 \n\t"
"addu %[temp5], %[temp5], %[temp13] \n\t"
"addu %[temp13], %[temp1], %[temp17] \n\t"
"subu %[temp1], %[temp1], %[temp17] \n\t"
"mul %[temp17], %[temp14], %[kC1] \n\t"
"mul %[temp14], %[temp14], %[kC2] \n\t"
"addu %[temp9], %[temp16], %[temp5] \n\t"
"subu %[temp5], %[temp16], %[temp5] \n\t"
"addu %[temp16], %[temp2], %[temp10] \n\t"
"subu %[temp2], %[temp2], %[temp10] \n\t"
"mul %[temp10], %[temp6], %[kC2] \n\t"
"mul %[temp6], %[temp6], %[kC1] \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp14], %[temp14], 16 \n\t"
"sra %[temp10], %[temp10], 16 \n\t"
"sra %[temp6], %[temp6], 16 \n\t"
"subu %[temp17], %[temp10], %[temp17] \n\t"
"addu %[temp6], %[temp6], %[temp14] \n\t"
"addu %[temp10], %[temp16], %[temp6] \n\t"
"subu %[temp6], %[temp16], %[temp6] \n\t"
"addu %[temp14], %[temp2], %[temp17] \n\t"
"subu %[temp2], %[temp2], %[temp17] \n\t"
"mul %[temp17], %[temp15], %[kC1] \n\t"
"mul %[temp15], %[temp15], %[kC2] \n\t"
"addu %[temp16], %[temp3], %[temp11] \n\t"
"subu %[temp3], %[temp3], %[temp11] \n\t"
"mul %[temp11], %[temp7], %[kC2] \n\t"
"mul %[temp7], %[temp7], %[kC1] \n\t"
"addiu %[temp8], %[temp8], 4 \n\t"
"addiu %[temp12], %[temp12], 4 \n\t"
"addiu %[temp0], %[temp0], 4 \n\t"
"addiu %[temp4], %[temp4], 4 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp15], %[temp15], 16 \n\t"
"sra %[temp11], %[temp11], 16 \n\t"
"sra %[temp7], %[temp7], 16 \n\t"
"subu %[temp17], %[temp11], %[temp17] \n\t"
"addu %[temp7], %[temp7], %[temp15] \n\t"
"addu %[temp15], %[temp3], %[temp17] \n\t"
"subu %[temp3], %[temp3], %[temp17] \n\t"
"addu %[temp11], %[temp16], %[temp7] \n\t"
"subu %[temp7], %[temp16], %[temp7] \n\t"
"addu %[temp16], %[temp8], %[temp10] \n\t"
"subu %[temp8], %[temp8], %[temp10] \n\t"
"mul %[temp10], %[temp9], %[kC2] \n\t"
"mul %[temp17], %[temp11], %[kC1] \n\t"
"mul %[temp9], %[temp9], %[kC1] \n\t"
"mul %[temp11], %[temp11], %[kC2] \n\t"
"sra %[temp10], %[temp10], 16 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp9], %[temp9], 16 \n\t"
"sra %[temp11], %[temp11], 16 \n\t"
"subu %[temp17], %[temp10], %[temp17] \n\t"
"addu %[temp11], %[temp9], %[temp11] \n\t"
"addu %[temp10], %[temp12], %[temp14] \n\t"
"subu %[temp12], %[temp12], %[temp14] \n\t"
"mul %[temp14], %[temp13], %[kC2] \n\t"
"mul %[temp9], %[temp15], %[kC1] \n\t"
"mul %[temp13], %[temp13], %[kC1] \n\t"
"mul %[temp15], %[temp15], %[kC2] \n\t"
"sra %[temp14], %[temp14], 16 \n\t"
"sra %[temp9], %[temp9], 16 \n\t"
"sra %[temp13], %[temp13], 16 \n\t"
"sra %[temp15], %[temp15], 16 \n\t"
"subu %[temp9], %[temp14], %[temp9] \n\t"
"addu %[temp15], %[temp13], %[temp15] \n\t"
"addu %[temp14], %[temp0], %[temp2] \n\t"
"subu %[temp0], %[temp0], %[temp2] \n\t"
"mul %[temp2], %[temp1], %[kC2] \n\t"
"mul %[temp13], %[temp3], %[kC1] \n\t"
"mul %[temp1], %[temp1], %[kC1] \n\t"
"mul %[temp3], %[temp3], %[kC2] \n\t"
"sra %[temp2], %[temp2], 16 \n\t"
"sra %[temp13], %[temp13], 16 \n\t"
"sra %[temp1], %[temp1], 16 \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"subu %[temp13], %[temp2], %[temp13] \n\t"
"addu %[temp3], %[temp1], %[temp3] \n\t"
"addu %[temp2], %[temp4], %[temp6] \n\t"
"subu %[temp4], %[temp4], %[temp6] \n\t"
"mul %[temp6], %[temp5], %[kC2] \n\t"
"mul %[temp1], %[temp7], %[kC1] \n\t"
"mul %[temp5], %[temp5], %[kC1] \n\t"
"mul %[temp7], %[temp7], %[kC2] \n\t"
"sra %[temp6], %[temp6], 16 \n\t"
"sra %[temp1], %[temp1], 16 \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp7], %[temp7], 16 \n\t"
"subu %[temp1], %[temp6], %[temp1] \n\t"
"addu %[temp7], %[temp5], %[temp7] \n\t"
"addu %[temp5], %[temp16], %[temp11] \n\t"
"subu %[temp16], %[temp16], %[temp11] \n\t"
"addu %[temp11], %[temp8], %[temp17] \n\t"
"subu %[temp8], %[temp8], %[temp17] \n\t"
"sra %[temp5], %[temp5], 3 \n\t"
"sra %[temp16], %[temp16], 3 \n\t"
"sra %[temp11], %[temp11], 3 \n\t"
"sra %[temp8], %[temp8], 3 \n\t"
"addu %[temp17], %[temp10], %[temp15] \n\t"
"subu %[temp10], %[temp10], %[temp15] \n\t"
"addu %[temp15], %[temp12], %[temp9] \n\t"
"subu %[temp12], %[temp12], %[temp9] \n\t"
"sra %[temp17], %[temp17], 3 \n\t"
"sra %[temp10], %[temp10], 3 \n\t"
"sra %[temp15], %[temp15], 3 \n\t"
"sra %[temp12], %[temp12], 3 \n\t"
"addu %[temp9], %[temp14], %[temp3] \n\t"
"subu %[temp14], %[temp14], %[temp3] \n\t"
"addu %[temp3], %[temp0], %[temp13] \n\t"
"subu %[temp0], %[temp0], %[temp13] \n\t"
"sra %[temp9], %[temp9], 3 \n\t"
"sra %[temp14], %[temp14], 3 \n\t"
"sra %[temp3], %[temp3], 3 \n\t"
"sra %[temp0], %[temp0], 3 \n\t"
"addu %[temp13], %[temp2], %[temp7] \n\t"
"subu %[temp2], %[temp2], %[temp7] \n\t"
"addu %[temp7], %[temp4], %[temp1] \n\t"
"subu %[temp4], %[temp4], %[temp1] \n\t"
"sra %[temp13], %[temp13], 3 \n\t"
"sra %[temp2], %[temp2], 3 \n\t"
"sra %[temp7], %[temp7], 3 \n\t"
"sra %[temp4], %[temp4], 3 \n\t"
"addiu %[temp6], $zero, 255 \n\t"
"lbu %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp1], %[temp1], %[temp5] \n\t"
"sra %[temp5], %[temp1], 8 \n\t"
"sra %[temp18], %[temp1], 31 \n\t"
"beqz %[temp5], 1f \n\t"
"xor %[temp1], %[temp1], %[temp1] \n\t"
"movz %[temp1], %[temp6], %[temp18] \n\t"
"1: \n\t"
"lbu %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp18], %[temp18], %[temp11] \n\t"
"sra %[temp11], %[temp18], 8 \n\t"
"sra %[temp1], %[temp18], 31 \n\t"
"beqz %[temp11], 2f \n\t"
"xor %[temp18], %[temp18], %[temp18] \n\t"
"movz %[temp18], %[temp6], %[temp1] \n\t"
"2: \n\t"
"lbu %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp1], %[temp1], %[temp8] \n\t"
"sra %[temp8], %[temp1], 8 \n\t"
"sra %[temp18], %[temp1], 31 \n\t"
"beqz %[temp8], 3f \n\t"
"xor %[temp1], %[temp1], %[temp1] \n\t"
"movz %[temp1], %[temp6], %[temp18] \n\t"
"3: \n\t"
"lbu %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp18], %[temp18], %[temp16] \n\t"
"sra %[temp16], %[temp18], 8 \n\t"
"sra %[temp1], %[temp18], 31 \n\t"
"beqz %[temp16], 4f \n\t"
"xor %[temp18], %[temp18], %[temp18] \n\t"
"movz %[temp18], %[temp6], %[temp1] \n\t"
"4: \n\t"
"sb %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp5], %[temp5], %[temp17] \n\t"
"addu %[temp8], %[temp8], %[temp15] \n\t"
"addu %[temp11], %[temp11], %[temp12] \n\t"
"addu %[temp16], %[temp16], %[temp10] \n\t"
"sra %[temp18], %[temp5], 8 \n\t"
"sra %[temp1], %[temp5], 31 \n\t"
"beqz %[temp18], 5f \n\t"
"xor %[temp5], %[temp5], %[temp5] \n\t"
"movz %[temp5], %[temp6], %[temp1] \n\t"
"5: \n\t"
"sra %[temp18], %[temp8], 8 \n\t"
"sra %[temp1], %[temp8], 31 \n\t"
"beqz %[temp18], 6f \n\t"
"xor %[temp8], %[temp8], %[temp8] \n\t"
"movz %[temp8], %[temp6], %[temp1] \n\t"
"6: \n\t"
"sra %[temp18], %[temp11], 8 \n\t"
"sra %[temp1], %[temp11], 31 \n\t"
"sra %[temp17], %[temp16], 8 \n\t"
"sra %[temp15], %[temp16], 31 \n\t"
"beqz %[temp18], 7f \n\t"
"xor %[temp11], %[temp11], %[temp11] \n\t"
"movz %[temp11], %[temp6], %[temp1] \n\t"
"7: \n\t"
"beqz %[temp17], 8f \n\t"
"xor %[temp16], %[temp16], %[temp16] \n\t"
"movz %[temp16], %[temp6], %[temp15] \n\t"
"8: \n\t"
"sb %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp5], %[temp5], %[temp9] \n\t"
"addu %[temp8], %[temp8], %[temp3] \n\t"
"addu %[temp11], %[temp11], %[temp0] \n\t"
"addu %[temp16], %[temp16], %[temp14] \n\t"
"sra %[temp18], %[temp5], 8 \n\t"
"sra %[temp1], %[temp5], 31 \n\t"
"sra %[temp17], %[temp8], 8 \n\t"
"sra %[temp15], %[temp8], 31 \n\t"
"sra %[temp12], %[temp11], 8 \n\t"
"sra %[temp10], %[temp11], 31 \n\t"
"sra %[temp9], %[temp16], 8 \n\t"
"sra %[temp3], %[temp16], 31 \n\t"
"beqz %[temp18], 9f \n\t"
"xor %[temp5], %[temp5], %[temp5] \n\t"
"movz %[temp5], %[temp6], %[temp1] \n\t"
"9: \n\t"
"beqz %[temp17], 10f \n\t"
"xor %[temp8], %[temp8], %[temp8] \n\t"
"movz %[temp8], %[temp6], %[temp15] \n\t"
"10: \n\t"
"beqz %[temp12], 11f \n\t"
"xor %[temp11], %[temp11], %[temp11] \n\t"
"movz %[temp11], %[temp6], %[temp10] \n\t"
"11: \n\t"
"beqz %[temp9], 12f \n\t"
"xor %[temp16], %[temp16], %[temp16] \n\t"
"movz %[temp16], %[temp6], %[temp3] \n\t"
"12: \n\t"
"sb %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp5], %[temp5], %[temp13] \n\t"
"addu %[temp8], %[temp8], %[temp7] \n\t"
"addu %[temp11], %[temp11], %[temp4] \n\t"
"addu %[temp16], %[temp16], %[temp2] \n\t"
"sra %[temp18], %[temp5], 8 \n\t"
"sra %[temp1], %[temp5], 31 \n\t"
"sra %[temp17], %[temp8], 8 \n\t"
"sra %[temp15], %[temp8], 31 \n\t"
"sra %[temp12], %[temp11], 8 \n\t"
"sra %[temp10], %[temp11], 31 \n\t"
"sra %[temp9], %[temp16], 8 \n\t"
"sra %[temp3], %[temp16], 31 \n\t"
"beqz %[temp18], 13f \n\t"
"xor %[temp5], %[temp5], %[temp5] \n\t"
"movz %[temp5], %[temp6], %[temp1] \n\t"
"13: \n\t"
"beqz %[temp17], 14f \n\t"
"xor %[temp8], %[temp8], %[temp8] \n\t"
"movz %[temp8], %[temp6], %[temp15] \n\t"
"14: \n\t"
"beqz %[temp12], 15f \n\t"
"xor %[temp11], %[temp11], %[temp11] \n\t"
"movz %[temp11], %[temp6], %[temp10] \n\t"
"15: \n\t"
"beqz %[temp9], 16f \n\t"
"xor %[temp16], %[temp16], %[temp16] \n\t"
"movz %[temp16], %[temp6], %[temp3] \n\t"
"16: \n\t"
"sb %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18)
: [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
: "memory", "hi", "lo"
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8DspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
VP8InitClipTables();
VP8Transform = TransformTwo;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
#endif // WEBP_USE_MIPS32

View file

@ -0,0 +1,994 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of dsp functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static void TransformDC(const int16_t* in, uint8_t* dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
__asm__ volatile (
LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
0, 0, 0, 0,
0, 1, 2, 3,
BPS)
"lh %[temp5], 0(%[in]) \n\t"
"addiu %[temp5], %[temp5], 4 \n\t"
"ins %[temp5], %[temp5], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 3 \n\t"
CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
temp3, temp1, temp2, temp3, temp4)
STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
dst, 0, 1, 2, 3, BPS)
OUTPUT_EARLY_CLOBBER_REGS_10()
: [in]"r"(in), [dst]"r"(dst)
: "memory"
);
}
static void TransformAC3(const int16_t* in, uint8_t* dst) {
const int a = in[0] + 4;
int c4 = MUL(in[4], kC2);
const int d4 = MUL(in[4], kC1);
const int c1 = MUL(in[1], kC2);
const int d1 = MUL(in[1], kC1);
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
__asm__ volatile (
"ins %[c4], %[d4], 16, 16 \n\t"
"replv.ph %[temp1], %[a] \n\t"
"replv.ph %[temp4], %[d1] \n\t"
ADD_SUB_HALVES(temp2, temp3, temp1, c4)
"replv.ph %[temp5], %[c1] \n\t"
SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
0, 0, 0, 0,
0, 1, 2, 3,
BPS)
CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
temp11, temp17, temp3, temp5, temp11, temp12)
PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
temp4, temp7, temp6, temp10, temp9)
STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
temp17, temp12, temp18, temp1, temp8, temp2, temp4,
temp7, temp6, dst, 0, 1, 2, 3, BPS)
OUTPUT_EARLY_CLOBBER_REGS_18(),
[c4]"+&r"(c4)
: [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
: "memory"
);
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
__asm__ volatile (
"ulw %[temp1], 0(%[in]) \n\t"
"ulw %[temp2], 16(%[in]) \n\t"
LOAD_IN_X2(temp5, temp6, 24, 26)
ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
LOAD_IN_X2(temp1, temp2, 8, 10)
MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
temp13, temp11, temp14, temp12)
INSERT_HALF_X2(temp8, temp7, temp10, temp9)
"ulw %[temp17], 4(%[in]) \n\t"
"ulw %[temp18], 20(%[in]) \n\t"
ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
LOAD_IN_X2(temp17, temp18, 12, 14)
LOAD_IN_X2(temp9, temp10, 28, 30)
MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
temp15, temp4, temp16, temp17)
INSERT_HALF_X2(temp11, temp12, temp13, temp14)
ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
// horizontal
SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
INSERT_HALF_X2(temp1, temp6, temp5, temp2)
SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
"repl.ph %[temp2], 0x4 \n\t"
INSERT_HALF_X2(temp3, temp8, temp17, temp4)
"addq.ph %[temp1], %[temp1], %[temp2] \n\t"
"addq.ph %[temp6], %[temp6], %[temp2] \n\t"
ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
temp6, temp17, temp8, temp18)
MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
temp18, temp12, temp17, temp16)
INSERT_HALF_X2(temp1, temp3, temp9, temp13)
INSERT_HALF_X2(temp6, temp8, temp11, temp15)
SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
temp6)
PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
temp16, temp11, temp10, temp15, temp14)
LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
0, 0, 0, 0,
0, 1, 2, 3,
BPS)
CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
temp11, temp10, temp11, temp14, temp15)
STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
dst, 0, 1, 2, 3, BPS)
OUTPUT_EARLY_CLOBBER_REGS_18()
: [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
: "memory", "hi", "lo"
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"negu %[temp1], %[hstride] \n\t"
"addiu %[size], %[size], -1 \n\t"
"sll %[temp2], %[hstride], 1 \n\t"
"sll %[temp3], %[temp1], 1 \n\t"
"addu %[temp4], %[temp2], %[hstride] \n\t"
"addu %[temp5], %[temp3], %[temp1] \n\t"
"lbu %[temp7], 0(%[p]) \n\t"
"sll %[temp6], %[temp3], 1 \n\t"
"lbux %[temp8], %[temp5](%[p]) \n\t"
"lbux %[temp9], %[temp3](%[p]) \n\t"
"lbux %[temp10], %[temp1](%[p]) \n\t"
"lbux %[temp11], %[temp6](%[p]) \n\t"
"lbux %[temp12], %[hstride](%[p]) \n\t"
"lbux %[temp13], %[temp2](%[p]) \n\t"
"lbux %[temp14], %[temp4](%[p]) \n\t"
"subu %[temp1], %[temp10], %[temp7] \n\t"
"subu %[temp2], %[temp9], %[temp12] \n\t"
"absq_s.w %[temp3], %[temp1] \n\t"
"absq_s.w %[temp4], %[temp2] \n\t"
"negu %[temp1], %[temp1] \n\t"
"sll %[temp3], %[temp3], 2 \n\t"
"addu %[temp15], %[temp3], %[temp4] \n\t"
"subu %[temp3], %[temp15], %[thresh2] \n\t"
"sll %[temp6], %[temp1], 1 \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp4], %[temp11], %[temp8] \n\t"
"absq_s.w %[temp4], %[temp4] \n\t"
"shll_s.w %[temp2], %[temp2], 24 \n\t"
"subu %[temp4], %[temp4], %[ithresh] \n\t"
"bgtz %[temp4], 3f \n\t"
" subu %[temp3], %[temp8], %[temp9] \n\t"
"absq_s.w %[temp3], %[temp3] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp5], %[temp9], %[temp10] \n\t"
"absq_s.w %[temp3], %[temp5] \n\t"
"absq_s.w %[temp5], %[temp5] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp3], %[temp14], %[temp13] \n\t"
"absq_s.w %[temp3], %[temp3] \n\t"
"slt %[temp5], %[hev_thresh], %[temp5] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp3], %[temp13], %[temp12] \n\t"
"absq_s.w %[temp3], %[temp3] \n\t"
"sra %[temp4], %[temp2], 24 \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp15], %[temp12], %[temp7] \n\t"
"absq_s.w %[temp3], %[temp15] \n\t"
"absq_s.w %[temp15], %[temp15] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" slt %[temp15], %[hev_thresh], %[temp15] \n\t"
"addu %[temp3], %[temp6], %[temp1] \n\t"
"or %[temp2], %[temp5], %[temp15] \n\t"
"addu %[temp5], %[temp4], %[temp3] \n\t"
"beqz %[temp2], 4f \n\t"
" shra_r.w %[temp1], %[temp5], 3 \n\t"
"addiu %[temp2], %[temp5], 3 \n\t"
"sra %[temp2], %[temp2], 3 \n\t"
"shll_s.w %[temp1], %[temp1], 27 \n\t"
"shll_s.w %[temp2], %[temp2], 27 \n\t"
"subu %[temp3], %[p], %[hstride] \n\t"
"sra %[temp1], %[temp1], 27 \n\t"
"sra %[temp2], %[temp2], 27 \n\t"
"subu %[temp1], %[temp7], %[temp1] \n\t"
"addu %[temp2], %[temp10], %[temp2] \n\t"
"lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
"lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[temp3]) \n\t"
"j 3f \n\t"
" sb %[temp1], 0(%[p]) \n\t"
"4: \n\t"
"shll_s.w %[temp5], %[temp5], 24 \n\t"
"subu %[temp14], %[p], %[hstride] \n\t"
"subu %[temp11], %[temp14], %[hstride] \n\t"
"sra %[temp6], %[temp5], 24 \n\t"
"sll %[temp1], %[temp6], 3 \n\t"
"subu %[temp15], %[temp11], %[hstride] \n\t"
"addu %[temp2], %[temp6], %[temp1] \n\t"
"sll %[temp3], %[temp2], 1 \n\t"
"addu %[temp4], %[temp3], %[temp2] \n\t"
"addiu %[temp2], %[temp2], 63 \n\t"
"addiu %[temp3], %[temp3], 63 \n\t"
"addiu %[temp4], %[temp4], 63 \n\t"
"sra %[temp2], %[temp2], 7 \n\t"
"sra %[temp3], %[temp3], 7 \n\t"
"sra %[temp4], %[temp4], 7 \n\t"
"addu %[temp1], %[temp8], %[temp2] \n\t"
"addu %[temp5], %[temp9], %[temp3] \n\t"
"addu %[temp6], %[temp10], %[temp4] \n\t"
"subu %[temp8], %[temp7], %[temp4] \n\t"
"subu %[temp7], %[temp12], %[temp3] \n\t"
"addu %[temp10], %[p], %[hstride] \n\t"
"subu %[temp9], %[temp13], %[temp2] \n\t"
"addu %[temp12], %[temp10], %[hstride] \n\t"
"lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
"lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
"lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
"lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
"lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[temp15]) \n\t"
"sb %[temp3], 0(%[temp11]) \n\t"
"sb %[temp4], 0(%[temp14]) \n\t"
"sb %[temp5], 0(%[p]) \n\t"
"sb %[temp6], 0(%[temp10]) \n\t"
"sb %[temp8], 0(%[temp12]) \n\t"
"3: \n\t"
"bgtz %[size], 1b \n\t"
" addu %[p], %[p], %[vstride] \n\t"
".set pop \n\t"
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
[temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
[temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
[size]"+&r"(size), [p]"+&r"(p)
: [hstride]"r"(hstride), [thresh2]"r"(thresh2),
[ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
[VP8kclip1]"r"(VP8kclip1)
: "memory"
);
}
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
int p0, q0, p1, q1, p2, q2, p3, q3;
int step1, step2, temp1, temp2, temp3, temp4;
uint8_t* pTemp0;
uint8_t* pTemp1;
const int thresh2 = 2 * thresh + 1;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"bltz %[size], 3f \n\t"
" nop \n\t"
"2: \n\t"
"negu %[step1], %[hstride] \n\t"
"lbu %[q0], 0(%[p]) \n\t"
"lbux %[p0], %[step1](%[p]) \n\t"
"subu %[step1], %[step1], %[hstride] \n\t"
"lbux %[q1], %[hstride](%[p]) \n\t"
"subu %[temp1], %[p0], %[q0] \n\t"
"lbux %[p1], %[step1](%[p]) \n\t"
"addu %[step2], %[hstride], %[hstride] \n\t"
"absq_s.w %[temp2], %[temp1] \n\t"
"subu %[temp3], %[p1], %[q1] \n\t"
"absq_s.w %[temp4], %[temp3] \n\t"
"sll %[temp2], %[temp2], 2 \n\t"
"addu %[temp2], %[temp2], %[temp4] \n\t"
"subu %[temp4], %[temp2], %[thresh2] \n\t"
"subu %[step1], %[step1], %[hstride] \n\t"
"bgtz %[temp4], 0f \n\t"
" lbux %[p2], %[step1](%[p]) \n\t"
"subu %[step1], %[step1], %[hstride] \n\t"
"lbux %[q2], %[step2](%[p]) \n\t"
"lbux %[p3], %[step1](%[p]) \n\t"
"subu %[temp4], %[p2], %[p1] \n\t"
"addu %[step2], %[step2], %[hstride] \n\t"
"subu %[temp2], %[p3], %[p2] \n\t"
"absq_s.w %[temp4], %[temp4] \n\t"
"absq_s.w %[temp2], %[temp2] \n\t"
"lbux %[q3], %[step2](%[p]) \n\t"
"subu %[temp4], %[temp4], %[ithresh] \n\t"
"negu %[temp1], %[temp1] \n\t"
"bgtz %[temp4], 0f \n\t"
" subu %[temp2], %[temp2], %[ithresh] \n\t"
"subu %[p3], %[p1], %[p0] \n\t"
"bgtz %[temp2], 0f \n\t"
" absq_s.w %[p3], %[p3] \n\t"
"subu %[temp4], %[q3], %[q2] \n\t"
"subu %[pTemp0], %[p], %[hstride] \n\t"
"absq_s.w %[temp4], %[temp4] \n\t"
"subu %[temp2], %[p3], %[ithresh] \n\t"
"sll %[step1], %[temp1], 1 \n\t"
"bgtz %[temp2], 0f \n\t"
" subu %[temp4], %[temp4], %[ithresh] \n\t"
"subu %[temp2], %[q2], %[q1] \n\t"
"bgtz %[temp4], 0f \n\t"
" absq_s.w %[temp2], %[temp2] \n\t"
"subu %[q3], %[q1], %[q0] \n\t"
"absq_s.w %[q3], %[q3] \n\t"
"subu %[temp2], %[temp2], %[ithresh] \n\t"
"addu %[temp1], %[temp1], %[step1] \n\t"
"bgtz %[temp2], 0f \n\t"
" subu %[temp4], %[q3], %[ithresh] \n\t"
"slt %[p3], %[hev_thresh], %[p3] \n\t"
"bgtz %[temp4], 0f \n\t"
" slt %[q3], %[hev_thresh], %[q3] \n\t"
"or %[q3], %[q3], %[p3] \n\t"
"bgtz %[q3], 1f \n\t"
" shra_r.w %[temp2], %[temp1], 3 \n\t"
"addiu %[temp1], %[temp1], 3 \n\t"
"sra %[temp1], %[temp1], 3 \n\t"
"shll_s.w %[temp2], %[temp2], 27 \n\t"
"shll_s.w %[temp1], %[temp1], 27 \n\t"
"addu %[pTemp1], %[p], %[hstride] \n\t"
"sra %[temp2], %[temp2], 27 \n\t"
"sra %[temp1], %[temp1], 27 \n\t"
"addiu %[step1], %[temp2], 1 \n\t"
"sra %[step1], %[step1], 1 \n\t"
"addu %[p0], %[p0], %[temp1] \n\t"
"addu %[p1], %[p1], %[step1] \n\t"
"subu %[q0], %[q0], %[temp2] \n\t"
"subu %[q1], %[q1], %[step1] \n\t"
"lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
"lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[pTemp0]) \n\t"
"lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
"subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
"sb %[temp3], 0(%[p]) \n\t"
"sb %[temp4], 0(%[pTemp1]) \n\t"
"j 0f \n\t"
" sb %[temp1], 0(%[pTemp0]) \n\t"
"1: \n\t"
"shll_s.w %[temp3], %[temp3], 24 \n\t"
"sra %[temp3], %[temp3], 24 \n\t"
"addu %[temp1], %[temp1], %[temp3] \n\t"
"shra_r.w %[temp2], %[temp1], 3 \n\t"
"addiu %[temp1], %[temp1], 3 \n\t"
"shll_s.w %[temp2], %[temp2], 27 \n\t"
"sra %[temp1], %[temp1], 3 \n\t"
"shll_s.w %[temp1], %[temp1], 27 \n\t"
"sra %[temp2], %[temp2], 27 \n\t"
"sra %[temp1], %[temp1], 27 \n\t"
"addu %[p0], %[p0], %[temp1] \n\t"
"subu %[q0], %[q0], %[temp2] \n\t"
"lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
"lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[p]) \n\t"
"sb %[temp1], 0(%[pTemp0]) \n\t"
"0: \n\t"
"subu %[size], %[size], 1 \n\t"
"bgtz %[size], 2b \n\t"
" addu %[p], %[p], %[vstride] \n\t"
"3: \n\t"
".set pop \n\t"
: [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
[p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
[step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
[size]"+&r"(size)
: [vstride]"r"(vstride), [ithresh]"r"(ithresh),
[hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
[VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
: "memory"
);
}
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#undef MUL
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
uint8_t* p1 = p - stride;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"li %[i], 16 \n\t"
"0: \n\t"
"negu %[temp4], %[stride] \n\t"
"sll %[temp5], %[temp4], 1 \n\t"
"lbu %[temp2], 0(%[p]) \n\t"
"lbux %[temp3], %[stride](%[p]) \n\t"
"lbux %[temp1], %[temp4](%[p]) \n\t"
"lbux %[temp0], %[temp5](%[p]) \n\t"
"subu %[temp7], %[temp1], %[temp2] \n\t"
"subu %[temp6], %[temp0], %[temp3] \n\t"
"absq_s.w %[temp4], %[temp7] \n\t"
"absq_s.w %[temp5], %[temp6] \n\t"
"sll %[temp4], %[temp4], 2 \n\t"
"subu %[temp5], %[temp5], %[thresh2] \n\t"
"addu %[temp5], %[temp4], %[temp5] \n\t"
"negu %[temp8], %[temp7] \n\t"
"bgtz %[temp5], 1f \n\t"
" addiu %[i], %[i], -1 \n\t"
"sll %[temp4], %[temp8], 1 \n\t"
"shll_s.w %[temp5], %[temp6], 24 \n\t"
"addu %[temp3], %[temp4], %[temp8] \n\t"
"sra %[temp5], %[temp5], 24 \n\t"
"addu %[temp3], %[temp3], %[temp5] \n\t"
"addiu %[temp7], %[temp3], 3 \n\t"
"sra %[temp7], %[temp7], 3 \n\t"
"shra_r.w %[temp8], %[temp3], 3 \n\t"
"shll_s.w %[temp0], %[temp7], 27 \n\t"
"shll_s.w %[temp4], %[temp8], 27 \n\t"
"sra %[temp0], %[temp0], 27 \n\t"
"sra %[temp4], %[temp4], 27 \n\t"
"addu %[temp7], %[temp1], %[temp0] \n\t"
"subu %[temp2], %[temp2], %[temp4] \n\t"
"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
"sb %[temp3], 0(%[p1]) \n\t"
"sb %[temp4], 0(%[p]) \n\t"
"1: \n\t"
"addiu %[p1], %[p1], 1 \n\t"
"bgtz %[i], 0b \n\t"
" addiu %[p], %[p], 1 \n\t"
" .set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
: "memory"
);
}
// TEMP0 = SRC[A + A1 * BPS]
// TEMP1 = SRC[B + B1 * BPS]
// TEMP2 = SRC[C + C1 * BPS]
// TEMP3 = SRC[D + D1 * BPS]
#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
A, A1, B, B1, C, C1, D, D1, SRC) \
"lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"li %[i], 16 \n\t"
"0: \n\t"
LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
"subu %[temp7], %[temp1], %[temp2] \n\t"
"subu %[temp6], %[temp0], %[temp3] \n\t"
"absq_s.w %[temp4], %[temp7] \n\t"
"absq_s.w %[temp5], %[temp6] \n\t"
"sll %[temp4], %[temp4], 2 \n\t"
"addu %[temp5], %[temp4], %[temp5] \n\t"
"subu %[temp5], %[temp5], %[thresh2] \n\t"
"negu %[temp8], %[temp7] \n\t"
"bgtz %[temp5], 1f \n\t"
" addiu %[i], %[i], -1 \n\t"
"sll %[temp4], %[temp8], 1 \n\t"
"shll_s.w %[temp5], %[temp6], 24 \n\t"
"addu %[temp3], %[temp4], %[temp8] \n\t"
"sra %[temp5], %[temp5], 24 \n\t"
"addu %[temp3], %[temp3], %[temp5] \n\t"
"addiu %[temp7], %[temp3], 3 \n\t"
"sra %[temp7], %[temp7], 3 \n\t"
"shra_r.w %[temp8], %[temp3], 3 \n\t"
"shll_s.w %[temp0], %[temp7], 27 \n\t"
"shll_s.w %[temp4], %[temp8], 27 \n\t"
"sra %[temp0], %[temp0], 27 \n\t"
"sra %[temp4], %[temp4], 27 \n\t"
"addu %[temp7], %[temp1], %[temp0] \n\t"
"subu %[temp2], %[temp2], %[temp4] \n\t"
"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
"sb %[temp3], -1(%[p]) \n\t"
"sb %[temp4], 0(%[p]) \n\t"
"1: \n\t"
"bgtz %[i], 0b \n\t"
" addu %[p], %[p], %[stride] \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[p]"+&r"(p), [i]"=&r"(i)
: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
: "memory"
);
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
}
}
// DST[A * BPS] = TEMP0
// DST[B + C * BPS] = TEMP1
#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
"usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
"usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
static void VE4(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile (
"ulw %[temp0], -1(%[top]) \n\t"
"ulh %[temp1], 3(%[top]) \n\t"
"preceu.ph.qbr %[temp2], %[temp0] \n\t"
"preceu.ph.qbl %[temp3], %[temp0] \n\t"
"preceu.ph.qbr %[temp4], %[temp1] \n\t"
"packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
"packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
"shll.ph %[temp5], %[temp5], 1 \n\t"
"shll.ph %[temp6], %[temp6], 1 \n\t"
"addq.ph %[temp2], %[temp5], %[temp2] \n\t"
"addq.ph %[temp6], %[temp6], %[temp4] \n\t"
"addq.ph %[temp2], %[temp2], %[temp3] \n\t"
"addq.ph %[temp6], %[temp6], %[temp3] \n\t"
"shra_r.ph %[temp2], %[temp2], 2 \n\t"
"shra_r.ph %[temp6], %[temp6], 2 \n\t"
"precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6)
: [top]"r"(top), [dst]"r"(dst)
: "memory"
);
}
static void DC4(uint8_t* dst) { // DC
int temp0, temp1, temp2, temp3, temp4;
__asm__ volatile (
"ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
"ins %[temp1], %[temp2], 8, 8 \n\t"
"ins %[temp1], %[temp3], 16, 8 \n\t"
"ins %[temp1], %[temp4], 24, 8 \n\t"
"raddu.w.qb %[temp0], %[temp0] \n\t"
"raddu.w.qb %[temp1], %[temp1] \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"shra_r.w %[temp0], %[temp0], 3 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
: [dst]"r"(dst)
: "memory"
);
}
static void RD4(uint8_t* dst) { // Down-right
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8;
__asm__ volatile (
LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
"ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"preceu.ph.qbr %[temp5], %[temp7] \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"preceu.ph.qbl %[temp4], %[temp7] \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"shll.ph %[temp2], %[temp2], 1 \n\t"
"addq.ph %[temp3], %[temp3], %[temp1] \n\t"
"packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
"addq.ph %[temp1], %[temp1], %[temp5] \n\t"
"shll.ph %[temp6], %[temp6], 1 \n\t"
"addq.ph %[temp1], %[temp1], %[temp6] \n\t"
"packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
"addq.ph %[temp8], %[temp5], %[temp4] \n\t"
"shra_r.ph %[temp3], %[temp3], 2 \n\t"
"shll.ph %[temp0], %[temp0], 1 \n\t"
"shra_r.ph %[temp1], %[temp1], 2 \n\t"
"addq.ph %[temp8], %[temp0], %[temp8] \n\t"
"lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
"precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
"shra_r.ph %[temp8], %[temp8], 2 \n\t"
"ins %[temp7], %[temp5], 0, 8 \n\t"
"precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
"raddu.w.qb %[temp4], %[temp7] \n\t"
"precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
"shra_r.w %[temp4], %[temp4], 2 \n\t"
STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
"prepend %[temp2], %[temp8], 8 \n\t"
"prepend %[temp6], %[temp4], 8 \n\t"
STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [dst]"r"(dst)
: "memory"
);
}
// TEMP0 = SRC[A * BPS]
// TEMP1 = SRC[B + C * BPS]
#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
"ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
static void LD4(uint8_t* dst) { // Down-Left
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
"preceu.ph.qbl %[temp2], %[temp0] \n\t"
"preceu.ph.qbr %[temp3], %[temp0] \n\t"
"preceu.ph.qbr %[temp4], %[temp1] \n\t"
"preceu.ph.qbl %[temp5], %[temp1] \n\t"
"packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
"packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
"packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
"shll.ph %[temp6], %[temp6], 1 \n\t"
"addq.ph %[temp9], %[temp2], %[temp6] \n\t"
"shll.ph %[temp7], %[temp7], 1 \n\t"
"addq.ph %[temp9], %[temp9], %[temp3] \n\t"
"shll.ph %[temp8], %[temp8], 1 \n\t"
"shra_r.ph %[temp9], %[temp9], 2 \n\t"
"addq.ph %[temp3], %[temp4], %[temp7] \n\t"
"addq.ph %[temp0], %[temp5], %[temp8] \n\t"
"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
"addq.ph %[temp0], %[temp0], %[temp4] \n\t"
"shra_r.ph %[temp3], %[temp3], 2 \n\t"
"shra_r.ph %[temp0], %[temp0], 2 \n\t"
"srl %[temp1], %[temp1], 24 \n\t"
"sll %[temp1], %[temp1], 1 \n\t"
"raddu.w.qb %[temp5], %[temp5] \n\t"
"precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
"precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
"addu %[temp1], %[temp1], %[temp5] \n\t"
"shra_r.w %[temp1], %[temp1], 2 \n\t"
STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
"prepend %[temp9], %[temp0], 8 \n\t"
"prepend %[temp3], %[temp1], 8 \n\t"
STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9)
: [dst]"r"(dst)
: "memory"
);
}
//------------------------------------------------------------------------------
// Chroma
static void DC8uv(uint8_t* dst) { // DC
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
"raddu.w.qb %[temp0], %[temp0] \n\t"
"raddu.w.qb %[temp1], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addu %[temp4], %[temp4], %[temp5] \n\t"
"addu %[temp6], %[temp6], %[temp7] \n\t"
"addu %[temp8], %[temp8], %[temp9] \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp4] \n\t"
"addu %[temp6], %[temp6], %[temp8] \n\t"
"addu %[temp0], %[temp0], %[temp2] \n\t"
"addu %[temp0], %[temp0], %[temp6] \n\t"
"shra_r.w %[temp0], %[temp0], 4 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9)
: [dst]"r"(dst)
: "memory"
);
}
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
int temp0, temp1;
__asm__ volatile (
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
"raddu.w.qb %[temp0], %[temp0] \n\t"
"raddu.w.qb %[temp1], %[temp1] \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"shra_r.w %[temp0], %[temp0], 3 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
: [dst]"r"(dst)
: "memory"
);
}
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8;
__asm__ volatile (
LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addu %[temp4], %[temp4], %[temp5] \n\t"
"addu %[temp6], %[temp6], %[temp7] \n\t"
"addu %[temp8], %[temp8], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp4] \n\t"
"addu %[temp6], %[temp6], %[temp8] \n\t"
"addu %[temp0], %[temp6], %[temp2] \n\t"
"shra_r.w %[temp0], %[temp0], 3 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [dst]"r"(dst)
: "memory"
);
}
#undef LOAD_8_BYTES
#undef STORE_8_BYTES
#undef LOAD_4_BYTES
#define CLIPPING(SIZE) \
"preceu.ph.qbl %[temp2], %[temp0] \n\t" \
"preceu.ph.qbr %[temp0], %[temp0] \n\t" \
".if " #SIZE " == 8 \n\t" \
"preceu.ph.qbl %[temp3], %[temp1] \n\t" \
"preceu.ph.qbr %[temp1], %[temp1] \n\t" \
".endif \n\t" \
"addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
"addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
".if " #SIZE " == 8 \n\t" \
"addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
"addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
".endif \n\t" \
"shll_s.ph %[temp2], %[temp2], 7 \n\t" \
"shll_s.ph %[temp0], %[temp0], 7 \n\t" \
".if " #SIZE " == 8 \n\t" \
"shll_s.ph %[temp3], %[temp3], 7 \n\t" \
"shll_s.ph %[temp1], %[temp1], 7 \n\t" \
".endif \n\t" \
"precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
".if " #SIZE " == 8 \n\t" \
"precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
".endif \n\t"
#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
int temp0, temp1, temp2, temp3; \
__asm__ volatile ( \
".if " #SIZE " < 8 \n\t" \
"ulw %[temp0], 0(%[top]) \n\t" \
"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
CLIPPING(4) \
"usw %[temp0], 0(%[dst]) \n\t" \
".else \n\t" \
"ulw %[temp0], 0(%[top]) \n\t" \
"ulw %[temp1], 4(%[top]) \n\t" \
"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
CLIPPING(8) \
"usw %[temp0], 0(%[dst]) \n\t" \
"usw %[temp1], 4(%[dst]) \n\t" \
".if " #SIZE " == 16 \n\t" \
"ulw %[temp0], 8(%[top]) \n\t" \
"ulw %[temp1], 12(%[top]) \n\t" \
CLIPPING(8) \
"usw %[temp0], 8(%[dst]) \n\t" \
"usw %[temp1], 12(%[dst]) \n\t" \
".endif \n\t" \
".endif \n\t" \
: [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
: [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
: "memory" \
); \
} while (0)
#define CLIP_TO_DST(DST, SIZE) do { \
int y; \
const uint8_t* top = (DST) - BPS; \
const int top_1 = ((int)top[-1] << 16) + top[-1]; \
for (y = 0; y < (SIZE); ++y) { \
CLIP_8B_TO_DST((DST), top, (SIZE)); \
(DST) += BPS; \
} \
} while (0)
#define TRUE_MOTION(DST, SIZE) \
static void TrueMotion##SIZE(uint8_t* (DST)) { \
CLIP_TO_DST((DST), (SIZE)); \
}
TRUE_MOTION(dst, 4)
TRUE_MOTION(dst, 8)
TRUE_MOTION(dst, 16)
#undef TRUE_MOTION
#undef CLIP_TO_DST
#undef CLIP_8B_TO_DST
#undef CLIPPING
//------------------------------------------------------------------------------
// Entry point
extern void VP8DspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
VP8TransformDC = TransformDC;
VP8TransformAC3 = TransformAC3;
VP8Transform = TransformTwo;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
VP8PredLuma4[0] = DC4;
VP8PredLuma4[1] = TrueMotion4;
VP8PredLuma4[2] = VE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma4[6] = LD4;
VP8PredChroma8[0] = DC8uv;
VP8PredChroma8[1] = TrueMotion8;
VP8PredChroma8[4] = DC8uvNoTop;
VP8PredChroma8[5] = DC8uvNoLeft;
VP8PredLuma16[1] = TrueMotion16;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,45 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 version of some decoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include "../dec/vp8i.h"
static void HE16(uint8_t* dst) { // horizontal
int j;
const __m128i kShuffle3 = _mm_set1_epi8(3);
for (j = 16; j > 0; --j) {
const __m128i in = _mm_cvtsi32_si128(*(int*)(dst - 4));
const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
_mm_storeu_si128((__m128i*)dst, values);
dst += BPS;
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8DspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
VP8PredLuma16[3] = HE16;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8DspInitSSE41)
#endif // WEBP_USE_SSE41

View file

@ -0,0 +1,21 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// AVX2 version of speed-critical encoding functions.
#include "./dsp.h"
#if defined(WEBP_USE_AVX2)
#endif // WEBP_USE_AVX2
//------------------------------------------------------------------------------
// Entry point
WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2)

View file

@ -0,0 +1,672 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of speed-critical encoding functions.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
// Slobodan Prijic (slobodan.prijic@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./mips_macro.h"
#include "../enc/vp8enci.h"
#include "../enc/cost.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
// macro for one vertical pass in ITransformOne
// MUL macro inlined
// temp0..temp15 holds tmp[0]..tmp[15]
// A..D - offsets in bytes to load from in buffer
// TEMP0..TEMP3 - registers for corresponding tmp elements
// TEMP4..TEMP5 - temporary registers
#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
"lh %[temp16], " #A "(%[temp20]) \n\t" \
"lh %[temp18], " #B "(%[temp20]) \n\t" \
"lh %[temp17], " #C "(%[temp20]) \n\t" \
"lh %[temp19], " #D "(%[temp20]) \n\t" \
"addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
"subu %[temp16], %[temp16], %[temp18] \n\t" \
"mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
"mul %[temp18], %[temp19], %[kC1] \n\t" \
"mul %[temp17], %[temp17], %[kC1] \n\t" \
"mul %[temp19], %[temp19], %[kC2] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
"sra %[temp18], %[temp18], 16 \n\n" \
"sra %[temp17], %[temp17], 16 \n\n" \
"sra %[temp19], %[temp19], 16 \n\n" \
"subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
"addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
"addu %[" #TEMP0 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t" \
"addu %[" #TEMP1 "], %[temp16], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP2 "], %[temp16], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP3 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t"
// macro for one horizontal pass in ITransformOne
// MUL and STORE macros inlined
// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
// temp0..temp15 holds tmp[0]..tmp[15]
// A - offset in bytes to load from ref and store to dst buffer
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
"addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
"mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
"mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
"mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
"subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
"addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
"subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
"subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
"lw %[temp20], 0(%[args]) \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
"addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
"addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
"addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
"slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
"slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
"slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
"slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
"movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
"movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
"movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
"movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
"addiu %[temp20], $zero, 255 \n\t" \
"slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
"slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
"slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
"slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
"movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
"movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
"lw %[temp16], 8(%[args]) \n\t" \
"movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
"movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
"sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
__asm__ volatile(
"lw %[temp20], 4(%[args]) \n\t"
VERTICAL_PASS(0, 16, 8, 24, temp4, temp0, temp1, temp2, temp3)
VERTICAL_PASS(2, 18, 10, 26, temp8, temp4, temp5, temp6, temp7)
VERTICAL_PASS(4, 20, 12, 28, temp12, temp8, temp9, temp10, temp11)
VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
HORIZONTAL_PASS(0, temp0, temp4, temp8, temp12)
HORIZONTAL_PASS(1, temp1, temp5, temp9, temp13)
HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
: [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
: "memory", "hi", "lo"
);
}
static void ITransform(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
// macro for one pass through for loop in QuantizeBlock
// QUANTDIV macro inlined
// J - offset in bytes (kZigzag[n] * 2)
// K - offset in bytes (kZigzag[n] * 4)
// N - offset in bytes (n * 2)
#define QUANTIZE_ONE(J, K, N) \
"lh %[temp0], " #J "(%[ppin]) \n\t" \
"lhu %[temp1], " #J "(%[ppsharpen]) \n\t" \
"lw %[temp2], " #K "(%[ppzthresh]) \n\t" \
"sra %[sign], %[temp0], 15 \n\t" \
"xor %[coeff], %[temp0], %[sign] \n\t" \
"subu %[coeff], %[coeff], %[sign] \n\t" \
"addu %[coeff], %[coeff], %[temp1] \n\t" \
"slt %[temp4], %[temp2], %[coeff] \n\t" \
"addiu %[temp5], $zero, 0 \n\t" \
"addiu %[level], $zero, 0 \n\t" \
"beqz %[temp4], 2f \n\t" \
"lhu %[temp1], " #J "(%[ppiq]) \n\t" \
"lw %[temp2], " #K "(%[ppbias]) \n\t" \
"lhu %[temp3], " #J "(%[ppq]) \n\t" \
"mul %[level], %[coeff], %[temp1] \n\t" \
"addu %[level], %[level], %[temp2] \n\t" \
"sra %[level], %[level], 17 \n\t" \
"slt %[temp4], %[max_level], %[level] \n\t" \
"movn %[level], %[max_level], %[temp4] \n\t" \
"xor %[level], %[level], %[sign] \n\t" \
"subu %[level], %[level], %[sign] \n\t" \
"mul %[temp5], %[level], %[temp3] \n\t" \
"2: \n\t" \
"sh %[temp5], " #J "(%[ppin]) \n\t" \
"sh %[level], " #N "(%[pout]) \n\t"
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int sign, coeff, level, i;
int max_level = MAX_LEVEL;
int16_t* ppin = &in[0];
int16_t* pout = &out[0];
const uint16_t* ppsharpen = &mtx->sharpen_[0];
const uint32_t* ppzthresh = &mtx->zthresh_[0];
const uint16_t* ppq = &mtx->q_[0];
const uint16_t* ppiq = &mtx->iq_[0];
const uint32_t* ppbias = &mtx->bias_[0];
__asm__ volatile(
QUANTIZE_ONE( 0, 0, 0)
QUANTIZE_ONE( 2, 4, 2)
QUANTIZE_ONE( 8, 16, 4)
QUANTIZE_ONE(16, 32, 6)
QUANTIZE_ONE(10, 20, 8)
QUANTIZE_ONE( 4, 8, 10)
QUANTIZE_ONE( 6, 12, 12)
QUANTIZE_ONE(12, 24, 14)
QUANTIZE_ONE(18, 36, 16)
QUANTIZE_ONE(24, 48, 18)
QUANTIZE_ONE(26, 52, 20)
QUANTIZE_ONE(20, 40, 22)
QUANTIZE_ONE(14, 28, 24)
QUANTIZE_ONE(22, 44, 26)
QUANTIZE_ONE(28, 56, 28)
QUANTIZE_ONE(30, 60, 30)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[sign]"=&r"(sign), [coeff]"=&r"(coeff),
[level]"=&r"(level)
: [pout]"r"(pout), [ppin]"r"(ppin),
[ppiq]"r"(ppiq), [max_level]"r"(max_level),
[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
: "memory", "hi", "lo"
);
// moved out from macro to increase possibility for earlier breaking
for (i = 15; i >= 0; i--) {
if (out[i]) return 1;
}
return 0;
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#undef QUANTIZE_ONE
// macro for one horizontal pass in Disto4x4 (TTransform)
// two calls of function TTransform are merged into single one
// A - offset in bytes to load from a and b buffers
// E..H - offsets in bytes to store first results to tmp buffer
// E1..H1 - offsets in bytes to store second results to tmp buffer
#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
"lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"addu %[temp8], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"addu %[temp2], %[temp1], %[temp3] \n\t" \
"subu %[temp1], %[temp1], %[temp3] \n\t" \
"addu %[temp3], %[temp4], %[temp6] \n\t" \
"subu %[temp4], %[temp4], %[temp6] \n\t" \
"addu %[temp6], %[temp5], %[temp7] \n\t" \
"subu %[temp5], %[temp5], %[temp7] \n\t" \
"addu %[temp7], %[temp8], %[temp2] \n\t" \
"subu %[temp2], %[temp8], %[temp2] \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp3], %[temp6] \n\t" \
"subu %[temp3], %[temp3], %[temp6] \n\t" \
"addu %[temp6], %[temp4], %[temp5] \n\t" \
"subu %[temp4], %[temp4], %[temp5] \n\t" \
"sw %[temp7], " #E "(%[tmp]) \n\t" \
"sw %[temp2], " #H "(%[tmp]) \n\t" \
"sw %[temp8], " #F "(%[tmp]) \n\t" \
"sw %[temp0], " #G "(%[tmp]) \n\t" \
"sw %[temp1], " #E1 "(%[tmp]) \n\t" \
"sw %[temp3], " #H1 "(%[tmp]) \n\t" \
"sw %[temp6], " #F1 "(%[tmp]) \n\t" \
"sw %[temp4], " #G1 "(%[tmp]) \n\t"
// macro for one vertical pass in Disto4x4 (TTransform)
// two calls of function TTransform are merged into single one
// since only one accu is available in mips32r1 instruction set
// first is done second call of function TTransform and after
// that first one.
// const int sum1 = TTransform(a, w);
// const int sum2 = TTransform(b, w);
// return abs(sum2 - sum1) >> 5;
// (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
// A..D - offsets in bytes to load first results from tmp buffer
// A1..D1 - offsets in bytes to load second results from tmp buffer
// E..H - offsets in bytes to load from w buffer
#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H) \
"lw %[temp0], " #A1 "(%[tmp]) \n\t" \
"lw %[temp1], " #C1 "(%[tmp]) \n\t" \
"lw %[temp2], " #B1 "(%[tmp]) \n\t" \
"lw %[temp3], " #D1 "(%[tmp]) \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp2], %[temp3] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp3], %[temp8], %[temp1] \n\t" \
"subu %[temp8], %[temp8], %[temp1] \n\t" \
"addu %[temp1], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"sra %[temp4], %[temp3], 31 \n\t" \
"sra %[temp5], %[temp1], 31 \n\t" \
"sra %[temp6], %[temp0], 31 \n\t" \
"sra %[temp7], %[temp8], 31 \n\t" \
"xor %[temp3], %[temp3], %[temp4] \n\t" \
"xor %[temp1], %[temp1], %[temp5] \n\t" \
"xor %[temp0], %[temp0], %[temp6] \n\t" \
"xor %[temp8], %[temp8], %[temp7] \n\t" \
"subu %[temp3], %[temp3], %[temp4] \n\t" \
"subu %[temp1], %[temp1], %[temp5] \n\t" \
"subu %[temp0], %[temp0], %[temp6] \n\t" \
"subu %[temp8], %[temp8], %[temp7] \n\t" \
"lhu %[temp4], " #E "(%[w]) \n\t" \
"lhu %[temp5], " #F "(%[w]) \n\t" \
"lhu %[temp6], " #G "(%[w]) \n\t" \
"lhu %[temp7], " #H "(%[w]) \n\t" \
"madd %[temp4], %[temp3] \n\t" \
"madd %[temp5], %[temp1] \n\t" \
"madd %[temp6], %[temp0] \n\t" \
"madd %[temp7], %[temp8] \n\t" \
"lw %[temp0], " #A "(%[tmp]) \n\t" \
"lw %[temp1], " #C "(%[tmp]) \n\t" \
"lw %[temp2], " #B "(%[tmp]) \n\t" \
"lw %[temp3], " #D "(%[tmp]) \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp2], %[temp3] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp3], %[temp8], %[temp1] \n\t" \
"subu %[temp1], %[temp8], %[temp1] \n\t" \
"addu %[temp8], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"sra %[temp2], %[temp3], 31 \n\t" \
"xor %[temp3], %[temp3], %[temp2] \n\t" \
"subu %[temp3], %[temp3], %[temp2] \n\t" \
"msub %[temp4], %[temp3] \n\t" \
"sra %[temp2], %[temp8], 31 \n\t" \
"sra %[temp3], %[temp0], 31 \n\t" \
"sra %[temp4], %[temp1], 31 \n\t" \
"xor %[temp8], %[temp8], %[temp2] \n\t" \
"xor %[temp0], %[temp0], %[temp3] \n\t" \
"xor %[temp1], %[temp1], %[temp4] \n\t" \
"subu %[temp8], %[temp8], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp3] \n\t" \
"subu %[temp1], %[temp1], %[temp4] \n\t" \
"msub %[temp5], %[temp8] \n\t" \
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
__asm__ volatile(
HORIZONTAL_PASS(0, 0, 4, 8, 12, 64, 68, 72, 76)
HORIZONTAL_PASS(1, 16, 20, 24, 28, 80, 84, 88, 92)
HORIZONTAL_PASS(2, 32, 36, 40, 44, 96, 100, 104, 108)
HORIZONTAL_PASS(3, 48, 52, 56, 60, 112, 116, 120, 124)
"mthi $zero \n\t"
"mtlo $zero \n\t"
VERTICAL_PASS( 0, 16, 32, 48, 64, 80, 96, 112, 0, 8, 16, 24)
VERTICAL_PASS( 4, 20, 36, 52, 68, 84, 100, 116, 2, 10, 18, 26)
VERTICAL_PASS( 8, 24, 40, 56, 72, 88, 104, 120, 4, 12, 20, 28)
VERTICAL_PASS(12, 28, 44, 60, 76, 92, 108, 124, 6, 14, 22, 30)
"mflo %[temp0] \n\t"
"sra %[temp1], %[temp0], 31 \n\t"
"xor %[temp0], %[temp0], %[temp1] \n\t"
"subu %[temp0], %[temp0], %[temp1] \n\t"
"sra %[temp0], %[temp0], 5 \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
: "memory", "hi", "lo"
);
return temp0;
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
}
}
return D;
}
// macro for one horizontal pass in FTransform
// temp0..temp15 holds tmp[0]..tmp[15]
// A - offset in bytes to load from src and ref buffers
// TEMP0..TEMP3 - registers for corresponding tmp elements
#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
"lw %[" #TEMP1 "], 0(%[args]) \n\t" \
"lw %[" #TEMP2 "], 4(%[args]) \n\t" \
"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[temp20], %[temp16], %[temp17] \n\t" \
"lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
"lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
"subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
"addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
"addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
"subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
"mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
"mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
"mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
"mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
"addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
"subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
"sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
"sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
"addiu %[temp16], %[temp16], 1812 \n\t" \
"addiu %[temp17], %[temp17], 937 \n\t" \
"addu %[temp16], %[temp16], %[temp19] \n\t" \
"subu %[temp17], %[temp17], %[temp18] \n\t" \
"sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
"sra %[" #TEMP3 "], %[temp17], 9 \n\t"
// macro for one vertical pass in FTransform
// temp0..temp15 holds tmp[0]..tmp[15]
// A..D - offsets in bytes to store to out buffer
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
"subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
"addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
"subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
"mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
"mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
"mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
"mul %[temp18], %[temp18], %[c5352] \n\t" \
"addiu %[temp16], %[temp16], 7 \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
"addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
"subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
"addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
"addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
"addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
"subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
"addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
"movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
"sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
"sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
const int c2217 = 2217;
const int c5352 = 5352;
const int* const args[3] =
{ (const int*)src, (const int*)ref, (const int*)out };
__asm__ volatile(
HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3)
HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7)
HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11)
HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
"lw %[temp20], 8(%[args]) \n\t"
VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12)
VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13)
VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
: [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
: "memory", "hi", "lo"
);
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
#if !defined(WORK_AROUND_GCC)
#define GET_SSE_INNER(A, B, C, D) \
"lbu %[temp0], " #A "(%[a]) \n\t" \
"lbu %[temp1], " #A "(%[b]) \n\t" \
"lbu %[temp2], " #B "(%[a]) \n\t" \
"lbu %[temp3], " #B "(%[b]) \n\t" \
"lbu %[temp4], " #C "(%[a]) \n\t" \
"lbu %[temp5], " #C "(%[b]) \n\t" \
"lbu %[temp6], " #D "(%[a]) \n\t" \
"lbu %[temp7], " #D "(%[b]) \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"subu %[temp4], %[temp4], %[temp5] \n\t" \
"subu %[temp6], %[temp6], %[temp7] \n\t" \
"madd %[temp0], %[temp0] \n\t" \
"madd %[temp2], %[temp2] \n\t" \
"madd %[temp4], %[temp4] \n\t" \
"madd %[temp6], %[temp6] \n\t"
#define GET_SSE(A, B, C, D) \
GET_SSE_INNER(A, A + 1, A + 2, A + 3) \
GET_SSE_INNER(B, B + 1, B + 2, B + 3) \
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS)
GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS)
GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
#undef GET_SSE
#undef GET_SSE_INNER
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
#endif
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
#endif // WEBP_USE_MIPS32

File diff suppressed because it is too large Load diff

934
drivers/webp/dsp/enc_neon.c Normal file
View file

@ -0,0 +1,934 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARM NEON version of speed-critical encoding functions.
//
// adapted from libvpx (http://www.webmproject.org/code/)
#include "./dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "./neon.h"
#include "../enc/vp8enci.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Inverse transform.
// This code is pretty much the same as TransformOne in the dec_neon.c, except
// for subtraction to *ref. See the comments there for algorithmic explanations.
static const int16_t kC1 = 20091;
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
// This code works but is *slower* than the inlined-asm version below
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
// WEBP_USE_INTRINSICS define.
// With gcc-4.8, it's a little faster speed than inlined-assembly.
#if defined(WEBP_USE_INTRINSICS)
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
// Store the results.
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
const uint8_t* const ref, uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
// Load the source pixels.
dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
{
// Convert to 16b.
const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
SaturateAndStore4x4(dst, out01, out23);
}
}
static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
// b0 d0 b1 d1 b2 d2 ...
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
const int16x8_t B1 =
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
// C0 = kC1 * in4 | kC1 * in12
// C1 = kC2 * in4 | kC2 * in12
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1])); // in0 + in8
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1])); // in0 - in8
// c = kC2 * in4 - kC1 * in12
// d = kC1 * in4 + kC2 * in12
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Transpose8x2(E0, E1, rows);
}
static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass(&rows);
TransformPass(&rows);
Add4x4(rows.val[0], rows.val[1], ref, dst);
}
#else
static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
__asm__ volatile (
"vld1.16 {q1, q2}, [%[in]] \n"
"vld1.16 {d0}, [%[kC1C2]] \n"
// d2: in[0]
// d3: in[8]
// d4: in[4]
// d5: in[12]
"vswp d3, d4 \n"
// q8 = {in[4], in[12]} * kC1 * 2 >> 16
// q9 = {in[4], in[12]} * kC2 >> 16
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
// d22 = a = in[0] + in[8]
// d23 = b = in[0] - in[8]
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
// q8 = in[4]/[12] * kC1 >> 16
"vshr.s16 q8, q8, #1 \n"
// Add {in[4], in[12]} back after the multiplication.
"vqadd.s16 q8, q2, q8 \n"
// d20 = c = in[4]*kC2 - in[12]*kC1
// d21 = d = in[4]*kC1 + in[12]*kC2
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
// d2 = tmp[0] = a + d
// d3 = tmp[1] = b + c
// d4 = tmp[2] = b - c
// d5 = tmp[3] = a - d
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
"vswp d3, d4 \n"
// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
// q9 = {tmp[4], tmp[12]} * kC2 >> 16
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
// d22 = a = tmp[0] + tmp[8]
// d23 = b = tmp[0] - tmp[8]
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
"vshr.s16 q8, q8, #1 \n"
"vqadd.s16 q8, q2, q8 \n"
// d20 = c = in[4]*kC2 - in[12]*kC1
// d21 = d = in[4]*kC1 + in[12]*kC2
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
// d2 = tmp[0] = a + d
// d3 = tmp[1] = b + c
// d4 = tmp[2] = b - c
// d5 = tmp[3] = a - d
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
// (val) + 4 >> 3
"vrshr.s16 d2, d2, #3 \n"
"vrshr.s16 d3, d3, #3 \n"
"vrshr.s16 d4, d4, #3 \n"
"vrshr.s16 d5, d5, #3 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
// Must accumulate before saturating
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vqadd.s16 q1, q1, q8 \n"
"vqadd.s16 q2, q2, q9 \n"
"vqmovun.s16 d0, q1 \n"
"vqmovun.s16 d1, q2 \n"
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
"vst1.32 d1[1], [%[dst]] \n"
: [in] "+r"(in), [dst] "+r"(dst) // modified registers
: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
);
}
#endif // WEBP_USE_INTRINSICS
static void ITransform(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
// Load all 4x4 pixels into a single uint8x16_t variable.
static uint8x16_t Load4x4(const uint8_t* src) {
uint32x4_t out = vdupq_n_u32(0);
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
return vreinterpretq_u8_u32(out);
}
// Forward transform.
#if defined(WEBP_USE_INTRINSICS)
static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
const int16x4_t C, const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
const int16x4x2_t AB = vtrn_s16(A, B);
const int16x4x2_t CD = vtrn_s16(C, D);
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
vreinterpret_s32_s16(CD.val[0]));
const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
vreinterpret_s32_s16(CD.val[1]));
*out01 = vreinterpretq_s16_s64(
vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
vreinterpret_s64_s32(tmp13.val[0])));
*out32 = vreinterpretq_s16_s64(
vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
vreinterpret_s64_s32(tmp02.val[1])));
}
static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
const uint8x8_t b) {
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
static void FTransform(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
const uint8x16_t S0 = Load4x4(src);
const uint8x16_t R0 = Load4x4(ref);
const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
const int16x4_t D0 = vget_low_s16(D0D1);
const int16x4_t D1 = vget_high_s16(D0D1);
const int16x4_t D2 = vget_low_s16(D2D3);
const int16x4_t D3 = vget_high_s16(D2D3);
Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
}
{ // 1rst pass
const int32x4_t kCst937 = vdupq_n_s32(937);
const int32x4_t kCst1812 = vdupq_n_s32(1812);
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
vget_high_s16(a0a1_2));
const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
vget_high_s16(a0a1_2));
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
}
{ // 2nd pass
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
const int32x4_t kCst51000 = vdupq_n_s32(51000);
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
const int16x4_t a3_eq_0 =
vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
vst1_s16(out + 0, out0);
vst1_s16(out + 4, out1);
vst1_s16(out + 8, out2);
vst1_s16(out + 12, out3);
}
}
#else
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
static const int16_t kCoeff16[] = {
5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
};
static const int32_t kCoeff32[] = {
1812, 1812, 1812, 1812,
937, 937, 937, 937,
12000, 12000, 12000, 12000,
51000, 51000, 51000, 51000
};
static void FTransform(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
const int16_t* coeff16 = kCoeff16;
const int32_t* coeff32 = kCoeff32;
__asm__ volatile (
// load src into q4, q5 in high half
"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d11}, [%[src_ptr]] \n"
// load ref into q6, q7 in high half
"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d15}, [%[ref_ptr]] \n"
// Pack the high values in to q4 and q6
"vtrn.32 q4, q5 \n"
"vtrn.32 q6, q7 \n"
// d[0-3] = src - ref
"vsubl.u8 q0, d8, d12 \n"
"vsubl.u8 q1, d9, d13 \n"
// load coeff16 into q8(d16=5352, d17=2217)
"vld1.16 {q8}, [%[coeff16]] \n"
// load coeff32 high half into q9 = 1812, q10 = 937
"vld1.32 {q9, q10}, [%[coeff32]]! \n"
// load coeff32 low half into q11=12000, q12=51000
"vld1.32 {q11,q12}, [%[coeff32]] \n"
// part 1
// Transpose. Register dN is the same as dN in C
"vtrn.32 d0, d2 \n"
"vtrn.32 d1, d3 \n"
"vtrn.16 d0, d1 \n"
"vtrn.16 d2, d3 \n"
"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
"vadd.s16 d0, d4, d5 \n" // a0 + a1
"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
"vsub.s16 d2, d4, d5 \n" // a0 - a1
"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
"vshrn.s32 d1, q9, #9 \n"
"vshrn.s32 d3, q10, #9 \n"
// part 2
// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
"vtrn.32 d0, d2 \n"
"vtrn.32 d1, d3 \n"
"vtrn.16 d0, d1 \n"
"vtrn.16 d2, d3 \n"
"vmov.s16 d26, #7 \n"
"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
"vadd.s16 d4, d4, d26 \n" // a1 + 7
"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
"vceq.s16 d4, d7, #0 \n"
"vshr.s16 d0, d0, #4 \n"
"vshr.s16 d2, d2, #4 \n"
"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
"vmvn d4, d4 \n" // !(d1 == 0)
// op[4] = (c1*2217 + d1*5352 + 12000)>>16
"vshrn.s32 d1, q11, #16 \n"
// op[4] += (d1!=0)
"vsub.s16 d1, d1, d4 \n"
// op[12]= (d1*2217 - c1*5352 + 51000)>>16
"vshrn.s32 d3, q12, #16 \n"
// set result to out array
"vst1.16 {q0, q1}, [%[out]] \n"
: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
[coeff32] "+r"(coeff32) // modified registers
: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
[out] "r"(out) // constants
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13" // clobbered
);
}
#endif
#define LOAD_LANE_16b(VALUE, LANE) do { \
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
src += stride; \
} while (0)
static void FTransformWHT(const int16_t* src, int16_t* out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
int16x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
LOAD_LANE_16b(in.val[0], 0);
LOAD_LANE_16b(in.val[1], 0);
LOAD_LANE_16b(in.val[2], 0);
LOAD_LANE_16b(in.val[3], 0);
LOAD_LANE_16b(in.val[0], 1);
LOAD_LANE_16b(in.val[1], 1);
LOAD_LANE_16b(in.val[2], 1);
LOAD_LANE_16b(in.val[3], 1);
LOAD_LANE_16b(in.val[0], 2);
LOAD_LANE_16b(in.val[1], 2);
LOAD_LANE_16b(in.val[2], 2);
LOAD_LANE_16b(in.val[3], 2);
LOAD_LANE_16b(in.val[0], 3);
LOAD_LANE_16b(in.val[1], 3);
LOAD_LANE_16b(in.val[2], 3);
LOAD_LANE_16b(in.val[3], 3);
{
// a0 = in[0 * 16] + in[2 * 16]
// a1 = in[1 * 16] + in[3 * 16]
// a2 = in[1 * 16] - in[3 * 16]
// a3 = in[0 * 16] - in[2 * 16]
const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
tmp0.val[0] = vaddq_s32(a0, a1);
tmp0.val[1] = vaddq_s32(a3, a2);
tmp0.val[2] = vsubq_s32(a3, a2);
tmp0.val[3] = vsubq_s32(a0, a1);
}
{
const int32x4x4_t tmp1 = Transpose4x4(tmp0);
// a0 = tmp[0 + i] + tmp[ 8 + i]
// a1 = tmp[4 + i] + tmp[12 + i]
// a2 = tmp[4 + i] - tmp[12 + i]
// a3 = tmp[0 + i] - tmp[ 8 + i]
const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
const int16x4_t out0 = vmovn_s32(b0);
const int16x4_t out1 = vmovn_s32(b1);
const int16x4_t out2 = vmovn_s32(b2);
const int16x4_t out3 = vmovn_s32(b3);
vst1_s16(out + 0, out0);
vst1_s16(out + 4, out1);
vst1_s16(out + 8, out2);
vst1_s16(out + 12, out3);
}
}
#undef LOAD_LANE_16b
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// a 0123, b 0123
// a 4567, b 4567
// a 89ab, b 89ab
// a cdef, b cdef
//
// transpose
//
// a 048c, b 048c
// a 159d, b 159d
// a 26ae, b 26ae
// a 37bf, b 37bf
//
static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
vreinterpret_u16_u8(d2_tmp1.val[0]));
const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
vreinterpret_u16_u8(d2_tmp1.val[1]));
d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
return d4_in;
}
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
vreinterpretq_s32_s16(q2_tmp1.val[0]));
const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
vreinterpretq_s32_s16(q2_tmp1.val[1]));
q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
return q4_in;
}
static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
d4_in.val[2]));
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
d4_in.val[3]));
const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
d4_in.val[2]));
const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
d4_in.val[3]));
int16x8x4_t q4_out;
// tmp[0] = a0 + a1
// tmp[1] = a3 + a2
// tmp[2] = a3 - a2
// tmp[3] = a0 - a1
INIT_VECTOR4(q4_out,
vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
return q4_out;
}
static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
q4_in.val[0] = vaddq_s16(q_a0, q_a1);
q4_in.val[1] = vaddq_s16(q_a3, q_a2);
q4_in.val[2] = vabdq_s16(q_a3, q_a2);
q4_in.val[3] = vabdq_s16(q_a0, q_a1);
q4_in.val[0] = vabsq_s16(q4_in.val[0]);
q4_in.val[1] = vabsq_s16(q4_in.val[1]);
return q4_in;
}
static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
int16x4x4_t d4_w;
INIT_VECTOR4(d4_w,
vget_low_s16(vreinterpretq_s16_u16(q_w07)),
vget_high_s16(vreinterpretq_s16_u16(q_w07)),
vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
return d4_w;
}
static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
int32x2_t d_sum;
// sum += w[ 0] * abs(b0);
// sum += w[ 4] * abs(b1);
// sum += w[ 8] * abs(b2);
// sum += w[12] * abs(b3);
int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
q_sum0 = vaddq_s32(q_sum0, q_sum1);
q_sum2 = vaddq_s32(q_sum2, q_sum3);
q_sum2 = vaddq_s32(q_sum0, q_sum2);
d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
d_sum = vpadd_s32(d_sum, d_sum);
return d_sum;
}
#define LOAD_LANE_32b(src, VALUE, LANE) \
(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
uint8x8x4_t d4_in;
// load data a, b
LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
INIT_VECTOR4(d4_in,
vreinterpret_u8_u32(d_in_ab_0123),
vreinterpret_u8_u32(d_in_ab_4567),
vreinterpret_u8_u32(d_in_ab_89ab),
vreinterpret_u8_u32(d_in_ab_cdef));
{
// horizontal pass
const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
const int16x4x4_t d4_w = DistoLoadW(w);
// vertical pass
const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
int32x2_t d_sum = DistoSum(q4_v, d4_w);
// abs(sum2 - sum1) >> 5
d_sum = vabs_s32(d_sum);
d_sum = vshr_n_s32(d_sum, 5);
return vget_lane_s32(d_sum, 0);
}
}
#undef LOAD_LANE_32b
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
const int16x8_t a0 = vld1q_s16(out + 0);
const int16x8_t b0 = vld1q_s16(out + 8);
const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
const uint16x8_t a2 = vshrq_n_u16(a1, 3);
const uint16x8_t b2 = vshrq_n_u16(b1, 3);
const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
*sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate
}
// Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt(uint32x4_t sum) {
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
}
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt(sum);
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt(sum);
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
const uint8x8_t a0 = vld1_u8(a + y * BPS);
const uint8x8_t b0 = vld1_u8(b + y * BPS);
const uint8x8_t abs_diff = vabd_u8(a0, b0);
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
sum = vpadalq_u16(sum, prod);
}
return SumToInt(sum);
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
const uint8x16_t a0 = Load4x4(a);
const uint8x16_t b0 = Load4x4(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
return SumToInt(vpaddlq_u16(prod));
}
//------------------------------------------------------------------------------
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
const int16x8_t a = vld1q_s16(in + offset); // in
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
vst1q_s16(in + offset, c4);
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
return c3;
}
static const uint8_t kShuffles[4][8] = {
{ 0, 1, 2, 3, 8, 9, 16, 17 },
{ 10, 11, 4, 5, 6, 7, 12, 13 },
{ 18, 19, 24, 25, 26, 27, 20, 21 },
{ 14, 15, 22, 23, 28, 29, 30, 31 }
};
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize(in, mtx, 0);
const int16x8_t out1 = Quantize(in, mtx, 8);
uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
uint8x16x2_t all_out;
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
INIT_VECTOR4(shuffles,
vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
#else
uint8x8x4_t all_out;
INIT_VECTOR4(all_out,
vreinterpret_u8_s16(vget_low_s16(out0)),
vreinterpret_u8_s16(vget_high_s16(out0)),
vreinterpret_u8_s16(vget_low_s16(out1)),
vreinterpret_u8_s16(vget_high_s16(out1)));
INIT_VECTOR4(shuffles,
vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
#endif
// Zigzag reordering
vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
// test zeros
if (*(uint64_t*)(out + 0) != 0) return 1;
if (*(uint64_t*)(out + 4) != 0) return 1;
if (*(uint64_t*)(out + 8) != 0) return 1;
if (*(uint64_t*)(out + 12) != 0) return 1;
return 0;
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransformWHT = FTransformWHT;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram;
VP8SSE16x16 = SSE16x16;
VP8SSE16x8 = SSE16x8;
VP8SSE8x8 = SSE8x8;
VP8SSE4x4 = SSE4x4;
#if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
#endif
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
#endif // WEBP_USE_NEON

View file

@ -0,0 +1,375 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 version of some encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include <stdlib.h> // for abs()
#include "../enc/vp8enci.h"
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
int k;
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
// Load.
const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
// v = abs(out) >> 3
const __m128i abs0 = _mm_abs_epi16(out0);
const __m128i abs1 = _mm_abs_epi16(out1);
const __m128i v0 = _mm_srai_epi16(abs0, 3);
const __m128i v1 = _mm_srai_epi16(abs1, 3);
// bin = min(v, MAX_COEFF_THRESH)
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
// Store.
_mm_storeu_si128((__m128i*)&out[0], bin0);
_mm_storeu_si128((__m128i*)&out[8], bin1);
}
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
// Load, combine and transpose inputs.
{
const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
// a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0
// a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0
// a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0
// a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0
// Transpose the two 4x4, discarding the filling zeroes.
const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
// a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23
// a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33
const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
// a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33
// Convert to 16b.
tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Horizontal pass and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Vertical pass and difference of weighted sums.
{
// Load all inputs.
const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// Separate the transforms of inA and inB.
__m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
__m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
__m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
__m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
A_b0 = _mm_abs_epi16(A_b0);
A_b2 = _mm_abs_epi16(A_b2);
B_b0 = _mm_abs_epi16(B_b0);
B_b2 = _mm_abs_epi16(B_b2);
// weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8);
B_b0 = _mm_madd_epi16(B_b0, w_0);
B_b2 = _mm_madd_epi16(B_b2, w_8);
A_b0 = _mm_add_epi32(A_b0, A_b2);
B_b0 = _mm_add_epi32(B_b0, B_b2);
// difference of weighted sums
A_b2 = _mm_sub_epi32(A_b0, B_b0);
// cascading summation of the differences
B_b0 = _mm_hadd_epi32(A_b2, A_b2);
B_b2 = _mm_hadd_epi32(B_b0, B_b0);
return _mm_cvtsi128_si32(B_b2);
}
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Quantization
//
// Generates a pshufb constant for shuffling 16b words.
#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
_mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i out0, out8;
__m128i packed_out;
// Load all inputs.
// TODO(cduvivier): Make variable declarations and allocations aligned so that
// we can use _mm_load_si128 instead of _mm_loadu_si128.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
// coeff = abs(in)
__m128i coeff0 = _mm_abs_epi16(in0);
__m128i coeff8 = _mm_abs_epi16(in8);
// coeff = abs(in) + sharpen
if (sharpen != NULL) {
const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
}
// out = (coeff * iQ + B) >> QFIX
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
// out = (coeff * iQ + B)
const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
// out = QUANTDIV(coeff, iQ, B, QFIX)
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
// if (coeff > 2047) coeff = 2047
out0 = _mm_min_epi16(out0, max_coeff_2047);
out8 = _mm_min_epi16(out8, max_coeff_2047);
}
// put sign back
out0 = _mm_sign_epi16(out0, in0);
out8 = _mm_sign_epi16(out8, in8);
// in = out * Q
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
_mm_storeu_si128((__m128i*)&in[0], in0);
_mm_storeu_si128((__m128i*)&in[8], in8);
// zigzag the output before storing it. The re-ordering is:
// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15
// -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
// There's only two misplaced entries ([8] and [7]) that are crossing the
// reg's boundaries.
// We use pshufb instead of pshuflo/pshufhi.
{
const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7
const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8
const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
_mm_storeu_si128((__m128i*)&out[0], out_z0);
_mm_storeu_si128((__m128i*)&out[8], out_z8);
packed_out = _mm_packs_epi16(out_z0, out_z8);
}
// detect if all 'out' values are zeroes or not
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
#undef PSHUFB_CST
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, NULL, mtx);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
VP8CollectHistogram = CollectHistogram;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
#endif // WEBP_USE_SSE41

240
drivers/webp/dsp/filters.c Normal file
View file

@ -0,0 +1,240 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Spatial prediction using various filters
//
// Author: Urvang (urvang@google.com)
#include "./dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
//------------------------------------------------------------------------------
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
} else {
for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
}
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
PredictLine(in, preds - stride, out, 1, inverse);
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line.
while (row < last_row) {
PredictLine(in, preds, out, width, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
int w;
// leftmost pixel: predict from above.
PredictLine(in, preds - stride, out, 1, inverse);
for (w = 1; w < width; ++w) {
const int pred = GradientPredictor(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
out[w] = in[w] + (inverse ? pred : -pred);
}
++row;
preds += stride;
in += stride;
out += stride;
}
}
#undef SANITY_CHECK
//------------------------------------------------------------------------------
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
//------------------------------------------------------------------------------
static void VerticalUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
}
static void HorizontalUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
}
static void GradientUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
}
//------------------------------------------------------------------------------
// Init function
WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
extern void VP8FiltersInitMIPSdspR2(void);
extern void VP8FiltersInitSSE2(void);
static volatile VP8CPUInfo filters_last_cpuinfo_used =
(VP8CPUInfo)&filters_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
WebPFilters[WEBP_FILTER_NONE] = NULL;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8FiltersInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8FiltersInitMIPSdspR2();
}
#endif
}
filters_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -0,0 +1,405 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Spatial prediction using various filters
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "../dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
//------------------------------------------------------------------------------
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
// if INVERSE
// preds == &dst[-1] == &src[-1]
// else
// preds == &src[-1] != &dst[-1]
#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \
const uint8_t* psrc = (uint8_t*)(SRC); \
uint8_t* pdst = (uint8_t*)(DST); \
const int ilength = (int)(LENGTH); \
int temp0, temp1, temp2, temp3, temp4, temp5, temp6; \
__asm__ volatile ( \
".set push \n\t" \
".set noreorder \n\t" \
"srl %[temp0], %[length], 0x2 \n\t" \
"beqz %[temp0], 4f \n\t" \
" andi %[temp6], %[length], 0x3 \n\t" \
".if " #INVERSE " \n\t" \
"lbu %[temp1], -1(%[src]) \n\t" \
"1: \n\t" \
"lbu %[temp2], 0(%[src]) \n\t" \
"lbu %[temp3], 1(%[src]) \n\t" \
"lbu %[temp4], 2(%[src]) \n\t" \
"lbu %[temp5], 3(%[src]) \n\t" \
"addiu %[src], %[src], 4 \n\t" \
"addiu %[temp0], %[temp0], -1 \n\t" \
"addu %[temp2], %[temp2], %[temp1] \n\t" \
"addu %[temp3], %[temp3], %[temp2] \n\t" \
"addu %[temp4], %[temp4], %[temp3] \n\t" \
"addu %[temp1], %[temp5], %[temp4] \n\t" \
"sb %[temp2], -4(%[src]) \n\t" \
"sb %[temp3], -3(%[src]) \n\t" \
"sb %[temp4], -2(%[src]) \n\t" \
"bnez %[temp0], 1b \n\t" \
" sb %[temp1], -1(%[src]) \n\t" \
".else \n\t" \
"1: \n\t" \
"ulw %[temp1], -1(%[src]) \n\t" \
"ulw %[temp2], 0(%[src]) \n\t" \
"addiu %[src], %[src], 4 \n\t" \
"addiu %[temp0], %[temp0], -1 \n\t" \
"subu.qb %[temp3], %[temp2], %[temp1] \n\t" \
"usw %[temp3], 0(%[dst]) \n\t" \
"bnez %[temp0], 1b \n\t" \
" addiu %[dst], %[dst], 4 \n\t" \
".endif \n\t" \
"4: \n\t" \
"beqz %[temp6], 3f \n\t" \
" nop \n\t" \
"2: \n\t" \
"lbu %[temp1], -1(%[src]) \n\t" \
"lbu %[temp2], 0(%[src]) \n\t" \
"addiu %[src], %[src], 1 \n\t" \
".if " #INVERSE " \n\t" \
"addu %[temp3], %[temp1], %[temp2] \n\t" \
"sb %[temp3], -1(%[src]) \n\t" \
".else \n\t" \
"subu %[temp3], %[temp1], %[temp2] \n\t" \
"sb %[temp3], 0(%[dst]) \n\t" \
".endif \n\t" \
"addiu %[temp6], %[temp6], -1 \n\t" \
"bnez %[temp6], 2b \n\t" \
" addiu %[dst], %[dst], 1 \n\t" \
"3: \n\t" \
".set pop \n\t" \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc) \
: [length]"r"(ilength) \
: "memory" \
); \
} while (0)
static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
int length, int inverse) {
if (inverse) {
DO_PREDICT_LINE(src, dst, length, 1);
} else {
DO_PREDICT_LINE(src, dst, length, 0);
}
}
#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do { \
const uint8_t* psrc = (uint8_t*)(SRC); \
const uint8_t* ppred = (uint8_t*)(PRED); \
uint8_t* pdst = (uint8_t*)(DST); \
const int ilength = (int)(LENGTH); \
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
__asm__ volatile ( \
".set push \n\t" \
".set noreorder \n\t" \
"srl %[temp0], %[length], 0x3 \n\t" \
"beqz %[temp0], 4f \n\t" \
" andi %[temp7], %[length], 0x7 \n\t" \
"1: \n\t" \
"ulw %[temp1], 0(%[src]) \n\t" \
"ulw %[temp2], 0(%[pred]) \n\t" \
"ulw %[temp3], 4(%[src]) \n\t" \
"ulw %[temp4], 4(%[pred]) \n\t" \
"addiu %[src], %[src], 8 \n\t" \
".if " #INVERSE " \n\t" \
"addu.qb %[temp5], %[temp1], %[temp2] \n\t" \
"addu.qb %[temp6], %[temp3], %[temp4] \n\t" \
".else \n\t" \
"subu.qb %[temp5], %[temp1], %[temp2] \n\t" \
"subu.qb %[temp6], %[temp3], %[temp4] \n\t" \
".endif \n\t" \
"addiu %[pred], %[pred], 8 \n\t" \
"usw %[temp5], 0(%[dst]) \n\t" \
"usw %[temp6], 4(%[dst]) \n\t" \
"addiu %[temp0], %[temp0], -1 \n\t" \
"bnez %[temp0], 1b \n\t" \
" addiu %[dst], %[dst], 8 \n\t" \
"4: \n\t" \
"beqz %[temp7], 3f \n\t" \
" nop \n\t" \
"2: \n\t" \
"lbu %[temp1], 0(%[src]) \n\t" \
"lbu %[temp2], 0(%[pred]) \n\t" \
"addiu %[src], %[src], 1 \n\t" \
"addiu %[pred], %[pred], 1 \n\t" \
".if " #INVERSE " \n\t" \
"addu %[temp3], %[temp1], %[temp2] \n\t" \
".else \n\t" \
"subu %[temp3], %[temp1], %[temp2] \n\t" \
".endif \n\t" \
"sb %[temp3], 0(%[dst]) \n\t" \
"addiu %[temp7], %[temp7], -1 \n\t" \
"bnez %[temp7], 2b \n\t" \
" addiu %[dst], %[dst], 1 \n\t" \
"3: \n\t" \
".set pop \n\t" \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred), \
[dst]"+&r"(pdst), [src]"+&r"(psrc) \
: [length]"r"(ilength) \
: "memory" \
); \
} while (0)
#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST, INVERSE) do { \
int temp1, temp2, temp3; \
__asm__ volatile ( \
"lbu %[temp1], 0(%[src]) \n\t" \
"lbu %[temp2], 0(%[pred]) \n\t" \
".if " #INVERSE " \n\t" \
"addu %[temp3], %[temp1], %[temp2] \n\t" \
".else \n\t" \
"subu %[temp3], %[temp1], %[temp2] \n\t" \
".endif \n\t" \
"sb %[temp3], 0(%[dst]) \n\t" \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
: [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC)) \
: "memory" \
); \
} while (0)
//------------------------------------------------------------------------------
// Horizontal filter.
#define FILTER_LINE_BY_LINE(INVERSE) do { \
while (row < last_row) { \
PREDICT_LINE_ONE_PASS(in, preds - stride, out, INVERSE); \
DO_PREDICT_LINE(in + 1, out + 1, width - 1, INVERSE); \
++row; \
preds += stride; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine(in + 1, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
if (inverse) {
FILTER_LINE_BY_LINE(1);
} else {
FILTER_LINE_BY_LINE(0);
}
}
#undef FILTER_LINE_BY_LINE
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void HorizontalUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
}
//------------------------------------------------------------------------------
// Vertical filter.
#define FILTER_LINE_BY_LINE(INVERSE) do { \
while (row < last_row) { \
DO_PREDICT_LINE_VERTICAL(in, preds, out, width, INVERSE); \
++row; \
preds += stride; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine(in + 1, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line.
if (inverse) {
FILTER_LINE_BY_LINE(1);
} else {
FILTER_LINE_BY_LINE(0);
}
}
#undef FILTER_LINE_BY_LINE
#undef DO_PREDICT_LINE_VERTICAL
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void VerticalUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
int temp0;
__asm__ volatile (
"addu %[temp0], %[a], %[b] \n\t"
"subu %[temp0], %[temp0], %[c] \n\t"
"shll_s.w %[temp0], %[temp0], 23 \n\t"
"precrqu_s.qb.ph %[temp0], %[temp0], $zero \n\t"
"srl %[temp0], %[temp0], 24 \n\t"
: [temp0]"=&r"(temp0)
: [a]"r"(a),[b]"r"(b),[c]"r"(c)
);
return temp0;
}
#define FILTER_LINE_BY_LINE(INVERSE, PREDS, OPERATION) do { \
while (row < last_row) { \
int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out, INVERSE); \
for (w = 1; w < width; ++w) { \
const int pred = GradientPredictor(PREDS[w - 1], \
PREDS[w - stride], \
PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \
} \
++row; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine(in + 1, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
if (inverse) {
FILTER_LINE_BY_LINE(1, out, +);
} else {
FILTER_LINE_BY_LINE(0, in, -);
}
}
#undef FILTER_LINE_BY_LINE
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void GradientUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
}
#undef PREDICT_LINE_ONE_PASS
#undef DO_PREDICT_LINE
#undef SANITY_CHECK
//------------------------------------------------------------------------------
// Entry point
extern void VP8FiltersInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,352 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of alpha filters
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <stdlib.h>
#include <string.h>
//------------------------------------------------------------------------------
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
if (inverse) {
for (i = 0; i < max_pos; i += 32) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]);
const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]);
const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
const __m128i C0 = _mm_add_epi8(A0, B0);
const __m128i C1 = _mm_add_epi8(A1, B1);
_mm_storeu_si128((__m128i*)&dst[i + 0], C0);
_mm_storeu_si128((__m128i*)&dst[i + 16], C1);
}
for (; i < length; ++i) dst[i] = src[i] + pred[i];
} else {
for (i = 0; i < max_pos; i += 32) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]);
const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]);
const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
const __m128i C0 = _mm_sub_epi8(A0, B0);
const __m128i C1 = _mm_sub_epi8(A1, B1);
_mm_storeu_si128((__m128i*)&dst[i + 0], C0);
_mm_storeu_si128((__m128i*)&dst[i + 16], C1);
}
for (; i < length; ++i) dst[i] = src[i] - pred[i];
}
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
int inverse) {
int i;
if (length <= 0) return;
if (inverse) {
const int max_pos = length & ~7;
__m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
for (i = 0; i < max_pos; i += 8) {
const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
const __m128i A1 = _mm_add_epi8(A0, last);
const __m128i A2 = _mm_slli_si128(A1, 1);
const __m128i A3 = _mm_add_epi8(A1, A2);
const __m128i A4 = _mm_slli_si128(A3, 2);
const __m128i A5 = _mm_add_epi8(A3, A4);
const __m128i A6 = _mm_slli_si128(A5, 4);
const __m128i A7 = _mm_add_epi8(A5, A6);
_mm_storel_epi64((__m128i*)(dst + i), A7);
last = _mm_srli_epi64(A7, 56);
}
for (; i < length; ++i) dst[i] = src[i] + dst[i - 1];
} else {
const int max_pos = length & ~31;
for (i = 0; i < max_pos; i += 32) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 ));
const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 ));
const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
const __m128i C0 = _mm_sub_epi8(A0, B0);
const __m128i C1 = _mm_sub_epi8(A1, B1);
_mm_storeu_si128((__m128i*)(dst + i + 0), C0);
_mm_storeu_si128((__m128i*)(dst + i + 16), C1);
}
for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
}
}
static void PredictLineC(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
} else {
for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
}
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft(in + 1, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
PredictLineC(in, preds - stride, out, 1, inverse);
PredictLineLeft(in + 1, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft(in + 1, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line.
while (row < last_row) {
PredictLineTop(in, preds, out, width, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static void GradientPredictDirect(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
const int max_pos = length & ~7;
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i < max_pos; i += 8) {
const __m128i A0 = _mm_loadl_epi64((const __m128i*)&row[i - 1]);
const __m128i B0 = _mm_loadl_epi64((const __m128i*)&top[i]);
const __m128i C0 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
const __m128i D = _mm_loadl_epi64((const __m128i*)&row[i]);
const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
const __m128i B1 = _mm_unpacklo_epi8(B0, zero);
const __m128i C1 = _mm_unpacklo_epi8(C0, zero);
const __m128i E = _mm_add_epi16(A1, B1);
const __m128i F = _mm_sub_epi16(E, C1);
const __m128i G = _mm_packus_epi16(F, zero);
const __m128i H = _mm_sub_epi8(D, G);
_mm_storel_epi64((__m128i*)(out + i), H);
}
for (; i < length; ++i) {
out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
}
}
static void GradientPredictInverse(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
if (length > 0) {
int i;
const int max_pos = length & ~7;
const __m128i zero = _mm_setzero_si128();
__m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample
for (i = 0; i < max_pos; i += 8) {
const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]);
const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input
const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C
__m128i out = zero; // accumulator for output
__m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
int k = 8;
while (1) {
const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C
const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi);
const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta
const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values
A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip
out = _mm_or_si128(out, A); // accumulate output
if (--k == 0) break;
A = _mm_slli_si128(A, 2); // rotate left sample
mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask
}
A = _mm_srli_si128(A, 14); // prepare left sample for next iteration
_mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero));
}
for (; i < length; ++i) {
row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
}
}
}
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft(in + 1, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
if (inverse) {
PredictLineC(in, out - stride, out, 1, inverse); // predict from above
GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1);
} else {
PredictLineC(in, in - stride, out, 1, inverse);
GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
}
++row;
in += stride;
out += stride;
}
}
#undef SANITY_CHECK
//------------------------------------------------------------------------------
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
//------------------------------------------------------------------------------
static void VerticalUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
}
static void HorizontalUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
}
static void GradientUnfilter(int width, int height, int stride, int row,
int num_rows, uint8_t* data) {
DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8FiltersInitSSE2)
#endif // WEBP_USE_SSE2

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,417 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of lossless functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "./lossless.h"
#if defined(WEBP_USE_MIPS32)
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#define APPROX_LOG_WITH_CORRECTION_MAX 65536
#define APPROX_LOG_MAX 4096
#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
static float FastSLog2Slow(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
const int c24 = 24;
const float v_f = (float)v;
uint32_t temp;
// Xf = 256 = 2^8
// log_cnt is index of leading one in upper 24 bits
__asm__ volatile(
"clz %[log_cnt], %[v] \n\t"
"addiu %[y], $zero, 1 \n\t"
"subu %[log_cnt], %[c24], %[log_cnt] \n\t"
"sllv %[y], %[y], %[log_cnt] \n\t"
"srlv %[temp], %[v], %[log_cnt] \n\t"
: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
[temp]"=r"(temp)
: [c24]"r"(c24), [v]"r"(v)
);
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
correction = (23 * (v & (y - 1))) >> 4;
return v_f * (kLog2Table[temp] + log_cnt) + correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
}
}
static float FastLog2Slow(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
const int c24 = 24;
double log_2;
uint32_t temp;
__asm__ volatile(
"clz %[log_cnt], %[v] \n\t"
"addiu %[y], $zero, 1 \n\t"
"subu %[log_cnt], %[c24], %[log_cnt] \n\t"
"sllv %[y], %[y], %[log_cnt] \n\t"
"srlv %[temp], %[v], %[log_cnt] \n\t"
: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
[temp]"=r"(temp)
: [c24]"r"(c24), [v]"r"(v)
);
log_2 = kLog2Table[temp] + log_cnt;
if (v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const uint32_t correction = (23 * (v & (y - 1))) >> 4;
log_2 += (double)correction / v;
}
return (float)log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
}
}
// C version of this function:
// int i = 0;
// int64_t cost = 0;
// const uint32_t* pop = &population[4];
// const uint32_t* LoopEnd = &population[length];
// while (pop != LoopEnd) {
// ++i;
// cost += i * *pop;
// cost += i * *(pop + 1);
// pop += 2;
// }
// return (double)cost;
static double ExtraCost(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
__asm__ volatile(
"mult $zero, $zero \n\t"
"xor %[i], %[i], %[i] \n\t"
"beq %[pop], %[LoopEnd], 2f \n\t"
"1: \n\t"
"lw %[temp0], 0(%[pop]) \n\t"
"lw %[temp1], 4(%[pop]) \n\t"
"addiu %[i], %[i], 1 \n\t"
"addiu %[pop], %[pop], 8 \n\t"
"madd %[i], %[temp0] \n\t"
"madd %[i], %[temp1] \n\t"
"bne %[pop], %[LoopEnd], 1b \n\t"
"2: \n\t"
"mfhi %[temp0] \n\t"
"mflo %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[i]"=&r"(i), [pop]"+r"(pop)
: [LoopEnd]"r"(LoopEnd)
: "memory", "hi", "lo"
);
return (double)((int64_t)temp0 << 32 | temp1);
}
// C version of this function:
// int i = 0;
// int64_t cost = 0;
// const uint32_t* pX = &X[4];
// const uint32_t* pY = &Y[4];
// const uint32_t* LoopEnd = &X[length];
// while (pX != LoopEnd) {
// const uint32_t xy0 = *pX + *pY;
// const uint32_t xy1 = *(pX + 1) + *(pY + 1);
// ++i;
// cost += i * xy0;
// cost += i * xy1;
// pX += 2;
// pY += 2;
// }
// return (double)cost;
static double ExtraCostCombined(const uint32_t* const X,
const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
const uint32_t* const LoopEnd = &X[length];
__asm__ volatile(
"mult $zero, $zero \n\t"
"xor %[i], %[i], %[i] \n\t"
"beq %[pX], %[LoopEnd], 2f \n\t"
"1: \n\t"
"lw %[temp0], 0(%[pX]) \n\t"
"lw %[temp1], 0(%[pY]) \n\t"
"lw %[temp2], 4(%[pX]) \n\t"
"lw %[temp3], 4(%[pY]) \n\t"
"addiu %[i], %[i], 1 \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addiu %[pX], %[pX], 8 \n\t"
"addiu %[pY], %[pY], 8 \n\t"
"madd %[i], %[temp0] \n\t"
"madd %[i], %[temp2] \n\t"
"bne %[pX], %[LoopEnd], 1b \n\t"
"2: \n\t"
"mfhi %[temp0] \n\t"
"mflo %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
: [LoopEnd]"r"(LoopEnd)
: "memory", "hi", "lo"
);
return (double)((int64_t)temp0 << 32 | temp1);
}
#define HUFFMAN_COST_PASS \
__asm__ volatile( \
"sll %[temp1], %[temp0], 3 \n\t" \
"addiu %[temp3], %[streak], -3 \n\t" \
"addu %[temp2], %[pstreaks], %[temp1] \n\t" \
"blez %[temp3], 1f \n\t" \
"srl %[temp1], %[temp1], 1 \n\t" \
"addu %[temp3], %[pcnts], %[temp1] \n\t" \
"lw %[temp0], 4(%[temp2]) \n\t" \
"lw %[temp1], 0(%[temp3]) \n\t" \
"addu %[temp0], %[temp0], %[streak] \n\t" \
"addiu %[temp1], %[temp1], 1 \n\t" \
"sw %[temp0], 4(%[temp2]) \n\t" \
"sw %[temp1], 0(%[temp3]) \n\t" \
"b 2f \n\t" \
"1: \n\t" \
"lw %[temp0], 0(%[temp2]) \n\t" \
"addu %[temp0], %[temp0], %[streak] \n\t" \
"sw %[temp0], 0(%[temp2]) \n\t" \
"2: \n\t" \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
: [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
[streak]"r"(streak) \
: "memory" \
);
// Returns the various RLE counts
static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
int i;
int streak = 0;
VP8LStreaks stats;
int* const pstreaks = &stats.streaks[0][0];
int* const pcnts = &stats.counts[0];
int temp0, temp1, temp2, temp3;
memset(&stats, 0, sizeof(stats));
for (i = 0; i < length - 1; ++i) {
++streak;
if (population[i] == population[i + 1]) {
continue;
}
temp0 = (population[i] != 0);
HUFFMAN_COST_PASS
streak = 0;
}
++streak;
temp0 = (population[i] != 0);
HUFFMAN_COST_PASS
return stats;
}
static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
const uint32_t* Y, int length) {
int i;
int streak = 0;
uint32_t xy_prev = 0xffffffff;
VP8LStreaks stats;
int* const pstreaks = &stats.streaks[0][0];
int* const pcnts = &stats.counts[0];
int temp0, temp1, temp2, temp3;
memset(&stats, 0, sizeof(stats));
for (i = 0; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
++streak;
if (xy != xy_prev) {
temp0 = (xy != 0);
HUFFMAN_COST_PASS
streak = 0;
xy_prev = xy;
}
}
return stats;
}
#define ASM_START \
__asm__ volatile( \
".set push \n\t" \
".set at \n\t" \
".set macro \n\t" \
"1: \n\t"
// P2 = P0 + P1
// A..D - offsets
// E - temp variable to tell macro
// if pointer should be incremented
// literal_ and successive histograms could be unaligned
// so we must use ulw and usw
#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
"ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
"ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
"ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
"ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
"ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
"ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
"ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
"ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
"addu %[temp4], %[temp4], %[temp0] \n\t" \
"addu %[temp5], %[temp5], %[temp1] \n\t" \
"addu %[temp6], %[temp6], %[temp2] \n\t" \
"addu %[temp7], %[temp7], %[temp3] \n\t" \
"addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
".if " #E " == 1 \n\t" \
"addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
".endif \n\t" \
"usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
"usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
"usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
"usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
"addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
"bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
".set pop \n\t" \
#define ASM_END_COMMON_0 \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
[pa]"+r"(pa), [pout]"+r"(pout)
#define ASM_END_COMMON_1 \
: [LoopEnd]"r"(LoopEnd) \
: "memory", "at" \
);
#define ASM_END_0 \
ASM_END_COMMON_0 \
, [pb]"+r"(pb) \
ASM_END_COMMON_1
#define ASM_END_1 \
ASM_END_COMMON_0 \
ASM_END_COMMON_1
#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \
const uint32_t* pa = (const uint32_t*)(A); \
const uint32_t* pb = (const uint32_t*)(B); \
uint32_t* pout = (uint32_t*)(OUT); \
const uint32_t* const LoopEnd = pa + (SIZE); \
assert((SIZE) % 4 == 0); \
ASM_START \
ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \
ASM_END_0 \
if ((EXTRA_SIZE) > 0) { \
const int last = (EXTRA_SIZE); \
int i; \
for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
} \
} while (0)
#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \
const uint32_t* pa = (const uint32_t*)(A); \
uint32_t* pout = (uint32_t*)(OUT); \
const uint32_t* const LoopEnd = pa + (SIZE); \
assert((SIZE) % 4 == 0); \
ASM_START \
ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \
ASM_END_1 \
if ((EXTRA_SIZE) > 0) { \
const int last = (EXTRA_SIZE); \
int i; \
for (i = 0; i < last; ++i) pout[i] += pa[i]; \
} \
} while (0)
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
- (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
ADD_VECTOR(a->literal_, b->literal_, out->literal_,
NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
ADD_VECTOR(a->distance_, b->distance_, out->distance_,
NUM_DISTANCE_CODES, 0);
ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
} else {
ADD_VECTOR_EQ(a->literal_, out->literal_,
NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
}
}
#undef ADD_VECTOR_EQ
#undef ADD_VECTOR
#undef ASM_END_1
#undef ASM_END_0
#undef ASM_END_COMMON_1
#undef ASM_END_COMMON_0
#undef ADD_TO_OUT
#undef ASM_START
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
VP8LFastSLog2Slow = FastSLog2Slow;
VP8LFastLog2Slow = FastLog2Slow;
VP8LExtraCost = ExtraCost;
VP8LExtraCostCombined = ExtraCostCombined;
VP8LHuffmanCostCount = HuffmanCostCount;
// TODO(mips team): rewrite VP8LGetCombinedEntropy (which used to use
// HuffmanCostCombinedCount) with MIPS optimizations
#if 0
VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
#else
(void)HuffmanCostCombinedCount;
#endif
VP8LHistogramAdd = HistogramAdd;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
#endif // WEBP_USE_MIPS32

View file

@ -0,0 +1,275 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transform methods for lossless encoder.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./lossless.h"
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[argb_data], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[argb_data]) \n\t"
"lw %[temp1], 4(%[argb_data]) \n\t"
"lw %[temp2], 8(%[argb_data]) \n\t"
"lw %[temp3], 12(%[argb_data]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[argb_data], %[argb_data], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t"
"replv.ph %[temp7], %[temp7] \n\t"
"subu.qb %[temp0], %[temp0], %[temp4] \n\t"
"subu.qb %[temp1], %[temp1], %[temp5] \n\t"
"subu.qb %[temp2], %[temp2], %[temp6] \n\t"
"subu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[argb_data]) \n\t"
"sw %[temp1], -12(%[argb_data]) \n\t"
"sw %[temp2], -8(%[argb_data]) \n\t"
"bne %[argb_data], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[argb_data]) \n\t"
"3: \n\t"
"beq %[argb_data], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[argb_data]) \n\t"
"addiu %[argb_data], %[argb_data], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"subu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[argb_data], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[argb_data]) \n\t"
"2: \n\t"
".set pop \n\t"
: [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
int8_t color) {
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_;
uint32_t* const p_loop_end = data + (num_pixels & ~1);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop_end], 1f \n\t"
" nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t"
"replv.ph %[temp2], %[R_to_B] \n\t"
"shll.ph %[temp0], %[temp0], 8 \n\t"
"shll.ph %[temp1], %[temp1], 8 \n\t"
"shll.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp0], %[temp0], 8 \n\t"
"shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t"
"lw %[argb], 0(%[data]) \n\t"
"lw %[argb1], 4(%[data]) \n\t"
"lhu %[new_red], 2(%[data]) \n\t"
"lhu %[new_red1], 6(%[data]) \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"precr.qb.ph %[temp4], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t"
"preceu.ph.qbla %[temp4], %[temp4] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t"
"shll.ph %[temp4], %[temp4], 8 \n\t"
"shra.ph %[temp3], %[temp3], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp5], %[temp3], %[temp0] \n\t"
"mul.ph %[temp3], %[temp3], %[temp1] \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"addiu %[data], %[data], 8 \n\t"
"ins %[new_red1], %[new_red], 16, 16 \n\t"
"ins %[argb1], %[argb], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp3], %[temp3], 5 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t"
"subu.ph %[new_red1], %[new_red1], %[temp5] \n\t"
"subu.ph %[argb1], %[argb1], %[temp3] \n\t"
"preceu.ph.qbra %[temp5], %[new_red1] \n\t"
"subu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -2(%[data]) \n\t"
"sb %[temp3], -4(%[data]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"sb %[temp5], -6(%[data]) \n\t"
"bne %[data], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[data]) \n\t"
"1: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
[argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo"
);
if (num_pixels & 1) {
const uint32_t argb_ = data[0];
const uint32_t green = argb_ >> 8;
const uint32_t red = argb_ >> 16;
uint32_t new_blue = argb_;
new_red = red;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
uint8_t red_to_blue,
uint32_t argb) {
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint8_t new_blue = argb;
new_blue -= ColorTransformDelta(green_to_blue, green);
new_blue -= ColorTransformDelta(red_to_blue, red);
return (new_blue & 0xff);
}
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
while (tile_height-- > 0) {
int x;
const uint32_t* p_argb = argb;
argb += stride;
for (x = 0; x < (tile_width >> 1); ++x) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile (
"lw %[temp0], 0(%[p_argb]) \n\t"
"lw %[temp1], 4(%[p_argb]) \n\t"
"precr.qb.ph %[temp2], %[temp0], %[temp1] \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp3], %[temp1], 8 \n\t"
"mul.ph %[temp5], %[temp2], %[rtb] \n\t"
"mul.ph %[temp6], %[temp3], %[gtb] \n\t"
"and %[temp4], %[temp1], %[mask] \n\t"
"addiu %[p_argb], %[p_argb], 8 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp6], %[temp6], 5 \n\t"
"subu.qb %[temp2], %[temp4], %[temp5] \n\t"
"subu.qb %[temp2], %[temp2], %[temp6] \n\t"
: [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
: [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
: "memory", "hi", "lo"
);
++histo[(uint8_t)(temp2 >> 16)];
++histo[(uint8_t)temp2];
}
if (tile_width & 1) {
++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
}
}
}
static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
uint32_t argb) {
const uint32_t green = argb >> 8;
uint32_t new_red = argb >> 16;
new_red -= ColorTransformDelta(green_to_red, green);
return (new_red & 0xff);
}
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;
const uint32_t* p_argb = argb;
argb += stride;
for (x = 0; x < (tile_width >> 1); ++x) {
int temp0, temp1, temp2, temp3, temp4;
__asm__ volatile (
"lw %[temp0], 0(%[p_argb]) \n\t"
"lw %[temp1], 4(%[p_argb]) \n\t"
"precrq.ph.w %[temp4], %[temp0], %[temp1] \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"shra.ph %[temp3], %[temp1], 8 \n\t"
"mul.ph %[temp2], %[temp3], %[gtr] \n\t"
"addiu %[p_argb], %[p_argb], 8 \n\t"
"shra.ph %[temp2], %[temp2], 5 \n\t"
"subu.qb %[temp2], %[temp4], %[temp2] \n\t"
: [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
: [gtr]"r"(gtr)
: "memory", "hi", "lo"
);
++histo[(uint8_t)(temp2 >> 16)];
++histo[(uint8_t)temp2];
}
if (tile_width & 1) {
++histo[TransformColorRed(green_to_red, *p_argb)];
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,143 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
#include "./neon.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ
#endif
#ifdef USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
#else // !USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_)
};
const int16x8_t mults_rb = vld1q_s16(rb);
const int16_t b2[8] = {
0, CST(red_to_blue_), 0, CST(red_to_blue_),
0, CST(red_to_blue_), 0, CST(red_to_blue_),
};
const int16x8_t mults_b2 = vld1q_s16(b2);
#undef CST
#ifdef USE_VTBLQ
static const uint8_t kg0g0[16] = {
255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
};
const uint8x16_t shuffle = vld1q_u8(kg0g0);
#else
static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
const uint8x8_t shuffle = vld1_u8(k0g0g);
#endif
const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu); // red-blue masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// r 0 b 0
const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
// x db2 0 0
const int16x8_t C = vqdmulhq_s16(B, mults_b2);
// 0 0 x db2
const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
// x dr x db
const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
vreinterpretq_s8_s16(A));
// 0 dr 0 db
const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
vreinterpretq_s8_u32(F));
vst1q_s8((int8_t*)(argb_data + i), out);
}
// fallthrough and finish off with plain-C
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
#undef USE_VTBLQ
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
#endif // WEBP_USE_NEON

View file

@ -0,0 +1,270 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "./lossless.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_sub_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = _mm_set_epi16(
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
const __m128i mults_b2 = _mm_set_epi16(
CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0
const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0
const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2
const __m128i H = _mm_add_epi8(G, D); // x dr x db
const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db
const __m128i out = _mm_sub_epi8(in, I);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
#define SPAN 8
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set_epi16(
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0
const __m128i A1 = _mm_slli_epi16(in1, 8);
const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i B1 = _mm_and_si128(in1, mask_g);
const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0
const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db
const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b'
const __m128i E1 = _mm_sub_epi8(in1, D1);
const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db
const __m128i F1 = _mm_srli_epi32(C1, 16);
const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b'
const __m128i G1 = _mm_sub_epi8(E1, F1);
const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b
const __m128i H1 = _mm_and_si128(G1, mask_b);
const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b'
_mm_storeu_si128((__m128i*)values, I);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi32(0xff);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i A1 = _mm_and_si128(in1, mask_g);
const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
const __m128i B1 = _mm_srli_epi32(in1, 16);
const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr
const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r'
const __m128i E1 = _mm_sub_epi8(B1, C1);
const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r'
const __m128i F1 = _mm_and_si128(E1, mask);
const __m128i I = _mm_packs_epi32(F0, F1);
_mm_storeu_si128((__m128i*)values, I);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_red, histo);
}
}
}
#undef SPAN
//------------------------------------------------------------------------------
#define LINE_SIZE 16 // 8 or 16
static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
}
}
static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
}
}
#undef LINE_SIZE
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
} else {
AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
}
for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
out->literal_[i] = a->literal_[i] + b->literal_[i];
}
for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
out->distance_[i] = a->distance_[i] + b->distance_[i];
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
VP8LHistogramAdd = HistogramAdd;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
#endif // WEBP_USE_SSE2

View file

@ -0,0 +1,51 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4.1 variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include "./lossless.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
int i;
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
-1, 5, -1, 5, -1, 1, -1, 1);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
const __m128i out = _mm_sub_epi8(in, in_0g0g);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
#endif // WEBP_USE_SSE41

View file

@ -0,0 +1,680 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./lossless.h"
#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
static void FUNC_NAME(const TYPE* src, \
const uint32_t* const color_map, \
TYPE* dst, int y_start, int y_end, \
int width) { \
int y; \
for (y = y_start; y < y_end; ++y) { \
int x; \
for (x = 0; x < (width >> 2); ++x) { \
int tmp1, tmp2, tmp3, tmp4; \
__asm__ volatile ( \
".ifc " #TYPE ", uint8_t \n\t" \
"lbu %[tmp1], 0(%[src]) \n\t" \
"lbu %[tmp2], 1(%[src]) \n\t" \
"lbu %[tmp3], 2(%[src]) \n\t" \
"lbu %[tmp4], 3(%[src]) \n\t" \
"addiu %[src], %[src], 4 \n\t" \
".endif \n\t" \
".ifc " #TYPE ", uint32_t \n\t" \
"lw %[tmp1], 0(%[src]) \n\t" \
"lw %[tmp2], 4(%[src]) \n\t" \
"lw %[tmp3], 8(%[src]) \n\t" \
"lw %[tmp4], 12(%[src]) \n\t" \
"ext %[tmp1], %[tmp1], 8, 8 \n\t" \
"ext %[tmp2], %[tmp2], 8, 8 \n\t" \
"ext %[tmp3], %[tmp3], 8, 8 \n\t" \
"ext %[tmp4], %[tmp4], 8, 8 \n\t" \
"addiu %[src], %[src], 16 \n\t" \
".endif \n\t" \
"sll %[tmp1], %[tmp1], 2 \n\t" \
"sll %[tmp2], %[tmp2], 2 \n\t" \
"sll %[tmp3], %[tmp3], 2 \n\t" \
"sll %[tmp4], %[tmp4], 2 \n\t" \
"lwx %[tmp1], %[tmp1](%[color_map]) \n\t" \
"lwx %[tmp2], %[tmp2](%[color_map]) \n\t" \
"lwx %[tmp3], %[tmp3](%[color_map]) \n\t" \
"lwx %[tmp4], %[tmp4](%[color_map]) \n\t" \
".ifc " #TYPE ", uint8_t \n\t" \
"ext %[tmp1], %[tmp1], 8, 8 \n\t" \
"ext %[tmp2], %[tmp2], 8, 8 \n\t" \
"ext %[tmp3], %[tmp3], 8, 8 \n\t" \
"ext %[tmp4], %[tmp4], 8, 8 \n\t" \
"sb %[tmp1], 0(%[dst]) \n\t" \
"sb %[tmp2], 1(%[dst]) \n\t" \
"sb %[tmp3], 2(%[dst]) \n\t" \
"sb %[tmp4], 3(%[dst]) \n\t" \
"addiu %[dst], %[dst], 4 \n\t" \
".endif \n\t" \
".ifc " #TYPE ", uint32_t \n\t" \
"sw %[tmp1], 0(%[dst]) \n\t" \
"sw %[tmp2], 4(%[dst]) \n\t" \
"sw %[tmp3], 8(%[dst]) \n\t" \
"sw %[tmp4], 12(%[dst]) \n\t" \
"addiu %[dst], %[dst], 16 \n\t" \
".endif \n\t" \
: [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), \
[tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst) \
: [color_map]"r"(color_map) \
: "memory" \
); \
} \
for (x = 0; x < (width & 3); ++x) { \
*dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]); \
} \
} \
}
MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef MAP_COLOR_FUNCS
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
int temp0, temp1, temp2, temp3, temp4, temp5;
__asm__ volatile (
"preceu.ph.qbr %[temp1], %[c0] \n\t"
"preceu.ph.qbl %[temp2], %[c0] \n\t"
"preceu.ph.qbr %[temp3], %[c1] \n\t"
"preceu.ph.qbl %[temp4], %[c1] \n\t"
"preceu.ph.qbr %[temp5], %[c2] \n\t"
"preceu.ph.qbl %[temp0], %[c2] \n\t"
"subq.ph %[temp3], %[temp3], %[temp5] \n\t"
"subq.ph %[temp4], %[temp4], %[temp0] \n\t"
"addq.ph %[temp1], %[temp1], %[temp3] \n\t"
"addq.ph %[temp2], %[temp2], %[temp4] \n\t"
"shll_s.ph %[temp1], %[temp1], 7 \n\t"
"shll_s.ph %[temp2], %[temp2], 7 \n\t"
"precrqu_s.qb.ph %[temp2], %[temp2], %[temp1] \n\t"
: [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
: [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
: "memory"
);
return temp2;
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
int temp0, temp1, temp2, temp3, temp4, temp5;
__asm__ volatile (
"adduh.qb %[temp5], %[c0], %[c1] \n\t"
"preceu.ph.qbr %[temp3], %[c2] \n\t"
"preceu.ph.qbr %[temp1], %[temp5] \n\t"
"preceu.ph.qbl %[temp2], %[temp5] \n\t"
"preceu.ph.qbl %[temp4], %[c2] \n\t"
"subq.ph %[temp3], %[temp1], %[temp3] \n\t"
"subq.ph %[temp4], %[temp2], %[temp4] \n\t"
"shrl.ph %[temp5], %[temp3], 15 \n\t"
"shrl.ph %[temp0], %[temp4], 15 \n\t"
"addq.ph %[temp3], %[temp3], %[temp5] \n\t"
"addq.ph %[temp4], %[temp0], %[temp4] \n\t"
"shra.ph %[temp3], %[temp3], 1 \n\t"
"shra.ph %[temp4], %[temp4], 1 \n\t"
"addq.ph %[temp1], %[temp1], %[temp3] \n\t"
"addq.ph %[temp2], %[temp2], %[temp4] \n\t"
"shll_s.ph %[temp1], %[temp1], 7 \n\t"
"shll_s.ph %[temp2], %[temp2], 7 \n\t"
"precrqu_s.qb.ph %[temp1], %[temp2], %[temp1] \n\t"
: [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
: [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
: "memory"
);
return temp1;
}
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
int temp0, temp1, temp2, temp3, temp4, temp5;
__asm__ volatile (
"cmpgdu.lt.qb %[temp1], %[c], %[b] \n\t"
"pick.qb %[temp1], %[b], %[c] \n\t"
"pick.qb %[temp2], %[c], %[b] \n\t"
"cmpgdu.lt.qb %[temp4], %[c], %[a] \n\t"
"pick.qb %[temp4], %[a], %[c] \n\t"
"pick.qb %[temp5], %[c], %[a] \n\t"
"subu.qb %[temp3], %[temp1], %[temp2] \n\t"
"subu.qb %[temp0], %[temp4], %[temp5] \n\t"
"raddu.w.qb %[temp3], %[temp3] \n\t"
"raddu.w.qb %[temp0], %[temp0] \n\t"
"subu %[temp3], %[temp3], %[temp0] \n\t"
"slti %[temp0], %[temp3], 0x1 \n\t"
"movz %[a], %[b], %[temp0] \n\t"
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
[a]"+&r"(a)
: [b]"r"(b), [c]"r"(c)
);
return a;
}
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
__asm__ volatile (
"adduh.qb %[a0], %[a0], %[a1] \n\t"
: [a0]"+r"(a0)
: [a1]"r"(a1)
);
return a0;
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return Average2(Average2(a0, a2), a1);
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
return Average2(Average2(a0, a1), Average2(a2, a3));
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
return Average3(left, top[0], top[1]);
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
return Average2(left, top[-1]);
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
return Average2(left, top[0]);
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[-1], top[0]);
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[0], top[1]);
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
return Average4(left, top[-1], top[0], top[1]);
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
return Select(top[0], left, top[-1]);
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
return ClampedAddSubtractFull(left, top[0], top[-1]);
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
return ClampedAddSubtractHalf(left, top[0], top[-1]);
}
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = data + (num_pixels & ~3);
uint32_t* const p_loop2_end = data + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[data]) \n\t"
"lw %[temp1], 4(%[data]) \n\t"
"lw %[temp2], 8(%[data]) \n\t"
"lw %[temp3], 12(%[data]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[data], %[data], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t"
"replv.ph %[temp7], %[temp7] \n\t"
"addu.qb %[temp0], %[temp0], %[temp4] \n\t"
"addu.qb %[temp1], %[temp1], %[temp5] \n\t"
"addu.qb %[temp2], %[temp2], %[temp6] \n\t"
"addu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[data]) \n\t"
"sw %[temp1], -12(%[data]) \n\t"
"sw %[temp2], -8(%[data]) \n\t"
"bne %[data], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[data]) \n\t"
"3: \n\t"
"beq %[data], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[data]) \n\t"
"addiu %[data], %[data], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"addu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[data], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[data]) \n\t"
"2: \n\t"
".set pop \n\t"
: [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_;
uint32_t* const p_loop_end = data + (num_pixels & ~1);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop_end], 1f \n\t"
" nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t"
"replv.ph %[temp2], %[R_to_B] \n\t"
"shll.ph %[temp0], %[temp0], 8 \n\t"
"shll.ph %[temp1], %[temp1], 8 \n\t"
"shll.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp0], %[temp0], 8 \n\t"
"shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t"
"lw %[argb], 0(%[data]) \n\t"
"lw %[argb1], 4(%[data]) \n\t"
"addiu %[data], %[data], 8 \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t"
"shra.ph %[temp3], %[temp3], 8 \n\t"
"mul.ph %[temp5], %[temp3], %[temp0] \n\t"
"mul.ph %[temp3], %[temp3], %[temp1] \n\t"
"precrq.ph.w %[new_red], %[argb], %[argb1] \n\t"
"ins %[argb1], %[argb], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp3], %[temp3], 5 \n\t"
"addu.ph %[new_red], %[new_red], %[temp5] \n\t"
"addu.ph %[argb1], %[argb1], %[temp3] \n\t"
"preceu.ph.qbra %[temp5], %[new_red] \n\t"
"shll.ph %[temp4], %[temp5], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"sb %[temp5], -2(%[data]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t"
"addu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -6(%[data]) \n\t"
"sb %[temp3], -4(%[data]) \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"bne %[data], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[data]) \n\t"
"1: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red]"=&r"(new_red), [argb]"=&r"(argb),
[argb1]"=&r"(argb1), [data]"+&r"(data)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo"
);
// Fall-back to C-version for left-overs.
if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1);
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"ins %[temp3], %[temp2], 24, 8 \n\t"
"sll %[temp2], %[temp2], 8 \n\t"
"rotr %[temp3], %[temp3], 16 \n\t"
"ins %[temp2], %[temp1], 0, 16 \n\t"
"sll %[temp1], %[temp1], 8 \n\t"
"wsbh %[temp3], %[temp3] \n\t"
"balign %[temp0], %[temp1], 1 \n\t"
"wsbh %[temp2], %[temp2] \n\t"
"wsbh %[temp0], %[temp0] \n\t"
"usw %[temp3], 8(%[dst]) \n\t"
"rotr %[temp0], %[temp0], 16 \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
"addiu %[src], %[src], 16 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 12 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"addiu %[src], %[src], 4 \n\t"
"wsbh %[temp1], %[temp0] \n\t"
"addiu %[dst], %[dst], 3 \n\t"
"ush %[temp1], -2(%[dst]) \n\t"
"sra %[temp0], %[temp0], 16 \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" sb %[temp0], -3(%[dst]) \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"wsbh %[temp0], %[temp0] \n\t"
"wsbh %[temp1], %[temp1] \n\t"
"wsbh %[temp2], %[temp2] \n\t"
"wsbh %[temp3], %[temp3] \n\t"
"addiu %[src], %[src], 16 \n\t"
"balign %[temp0], %[temp0], 1 \n\t"
"balign %[temp1], %[temp1], 1 \n\t"
"balign %[temp2], %[temp2], 1 \n\t"
"balign %[temp3], %[temp3], 1 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp1], 4(%[dst]) \n\t"
"usw %[temp2], 8(%[dst]) \n\t"
"usw %[temp3], 12(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 16 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"wsbh %[temp0], %[temp0] \n\t"
"addiu %[src], %[src], 4 \n\t"
"balign %[temp0], %[temp0], 1 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" addiu %[dst], %[dst], 4 \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToRGBA4444(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 28, 4 \n\t"
"ext %[temp5], %[temp0], 12, 4 \n\t"
"ins %[temp0], %[temp4], 0, 4 \n\t"
"ext %[temp4], %[temp1], 28, 4 \n\t"
"ins %[temp0], %[temp5], 16, 4 \n\t"
"ext %[temp5], %[temp1], 12, 4 \n\t"
"ins %[temp1], %[temp4], 0, 4 \n\t"
"ext %[temp4], %[temp2], 28, 4 \n\t"
"ins %[temp1], %[temp5], 16, 4 \n\t"
"ext %[temp5], %[temp2], 12, 4 \n\t"
"ins %[temp2], %[temp4], 0, 4 \n\t"
"ext %[temp4], %[temp3], 28, 4 \n\t"
"ins %[temp2], %[temp5], 16, 4 \n\t"
"ext %[temp5], %[temp3], 12, 4 \n\t"
"ins %[temp3], %[temp4], 0, 4 \n\t"
"precr.qb.ph %[temp1], %[temp1], %[temp0] \n\t"
"ins %[temp3], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 16 \n\t"
"precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#else
"wsbh %[temp1], %[temp1] \n\t"
"wsbh %[temp3], %[temp3] \n\t"
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 8 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"ext %[temp4], %[temp0], 28, 4 \n\t"
"ext %[temp5], %[temp0], 12, 4 \n\t"
"ins %[temp0], %[temp4], 0, 4 \n\t"
"ins %[temp0], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
"ush %[temp0], 0(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
"ush %[temp0], 0(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop2_end], 1b \n\t"
" addiu %[dst], %[dst], 2 \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToRGB565(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 16 \n\t"
"ext %[temp5], %[temp0], 5, 11 \n\t"
"ext %[temp0], %[temp0], 3, 5 \n\t"
"ins %[temp4], %[temp5], 0, 11 \n\t"
"ext %[temp5], %[temp1], 5, 11 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
"ext %[temp0], %[temp1], 8, 16 \n\t"
"ext %[temp1], %[temp1], 3, 5 \n\t"
"ins %[temp0], %[temp5], 0, 11 \n\t"
"ext %[temp5], %[temp2], 5, 11 \n\t"
"ins %[temp0], %[temp1], 0, 5 \n\t"
"ext %[temp1], %[temp2], 8, 16 \n\t"
"ext %[temp2], %[temp2], 3, 5 \n\t"
"ins %[temp1], %[temp5], 0, 11 \n\t"
"ext %[temp5], %[temp3], 5, 11 \n\t"
"ins %[temp1], %[temp2], 0, 5 \n\t"
"ext %[temp2], %[temp3], 8, 16 \n\t"
"ext %[temp3], %[temp3], 3, 5 \n\t"
"ins %[temp2], %[temp5], 0, 11 \n\t"
"append %[temp0], %[temp4], 16 \n\t"
"ins %[temp2], %[temp3], 0, 5 \n\t"
"addiu %[src], %[src], 16 \n\t"
"append %[temp2], %[temp1], 16 \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
"wsbh %[temp2], %[temp2] \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 8 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 16 \n\t"
"ext %[temp5], %[temp0], 5, 11 \n\t"
"ext %[temp0], %[temp0], 3, 5 \n\t"
"ins %[temp4], %[temp5], 0, 11 \n\t"
"addiu %[src], %[src], 4 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
"ush %[temp4], 0(%[dst]) \n\t"
#else
"wsbh %[temp4], %[temp4] \n\t"
"ush %[temp4], 0(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop2_end], 1b \n\t"
" addiu %[dst], %[dst], 2 \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ins %[temp0], %[temp1], 24, 8 \n\t"
"sra %[temp1], %[temp1], 8 \n\t"
"ins %[temp1], %[temp2], 16, 16 \n\t"
"sll %[temp2], %[temp2], 8 \n\t"
"balign %[temp3], %[temp2], 1 \n\t"
"addiu %[src], %[src], 16 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp1], 4(%[dst]) \n\t"
"usw %[temp3], 8(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 12 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"addiu %[src], %[src], 4 \n\t"
"addiu %[dst], %[dst], 3 \n\t"
"ush %[temp0], -3(%[dst]) \n\t"
"sra %[temp0], %[temp0], 16 \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" sb %[temp0], -1(%[dst]) \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
VP8LMapColor32b = MapARGB;
VP8LMapColor8b = MapAlpha;
VP8LPredictors[5] = Predictor5;
VP8LPredictors[6] = Predictor6;
VP8LPredictors[7] = Predictor7;
VP8LPredictors[8] = Predictor8;
VP8LPredictors[9] = Predictor9;
VP8LPredictors[10] = Predictor10;
VP8LPredictors[11] = Predictor11;
VP8LPredictors[12] = Predictor12;
VP8LPredictors[13] = Predictor13;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,269 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of methods for lossless decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
#include "./neon.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
#if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
const uint8x16_t tmp = pixel.val[0];
pixel.val[0] = pixel.val[2];
pixel.val[2] = tmp;
vst4q_u8(dst, pixel);
dst += 64;
}
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
vst3q_u8(dst, tmp);
dst += 48;
}
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
vst3q_u8(dst, tmp);
dst += 48;
}
VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs
}
#else // WORK_AROUND_GCC
// gcc-4.6.0 fallback
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
const uint8x8_t pixels = vld1_u8((uint8_t*)src);
vst1_u8(dst, vtbl1_u8(pixels, shuffle));
dst += 8;
}
VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs
}
static const uint8_t kBGRShuffle[3][8] = {
{ 0, 1, 2, 4, 5, 6, 8, 9 },
{ 10, 12, 13, 14, 16, 17, 18, 20 },
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
for (; src < end; src += 8) {
uint8x8x4_t pixels;
INIT_VECTOR4(pixels,
vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)));
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
dst += 8 * 3;
}
VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs
}
static const uint8_t kRGBShuffle[3][8] = {
{ 2, 1, 0, 6, 5, 4, 10, 9 },
{ 8, 14, 13, 12, 18, 17, 16, 22 },
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
for (; src < end; src += 8) {
uint8x8x4_t pixels;
INIT_VECTOR4(pixels,
vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)));
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
dst += 8 * 3;
}
VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs
}
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Subtract-Green Transform
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ
#endif
#ifdef USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
#else // !USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_)
};
const int16x8_t mults_rb = vld1q_s16(rb);
const int16_t b2[8] = {
0, CST(red_to_blue_), 0, CST(red_to_blue_),
0, CST(red_to_blue_), 0, CST(red_to_blue_),
};
const int16x8_t mults_b2 = vld1q_s16(b2);
#undef CST
#ifdef USE_VTBLQ
static const uint8_t kg0g0[16] = {
255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
};
const uint8x16_t shuffle = vld1q_u8(kg0g0);
#else
static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
const uint8x8_t shuffle = vld1_u8(k0g0g);
#endif
const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// x r' x b'
const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),
vreinterpretq_s8_s16(A));
// r' 0 b' 0
const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);
// x db2 0 0
const int16x8_t D = vqdmulhq_s16(C, mults_b2);
// 0 x db2 0
const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);
// r' x b'' 0
const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),
vreinterpretq_s8_s16(C));
// 0 r' 0 b''
const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
vst1q_u32(argb_data + i, out);
}
// Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
}
#undef USE_VTBLQ
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8LDspInitNEON)
#endif // WEBP_USE_NEON

View file

@ -0,0 +1,372 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of methods for lossless decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "./lossless.h"
//------------------------------------------------------------------------------
// Predictor Transform
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
const __m128i V1 = _mm_add_epi16(C0, C1);
const __m128i V2 = _mm_sub_epi16(V1, C2);
const __m128i b = _mm_packus_epi16(V2, V2);
const uint32_t output = _mm_cvtsi128_si32(b);
return output;
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
const __m128i avg = _mm_add_epi16(C1, C0);
const __m128i A0 = _mm_srli_epi16(avg, 1);
const __m128i A1 = _mm_sub_epi16(A0, B0);
const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
const __m128i A2 = _mm_sub_epi16(A1, BgtA);
const __m128i A3 = _mm_srai_epi16(A2, 1);
const __m128i A4 = _mm_add_epi16(A0, A3);
const __m128i A5 = _mm_packus_epi16(A4, A4);
const uint32_t output = _mm_cvtsi128_si32(A5);
return output;
}
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
int pa_minus_pb;
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_cvtsi32_si128(a);
const __m128i B0 = _mm_cvtsi32_si128(b);
const __m128i C0 = _mm_cvtsi32_si128(c);
const __m128i AC0 = _mm_subs_epu8(A0, C0);
const __m128i CA0 = _mm_subs_epu8(C0, A0);
const __m128i BC0 = _mm_subs_epu8(B0, C0);
const __m128i CB0 = _mm_subs_epu8(C0, B0);
const __m128i AC = _mm_or_si128(AC0, CA0);
const __m128i BC = _mm_or_si128(BC0, CB0);
const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
const __m128i diff = _mm_sub_epi16(pb, pa);
{
int16_t out[8];
_mm_storeu_si128((__m128i*)out, diff);
pa_minus_pb = out[0] + out[1] + out[2] + out[3];
}
return (pa_minus_pb <= 0) ? a : b;
}
static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(A1, A0);
const __m128i avg = _mm_srli_epi16(sum, 1);
return avg;
}
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
const __m128i avg = Average2_128i(a0, a1);
const __m128i A2 = _mm_packus_epi16(avg, avg);
const uint32_t output = _mm_cvtsi128_si32(A2);
return output;
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
const __m128i zero = _mm_setzero_si128();
const __m128i avg1 = Average2_128i(a0, a2);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(avg1, A1);
const __m128i avg2 = _mm_srli_epi16(sum, 1);
const __m128i A2 = _mm_packus_epi16(avg2, avg2);
const uint32_t output = _mm_cvtsi128_si32(A2);
return output;
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
const __m128i avg1 = Average2_128i(a0, a1);
const __m128i avg2 = Average2_128i(a2, a3);
const __m128i sum = _mm_add_epi16(avg2, avg1);
const __m128i avg3 = _mm_srli_epi16(sum, 1);
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
const uint32_t output = _mm_cvtsi128_si32(A0);
return output;
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_add_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16(
CST(green_to_red_), CST(green_to_blue_),
CST(green_to_red_), CST(green_to_blue_),
CST(green_to_red_), CST(green_to_blue_),
CST(green_to_red_), CST(green_to_blue_));
const __m128i mults_b2 = _mm_set_epi16(
CST(red_to_blue_), 0, CST(red_to_blue_), 0,
CST(red_to_blue_), 0, CST(red_to_blue_), 0);
#undef CST
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
const __m128i E = _mm_add_epi8(in, D); // x r' x b'
const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
const __m128i out = _mm_or_si128(J, A);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
// Color-space conversion functions
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
_mm_storeu_si128(out++, rgba0);
_mm_storeu_si128(out++, rgba4);
num_pixels -= 8;
}
// left-overs
VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
static void ConvertBGRAToRGBA4444(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
#ifdef WEBP_SWAP_16BIT_CSP
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
#else
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
#endif
_mm_storeu_si128(out++, rgba);
num_pixels -= 8;
}
// left-overs
VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
static void ConvertBGRAToRGB565(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
const __m128i b1 = _mm_srli_epi16(b0, 3);
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
#ifdef WEBP_SWAP_16BIT_CSP
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
#else
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
#endif
_mm_storeu_si128(out++, rgba);
num_pixels -= 8;
}
// left-overs
VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src;
const uint8_t* const end = dst + num_pixels * 3;
// the last storel_epi64 below writes 8 bytes starting at offset 18
while (dst + 26 <= end) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
const __m128i c2 = _mm_srli_si128(c0, 8);
const __m128i c6 = _mm_srli_si128(c4, 8);
_mm_storel_epi64((__m128i*)(dst + 0), c0);
_mm_storel_epi64((__m128i*)(dst + 6), c2);
_mm_storel_epi64((__m128i*)(dst + 12), c4);
_mm_storel_epi64((__m128i*)(dst + 18), c6);
dst += 24;
num_pixels -= 8;
}
// left-overs
VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
VP8LPredictors[5] = Predictor5;
VP8LPredictors[6] = Predictor6;
VP8LPredictors[7] = Predictor7;
VP8LPredictors[8] = Predictor8;
VP8LPredictors[9] = Predictor9;
VP8LPredictors[10] = Predictor10;
VP8LPredictors[11] = Predictor11;
VP8LPredictors[12] = Predictor12;
VP8LPredictors[13] = Predictor13;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
#endif // WEBP_USE_SSE2

View file

@ -0,0 +1,200 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS common macros
#ifndef WEBP_DSP_MIPS_MACRO_H_
#define WEBP_DSP_MIPS_MACRO_H_
#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
#define WORK_AROUND_GCC
#endif
#define STR(s) #s
#define XSTR(s) STR(s)
// O0[31..16 | 15..0] = I0[31..16 | 15..0] + I1[31..16 | 15..0]
// O1[31..16 | 15..0] = I0[31..16 | 15..0] - I1[31..16 | 15..0]
// O - output
// I - input (macro doesn't change it)
#define ADD_SUB_HALVES(O0, O1, \
I0, I1) \
"addq.ph %[" #O0 "], %[" #I0 "], %[" #I1 "] \n\t" \
"subq.ph %[" #O1 "], %[" #I0 "], %[" #I1 "] \n\t"
// O - output
// I - input (macro doesn't change it)
// I[0/1] - offset in bytes
#define LOAD_IN_X2(O0, O1, \
I0, I1) \
"lh %[" #O0 "], " #I0 "(%[in]) \n\t" \
"lh %[" #O1 "], " #I1 "(%[in]) \n\t"
// I0 - location
// I1..I9 - offsets in bytes
#define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3, \
I0, I1, I2, I3, I4, I5, I6, I7, I8, I9) \
"ulw %[" #O0 "], " #I1 "+" XSTR(I9) "*" #I5 "(%[" #I0 "]) \n\t" \
"ulw %[" #O1 "], " #I2 "+" XSTR(I9) "*" #I6 "(%[" #I0 "]) \n\t" \
"ulw %[" #O2 "], " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "]) \n\t" \
"ulw %[" #O3 "], " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "]) \n\t"
// O - output
// IO - input/output
// I - input (macro doesn't change it)
#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7, \
IO0, IO1, IO2, IO3, \
I0, I1, I2, I3, I4, I5, I6, I7) \
"mul %[" #O0 "], %[" #I0 "], %[kC2] \n\t" \
"mul %[" #O1 "], %[" #I0 "], %[kC1] \n\t" \
"mul %[" #O2 "], %[" #I1 "], %[kC2] \n\t" \
"mul %[" #O3 "], %[" #I1 "], %[kC1] \n\t" \
"mul %[" #O4 "], %[" #I2 "], %[kC2] \n\t" \
"mul %[" #O5 "], %[" #I2 "], %[kC1] \n\t" \
"mul %[" #O6 "], %[" #I3 "], %[kC2] \n\t" \
"mul %[" #O7 "], %[" #I3 "], %[kC1] \n\t" \
"sra %[" #O0 "], %[" #O0 "], 16 \n\t" \
"sra %[" #O1 "], %[" #O1 "], 16 \n\t" \
"sra %[" #O2 "], %[" #O2 "], 16 \n\t" \
"sra %[" #O3 "], %[" #O3 "], 16 \n\t" \
"sra %[" #O4 "], %[" #O4 "], 16 \n\t" \
"sra %[" #O5 "], %[" #O5 "], 16 \n\t" \
"sra %[" #O6 "], %[" #O6 "], 16 \n\t" \
"sra %[" #O7 "], %[" #O7 "], 16 \n\t" \
"addu %[" #IO0 "], %[" #IO0 "], %[" #I4 "] \n\t" \
"addu %[" #IO1 "], %[" #IO1 "], %[" #I5 "] \n\t" \
"subu %[" #IO2 "], %[" #IO2 "], %[" #I6 "] \n\t" \
"subu %[" #IO3 "], %[" #IO3 "], %[" #I7 "] \n\t"
// O - output
// I - input (macro doesn't change it)
#define INSERT_HALF_X2(O0, O1, \
I0, I1) \
"ins %[" #O0 "], %[" #I0 "], 16, 16 \n\t" \
"ins %[" #O1 "], %[" #I1 "], 16, 16 \n\t"
// O - output
// I - input (macro doesn't change it)
#define SRA_16(O0, O1, O2, O3, \
I0, I1, I2, I3) \
"sra %[" #O0 "], %[" #I0 "], 16 \n\t" \
"sra %[" #O1 "], %[" #I1 "], 16 \n\t" \
"sra %[" #O2 "], %[" #I2 "], 16 \n\t" \
"sra %[" #O3 "], %[" #I3 "], 16 \n\t"
// temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0]
// temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0]
// temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3]
// temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3]
// O - output
// I - input (macro doesn't change it)
#define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7, \
I0, I1, I2, I3, I4, I5, I6, I7) \
"addq.ph %[" #O0 "], %[" #I0 "], %[" #I4 "] \n\t" \
"subq.ph %[" #O1 "], %[" #I0 "], %[" #I4 "] \n\t" \
"addq.ph %[" #O2 "], %[" #I1 "], %[" #I5 "] \n\t" \
"subq.ph %[" #O3 "], %[" #I1 "], %[" #I5 "] \n\t" \
"addq.ph %[" #O4 "], %[" #I2 "], %[" #I6 "] \n\t" \
"subq.ph %[" #O5 "], %[" #I2 "], %[" #I6 "] \n\t" \
"addq.ph %[" #O6 "], %[" #I3 "], %[" #I7 "] \n\t" \
"subq.ph %[" #O7 "], %[" #I3 "], %[" #I7 "] \n\t" \
"shra.ph %[" #O0 "], %[" #O0 "], 3 \n\t" \
"shra.ph %[" #O1 "], %[" #O1 "], 3 \n\t" \
"shra.ph %[" #O2 "], %[" #O2 "], 3 \n\t" \
"shra.ph %[" #O3 "], %[" #O3 "], 3 \n\t" \
"shra.ph %[" #O4 "], %[" #O4 "], 3 \n\t" \
"shra.ph %[" #O5 "], %[" #O5 "], 3 \n\t" \
"shra.ph %[" #O6 "], %[" #O6 "], 3 \n\t" \
"shra.ph %[" #O7 "], %[" #O7 "], 3 \n\t"
// precrq.ph.w temp0, temp8, temp2
// temp0 = temp8[31..16] | temp2[31..16]
// ins temp2, temp8, 16, 16
// temp2 = temp8[31..16] | temp2[15..0]
// O - output
// IO - input/output
// I - input (macro doesn't change it)
#define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3, \
IO0, IO1, IO2, IO3, \
I0, I1, I2, I3) \
"precrq.ph.w %[" #O0 "], %[" #I0 "], %[" #IO0 "] \n\t" \
"precrq.ph.w %[" #O1 "], %[" #I1 "], %[" #IO1 "] \n\t" \
"ins %[" #IO0 "], %[" #I0 "], 16, 16 \n\t" \
"ins %[" #IO1 "], %[" #I1 "], 16, 16 \n\t" \
"precrq.ph.w %[" #O2 "], %[" #I2 "], %[" #IO2 "] \n\t" \
"precrq.ph.w %[" #O3 "], %[" #I3 "], %[" #IO3 "] \n\t" \
"ins %[" #IO2 "], %[" #I2 "], 16, 16 \n\t" \
"ins %[" #IO3 "], %[" #I3 "], 16, 16 \n\t"
// preceu.ph.qbr temp0, temp8
// temp0 = 0 | 0 | temp8[23..16] | temp8[7..0]
// preceu.ph.qbl temp1, temp8
// temp1 = temp8[23..16] | temp8[7..0] | 0 | 0
// O - output
// I - input (macro doesn't change it)
#define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7, \
I0, I1, I2, I3) \
"preceu.ph.qbr %[" #O0 "], %[" #I0 "] \n\t" \
"preceu.ph.qbl %[" #O1 "], %[" #I0 "] \n\t" \
"preceu.ph.qbr %[" #O2 "], %[" #I1 "] \n\t" \
"preceu.ph.qbl %[" #O3 "], %[" #I1 "] \n\t" \
"preceu.ph.qbr %[" #O4 "], %[" #I2 "] \n\t" \
"preceu.ph.qbl %[" #O5 "], %[" #I2 "] \n\t" \
"preceu.ph.qbr %[" #O6 "], %[" #I3 "] \n\t" \
"preceu.ph.qbl %[" #O7 "], %[" #I3 "] \n\t"
// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
// temp1..temp7 same as temp0
// precrqu_s.qb.ph temp0, temp1, temp0:
// temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
// store temp0 to dst
// IO - input/output
// I - input (macro doesn't change it)
#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7, \
I0, I1, I2, I3, I4, I5, I6, I7, \
I8, I9, I10, I11, I12, I13) \
"addq.ph %[" #IO0 "], %[" #IO0 "], %[" #I0 "] \n\t" \
"addq.ph %[" #IO1 "], %[" #IO1 "], %[" #I1 "] \n\t" \
"addq.ph %[" #IO2 "], %[" #IO2 "], %[" #I2 "] \n\t" \
"addq.ph %[" #IO3 "], %[" #IO3 "], %[" #I3 "] \n\t" \
"addq.ph %[" #IO4 "], %[" #IO4 "], %[" #I4 "] \n\t" \
"addq.ph %[" #IO5 "], %[" #IO5 "], %[" #I5 "] \n\t" \
"addq.ph %[" #IO6 "], %[" #IO6 "], %[" #I6 "] \n\t" \
"addq.ph %[" #IO7 "], %[" #IO7 "], %[" #I7 "] \n\t" \
"shll_s.ph %[" #IO0 "], %[" #IO0 "], 7 \n\t" \
"shll_s.ph %[" #IO1 "], %[" #IO1 "], 7 \n\t" \
"shll_s.ph %[" #IO2 "], %[" #IO2 "], 7 \n\t" \
"shll_s.ph %[" #IO3 "], %[" #IO3 "], 7 \n\t" \
"shll_s.ph %[" #IO4 "], %[" #IO4 "], 7 \n\t" \
"shll_s.ph %[" #IO5 "], %[" #IO5 "], 7 \n\t" \
"shll_s.ph %[" #IO6 "], %[" #IO6 "], 7 \n\t" \
"shll_s.ph %[" #IO7 "], %[" #IO7 "], 7 \n\t" \
"precrqu_s.qb.ph %[" #IO0 "], %[" #IO1 "], %[" #IO0 "] \n\t" \
"precrqu_s.qb.ph %[" #IO2 "], %[" #IO3 "], %[" #IO2 "] \n\t" \
"precrqu_s.qb.ph %[" #IO4 "], %[" #IO5 "], %[" #IO4 "] \n\t" \
"precrqu_s.qb.ph %[" #IO6 "], %[" #IO7 "], %[" #IO6 "] \n\t" \
"usw %[" #IO0 "], " XSTR(I13) "*" #I9 "(%[" #I8 "]) \n\t" \
"usw %[" #IO2 "], " XSTR(I13) "*" #I10 "(%[" #I8 "]) \n\t" \
"usw %[" #IO4 "], " XSTR(I13) "*" #I11 "(%[" #I8 "]) \n\t" \
"usw %[" #IO6 "], " XSTR(I13) "*" #I12 "(%[" #I8 "]) \n\t"
#define OUTPUT_EARLY_CLOBBER_REGS_10() \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), \
[temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), \
[temp10]"=&r"(temp10)
#define OUTPUT_EARLY_CLOBBER_REGS_18() \
OUTPUT_EARLY_CLOBBER_REGS_10(), \
[temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), \
[temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), \
[temp17]"=&r"(temp17), [temp18]"=&r"(temp18)
#endif // WEBP_DSP_MIPS_MACRO_H_

82
drivers/webp/dsp/neon.h Normal file
View file

@ -0,0 +1,82 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON common code.
#ifndef WEBP_DSP_NEON_H_
#define WEBP_DSP_NEON_H_
#include <arm_neon.h>
#include "./dsp.h"
// Right now, some intrinsics functions seem slower, so we disable them
// everywhere except aarch64 where the inline assembly is incompatible.
#if defined(__aarch64__)
#define WEBP_USE_INTRINSICS // use intrinsics when possible
#endif
#define INIT_VECTOR2(v, a, b) do { \
v.val[0] = a; \
v.val[1] = b; \
} while (0)
#define INIT_VECTOR3(v, a, b, c) do { \
v.val[0] = a; \
v.val[1] = b; \
v.val[2] = c; \
} while (0)
#define INIT_VECTOR4(v, a, b, c, d) do { \
v.val[0] = a; \
v.val[1] = b; \
v.val[2] = c; \
v.val[3] = d; \
} while (0)
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WORK_AROUND_GCC
#endif
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
// Transpose 64-bit values (there's no vswp equivalent)
{
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
}
{
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
vreinterpretq_s32_u64(row01.val[1]));
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
vreinterpretq_s32_u64(row23.val[1]));
int32x4x4_t out;
out.val[0] = out01.val[0];
out.val[1] = out01.val[1];
out.val[2] = out23.val[0];
out.val[3] = out23.val[1];
return out;
}
}
#endif // WEBP_DSP_NEON_H_

238
drivers/webp/dsp/rescaler.c Normal file
View file

@ -0,0 +1,238 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Rescaling functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "./dsp.h"
#include "../utils/rescaler.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
//------------------------------------------------------------------------------
// Row import
void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
assert(!WebPRescalerInputDone(wrk));
assert(wrk->x_expand);
for (channel = 0; channel < x_stride; ++channel) {
int x_in = channel;
int x_out = channel;
// simple bilinear interpolation
int accum = wrk->x_add;
int left = src[x_in];
int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
x_in += x_stride;
while (1) {
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
x_out += x_stride;
if (x_out >= x_out_max) break;
accum -= wrk->x_sub;
if (accum < 0) {
left = right;
x_in += x_stride;
assert(x_in < wrk->src_width * x_stride);
right = src[x_in];
accum += wrk->x_add;
}
}
assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
}
}
void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
assert(!WebPRescalerInputDone(wrk));
assert(!wrk->x_expand);
for (channel = 0; channel < x_stride; ++channel) {
int x_in = channel;
int x_out = channel;
uint32_t sum = 0;
int accum = 0;
while (x_out < x_out_max) {
uint32_t base = 0;
accum += wrk->x_add;
while (accum > 0) {
accum -= wrk->x_sub;
assert(x_in < wrk->src_width * x_stride);
base = src[x_in];
sum += base;
x_in += x_stride;
}
{ // Emit next horizontal pixel.
const rescaler_t frac = base * (-accum);
wrk->frow[x_out] = sum * wrk->x_sub - frac;
// fresh fractional start for next pixel
sum = (int)MULT_FIX(frac, wrk->fx_scale);
}
x_out += x_stride;
}
assert(accum == 0);
}
}
//------------------------------------------------------------------------------
// Row export
void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
}
}
}
void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
irow[x_out] = frac; // new fractional start
}
} else {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
irow[x_out] = 0;
}
}
}
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
// Main entry calls
void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
assert(!WebPRescalerInputDone(wrk));
if (!wrk->x_expand) {
WebPRescalerImportRowShrink(wrk, src);
} else {
WebPRescalerImportRowExpand(wrk, src);
}
}
void WebPRescalerExportRow(WebPRescaler* const wrk) {
if (wrk->y_accum <= 0) {
assert(!WebPRescalerOutputDone(wrk));
if (wrk->y_expand) {
WebPRescalerExportRowExpand(wrk);
} else if (wrk->fxy_scale) {
WebPRescalerExportRowShrink(wrk);
} else { // very special case for src = dst = 1x1
int i;
assert(wrk->src_width == 1 && wrk->dst_width <= 2);
assert(wrk->src_height == 1 && wrk->dst_height == 1);
for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
wrk->dst[i] = wrk->irow[i];
wrk->irow[i] = 0;
}
}
wrk->y_accum += wrk->y_add;
wrk->dst += wrk->dst_stride;
++wrk->dst_y;
}
}
//------------------------------------------------------------------------------
WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
extern void WebPRescalerDspInitSSE2(void);
extern void WebPRescalerDspInitMIPS32(void);
extern void WebPRescalerDspInitMIPSdspR2(void);
extern void WebPRescalerDspInitNEON(void);
static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
(VP8CPUInfo)&rescaler_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPRescalerDspInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPRescalerDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPRescalerDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPRescalerDspInitMIPSdspR2();
}
#endif
}
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -0,0 +1,291 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of rescaling functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS32)
#include <assert.h>
#include "../utils/rescaler.h"
//------------------------------------------------------------------------------
// Row import
static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale;
const int x_add = wrk->x_add;
const int x_sub = wrk->x_sub;
const int x_stride1 = x_stride << 2;
int channel;
assert(!wrk->x_expand);
assert(!WebPRescalerInputDone(wrk));
for (channel = 0; channel < x_stride; ++channel) {
const uint8_t* src1 = src + channel;
rescaler_t* frow = wrk->frow + channel;
int temp1, temp2, temp3;
int base, frac, sum;
int accum, accum1;
int loop_c = x_out_max - channel;
__asm__ volatile (
"li %[temp1], 0x8000 \n\t"
"li %[temp2], 0x10000 \n\t"
"li %[sum], 0 \n\t"
"li %[accum], 0 \n\t"
"1: \n\t"
"addu %[accum], %[accum], %[x_add] \n\t"
"li %[base], 0 \n\t"
"blez %[accum], 3f \n\t"
"2: \n\t"
"lbu %[base], 0(%[src1]) \n\t"
"subu %[accum], %[accum], %[x_sub] \n\t"
"addu %[src1], %[src1], %[x_stride] \n\t"
"addu %[sum], %[sum], %[base] \n\t"
"bgtz %[accum], 2b \n\t"
"3: \n\t"
"negu %[accum1], %[accum] \n\t"
"mul %[frac], %[base], %[accum1] \n\t"
"mul %[temp3], %[sum], %[x_sub] \n\t"
"subu %[loop_c], %[loop_c], %[x_stride] \n\t"
"mult %[temp1], %[temp2] \n\t"
"maddu %[frac], %[fx_scale] \n\t"
"mfhi %[sum] \n\t"
"subu %[temp3], %[temp3], %[frac] \n\t"
"sw %[temp3], 0(%[frow]) \n\t"
"addu %[frow], %[frow], %[x_stride1] \n\t"
"bgtz %[loop_c], 1b \n\t"
: [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
[sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
[frow]"+r"(frow), [accum1]"=&r"(accum1),
[temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
: [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
[x_sub]"r"(x_sub), [x_add]"r"(x_add),
[loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
: "memory", "hi", "lo"
);
assert(accum == 0);
}
}
static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
const int x_sub = wrk->x_sub;
const int src_width = wrk->src_width;
const int x_stride1 = x_stride << 2;
int channel;
assert(wrk->x_expand);
assert(!WebPRescalerInputDone(wrk));
for (channel = 0; channel < x_stride; ++channel) {
const uint8_t* src1 = src + channel;
rescaler_t* frow = wrk->frow + channel;
int temp1, temp2, temp3, temp4;
int frac;
int accum;
int x_out = channel;
__asm__ volatile (
"addiu %[temp3], %[src_width], -1 \n\t"
"lbu %[temp2], 0(%[src1]) \n\t"
"addu %[src1], %[src1], %[x_stride] \n\t"
"bgtz %[temp3], 0f \n\t"
"addiu %[temp1], %[temp2], 0 \n\t"
"b 3f \n\t"
"0: \n\t"
"lbu %[temp1], 0(%[src1]) \n\t"
"3: \n\t"
"addiu %[accum], %[x_add], 0 \n\t"
"1: \n\t"
"subu %[temp3], %[temp2], %[temp1] \n\t"
"mul %[temp3], %[temp3], %[accum] \n\t"
"mul %[temp4], %[temp1], %[x_add] \n\t"
"addu %[temp3], %[temp4], %[temp3] \n\t"
"sw %[temp3], 0(%[frow]) \n\t"
"addu %[frow], %[frow], %[x_stride1] \n\t"
"addu %[x_out], %[x_out], %[x_stride] \n\t"
"subu %[temp3], %[x_out], %[x_out_max] \n\t"
"bgez %[temp3], 2f \n\t"
"subu %[accum], %[accum], %[x_sub] \n\t"
"bgez %[accum], 4f \n\t"
"addiu %[temp2], %[temp1], 0 \n\t"
"addu %[src1], %[src1], %[x_stride] \n\t"
"lbu %[temp1], 0(%[src1]) \n\t"
"addu %[accum], %[accum], %[x_add] \n\t"
"4: \n\t"
"b 1b \n\t"
"2: \n\t"
: [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
: [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
[x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
[x_out_max]"r"(x_out_max)
: "memory", "hi", "lo"
);
assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
}
}
//------------------------------------------------------------------------------
// Row export
static void ExportRowExpand(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* frow = wrk->frow;
int temp0, temp1, temp3, temp4, temp5, loop_end;
const int temp2 = (int)wrk->fy_scale;
const int temp6 = x_out_max << 2;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"addiu %[frow], %[frow], 4 \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp0], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [temp6]"r"(temp6)
: "memory", "hi", "lo"
);
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 0(%[irow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[A], %[temp0] \n\t"
"maddu %[B], %[temp1] \n\t"
"addiu %[frow], %[frow], 4 \n\t"
"addiu %[irow], %[irow], 4 \n\t"
"mfhi %[temp5] \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp5], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
: "memory", "hi", "lo"
);
}
}
static void ExportRowShrink(WebPRescaler* const wrk) {
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const rescaler_t* frow = wrk->frow;
const int yscale = wrk->fy_scale * (-wrk->y_accum);
int temp0, temp1, temp3, temp4, temp5, loop_end;
const int temp2 = (int)wrk->fxy_scale;
const int temp6 = x_out_max << 2;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
assert(wrk->fxy_scale != 0);
if (yscale) {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"mult %[temp3], %[temp4] \n\t"
"addiu %[frow], %[frow], 4 \n\t"
"maddu %[temp0], %[yscale] \n\t"
"mfhi %[temp1] \n\t"
"lw %[temp0], 0(%[irow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"addiu %[irow], %[irow], 4 \n\t"
"subu %[temp0], %[temp0], %[temp1] \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp0], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sw %[temp1], -4(%[irow]) \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
: "memory", "hi", "lo"
);
} else {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[irow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[irow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"addiu %[irow], %[irow], 4 \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp0], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sw $zero, -4(%[irow]) \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[irow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [temp6]"r"(temp6)
: "memory", "hi", "lo"
);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPRescalerDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
WebPRescalerImportRowExpand = ImportRowExpand;
WebPRescalerImportRowShrink = ImportRowShrink;
WebPRescalerExportRowExpand = ExportRowExpand;
WebPRescalerExportRowShrink = ExportRowShrink;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPS32)
#endif // WEBP_USE_MIPS32

View file

@ -0,0 +1,314 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of rescaling functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include <assert.h>
#include "../utils/rescaler.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
//------------------------------------------------------------------------------
// Row export
static void ExportRowShrink(WebPRescaler* const wrk) {
int i;
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const rescaler_t* frow = wrk->frow;
const int yscale = wrk->fy_scale * (-wrk->y_accum);
int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
const int temp7 = (int)wrk->fxy_scale;
const int temp6 = (x_out_max & ~0x3) << 2;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
assert(wrk->fxy_scale != 0);
if (yscale) {
if (x_out_max >= 4) {
int temp8, temp9, temp10, temp11;
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 4(%[frow]) \n\t"
"lw %[temp2], 8(%[frow]) \n\t"
"lw %[temp5], 12(%[frow]) \n\t"
"mult $ac0, %[temp3], %[temp4] \n\t"
"maddu $ac0, %[temp0], %[yscale] \n\t"
"mult $ac1, %[temp3], %[temp4] \n\t"
"maddu $ac1, %[temp1], %[yscale] \n\t"
"mult $ac2, %[temp3], %[temp4] \n\t"
"maddu $ac2, %[temp2], %[yscale] \n\t"
"mult $ac3, %[temp3], %[temp4] \n\t"
"maddu $ac3, %[temp5], %[yscale] \n\t"
"addiu %[frow], %[frow], 16 \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp5], $ac3 \n\t"
"lw %[temp8], 0(%[irow]) \n\t"
"lw %[temp9], 4(%[irow]) \n\t"
"lw %[temp10], 8(%[irow]) \n\t"
"lw %[temp11], 12(%[irow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"addiu %[irow], %[irow], 16 \n\t"
"subu %[temp8], %[temp8], %[temp0] \n\t"
"subu %[temp9], %[temp9], %[temp1] \n\t"
"subu %[temp10], %[temp10], %[temp2] \n\t"
"subu %[temp11], %[temp11], %[temp5] \n\t"
"mult $ac0, %[temp3], %[temp4] \n\t"
"maddu $ac0, %[temp8], %[temp7] \n\t"
"mult $ac1, %[temp3], %[temp4] \n\t"
"maddu $ac1, %[temp9], %[temp7] \n\t"
"mult $ac2, %[temp3], %[temp4] \n\t"
"maddu $ac2, %[temp10], %[temp7] \n\t"
"mult $ac3, %[temp3], %[temp4] \n\t"
"maddu $ac3, %[temp11], %[temp7] \n\t"
"mfhi %[temp8], $ac0 \n\t"
"mfhi %[temp9], $ac1 \n\t"
"mfhi %[temp10], $ac2 \n\t"
"mfhi %[temp11], $ac3 \n\t"
"sw %[temp0], -16(%[irow]) \n\t"
"sw %[temp1], -12(%[irow]) \n\t"
"sw %[temp2], -8(%[irow]) \n\t"
"sw %[temp5], -4(%[irow]) \n\t"
"sb %[temp8], -4(%[dst]) \n\t"
"sb %[temp9], -3(%[dst]) \n\t"
"sb %[temp10], -2(%[dst]) \n\t"
"sb %[temp11], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
[temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
[temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale);
const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
assert(v >= 0 && v <= 255);
*dst++ = v;
*irow++ = frac; // new fractional start
}
} else {
if (x_out_max >= 4) {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[irow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[irow]) \n\t"
"lw %[temp1], 4(%[irow]) \n\t"
"lw %[temp2], 8(%[irow]) \n\t"
"lw %[temp5], 12(%[irow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"addiu %[irow], %[irow], 16 \n\t"
"mult $ac0, %[temp3], %[temp4] \n\t"
"maddu $ac0, %[temp0], %[temp7] \n\t"
"mult $ac1, %[temp3], %[temp4] \n\t"
"maddu $ac1, %[temp1], %[temp7] \n\t"
"mult $ac2, %[temp3], %[temp4] \n\t"
"maddu $ac2, %[temp2], %[temp7] \n\t"
"mult $ac3, %[temp3], %[temp4] \n\t"
"maddu $ac3, %[temp5], %[temp7] \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp5], $ac3 \n\t"
"sw $zero, -16(%[irow]) \n\t"
"sw $zero, -12(%[irow]) \n\t"
"sw $zero, -8(%[irow]) \n\t"
"sw $zero, -4(%[irow]) \n\t"
"sb %[temp0], -4(%[dst]) \n\t"
"sb %[temp1], -3(%[dst]) \n\t"
"sb %[temp2], -2(%[dst]) \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[irow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [temp6]"r"(temp6)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const int v = (int)MULT_FIX(*irow, wrk->fxy_scale);
assert(v >= 0 && v <= 255);
*dst++ = v;
*irow++ = 0;
}
}
}
static void ExportRowExpand(WebPRescaler* const wrk) {
int i;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* frow = wrk->frow;
int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
const int temp6 = (x_out_max & ~0x3) << 2;
const int temp7 = (int)wrk->fy_scale;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
if (x_out_max >= 4) {
__asm__ volatile (
"li %[temp4], 0x10000 \n\t"
"li %[temp5], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 4(%[frow]) \n\t"
"lw %[temp2], 8(%[frow]) \n\t"
"lw %[temp3], 12(%[frow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"addiu %[frow], %[frow], 16 \n\t"
"mult $ac0, %[temp4], %[temp5] \n\t"
"maddu $ac0, %[temp0], %[temp7] \n\t"
"mult $ac1, %[temp4], %[temp5] \n\t"
"maddu $ac1, %[temp1], %[temp7] \n\t"
"mult $ac2, %[temp4], %[temp5] \n\t"
"maddu $ac2, %[temp2], %[temp7] \n\t"
"mult $ac3, %[temp4], %[temp5] \n\t"
"maddu $ac3, %[temp3], %[temp7] \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp3], $ac3 \n\t"
"sb %[temp0], -4(%[dst]) \n\t"
"sb %[temp1], -3(%[dst]) \n\t"
"sb %[temp2], -2(%[dst]) \n\t"
"sb %[temp3], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [temp6]"r"(temp6)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const uint32_t J = *frow++;
const int v = (int)MULT_FIX(J, wrk->fy_scale);
assert(v >= 0 && v <= 255);
*dst++ = v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
if (x_out_max >= 4) {
int temp8, temp9, temp10, temp11;
__asm__ volatile (
"li %[temp8], 0x10000 \n\t"
"li %[temp9], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 4(%[frow]) \n\t"
"lw %[temp2], 8(%[frow]) \n\t"
"lw %[temp3], 12(%[frow]) \n\t"
"lw %[temp4], 0(%[irow]) \n\t"
"lw %[temp5], 4(%[irow]) \n\t"
"lw %[temp10], 8(%[irow]) \n\t"
"lw %[temp11], 12(%[irow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"mult $ac0, %[temp8], %[temp9] \n\t"
"maddu $ac0, %[A], %[temp0] \n\t"
"maddu $ac0, %[B], %[temp4] \n\t"
"mult $ac1, %[temp8], %[temp9] \n\t"
"maddu $ac1, %[A], %[temp1] \n\t"
"maddu $ac1, %[B], %[temp5] \n\t"
"mult $ac2, %[temp8], %[temp9] \n\t"
"maddu $ac2, %[A], %[temp2] \n\t"
"maddu $ac2, %[B], %[temp10] \n\t"
"mult $ac3, %[temp8], %[temp9] \n\t"
"maddu $ac3, %[A], %[temp3] \n\t"
"maddu $ac3, %[B], %[temp11] \n\t"
"addiu %[frow], %[frow], 16 \n\t"
"addiu %[irow], %[irow], 16 \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp3], $ac3 \n\t"
"mult $ac0, %[temp8], %[temp9] \n\t"
"maddu $ac0, %[temp0], %[temp7] \n\t"
"mult $ac1, %[temp8], %[temp9] \n\t"
"maddu $ac1, %[temp1], %[temp7] \n\t"
"mult $ac2, %[temp8], %[temp9] \n\t"
"maddu $ac2, %[temp2], %[temp7] \n\t"
"mult $ac3, %[temp8], %[temp9] \n\t"
"maddu $ac3, %[temp3], %[temp7] \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp3], $ac3 \n\t"
"sb %[temp0], -4(%[dst]) \n\t"
"sb %[temp1], -3(%[dst]) \n\t"
"sb %[temp2], -2(%[dst]) \n\t"
"sb %[temp3], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
[temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
[temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const uint64_t I = (uint64_t)A * *frow++
+ (uint64_t)B * *irow++;
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
assert(v >= 0 && v <= 255);
*dst++ = v;
}
}
}
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
// Entry point
extern void WebPRescalerDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
WebPRescalerExportRowExpand = ExportRowExpand;
WebPRescalerExportRowShrink = ExportRowShrink;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -0,0 +1,186 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON version of rescaling functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include <assert.h>
#include "./neon.h"
#include "../utils/rescaler.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
#define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC))
#define LOAD_32x8(SRC, DST0, DST1) \
LOAD_32x4(SRC + 0, DST0); \
LOAD_32x4(SRC + 4, DST1)
#define STORE_32x8(SRC0, SRC1, DST) do { \
vst1q_u32((DST) + 0, SRC0); \
vst1q_u32((DST) + 4, SRC1); \
} while (0);
#if (WEBP_RESCALER_RFIX == 32)
#define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1))
#define MULT_FIX(A, B) /* note: B is actualy scale>>1. See MAKE_HALF_CST */ \
vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
#else
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif
static uint32x4_t Interpolate(const rescaler_t* const frow,
const rescaler_t* const irow,
uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0);
const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
const uint64x2_t C1 = vmull_n_u32(vget_high_u32(A0), A);
const uint64x2_t D0 = vmlal_n_u32(C0, vget_low_u32(B0), B);
const uint64x2_t D1 = vmlal_n_u32(C1, vget_high_u32(B0), B);
const uint32x4_t E = vcombine_u32(
vrshrn_n_u64(D0, WEBP_RESCALER_RFIX),
vrshrn_n_u64(D1, WEBP_RESCALER_RFIX));
return E;
}
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int max_span = x_out_max & ~7;
const rescaler_t* const frow = wrk->frow;
const uint32_t fy_scale = wrk->fy_scale;
const int32x4_t fy_scale_half = MAKE_HALF_CST(fy_scale);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x4(frow + x_out + 0, A0);
LOAD_32x4(frow + x_out + 4, A1);
const uint32x4_t B0 = MULT_FIX(A0, fy_scale_half);
const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
const uint16x4_t C0 = vmovn_u32(B0);
const uint16x4_t C1 = vmovn_u32(B1);
const uint8x8_t D = vmovn_u16(vcombine_u16(C0, C1));
vst1_u8(dst + x_out, D);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX_C(J, fy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < max_span; x_out += 8) {
const uint32x4_t C0 =
Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
const uint32x4_t C1 =
Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
const uint16x4_t E0 = vmovn_u32(D0);
const uint16x4_t E1 = vmovn_u32(D1);
const uint8x8_t F = vmovn_u16(vcombine_u16(E0, E1));
vst1_u8(dst + x_out, F);
}
for (; x_out < x_out_max; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX_C(J, fy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
}
}
}
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int max_span = x_out_max & ~7;
const rescaler_t* const frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
const uint32_t fxy_scale = wrk->fxy_scale;
const uint32x4_t zero = vdupq_n_u32(0);
const int32x4_t yscale_half = MAKE_HALF_CST(yscale);
const int32x4_t fxy_scale_half = MAKE_HALF_CST(fxy_scale);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x8(frow + x_out, in0, in1);
LOAD_32x8(irow + x_out, in2, in3);
const uint32x4_t A0 = MULT_FIX(in0, yscale_half);
const uint32x4_t A1 = MULT_FIX(in1, yscale_half);
const uint32x4_t B0 = vqsubq_u32(in2, A0);
const uint32x4_t B1 = vqsubq_u32(in3, A1);
const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
const uint16x4_t D0 = vmovn_u32(C0);
const uint16x4_t D1 = vmovn_u32(C1);
const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1));
vst1_u8(dst + x_out, E);
STORE_32x8(A0, A1, irow + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale);
const int v = (int)MULT_FIX_C(irow[x_out] - frac, wrk->fxy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
irow[x_out] = frac; // new fractional start
}
} else {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x8(irow + x_out, in0, in1);
const uint32x4_t A0 = MULT_FIX(in0, fxy_scale_half);
const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
const uint16x4_t B0 = vmovn_u32(A0);
const uint16x4_t B1 = vmovn_u32(A1);
const uint8x8_t C = vmovn_u16(vcombine_u16(B0, B1));
vst1_u8(dst + x_out, C);
STORE_32x8(zero, zero, irow + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
irow[x_out] = 0;
}
}
}
//------------------------------------------------------------------------------
extern void WebPRescalerDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand;
WebPRescalerExportRowShrink = RescalerExportRowShrink;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPRescalerDspInitNEON)
#endif // WEBP_USE_NEON

View file

@ -0,0 +1,373 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 Rescaling functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include <assert.h>
#include "../utils/rescaler.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
const __m128i C = _mm_srli_si128(B, 8); // E0F0G0H0
*out = _mm_unpacklo_epi16(B, C);
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
}
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
int accum = x_add;
__m128i cur_pixels;
assert(!WebPRescalerInputDone(wrk));
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
if (wrk->src_width < 2) {
WebPRescalerImportRowExpandC(wrk, src);
return;
}
LoadTwoPixels(src, &cur_pixels);
src += 4;
while (1) {
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
_mm_storeu_si128((__m128i*)frow, out);
frow += 4;
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
LoadTwoPixels(src, &cur_pixels);
src += 4;
accum += x_add;
}
}
} else {
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
if (wrk->src_width < 8) {
WebPRescalerImportRowExpandC(wrk, src);
return;
}
LoadHeightPixels(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
*(uint32_t*)frow = _mm_cvtsi128_si32(out);
frow += 1;
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
LoadHeightPixels(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail
cur_pixels = _mm_srli_si128(cur_pixels, 2);
cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
src += 1;
left = 1;
}
accum += x_add;
}
}
}
assert(accum == 0);
}
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
const __m128i mult0 = _mm_set1_epi16(x_sub);
const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
__m128i sum = zero;
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
WebPRescalerImportRowShrinkC(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
assert(!wrk->x_expand);
for (; frow < frow_end; frow += 4) {
__m128i base = zero;
accum += wrk->x_add;
while (accum > 0) {
const __m128i A = _mm_cvtsi32_si128(*(int*)src);
src += 4;
base = _mm_unpacklo_epi8(A, zero);
// To avoid overflow, we need: base * x_add / x_sub < 32768
// => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
sum = _mm_add_epi16(sum, base);
accum -= x_sub;
}
{ // Emit next horizontal pixel.
const __m128i mult = _mm_set1_epi16(-accum);
const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b
const __m128i frac1 = _mm_mulhi_epu16(base, mult);
const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b
const __m128i A0 = _mm_mullo_epi16(sum, mult0);
const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub
const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac
const __m128i D0 = _mm_srli_epi64(frac, 32);
const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b
const __m128i D2 = _mm_mul_epu32(D0, mult1);
const __m128i E1 = _mm_add_epi64(D1, rounder);
const __m128i E2 = _mm_add_epi64(D2, rounder);
const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
const __m128i G = _mm_unpacklo_epi32(F1, F2);
sum = _mm_packs_epi32(G, zero);
_mm_storeu_si128((__m128i*)frow, frow_out);
}
}
assert(accum == 0);
}
//------------------------------------------------------------------------------
// Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32);
const __m128i A3 = _mm_srli_epi64(A1, 32);
if (mult != NULL) {
*out0 = _mm_mul_epu32(A0, *mult);
*out1 = _mm_mul_epu32(A1, *mult);
*out2 = _mm_mul_epu32(A2, *mult);
*out3 = _mm_mul_epu32(A3, *mult);
} else {
*out0 = A0;
*out1 = A1;
*out2 = A2;
*out3 = A3;
}
}
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
const __m128i* const A1,
const __m128i* const A2,
const __m128i* const A3,
const __m128i* const mult,
uint8_t* const dst) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
const __m128i B0 = _mm_mul_epu32(*A0, *mult);
const __m128i B1 = _mm_mul_epu32(*A1, *mult);
const __m128i B2 = _mm_mul_epu32(*A2, *mult);
const __m128i B3 = _mm_mul_epu32(*A3, *mult);
const __m128i C0 = _mm_add_epi64(B0, rounder);
const __m128i C1 = _mm_add_epi64(B1, rounder);
const __m128i C2 = _mm_add_epi64(B2, rounder);
const __m128i C3 = _mm_add_epi64(B3, rounder);
const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
#if (WEBP_RESCALER_FIX < 32)
const __m128i D2 =
_mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
const __m128i D3 =
_mm_and_si128(_mm_slli_epi64(C3, 32 - WEBP_RESCALER_RFIX), mask);
#else
const __m128i D2 = _mm_and_si128(C2, mask);
const __m128i D3 = _mm_and_si128(C3, mask);
#endif
const __m128i E0 = _mm_or_si128(D0, D2);
const __m128i E1 = _mm_or_si128(D1, D3);
const __m128i F = _mm_packs_epi32(E0, E1);
const __m128i G = _mm_packus_epi16(F, F);
_mm_storel_epi64((__m128i*)dst, G);
}
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
const __m128i mult = _mm_set_epi32(0, wrk->fy_scale, 0, wrk->fy_scale);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0 && wrk->y_sub + wrk->y_accum >= 0);
assert(wrk->y_expand);
if (wrk->y_accum == 0) {
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
const __m128i mA = _mm_set_epi32(0, A, 0, A);
const __m128i mB = _mm_set_epi32(0, B, 0, B);
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(A0, B0);
const __m128i C1 = _mm_add_epi64(A1, B1);
const __m128i C2 = _mm_add_epi64(A2, B2);
const __m128i C3 = _mm_add_epi64(A3, B3);
const __m128i D0 = _mm_add_epi64(C0, rounder);
const __m128i D1 = _mm_add_epi64(C1, rounder);
const __m128i D2 = _mm_add_epi64(C2, rounder);
const __m128i D3 = _mm_add_epi64(C3, rounder);
const __m128i E0 = _mm_srli_epi64(D0, WEBP_RESCALER_RFIX);
const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
}
}
}
static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
const int scale_xy = wrk->fxy_scale;
const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy);
const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale);
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(B0, rounder);
const __m128i C1 = _mm_add_epi64(B1, rounder);
const __m128i C2 = _mm_add_epi64(B2, rounder);
const __m128i C3 = _mm_add_epi64(B3, rounder);
const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); // = frac
const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX);
const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX);
const __m128i E0 = _mm_sub_epi64(A0, D0); // irow[x] - frac
const __m128i E1 = _mm_sub_epi64(A1, D1);
const __m128i E2 = _mm_sub_epi64(A2, D2);
const __m128i E3 = _mm_sub_epi64(A3, D3);
const __m128i F2 = _mm_slli_epi64(D2, 32);
const __m128i F3 = _mm_slli_epi64(D3, 32);
const __m128i G0 = _mm_or_si128(D0, F2);
const __m128i G1 = _mm_or_si128(D1, F3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale);
const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
irow[x_out] = frac; // new fractional start
}
} else {
const uint32_t scale = wrk->fxy_scale;
const __m128i mult = _mm_set_epi32(0, scale, 0, scale);
const __m128i zero = _mm_setzero_si128();
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], scale);
assert(v >= 0 && v <= 255);
dst[x_out] = v;
irow[x_out] = 0;
}
}
}
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPRescalerDspInitSSE2)
#endif // WEBP_USE_SSE2

View file

@ -0,0 +1,282 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV to RGB upsampling functions.
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include <assert.h>
#include "./yuv.h"
#if !defined(WEBP_YUV_USE_TABLE)
#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
const int t1 = kYScale * Y; \
const int t2 = kVToG * V; \
R = kVToR * V; \
G = kUToG * U; \
B = kUToB * U; \
R = t1 + R; \
G = t1 - G; \
B = t1 + B; \
R = R + kRCst; \
G = G - t2 + kGCst; \
B = B + kBCst; \
__asm__ volatile ( \
"shll_s.w %[" #R "], %[" #R "], 9 \n\t" \
"shll_s.w %[" #G "], %[" #G "], 9 \n\t" \
"shll_s.w %[" #B "], %[" #B "], 9 \n\t" \
"precrqu_s.qb.ph %[" #R "], %[" #R "], $zero \n\t" \
"precrqu_s.qb.ph %[" #G "], %[" #G "], $zero \n\t" \
"precrqu_s.qb.ph %[" #B "], %[" #B "], $zero \n\t" \
"srl %[" #R "], %[" #R "], 24 \n\t" \
"srl %[" #G "], %[" #G "], 24 \n\t" \
"srl %[" #B "], %[" #B "], 24 \n\t" \
: [R]"+r"(R), [G]"+r"(G), [B]"+r"(B) \
: \
); \
} while (0)
static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
rgb[0] = r;
rgb[1] = g;
rgb[2] = b;
}
static WEBP_INLINE void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
bgr[0] = b;
bgr[1] = g;
bgr[2] = r;
}
static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
{
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
rgb[0] = gb;
rgb[1] = rg;
#else
rgb[0] = rg;
rgb[1] = gb;
#endif
}
}
static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
uint8_t* const argb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
{
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
argb[0] = ba;
argb[1] = rg;
#else
argb[0] = rg;
argb[1] = ba;
#endif
}
}
#endif // WEBP_YUV_USE_TABLE
//-----------------------------------------------------------------------------
// Alpha handling variants
static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
argb[0] = 0xff;
argb[1] = r;
argb[2] = g;
argb[3] = b;
}
static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
bgra[0] = b;
bgra[1] = g;
bgra[2] = r;
bgra[3] = 0xff;
}
static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgba) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
rgba[0] = r;
rgba[1] = g;
rgba[2] = b;
rgba[3] = 0xff;
}
//------------------------------------------------------------------------------
// Fancy upsampler
#ifdef FANCY_UPSAMPLING
// Given samples laid out in a square as:
// [a b]
// [c d]
// we interpolate u/v as:
// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
// We process u and v together stashed into 32bit (16bit each).
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
assert(top_y != NULL); \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
} \
for (x = 1; x <= last_pixel_pair; ++x) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
/* precompute invariant values associated with first and second diagonals*/\
const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
{ \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
} \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (!(len & 1)) { \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
} \
} \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#undef LOAD_UV
#undef UPSAMPLE_FUNC
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitUpsamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
}
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
#undef YUV444_FUNC
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
#endif

View file

@ -0,0 +1,261 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON version of YUV to RGB upsampling functions.
//
// Author: mans@mansr.com (Mans Rullgard)
// Based on SSE code by: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <arm_neon.h>
#include <string.h>
#include "./neon.h"
#include "./yuv.h"
#ifdef FANCY_UPSAMPLING
//-----------------------------------------------------------------------------
// U/V upsampling
// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
#define UPSAMPLE_16PIXELS(r1, r2, out) { \
uint8x8_t a = vld1_u8(r1); \
uint8x8_t b = vld1_u8(r1 + 1); \
uint8x8_t c = vld1_u8(r2); \
uint8x8_t d = vld1_u8(r2 + 1); \
\
uint16x8_t al = vshll_n_u8(a, 1); \
uint16x8_t bl = vshll_n_u8(b, 1); \
uint16x8_t cl = vshll_n_u8(c, 1); \
uint16x8_t dl = vshll_n_u8(d, 1); \
\
uint8x8_t diag1, diag2; \
uint16x8_t sl; \
\
/* a + b + c + d */ \
sl = vaddl_u8(a, b); \
sl = vaddw_u8(sl, c); \
sl = vaddw_u8(sl, d); \
\
al = vaddq_u16(sl, al); /* 3a + b + c + d */ \
bl = vaddq_u16(sl, bl); /* a + 3b + c + d */ \
\
al = vaddq_u16(al, dl); /* 3a + b + c + 3d */ \
bl = vaddq_u16(bl, cl); /* a + 3b + 3c + d */ \
\
diag2 = vshrn_n_u16(al, 3); \
diag1 = vshrn_n_u16(bl, 3); \
\
a = vrhadd_u8(a, diag1); \
b = vrhadd_u8(b, diag2); \
c = vrhadd_u8(c, diag2); \
d = vrhadd_u8(d, diag1); \
\
{ \
uint8x8x2_t a_b, c_d; \
INIT_VECTOR2(a_b, a, b); \
INIT_VECTOR2(c_d, c, d); \
vst2_u8(out, a_b); \
vst2_u8(out + 32, c_d); \
} \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
uint8_t *out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
uint8_t r1[9], r2[9]; \
memcpy(r1, (tb), (num_pixels)); \
memcpy(r2, (bb), (num_pixels)); \
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
Upsample16Pixels(r1, r2, out); \
}
//-----------------------------------------------------------------------------
// YUV->RGB conversion
static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
#define v255 vdup_n_u8(255)
#define STORE_Rgb(out, r, g, b) do { \
uint8x8x3_t r_g_b; \
INIT_VECTOR3(r_g_b, r, g, b); \
vst3_u8(out, r_g_b); \
} while (0)
#define STORE_Bgr(out, r, g, b) do { \
uint8x8x3_t b_g_r; \
INIT_VECTOR3(b_g_r, b, g, r); \
vst3_u8(out, b_g_r); \
} while (0)
#define STORE_Rgba(out, r, g, b) do { \
uint8x8x4_t r_g_b_v255; \
INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \
vst4_u8(out, r_g_b_v255); \
} while (0)
#define STORE_Bgra(out, r, g, b) do { \
uint8x8x4_t b_g_r_v255; \
INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \
vst4_u8(out, b_g_r_v255); \
} while (0)
#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \
int i; \
for (i = 0; i < N; i += 8) { \
const int off = ((cur_x) + i) * XSTEP; \
uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \
uint8x8_t u = vld1_u8((src_uv) + i); \
uint8x8_t v = vld1_u8((src_uv) + i + 16); \
const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \
const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \
const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \
int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \
int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \
const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\
const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \
int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \
const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \
const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \
gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \
gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \
yl = vmlaq_lane_s32(yl, bl, cf32, 0); \
yh = vmlaq_lane_s32(yh, bh, cf32, 0); \
/* vrshrn_n_s32() already incorporates the rounding constant */ \
y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \
vrshrn_n_s32(rh, YUV_FIX2))); \
u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \
vrshrn_n_s32(gh, YUV_FIX2))); \
v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \
vrshrn_n_s32(yh, YUV_FIX2))); \
STORE_ ## FMT(out + off, y, u, v); \
} \
}
#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \
int i; \
for (i = 0; i < N; i++) { \
const int off = ((cur_x) + i) * XSTEP; \
const int y = src_y[(cur_x) + i]; \
const int u = (src_uv)[i]; \
const int v = (src_uv)[i + 16]; \
FUNC(y, u, v, rgb + off); \
} \
}
#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, len) { \
CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x) \
if (bottom_y != NULL) { \
CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x) \
} \
}
#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, len) { \
CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x); \
if (bottom_y != NULL) { \
CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
} \
}
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
const uint8_t *top_u, const uint8_t *top_v, \
const uint8_t *cur_u, const uint8_t *cur_v, \
uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \
uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
const int uv_len = (len + 1) >> 1; \
/* 9 pixels must be read-able for each block */ \
const int num_blocks = (uv_len - 1) >> 3; \
const int leftover = uv_len - num_blocks * 8; \
const int last_pos = 1 + 16 * num_blocks; \
\
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
\
const int16x4_t cf16 = vld1_s16(kCoeffs); \
const int32x2_t cf32 = vdup_n_s32(kUToB); \
const uint8x8_t u16 = vdup_n_u8(16); \
const uint8x8_t u128 = vdup_n_u8(128); \
\
/* Treat the first pixel in regular way */ \
assert(top_y != NULL); \
{ \
const int u0 = (top_u[0] + u_diag) >> 1; \
const int v0 = (top_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \
} \
if (bottom_y != NULL) { \
const int u0 = (cur_u[0] + u_diag) >> 1; \
const int v0 = (cur_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \
} \
\
for (block = 0; block < num_blocks; ++block) { \
UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \
UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \
CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, 16 * block + 1, 16); \
top_u += 8; \
cur_u += 8; \
top_v += 8; \
cur_v += 8; \
} \
\
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, last_pos, len - last_pos); \
}
// NEON variants of the fancy upsampler.
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3)
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3)
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
//------------------------------------------------------------------------------
// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
}
#endif // FANCY_UPSAMPLING
#endif // WEBP_USE_NEON
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_NEON))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersNEON)
#endif

View file

@ -0,0 +1,103 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of YUV to RGB upsampling functions.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i, r, g, b; \
int temp0, temp1, temp2, temp3, temp4; \
for (i = 0; i < (len >> 1); i++) { \
temp1 = kVToR * v[0]; \
temp3 = kVToG * v[0]; \
temp2 = kUToG * u[0]; \
temp4 = kUToB * u[0]; \
temp0 = kYScale * y[0]; \
temp1 += kRCst; \
temp3 -= kGCst; \
temp2 += temp3; \
temp4 += kBCst; \
r = VP8Clip8(temp0 + temp1); \
g = VP8Clip8(temp0 - temp2); \
b = VP8Clip8(temp0 + temp4); \
temp0 = kYScale * y[1]; \
dst[R] = r; \
dst[G] = g; \
dst[B] = b; \
if (A) dst[A] = 0xff; \
r = VP8Clip8(temp0 + temp1); \
g = VP8Clip8(temp0 - temp2); \
b = VP8Clip8(temp0 + temp4); \
dst[R + XSTEP] = r; \
dst[G + XSTEP] = g; \
dst[B + XSTEP] = b; \
if (A) dst[A + XSTEP] = 0xff; \
y += 2; \
++u; \
++v; \
dst += 2 * XSTEP; \
} \
if (len & 1) { \
temp1 = kVToR * v[0]; \
temp3 = kVToG * v[0]; \
temp2 = kUToG * u[0]; \
temp4 = kUToB * u[0]; \
temp0 = kYScale * y[0]; \
temp1 += kRCst; \
temp3 -= kGCst; \
temp2 += temp3; \
temp4 += kBCst; \
r = VP8Clip8(temp0 + temp1); \
g = VP8Clip8(temp0 - temp2); \
b = VP8Clip8(temp0 + temp4); \
dst[R] = r; \
dst[G] = g; \
dst[B] = b; \
if (A) dst[A] = 0xff; \
} \
}
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
#undef ROW_FUNC
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(WebPInitSamplersMIPS32)
#endif // WEBP_USE_MIPS32

View file

@ -0,0 +1,134 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS DSPr2 version of YUV to RGB upsampling functions.
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
#define ROW_FUNC_PART_1() \
"lbu %[temp3], 0(%[v]) \n\t" \
"lbu %[temp4], 0(%[u]) \n\t" \
"lbu %[temp0], 0(%[y]) \n\t" \
"mul %[temp1], %[t_con_1], %[temp3] \n\t" \
"mul %[temp3], %[t_con_2], %[temp3] \n\t" \
"mul %[temp2], %[t_con_3], %[temp4] \n\t" \
"mul %[temp4], %[t_con_4], %[temp4] \n\t" \
"mul %[temp0], %[t_con_5], %[temp0] \n\t" \
"addu %[temp1], %[temp1], %[t_con_6] \n\t" \
"subu %[temp3], %[temp3], %[t_con_7] \n\t" \
"addu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp4], %[temp4], %[t_con_8] \n\t" \
#define ROW_FUNC_PART_2(R, G, B, K) \
"addu %[temp5], %[temp0], %[temp1] \n\t" \
"subu %[temp6], %[temp0], %[temp2] \n\t" \
"addu %[temp7], %[temp0], %[temp4] \n\t" \
".if " #K " \n\t" \
"lbu %[temp0], 1(%[y]) \n\t" \
".endif \n\t" \
"shll_s.w %[temp5], %[temp5], 9 \n\t" \
"shll_s.w %[temp6], %[temp6], 9 \n\t" \
".if " #K " \n\t" \
"mul %[temp0], %[t_con_5], %[temp0] \n\t" \
".endif \n\t" \
"shll_s.w %[temp7], %[temp7], 9 \n\t" \
"precrqu_s.qb.ph %[temp5], %[temp5], $zero \n\t" \
"precrqu_s.qb.ph %[temp6], %[temp6], $zero \n\t" \
"precrqu_s.qb.ph %[temp7], %[temp7], $zero \n\t" \
"srl %[temp5], %[temp5], 24 \n\t" \
"srl %[temp6], %[temp6], 24 \n\t" \
"srl %[temp7], %[temp7], 24 \n\t" \
"sb %[temp5], " #R "(%[dst]) \n\t" \
"sb %[temp6], " #G "(%[dst]) \n\t" \
"sb %[temp7], " #B "(%[dst]) \n\t" \
#define ASM_CLOBBER_LIST() \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7) \
: [t_con_1]"r"(t_con_1), [t_con_2]"r"(t_con_2), [t_con_3]"r"(t_con_3), \
[t_con_4]"r"(t_con_4), [t_con_5]"r"(t_con_5), [t_con_6]"r"(t_con_6), \
[u]"r"(u), [v]"r"(v), [y]"r"(y), [dst]"r"(dst), \
[t_con_7]"r"(t_con_7), [t_con_8]"r"(t_con_8) \
: "memory", "hi", "lo" \
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
const int t_con_1 = kVToR; \
const int t_con_2 = kVToG; \
const int t_con_3 = kUToG; \
const int t_con_4 = kUToB; \
const int t_con_5 = kYScale; \
const int t_con_6 = kRCst; \
const int t_con_7 = kGCst; \
const int t_con_8 = kBCst; \
for (i = 0; i < (len >> 1); i++) { \
__asm__ volatile ( \
ROW_FUNC_PART_1() \
ROW_FUNC_PART_2(R, G, B, 1) \
ROW_FUNC_PART_2(R + XSTEP, G + XSTEP, B + XSTEP, 0) \
ASM_CLOBBER_LIST() \
); \
if (A) dst[A] = dst[A + XSTEP] = 0xff; \
y += 2; \
++u; \
++v; \
dst += 2 * XSTEP; \
} \
if (len & 1) { \
__asm__ volatile ( \
ROW_FUNC_PART_1() \
ROW_FUNC_PART_2(R, G, B, 0) \
ASM_CLOBBER_LIST() \
); \
if (A) dst[A] = 0xff; \
} \
}
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
#undef ROW_FUNC
#undef ASM_CLOBBER_LIST
#undef ROW_FUNC_PART_2
#undef ROW_FUNC_PART_1
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPInitSamplersMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

606
drivers/webp/dsp/yuv_sse2.c Normal file
View file

@ -0,0 +1,606 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./yuv.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include <string.h> // for memcpy
typedef union { // handy struct for converting SSE2 registers
int32_t i32[4];
uint8_t u8[16];
__m128i m;
} VP8kCstSSE2;
#if defined(WEBP_YUV_USE_SSE2_TABLES)
#include "./yuv_tables_sse2.h"
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInitSSE2(void) {}
#else
static int done_sse2 = 0;
static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInitSSE2(void) {
if (!done_sse2) {
int i;
for (i = 0; i < 256; ++i) {
VP8kYtoRGBA[i].i32[0] =
VP8kYtoRGBA[i].i32[1] =
VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
VP8kUtoRGBA[i].i32[0] = 0;
VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128);
VP8kUtoRGBA[i].i32[3] = 0;
VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128);
VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
VP8kVtoRGBA[i].i32[2] = 0;
VP8kVtoRGBA[i].i32[3] = 0;
}
done_sse2 = 1;
#if 0 // code used to generate 'yuv_tables_sse2.h'
printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
for (i = 0; i < 256; ++i) {
printf(" {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1],
VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]);
}
printf("};\n\n");
printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
for (i = 0; i < 256; ++i) {
printf(" {{0, 0x%.8x, 0x%.8x, 0}},\n",
VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]);
}
printf("};\n\n");
printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
for (i = 0; i < 256; ++i) {
printf(" {{0x%.8x, 0x%.8x, 0, 0}},\n",
VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]);
}
printf("};\n\n");
#endif
}
}
#endif // WEBP_YUV_USE_SSE2_TABLES
//-----------------------------------------------------------------------------
static WEBP_INLINE __m128i LoadUVPart(int u, int v) {
const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
const __m128i uv_part = _mm_add_epi32(u_part, v_part);
return uv_part;
}
static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
return rgba2;
}
static WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) {
const __m128i uv_part = LoadUVPart(u, v);
return GetRGBA32bWithUV(y, uv_part);
}
static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgb) {
const __m128i tmp0 = GetRGBA32b(y, u, v);
const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
// Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
_mm_storel_epi64((__m128i*)rgb, tmp2);
}
static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgr) {
const __m128i tmp0 = GetRGBA32b(y, u, v);
const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
// Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
_mm_storel_epi64((__m128i*)bgr, tmp3);
}
//-----------------------------------------------------------------------------
// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
for (n = 0; n < 32; n += 4) {
const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
_mm_storeu_si128((__m128i*)dst, tmp2);
dst += 4 * 4;
}
}
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
for (n = 0; n < 32; n += 2) {
const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
_mm_storel_epi64((__m128i*)dst, tmp3);
dst += 4 * 2;
}
}
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
uint8_t tmp0[2 * 3 + 5 + 15];
uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align
for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory
YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
}
// Last two pixels are special: we write in a tmp buffer before sending
// to dst.
YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
memcpy(dst + n * 3, tmp, 2 * 3);
}
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
uint8_t tmp0[2 * 3 + 5 + 15];
uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align
for (n = 0; n < 30; ++n) {
YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
}
YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
memcpy(dst + n * 3, tmp, 2 * 3);
}
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 4 <= len; n += 4) {
const __m128i uv_0 = LoadUVPart(u[0], v[0]);
const __m128i uv_1 = LoadUVPart(u[1], v[1]);
const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1);
const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1);
const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
_mm_storeu_si128((__m128i*)dst, tmp2);
dst += 4 * 4;
y += 4;
u += 2;
v += 2;
}
// Finish off
while (n < len) {
VP8YuvToRgba(y[0], u[0], v[0], dst);
dst += 4;
++y;
u += (n & 1);
v += (n & 1);
++n;
}
}
static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 2 <= len; n += 2) {
const __m128i uv_0 = LoadUVPart(u[0], v[0]);
const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
_mm_storel_epi64((__m128i*)dst, tmp3);
dst += 4 * 2;
y += 2;
++u;
++v;
}
// Finish off
if (len & 1) {
VP8YuvToBgra(y[0], u[0], v[0], dst);
}
}
static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 2 <= len; n += 2) {
const __m128i uv_0 = LoadUVPart(u[0], v[0]);
const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
_mm_storel_epi64((__m128i*)dst, tmp3);
dst += 4 * 2;
y += 2;
++u;
++v;
}
// Finish off
if (len & 1) {
VP8YuvToArgb(y[0], u[0], v[0], dst);
}
}
static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory
YuvToRgbSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes
dst += 3;
++y;
u += (n & 1);
v += (n & 1);
}
VP8YuvToRgb(y[0], u[0], v[0], dst);
if (len > 1) {
VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3);
}
}
static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory
YuvToBgrSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes
dst += 3;
++y;
u += (n & 1);
v += (n & 1);
}
VP8YuvToBgr(y[0], u[0], v[0], dst + 0);
if (len > 1) {
VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_ARGB] = YuvToArgbRow;
}
//------------------------------------------------------------------------------
// RGB24/32 -> YUV converters
// Load eight 16b-words from *src.
#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
// Store either 16b-words into *dst
#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
// Convert 8 packed RGB or BGR samples to r[], g[], b[]
static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
__m128i* const r,
__m128i* const g,
__m128i* const b,
int input_is_bgr) {
const __m128i zero = _mm_setzero_si128();
// in0: r0 g0 b0 r1 | g1 b1 r2 g2 | b2 r3 g3 b3 | r4 g4 b4 r5
// in1: b2 r3 g3 b3 | r4 g4 b4 r5 | g5 b5 r6 g6 | b6 r7 g7 b7
const __m128i in0 = LOAD_16(rgb + 0);
const __m128i in1 = LOAD_16(rgb + 8);
// A0: | r2 g2 b2 r3 | g3 b3 r4 g4 | b4 r5 ...
// A1: ... b2 r3 | g3 b3 r4 g4 | b4 r5 g5 b5 |
const __m128i A0 = _mm_srli_si128(in0, 6);
const __m128i A1 = _mm_slli_si128(in1, 6);
// B0: r0 r2 g0 g2 | b0 b2 r1 r3 | g1 g3 b1 b3 | r2 r4 b2 b4
// B1: g3 g5 b3 b5 | r4 r6 g4 g6 | b4 b6 r5 r7 | g5 g7 b5 b7
const __m128i B0 = _mm_unpacklo_epi8(in0, A0);
const __m128i B1 = _mm_unpackhi_epi8(A1, in1);
// C0: r1 r3 g1 g3 | b1 b3 r2 r4 | b2 b4 ...
// C1: ... g3 g5 | b3 b5 r4 r6 | g4 g6 b4 b6
const __m128i C0 = _mm_srli_si128(B0, 6);
const __m128i C1 = _mm_slli_si128(B1, 6);
// D0: r0 r1 r2 r3 | g0 g1 g2 g3 | b0 b1 b2 b3 | r1 r2 r3 r4
// D1: b3 b4 b5 b6 | r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 |
const __m128i D0 = _mm_unpacklo_epi8(B0, C0);
const __m128i D1 = _mm_unpackhi_epi8(C1, B1);
// r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 | 0
const __m128i D2 = _mm_srli_si128(D1, 4);
// r0 r1 r2 r3 | r4 r5 r6 r7 | g0 g1 g2 g3 | g4 g5 g6 g7
const __m128i E0 = _mm_unpacklo_epi32(D0, D2);
// b0 b1 b2 b3 | b4 b5 b6 b7 | r1 r2 r3 r4 | 0
const __m128i E1 = _mm_unpackhi_epi32(D0, D2);
// g0 g1 g2 g3 | g4 g5 g6 g7 | 0
const __m128i E2 = _mm_srli_si128(E0, 8);
const __m128i F0 = _mm_unpacklo_epi8(E0, zero); // -> R
const __m128i F1 = _mm_unpacklo_epi8(E1, zero); // -> B
const __m128i F2 = _mm_unpacklo_epi8(E2, zero); // -> G
*g = F2;
if (input_is_bgr) {
*r = F1;
*b = F0;
} else {
*r = F0;
*b = F1;
}
}
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
__m128i* const r,
__m128i* const g,
__m128i* const b) {
const __m128i zero = _mm_setzero_si128();
const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0
const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4
// column-wise transpose
const __m128i A0 = _mm_unpacklo_epi8(in0, in1);
const __m128i A1 = _mm_unpackhi_epi8(in0, in1);
const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
// C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
// C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
// store 16b
*r = _mm_unpacklo_epi8(C1, zero);
*g = _mm_unpackhi_epi8(C0, zero);
*b = _mm_unpacklo_epi8(C0, zero);
}
// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
// It's a macro and not a function because we need to use immediate values with
// srai_epi32, e.g.
#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
ROUNDER, DESCALE_FIX, OUT) do { \
const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
} while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const U, __m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0);
const __m128i kGB_v = MK_CST_16(-24116, -4684);
const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
kHALF_UV, YUV_FIX + 2, *U);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
kHALF_UV, YUV_FIX + 2, *V);
}
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16, rgb += 3 * 16) {
__m128i r, g, b, Y0, Y1;
RGB24PackedToPlanar(rgb + 0 * 8, &r, &g, &b, 0);
ConvertRGBToY(&r, &g, &b, &Y0);
RGB24PackedToPlanar(rgb + 3 * 8, &r, &g, &b, 0);
ConvertRGBToY(&r, &g, &b, &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
int i;
const int max_width = width & ~15;
for (i = 0; i < max_width; i += 16, bgr += 3 * 16) {
__m128i r, g, b, Y0, Y1;
RGB24PackedToPlanar(bgr + 0 * 8, &r, &g, &b, 1);
ConvertRGBToY(&r, &g, &b, &Y0);
RGB24PackedToPlanar(bgr + 3 * 8, &r, &g, &b, 1);
ConvertRGBToY(&r, &g, &b, &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
__m128i r, g, b, Y0, Y1;
RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);
ConvertRGBToY(&r, &g, &b, &Y0);
RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);
ConvertRGBToY(&r, &g, &b, &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2);
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;
RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0);
RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1);
HorizontalAddPack(&r0, &r1, &r0);
HorizontalAddPack(&g0, &g1, &g0);
HorizontalAddPack(&b0, &b1, &b0);
ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);
RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);
RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);
HorizontalAddPack(&r0, &r1, &r0);
HorizontalAddPack(&g0, &g1, &g0);
HorizontalAddPack(&b0, &b1, &b0);
ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);
U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1);
if (!do_store) {
const __m128i prev_u = LOAD_16(u);
const __m128i prev_v = LOAD_16(v);
U0 = _mm_avg_epu8(U0, prev_u);
V0 = _mm_avg_epu8(V0, prev_v);
}
STORE_16(U0, u);
STORE_16(V0, v);
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
__m128i* const r,
__m128i* const g,
__m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
// column-wise transpose
const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 ..
const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x
const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 ..
const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x
*r = _mm_unpacklo_epi64(B0, B2);
*g = _mm_unpackhi_epi64(B0, B2);
*b = _mm_unpacklo_epi64(B1, B3);
}
static void ConvertRGBA32ToUV(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1;
RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b);
ConvertRGBToUV(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
ConvertRGBToUV(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16;
v += 16;
rgb += 2 * 32;
}
if (max_width < width) { // left-over
WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
WebPConvertARGBToY = ConvertARGBToY;
WebPConvertARGBToUV = ConvertARGBToUV;
WebPConvertRGB24ToY = ConvertRGB24ToY;
WebPConvertBGR24ToY = ConvertBGR24ToY;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
#endif // WEBP_USE_SSE2

View file

@ -0,0 +1,536 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 tables for YUV->RGB conversion (12kB overall)
//
// Author: Skal (pascal.massimino@gmail.com)
// This file is not compiled, but #include'd directly from yuv.c
// Only used if WEBP_YUV_USE_SSE2_TABLES is defined.
static const VP8kCstSSE2 VP8kYtoRGBA[256] = {
{{0xfffb77b0, 0xfffb77b0, 0xfffb77b0, 0x003fc000}},
{{0xfffbc235, 0xfffbc235, 0xfffbc235, 0x003fc000}},
{{0xfffc0cba, 0xfffc0cba, 0xfffc0cba, 0x003fc000}},
{{0xfffc573f, 0xfffc573f, 0xfffc573f, 0x003fc000}},
{{0xfffca1c4, 0xfffca1c4, 0xfffca1c4, 0x003fc000}},
{{0xfffcec49, 0xfffcec49, 0xfffcec49, 0x003fc000}},
{{0xfffd36ce, 0xfffd36ce, 0xfffd36ce, 0x003fc000}},
{{0xfffd8153, 0xfffd8153, 0xfffd8153, 0x003fc000}},
{{0xfffdcbd8, 0xfffdcbd8, 0xfffdcbd8, 0x003fc000}},
{{0xfffe165d, 0xfffe165d, 0xfffe165d, 0x003fc000}},
{{0xfffe60e2, 0xfffe60e2, 0xfffe60e2, 0x003fc000}},
{{0xfffeab67, 0xfffeab67, 0xfffeab67, 0x003fc000}},
{{0xfffef5ec, 0xfffef5ec, 0xfffef5ec, 0x003fc000}},
{{0xffff4071, 0xffff4071, 0xffff4071, 0x003fc000}},
{{0xffff8af6, 0xffff8af6, 0xffff8af6, 0x003fc000}},
{{0xffffd57b, 0xffffd57b, 0xffffd57b, 0x003fc000}},
{{0x00002000, 0x00002000, 0x00002000, 0x003fc000}},
{{0x00006a85, 0x00006a85, 0x00006a85, 0x003fc000}},
{{0x0000b50a, 0x0000b50a, 0x0000b50a, 0x003fc000}},
{{0x0000ff8f, 0x0000ff8f, 0x0000ff8f, 0x003fc000}},
{{0x00014a14, 0x00014a14, 0x00014a14, 0x003fc000}},
{{0x00019499, 0x00019499, 0x00019499, 0x003fc000}},
{{0x0001df1e, 0x0001df1e, 0x0001df1e, 0x003fc000}},
{{0x000229a3, 0x000229a3, 0x000229a3, 0x003fc000}},
{{0x00027428, 0x00027428, 0x00027428, 0x003fc000}},
{{0x0002bead, 0x0002bead, 0x0002bead, 0x003fc000}},
{{0x00030932, 0x00030932, 0x00030932, 0x003fc000}},
{{0x000353b7, 0x000353b7, 0x000353b7, 0x003fc000}},
{{0x00039e3c, 0x00039e3c, 0x00039e3c, 0x003fc000}},
{{0x0003e8c1, 0x0003e8c1, 0x0003e8c1, 0x003fc000}},
{{0x00043346, 0x00043346, 0x00043346, 0x003fc000}},
{{0x00047dcb, 0x00047dcb, 0x00047dcb, 0x003fc000}},
{{0x0004c850, 0x0004c850, 0x0004c850, 0x003fc000}},
{{0x000512d5, 0x000512d5, 0x000512d5, 0x003fc000}},
{{0x00055d5a, 0x00055d5a, 0x00055d5a, 0x003fc000}},
{{0x0005a7df, 0x0005a7df, 0x0005a7df, 0x003fc000}},
{{0x0005f264, 0x0005f264, 0x0005f264, 0x003fc000}},
{{0x00063ce9, 0x00063ce9, 0x00063ce9, 0x003fc000}},
{{0x0006876e, 0x0006876e, 0x0006876e, 0x003fc000}},
{{0x0006d1f3, 0x0006d1f3, 0x0006d1f3, 0x003fc000}},
{{0x00071c78, 0x00071c78, 0x00071c78, 0x003fc000}},
{{0x000766fd, 0x000766fd, 0x000766fd, 0x003fc000}},
{{0x0007b182, 0x0007b182, 0x0007b182, 0x003fc000}},
{{0x0007fc07, 0x0007fc07, 0x0007fc07, 0x003fc000}},
{{0x0008468c, 0x0008468c, 0x0008468c, 0x003fc000}},
{{0x00089111, 0x00089111, 0x00089111, 0x003fc000}},
{{0x0008db96, 0x0008db96, 0x0008db96, 0x003fc000}},
{{0x0009261b, 0x0009261b, 0x0009261b, 0x003fc000}},
{{0x000970a0, 0x000970a0, 0x000970a0, 0x003fc000}},
{{0x0009bb25, 0x0009bb25, 0x0009bb25, 0x003fc000}},
{{0x000a05aa, 0x000a05aa, 0x000a05aa, 0x003fc000}},
{{0x000a502f, 0x000a502f, 0x000a502f, 0x003fc000}},
{{0x000a9ab4, 0x000a9ab4, 0x000a9ab4, 0x003fc000}},
{{0x000ae539, 0x000ae539, 0x000ae539, 0x003fc000}},
{{0x000b2fbe, 0x000b2fbe, 0x000b2fbe, 0x003fc000}},
{{0x000b7a43, 0x000b7a43, 0x000b7a43, 0x003fc000}},
{{0x000bc4c8, 0x000bc4c8, 0x000bc4c8, 0x003fc000}},
{{0x000c0f4d, 0x000c0f4d, 0x000c0f4d, 0x003fc000}},
{{0x000c59d2, 0x000c59d2, 0x000c59d2, 0x003fc000}},
{{0x000ca457, 0x000ca457, 0x000ca457, 0x003fc000}},
{{0x000ceedc, 0x000ceedc, 0x000ceedc, 0x003fc000}},
{{0x000d3961, 0x000d3961, 0x000d3961, 0x003fc000}},
{{0x000d83e6, 0x000d83e6, 0x000d83e6, 0x003fc000}},
{{0x000dce6b, 0x000dce6b, 0x000dce6b, 0x003fc000}},
{{0x000e18f0, 0x000e18f0, 0x000e18f0, 0x003fc000}},
{{0x000e6375, 0x000e6375, 0x000e6375, 0x003fc000}},
{{0x000eadfa, 0x000eadfa, 0x000eadfa, 0x003fc000}},
{{0x000ef87f, 0x000ef87f, 0x000ef87f, 0x003fc000}},
{{0x000f4304, 0x000f4304, 0x000f4304, 0x003fc000}},
{{0x000f8d89, 0x000f8d89, 0x000f8d89, 0x003fc000}},
{{0x000fd80e, 0x000fd80e, 0x000fd80e, 0x003fc000}},
{{0x00102293, 0x00102293, 0x00102293, 0x003fc000}},
{{0x00106d18, 0x00106d18, 0x00106d18, 0x003fc000}},
{{0x0010b79d, 0x0010b79d, 0x0010b79d, 0x003fc000}},
{{0x00110222, 0x00110222, 0x00110222, 0x003fc000}},
{{0x00114ca7, 0x00114ca7, 0x00114ca7, 0x003fc000}},
{{0x0011972c, 0x0011972c, 0x0011972c, 0x003fc000}},
{{0x0011e1b1, 0x0011e1b1, 0x0011e1b1, 0x003fc000}},
{{0x00122c36, 0x00122c36, 0x00122c36, 0x003fc000}},
{{0x001276bb, 0x001276bb, 0x001276bb, 0x003fc000}},
{{0x0012c140, 0x0012c140, 0x0012c140, 0x003fc000}},
{{0x00130bc5, 0x00130bc5, 0x00130bc5, 0x003fc000}},
{{0x0013564a, 0x0013564a, 0x0013564a, 0x003fc000}},
{{0x0013a0cf, 0x0013a0cf, 0x0013a0cf, 0x003fc000}},
{{0x0013eb54, 0x0013eb54, 0x0013eb54, 0x003fc000}},
{{0x001435d9, 0x001435d9, 0x001435d9, 0x003fc000}},
{{0x0014805e, 0x0014805e, 0x0014805e, 0x003fc000}},
{{0x0014cae3, 0x0014cae3, 0x0014cae3, 0x003fc000}},
{{0x00151568, 0x00151568, 0x00151568, 0x003fc000}},
{{0x00155fed, 0x00155fed, 0x00155fed, 0x003fc000}},
{{0x0015aa72, 0x0015aa72, 0x0015aa72, 0x003fc000}},
{{0x0015f4f7, 0x0015f4f7, 0x0015f4f7, 0x003fc000}},
{{0x00163f7c, 0x00163f7c, 0x00163f7c, 0x003fc000}},
{{0x00168a01, 0x00168a01, 0x00168a01, 0x003fc000}},
{{0x0016d486, 0x0016d486, 0x0016d486, 0x003fc000}},
{{0x00171f0b, 0x00171f0b, 0x00171f0b, 0x003fc000}},
{{0x00176990, 0x00176990, 0x00176990, 0x003fc000}},
{{0x0017b415, 0x0017b415, 0x0017b415, 0x003fc000}},
{{0x0017fe9a, 0x0017fe9a, 0x0017fe9a, 0x003fc000}},
{{0x0018491f, 0x0018491f, 0x0018491f, 0x003fc000}},
{{0x001893a4, 0x001893a4, 0x001893a4, 0x003fc000}},
{{0x0018de29, 0x0018de29, 0x0018de29, 0x003fc000}},
{{0x001928ae, 0x001928ae, 0x001928ae, 0x003fc000}},
{{0x00197333, 0x00197333, 0x00197333, 0x003fc000}},
{{0x0019bdb8, 0x0019bdb8, 0x0019bdb8, 0x003fc000}},
{{0x001a083d, 0x001a083d, 0x001a083d, 0x003fc000}},
{{0x001a52c2, 0x001a52c2, 0x001a52c2, 0x003fc000}},
{{0x001a9d47, 0x001a9d47, 0x001a9d47, 0x003fc000}},
{{0x001ae7cc, 0x001ae7cc, 0x001ae7cc, 0x003fc000}},
{{0x001b3251, 0x001b3251, 0x001b3251, 0x003fc000}},
{{0x001b7cd6, 0x001b7cd6, 0x001b7cd6, 0x003fc000}},
{{0x001bc75b, 0x001bc75b, 0x001bc75b, 0x003fc000}},
{{0x001c11e0, 0x001c11e0, 0x001c11e0, 0x003fc000}},
{{0x001c5c65, 0x001c5c65, 0x001c5c65, 0x003fc000}},
{{0x001ca6ea, 0x001ca6ea, 0x001ca6ea, 0x003fc000}},
{{0x001cf16f, 0x001cf16f, 0x001cf16f, 0x003fc000}},
{{0x001d3bf4, 0x001d3bf4, 0x001d3bf4, 0x003fc000}},
{{0x001d8679, 0x001d8679, 0x001d8679, 0x003fc000}},
{{0x001dd0fe, 0x001dd0fe, 0x001dd0fe, 0x003fc000}},
{{0x001e1b83, 0x001e1b83, 0x001e1b83, 0x003fc000}},
{{0x001e6608, 0x001e6608, 0x001e6608, 0x003fc000}},
{{0x001eb08d, 0x001eb08d, 0x001eb08d, 0x003fc000}},
{{0x001efb12, 0x001efb12, 0x001efb12, 0x003fc000}},
{{0x001f4597, 0x001f4597, 0x001f4597, 0x003fc000}},
{{0x001f901c, 0x001f901c, 0x001f901c, 0x003fc000}},
{{0x001fdaa1, 0x001fdaa1, 0x001fdaa1, 0x003fc000}},
{{0x00202526, 0x00202526, 0x00202526, 0x003fc000}},
{{0x00206fab, 0x00206fab, 0x00206fab, 0x003fc000}},
{{0x0020ba30, 0x0020ba30, 0x0020ba30, 0x003fc000}},
{{0x002104b5, 0x002104b5, 0x002104b5, 0x003fc000}},
{{0x00214f3a, 0x00214f3a, 0x00214f3a, 0x003fc000}},
{{0x002199bf, 0x002199bf, 0x002199bf, 0x003fc000}},
{{0x0021e444, 0x0021e444, 0x0021e444, 0x003fc000}},
{{0x00222ec9, 0x00222ec9, 0x00222ec9, 0x003fc000}},
{{0x0022794e, 0x0022794e, 0x0022794e, 0x003fc000}},
{{0x0022c3d3, 0x0022c3d3, 0x0022c3d3, 0x003fc000}},
{{0x00230e58, 0x00230e58, 0x00230e58, 0x003fc000}},
{{0x002358dd, 0x002358dd, 0x002358dd, 0x003fc000}},
{{0x0023a362, 0x0023a362, 0x0023a362, 0x003fc000}},
{{0x0023ede7, 0x0023ede7, 0x0023ede7, 0x003fc000}},
{{0x0024386c, 0x0024386c, 0x0024386c, 0x003fc000}},
{{0x002482f1, 0x002482f1, 0x002482f1, 0x003fc000}},
{{0x0024cd76, 0x0024cd76, 0x0024cd76, 0x003fc000}},
{{0x002517fb, 0x002517fb, 0x002517fb, 0x003fc000}},
{{0x00256280, 0x00256280, 0x00256280, 0x003fc000}},
{{0x0025ad05, 0x0025ad05, 0x0025ad05, 0x003fc000}},
{{0x0025f78a, 0x0025f78a, 0x0025f78a, 0x003fc000}},
{{0x0026420f, 0x0026420f, 0x0026420f, 0x003fc000}},
{{0x00268c94, 0x00268c94, 0x00268c94, 0x003fc000}},
{{0x0026d719, 0x0026d719, 0x0026d719, 0x003fc000}},
{{0x0027219e, 0x0027219e, 0x0027219e, 0x003fc000}},
{{0x00276c23, 0x00276c23, 0x00276c23, 0x003fc000}},
{{0x0027b6a8, 0x0027b6a8, 0x0027b6a8, 0x003fc000}},
{{0x0028012d, 0x0028012d, 0x0028012d, 0x003fc000}},
{{0x00284bb2, 0x00284bb2, 0x00284bb2, 0x003fc000}},
{{0x00289637, 0x00289637, 0x00289637, 0x003fc000}},
{{0x0028e0bc, 0x0028e0bc, 0x0028e0bc, 0x003fc000}},
{{0x00292b41, 0x00292b41, 0x00292b41, 0x003fc000}},
{{0x002975c6, 0x002975c6, 0x002975c6, 0x003fc000}},
{{0x0029c04b, 0x0029c04b, 0x0029c04b, 0x003fc000}},
{{0x002a0ad0, 0x002a0ad0, 0x002a0ad0, 0x003fc000}},
{{0x002a5555, 0x002a5555, 0x002a5555, 0x003fc000}},
{{0x002a9fda, 0x002a9fda, 0x002a9fda, 0x003fc000}},
{{0x002aea5f, 0x002aea5f, 0x002aea5f, 0x003fc000}},
{{0x002b34e4, 0x002b34e4, 0x002b34e4, 0x003fc000}},
{{0x002b7f69, 0x002b7f69, 0x002b7f69, 0x003fc000}},
{{0x002bc9ee, 0x002bc9ee, 0x002bc9ee, 0x003fc000}},
{{0x002c1473, 0x002c1473, 0x002c1473, 0x003fc000}},
{{0x002c5ef8, 0x002c5ef8, 0x002c5ef8, 0x003fc000}},
{{0x002ca97d, 0x002ca97d, 0x002ca97d, 0x003fc000}},
{{0x002cf402, 0x002cf402, 0x002cf402, 0x003fc000}},
{{0x002d3e87, 0x002d3e87, 0x002d3e87, 0x003fc000}},
{{0x002d890c, 0x002d890c, 0x002d890c, 0x003fc000}},
{{0x002dd391, 0x002dd391, 0x002dd391, 0x003fc000}},
{{0x002e1e16, 0x002e1e16, 0x002e1e16, 0x003fc000}},
{{0x002e689b, 0x002e689b, 0x002e689b, 0x003fc000}},
{{0x002eb320, 0x002eb320, 0x002eb320, 0x003fc000}},
{{0x002efda5, 0x002efda5, 0x002efda5, 0x003fc000}},
{{0x002f482a, 0x002f482a, 0x002f482a, 0x003fc000}},
{{0x002f92af, 0x002f92af, 0x002f92af, 0x003fc000}},
{{0x002fdd34, 0x002fdd34, 0x002fdd34, 0x003fc000}},
{{0x003027b9, 0x003027b9, 0x003027b9, 0x003fc000}},
{{0x0030723e, 0x0030723e, 0x0030723e, 0x003fc000}},
{{0x0030bcc3, 0x0030bcc3, 0x0030bcc3, 0x003fc000}},
{{0x00310748, 0x00310748, 0x00310748, 0x003fc000}},
{{0x003151cd, 0x003151cd, 0x003151cd, 0x003fc000}},
{{0x00319c52, 0x00319c52, 0x00319c52, 0x003fc000}},
{{0x0031e6d7, 0x0031e6d7, 0x0031e6d7, 0x003fc000}},
{{0x0032315c, 0x0032315c, 0x0032315c, 0x003fc000}},
{{0x00327be1, 0x00327be1, 0x00327be1, 0x003fc000}},
{{0x0032c666, 0x0032c666, 0x0032c666, 0x003fc000}},
{{0x003310eb, 0x003310eb, 0x003310eb, 0x003fc000}},
{{0x00335b70, 0x00335b70, 0x00335b70, 0x003fc000}},
{{0x0033a5f5, 0x0033a5f5, 0x0033a5f5, 0x003fc000}},
{{0x0033f07a, 0x0033f07a, 0x0033f07a, 0x003fc000}},
{{0x00343aff, 0x00343aff, 0x00343aff, 0x003fc000}},
{{0x00348584, 0x00348584, 0x00348584, 0x003fc000}},
{{0x0034d009, 0x0034d009, 0x0034d009, 0x003fc000}},
{{0x00351a8e, 0x00351a8e, 0x00351a8e, 0x003fc000}},
{{0x00356513, 0x00356513, 0x00356513, 0x003fc000}},
{{0x0035af98, 0x0035af98, 0x0035af98, 0x003fc000}},
{{0x0035fa1d, 0x0035fa1d, 0x0035fa1d, 0x003fc000}},
{{0x003644a2, 0x003644a2, 0x003644a2, 0x003fc000}},
{{0x00368f27, 0x00368f27, 0x00368f27, 0x003fc000}},
{{0x0036d9ac, 0x0036d9ac, 0x0036d9ac, 0x003fc000}},
{{0x00372431, 0x00372431, 0x00372431, 0x003fc000}},
{{0x00376eb6, 0x00376eb6, 0x00376eb6, 0x003fc000}},
{{0x0037b93b, 0x0037b93b, 0x0037b93b, 0x003fc000}},
{{0x003803c0, 0x003803c0, 0x003803c0, 0x003fc000}},
{{0x00384e45, 0x00384e45, 0x00384e45, 0x003fc000}},
{{0x003898ca, 0x003898ca, 0x003898ca, 0x003fc000}},
{{0x0038e34f, 0x0038e34f, 0x0038e34f, 0x003fc000}},
{{0x00392dd4, 0x00392dd4, 0x00392dd4, 0x003fc000}},
{{0x00397859, 0x00397859, 0x00397859, 0x003fc000}},
{{0x0039c2de, 0x0039c2de, 0x0039c2de, 0x003fc000}},
{{0x003a0d63, 0x003a0d63, 0x003a0d63, 0x003fc000}},
{{0x003a57e8, 0x003a57e8, 0x003a57e8, 0x003fc000}},
{{0x003aa26d, 0x003aa26d, 0x003aa26d, 0x003fc000}},
{{0x003aecf2, 0x003aecf2, 0x003aecf2, 0x003fc000}},
{{0x003b3777, 0x003b3777, 0x003b3777, 0x003fc000}},
{{0x003b81fc, 0x003b81fc, 0x003b81fc, 0x003fc000}},
{{0x003bcc81, 0x003bcc81, 0x003bcc81, 0x003fc000}},
{{0x003c1706, 0x003c1706, 0x003c1706, 0x003fc000}},
{{0x003c618b, 0x003c618b, 0x003c618b, 0x003fc000}},
{{0x003cac10, 0x003cac10, 0x003cac10, 0x003fc000}},
{{0x003cf695, 0x003cf695, 0x003cf695, 0x003fc000}},
{{0x003d411a, 0x003d411a, 0x003d411a, 0x003fc000}},
{{0x003d8b9f, 0x003d8b9f, 0x003d8b9f, 0x003fc000}},
{{0x003dd624, 0x003dd624, 0x003dd624, 0x003fc000}},
{{0x003e20a9, 0x003e20a9, 0x003e20a9, 0x003fc000}},
{{0x003e6b2e, 0x003e6b2e, 0x003e6b2e, 0x003fc000}},
{{0x003eb5b3, 0x003eb5b3, 0x003eb5b3, 0x003fc000}},
{{0x003f0038, 0x003f0038, 0x003f0038, 0x003fc000}},
{{0x003f4abd, 0x003f4abd, 0x003f4abd, 0x003fc000}},
{{0x003f9542, 0x003f9542, 0x003f9542, 0x003fc000}},
{{0x003fdfc7, 0x003fdfc7, 0x003fdfc7, 0x003fc000}},
{{0x00402a4c, 0x00402a4c, 0x00402a4c, 0x003fc000}},
{{0x004074d1, 0x004074d1, 0x004074d1, 0x003fc000}},
{{0x0040bf56, 0x0040bf56, 0x0040bf56, 0x003fc000}},
{{0x004109db, 0x004109db, 0x004109db, 0x003fc000}},
{{0x00415460, 0x00415460, 0x00415460, 0x003fc000}},
{{0x00419ee5, 0x00419ee5, 0x00419ee5, 0x003fc000}},
{{0x0041e96a, 0x0041e96a, 0x0041e96a, 0x003fc000}},
{{0x004233ef, 0x004233ef, 0x004233ef, 0x003fc000}},
{{0x00427e74, 0x00427e74, 0x00427e74, 0x003fc000}},
{{0x0042c8f9, 0x0042c8f9, 0x0042c8f9, 0x003fc000}},
{{0x0043137e, 0x0043137e, 0x0043137e, 0x003fc000}},
{{0x00435e03, 0x00435e03, 0x00435e03, 0x003fc000}},
{{0x0043a888, 0x0043a888, 0x0043a888, 0x003fc000}},
{{0x0043f30d, 0x0043f30d, 0x0043f30d, 0x003fc000}},
{{0x00443d92, 0x00443d92, 0x00443d92, 0x003fc000}},
{{0x00448817, 0x00448817, 0x00448817, 0x003fc000}},
{{0x0044d29c, 0x0044d29c, 0x0044d29c, 0x003fc000}},
{{0x00451d21, 0x00451d21, 0x00451d21, 0x003fc000}},
{{0x004567a6, 0x004567a6, 0x004567a6, 0x003fc000}},
{{0x0045b22b, 0x0045b22b, 0x0045b22b, 0x003fc000}}
};
static const VP8kCstSSE2 VP8kUtoRGBA[256] = {
{{0, 0x000c8980, 0xffbf7300, 0}}, {{0, 0x000c706d, 0xffbff41a, 0}},
{{0, 0x000c575a, 0xffc07534, 0}}, {{0, 0x000c3e47, 0xffc0f64e, 0}},
{{0, 0x000c2534, 0xffc17768, 0}}, {{0, 0x000c0c21, 0xffc1f882, 0}},
{{0, 0x000bf30e, 0xffc2799c, 0}}, {{0, 0x000bd9fb, 0xffc2fab6, 0}},
{{0, 0x000bc0e8, 0xffc37bd0, 0}}, {{0, 0x000ba7d5, 0xffc3fcea, 0}},
{{0, 0x000b8ec2, 0xffc47e04, 0}}, {{0, 0x000b75af, 0xffc4ff1e, 0}},
{{0, 0x000b5c9c, 0xffc58038, 0}}, {{0, 0x000b4389, 0xffc60152, 0}},
{{0, 0x000b2a76, 0xffc6826c, 0}}, {{0, 0x000b1163, 0xffc70386, 0}},
{{0, 0x000af850, 0xffc784a0, 0}}, {{0, 0x000adf3d, 0xffc805ba, 0}},
{{0, 0x000ac62a, 0xffc886d4, 0}}, {{0, 0x000aad17, 0xffc907ee, 0}},
{{0, 0x000a9404, 0xffc98908, 0}}, {{0, 0x000a7af1, 0xffca0a22, 0}},
{{0, 0x000a61de, 0xffca8b3c, 0}}, {{0, 0x000a48cb, 0xffcb0c56, 0}},
{{0, 0x000a2fb8, 0xffcb8d70, 0}}, {{0, 0x000a16a5, 0xffcc0e8a, 0}},
{{0, 0x0009fd92, 0xffcc8fa4, 0}}, {{0, 0x0009e47f, 0xffcd10be, 0}},
{{0, 0x0009cb6c, 0xffcd91d8, 0}}, {{0, 0x0009b259, 0xffce12f2, 0}},
{{0, 0x00099946, 0xffce940c, 0}}, {{0, 0x00098033, 0xffcf1526, 0}},
{{0, 0x00096720, 0xffcf9640, 0}}, {{0, 0x00094e0d, 0xffd0175a, 0}},
{{0, 0x000934fa, 0xffd09874, 0}}, {{0, 0x00091be7, 0xffd1198e, 0}},
{{0, 0x000902d4, 0xffd19aa8, 0}}, {{0, 0x0008e9c1, 0xffd21bc2, 0}},
{{0, 0x0008d0ae, 0xffd29cdc, 0}}, {{0, 0x0008b79b, 0xffd31df6, 0}},
{{0, 0x00089e88, 0xffd39f10, 0}}, {{0, 0x00088575, 0xffd4202a, 0}},
{{0, 0x00086c62, 0xffd4a144, 0}}, {{0, 0x0008534f, 0xffd5225e, 0}},
{{0, 0x00083a3c, 0xffd5a378, 0}}, {{0, 0x00082129, 0xffd62492, 0}},
{{0, 0x00080816, 0xffd6a5ac, 0}}, {{0, 0x0007ef03, 0xffd726c6, 0}},
{{0, 0x0007d5f0, 0xffd7a7e0, 0}}, {{0, 0x0007bcdd, 0xffd828fa, 0}},
{{0, 0x0007a3ca, 0xffd8aa14, 0}}, {{0, 0x00078ab7, 0xffd92b2e, 0}},
{{0, 0x000771a4, 0xffd9ac48, 0}}, {{0, 0x00075891, 0xffda2d62, 0}},
{{0, 0x00073f7e, 0xffdaae7c, 0}}, {{0, 0x0007266b, 0xffdb2f96, 0}},
{{0, 0x00070d58, 0xffdbb0b0, 0}}, {{0, 0x0006f445, 0xffdc31ca, 0}},
{{0, 0x0006db32, 0xffdcb2e4, 0}}, {{0, 0x0006c21f, 0xffdd33fe, 0}},
{{0, 0x0006a90c, 0xffddb518, 0}}, {{0, 0x00068ff9, 0xffde3632, 0}},
{{0, 0x000676e6, 0xffdeb74c, 0}}, {{0, 0x00065dd3, 0xffdf3866, 0}},
{{0, 0x000644c0, 0xffdfb980, 0}}, {{0, 0x00062bad, 0xffe03a9a, 0}},
{{0, 0x0006129a, 0xffe0bbb4, 0}}, {{0, 0x0005f987, 0xffe13cce, 0}},
{{0, 0x0005e074, 0xffe1bde8, 0}}, {{0, 0x0005c761, 0xffe23f02, 0}},
{{0, 0x0005ae4e, 0xffe2c01c, 0}}, {{0, 0x0005953b, 0xffe34136, 0}},
{{0, 0x00057c28, 0xffe3c250, 0}}, {{0, 0x00056315, 0xffe4436a, 0}},
{{0, 0x00054a02, 0xffe4c484, 0}}, {{0, 0x000530ef, 0xffe5459e, 0}},
{{0, 0x000517dc, 0xffe5c6b8, 0}}, {{0, 0x0004fec9, 0xffe647d2, 0}},
{{0, 0x0004e5b6, 0xffe6c8ec, 0}}, {{0, 0x0004cca3, 0xffe74a06, 0}},
{{0, 0x0004b390, 0xffe7cb20, 0}}, {{0, 0x00049a7d, 0xffe84c3a, 0}},
{{0, 0x0004816a, 0xffe8cd54, 0}}, {{0, 0x00046857, 0xffe94e6e, 0}},
{{0, 0x00044f44, 0xffe9cf88, 0}}, {{0, 0x00043631, 0xffea50a2, 0}},
{{0, 0x00041d1e, 0xffead1bc, 0}}, {{0, 0x0004040b, 0xffeb52d6, 0}},
{{0, 0x0003eaf8, 0xffebd3f0, 0}}, {{0, 0x0003d1e5, 0xffec550a, 0}},
{{0, 0x0003b8d2, 0xffecd624, 0}}, {{0, 0x00039fbf, 0xffed573e, 0}},
{{0, 0x000386ac, 0xffedd858, 0}}, {{0, 0x00036d99, 0xffee5972, 0}},
{{0, 0x00035486, 0xffeeda8c, 0}}, {{0, 0x00033b73, 0xffef5ba6, 0}},
{{0, 0x00032260, 0xffefdcc0, 0}}, {{0, 0x0003094d, 0xfff05dda, 0}},
{{0, 0x0002f03a, 0xfff0def4, 0}}, {{0, 0x0002d727, 0xfff1600e, 0}},
{{0, 0x0002be14, 0xfff1e128, 0}}, {{0, 0x0002a501, 0xfff26242, 0}},
{{0, 0x00028bee, 0xfff2e35c, 0}}, {{0, 0x000272db, 0xfff36476, 0}},
{{0, 0x000259c8, 0xfff3e590, 0}}, {{0, 0x000240b5, 0xfff466aa, 0}},
{{0, 0x000227a2, 0xfff4e7c4, 0}}, {{0, 0x00020e8f, 0xfff568de, 0}},
{{0, 0x0001f57c, 0xfff5e9f8, 0}}, {{0, 0x0001dc69, 0xfff66b12, 0}},
{{0, 0x0001c356, 0xfff6ec2c, 0}}, {{0, 0x0001aa43, 0xfff76d46, 0}},
{{0, 0x00019130, 0xfff7ee60, 0}}, {{0, 0x0001781d, 0xfff86f7a, 0}},
{{0, 0x00015f0a, 0xfff8f094, 0}}, {{0, 0x000145f7, 0xfff971ae, 0}},
{{0, 0x00012ce4, 0xfff9f2c8, 0}}, {{0, 0x000113d1, 0xfffa73e2, 0}},
{{0, 0x0000fabe, 0xfffaf4fc, 0}}, {{0, 0x0000e1ab, 0xfffb7616, 0}},
{{0, 0x0000c898, 0xfffbf730, 0}}, {{0, 0x0000af85, 0xfffc784a, 0}},
{{0, 0x00009672, 0xfffcf964, 0}}, {{0, 0x00007d5f, 0xfffd7a7e, 0}},
{{0, 0x0000644c, 0xfffdfb98, 0}}, {{0, 0x00004b39, 0xfffe7cb2, 0}},
{{0, 0x00003226, 0xfffefdcc, 0}}, {{0, 0x00001913, 0xffff7ee6, 0}},
{{0, 0x00000000, 0x00000000, 0}}, {{0, 0xffffe6ed, 0x0000811a, 0}},
{{0, 0xffffcdda, 0x00010234, 0}}, {{0, 0xffffb4c7, 0x0001834e, 0}},
{{0, 0xffff9bb4, 0x00020468, 0}}, {{0, 0xffff82a1, 0x00028582, 0}},
{{0, 0xffff698e, 0x0003069c, 0}}, {{0, 0xffff507b, 0x000387b6, 0}},
{{0, 0xffff3768, 0x000408d0, 0}}, {{0, 0xffff1e55, 0x000489ea, 0}},
{{0, 0xffff0542, 0x00050b04, 0}}, {{0, 0xfffeec2f, 0x00058c1e, 0}},
{{0, 0xfffed31c, 0x00060d38, 0}}, {{0, 0xfffeba09, 0x00068e52, 0}},
{{0, 0xfffea0f6, 0x00070f6c, 0}}, {{0, 0xfffe87e3, 0x00079086, 0}},
{{0, 0xfffe6ed0, 0x000811a0, 0}}, {{0, 0xfffe55bd, 0x000892ba, 0}},
{{0, 0xfffe3caa, 0x000913d4, 0}}, {{0, 0xfffe2397, 0x000994ee, 0}},
{{0, 0xfffe0a84, 0x000a1608, 0}}, {{0, 0xfffdf171, 0x000a9722, 0}},
{{0, 0xfffdd85e, 0x000b183c, 0}}, {{0, 0xfffdbf4b, 0x000b9956, 0}},
{{0, 0xfffda638, 0x000c1a70, 0}}, {{0, 0xfffd8d25, 0x000c9b8a, 0}},
{{0, 0xfffd7412, 0x000d1ca4, 0}}, {{0, 0xfffd5aff, 0x000d9dbe, 0}},
{{0, 0xfffd41ec, 0x000e1ed8, 0}}, {{0, 0xfffd28d9, 0x000e9ff2, 0}},
{{0, 0xfffd0fc6, 0x000f210c, 0}}, {{0, 0xfffcf6b3, 0x000fa226, 0}},
{{0, 0xfffcdda0, 0x00102340, 0}}, {{0, 0xfffcc48d, 0x0010a45a, 0}},
{{0, 0xfffcab7a, 0x00112574, 0}}, {{0, 0xfffc9267, 0x0011a68e, 0}},
{{0, 0xfffc7954, 0x001227a8, 0}}, {{0, 0xfffc6041, 0x0012a8c2, 0}},
{{0, 0xfffc472e, 0x001329dc, 0}}, {{0, 0xfffc2e1b, 0x0013aaf6, 0}},
{{0, 0xfffc1508, 0x00142c10, 0}}, {{0, 0xfffbfbf5, 0x0014ad2a, 0}},
{{0, 0xfffbe2e2, 0x00152e44, 0}}, {{0, 0xfffbc9cf, 0x0015af5e, 0}},
{{0, 0xfffbb0bc, 0x00163078, 0}}, {{0, 0xfffb97a9, 0x0016b192, 0}},
{{0, 0xfffb7e96, 0x001732ac, 0}}, {{0, 0xfffb6583, 0x0017b3c6, 0}},
{{0, 0xfffb4c70, 0x001834e0, 0}}, {{0, 0xfffb335d, 0x0018b5fa, 0}},
{{0, 0xfffb1a4a, 0x00193714, 0}}, {{0, 0xfffb0137, 0x0019b82e, 0}},
{{0, 0xfffae824, 0x001a3948, 0}}, {{0, 0xfffacf11, 0x001aba62, 0}},
{{0, 0xfffab5fe, 0x001b3b7c, 0}}, {{0, 0xfffa9ceb, 0x001bbc96, 0}},
{{0, 0xfffa83d8, 0x001c3db0, 0}}, {{0, 0xfffa6ac5, 0x001cbeca, 0}},
{{0, 0xfffa51b2, 0x001d3fe4, 0}}, {{0, 0xfffa389f, 0x001dc0fe, 0}},
{{0, 0xfffa1f8c, 0x001e4218, 0}}, {{0, 0xfffa0679, 0x001ec332, 0}},
{{0, 0xfff9ed66, 0x001f444c, 0}}, {{0, 0xfff9d453, 0x001fc566, 0}},
{{0, 0xfff9bb40, 0x00204680, 0}}, {{0, 0xfff9a22d, 0x0020c79a, 0}},
{{0, 0xfff9891a, 0x002148b4, 0}}, {{0, 0xfff97007, 0x0021c9ce, 0}},
{{0, 0xfff956f4, 0x00224ae8, 0}}, {{0, 0xfff93de1, 0x0022cc02, 0}},
{{0, 0xfff924ce, 0x00234d1c, 0}}, {{0, 0xfff90bbb, 0x0023ce36, 0}},
{{0, 0xfff8f2a8, 0x00244f50, 0}}, {{0, 0xfff8d995, 0x0024d06a, 0}},
{{0, 0xfff8c082, 0x00255184, 0}}, {{0, 0xfff8a76f, 0x0025d29e, 0}},
{{0, 0xfff88e5c, 0x002653b8, 0}}, {{0, 0xfff87549, 0x0026d4d2, 0}},
{{0, 0xfff85c36, 0x002755ec, 0}}, {{0, 0xfff84323, 0x0027d706, 0}},
{{0, 0xfff82a10, 0x00285820, 0}}, {{0, 0xfff810fd, 0x0028d93a, 0}},
{{0, 0xfff7f7ea, 0x00295a54, 0}}, {{0, 0xfff7ded7, 0x0029db6e, 0}},
{{0, 0xfff7c5c4, 0x002a5c88, 0}}, {{0, 0xfff7acb1, 0x002adda2, 0}},
{{0, 0xfff7939e, 0x002b5ebc, 0}}, {{0, 0xfff77a8b, 0x002bdfd6, 0}},
{{0, 0xfff76178, 0x002c60f0, 0}}, {{0, 0xfff74865, 0x002ce20a, 0}},
{{0, 0xfff72f52, 0x002d6324, 0}}, {{0, 0xfff7163f, 0x002de43e, 0}},
{{0, 0xfff6fd2c, 0x002e6558, 0}}, {{0, 0xfff6e419, 0x002ee672, 0}},
{{0, 0xfff6cb06, 0x002f678c, 0}}, {{0, 0xfff6b1f3, 0x002fe8a6, 0}},
{{0, 0xfff698e0, 0x003069c0, 0}}, {{0, 0xfff67fcd, 0x0030eada, 0}},
{{0, 0xfff666ba, 0x00316bf4, 0}}, {{0, 0xfff64da7, 0x0031ed0e, 0}},
{{0, 0xfff63494, 0x00326e28, 0}}, {{0, 0xfff61b81, 0x0032ef42, 0}},
{{0, 0xfff6026e, 0x0033705c, 0}}, {{0, 0xfff5e95b, 0x0033f176, 0}},
{{0, 0xfff5d048, 0x00347290, 0}}, {{0, 0xfff5b735, 0x0034f3aa, 0}},
{{0, 0xfff59e22, 0x003574c4, 0}}, {{0, 0xfff5850f, 0x0035f5de, 0}},
{{0, 0xfff56bfc, 0x003676f8, 0}}, {{0, 0xfff552e9, 0x0036f812, 0}},
{{0, 0xfff539d6, 0x0037792c, 0}}, {{0, 0xfff520c3, 0x0037fa46, 0}},
{{0, 0xfff507b0, 0x00387b60, 0}}, {{0, 0xfff4ee9d, 0x0038fc7a, 0}},
{{0, 0xfff4d58a, 0x00397d94, 0}}, {{0, 0xfff4bc77, 0x0039feae, 0}},
{{0, 0xfff4a364, 0x003a7fc8, 0}}, {{0, 0xfff48a51, 0x003b00e2, 0}},
{{0, 0xfff4713e, 0x003b81fc, 0}}, {{0, 0xfff4582b, 0x003c0316, 0}},
{{0, 0xfff43f18, 0x003c8430, 0}}, {{0, 0xfff42605, 0x003d054a, 0}},
{{0, 0xfff40cf2, 0x003d8664, 0}}, {{0, 0xfff3f3df, 0x003e077e, 0}},
{{0, 0xfff3dacc, 0x003e8898, 0}}, {{0, 0xfff3c1b9, 0x003f09b2, 0}},
{{0, 0xfff3a8a6, 0x003f8acc, 0}}, {{0, 0xfff38f93, 0x00400be6, 0}}
};
static VP8kCstSSE2 VP8kVtoRGBA[256] = {
{{0xffcced80, 0x001a0400, 0, 0}}, {{0xffcd53a5, 0x0019cff8, 0, 0}},
{{0xffcdb9ca, 0x00199bf0, 0, 0}}, {{0xffce1fef, 0x001967e8, 0, 0}},
{{0xffce8614, 0x001933e0, 0, 0}}, {{0xffceec39, 0x0018ffd8, 0, 0}},
{{0xffcf525e, 0x0018cbd0, 0, 0}}, {{0xffcfb883, 0x001897c8, 0, 0}},
{{0xffd01ea8, 0x001863c0, 0, 0}}, {{0xffd084cd, 0x00182fb8, 0, 0}},
{{0xffd0eaf2, 0x0017fbb0, 0, 0}}, {{0xffd15117, 0x0017c7a8, 0, 0}},
{{0xffd1b73c, 0x001793a0, 0, 0}}, {{0xffd21d61, 0x00175f98, 0, 0}},
{{0xffd28386, 0x00172b90, 0, 0}}, {{0xffd2e9ab, 0x0016f788, 0, 0}},
{{0xffd34fd0, 0x0016c380, 0, 0}}, {{0xffd3b5f5, 0x00168f78, 0, 0}},
{{0xffd41c1a, 0x00165b70, 0, 0}}, {{0xffd4823f, 0x00162768, 0, 0}},
{{0xffd4e864, 0x0015f360, 0, 0}}, {{0xffd54e89, 0x0015bf58, 0, 0}},
{{0xffd5b4ae, 0x00158b50, 0, 0}}, {{0xffd61ad3, 0x00155748, 0, 0}},
{{0xffd680f8, 0x00152340, 0, 0}}, {{0xffd6e71d, 0x0014ef38, 0, 0}},
{{0xffd74d42, 0x0014bb30, 0, 0}}, {{0xffd7b367, 0x00148728, 0, 0}},
{{0xffd8198c, 0x00145320, 0, 0}}, {{0xffd87fb1, 0x00141f18, 0, 0}},
{{0xffd8e5d6, 0x0013eb10, 0, 0}}, {{0xffd94bfb, 0x0013b708, 0, 0}},
{{0xffd9b220, 0x00138300, 0, 0}}, {{0xffda1845, 0x00134ef8, 0, 0}},
{{0xffda7e6a, 0x00131af0, 0, 0}}, {{0xffdae48f, 0x0012e6e8, 0, 0}},
{{0xffdb4ab4, 0x0012b2e0, 0, 0}}, {{0xffdbb0d9, 0x00127ed8, 0, 0}},
{{0xffdc16fe, 0x00124ad0, 0, 0}}, {{0xffdc7d23, 0x001216c8, 0, 0}},
{{0xffdce348, 0x0011e2c0, 0, 0}}, {{0xffdd496d, 0x0011aeb8, 0, 0}},
{{0xffddaf92, 0x00117ab0, 0, 0}}, {{0xffde15b7, 0x001146a8, 0, 0}},
{{0xffde7bdc, 0x001112a0, 0, 0}}, {{0xffdee201, 0x0010de98, 0, 0}},
{{0xffdf4826, 0x0010aa90, 0, 0}}, {{0xffdfae4b, 0x00107688, 0, 0}},
{{0xffe01470, 0x00104280, 0, 0}}, {{0xffe07a95, 0x00100e78, 0, 0}},
{{0xffe0e0ba, 0x000fda70, 0, 0}}, {{0xffe146df, 0x000fa668, 0, 0}},
{{0xffe1ad04, 0x000f7260, 0, 0}}, {{0xffe21329, 0x000f3e58, 0, 0}},
{{0xffe2794e, 0x000f0a50, 0, 0}}, {{0xffe2df73, 0x000ed648, 0, 0}},
{{0xffe34598, 0x000ea240, 0, 0}}, {{0xffe3abbd, 0x000e6e38, 0, 0}},
{{0xffe411e2, 0x000e3a30, 0, 0}}, {{0xffe47807, 0x000e0628, 0, 0}},
{{0xffe4de2c, 0x000dd220, 0, 0}}, {{0xffe54451, 0x000d9e18, 0, 0}},
{{0xffe5aa76, 0x000d6a10, 0, 0}}, {{0xffe6109b, 0x000d3608, 0, 0}},
{{0xffe676c0, 0x000d0200, 0, 0}}, {{0xffe6dce5, 0x000ccdf8, 0, 0}},
{{0xffe7430a, 0x000c99f0, 0, 0}}, {{0xffe7a92f, 0x000c65e8, 0, 0}},
{{0xffe80f54, 0x000c31e0, 0, 0}}, {{0xffe87579, 0x000bfdd8, 0, 0}},
{{0xffe8db9e, 0x000bc9d0, 0, 0}}, {{0xffe941c3, 0x000b95c8, 0, 0}},
{{0xffe9a7e8, 0x000b61c0, 0, 0}}, {{0xffea0e0d, 0x000b2db8, 0, 0}},
{{0xffea7432, 0x000af9b0, 0, 0}}, {{0xffeada57, 0x000ac5a8, 0, 0}},
{{0xffeb407c, 0x000a91a0, 0, 0}}, {{0xffeba6a1, 0x000a5d98, 0, 0}},
{{0xffec0cc6, 0x000a2990, 0, 0}}, {{0xffec72eb, 0x0009f588, 0, 0}},
{{0xffecd910, 0x0009c180, 0, 0}}, {{0xffed3f35, 0x00098d78, 0, 0}},
{{0xffeda55a, 0x00095970, 0, 0}}, {{0xffee0b7f, 0x00092568, 0, 0}},
{{0xffee71a4, 0x0008f160, 0, 0}}, {{0xffeed7c9, 0x0008bd58, 0, 0}},
{{0xffef3dee, 0x00088950, 0, 0}}, {{0xffefa413, 0x00085548, 0, 0}},
{{0xfff00a38, 0x00082140, 0, 0}}, {{0xfff0705d, 0x0007ed38, 0, 0}},
{{0xfff0d682, 0x0007b930, 0, 0}}, {{0xfff13ca7, 0x00078528, 0, 0}},
{{0xfff1a2cc, 0x00075120, 0, 0}}, {{0xfff208f1, 0x00071d18, 0, 0}},
{{0xfff26f16, 0x0006e910, 0, 0}}, {{0xfff2d53b, 0x0006b508, 0, 0}},
{{0xfff33b60, 0x00068100, 0, 0}}, {{0xfff3a185, 0x00064cf8, 0, 0}},
{{0xfff407aa, 0x000618f0, 0, 0}}, {{0xfff46dcf, 0x0005e4e8, 0, 0}},
{{0xfff4d3f4, 0x0005b0e0, 0, 0}}, {{0xfff53a19, 0x00057cd8, 0, 0}},
{{0xfff5a03e, 0x000548d0, 0, 0}}, {{0xfff60663, 0x000514c8, 0, 0}},
{{0xfff66c88, 0x0004e0c0, 0, 0}}, {{0xfff6d2ad, 0x0004acb8, 0, 0}},
{{0xfff738d2, 0x000478b0, 0, 0}}, {{0xfff79ef7, 0x000444a8, 0, 0}},
{{0xfff8051c, 0x000410a0, 0, 0}}, {{0xfff86b41, 0x0003dc98, 0, 0}},
{{0xfff8d166, 0x0003a890, 0, 0}}, {{0xfff9378b, 0x00037488, 0, 0}},
{{0xfff99db0, 0x00034080, 0, 0}}, {{0xfffa03d5, 0x00030c78, 0, 0}},
{{0xfffa69fa, 0x0002d870, 0, 0}}, {{0xfffad01f, 0x0002a468, 0, 0}},
{{0xfffb3644, 0x00027060, 0, 0}}, {{0xfffb9c69, 0x00023c58, 0, 0}},
{{0xfffc028e, 0x00020850, 0, 0}}, {{0xfffc68b3, 0x0001d448, 0, 0}},
{{0xfffcced8, 0x0001a040, 0, 0}}, {{0xfffd34fd, 0x00016c38, 0, 0}},
{{0xfffd9b22, 0x00013830, 0, 0}}, {{0xfffe0147, 0x00010428, 0, 0}},
{{0xfffe676c, 0x0000d020, 0, 0}}, {{0xfffecd91, 0x00009c18, 0, 0}},
{{0xffff33b6, 0x00006810, 0, 0}}, {{0xffff99db, 0x00003408, 0, 0}},
{{0x00000000, 0x00000000, 0, 0}}, {{0x00006625, 0xffffcbf8, 0, 0}},
{{0x0000cc4a, 0xffff97f0, 0, 0}}, {{0x0001326f, 0xffff63e8, 0, 0}},
{{0x00019894, 0xffff2fe0, 0, 0}}, {{0x0001feb9, 0xfffefbd8, 0, 0}},
{{0x000264de, 0xfffec7d0, 0, 0}}, {{0x0002cb03, 0xfffe93c8, 0, 0}},
{{0x00033128, 0xfffe5fc0, 0, 0}}, {{0x0003974d, 0xfffe2bb8, 0, 0}},
{{0x0003fd72, 0xfffdf7b0, 0, 0}}, {{0x00046397, 0xfffdc3a8, 0, 0}},
{{0x0004c9bc, 0xfffd8fa0, 0, 0}}, {{0x00052fe1, 0xfffd5b98, 0, 0}},
{{0x00059606, 0xfffd2790, 0, 0}}, {{0x0005fc2b, 0xfffcf388, 0, 0}},
{{0x00066250, 0xfffcbf80, 0, 0}}, {{0x0006c875, 0xfffc8b78, 0, 0}},
{{0x00072e9a, 0xfffc5770, 0, 0}}, {{0x000794bf, 0xfffc2368, 0, 0}},
{{0x0007fae4, 0xfffbef60, 0, 0}}, {{0x00086109, 0xfffbbb58, 0, 0}},
{{0x0008c72e, 0xfffb8750, 0, 0}}, {{0x00092d53, 0xfffb5348, 0, 0}},
{{0x00099378, 0xfffb1f40, 0, 0}}, {{0x0009f99d, 0xfffaeb38, 0, 0}},
{{0x000a5fc2, 0xfffab730, 0, 0}}, {{0x000ac5e7, 0xfffa8328, 0, 0}},
{{0x000b2c0c, 0xfffa4f20, 0, 0}}, {{0x000b9231, 0xfffa1b18, 0, 0}},
{{0x000bf856, 0xfff9e710, 0, 0}}, {{0x000c5e7b, 0xfff9b308, 0, 0}},
{{0x000cc4a0, 0xfff97f00, 0, 0}}, {{0x000d2ac5, 0xfff94af8, 0, 0}},
{{0x000d90ea, 0xfff916f0, 0, 0}}, {{0x000df70f, 0xfff8e2e8, 0, 0}},
{{0x000e5d34, 0xfff8aee0, 0, 0}}, {{0x000ec359, 0xfff87ad8, 0, 0}},
{{0x000f297e, 0xfff846d0, 0, 0}}, {{0x000f8fa3, 0xfff812c8, 0, 0}},
{{0x000ff5c8, 0xfff7dec0, 0, 0}}, {{0x00105bed, 0xfff7aab8, 0, 0}},
{{0x0010c212, 0xfff776b0, 0, 0}}, {{0x00112837, 0xfff742a8, 0, 0}},
{{0x00118e5c, 0xfff70ea0, 0, 0}}, {{0x0011f481, 0xfff6da98, 0, 0}},
{{0x00125aa6, 0xfff6a690, 0, 0}}, {{0x0012c0cb, 0xfff67288, 0, 0}},
{{0x001326f0, 0xfff63e80, 0, 0}}, {{0x00138d15, 0xfff60a78, 0, 0}},
{{0x0013f33a, 0xfff5d670, 0, 0}}, {{0x0014595f, 0xfff5a268, 0, 0}},
{{0x0014bf84, 0xfff56e60, 0, 0}}, {{0x001525a9, 0xfff53a58, 0, 0}},
{{0x00158bce, 0xfff50650, 0, 0}}, {{0x0015f1f3, 0xfff4d248, 0, 0}},
{{0x00165818, 0xfff49e40, 0, 0}}, {{0x0016be3d, 0xfff46a38, 0, 0}},
{{0x00172462, 0xfff43630, 0, 0}}, {{0x00178a87, 0xfff40228, 0, 0}},
{{0x0017f0ac, 0xfff3ce20, 0, 0}}, {{0x001856d1, 0xfff39a18, 0, 0}},
{{0x0018bcf6, 0xfff36610, 0, 0}}, {{0x0019231b, 0xfff33208, 0, 0}},
{{0x00198940, 0xfff2fe00, 0, 0}}, {{0x0019ef65, 0xfff2c9f8, 0, 0}},
{{0x001a558a, 0xfff295f0, 0, 0}}, {{0x001abbaf, 0xfff261e8, 0, 0}},
{{0x001b21d4, 0xfff22de0, 0, 0}}, {{0x001b87f9, 0xfff1f9d8, 0, 0}},
{{0x001bee1e, 0xfff1c5d0, 0, 0}}, {{0x001c5443, 0xfff191c8, 0, 0}},
{{0x001cba68, 0xfff15dc0, 0, 0}}, {{0x001d208d, 0xfff129b8, 0, 0}},
{{0x001d86b2, 0xfff0f5b0, 0, 0}}, {{0x001decd7, 0xfff0c1a8, 0, 0}},
{{0x001e52fc, 0xfff08da0, 0, 0}}, {{0x001eb921, 0xfff05998, 0, 0}},
{{0x001f1f46, 0xfff02590, 0, 0}}, {{0x001f856b, 0xffeff188, 0, 0}},
{{0x001feb90, 0xffefbd80, 0, 0}}, {{0x002051b5, 0xffef8978, 0, 0}},
{{0x0020b7da, 0xffef5570, 0, 0}}, {{0x00211dff, 0xffef2168, 0, 0}},
{{0x00218424, 0xffeeed60, 0, 0}}, {{0x0021ea49, 0xffeeb958, 0, 0}},
{{0x0022506e, 0xffee8550, 0, 0}}, {{0x0022b693, 0xffee5148, 0, 0}},
{{0x00231cb8, 0xffee1d40, 0, 0}}, {{0x002382dd, 0xffede938, 0, 0}},
{{0x0023e902, 0xffedb530, 0, 0}}, {{0x00244f27, 0xffed8128, 0, 0}},
{{0x0024b54c, 0xffed4d20, 0, 0}}, {{0x00251b71, 0xffed1918, 0, 0}},
{{0x00258196, 0xffece510, 0, 0}}, {{0x0025e7bb, 0xffecb108, 0, 0}},
{{0x00264de0, 0xffec7d00, 0, 0}}, {{0x0026b405, 0xffec48f8, 0, 0}},
{{0x00271a2a, 0xffec14f0, 0, 0}}, {{0x0027804f, 0xffebe0e8, 0, 0}},
{{0x0027e674, 0xffebace0, 0, 0}}, {{0x00284c99, 0xffeb78d8, 0, 0}},
{{0x0028b2be, 0xffeb44d0, 0, 0}}, {{0x002918e3, 0xffeb10c8, 0, 0}},
{{0x00297f08, 0xffeadcc0, 0, 0}}, {{0x0029e52d, 0xffeaa8b8, 0, 0}},
{{0x002a4b52, 0xffea74b0, 0, 0}}, {{0x002ab177, 0xffea40a8, 0, 0}},
{{0x002b179c, 0xffea0ca0, 0, 0}}, {{0x002b7dc1, 0xffe9d898, 0, 0}},
{{0x002be3e6, 0xffe9a490, 0, 0}}, {{0x002c4a0b, 0xffe97088, 0, 0}},
{{0x002cb030, 0xffe93c80, 0, 0}}, {{0x002d1655, 0xffe90878, 0, 0}},
{{0x002d7c7a, 0xffe8d470, 0, 0}}, {{0x002de29f, 0xffe8a068, 0, 0}},
{{0x002e48c4, 0xffe86c60, 0, 0}}, {{0x002eaee9, 0xffe83858, 0, 0}},
{{0x002f150e, 0xffe80450, 0, 0}}, {{0x002f7b33, 0xffe7d048, 0, 0}},
{{0x002fe158, 0xffe79c40, 0, 0}}, {{0x0030477d, 0xffe76838, 0, 0}},
{{0x0030ada2, 0xffe73430, 0, 0}}, {{0x003113c7, 0xffe70028, 0, 0}},
{{0x003179ec, 0xffe6cc20, 0, 0}}, {{0x0031e011, 0xffe69818, 0, 0}},
{{0x00324636, 0xffe66410, 0, 0}}, {{0x0032ac5b, 0xffe63008, 0, 0}}
};

View file

@ -0,0 +1,455 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Mislav Bradac (mislavm@google.com)
//
#include "./delta_palettization.h"
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "../webp/types.h"
#include "../dsp/lossless.h"
#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
// Format allows palette up to 256 entries, but more palette entries produce
// bigger entropy. In the future it will probably be useful to add more entries
// that are far from the origin of the palette or choose remaining entries
// dynamically.
#define DELTA_PALETTE_SIZE 226
// Palette used for delta_palettization. Entries are roughly sorted by distance
// of their signed equivalents from the origin.
static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
MK_COL(0u, 0u, 0u),
MK_COL(255u, 255u, 255u),
MK_COL(1u, 1u, 1u),
MK_COL(254u, 254u, 254u),
MK_COL(2u, 2u, 2u),
MK_COL(4u, 4u, 4u),
MK_COL(252u, 252u, 252u),
MK_COL(250u, 0u, 0u),
MK_COL(0u, 250u, 0u),
MK_COL(0u, 0u, 250u),
MK_COL(6u, 0u, 0u),
MK_COL(0u, 6u, 0u),
MK_COL(0u, 0u, 6u),
MK_COL(0u, 0u, 248u),
MK_COL(0u, 0u, 8u),
MK_COL(0u, 248u, 0u),
MK_COL(0u, 248u, 248u),
MK_COL(0u, 248u, 8u),
MK_COL(0u, 8u, 0u),
MK_COL(0u, 8u, 248u),
MK_COL(0u, 8u, 8u),
MK_COL(8u, 8u, 8u),
MK_COL(248u, 0u, 0u),
MK_COL(248u, 0u, 248u),
MK_COL(248u, 0u, 8u),
MK_COL(248u, 248u, 0u),
MK_COL(248u, 8u, 0u),
MK_COL(8u, 0u, 0u),
MK_COL(8u, 0u, 248u),
MK_COL(8u, 0u, 8u),
MK_COL(8u, 248u, 0u),
MK_COL(8u, 8u, 0u),
MK_COL(23u, 23u, 23u),
MK_COL(13u, 13u, 13u),
MK_COL(232u, 232u, 232u),
MK_COL(244u, 244u, 244u),
MK_COL(245u, 245u, 250u),
MK_COL(50u, 50u, 50u),
MK_COL(204u, 204u, 204u),
MK_COL(236u, 236u, 236u),
MK_COL(16u, 16u, 16u),
MK_COL(240u, 16u, 16u),
MK_COL(16u, 240u, 16u),
MK_COL(240u, 240u, 16u),
MK_COL(16u, 16u, 240u),
MK_COL(240u, 16u, 240u),
MK_COL(16u, 240u, 240u),
MK_COL(240u, 240u, 240u),
MK_COL(0u, 0u, 232u),
MK_COL(0u, 232u, 0u),
MK_COL(232u, 0u, 0u),
MK_COL(0u, 0u, 24u),
MK_COL(0u, 24u, 0u),
MK_COL(24u, 0u, 0u),
MK_COL(32u, 32u, 32u),
MK_COL(224u, 32u, 32u),
MK_COL(32u, 224u, 32u),
MK_COL(224u, 224u, 32u),
MK_COL(32u, 32u, 224u),
MK_COL(224u, 32u, 224u),
MK_COL(32u, 224u, 224u),
MK_COL(224u, 224u, 224u),
MK_COL(0u, 0u, 176u),
MK_COL(0u, 0u, 80u),
MK_COL(0u, 176u, 0u),
MK_COL(0u, 176u, 176u),
MK_COL(0u, 176u, 80u),
MK_COL(0u, 80u, 0u),
MK_COL(0u, 80u, 176u),
MK_COL(0u, 80u, 80u),
MK_COL(176u, 0u, 0u),
MK_COL(176u, 0u, 176u),
MK_COL(176u, 0u, 80u),
MK_COL(176u, 176u, 0u),
MK_COL(176u, 80u, 0u),
MK_COL(80u, 0u, 0u),
MK_COL(80u, 0u, 176u),
MK_COL(80u, 0u, 80u),
MK_COL(80u, 176u, 0u),
MK_COL(80u, 80u, 0u),
MK_COL(0u, 0u, 152u),
MK_COL(0u, 0u, 104u),
MK_COL(0u, 152u, 0u),
MK_COL(0u, 152u, 152u),
MK_COL(0u, 152u, 104u),
MK_COL(0u, 104u, 0u),
MK_COL(0u, 104u, 152u),
MK_COL(0u, 104u, 104u),
MK_COL(152u, 0u, 0u),
MK_COL(152u, 0u, 152u),
MK_COL(152u, 0u, 104u),
MK_COL(152u, 152u, 0u),
MK_COL(152u, 104u, 0u),
MK_COL(104u, 0u, 0u),
MK_COL(104u, 0u, 152u),
MK_COL(104u, 0u, 104u),
MK_COL(104u, 152u, 0u),
MK_COL(104u, 104u, 0u),
MK_COL(216u, 216u, 216u),
MK_COL(216u, 216u, 40u),
MK_COL(216u, 216u, 176u),
MK_COL(216u, 216u, 80u),
MK_COL(216u, 40u, 216u),
MK_COL(216u, 40u, 40u),
MK_COL(216u, 40u, 176u),
MK_COL(216u, 40u, 80u),
MK_COL(216u, 176u, 216u),
MK_COL(216u, 176u, 40u),
MK_COL(216u, 176u, 176u),
MK_COL(216u, 176u, 80u),
MK_COL(216u, 80u, 216u),
MK_COL(216u, 80u, 40u),
MK_COL(216u, 80u, 176u),
MK_COL(216u, 80u, 80u),
MK_COL(40u, 216u, 216u),
MK_COL(40u, 216u, 40u),
MK_COL(40u, 216u, 176u),
MK_COL(40u, 216u, 80u),
MK_COL(40u, 40u, 216u),
MK_COL(40u, 40u, 40u),
MK_COL(40u, 40u, 176u),
MK_COL(40u, 40u, 80u),
MK_COL(40u, 176u, 216u),
MK_COL(40u, 176u, 40u),
MK_COL(40u, 176u, 176u),
MK_COL(40u, 176u, 80u),
MK_COL(40u, 80u, 216u),
MK_COL(40u, 80u, 40u),
MK_COL(40u, 80u, 176u),
MK_COL(40u, 80u, 80u),
MK_COL(80u, 216u, 216u),
MK_COL(80u, 216u, 40u),
MK_COL(80u, 216u, 176u),
MK_COL(80u, 216u, 80u),
MK_COL(80u, 40u, 216u),
MK_COL(80u, 40u, 40u),
MK_COL(80u, 40u, 176u),
MK_COL(80u, 40u, 80u),
MK_COL(80u, 176u, 216u),
MK_COL(80u, 176u, 40u),
MK_COL(80u, 176u, 176u),
MK_COL(80u, 176u, 80u),
MK_COL(80u, 80u, 216u),
MK_COL(80u, 80u, 40u),
MK_COL(80u, 80u, 176u),
MK_COL(80u, 80u, 80u),
MK_COL(0u, 0u, 192u),
MK_COL(0u, 0u, 64u),
MK_COL(0u, 0u, 128u),
MK_COL(0u, 192u, 0u),
MK_COL(0u, 192u, 192u),
MK_COL(0u, 192u, 64u),
MK_COL(0u, 192u, 128u),
MK_COL(0u, 64u, 0u),
MK_COL(0u, 64u, 192u),
MK_COL(0u, 64u, 64u),
MK_COL(0u, 64u, 128u),
MK_COL(0u, 128u, 0u),
MK_COL(0u, 128u, 192u),
MK_COL(0u, 128u, 64u),
MK_COL(0u, 128u, 128u),
MK_COL(176u, 216u, 216u),
MK_COL(176u, 216u, 40u),
MK_COL(176u, 216u, 176u),
MK_COL(176u, 216u, 80u),
MK_COL(176u, 40u, 216u),
MK_COL(176u, 40u, 40u),
MK_COL(176u, 40u, 176u),
MK_COL(176u, 40u, 80u),
MK_COL(176u, 176u, 216u),
MK_COL(176u, 176u, 40u),
MK_COL(176u, 176u, 176u),
MK_COL(176u, 176u, 80u),
MK_COL(176u, 80u, 216u),
MK_COL(176u, 80u, 40u),
MK_COL(176u, 80u, 176u),
MK_COL(176u, 80u, 80u),
MK_COL(192u, 0u, 0u),
MK_COL(192u, 0u, 192u),
MK_COL(192u, 0u, 64u),
MK_COL(192u, 0u, 128u),
MK_COL(192u, 192u, 0u),
MK_COL(192u, 192u, 192u),
MK_COL(192u, 192u, 64u),
MK_COL(192u, 192u, 128u),
MK_COL(192u, 64u, 0u),
MK_COL(192u, 64u, 192u),
MK_COL(192u, 64u, 64u),
MK_COL(192u, 64u, 128u),
MK_COL(192u, 128u, 0u),
MK_COL(192u, 128u, 192u),
MK_COL(192u, 128u, 64u),
MK_COL(192u, 128u, 128u),
MK_COL(64u, 0u, 0u),
MK_COL(64u, 0u, 192u),
MK_COL(64u, 0u, 64u),
MK_COL(64u, 0u, 128u),
MK_COL(64u, 192u, 0u),
MK_COL(64u, 192u, 192u),
MK_COL(64u, 192u, 64u),
MK_COL(64u, 192u, 128u),
MK_COL(64u, 64u, 0u),
MK_COL(64u, 64u, 192u),
MK_COL(64u, 64u, 64u),
MK_COL(64u, 64u, 128u),
MK_COL(64u, 128u, 0u),
MK_COL(64u, 128u, 192u),
MK_COL(64u, 128u, 64u),
MK_COL(64u, 128u, 128u),
MK_COL(128u, 0u, 0u),
MK_COL(128u, 0u, 192u),
MK_COL(128u, 0u, 64u),
MK_COL(128u, 0u, 128u),
MK_COL(128u, 192u, 0u),
MK_COL(128u, 192u, 192u),
MK_COL(128u, 192u, 64u),
MK_COL(128u, 192u, 128u),
MK_COL(128u, 64u, 0u),
MK_COL(128u, 64u, 192u),
MK_COL(128u, 64u, 64u),
MK_COL(128u, 64u, 128u),
MK_COL(128u, 128u, 0u),
MK_COL(128u, 128u, 192u),
MK_COL(128u, 128u, 64u),
MK_COL(128u, 128u, 128u),
};
#undef MK_COL
//------------------------------------------------------------------------------
// TODO(skal): move the functions to dsp/lossless.c when the correct
// granularity is found. For now, we'll just copy-paste some useful bits
// here instead.
// In-place sum of each component with mod 256.
static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
*a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
}
static WEBP_INLINE uint32_t Clip255(uint32_t a) {
if (a < 256) {
return a;
}
// return 0, when a is a negative integer.
// return 255, when a is positive.
return ~a >> 24;
}
// Delta palettization functions.
static WEBP_INLINE int Square(int x) {
return x * x;
}
static WEBP_INLINE uint32_t Intensity(uint32_t a) {
return
30 * ((a >> 16) & 0xff) +
59 * ((a >> 8) & 0xff) +
11 * ((a >> 0) & 0xff);
}
static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
uint32_t palette_entry) {
int i;
uint32_t distance = 0;
AddPixelsEq(&predicted_value, palette_entry);
for (i = 0; i < 32; i += 8) {
const int32_t av = (actual_value >> i) & 0xff;
const int32_t pv = (predicted_value >> i) & 0xff;
distance += Square(pv - av);
}
// We sum square of intensity difference with factor 10, but because Intensity
// returns 100 times real intensity we need to multiply differences of colors
// by 1000.
distance *= 1000u;
distance += Square(Intensity(predicted_value)
- Intensity(actual_value));
return distance;
}
static uint32_t Predict(int x, int y, uint32_t* image) {
const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
const uint32_t p =
(((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
(((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
(((((t >> 8) & 0xff) + ((l >> 8) & 0xff)) / 2) << 8) +
(((((t >> 0) & 0xff) + ((l >> 0) & 0xff)) / 2) << 0);
if (x == 0 && y == 0) return ARGB_BLACK;
if (x == 0) return t;
if (y == 0) return l;
return p;
}
static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
int a, int b, int c) {
return Clip255(a + ((b - c) >> 2));
}
static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
uint32_t c0, uint32_t c1, uint32_t c2) {
const int a = AddSubtractComponentFullWithCoefficient(
c0 >> 24, c1 >> 24, c2 >> 24);
const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
(c1 >> 16) & 0xff,
(c2 >> 16) & 0xff);
const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
(c1 >> 8) & 0xff,
(c2 >> 8) & 0xff);
const int b = AddSubtractComponentFullWithCoefficient(
c0 & 0xff, c1 & 0xff, c2 & 0xff);
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
//------------------------------------------------------------------------------
// Find palette entry with minimum error from difference of actual pixel value
// and predicted pixel value. Propagate error of pixel to its top and left pixel
// in src array. Write predicted_value + palette_entry to new_image. Return
// index of best palette entry.
static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
const uint32_t palette[], int palette_size) {
int i;
int idx = 0;
uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
for (i = 1; i < palette_size; ++i) {
const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
if (distance < best_distance) {
best_distance = distance;
idx = i;
}
}
return idx;
}
static void ApplyBestPaletteEntry(int x, int y,
uint32_t new_value, uint32_t palette_value,
uint32_t* src, int src_stride,
uint32_t* new_image) {
AddPixelsEq(&new_value, palette_value);
if (x > 0) {
src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
new_value, src[x]);
}
if (y > 0) {
src[x - src_stride] =
ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
new_value, src[x]);
}
new_image[x] = new_value;
}
//------------------------------------------------------------------------------
// Main entry point
static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
uint32_t src_stride,
uint32_t dst_stride,
const uint32_t* palette,
int palette_size,
int width, int height,
int num_passes) {
int x, y;
WebPEncodingError err = VP8_ENC_OK;
uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
if (new_image == NULL || tmp_row == NULL) {
err = VP8_ENC_ERROR_OUT_OF_MEMORY;
goto Error;
}
while (num_passes--) {
uint32_t* cur_src = src;
uint32_t* cur_dst = dst;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
const uint32_t predicted_value = Predict(x, y, new_image);
tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
palette, palette_size);
ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
cur_src, src_stride, new_image);
}
for (x = 0; x < width; ++x) {
cur_dst[x] = palette[tmp_row[x]];
}
cur_src += src_stride;
cur_dst += dst_stride;
}
}
Error:
WebPSafeFree(new_image);
WebPSafeFree(tmp_row);
return err;
}
// replaces enc->argb_ by a palettizable approximation of it,
// and generates optimal enc->palette_[]
WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
const WebPPicture* const pic = enc->pic_;
uint32_t* src = pic->argb;
uint32_t* dst = enc->argb_;
const int width = pic->width;
const int height = pic->height;
WebPEncodingError err = VP8_ENC_OK;
memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
enc->palette_size_ = DELTA_PALETTE_SIZE;
err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
enc->palette_, enc->palette_size_,
width, height, 2);
if (err != VP8_ENC_OK) goto Error;
Error:
return err;
}
#else // !WEBP_EXPERIMENTAL_FEATURES
WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
(void)enc;
return VP8_ENC_ERROR_INVALID_CONFIGURATION;
}
#endif // WEBP_EXPERIMENTAL_FEATURES

View file

@ -0,0 +1,25 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Mislav Bradac (mislavm@google.com)
//
#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
#define WEBP_ENC_DELTA_PALETTIZATION_H_
#include "../webp/encode.h"
#include "../enc/vp8li.h"
// Replaces enc->argb_[] input by a palettizable approximation of it,
// and generates optimal enc->palette_[].
// This function can revert enc->use_palette_ / enc->use_predict_ flag
// if delta-palettization is not producing expected saving.
WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
#endif // WEBP_ENC_DELTA_PALETTIZATION_H_

View file

@ -0,0 +1,160 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Near-lossless image preprocessing adjusts pixel values to help
// compressibility with a guarantee of maximum deviation between original and
// resulting pixel values.
//
// Author: Jyrki Alakuijala (jyrki@google.com)
// Converted to C by Aleksander Kramarz (akramarz@google.com)
#include <stdlib.h>
#include "../dsp/lossless.h"
#include "../utils/utils.h"
#include "./vp8enci.h"
#define MIN_DIM_FOR_NEAR_LOSSLESS 64
#define MAX_LIMIT_BITS 5
// Computes quantized pixel value and distance from original value.
static void GetValAndDistance(int a, int initial, int bits,
int* const val, int* const distance) {
const int mask = ~((1 << bits) - 1);
*val = (initial & mask) | (initial >> (8 - bits));
*distance = 2 * abs(a - *val);
}
// Clamps the value to range [0, 255].
static int Clamp8b(int val) {
const int min_val = 0;
const int max_val = 0xff;
return (val < min_val) ? min_val : (val > max_val) ? max_val : val;
}
// Quantizes values {a, a+(1<<bits), a-(1<<bits)} and returns the nearest one.
static int FindClosestDiscretized(int a, int bits) {
int best_val = a, i;
int min_distance = 256;
for (i = -1; i <= 1; ++i) {
int candidate, distance;
const int val = Clamp8b(a + i * (1 << bits));
GetValAndDistance(a, val, bits, &candidate, &distance);
if (i != 0) {
++distance;
}
// Smallest distance but favor i == 0 over i == -1 and i == 1
// since that keeps the overall intensity more constant in the
// images.
if (distance < min_distance) {
min_distance = distance;
best_val = candidate;
}
}
return best_val;
}
// Applies FindClosestDiscretized to all channels of pixel.
static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
return
(FindClosestDiscretized(a >> 24, bits) << 24) |
(FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
(FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
(FindClosestDiscretized(a & 0xff, bits));
}
// Checks if distance between corresponding channel values of pixels a and b
// is within the given limit.
static int IsNear(uint32_t a, uint32_t b, int limit) {
int k;
for (k = 0; k < 4; ++k) {
const int delta =
(int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
if (delta >= limit || delta <= -limit) {
return 0;
}
}
return 1;
}
static int IsSmooth(const uint32_t* const prev_row,
const uint32_t* const curr_row,
const uint32_t* const next_row,
int ix, int limit) {
// Check that all pixels in 4-connected neighborhood are smooth.
return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
IsNear(curr_row[ix], prev_row[ix], limit) &&
IsNear(curr_row[ix], next_row[ix], limit));
}
// Adjusts pixel values of image with given maximum error.
static void NearLossless(int xsize, int ysize, uint32_t* argb,
int limit_bits, uint32_t* copy_buffer) {
int x, y;
const int limit = 1 << limit_bits;
uint32_t* prev_row = copy_buffer;
uint32_t* curr_row = prev_row + xsize;
uint32_t* next_row = curr_row + xsize;
memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));
for (y = 1; y < ysize - 1; ++y) {
uint32_t* const curr_argb_row = argb + y * xsize;
uint32_t* const next_argb_row = curr_argb_row + xsize;
memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
for (x = 1; x < xsize - 1; ++x) {
if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
}
}
{
// Three-way swap.
uint32_t* const temp = prev_row;
prev_row = curr_row;
curr_row = next_row;
next_row = temp;
}
}
}
static int QualityToLimitBits(int quality) {
// quality mapping:
// 0..19 -> 5
// 0..39 -> 4
// 0..59 -> 3
// 0..79 -> 2
// 0..99 -> 1
// 100 -> 0
return MAX_LIMIT_BITS - quality / 20;
}
int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
int i;
uint32_t* const copy_buffer =
(uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
const int limit_bits = QualityToLimitBits(quality);
assert(argb != NULL);
assert(limit_bits >= 0);
assert(limit_bits <= MAX_LIMIT_BITS);
if (copy_buffer == NULL) {
return 0;
}
// For small icon images, don't attempt to apply near-lossless compression.
if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
WebPSafeFree(copy_buffer);
return 1;
}
for (i = limit_bits; i != 0; --i) {
NearLossless(xsize, ysize, argb, i, copy_buffer);
}
WebPSafeFree(copy_buffer);
return 1;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,175 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture tools for measuring distortion
//
// Author: Skal (pascal.massimino@gmail.com)
#include <math.h>
#include <stdlib.h>
#include "./vp8enci.h"
#include "../utils/utils.h"
//------------------------------------------------------------------------------
// local-min distortion
//
// For every pixel in the *reference* picture, we search for the local best
// match in the compressed image. This is not a symmetrical measure.
#define RADIUS 2 // search radius. Shouldn't be too large.
static void AccumulateLSIM(const uint8_t* src, int src_stride,
const uint8_t* ref, int ref_stride,
int w, int h, DistoStats* stats) {
int x, y;
double total_sse = 0.;
for (y = 0; y < h; ++y) {
const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
for (x = 0; x < w; ++x) {
const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
double best_sse = 255. * 255.;
const double value = (double)ref[y * ref_stride + x];
int i, j;
for (j = y_0; j < y_1; ++j) {
const uint8_t* const s = src + j * src_stride;
for (i = x_0; i < x_1; ++i) {
const double diff = s[i] - value;
const double sse = diff * diff;
if (sse < best_sse) best_sse = sse;
}
}
total_sse += best_sse;
}
}
stats->w = w * h;
stats->xm = 0;
stats->ym = 0;
stats->xxm = total_sse;
stats->yym = 0;
stats->xxm = 0;
}
#undef RADIUS
//------------------------------------------------------------------------------
// Distortion
// Max value returned in case of exact similarity.
static const double kMinDistortion_dB = 99.;
static float GetPSNR(const double v) {
return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
: kMinDistortion_dB);
}
int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
int type, float result[5]) {
DistoStats stats[5];
int w, h;
memset(stats, 0, sizeof(stats));
if (src == NULL || ref == NULL ||
src->width != ref->width || src->height != ref->height ||
src->use_argb != ref->use_argb || result == NULL) {
return 0;
}
w = src->width;
h = src->height;
if (src->use_argb == 1) {
if (src->argb == NULL || ref->argb == NULL) {
return 0;
} else {
int i, j, c;
uint8_t* tmp1, *tmp2;
uint8_t* const tmp_plane =
(uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane));
if (tmp_plane == NULL) return 0;
tmp1 = tmp_plane;
tmp2 = tmp_plane + w * h;
for (c = 0; c < 4; ++c) {
for (j = 0; j < h; ++j) {
for (i = 0; i < w; ++i) {
tmp1[j * w + i] = src->argb[i + j * src->argb_stride] >> (c * 8);
tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8);
}
}
if (type >= 2) {
AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]);
} else {
VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
}
}
free(tmp_plane);
}
} else {
int has_alpha, uv_w, uv_h;
if (src->y == NULL || ref->y == NULL ||
src->u == NULL || ref->u == NULL ||
src->v == NULL || ref->v == NULL) {
return 0;
}
has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
(has_alpha && (src->a == NULL || ref->a == NULL))) {
return 0;
}
uv_w = (src->width + 1) >> 1;
uv_h = (src->height + 1) >> 1;
if (type >= 2) {
AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
w, h, &stats[0]);
AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
uv_w, uv_h, &stats[1]);
AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
uv_w, uv_h, &stats[2]);
if (has_alpha) {
AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
w, h, &stats[3]);
}
} else {
VP8SSIMAccumulatePlane(src->y, src->y_stride,
ref->y, ref->y_stride,
w, h, &stats[0]);
VP8SSIMAccumulatePlane(src->u, src->uv_stride,
ref->u, ref->uv_stride,
uv_w, uv_h, &stats[1]);
VP8SSIMAccumulatePlane(src->v, src->uv_stride,
ref->v, ref->uv_stride,
uv_w, uv_h, &stats[2]);
if (has_alpha) {
VP8SSIMAccumulatePlane(src->a, src->a_stride,
ref->a, ref->a_stride,
w, h, &stats[3]);
}
}
}
// Final stat calculations.
{
int c;
for (c = 0; c <= 4; ++c) {
if (type == 1) {
const double v = VP8SSIMGet(&stats[c]);
result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
: kMinDistortion_dB);
} else {
const double v = VP8SSIMGetSquaredError(&stats[c]);
result[c] = GetPSNR(v);
}
// Accumulate forward
if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
}
}
return 1;
}
//------------------------------------------------------------------------------

View file

@ -0,0 +1,264 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture tools: copy, crop, rescaling and view.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "./vp8enci.h"
#include "../utils/rescaler.h"
#include "../utils/utils.h"
#define HALVE(x) (((x) + 1) >> 1)
// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
// into 'dst'. Mark 'dst' as not owning any memory.
static void PictureGrabSpecs(const WebPPicture* const src,
WebPPicture* const dst) {
assert(src != NULL && dst != NULL);
*dst = *src;
WebPPictureResetBuffers(dst);
}
//------------------------------------------------------------------------------
// Adjust top-left corner to chroma sample position.
static void SnapTopLeftPosition(const WebPPicture* const pic,
int* const left, int* const top) {
if (!pic->use_argb) {
*left &= ~1;
*top &= ~1;
}
}
// Adjust top-left corner and verify that the sub-rectangle is valid.
static int AdjustAndCheckRectangle(const WebPPicture* const pic,
int* const left, int* const top,
int width, int height) {
SnapTopLeftPosition(pic, left, top);
if ((*left) < 0 || (*top) < 0) return 0;
if (width <= 0 || height <= 0) return 0;
if ((*left) + width > pic->width) return 0;
if ((*top) + height > pic->height) return 0;
return 1;
}
int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
if (src == NULL || dst == NULL) return 0;
if (src == dst) return 1;
PictureGrabSpecs(src, dst);
if (!WebPPictureAlloc(dst)) return 0;
if (!src->use_argb) {
WebPCopyPlane(src->y, src->y_stride,
dst->y, dst->y_stride, dst->width, dst->height);
WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
HALVE(dst->width), HALVE(dst->height));
WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
HALVE(dst->width), HALVE(dst->height));
if (dst->a != NULL) {
WebPCopyPlane(src->a, src->a_stride,
dst->a, dst->a_stride, dst->width, dst->height);
}
} else {
WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
(uint8_t*)dst->argb, 4 * dst->argb_stride,
4 * dst->width, dst->height);
}
return 1;
}
int WebPPictureIsView(const WebPPicture* picture) {
if (picture == NULL) return 0;
if (picture->use_argb) {
return (picture->memory_argb_ == NULL);
}
return (picture->memory_ == NULL);
}
int WebPPictureView(const WebPPicture* src,
int left, int top, int width, int height,
WebPPicture* dst) {
if (src == NULL || dst == NULL) return 0;
// verify rectangle position.
if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
if (src != dst) { // beware of aliasing! We don't want to leak 'memory_'.
PictureGrabSpecs(src, dst);
}
dst->width = width;
dst->height = height;
if (!src->use_argb) {
dst->y = src->y + top * src->y_stride + left;
dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
dst->y_stride = src->y_stride;
dst->uv_stride = src->uv_stride;
if (src->a != NULL) {
dst->a = src->a + top * src->a_stride + left;
dst->a_stride = src->a_stride;
}
} else {
dst->argb = src->argb + top * src->argb_stride + left;
dst->argb_stride = src->argb_stride;
}
return 1;
}
//------------------------------------------------------------------------------
// Picture cropping
int WebPPictureCrop(WebPPicture* pic,
int left, int top, int width, int height) {
WebPPicture tmp;
if (pic == NULL) return 0;
if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
PictureGrabSpecs(pic, &tmp);
tmp.width = width;
tmp.height = height;
if (!WebPPictureAlloc(&tmp)) return 0;
if (!pic->use_argb) {
const int y_offset = top * pic->y_stride + left;
const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
WebPCopyPlane(pic->y + y_offset, pic->y_stride,
tmp.y, tmp.y_stride, width, height);
WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
if (tmp.a != NULL) {
const int a_offset = top * pic->a_stride + left;
WebPCopyPlane(pic->a + a_offset, pic->a_stride,
tmp.a, tmp.a_stride, width, height);
}
} else {
const uint8_t* const src =
(const uint8_t*)(pic->argb + top * pic->argb_stride + left);
WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
tmp.argb_stride * 4, width * 4, height);
}
WebPPictureFree(pic);
*pic = tmp;
return 1;
}
//------------------------------------------------------------------------------
// Simple picture rescaler
static void RescalePlane(const uint8_t* src,
int src_width, int src_height, int src_stride,
uint8_t* dst,
int dst_width, int dst_height, int dst_stride,
rescaler_t* const work,
int num_channels) {
WebPRescaler rescaler;
int y = 0;
WebPRescalerInit(&rescaler, src_width, src_height,
dst, dst_width, dst_height, dst_stride,
num_channels, work);
while (y < src_height) {
y += WebPRescalerImport(&rescaler, src_height - y,
src + y * src_stride, src_stride);
WebPRescalerExport(&rescaler);
}
}
static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
assert(pic->argb != NULL);
WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
pic->width, pic->height, inverse);
}
static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
if (pic->a != NULL) {
WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
pic->width, pic->height, inverse);
}
}
int WebPPictureRescale(WebPPicture* pic, int width, int height) {
WebPPicture tmp;
int prev_width, prev_height;
rescaler_t* work;
if (pic == NULL) return 0;
prev_width = pic->width;
prev_height = pic->height;
if (!WebPRescalerGetScaledDimensions(
prev_width, prev_height, &width, &height)) {
return 0;
}
PictureGrabSpecs(pic, &tmp);
tmp.width = width;
tmp.height = height;
if (!WebPPictureAlloc(&tmp)) return 0;
if (!pic->use_argb) {
work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
if (work == NULL) {
WebPPictureFree(&tmp);
return 0;
}
// If present, we need to rescale alpha first (for AlphaMultiplyY).
if (pic->a != NULL) {
WebPInitAlphaProcessing();
RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
tmp.a, width, height, tmp.a_stride, work, 1);
}
// We take transparency into account on the luma plane only. That's not
// totally exact blending, but still is a good approximation.
AlphaMultiplyY(pic, 0);
RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
tmp.y, width, height, tmp.y_stride, work, 1);
AlphaMultiplyY(&tmp, 1);
RescalePlane(pic->u,
HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
tmp.u,
HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
RescalePlane(pic->v,
HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
tmp.v,
HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
} else {
work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
if (work == NULL) {
WebPPictureFree(&tmp);
return 0;
}
// In order to correctly interpolate colors, we need to apply the alpha
// weighting first (black-matting), scale the RGB values, and remove
// the premultiplication afterward (while preserving the alpha channel).
WebPInitAlphaProcessing();
AlphaMultiplyARGB(pic, 0);
RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
pic->argb_stride * 4,
(uint8_t*)tmp.argb, width, height,
tmp.argb_stride * 4,
work, 4);
AlphaMultiplyARGB(&tmp, 1);
}
WebPPictureFree(pic);
WebPSafeFree(work);
*pic = tmp;
return 1;
}
//------------------------------------------------------------------------------

View file

@ -0,0 +1,206 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture tools: alpha handling, etc.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./vp8enci.h"
#include "../dsp/yuv.h"
static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
return (0xff000000u | (r << 16) | (g << 8) | b);
}
//------------------------------------------------------------------------------
// Helper: clean up fully transparent area to help compressibility.
#define SIZE 8
#define SIZE2 (SIZE / 2)
static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
int y, x;
for (y = 0; y < size; ++y) {
for (x = 0; x < size; ++x) {
if (ptr[x]) {
return 0;
}
}
ptr += stride;
}
return 1;
}
static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
int y, x;
for (y = 0; y < size; ++y) {
for (x = 0; x < size; ++x) {
if (ptr[x] & 0xff000000u) {
return 0;
}
}
ptr += stride;
}
return 1;
}
static void flatten(uint8_t* ptr, int v, int stride, int size) {
int y;
for (y = 0; y < size; ++y) {
memset(ptr, v, size);
ptr += stride;
}
}
static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
int x, y;
for (y = 0; y < size; ++y) {
for (x = 0; x < size; ++x) ptr[x] = v;
ptr += stride;
}
}
void WebPCleanupTransparentArea(WebPPicture* pic) {
int x, y, w, h;
if (pic == NULL) return;
w = pic->width / SIZE;
h = pic->height / SIZE;
// note: we ignore the left-overs on right/bottom
if (pic->use_argb) {
uint32_t argb_value = 0;
for (y = 0; y < h; ++y) {
int need_reset = 1;
for (x = 0; x < w; ++x) {
const int off = (y * pic->argb_stride + x) * SIZE;
if (is_transparent_argb_area(pic->argb + off, pic->argb_stride, SIZE)) {
if (need_reset) {
argb_value = pic->argb[off];
need_reset = 0;
}
flatten_argb(pic->argb + off, argb_value, pic->argb_stride, SIZE);
} else {
need_reset = 1;
}
}
}
} else {
const uint8_t* const a_ptr = pic->a;
int values[3] = { 0 };
if (a_ptr == NULL) return; // nothing to do
for (y = 0; y < h; ++y) {
int need_reset = 1;
for (x = 0; x < w; ++x) {
const int off_a = (y * pic->a_stride + x) * SIZE;
const int off_y = (y * pic->y_stride + x) * SIZE;
const int off_uv = (y * pic->uv_stride + x) * SIZE2;
if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
if (need_reset) {
values[0] = pic->y[off_y];
values[1] = pic->u[off_uv];
values[2] = pic->v[off_uv];
need_reset = 0;
}
flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
} else {
need_reset = 1;
}
}
}
}
}
#undef SIZE
#undef SIZE2
//------------------------------------------------------------------------------
// Blend color and remove transparency info
#define BLEND(V0, V1, ALPHA) \
((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
#define BLEND_10BIT(V0, V1, ALPHA) \
((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
const int red = (background_rgb >> 16) & 0xff;
const int green = (background_rgb >> 8) & 0xff;
const int blue = (background_rgb >> 0) & 0xff;
int x, y;
if (pic == NULL) return;
if (!pic->use_argb) {
const int uv_width = (pic->width >> 1); // omit last pixel during u/v loop
const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
// VP8RGBToU/V expects the u/v values summed over four pixels
const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
if (!has_alpha || pic->a == NULL) return; // nothing to do
for (y = 0; y < pic->height; ++y) {
// Luma blending
uint8_t* const y_ptr = pic->y + y * pic->y_stride;
uint8_t* const a_ptr = pic->a + y * pic->a_stride;
for (x = 0; x < pic->width; ++x) {
const int alpha = a_ptr[x];
if (alpha < 0xff) {
y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
}
}
// Chroma blending every even line
if ((y & 1) == 0) {
uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
uint8_t* const a_ptr2 =
(y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
for (x = 0; x < uv_width; ++x) {
// Average four alpha values into a single blending weight.
// TODO(skal): might lead to visible contouring. Can we do better?
const int alpha =
a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
u[x] = BLEND_10BIT(U0, u[x], alpha);
v[x] = BLEND_10BIT(V0, v[x], alpha);
}
if (pic->width & 1) { // rightmost pixel
const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
u[x] = BLEND_10BIT(U0, u[x], alpha);
v[x] = BLEND_10BIT(V0, v[x], alpha);
}
}
memset(a_ptr, 0xff, pic->width);
}
} else {
uint32_t* argb = pic->argb;
const uint32_t background = MakeARGB32(red, green, blue);
for (y = 0; y < pic->height; ++y) {
for (x = 0; x < pic->width; ++x) {
const int alpha = (argb[x] >> 24) & 0xff;
if (alpha != 0xff) {
if (alpha > 0) {
int r = (argb[x] >> 16) & 0xff;
int g = (argb[x] >> 8) & 0xff;
int b = (argb[x] >> 0) & 0xff;
r = BLEND(red, r, alpha);
g = BLEND(green, g, alpha);
b = BLEND(blue, b, alpha);
argb[x] = MakeARGB32(r, g, b);
} else {
argb[x] = background;
}
}
}
argb += pic->argb_stride;
}
}
}
#undef BLEND
#undef BLEND_10BIT
//------------------------------------------------------------------------------

285
drivers/webp/enc/token.c Normal file
View file

@ -0,0 +1,285 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Paginated token buffer
//
// A 'token' is a bit value associated with a probability, either fixed
// or a later-to-be-determined after statistics have been collected.
// For dynamic probability, we just record the slot id (idx) for the probability
// value in the final probability array (uint8_t* probas in VP8EmitTokens).
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "./cost.h"
#include "./vp8enci.h"
#include "../utils/utils.h"
#if !defined(DISABLE_TOKEN_BUFFER)
// we use pages to reduce the number of memcpy()
#define MIN_PAGE_SIZE 8192 // minimum number of token per page
#define FIXED_PROBA_BIT (1u << 14)
typedef uint16_t token_t; // bit #15: bit value
// bit #14: flags for constant proba or idx
// bits #0..13: slot or constant proba
struct VP8Tokens {
VP8Tokens* next_; // pointer to next page
};
// Token data is located in memory just after the next_ field.
// This macro is used to return their address and hide the trick.
#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
//------------------------------------------------------------------------------
void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
b->tokens_ = NULL;
b->pages_ = NULL;
b->last_page_ = &b->pages_;
b->left_ = 0;
b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
b->error_ = 0;
}
void VP8TBufferClear(VP8TBuffer* const b) {
if (b != NULL) {
VP8Tokens* p = b->pages_;
while (p != NULL) {
VP8Tokens* const next = p->next_;
WebPSafeFree(p);
p = next;
}
VP8TBufferInit(b, b->page_size_);
}
}
static int TBufferNewPage(VP8TBuffer* const b) {
VP8Tokens* page = NULL;
if (!b->error_) {
const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
}
if (page == NULL) {
b->error_ = 1;
return 0;
}
page->next_ = NULL;
*b->last_page_ = page;
b->last_page_ = &page->next_;
b->left_ = b->page_size_;
b->tokens_ = (token_t*)TOKEN_DATA(page);
return 1;
}
//------------------------------------------------------------------------------
#define TOKEN_ID(t, b, ctx) \
(NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b,
uint32_t bit, uint32_t proba_idx) {
assert(proba_idx < FIXED_PROBA_BIT);
assert(bit <= 1);
if (b->left_ > 0 || TBufferNewPage(b)) {
const int slot = --b->left_;
b->tokens_[slot] = (bit << 15) | proba_idx;
}
return bit;
}
static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
uint32_t bit, uint32_t proba) {
assert(proba < 256);
assert(bit <= 1);
if (b->left_ > 0 || TBufferNewPage(b)) {
const int slot = --b->left_;
b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
}
}
int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
int first, int last,
const int16_t* const coeffs,
VP8TBuffer* const tokens) {
int n = first;
uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
if (!AddToken(tokens, last >= 0, base_id + 0)) {
return 0;
}
while (n < 16) {
const int c = coeffs[n++];
const int sign = c < 0;
const uint32_t v = sign ? -c : c;
if (!AddToken(tokens, v != 0, base_id + 1)) {
base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0); // ctx=0
continue;
}
if (!AddToken(tokens, v > 1, base_id + 2)) {
base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1); // ctx=1
} else {
if (!AddToken(tokens, v > 4, base_id + 3)) {
if (AddToken(tokens, v != 2, base_id + 4))
AddToken(tokens, v == 4, base_id + 5);
} else if (!AddToken(tokens, v > 10, base_id + 6)) {
if (!AddToken(tokens, v > 6, base_id + 7)) {
AddConstantToken(tokens, v == 6, 159);
} else {
AddConstantToken(tokens, v >= 9, 165);
AddConstantToken(tokens, !(v & 1), 145);
}
} else {
int mask;
const uint8_t* tab;
uint32_t residue = v - 3;
if (residue < (8 << 1)) { // VP8Cat3 (3b)
AddToken(tokens, 0, base_id + 8);
AddToken(tokens, 0, base_id + 9);
residue -= (8 << 0);
mask = 1 << 2;
tab = VP8Cat3;
} else if (residue < (8 << 2)) { // VP8Cat4 (4b)
AddToken(tokens, 0, base_id + 8);
AddToken(tokens, 1, base_id + 9);
residue -= (8 << 1);
mask = 1 << 3;
tab = VP8Cat4;
} else if (residue < (8 << 3)) { // VP8Cat5 (5b)
AddToken(tokens, 1, base_id + 8);
AddToken(tokens, 0, base_id + 10);
residue -= (8 << 2);
mask = 1 << 4;
tab = VP8Cat5;
} else { // VP8Cat6 (11b)
AddToken(tokens, 1, base_id + 8);
AddToken(tokens, 1, base_id + 10);
residue -= (8 << 3);
mask = 1 << 10;
tab = VP8Cat6;
}
while (mask) {
AddConstantToken(tokens, !!(residue & mask), *tab++);
mask >>= 1;
}
}
base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2); // ctx=2
}
AddConstantToken(tokens, sign, 128);
if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
return 1; // EOB
}
}
return 1;
}
#undef TOKEN_ID
//------------------------------------------------------------------------------
// This function works, but isn't currently used. Saved for later.
#if 0
static void Record(int bit, proba_t* const stats) {
proba_t p = *stats;
if (p >= 0xffff0000u) { // an overflow is inbound.
p = ((p + 1u) >> 1) & 0x7fff7fffu; // -> divide the stats by 2.
}
// record bit count (lower 16 bits) and increment total count (upper 16 bits).
p += 0x00010000u + bit;
*stats = p;
}
void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
const VP8Tokens* p = b->pages_;
while (p != NULL) {
const int N = (p->next_ == NULL) ? b->left_ : 0;
int n = MAX_NUM_TOKEN;
const token_t* const tokens = TOKEN_DATA(p);
while (n-- > N) {
const token_t token = tokens[n];
if (!(token & FIXED_PROBA_BIT)) {
Record((token >> 15) & 1, stats + (token & 0x3fffu));
}
}
p = p->next_;
}
}
#endif // 0
//------------------------------------------------------------------------------
// Final coding pass, with known probabilities
int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
const uint8_t* const probas, int final_pass) {
const VP8Tokens* p = b->pages_;
assert(!b->error_);
while (p != NULL) {
const VP8Tokens* const next = p->next_;
const int N = (next == NULL) ? b->left_ : 0;
int n = b->page_size_;
const token_t* const tokens = TOKEN_DATA(p);
while (n-- > N) {
const token_t token = tokens[n];
const int bit = (token >> 15) & 1;
if (token & FIXED_PROBA_BIT) {
VP8PutBit(bw, bit, token & 0xffu); // constant proba
} else {
VP8PutBit(bw, bit, probas[token & 0x3fffu]);
}
}
if (final_pass) WebPSafeFree((void*)p);
p = next;
}
if (final_pass) b->pages_ = NULL;
return 1;
}
// Size estimation
size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
size_t size = 0;
const VP8Tokens* p = b->pages_;
assert(!b->error_);
while (p != NULL) {
const VP8Tokens* const next = p->next_;
const int N = (next == NULL) ? b->left_ : 0;
int n = b->page_size_;
const token_t* const tokens = TOKEN_DATA(p);
while (n-- > N) {
const token_t token = tokens[n];
const int bit = token & (1 << 15);
if (token & FIXED_PROBA_BIT) {
size += VP8BitCost(bit, token & 0xffu);
} else {
size += VP8BitCost(bit, probas[token & 0x3fffu]);
}
}
p = next;
}
return size;
}
//------------------------------------------------------------------------------
#else // DISABLE_TOKEN_BUFFER
void VP8TBufferInit(VP8TBuffer* const b) {
(void)b;
}
void VP8TBufferClear(VP8TBuffer* const b) {
(void)b;
}
#endif // !DISABLE_TOKEN_BUFFER

51
drivers/webp/extras.h Normal file
View file

@ -0,0 +1,51 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
#ifndef WEBP_WEBP_EXTRAS_H_
#define WEBP_WEBP_EXTRAS_H_
#include "./types.h"
#ifdef __cplusplus
extern "C" {
#endif
#include "./encode.h"
#define WEBP_EXTRAS_ABI_VERSION 0x0000 // MAJOR(8b) + MINOR(8b)
//------------------------------------------------------------------------------
// Returns the version number of the extras library, packed in hexadecimal using
// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
WEBP_EXTERN(int) WebPGetExtrasVersion(void);
//------------------------------------------------------------------------------
// Ad-hoc colorspace importers.
// Import luma sample (gray scale image) into 'picture'. The 'picture'
// width and height must be set prior to calling this function.
WEBP_EXTERN(int) WebPImportGray(const uint8_t* gray, WebPPicture* picture);
// Import rgb sample in RGB565 packed format into 'picture'. The 'picture'
// width and height must be set prior to calling this function.
WEBP_EXTERN(int) WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);
// Import rgb sample in RGB4444 packed format into 'picture'. The 'picture'
// width and height must be set prior to calling this function.
WEBP_EXTERN(int) WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* WEBP_WEBP_EXTRAS_H_ */

File diff suppressed because it is too large Load diff

97
drivers/webp/mux_types.h Normal file
View file

@ -0,0 +1,97 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Data-types common to the mux and demux libraries.
//
// Author: Urvang (urvang@google.com)
#ifndef WEBP_WEBP_MUX_TYPES_H_
#define WEBP_WEBP_MUX_TYPES_H_
#include <stdlib.h> // free()
#include <string.h> // memset()
#include "./types.h"
#ifdef __cplusplus
extern "C" {
#endif
// Note: forward declaring enumerations is not allowed in (strict) C and C++,
// the types are left here for reference.
// typedef enum WebPFeatureFlags WebPFeatureFlags;
// typedef enum WebPMuxAnimDispose WebPMuxAnimDispose;
// typedef enum WebPMuxAnimBlend WebPMuxAnimBlend;
typedef struct WebPData WebPData;
// VP8X Feature Flags.
typedef enum WebPFeatureFlags {
FRAGMENTS_FLAG = 0x00000001,
ANIMATION_FLAG = 0x00000002,
XMP_FLAG = 0x00000004,
EXIF_FLAG = 0x00000008,
ALPHA_FLAG = 0x00000010,
ICCP_FLAG = 0x00000020
} WebPFeatureFlags;
// Dispose method (animation only). Indicates how the area used by the current
// frame is to be treated before rendering the next frame on the canvas.
typedef enum WebPMuxAnimDispose {
WEBP_MUX_DISPOSE_NONE, // Do not dispose.
WEBP_MUX_DISPOSE_BACKGROUND // Dispose to background color.
} WebPMuxAnimDispose;
// Blend operation (animation only). Indicates how transparent pixels of the
// current frame are blended with those of the previous canvas.
typedef enum WebPMuxAnimBlend {
WEBP_MUX_BLEND, // Blend.
WEBP_MUX_NO_BLEND // Do not blend.
} WebPMuxAnimBlend;
// Data type used to describe 'raw' data, e.g., chunk data
// (ICC profile, metadata) and WebP compressed image data.
struct WebPData {
const uint8_t* bytes;
size_t size;
};
// Initializes the contents of the 'webp_data' object with default values.
static WEBP_INLINE void WebPDataInit(WebPData* webp_data) {
if (webp_data != NULL) {
memset(webp_data, 0, sizeof(*webp_data));
}
}
// Clears the contents of the 'webp_data' object by calling free(). Does not
// deallocate the object itself.
static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
if (webp_data != NULL) {
free((void*)webp_data->bytes);
WebPDataInit(webp_data);
}
}
// Allocates necessary storage for 'dst' and copies the contents of 'src'.
// Returns true on success.
static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
if (src == NULL || dst == NULL) return 0;
WebPDataInit(dst);
if (src->bytes != NULL && src->size != 0) {
dst->bytes = (uint8_t*)malloc(src->size);
if (dst->bytes == NULL) return 0;
memcpy((void*)dst->bytes, src->bytes, src->size);
dst->size = src->size;
}
return 1;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* WEBP_WEBP_MUX_TYPES_H_ */

View file

@ -0,0 +1,172 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Specific inlined methods for boolean decoder [VP8GetBit() ...]
// This file should be included by the .c sources that actually need to call
// these methods.
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_UTILS_BIT_READER_INL_H_
#define WEBP_UTILS_BIT_READER_INL_H_
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#endif
#ifdef WEBP_FORCE_ALIGNED
#include <string.h> // memcpy
#endif
#include "../dsp/dsp.h"
#include "./bit_reader.h"
#include "./endian_inl.h"
#ifdef __cplusplus
extern "C" {
#endif
//------------------------------------------------------------------------------
// Derived type lbit_t = natural type for memory I/O
#if (BITS > 32)
typedef uint64_t lbit_t;
#elif (BITS > 16)
typedef uint32_t lbit_t;
#elif (BITS > 8)
typedef uint16_t lbit_t;
#else
typedef uint8_t lbit_t;
#endif
extern const uint8_t kVP8Log2Range[128];
extern const uint8_t kVP8NewRange[128];
// special case for the tail byte-reading
void VP8LoadFinalBytes(VP8BitReader* const br);
//------------------------------------------------------------------------------
// Inlined critical functions
// makes sure br->value_ has at least BITS bits worth of data
static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
assert(br != NULL && br->buf_ != NULL);
// Read 'BITS' bits at a time if possible.
if (br->buf_ < br->buf_max_) {
// convert memory type to register type (with some zero'ing!)
bit_t bits;
#if defined(WEBP_FORCE_ALIGNED)
lbit_t in_bits;
memcpy(&in_bits, br->buf_, sizeof(in_bits));
#elif defined(WEBP_USE_MIPS32)
// This is needed because of un-aligned read.
lbit_t in_bits;
lbit_t* p_buf_ = (lbit_t*)br->buf_;
__asm__ volatile(
".set push \n\t"
".set at \n\t"
".set macro \n\t"
"ulw %[in_bits], 0(%[p_buf_]) \n\t"
".set pop \n\t"
: [in_bits]"=r"(in_bits)
: [p_buf_]"r"(p_buf_)
: "memory", "at"
);
#else
const lbit_t in_bits = *(const lbit_t*)br->buf_;
#endif
br->buf_ += BITS >> 3;
#if !defined(WORDS_BIGENDIAN)
#if (BITS > 32)
bits = BSwap64(in_bits);
bits >>= 64 - BITS;
#elif (BITS >= 24)
bits = BSwap32(in_bits);
bits >>= (32 - BITS);
#elif (BITS == 16)
bits = BSwap16(in_bits);
#else // BITS == 8
bits = (bit_t)in_bits;
#endif // BITS > 32
#else // WORDS_BIGENDIAN
bits = (bit_t)in_bits;
if (BITS != 8 * sizeof(bit_t)) bits >>= (8 * sizeof(bit_t) - BITS);
#endif
br->value_ = bits | (br->value_ << BITS);
br->bits_ += BITS;
} else {
VP8LoadFinalBytes(br); // no need to be inlined
}
}
// Read a bit with proba 'prob'. Speed-critical function!
static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
// Don't move this declaration! It makes a big speed difference to store
// 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
// alter br->range_ value.
range_t range = br->range_;
if (br->bits_ < 0) {
VP8LoadNewBytes(br);
}
{
const int pos = br->bits_;
const range_t split = (range * prob) >> 8;
const range_t value = (range_t)(br->value_ >> pos);
#if defined(__arm__) || defined(_M_ARM) // ARM-specific
const int bit = ((int)(split - value) >> 31) & 1;
if (value > split) {
range -= split + 1;
br->value_ -= (bit_t)(split + 1) << pos;
} else {
range = split;
}
#else // faster version on x86
int bit; // Don't use 'const int bit = (value > split);", it's slower.
if (value > split) {
range -= split + 1;
br->value_ -= (bit_t)(split + 1) << pos;
bit = 1;
} else {
range = split;
bit = 0;
}
#endif
if (range <= (range_t)0x7e) {
const int shift = kVP8Log2Range[range];
range = kVP8NewRange[range];
br->bits_ -= shift;
}
br->range_ = range;
return bit;
}
}
// simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
if (br->bits_ < 0) {
VP8LoadNewBytes(br);
}
{
const int pos = br->bits_;
const range_t split = br->range_ >> 1;
const range_t value = (range_t)(br->value_ >> pos);
const int32_t mask = (int32_t)(split - value) >> 31; // -1 or 0
br->bits_ -= 1;
br->range_ += mask;
br->range_ |= 1;
br->value_ -= (bit_t)((split + 1) & mask) << pos;
return (v ^ mask) - mask;
}
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_UTILS_BIT_READER_INL_H_

View file

@ -0,0 +1,100 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Endian related functions.
#ifndef WEBP_UTILS_ENDIAN_INL_H_
#define WEBP_UTILS_ENDIAN_INL_H_
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#endif
#include "../dsp/dsp.h"
#include "../webp/types.h"
// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
#if !defined(WORDS_BIGENDIAN) && \
(defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
(defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
#define WORDS_BIGENDIAN
#endif
#if defined(WORDS_BIGENDIAN)
#define HToLE32 BSwap32
#define HToLE16 BSwap16
#else
#define HToLE32(x) (x)
#define HToLE16(x) (x)
#endif
#if !defined(HAVE_CONFIG_H)
#if LOCAL_GCC_PREREQ(4,8) || __has_builtin(__builtin_bswap16)
#define HAVE_BUILTIN_BSWAP16
#endif
#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap32)
#define HAVE_BUILTIN_BSWAP32
#endif
#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap64)
#define HAVE_BUILTIN_BSWAP64
#endif
#endif // !HAVE_CONFIG_H
static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
#if defined(HAVE_BUILTIN_BSWAP16)
return __builtin_bswap16(x);
#elif defined(_MSC_VER)
return _byteswap_ushort(x);
#else
// gcc will recognize a 'rorw $8, ...' here:
return (x >> 8) | ((x & 0xff) << 8);
#endif // HAVE_BUILTIN_BSWAP16
}
static WEBP_INLINE uint32_t BSwap32(uint32_t x) {
#if defined(WEBP_USE_MIPS32_R2)
uint32_t ret;
__asm__ volatile (
"wsbh %[ret], %[x] \n\t"
"rotr %[ret], %[ret], 16 \n\t"
: [ret]"=r"(ret)
: [x]"r"(x)
);
return ret;
#elif defined(HAVE_BUILTIN_BSWAP32)
return __builtin_bswap32(x);
#elif defined(__i386__) || defined(__x86_64__)
uint32_t swapped_bytes;
__asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
return swapped_bytes;
#elif defined(_MSC_VER)
return (uint32_t)_byteswap_ulong(x);
#else
return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
#endif // HAVE_BUILTIN_BSWAP32
}
static WEBP_INLINE uint64_t BSwap64(uint64_t x) {
#if defined(HAVE_BUILTIN_BSWAP64)
return __builtin_bswap64(x);
#elif defined(__x86_64__)
uint64_t swapped_bytes;
__asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
return swapped_bytes;
#elif defined(_MSC_VER)
return (uint64_t)_byteswap_uint64(x);
#else // generic code for swapping 64-bit values (suggested by bdb@)
x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8);
return x;
#endif // HAVE_BUILTIN_BSWAP64
}
#endif // WEBP_UTILS_ENDIAN_INL_H_

View file

@ -0,0 +1,279 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Implement gradient smoothing: we replace a current alpha value by its
// surrounding average if it's close enough (that is: the change will be less
// than the minimum distance between two quantized level).
// We use sliding window for computing the 2d moving average.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./quant_levels_dec.h"
#include <string.h> // for memset
#include "./utils.h"
// #define USE_DITHERING // uncomment to enable ordered dithering (not vital)
#define FIX 16 // fix-point precision for averaging
#define LFIX 2 // extra precision for look-up table
#define LUT_SIZE ((1 << (8 + LFIX)) - 1) // look-up table size
#if defined(USE_DITHERING)
#define DFIX 4 // extra precision for ordered dithering
#define DSIZE 4 // dithering size (must be a power of two)
// cf. http://en.wikipedia.org/wiki/Ordered_dithering
static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
{ 0, 8, 2, 10 }, // coefficients are in DFIX fixed-point precision
{ 12, 4, 14, 6 },
{ 3, 11, 1, 9 },
{ 15, 7, 13, 5 }
};
#else
#define DFIX 0
#endif
typedef struct {
int width_, height_; // dimension
int row_; // current input row being processed
uint8_t* src_; // input pointer
uint8_t* dst_; // output pointer
int radius_; // filter radius (=delay)
int scale_; // normalization factor, in FIX bits precision
void* mem_; // all memory
// various scratch buffers
uint16_t* start_;
uint16_t* cur_;
uint16_t* end_;
uint16_t* top_;
uint16_t* average_;
// input levels distribution
int num_levels_; // number of quantized levels
int min_, max_; // min and max level values
int min_level_dist_; // smallest distance between two consecutive levels
int16_t* correction_; // size = 1 + 2*LUT_SIZE -> ~4k memory
} SmoothParams;
//------------------------------------------------------------------------------
#define CLIP_MASK (int)(~0U << (8 + DFIX))
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
}
// vertical accumulation
static void VFilter(SmoothParams* const p) {
const uint8_t* src = p->src_;
const int w = p->width_;
uint16_t* const cur = p->cur_;
const uint16_t* const top = p->top_;
uint16_t* const out = p->end_;
uint16_t sum = 0; // all arithmetic is modulo 16bit
int x;
for (x = 0; x < w; ++x) {
uint16_t new_value;
sum += src[x];
new_value = top[x] + sum;
out[x] = new_value - cur[x]; // vertical sum of 'r' pixels.
cur[x] = new_value;
}
// move input pointers one row down
p->top_ = p->cur_;
p->cur_ += w;
if (p->cur_ == p->end_) p->cur_ = p->start_; // roll-over
// We replicate edges, as it's somewhat easier as a boundary condition.
// That's why we don't update the 'src' pointer on top/bottom area:
if (p->row_ >= 0 && p->row_ < p->height_ - 1) {
p->src_ += p->width_;
}
}
// horizontal accumulation. We use mirror replication of missing pixels, as it's
// a little easier to implement (surprisingly).
static void HFilter(SmoothParams* const p) {
const uint16_t* const in = p->end_;
uint16_t* const out = p->average_;
const uint32_t scale = p->scale_;
const int w = p->width_;
const int r = p->radius_;
int x;
for (x = 0; x <= r; ++x) { // left mirroring
const uint16_t delta = in[x + r - 1] + in[r - x];
out[x] = (delta * scale) >> FIX;
}
for (; x < w - r; ++x) { // bulk middle run
const uint16_t delta = in[x + r] - in[x - r - 1];
out[x] = (delta * scale) >> FIX;
}
for (; x < w; ++x) { // right mirroring
const uint16_t delta =
2 * in[w - 1] - in[2 * w - 2 - r - x] - in[x - r - 1];
out[x] = (delta * scale) >> FIX;
}
}
// emit one filtered output row
static void ApplyFilter(SmoothParams* const p) {
const uint16_t* const average = p->average_;
const int w = p->width_;
const int16_t* const correction = p->correction_;
#if defined(USE_DITHERING)
const uint8_t* const dither = kOrderedDither[p->row_ % DSIZE];
#endif
uint8_t* const dst = p->dst_;
int x;
for (x = 0; x < w; ++x) {
const int v = dst[x];
if (v < p->max_ && v > p->min_) {
const int c = (v << DFIX) + correction[average[x] - (v << LFIX)];
#if defined(USE_DITHERING)
dst[x] = clip_8b(c + dither[x % DSIZE]);
#else
dst[x] = clip_8b(c);
#endif
}
}
p->dst_ += w; // advance output pointer
}
//------------------------------------------------------------------------------
// Initialize correction table
static void InitCorrectionLUT(int16_t* const lut, int min_dist) {
// The correction curve is:
// f(x) = x for x <= threshold2
// f(x) = 0 for x >= threshold1
// and a linear interpolation for range x=[threshold2, threshold1]
// (along with f(-x) = -f(x) symmetry).
// Note that: threshold2 = 3/4 * threshold1
const int threshold1 = min_dist << LFIX;
const int threshold2 = (3 * threshold1) >> 2;
const int max_threshold = threshold2 << DFIX;
const int delta = threshold1 - threshold2;
int i;
for (i = 1; i <= LUT_SIZE; ++i) {
int c = (i <= threshold2) ? (i << DFIX)
: (i < threshold1) ? max_threshold * (threshold1 - i) / delta
: 0;
c >>= LFIX;
lut[+i] = +c;
lut[-i] = -c;
}
lut[0] = 0;
}
static void CountLevels(const uint8_t* const data, int size,
SmoothParams* const p) {
int i, last_level;
uint8_t used_levels[256] = { 0 };
p->min_ = 255;
p->max_ = 0;
for (i = 0; i < size; ++i) {
const int v = data[i];
if (v < p->min_) p->min_ = v;
if (v > p->max_) p->max_ = v;
used_levels[v] = 1;
}
// Compute the mininum distance between two non-zero levels.
p->min_level_dist_ = p->max_ - p->min_;
last_level = -1;
for (i = 0; i < 256; ++i) {
if (used_levels[i]) {
++p->num_levels_;
if (last_level >= 0) {
const int level_dist = i - last_level;
if (level_dist < p->min_level_dist_) {
p->min_level_dist_ = level_dist;
}
}
last_level = i;
}
}
}
// Initialize all params.
static int InitParams(uint8_t* const data, int width, int height,
int radius, SmoothParams* const p) {
const int R = 2 * radius + 1; // total size of the kernel
const size_t size_scratch_m = (R + 1) * width * sizeof(*p->start_);
const size_t size_m = width * sizeof(*p->average_);
const size_t size_lut = (1 + 2 * LUT_SIZE) * sizeof(*p->correction_);
const size_t total_size = size_scratch_m + size_m + size_lut;
uint8_t* mem = (uint8_t*)WebPSafeMalloc(1U, total_size);
if (mem == NULL) return 0;
p->mem_ = (void*)mem;
p->start_ = (uint16_t*)mem;
p->cur_ = p->start_;
p->end_ = p->start_ + R * width;
p->top_ = p->end_ - width;
memset(p->top_, 0, width * sizeof(*p->top_));
mem += size_scratch_m;
p->average_ = (uint16_t*)mem;
mem += size_m;
p->width_ = width;
p->height_ = height;
p->src_ = data;
p->dst_ = data;
p->radius_ = radius;
p->scale_ = (1 << (FIX + LFIX)) / (R * R); // normalization constant
p->row_ = -radius;
// analyze the input distribution so we can best-fit the threshold
CountLevels(data, width * height, p);
// correction table
p->correction_ = ((int16_t*)mem) + LUT_SIZE;
InitCorrectionLUT(p->correction_, p->min_level_dist_);
return 1;
}
static void CleanupParams(SmoothParams* const p) {
WebPSafeFree(p->mem_);
}
int WebPDequantizeLevels(uint8_t* const data, int width, int height,
int strength) {
const int radius = 4 * strength / 100;
if (strength < 0 || strength > 100) return 0;
if (data == NULL || width <= 0 || height <= 0) return 0; // bad params
if (radius > 0) {
SmoothParams p;
memset(&p, 0, sizeof(p));
if (!InitParams(data, width, height, radius, &p)) return 0;
if (p.num_levels_ > 2) {
for (; p.row_ < p.height_; ++p.row_) {
VFilter(&p); // accumulate average of input
// Need to wait few rows in order to prime the filter,
// before emitting some output.
if (p.row_ >= p.radius_) {
HFilter(&p);
ApplyFilter(&p);
}
}
}
CleanupParams(&p);
}
return 1;
}

View file

@ -0,0 +1,35 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Alpha plane de-quantization utility
//
// Author: Vikas Arora (vikasa@google.com)
#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_
#define WEBP_UTILS_QUANT_LEVELS_DEC_H_
#include "../webp/types.h"
#ifdef __cplusplus
extern "C" {
#endif
// Apply post-processing to input 'data' of size 'width'x'height' assuming that
// the source was quantized to a reduced number of levels.
// Strength is in [0..100] and controls the amount of dithering applied.
// Returns false in case of error (data is NULL, invalid parameters,
// malloc failure, ...).
int WebPDequantizeLevels(uint8_t* const data, int width, int height,
int strength);
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */

View file

@ -0,0 +1,43 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Pseudo-random utilities
//
// Author: Skal (pascal.massimino@gmail.com)
#include <string.h>
#include "./random.h"
//------------------------------------------------------------------------------
// 31b-range values
static const uint32_t kRandomTable[VP8_RANDOM_TABLE_SIZE] = {
0x0de15230, 0x03b31886, 0x775faccb, 0x1c88626a, 0x68385c55, 0x14b3b828,
0x4a85fef8, 0x49ddb84b, 0x64fcf397, 0x5c550289, 0x4a290000, 0x0d7ec1da,
0x5940b7ab, 0x5492577d, 0x4e19ca72, 0x38d38c69, 0x0c01ee65, 0x32a1755f,
0x5437f652, 0x5abb2c32, 0x0faa57b1, 0x73f533e7, 0x685feeda, 0x7563cce2,
0x6e990e83, 0x4730a7ed, 0x4fc0d9c6, 0x496b153c, 0x4f1403fa, 0x541afb0c,
0x73990b32, 0x26d7cb1c, 0x6fcc3706, 0x2cbb77d8, 0x75762f2a, 0x6425ccdd,
0x24b35461, 0x0a7d8715, 0x220414a8, 0x141ebf67, 0x56b41583, 0x73e502e3,
0x44cab16f, 0x28264d42, 0x73baaefb, 0x0a50ebed, 0x1d6ab6fb, 0x0d3ad40b,
0x35db3b68, 0x2b081e83, 0x77ce6b95, 0x5181e5f0, 0x78853bbc, 0x009f9494,
0x27e5ed3c
};
void VP8InitRandom(VP8Random* const rg, float dithering) {
memcpy(rg->tab_, kRandomTable, sizeof(rg->tab_));
rg->index1_ = 0;
rg->index2_ = 31;
rg->amp_ = (dithering < 0.0) ? 0
: (dithering > 1.0) ? (1 << VP8_RANDOM_DITHER_FIX)
: (uint32_t)((1 << VP8_RANDOM_DITHER_FIX) * dithering);
}
//------------------------------------------------------------------------------

View file

@ -0,0 +1,63 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Pseudo-random utilities
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_UTILS_RANDOM_H_
#define WEBP_UTILS_RANDOM_H_
#include <assert.h>
#include "../webp/types.h"
#ifdef __cplusplus
extern "C" {
#endif
#define VP8_RANDOM_DITHER_FIX 8 // fixed-point precision for dithering
#define VP8_RANDOM_TABLE_SIZE 55
typedef struct {
int index1_, index2_;
uint32_t tab_[VP8_RANDOM_TABLE_SIZE];
int amp_;
} VP8Random;
// Initializes random generator with an amplitude 'dithering' in range [0..1].
void VP8InitRandom(VP8Random* const rg, float dithering);
// Returns a centered pseudo-random number with 'num_bits' amplitude.
// (uses D.Knuth's Difference-based random generator).
// 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
int amp) {
int diff;
assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
if (diff < 0) diff += (1u << 31);
rg->tab_[rg->index1_] = diff;
if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
// sign-extend, 0-center
diff = (int)((uint32_t)diff << 1) >> (32 - num_bits);
diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX; // restrict range
diff += 1 << (num_bits - 1); // shift back to 0.5-center
return diff;
}
static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
return VP8RandomBits2(rg, num_bits, rg->amp_);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* WEBP_UTILS_RANDOM_H_ */