Merge pull request #55136 from akien-mga/basisu-646a9f826
This commit is contained in:
commit
a74e2c5058
10 changed files with 52 additions and 62 deletions
2
thirdparty/README.md
vendored
2
thirdparty/README.md
vendored
|
@ -8,7 +8,7 @@ readability.
|
||||||
## basis_universal
|
## basis_universal
|
||||||
|
|
||||||
- Upstream: https://github.com/BinomialLLC/basis_universal
|
- Upstream: https://github.com/BinomialLLC/basis_universal
|
||||||
- Version: git (ba1c3e40f1d434ebaf9a167b44e9b11d2bf0f765, 2021)
|
- Version: git (646a9f826131cb0b9e14b5e4740874808315f83a, 2021)
|
||||||
- License: Apache 2.0
|
- License: Apache 2.0
|
||||||
|
|
||||||
Files extracted from upstream source:
|
Files extracted from upstream source:
|
||||||
|
|
4
thirdparty/basis_universal/encoder/apg_bmp.c
vendored
4
thirdparty/basis_universal/encoder/apg_bmp.c
vendored
|
@ -247,7 +247,7 @@ unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int*
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate memory for the output pixels block. cast to size_t in case width and height are both the max of 65536 and n_dst_chans > 1
|
// allocate memory for the output pixels block. cast to size_t in case width and height are both the max of 65536 and n_dst_chans > 1
|
||||||
unsigned char* dst_img_ptr = malloc( (size_t)width * (size_t)height * (size_t)n_dst_chans );
|
unsigned char* dst_img_ptr = (unsigned char*)malloc( (size_t)width * (size_t)height * (size_t)n_dst_chans );
|
||||||
if ( !dst_img_ptr ) {
|
if ( !dst_img_ptr ) {
|
||||||
free( record.data );
|
free( record.data );
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -480,7 +480,7 @@ unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int
|
||||||
dib_hdr.bitmask_b = 0x0000FF00;
|
dib_hdr.bitmask_b = 0x0000FF00;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t* dst_pixels_ptr = malloc( dst_pixels_padded_sz );
|
uint8_t* dst_pixels_ptr = (uint8_t*)malloc( dst_pixels_padded_sz );
|
||||||
if ( !dst_pixels_ptr ) { return 0; }
|
if ( !dst_pixels_ptr ) { return 0; }
|
||||||
{
|
{
|
||||||
size_t dst_byte_idx = 0;
|
size_t dst_byte_idx = 0;
|
||||||
|
|
|
@ -174,9 +174,8 @@ static void astc_init()
|
||||||
} // range
|
} // range
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t astc_interpolate(uint32_t l, uint32_t h, uint32_t w)
|
static inline uint32_t astc_interpolate_linear(uint32_t l, uint32_t h, uint32_t w)
|
||||||
{
|
{
|
||||||
// This is for linear values, not sRGB.
|
|
||||||
l = (l << 8) | l;
|
l = (l << 8) | l;
|
||||||
h = (h << 8) | h;
|
h = (h << 8) | h;
|
||||||
uint32_t k = (l * (64 - w) + h * w + 32) >> 6;
|
uint32_t k = (l * (64 - w) + h * w + 32) >> 6;
|
||||||
|
@ -230,7 +229,7 @@ void bc7enc_compress_block_init()
|
||||||
{
|
{
|
||||||
uint32_t high = (h << 4) | h;
|
uint32_t high = (h << 4) | h;
|
||||||
|
|
||||||
const int k = astc_interpolate(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
|
const int k = astc_interpolate_linear(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
|
||||||
const int err = (k - c) * (k - c);
|
const int err = (k - c) * (k - c);
|
||||||
|
|
||||||
if (err < best.m_error)
|
if (err < best.m_error)
|
||||||
|
@ -259,7 +258,7 @@ void bc7enc_compress_block_init()
|
||||||
{
|
{
|
||||||
uint32_t high = (h << 4) | h;
|
uint32_t high = (h << 4) | h;
|
||||||
|
|
||||||
const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
|
const int k = astc_interpolate_linear(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
|
||||||
const int err = (k - c) * (k - c);
|
const int err = (k - c) * (k - c);
|
||||||
|
|
||||||
if (err < best.m_error)
|
if (err < best.m_error)
|
||||||
|
@ -288,7 +287,7 @@ void bc7enc_compress_block_init()
|
||||||
{
|
{
|
||||||
uint32_t high = g_astc_sorted_order_unquant[7][h].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[7][h].m_unquant;
|
||||||
|
|
||||||
const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
|
const int k = astc_interpolate_linear(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
|
||||||
const int err = (k - c) * (k - c);
|
const int err = (k - c) * (k - c);
|
||||||
|
|
||||||
if (err < best.m_error)
|
if (err < best.m_error)
|
||||||
|
@ -317,7 +316,7 @@ void bc7enc_compress_block_init()
|
||||||
{
|
{
|
||||||
uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
|
||||||
|
|
||||||
const int k = astc_interpolate(low, high, g_astc_weights4[BC7ENC_ASTC_RANGE13_4BIT_OPTIMAL_INDEX]);
|
const int k = astc_interpolate_linear(low, high, g_astc_weights4[BC7ENC_ASTC_RANGE13_4BIT_OPTIMAL_INDEX]);
|
||||||
const int err = (k - c) * (k - c);
|
const int err = (k - c) * (k - c);
|
||||||
|
|
||||||
if (err < best.m_error)
|
if (err < best.m_error)
|
||||||
|
@ -346,7 +345,7 @@ void bc7enc_compress_block_init()
|
||||||
{
|
{
|
||||||
uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
|
||||||
|
|
||||||
const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
|
const int k = astc_interpolate_linear(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
|
||||||
const int err = (k - c) * (k - c);
|
const int err = (k - c) * (k - c);
|
||||||
|
|
||||||
if (err < best.m_error)
|
if (err < best.m_error)
|
||||||
|
@ -375,7 +374,7 @@ void bc7enc_compress_block_init()
|
||||||
{
|
{
|
||||||
uint32_t high = g_astc_sorted_order_unquant[11][h].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[11][h].m_unquant;
|
||||||
|
|
||||||
const int k = astc_interpolate(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
|
const int k = astc_interpolate_linear(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
|
||||||
const int err = (k - c) * (k - c);
|
const int err = (k - c) * (k - c);
|
||||||
|
|
||||||
if (err < best.m_error)
|
if (err < best.m_error)
|
||||||
|
@ -650,7 +649,7 @@ static uint64_t pack_astc_4bit_3bit_to_one_color(const color_cell_compressor_par
|
||||||
uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
|
uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
|
||||||
uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
|
uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
|
||||||
|
|
||||||
p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
|
p.m_c[i] = (uint8_t)astc_interpolate_linear(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
|
||||||
}
|
}
|
||||||
p.m_c[3] = 255;
|
p.m_c[3] = 255;
|
||||||
|
|
||||||
|
@ -689,7 +688,7 @@ static uint64_t pack_astc_4bit_2bit_to_one_color_rgba(const color_cell_compresso
|
||||||
uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
|
uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
|
||||||
uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
|
uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
|
||||||
|
|
||||||
p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
|
p.m_c[i] = (uint8_t)astc_interpolate_linear(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t total_err = 0;
|
uint64_t total_err = 0;
|
||||||
|
@ -728,7 +727,7 @@ static uint64_t pack_astc_range7_2bit_to_one_color(const color_cell_compressor_p
|
||||||
uint32_t low = g_astc_sorted_order_unquant[7][pResults->m_low_endpoint.m_c[i]].m_unquant;
|
uint32_t low = g_astc_sorted_order_unquant[7][pResults->m_low_endpoint.m_c[i]].m_unquant;
|
||||||
uint32_t high = g_astc_sorted_order_unquant[7][pResults->m_high_endpoint.m_c[i]].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[7][pResults->m_high_endpoint.m_c[i]].m_unquant;
|
||||||
|
|
||||||
p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
|
p.m_c[i] = (uint8_t)astc_interpolate_linear(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
|
||||||
}
|
}
|
||||||
p.m_c[3] = 255;
|
p.m_c[3] = 255;
|
||||||
|
|
||||||
|
@ -768,7 +767,7 @@ static uint64_t pack_astc_range13_2bit_to_one_color(const color_cell_compressor_
|
||||||
uint32_t low = g_astc_sorted_order_unquant[13][pResults->m_low_endpoint.m_c[i]].m_unquant;
|
uint32_t low = g_astc_sorted_order_unquant[13][pResults->m_low_endpoint.m_c[i]].m_unquant;
|
||||||
uint32_t high = g_astc_sorted_order_unquant[13][pResults->m_high_endpoint.m_c[i]].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[13][pResults->m_high_endpoint.m_c[i]].m_unquant;
|
||||||
|
|
||||||
p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
|
p.m_c[i] = (uint8_t)astc_interpolate_linear(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t total_err = 0;
|
uint64_t total_err = 0;
|
||||||
|
@ -807,7 +806,7 @@ static uint64_t pack_astc_range11_5bit_to_one_color(const color_cell_compressor_
|
||||||
uint32_t low = g_astc_sorted_order_unquant[11][pResults->m_low_endpoint.m_c[i]].m_unquant;
|
uint32_t low = g_astc_sorted_order_unquant[11][pResults->m_low_endpoint.m_c[i]].m_unquant;
|
||||||
uint32_t high = g_astc_sorted_order_unquant[11][pResults->m_high_endpoint.m_c[i]].m_unquant;
|
uint32_t high = g_astc_sorted_order_unquant[11][pResults->m_high_endpoint.m_c[i]].m_unquant;
|
||||||
|
|
||||||
p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
|
p.m_c[i] = (uint8_t)astc_interpolate_linear(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t total_err = 0;
|
uint64_t total_err = 0;
|
||||||
|
@ -863,7 +862,7 @@ static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8
|
||||||
for (uint32_t i = 1; i < (N - 1); i++)
|
for (uint32_t i = 1; i < (N - 1); i++)
|
||||||
{
|
{
|
||||||
for (uint32_t j = 0; j < nc; j++)
|
for (uint32_t j = 0; j < nc; j++)
|
||||||
weightedColors[i].m_c[j] = (uint8_t)(astc_interpolate(actualMinColor.m_c[j], actualMaxColor.m_c[j], pParams->m_pSelector_weights[i]));
|
weightedColors[i].m_c[j] = (uint8_t)(astc_interpolate_linear(actualMinColor.m_c[j], actualMaxColor.m_c[j], pParams->m_pSelector_weights[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1300,7 +1299,7 @@ void check_best_overall_error(const color_cell_compressor_params *pParams, color
|
||||||
|
|
||||||
for (uint32_t i = 1; i < pParams->m_num_selector_weights - 1; i++)
|
for (uint32_t i = 1; i < pParams->m_num_selector_weights - 1; i++)
|
||||||
for (uint32_t c = 0; c < 4; c++)
|
for (uint32_t c = 0; c < 4; c++)
|
||||||
colors[i].m_c[c] = (uint8_t)astc_interpolate(colors[0].m_c[c], colors[n - 1].m_c[c], pParams->m_pSelector_weights[i]);
|
colors[i].m_c[c] = (uint8_t)astc_interpolate_linear(colors[0].m_c[c], colors[n - 1].m_c[c], pParams->m_pSelector_weights[i]);
|
||||||
|
|
||||||
uint64_t total_err = 0;
|
uint64_t total_err = 0;
|
||||||
for (uint32_t p = 0; p < pParams->m_num_pixels; p++)
|
for (uint32_t p = 0; p < pParams->m_num_pixels; p++)
|
||||||
|
@ -1815,10 +1814,10 @@ uint64_t color_cell_compression_est_astc(
|
||||||
weightedColors[num_weights - 1] = highColor;
|
weightedColors[num_weights - 1] = highColor;
|
||||||
for (uint32_t i = 1; i < (num_weights - 1); i++)
|
for (uint32_t i = 1; i < (num_weights - 1); i++)
|
||||||
{
|
{
|
||||||
weightedColors[i].m_c[0] = (uint8_t)astc_interpolate(lowColor.m_c[0], highColor.m_c[0], pWeight_table[i]);
|
weightedColors[i].m_c[0] = (uint8_t)astc_interpolate_linear(lowColor.m_c[0], highColor.m_c[0], pWeight_table[i]);
|
||||||
weightedColors[i].m_c[1] = (uint8_t)astc_interpolate(lowColor.m_c[1], highColor.m_c[1], pWeight_table[i]);
|
weightedColors[i].m_c[1] = (uint8_t)astc_interpolate_linear(lowColor.m_c[1], highColor.m_c[1], pWeight_table[i]);
|
||||||
weightedColors[i].m_c[2] = (uint8_t)astc_interpolate(lowColor.m_c[2], highColor.m_c[2], pWeight_table[i]);
|
weightedColors[i].m_c[2] = (uint8_t)astc_interpolate_linear(lowColor.m_c[2], highColor.m_c[2], pWeight_table[i]);
|
||||||
weightedColors[i].m_c[3] = (num_comps == 4) ? (uint8_t)astc_interpolate(lowColor.m_c[3], highColor.m_c[3], pWeight_table[i]) : 255;
|
weightedColors[i].m_c[3] = (num_comps == 4) ? (uint8_t)astc_interpolate_linear(lowColor.m_c[3], highColor.m_c[3], pWeight_table[i]) : 255;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute dots and thresholds
|
// Compute dots and thresholds
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
#pragma once
|
||||||
#include "basisu_enc.h"
|
#include "basisu_enc.h"
|
||||||
#include "../transcoder/basisu_transcoder_uastc.h"
|
#include "../transcoder/basisu_transcoder_uastc.h"
|
||||||
|
|
||||||
|
|
|
@ -467,7 +467,10 @@ namespace basisu
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m_params.m_status_output)
|
||||||
|
{
|
||||||
printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height());
|
printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height());
|
||||||
|
}
|
||||||
|
|
||||||
// Optionally load another image and put a grayscale version of it into the alpha channel.
|
// Optionally load another image and put a grayscale version of it into the alpha channel.
|
||||||
if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
|
if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
|
||||||
|
@ -1427,8 +1430,11 @@ namespace basisu
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m_params.m_status_output)
|
||||||
|
{
|
||||||
printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str());
|
printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t comp_size = 0;
|
size_t comp_size = 0;
|
||||||
if ((m_params.m_compute_stats) && (m_params.m_uastc) && (comp_data.size()))
|
if ((m_params.m_compute_stats) && (m_params.m_uastc) && (comp_data.size()))
|
||||||
|
|
|
@ -195,7 +195,7 @@ namespace basisu
|
||||||
{
|
{
|
||||||
QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(pTicks));
|
QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(pTicks));
|
||||||
}
|
}
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__) || defined(__OpenBSD__)
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
inline void query_counter(timer_ticks* pTicks)
|
inline void query_counter(timer_ticks* pTicks)
|
||||||
{
|
{
|
||||||
|
@ -1779,8 +1779,6 @@ namespace basisu
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t bytes_per_line = hdr.m_width * tga_bytes_per_pixel;
|
|
||||||
|
|
||||||
const uint8_t *pSrc = pBuf + sizeof(tga_header);
|
const uint8_t *pSrc = pBuf + sizeof(tga_header);
|
||||||
uint32_t bytes_remaining = buf_size - sizeof(tga_header);
|
uint32_t bytes_remaining = buf_size - sizeof(tga_header);
|
||||||
|
|
||||||
|
|
10
thirdparty/basis_universal/encoder/basisu_enc.h
vendored
10
thirdparty/basis_universal/encoder/basisu_enc.h
vendored
|
@ -1634,6 +1634,14 @@ namespace basisu
|
||||||
|
|
||||||
if ((!l_weight) || (!r_weight))
|
if ((!l_weight) || (!r_weight))
|
||||||
{
|
{
|
||||||
|
l_children.resize(0);
|
||||||
|
new_l_child.set(0.0f);
|
||||||
|
l_ttsum = 0.0f;
|
||||||
|
l_weight = 0;
|
||||||
|
r_children.resize(0);
|
||||||
|
new_r_child.set(0.0f);
|
||||||
|
r_ttsum = 0.0f;
|
||||||
|
r_weight = 0;
|
||||||
TrainingVectorType firstVec;
|
TrainingVectorType firstVec;
|
||||||
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
|
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
|
||||||
{
|
{
|
||||||
|
@ -1660,7 +1668,7 @@ namespace basisu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!l_weight)
|
if ((!l_weight) || (!r_weight))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,14 +15,6 @@
|
||||||
#include "basisu_resampler.h"
|
#include "basisu_resampler.h"
|
||||||
#include "basisu_resampler_filters.h"
|
#include "basisu_resampler_filters.h"
|
||||||
|
|
||||||
#ifndef max
|
|
||||||
#define max(a, b) (((a) > (b)) ? (a) : (b))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef min
|
|
||||||
#define min(a, b) (((a) < (b)) ? (a) : (b))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define RESAMPLER_DEBUG 0
|
#define RESAMPLER_DEBUG 0
|
||||||
|
|
||||||
namespace basisu
|
namespace basisu
|
||||||
|
|
32
thirdparty/basis_universal/encoder/cppspmd_sse.h
vendored
32
thirdparty/basis_universal/encoder/cppspmd_sse.h
vendored
|
@ -1327,33 +1327,15 @@ struct spmd_kernel
|
||||||
CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)
|
CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)
|
||||||
{
|
{
|
||||||
__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
|
__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
|
||||||
|
__m128 temp = _mm_add_ps(_mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
|
||||||
//#if CPPSPMD_SSE2
|
return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(temp, temp), temp));
|
||||||
#if 1
|
|
||||||
// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
|
|
||||||
__m128 shuf = _mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(2, 3, 0, 1));
|
|
||||||
__m128 sums = _mm_add_ps(k3210, shuf);
|
|
||||||
shuf = _mm_movehl_ps(shuf, sums);
|
|
||||||
sums = _mm_add_ss(sums, shuf);
|
|
||||||
return _mm_cvtss_f32(sums);
|
|
||||||
#else
|
|
||||||
// This is pretty slow.
|
|
||||||
__m128 a = _mm_hadd_ps(k3210, k3210);
|
|
||||||
__m128 b = _mm_hadd_ps(a, a);
|
|
||||||
return extractf_ps_x(b);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CPPSPMD_FORCE_INLINE int reduce_add(vint v)
|
CPPSPMD_FORCE_INLINE int reduce_add(vint v)
|
||||||
{
|
{
|
||||||
__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
|
__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
|
||||||
|
__m128i temp = _mm_add_epi32(_mm_shuffle_epi32(k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
|
||||||
// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
|
return extract_x(_mm_add_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(temp), _mm_castsi128_ps(temp))), temp));
|
||||||
__m128i shuf = _mm_shuffle_epi32(k3210, _MM_SHUFFLE(2, 3, 0, 1));
|
|
||||||
__m128i sums = _mm_add_epi32(k3210, shuf);
|
|
||||||
shuf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(shuf), _mm_castsi128_ps(sums)));
|
|
||||||
sums = _mm_add_epi32(sums, shuf);
|
|
||||||
return extract_x(sums);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#include "cppspmd_math_declares.h"
|
#include "cppspmd_math_declares.h"
|
||||||
|
@ -1686,6 +1668,12 @@ CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b
|
||||||
CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
|
CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
|
||||||
CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
|
CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
|
||||||
|
|
||||||
|
CPPSPMD_FORCE_INLINE vint vint_lane_set(int v0, int v1, int v2, int v3) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
|
||||||
|
CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set(float v0, float v1, float v2, float v3) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
|
||||||
|
|
||||||
|
CPPSPMD_FORCE_INLINE vint vint_lane_set_r(int v3, int v2, int v1, int v0) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
|
||||||
|
CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set_r(float v3, float v2, float v1, float v0) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
|
||||||
|
|
||||||
// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
|
// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
|
||||||
#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
|
#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
|
||||||
|
|
||||||
|
|
|
@ -10778,8 +10778,6 @@ namespace basist
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0;
|
|
||||||
|
|
||||||
if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2)
|
if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2)
|
||||||
{
|
{
|
||||||
BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n");
|
BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n");
|
||||||
|
@ -17336,7 +17334,6 @@ namespace basist
|
||||||
|
|
||||||
bool ktx2_transcoder::decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data)
|
bool ktx2_transcoder::decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data)
|
||||||
{
|
{
|
||||||
const uint8_t* pComp_data = m_levels[level_index].m_byte_offset + m_pData;
|
|
||||||
const uint64_t comp_size = m_levels[level_index].m_byte_length;
|
const uint64_t comp_size = m_levels[level_index].m_byte_length;
|
||||||
|
|
||||||
const uint64_t uncomp_size = m_levels[level_index].m_uncompressed_byte_length;
|
const uint64_t uncomp_size = m_levels[level_index].m_uncompressed_byte_length;
|
||||||
|
@ -17361,6 +17358,7 @@ namespace basist
|
||||||
if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD)
|
if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD)
|
||||||
{
|
{
|
||||||
#if BASISD_SUPPORT_KTX2_ZSTD
|
#if BASISD_SUPPORT_KTX2_ZSTD
|
||||||
|
const uint8_t* pComp_data = m_levels[level_index].m_byte_offset + m_pData;
|
||||||
size_t actualUncompSize = ZSTD_decompress(uncomp_data.data(), (size_t)uncomp_size, pComp_data, (size_t)comp_size);
|
size_t actualUncompSize = ZSTD_decompress(uncomp_data.data(), (size_t)uncomp_size, pComp_data, (size_t)comp_size);
|
||||||
if (ZSTD_isError(actualUncompSize))
|
if (ZSTD_isError(actualUncompSize))
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue