Merge pull request #48455 from JFonS/3.x_embree_aarch64

[3.x] Switch to embree-aarch64
This commit is contained in:
Rémi Verschelde 2021-05-05 15:01:18 +02:00 committed by GitHub
commit b8d198eeed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
97 changed files with 6063 additions and 1294 deletions

View file

@ -135,6 +135,11 @@ Copyright: 2018, Eric Lasota
2018, Microsoft Corp.
License: Expat
Files: ./thirdparty/embree/
Comment: Embree
Copyright: 2009-2021 Intel Corporation
License: Apache-2.0
Files: ./thirdparty/enet/
Comment: ENet
Copyright: 2002-2020, Lee Salzman

View file

@ -6,23 +6,13 @@ def can_build(env, platform):
# `can_build()` for that module, so we need to duplicate that code as a short-term
# solution.
# Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are
# not supported.
# It's also only relevant for tools build and desktop platforms,
# as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
supported_platform = platform in ["x11", "osx", "windows", "server"]
supported_bits = env["bits"] == "64"
supported_arch = env["arch"] != "arm64"
if platform == "android":
return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
# Hack to disable on Linux arm64. This won't work well for cross-compilation (checks
# host, not target) and would need a more thorough fix by refactoring our arch and
# bits-handling code.
from platform import machine
if platform in ["javascript", "server"]:
return False
if platform == "x11" and machine() != "x86_64":
supported_arch = False
return supported_platform and supported_bits and supported_arch
return True
def configure(env):

View file

@ -70,25 +70,19 @@ if env["builtin_embree"]:
thirdparty_sources = [thirdparty_dir + file for file in embree_src]
env_raycast.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "include"])
env_raycast.Append(
CPPDEFINES=[
"EMBREE_TARGET_SSE2",
"EMBREE_LOWEST_ISA",
"TASKING_INTERNAL",
"NDEBUG",
"__SSE2__",
"__SSE__",
]
)
env_raycast.Append(CPPDEFINES=["EMBREE_TARGET_SSE2", "EMBREE_LOWEST_ISA", "TASKING_INTERNAL", "NDEBUG"])
if not env.msvc:
env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"])
if env["arch"] in ["x86", "x86_64"]:
env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"])
if env["platform"] == "windows":
env_raycast.Append(CPPFLAGS=["-mstackrealign"])
if env["platform"] == "windows":
if env.msvc:
env.Append(LINKFLAGS=["psapi.lib"])
env_raycast.Append(CPPDEFINES=["__SSE2__", "__SSE__"])
else:
env.Append(LIBS=["psapi"])

View file

@ -1,21 +1,14 @@
def can_build(env, platform):
# Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are
# not supported.
# It's also only relevant for tools build and desktop platforms,
# as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
supported_platform = platform in ["x11", "osx", "windows", "server"]
supported_bits = env["bits"] == "64"
supported_arch = env["arch"] != "arm64"
if not env["tools"]:
return False
# Hack to disable on Linux arm64. This won't work well for cross-compilation (checks
# host, not target) and would need a more thorough fix by refactoring our arch and
# bits-handling code.
from platform import machine
if platform == "android":
return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
if platform == "x11" and machine() != "x86_64":
supported_arch = False
if platform in ["javascript", "server"]:
return False
return env["tools"] and supported_platform and supported_bits and supported_arch
return True
def configure(env):

View file

@ -74,17 +74,18 @@ cpp_files = [
os.chdir("../../thirdparty")
if os.path.exists("embree"):
shutil.rmtree("embree")
dir_name = "embree"
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
subprocess.run(["git", "clone", "https://github.com/embree/embree.git", "embree-tmp"])
subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"])
os.chdir("embree-tmp")
commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
dest_dir = "../embree"
all_files = set(cpp_files)
dest_dir = os.path.join("..", dir_name)
for include_dir in include_dirs:
headers = glob.iglob(os.path.join(include_dir, "*.h"))
all_files.update(headers)

View file

@ -190,8 +190,11 @@ LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
if (embree_scene != nullptr)
if (embree_scene != nullptr) {
rtcReleaseScene(embree_scene);
if (embree_device != nullptr)
}
if (embree_device != nullptr) {
rtcReleaseDevice(embree_device);
}
}

14
thirdparty/README.md vendored
View file

@ -41,19 +41,19 @@ Files extracted from upstream source:
## embree
- Upstream: https://github.com/embree/embree
- Version: 3.12.1 (69bd4c272f1ed608494f233ecfff3feec516880b, 2020)
- Upstream: https://github.com/lighttransport/embree-aarch64
- Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020)
- License: Apache 2.0
Files extracted from upstream:
- All cpp files listed in `modules/raytrace/godot_update_embree.py`
- All header files in the directories listed in `modules/raytrace/godot_update_embree.py`
- All cpp files listed in `modules/raycast/godot_update_embree.py`
- All header files in the directories listed in `modules/raycast/godot_update_embree.py`
The `modules/raytrace/godot_update_embree.py`script can be used to pull the
relevant files from the latest Embree release and apply some automatic changes.
The `modules/raycast/godot_update_embree.py`script can be used to pull the
relevant files from the latest Embree-aarch64 release and apply some automatic changes.
Some minor changes have been made in order to fix build errors.
Some changes have been made in order to remove exceptions and fix minor build errors.
They are marked with `// -- GODOT start --` and `// -- GODOT end --`
comments. Apply the patches in the `patches/` folder when syncing on newer upstream
commits.

View file

@ -8,6 +8,12 @@
#include "../math/math.h"
#include "../math/range.h"
#if defined(TASKING_GCD) && defined(BUILD_IOS)
#include <dispatch/dispatch.h>
#include <algorithm>
#include <type_traits>
#endif
namespace embree
{
/* parallel_for without range */
@ -21,8 +27,29 @@ namespace embree
func(r.begin());
});
if (!TaskScheduler::wait())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
}
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
const size_t length = N;
const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
const size_t numBlocks = (length + blockSize-1) / blockSize;
dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
const size_t start = (currentBlock * blockSize);
const size_t blockLength = std::min(length - start, blockSize);
const size_t end = start + blockLength;
for(size_t i=start; i < end; i++)
{
func(i);
}
});
#elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
@ -31,13 +58,19 @@ namespace embree
func(i);
},context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
});
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif
#elif defined(TASKING_PPL)
@ -57,7 +90,29 @@ namespace embree
#if defined(TASKING_INTERNAL)
TaskScheduler::spawn(first,last,minStepSize,func);
if (!TaskScheduler::wait())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
const size_t length = last - first;
const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
blockSize += blockSize % 4;
const size_t numBlocks = (length + blockSize-1) / blockSize;
dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
const size_t start = first + (currentBlock * blockSize);
const size_t end = std::min<size_t>(last, start + blockSize);
func( embree::range<Index>(start,end) );
});
#elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
@ -66,13 +121,19 @@ namespace embree
func(range<Index>(r.begin(),r.end()));
},context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
func(range<Index>(r.begin(),r.end()));
});
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif
#elif defined(TASKING_PPL)
@ -104,13 +165,19 @@ namespace embree
func(i);
},tbb::simple_partitioner(),context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},tbb::simple_partitioner());
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif
}
@ -125,13 +192,19 @@ namespace embree
func(i);
},ap,context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},ap);
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif
}

View file

@ -43,7 +43,7 @@ namespace embree
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
#if defined(TASKING_INTERNAL)
#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
/* fast path for small number of iterations */
Index taskCount = (last-first+minStepSize-1)/minStepSize;
@ -58,15 +58,19 @@ namespace embree
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction,context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// if (context.is_group_execution_cancelled())
// throw std::runtime_error("task cancelled");
// -- GODOT end --
return v;
#else
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction);
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
// -- GODOT start --
// if (tbb::task::self().is_cancelled())
// throw std::runtime_error("task cancelled");
// -- GODOT end --
return v;
#endif
#else // TASKING_PPL

View file

@ -5,6 +5,9 @@
#include "../simd/simd.h"
#include "parallel_for.h"
#if defined(TASKING_GCD) && defined(BUILD_IOS)
#include "../sys/alloc.h"
#endif
#include <algorithm>
namespace embree
@ -320,7 +323,7 @@ namespace embree
#pragma nounroll
#endif
for (size_t i=startID; i<endID; i++) {
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
#else
const Key index = ((Key)src[i] >> shift) & mask;
@ -382,7 +385,7 @@ namespace embree
#endif
for (size_t i=startID; i<endID; i++) {
const Ty elt = src[i];
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
#else
const size_t index = ((Key)src[i] >> shift) & mask;

View file

@ -39,7 +39,10 @@ namespace embree
std::vector<char> str; str.reserve(64);
while (cin->peek() != EOF && !isSeparator(cin->peek())) {
int c = cin->get();
if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
// -- GODOT start --
// if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
if (!isValidChar(c)) abort();
// -- GODOT end --
str.push_back((char)c);
}
str.push_back(0);

986
thirdparty/embree/common/math/AVX2NEON.h vendored Normal file
View file

@ -0,0 +1,986 @@
#pragma once
#include "SSE2NEON.h"
#define AVX2NEON_ABI static inline __attribute__((always_inline))
struct __m256d;
struct __m256 {
__m128 lo,hi;
__m256() {}
};
struct __m256i {
__m128i lo,hi;
explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
__m256i() {}
};
struct __m256d {
float64x2_t lo,hi;
__m256d() {}
__m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
__m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
};
#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
#define _mm_stream_load_si128 _mm_load_si128
#define _mm256_stream_load_si256 _mm256_load_si256
AVX2NEON_ABI
__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
{
__m128 res;
for (int i=0;i<4;i++)
{
if (imm8 & (1<<i))
{
res[i] = b[i];
}
else{
res[i] = a[i];
}
}
return res;
}
AVX2NEON_ABI
__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
{
__m128i res;
for (int i=0;i<4;i++)
{
if (imm8 & (1<<i))
{
res[i] = b[i];
}
else{
res[i] = a[i];
}
}
return res;
}
AVX2NEON_ABI
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
{
return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
}
AVX2NEON_ABI
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
{
int64x2_t y;
y[0] = *(int64_t *)mem_addr;
y[1] = 0;
return __m128i(y);
}
AVX2NEON_ABI
int _mm_movemask_popcnt(__m128 a)
{
return __builtin_popcount(_mm_movemask_ps(a));
}
AVX2NEON_ABI
__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
{
__m128 res;
for (int i=0;i<4;i++) {
if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
}
return res;
}
AVX2NEON_ABI
void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
{
for (int i=0;i<4;i++) {
if (mask[i] & 0x80000000) mem_addr[i] = a[i];
}
}
AVX2NEON_ABI
void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
{
for (int i=0;i<4;i++) {
if (mask[i] & 0x80000000) mem_addr[i] = a[i];
}
}
AVX2NEON_ABI
__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
{
return vnegq_f32(vfmaq_f32(c,a,b));
}
#define _mm_fnmsub_ss _mm_fnmsub_ps
AVX2NEON_ABI
__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
{
return vfmsq_f32(c,a,b);
}
#define _mm_fnmadd_ss _mm_fnmadd_ps
AVX2NEON_ABI
__m128 _mm_broadcast_ss (float const * mem_addr)
{
return vdupq_n_f32(*mem_addr);
}
AVX2NEON_ABI
__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
{
return vfmaq_f32(vnegq_f32(c),a,b);
}
#define _mm_fmsub_ss _mm_fmsub_ps
#define _mm_fmadd_ps _mm_madd_ps
#define _mm_fmadd_ss _mm_madd_ps
template<int code>
AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
{
float v;
v = 0;
v += (code & 0x10) ? a[0]*b[0] : 0;
v += (code & 0x20) ? a[1]*b[1] : 0;
v += (code & 0x40) ? a[2]*b[2] : 0;
v += (code & 0x80) ? a[3]*b[3] : 0;
float32x4_t res;
res[0] = (code & 0x1) ? v : 0;
res[1] = (code & 0x2) ? v : 0;
res[2] = (code & 0x4) ? v : 0;
res[3] = (code & 0x8) ? v : 0;
return res;
}
template<>
inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
{
float v;
float32x4_t m = _mm_mul_ps(a,b);
m[3] = 0;
v = vaddvq_f32(m);
return _mm_set1_ps(v);
}
template<>
inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
{
float v;
float32x4_t m = _mm_mul_ps(a,b);
v = vaddvq_f32(m);
return _mm_set1_ps(v);
}
#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
AVX2NEON_ABI
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
{
return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
}
AVX2NEON_ABI
__m128 _mm_permutevar_ps (__m128 a, __m128i b)
{
__m128 x;
for (int i=0;i<4;i++)
{
x[i] = a[b[i&3]];
}
return x;
}
AVX2NEON_ABI
__m256i _mm256_setzero_si256()
{
__m256i res;
res.lo = res.hi = vdupq_n_s32(0);
return res;
}
AVX2NEON_ABI
__m256 _mm256_setzero_ps()
{
__m256 res;
res.lo = res.hi = vdupq_n_f32(0.0f);
return res;
}
AVX2NEON_ABI
__m256i _mm256_undefined_si256()
{
return _mm256_setzero_si256();
}
AVX2NEON_ABI
__m256 _mm256_undefined_ps()
{
return _mm256_setzero_ps();
}
CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
AVX2NEON_ABI
__m128 _mm256_castps256_ps128 (__m256 a)
{
return a.lo;
}
AVX2NEON_ABI
__m256i _mm256_castsi128_si256 (__m128i a)
{
__m256i res;
res.lo = a ;
res.hi = vdupq_n_s32(0);
return res;
}
AVX2NEON_ABI
__m128i _mm256_castsi256_si128 (__m256i a)
{
return a.lo;
}
AVX2NEON_ABI
__m256 _mm256_castps128_ps256 (__m128 a)
{
__m256 res;
res.lo = a;
res.hi = vdupq_n_f32(0);
return res;
}
AVX2NEON_ABI
__m256 _mm256_broadcast_ss (float const * mem_addr)
{
__m256 res;
res.lo = res.hi = vdupq_n_f32(*mem_addr);
return res;
}
AVX2NEON_ABI
__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
{
__m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
__m256i res;
res.lo = lo; res.hi = hi;
return res;
}
AVX2NEON_ABI
__m256i _mm256_set1_epi32 (int a)
{
__m256i res;
res.lo = res.hi = vdupq_n_s32(a);
return res;
}
AVX2NEON_ABI
int _mm256_movemask_ps(const __m256& v)
{
return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
}
template<int imm8>
AVX2NEON_ABI
__m256 __mm256_permute_ps (const __m256& a)
{
__m256 res;
res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
return res;
}
#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
template<int imm8>
AVX2NEON_ABI
__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
{
__m256 res;
res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
return res;
}
#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
AVX2NEON_ABI
__m256i _mm256_set1_epi64x (long long a)
{
__m256i res;
int64x2_t t = vdupq_n_s64(a);
res.lo = res.hi = __m128i(t);
return res;
}
AVX2NEON_ABI
__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
{
__m256 res;
__m128 tmp;
switch (imm8 & 0x7)
{
case 0: tmp = a.lo; break;
case 1: tmp = a.hi; break;
case 2: tmp = b.lo; break;
case 3: tmp = b.hi; break;
}
if (imm8 & 0x8)
tmp = _mm_setzero_ps();
res.lo = tmp;
imm8 >>= 4;
switch (imm8 & 0x7)
{
case 0: tmp = a.lo; break;
case 1: tmp = a.hi; break;
case 2: tmp = b.lo; break;
case 3: tmp = b.hi; break;
}
if (imm8 & 0x8)
tmp = _mm_setzero_ps();
res.hi = tmp;
return res;
}
AVX2NEON_ABI
__m256 _mm256_moveldup_ps (__m256 a)
{
__m256 res;
res.lo[0] = res.lo[1] = a.lo[0];
res.lo[2] = res.lo[3] = a.lo[2];
res.hi[0] = res.hi[1] = a.hi[0];
res.hi[2] = res.hi[3] = a.hi[2];
return res;
}
AVX2NEON_ABI
__m256 _mm256_movehdup_ps (__m256 a)
{
__m256 res;
res.lo[0] = res.lo[1] = a.lo[1];
res.lo[2] = res.lo[3] = a.lo[3];
res.hi[0] = res.hi[1] = a.hi[1];
res.hi[2] = res.hi[3] = a.hi[3];
return res;
}
AVX2NEON_ABI
__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
{
__m256 res = a;
if (imm8 & 1) res.hi = b;
else res.lo = b;
return res;
}
AVX2NEON_ABI
__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
{
if (imm8 & 1) return a.hi;
return a.lo;
}
AVX2NEON_ABI
__m256d _mm256_movedup_pd (__m256d a)
{
__m256d res;
res.hi = a.hi;
res.lo[0] = res.lo[1] = a.lo[0];
return res;
}
AVX2NEON_ABI
__m256i _mm256_abs_epi32(__m256i a)
{
__m256i res;
res.lo = vabsq_s32(a.lo);
res.hi = vabsq_s32(a.hi);
return res;
}
UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
AVX2NEON_ABI
__m256i _mm256_cvtps_epi32 (__m256 a)
{
__m256i res;
res.lo = _mm_cvtps_epi32(a.lo);
res.hi = _mm_cvtps_epi32(a.hi);
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvttps_epi32 (__m256 a)
{
__m256i res;
res.lo = _mm_cvttps_epi32(a.lo);
res.hi = _mm_cvttps_epi32(a.hi);
return res;
}
AVX2NEON_ABI
__m256 _mm256_loadu_ps (float const * mem_addr)
{
__m256 res;
res.lo = *(__m128 *)(mem_addr + 0);
res.hi = *(__m128 *)(mem_addr + 4);
return res;
}
#define _mm256_load_ps _mm256_loadu_ps
AVX2NEON_ABI
int _mm256_testz_ps (const __m256& a, const __m256& b)
{
__m256 t = a;
if (&a != &b)
t = _mm256_and_ps(a,b);
__m128i l = vshrq_n_s32(__m128i(t.lo),31);
__m128i h = vshrq_n_s32(__m128i(t.hi),31);
return vaddvq_s32(vaddq_s32(l,h)) == 0;
}
AVX2NEON_ABI
__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
{
__m256i res;
int64x2_t t0 = {e0,e1};
int64x2_t t1 = {e2,e3};
res.lo = __m128i(t0);
res.hi = __m128i(t1);
return res;
}
AVX2NEON_ABI
__m256d _mm256_setzero_pd ()
{
__m256d res;
res.lo = res.hi = vdupq_n_f64(0);
return res;
}
AVX2NEON_ABI
int _mm256_movemask_pd (__m256d a)
{
int res = 0;
uint64x2_t x;
x = uint64x2_t(a.lo);
res |= (x[0] >> 63) ? 1 : 0;
res |= (x[0] >> 63) ? 2 : 0;
x = uint64x2_t(a.hi);
res |= (x[0] >> 63) ? 4 : 0;
res |= (x[0] >> 63) ? 8 : 0;
return res;
}
AVX2NEON_ABI
__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
{
__m256i res;
res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
return res;
}
AVX2NEON_ABI
__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
{
__m256i res;
res.lo = __m128i(vceqq_f64(a.lo,b.lo));
res.hi = __m128i(vceqq_f64(a.hi,b.hi));
return res;
}
AVX2NEON_ABI
int _mm256_testz_pd (const __m256d& a, const __m256d& b)
{
__m256d t = a;
if (&a != &b)
t = _mm256_and_pd(a,b);
return _mm256_movemask_pd(t) == 0;
}
AVX2NEON_ABI
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
{
__m256d res;
uint64x2_t t = uint64x2_t(mask.lo);
res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
t = uint64x2_t(mask.hi);
res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
return res;
}
template<int imm8>
__m256 __mm256_dp_ps (__m256 a, __m256 b)
{
__m256 res;
res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
return res;
}
#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
AVX2NEON_ABI
double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
{
switch (imm8 & 3) {
case 0:
return a.lo[0];
case 1:
return a.lo[1];
case 2:
return a.hi[0];
case 3:
return a.hi[1];
}
__builtin_unreachable();
return 0;
}
AVX2NEON_ABI
__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
{
__m256d res;
res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
return res;
}
AVX2NEON_ABI
__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
{
return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
}
AVX2NEON_ABI
__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
{
__m256i res;
res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
return res;
}
#define _mm256_load_si256 _mm256_loadu_si256
AVX2NEON_ABI
void _mm256_storeu_ps (float * mem_addr, __m256 a)
{
*(__m128 *)(mem_addr + 0) = a.lo;
*(__m128 *)(mem_addr + 4) = a.hi;
}
#define _mm256_store_ps _mm256_storeu_ps
#define _mm256_stream_ps _mm256_storeu_ps
AVX2NEON_ABI
void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
{
*(__m128i *)((int *)mem_addr + 0) = a.lo;
*(__m128i *)((int *)mem_addr + 4) = a.hi;
}
#define _mm256_store_si256 _mm256_storeu_si256
AVX2NEON_ABI
__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
{
__m256 res;
res.lo = _mm_maskload_ps(mem_addr,mask.lo);
res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepu8_epi32 (__m128i a)
{
__m256i res;
uint8x16_t x = uint8x16_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepi8_epi32 (__m128i a)
{
__m256i res;
int8x16_t x = int8x16_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepu16_epi32 (__m128i a)
{
__m256i res;
uint16x8_t x = uint16x8_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepi16_epi32 (__m128i a)
{
__m256i res;
int16x8_t x = int16x8_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
{
_mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
_mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
}
AVX2NEON_ABI
__m256i _mm256_slli_epi32 (__m256i a, int imm8)
{
__m256i res;
res.lo = _mm_slli_epi32(a.lo,imm8);
res.hi = _mm_slli_epi32(a.hi,imm8);
return res;
}
AVX2NEON_ABI
__m256i _mm256_srli_epi32 (__m256i a, int imm8)
{
__m256i res;
res.lo = _mm_srli_epi32(a.lo,imm8);
res.hi = _mm_srli_epi32(a.hi,imm8);
return res;
}
AVX2NEON_ABI
__m256i _mm256_srai_epi32 (__m256i a, int imm8)
{
__m256i res;
res.lo = _mm_srai_epi32(a.lo,imm8);
res.hi = _mm_srai_epi32(a.hi,imm8);
return res;
}
AVX2NEON_ABI
__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
{
__m256i res;
res.lo = vshlq_s32(a.lo,count.lo);
res.hi = vshlq_s32(a.hi,count.hi);
return res;
}
AVX2NEON_ABI
__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
{
__m256i res;
res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
return res;
}
AVX2NEON_ABI
__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
{
__m256i res;
res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
return res;
}
AVX2NEON_ABI
__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
{
return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
}
AVX2NEON_ABI
__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
{
if (imm8 & 1) return a.hi;
return a.lo;
}
AVX2NEON_ABI
__m256 _mm256_set1_ps(float x)
{
__m256 res;
res.lo = res.hi = vdupq_n_f32(x);
return res;
}
AVX2NEON_ABI
__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
{
__m256 res;
res.lo = _mm_set_ps(e3,e2,e1,e0);
res.hi = _mm_set_ps(e7,e6,e5,e4);
return res;
}
AVX2NEON_ABI
__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
{
__m256 res;
res.lo = res.hi = *mem_addr;
return res;
}
AVX2NEON_ABI
__m256 _mm256_cvtepi32_ps (__m256i a)
{
__m256 res;
res.lo = _mm_cvtepi32_ps(a.lo);
res.hi = _mm_cvtepi32_ps(a.hi);
return res;
}
AVX2NEON_ABI
void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
{
for (int i=0;i<4;i++) {
if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
}
}
AVX2NEON_ABI
__m256d _mm256_andnot_pd (__m256d a, __m256d b)
{
__m256d res;
res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
return res;
}
AVX2NEON_ABI
__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
{
__m256 res;
res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
return res;
}
AVX2NEON_ABI
__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
{
__m256i res;
res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
return res;
}
AVX2NEON_ABI
__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
{
__m256i res;
for (int i=0;i<4;i++)
{
res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
{
__m256i res = _mm256_setzero_si256();
for (int i=0;i<4;i++)
{
if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
}
return res;
}

1753
thirdparty/embree/common/math/SSE2NEON.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -77,7 +77,7 @@ namespace embree
return lower > upper;
}
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline bool BBox<Vec3fa>::empty() const {
return !all(le_mask(lower,upper));
}
@ -228,11 +228,11 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////
#if defined __SSE__
#if defined (__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
#if defined __AVX__
#if defined (__AVX__)
#include "../simd/avx.h"
#endif

View file

@ -42,6 +42,6 @@ namespace embree
}
/*! default template instantiations */
typedef Col3<unsigned char> Col3uc;
typedef Col3<uint8_t > Col3uc;
typedef Col3<float > Col3f;
}

View file

@ -42,6 +42,6 @@ namespace embree
}
/*! default template instantiations */
typedef Col4<unsigned char> Col4uc;
typedef Col4<uint8_t > Col4uc;
typedef Col4<float > Col4f;
}

View file

@ -52,17 +52,17 @@ namespace embree
__forceinline void set(Col3uc& d) const
{
vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]);
d.g = (unsigned char)(s[1]);
d.b = (unsigned char)(s[2]);
d.r = (uint8_t)(s[0]);
d.g = (uint8_t)(s[1]);
d.b = (uint8_t)(s[2]);
}
__forceinline void set(Col4uc& d) const
{
vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]);
d.g = (unsigned char)(s[1]);
d.b = (unsigned char)(s[2]);
d.a = (unsigned char)(s[3]);
d.r = (uint8_t)(s[0]);
d.g = (uint8_t)(s[1]);
d.b = (uint8_t)(s[2]);
d.a = (uint8_t)(s[3]);
}
////////////////////////////////////////////////////////////////////////////////
@ -114,16 +114,16 @@ namespace embree
__forceinline void set(Col3uc& d) const
{
vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]);
d.g = (unsigned char)(s[1]);
d.b = (unsigned char)(s[2]);
d.r = (uint8_t)(s[0]);
d.g = (uint8_t)(s[1]);
d.b = (uint8_t)(s[2]);
}
__forceinline void set(Col4uc& d) const
{
vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]);
d.g = (unsigned char)(s[1]);
d.b = (unsigned char)(s[2]);
d.r = (uint8_t)(s[0]);
d.g = (uint8_t)(s[1]);
d.b = (uint8_t)(s[2]);
d.a = 255;
}
@ -152,21 +152,37 @@ namespace embree
}
__forceinline const Color rcp ( const Color& a )
{
#if defined(__aarch64__) && defined(BUILD_IOS)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
return (const Color)reciprocal;
#else
#if defined(__AVX512VL__)
const Color r = _mm_rcp14_ps(a.m128);
#else
const Color r = _mm_rcp_ps(a.m128);
#endif
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
#endif //defined(__aarch64__) && defined(BUILD_IOS)
}
__forceinline const Color rsqrt( const Color& a )
{
#if defined(__aarch64__) && defined(BUILD_IOS)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
return r;
#else
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif //defined(__aarch64__) && defined(BUILD_IOS)
}
__forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }

View file

@ -1,6 +1,10 @@
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#if defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "constants.h"
namespace embree
@ -24,4 +28,34 @@ namespace embree
ReverseStepTy reverse_step;
EmptyTy empty;
UndefinedTy undefined;
#if defined(__aarch64__)
const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
const uint32x4_t vzero = { 0, 0, 0, 0 };
const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
#endif
}

View file

@ -12,6 +12,19 @@
#include <cfloat>
#include <climits>
// Math constants may not be defined in libcxx + mingw + strict C++ standard
#if defined(__MINGW32__)
// TODO(LTE): use constexpr
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#ifndef M_1_PI
#define M_1_PI 0.31830988618379067154
#endif
#endif // __MINGW32__
namespace embree
{
static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
@ -44,8 +57,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return 0; }
__forceinline operator short ( ) const { return 0; }
__forceinline operator unsigned short ( ) const { return 0; }
__forceinline operator char ( ) const { return 0; }
__forceinline operator unsigned char ( ) const { return 0; }
__forceinline operator int8_t ( ) const { return 0; }
__forceinline operator uint8_t ( ) const { return 0; }
};
extern MAYBE_UNUSED ZeroTy zero;
@ -62,8 +75,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return 1; }
__forceinline operator short ( ) const { return 1; }
__forceinline operator unsigned short ( ) const { return 1; }
__forceinline operator char ( ) const { return 1; }
__forceinline operator unsigned char ( ) const { return 1; }
__forceinline operator int8_t ( ) const { return 1; }
__forceinline operator uint8_t ( ) const { return 1; }
};
extern MAYBE_UNUSED OneTy one;
@ -80,8 +93,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); }
__forceinline operator short ( ) const { return std::numeric_limits<short>::min(); }
__forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); }
__forceinline operator char ( ) const { return std::numeric_limits<char>::min(); }
__forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::min(); }
__forceinline operator int8_t ( ) const { return std::numeric_limits<int8_t>::min(); }
__forceinline operator uint8_t ( ) const { return std::numeric_limits<uint8_t>::min(); }
};
@ -99,8 +112,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); }
__forceinline operator short ( ) const { return std::numeric_limits<short>::max(); }
__forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); }
__forceinline operator char ( ) const { return std::numeric_limits<char>::max(); }
__forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); }
__forceinline operator int8_t ( ) const { return std::numeric_limits<int8_t>::max(); }
__forceinline operator uint8_t ( ) const { return std::numeric_limits<uint8_t>::max(); }
};
extern MAYBE_UNUSED PosInfTy inf;
@ -194,4 +207,33 @@ namespace embree
};
extern MAYBE_UNUSED UndefinedTy undefined;
#if defined(__aarch64__)
extern const uint32x4_t movemask_mask;
extern const uint32x4_t vzero;
extern const uint32x4_t v0x80000000;
extern const uint32x4_t v0x7fffffff;
extern const uint32x4_t v000F;
extern const uint32x4_t v00F0;
extern const uint32x4_t v00FF;
extern const uint32x4_t v0F00;
extern const uint32x4_t v0F0F;
extern const uint32x4_t v0FF0;
extern const uint32x4_t v0FFF;
extern const uint32x4_t vF000;
extern const uint32x4_t vF00F;
extern const uint32x4_t vF0F0;
extern const uint32x4_t vF0FF;
extern const uint32x4_t vFF00;
extern const uint32x4_t vFF0F;
extern const uint32x4_t vFFF0;
extern const uint32x4_t vFFFF;
extern const uint8x16_t v0022;
extern const uint8x16_t v1133;
extern const uint8x16_t v0101;
extern const float32x4_t vOne;
extern const float32x4_t vmOne;
extern const float32x4_t vInf;
extern const float32x4_t vmInf;
#endif
}

View file

@ -8,12 +8,19 @@
#include "constants.h"
#include <cmath>
#if defined(__ARM_NEON)
#include "SSE2NEON.h"
#if defined(NEON_AVX2_EMULATION)
#include "AVX2NEON.h"
#endif
#else
#include <emmintrin.h>
#include <xmmintrin.h>
#include <immintrin.h>
#endif
#if defined(__WIN32__)
#if defined(_MSC_VER) && (_MSC_VER <= 1700)
#if defined(__WIN32__) && !defined(__MINGW32__)
#if (__MSV_VER <= 1700)
namespace std
{
__forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
@ -40,7 +47,7 @@ namespace embree
__forceinline int toInt (const float& a) { return int(a); }
__forceinline float toFloat(const int& a) { return float(a); }
#if defined(__WIN32__)
#if defined(__WIN32__) && !defined(__MINGW32__)
__forceinline bool finite ( const float x ) { return _finite(x) != 0; }
#endif
@ -49,6 +56,16 @@ namespace embree
__forceinline float rcp ( const float x )
{
#if defined(__aarch64__)
// Move scalar to vector register and do rcp.
__m128 a;
a[0] = x;
float32x4_t reciprocal = vrecpeq_f32(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
return reciprocal[0];
#else
const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__)
@ -62,19 +79,61 @@ namespace embree
#else
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
#endif
#endif //defined(__aarch64__)
}
__forceinline float signmsk ( const float x ) {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
a[0] = x;
b[0] = 0x80000000;
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
return a[0];
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
#endif
}
__forceinline float xorf( const float x, const float y ) {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128 b;
a[0] = x;
b[0] = y;
a = _mm_xor_ps(a, b);
return a[0];
#else
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
#endif
}
__forceinline float andf( const float x, const unsigned y ) {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
a[0] = x;
b[0] = y;
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
return a[0];
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
#endif
}
__forceinline float rsqrt( const float x )
{
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
a[0] = x;
__m128 value = _mm_rsqrt_ps(a);
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
return value[0];
#else
const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__)
const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
@ -84,9 +143,10 @@ namespace embree
const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
_mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
return _mm_cvtss_f32(c);
#endif
}
#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
__forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
__forceinline int roundf(float f) { return (int)(f + 0.5f); }
@ -140,7 +200,17 @@ namespace embree
__forceinline double floor( const double x ) { return ::floor (x); }
__forceinline double ceil ( const double x ) { return ::ceil (x); }
#if defined(__SSE4_1__)
#if defined(__aarch64__)
__forceinline float mini(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
x[0] = a;
y[0] = b;
x = _mm_min_ps(x, y);
return x[0];
}
#elif defined(__SSE4_1__)
__forceinline float mini(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@ -149,7 +219,17 @@ namespace embree
}
#endif
#if defined(__SSE4_1__)
#if defined(__aarch64__)
__forceinline float maxi(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
x[0] = a;
y[0] = b;
x = _mm_max_ps(x, y);
return x[0];
}
#elif defined(__SSE4_1__)
__forceinline float maxi(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@ -166,7 +246,7 @@ namespace embree
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
__forceinline float min(float a, float b) { return a<b ? a:b; }
__forceinline double min(double a, double b) { return a<b ? a:b; }
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
#endif
@ -183,7 +263,7 @@ namespace embree
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
__forceinline float max(float a, float b) { return a<b ? b:a; }
__forceinline double max(double a, double b) { return a<b ? b:a; }
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
#endif
@ -225,6 +305,16 @@ namespace embree
__forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
#elif defined (__aarch64__) && defined(__clang__)
#pragma clang fp contract(fast)
__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; }
__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; }
__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
#pragma clang fp contract(on)
#else
__forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }
__forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }
@ -273,6 +363,15 @@ namespace embree
/*! exchange */
template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
#if 1//!defined(__aarch64__)
return msub(a,b,c*d);
#else
return nmadd(c,d,a*b);
#endif
}
/*! bit reverse operation */
template<class T>
__forceinline T bitReverse(const T& vin)
@ -290,7 +389,7 @@ namespace embree
template<class T>
__forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
{
T x = xin, y = yin, z = zin;
T x = xin, y = yin, z = zin;
x = (x | (x << 16)) & 0x030000FF;
x = (x | (x << 8)) & 0x0300F00F;
x = (x | (x << 4)) & 0x030C30C3;
@ -309,7 +408,7 @@ namespace embree
return x | (y << 1) | (z << 2);
}
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
template<>
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)

View file

@ -205,11 +205,11 @@ namespace embree
#include "vec2fa.h"
#if defined __SSE__
#if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
#if defined __AVX__
#if defined(__AVX__)
#include "../simd/avx.h"
#endif
@ -221,7 +221,7 @@ namespace embree
{
template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#endif

View file

@ -97,6 +97,12 @@ namespace embree
__forceinline Vec2fa rcp ( const Vec2fa& a )
{
#if defined(__aarch64__)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
return (const Vec2fa)reciprocal;
#else
#if defined(__AVX512VL__)
const Vec2fa r = _mm_rcp14_ps(a.m128);
#else
@ -111,6 +117,7 @@ namespace embree
#endif
return res;
#endif //defined(__aarch64__)
}
__forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
@ -118,12 +125,21 @@ namespace embree
__forceinline Vec2fa rsqrt( const Vec2fa& a )
{
#if defined(__aarch64__)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
return r;
#else
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif
}
__forceinline Vec2fa zero_fix(const Vec2fa& a) {
@ -156,7 +172,7 @@ namespace embree
__forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@ -165,7 +181,7 @@ namespace embree
}
#endif
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@ -275,7 +291,11 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__)
#if defined(__aarch64__)
__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
#elif defined (__SSE4_1__)
//__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }

View file

@ -206,8 +206,7 @@ namespace embree
template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); }
template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); }
template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
{
const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
@ -266,11 +265,11 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////
#if defined __SSE__
#if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
#if defined __AVX__
#if defined(__AVX__)
#include "../simd/avx.h"
#endif
@ -291,14 +290,14 @@ namespace embree
template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
x = a.x; y = a.y; z = a.z;
}
#elif defined(__SSE__)
#elif defined(__SSE__) || defined(__ARM_NEON)
template<>
__forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
}
#endif
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
__forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
}

View file

@ -55,7 +55,13 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
static __forceinline Vec3fa load( const void* const a ) {
#if defined(__aarch64__)
__m128 t = _mm_load_ps((float*)a);
t[3] = 0.0f;
return Vec3fa(t);
#else
return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
#endif
}
static __forceinline Vec3fa loadu( const void* const a ) {
@ -89,19 +95,42 @@ namespace embree
__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
__forceinline Vec3fa operator -( const Vec3fa& a ) {
#if defined(__aarch64__)
return vnegq_f32(a.m128);
#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return _mm_xor_ps(a.m128, mask);
#endif
}
__forceinline Vec3fa abs ( const Vec3fa& a ) {
#if defined(__aarch64__)
return _mm_abs_ps(a.m128);
#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return _mm_and_ps(a.m128, mask);
#endif
}
__forceinline Vec3fa sign ( const Vec3fa& a ) {
#if defined(__aarch64__)
Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f)));
return r;
#else
return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
#endif
}
__forceinline Vec3fa rcp ( const Vec3fa& a )
{
#if defined(__aarch64__) && defined(BUILD_IOS)
return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
#elif defined(__aarch64__)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
return (const Vec3fa)reciprocal;
#else
#if defined(__AVX512VL__)
const Vec3fa r = _mm_rcp14_ps(a.m128);
#else
@ -116,6 +145,7 @@ namespace embree
#endif
return res;
#endif //defined(__aarch64__)
}
__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
@ -123,12 +153,20 @@ namespace embree
__forceinline Vec3fa rsqrt( const Vec3fa& a )
{
#if defined(__aarch64__)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
return r;
#else
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif
}
__forceinline Vec3fa zero_fix(const Vec3fa& a) {
@ -161,7 +199,7 @@ namespace embree
__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@ -170,7 +208,7 @@ namespace embree
}
#endif
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@ -192,11 +230,30 @@ namespace embree
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
#else
#if defined(__aarch64__)
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
return _mm_madd_ps(a.m128, b.m128, c.m128); //a*b+c;
}
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
return _mm_msub_ps(a.m128, b.m128, c.m128); //-a*b+c;
}
__forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128);
return -t;
}
__forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c
}
#else
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
#endif
#endif
__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
@ -218,7 +275,25 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline float reduce_add(const Vec3fa& v) {
float32x4_t t = v.m128;
t[3] = 0.0f;
return vaddvq_f32(t);
}
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
__forceinline float reduce_min(const Vec3fa& v) {
float32x4_t t = v.m128;
t[3] = t[2];
return vminvq_f32(t);
}
__forceinline float reduce_max(const Vec3fa& v) {
float32x4_t t = v.m128;
t[3] = t[2];
return vmaxvq_f32(t);
}
#else
__forceinline float reduce_add(const Vec3fa& v) {
const vfloat4 a(v.m128);
const vfloat4 b = shuffle<1>(a);
@ -229,6 +304,7 @@ namespace embree
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
#endif
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators
@ -241,8 +317,13 @@ namespace embree
__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
#if defined(__aarch64__)
__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
#else
__forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
__forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
#endif
__forceinline bool isvalid ( const Vec3fa& v ) {
return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
@ -280,7 +361,7 @@ namespace embree
vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
vfloat4 b1 = vfloat4(b.m128);
return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)));
}
__forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
@ -335,7 +416,11 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__)
#if defined(__aarch64__)
__forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
__forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
__forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
#elif defined (__SSE4_1__)
__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
@ -394,7 +479,9 @@ namespace embree
__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
__forceinline Vec3fx( const Vec3fa& other, const float w1) {
#if defined (__SSE4_1__)
#if defined (__aarch64__)
m128 = other.m128; m128[3] = w1;
#elif defined (__SSE4_1__)
m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
#else
const vint4 mask(-1,-1,-1,0);
@ -526,7 +613,7 @@ namespace embree
__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
#if defined(__SSE4_1__)
#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@ -535,7 +622,7 @@ namespace embree
}
#endif
#if defined(__SSE4_1__)
#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@ -700,7 +787,7 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__)
#if defined (__SSE4_1__) && !defined(__aarch64__)
__forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }

View file

@ -65,7 +65,9 @@ namespace embree
__forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
__forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
#if defined(__SSSE3__)
#if (defined(__aarch64__))
__forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
#elif defined(__SSSE3__)
__forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
#endif
@ -81,7 +83,7 @@ namespace embree
__forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); }
__forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; }
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
__forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); }
__forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; }
@ -99,12 +101,14 @@ namespace embree
__forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); }
__forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
#if !defined(__ARM_NEON)
__forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
__forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
__forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
__forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
__forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
#endif
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
@ -116,7 +120,7 @@ namespace embree
__forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
__forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; }
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
__forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; }
#endif
@ -127,18 +131,38 @@ namespace embree
__forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
__forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; }
#if !defined(__ARM_NEON)
__forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
__forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
#endif
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__)
__forceinline int reduce_add(const Vec3ia& v) {
int32x4_t t = v.m128;
t[3] = 0;
return vaddvq_s32(t);
}
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
__forceinline int reduce_min(const Vec3ia& v) {
int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0);
return vminvq_s32(t);
}
__forceinline int reduce_max(const Vec3ia& v) {
int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0);
return vmaxvq_s32(t);
}
#else
__forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
__forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
__forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
#endif
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators
////////////////////////////////////////////////////////////////////////////////
@ -161,14 +185,14 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
__forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
#else
return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
#endif
}
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
__forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
#else

View file

@ -192,7 +192,7 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
typedef Vec4<bool > Vec4b;
typedef Vec4<unsigned char> Vec4uc;
typedef Vec4<uint8_t > Vec4uc;
typedef Vec4<int > Vec4i;
typedef Vec4<float > Vec4f;
}
@ -205,7 +205,7 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////
#if defined __SSE__
#if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
@ -225,13 +225,13 @@ namespace embree
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
x = a.x; y = a.y; z = a.z; w = a.w;
}
#elif defined(__SSE__)
#elif defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
}
#endif
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
__forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) {
return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
}

View file

@ -6,7 +6,7 @@
#include "../math/math.h"
/* include SSE wrapper classes */
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
# include "sse.h"
#endif

View file

@ -11,7 +11,7 @@
namespace embree
{
#if defined(__SSE4_1__)
#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
__forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) {
return _mm_blendv_ps(f,t,mask);
}

View file

@ -56,7 +56,11 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
__forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
#if !defined(__aarch64__)
__forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
#else
__forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
#endif
////////////////////////////////////////////////////////////////////////////////
/// Array Access
@ -101,9 +105,10 @@ namespace embree
/// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
#if !defined(__aarch64__)
__forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
__forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
#endif
#if defined(__AVX2__)
template<int i0, int i1, int i2, int i3>

View file

@ -37,9 +37,13 @@ namespace embree
: v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
__forceinline vboolf(bool a, bool b, bool c, bool d)
: v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; }
__forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; }
#else
__forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
__forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
#endif
/* return int32 mask */
__forceinline __m128i mask32() const {
return _mm_castps_si128(v);
@ -56,8 +60,13 @@ namespace embree
/// Array Access
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; }
__forceinline int& operator [](size_t index) { return i[index]; }
#else
__forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
__forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; }
#endif
};
////////////////////////////////////////////////////////////////////////////////
@ -92,7 +101,7 @@ namespace embree
__forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
__forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
#if defined(__SSE4_1__)
#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
return _mm_blendv_ps(f, t, m);
#else
return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
@ -106,6 +115,17 @@ namespace embree
__forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
__forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vboolf4 shuffle(const vboolf4& v) {
return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3>
__forceinline vboolf4 shuffle(const vboolf4& v) {
return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
@ -115,6 +135,7 @@ namespace embree
__forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
#endif
template<int i0>
__forceinline vboolf4 shuffle(const vboolf4& v) {
@ -127,7 +148,7 @@ namespace embree
template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
#endif
#if defined(__SSE4_1__)
#if defined(__SSE4_1__) && !defined(__aarch64__)
template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
@ -149,10 +170,14 @@ namespace embree
__forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
__forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); }
#else
#if defined(__SSE4_2__)
__forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
#else
__forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
#endif
#endif
////////////////////////////////////////////////////////////////////////////////

View file

@ -68,8 +68,11 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
__forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
#if !defined(__aarch64__)
__forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
#else
__forceinline vboolf(TrueTy) : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {}
#endif
////////////////////////////////////////////////////////////////////////////////
/// Array Access
////////////////////////////////////////////////////////////////////////////////

View file

@ -181,13 +181,20 @@ namespace embree
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
#else
#elif !defined(__aarch64__)
__forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
__forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
__forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
#else
__forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); }
__forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
__forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); }
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); }
#endif
__forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); }

View file

@ -34,6 +34,11 @@ namespace embree
__forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
__forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
#if defined(__aarch64__)
__forceinline explicit vfloat(const vuint4& x) {
v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
}
#else
__forceinline explicit vfloat(const vuint4& x) {
const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
@ -41,7 +46,7 @@ namespace embree
const __m128 bf = _mm_castsi128_ps(b);
v = _mm_add_ps(af,bf);
}
#endif
////////////////////////////////////////////////////////////////////////////////
/// Constants
////////////////////////////////////////////////////////////////////////////////
@ -106,28 +111,40 @@ namespace embree
#endif
}
#if defined(__SSE4_1__)
static __forceinline vfloat4 load(const char* ptr) {
#if defined(__aarch64__)
static __forceinline vfloat4 load(const int8_t* ptr) {
return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
}
#elif defined(__SSE4_1__)
static __forceinline vfloat4 load(const int8_t* ptr) {
return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
}
#else
static __forceinline vfloat4 load(const char* ptr) {
static __forceinline vfloat4 load(const int8_t* ptr) {
return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
}
#endif
#if defined(__SSE4_1__)
static __forceinline vfloat4 load(const unsigned char* ptr) {
#if defined(__aarch64__)
static __forceinline vfloat4 load(const uint8_t* ptr) {
return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
}
#elif defined(__SSE4_1__)
static __forceinline vfloat4 load(const uint8_t* ptr) {
return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
}
#else
static __forceinline vfloat4 load(const unsigned char* ptr) {
static __forceinline vfloat4 load(const uint8_t* ptr) {
//return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
}
#endif
#if defined(__SSE4_1__)
#if defined(__aarch64__)
static __forceinline vfloat4 load(const short* ptr) {
return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
}
#elif defined(__SSE4_1__)
static __forceinline vfloat4 load(const short* ptr) {
return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
}
@ -144,7 +161,11 @@ namespace embree
static __forceinline void store_nt(void* ptr, const vfloat4& v)
{
#if defined (__SSE4_1__)
#if defined(__aarch64__)
_mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v));
#else
_mm_stream_ps((float*)ptr,v);
#endif
#else
_mm_store_ps((float*)ptr,v);
#endif
@ -152,14 +173,14 @@ namespace embree
template<int scale = 4>
static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _mm_i32gather_ps(ptr, index, scale);
#else
return vfloat4(
*(float*)(((char*)ptr)+scale*index[0]),
*(float*)(((char*)ptr)+scale*index[1]),
*(float*)(((char*)ptr)+scale*index[2]),
*(float*)(((char*)ptr)+scale*index[3]));
*(float*)(((int8_t*)ptr)+scale*index[0]),
*(float*)(((int8_t*)ptr)+scale*index[1]),
*(float*)(((int8_t*)ptr)+scale*index[2]),
*(float*)(((int8_t*)ptr)+scale*index[3]));
#endif
}
@ -168,13 +189,13 @@ namespace embree
vfloat4 r = zero;
#if defined(__AVX512VL__)
return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
#elif defined(__AVX2__)
#elif defined(__AVX2__) && !defined(__aarch64__)
return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
#else
if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
return r;
#endif
}
@ -185,10 +206,10 @@ namespace embree
#if defined(__AVX512VL__)
_mm_i32scatter_ps((float*)ptr, index, v, scale);
#else
*(float*)(((char*)ptr)+scale*index[0]) = v[0];
*(float*)(((char*)ptr)+scale*index[1]) = v[1];
*(float*)(((char*)ptr)+scale*index[2]) = v[2];
*(float*)(((char*)ptr)+scale*index[3]) = v[3];
*(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
*(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
*(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
*(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif
}
@ -198,14 +219,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
#else
if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2];
if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3];
if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif
}
static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) {
static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) {
scatter<1>(mask,ptr,ofs,v);
}
static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
@ -222,7 +243,7 @@ namespace embree
friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
#if defined(__AVX512VL__)
return _mm_mask_blend_ps(m, f, t);
#elif defined(__SSE4_1__)
#elif defined(__SSE4_1__) || (defined(__aarch64__))
return _mm_blendv_ps(f, t, m);
#else
return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
@ -243,18 +264,47 @@ namespace embree
__forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); }
__forceinline vfloat4 operator +(const vfloat4& a) { return a; }
#if defined(__aarch64__)
__forceinline vfloat4 operator -(const vfloat4& a) {
return vnegq_f32(a);
}
#else
__forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
#endif
#if defined(__aarch64__)
__forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
#else
__forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
#endif
#if defined(__AVX512VL__)
__forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
#else
__forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
#endif
#if defined(__aarch64__)
__forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); }
#else
__forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
#endif
__forceinline vfloat4 rcp(const vfloat4& a)
{
#if defined(__aarch64__)
#if defined(BUILD_IOS)
return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
#else //BUILD_IOS
__m128 reciprocal = _mm_rcp_ps(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
// +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp.
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
return (const vfloat4)reciprocal;
#endif // BUILD_IOS
#else
#if defined(__AVX512VL__)
const vfloat4 r = _mm_rcp14_ps(a);
#else
@ -266,12 +316,22 @@ namespace embree
#else
return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
#endif
#endif //defined(__aarch64__)
}
__forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
__forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
__forceinline vfloat4 rsqrt(const vfloat4& a)
{
#if defined(__aarch64__)
vfloat4 r = _mm_rsqrt_ps(a);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
return r;
#else
#if defined(__AVX512VL__)
const vfloat4 r = _mm_rsqrt14_ps(a);
#else
@ -284,11 +344,17 @@ namespace embree
#else
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif
#endif
}
__forceinline vboolf4 isnan(const vfloat4& a) {
#if defined(__aarch64__)
const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff));
#else
const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
#endif
#if defined(__AVX512VL__)
return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
#else
@ -329,7 +395,8 @@ namespace embree
__forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); }
__forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
#if defined(__SSE4_1__)
#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@ -377,10 +444,24 @@ namespace embree
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
#else
#if defined(__aarch64__)
__forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) {
return _mm_madd_ps(a, b, c); //a*b+c;
}
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
return _mm_msub_ps(a, b, c); //-a*b+c;
}
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
return vnegq_f32(vfmaq_f32(c,a, b));
}
#else
__forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
__forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
#endif
__forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
#endif
////////////////////////////////////////////////////////////////////////////////
@ -414,8 +495,13 @@ namespace embree
__forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
__forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
__forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
#if defined(__aarch64__)
__forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
__forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
#else
__forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
__forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
#endif
__forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
#endif
@ -470,6 +556,57 @@ namespace embree
#endif
}
#if defined(__aarch64__)
template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero));
}
template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F));
}
template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0));
}
template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF));
}
template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00));
}
template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F));
}
template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0));
}
template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF));
}
template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000));
}
template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F));
}
template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0));
}
template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF));
}
template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00));
}
template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F));
}
template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0));
}
template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF));
}
#endif
__forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
return madd(t,b-a,a);
}
@ -490,11 +627,16 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__)
__forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
__forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); }
__forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
#if defined(__aarch64__)
__forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
__forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
__forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
__forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
#elif defined (__SSE4_1__)
__forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
__forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); }
__forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd
#else
__forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
__forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
@ -504,7 +646,9 @@ namespace embree
__forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
__forceinline vint4 floori(const vfloat4& a) {
#if defined(__SSE4_1__)
#if defined(__aarch64__)
return vcvtq_s32_f32(floor(a));
#elif defined(__SSE4_1__)
return vint4(floor(a));
#else
return vint4(a-vfloat4(0.5f));
@ -518,6 +662,16 @@ namespace embree
__forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
__forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vfloat4 shuffle(const vfloat4& v) {
return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3>
__forceinline vfloat4 shuffle(const vfloat4& v) {
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
@ -527,6 +681,7 @@ namespace embree
__forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
#endif
#if defined (__SSSE3__)
__forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) {
@ -534,7 +689,11 @@ namespace embree
}
#endif
#if defined(__SSE3__)
#if defined(__aarch64__)
template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); }
template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); }
template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); }
#elif defined(__SSE3__)
template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
@ -545,14 +704,56 @@ namespace embree
return shuffle<i,i,i,i>(v);
}
#if defined (__SSE4_1__) && !defined(__GNUC__)
#if defined(__aarch64__)
template<int i> __forceinline float extract(const vfloat4& a);
template<> __forceinline float extract<0>(const vfloat4& b) {
return b[0];
}
template<> __forceinline float extract<1>(const vfloat4& b) {
return b[1];
}
template<> __forceinline float extract<2>(const vfloat4& b) {
return b[2];
}
template<> __forceinline float extract<3>(const vfloat4& b) {
return b[3];
}
#elif defined (__SSE4_1__) && !defined(__GNUC__)
template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
#else
template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
#endif
template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
#endif
#if defined (__SSE4_1__)
#if defined(__aarch64__)
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b);
template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[0] = b;
return c;
}
template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[1] = b;
return c;
}
template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[2] = b;
return c;
}
template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[3] = b;
return c;
}
#elif defined (__SSE4_1__)
template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
@ -561,8 +762,13 @@ namespace embree
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
#endif
#if defined(__aarch64__)
__forceinline float toScalar(const vfloat4& v) {
return v[0];
}
#else
__forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
#endif
__forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) {
return vfloat4::broadcast(&a[k]);
}
@ -658,14 +864,25 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__)
__forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
__forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
__forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
#else
__forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
__forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
__forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
#endif
#if defined(__aarch64__)
__forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
__forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
__forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
#else
__forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
__forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
__forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
#endif
__forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
{
@ -694,7 +911,7 @@ namespace embree
const vfloat4 b0 = shuffle<1,2,0,3>(b);
const vfloat4 a1 = shuffle<1,2,0,3>(a);
const vfloat4 b1 = b;
return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1));
}
////////////////////////////////////////////////////////////////////////////////

View file

@ -33,7 +33,7 @@ namespace embree
__forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
__forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
__forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
__forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {}
__forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
__forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
__forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
@ -75,7 +75,7 @@ namespace embree
return _mm256_broadcast_ps((__m128*)ptr);
}
static __forceinline vfloat8 load(const char* ptr) {
static __forceinline vfloat8 load(const int8_t* ptr) {
#if defined(__AVX2__)
return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
#else
@ -83,7 +83,7 @@ namespace embree
#endif
}
static __forceinline vfloat8 load(const unsigned char* ptr) {
static __forceinline vfloat8 load(const uint8_t* ptr) {
#if defined(__AVX2__)
return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
#else
@ -119,6 +119,12 @@ namespace embree
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
#elif defined(__aarch64__)
static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
#else
static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
@ -139,18 +145,18 @@ namespace embree
template<int scale = 4>
static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _mm256_i32gather_ps(ptr, index ,scale);
#else
return vfloat8(
*(float*)(((char*)ptr)+scale*index[0]),
*(float*)(((char*)ptr)+scale*index[1]),
*(float*)(((char*)ptr)+scale*index[2]),
*(float*)(((char*)ptr)+scale*index[3]),
*(float*)(((char*)ptr)+scale*index[4]),
*(float*)(((char*)ptr)+scale*index[5]),
*(float*)(((char*)ptr)+scale*index[6]),
*(float*)(((char*)ptr)+scale*index[7]));
*(float*)(((int8_t*)ptr)+scale*index[0]),
*(float*)(((int8_t*)ptr)+scale*index[1]),
*(float*)(((int8_t*)ptr)+scale*index[2]),
*(float*)(((int8_t*)ptr)+scale*index[3]),
*(float*)(((int8_t*)ptr)+scale*index[4]),
*(float*)(((int8_t*)ptr)+scale*index[5]),
*(float*)(((int8_t*)ptr)+scale*index[6]),
*(float*)(((int8_t*)ptr)+scale*index[7]));
#endif
}
@ -159,17 +165,17 @@ namespace embree
vfloat8 r = zero;
#if defined(__AVX512VL__)
return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
#elif defined(__AVX2__)
#elif defined(__AVX2__) && !defined(__aarch64__)
return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
#else
if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]);
if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]);
return r;
#endif
}
@ -180,14 +186,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
#else
*(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
*(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
*(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
*(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
*(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
*(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
*(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
*(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
*(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif
}
@ -197,18 +203,18 @@ namespace embree
#if defined(__AVX512VL__)
_mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
#else
if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif
}
static __forceinline void store(const vboolf8& mask, char* ptr, const vint8& ofs, const vfloat8& v) {
static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) {
scatter<1>(mask,ptr,ofs,v);
}
static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) {
@ -235,27 +241,60 @@ namespace embree
__forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); }
__forceinline vfloat8 operator +(const vfloat8& a) { return a; }
#if !defined(__aarch64__)
__forceinline vfloat8 operator -(const vfloat8& a) {
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
return _mm256_xor_ps(a, mask);
}
__forceinline vfloat8 abs(const vfloat8& a) {
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
return _mm256_and_ps(a, mask);
}
#else
__forceinline vfloat8 operator -(const vfloat8& a) {
__m256 res;
res.lo = vnegq_f32(a.v.lo);
res.hi = vnegq_f32(a.v.hi);
return res;
}
#endif
#if !defined(__aarch64__)
__forceinline vfloat8 abs(const vfloat8& a) {
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
return _mm256_and_ps(a, mask);
}
#else
__forceinline vfloat8 abs(const vfloat8& a) {
__m256 res;
res.lo = vabsq_f32(a.v.lo);
res.hi = vabsq_f32(a.v.hi);
return res;
}
#endif
#if !defined(__aarch64__)
__forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
#else
__forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
#endif
__forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
static __forceinline vfloat8 rcp(const vfloat8& a)
{
#if defined(BUILD_IOS) && defined(__aarch64__)
// ios devices are faster doing full divide, no need for NR fixup
vfloat8 ret;
const float32x4_t one = vdupq_n_f32(1.0f);
ret.v.lo = vdivq_f32(one, a.v.lo);
ret.v.hi = vdivq_f32(one, a.v.hi);
return ret;
#endif
#if defined(__AVX512VL__)
const vfloat8 r = _mm256_rcp14_ps(a);
#else
const vfloat8 r = _mm256_rcp_ps(a);
#endif
#if defined(__AVX2__)
#if defined(__AVX2__) //&& !defined(aarch64)
return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
#else
return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
@ -404,17 +443,29 @@ namespace embree
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
return _mm256_mask_blend_ps(m, f, t);
}
#else
static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
#elif !defined(__aarch64__)
__forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
__forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
__forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
__forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
__forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
__forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
__forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
return _mm256_blendv_ps(f, t, m);
}
#else
__forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); }
__forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
__forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); }
__forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); }
__forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); }
__forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); }
__forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
return _mm256_blendv_ps(f, t, m);
}
#endif
template<int mask>
@ -483,10 +534,17 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
#if !defined(__aarch64__)
__forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); }
__forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); }
__forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
#else
__forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
__forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
#endif
__forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
////////////////////////////////////////////////////////////////////////////////
@ -521,9 +579,11 @@ namespace embree
return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
#if !defined(__aarch64__)
template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
#endif
__forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
@ -534,8 +594,8 @@ namespace embree
__forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); }
#if defined (__AVX2__)
static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
#if defined (__AVX2__) && !defined(__aarch64__)
__forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
return _mm256_permutevar8x32_ps(a, index);
}
#endif
@ -639,7 +699,7 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
#if !defined(__aarch64__)
__forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
__forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
__forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
@ -655,7 +715,14 @@ namespace embree
__forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
__forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
__forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
#else
__forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
__forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
__forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
__forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
__forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
#endif
__forceinline size_t select_min(const vboolf8& valid, const vfloat8& v)
{
const vfloat8 a = select(valid,v,vfloat8(pos_inf));

View file

@ -90,10 +90,10 @@ namespace embree
static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }

View file

@ -98,53 +98,73 @@ namespace embree
#endif
#if defined(__SSE4_1__)
static __forceinline vint4 load(const unsigned char* ptr) {
#if defined(__aarch64__)
static __forceinline vint4 load(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
static __forceinline vint4 loadu(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
#elif defined(__SSE4_1__)
static __forceinline vint4 load(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
}
static __forceinline vint4 loadu(const unsigned char* ptr) {
static __forceinline vint4 loadu(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
}
#else
static __forceinline vint4 load(const unsigned char* ptr) {
static __forceinline vint4 load(const uint8_t* ptr) {
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
}
static __forceinline vint4 loadu(const unsigned char* ptr) {
static __forceinline vint4 loadu(const uint8_t* ptr) {
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
}
#endif
static __forceinline vint4 load(const unsigned short* ptr) {
#if defined (__SSE4_1__)
#if defined(__aarch64__)
return __m128i(vmovl_u16(vld1_u16(ptr)));
#elif defined (__SSE4_1__)
return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
#else
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
#endif
}
static __forceinline void store(unsigned char* ptr, const vint4& v) {
#if defined(__SSE4_1__)
static __forceinline void store(uint8_t* ptr, const vint4& v) {
#if defined(__aarch64__)
int32x4_t x = v;
uint16x4_t y = vqmovn_u32(uint32x4_t(x));
uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
#elif defined(__SSE4_1__)
__m128i x = v;
x = _mm_packus_epi32(x, x);
x = _mm_packus_epi16(x, x);
*(int*)ptr = _mm_cvtsi128_si32(x);
#else
for (size_t i=0;i<4;i++)
ptr[i] = (unsigned char)v[i];
ptr[i] = (uint8_t)v[i];
#endif
}
static __forceinline void store(unsigned short* ptr, const vint4& v) {
#if defined(__aarch64__)
uint32x4_t x = uint32x4_t(v.v);
uint16x4_t y = vqmovn_u32(x);
vst1_u16(ptr, y);
#else
for (size_t i=0;i<4;i++)
ptr[i] = (unsigned short)v[i];
#endif
}
static __forceinline vint4 load_nt(void* ptr) {
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
return _mm_stream_load_si128((__m128i*)ptr);
#else
return _mm_load_si128((__m128i*)ptr);
@ -152,7 +172,7 @@ namespace embree
}
static __forceinline void store_nt(void* ptr, const vint4& v) {
#if defined(__SSE4_1__)
#if !defined(__aarch64__) && defined(__SSE4_1__)
_mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
#else
_mm_store_si128((__m128i*)ptr,v);
@ -161,14 +181,14 @@ namespace embree
template<int scale = 4>
static __forceinline vint4 gather(const int* ptr, const vint4& index) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _mm_i32gather_epi32(ptr, index, scale);
#else
return vint4(
*(int*)(((char*)ptr)+scale*index[0]),
*(int*)(((char*)ptr)+scale*index[1]),
*(int*)(((char*)ptr)+scale*index[2]),
*(int*)(((char*)ptr)+scale*index[3]));
*(int*)(((int8_t*)ptr)+scale*index[0]),
*(int*)(((int8_t*)ptr)+scale*index[1]),
*(int*)(((int8_t*)ptr)+scale*index[2]),
*(int*)(((int8_t*)ptr)+scale*index[3]));
#endif
}
@ -177,13 +197,13 @@ namespace embree
vint4 r = zero;
#if defined(__AVX512VL__)
return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
#elif defined(__AVX2__)
#elif defined(__AVX2__) && !defined(__aarch64__)
return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
#else
if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
return r;
#endif
}
@ -194,10 +214,10 @@ namespace embree
#if defined(__AVX512VL__)
_mm_i32scatter_epi32((int*)ptr, index, v, scale);
#else
*(int*)(((char*)ptr)+scale*index[0]) = v[0];
*(int*)(((char*)ptr)+scale*index[1]) = v[1];
*(int*)(((char*)ptr)+scale*index[2]) = v[2];
*(int*)(((char*)ptr)+scale*index[3]) = v[3];
*(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
*(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
*(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
*(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif
}
@ -207,14 +227,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
#else
if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif
}
#if defined(__x86_64__)
#if defined(__x86_64__) || defined(__aarch64__)
static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
#endif
@ -228,6 +248,8 @@ namespace embree
friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
#if defined(__AVX512VL__)
return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
#elif defined(__aarch64__)
return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
#elif defined(__SSE4_1__)
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
#else
@ -248,7 +270,9 @@ namespace embree
__forceinline vint4 operator +(const vint4& a) { return a; }
__forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
#if defined(__SSSE3__)
#if defined(__aarch64__)
__forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
#elif defined(__SSSE3__)
__forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
#endif
@ -264,7 +288,7 @@ namespace embree
__forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); }
__forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; }
#if defined(__SSE4_1__)
#if (defined(__aarch64__)) || defined(__SSE4_1__)
__forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
#else
__forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
@ -284,8 +308,8 @@ namespace embree
__forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); }
__forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; }
__forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
__forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
__forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
__forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
__forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
__forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
@ -301,7 +325,7 @@ namespace embree
__forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
__forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; }
#if defined(__SSE4_1__)
#if (defined(__aarch64__)) || defined(__SSE4_1__)
__forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
__forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; }
#endif
@ -385,7 +409,8 @@ namespace embree
#endif
}
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
__forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
@ -409,16 +434,25 @@ namespace embree
__forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
__forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& v) {
return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& a, const vint4& b) {
return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& v) {
return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& a, const vint4& b) {
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
}
#endif
#if defined(__SSE3__)
template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@ -430,7 +464,10 @@ namespace embree
return shuffle<i,i,i,i>(v);
}
#if defined(__SSE4_1__)
#if defined(__aarch64__)
template<int src> __forceinline int extract(const vint4& b);
template<int dst> __forceinline vint4 insert(const vint4& a, const int b);
#elif defined(__SSE4_1__)
template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
#else
@ -438,7 +475,53 @@ namespace embree
template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
#endif
#if defined(__aarch64__)
template<> __forceinline int extract<0>(const vint4& b) {
return b.v[0];
}
template<> __forceinline int extract<1>(const vint4& b) {
return b.v[1];
}
template<> __forceinline int extract<2>(const vint4& b) {
return b.v[2];
}
template<> __forceinline int extract<3>(const vint4& b) {
return b.v[3];
}
template<> __forceinline vint4 insert<0>(const vint4& a, int b)
{
vint4 c = a;
c[0] = b;
return c;
}
template<> __forceinline vint4 insert<1>(const vint4& a, int b)
{
vint4 c = a;
c[1] = b;
return c;
}
template<> __forceinline vint4 insert<2>(const vint4& a, int b)
{
vint4 c = a;
c[2] = b;
return c;
}
template<> __forceinline vint4 insert<3>(const vint4& a, int b)
{
vint4 c = a;
c[3] = b;
return c;
}
__forceinline int toScalar(const vint4& v) {
return v[0];
}
__forceinline size_t toSizeT(const vint4& v) {
uint64x2_t x = uint64x2_t(v.v);
return x[0];
}
#else
template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
__forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
@ -446,10 +529,14 @@ namespace embree
__forceinline size_t toSizeT(const vint4& v) {
#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
return toScalar(v);
#elif defined(__ARM_NEON)
// FIXME(LTE): Do we need a swap(i.e. use lane 1)?
return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
#else
return _mm_cvtsi128_si64(v);
#endif
}
#endif
#if defined(__AVX512VL__)
@ -467,7 +554,17 @@ namespace embree
/// Reductions
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
#if defined(__aarch64__)
__forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
__forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
__forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
__forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
__forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
__forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
#else
__forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
__forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
__forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
@ -475,6 +572,7 @@ namespace embree
__forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
__forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
__forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
#endif
__forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
__forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
@ -494,7 +592,7 @@ namespace embree
/// Sorting networks
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
#if (defined(__aarch64__)) || defined(__SSE4_1__)
__forceinline vint4 usort_ascending(const vint4& v)
{

View file

@ -71,20 +71,25 @@ namespace embree
static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
#if !defined(__aarch64__)
static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
#else
static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
#endif
static __forceinline void store_nt(void* ptr, const vint8& v) {
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
}
static __forceinline vint8 load(const unsigned char* ptr) {
static __forceinline vint8 load(const uint8_t* ptr) {
vint4 il = vint4::load(ptr+0);
vint4 ih = vint4::load(ptr+4);
return vint8(il,ih);
}
static __forceinline vint8 loadu(const unsigned char* ptr) {
static __forceinline vint8 loadu(const uint8_t* ptr) {
vint4 il = vint4::loadu(ptr+0);
vint4 ih = vint4::loadu(ptr+4);
return vint8(il,ih);
@ -102,7 +107,7 @@ namespace embree
return vint8(il,ih);
}
static __forceinline void store(unsigned char* ptr, const vint8& i) {
static __forceinline void store(uint8_t* ptr, const vint8& i) {
vint4 il(i.vl);
vint4 ih(i.vh);
vint4::store(ptr + 0,il);
@ -117,54 +122,54 @@ namespace embree
template<int scale = 4>
static __forceinline vint8 gather(const int* ptr, const vint8& index) {
return vint8(
*(int*)(((char*)ptr)+scale*index[0]),
*(int*)(((char*)ptr)+scale*index[1]),
*(int*)(((char*)ptr)+scale*index[2]),
*(int*)(((char*)ptr)+scale*index[3]),
*(int*)(((char*)ptr)+scale*index[4]),
*(int*)(((char*)ptr)+scale*index[5]),
*(int*)(((char*)ptr)+scale*index[6]),
*(int*)(((char*)ptr)+scale*index[7]));
*(int*)(((int8_t*)ptr)+scale*index[0]),
*(int*)(((int8_t*)ptr)+scale*index[1]),
*(int*)(((int8_t*)ptr)+scale*index[2]),
*(int*)(((int8_t*)ptr)+scale*index[3]),
*(int*)(((int8_t*)ptr)+scale*index[4]),
*(int*)(((int8_t*)ptr)+scale*index[5]),
*(int*)(((int8_t*)ptr)+scale*index[6]),
*(int*)(((int8_t*)ptr)+scale*index[7]));
}
template<int scale = 4>
static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
vint8 r = zero;
if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]);
if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]);
return r;
}
template<int scale = 4>
static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
{
*(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
*(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
*(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
*(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
*(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
*(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
*(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
*(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
*(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
}
template<int scale = 4>
static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
{
if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
}

View file

@ -67,8 +67,8 @@ namespace embree
/// Loads and Stores
////////////////////////////////////////////////////////////////////////////////
static __forceinline vint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
@ -108,7 +108,7 @@ namespace embree
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
}
static __forceinline void store(unsigned char* ptr, const vint8& i)
static __forceinline void store(uint8_t* ptr, const vint8& i)
{
for (size_t j=0; j<8; j++)
ptr[j] = i[j];
@ -140,14 +140,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
#else
*(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
*(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
*(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
*(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
*(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
*(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
*(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
*(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
*(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif
}
@ -157,14 +157,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
#else
if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif
}
@ -385,7 +385,9 @@ namespace embree
__forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
__forceinline vint8 permute(const vint8& v, const __m256i& index) {
#if !defined(__aarch64__)
__forceinline vint8 permute(const vint8& v, const __m256i& index) {
return _mm256_permutevar8x32_epi32(v, index);
}
@ -393,6 +395,8 @@ namespace embree
return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
}
template<int i>
static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
#if defined(__AVX512VL__)
@ -402,6 +406,9 @@ namespace embree
#endif
}
#endif
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////

View file

@ -78,7 +78,7 @@ namespace embree
return _mm512_load_si512(addr);
}
static __forceinline vllong8 load(const unsigned char* ptr) {
static __forceinline vllong8 load(const uint8_t* ptr) {
return _mm512_cvtepu8_epi64(*(__m128i*)ptr);
}

View file

@ -83,7 +83,7 @@ namespace embree
return _mm512_loadu_si512(addr);
}
static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
static __forceinline vuint16 load(const vuint16* addr) {

View file

@ -87,44 +87,64 @@ namespace embree
static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
#endif
#if defined(__SSE4_1__)
static __forceinline vuint4 load(const unsigned char* ptr) {
#if defined(__aarch64__)
static __forceinline vuint4 load(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
static __forceinline vuint4 loadu(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
#elif defined(__SSE4_1__)
static __forceinline vuint4 load(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
}
static __forceinline vuint4 loadu(const unsigned char* ptr) {
static __forceinline vuint4 loadu(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
}
#endif
static __forceinline vuint4 load(const unsigned short* ptr) {
#if defined (__SSE4_1__)
#if defined(__aarch64__)
return _mm_load4epu16_epi32(((__m128i*)ptr));
#elif defined (__SSE4_1__)
return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
#else
return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
#endif
}
static __forceinline void store_uchar(unsigned char* ptr, const vuint4& v) {
#if defined(__SSE4_1__)
static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) {
#if defined(__aarch64__)
uint32x4_t x = uint32x4_t(v.v);
uint16x4_t y = vqmovn_u32(x);
uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0);
#elif defined(__SSE4_1__)
__m128i x = v;
x = _mm_packus_epi32(x, x);
x = _mm_packus_epi16(x, x);
*(unsigned*)ptr = _mm_cvtsi128_si32(x);
#else
for (size_t i=0;i<4;i++)
ptr[i] = (unsigned char)v[i];
ptr[i] = (uint8_t)v[i];
#endif
}
static __forceinline void store_uchar(unsigned short* ptr, const vuint4& v) {
static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) {
#if defined(__aarch64__)
uint32x4_t x = (uint32x4_t)v.v;
uint16x4_t y = vqmovn_u32(x);
vst1_u16(ptr, y);
#else
for (size_t i=0;i<4;i++)
ptr[i] = (unsigned short)v[i];
#endif
}
static __forceinline vuint4 load_nt(void* ptr) {
#if defined(__SSE4_1__)
#if (defined(__aarch64__)) || defined(__SSE4_1__)
return _mm_stream_load_si128((__m128i*)ptr);
#else
return _mm_load_si128((__m128i*)ptr);
@ -132,8 +152,8 @@ namespace embree
}
static __forceinline void store_nt(void* ptr, const vuint4& v) {
#if defined(__SSE4_1__)
_mm_stream_ps((float*)ptr,_mm_castsi128_ps(v));
#if !defined(__aarch64__) && defined(__SSE4_1__)
_mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
#else
_mm_store_si128((__m128i*)ptr,v);
#endif
@ -141,14 +161,14 @@ namespace embree
template<int scale = 4>
static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _mm_i32gather_epi32((const int*)ptr, index, scale);
#else
return vuint4(
*(unsigned int*)(((char*)ptr)+scale*index[0]),
*(unsigned int*)(((char*)ptr)+scale*index[1]),
*(unsigned int*)(((char*)ptr)+scale*index[2]),
*(unsigned int*)(((char*)ptr)+scale*index[3]));
*(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[3]));
#endif
}
@ -157,13 +177,13 @@ namespace embree
vuint4 r = zero;
#if defined(__AVX512VL__)
return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
#elif defined(__AVX2__)
#elif defined(__AVX2__) && !defined(__aarch64__)
return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
#else
if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
return r;
#endif
}
@ -353,16 +373,25 @@ namespace embree
__forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
__forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& v) {
return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& v) {
return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
}
#endif
#if defined(__SSE3__)
template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@ -374,7 +403,10 @@ namespace embree
return shuffle<i,i,i,i>(v);
}
#if defined(__SSE4_1__)
#if defined(__aarch64__)
template<int src> __forceinline unsigned int extract(const vuint4& b);
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b);
#elif defined(__SSE4_1__)
template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
#else
@ -382,10 +414,49 @@ namespace embree
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
#endif
#if defined(__aarch64__)
template<> __forceinline unsigned int extract<0>(const vuint4& b) {
return b[0];
}
template<> __forceinline unsigned int extract<1>(const vuint4& b) {
return b[1];
}
template<> __forceinline unsigned int extract<2>(const vuint4& b) {
return b[2];
}
template<> __forceinline unsigned int extract<3>(const vuint4& b) {
return b[3];
}
template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){
vuint4 c = a;
c[0] = b;
return c;
}
template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){
vuint4 c = a;
c[1] = b;
return c;
}
template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){
vuint4 c = a;
c[2] = b;
return c;
}
template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){
vuint4 c = a;
c[3] = b;
return c;
}
__forceinline unsigned int toScalar(const vuint4& v) {
return v[0];
}
#else
template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
__forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
#endif
////////////////////////////////////////////////////////////////////////////////
/// Reductions

View file

@ -69,20 +69,24 @@ namespace embree
static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
#if !defined(__aarch64__)
static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
#else
static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
#endif
static __forceinline void store_nt(void* ptr, const vuint8& v) {
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
}
static __forceinline vuint8 load(const unsigned char* ptr) {
static __forceinline vuint8 load(const uint8_t* ptr) {
vuint4 il = vuint4::load(ptr+0);
vuint4 ih = vuint4::load(ptr+4);
return vuint8(il,ih);
}
static __forceinline vuint8 loadu(const unsigned char* ptr) {
static __forceinline vuint8 loadu(const uint8_t* ptr) {
vuint4 il = vuint4::loadu(ptr+0);
vuint4 ih = vuint4::loadu(ptr+4);
return vuint8(il,ih);
@ -100,7 +104,7 @@ namespace embree
return vuint8(il,ih);
}
static __forceinline void store(unsigned char* ptr, const vuint8& i) {
static __forceinline void store(uint8_t* ptr, const vuint8& i) {
vuint4 il(i.vl);
vuint4 ih(i.vh);
vuint4::store(ptr + 0,il);
@ -115,54 +119,54 @@ namespace embree
template<int scale = 4>
static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
return vuint8(
*(unsigned int*)(((char*)ptr)+scale*index[0]),
*(unsigned int*)(((char*)ptr)+scale*index[1]),
*(unsigned int*)(((char*)ptr)+scale*index[2]),
*(unsigned int*)(((char*)ptr)+scale*index[3]),
*(unsigned int*)(((char*)ptr)+scale*index[4]),
*(unsigned int*)(((char*)ptr)+scale*index[5]),
*(unsigned int*)(((char*)ptr)+scale*index[6]),
*(unsigned int*)(((char*)ptr)+scale*index[7]));
*(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[3]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[4]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[5]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[6]),
*(unsigned int*)(((int8_t*)ptr)+scale*index[7]));
}
template<int scale = 4>
static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
vuint8 r = zero;
if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]);
if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]);
return r;
}
template<int scale = 4>
static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
{
*(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
*(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
*(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
*(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
*(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
*(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
*(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
*(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
}
template<int scale = 4>
static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
{
if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
}

View file

@ -66,8 +66,8 @@ namespace embree
/// Loads and Stores
////////////////////////////////////////////////////////////////////////////////
static __forceinline vuint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vuint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
@ -107,7 +107,7 @@ namespace embree
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
}
static __forceinline void store(unsigned char* ptr, const vuint8& i)
static __forceinline void store(uint8_t* ptr, const vuint8& i)
{
for (size_t j=0; j<8; j++)
ptr[j] = i[j];
@ -139,14 +139,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
#else
*(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
*(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
*(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
*(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
*(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
*(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
*(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
*(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6];
*(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7];
#endif
}
@ -156,14 +156,14 @@ namespace embree
#if defined(__AVX512VL__)
_mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
#else
if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif
}
@ -379,6 +379,8 @@ namespace embree
__forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
#if !defined(__aarch64__)
__forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
return _mm256_permutevar8x32_epi32(v, index);
}
@ -396,6 +398,9 @@ namespace embree
#endif
}
#endif
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////

View file

@ -21,7 +21,10 @@ namespace embree
void* ptr = _mm_malloc(size,align);
if (size != 0 && ptr == nullptr)
throw std::bad_alloc();
// -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
return ptr;
}
@ -128,7 +131,10 @@ namespace embree
/* fall back to 4k pages */
int flags = MEM_COMMIT | MEM_RESERVE;
char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
if (ptr == nullptr) throw std::bad_alloc();
// -- GODOT start --
// if (ptr == nullptr) throw std::bad_alloc();
if (ptr == nullptr) abort();
// -- GODOT end --
hugepages = false;
return ptr;
}
@ -145,7 +151,10 @@ namespace embree
return bytesOld;
if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
throw std::bad_alloc();
// -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
return bytesNew;
}
@ -156,7 +165,10 @@ namespace embree
return;
if (!VirtualFree(ptr,0,MEM_RELEASE))
throw std::bad_alloc();
// -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
}
void os_advise(void *ptr, size_t bytes)
@ -260,7 +272,10 @@ namespace embree
/* fallback to 4k pages */
void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ptr == MAP_FAILED) throw std::bad_alloc();
// -- GODOT start --
// if (ptr == MAP_FAILED) throw std::bad_alloc();
if (ptr == MAP_FAILED) abort();
// -- GODOT end --
hugepages = false;
/* advise huge page hint for THP */
@ -277,7 +292,10 @@ namespace embree
return bytesOld;
if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
throw std::bad_alloc();
// -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
return bytesNew;
}
@ -291,7 +309,10 @@ namespace embree
const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
bytes = (bytes+pageSize-1) & ~(pageSize-1);
if (munmap(ptr,bytes) == -1)
throw std::bad_alloc();
// -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
}
/* hint for transparent huge pages (THP) */

View file

@ -139,7 +139,7 @@ namespace embree
__forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; }
__forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; }
__forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
#endif
@ -196,7 +196,7 @@ namespace embree
__forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
__forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
#endif

View file

@ -9,7 +9,14 @@
#include <intrin.h>
#endif
#if defined(__ARM_NEON)
#include "../math/SSE2NEON.h"
#if defined(NEON_AVX2_EMULATION)
#include "../math/AVX2NEON.h"
#endif
#else
#include <immintrin.h>
#endif
#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
#if !defined(_tzcnt_u32)
@ -20,6 +27,14 @@
#endif
#endif
#if defined(__aarch64__)
#if !defined(_lzcnt_u32)
#define _lzcnt_u32 __builtin_clz
#endif
#if !defined(_lzcnt_u32)
#define _lzcnt_u32 __builtin_clzll
#endif
#else
#if defined(__LZCNT__)
#if !defined(_lzcnt_u32)
#define _lzcnt_u32 __lzcnt32
@ -28,16 +43,13 @@
#define _lzcnt_u64 __lzcnt64
#endif
#endif
#endif
#if defined(__WIN32__)
// -- GODOT start --
#if !defined(NOMINMAX)
// -- GODOT end --
#define NOMINMAX
// -- GODOT start --
#endif
#include "windows.h"
// -- GODOT end --
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#endif
/* normally defined in pmmintrin.h, but we always need this */
@ -65,7 +77,7 @@ namespace embree
}
__forceinline int bsf(int v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _tzcnt_u32(v);
#else
unsigned long r = 0; _BitScanForward(&r,v); return r;
@ -73,7 +85,7 @@ namespace embree
}
__forceinline unsigned bsf(unsigned v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _tzcnt_u32(v);
#else
unsigned long r = 0; _BitScanForward(&r,v); return r;
@ -114,7 +126,7 @@ namespace embree
#endif
__forceinline int bsr(int v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return 31 - _lzcnt_u32(v);
#else
unsigned long r = 0; _BitScanReverse(&r,v); return r;
@ -122,7 +134,7 @@ namespace embree
}
__forceinline unsigned bsr(unsigned v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return 31 - _lzcnt_u32(v);
#else
unsigned long r = 0; _BitScanReverse(&r,v); return r;
@ -141,7 +153,7 @@ namespace embree
__forceinline int lzcnt(const int x)
{
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _lzcnt_u32(x);
#else
if (unlikely(x == 0)) return 32;
@ -210,47 +222,72 @@ namespace embree
#else
__forceinline void __cpuid(int out[4], int op) {
#if defined(__ARM_NEON)
if (op == 0) { // Get CPU name
out[0] = 0x41524d20;
out[1] = 0x41524d20;
out[2] = 0x41524d20;
out[3] = 0x41524d20;
}
#else
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
#endif
}
#if !defined(__ARM_NEON)
__forceinline void __cpuid_count(int out[4], int op1, int op2) {
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
}
#endif
#endif
__forceinline uint64_t read_tsc() {
#if defined(__ARM_NEON)
return 0; // FIXME(LTE): mimic rdtsc
#else
uint32_t high,low;
asm volatile ("rdtsc" : "=d"(high), "=a"(low));
return (((uint64_t)high) << 32) + (uint64_t)low;
#endif
}
__forceinline int bsf(int v) {
#if defined(__ARM_NEON)
return __builtin_ctz(v);
#else
#if defined(__AVX2__)
return _tzcnt_u32(v);
#else
int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
#endif
}
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline unsigned bsf(unsigned v)
{
#if defined(__ARM_NEON)
return __builtin_ctz(v);
#else
#if defined(__AVX2__)
return _tzcnt_u32(v);
#else
unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
#endif
}
#endif
__forceinline size_t bsf(size_t v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__X86_64__)
return _tzcnt_u64(v);
#else
return _tzcnt_u32(v);
#endif
#elif defined(__ARM_NEON)
return __builtin_ctzl(v);
#else
size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
@ -263,7 +300,7 @@ namespace embree
return i;
}
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline unsigned int bscf(unsigned int& v)
{
unsigned int i = bsf(v);
@ -280,17 +317,21 @@ namespace embree
}
__forceinline int bsr(int v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return 31 - _lzcnt_u32(v);
#elif defined(__ARM_NEON)
return __builtin_clz(v)^31;
#else
int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
}
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
__forceinline unsigned bsr(unsigned v) {
#if defined(__AVX2__)
return 31 - _lzcnt_u32(v);
#elif defined(__ARM_NEON)
return __builtin_clz(v)^31;
#else
unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
@ -298,12 +339,14 @@ namespace embree
#endif
__forceinline size_t bsr(size_t v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__X86_64__)
return 63 - _lzcnt_u64(v);
#else
return 31 - _lzcnt_u32(v);
#endif
#elif defined(__aarch64__)
return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
#else
size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
@ -311,7 +354,7 @@ namespace embree
__forceinline int lzcnt(const int x)
{
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
return _lzcnt_u32(x);
#else
if (unlikely(x == 0)) return 32;
@ -320,7 +363,7 @@ namespace embree
}
__forceinline size_t blsr(size_t v) {
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__INTEL_COMPILER)
return _blsr_u64(v);
#else
@ -336,27 +379,65 @@ namespace embree
}
__forceinline int btc(int v, int i) {
#if defined(__aarch64__)
// _bittestandcomplement(long *a, long b) {
// unsigned char x = (*a >> b) & 1;
// *a = *a ^ (1 << b);
// return x;
// We only need `*a`
return (v ^ (1 << i));
#else
int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
#endif
}
__forceinline int bts(int v, int i) {
#if defined(__aarch64__)
// _bittestandset(long *a, long b) {
// unsigned char x = (*a >> b) & 1;
// *a = *a | (1 << b);
// return x;
return (v | (v << i));
#else
int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
}
__forceinline int btr(int v, int i) {
#if defined(__aarch64__)
// _bittestandreset(long *a, long b) {
// unsigned char x = (*a >> b) & 1;
// *a = *a & ~(1 << b);
// return x;
return (v & ~(v << i));
#else
int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
}
__forceinline size_t btc(size_t v, size_t i) {
#if defined(__aarch64__)
return (v ^ (1 << i));
#else
size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
#endif
}
__forceinline size_t bts(size_t v, size_t i) {
#if defined(__aarch64__)
return (v | (v << i));
#else
size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
}
__forceinline size_t btr(size_t v, size_t i) {
#if defined(__ARM_NEON)
return (v & ~(v << i));
#else
size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
}
__forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
@ -390,7 +471,7 @@ namespace embree
#endif
#endif
#if defined(__SSE4_2__)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
__forceinline int popcnt(int in) {
return _mm_popcnt_u32(in);
@ -400,7 +481,7 @@ namespace embree
return _mm_popcnt_u32(in);
}
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__ARM_NEON)
__forceinline size_t popcnt(size_t in) {
return _mm_popcnt_u64(in);
}
@ -443,7 +524,7 @@ namespace embree
__forceinline void prefetchL2EX(const void* ptr) {
prefetchEX(ptr);
}
#if defined(__AVX2__)
#if defined(__AVX2__) && !defined(__aarch64__)
__forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
__forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
#if defined(__X86_64__)

View file

@ -27,9 +27,7 @@ namespace embree
/* returns address of a symbol from the library */
void* getSymbol(lib_t lib, const std::string& sym) {
// -- GODOT start --
return (void*) GetProcAddress(HMODULE(lib),sym.c_str());
// -- GODOT end --
return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str()));
}
/* closes the shared library */

View file

@ -36,6 +36,7 @@ namespace embree
MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
assert(ok);
delete (pthread_mutex_t*)mutex;
mutex = nullptr;
}
void MutexSys::lock()

View file

@ -91,7 +91,7 @@
#define dll_import
#endif
#if defined(__WIN32__) && !defined(__MINGW32__)
#ifdef __WIN32__
#if !defined(__noinline)
#define __noinline __declspec(noinline)
#endif
@ -103,11 +103,16 @@
#define __restrict__ //__restrict // causes issues with MSVC
#endif
#if !defined(__thread)
// NOTE: Require `-fms-extensions` for clang
#define __thread __declspec(thread)
#endif
#if !defined(__aligned)
#if defined(__MINGW32__)
#define __aligned(...) __attribute__((aligned(__VA_ARGS__)))
#else
#define __aligned(...) __declspec(align(__VA_ARGS__))
#endif
#endif
//#define __FUNCTION__ __FUNCTION__
#define debugbreak() __debugbreak()
@ -142,7 +147,7 @@
#endif
// -- GODOT start --
#if !defined(likely)
#ifndef likely
// -- GODOT end --
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#define likely(expr) (expr)
@ -169,11 +174,19 @@
#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
#if defined(DEBUG) // only report file and line in debug mode
// -- GODOT start --
// #define THROW_RUNTIME_ERROR(str)
// throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define THROW_RUNTIME_ERROR(str) \
throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
// -- GODOT end --
#else
// -- GODOT start --
// #define THROW_RUNTIME_ERROR(str)
// throw std::runtime_error(str);
#define THROW_RUNTIME_ERROR(str) \
throw std::runtime_error(str);
abort();
// -- GODOT end --
#endif
#define FATAL(x) THROW_RUNTIME_ERROR(x)
@ -192,7 +205,7 @@ namespace embree {
/* windows does not have ssize_t */
#if defined(__WIN32__)
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
typedef int64_t ssize_t;
#else
typedef int32_t ssize_t;

View file

@ -21,7 +21,13 @@ namespace embree
std::string getPlatformName()
{
#if defined(__LINUX__) && !defined(__X86_64__)
#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON)
return "Android Linux (aarch64 / arm64)";
#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__)
return "Android Linux (x64)";
#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86))
return "Android Linux (x86)";
#elif defined(__LINUX__) && !defined(__X86_64__)
return "Linux (32bit)";
#elif defined(__LINUX__) && defined(__X86_64__)
return "Linux (64bit)";
@ -37,10 +43,16 @@ namespace embree
return "Windows (32bit)";
#elif defined(__WIN32__) && defined(__X86_64__)
return "Windows (64bit)";
#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__)
return "iOS Simulator (x64)";
#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON)
return "iOS (aarch64 / arm64)";
#elif defined(__MACOSX__) && !defined(__X86_64__)
return "Mac OS X (32bit)";
#elif defined(__MACOSX__) && defined(__X86_64__)
return "Mac OS X (64bit)";
#elif defined(__UNIX__) && defined(__aarch64__)
return "Unix (aarch64)";
#elif defined(__UNIX__) && !defined(__X86_64__)
return "Unix (32bit)";
#elif defined(__UNIX__) && defined(__X86_64__)
@ -183,11 +195,13 @@ namespace embree
case CPU::NEHALEM : return "Nehalem";
case CPU::CORE2 : return "Core2";
case CPU::CORE1 : return "Core";
case CPU::ARM : return "Arm";
case CPU::UNKNOWN : return "Unknown CPU";
}
return "Unknown CPU (error)";
}
#if !defined(__ARM_NEON)
/* constants to access destination registers of CPUID instruction */
static const int EAX = 0;
static const int EBX = 1;
@ -230,10 +244,13 @@ namespace embree
/* cpuid[eax=7,ecx=0].ecx */
static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions)
#endif
#if !defined(__ARM_NEON)
__noinline int64_t get_xcr0()
{
#if defined (__WIN32__) /* -- GODOT start -- */ && !defined (__MINGW32__) /* -- GODOT end -- */
// https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466
#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
xcr0 = _xgetbv(0);
return xcr0;
@ -243,9 +260,32 @@ namespace embree
return xcr0;
#endif
}
#endif
int getCPUFeatures()
{
#if defined(__ARM_NEON)
int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
#if defined(NEON_AVX2_EMULATION)
cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
cpu_features |= CPU_FEATURE_XMM_ENABLED;
cpu_features |= CPU_FEATURE_YMM_ENABLED;
cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
cpu_features |= CPU_FEATURE_POPCNT;
cpu_features |= CPU_FEATURE_AVX;
cpu_features |= CPU_FEATURE_AVX2;
cpu_features |= CPU_FEATURE_FMA3;
cpu_features |= CPU_FEATURE_LZCNT;
cpu_features |= CPU_FEATURE_BMI1;
cpu_features |= CPU_FEATURE_BMI2;
cpu_features |= CPU_FEATURE_NEON_2X;
#endif
return cpu_features;
#else
/* cache CPU features access */
static int cpu_features = 0;
if (cpu_features)
@ -297,8 +337,8 @@ namespace embree
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2;
@ -318,6 +358,7 @@ namespace embree
if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
return cpu_features;
#endif
}
std::string stringOfCPUFeatures(int features)
@ -350,6 +391,8 @@ namespace embree
if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
if (features & CPU_FEATURE_NEON) str += "NEON ";
if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
return str;
}
@ -365,6 +408,8 @@ namespace embree
if (isa == AVX2) return "AVX2";
if (isa == AVX512KNL) return "AVX512KNL";
if (isa == AVX512SKX) return "AVX512SKX";
if (isa == NEON) return "NEON";
if (isa == NEON_2X) return "2xNEON";
return "UNKNOWN";
}
@ -386,6 +431,8 @@ namespace embree
if (hasISA(features,AVX2)) v += "AVX2 ";
if (hasISA(features,AVX512KNL)) v += "AVX512KNL ";
if (hasISA(features,AVX512SKX)) v += "AVX512SKX ";
if (hasISA(features,NEON)) v += "NEON ";
if (hasISA(features,NEON_2X)) v += "2xNEON ";
return v;
}
}
@ -596,7 +643,7 @@ namespace embree
static int nThreads = -1;
if (nThreads != -1) return nThreads;
#if defined(__MACOSX__)
#if defined(__MACOSX__) || defined(__ANDROID__)
nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
assert(nThreads);
#else

View file

@ -59,6 +59,11 @@
# define isa sse
# define ISA SSE
# define ISA_STR "SSE"
#elif defined(__ARM_NEON)
// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
#define isa sse2
#define ISA NEON
#define ISA_STR "NEON"
#else
#error Unknown ISA
#endif
@ -87,6 +92,7 @@ namespace embree
NEHALEM,
CORE2,
CORE1,
ARM,
UNKNOWN,
};
@ -136,6 +142,8 @@ namespace embree
static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
static const int CPU_FEATURE_NEON = 1 << 28;
static const int CPU_FEATURE_NEON_2X = 1 << 29;
/*! get CPU features */
int getCPUFeatures();
@ -158,6 +166,8 @@ namespace embree
static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED;
static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
/*! converts ISA bitvector into a string */
std::string stringOfISA(int features);

View file

@ -6,7 +6,11 @@
#include "string.h"
#include <iostream>
#if defined(__ARM_NEON)
#include "../math/SSE2NEON.h"
#else
#include <xmmintrin.h>
#endif
#if defined(PTHREADS_WIN32)
#pragma comment (lib, "pthreadVC.lib")
@ -95,6 +99,7 @@ namespace embree
_mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
parg->f(parg->arg);
delete parg;
parg = nullptr;
return 0;
}
@ -120,12 +125,6 @@ namespace embree
CloseHandle(HANDLE(tid));
}
/*! destroy a hardware thread by its handle */
void destroyThread(thread_t tid) {
TerminateThread(HANDLE(tid),0);
CloseHandle(HANDLE(tid));
}
/*! creates thread local storage */
tls_t createTls() {
return tls_t(size_t(TlsAlloc()));
@ -160,11 +159,16 @@ namespace embree
#include <sstream>
#include <algorithm>
#if defined(__ANDROID__)
#include <pthread.h>
#endif
namespace embree
{
static MutexSys mutex;
static std::vector<size_t> threadIDs;
#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target
/* changes thread ID mapping such that we first fill up all thread on one core */
size_t mapThreadID(size_t threadID)
{
@ -229,16 +233,21 @@ namespace embree
return ID;
}
#endif
/*! set affinity of the calling thread */
void setAffinity(ssize_t affinity)
{
#if defined(__ANDROID__)
// TODO(LTE): Implement
#else
cpu_set_t cset;
CPU_ZERO(&cset);
size_t threadID = mapThreadID(affinity);
CPU_SET(threadID, &cset);
pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
#endif
}
}
#endif
@ -326,6 +335,7 @@ namespace embree
parg->f(parg->arg);
delete parg;
parg = nullptr;
return nullptr;
}
@ -347,7 +357,7 @@ namespace embree
pthread_attr_destroy(&attr);
/* set affinity */
#if defined(__LINUX__)
#if defined(__LINUX__) && !defined(__ANDROID__)
if (threadID >= 0) {
cpu_set_t cset;
CPU_ZERO(&cset);
@ -379,12 +389,6 @@ namespace embree
delete (pthread_t*)tid;
}
/*! destroy a hardware thread by its handle */
void destroyThread(thread_t tid) {
pthread_cancel(*(pthread_t*)tid);
delete (pthread_t*)tid;
}
/*! creates thread local storage */
tls_t createTls()
{

View file

@ -29,9 +29,6 @@ namespace embree
/*! waits until the given thread has terminated */
void join(thread_t tid);
/*! destroy handle of a thread */
void destroyThread(thread_t tid);
/*! type for handle to thread local storage */
typedef struct opaque_tls_t* tls_t;

View file

@ -5,6 +5,8 @@
#if defined(TASKING_INTERNAL)
# include "taskschedulerinternal.h"
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
# include "taskschedulergcd.h"
#elif defined(TASKING_TBB)
# include "taskschedulertbb.h"
#elif defined(TASKING_PPL)

View file

@ -0,0 +1,49 @@
#pragma once
#include "../sys/platform.h"
#include "../sys/alloc.h"
#include "../sys/barrier.h"
#include "../sys/thread.h"
#include "../sys/mutex.h"
#include "../sys/condition.h"
#include "../sys/ref.h"
#include <dispatch/dispatch.h>
namespace embree
{
struct TaskScheduler
{
/*! initializes the task scheduler */
static void create(size_t numThreads, bool set_affinity, bool start_threads);
/*! destroys the task scheduler again */
static void destroy() {}
/* returns the ID of the current thread */
static __forceinline size_t threadID()
{
return threadIndex();
}
/* returns the index (0..threadCount-1) of the current thread */
static __forceinline size_t threadIndex()
{
currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads;
return currentThreadIndex;
}
/* returns the total number of threads */
static __forceinline size_t threadCount()
{
return GCDNumThreads;
}
private:
static size_t GCDNumThreads;
static size_t currentThreadIndex;
};
};

View file

@ -48,13 +48,15 @@ namespace embree
{
Task* prevTask = thread.task;
thread.task = this;
try {
if (thread.scheduler->cancellingException == nullptr)
// -- GODOT start --
// try {
// if (thread.scheduler->cancellingException == nullptr)
closure->execute();
} catch (...) {
if (thread.scheduler->cancellingException == nullptr)
thread.scheduler->cancellingException = std::current_exception();
}
// } catch (...) {
// if (thread.scheduler->cancellingException == nullptr)
// thread.scheduler->cancellingException = std::current_exception();
// }
// -- GODOT end --
thread.task = prevTask;
add_dependencies(-1);
}
@ -152,6 +154,12 @@ namespace embree
assert(newNumThreads);
newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
// We are observing a few % gain by increasing number threads by 2 on aarch64.
#if defined(__aarch64__) && defined(BUILD_IOS)
numThreads = newNumThreads*2;
#else
numThreads = newNumThreads;
#endif
numThreads = newNumThreads;
if (!startThreads && !running) return;
running = true;
@ -291,8 +299,11 @@ namespace embree
size_t threadIndex = allocThreadIndex();
condition.wait(mutex, [&] () { return hasRootTask.load(); });
mutex.unlock();
std::exception_ptr except = thread_loop(threadIndex);
if (except != nullptr) std::rethrow_exception(except);
// -- GODOT start --
// std::exception_ptr except = thread_loop(threadIndex);
// if (except != nullptr) std::rethrow_exception(except);
thread_loop(threadIndex);
// -- GODOT end --
}
void TaskScheduler::reset() {
@ -324,7 +335,10 @@ namespace embree
return thread->scheduler->cancellingException == nullptr;
}
std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
// -- GODOT start --
// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
void TaskScheduler::thread_loop(size_t threadIndex)
// -- GODOT end --
{
/* allocate thread structure */
std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
@ -347,9 +361,10 @@ namespace embree
swapThread(oldThread);
/* remember exception to throw */
std::exception_ptr except = nullptr;
if (cancellingException != nullptr) except = cancellingException;
// -- GODOT start --
// std::exception_ptr except = nullptr;
// if (cancellingException != nullptr) except = cancellingException;
// -- GODOT end --
/* wait for all threads to terminate */
threadCounter--;
#if defined(__WIN32__)
@ -367,7 +382,10 @@ namespace embree
yield();
#endif
}
return except;
// -- GODOT start --
// return except;
return;
// -- GODOT end --
}
bool TaskScheduler::steal_from_other_threads(Thread& thread)

View file

@ -123,7 +123,10 @@ namespace embree
{
size_t ofs = bytes + ((align - stackPtr) & (align-1));
if (stackPtr + ofs > CLOSURE_STACK_SIZE)
throw std::runtime_error("closure stack overflow");
// -- GODOT start --
// throw std::runtime_error("closure stack overflow");
abort();
// -- GODOT end --
stackPtr += ofs;
return &stack[stackPtr-bytes];
}
@ -132,12 +135,16 @@ namespace embree
__forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
{
if (right >= TASK_STACK_SIZE)
throw std::runtime_error("task stack overflow");
// -- GODOT start --
// throw std::runtime_error("task stack overflow");
abort();
// -- GODOT end --
/* allocate new task on right side of stack */
size_t oldStackPtr = stackPtr;
TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
/* gcc 8 or later fails to compile without explicit .load() */
new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
right++;
/* also move left pointer */
@ -238,7 +245,10 @@ namespace embree
void wait_for_threads(size_t threadCount);
/*! thread loop for all worker threads */
std::exception_ptr thread_loop(size_t threadIndex);
// -- GODOT start --
// std::exception_ptr thread_loop(size_t threadIndex);
void thread_loop(size_t threadIndex);
// -- GODOT end --
/*! steals a task from a different thread */
bool steal_from_other_threads(Thread& thread);

View file

@ -12,13 +12,7 @@
#include "../sys/ref.h"
#if defined(__WIN32__)
// -- GODOT start --
#if !defined(NOMINMAX)
// -- GODOT end --
# define NOMINMAX
// -- GODOT start --
#endif
// -- GODOT end --
#endif
// We need to define these to avoid implicit linkage against

View file

@ -19,7 +19,7 @@ typedef int ssize_t;
#endif
#endif
#if defined(_WIN32) && defined(_MSC_VER)
#if defined(_WIN32) && !defined(__MINGW32__)
# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
#else
# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))

View file

@ -43,7 +43,7 @@ namespace embree
{
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(settings.sahBlockSize);
if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(static_cast<size_t>(settings.sahBlockSize));
if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost;

View file

@ -51,7 +51,7 @@ namespace embree
template<int N>
void BVHN<N>::layoutLargeNodes(size_t num)
{
#if defined(__X86_64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
struct NodeArea
{
__forceinline NodeArea() {}
@ -183,7 +183,7 @@ namespace embree
template class BVHN<8>;
#endif
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
template class BVHN<4>;
#endif
}

View file

@ -18,7 +18,7 @@
#include "../geometry/object.h"
#include "../geometry/instance.h"
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
#else
# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues

View file

@ -172,12 +172,23 @@ namespace embree
TravRayKStream<K,robust> &p = packets[rayID / K];
const size_t i = rayID % K;
const vint<Nx> bitmask(shiftTable[rayID]);
#if defined (__aarch64__)
const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
const vfloat<Nx> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
const vfloat<Nx> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
const vfloat<Nx> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
#else
const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
const vfloat<Nx> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
const vfloat<Nx> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
const vfloat<Nx> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]);
#endif
const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i]));

View file

@ -102,7 +102,7 @@ namespace embree
/*! Sets the barrier bit. */
__forceinline void setBarrier() {
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
assert(!isBarrier());
ptr |= barrier_mask;
#else
@ -112,7 +112,7 @@ namespace embree
/*! Clears the barrier bit. */
__forceinline void clearBarrier() {
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
ptr &= ~barrier_mask;
#else
assert(false);

View file

@ -150,7 +150,10 @@ namespace embree
}
}
else {
throw std::runtime_error("not supported node type in bvh_statistics");
// -- GODOT start --
// throw std::runtime_error("not supported node type in bvh_statistics");
abort();
// -- GODOT end --
}
return s;
}
@ -159,7 +162,7 @@ namespace embree
template class BVHNStatistics<8>;
#endif
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
template class BVHNStatistics<4>;
#endif
}

View file

@ -5,6 +5,15 @@
#include "node_intersector.h"
#if defined(__AVX2__)
#define __FMA_X4__
#endif
#if defined(__aarch64__)
#define __FMA_X4__
#endif
namespace embree
{
namespace isa
@ -29,9 +38,15 @@ namespace embree
org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
#if defined(__AVX2__)
#if defined(__FMA_X4__)
const Vec3fa ray_org_rdir = ray_org*ray_rdir;
#if !defined(__aarch64__)
org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
#else
//for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
//x86 will use msub
neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
#endif
#endif
nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
@ -59,8 +74,12 @@ namespace embree
org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
#if defined(__AVX2__)
org_rdir = org*rdir;
#if defined(__FMA_X4__)
#if !defined(__aarch64__)
org_rdir = org*rdir;
#else
neg_org_rdir = -(org*rdir);
#endif
#endif
nearX = nearXYZ.x[k];
nearY = nearXYZ.y[k];
@ -81,8 +100,14 @@ namespace embree
Vec3fa org_xyz, dir_xyz;
Vec3vf<Nx> org, dir, rdir;
#if defined(__AVX2__)
#if defined(__FMA_X4__)
#if !defined(__aarch64__)
Vec3vf<Nx> org_rdir;
#else
//aarch64 version are keeping negation of the org_rdir and use madd
//x86 uses msub
Vec3vf<Nx> neg_org_rdir;
#endif
#endif
#if defined(__AVX512ER__) // KNL+
vint16 permX, permY, permZ;
@ -110,7 +135,6 @@ namespace embree
dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
rdir_far = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
@ -447,13 +471,22 @@ namespace embree
template<>
__forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
{
#if defined(__AVX2__)
#if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@ -463,7 +496,12 @@ namespace embree
const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
#endif
#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
#if defined(__aarch64__)
const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
const vbool4 vmask = asInt(tNear) <= asInt(tFar);
const size_t mask = movemask(vmask);
#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
const vbool4 vmask = asInt(tNear) > asInt(tFar);
@ -489,12 +527,22 @@ namespace embree
__forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
{
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@ -638,13 +686,22 @@ namespace embree
const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
#if defined(__AVX2__)
#if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@ -653,7 +710,7 @@ namespace embree
const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
#endif
#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
const vbool<N> vmask = asInt(tNear) > asInt(tFar);
@ -714,13 +771,22 @@ namespace embree
const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
#if defined (__AVX2__)
#if defined (__FMA_X4__)
#if defined(__aarch64__)
const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@ -729,7 +795,7 @@ namespace embree
const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
#endif
#if defined(__AVX2__) && !defined(__AVX512F__)
#if defined(__FMA_X4__) && !defined(__AVX512F__)
const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
#else
@ -803,13 +869,22 @@ namespace embree
const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z);
#if defined(__AVX2__)
#if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@ -819,7 +894,7 @@ namespace embree
const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z;
#endif
#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
const vbool4 vmask = asInt(tNear) > asInt(tFar);
@ -892,12 +967,21 @@ namespace embree
const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z);
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@ -1078,13 +1162,22 @@ namespace embree
const vfloat<N> upper_y = node->dequantizeUpperY(time);
const vfloat<N> lower_z = node->dequantizeLowerZ(time);
const vfloat<N> upper_z = node->dequantizeUpperZ(time);
#if defined(__AVX2__)
#if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
#endif
#else
const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;

View file

@ -81,9 +81,13 @@ namespace embree
min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
#if defined (__aarch64__)
neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
#else
min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
#endif
min_dist = reduced_min_dist;
max_dist = reduced_max_dist;
@ -101,9 +105,13 @@ namespace embree
Vec3fa min_rdir;
Vec3fa max_rdir;
#if defined (__aarch64__)
Vec3fa neg_min_org_rdir;
Vec3fa neg_max_org_rdir;
#else
Vec3fa min_org_rdir;
Vec3fa max_org_rdir;
#endif
float min_dist;
float max_dist;
};
@ -203,13 +211,21 @@ namespace embree
const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
#if defined (__aarch64__)
const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x));
const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y));
const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z));
const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x));
const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y));
const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z));
#else
const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x));
const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y));
const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z));
const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x));
const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y));
const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z));
#endif
const vfloat<Nx> fmin = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
dist = fmin;
const vfloat<Nx> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));

View file

@ -39,10 +39,11 @@ namespace embree
org = ray_org;
dir = ray_dir;
rdir = rcp_safe(ray_dir);
#if defined(__AVX2__)
#if defined(__aarch64__)
neg_org_rdir = -(org * rdir);
#elif defined(__AVX2__)
org_rdir = org * rdir;
#endif
if (N)
{
const int size = sizeof(float)*N;
@ -55,7 +56,9 @@ namespace embree
Vec3vf<K> org;
Vec3vf<K> dir;
Vec3vf<K> rdir;
#if defined(__AVX2__)
#if defined(__aarch64__)
Vec3vf<K> neg_org_rdir;
#elif defined(__AVX2__)
Vec3vf<K> org_rdir;
#endif
Vec3vi<K> nearXYZ;
@ -119,7 +122,14 @@ namespace embree
const TravRayKFast<K>& ray, vfloat<K>& dist)
{
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
@ -199,7 +209,14 @@ namespace embree
const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@ -302,7 +319,14 @@ namespace embree
const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@ -464,7 +488,14 @@ namespace embree
const vfloat<N> lower_z = node->dequantizeLowerZ();
const vfloat<N> upper_z = node->dequantizeUpperZ();
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
@ -549,7 +580,14 @@ namespace embree
const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
#if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);

View file

@ -32,11 +32,19 @@ namespace embree
__forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
{
rdir = rcp_safe(ray_dir);
#if defined(__aarch64__)
neg_org_rdir = -(ray_org * rdir);
#else
org_rdir = ray_org * rdir;
#endif
}
Vec3vf<K> rdir;
#if defined(__aarch64__)
Vec3vf<K> neg_org_rdir;
#else
Vec3vf<K> org_rdir;
#endif
vfloat<K> tnear;
vfloat<K> tfar;
};
@ -87,12 +95,21 @@ namespace embree
const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
#if defined (__aarch64__)
const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
#else
const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
#endif
const vfloat<Nx> rmin = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
const vfloat<Nx> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
@ -113,12 +130,21 @@ namespace embree
const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
#if defined (__aarch64__)
const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
#endif
const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear);
const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);

View file

@ -332,7 +332,7 @@ namespace embree
intersectorN.intersect(this,rayN,N,context);
}
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
__forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
const vint<4> mask = valid.mask32();
intersect4(&mask,(RTCRayHit4&)ray,context);
@ -388,7 +388,7 @@ namespace embree
intersectorN.occluded(this,rayN,N,context);
}
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
__forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
const vint<4> mask = valid.mask32();
occluded4(&mask,(RTCRay4&)ray,context);

View file

@ -97,7 +97,7 @@ namespace embree
for (size_t i=0; i<This->accels.size(); i++) {
if (This->accels[i]->isEmpty()) continue;
This->accels[i]->intersectors.occluded4(valid,ray,context);
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
vbool4 valid0 = asBool(((vint4*)valid)[0]);
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
if (unlikely(none(valid0 & hit0))) break;
@ -111,7 +111,7 @@ namespace embree
for (size_t i=0; i<This->accels.size(); i++) {
if (This->accels[i]->isEmpty()) continue;
This->accels[i]->intersectors.occluded8(valid,ray,context);
#if defined(__SSE2__) // FIXME: use higher ISA
#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
vbool4 valid0 = asBool(((vint4*)valid)[0]);
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
vbool4 valid1 = asBool(((vint4*)valid)[1]);
@ -127,7 +127,7 @@ namespace embree
for (size_t i=0; i<This->accels.size(); i++) {
if (This->accels[i]->isEmpty()) continue;
This->accels[i]->intersectors.occluded16(valid,ray,context);
#if defined(__SSE2__) // FIXME: use higher ISA
#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
vbool4 valid0 = asBool(((vint4*)valid)[0]);
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
vbool4 valid1 = asBool(((vint4*)valid)[1]);

View file

@ -3,6 +3,9 @@
#include "alloc.h"
#include "../../common/sys/thread.h"
#if defined(__aarch64__) && defined(BUILD_IOS)
#include "../../common/sys/barrier.h"
#endif
namespace embree
{

View file

@ -8,6 +8,10 @@
#include "scene.h"
#include "primref.h"
#if defined(__aarch64__) && defined(BUILD_IOS)
#include <mutex>
#endif
namespace embree
{
class FastAllocator
@ -26,7 +30,7 @@ namespace embree
public:
struct ThreadLocal2;
enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
/*! Per thread structure holding the current memory block. */
struct __aligned(64) ThreadLocal
@ -132,7 +136,11 @@ namespace embree
{
assert(alloc_i);
if (alloc.load() == alloc_i) return;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex);
#endif
//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
if (alloc.load()) {
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
@ -150,7 +158,11 @@ namespace embree
{
assert(alloc_i);
if (alloc.load() != alloc_i) return;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex);
#endif
if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
@ -161,7 +173,11 @@ namespace embree
}
public:
#if defined(__aarch64__) && defined(BUILD_IOS)
std::mutex mutex;
#else
SpinLock mutex; //!< required as unbind is called from other threads
#endif
std::atomic<FastAllocator*> alloc; //!< parent allocator
ThreadLocal alloc0;
ThreadLocal alloc1;
@ -169,7 +185,7 @@ namespace embree
FastAllocator (Device* device, bool osAllocation)
: device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
primrefarray(device,0)
{
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
@ -206,7 +222,7 @@ namespace embree
void setOSallocation(bool flag)
{
atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
}
private:
@ -217,7 +233,11 @@ namespace embree
ThreadLocal2* alloc = thread_local_allocator2;
if (alloc == nullptr) {
thread_local_allocator2 = alloc = new ThreadLocal2;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(s_thread_local_allocators_lock);
#else
Lock<SpinLock> lock(s_thread_local_allocators_lock);
#endif
s_thread_local_allocators.push_back(make_unique(alloc));
}
return alloc;
@ -227,7 +247,11 @@ namespace embree
__forceinline void join(ThreadLocal2* alloc)
{
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(s_thread_local_allocators_lock);
#else
Lock<SpinLock> lock(thread_local_allocators_lock);
#endif
thread_local_allocators.push_back(alloc);
}
@ -496,7 +520,11 @@ namespace embree
/* parallel block creation in case of no freeBlocks, avoids single global mutex */
if (likely(freeBlocks.load() == nullptr))
{
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(slotMutex[slot]);
#else
Lock<SpinLock> lock(slotMutex[slot]);
#endif
if (myUsedBlocks == threadUsedBlocks[slot]) {
const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
@ -509,7 +537,11 @@ namespace embree
/* if this fails allocate new block */
{
Lock<SpinLock> lock(mutex);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex);
#endif
if (myUsedBlocks == threadUsedBlocks[slot])
{
if (freeBlocks.load() != nullptr) {
@ -531,7 +563,11 @@ namespace embree
/*! add new block */
void addBlock(void* ptr, ssize_t bytes)
{
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex);
#endif
const size_t sizeof_Header = offsetof(Block,data[0]);
void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
size_t ofs = (size_t) aptr - (size_t) ptr;
@ -617,8 +653,8 @@ namespace embree
bytesWasted(alloc->bytesWasted),
stat_all(alloc,ANY_TYPE),
stat_malloc(alloc,ALIGNED_MALLOC),
stat_4K(alloc,OS_MALLOC,false),
stat_2M(alloc,OS_MALLOC,true),
stat_4K(alloc,EMBREE_OS_MALLOC,false),
stat_2M(alloc,EMBREE_OS_MALLOC,true),
stat_shared(alloc,SHARED) {}
AllStatistics (size_t bytesUsed,
@ -711,7 +747,7 @@ namespace embree
/* We avoid using os_malloc for small blocks as this could
* cause a risk of fragmenting the virtual address space and
* reach the limit of vm.max_map_count = 65k under Linux. */
if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
atype = ALIGNED_MALLOC;
/* we need to additionally allocate some header */
@ -720,7 +756,7 @@ namespace embree
bytesReserve = sizeof_Header+bytesReserve;
/* consume full 4k pages with using os_malloc */
if (atype == OS_MALLOC) {
if (atype == EMBREE_OS_MALLOC) {
bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
}
@ -752,11 +788,11 @@ namespace embree
return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
}
}
else if (atype == OS_MALLOC)
else if (atype == EMBREE_OS_MALLOC)
{
if (device) device->memoryMonitor(bytesAllocate,false);
bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
}
else
assert(false);
@ -800,7 +836,7 @@ namespace embree
if (device) device->memoryMonitor(-sizeof_Alloced,true);
}
else if (atype == OS_MALLOC) {
else if (atype == EMBREE_OS_MALLOC) {
size_t sizeof_This = sizeof_Header+reserveEnd;
os_free(this,sizeof_This,huge_pages);
if (device) device->memoryMonitor(-sizeof_Alloced,true);
@ -861,7 +897,7 @@ namespace embree
bool hasType(AllocationType atype_i, bool huge_pages_i) const
{
if (atype_i == ANY_TYPE ) return true;
else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
else return atype_i == atype;
}
@ -910,7 +946,7 @@ namespace embree
void print_block() const
{
if (atype == ALIGNED_MALLOC) std::cout << "A";
else if (atype == OS_MALLOC) std::cout << "O";
else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
else if (atype == SHARED) std::cout << "S";
if (huge_pages) std::cout << "H";
size_t bytesUsed = getBlockUsedBytes();
@ -940,7 +976,11 @@ namespace embree
std::atomic<Block*> freeBlocks;
std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
#if defined(__aarch64__) && defined(BUILD_IOS)
std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
#else
SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
#endif
bool use_single_mode;
size_t defaultBlockSize;
@ -954,7 +994,11 @@ namespace embree
static __thread ThreadLocal2* thread_local_allocator2;
static SpinLock s_thread_local_allocators_lock;
static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::mutex thread_local_allocators_lock;
#else
SpinLock thread_local_allocators_lock;
#endif
std::vector<ThreadLocal2*> thread_local_allocators;
AllocationType atype;
mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes

View file

@ -55,6 +55,11 @@
#include <utility>
#include <sstream>
#if !defined(_DEBUG) && defined(BUILD_IOS)
#undef assert
#define assert(_EXPR)
#endif
namespace embree
{
////////////////////////////////////////////////////////////////////////////////

View file

@ -221,6 +221,9 @@ namespace embree
#if defined(TASKING_INTERNAL)
std::cout << "internal_tasking_system ";
#endif
#if defined(TASKING_GCD) && defined(BUILD_IOS)
std::cout << "GCD tasking system ";
#endif
#if defined(TASKING_PPL)
std::cout << "PPL ";
#endif
@ -504,6 +507,10 @@ namespace embree
case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
#endif
#if defined(TASKING_GCD) && defined(BUILD_IOS)
case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
#endif
#if defined(EMBREE_GEOMETRY_TRIANGLE)
case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
#else

View file

@ -46,7 +46,7 @@ namespace embree
#define SELECT_SYMBOL_DEFAULT(features,intersector) \
intersector = isa::intersector;
#if defined(__SSE__)
#if defined(__SSE__) || defined(__ARM_NEON)
#if !defined(EMBREE_TARGET_SIMD4)
#define EMBREE_TARGET_SIMD4
#endif

View file

@ -29,7 +29,7 @@ namespace embree
__forceinline PrimRef (const BBox3fa& bounds, size_t id)
{
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
#else
@ -79,7 +79,7 @@ namespace embree
/*! returns an size_t sized ID */
__forceinline size_t ID() const {
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
return size_t(lower.u) + (size_t(upper.u) << 32);
#else
return size_t(lower.u);

View file

@ -32,7 +32,7 @@ namespace embree
: lbounds((LBBox3fx)lbounds_i), time_range(time_range)
{
assert(activeTimeSegments > 0);
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
#else
@ -47,7 +47,7 @@ namespace embree
: lbounds((LBBox3fx)lbounds_i), time_range(time_range)
{
assert(activeTimeSegments > 0);
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
#else
@ -115,7 +115,7 @@ namespace embree
/*! returns an size_t sized ID */
__forceinline size_t ID() const {
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
#else
return size_t(lbounds.bounds0.lower.u);
@ -163,7 +163,7 @@ namespace embree
: bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
{
assert(activeTimeSegments > 0);
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
bbox.lower.u = id & 0xFFFFFFFF;
bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
#else
@ -229,7 +229,7 @@ namespace embree
/*! returns an size_t sized ID */
__forceinline size_t ID() const {
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
#else
return size_t(bbox.lower.u);

View file

@ -8,18 +8,31 @@
#include "scene.h"
#include "context.h"
#include "../../include/embree3/rtcore_ray.h"
#if defined(__aarch64__) && defined(BUILD_IOS)
#include <mutex>
#endif
using namespace embree;
RTC_NAMESPACE_BEGIN;
/* mutex to make API thread safe */
static MutexSys g_mutex;
#if defined(__aarch64__) && defined(BUILD_IOS)
static std::mutex g_mutex;
#else
static MutexSys g_mutex;
#endif
RTC_API RTCDevice rtcNewDevice(const char* config)
{
RTC_CATCH_BEGIN;
RTC_TRACE(rtcNewDevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex);
#endif
Device* device = new Device(config);
return (RTCDevice) device->refInc();
RTC_CATCH_END(nullptr);
@ -32,7 +45,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN;
RTC_TRACE(rtcRetainDevice);
RTC_VERIFY_HANDLE(hdevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex);
#endif
device->refInc();
RTC_CATCH_END(nullptr);
}
@ -43,7 +60,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN;
RTC_TRACE(rtcReleaseDevice);
RTC_VERIFY_HANDLE(hdevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex);
#endif
device->refDec();
RTC_CATCH_END(nullptr);
}
@ -54,7 +75,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN;
RTC_TRACE(rtcGetDeviceProperty);
RTC_VERIFY_HANDLE(hdevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex);
#endif
return device->getProperty(prop);
RTC_CATCH_END(device);
return 0;
@ -67,7 +92,11 @@ RTC_NAMESPACE_BEGIN;
RTC_TRACE(rtcSetDeviceProperty);
const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex);
#endif
device->setProperty(prop,val);
RTC_CATCH_END(device);
}
@ -183,7 +212,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN;
RTC_TRACE(rtcSetSceneProgressMonitorFunction);
RTC_VERIFY_HANDLE(hscene);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex);
#endif
scene->setProgressMonitorFunction(progress,ptr);
RTC_CATCH_END2(scene);
}
@ -197,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
if (quality != RTC_BUILD_QUALITY_LOW &&
quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH)
throw std::runtime_error("invalid build quality");
// -- GODOT start --
// throw std::runtime_error("invalid build quality");
abort();
// -- GODOT end --
scene->setBuildQuality(quality);
RTC_CATCH_END2(scene);
}
@ -479,12 +515,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS)
Ray4* ray4 = (Ray4*) rayhit;
RayHit4* rayhit4 = (RayHit4*)rayhit;
for (size_t i=0; i<4; i++) {
if (!valid[i]) continue;
RayHit ray1; ray4->get(i,ray1);
RayHit ray1; rayhit4->get(i,ray1);
scene->intersectors.intersect((RTCRayHit&)ray1,&context);
ray4->set(i,ray1);
rayhit4->set(i,ray1);
}
#else
scene->intersectors.intersect4(valid,*rayhit,&context);
@ -510,12 +546,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS)
Ray8* ray8 = (Ray8*) rayhit;
RayHit8* rayhit8 = (RayHit8*) rayhit;
for (size_t i=0; i<8; i++) {
if (!valid[i]) continue;
RayHit ray1; ray8->get(i,ray1);
RayHit ray1; rayhit8->get(i,ray1);
scene->intersectors.intersect((RTCRayHit&)ray1,&context);
ray8->set(i,ray1);
rayhit8->set(i,ray1);
}
#else
if (likely(scene->intersectors.intersector8))
@ -543,12 +579,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS)
Ray16* ray16 = (Ray16*) rayhit;
RayHit16* rayhit16 = (RayHit16*) rayhit;
for (size_t i=0; i<16; i++) {
if (!valid[i]) continue;
RayHit ray1; ray16->get(i,ray1);
RayHit ray1; rayhit16->get(i,ray1);
scene->intersectors.intersect((RTCRayHit&)ray1,&context);
ray16->set(i,ray1);
rayhit16->set(i,ray1);
}
#else
if (likely(scene->intersectors.intersector16))
@ -730,12 +766,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS)
RayHit4* ray4 = (RayHit4*) ray;
Ray4* ray4 = (Ray4*) ray;
for (size_t i=0; i<4; i++) {
if (!valid[i]) continue;
RayHit ray1; ray4->get(i,ray1);
Ray ray1; ray4->get(i,ray1);
scene->intersectors.occluded((RTCRay&)ray1,&context);
ray4->geomID[i] = ray1.geomID;
ray4->set(i,ray1);
}
#else
scene->intersectors.occluded4(valid,*ray,&context);
@ -761,10 +797,10 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS)
RayHit8* ray8 = (RayHit8*) ray;
Ray8* ray8 = (Ray8*) ray;
for (size_t i=0; i<8; i++) {
if (!valid[i]) continue;
RayHit ray1; ray8->get(i,ray1);
Ray ray1; ray8->get(i,ray1);
scene->intersectors.occluded((RTCRay&)ray1,&context);
ray8->set(i,ray1);
}
@ -795,10 +831,10 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS)
RayHit16* ray16 = (RayHit16*) ray;
Ray16* ray16 = (Ray16*) ray;
for (size_t i=0; i<16; i++) {
if (!valid[i]) continue;
RayHit ray1; ray16->get(i,ray1);
Ray ray1; ray16->get(i,ray1);
scene->intersectors.occluded((RTCRay&)ray1,&context);
ray16->set(i,ray1);
}
@ -1350,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH &&
quality != RTC_BUILD_QUALITY_REFIT)
throw std::runtime_error("invalid build quality");
// -- GODOT start --
// throw std::runtime_error("invalid build quality");
abort();
// -- GODOT end --
geometry->setBuildQuality(quality);
RTC_CATCH_END2(geometry);
}

View file

@ -25,52 +25,58 @@ namespace embree
#endif
/*! Macros used in the rtcore API implementation */
#define RTC_CATCH_BEGIN try {
// -- GODOT start --
// #define RTC_CATCH_BEGIN try {
#define RTC_CATCH_BEGIN
#define RTC_CATCH_END(device) \
} catch (std::bad_alloc&) { \
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
} catch (rtcore_error& e) { \
Device::process_error(device,e.error,e.what()); \
} catch (std::exception& e) { \
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
} catch (...) { \
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
}
// #define RTC_CATCH_END(device) \
// } catch (std::bad_alloc&) { \
// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
// } catch (rtcore_error& e) { \
// Device::process_error(device,e.error,e.what()); \
// } catch (std::exception& e) { \
// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
// } catch (...) { \
// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
// }
#define RTC_CATCH_END(device)
#define RTC_CATCH_END2(scene) \
} catch (std::bad_alloc&) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
} catch (rtcore_error& e) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,e.error,e.what()); \
} catch (std::exception& e) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
} catch (...) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
}
// #define RTC_CATCH_END2(scene) \
// } catch (std::bad_alloc&) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
// } catch (rtcore_error& e) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,e.error,e.what()); \
// } catch (std::exception& e) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
// } catch (...) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
// }
#define RTC_CATCH_END2(scene)
#define RTC_CATCH_END2_FALSE(scene) \
} catch (std::bad_alloc&) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
return false; \
} catch (rtcore_error& e) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,e.error,e.what()); \
return false; \
} catch (std::exception& e) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
return false; \
} catch (...) { \
Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
return false; \
}
// #define RTC_CATCH_END2_FALSE(scene) \
// } catch (std::bad_alloc&) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
// return false; \
// } catch (rtcore_error& e) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,e.error,e.what()); \
// return false; \
// } catch (std::exception& e) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
// return false; \
// } catch (...) { \
// Device* device = scene ? scene->device : nullptr; \
// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
// return false; \
// }
#define RTC_CATCH_END2_FALSE(scene) return false;
// -- GODOT end --
#define RTC_VERIFY_HANDLE(handle) \
if (handle == nullptr) { \
@ -97,28 +103,38 @@ namespace embree
#define RTC_TRACE(x)
#endif
/*! used to throw embree API errors */
struct rtcore_error : public std::exception
{
__forceinline rtcore_error(RTCError error, const std::string& str)
: error(error), str(str) {}
~rtcore_error() throw() {}
const char* what () const throw () {
return str.c_str();
}
RTCError error;
std::string str;
};
// -- GODOT begin --
// /*! used to throw embree API errors */
// struct rtcore_error : public std::exception
// {
// __forceinline rtcore_error(RTCError error, const std::string& str)
// : error(error), str(str) {}
//
// ~rtcore_error() throw() {}
//
// const char* what () const throw () {
// return str.c_str();
// }
//
// RTCError error;
// std::string str;
// };
// -- GODOT end --
#if defined(DEBUG) // only report file and line in debug mode
// -- GODOT begin --
// #define throw_RTCError(error,str) \
// throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define throw_RTCError(error,str) \
throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
// -- GODOT end --
#else
// -- GODOT begin --
// #define throw_RTCError(error,str) \
// throw rtcore_error(error,str);
#define throw_RTCError(error,str) \
throw rtcore_error(error,str);
abort();
// -- GODOT end --
#endif
#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \

View file

@ -594,7 +594,11 @@ namespace embree
unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
{
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(geometriesMutex);
#else
Lock<SpinLock> lock(geometriesMutex);
#endif
if (geomID == RTC_INVALID_GEOMETRY_ID) {
geomID = id_pool.allocate();
if (geomID == RTC_INVALID_GEOMETRY_ID)
@ -620,7 +624,11 @@ namespace embree
void Scene::detachGeometry(size_t geomID)
{
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(geometriesMutex);
#else
Lock<SpinLock> lock(geometriesMutex);
#endif
if (geomID >= geometries.size())
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
@ -792,21 +800,23 @@ namespace embree
}
/* initiate build */
try {
// -- GODOT start --
// try {
scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
}
catch (...) {
accels_clear();
updateInterface();
Lock<MutexSys> lock(schedulerMutex);
this->scheduler = nullptr;
throw;
}
// }
// catch (...) {
// accels_clear();
// updateInterface();
// Lock<MutexSys> lock(schedulerMutex);
// this->scheduler = nullptr;
// throw;
// }
// -- GODOT end --
}
#endif
#if defined(TASKING_TBB)
#if defined(TASKING_TBB) || defined(TASKING_GCD)
void Scene::commit (bool join)
{
@ -828,6 +838,9 @@ namespace embree
do {
#if defined(TASKING_GCD)
// Do Nothing
#else
#if USE_TASK_ARENA
if (join) {
device->arena->execute([&]{ group.wait(); });
@ -837,9 +850,11 @@ namespace embree
{
group.wait();
}
#endif
pause_cpu();
yield();
} while (!buildMutex.try_lock());
buildMutex.unlock();
@ -851,6 +866,7 @@ namespace embree
_mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
try {
#if defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION_MAJOR < 8
tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
#else
@ -879,6 +895,13 @@ namespace embree
/* reset MXCSR register again */
_mm_setcsr(mxcsr);
#elif defined(TASKING_GCD)
commit_task();
#endif // #if defined(TASKING_TBB)
}
catch (...)
{

View file

@ -275,11 +275,11 @@ namespace embree
parallel_set<uint32_t> holeSet;
/*! fast lookup table to detect invalid faces */
mvector<char> invalid_face;
mvector<int8_t> invalid_face;
/*! test if face i is invalid in timestep j */
__forceinline char& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; }
__forceinline const char& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
__forceinline int8_t& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; }
__forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
/*! interpolation cache */
public:

View file

@ -147,7 +147,20 @@ namespace embree
}
bool State::checkISASupport() {
#if defined(__ARM_NEON)
/*
* NEON CPU type is a mixture of NEON and SSE2
*/
bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
/* this will be true when explicitly initialize Device with `isa=neon` config */
bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
return hasSSE2 || hasNEON;
#else
return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
#endif
}
void State::verify()
@ -160,8 +173,10 @@ namespace embree
* functions */
#if defined(DEBUG)
#if defined(EMBREE_TARGET_SSE2)
#if !defined(__ARM_NEON)
assert(sse2::getISA() <= SSE2);
#endif
#endif
#if defined(EMBREE_TARGET_SSE42)
assert(sse42::getISA() <= SSE42);
#endif

View file

@ -43,10 +43,10 @@ namespace embree
__forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
{
size_t end = min(begin+M,_end);
N = (unsigned char)(end-begin);
N = (uint8_t)(end-begin);
const unsigned int geomID0 = prims[begin].geomID();
this->geomID(N) = geomID0;
ty = (unsigned char) scene->get(geomID0)->getType();
ty = (uint8_t) scene->get(geomID0)->getType();
/* encode all primitives */
BBox3fa bounds = empty;
@ -76,25 +76,25 @@ namespace embree
const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
bounds_vx_x(N)[i] = (char) space3.vx.x;
bounds_vx_y(N)[i] = (char) space3.vx.y;
bounds_vx_z(N)[i] = (char) space3.vx.z;
bounds_vx_x(N)[i] = (int8_t) space3.vx.x;
bounds_vx_y(N)[i] = (int8_t) space3.vx.y;
bounds_vx_z(N)[i] = (int8_t) space3.vx.z;
bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
bounds_vy_x(N)[i] = (char) space3.vy.x;
bounds_vy_y(N)[i] = (char) space3.vy.y;
bounds_vy_z(N)[i] = (char) space3.vy.z;
bounds_vy_x(N)[i] = (int8_t) space3.vy.x;
bounds_vy_y(N)[i] = (int8_t) space3.vy.y;
bounds_vy_z(N)[i] = (int8_t) space3.vy.z;
bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
bounds_vz_x(N)[i] = (char) space3.vz.x;
bounds_vz_y(N)[i] = (char) space3.vz.y;
bounds_vz_z(N)[i] = (char) space3.vz.z;
bounds_vz_x(N)[i] = (int8_t) space3.vz.x;
bounds_vz_y(N)[i] = (int8_t) space3.vz.y;
bounds_vz_z(N)[i] = (int8_t) space3.vz.z;
bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
@ -114,15 +114,15 @@ namespace embree
for (size_t i=0; i<items; i++) {
accel[i].fill(prims,start,set.end(),bvh->scene);
}
return bvh->encodeLeaf((char*)accel,items);
return bvh->encodeLeaf((int8_t*)accel,items);
};
public:
// 27.6 - 46 bytes per primitive
unsigned char ty;
unsigned char N;
unsigned char data[4+25*M+16];
uint8_t ty;
uint8_t N;
uint8_t data[4+25*M+16];
/*
struct Layout
@ -130,21 +130,21 @@ namespace embree
unsigned int geomID;
unsigned int primID[N];
char bounds_vx_x[N];
char bounds_vx_y[N];
char bounds_vx_z[N];
int8_t bounds_vx_x[N];
int8_t bounds_vx_y[N];
int8_t bounds_vx_z[N];
short bounds_vx_lower[N];
short bounds_vx_upper[N];
char bounds_vy_x[N];
char bounds_vy_y[N];
char bounds_vy_z[N];
int8_t bounds_vy_x[N];
int8_t bounds_vy_y[N];
int8_t bounds_vy_z[N];
short bounds_vy_lower[N];
short bounds_vy_upper[N];
char bounds_vz_x[N];
char bounds_vz_y[N];
char bounds_vz_z[N];
int8_t bounds_vz_x[N];
int8_t bounds_vz_y[N];
int8_t bounds_vz_z[N];
short bounds_vz_lower[N];
short bounds_vz_upper[N];
@ -153,65 +153,65 @@ namespace embree
};
*/
__forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((char*)this+2); }
__forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
__forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); }
__forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
__forceinline unsigned int* primID(size_t N) { return (unsigned int*)((char*)this+6); }
__forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
__forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); }
__forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
__forceinline char* bounds_vx_x(size_t N) { return (char*)((char*)this+6+4*N); }
__forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
__forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline char* bounds_vx_y(size_t N) { return (char*)((char*)this+6+5*N); }
__forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
__forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline char* bounds_vx_z(size_t N) { return (char*)((char*)this+6+6*N); }
__forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
__forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline short* bounds_vx_lower(size_t N) { return (short*)((char*)this+6+7*N); }
__forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((char*)this+6+7*N); }
__forceinline short* bounds_vx_lower(size_t N) { return (short*)((int8_t*)this+6+7*N); }
__forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
__forceinline short* bounds_vx_upper(size_t N) { return (short*)((char*)this+6+9*N); }
__forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((char*)this+6+9*N); }
__forceinline short* bounds_vx_upper(size_t N) { return (short*)((int8_t*)this+6+9*N); }
__forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
__forceinline char* bounds_vy_x(size_t N) { return (char*)((char*)this+6+11*N); }
__forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+11*N); }
__forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+11*N); }
__forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); }
__forceinline char* bounds_vy_y(size_t N) { return (char*)((char*)this+6+12*N); }
__forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+12*N); }
__forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+12*N); }
__forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); }
__forceinline char* bounds_vy_z(size_t N) { return (char*)((char*)this+6+13*N); }
__forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+13*N); }
__forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+13*N); }
__forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); }
__forceinline short* bounds_vy_lower(size_t N) { return (short*)((char*)this+6+14*N); }
__forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((char*)this+6+14*N); }
__forceinline short* bounds_vy_lower(size_t N) { return (short*)((int8_t*)this+6+14*N); }
__forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); }
__forceinline short* bounds_vy_upper(size_t N) { return (short*)((char*)this+6+16*N); }
__forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((char*)this+6+16*N); }
__forceinline short* bounds_vy_upper(size_t N) { return (short*)((int8_t*)this+6+16*N); }
__forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); }
__forceinline char* bounds_vz_x(size_t N) { return (char*)((char*)this+6+18*N); }
__forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+18*N); }
__forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+18*N); }
__forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); }
__forceinline char* bounds_vz_y(size_t N) { return (char*)((char*)this+6+19*N); }
__forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+19*N); }
__forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+19*N); }
__forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); }
__forceinline char* bounds_vz_z(size_t N) { return (char*)((char*)this+6+20*N); }
__forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+20*N); }
__forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+20*N); }
__forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); }
__forceinline short* bounds_vz_lower(size_t N) { return (short*)((char*)this+6+21*N); }
__forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((char*)this+6+21*N); }
__forceinline short* bounds_vz_lower(size_t N) { return (short*)((int8_t*)this+6+21*N); }
__forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); }
__forceinline short* bounds_vz_upper(size_t N) { return (short*)((char*)this+6+23*N); }
__forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((char*)this+6+23*N); }
__forceinline short* bounds_vz_upper(size_t N) { return (short*)((int8_t*)this+6+23*N); }
__forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); }
__forceinline Vec3f* offset(size_t N) { return (Vec3f*)((char*)this+6+25*N); }
__forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+25*N); }
__forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+25*N); }
__forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); }
__forceinline float* scale(size_t N) { return (float*)((char*)this+6+25*N+12); }
__forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+25*N+12); }
__forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+25*N+12); }
__forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); }
__forceinline char* end(size_t N) { return (char*)this+6+25*N+16; }
__forceinline const char* end(size_t N) const { return (char*)this+6+25*N+16; }
__forceinline int8_t* end(size_t N) { return (int8_t*)this+6+25*N+16; }
__forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; }
};
template<int M>

View file

@ -43,10 +43,10 @@ namespace embree
__forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
{
size_t end = min(begin+M,_end);
N = (unsigned char)(end-begin);
N = (uint8_t)(end-begin);
const unsigned int geomID0 = prims[begin].geomID();
this->geomID(N) = geomID0;
ty = (unsigned char) scene->get(geomID0)->getType();
ty = (uint8_t) scene->get(geomID0)->getType();
/* encode all primitives */
LBBox3fa lbounds = empty;
@ -79,10 +79,10 @@ namespace embree
const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
// NOTE: this weird (char) (short) cast works around VS2015 Win32 compiler bug
bounds_vx_x(N)[i] = (char) (short) space3.vx.x;
bounds_vx_y(N)[i] = (char) (short) space3.vx.y;
bounds_vx_z(N)[i] = (char) (short) space3.vx.z;
// NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug
bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x;
bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y;
bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z;
bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
@ -92,9 +92,9 @@ namespace embree
assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
bounds_vy_x(N)[i] = (char) (short) space3.vy.x;
bounds_vy_y(N)[i] = (char) (short) space3.vy.y;
bounds_vy_z(N)[i] = (char) (short) space3.vy.z;
bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x;
bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y;
bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z;
bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
@ -104,9 +104,9 @@ namespace embree
assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
bounds_vz_x(N)[i] = (char) (short) space3.vz.x;
bounds_vz_y(N)[i] = (char) (short) space3.vz.y;
bounds_vz_z(N)[i] = (char) (short) space3.vz.z;
bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x;
bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y;
bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z;
bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
@ -130,7 +130,7 @@ namespace embree
size_t items = CurveNiMB::blocks(prims.size());
size_t numbytes = CurveNiMB::bytes(prims.size());
CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items);
LBBox3fa bounds = empty;
for (size_t i=0; i<items; i++)
@ -143,9 +143,9 @@ namespace embree
public:
// 27.6 - 46 bytes per primitive
unsigned char ty;
unsigned char N;
unsigned char data[4+37*M+24];
uint8_t ty;
uint8_t N;
uint8_t data[4+37*M+24];
/*
struct Layout
@ -153,25 +153,25 @@ namespace embree
unsigned int geomID;
unsigned int primID[N];
char bounds_vx_x[N];
char bounds_vx_y[N];
char bounds_vx_z[N];
int8_t bounds_vx_x[N];
int8_t bounds_vx_y[N];
int8_t bounds_vx_z[N];
short bounds_vx_lower0[N];
short bounds_vx_upper0[N];
short bounds_vx_lower1[N];
short bounds_vx_upper1[N];
char bounds_vy_x[N];
char bounds_vy_y[N];
char bounds_vy_z[N];
int8_t bounds_vy_x[N];
int8_t bounds_vy_y[N];
int8_t bounds_vy_z[N];
short bounds_vy_lower0[N];
short bounds_vy_upper0[N];
short bounds_vy_lower1[N];
short bounds_vy_upper1[N];
char bounds_vz_x[N];
char bounds_vz_y[N];
char bounds_vz_z[N];
int8_t bounds_vz_x[N];
int8_t bounds_vz_y[N];
int8_t bounds_vz_z[N];
short bounds_vz_lower0[N];
short bounds_vz_upper0[N];
short bounds_vz_lower1[N];
@ -185,89 +185,89 @@ namespace embree
};
*/
__forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((char*)this+2); }
__forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
__forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); }
__forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
__forceinline unsigned int* primID(size_t N) { return (unsigned int*)((char*)this+6); }
__forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
__forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); }
__forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
__forceinline char* bounds_vx_x(size_t N) { return (char*)((char*)this+6+4*N); }
__forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
__forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline char* bounds_vx_y(size_t N) { return (char*)((char*)this+6+5*N); }
__forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
__forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline char* bounds_vx_z(size_t N) { return (char*)((char*)this+6+6*N); }
__forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
__forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline short* bounds_vx_lower0(size_t N) { return (short*)((char*)this+6+7*N); }
__forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((char*)this+6+7*N); }
__forceinline short* bounds_vx_lower0(size_t N) { return (short*)((int8_t*)this+6+7*N); }
__forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
__forceinline short* bounds_vx_upper0(size_t N) { return (short*)((char*)this+6+9*N); }
__forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((char*)this+6+9*N); }
__forceinline short* bounds_vx_upper0(size_t N) { return (short*)((int8_t*)this+6+9*N); }
__forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
__forceinline short* bounds_vx_lower1(size_t N) { return (short*)((char*)this+6+11*N); }
__forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((char*)this+6+11*N); }
__forceinline short* bounds_vx_lower1(size_t N) { return (short*)((int8_t*)this+6+11*N); }
__forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); }
__forceinline short* bounds_vx_upper1(size_t N) { return (short*)((char*)this+6+13*N); }
__forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((char*)this+6+13*N); }
__forceinline short* bounds_vx_upper1(size_t N) { return (short*)((int8_t*)this+6+13*N); }
__forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); }
__forceinline char* bounds_vy_x(size_t N) { return (char*)((char*)this+6+15*N); }
__forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+15*N); }
__forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+15*N); }
__forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); }
__forceinline char* bounds_vy_y(size_t N) { return (char*)((char*)this+6+16*N); }
__forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+16*N); }
__forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+16*N); }
__forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); }
__forceinline char* bounds_vy_z(size_t N) { return (char*)((char*)this+6+17*N); }
__forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+17*N); }
__forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+17*N); }
__forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); }
__forceinline short* bounds_vy_lower0(size_t N) { return (short*)((char*)this+6+18*N); }
__forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((char*)this+6+18*N); }
__forceinline short* bounds_vy_lower0(size_t N) { return (short*)((int8_t*)this+6+18*N); }
__forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); }
__forceinline short* bounds_vy_upper0(size_t N) { return (short*)((char*)this+6+20*N); }
__forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((char*)this+6+20*N); }
__forceinline short* bounds_vy_upper0(size_t N) { return (short*)((int8_t*)this+6+20*N); }
__forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); }
__forceinline short* bounds_vy_lower1(size_t N) { return (short*)((char*)this+6+22*N); }
__forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((char*)this+6+22*N); }
__forceinline short* bounds_vy_lower1(size_t N) { return (short*)((int8_t*)this+6+22*N); }
__forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); }
__forceinline short* bounds_vy_upper1(size_t N) { return (short*)((char*)this+6+24*N); }
__forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((char*)this+6+24*N); }
__forceinline short* bounds_vy_upper1(size_t N) { return (short*)((int8_t*)this+6+24*N); }
__forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); }
__forceinline char* bounds_vz_x(size_t N) { return (char*)((char*)this+6+26*N); }
__forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+26*N); }
__forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+26*N); }
__forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); }
__forceinline char* bounds_vz_y(size_t N) { return (char*)((char*)this+6+27*N); }
__forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+27*N); }
__forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+27*N); }
__forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); }
__forceinline char* bounds_vz_z(size_t N) { return (char*)((char*)this+6+28*N); }
__forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+28*N); }
__forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+28*N); }
__forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); }
__forceinline short* bounds_vz_lower0(size_t N) { return (short*)((char*)this+6+29*N); }
__forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((char*)this+6+29*N); }
__forceinline short* bounds_vz_lower0(size_t N) { return (short*)((int8_t*)this+6+29*N); }
__forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); }
__forceinline short* bounds_vz_upper0(size_t N) { return (short*)((char*)this+6+31*N); }
__forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((char*)this+6+31*N); }
__forceinline short* bounds_vz_upper0(size_t N) { return (short*)((int8_t*)this+6+31*N); }
__forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); }
__forceinline short* bounds_vz_lower1(size_t N) { return (short*)((char*)this+6+33*N); }
__forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((char*)this+6+33*N); }
__forceinline short* bounds_vz_lower1(size_t N) { return (short*)((int8_t*)this+6+33*N); }
__forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); }
__forceinline short* bounds_vz_upper1(size_t N) { return (short*)((char*)this+6+35*N); }
__forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((char*)this+6+35*N); }
__forceinline short* bounds_vz_upper1(size_t N) { return (short*)((int8_t*)this+6+35*N); }
__forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); }
__forceinline Vec3f* offset(size_t N) { return (Vec3f*)((char*)this+6+37*N); }
__forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+37*N); }
__forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+37*N); }
__forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); }
__forceinline float* scale(size_t N) { return (float*)((char*)this+6+37*N+12); }
__forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+37*N+12); }
__forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+37*N+12); }
__forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); }
__forceinline float& time_offset(size_t N) { return *(float*)((char*)this+6+37*N+16); }
__forceinline const float& time_offset(size_t N) const { return *(float*)((char*)this+6+37*N+16); }
__forceinline float& time_offset(size_t N) { return *(float*)((int8_t*)this+6+37*N+16); }
__forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); }
__forceinline float& time_scale(size_t N) { return *(float*)((char*)this+6+37*N+20); }
__forceinline const float& time_scale(size_t N) const { return *(float*)((char*)this+6+37*N+20); }
__forceinline float& time_scale(size_t N) { return *(float*)((int8_t*)this+6+37*N+20); }
__forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); }
__forceinline char* end(size_t N) { return (char*)this+6+37*N+24; }
__forceinline const char* end(size_t N) const { return (char*)this+6+37*N+24; }
__forceinline int8_t* end(size_t N) { return (int8_t*)this+6+37*N+24; }
__forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; }
};
template<int M>

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,22 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim);
#if defined (__AVX__)
void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -41,7 +41,7 @@ namespace embree
}
const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);
size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
#if !defined(__X86_64__)
#if !defined(__X86_64__) && !defined(__aarch64__)
rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
#endif
void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
@ -62,8 +62,8 @@ namespace embree
__forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
/*! returns pointer to BVH array */
__forceinline char* bvhData() { return &data[0]; }
__forceinline const char* bvhData() const { return &data[0]; }
__forceinline int8_t* bvhData() { return &data[0]; }
__forceinline const int8_t* bvhData() const { return &data[0]; }
/*! returns pointer to Grid array */
__forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; }
@ -253,7 +253,7 @@ namespace embree
public:
BVH4::NodeRef troot;
#if !defined(__X86_64__)
#if !defined(__X86_64__) && !defined(__aarch64__)
unsigned align1;
#endif
unsigned time_steps;
@ -269,7 +269,7 @@ namespace embree
unsigned gridBytes;
unsigned rootOffset;
char data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots
int8_t data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots
};
}
}

View file

@ -2,4 +2,4 @@
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#define RTC_HASH "69bd4c272f1ed608494f233ecfff3feec516880b"
#define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6"

View file

@ -63,7 +63,7 @@ namespace embree
static const size_t NUM_CACHE_SEGMENTS = 8;
static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
static const size_t COMMIT_INDEX_SHIFT = 32+8;
#if defined(__X86_64__)
#if defined(__X86_64__) || defined(__aarch64__)
static const size_t REF_TAG_MASK = 0xffffffffff;
#else
static const size_t REF_TAG_MASK = 0x7FFFFFFF;

View file

@ -1,215 +1,630 @@
diff --git a/common/math/math.h b/common/math/math.h
index 5af0691a2..1982c27c1 100644
--- a/common/math/math.h
+++ b/common/math/math.h
@@ -13,7 +13,7 @@
#include <immintrin.h>
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
index 76c6b740aa..51d296fb16 100644
--- a/thirdparty/embree/common/algorithms/parallel_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -27,7 +27,10 @@ namespace embree
func(r.begin());
});
if (!TaskScheduler::wait())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
}
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
#if defined(__WIN32__)
-#if (__MSV_VER <= 1700)
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
namespace std
{
__forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
@@ -86,7 +86,7 @@
return _mm_cvtss_f32(c);
@@ -55,13 +58,19 @@ namespace embree
func(i);
},context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
});
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
#elif defined(TASKING_PPL)
@@ -81,7 +90,10 @@ namespace embree
#if defined(TASKING_INTERNAL)
TaskScheduler::spawn(first,last,minStepSize,func);
if (!TaskScheduler::wait())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
@@ -109,13 +121,19 @@ namespace embree
func(range<Index>(r.begin(),r.end()));
},context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
func(range<Index>(r.begin(),r.end()));
});
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
#elif defined(TASKING_PPL)
@@ -147,13 +165,19 @@ namespace embree
func(i);
},tbb::simple_partitioner(),context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},tbb::simple_partitioner());
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
}
-#if defined(__WIN32__) && (__MSC_VER <= 1700)
+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
__forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
__forceinline int roundf(float f) { return (int)(f + 0.5f); }
diff --git a/common/sys/intrinsics.h b/common/sys/intrinsics.h
index 3f0619cac..58f5c3bb4 100644
--- a/common/sys/intrinsics.h
+++ b/common/sys/intrinsics.h
@@ -11,6 +11,12 @@
#include <immintrin.h>
+// -- GODOT start --
+#if defined(__WIN32__) && defined(__MINGW32__)
+#include <unistd.h>
+#endif
+// -- GODOT end --
+
#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
#if !defined(_tzcnt_u32)
#define _tzcnt_u32 __tzcnt_u32
@@ -30,8 +36,14 @@
#endif
#if defined(__WIN32__)
-# define NOMINMAX
-# include <windows.h>
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#define NOMINMAX
+// -- GODOT start --
+#endif
+#include "windows.h"
+// -- GODOT end --
#endif
/* normally defined in pmmintrin.h, but we always need this */
@@ -413,8 +425,16 @@ namespace embree
__forceinline void pause_cpu(const size_t N = 8)
{
+// -- GODOT start --
for (size_t i=0; i<N; i++)
+#if !(defined(__WIN32__) && defined(__MINGW32__))
+// -- GODOT end --
_mm_pause();
+// -- GODOT start --
+#else
+ usleep(1);
+#endif
+// -- GODOT end --
@@ -168,13 +192,19 @@ namespace embree
func(i);
},ap,context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},ap);
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
}
/* prefetches */
diff --git a/common/sys/library.cpp b/common/sys/library.cpp
index e448b195d..8ec918660 100644
--- a/common/sys/library.cpp
+++ b/common/sys/library.cpp
@@ -27,7 +27,9 @@ namespace embree
/* returns address of a symbol from the library */
void* getSymbol(lib_t lib, const std::string& sym) {
- return GetProcAddress(HMODULE(lib),sym.c_str());
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
index d444b6a2e4..0daf94e50e 100644
--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -58,15 +58,19 @@ namespace embree
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction,context);
- if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ return (void*) GetProcAddress(HMODULE(lib),sym.c_str());
+ // if (context.is_group_execution_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
return v;
#else
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction);
- if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // if (tbb::task::self().is_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
return v;
#endif
#else // TASKING_PPL
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
index 7e7b9faef8..98dc80ad59 100644
--- a/thirdparty/embree/common/lexers/stringstream.cpp
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -39,7 +39,10 @@ namespace embree
std::vector<char> str; str.reserve(64);
while (cin->peek() != EOF && !isSeparator(cin->peek())) {
int c = cin->get();
- if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+ // -- GODOT start --
+ // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+ if (!isValidChar(c)) abort();
+ // -- GODOT end --
str.push_back((char)c);
}
str.push_back(0);
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
index 4e8928242e..12f143f131 100644
--- a/thirdparty/embree/common/sys/alloc.cpp
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -21,7 +21,10 @@ namespace embree
void* ptr = _mm_malloc(size,align);
if (size != 0 && ptr == nullptr)
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
return ptr;
}
@@ -128,7 +131,10 @@ namespace embree
/* fall back to 4k pages */
int flags = MEM_COMMIT | MEM_RESERVE;
char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
- if (ptr == nullptr) throw std::bad_alloc();
+ // -- GODOT start --
+ // if (ptr == nullptr) throw std::bad_alloc();
+ if (ptr == nullptr) abort();
+ // -- GODOT end --
hugepages = false;
return ptr;
}
@@ -145,7 +151,10 @@ namespace embree
return bytesOld;
if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
return bytesNew;
}
@@ -156,7 +165,10 @@ namespace embree
return;
if (!VirtualFree(ptr,0,MEM_RELEASE))
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
}
void os_advise(void *ptr, size_t bytes)
@@ -260,7 +272,10 @@ namespace embree
/* fallback to 4k pages */
void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
- if (ptr == MAP_FAILED) throw std::bad_alloc();
+ // -- GODOT start --
+ // if (ptr == MAP_FAILED) throw std::bad_alloc();
+ if (ptr == MAP_FAILED) abort();
+ // -- GODOT end --
hugepages = false;
/* advise huge page hint for THP */
@@ -277,7 +292,10 @@ namespace embree
return bytesOld;
if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
return bytesNew;
}
@@ -291,7 +309,10 @@ namespace embree
const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
bytes = (bytes+pageSize-1) & ~(pageSize-1);
if (munmap(ptr,bytes) == -1)
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
}
/* hint for transparent huge pages (THP) */
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
index 7914eb7a52..737f14aa6e 100644
--- a/thirdparty/embree/common/sys/platform.h
+++ b/thirdparty/embree/common/sys/platform.h
@@ -174,11 +174,19 @@
#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
#if defined(DEBUG) // only report file and line in debug mode
+ // -- GODOT start --
+ // #define THROW_RUNTIME_ERROR(str)
+ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define THROW_RUNTIME_ERROR(str) \
- throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+ // -- GODOT end --
#else
+ // -- GODOT start --
+ // #define THROW_RUNTIME_ERROR(str)
+ // throw std::runtime_error(str);
#define THROW_RUNTIME_ERROR(str) \
- throw std::runtime_error(str);
+ abort();
+ // -- GODOT end --
#endif
#define FATAL(x) THROW_RUNTIME_ERROR(x)
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
index 98d7fb9249..ebf656d1a0 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -48,13 +48,15 @@ namespace embree
{
Task* prevTask = thread.task;
thread.task = this;
- try {
- if (thread.scheduler->cancellingException == nullptr)
+ // -- GODOT start --
+ // try {
+ // if (thread.scheduler->cancellingException == nullptr)
closure->execute();
- } catch (...) {
- if (thread.scheduler->cancellingException == nullptr)
- thread.scheduler->cancellingException = std::current_exception();
- }
+ // } catch (...) {
+ // if (thread.scheduler->cancellingException == nullptr)
+ // thread.scheduler->cancellingException = std::current_exception();
+ // }
+ // -- GODOT end --
thread.task = prevTask;
add_dependencies(-1);
}
@@ -297,8 +299,11 @@ namespace embree
size_t threadIndex = allocThreadIndex();
condition.wait(mutex, [&] () { return hasRootTask.load(); });
mutex.unlock();
- std::exception_ptr except = thread_loop(threadIndex);
- if (except != nullptr) std::rethrow_exception(except);
+ // -- GODOT start --
+ // std::exception_ptr except = thread_loop(threadIndex);
+ // if (except != nullptr) std::rethrow_exception(except);
+ thread_loop(threadIndex);
+ // -- GODOT end --
}
/* closes the shared library */
diff --git a/common/sys/mutex.h b/common/sys/mutex.h
index 1164210f2..f0f55340a 100644
--- a/common/sys/mutex.h
+++ b/common/sys/mutex.h
@@ -47,8 +47,17 @@ namespace embree
{
while (flag.load())
{
+// -- GODOT start --
+#if !(defined (__WIN32__) && defined (__MINGW32__))
+// -- GODOT end --
_mm_pause();
_mm_pause();
+// -- GODOT start --
+#else
+ __builtin_ia32_pause();
+ __builtin_ia32_pause();
+#endif
+// -- GODOT end --
}
void TaskScheduler::reset() {
@@ -330,7 +335,10 @@ namespace embree
return thread->scheduler->cancellingException == nullptr;
}
bool expected = false;
@@ -74,8 +82,17 @@ namespace embree
{
while(flag.load())
- std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT start --
+// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+ void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
{
/* allocate thread structure */
std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
@@ -353,9 +361,10 @@ namespace embree
swapThread(oldThread);
/* remember exception to throw */
- std::exception_ptr except = nullptr;
- if (cancellingException != nullptr) except = cancellingException;
-
+ // -- GODOT start --
+ // std::exception_ptr except = nullptr;
+ // if (cancellingException != nullptr) except = cancellingException;
+ // -- GODOT end --
/* wait for all threads to terminate */
threadCounter--;
#if defined(__WIN32__)
@@ -373,7 +382,10 @@ namespace embree
yield();
#endif
}
- return except;
+ // -- GODOT start --
+ // return except;
+ return;
+ // -- GODOT end --
}
bool TaskScheduler::steal_from_other_threads(Thread& thread)
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
index c2a9391aea..8bd70b2b8c 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -123,7 +123,10 @@ namespace embree
{
+// -- GODOT start --
+#if !(defined (__WIN32__) && defined(__MINGW32__))
+// -- GODOT end --
_mm_pause();
_mm_pause();
+// -- GODOT start --
+#else
+ __builtin_ia32_pause();
+ __builtin_ia32_pause();
+#endif
+// -- GODOT end --
size_t ofs = bytes + ((align - stackPtr) & (align-1));
if (stackPtr + ofs > CLOSURE_STACK_SIZE)
- throw std::runtime_error("closure stack overflow");
+ // -- GODOT start --
+ // throw std::runtime_error("closure stack overflow");
+ abort();
+ // -- GODOT end --
stackPtr += ofs;
return &stack[stackPtr-bytes];
}
@@ -132,7 +135,10 @@ namespace embree
__forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
{
if (right >= TASK_STACK_SIZE)
- throw std::runtime_error("task stack overflow");
+ // -- GODOT start --
+ // throw std::runtime_error("task stack overflow");
+ abort();
+ // -- GODOT end --
/* allocate new task on right side of stack */
size_t oldStackPtr = stackPtr;
@@ -239,7 +245,10 @@ namespace embree
void wait_for_threads(size_t threadCount);
/*! thread loop for all worker threads */
- std::exception_ptr thread_loop(size_t threadIndex);
+ // -- GODOT start --
+ // std::exception_ptr thread_loop(size_t threadIndex);
+ void thread_loop(size_t threadIndex);
+ // -- GODOT end --
/*! steals a task from a different thread */
bool steal_from_other_threads(Thread& thread);
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
index 20cdd2d320..aa56035026 100644
--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -150,7 +150,10 @@ namespace embree
}
}
diff --git a/common/sys/platform.h b/common/sys/platform.h
index 96f9aab01..08617452f 100644
--- a/common/sys/platform.h
+++ b/common/sys/platform.h
@@ -141,6 +141,9 @@
#define DELETED = delete
else {
- throw std::runtime_error("not supported node type in bvh_statistics");
+ // -- GODOT start --
+ // throw std::runtime_error("not supported node type in bvh_statistics");
+ abort();
+ // -- GODOT end --
}
return s;
}
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
index ee5c37b238..625fbf6d4f 100644
--- a/thirdparty/embree/kernels/common/rtcore.cpp
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
if (quality != RTC_BUILD_QUALITY_LOW &&
quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH)
- throw std::runtime_error("invalid build quality");
+ // -- GODOT start --
+ // throw std::runtime_error("invalid build quality");
+ abort();
+ // -- GODOT end --
scene->setBuildQuality(quality);
RTC_CATCH_END2(scene);
}
@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH &&
quality != RTC_BUILD_QUALITY_REFIT)
- throw std::runtime_error("invalid build quality");
+ // -- GODOT start --
+ // throw std::runtime_error("invalid build quality");
+ abort();
+ // -- GODOT end --
geometry->setBuildQuality(quality);
RTC_CATCH_END2(geometry);
}
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
index 6583d12d57..4b070e122b 100644
--- a/thirdparty/embree/kernels/common/rtcore.h
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -25,52 +25,58 @@ namespace embree
#endif
/*! Macros used in the rtcore API implementation */
-#define RTC_CATCH_BEGIN try {
+// -- GODOT start --
+#if !defined(likely)
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
-#define RTC_CATCH_END(device) \
- } catch (std::bad_alloc&) { \
- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
- } catch (rtcore_error& e) { \
- Device::process_error(device,e.error,e.what()); \
- } catch (std::exception& e) { \
- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
- } catch (...) { \
- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
- }
+// #define RTC_CATCH_END(device) \
+// } catch (std::bad_alloc&) { \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// } catch (rtcore_error& e) { \
+// Device::process_error(device,e.error,e.what()); \
+// } catch (std::exception& e) { \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// } catch (...) { \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// }
+#define RTC_CATCH_END(device)
-#define RTC_CATCH_END2(scene) \
- } catch (std::bad_alloc&) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
- } catch (rtcore_error& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,e.error,e.what()); \
- } catch (std::exception& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
- } catch (...) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
- }
+// #define RTC_CATCH_END2(scene) \
+// } catch (std::bad_alloc&) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// } catch (rtcore_error& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,e.error,e.what()); \
+// } catch (std::exception& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// } catch (...) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// }
+#define RTC_CATCH_END2(scene)
-#define RTC_CATCH_END2_FALSE(scene) \
- } catch (std::bad_alloc&) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
- return false; \
- } catch (rtcore_error& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,e.error,e.what()); \
- return false; \
- } catch (std::exception& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
- return false; \
- } catch (...) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
- return false; \
- }
+// #define RTC_CATCH_END2_FALSE(scene) \
+// } catch (std::bad_alloc&) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// return false; \
+// } catch (rtcore_error& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,e.error,e.what()); \
+// return false; \
+// } catch (std::exception& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// return false; \
+// } catch (...) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// return false; \
+// }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#define likely(expr) (expr)
#define unlikely(expr) (expr)
@@ -148,6 +151,9 @@
#define likely(expr) __builtin_expect((bool)(expr),true )
#define unlikely(expr) __builtin_expect((bool)(expr),false)
#define RTC_VERIFY_HANDLE(handle) \
if (handle == nullptr) { \
@@ -97,28 +103,38 @@ namespace embree
#define RTC_TRACE(x)
#endif
+// -- GODOT start --
+#endif
- /*! used to throw embree API errors */
- struct rtcore_error : public std::exception
- {
- __forceinline rtcore_error(RTCError error, const std::string& str)
- : error(error), str(str) {}
-
- ~rtcore_error() throw() {}
-
- const char* what () const throw () {
- return str.c_str();
- }
-
- RTCError error;
- std::string str;
- };
+// -- GODOT begin --
+// /*! used to throw embree API errors */
+// struct rtcore_error : public std::exception
+// {
+// __forceinline rtcore_error(RTCError error, const std::string& str)
+// : error(error), str(str) {}
+//
+// ~rtcore_error() throw() {}
+//
+// const char* what () const throw () {
+// return str.c_str();
+// }
+//
+// RTCError error;
+// std::string str;
+// };
+// -- GODOT end --
////////////////////////////////////////////////////////////////////////////////
/// Error handling and debugging
diff --git a/common/sys/sysinfo.cpp b/common/sys/sysinfo.cpp
index eb0a10eaf..74438260d 100644
--- a/common/sys/sysinfo.cpp
+++ b/common/sys/sysinfo.cpp
@@ -233,7 +233,7 @@ namespace embree
__noinline int64_t get_xcr0()
{
-#if defined (__WIN32__)
+#if defined (__WIN32__) /* -- GODOT start -- */ && !defined (__MINGW32__) /* -- GODOT end -- */
int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
xcr0 = _xgetbv(0);
return xcr0;
diff --git a/common/tasking/taskschedulerinternal.cpp b/common/tasking/taskschedulerinternal.cpp
index 2152e92f4..923d62f83 100644
--- a/common/tasking/taskschedulerinternal.cpp
+++ b/common/tasking/taskschedulerinternal.cpp
@@ -361,7 +361,15 @@ namespace embree
if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
yield();
else
+// -- GODOT start --
+#if !defined(__MINGW32__)
+// -- GODOT end --
_mm_pause();
+// -- GODOT start --
+#else
+ usleep(1);
+#endif
+// -- GODOT end --
loopIndex++;
#if defined(DEBUG) // only report file and line in debug mode
+ // -- GODOT begin --
+ // #define throw_RTCError(error,str) \
+ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define throw_RTCError(error,str) \
- throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+ // -- GODOT end --
#else
yield();
diff --git a/common/tasking/taskschedulertbb.h b/common/tasking/taskschedulertbb.h
index 98dba2687..369e5edf0 100644
--- a/common/tasking/taskschedulertbb.h
+++ b/common/tasking/taskschedulertbb.h
@@ -12,7 +12,13 @@
#include "../sys/ref.h"
#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
# define NOMINMAX
+// -- GODOT start --
+#endif
+// -- GODOT end --
+ // -- GODOT begin --
+ // #define throw_RTCError(error,str) \
+ // throw rtcore_error(error,str);
#define throw_RTCError(error,str) \
- throw rtcore_error(error,str);
+ abort();
+ // -- GODOT end --
#endif
// We need to define these to avoid implicit linkage against
diff a/include/embree3/rtcore_common.h b/include/embree3/rtcore_common.h
--- a/include/embree3/rtcore_common.h
+++ b/include/embree3/rtcore_common.h
@@ -19,7 +19,7 @@
#endif
#endif
#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
index e75aa968f9..1e23aeb415 100644
--- a/thirdparty/embree/kernels/common/scene.cpp
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -800,16 +800,18 @@ namespace embree
}
-#ifdef _WIN32
+#if defined(_WIN32) && defined(_MSC_VER)
# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
#else
# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
/* initiate build */
- try {
+ // -- GODOT start --
+ // try {
scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
- }
- catch (...) {
- accels_clear();
- updateInterface();
- Lock<MutexSys> lock(schedulerMutex);
- this->scheduler = nullptr;
- throw;
- }
+ // }
+ // catch (...) {
+ // accels_clear();
+ // updateInterface();
+ // Lock<MutexSys> lock(schedulerMutex);
+ // this->scheduler = nullptr;
+ // throw;
+ // }
+ // -- GODOT end --
}
#endif