Merge pull request #48455 from JFonS/3.x_embree_aarch64

[3.x] Switch to embree-aarch64
This commit is contained in:
Rémi Verschelde 2021-05-05 15:01:18 +02:00 committed by GitHub
commit b8d198eeed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
97 changed files with 6063 additions and 1294 deletions

View file

@ -135,6 +135,11 @@ Copyright: 2018, Eric Lasota
2018, Microsoft Corp. 2018, Microsoft Corp.
License: Expat License: Expat
Files: ./thirdparty/embree/
Comment: Embree
Copyright: 2009-2021 Intel Corporation
License: Apache-2.0
Files: ./thirdparty/enet/ Files: ./thirdparty/enet/
Comment: ENet Comment: ENet
Copyright: 2002-2020, Lee Salzman Copyright: 2002-2020, Lee Salzman

View file

@ -6,23 +6,13 @@ def can_build(env, platform):
# `can_build()` for that module, so we need to duplicate that code as a short-term # `can_build()` for that module, so we need to duplicate that code as a short-term
# solution. # solution.
# Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are if platform == "android":
# not supported. return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
# It's also only relevant for tools build and desktop platforms,
# as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
supported_platform = platform in ["x11", "osx", "windows", "server"]
supported_bits = env["bits"] == "64"
supported_arch = env["arch"] != "arm64"
# Hack to disable on Linux arm64. This won't work well for cross-compilation (checks if platform in ["javascript", "server"]:
# host, not target) and would need a more thorough fix by refactoring our arch and return False
# bits-handling code.
from platform import machine
if platform == "x11" and machine() != "x86_64": return True
supported_arch = False
return supported_platform and supported_bits and supported_arch
def configure(env): def configure(env):

View file

@ -70,25 +70,19 @@ if env["builtin_embree"]:
thirdparty_sources = [thirdparty_dir + file for file in embree_src] thirdparty_sources = [thirdparty_dir + file for file in embree_src]
env_raycast.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "include"]) env_raycast.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "include"])
env_raycast.Append( env_raycast.Append(CPPDEFINES=["EMBREE_TARGET_SSE2", "EMBREE_LOWEST_ISA", "TASKING_INTERNAL", "NDEBUG"])
CPPDEFINES=[
"EMBREE_TARGET_SSE2",
"EMBREE_LOWEST_ISA",
"TASKING_INTERNAL",
"NDEBUG",
"__SSE2__",
"__SSE__",
]
)
if not env.msvc: if not env.msvc:
env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"]) if env["arch"] in ["x86", "x86_64"]:
env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"])
if env["platform"] == "windows": if env["platform"] == "windows":
env_raycast.Append(CPPFLAGS=["-mstackrealign"]) env_raycast.Append(CPPFLAGS=["-mstackrealign"])
if env["platform"] == "windows": if env["platform"] == "windows":
if env.msvc: if env.msvc:
env.Append(LINKFLAGS=["psapi.lib"]) env.Append(LINKFLAGS=["psapi.lib"])
env_raycast.Append(CPPDEFINES=["__SSE2__", "__SSE__"])
else: else:
env.Append(LIBS=["psapi"]) env.Append(LIBS=["psapi"])

View file

@ -1,21 +1,14 @@
def can_build(env, platform): def can_build(env, platform):
# Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are if not env["tools"]:
# not supported. return False
# It's also only relevant for tools build and desktop platforms,
# as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
supported_platform = platform in ["x11", "osx", "windows", "server"]
supported_bits = env["bits"] == "64"
supported_arch = env["arch"] != "arm64"
# Hack to disable on Linux arm64. This won't work well for cross-compilation (checks if platform == "android":
# host, not target) and would need a more thorough fix by refactoring our arch and return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
# bits-handling code.
from platform import machine
if platform == "x11" and machine() != "x86_64": if platform in ["javascript", "server"]:
supported_arch = False return False
return env["tools"] and supported_platform and supported_bits and supported_arch return True
def configure(env): def configure(env):

View file

@ -74,17 +74,18 @@ cpp_files = [
os.chdir("../../thirdparty") os.chdir("../../thirdparty")
if os.path.exists("embree"): dir_name = "embree"
shutil.rmtree("embree") if os.path.exists(dir_name):
shutil.rmtree(dir_name)
subprocess.run(["git", "clone", "https://github.com/embree/embree.git", "embree-tmp"]) subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"])
os.chdir("embree-tmp") os.chdir("embree-tmp")
commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip() commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
dest_dir = "../embree"
all_files = set(cpp_files) all_files = set(cpp_files)
dest_dir = os.path.join("..", dir_name)
for include_dir in include_dirs: for include_dir in include_dirs:
headers = glob.iglob(os.path.join(include_dir, "*.h")) headers = glob.iglob(os.path.join(include_dir, "*.h"))
all_files.update(headers) all_files.update(headers)

View file

@ -190,8 +190,11 @@ LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
if (embree_scene != nullptr) if (embree_scene != nullptr) {
rtcReleaseScene(embree_scene); rtcReleaseScene(embree_scene);
if (embree_device != nullptr) }
if (embree_device != nullptr) {
rtcReleaseDevice(embree_device); rtcReleaseDevice(embree_device);
}
} }

14
thirdparty/README.md vendored
View file

@ -41,19 +41,19 @@ Files extracted from upstream source:
## embree ## embree
- Upstream: https://github.com/embree/embree - Upstream: https://github.com/lighttransport/embree-aarch64
- Version: 3.12.1 (69bd4c272f1ed608494f233ecfff3feec516880b, 2020) - Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020)
- License: Apache 2.0 - License: Apache 2.0
Files extracted from upstream: Files extracted from upstream:
- All cpp files listed in `modules/raytrace/godot_update_embree.py` - All cpp files listed in `modules/raycast/godot_update_embree.py`
- All header files in the directories listed in `modules/raytrace/godot_update_embree.py` - All header files in the directories listed in `modules/raycast/godot_update_embree.py`
The `modules/raytrace/godot_update_embree.py`script can be used to pull the The `modules/raycast/godot_update_embree.py`script can be used to pull the
relevant files from the latest Embree release and apply some automatic changes. relevant files from the latest Embree-aarch64 release and apply some automatic changes.
Some minor changes have been made in order to fix build errors. Some changes have been made in order to remove exceptions and fix minor build errors.
They are marked with `// -- GODOT start --` and `// -- GODOT end --` They are marked with `// -- GODOT start --` and `// -- GODOT end --`
comments. Apply the patches in the `patches/` folder when syncing on newer upstream comments. Apply the patches in the `patches/` folder when syncing on newer upstream
commits. commits.

View file

@ -8,6 +8,12 @@
#include "../math/math.h" #include "../math/math.h"
#include "../math/range.h" #include "../math/range.h"
#if defined(TASKING_GCD) && defined(BUILD_IOS)
#include <dispatch/dispatch.h>
#include <algorithm>
#include <type_traits>
#endif
namespace embree namespace embree
{ {
/* parallel_for without range */ /* parallel_for without range */
@ -21,9 +27,30 @@ namespace embree
func(r.begin()); func(r.begin());
}); });
if (!TaskScheduler::wait()) if (!TaskScheduler::wait())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
} }
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
const size_t length = N;
const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
const size_t numBlocks = (length + blockSize-1) / blockSize;
dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
const size_t start = (currentBlock * blockSize);
const size_t blockLength = std::min(length - start, blockSize);
const size_t end = start + blockLength;
for(size_t i=start; i < end; i++)
{
func(i);
}
});
#elif defined(TASKING_TBB) #elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002 #if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context; tbb::task_group_context context;
@ -31,13 +58,19 @@ namespace embree
func(i); func(i);
},context); },context);
if (context.is_group_execution_cancelled()) if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else #else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i); func(i);
}); });
if (tbb::task::self().is_cancelled()) if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif #endif
#elif defined(TASKING_PPL) #elif defined(TASKING_PPL)
@ -57,7 +90,29 @@ namespace embree
#if defined(TASKING_INTERNAL) #if defined(TASKING_INTERNAL)
TaskScheduler::spawn(first,last,minStepSize,func); TaskScheduler::spawn(first,last,minStepSize,func);
if (!TaskScheduler::wait()) if (!TaskScheduler::wait())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
const size_t length = last - first;
const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
blockSize += blockSize % 4;
const size_t numBlocks = (length + blockSize-1) / blockSize;
dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
const size_t start = first + (currentBlock * blockSize);
const size_t end = std::min<size_t>(last, start + blockSize);
func( embree::range<Index>(start,end) );
});
#elif defined(TASKING_TBB) #elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002 #if TBB_INTERFACE_VERSION >= 12002
@ -66,13 +121,19 @@ namespace embree
func(range<Index>(r.begin(),r.end())); func(range<Index>(r.begin(),r.end()));
},context); },context);
if (context.is_group_execution_cancelled()) if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else #else
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
func(range<Index>(r.begin(),r.end())); func(range<Index>(r.begin(),r.end()));
}); });
if (tbb::task::self().is_cancelled()) if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif #endif
#elif defined(TASKING_PPL) #elif defined(TASKING_PPL)
@ -104,13 +165,19 @@ namespace embree
func(i); func(i);
},tbb::simple_partitioner(),context); },tbb::simple_partitioner(),context);
if (context.is_group_execution_cancelled()) if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else #else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i); func(i);
},tbb::simple_partitioner()); },tbb::simple_partitioner());
if (tbb::task::self().is_cancelled()) if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif #endif
} }
@ -125,13 +192,19 @@ namespace embree
func(i); func(i);
},ap,context); },ap,context);
if (context.is_group_execution_cancelled()) if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#else #else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i); func(i);
},ap); },ap);
if (tbb::task::self().is_cancelled()) if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled"); // -- GODOT start --
// throw std::runtime_error("task cancelled");
abort();
// -- GODOT end --
#endif #endif
} }

View file

@ -43,7 +43,7 @@ namespace embree
template<typename Index, typename Value, typename Func, typename Reduction> template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{ {
#if defined(TASKING_INTERNAL) #if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
/* fast path for small number of iterations */ /* fast path for small number of iterations */
Index taskCount = (last-first+minStepSize-1)/minStepSize; Index taskCount = (last-first+minStepSize-1)/minStepSize;
@ -58,15 +58,19 @@ namespace embree
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction,context); reduction,context);
if (context.is_group_execution_cancelled()) // -- GODOT start --
throw std::runtime_error("task cancelled"); // if (context.is_group_execution_cancelled())
// throw std::runtime_error("task cancelled");
// -- GODOT end --
return v; return v;
#else #else
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction); reduction);
if (tbb::task::self().is_cancelled()) // -- GODOT start --
throw std::runtime_error("task cancelled"); // if (tbb::task::self().is_cancelled())
// throw std::runtime_error("task cancelled");
// -- GODOT end --
return v; return v;
#endif #endif
#else // TASKING_PPL #else // TASKING_PPL

View file

@ -5,6 +5,9 @@
#include "../simd/simd.h" #include "../simd/simd.h"
#include "parallel_for.h" #include "parallel_for.h"
#if defined(TASKING_GCD) && defined(BUILD_IOS)
#include "../sys/alloc.h"
#endif
#include <algorithm> #include <algorithm>
namespace embree namespace embree
@ -320,7 +323,7 @@ namespace embree
#pragma nounroll #pragma nounroll
#endif #endif
for (size_t i=startID; i<endID; i++) { for (size_t i=startID; i<endID; i++) {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
#else #else
const Key index = ((Key)src[i] >> shift) & mask; const Key index = ((Key)src[i] >> shift) & mask;
@ -382,7 +385,7 @@ namespace embree
#endif #endif
for (size_t i=startID; i<endID; i++) { for (size_t i=startID; i<endID; i++) {
const Ty elt = src[i]; const Ty elt = src[i];
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
#else #else
const size_t index = ((Key)src[i] >> shift) & mask; const size_t index = ((Key)src[i] >> shift) & mask;

View file

@ -39,7 +39,10 @@ namespace embree
std::vector<char> str; str.reserve(64); std::vector<char> str; str.reserve(64);
while (cin->peek() != EOF && !isSeparator(cin->peek())) { while (cin->peek() != EOF && !isSeparator(cin->peek())) {
int c = cin->get(); int c = cin->get();
if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); // -- GODOT start --
// if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
if (!isValidChar(c)) abort();
// -- GODOT end --
str.push_back((char)c); str.push_back((char)c);
} }
str.push_back(0); str.push_back(0);

986
thirdparty/embree/common/math/AVX2NEON.h vendored Normal file
View file

@ -0,0 +1,986 @@
#pragma once
#include "SSE2NEON.h"
#define AVX2NEON_ABI static inline __attribute__((always_inline))
struct __m256d;
struct __m256 {
__m128 lo,hi;
__m256() {}
};
struct __m256i {
__m128i lo,hi;
explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
__m256i() {}
};
struct __m256d {
float64x2_t lo,hi;
__m256d() {}
__m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
__m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
};
#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
#define _mm_stream_load_si128 _mm_load_si128
#define _mm256_stream_load_si256 _mm256_load_si256
AVX2NEON_ABI
__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
{
__m128 res;
for (int i=0;i<4;i++)
{
if (imm8 & (1<<i))
{
res[i] = b[i];
}
else{
res[i] = a[i];
}
}
return res;
}
AVX2NEON_ABI
__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
{
__m128i res;
for (int i=0;i<4;i++)
{
if (imm8 & (1<<i))
{
res[i] = b[i];
}
else{
res[i] = a[i];
}
}
return res;
}
AVX2NEON_ABI
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
{
return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
}
AVX2NEON_ABI
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
{
int64x2_t y;
y[0] = *(int64_t *)mem_addr;
y[1] = 0;
return __m128i(y);
}
AVX2NEON_ABI
int _mm_movemask_popcnt(__m128 a)
{
return __builtin_popcount(_mm_movemask_ps(a));
}
AVX2NEON_ABI
__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
{
__m128 res;
for (int i=0;i<4;i++) {
if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
}
return res;
}
AVX2NEON_ABI
void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
{
for (int i=0;i<4;i++) {
if (mask[i] & 0x80000000) mem_addr[i] = a[i];
}
}
AVX2NEON_ABI
void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
{
for (int i=0;i<4;i++) {
if (mask[i] & 0x80000000) mem_addr[i] = a[i];
}
}
AVX2NEON_ABI
__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
{
return vnegq_f32(vfmaq_f32(c,a,b));
}
#define _mm_fnmsub_ss _mm_fnmsub_ps
AVX2NEON_ABI
__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
{
return vfmsq_f32(c,a,b);
}
#define _mm_fnmadd_ss _mm_fnmadd_ps
AVX2NEON_ABI
__m128 _mm_broadcast_ss (float const * mem_addr)
{
return vdupq_n_f32(*mem_addr);
}
AVX2NEON_ABI
__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
{
return vfmaq_f32(vnegq_f32(c),a,b);
}
#define _mm_fmsub_ss _mm_fmsub_ps
#define _mm_fmadd_ps _mm_madd_ps
#define _mm_fmadd_ss _mm_madd_ps
template<int code>
AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
{
float v;
v = 0;
v += (code & 0x10) ? a[0]*b[0] : 0;
v += (code & 0x20) ? a[1]*b[1] : 0;
v += (code & 0x40) ? a[2]*b[2] : 0;
v += (code & 0x80) ? a[3]*b[3] : 0;
float32x4_t res;
res[0] = (code & 0x1) ? v : 0;
res[1] = (code & 0x2) ? v : 0;
res[2] = (code & 0x4) ? v : 0;
res[3] = (code & 0x8) ? v : 0;
return res;
}
template<>
inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
{
float v;
float32x4_t m = _mm_mul_ps(a,b);
m[3] = 0;
v = vaddvq_f32(m);
return _mm_set1_ps(v);
}
template<>
inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
{
float v;
float32x4_t m = _mm_mul_ps(a,b);
v = vaddvq_f32(m);
return _mm_set1_ps(v);
}
#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
AVX2NEON_ABI
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
{
return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
}
AVX2NEON_ABI
__m128 _mm_permutevar_ps (__m128 a, __m128i b)
{
__m128 x;
for (int i=0;i<4;i++)
{
x[i] = a[b[i&3]];
}
return x;
}
AVX2NEON_ABI
__m256i _mm256_setzero_si256()
{
__m256i res;
res.lo = res.hi = vdupq_n_s32(0);
return res;
}
AVX2NEON_ABI
__m256 _mm256_setzero_ps()
{
__m256 res;
res.lo = res.hi = vdupq_n_f32(0.0f);
return res;
}
AVX2NEON_ABI
__m256i _mm256_undefined_si256()
{
return _mm256_setzero_si256();
}
AVX2NEON_ABI
__m256 _mm256_undefined_ps()
{
return _mm256_setzero_ps();
}
CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
AVX2NEON_ABI
__m128 _mm256_castps256_ps128 (__m256 a)
{
return a.lo;
}
AVX2NEON_ABI
__m256i _mm256_castsi128_si256 (__m128i a)
{
__m256i res;
res.lo = a ;
res.hi = vdupq_n_s32(0);
return res;
}
AVX2NEON_ABI
__m128i _mm256_castsi256_si128 (__m256i a)
{
return a.lo;
}
AVX2NEON_ABI
__m256 _mm256_castps128_ps256 (__m128 a)
{
__m256 res;
res.lo = a;
res.hi = vdupq_n_f32(0);
return res;
}
AVX2NEON_ABI
__m256 _mm256_broadcast_ss (float const * mem_addr)
{
__m256 res;
res.lo = res.hi = vdupq_n_f32(*mem_addr);
return res;
}
AVX2NEON_ABI
__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
{
__m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
__m256i res;
res.lo = lo; res.hi = hi;
return res;
}
AVX2NEON_ABI
__m256i _mm256_set1_epi32 (int a)
{
__m256i res;
res.lo = res.hi = vdupq_n_s32(a);
return res;
}
AVX2NEON_ABI
int _mm256_movemask_ps(const __m256& v)
{
return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
}
template<int imm8>
AVX2NEON_ABI
__m256 __mm256_permute_ps (const __m256& a)
{
__m256 res;
res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
return res;
}
#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
template<int imm8>
AVX2NEON_ABI
__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
{
__m256 res;
res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
return res;
}
#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
AVX2NEON_ABI
__m256i _mm256_set1_epi64x (long long a)
{
__m256i res;
int64x2_t t = vdupq_n_s64(a);
res.lo = res.hi = __m128i(t);
return res;
}
AVX2NEON_ABI
__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
{
__m256 res;
__m128 tmp;
switch (imm8 & 0x7)
{
case 0: tmp = a.lo; break;
case 1: tmp = a.hi; break;
case 2: tmp = b.lo; break;
case 3: tmp = b.hi; break;
}
if (imm8 & 0x8)
tmp = _mm_setzero_ps();
res.lo = tmp;
imm8 >>= 4;
switch (imm8 & 0x7)
{
case 0: tmp = a.lo; break;
case 1: tmp = a.hi; break;
case 2: tmp = b.lo; break;
case 3: tmp = b.hi; break;
}
if (imm8 & 0x8)
tmp = _mm_setzero_ps();
res.hi = tmp;
return res;
}
AVX2NEON_ABI
__m256 _mm256_moveldup_ps (__m256 a)
{
__m256 res;
res.lo[0] = res.lo[1] = a.lo[0];
res.lo[2] = res.lo[3] = a.lo[2];
res.hi[0] = res.hi[1] = a.hi[0];
res.hi[2] = res.hi[3] = a.hi[2];
return res;
}
AVX2NEON_ABI
__m256 _mm256_movehdup_ps (__m256 a)
{
__m256 res;
res.lo[0] = res.lo[1] = a.lo[1];
res.lo[2] = res.lo[3] = a.lo[3];
res.hi[0] = res.hi[1] = a.hi[1];
res.hi[2] = res.hi[3] = a.hi[3];
return res;
}
AVX2NEON_ABI
__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
{
__m256 res = a;
if (imm8 & 1) res.hi = b;
else res.lo = b;
return res;
}
AVX2NEON_ABI
__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
{
if (imm8 & 1) return a.hi;
return a.lo;
}
AVX2NEON_ABI
__m256d _mm256_movedup_pd (__m256d a)
{
__m256d res;
res.hi = a.hi;
res.lo[0] = res.lo[1] = a.lo[0];
return res;
}
AVX2NEON_ABI
__m256i _mm256_abs_epi32(__m256i a)
{
__m256i res;
res.lo = vabsq_s32(a.lo);
res.hi = vabsq_s32(a.hi);
return res;
}
UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
AVX2NEON_ABI
__m256i _mm256_cvtps_epi32 (__m256 a)
{
__m256i res;
res.lo = _mm_cvtps_epi32(a.lo);
res.hi = _mm_cvtps_epi32(a.hi);
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvttps_epi32 (__m256 a)
{
__m256i res;
res.lo = _mm_cvttps_epi32(a.lo);
res.hi = _mm_cvttps_epi32(a.hi);
return res;
}
AVX2NEON_ABI
__m256 _mm256_loadu_ps (float const * mem_addr)
{
__m256 res;
res.lo = *(__m128 *)(mem_addr + 0);
res.hi = *(__m128 *)(mem_addr + 4);
return res;
}
#define _mm256_load_ps _mm256_loadu_ps
AVX2NEON_ABI
int _mm256_testz_ps (const __m256& a, const __m256& b)
{
__m256 t = a;
if (&a != &b)
t = _mm256_and_ps(a,b);
__m128i l = vshrq_n_s32(__m128i(t.lo),31);
__m128i h = vshrq_n_s32(__m128i(t.hi),31);
return vaddvq_s32(vaddq_s32(l,h)) == 0;
}
AVX2NEON_ABI
__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
{
__m256i res;
int64x2_t t0 = {e0,e1};
int64x2_t t1 = {e2,e3};
res.lo = __m128i(t0);
res.hi = __m128i(t1);
return res;
}
AVX2NEON_ABI
__m256d _mm256_setzero_pd ()
{
__m256d res;
res.lo = res.hi = vdupq_n_f64(0);
return res;
}
AVX2NEON_ABI
int _mm256_movemask_pd (__m256d a)
{
int res = 0;
uint64x2_t x;
x = uint64x2_t(a.lo);
res |= (x[0] >> 63) ? 1 : 0;
res |= (x[0] >> 63) ? 2 : 0;
x = uint64x2_t(a.hi);
res |= (x[0] >> 63) ? 4 : 0;
res |= (x[0] >> 63) ? 8 : 0;
return res;
}
AVX2NEON_ABI
__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
{
__m256i res;
res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
return res;
}
AVX2NEON_ABI
__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
{
__m256i res;
res.lo = __m128i(vceqq_f64(a.lo,b.lo));
res.hi = __m128i(vceqq_f64(a.hi,b.hi));
return res;
}
AVX2NEON_ABI
int _mm256_testz_pd (const __m256d& a, const __m256d& b)
{
__m256d t = a;
if (&a != &b)
t = _mm256_and_pd(a,b);
return _mm256_movemask_pd(t) == 0;
}
AVX2NEON_ABI
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
{
__m256d res;
uint64x2_t t = uint64x2_t(mask.lo);
res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
t = uint64x2_t(mask.hi);
res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
return res;
}
template<int imm8>
__m256 __mm256_dp_ps (__m256 a, __m256 b)
{
__m256 res;
res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
return res;
}
#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
AVX2NEON_ABI
double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
{
switch (imm8 & 3) {
case 0:
return a.lo[0];
case 1:
return a.lo[1];
case 2:
return a.hi[0];
case 3:
return a.hi[1];
}
__builtin_unreachable();
return 0;
}
AVX2NEON_ABI
__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
{
__m256d res;
res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
return res;
}
AVX2NEON_ABI
__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
{
return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
}
AVX2NEON_ABI
__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
{
__m256i res;
res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
return res;
}
#define _mm256_load_si256 _mm256_loadu_si256
AVX2NEON_ABI
void _mm256_storeu_ps (float * mem_addr, __m256 a)
{
*(__m128 *)(mem_addr + 0) = a.lo;
*(__m128 *)(mem_addr + 4) = a.hi;
}
#define _mm256_store_ps _mm256_storeu_ps
#define _mm256_stream_ps _mm256_storeu_ps
AVX2NEON_ABI
void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
{
*(__m128i *)((int *)mem_addr + 0) = a.lo;
*(__m128i *)((int *)mem_addr + 4) = a.hi;
}
#define _mm256_store_si256 _mm256_storeu_si256
AVX2NEON_ABI
__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
{
__m256 res;
res.lo = _mm_maskload_ps(mem_addr,mask.lo);
res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepu8_epi32 (__m128i a)
{
__m256i res;
uint8x16_t x = uint8x16_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepi8_epi32 (__m128i a)
{
__m256i res;
int8x16_t x = int8x16_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepu16_epi32 (__m128i a)
{
__m256i res;
uint16x8_t x = uint16x8_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_cvtepi16_epi32 (__m128i a)
{
__m256i res;
int16x8_t x = int16x8_t(a);
for (int i=0;i<4;i++)
{
res.lo[i] = x[i];
res.hi[i] = x[i+4];
}
return res;
}
AVX2NEON_ABI
void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
{
_mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
_mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
}
AVX2NEON_ABI
__m256i _mm256_slli_epi32 (__m256i a, int imm8)
{
__m256i res;
res.lo = _mm_slli_epi32(a.lo,imm8);
res.hi = _mm_slli_epi32(a.hi,imm8);
return res;
}
AVX2NEON_ABI
__m256i _mm256_srli_epi32 (__m256i a, int imm8)
{
__m256i res;
res.lo = _mm_srli_epi32(a.lo,imm8);
res.hi = _mm_srli_epi32(a.hi,imm8);
return res;
}
AVX2NEON_ABI
__m256i _mm256_srai_epi32 (__m256i a, int imm8)
{
__m256i res;
res.lo = _mm_srai_epi32(a.lo,imm8);
res.hi = _mm_srai_epi32(a.hi,imm8);
return res;
}
AVX2NEON_ABI
__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
{
__m256i res;
res.lo = vshlq_s32(a.lo,count.lo);
res.hi = vshlq_s32(a.hi,count.hi);
return res;
}
AVX2NEON_ABI
__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
{
__m256i res;
res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
return res;
}
AVX2NEON_ABI
__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
{
__m256i res;
res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
return res;
}
AVX2NEON_ABI
__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
{
return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
}
AVX2NEON_ABI
__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
{
if (imm8 & 1) return a.hi;
return a.lo;
}
AVX2NEON_ABI
__m256 _mm256_set1_ps(float x)
{
__m256 res;
res.lo = res.hi = vdupq_n_f32(x);
return res;
}
AVX2NEON_ABI
__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
{
__m256 res;
res.lo = _mm_set_ps(e3,e2,e1,e0);
res.hi = _mm_set_ps(e7,e6,e5,e4);
return res;
}
AVX2NEON_ABI
__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
{
__m256 res;
res.lo = res.hi = *mem_addr;
return res;
}
AVX2NEON_ABI
__m256 _mm256_cvtepi32_ps (__m256i a)
{
__m256 res;
res.lo = _mm_cvtepi32_ps(a.lo);
res.hi = _mm_cvtepi32_ps(a.hi);
return res;
}
AVX2NEON_ABI
void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
{
for (int i=0;i<4;i++) {
if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
}
}
AVX2NEON_ABI
__m256d _mm256_andnot_pd (__m256d a, __m256d b)
{
__m256d res;
res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
return res;
}
AVX2NEON_ABI
__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
{
__m256 res;
res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
return res;
}
AVX2NEON_ABI
__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
{
__m256i res;
res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
return res;
}
AVX2NEON_ABI
__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
{
__m256i res;
for (int i=0;i<4;i++)
{
res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
}
return res;
}
AVX2NEON_ABI
__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
{
__m256i res = _mm256_setzero_si256();
for (int i=0;i<4;i++)
{
if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
}
return res;
}

1753
thirdparty/embree/common/math/SSE2NEON.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -77,7 +77,7 @@ namespace embree
return lower > upper; return lower > upper;
} }
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline bool BBox<Vec3fa>::empty() const { template<> __forceinline bool BBox<Vec3fa>::empty() const {
return !all(le_mask(lower,upper)); return !all(le_mask(lower,upper));
} }
@ -228,11 +228,11 @@ namespace embree
/// SSE / AVX / MIC specializations /// SSE / AVX / MIC specializations
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined __SSE__ #if defined (__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h" #include "../simd/sse.h"
#endif #endif
#if defined __AVX__ #if defined (__AVX__)
#include "../simd/avx.h" #include "../simd/avx.h"
#endif #endif

View file

@ -42,6 +42,6 @@ namespace embree
} }
/*! default template instantiations */ /*! default template instantiations */
typedef Col3<unsigned char> Col3uc; typedef Col3<uint8_t > Col3uc;
typedef Col3<float > Col3f; typedef Col3<float > Col3f;
} }

View file

@ -42,6 +42,6 @@ namespace embree
} }
/*! default template instantiations */ /*! default template instantiations */
typedef Col4<unsigned char> Col4uc; typedef Col4<uint8_t > Col4uc;
typedef Col4<float > Col4f; typedef Col4<float > Col4f;
} }

View file

@ -52,17 +52,17 @@ namespace embree
__forceinline void set(Col3uc& d) const __forceinline void set(Col3uc& d) const
{ {
vfloat4 s = clamp(vfloat4(m128))*255.0f; vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]); d.r = (uint8_t)(s[0]);
d.g = (unsigned char)(s[1]); d.g = (uint8_t)(s[1]);
d.b = (unsigned char)(s[2]); d.b = (uint8_t)(s[2]);
} }
__forceinline void set(Col4uc& d) const __forceinline void set(Col4uc& d) const
{ {
vfloat4 s = clamp(vfloat4(m128))*255.0f; vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]); d.r = (uint8_t)(s[0]);
d.g = (unsigned char)(s[1]); d.g = (uint8_t)(s[1]);
d.b = (unsigned char)(s[2]); d.b = (uint8_t)(s[2]);
d.a = (unsigned char)(s[3]); d.a = (uint8_t)(s[3]);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -114,16 +114,16 @@ namespace embree
__forceinline void set(Col3uc& d) const __forceinline void set(Col3uc& d) const
{ {
vfloat4 s = clamp(vfloat4(m128))*255.0f; vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]); d.r = (uint8_t)(s[0]);
d.g = (unsigned char)(s[1]); d.g = (uint8_t)(s[1]);
d.b = (unsigned char)(s[2]); d.b = (uint8_t)(s[2]);
} }
__forceinline void set(Col4uc& d) const __forceinline void set(Col4uc& d) const
{ {
vfloat4 s = clamp(vfloat4(m128))*255.0f; vfloat4 s = clamp(vfloat4(m128))*255.0f;
d.r = (unsigned char)(s[0]); d.r = (uint8_t)(s[0]);
d.g = (unsigned char)(s[1]); d.g = (uint8_t)(s[1]);
d.b = (unsigned char)(s[2]); d.b = (uint8_t)(s[2]);
d.a = 255; d.a = 255;
} }
@ -152,21 +152,37 @@ namespace embree
} }
__forceinline const Color rcp ( const Color& a ) __forceinline const Color rcp ( const Color& a )
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
return (const Color)reciprocal;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const Color r = _mm_rcp14_ps(a.m128); const Color r = _mm_rcp14_ps(a.m128);
#else #else
const Color r = _mm_rcp_ps(a.m128); const Color r = _mm_rcp_ps(a.m128);
#endif #endif
return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
#endif //defined(__aarch64__) && defined(BUILD_IOS)
} }
__forceinline const Color rsqrt( const Color& a ) __forceinline const Color rsqrt( const Color& a )
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
return r;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128); __m128 r = _mm_rsqrt14_ps(a.m128);
#else #else
__m128 r = _mm_rsqrt_ps(a.m128); __m128 r = _mm_rsqrt_ps(a.m128);
#endif #endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif //defined(__aarch64__) && defined(BUILD_IOS)
} }
__forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }

View file

@ -1,6 +1,10 @@
// Copyright 2009-2020 Intel Corporation // Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
#if defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "constants.h" #include "constants.h"
namespace embree namespace embree
@ -24,4 +28,34 @@ namespace embree
ReverseStepTy reverse_step; ReverseStepTy reverse_step;
EmptyTy empty; EmptyTy empty;
UndefinedTy undefined; UndefinedTy undefined;
#if defined(__aarch64__)
const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
const uint32x4_t vzero = { 0, 0, 0, 0 };
const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
#endif
} }

View file

@ -12,6 +12,19 @@
#include <cfloat> #include <cfloat>
#include <climits> #include <climits>
// Math constants may not be defined in libcxx + mingw + strict C++ standard
#if defined(__MINGW32__)
// TODO(LTE): use constexpr
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#ifndef M_1_PI
#define M_1_PI 0.31830988618379067154
#endif
#endif // __MINGW32__
namespace embree namespace embree
{ {
static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
@ -44,8 +57,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return 0; } __forceinline operator unsigned int ( ) const { return 0; }
__forceinline operator short ( ) const { return 0; } __forceinline operator short ( ) const { return 0; }
__forceinline operator unsigned short ( ) const { return 0; } __forceinline operator unsigned short ( ) const { return 0; }
__forceinline operator char ( ) const { return 0; } __forceinline operator int8_t ( ) const { return 0; }
__forceinline operator unsigned char ( ) const { return 0; } __forceinline operator uint8_t ( ) const { return 0; }
}; };
extern MAYBE_UNUSED ZeroTy zero; extern MAYBE_UNUSED ZeroTy zero;
@ -62,8 +75,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return 1; } __forceinline operator unsigned int ( ) const { return 1; }
__forceinline operator short ( ) const { return 1; } __forceinline operator short ( ) const { return 1; }
__forceinline operator unsigned short ( ) const { return 1; } __forceinline operator unsigned short ( ) const { return 1; }
__forceinline operator char ( ) const { return 1; } __forceinline operator int8_t ( ) const { return 1; }
__forceinline operator unsigned char ( ) const { return 1; } __forceinline operator uint8_t ( ) const { return 1; }
}; };
extern MAYBE_UNUSED OneTy one; extern MAYBE_UNUSED OneTy one;
@ -80,8 +93,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); } __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); }
__forceinline operator short ( ) const { return std::numeric_limits<short>::min(); } __forceinline operator short ( ) const { return std::numeric_limits<short>::min(); }
__forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); } __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); }
__forceinline operator char ( ) const { return std::numeric_limits<char>::min(); } __forceinline operator int8_t ( ) const { return std::numeric_limits<int8_t>::min(); }
__forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::min(); } __forceinline operator uint8_t ( ) const { return std::numeric_limits<uint8_t>::min(); }
}; };
@ -99,8 +112,8 @@ namespace embree
__forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); } __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); }
__forceinline operator short ( ) const { return std::numeric_limits<short>::max(); } __forceinline operator short ( ) const { return std::numeric_limits<short>::max(); }
__forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); } __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); }
__forceinline operator char ( ) const { return std::numeric_limits<char>::max(); } __forceinline operator int8_t ( ) const { return std::numeric_limits<int8_t>::max(); }
__forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); } __forceinline operator uint8_t ( ) const { return std::numeric_limits<uint8_t>::max(); }
}; };
extern MAYBE_UNUSED PosInfTy inf; extern MAYBE_UNUSED PosInfTy inf;
@ -194,4 +207,33 @@ namespace embree
}; };
extern MAYBE_UNUSED UndefinedTy undefined; extern MAYBE_UNUSED UndefinedTy undefined;
#if defined(__aarch64__)
extern const uint32x4_t movemask_mask;
extern const uint32x4_t vzero;
extern const uint32x4_t v0x80000000;
extern const uint32x4_t v0x7fffffff;
extern const uint32x4_t v000F;
extern const uint32x4_t v00F0;
extern const uint32x4_t v00FF;
extern const uint32x4_t v0F00;
extern const uint32x4_t v0F0F;
extern const uint32x4_t v0FF0;
extern const uint32x4_t v0FFF;
extern const uint32x4_t vF000;
extern const uint32x4_t vF00F;
extern const uint32x4_t vF0F0;
extern const uint32x4_t vF0FF;
extern const uint32x4_t vFF00;
extern const uint32x4_t vFF0F;
extern const uint32x4_t vFFF0;
extern const uint32x4_t vFFFF;
extern const uint8x16_t v0022;
extern const uint8x16_t v1133;
extern const uint8x16_t v0101;
extern const float32x4_t vOne;
extern const float32x4_t vmOne;
extern const float32x4_t vInf;
extern const float32x4_t vmInf;
#endif
} }

View file

@ -8,12 +8,19 @@
#include "constants.h" #include "constants.h"
#include <cmath> #include <cmath>
#if defined(__ARM_NEON)
#include "SSE2NEON.h"
#if defined(NEON_AVX2_EMULATION)
#include "AVX2NEON.h"
#endif
#else
#include <emmintrin.h> #include <emmintrin.h>
#include <xmmintrin.h> #include <xmmintrin.h>
#include <immintrin.h> #include <immintrin.h>
#endif
#if defined(__WIN32__) #if defined(__WIN32__) && !defined(__MINGW32__)
#if defined(_MSC_VER) && (_MSC_VER <= 1700) #if (__MSV_VER <= 1700)
namespace std namespace std
{ {
__forceinline bool isinf ( const float x ) { return _finite(x) == 0; } __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
@ -40,7 +47,7 @@ namespace embree
__forceinline int toInt (const float& a) { return int(a); } __forceinline int toInt (const float& a) { return int(a); }
__forceinline float toFloat(const int& a) { return float(a); } __forceinline float toFloat(const int& a) { return float(a); }
#if defined(__WIN32__) #if defined(__WIN32__) && !defined(__MINGW32__)
__forceinline bool finite ( const float x ) { return _finite(x) != 0; } __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
#endif #endif
@ -49,6 +56,16 @@ namespace embree
__forceinline float rcp ( const float x ) __forceinline float rcp ( const float x )
{ {
#if defined(__aarch64__)
// Move scalar to vector register and do rcp.
__m128 a;
a[0] = x;
float32x4_t reciprocal = vrecpeq_f32(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
return reciprocal[0];
#else
const __m128 a = _mm_set_ss(x); const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
@ -62,19 +79,61 @@ namespace embree
#else #else
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
#endif #endif
#endif //defined(__aarch64__)
} }
__forceinline float signmsk ( const float x ) { __forceinline float signmsk ( const float x ) {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
a[0] = x;
b[0] = 0x80000000;
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
return a[0];
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
#endif
} }
__forceinline float xorf( const float x, const float y ) { __forceinline float xorf( const float x, const float y ) {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128 b;
a[0] = x;
b[0] = y;
a = _mm_xor_ps(a, b);
return a[0];
#else
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
#endif
} }
__forceinline float andf( const float x, const unsigned y ) { __forceinline float andf( const float x, const unsigned y ) {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
a[0] = x;
b[0] = y;
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
return a[0];
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
#endif
} }
__forceinline float rsqrt( const float x ) __forceinline float rsqrt( const float x )
{ {
#if defined(__aarch64__)
// FP and Neon shares same vector register in arm64
__m128 a;
a[0] = x;
__m128 value = _mm_rsqrt_ps(a);
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
return value[0];
#else
const __m128 a = _mm_set_ss(x); const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
@ -84,9 +143,10 @@ namespace embree
const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
_mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
return _mm_cvtss_f32(c); return _mm_cvtss_f32(c);
#endif
} }
#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) #if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
__forceinline double nextafter(double x, double y) { return _nextafter(x, y); } __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
__forceinline int roundf(float f) { return (int)(f + 0.5f); } __forceinline int roundf(float f) { return (int)(f + 0.5f); }
@ -140,7 +200,17 @@ namespace embree
__forceinline double floor( const double x ) { return ::floor (x); } __forceinline double floor( const double x ) { return ::floor (x); }
__forceinline double ceil ( const double x ) { return ::ceil (x); } __forceinline double ceil ( const double x ) { return ::ceil (x); }
#if defined(__SSE4_1__) #if defined(__aarch64__)
__forceinline float mini(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
x[0] = a;
y[0] = b;
x = _mm_min_ps(x, y);
return x[0];
}
#elif defined(__SSE4_1__)
__forceinline float mini(float a, float b) { __forceinline float mini(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@ -149,7 +219,17 @@ namespace embree
} }
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__)
__forceinline float maxi(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
x[0] = a;
y[0] = b;
x = _mm_max_ps(x, y);
return x[0];
}
#elif defined(__SSE4_1__)
__forceinline float maxi(float a, float b) { __forceinline float maxi(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@ -166,7 +246,7 @@ namespace embree
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
__forceinline float min(float a, float b) { return a<b ? a:b; } __forceinline float min(float a, float b) { return a<b ? a:b; }
__forceinline double min(double a, double b) { return a<b ? a:b; } __forceinline double min(double a, double b) { return a<b ? a:b; }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
#endif #endif
@ -183,7 +263,7 @@ namespace embree
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
__forceinline float max(float a, float b) { return a<b ? b:a; } __forceinline float max(float a, float b) { return a<b ? b:a; }
__forceinline double max(double a, double b) { return a<b ? b:a; } __forceinline double max(double a, double b) { return a<b ? b:a; }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
#endif #endif
@ -225,6 +305,16 @@ namespace embree
__forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
#elif defined (__aarch64__) && defined(__clang__)
#pragma clang fp contract(fast)
__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; }
__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; }
__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
#pragma clang fp contract(on)
#else #else
__forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }
__forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }
@ -273,6 +363,15 @@ namespace embree
/*! exchange */ /*! exchange */
template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
#if 1//!defined(__aarch64__)
return msub(a,b,c*d);
#else
return nmadd(c,d,a*b);
#endif
}
/*! bit reverse operation */ /*! bit reverse operation */
template<class T> template<class T>
__forceinline T bitReverse(const T& vin) __forceinline T bitReverse(const T& vin)
@ -290,7 +389,7 @@ namespace embree
template<class T> template<class T>
__forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
{ {
T x = xin, y = yin, z = zin; T x = xin, y = yin, z = zin;
x = (x | (x << 16)) & 0x030000FF; x = (x | (x << 16)) & 0x030000FF;
x = (x | (x << 8)) & 0x0300F00F; x = (x | (x << 8)) & 0x0300F00F;
x = (x | (x << 4)) & 0x030C30C3; x = (x | (x << 4)) & 0x030C30C3;
@ -309,7 +408,7 @@ namespace embree
return x | (y << 1) | (z << 2); return x | (y << 1) | (z << 2);
} }
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
template<> template<>
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)

View file

@ -205,11 +205,11 @@ namespace embree
#include "vec2fa.h" #include "vec2fa.h"
#if defined __SSE__ #if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h" #include "../simd/sse.h"
#endif #endif
#if defined __AVX__ #if defined(__AVX__)
#include "../simd/avx.h" #include "../simd/avx.h"
#endif #endif
@ -221,7 +221,7 @@ namespace embree
{ {
template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#endif #endif

View file

@ -97,6 +97,12 @@ namespace embree
__forceinline Vec2fa rcp ( const Vec2fa& a ) __forceinline Vec2fa rcp ( const Vec2fa& a )
{ {
#if defined(__aarch64__)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
return (const Vec2fa)reciprocal;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const Vec2fa r = _mm_rcp14_ps(a.m128); const Vec2fa r = _mm_rcp14_ps(a.m128);
#else #else
@ -111,6 +117,7 @@ namespace embree
#endif #endif
return res; return res;
#endif //defined(__aarch64__)
} }
__forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
@ -118,12 +125,21 @@ namespace embree
__forceinline Vec2fa rsqrt( const Vec2fa& a ) __forceinline Vec2fa rsqrt( const Vec2fa& a )
{ {
#if defined(__aarch64__)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
return r;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128); __m128 r = _mm_rsqrt14_ps(a.m128);
#else #else
__m128 r = _mm_rsqrt_ps(a.m128); __m128 r = _mm_rsqrt_ps(a.m128);
#endif #endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif
} }
__forceinline Vec2fa zero_fix(const Vec2fa& a) { __forceinline Vec2fa zero_fix(const Vec2fa& a) {
@ -156,7 +172,7 @@ namespace embree
__forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a); const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b); const vint4 bi = _mm_castps_si128(b);
@ -165,7 +181,7 @@ namespace embree
} }
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a); const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b); const vint4 bi = _mm_castps_si128(b);
@ -275,7 +291,11 @@ namespace embree
/// Rounding Functions /// Rounding Functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__) #if defined(__aarch64__)
__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
#elif defined (__SSE4_1__)
//__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }

View file

@ -206,8 +206,7 @@ namespace embree
template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); } template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); }
template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); } template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); }
template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); } template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); } template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c ) template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
{ {
const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
@ -266,11 +265,11 @@ namespace embree
/// SSE / AVX / MIC specializations /// SSE / AVX / MIC specializations
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined __SSE__ #if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h" #include "../simd/sse.h"
#endif #endif
#if defined __AVX__ #if defined(__AVX__)
#include "../simd/avx.h" #include "../simd/avx.h"
#endif #endif
@ -291,14 +290,14 @@ namespace embree
template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
x = a.x; y = a.y; z = a.z; x = a.x; y = a.y; z = a.z;
} }
#elif defined(__SSE__) #elif defined(__SSE__) || defined(__ARM_NEON)
template<> template<>
__forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
} }
#endif #endif
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
__forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) { __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
} }

View file

@ -55,7 +55,13 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static __forceinline Vec3fa load( const void* const a ) { static __forceinline Vec3fa load( const void* const a ) {
#if defined(__aarch64__)
__m128 t = _mm_load_ps((float*)a);
t[3] = 0.0f;
return Vec3fa(t);
#else
return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
#endif
} }
static __forceinline Vec3fa loadu( const void* const a ) { static __forceinline Vec3fa loadu( const void* const a ) {
@ -89,19 +95,42 @@ namespace embree
__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
__forceinline Vec3fa operator -( const Vec3fa& a ) { __forceinline Vec3fa operator -( const Vec3fa& a ) {
#if defined(__aarch64__)
return vnegq_f32(a.m128);
#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return _mm_xor_ps(a.m128, mask); return _mm_xor_ps(a.m128, mask);
#endif
} }
__forceinline Vec3fa abs ( const Vec3fa& a ) { __forceinline Vec3fa abs ( const Vec3fa& a ) {
#if defined(__aarch64__)
return _mm_abs_ps(a.m128);
#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return _mm_and_ps(a.m128, mask); return _mm_and_ps(a.m128, mask);
#endif
} }
__forceinline Vec3fa sign ( const Vec3fa& a ) { __forceinline Vec3fa sign ( const Vec3fa& a ) {
#if defined(__aarch64__)
Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f)));
return r;
#else
return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
#endif
} }
__forceinline Vec3fa rcp ( const Vec3fa& a ) __forceinline Vec3fa rcp ( const Vec3fa& a )
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
#elif defined(__aarch64__)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
return (const Vec3fa)reciprocal;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const Vec3fa r = _mm_rcp14_ps(a.m128); const Vec3fa r = _mm_rcp14_ps(a.m128);
#else #else
@ -116,6 +145,7 @@ namespace embree
#endif #endif
return res; return res;
#endif //defined(__aarch64__)
} }
__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
@ -123,12 +153,20 @@ namespace embree
__forceinline Vec3fa rsqrt( const Vec3fa& a ) __forceinline Vec3fa rsqrt( const Vec3fa& a )
{ {
#if defined(__aarch64__)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
return r;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128); __m128 r = _mm_rsqrt14_ps(a.m128);
#else #else
__m128 r = _mm_rsqrt_ps(a.m128); __m128 r = _mm_rsqrt_ps(a.m128);
#endif #endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif
} }
__forceinline Vec3fa zero_fix(const Vec3fa& a) { __forceinline Vec3fa zero_fix(const Vec3fa& a) {
@ -161,7 +199,7 @@ namespace embree
__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128); const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128); const vint4 bi = _mm_castps_si128(b.m128);
@ -170,7 +208,7 @@ namespace embree
} }
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128); const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128); const vint4 bi = _mm_castps_si128(b.m128);
@ -192,11 +230,30 @@ namespace embree
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
#else
#if defined(__aarch64__)
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
return _mm_madd_ps(a.m128, b.m128, c.m128); //a*b+c;
}
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
return _mm_msub_ps(a.m128, b.m128, c.m128); //-a*b+c;
}
__forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128);
return -t;
}
__forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c
}
#else #else
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
#endif
#endif #endif
__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
@ -218,18 +275,37 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline float reduce_add(const Vec3fa& v) { __forceinline float reduce_add(const Vec3fa& v) {
float32x4_t t = v.m128;
t[3] = 0.0f;
return vaddvq_f32(t);
}
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
__forceinline float reduce_min(const Vec3fa& v) {
float32x4_t t = v.m128;
t[3] = t[2];
return vminvq_f32(t);
}
__forceinline float reduce_max(const Vec3fa& v) {
float32x4_t t = v.m128;
t[3] = t[2];
return vmaxvq_f32(t);
}
#else
__forceinline float reduce_add(const Vec3fa& v) {
const vfloat4 a(v.m128); const vfloat4 a(v.m128);
const vfloat4 b = shuffle<1>(a); const vfloat4 b = shuffle<1>(a);
const vfloat4 c = shuffle<2>(a); const vfloat4 c = shuffle<2>(a);
return _mm_cvtss_f32(a+b+c); return _mm_cvtss_f32(a+b+c);
} }
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators /// Comparison Operators
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -241,8 +317,13 @@ namespace embree
__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } #if defined(__aarch64__)
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
#else
__forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
__forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
#endif
__forceinline bool isvalid ( const Vec3fa& v ) { __forceinline bool isvalid ( const Vec3fa& v ) {
return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
@ -280,7 +361,7 @@ namespace embree
vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
vfloat4 b1 = vfloat4(b.m128); vfloat4 b1 = vfloat4(b.m128);
return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)));
} }
__forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
@ -335,7 +416,11 @@ namespace embree
/// Rounding Functions /// Rounding Functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__) #if defined(__aarch64__)
__forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
__forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
__forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
#elif defined (__SSE4_1__)
__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
@ -393,8 +478,10 @@ namespace embree
__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
__forceinline Vec3fx( const Vec3fa& other, const float w1) { __forceinline Vec3fx( const Vec3fa& other, const float w1) {
#if defined (__SSE4_1__) #if defined (__aarch64__)
m128 = other.m128; m128[3] = w1;
#elif defined (__SSE4_1__)
m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
#else #else
const vint4 mask(-1,-1,-1,0); const vint4 mask(-1,-1,-1,0);
@ -526,7 +613,7 @@ namespace embree
__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
#if defined(__SSE4_1__) #if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128); const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128); const vint4 bi = _mm_castps_si128(b.m128);
@ -535,7 +622,7 @@ namespace embree
} }
#endif #endif
#if defined(__SSE4_1__) #if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128); const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128); const vint4 bi = _mm_castps_si128(b.m128);
@ -584,11 +671,11 @@ namespace embree
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline float reduce_add(const Vec3fx& v) { __forceinline float reduce_add(const Vec3fx& v) {
const vfloat4 a(v.m128); const vfloat4 a(v.m128);
const vfloat4 b = shuffle<1>(a); const vfloat4 b = shuffle<1>(a);
const vfloat4 c = shuffle<2>(a); const vfloat4 c = shuffle<2>(a);
return _mm_cvtss_f32(a+b+c); return _mm_cvtss_f32(a+b+c);
} }
__forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
@ -700,7 +787,7 @@ namespace embree
/// Rounding Functions /// Rounding Functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__) #if defined (__SSE4_1__) && !defined(__aarch64__)
__forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }

View file

@ -65,7 +65,9 @@ namespace embree
__forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
__forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
#if defined(__SSSE3__) #if (defined(__aarch64__))
__forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
#elif defined(__SSSE3__)
__forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
#endif #endif
@ -81,7 +83,7 @@ namespace embree
__forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); }
__forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; }
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
__forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); }
__forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; }
@ -99,12 +101,14 @@ namespace embree
__forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); }
__forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
#if !defined(__ARM_NEON)
__forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
__forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
__forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
__forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
__forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators /// Assignment Operators
@ -116,7 +120,7 @@ namespace embree
__forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
__forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; }
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
__forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; }
#endif #endif
@ -127,18 +131,38 @@ namespace embree
__forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
__forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; }
#if !defined(__ARM_NEON)
__forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
__forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__)
__forceinline int reduce_add(const Vec3ia& v) {
int32x4_t t = v.m128;
t[3] = 0;
return vaddvq_s32(t);
}
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
__forceinline int reduce_min(const Vec3ia& v) {
int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0);
return vminvq_s32(t);
}
__forceinline int reduce_max(const Vec3ia& v) {
int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0);
return vmaxvq_s32(t);
}
#else
__forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
__forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
__forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators /// Comparison Operators
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -161,14 +185,14 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
#else #else
return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
#endif #endif
} }
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
__forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
#else #else

View file

@ -192,7 +192,7 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
typedef Vec4<bool > Vec4b; typedef Vec4<bool > Vec4b;
typedef Vec4<unsigned char> Vec4uc; typedef Vec4<uint8_t > Vec4uc;
typedef Vec4<int > Vec4i; typedef Vec4<int > Vec4i;
typedef Vec4<float > Vec4f; typedef Vec4<float > Vec4f;
} }
@ -205,7 +205,7 @@ namespace embree
/// SSE / AVX / MIC specializations /// SSE / AVX / MIC specializations
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined __SSE__ #if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h" #include "../simd/sse.h"
#endif #endif
@ -225,13 +225,13 @@ namespace embree
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
x = a.x; y = a.y; z = a.z; w = a.w; x = a.x; y = a.y; z = a.z; w = a.w;
} }
#elif defined(__SSE__) #elif defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
} }
#endif #endif
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
__forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) { __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) {
return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
} }

View file

@ -6,7 +6,7 @@
#include "../math/math.h" #include "../math/math.h"
/* include SSE wrapper classes */ /* include SSE wrapper classes */
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
# include "sse.h" # include "sse.h"
#endif #endif

View file

@ -11,7 +11,7 @@
namespace embree namespace embree
{ {
#if defined(__SSE4_1__) #if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
__forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) {
return _mm_blendv_ps(f,t,mask); return _mm_blendv_ps(f,t,mask);
} }

View file

@ -56,8 +56,12 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
#if !defined(__aarch64__)
__forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
#else
__forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Array Access /// Array Access
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -101,9 +105,10 @@ namespace embree
/// Movement/Shifting/Shuffling Functions /// Movement/Shifting/Shuffling Functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if !defined(__aarch64__)
__forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
__forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
#endif
#if defined(__AVX2__) #if defined(__AVX2__)
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>

View file

@ -37,9 +37,13 @@ namespace embree
: v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
__forceinline vboolf(bool a, bool b, bool c, bool d) __forceinline vboolf(bool a, bool b, bool c, bool d)
: v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; }
__forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; }
#else
__forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
__forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
#endif
/* return int32 mask */ /* return int32 mask */
__forceinline __m128i mask32() const { __forceinline __m128i mask32() const {
return _mm_castps_si128(v); return _mm_castps_si128(v);
@ -56,8 +60,13 @@ namespace embree
/// Array Access /// Array Access
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; }
__forceinline int& operator [](size_t index) { return i[index]; }
#else
__forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
__forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; }
#endif
}; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -92,7 +101,7 @@ namespace embree
__forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
__forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
#if defined(__SSE4_1__) #if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
return _mm_blendv_ps(f, t, m); return _mm_blendv_ps(f, t, m);
#else #else
return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
@ -106,6 +115,17 @@ namespace embree
__forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
__forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vboolf4 shuffle(const vboolf4& v) {
return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>
__forceinline vboolf4 shuffle(const vboolf4& v) { __forceinline vboolf4 shuffle(const vboolf4& v) {
return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
@ -115,7 +135,8 @@ namespace embree
__forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
} }
#endif
template<int i0> template<int i0>
__forceinline vboolf4 shuffle(const vboolf4& v) { __forceinline vboolf4 shuffle(const vboolf4& v) {
return shuffle<i0,i0,i0,i0>(v); return shuffle<i0,i0,i0,i0>(v);
@ -127,7 +148,7 @@ namespace embree
template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
#endif #endif
#if defined(__SSE4_1__) #if defined(__SSE4_1__) && !defined(__aarch64__)
template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); } template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); } template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
@ -149,10 +170,14 @@ namespace embree
__forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
__forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
#if defined(__aarch64__) && defined(BUILD_IOS)
__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); }
#else
#if defined(__SSE4_2__) #if defined(__SSE4_2__)
__forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
#else #else
__forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
#endif
#endif #endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -68,8 +68,11 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
#if !defined(__aarch64__)
__forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
#else
__forceinline vboolf(TrueTy) : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {}
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Array Access /// Array Access
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -181,13 +181,20 @@ namespace embree
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
#else #elif !defined(__aarch64__)
__forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
__forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
__forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
#else
__forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); }
__forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
__forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); }
__forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
__forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
__forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); }
#endif #endif
__forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); }

View file

@ -10,18 +10,18 @@ namespace embree
struct vfloat<4> struct vfloat<4>
{ {
ALIGNED_STRUCT_(16); ALIGNED_STRUCT_(16);
typedef vboolf4 Bool; typedef vboolf4 Bool;
typedef vint4 Int; typedef vint4 Int;
typedef vfloat4 Float; typedef vfloat4 Float;
enum { size = 4 }; // number of SIMD elements enum { size = 4 }; // number of SIMD elements
union { __m128 v; float f[4]; int i[4]; }; // data union { __m128 v; float f[4]; int i[4]; }; // data
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Constructors, Assignment & Cast Operators /// Constructors, Assignment & Cast Operators
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline vfloat() {} __forceinline vfloat() {}
__forceinline vfloat(const vfloat4& other) { v = other.v; } __forceinline vfloat(const vfloat4& other) { v = other.v; }
__forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
@ -34,14 +34,19 @@ namespace embree
__forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
__forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
#if defined(__aarch64__)
__forceinline explicit vfloat(const vuint4& x) {
v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
}
#else
__forceinline explicit vfloat(const vuint4& x) { __forceinline explicit vfloat(const vuint4& x) {
const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
const __m128 af = _mm_cvtepi32_ps(a); const __m128 af = _mm_cvtepi32_ps(a);
const __m128 bf = _mm_castsi128_ps(b); const __m128 bf = _mm_castsi128_ps(b);
v = _mm_add_ps(af,bf); v = _mm_add_ps(af,bf);
} }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Constants /// Constants
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -102,32 +107,44 @@ namespace embree
#if defined (__SSE4_1__) #if defined (__SSE4_1__)
return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
#else #else
return _mm_load_ps(ptr); return _mm_load_ps(ptr);
#endif #endif
} }
#if defined(__SSE4_1__) #if defined(__aarch64__)
static __forceinline vfloat4 load(const char* ptr) { static __forceinline vfloat4 load(const int8_t* ptr) {
return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
}
#elif defined(__SSE4_1__)
static __forceinline vfloat4 load(const int8_t* ptr) {
return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
} }
#else #else
static __forceinline vfloat4 load(const char* ptr) { static __forceinline vfloat4 load(const int8_t* ptr) {
return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
} }
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__)
static __forceinline vfloat4 load(const unsigned char* ptr) { static __forceinline vfloat4 load(const uint8_t* ptr) {
return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
}
#elif defined(__SSE4_1__)
static __forceinline vfloat4 load(const uint8_t* ptr) {
return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
} }
#else #else
static __forceinline vfloat4 load(const unsigned char* ptr) { static __forceinline vfloat4 load(const uint8_t* ptr) {
//return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
} }
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__)
static __forceinline vfloat4 load(const short* ptr) {
return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
}
#elif defined(__SSE4_1__)
static __forceinline vfloat4 load(const short* ptr) { static __forceinline vfloat4 load(const short* ptr) {
return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
} }
@ -140,11 +157,15 @@ namespace embree
static __forceinline vfloat4 load(const unsigned short* ptr) { static __forceinline vfloat4 load(const unsigned short* ptr) {
return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
} }
static __forceinline void store_nt(void* ptr, const vfloat4& v) static __forceinline void store_nt(void* ptr, const vfloat4& v)
{ {
#if defined (__SSE4_1__) #if defined (__SSE4_1__)
#if defined(__aarch64__)
_mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v));
#else
_mm_stream_ps((float*)ptr,v); _mm_stream_ps((float*)ptr,v);
#endif
#else #else
_mm_store_ps((float*)ptr,v); _mm_store_ps((float*)ptr,v);
#endif #endif
@ -152,14 +173,14 @@ namespace embree
template<int scale = 4> template<int scale = 4>
static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _mm_i32gather_ps(ptr, index, scale); return _mm_i32gather_ps(ptr, index, scale);
#else #else
return vfloat4( return vfloat4(
*(float*)(((char*)ptr)+scale*index[0]), *(float*)(((int8_t*)ptr)+scale*index[0]),
*(float*)(((char*)ptr)+scale*index[1]), *(float*)(((int8_t*)ptr)+scale*index[1]),
*(float*)(((char*)ptr)+scale*index[2]), *(float*)(((int8_t*)ptr)+scale*index[2]),
*(float*)(((char*)ptr)+scale*index[3])); *(float*)(((int8_t*)ptr)+scale*index[3]));
#endif #endif
} }
@ -168,13 +189,13 @@ namespace embree
vfloat4 r = zero; vfloat4 r = zero;
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
#elif defined(__AVX2__) #elif defined(__AVX2__) && !defined(__aarch64__)
return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
#else #else
if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]); if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]); if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]); if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
return r; return r;
#endif #endif
} }
@ -185,10 +206,10 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm_i32scatter_ps((float*)ptr, index, v, scale); _mm_i32scatter_ps((float*)ptr, index, v, scale);
#else #else
*(float*)(((char*)ptr)+scale*index[0]) = v[0]; *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
*(float*)(((char*)ptr)+scale*index[1]) = v[1]; *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
*(float*)(((char*)ptr)+scale*index[2]) = v[2]; *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
*(float*)(((char*)ptr)+scale*index[3]) = v[3]; *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif #endif
} }
@ -198,20 +219,20 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
#else #else
if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0]; if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1]; if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2]; if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3]; if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif #endif
} }
static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) { static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) {
scatter<1>(mask,ptr,ofs,v); scatter<1>(mask,ptr,ofs,v);
} }
static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
scatter<4>(mask,ptr,ofs,v); scatter<4>(mask,ptr,ofs,v);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Array Access /// Array Access
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -222,10 +243,10 @@ namespace embree
friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm_mask_blend_ps(m, f, t); return _mm_mask_blend_ps(m, f, t);
#elif defined(__SSE4_1__) #elif defined(__SSE4_1__) || (defined(__aarch64__))
return _mm_blendv_ps(f, t, m); return _mm_blendv_ps(f, t, m);
#else #else
return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
#endif #endif
} }
}; };
@ -243,18 +264,47 @@ namespace embree
__forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); }
__forceinline vfloat4 operator +(const vfloat4& a) { return a; } __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
#if defined(__aarch64__)
__forceinline vfloat4 operator -(const vfloat4& a) {
return vnegq_f32(a);
}
#else
__forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
#endif
#if defined(__aarch64__)
__forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
#else
__forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
#endif
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
__forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
#else #else
__forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
#endif #endif
#if defined(__aarch64__)
__forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); }
#else
__forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
#endif
__forceinline vfloat4 rcp(const vfloat4& a) __forceinline vfloat4 rcp(const vfloat4& a)
{ {
#if defined(__aarch64__)
#if defined(BUILD_IOS)
return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
#else //BUILD_IOS
__m128 reciprocal = _mm_rcp_ps(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
// +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp.
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
return (const vfloat4)reciprocal;
#endif // BUILD_IOS
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const vfloat4 r = _mm_rcp14_ps(a); const vfloat4 r = _mm_rcp14_ps(a);
#else #else
@ -266,12 +316,22 @@ namespace embree
#else #else
return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
#endif #endif
#endif //defined(__aarch64__)
} }
__forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
__forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
__forceinline vfloat4 rsqrt(const vfloat4& a) __forceinline vfloat4 rsqrt(const vfloat4& a)
{ {
#if defined(__aarch64__)
vfloat4 r = _mm_rsqrt_ps(a);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
return r;
#else
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const vfloat4 r = _mm_rsqrt14_ps(a); const vfloat4 r = _mm_rsqrt14_ps(a);
#else #else
@ -284,11 +344,17 @@ namespace embree
#else #else
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
#endif
#endif #endif
} }
__forceinline vboolf4 isnan(const vfloat4& a) { __forceinline vboolf4 isnan(const vfloat4& a) {
#if defined(__aarch64__)
const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff));
#else
const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
#endif
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
#else #else
@ -329,7 +395,8 @@ namespace embree
__forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); }
__forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
#if defined(__SSE4_1__) #if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
const vint4 ai = _mm_castps_si128(a); const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b); const vint4 bi = _mm_castps_si128(b);
@ -377,10 +444,24 @@ namespace embree
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
#else #else
#if defined(__aarch64__)
__forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) {
return _mm_madd_ps(a, b, c); //a*b+c;
}
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
return _mm_msub_ps(a, b, c); //-a*b+c;
}
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
return vnegq_f32(vfmaq_f32(c,a, b));
}
#else
__forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
__forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
__forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
__forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
#endif
__forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
#endif #endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -414,8 +495,13 @@ namespace embree
__forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
__forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
__forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
#if defined(__aarch64__)
__forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
__forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
#else
__forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
__forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
#endif
__forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
#endif #endif
@ -427,7 +513,7 @@ namespace embree
__forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); }
__forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; }
__forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); }
__forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; }
@ -463,17 +549,68 @@ namespace embree
template<int mask> template<int mask>
__forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
{ {
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
return _mm_blend_ps(f, t, mask); return _mm_blend_ps(f, t, mask);
#else #else
return select(vboolf4(mask), t, f); return select(vboolf4(mask), t, f);
#endif #endif
} }
#if defined(__aarch64__)
template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero));
}
template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F));
}
template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0));
}
template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF));
}
template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00));
}
template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F));
}
template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0));
}
template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF));
}
template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000));
}
template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F));
}
template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0));
}
template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF));
}
template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00));
}
template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F));
}
template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0));
}
template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) {
return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF));
}
#endif
__forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
return madd(t,b-a,a); return madd(t,b-a,a);
} }
__forceinline bool isvalid(const vfloat4& v) { __forceinline bool isvalid(const vfloat4& v) {
return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
} }
@ -485,16 +622,21 @@ namespace embree
__forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Rounding Functions /// Rounding Functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined (__SSE4_1__) #if defined(__aarch64__)
__forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
__forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
__forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
__forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
#elif defined (__SSE4_1__)
__forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
__forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); }
__forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd
#else #else
__forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
__forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
@ -504,7 +646,9 @@ namespace embree
__forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
__forceinline vint4 floori(const vfloat4& a) { __forceinline vint4 floori(const vfloat4& a) {
#if defined(__SSE4_1__) #if defined(__aarch64__)
return vcvtq_s32_f32(floor(a));
#elif defined(__SSE4_1__)
return vint4(floor(a)); return vint4(floor(a));
#else #else
return vint4(a-vfloat4(0.5f)); return vint4(a-vfloat4(0.5f));
@ -518,6 +662,16 @@ namespace embree
__forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
__forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vfloat4 shuffle(const vfloat4& v) {
return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>
__forceinline vfloat4 shuffle(const vfloat4& v) { __forceinline vfloat4 shuffle(const vfloat4& v) {
return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
@ -527,14 +681,19 @@ namespace embree
__forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
} }
#endif
#if defined (__SSSE3__) #if defined (__SSSE3__)
__forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) { __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) {
return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
} }
#endif #endif
#if defined(__SSE3__) #if defined(__aarch64__)
template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); }
template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); }
template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); }
#elif defined(__SSE3__)
template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
@ -545,14 +704,56 @@ namespace embree
return shuffle<i,i,i,i>(v); return shuffle<i,i,i,i>(v);
} }
#if defined (__SSE4_1__) && !defined(__GNUC__) #if defined(__aarch64__)
template<int i> __forceinline float extract(const vfloat4& a);
template<> __forceinline float extract<0>(const vfloat4& b) {
return b[0];
}
template<> __forceinline float extract<1>(const vfloat4& b) {
return b[1];
}
template<> __forceinline float extract<2>(const vfloat4& b) {
return b[2];
}
template<> __forceinline float extract<3>(const vfloat4& b) {
return b[3];
}
#elif defined (__SSE4_1__) && !defined(__GNUC__)
template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); } template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
#else #else
template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); } template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
#endif
template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
#endif
#if defined (__SSE4_1__)
#if defined(__aarch64__)
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b);
template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[0] = b;
return c;
}
template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[1] = b;
return c;
}
template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[2] = b;
return c;
}
template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b)
{
vfloat4 c = a;
c[3] = b;
return c;
}
#elif defined (__SSE4_1__)
template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); } template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
@ -561,14 +762,19 @@ namespace embree
template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
#endif #endif
#if defined(__aarch64__)
__forceinline float toScalar(const vfloat4& v) {
return v[0];
}
#else
__forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
#endif
__forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) { __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) {
return vfloat4::broadcast(&a[k]); return vfloat4::broadcast(&a[k]);
} }
__forceinline vfloat4 shift_right_1(const vfloat4& x) { __forceinline vfloat4 shift_right_1(const vfloat4& x) {
return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4));
} }
#if defined (__AVX2__) #if defined (__AVX2__)
@ -584,7 +790,7 @@ namespace embree
template<int i> template<int i>
__forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
} }
#endif #endif
@ -658,28 +864,39 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__)
__forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
__forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
__forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
#else
__forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
__forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
__forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
#endif
#if defined(__aarch64__)
__forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
__forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
__forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
#else
__forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
__forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
__forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
#endif
__forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
{ {
const vfloat4 a = select(valid,v,vfloat4(pos_inf)); const vfloat4 a = select(valid,v,vfloat4(pos_inf));
const vbool4 valid_min = valid & (a == vreduce_min(a)); const vbool4 valid_min = valid & (a == vreduce_min(a));
return bsf(movemask(any(valid_min) ? valid_min : valid)); return bsf(movemask(any(valid_min) ? valid_min : valid));
} }
__forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v)
{ {
const vfloat4 a = select(valid,v,vfloat4(neg_inf)); const vfloat4 a = select(valid,v,vfloat4(neg_inf));
const vbool4 valid_max = valid & (a == vreduce_max(a)); const vbool4 valid_max = valid & (a == vreduce_max(a));
return bsf(movemask(any(valid_max) ? valid_max : valid)); return bsf(movemask(any(valid_max) ? valid_max : valid));
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Euclidian Space Operators /// Euclidian Space Operators
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -694,7 +911,7 @@ namespace embree
const vfloat4 b0 = shuffle<1,2,0,3>(b); const vfloat4 b0 = shuffle<1,2,0,3>(b);
const vfloat4 a1 = shuffle<1,2,0,3>(a); const vfloat4 a1 = shuffle<1,2,0,3>(a);
const vfloat4 b1 = b; const vfloat4 b1 = b;
return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1));
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -33,7 +33,7 @@ namespace embree
__forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
__forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
__forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {} __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {}
__forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
__forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
__forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
@ -75,7 +75,7 @@ namespace embree
return _mm256_broadcast_ps((__m128*)ptr); return _mm256_broadcast_ps((__m128*)ptr);
} }
static __forceinline vfloat8 load(const char* ptr) { static __forceinline vfloat8 load(const int8_t* ptr) {
#if defined(__AVX2__) #if defined(__AVX2__)
return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
#else #else
@ -83,7 +83,7 @@ namespace embree
#endif #endif
} }
static __forceinline vfloat8 load(const unsigned char* ptr) { static __forceinline vfloat8 load(const uint8_t* ptr) {
#if defined(__AVX2__) #if defined(__AVX2__)
return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
#else #else
@ -119,6 +119,12 @@ namespace embree
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
#elif defined(__aarch64__)
static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
#else #else
static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
@ -139,18 +145,18 @@ namespace embree
template<int scale = 4> template<int scale = 4>
static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _mm256_i32gather_ps(ptr, index ,scale); return _mm256_i32gather_ps(ptr, index ,scale);
#else #else
return vfloat8( return vfloat8(
*(float*)(((char*)ptr)+scale*index[0]), *(float*)(((int8_t*)ptr)+scale*index[0]),
*(float*)(((char*)ptr)+scale*index[1]), *(float*)(((int8_t*)ptr)+scale*index[1]),
*(float*)(((char*)ptr)+scale*index[2]), *(float*)(((int8_t*)ptr)+scale*index[2]),
*(float*)(((char*)ptr)+scale*index[3]), *(float*)(((int8_t*)ptr)+scale*index[3]),
*(float*)(((char*)ptr)+scale*index[4]), *(float*)(((int8_t*)ptr)+scale*index[4]),
*(float*)(((char*)ptr)+scale*index[5]), *(float*)(((int8_t*)ptr)+scale*index[5]),
*(float*)(((char*)ptr)+scale*index[6]), *(float*)(((int8_t*)ptr)+scale*index[6]),
*(float*)(((char*)ptr)+scale*index[7])); *(float*)(((int8_t*)ptr)+scale*index[7]));
#endif #endif
} }
@ -159,17 +165,17 @@ namespace embree
vfloat8 r = zero; vfloat8 r = zero;
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
#elif defined(__AVX2__) #elif defined(__AVX2__) && !defined(__aarch64__)
return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
#else #else
if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]); if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]); if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]); if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]); if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]); if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]); if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]); if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]);
return r; return r;
#endif #endif
} }
@ -180,14 +186,14 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm256_i32scatter_ps((float*)ptr, ofs, v, scale); _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
#else #else
*(float*)(((char*)ptr)+scale*ofs[0]) = v[0]; *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(float*)(((char*)ptr)+scale*ofs[1]) = v[1]; *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(float*)(((char*)ptr)+scale*ofs[2]) = v[2]; *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(float*)(((char*)ptr)+scale*ofs[3]) = v[3]; *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(float*)(((char*)ptr)+scale*ofs[4]) = v[4]; *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(float*)(((char*)ptr)+scale*ofs[5]) = v[5]; *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(float*)(((char*)ptr)+scale*ofs[6]) = v[6]; *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(float*)(((char*)ptr)+scale*ofs[7]) = v[7]; *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif #endif
} }
@ -197,18 +203,18 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
#else #else
if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0]; if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1]; if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2]; if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3]; if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4]; if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5]; if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6]; if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7]; if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif #endif
} }
static __forceinline void store(const vboolf8& mask, char* ptr, const vint8& ofs, const vfloat8& v) { static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) {
scatter<1>(mask,ptr,ofs,v); scatter<1>(mask,ptr,ofs,v);
} }
static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) { static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) {
@ -235,27 +241,60 @@ namespace embree
__forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); }
__forceinline vfloat8 operator +(const vfloat8& a) { return a; } __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
#if !defined(__aarch64__)
__forceinline vfloat8 operator -(const vfloat8& a) { __forceinline vfloat8 operator -(const vfloat8& a) {
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
return _mm256_xor_ps(a, mask); return _mm256_xor_ps(a, mask);
} }
__forceinline vfloat8 abs(const vfloat8& a) { #else
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); __forceinline vfloat8 operator -(const vfloat8& a) {
return _mm256_and_ps(a, mask); __m256 res;
} res.lo = vnegq_f32(a.v.lo);
res.hi = vnegq_f32(a.v.hi);
return res;
}
#endif
#if !defined(__aarch64__)
__forceinline vfloat8 abs(const vfloat8& a) {
const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
return _mm256_and_ps(a, mask);
}
#else
__forceinline vfloat8 abs(const vfloat8& a) {
__m256 res;
res.lo = vabsq_f32(a.v.lo);
res.hi = vabsq_f32(a.v.hi);
return res;
}
#endif
#if !defined(__aarch64__)
__forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
#else
__forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
#endif
__forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
static __forceinline vfloat8 rcp(const vfloat8& a) static __forceinline vfloat8 rcp(const vfloat8& a)
{ {
#if defined(BUILD_IOS) && defined(__aarch64__)
// ios devices are faster doing full divide, no need for NR fixup
vfloat8 ret;
const float32x4_t one = vdupq_n_f32(1.0f);
ret.v.lo = vdivq_f32(one, a.v.lo);
ret.v.hi = vdivq_f32(one, a.v.hi);
return ret;
#endif
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
const vfloat8 r = _mm256_rcp14_ps(a); const vfloat8 r = _mm256_rcp14_ps(a);
#else #else
const vfloat8 r = _mm256_rcp_ps(a); const vfloat8 r = _mm256_rcp_ps(a);
#endif #endif
#if defined(__AVX2__) #if defined(__AVX2__) //&& !defined(aarch64)
return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
#else #else
return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
@ -404,17 +443,29 @@ namespace embree
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
return _mm256_mask_blend_ps(m, f, t); return _mm256_mask_blend_ps(m, f, t);
} }
#else #elif !defined(__aarch64__)
static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
return _mm256_blendv_ps(f, t, m); return _mm256_blendv_ps(f, t, m);
} }
#else
__forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); }
__forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
__forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); }
__forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); }
__forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); }
__forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); }
__forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
return _mm256_blendv_ps(f, t, m);
}
#endif #endif
template<int mask> template<int mask>
@ -483,10 +534,17 @@ namespace embree
/// Rounding Functions /// Rounding Functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if !defined(__aarch64__)
__forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
__forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); }
__forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); }
__forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
#else
__forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
__forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
#endif
__forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -521,9 +579,11 @@ namespace embree
return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
} }
#if !defined(__aarch64__)
template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
#endif
__forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
@ -534,8 +594,8 @@ namespace embree
__forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); } __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); }
#if defined (__AVX2__) #if defined (__AVX2__) && !defined(__aarch64__)
static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
return _mm256_permutevar8x32_ps(a, index); return _mm256_permutevar8x32_ps(a, index);
} }
#endif #endif
@ -639,7 +699,7 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if !defined(__aarch64__)
__forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
__forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
__forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
@ -655,7 +715,14 @@ namespace embree
__forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
__forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
__forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
#else
__forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
__forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
__forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
__forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
__forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
#endif
__forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v)
{ {
const vfloat8 a = select(valid,v,vfloat8(pos_inf)); const vfloat8 a = select(valid,v,vfloat8(pos_inf));

View file

@ -90,10 +90,10 @@ namespace embree
static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }

View file

@ -23,7 +23,7 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Constructors, Assignment & Cast Operators /// Constructors, Assignment & Cast Operators
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline vint() {} __forceinline vint() {}
__forceinline vint(const vint4& a) { v = a.v; } __forceinline vint(const vint4& a) { v = a.v; }
__forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
@ -68,7 +68,7 @@ namespace embree
static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
@ -98,61 +98,81 @@ namespace embree
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__)
static __forceinline vint4 load(const unsigned char* ptr) { static __forceinline vint4 load(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
static __forceinline vint4 loadu(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
#elif defined(__SSE4_1__)
static __forceinline vint4 load(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
} }
static __forceinline vint4 loadu(const unsigned char* ptr) { static __forceinline vint4 loadu(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
} }
#else #else
static __forceinline vint4 load(const unsigned char* ptr) { static __forceinline vint4 load(const uint8_t* ptr) {
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
} }
static __forceinline vint4 loadu(const unsigned char* ptr) { static __forceinline vint4 loadu(const uint8_t* ptr) {
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
} }
#endif #endif
static __forceinline vint4 load(const unsigned short* ptr) { static __forceinline vint4 load(const unsigned short* ptr) {
#if defined (__SSE4_1__) #if defined(__aarch64__)
return __m128i(vmovl_u16(vld1_u16(ptr)));
#elif defined (__SSE4_1__)
return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
#else #else
return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
#endif #endif
} }
static __forceinline void store(unsigned char* ptr, const vint4& v) { static __forceinline void store(uint8_t* ptr, const vint4& v) {
#if defined(__SSE4_1__) #if defined(__aarch64__)
int32x4_t x = v;
uint16x4_t y = vqmovn_u32(uint32x4_t(x));
uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
#elif defined(__SSE4_1__)
__m128i x = v; __m128i x = v;
x = _mm_packus_epi32(x, x); x = _mm_packus_epi32(x, x);
x = _mm_packus_epi16(x, x); x = _mm_packus_epi16(x, x);
*(int*)ptr = _mm_cvtsi128_si32(x); *(int*)ptr = _mm_cvtsi128_si32(x);
#else #else
for (size_t i=0;i<4;i++) for (size_t i=0;i<4;i++)
ptr[i] = (unsigned char)v[i]; ptr[i] = (uint8_t)v[i];
#endif #endif
} }
static __forceinline void store(unsigned short* ptr, const vint4& v) { static __forceinline void store(unsigned short* ptr, const vint4& v) {
#if defined(__aarch64__)
uint32x4_t x = uint32x4_t(v.v);
uint16x4_t y = vqmovn_u32(x);
vst1_u16(ptr, y);
#else
for (size_t i=0;i<4;i++) for (size_t i=0;i<4;i++)
ptr[i] = (unsigned short)v[i]; ptr[i] = (unsigned short)v[i];
#endif
} }
static __forceinline vint4 load_nt(void* ptr) { static __forceinline vint4 load_nt(void* ptr) {
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
return _mm_stream_load_si128((__m128i*)ptr); return _mm_stream_load_si128((__m128i*)ptr);
#else #else
return _mm_load_si128((__m128i*)ptr); return _mm_load_si128((__m128i*)ptr);
#endif #endif
} }
static __forceinline void store_nt(void* ptr, const vint4& v) { static __forceinline void store_nt(void* ptr, const vint4& v) {
#if defined(__SSE4_1__) #if !defined(__aarch64__) && defined(__SSE4_1__)
_mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
#else #else
_mm_store_si128((__m128i*)ptr,v); _mm_store_si128((__m128i*)ptr,v);
@ -161,14 +181,14 @@ namespace embree
template<int scale = 4> template<int scale = 4>
static __forceinline vint4 gather(const int* ptr, const vint4& index) { static __forceinline vint4 gather(const int* ptr, const vint4& index) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _mm_i32gather_epi32(ptr, index, scale); return _mm_i32gather_epi32(ptr, index, scale);
#else #else
return vint4( return vint4(
*(int*)(((char*)ptr)+scale*index[0]), *(int*)(((int8_t*)ptr)+scale*index[0]),
*(int*)(((char*)ptr)+scale*index[1]), *(int*)(((int8_t*)ptr)+scale*index[1]),
*(int*)(((char*)ptr)+scale*index[2]), *(int*)(((int8_t*)ptr)+scale*index[2]),
*(int*)(((char*)ptr)+scale*index[3])); *(int*)(((int8_t*)ptr)+scale*index[3]));
#endif #endif
} }
@ -177,13 +197,13 @@ namespace embree
vint4 r = zero; vint4 r = zero;
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
#elif defined(__AVX2__) #elif defined(__AVX2__) && !defined(__aarch64__)
return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
#else #else
if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]); if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]); if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]); if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
return r; return r;
#endif #endif
} }
@ -194,10 +214,10 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm_i32scatter_epi32((int*)ptr, index, v, scale); _mm_i32scatter_epi32((int*)ptr, index, v, scale);
#else #else
*(int*)(((char*)ptr)+scale*index[0]) = v[0]; *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
*(int*)(((char*)ptr)+scale*index[1]) = v[1]; *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
*(int*)(((char*)ptr)+scale*index[2]) = v[2]; *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
*(int*)(((char*)ptr)+scale*index[3]) = v[3]; *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif #endif
} }
@ -207,14 +227,14 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
#else #else
if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0]; if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1]; if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2]; if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3]; if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
#endif #endif
} }
#if defined(__x86_64__) #if defined(__x86_64__) || defined(__aarch64__)
static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
#endif #endif
@ -228,10 +248,12 @@ namespace embree
friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
#elif defined(__aarch64__)
return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
#elif defined(__SSE4_1__) #elif defined(__SSE4_1__)
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
#else #else
return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
#endif #endif
} }
}; };
@ -248,7 +270,9 @@ namespace embree
__forceinline vint4 operator +(const vint4& a) { return a; } __forceinline vint4 operator +(const vint4& a) { return a; }
__forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
#if defined(__SSSE3__) #if defined(__aarch64__)
__forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
#elif defined(__SSSE3__)
__forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
#endif #endif
@ -264,7 +288,7 @@ namespace embree
__forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); }
__forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; }
#if defined(__SSE4_1__) #if (defined(__aarch64__)) || defined(__SSE4_1__)
__forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
#else #else
__forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
@ -284,34 +308,34 @@ namespace embree
__forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); }
__forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; }
__forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); } __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
__forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); } __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
__forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
__forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
__forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators /// Assignment Operators
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
__forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; }
__forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
__forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; }
#if defined(__SSE4_1__) #if (defined(__aarch64__)) || defined(__SSE4_1__)
__forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
__forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; }
#endif #endif
__forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
__forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; }
__forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
__forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; }
__forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
__forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
@ -378,14 +402,15 @@ namespace embree
template<int mask> template<int mask>
__forceinline vint4 select(const vint4& t, const vint4& f) { __forceinline vint4 select(const vint4& t, const vint4& f) {
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
#else #else
return select(vboolf4(mask), t, f); return select(vboolf4(mask), t, f);
#endif #endif
} }
#if defined(__SSE4_1__)
#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
__forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
@ -409,16 +434,25 @@ namespace embree
__forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
__forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& v) {
return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& a, const vint4& b) {
return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& v) { __forceinline vint4 shuffle(const vint4& v) {
return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
} }
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>
__forceinline vint4 shuffle(const vint4& a, const vint4& b) { __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
} }
#endif
#if defined(__SSE3__) #if defined(__SSE3__)
template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@ -430,7 +464,10 @@ namespace embree
return shuffle<i,i,i,i>(v); return shuffle<i,i,i,i>(v);
} }
#if defined(__SSE4_1__) #if defined(__aarch64__)
template<int src> __forceinline int extract(const vint4& b);
template<int dst> __forceinline vint4 insert(const vint4& a, const int b);
#elif defined(__SSE4_1__)
template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
#else #else
@ -438,19 +475,69 @@ namespace embree
template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
#endif #endif
#if defined(__aarch64__)
template<> __forceinline int extract<0>(const vint4& b) {
return b.v[0];
}
template<> __forceinline int extract<1>(const vint4& b) {
return b.v[1];
}
template<> __forceinline int extract<2>(const vint4& b) {
return b.v[2];
}
template<> __forceinline int extract<3>(const vint4& b) {
return b.v[3];
}
template<> __forceinline vint4 insert<0>(const vint4& a, int b)
{
vint4 c = a;
c[0] = b;
return c;
}
template<> __forceinline vint4 insert<1>(const vint4& a, int b)
{
vint4 c = a;
c[1] = b;
return c;
}
template<> __forceinline vint4 insert<2>(const vint4& a, int b)
{
vint4 c = a;
c[2] = b;
return c;
}
template<> __forceinline vint4 insert<3>(const vint4& a, int b)
{
vint4 c = a;
c[3] = b;
return c;
}
__forceinline int toScalar(const vint4& v) {
return v[0];
}
__forceinline size_t toSizeT(const vint4& v) {
uint64x2_t x = uint64x2_t(v.v);
return x[0];
}
#else
template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
__forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
__forceinline size_t toSizeT(const vint4& v) { __forceinline size_t toSizeT(const vint4& v) {
#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround #if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
return toScalar(v); return toScalar(v);
#elif defined(__ARM_NEON)
// FIXME(LTE): Do we need a swap(i.e. use lane 1)?
return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
#else #else
return _mm_cvtsi128_si64(v); return _mm_cvtsi128_si64(v);
#endif #endif
} }
#endif
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
__forceinline vint4 permute(const vint4 &a, const vint4 &index) { __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
@ -459,15 +546,25 @@ namespace embree
template<int i> template<int i>
__forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
return _mm_alignr_epi32(a, b, i); return _mm_alignr_epi32(a, b, i);
} }
#endif #endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__) #if defined(__aarch64__) || defined(__SSE4_1__)
#if defined(__aarch64__)
__forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
__forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
__forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
__forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
__forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
__forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
#else
__forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
__forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
__forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
@ -475,7 +572,8 @@ namespace embree
__forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
__forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
__forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
#endif
__forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
__forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
@ -494,7 +592,7 @@ namespace embree
/// Sorting networks /// Sorting networks
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__) #if (defined(__aarch64__)) || defined(__SSE4_1__)
__forceinline vint4 usort_ascending(const vint4& v) __forceinline vint4 usort_ascending(const vint4& v)
{ {

View file

@ -71,20 +71,25 @@ namespace embree
static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
#if !defined(__aarch64__)
static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
#else
static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
#endif
static __forceinline void store_nt(void* ptr, const vint8& v) { static __forceinline void store_nt(void* ptr, const vint8& v) {
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
} }
static __forceinline vint8 load(const unsigned char* ptr) { static __forceinline vint8 load(const uint8_t* ptr) {
vint4 il = vint4::load(ptr+0); vint4 il = vint4::load(ptr+0);
vint4 ih = vint4::load(ptr+4); vint4 ih = vint4::load(ptr+4);
return vint8(il,ih); return vint8(il,ih);
} }
static __forceinline vint8 loadu(const unsigned char* ptr) { static __forceinline vint8 loadu(const uint8_t* ptr) {
vint4 il = vint4::loadu(ptr+0); vint4 il = vint4::loadu(ptr+0);
vint4 ih = vint4::loadu(ptr+4); vint4 ih = vint4::loadu(ptr+4);
return vint8(il,ih); return vint8(il,ih);
@ -102,7 +107,7 @@ namespace embree
return vint8(il,ih); return vint8(il,ih);
} }
static __forceinline void store(unsigned char* ptr, const vint8& i) { static __forceinline void store(uint8_t* ptr, const vint8& i) {
vint4 il(i.vl); vint4 il(i.vl);
vint4 ih(i.vh); vint4 ih(i.vh);
vint4::store(ptr + 0,il); vint4::store(ptr + 0,il);
@ -117,54 +122,54 @@ namespace embree
template<int scale = 4> template<int scale = 4>
static __forceinline vint8 gather(const int* ptr, const vint8& index) { static __forceinline vint8 gather(const int* ptr, const vint8& index) {
return vint8( return vint8(
*(int*)(((char*)ptr)+scale*index[0]), *(int*)(((int8_t*)ptr)+scale*index[0]),
*(int*)(((char*)ptr)+scale*index[1]), *(int*)(((int8_t*)ptr)+scale*index[1]),
*(int*)(((char*)ptr)+scale*index[2]), *(int*)(((int8_t*)ptr)+scale*index[2]),
*(int*)(((char*)ptr)+scale*index[3]), *(int*)(((int8_t*)ptr)+scale*index[3]),
*(int*)(((char*)ptr)+scale*index[4]), *(int*)(((int8_t*)ptr)+scale*index[4]),
*(int*)(((char*)ptr)+scale*index[5]), *(int*)(((int8_t*)ptr)+scale*index[5]),
*(int*)(((char*)ptr)+scale*index[6]), *(int*)(((int8_t*)ptr)+scale*index[6]),
*(int*)(((char*)ptr)+scale*index[7])); *(int*)(((int8_t*)ptr)+scale*index[7]));
} }
template<int scale = 4> template<int scale = 4>
static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
vint8 r = zero; vint8 r = zero;
if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]); if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]); if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]); if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]); if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]); if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]); if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]); if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]);
return r; return r;
} }
template<int scale = 4> template<int scale = 4>
static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
{ {
*(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
} }
template<int scale = 4> template<int scale = 4>
static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
{ {
if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
} }

View file

@ -67,8 +67,8 @@ namespace embree
/// Loads and Stores /// Loads and Stores
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static __forceinline vint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } static __forceinline vint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
@ -108,7 +108,7 @@ namespace embree
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
} }
static __forceinline void store(unsigned char* ptr, const vint8& i) static __forceinline void store(uint8_t* ptr, const vint8& i)
{ {
for (size_t j=0; j<8; j++) for (size_t j=0; j<8; j++)
ptr[j] = i[j]; ptr[j] = i[j];
@ -140,14 +140,14 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
#else #else
*(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif #endif
} }
@ -157,14 +157,14 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
#else #else
if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif #endif
} }
@ -385,7 +385,9 @@ namespace embree
__forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
__forceinline vint8 permute(const vint8& v, const __m256i& index) { #if !defined(__aarch64__)
__forceinline vint8 permute(const vint8& v, const __m256i& index) {
return _mm256_permutevar8x32_epi32(v, index); return _mm256_permutevar8x32_epi32(v, index);
} }
@ -393,6 +395,8 @@ namespace embree
return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
} }
template<int i> template<int i>
static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
@ -402,6 +406,9 @@ namespace embree
#endif #endif
} }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -78,7 +78,7 @@ namespace embree
return _mm512_load_si512(addr); return _mm512_load_si512(addr);
} }
static __forceinline vllong8 load(const unsigned char* ptr) { static __forceinline vllong8 load(const uint8_t* ptr) {
return _mm512_cvtepu8_epi64(*(__m128i*)ptr); return _mm512_cvtepu8_epi64(*(__m128i*)ptr);
} }

View file

@ -83,7 +83,7 @@ namespace embree
return _mm512_loadu_si512(addr); return _mm512_loadu_si512(addr);
} }
static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
static __forceinline vuint16 load(const vuint16* addr) { static __forceinline vuint16 load(const vuint16* addr) {

View file

@ -87,44 +87,64 @@ namespace embree
static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
#endif #endif
#if defined(__SSE4_1__) #if defined(__aarch64__)
static __forceinline vuint4 load(const unsigned char* ptr) { static __forceinline vuint4 load(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
static __forceinline vuint4 loadu(const uint8_t* ptr) {
return _mm_load4epu8_epi32(((__m128i*)ptr));
}
#elif defined(__SSE4_1__)
static __forceinline vuint4 load(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
} }
static __forceinline vuint4 loadu(const unsigned char* ptr) { static __forceinline vuint4 loadu(const uint8_t* ptr) {
return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
} }
#endif #endif
static __forceinline vuint4 load(const unsigned short* ptr) { static __forceinline vuint4 load(const unsigned short* ptr) {
#if defined (__SSE4_1__) #if defined(__aarch64__)
return _mm_load4epu16_epi32(((__m128i*)ptr));
#elif defined (__SSE4_1__)
return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
#else #else
return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
#endif #endif
} }
static __forceinline void store_uchar(unsigned char* ptr, const vuint4& v) { static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) {
#if defined(__SSE4_1__) #if defined(__aarch64__)
uint32x4_t x = uint32x4_t(v.v);
uint16x4_t y = vqmovn_u32(x);
uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0);
#elif defined(__SSE4_1__)
__m128i x = v; __m128i x = v;
x = _mm_packus_epi32(x, x); x = _mm_packus_epi32(x, x);
x = _mm_packus_epi16(x, x); x = _mm_packus_epi16(x, x);
*(unsigned*)ptr = _mm_cvtsi128_si32(x); *(unsigned*)ptr = _mm_cvtsi128_si32(x);
#else #else
for (size_t i=0;i<4;i++) for (size_t i=0;i<4;i++)
ptr[i] = (unsigned char)v[i]; ptr[i] = (uint8_t)v[i];
#endif #endif
} }
static __forceinline void store_uchar(unsigned short* ptr, const vuint4& v) { static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) {
#if defined(__aarch64__)
uint32x4_t x = (uint32x4_t)v.v;
uint16x4_t y = vqmovn_u32(x);
vst1_u16(ptr, y);
#else
for (size_t i=0;i<4;i++) for (size_t i=0;i<4;i++)
ptr[i] = (unsigned short)v[i]; ptr[i] = (unsigned short)v[i];
#endif
} }
static __forceinline vuint4 load_nt(void* ptr) { static __forceinline vuint4 load_nt(void* ptr) {
#if defined(__SSE4_1__) #if (defined(__aarch64__)) || defined(__SSE4_1__)
return _mm_stream_load_si128((__m128i*)ptr); return _mm_stream_load_si128((__m128i*)ptr);
#else #else
return _mm_load_si128((__m128i*)ptr); return _mm_load_si128((__m128i*)ptr);
@ -132,8 +152,8 @@ namespace embree
} }
static __forceinline void store_nt(void* ptr, const vuint4& v) { static __forceinline void store_nt(void* ptr, const vuint4& v) {
#if defined(__SSE4_1__) #if !defined(__aarch64__) && defined(__SSE4_1__)
_mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
#else #else
_mm_store_si128((__m128i*)ptr,v); _mm_store_si128((__m128i*)ptr,v);
#endif #endif
@ -141,14 +161,14 @@ namespace embree
template<int scale = 4> template<int scale = 4>
static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _mm_i32gather_epi32((const int*)ptr, index, scale); return _mm_i32gather_epi32((const int*)ptr, index, scale);
#else #else
return vuint4( return vuint4(
*(unsigned int*)(((char*)ptr)+scale*index[0]), *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
*(unsigned int*)(((char*)ptr)+scale*index[1]), *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
*(unsigned int*)(((char*)ptr)+scale*index[2]), *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
*(unsigned int*)(((char*)ptr)+scale*index[3])); *(unsigned int*)(((int8_t*)ptr)+scale*index[3]));
#endif #endif
} }
@ -157,13 +177,13 @@ namespace embree
vuint4 r = zero; vuint4 r = zero;
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
#elif defined(__AVX2__) #elif defined(__AVX2__) && !defined(__aarch64__)
return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
#else #else
if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]); if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]); if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]); if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
return r; return r;
#endif #endif
} }
@ -353,16 +373,25 @@ namespace embree
__forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
__forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
#if defined(__aarch64__)
template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& v) {
return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
}
template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
}
#else
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& v) { __forceinline vuint4 shuffle(const vuint4& v) {
return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
} }
template<int i0, int i1, int i2, int i3> template<int i0, int i1, int i2, int i3>
__forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
} }
#endif
#if defined(__SSE3__) #if defined(__SSE3__)
template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@ -374,7 +403,10 @@ namespace embree
return shuffle<i,i,i,i>(v); return shuffle<i,i,i,i>(v);
} }
#if defined(__SSE4_1__) #if defined(__aarch64__)
template<int src> __forceinline unsigned int extract(const vuint4& b);
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b);
#elif defined(__SSE4_1__)
template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
#else #else
@ -382,11 +414,50 @@ namespace embree
template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
#endif #endif
#if defined(__aarch64__)
template<> __forceinline unsigned int extract<0>(const vuint4& b) {
return b[0];
}
template<> __forceinline unsigned int extract<1>(const vuint4& b) {
return b[1];
}
template<> __forceinline unsigned int extract<2>(const vuint4& b) {
return b[2];
}
template<> __forceinline unsigned int extract<3>(const vuint4& b) {
return b[3];
}
template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){
vuint4 c = a;
c[0] = b;
return c;
}
template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){
vuint4 c = a;
c[1] = b;
return c;
}
template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){
vuint4 c = a;
c[2] = b;
return c;
}
template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){
vuint4 c = a;
c[3] = b;
return c;
}
__forceinline unsigned int toScalar(const vuint4& v) {
return v[0];
}
#else
template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
__forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -69,20 +69,24 @@ namespace embree
static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
#if !defined(__aarch64__)
static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
#else
static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
#endif
static __forceinline void store_nt(void* ptr, const vuint8& v) { static __forceinline void store_nt(void* ptr, const vuint8& v) {
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
} }
static __forceinline vuint8 load(const unsigned char* ptr) { static __forceinline vuint8 load(const uint8_t* ptr) {
vuint4 il = vuint4::load(ptr+0); vuint4 il = vuint4::load(ptr+0);
vuint4 ih = vuint4::load(ptr+4); vuint4 ih = vuint4::load(ptr+4);
return vuint8(il,ih); return vuint8(il,ih);
} }
static __forceinline vuint8 loadu(const unsigned char* ptr) { static __forceinline vuint8 loadu(const uint8_t* ptr) {
vuint4 il = vuint4::loadu(ptr+0); vuint4 il = vuint4::loadu(ptr+0);
vuint4 ih = vuint4::loadu(ptr+4); vuint4 ih = vuint4::loadu(ptr+4);
return vuint8(il,ih); return vuint8(il,ih);
@ -100,7 +104,7 @@ namespace embree
return vuint8(il,ih); return vuint8(il,ih);
} }
static __forceinline void store(unsigned char* ptr, const vuint8& i) { static __forceinline void store(uint8_t* ptr, const vuint8& i) {
vuint4 il(i.vl); vuint4 il(i.vl);
vuint4 ih(i.vh); vuint4 ih(i.vh);
vuint4::store(ptr + 0,il); vuint4::store(ptr + 0,il);
@ -115,54 +119,54 @@ namespace embree
template<int scale = 4> template<int scale = 4>
static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
return vuint8( return vuint8(
*(unsigned int*)(((char*)ptr)+scale*index[0]), *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
*(unsigned int*)(((char*)ptr)+scale*index[1]), *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
*(unsigned int*)(((char*)ptr)+scale*index[2]), *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
*(unsigned int*)(((char*)ptr)+scale*index[3]), *(unsigned int*)(((int8_t*)ptr)+scale*index[3]),
*(unsigned int*)(((char*)ptr)+scale*index[4]), *(unsigned int*)(((int8_t*)ptr)+scale*index[4]),
*(unsigned int*)(((char*)ptr)+scale*index[5]), *(unsigned int*)(((int8_t*)ptr)+scale*index[5]),
*(unsigned int*)(((char*)ptr)+scale*index[6]), *(unsigned int*)(((int8_t*)ptr)+scale*index[6]),
*(unsigned int*)(((char*)ptr)+scale*index[7])); *(unsigned int*)(((int8_t*)ptr)+scale*index[7]));
} }
template<int scale = 4> template<int scale = 4>
static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
vuint8 r = zero; vuint8 r = zero;
if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]); if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]); if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]); if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]); if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]);
if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]); if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]);
if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]); if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]);
if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]); if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]);
return r; return r;
} }
template<int scale = 4> template<int scale = 4>
static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
{ {
*(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
*(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
*(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
*(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
*(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
*(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
*(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
*(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
} }
template<int scale = 4> template<int scale = 4>
static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
{ {
if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
} }

View file

@ -66,8 +66,8 @@ namespace embree
/// Loads and Stores /// Loads and Stores
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static __forceinline vuint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } static __forceinline vuint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
@ -107,7 +107,7 @@ namespace embree
_mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
} }
static __forceinline void store(unsigned char* ptr, const vuint8& i) static __forceinline void store(uint8_t* ptr, const vuint8& i)
{ {
for (size_t j=0; j<8; j++) for (size_t j=0; j<8; j++)
ptr[j] = i[j]; ptr[j] = i[j];
@ -139,14 +139,14 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
#else #else
*(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0];
*(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1];
*(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2];
*(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3];
*(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4];
*(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5];
*(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6];
*(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7];
#endif #endif
} }
@ -156,14 +156,14 @@ namespace embree
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
_mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
#else #else
if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
#endif #endif
} }
@ -379,6 +379,8 @@ namespace embree
__forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
#if !defined(__aarch64__)
__forceinline vuint8 permute(const vuint8& v, const __m256i& index) { __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
return _mm256_permutevar8x32_epi32(v, index); return _mm256_permutevar8x32_epi32(v, index);
} }
@ -394,7 +396,10 @@ namespace embree
#else #else
return _mm256_alignr_epi8(a, b, 4*i); return _mm256_alignr_epi8(a, b, 4*i);
#endif #endif
} }
#endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Reductions /// Reductions

View file

@ -21,7 +21,10 @@ namespace embree
void* ptr = _mm_malloc(size,align); void* ptr = _mm_malloc(size,align);
if (size != 0 && ptr == nullptr) if (size != 0 && ptr == nullptr)
throw std::bad_alloc(); // -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
return ptr; return ptr;
} }
@ -128,7 +131,10 @@ namespace embree
/* fall back to 4k pages */ /* fall back to 4k pages */
int flags = MEM_COMMIT | MEM_RESERVE; int flags = MEM_COMMIT | MEM_RESERVE;
char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
if (ptr == nullptr) throw std::bad_alloc(); // -- GODOT start --
// if (ptr == nullptr) throw std::bad_alloc();
if (ptr == nullptr) abort();
// -- GODOT end --
hugepages = false; hugepages = false;
return ptr; return ptr;
} }
@ -145,7 +151,10 @@ namespace embree
return bytesOld; return bytesOld;
if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
throw std::bad_alloc(); // -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
return bytesNew; return bytesNew;
} }
@ -156,7 +165,10 @@ namespace embree
return; return;
if (!VirtualFree(ptr,0,MEM_RELEASE)) if (!VirtualFree(ptr,0,MEM_RELEASE))
throw std::bad_alloc(); // -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
} }
void os_advise(void *ptr, size_t bytes) void os_advise(void *ptr, size_t bytes)
@ -260,7 +272,10 @@ namespace embree
/* fallback to 4k pages */ /* fallback to 4k pages */
void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (ptr == MAP_FAILED) throw std::bad_alloc(); // -- GODOT start --
// if (ptr == MAP_FAILED) throw std::bad_alloc();
if (ptr == MAP_FAILED) abort();
// -- GODOT end --
hugepages = false; hugepages = false;
/* advise huge page hint for THP */ /* advise huge page hint for THP */
@ -277,7 +292,10 @@ namespace embree
return bytesOld; return bytesOld;
if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
throw std::bad_alloc(); // -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
return bytesNew; return bytesNew;
} }
@ -291,7 +309,10 @@ namespace embree
const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
bytes = (bytes+pageSize-1) & ~(pageSize-1); bytes = (bytes+pageSize-1) & ~(pageSize-1);
if (munmap(ptr,bytes) == -1) if (munmap(ptr,bytes) == -1)
throw std::bad_alloc(); // -- GODOT start --
// throw std::bad_alloc();
abort();
// -- GODOT end --
} }
/* hint for transparent huge pages (THP) */ /* hint for transparent huge pages (THP) */

View file

@ -139,7 +139,7 @@ namespace embree
__forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; } __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; }
__forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; } __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; } __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; }
__forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; } __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
#endif #endif
@ -196,7 +196,7 @@ namespace embree
__forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; } __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
__forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; } __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; } __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
#endif #endif

View file

@ -9,7 +9,14 @@
#include <intrin.h> #include <intrin.h>
#endif #endif
#if defined(__ARM_NEON)
#include "../math/SSE2NEON.h"
#if defined(NEON_AVX2_EMULATION)
#include "../math/AVX2NEON.h"
#endif
#else
#include <immintrin.h> #include <immintrin.h>
#endif
#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
#if !defined(_tzcnt_u32) #if !defined(_tzcnt_u32)
@ -20,6 +27,14 @@
#endif #endif
#endif #endif
#if defined(__aarch64__)
#if !defined(_lzcnt_u32)
#define _lzcnt_u32 __builtin_clz
#endif
#if !defined(_lzcnt_u32)
#define _lzcnt_u32 __builtin_clzll
#endif
#else
#if defined(__LZCNT__) #if defined(__LZCNT__)
#if !defined(_lzcnt_u32) #if !defined(_lzcnt_u32)
#define _lzcnt_u32 __lzcnt32 #define _lzcnt_u32 __lzcnt32
@ -28,16 +43,13 @@
#define _lzcnt_u64 __lzcnt64 #define _lzcnt_u64 __lzcnt64
#endif #endif
#endif #endif
#endif
#if defined(__WIN32__) #if defined(__WIN32__)
// -- GODOT start -- # ifndef NOMINMAX
#if !defined(NOMINMAX) # define NOMINMAX
// -- GODOT end -- # endif
#define NOMINMAX # include <windows.h>
// -- GODOT start --
#endif
#include "windows.h"
// -- GODOT end --
#endif #endif
/* normally defined in pmmintrin.h, but we always need this */ /* normally defined in pmmintrin.h, but we always need this */
@ -50,133 +62,133 @@
namespace embree namespace embree
{ {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Windows Platform /// Windows Platform
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__WIN32__) #if defined(__WIN32__)
__forceinline size_t read_tsc() __forceinline size_t read_tsc()
{ {
LARGE_INTEGER li; LARGE_INTEGER li;
QueryPerformanceCounter(&li); QueryPerformanceCounter(&li);
return (size_t)li.QuadPart; return (size_t)li.QuadPart;
} }
__forceinline int bsf(int v) { __forceinline int bsf(int v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _tzcnt_u32(v); return _tzcnt_u32(v);
#else #else
unsigned long r = 0; _BitScanForward(&r,v); return r; unsigned long r = 0; _BitScanForward(&r,v); return r;
#endif #endif
} }
__forceinline unsigned bsf(unsigned v) { __forceinline unsigned bsf(unsigned v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _tzcnt_u32(v); return _tzcnt_u32(v);
#else #else
unsigned long r = 0; _BitScanForward(&r,v); return r; unsigned long r = 0; _BitScanForward(&r,v); return r;
#endif #endif
} }
#if defined(__X86_64__) #if defined(__X86_64__)
__forceinline size_t bsf(size_t v) { __forceinline size_t bsf(size_t v) {
#if defined(__AVX2__) #if defined(__AVX2__)
return _tzcnt_u64(v); return _tzcnt_u64(v);
#else #else
unsigned long r = 0; _BitScanForward64(&r,v); return r; unsigned long r = 0; _BitScanForward64(&r,v); return r;
#endif #endif
} }
#endif #endif
__forceinline int bscf(int& v) __forceinline int bscf(int& v)
{ {
int i = bsf(v); int i = bsf(v);
v &= v-1; v &= v-1;
return i; return i;
} }
__forceinline unsigned bscf(unsigned& v) __forceinline unsigned bscf(unsigned& v)
{ {
unsigned i = bsf(v); unsigned i = bsf(v);
v &= v-1; v &= v-1;
return i; return i;
} }
#if defined(__X86_64__) #if defined(__X86_64__)
__forceinline size_t bscf(size_t& v) __forceinline size_t bscf(size_t& v)
{ {
size_t i = bsf(v); size_t i = bsf(v);
v &= v-1; v &= v-1;
return i; return i;
} }
#endif #endif
__forceinline int bsr(int v) { __forceinline int bsr(int v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return 31 - _lzcnt_u32(v); return 31 - _lzcnt_u32(v);
#else #else
unsigned long r = 0; _BitScanReverse(&r,v); return r; unsigned long r = 0; _BitScanReverse(&r,v); return r;
#endif #endif
} }
__forceinline unsigned bsr(unsigned v) { __forceinline unsigned bsr(unsigned v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return 31 - _lzcnt_u32(v); return 31 - _lzcnt_u32(v);
#else #else
unsigned long r = 0; _BitScanReverse(&r,v); return r; unsigned long r = 0; _BitScanReverse(&r,v); return r;
#endif #endif
} }
#if defined(__X86_64__) #if defined(__X86_64__)
__forceinline size_t bsr(size_t v) { __forceinline size_t bsr(size_t v) {
#if defined(__AVX2__) #if defined(__AVX2__)
return 63 -_lzcnt_u64(v); return 63 -_lzcnt_u64(v);
#else #else
unsigned long r = 0; _BitScanReverse64(&r, v); return r; unsigned long r = 0; _BitScanReverse64(&r, v); return r;
#endif #endif
} }
#endif #endif
__forceinline int lzcnt(const int x) __forceinline int lzcnt(const int x)
{ {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _lzcnt_u32(x); return _lzcnt_u32(x);
#else #else
if (unlikely(x == 0)) return 32; if (unlikely(x == 0)) return 32;
return 31 - bsr(x); return 31 - bsr(x);
#endif #endif
} }
__forceinline int btc(int v, int i) { __forceinline int btc(int v, int i) {
long r = v; _bittestandcomplement(&r,i); return r; long r = v; _bittestandcomplement(&r,i); return r;
} }
__forceinline int bts(int v, int i) { __forceinline int bts(int v, int i) {
long r = v; _bittestandset(&r,i); return r; long r = v; _bittestandset(&r,i); return r;
} }
__forceinline int btr(int v, int i) { __forceinline int btr(int v, int i) {
long r = v; _bittestandreset(&r,i); return r; long r = v; _bittestandreset(&r,i); return r;
} }
#if defined(__X86_64__) #if defined(__X86_64__)
__forceinline size_t btc(size_t v, size_t i) { __forceinline size_t btc(size_t v, size_t i) {
size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
} }
__forceinline size_t bts(size_t v, size_t i) { __forceinline size_t bts(size_t v, size_t i) {
__int64 r = v; _bittestandset64(&r,i); return r; __int64 r = v; _bittestandset64(&r,i); return r;
} }
__forceinline size_t btr(size_t v, size_t i) { __forceinline size_t btr(size_t v, size_t i) {
__int64 r = v; _bittestandreset64(&r,i); return r; __int64 r = v; _bittestandreset64(&r,i); return r;
} }
#endif #endif
__forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
return _InterlockedCompareExchange((volatile long*)p,v,c); return _InterlockedCompareExchange((volatile long*)p,v,c);
} }
@ -184,143 +196,174 @@ namespace embree
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// Unix Platform /// Unix Platform
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#else #else
#if defined(__i386__) && defined(__PIC__) #if defined(__i386__) && defined(__PIC__)
__forceinline void __cpuid(int out[4], int op) __forceinline void __cpuid(int out[4], int op)
{ {
asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t" "cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t" "xchg{l}\t{%%}ebx, %1\n\t"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "0"(op)); : "0"(op));
} }
__forceinline void __cpuid_count(int out[4], int op1, int op2) __forceinline void __cpuid_count(int out[4], int op1, int op2)
{ {
asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t" "cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t" "xchg{l}\t{%%}ebx, %1\n\t"
: "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
: "0" (op1), "2" (op2)); : "0" (op1), "2" (op2));
} }
#else #else
__forceinline void __cpuid(int out[4], int op) { __forceinline void __cpuid(int out[4], int op) {
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); #if defined(__ARM_NEON)
} if (op == 0) { // Get CPU name
out[0] = 0x41524d20;
__forceinline void __cpuid_count(int out[4], int op1, int op2) { out[1] = 0x41524d20;
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); out[2] = 0x41524d20;
} out[3] = 0x41524d20;
}
#else
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
#endif #endif
}
#if !defined(__ARM_NEON)
__forceinline void __cpuid_count(int out[4], int op1, int op2) {
asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
}
#endif
#endif
__forceinline uint64_t read_tsc() { __forceinline uint64_t read_tsc() {
#if defined(__ARM_NEON)
return 0; // FIXME(LTE): mimic rdtsc
#else
uint32_t high,low; uint32_t high,low;
asm volatile ("rdtsc" : "=d"(high), "=a"(low)); asm volatile ("rdtsc" : "=d"(high), "=a"(low));
return (((uint64_t)high) << 32) + (uint64_t)low; return (((uint64_t)high) << 32) + (uint64_t)low;
#endif
} }
__forceinline int bsf(int v) { __forceinline int bsf(int v) {
#if defined(__AVX2__) #if defined(__ARM_NEON)
return __builtin_ctz(v);
#else
#if defined(__AVX2__)
return _tzcnt_u32(v); return _tzcnt_u32(v);
#else #else
int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
#endif #endif
} }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline unsigned bsf(unsigned v) __forceinline unsigned bsf(unsigned v)
{ {
#if defined(__AVX2__) #if defined(__ARM_NEON)
return __builtin_ctz(v);
#else
#if defined(__AVX2__)
return _tzcnt_u32(v); return _tzcnt_u32(v);
#else #else
unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
#endif
#endif #endif
} }
#endif #endif
__forceinline size_t bsf(size_t v) { __forceinline size_t bsf(size_t v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__X86_64__) #if defined(__X86_64__)
return _tzcnt_u64(v); return _tzcnt_u64(v);
#else #else
return _tzcnt_u32(v); return _tzcnt_u32(v);
#endif #endif
#elif defined(__ARM_NEON)
return __builtin_ctzl(v);
#else #else
size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
#endif #endif
} }
__forceinline int bscf(int& v) __forceinline int bscf(int& v)
{ {
int i = bsf(v); int i = bsf(v);
v &= v-1; v &= v-1;
return i; return i;
} }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline unsigned int bscf(unsigned int& v) __forceinline unsigned int bscf(unsigned int& v)
{ {
unsigned int i = bsf(v); unsigned int i = bsf(v);
v &= v-1; v &= v-1;
return i; return i;
} }
#endif #endif
__forceinline size_t bscf(size_t& v) __forceinline size_t bscf(size_t& v)
{ {
size_t i = bsf(v); size_t i = bsf(v);
v &= v-1; v &= v-1;
return i; return i;
} }
__forceinline int bsr(int v) { __forceinline int bsr(int v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return 31 - _lzcnt_u32(v); return 31 - _lzcnt_u32(v);
#elif defined(__ARM_NEON)
return __builtin_clz(v)^31;
#else #else
int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
#endif #endif
} }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
__forceinline unsigned bsr(unsigned v) { __forceinline unsigned bsr(unsigned v) {
#if defined(__AVX2__) #if defined(__AVX2__)
return 31 - _lzcnt_u32(v); return 31 - _lzcnt_u32(v);
#elif defined(__ARM_NEON)
return __builtin_clz(v)^31;
#else #else
unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
#endif #endif
} }
#endif #endif
__forceinline size_t bsr(size_t v) { __forceinline size_t bsr(size_t v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__X86_64__) #if defined(__X86_64__)
return 63 - _lzcnt_u64(v); return 63 - _lzcnt_u64(v);
#else #else
return 31 - _lzcnt_u32(v); return 31 - _lzcnt_u32(v);
#endif #endif
#elif defined(__aarch64__)
return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
#else #else
size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
#endif #endif
} }
__forceinline int lzcnt(const int x) __forceinline int lzcnt(const int x)
{ {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
return _lzcnt_u32(x); return _lzcnt_u32(x);
#else #else
if (unlikely(x == 0)) return 32; if (unlikely(x == 0)) return 32;
return 31 - bsr(x); return 31 - bsr(x);
#endif #endif
} }
__forceinline size_t blsr(size_t v) { __forceinline size_t blsr(size_t v) {
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__INTEL_COMPILER) #if defined(__INTEL_COMPILER)
return _blsr_u64(v); return _blsr_u64(v);
#else #else
@ -334,41 +377,79 @@ namespace embree
return v & (v-1); return v & (v-1);
#endif #endif
} }
__forceinline int btc(int v, int i) { __forceinline int btc(int v, int i) {
#if defined(__aarch64__)
// _bittestandcomplement(long *a, long b) {
// unsigned char x = (*a >> b) & 1;
// *a = *a ^ (1 << b);
// return x;
// We only need `*a`
return (v ^ (1 << i));
#else
int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
#endif
} }
__forceinline int bts(int v, int i) { __forceinline int bts(int v, int i) {
#if defined(__aarch64__)
// _bittestandset(long *a, long b) {
// unsigned char x = (*a >> b) & 1;
// *a = *a | (1 << b);
// return x;
return (v | (v << i));
#else
int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
} }
__forceinline int btr(int v, int i) { __forceinline int btr(int v, int i) {
#if defined(__aarch64__)
// _bittestandreset(long *a, long b) {
// unsigned char x = (*a >> b) & 1;
// *a = *a & ~(1 << b);
// return x;
return (v & ~(v << i));
#else
int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
} }
__forceinline size_t btc(size_t v, size_t i) { __forceinline size_t btc(size_t v, size_t i) {
#if defined(__aarch64__)
return (v ^ (1 << i));
#else
size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
#endif
} }
__forceinline size_t bts(size_t v, size_t i) { __forceinline size_t bts(size_t v, size_t i) {
#if defined(__aarch64__)
return (v | (v << i));
#else
size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
} }
__forceinline size_t btr(size_t v, size_t i) { __forceinline size_t btr(size_t v, size_t i) {
#if defined(__ARM_NEON)
return (v & ~(v << i));
#else
size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
#endif
} }
__forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
return __sync_val_compare_and_swap(value, comparand, input); return __sync_val_compare_and_swap(value, comparand, input);
} }
#endif #endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// All Platforms /// All Platforms
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined(__clang__) || defined(__GNUC__) #if defined(__clang__) || defined(__GNUC__)
#if !defined(_mm_undefined_ps) #if !defined(_mm_undefined_ps)
__forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
@ -390,39 +471,39 @@ namespace embree
#endif #endif
#endif #endif
#if defined(__SSE4_2__) #if defined(__SSE4_2__) || defined(__ARM_NEON)
__forceinline int popcnt(int in) { __forceinline int popcnt(int in) {
return _mm_popcnt_u32(in); return _mm_popcnt_u32(in);
} }
__forceinline unsigned popcnt(unsigned in) { __forceinline unsigned popcnt(unsigned in) {
return _mm_popcnt_u32(in); return _mm_popcnt_u32(in);
} }
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__ARM_NEON)
__forceinline size_t popcnt(size_t in) { __forceinline size_t popcnt(size_t in) {
return _mm_popcnt_u64(in); return _mm_popcnt_u64(in);
} }
#endif #endif
#endif #endif
__forceinline uint64_t rdtsc() __forceinline uint64_t rdtsc()
{ {
int dummy[4]; int dummy[4];
__cpuid(dummy,0); __cpuid(dummy,0);
uint64_t clock = read_tsc(); uint64_t clock = read_tsc();
__cpuid(dummy,0); __cpuid(dummy,0);
return clock; return clock;
} }
__forceinline void pause_cpu(const size_t N = 8) __forceinline void pause_cpu(const size_t N = 8)
{ {
for (size_t i=0; i<N; i++) for (size_t i=0; i<N; i++)
_mm_pause(); _mm_pause();
} }
/* prefetches */ /* prefetches */
__forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
__forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
@ -432,18 +513,18 @@ namespace embree
#if defined(__INTEL_COMPILER) #if defined(__INTEL_COMPILER)
_mm_prefetch((const char*)ptr,_MM_HINT_ET0); _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
#else #else
_mm_prefetch((const char*)ptr,_MM_HINT_T0); _mm_prefetch((const char*)ptr,_MM_HINT_T0);
#endif #endif
} }
__forceinline void prefetchL1EX(const void* ptr) { __forceinline void prefetchL1EX(const void* ptr) {
prefetchEX(ptr); prefetchEX(ptr);
} }
__forceinline void prefetchL2EX(const void* ptr) { __forceinline void prefetchL2EX(const void* ptr) {
prefetchEX(ptr); prefetchEX(ptr);
} }
#if defined(__AVX2__) #if defined(__AVX2__) && !defined(__aarch64__)
__forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
__forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
#if defined(__X86_64__) #if defined(__X86_64__)

View file

@ -27,9 +27,7 @@ namespace embree
/* returns address of a symbol from the library */ /* returns address of a symbol from the library */
void* getSymbol(lib_t lib, const std::string& sym) { void* getSymbol(lib_t lib, const std::string& sym) {
// -- GODOT start -- return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str()));
return (void*) GetProcAddress(HMODULE(lib),sym.c_str());
// -- GODOT end --
} }
/* closes the shared library */ /* closes the shared library */
@ -63,7 +61,7 @@ namespace embree
lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
if (lib == nullptr) { if (lib == nullptr) {
const char* error = dlerror(); const char* error = dlerror();
if (error) { if (error) {
THROW_RUNTIME_ERROR(error); THROW_RUNTIME_ERROR(error);
} else { } else {
THROW_RUNTIME_ERROR("could not load library "+executable.str()); THROW_RUNTIME_ERROR("could not load library "+executable.str());

View file

@ -36,6 +36,7 @@ namespace embree
MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
assert(ok); assert(ok);
delete (pthread_mutex_t*)mutex; delete (pthread_mutex_t*)mutex;
mutex = nullptr;
} }
void MutexSys::lock() void MutexSys::lock()

View file

@ -47,7 +47,7 @@ namespace embree
{ {
while (flag.load()) while (flag.load())
{ {
_mm_pause(); _mm_pause();
_mm_pause(); _mm_pause();
} }
@ -74,7 +74,7 @@ namespace embree
{ {
while(flag.load()) while(flag.load())
{ {
_mm_pause(); _mm_pause();
_mm_pause(); _mm_pause();
} }
} }

View file

@ -88,10 +88,10 @@
#define dll_import __declspec(dllimport) #define dll_import __declspec(dllimport)
#else #else
#define dll_export __attribute__ ((visibility ("default"))) #define dll_export __attribute__ ((visibility ("default")))
#define dll_import #define dll_import
#endif #endif
#if defined(__WIN32__) && !defined(__MINGW32__) #ifdef __WIN32__
#if !defined(__noinline) #if !defined(__noinline)
#define __noinline __declspec(noinline) #define __noinline __declspec(noinline)
#endif #endif
@ -103,11 +103,16 @@
#define __restrict__ //__restrict // causes issues with MSVC #define __restrict__ //__restrict // causes issues with MSVC
#endif #endif
#if !defined(__thread) #if !defined(__thread)
// NOTE: Require `-fms-extensions` for clang
#define __thread __declspec(thread) #define __thread __declspec(thread)
#endif #endif
#if !defined(__aligned) #if !defined(__aligned)
#if defined(__MINGW32__)
#define __aligned(...) __attribute__((aligned(__VA_ARGS__)))
#else
#define __aligned(...) __declspec(align(__VA_ARGS__)) #define __aligned(...) __declspec(align(__VA_ARGS__))
#endif #endif
#endif
//#define __FUNCTION__ __FUNCTION__ //#define __FUNCTION__ __FUNCTION__
#define debugbreak() __debugbreak() #define debugbreak() __debugbreak()
@ -142,7 +147,7 @@
#endif #endif
// -- GODOT start -- // -- GODOT start --
#if !defined(likely) #ifndef likely
// -- GODOT end -- // -- GODOT end --
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#define likely(expr) (expr) #define likely(expr) (expr)
@ -169,11 +174,19 @@
#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
#if defined(DEBUG) // only report file and line in debug mode #if defined(DEBUG) // only report file and line in debug mode
// -- GODOT start --
// #define THROW_RUNTIME_ERROR(str)
// throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define THROW_RUNTIME_ERROR(str) \ #define THROW_RUNTIME_ERROR(str) \
throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
// -- GODOT end --
#else #else
// -- GODOT start --
// #define THROW_RUNTIME_ERROR(str)
// throw std::runtime_error(str);
#define THROW_RUNTIME_ERROR(str) \ #define THROW_RUNTIME_ERROR(str) \
throw std::runtime_error(str); abort();
// -- GODOT end --
#endif #endif
#define FATAL(x) THROW_RUNTIME_ERROR(x) #define FATAL(x) THROW_RUNTIME_ERROR(x)
@ -192,7 +205,7 @@ namespace embree {
/* windows does not have ssize_t */ /* windows does not have ssize_t */
#if defined(__WIN32__) #if defined(__WIN32__)
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
typedef int64_t ssize_t; typedef int64_t ssize_t;
#else #else
typedef int32_t ssize_t; typedef int32_t ssize_t;
@ -316,7 +329,7 @@ __forceinline std::string toString(long long value) {
/// Some macros for static profiling /// Some macros for static profiling
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if defined (__GNUC__) #if defined (__GNUC__)
#define IACA_SSC_MARK( MARK_ID ) \ #define IACA_SSC_MARK( MARK_ID ) \
__asm__ __volatile__ ( \ __asm__ __volatile__ ( \
"\n\t movl $"#MARK_ID", %%ebx" \ "\n\t movl $"#MARK_ID", %%ebx" \
@ -355,7 +368,7 @@ namespace embree
bool active; bool active;
const Closure f; const Closure f;
}; };
template <typename Closure> template <typename Closure>
OnScopeExitHelper<Closure> OnScopeExit(const Closure f) { OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
return OnScopeExitHelper<Closure>(f); return OnScopeExitHelper<Closure>(f);

View file

@ -18,10 +18,16 @@ typedef cpuset_t cpu_set_t;
namespace embree namespace embree
{ {
NullTy null; NullTy null;
std::string getPlatformName() std::string getPlatformName()
{ {
#if defined(__LINUX__) && !defined(__X86_64__) #if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON)
return "Android Linux (aarch64 / arm64)";
#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__)
return "Android Linux (x64)";
#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86))
return "Android Linux (x86)";
#elif defined(__LINUX__) && !defined(__X86_64__)
return "Linux (32bit)"; return "Linux (32bit)";
#elif defined(__LINUX__) && defined(__X86_64__) #elif defined(__LINUX__) && defined(__X86_64__)
return "Linux (64bit)"; return "Linux (64bit)";
@ -37,10 +43,16 @@ namespace embree
return "Windows (32bit)"; return "Windows (32bit)";
#elif defined(__WIN32__) && defined(__X86_64__) #elif defined(__WIN32__) && defined(__X86_64__)
return "Windows (64bit)"; return "Windows (64bit)";
#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__)
return "iOS Simulator (x64)";
#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON)
return "iOS (aarch64 / arm64)";
#elif defined(__MACOSX__) && !defined(__X86_64__) #elif defined(__MACOSX__) && !defined(__X86_64__)
return "Mac OS X (32bit)"; return "Mac OS X (32bit)";
#elif defined(__MACOSX__) && defined(__X86_64__) #elif defined(__MACOSX__) && defined(__X86_64__)
return "Mac OS X (64bit)"; return "Mac OS X (64bit)";
#elif defined(__UNIX__) && defined(__aarch64__)
return "Unix (aarch64)";
#elif defined(__UNIX__) && !defined(__X86_64__) #elif defined(__UNIX__) && !defined(__X86_64__)
return "Unix (32bit)"; return "Unix (32bit)";
#elif defined(__UNIX__) && defined(__X86_64__) #elif defined(__UNIX__) && defined(__X86_64__)
@ -79,8 +91,8 @@ namespace embree
std::string getCPUVendor() std::string getCPUVendor()
{ {
int cpuinfo[4]; int cpuinfo[4];
__cpuid (cpuinfo, 0); __cpuid (cpuinfo, 0);
int name[4]; int name[4];
name[0] = cpuinfo[1]; name[0] = cpuinfo[1];
name[1] = cpuinfo[3]; name[1] = cpuinfo[3];
@ -89,11 +101,11 @@ namespace embree
return (char*)name; return (char*)name;
} }
CPU getCPUModel() CPU getCPUModel()
{ {
if (getCPUVendor() != "GenuineIntel") if (getCPUVendor() != "GenuineIntel")
return CPU::UNKNOWN; return CPU::UNKNOWN;
int out[4]; int out[4];
__cpuid(out, 0); __cpuid(out, 0);
if (out[0] < 1) return CPU::UNKNOWN; if (out[0] < 1) return CPU::UNKNOWN;
@ -183,11 +195,13 @@ namespace embree
case CPU::NEHALEM : return "Nehalem"; case CPU::NEHALEM : return "Nehalem";
case CPU::CORE2 : return "Core2"; case CPU::CORE2 : return "Core2";
case CPU::CORE1 : return "Core"; case CPU::CORE1 : return "Core";
case CPU::ARM : return "Arm";
case CPU::UNKNOWN : return "Unknown CPU"; case CPU::UNKNOWN : return "Unknown CPU";
} }
return "Unknown CPU (error)"; return "Unknown CPU (error)";
} }
#if !defined(__ARM_NEON)
/* constants to access destination registers of CPUID instruction */ /* constants to access destination registers of CPUID instruction */
static const int EAX = 0; static const int EAX = 0;
static const int EBX = 1; static const int EBX = 1;
@ -227,13 +241,16 @@ namespace embree
static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions)
static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions)
static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions)
/* cpuid[eax=7,ecx=0].ecx */ /* cpuid[eax=7,ecx=0].ecx */
static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions)
#endif
__noinline int64_t get_xcr0() #if !defined(__ARM_NEON)
__noinline int64_t get_xcr0()
{ {
#if defined (__WIN32__) /* -- GODOT start -- */ && !defined (__MINGW32__) /* -- GODOT end -- */ // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466
#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
xcr0 = _xgetbv(0); xcr0 = _xgetbv(0);
return xcr0; return xcr0;
@ -243,21 +260,44 @@ namespace embree
return xcr0; return xcr0;
#endif #endif
} }
#endif
int getCPUFeatures() int getCPUFeatures()
{ {
#if defined(__ARM_NEON)
int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
#if defined(NEON_AVX2_EMULATION)
cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
cpu_features |= CPU_FEATURE_XMM_ENABLED;
cpu_features |= CPU_FEATURE_YMM_ENABLED;
cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
cpu_features |= CPU_FEATURE_POPCNT;
cpu_features |= CPU_FEATURE_AVX;
cpu_features |= CPU_FEATURE_AVX2;
cpu_features |= CPU_FEATURE_FMA3;
cpu_features |= CPU_FEATURE_LZCNT;
cpu_features |= CPU_FEATURE_BMI1;
cpu_features |= CPU_FEATURE_BMI2;
cpu_features |= CPU_FEATURE_NEON_2X;
#endif
return cpu_features;
#else
/* cache CPU features access */ /* cache CPU features access */
static int cpu_features = 0; static int cpu_features = 0;
if (cpu_features) if (cpu_features)
return cpu_features; return cpu_features;
/* get number of CPUID leaves */ /* get number of CPUID leaves */
int cpuid_leaf0[4]; int cpuid_leaf0[4];
__cpuid(cpuid_leaf0, 0x00000000); __cpuid(cpuid_leaf0, 0x00000000);
unsigned nIds = cpuid_leaf0[EAX]; unsigned nIds = cpuid_leaf0[EAX];
/* get number of extended CPUID leaves */ /* get number of extended CPUID leaves */
int cpuid_leafe[4]; int cpuid_leafe[4];
__cpuid(cpuid_leafe, 0x80000000); __cpuid(cpuid_leafe, 0x80000000);
unsigned nExIds = cpuid_leafe[EAX]; unsigned nExIds = cpuid_leafe[EAX];
@ -289,7 +329,7 @@ namespace embree
if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE;
if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3;
@ -297,8 +337,8 @@ namespace embree
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C;
if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2;
@ -310,7 +350,7 @@ namespace embree
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW;
if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
@ -318,6 +358,7 @@ namespace embree
if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
return cpu_features; return cpu_features;
#endif
} }
std::string stringOfCPUFeatures(int features) std::string stringOfCPUFeatures(int features)
@ -350,9 +391,11 @@ namespace embree
if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
if (features & CPU_FEATURE_NEON) str += "NEON ";
if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
return str; return str;
} }
std::string stringOfISA (int isa) std::string stringOfISA (int isa)
{ {
if (isa == SSE) return "SSE"; if (isa == SSE) return "SSE";
@ -365,13 +408,15 @@ namespace embree
if (isa == AVX2) return "AVX2"; if (isa == AVX2) return "AVX2";
if (isa == AVX512KNL) return "AVX512KNL"; if (isa == AVX512KNL) return "AVX512KNL";
if (isa == AVX512SKX) return "AVX512SKX"; if (isa == AVX512SKX) return "AVX512SKX";
if (isa == NEON) return "NEON";
if (isa == NEON_2X) return "2xNEON";
return "UNKNOWN"; return "UNKNOWN";
} }
bool hasISA(int features, int isa) { bool hasISA(int features, int isa) {
return (features & isa) == isa; return (features & isa) == isa;
} }
std::string supportedTargetList (int features) std::string supportedTargetList (int features)
{ {
std::string v; std::string v;
@ -386,6 +431,8 @@ namespace embree
if (hasISA(features,AVX2)) v += "AVX2 "; if (hasISA(features,AVX2)) v += "AVX2 ";
if (hasISA(features,AVX512KNL)) v += "AVX512KNL "; if (hasISA(features,AVX512KNL)) v += "AVX512KNL ";
if (hasISA(features,AVX512SKX)) v += "AVX512SKX "; if (hasISA(features,AVX512SKX)) v += "AVX512SKX ";
if (hasISA(features,NEON)) v += "NEON ";
if (hasISA(features,NEON_2X)) v += "2xNEON ";
return v; return v;
} }
} }
@ -409,7 +456,7 @@ namespace embree
return std::string(filename); return std::string(filename);
} }
unsigned int getNumberOfLogicalThreads() unsigned int getNumberOfLogicalThreads()
{ {
static int nThreads = -1; static int nThreads = -1;
if (nThreads != -1) return nThreads; if (nThreads != -1) return nThreads;
@ -420,11 +467,11 @@ namespace embree
GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount");
if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount)
{ {
int groups = pGetActiveProcessorGroupCount(); int groups = pGetActiveProcessorGroupCount();
int totalProcessors = 0; int totalProcessors = 0;
for (int i = 0; i < groups; i++) for (int i = 0; i < groups; i++)
totalProcessors += pGetActiveProcessorCount(i); totalProcessors += pGetActiveProcessorCount(i);
nThreads = totalProcessors; nThreads = totalProcessors;
} }
@ -438,7 +485,7 @@ namespace embree
return nThreads; return nThreads;
} }
int getTerminalWidth() int getTerminalWidth()
{ {
HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
if (handle == INVALID_HANDLE_VALUE) return 80; if (handle == INVALID_HANDLE_VALUE) return 80;
@ -448,7 +495,7 @@ namespace embree
return info.dwSize.X; return info.dwSize.X;
} }
double getSeconds() double getSeconds()
{ {
LARGE_INTEGER freq, val; LARGE_INTEGER freq, val;
QueryPerformanceFrequency(&freq); QueryPerformanceFrequency(&freq);
@ -487,7 +534,7 @@ namespace embree
namespace embree namespace embree
{ {
std::string getExecutableFileName() std::string getExecutableFileName()
{ {
std::string pid = "/proc/" + toString(getpid()) + "/exe"; std::string pid = "/proc/" + toString(getpid()) + "/exe";
char buf[4096]; char buf[4096];
@ -540,7 +587,7 @@ namespace embree
size_t getVirtualMemoryBytes() { size_t getVirtualMemoryBytes() {
return 0; return 0;
} }
size_t getResidentMemoryBytes() { size_t getResidentMemoryBytes() {
return 0; return 0;
} }
@ -570,7 +617,7 @@ namespace embree
size_t getVirtualMemoryBytes() { size_t getVirtualMemoryBytes() {
return 0; return 0;
} }
size_t getResidentMemoryBytes() { size_t getResidentMemoryBytes() {
return 0; return 0;
} }
@ -591,12 +638,12 @@ namespace embree
namespace embree namespace embree
{ {
unsigned int getNumberOfLogicalThreads() unsigned int getNumberOfLogicalThreads()
{ {
static int nThreads = -1; static int nThreads = -1;
if (nThreads != -1) return nThreads; if (nThreads != -1) return nThreads;
#if defined(__MACOSX__) #if defined(__MACOSX__) || defined(__ANDROID__)
nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
assert(nThreads); assert(nThreads);
#else #else
@ -604,12 +651,12 @@ namespace embree
if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
nThreads = CPU_COUNT(&set); nThreads = CPU_COUNT(&set);
#endif #endif
assert(nThreads); assert(nThreads);
return nThreads; return nThreads;
} }
int getTerminalWidth() int getTerminalWidth()
{ {
struct winsize info; struct winsize info;
if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;

View file

@ -59,7 +59,12 @@
# define isa sse # define isa sse
# define ISA SSE # define ISA SSE
# define ISA_STR "SSE" # define ISA_STR "SSE"
#else #elif defined(__ARM_NEON)
// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
#define isa sse2
#define ISA NEON
#define ISA_STR "NEON"
#else
#error Unknown ISA #error Unknown ISA
#endif #endif
@ -87,6 +92,7 @@ namespace embree
NEHALEM, NEHALEM,
CORE2, CORE2,
CORE1, CORE1,
ARM,
UNKNOWN, UNKNOWN,
}; };
@ -114,7 +120,7 @@ namespace embree
static const int CPU_FEATURE_SSE3 = 1 << 2; static const int CPU_FEATURE_SSE3 = 1 << 2;
static const int CPU_FEATURE_SSSE3 = 1 << 3; static const int CPU_FEATURE_SSSE3 = 1 << 3;
static const int CPU_FEATURE_SSE41 = 1 << 4; static const int CPU_FEATURE_SSE41 = 1 << 4;
static const int CPU_FEATURE_SSE42 = 1 << 5; static const int CPU_FEATURE_SSE42 = 1 << 5;
static const int CPU_FEATURE_POPCNT = 1 << 6; static const int CPU_FEATURE_POPCNT = 1 << 6;
static const int CPU_FEATURE_AVX = 1 << 7; static const int CPU_FEATURE_AVX = 1 << 7;
static const int CPU_FEATURE_F16C = 1 << 8; static const int CPU_FEATURE_F16C = 1 << 8;
@ -125,7 +131,7 @@ namespace embree
static const int CPU_FEATURE_BMI1 = 1 << 13; static const int CPU_FEATURE_BMI1 = 1 << 13;
static const int CPU_FEATURE_BMI2 = 1 << 14; static const int CPU_FEATURE_BMI2 = 1 << 14;
static const int CPU_FEATURE_AVX512F = 1 << 16; static const int CPU_FEATURE_AVX512F = 1 << 16;
static const int CPU_FEATURE_AVX512DQ = 1 << 17; static const int CPU_FEATURE_AVX512DQ = 1 << 17;
static const int CPU_FEATURE_AVX512PF = 1 << 18; static const int CPU_FEATURE_AVX512PF = 1 << 18;
static const int CPU_FEATURE_AVX512ER = 1 << 19; static const int CPU_FEATURE_AVX512ER = 1 << 19;
static const int CPU_FEATURE_AVX512CD = 1 << 20; static const int CPU_FEATURE_AVX512CD = 1 << 20;
@ -136,7 +142,9 @@ namespace embree
static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
static const int CPU_FEATURE_NEON = 1 << 28;
static const int CPU_FEATURE_NEON_2X = 1 << 29;
/*! get CPU features */ /*! get CPU features */
int getCPUFeatures(); int getCPUFeatures();
@ -147,7 +155,7 @@ namespace embree
std::string supportedTargetList (int isa); std::string supportedTargetList (int isa);
/*! ISAs */ /*! ISAs */
static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED;
static const int SSE2 = SSE | CPU_FEATURE_SSE2; static const int SSE2 = SSE | CPU_FEATURE_SSE2;
static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; static const int SSE3 = SSE2 | CPU_FEATURE_SSE3;
static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3;
@ -158,6 +166,8 @@ namespace embree
static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED; static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED;
static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
/*! converts ISA bitvector into a string */ /*! converts ISA bitvector into a string */
std::string stringOfISA(int features); std::string stringOfISA(int features);

View file

@ -6,7 +6,11 @@
#include "string.h" #include "string.h"
#include <iostream> #include <iostream>
#if defined(__ARM_NEON)
#include "../math/SSE2NEON.h"
#else
#include <xmmintrin.h> #include <xmmintrin.h>
#endif
#if defined(PTHREADS_WIN32) #if defined(PTHREADS_WIN32)
#pragma comment (lib, "pthreadVC.lib") #pragma comment (lib, "pthreadVC.lib")
@ -35,7 +39,7 @@ namespace embree
GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx)
{ {
int groups = pGetActiveProcessorGroupCount(); int groups = pGetActiveProcessorGroupCount();
int totalProcessors = 0, group = 0, number = 0; int totalProcessors = 0, group = 0, number = 0;
@ -48,7 +52,7 @@ namespace embree
} }
totalProcessors += processors; totalProcessors += processors;
} }
GROUP_AFFINITY groupAffinity; GROUP_AFFINITY groupAffinity;
groupAffinity.Group = (WORD)group; groupAffinity.Group = (WORD)group;
groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
@ -57,15 +61,15 @@ namespace embree
groupAffinity.Reserved[2] = 0; groupAffinity.Reserved[2] = 0;
if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
PROCESSOR_NUMBER processorNumber; PROCESSOR_NUMBER processorNumber;
processorNumber.Group = group; processorNumber.Group = group;
processorNumber.Number = number; processorNumber.Number = number;
processorNumber.Reserved = 0; processorNumber.Reserved = 0;
if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
} }
else else
{ {
if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
WARNING("SetThreadAffinityMask failed"); // on purpose only a warning WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
@ -79,10 +83,10 @@ namespace embree
setAffinity(GetCurrentThread(), affinity); setAffinity(GetCurrentThread(), affinity);
} }
struct ThreadStartupData struct ThreadStartupData
{ {
public: public:
ThreadStartupData (thread_func f, void* arg) ThreadStartupData (thread_func f, void* arg)
: f(f), arg(arg) {} : f(f), arg(arg) {}
public: public:
thread_func f; thread_func f;
@ -95,6 +99,7 @@ namespace embree
_mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
parg->f(parg->arg); parg->f(parg->arg);
delete parg; delete parg;
parg = nullptr;
return 0; return 0;
} }
@ -120,12 +125,6 @@ namespace embree
CloseHandle(HANDLE(tid)); CloseHandle(HANDLE(tid));
} }
/*! destroy a hardware thread by its handle */
void destroyThread(thread_t tid) {
TerminateThread(HANDLE(tid),0);
CloseHandle(HANDLE(tid));
}
/*! creates thread local storage */ /*! creates thread local storage */
tls_t createTls() { tls_t createTls() {
return tls_t(size_t(TlsAlloc())); return tls_t(size_t(TlsAlloc()));
@ -160,16 +159,21 @@ namespace embree
#include <sstream> #include <sstream>
#include <algorithm> #include <algorithm>
#if defined(__ANDROID__)
#include <pthread.h>
#endif
namespace embree namespace embree
{ {
static MutexSys mutex; static MutexSys mutex;
static std::vector<size_t> threadIDs; static std::vector<size_t> threadIDs;
#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target
/* changes thread ID mapping such that we first fill up all thread on one core */ /* changes thread ID mapping such that we first fill up all thread on one core */
size_t mapThreadID(size_t threadID) size_t mapThreadID(size_t threadID)
{ {
Lock<MutexSys> lock(mutex); Lock<MutexSys> lock(mutex);
if (threadIDs.size() == 0) if (threadIDs.size() == 0)
{ {
/* parse thread/CPU topology */ /* parse thread/CPU topology */
@ -181,11 +185,11 @@ namespace embree
if (fs.fail()) break; if (fs.fail()) break;
int i; int i;
while (fs >> i) while (fs >> i)
{ {
if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
threadIDs.push_back(i); threadIDs.push_back(i);
if (fs.peek() == ',') if (fs.peek() == ',')
fs.ignore(); fs.ignore();
} }
fs.close(); fs.close();
@ -229,16 +233,21 @@ namespace embree
return ID; return ID;
} }
#endif
/*! set affinity of the calling thread */ /*! set affinity of the calling thread */
void setAffinity(ssize_t affinity) void setAffinity(ssize_t affinity)
{ {
#if defined(__ANDROID__)
// TODO(LTE): Implement
#else
cpu_set_t cset; cpu_set_t cset;
CPU_ZERO(&cset); CPU_ZERO(&cset);
size_t threadID = mapThreadID(affinity); size_t threadID = mapThreadID(affinity);
CPU_SET(threadID, &cset); CPU_SET(threadID, &cset);
pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
#endif
} }
} }
#endif #endif
@ -303,21 +312,21 @@ namespace embree
namespace embree namespace embree
{ {
struct ThreadStartupData struct ThreadStartupData
{ {
public: public:
ThreadStartupData (thread_func f, void* arg, int affinity) ThreadStartupData (thread_func f, void* arg, int affinity)
: f(f), arg(arg), affinity(affinity) {} : f(f), arg(arg), affinity(affinity) {}
public: public:
thread_func f; thread_func f;
void* arg; void* arg;
ssize_t affinity; ssize_t affinity;
}; };
static void* threadStartup(ThreadStartupData* parg) static void* threadStartup(ThreadStartupData* parg)
{ {
_mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
/*! Mac OS X does not support setting affinity at thread creation time */ /*! Mac OS X does not support setting affinity at thread creation time */
#if defined(__MACOSX__) #if defined(__MACOSX__)
if (parg->affinity >= 0) if (parg->affinity >= 0)
@ -326,6 +335,7 @@ namespace embree
parg->f(parg->arg); parg->f(parg->arg);
delete parg; delete parg;
parg = nullptr;
return nullptr; return nullptr;
} }
@ -341,13 +351,13 @@ namespace embree
pthread_t* tid = new pthread_t; pthread_t* tid = new pthread_t;
if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
pthread_attr_destroy(&attr); pthread_attr_destroy(&attr);
delete tid; delete tid;
FATAL("pthread_create failed"); FATAL("pthread_create failed");
} }
pthread_attr_destroy(&attr); pthread_attr_destroy(&attr);
/* set affinity */ /* set affinity */
#if defined(__LINUX__) #if defined(__LINUX__) && !defined(__ANDROID__)
if (threadID >= 0) { if (threadID >= 0) {
cpu_set_t cset; cpu_set_t cset;
CPU_ZERO(&cset); CPU_ZERO(&cset);
@ -379,14 +389,8 @@ namespace embree
delete (pthread_t*)tid; delete (pthread_t*)tid;
} }
/*! destroy a hardware thread by its handle */
void destroyThread(thread_t tid) {
pthread_cancel(*(pthread_t*)tid);
delete (pthread_t*)tid;
}
/*! creates thread local storage */ /*! creates thread local storage */
tls_t createTls() tls_t createTls()
{ {
pthread_key_t* key = new pthread_key_t; pthread_key_t* key = new pthread_key_t;
if (pthread_key_create(key,nullptr) != 0) { if (pthread_key_create(key,nullptr) != 0) {
@ -398,14 +402,14 @@ namespace embree
} }
/*! return the thread local storage pointer */ /*! return the thread local storage pointer */
void* getTls(tls_t tls) void* getTls(tls_t tls)
{ {
assert(tls); assert(tls);
return pthread_getspecific(*(pthread_key_t*)tls); return pthread_getspecific(*(pthread_key_t*)tls);
} }
/*! set the thread local storage pointer */ /*! set the thread local storage pointer */
void setTls(tls_t tls, void* const ptr) void setTls(tls_t tls, void* const ptr)
{ {
assert(tls); assert(tls);
if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
@ -413,7 +417,7 @@ namespace embree
} }
/*! destroys thread local storage identifier */ /*! destroys thread local storage identifier */
void destroyTls(tls_t tls) void destroyTls(tls_t tls)
{ {
assert(tls); assert(tls);
if (pthread_key_delete(*(pthread_key_t*)tls) != 0) if (pthread_key_delete(*(pthread_key_t*)tls) != 0)

View file

@ -29,9 +29,6 @@ namespace embree
/*! waits until the given thread has terminated */ /*! waits until the given thread has terminated */
void join(thread_t tid); void join(thread_t tid);
/*! destroy handle of a thread */
void destroyThread(thread_t tid);
/*! type for handle to thread local storage */ /*! type for handle to thread local storage */
typedef struct opaque_tls_t* tls_t; typedef struct opaque_tls_t* tls_t;

View file

@ -5,6 +5,8 @@
#if defined(TASKING_INTERNAL) #if defined(TASKING_INTERNAL)
# include "taskschedulerinternal.h" # include "taskschedulerinternal.h"
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
# include "taskschedulergcd.h"
#elif defined(TASKING_TBB) #elif defined(TASKING_TBB)
# include "taskschedulertbb.h" # include "taskschedulertbb.h"
#elif defined(TASKING_PPL) #elif defined(TASKING_PPL)

View file

@ -0,0 +1,49 @@
#pragma once
#include "../sys/platform.h"
#include "../sys/alloc.h"
#include "../sys/barrier.h"
#include "../sys/thread.h"
#include "../sys/mutex.h"
#include "../sys/condition.h"
#include "../sys/ref.h"
#include <dispatch/dispatch.h>
namespace embree
{
struct TaskScheduler
{
/*! initializes the task scheduler */
static void create(size_t numThreads, bool set_affinity, bool start_threads);
/*! destroys the task scheduler again */
static void destroy() {}
/* returns the ID of the current thread */
static __forceinline size_t threadID()
{
return threadIndex();
}
/* returns the index (0..threadCount-1) of the current thread */
static __forceinline size_t threadIndex()
{
currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads;
return currentThreadIndex;
}
/* returns the total number of threads */
static __forceinline size_t threadCount()
{
return GCDNumThreads;
}
private:
static size_t GCDNumThreads;
static size_t currentThreadIndex;
};
};

View file

@ -48,13 +48,15 @@ namespace embree
{ {
Task* prevTask = thread.task; Task* prevTask = thread.task;
thread.task = this; thread.task = this;
try { // -- GODOT start --
if (thread.scheduler->cancellingException == nullptr) // try {
// if (thread.scheduler->cancellingException == nullptr)
closure->execute(); closure->execute();
} catch (...) { // } catch (...) {
if (thread.scheduler->cancellingException == nullptr) // if (thread.scheduler->cancellingException == nullptr)
thread.scheduler->cancellingException = std::current_exception(); // thread.scheduler->cancellingException = std::current_exception();
} // }
// -- GODOT end --
thread.task = prevTask; thread.task = prevTask;
add_dependencies(-1); add_dependencies(-1);
} }
@ -152,6 +154,12 @@ namespace embree
assert(newNumThreads); assert(newNumThreads);
newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
// We are observing a few % gain by increasing number threads by 2 on aarch64.
#if defined(__aarch64__) && defined(BUILD_IOS)
numThreads = newNumThreads*2;
#else
numThreads = newNumThreads;
#endif
numThreads = newNumThreads; numThreads = newNumThreads;
if (!startThreads && !running) return; if (!startThreads && !running) return;
running = true; running = true;
@ -291,8 +299,11 @@ namespace embree
size_t threadIndex = allocThreadIndex(); size_t threadIndex = allocThreadIndex();
condition.wait(mutex, [&] () { return hasRootTask.load(); }); condition.wait(mutex, [&] () { return hasRootTask.load(); });
mutex.unlock(); mutex.unlock();
std::exception_ptr except = thread_loop(threadIndex); // -- GODOT start --
if (except != nullptr) std::rethrow_exception(except); // std::exception_ptr except = thread_loop(threadIndex);
// if (except != nullptr) std::rethrow_exception(except);
thread_loop(threadIndex);
// -- GODOT end --
} }
void TaskScheduler::reset() { void TaskScheduler::reset() {
@ -324,7 +335,10 @@ namespace embree
return thread->scheduler->cancellingException == nullptr; return thread->scheduler->cancellingException == nullptr;
} }
std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) // -- GODOT start --
// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
void TaskScheduler::thread_loop(size_t threadIndex)
// -- GODOT end --
{ {
/* allocate thread structure */ /* allocate thread structure */
std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
@ -347,9 +361,10 @@ namespace embree
swapThread(oldThread); swapThread(oldThread);
/* remember exception to throw */ /* remember exception to throw */
std::exception_ptr except = nullptr; // -- GODOT start --
if (cancellingException != nullptr) except = cancellingException; // std::exception_ptr except = nullptr;
// if (cancellingException != nullptr) except = cancellingException;
// -- GODOT end --
/* wait for all threads to terminate */ /* wait for all threads to terminate */
threadCounter--; threadCounter--;
#if defined(__WIN32__) #if defined(__WIN32__)
@ -367,7 +382,10 @@ namespace embree
yield(); yield();
#endif #endif
} }
return except; // -- GODOT start --
// return except;
return;
// -- GODOT end --
} }
bool TaskScheduler::steal_from_other_threads(Thread& thread) bool TaskScheduler::steal_from_other_threads(Thread& thread)

View file

@ -123,7 +123,10 @@ namespace embree
{ {
size_t ofs = bytes + ((align - stackPtr) & (align-1)); size_t ofs = bytes + ((align - stackPtr) & (align-1));
if (stackPtr + ofs > CLOSURE_STACK_SIZE) if (stackPtr + ofs > CLOSURE_STACK_SIZE)
throw std::runtime_error("closure stack overflow"); // -- GODOT start --
// throw std::runtime_error("closure stack overflow");
abort();
// -- GODOT end --
stackPtr += ofs; stackPtr += ofs;
return &stack[stackPtr-bytes]; return &stack[stackPtr-bytes];
} }
@ -132,12 +135,16 @@ namespace embree
__forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
{ {
if (right >= TASK_STACK_SIZE) if (right >= TASK_STACK_SIZE)
throw std::runtime_error("task stack overflow"); // -- GODOT start --
// throw std::runtime_error("task stack overflow");
abort();
// -- GODOT end --
/* allocate new task on right side of stack */ /* allocate new task on right side of stack */
size_t oldStackPtr = stackPtr; size_t oldStackPtr = stackPtr;
TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure); TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
new (&tasks[right]) Task(func,thread.task,oldStackPtr,size); /* gcc 8 or later fails to compile without explicit .load() */
new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
right++; right++;
/* also move left pointer */ /* also move left pointer */
@ -238,7 +245,10 @@ namespace embree
void wait_for_threads(size_t threadCount); void wait_for_threads(size_t threadCount);
/*! thread loop for all worker threads */ /*! thread loop for all worker threads */
std::exception_ptr thread_loop(size_t threadIndex); // -- GODOT start --
// std::exception_ptr thread_loop(size_t threadIndex);
void thread_loop(size_t threadIndex);
// -- GODOT end --
/*! steals a task from a different thread */ /*! steals a task from a different thread */
bool steal_from_other_threads(Thread& thread); bool steal_from_other_threads(Thread& thread);

View file

@ -12,13 +12,7 @@
#include "../sys/ref.h" #include "../sys/ref.h"
#if defined(__WIN32__) #if defined(__WIN32__)
// -- GODOT start --
#if !defined(NOMINMAX)
// -- GODOT end --
# define NOMINMAX # define NOMINMAX
// -- GODOT start --
#endif
// -- GODOT end --
#endif #endif
// We need to define these to avoid implicit linkage against // We need to define these to avoid implicit linkage against

View file

@ -19,7 +19,7 @@ typedef int ssize_t;
#endif #endif
#endif #endif
#if defined(_WIN32) && defined(_MSC_VER) #if defined(_WIN32) && !defined(__MINGW32__)
# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) # define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
#else #else
# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) # define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
@ -35,7 +35,7 @@ typedef int ssize_t;
#endif #endif
#endif #endif
#if defined(_WIN32) #if defined(_WIN32)
# define RTC_FORCEINLINE __forceinline # define RTC_FORCEINLINE __forceinline
#else #else
# define RTC_FORCEINLINE inline __attribute__((always_inline)) # define RTC_FORCEINLINE inline __attribute__((always_inline))
@ -224,13 +224,13 @@ RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context
} }
/* Point query structure for closest point query */ /* Point query structure for closest point query */
struct RTC_ALIGN(16) RTCPointQuery struct RTC_ALIGN(16) RTCPointQuery
{ {
float x; // x coordinate of the query point float x; // x coordinate of the query point
float y; // y coordinate of the query point float y; // y coordinate of the query point
float z; // z coordinate of the query point float z; // z coordinate of the query point
float time; // time of the point query float time; // time of the point query
float radius; // radius of the point query float radius; // radius of the point query
}; };
/* Structure of a packet of 4 query points */ /* Structure of a packet of 4 query points */
@ -250,7 +250,7 @@ struct RTC_ALIGN(32) RTCPointQuery8
float y[8]; // y coordinate of the query point float y[8]; // y coordinate of the query point
float z[8]; // z coordinate of the query point float z[8]; // z coordinate of the query point
float time[8]; // time of the point query float time[8]; // time of the point query
float radius[8]; // radius ofr the point query float radius[8]; // radius ofr the point query
}; };
/* Structure of a packet of 16 query points */ /* Structure of a packet of 16 query points */
@ -269,11 +269,11 @@ struct RTC_ALIGN(16) RTCPointQueryContext
{ {
// accumulated 4x4 column major matrices from world space to instance space. // accumulated 4x4 column major matrices from world space to instance space.
// undefined if size == 0. // undefined if size == 0.
float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
// accumulated 4x4 column major matrices from instance space to world space. // accumulated 4x4 column major matrices from instance space to world space.
// undefined if size == 0. // undefined if size == 0.
float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
// instance ids. // instance ids.
unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
@ -301,13 +301,13 @@ struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
void* userPtr; void* userPtr;
// primitive and geometry ID of primitive // primitive and geometry ID of primitive
unsigned int primID; unsigned int primID;
unsigned int geomID; unsigned int geomID;
// the context with transformation and instance ID stack // the context with transformation and instance ID stack
struct RTCPointQueryContext* context; struct RTCPointQueryContext* context;
// If the current instance transform M (= context->world2inst[context->instStackSize]) // If the current instance transform M (= context->world2inst[context->instStackSize])
// is a similarity matrix, i.e there is a constant factor similarityScale such that, // is a similarity matrix, i.e there is a constant factor similarityScale such that,
// for all x,y: dist(Mx, My) = similarityScale * dist(x, y), // for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
// The similarity scale is 0, if the current instance transform is not a // The similarity scale is 0, if the current instance transform is not a
@ -322,5 +322,5 @@ struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
}; };
typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args); typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
RTC_NAMESPACE_END RTC_NAMESPACE_END

View file

@ -43,7 +43,7 @@ namespace embree
{ {
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(settings.sahBlockSize); if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(static_cast<size_t>(settings.sahBlockSize));
if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost; if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost;

View file

@ -51,7 +51,7 @@ namespace embree
template<int N> template<int N>
void BVHN<N>::layoutLargeNodes(size_t num) void BVHN<N>::layoutLargeNodes(size_t num)
{ {
#if defined(__X86_64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues #if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
struct NodeArea struct NodeArea
{ {
__forceinline NodeArea() {} __forceinline NodeArea() {}
@ -183,7 +183,7 @@ namespace embree
template class BVHN<8>; template class BVHN<8>;
#endif #endif
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) #if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
template class BVHN<4>; template class BVHN<4>;
#endif #endif
} }

View file

@ -81,7 +81,7 @@ namespace embree
struct CreateAlloc : public FastAllocator::Create { struct CreateAlloc : public FastAllocator::Create {
__forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {} __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
}; };
typedef BVHNodeRecord<NodeRef> NodeRecord; typedef BVHNodeRecord<NodeRef> NodeRecord;
typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;

View file

@ -18,7 +18,7 @@
#include "../geometry/object.h" #include "../geometry/object.h"
#include "../geometry/instance.h" #include "../geometry/instance.h"
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform # define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
#else #else
# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues # define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues

View file

@ -172,12 +172,23 @@ namespace embree
TravRayKStream<K,robust> &p = packets[rayID / K]; TravRayKStream<K,robust> &p = packets[rayID / K];
const size_t i = rayID % K; const size_t i = rayID % K;
const vint<Nx> bitmask(shiftTable[rayID]); const vint<Nx> bitmask(shiftTable[rayID]);
#if defined (__aarch64__)
const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
const vfloat<Nx> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
const vfloat<Nx> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
const vfloat<Nx> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
#else
const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
const vfloat<Nx> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); const vfloat<Nx> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
const vfloat<Nx> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); const vfloat<Nx> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
const vfloat<Nx> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); const vfloat<Nx> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]);
#endif
const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i])); const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i])); const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i]));

View file

@ -102,7 +102,7 @@ namespace embree
/*! Sets the barrier bit. */ /*! Sets the barrier bit. */
__forceinline void setBarrier() { __forceinline void setBarrier() {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
assert(!isBarrier()); assert(!isBarrier());
ptr |= barrier_mask; ptr |= barrier_mask;
#else #else
@ -112,7 +112,7 @@ namespace embree
/*! Clears the barrier bit. */ /*! Clears the barrier bit. */
__forceinline void clearBarrier() { __forceinline void clearBarrier() {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
ptr &= ~barrier_mask; ptr &= ~barrier_mask;
#else #else
assert(false); assert(false);

View file

@ -150,7 +150,10 @@ namespace embree
} }
} }
else { else {
throw std::runtime_error("not supported node type in bvh_statistics"); // -- GODOT start --
// throw std::runtime_error("not supported node type in bvh_statistics");
abort();
// -- GODOT end --
} }
return s; return s;
} }
@ -159,7 +162,7 @@ namespace embree
template class BVHNStatistics<8>; template class BVHNStatistics<8>;
#endif #endif
#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) #if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
template class BVHNStatistics<4>; template class BVHNStatistics<4>;
#endif #endif
} }

View file

@ -5,6 +5,15 @@
#include "node_intersector.h" #include "node_intersector.h"
#if defined(__AVX2__)
#define __FMA_X4__
#endif
#if defined(__aarch64__)
#define __FMA_X4__
#endif
namespace embree namespace embree
{ {
namespace isa namespace isa
@ -29,9 +38,15 @@ namespace embree
org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z); rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
#if defined(__AVX2__) #if defined(__FMA_X4__)
const Vec3fa ray_org_rdir = ray_org*ray_rdir; const Vec3fa ray_org_rdir = ray_org*ray_rdir;
#if !defined(__aarch64__)
org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
#else
//for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
//x86 will use msub
neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
#endif
#endif #endif
nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
@ -59,8 +74,12 @@ namespace embree
org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
#if defined(__AVX2__) #if defined(__FMA_X4__)
org_rdir = org*rdir; #if !defined(__aarch64__)
org_rdir = org*rdir;
#else
neg_org_rdir = -(org*rdir);
#endif
#endif #endif
nearX = nearXYZ.x[k]; nearX = nearXYZ.x[k];
nearY = nearXYZ.y[k]; nearY = nearXYZ.y[k];
@ -81,8 +100,14 @@ namespace embree
Vec3fa org_xyz, dir_xyz; Vec3fa org_xyz, dir_xyz;
Vec3vf<Nx> org, dir, rdir; Vec3vf<Nx> org, dir, rdir;
#if defined(__AVX2__) #if defined(__FMA_X4__)
#if !defined(__aarch64__)
Vec3vf<Nx> org_rdir; Vec3vf<Nx> org_rdir;
#else
//aarch64 version are keeping negation of the org_rdir and use madd
//x86 uses msub
Vec3vf<Nx> neg_org_rdir;
#endif
#endif #endif
#if defined(__AVX512ER__) // KNL+ #if defined(__AVX512ER__) // KNL+
vint16 permX, permY, permZ; vint16 permX, permY, permZ;
@ -110,7 +135,6 @@ namespace embree
dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z); rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
rdir_far = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z); rdir_far = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>); nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
@ -447,13 +471,22 @@ namespace embree
template<> template<>
__forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
{ {
#if defined(__AVX2__) #if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@ -462,8 +495,13 @@ namespace embree
const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
#endif #endif
#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW #if defined(__aarch64__)
const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
const vbool4 vmask = asInt(tNear) <= asInt(tFar);
const size_t mask = movemask(vmask);
#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
const vbool4 vmask = asInt(tNear) > asInt(tFar); const vbool4 vmask = asInt(tNear) > asInt(tFar);
@ -489,12 +527,22 @@ namespace embree
__forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
{ {
#if defined(__AVX2__) #if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@ -638,13 +686,22 @@ namespace embree
const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
#if defined(__AVX2__) #if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@ -653,7 +710,7 @@ namespace embree
const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
#endif #endif
#if defined(__AVX2__) && !defined(__AVX512F__) // HSW #if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
const vbool<N> vmask = asInt(tNear) > asInt(tFar); const vbool<N> vmask = asInt(tNear) > asInt(tFar);
@ -714,13 +771,22 @@ namespace embree
const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
#if defined (__AVX2__) #if defined (__FMA_X4__)
#if defined(__aarch64__)
const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@ -729,7 +795,7 @@ namespace embree
const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
#endif #endif
#if defined(__AVX2__) && !defined(__AVX512F__) #if defined(__FMA_X4__) && !defined(__AVX512F__)
const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
#else #else
@ -803,13 +869,22 @@ namespace embree
const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z);
#if defined(__AVX2__) #if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@ -819,7 +894,7 @@ namespace embree
const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z;
#endif #endif
#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW #if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
const vbool4 vmask = asInt(tNear) > asInt(tFar); const vbool4 vmask = asInt(tNear) > asInt(tFar);
@ -892,12 +967,21 @@ namespace embree
const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z);
#if defined(__AVX2__) #if defined(__AVX2__)
#if defined(__aarch64__)
const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@ -1078,13 +1162,22 @@ namespace embree
const vfloat<N> upper_y = node->dequantizeUpperY(time); const vfloat<N> upper_y = node->dequantizeUpperY(time);
const vfloat<N> lower_z = node->dequantizeLowerZ(time); const vfloat<N> lower_z = node->dequantizeLowerZ(time);
const vfloat<N> upper_z = node->dequantizeUpperZ(time); const vfloat<N> upper_z = node->dequantizeUpperZ(time);
#if defined(__AVX2__) #if defined(__FMA_X4__)
#if defined(__aarch64__)
const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
#endif
#else #else
const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y; const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;

View file

@ -81,9 +81,13 @@ namespace embree
min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
#if defined (__aarch64__)
neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
#else
min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
#endif
min_dist = reduced_min_dist; min_dist = reduced_min_dist;
max_dist = reduced_max_dist; max_dist = reduced_max_dist;
@ -101,9 +105,13 @@ namespace embree
Vec3fa min_rdir; Vec3fa min_rdir;
Vec3fa max_rdir; Vec3fa max_rdir;
#if defined (__aarch64__)
Vec3fa neg_min_org_rdir;
Vec3fa neg_max_org_rdir;
#else
Vec3fa min_org_rdir; Vec3fa min_org_rdir;
Vec3fa max_org_rdir; Vec3fa max_org_rdir;
#endif
float min_dist; float min_dist;
float max_dist; float max_dist;
}; };
@ -203,13 +211,21 @@ namespace embree
const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
#if defined (__aarch64__)
const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x));
const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y));
const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z));
const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x));
const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y));
const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z));
#else
const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x)); const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x));
const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y)); const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y));
const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z)); const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z));
const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x)); const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x));
const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y)); const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y));
const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z)); const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z));
#endif
const vfloat<Nx> fmin = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist)); const vfloat<Nx> fmin = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
dist = fmin; dist = fmin;
const vfloat<Nx> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist)); const vfloat<Nx> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));

View file

@ -39,10 +39,11 @@ namespace embree
org = ray_org; org = ray_org;
dir = ray_dir; dir = ray_dir;
rdir = rcp_safe(ray_dir); rdir = rcp_safe(ray_dir);
#if defined(__AVX2__) #if defined(__aarch64__)
neg_org_rdir = -(org * rdir);
#elif defined(__AVX2__)
org_rdir = org * rdir; org_rdir = org * rdir;
#endif #endif
if (N) if (N)
{ {
const int size = sizeof(float)*N; const int size = sizeof(float)*N;
@ -55,7 +56,9 @@ namespace embree
Vec3vf<K> org; Vec3vf<K> org;
Vec3vf<K> dir; Vec3vf<K> dir;
Vec3vf<K> rdir; Vec3vf<K> rdir;
#if defined(__AVX2__) #if defined(__aarch64__)
Vec3vf<K> neg_org_rdir;
#elif defined(__AVX2__)
Vec3vf<K> org_rdir; Vec3vf<K> org_rdir;
#endif #endif
Vec3vi<K> nearXYZ; Vec3vi<K> nearXYZ;
@ -119,7 +122,14 @@ namespace embree
const TravRayKFast<K>& ray, vfloat<K>& dist) const TravRayKFast<K>& ray, vfloat<K>& dist)
{ {
#if defined(__AVX2__) #if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
@ -199,7 +209,14 @@ namespace embree
const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
#if defined(__AVX2__) #if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@ -302,7 +319,14 @@ namespace embree
const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
#if defined(__AVX2__) #if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@ -464,7 +488,14 @@ namespace embree
const vfloat<N> lower_z = node->dequantizeLowerZ(); const vfloat<N> lower_z = node->dequantizeLowerZ();
const vfloat<N> upper_z = node->dequantizeUpperZ(); const vfloat<N> upper_z = node->dequantizeUpperZ();
#if defined(__AVX2__) #if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
@ -549,7 +580,14 @@ namespace embree
const vfloat<K> lower_z = node->dequantizeLowerZ(i,time); const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
const vfloat<K> upper_z = node->dequantizeUpperZ(i,time); const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
#if defined(__AVX2__) #if defined(__aarch64__)
const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
#elif defined(__AVX2__)
const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);

View file

@ -32,11 +32,19 @@ namespace embree
__forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
{ {
rdir = rcp_safe(ray_dir); rdir = rcp_safe(ray_dir);
#if defined(__aarch64__)
neg_org_rdir = -(ray_org * rdir);
#else
org_rdir = ray_org * rdir; org_rdir = ray_org * rdir;
#endif
} }
Vec3vf<K> rdir; Vec3vf<K> rdir;
#if defined(__aarch64__)
Vec3vf<K> neg_org_rdir;
#else
Vec3vf<K> org_rdir; Vec3vf<K> org_rdir;
#endif
vfloat<K> tnear; vfloat<K> tnear;
vfloat<K> tfar; vfloat<K> tfar;
}; };
@ -87,12 +95,21 @@ namespace embree
const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
#if defined (__aarch64__)
const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
#else
const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k])); const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k])); const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k])); const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k])); const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k])); const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k])); const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
#endif
const vfloat<Nx> rmin = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k])); const vfloat<Nx> rmin = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
const vfloat<Nx> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k])); const vfloat<Nx> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
@ -113,12 +130,21 @@ namespace embree
const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
#if defined (__aarch64__)
const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
#endif
const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear); const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear);
const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);

View file

@ -332,7 +332,7 @@ namespace embree
intersectorN.intersect(this,rayN,N,context); intersectorN.intersect(this,rayN,N,context);
} }
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
__forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) { __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
const vint<4> mask = valid.mask32(); const vint<4> mask = valid.mask32();
intersect4(&mask,(RTCRayHit4&)ray,context); intersect4(&mask,(RTCRayHit4&)ray,context);
@ -388,7 +388,7 @@ namespace embree
intersectorN.occluded(this,rayN,N,context); intersectorN.occluded(this,rayN,N,context);
} }
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
__forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) { __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
const vint<4> mask = valid.mask32(); const vint<4> mask = valid.mask32();
occluded4(&mask,(RTCRay4&)ray,context); occluded4(&mask,(RTCRay4&)ray,context);

View file

@ -97,7 +97,7 @@ namespace embree
for (size_t i=0; i<This->accels.size(); i++) { for (size_t i=0; i<This->accels.size(); i++) {
if (This->accels[i]->isEmpty()) continue; if (This->accels[i]->isEmpty()) continue;
This->accels[i]->intersectors.occluded4(valid,ray,context); This->accels[i]->intersectors.occluded4(valid,ray,context);
#if defined(__SSE2__) #if defined(__SSE2__) || defined(__ARM_NEON)
vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 valid0 = asBool(((vint4*)valid)[0]);
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
if (unlikely(none(valid0 & hit0))) break; if (unlikely(none(valid0 & hit0))) break;
@ -111,7 +111,7 @@ namespace embree
for (size_t i=0; i<This->accels.size(); i++) { for (size_t i=0; i<This->accels.size(); i++) {
if (This->accels[i]->isEmpty()) continue; if (This->accels[i]->isEmpty()) continue;
This->accels[i]->intersectors.occluded8(valid,ray,context); This->accels[i]->intersectors.occluded8(valid,ray,context);
#if defined(__SSE2__) // FIXME: use higher ISA #if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 valid0 = asBool(((vint4*)valid)[0]);
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
vbool4 valid1 = asBool(((vint4*)valid)[1]); vbool4 valid1 = asBool(((vint4*)valid)[1]);
@ -127,7 +127,7 @@ namespace embree
for (size_t i=0; i<This->accels.size(); i++) { for (size_t i=0; i<This->accels.size(); i++) {
if (This->accels[i]->isEmpty()) continue; if (This->accels[i]->isEmpty()) continue;
This->accels[i]->intersectors.occluded16(valid,ray,context); This->accels[i]->intersectors.occluded16(valid,ray,context);
#if defined(__SSE2__) // FIXME: use higher ISA #if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 valid0 = asBool(((vint4*)valid)[0]);
vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
vbool4 valid1 = asBool(((vint4*)valid)[1]); vbool4 valid1 = asBool(((vint4*)valid)[1]);

View file

@ -3,6 +3,9 @@
#include "alloc.h" #include "alloc.h"
#include "../../common/sys/thread.h" #include "../../common/sys/thread.h"
#if defined(__aarch64__) && defined(BUILD_IOS)
#include "../../common/sys/barrier.h"
#endif
namespace embree namespace embree
{ {

View file

@ -8,6 +8,10 @@
#include "scene.h" #include "scene.h"
#include "primref.h" #include "primref.h"
#if defined(__aarch64__) && defined(BUILD_IOS)
#include <mutex>
#endif
namespace embree namespace embree
{ {
class FastAllocator class FastAllocator
@ -26,7 +30,7 @@ namespace embree
public: public:
struct ThreadLocal2; struct ThreadLocal2;
enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE }; enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
/*! Per thread structure holding the current memory block. */ /*! Per thread structure holding the current memory block. */
struct __aligned(64) ThreadLocal struct __aligned(64) ThreadLocal
@ -132,7 +136,11 @@ namespace embree
{ {
assert(alloc_i); assert(alloc_i);
if (alloc.load() == alloc_i) return; if (alloc.load() == alloc_i) return;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex); Lock<SpinLock> lock(mutex);
#endif
//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
if (alloc.load()) { if (alloc.load()) {
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
@ -150,7 +158,11 @@ namespace embree
{ {
assert(alloc_i); assert(alloc_i);
if (alloc.load() != alloc_i) return; if (alloc.load() != alloc_i) return;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex); Lock<SpinLock> lock(mutex);
#endif
if (alloc.load() != alloc_i) return; // required as a different thread calls unbind if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
@ -161,7 +173,11 @@ namespace embree
} }
public: public:
#if defined(__aarch64__) && defined(BUILD_IOS)
std::mutex mutex;
#else
SpinLock mutex; //!< required as unbind is called from other threads SpinLock mutex; //!< required as unbind is called from other threads
#endif
std::atomic<FastAllocator*> alloc; //!< parent allocator std::atomic<FastAllocator*> alloc; //!< parent allocator
ThreadLocal alloc0; ThreadLocal alloc0;
ThreadLocal alloc1; ThreadLocal alloc1;
@ -169,7 +185,7 @@ namespace embree
FastAllocator (Device* device, bool osAllocation) FastAllocator (Device* device, bool osAllocation)
: device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC), growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
primrefarray(device,0) primrefarray(device,0)
{ {
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
@ -206,7 +222,7 @@ namespace embree
void setOSallocation(bool flag) void setOSallocation(bool flag)
{ {
atype = flag ? OS_MALLOC : ALIGNED_MALLOC; atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
} }
private: private:
@ -217,7 +233,11 @@ namespace embree
ThreadLocal2* alloc = thread_local_allocator2; ThreadLocal2* alloc = thread_local_allocator2;
if (alloc == nullptr) { if (alloc == nullptr) {
thread_local_allocator2 = alloc = new ThreadLocal2; thread_local_allocator2 = alloc = new ThreadLocal2;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(s_thread_local_allocators_lock);
#else
Lock<SpinLock> lock(s_thread_local_allocators_lock); Lock<SpinLock> lock(s_thread_local_allocators_lock);
#endif
s_thread_local_allocators.push_back(make_unique(alloc)); s_thread_local_allocators.push_back(make_unique(alloc));
} }
return alloc; return alloc;
@ -227,7 +247,11 @@ namespace embree
__forceinline void join(ThreadLocal2* alloc) __forceinline void join(ThreadLocal2* alloc)
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(s_thread_local_allocators_lock);
#else
Lock<SpinLock> lock(thread_local_allocators_lock); Lock<SpinLock> lock(thread_local_allocators_lock);
#endif
thread_local_allocators.push_back(alloc); thread_local_allocators.push_back(alloc);
} }
@ -496,7 +520,11 @@ namespace embree
/* parallel block creation in case of no freeBlocks, avoids single global mutex */ /* parallel block creation in case of no freeBlocks, avoids single global mutex */
if (likely(freeBlocks.load() == nullptr)) if (likely(freeBlocks.load() == nullptr))
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(slotMutex[slot]);
#else
Lock<SpinLock> lock(slotMutex[slot]); Lock<SpinLock> lock(slotMutex[slot]);
#endif
if (myUsedBlocks == threadUsedBlocks[slot]) { if (myUsedBlocks == threadUsedBlocks[slot]) {
const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
@ -509,7 +537,11 @@ namespace embree
/* if this fails allocate new block */ /* if this fails allocate new block */
{ {
Lock<SpinLock> lock(mutex); #if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex);
#endif
if (myUsedBlocks == threadUsedBlocks[slot]) if (myUsedBlocks == threadUsedBlocks[slot])
{ {
if (freeBlocks.load() != nullptr) { if (freeBlocks.load() != nullptr) {
@ -531,7 +563,11 @@ namespace embree
/*! add new block */ /*! add new block */
void addBlock(void* ptr, ssize_t bytes) void addBlock(void* ptr, ssize_t bytes)
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(mutex);
#else
Lock<SpinLock> lock(mutex); Lock<SpinLock> lock(mutex);
#endif
const size_t sizeof_Header = offsetof(Block,data[0]); const size_t sizeof_Header = offsetof(Block,data[0]);
void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
size_t ofs = (size_t) aptr - (size_t) ptr; size_t ofs = (size_t) aptr - (size_t) ptr;
@ -617,8 +653,8 @@ namespace embree
bytesWasted(alloc->bytesWasted), bytesWasted(alloc->bytesWasted),
stat_all(alloc,ANY_TYPE), stat_all(alloc,ANY_TYPE),
stat_malloc(alloc,ALIGNED_MALLOC), stat_malloc(alloc,ALIGNED_MALLOC),
stat_4K(alloc,OS_MALLOC,false), stat_4K(alloc,EMBREE_OS_MALLOC,false),
stat_2M(alloc,OS_MALLOC,true), stat_2M(alloc,EMBREE_OS_MALLOC,true),
stat_shared(alloc,SHARED) {} stat_shared(alloc,SHARED) {}
AllStatistics (size_t bytesUsed, AllStatistics (size_t bytesUsed,
@ -711,7 +747,7 @@ namespace embree
/* We avoid using os_malloc for small blocks as this could /* We avoid using os_malloc for small blocks as this could
* cause a risk of fragmenting the virtual address space and * cause a risk of fragmenting the virtual address space and
* reach the limit of vm.max_map_count = 65k under Linux. */ * reach the limit of vm.max_map_count = 65k under Linux. */
if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize) if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
atype = ALIGNED_MALLOC; atype = ALIGNED_MALLOC;
/* we need to additionally allocate some header */ /* we need to additionally allocate some header */
@ -720,7 +756,7 @@ namespace embree
bytesReserve = sizeof_Header+bytesReserve; bytesReserve = sizeof_Header+bytesReserve;
/* consume full 4k pages with using os_malloc */ /* consume full 4k pages with using os_malloc */
if (atype == OS_MALLOC) { if (atype == EMBREE_OS_MALLOC) {
bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
} }
@ -752,11 +788,11 @@ namespace embree
return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
} }
} }
else if (atype == OS_MALLOC) else if (atype == EMBREE_OS_MALLOC)
{ {
if (device) device->memoryMonitor(bytesAllocate,false); if (device) device->memoryMonitor(bytesAllocate,false);
bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
} }
else else
assert(false); assert(false);
@ -800,7 +836,7 @@ namespace embree
if (device) device->memoryMonitor(-sizeof_Alloced,true); if (device) device->memoryMonitor(-sizeof_Alloced,true);
} }
else if (atype == OS_MALLOC) { else if (atype == EMBREE_OS_MALLOC) {
size_t sizeof_This = sizeof_Header+reserveEnd; size_t sizeof_This = sizeof_Header+reserveEnd;
os_free(this,sizeof_This,huge_pages); os_free(this,sizeof_This,huge_pages);
if (device) device->memoryMonitor(-sizeof_Alloced,true); if (device) device->memoryMonitor(-sizeof_Alloced,true);
@ -861,7 +897,7 @@ namespace embree
bool hasType(AllocationType atype_i, bool huge_pages_i) const bool hasType(AllocationType atype_i, bool huge_pages_i) const
{ {
if (atype_i == ANY_TYPE ) return true; if (atype_i == ANY_TYPE ) return true;
else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
else return atype_i == atype; else return atype_i == atype;
} }
@ -910,7 +946,7 @@ namespace embree
void print_block() const void print_block() const
{ {
if (atype == ALIGNED_MALLOC) std::cout << "A"; if (atype == ALIGNED_MALLOC) std::cout << "A";
else if (atype == OS_MALLOC) std::cout << "O"; else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
else if (atype == SHARED) std::cout << "S"; else if (atype == SHARED) std::cout << "S";
if (huge_pages) std::cout << "H"; if (huge_pages) std::cout << "H";
size_t bytesUsed = getBlockUsedBytes(); size_t bytesUsed = getBlockUsedBytes();
@ -940,7 +976,11 @@ namespace embree
std::atomic<Block*> freeBlocks; std::atomic<Block*> freeBlocks;
std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
#if defined(__aarch64__) && defined(BUILD_IOS)
std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
#else
SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
#endif
bool use_single_mode; bool use_single_mode;
size_t defaultBlockSize; size_t defaultBlockSize;
@ -954,7 +994,11 @@ namespace embree
static __thread ThreadLocal2* thread_local_allocator2; static __thread ThreadLocal2* thread_local_allocator2;
static SpinLock s_thread_local_allocators_lock; static SpinLock s_thread_local_allocators_lock;
static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
#if defined(__aarch64__) && defined(BUILD_IOS)
std::mutex thread_local_allocators_lock;
#else
SpinLock thread_local_allocators_lock; SpinLock thread_local_allocators_lock;
#endif
std::vector<ThreadLocal2*> thread_local_allocators; std::vector<ThreadLocal2*> thread_local_allocators;
AllocationType atype; AllocationType atype;
mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes

View file

@ -55,6 +55,11 @@
#include <utility> #include <utility>
#include <sstream> #include <sstream>
#if !defined(_DEBUG) && defined(BUILD_IOS)
#undef assert
#define assert(_EXPR)
#endif
namespace embree namespace embree
{ {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -221,6 +221,9 @@ namespace embree
#if defined(TASKING_INTERNAL) #if defined(TASKING_INTERNAL)
std::cout << "internal_tasking_system "; std::cout << "internal_tasking_system ";
#endif #endif
#if defined(TASKING_GCD) && defined(BUILD_IOS)
std::cout << "GCD tasking system ";
#endif
#if defined(TASKING_PPL) #if defined(TASKING_PPL)
std::cout << "PPL "; std::cout << "PPL ";
#endif #endif
@ -503,6 +506,10 @@ namespace embree
#if defined(TASKING_PPL) #if defined(TASKING_PPL)
case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2; case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
#endif #endif
#if defined(TASKING_GCD) && defined(BUILD_IOS)
case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
#endif
#if defined(EMBREE_GEOMETRY_TRIANGLE) #if defined(EMBREE_GEOMETRY_TRIANGLE)
case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1; case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;

View file

@ -46,7 +46,7 @@ namespace embree
#define SELECT_SYMBOL_DEFAULT(features,intersector) \ #define SELECT_SYMBOL_DEFAULT(features,intersector) \
intersector = isa::intersector; intersector = isa::intersector;
#if defined(__SSE__) #if defined(__SSE__) || defined(__ARM_NEON)
#if !defined(EMBREE_TARGET_SIMD4) #if !defined(EMBREE_TARGET_SIMD4)
#define EMBREE_TARGET_SIMD4 #define EMBREE_TARGET_SIMD4
#endif #endif

View file

@ -29,7 +29,7 @@ namespace embree
__forceinline PrimRef (const BBox3fa& bounds, size_t id) __forceinline PrimRef (const BBox3fa& bounds, size_t id)
{ {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF)); lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF)); upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
#else #else
@ -79,7 +79,7 @@ namespace embree
/*! returns an size_t sized ID */ /*! returns an size_t sized ID */
__forceinline size_t ID() const { __forceinline size_t ID() const {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
return size_t(lower.u) + (size_t(upper.u) << 32); return size_t(lower.u) + (size_t(upper.u) << 32);
#else #else
return size_t(lower.u); return size_t(lower.u);

View file

@ -32,7 +32,7 @@ namespace embree
: lbounds((LBBox3fx)lbounds_i), time_range(time_range) : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
{ {
assert(activeTimeSegments > 0); assert(activeTimeSegments > 0);
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
lbounds.bounds0.lower.a = id & 0xFFFFFFFF; lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF; lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
#else #else
@ -47,7 +47,7 @@ namespace embree
: lbounds((LBBox3fx)lbounds_i), time_range(time_range) : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
{ {
assert(activeTimeSegments > 0); assert(activeTimeSegments > 0);
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
lbounds.bounds0.lower.u = id & 0xFFFFFFFF; lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF; lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
#else #else
@ -115,7 +115,7 @@ namespace embree
/*! returns an size_t sized ID */ /*! returns an size_t sized ID */
__forceinline size_t ID() const { __forceinline size_t ID() const {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32); return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
#else #else
return size_t(lbounds.bounds0.lower.u); return size_t(lbounds.bounds0.lower.u);
@ -163,7 +163,7 @@ namespace embree
: bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
{ {
assert(activeTimeSegments > 0); assert(activeTimeSegments > 0);
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
bbox.lower.u = id & 0xFFFFFFFF; bbox.lower.u = id & 0xFFFFFFFF;
bbox.upper.u = (id >> 32) & 0xFFFFFFFF; bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
#else #else
@ -229,7 +229,7 @@ namespace embree
/*! returns an size_t sized ID */ /*! returns an size_t sized ID */
__forceinline size_t ID() const { __forceinline size_t ID() const {
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32); return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
#else #else
return size_t(bbox.lower.u); return size_t(bbox.lower.u);

View file

@ -8,18 +8,31 @@
#include "scene.h" #include "scene.h"
#include "context.h" #include "context.h"
#include "../../include/embree3/rtcore_ray.h" #include "../../include/embree3/rtcore_ray.h"
#if defined(__aarch64__) && defined(BUILD_IOS)
#include <mutex>
#endif
using namespace embree; using namespace embree;
RTC_NAMESPACE_BEGIN; RTC_NAMESPACE_BEGIN;
/* mutex to make API thread safe */ /* mutex to make API thread safe */
static MutexSys g_mutex; #if defined(__aarch64__) && defined(BUILD_IOS)
static std::mutex g_mutex;
#else
static MutexSys g_mutex;
#endif
RTC_API RTCDevice rtcNewDevice(const char* config) RTC_API RTCDevice rtcNewDevice(const char* config)
{ {
RTC_CATCH_BEGIN; RTC_CATCH_BEGIN;
RTC_TRACE(rtcNewDevice); RTC_TRACE(rtcNewDevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex); Lock<MutexSys> lock(g_mutex);
#endif
Device* device = new Device(config); Device* device = new Device(config);
return (RTCDevice) device->refInc(); return (RTCDevice) device->refInc();
RTC_CATCH_END(nullptr); RTC_CATCH_END(nullptr);
@ -32,7 +45,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN; RTC_CATCH_BEGIN;
RTC_TRACE(rtcRetainDevice); RTC_TRACE(rtcRetainDevice);
RTC_VERIFY_HANDLE(hdevice); RTC_VERIFY_HANDLE(hdevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex); Lock<MutexSys> lock(g_mutex);
#endif
device->refInc(); device->refInc();
RTC_CATCH_END(nullptr); RTC_CATCH_END(nullptr);
} }
@ -43,7 +60,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN; RTC_CATCH_BEGIN;
RTC_TRACE(rtcReleaseDevice); RTC_TRACE(rtcReleaseDevice);
RTC_VERIFY_HANDLE(hdevice); RTC_VERIFY_HANDLE(hdevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex); Lock<MutexSys> lock(g_mutex);
#endif
device->refDec(); device->refDec();
RTC_CATCH_END(nullptr); RTC_CATCH_END(nullptr);
} }
@ -54,7 +75,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN; RTC_CATCH_BEGIN;
RTC_TRACE(rtcGetDeviceProperty); RTC_TRACE(rtcGetDeviceProperty);
RTC_VERIFY_HANDLE(hdevice); RTC_VERIFY_HANDLE(hdevice);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex); Lock<MutexSys> lock(g_mutex);
#endif
return device->getProperty(prop); return device->getProperty(prop);
RTC_CATCH_END(device); RTC_CATCH_END(device);
return 0; return 0;
@ -67,7 +92,11 @@ RTC_NAMESPACE_BEGIN;
RTC_TRACE(rtcSetDeviceProperty); RTC_TRACE(rtcSetDeviceProperty);
const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004; const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex); Lock<MutexSys> lock(g_mutex);
#endif
device->setProperty(prop,val); device->setProperty(prop,val);
RTC_CATCH_END(device); RTC_CATCH_END(device);
} }
@ -183,7 +212,11 @@ RTC_NAMESPACE_BEGIN;
RTC_CATCH_BEGIN; RTC_CATCH_BEGIN;
RTC_TRACE(rtcSetSceneProgressMonitorFunction); RTC_TRACE(rtcSetSceneProgressMonitorFunction);
RTC_VERIFY_HANDLE(hscene); RTC_VERIFY_HANDLE(hscene);
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(g_mutex);
#else
Lock<MutexSys> lock(g_mutex); Lock<MutexSys> lock(g_mutex);
#endif
scene->setProgressMonitorFunction(progress,ptr); scene->setProgressMonitorFunction(progress,ptr);
RTC_CATCH_END2(scene); RTC_CATCH_END2(scene);
} }
@ -197,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
if (quality != RTC_BUILD_QUALITY_LOW && if (quality != RTC_BUILD_QUALITY_LOW &&
quality != RTC_BUILD_QUALITY_MEDIUM && quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH) quality != RTC_BUILD_QUALITY_HIGH)
throw std::runtime_error("invalid build quality"); // -- GODOT start --
// throw std::runtime_error("invalid build quality");
abort();
// -- GODOT end --
scene->setBuildQuality(quality); scene->setBuildQuality(quality);
RTC_CATCH_END2(scene); RTC_CATCH_END2(scene);
} }
@ -479,12 +515,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context); IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS) #if !defined(EMBREE_RAY_PACKETS)
Ray4* ray4 = (Ray4*) rayhit; RayHit4* rayhit4 = (RayHit4*)rayhit;
for (size_t i=0; i<4; i++) { for (size_t i=0; i<4; i++) {
if (!valid[i]) continue; if (!valid[i]) continue;
RayHit ray1; ray4->get(i,ray1); RayHit ray1; rayhit4->get(i,ray1);
scene->intersectors.intersect((RTCRayHit&)ray1,&context); scene->intersectors.intersect((RTCRayHit&)ray1,&context);
ray4->set(i,ray1); rayhit4->set(i,ray1);
} }
#else #else
scene->intersectors.intersect4(valid,*rayhit,&context); scene->intersectors.intersect4(valid,*rayhit,&context);
@ -510,12 +546,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context); IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS) #if !defined(EMBREE_RAY_PACKETS)
Ray8* ray8 = (Ray8*) rayhit; RayHit8* rayhit8 = (RayHit8*) rayhit;
for (size_t i=0; i<8; i++) { for (size_t i=0; i<8; i++) {
if (!valid[i]) continue; if (!valid[i]) continue;
RayHit ray1; ray8->get(i,ray1); RayHit ray1; rayhit8->get(i,ray1);
scene->intersectors.intersect((RTCRayHit&)ray1,&context); scene->intersectors.intersect((RTCRayHit&)ray1,&context);
ray8->set(i,ray1); rayhit8->set(i,ray1);
} }
#else #else
if (likely(scene->intersectors.intersector8)) if (likely(scene->intersectors.intersector8))
@ -543,12 +579,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context); IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS) #if !defined(EMBREE_RAY_PACKETS)
Ray16* ray16 = (Ray16*) rayhit; RayHit16* rayhit16 = (RayHit16*) rayhit;
for (size_t i=0; i<16; i++) { for (size_t i=0; i<16; i++) {
if (!valid[i]) continue; if (!valid[i]) continue;
RayHit ray1; ray16->get(i,ray1); RayHit ray1; rayhit16->get(i,ray1);
scene->intersectors.intersect((RTCRayHit&)ray1,&context); scene->intersectors.intersect((RTCRayHit&)ray1,&context);
ray16->set(i,ray1); rayhit16->set(i,ray1);
} }
#else #else
if (likely(scene->intersectors.intersector16)) if (likely(scene->intersectors.intersector16))
@ -730,12 +766,12 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context); IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS) #if !defined(EMBREE_RAY_PACKETS)
RayHit4* ray4 = (RayHit4*) ray; Ray4* ray4 = (Ray4*) ray;
for (size_t i=0; i<4; i++) { for (size_t i=0; i<4; i++) {
if (!valid[i]) continue; if (!valid[i]) continue;
RayHit ray1; ray4->get(i,ray1); Ray ray1; ray4->get(i,ray1);
scene->intersectors.occluded((RTCRay&)ray1,&context); scene->intersectors.occluded((RTCRay&)ray1,&context);
ray4->geomID[i] = ray1.geomID; ray4->set(i,ray1);
} }
#else #else
scene->intersectors.occluded4(valid,*ray,&context); scene->intersectors.occluded4(valid,*ray,&context);
@ -761,10 +797,10 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context); IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS) #if !defined(EMBREE_RAY_PACKETS)
RayHit8* ray8 = (RayHit8*) ray; Ray8* ray8 = (Ray8*) ray;
for (size_t i=0; i<8; i++) { for (size_t i=0; i<8; i++) {
if (!valid[i]) continue; if (!valid[i]) continue;
RayHit ray1; ray8->get(i,ray1); Ray ray1; ray8->get(i,ray1);
scene->intersectors.occluded((RTCRay&)ray1,&context); scene->intersectors.occluded((RTCRay&)ray1,&context);
ray8->set(i,ray1); ray8->set(i,ray1);
} }
@ -795,10 +831,10 @@ RTC_NAMESPACE_BEGIN;
IntersectContext context(scene,user_context); IntersectContext context(scene,user_context);
#if !defined(EMBREE_RAY_PACKETS) #if !defined(EMBREE_RAY_PACKETS)
RayHit16* ray16 = (RayHit16*) ray; Ray16* ray16 = (Ray16*) ray;
for (size_t i=0; i<16; i++) { for (size_t i=0; i<16; i++) {
if (!valid[i]) continue; if (!valid[i]) continue;
RayHit ray1; ray16->get(i,ray1); Ray ray1; ray16->get(i,ray1);
scene->intersectors.occluded((RTCRay&)ray1,&context); scene->intersectors.occluded((RTCRay&)ray1,&context);
ray16->set(i,ray1); ray16->set(i,ray1);
} }
@ -1350,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
quality != RTC_BUILD_QUALITY_MEDIUM && quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH && quality != RTC_BUILD_QUALITY_HIGH &&
quality != RTC_BUILD_QUALITY_REFIT) quality != RTC_BUILD_QUALITY_REFIT)
throw std::runtime_error("invalid build quality"); // -- GODOT start --
// throw std::runtime_error("invalid build quality");
abort();
// -- GODOT end --
geometry->setBuildQuality(quality); geometry->setBuildQuality(quality);
RTC_CATCH_END2(geometry); RTC_CATCH_END2(geometry);
} }

View file

@ -25,52 +25,58 @@ namespace embree
#endif #endif
/*! Macros used in the rtcore API implementation */ /*! Macros used in the rtcore API implementation */
#define RTC_CATCH_BEGIN try { // -- GODOT start --
// #define RTC_CATCH_BEGIN try {
#define RTC_CATCH_BEGIN
#define RTC_CATCH_END(device) \ // #define RTC_CATCH_END(device) \
} catch (std::bad_alloc&) { \ // } catch (std::bad_alloc&) { \
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ // Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
} catch (rtcore_error& e) { \ // } catch (rtcore_error& e) { \
Device::process_error(device,e.error,e.what()); \ // Device::process_error(device,e.error,e.what()); \
} catch (std::exception& e) { \ // } catch (std::exception& e) { \
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ // Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
} catch (...) { \ // } catch (...) { \
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ // Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
} // }
#define RTC_CATCH_END(device)
#define RTC_CATCH_END2(scene) \ // #define RTC_CATCH_END2(scene) \
} catch (std::bad_alloc&) { \ // } catch (std::bad_alloc&) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ // Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
} catch (rtcore_error& e) { \ // } catch (rtcore_error& e) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,e.error,e.what()); \ // Device::process_error(device,e.error,e.what()); \
} catch (std::exception& e) { \ // } catch (std::exception& e) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ // Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
} catch (...) { \ // } catch (...) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ // Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
} // }
#define RTC_CATCH_END2(scene)
#define RTC_CATCH_END2_FALSE(scene) \ // #define RTC_CATCH_END2_FALSE(scene) \
} catch (std::bad_alloc&) { \ // } catch (std::bad_alloc&) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ // Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
return false; \ // return false; \
} catch (rtcore_error& e) { \ // } catch (rtcore_error& e) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,e.error,e.what()); \ // Device::process_error(device,e.error,e.what()); \
return false; \ // return false; \
} catch (std::exception& e) { \ // } catch (std::exception& e) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ // Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
return false; \ // return false; \
} catch (...) { \ // } catch (...) { \
Device* device = scene ? scene->device : nullptr; \ // Device* device = scene ? scene->device : nullptr; \
Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ // Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
return false; \ // return false; \
} // }
#define RTC_CATCH_END2_FALSE(scene) return false;
// -- GODOT end --
#define RTC_VERIFY_HANDLE(handle) \ #define RTC_VERIFY_HANDLE(handle) \
if (handle == nullptr) { \ if (handle == nullptr) { \
@ -97,28 +103,38 @@ namespace embree
#define RTC_TRACE(x) #define RTC_TRACE(x)
#endif #endif
/*! used to throw embree API errors */ // -- GODOT begin --
struct rtcore_error : public std::exception // /*! used to throw embree API errors */
{ // struct rtcore_error : public std::exception
__forceinline rtcore_error(RTCError error, const std::string& str) // {
: error(error), str(str) {} // __forceinline rtcore_error(RTCError error, const std::string& str)
// : error(error), str(str) {}
~rtcore_error() throw() {} //
// ~rtcore_error() throw() {}
const char* what () const throw () { //
return str.c_str(); // const char* what () const throw () {
} // return str.c_str();
// }
RTCError error; //
std::string str; // RTCError error;
}; // std::string str;
// };
// -- GODOT end --
#if defined(DEBUG) // only report file and line in debug mode #if defined(DEBUG) // only report file and line in debug mode
// -- GODOT begin --
// #define throw_RTCError(error,str) \
// throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define throw_RTCError(error,str) \ #define throw_RTCError(error,str) \
throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
// -- GODOT end --
#else #else
// -- GODOT begin --
// #define throw_RTCError(error,str) \
// throw rtcore_error(error,str);
#define throw_RTCError(error,str) \ #define throw_RTCError(error,str) \
throw rtcore_error(error,str); abort();
// -- GODOT end --
#endif #endif
#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \

View file

@ -6,7 +6,7 @@
#include "../bvh/bvh4_factory.h" #include "../bvh/bvh4_factory.h"
#include "../bvh/bvh8_factory.h" #include "../bvh/bvh8_factory.h"
#include "../../common/algorithms/parallel_reduce.h" #include "../../common/algorithms/parallel_reduce.h"
namespace embree namespace embree
{ {
/* error raising rtcIntersect and rtcOccluded functions */ /* error raising rtcIntersect and rtcOccluded functions */
@ -40,7 +40,7 @@ namespace embree
{ {
device->refDec(); device->refDec();
} }
void Scene::printStatistics() void Scene::printStatistics()
{ {
/* calculate maximum number of time segments */ /* calculate maximum number of time segments */
@ -56,12 +56,12 @@ namespace embree
statistics[i].resize(max_time_steps); statistics[i].resize(max_time_steps);
/* gather statistics */ /* gather statistics */
for (size_t i=0; i<size(); i++) for (size_t i=0; i<size(); i++)
{ {
if (!get(i)) continue; if (!get(i)) continue;
int ty = get(i)->getType(); int ty = get(i)->getType();
assert(ty<Geometry::GTY_END); assert(ty<Geometry::GTY_END);
int timesegments = get(i)->numTimeSegments(); int timesegments = get(i)->numTimeSegments();
assert((unsigned int)timesegments < max_time_steps); assert((unsigned int)timesegments < max_time_steps);
statistics[ty][timesegments] += get(i)->size(); statistics[ty][timesegments] += get(i)->size();
} }
@ -76,7 +76,7 @@ namespace embree
for (size_t t=0; t<max_time_steps; t++) for (size_t t=0; t<max_time_steps; t++)
std::cout << "----------"; std::cout << "----------";
std::cout << std::endl; std::cout << std::endl;
for (size_t p=0; p<Geometry::GTY_END; p++) for (size_t p=0; p<Geometry::GTY_END; p++)
{ {
if (std::string(Geometry::gtype_names[p]) == "") continue; if (std::string(Geometry::gtype_names[p]) == "") continue;
@ -90,34 +90,34 @@ namespace embree
void Scene::createTriangleAccel() void Scene::createTriangleAccel()
{ {
#if defined(EMBREE_GEOMETRY_TRIANGLE) #if defined(EMBREE_GEOMETRY_TRIANGLE)
if (device->tri_accel == "default") if (device->tri_accel == "default")
{ {
if (quality_flags != RTC_BUILD_QUALITY_LOW) if (quality_flags != RTC_BUILD_QUALITY_LOW)
{ {
int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
switch (mode) { switch (mode) {
case /*0b00*/ 0: case /*0b00*/ 0:
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
if (device->canUseAVX()) if (device->canUseAVX())
{ {
if (quality_flags == RTC_BUILD_QUALITY_HIGH) if (quality_flags == RTC_BUILD_QUALITY_HIGH)
accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
else else
accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
} }
else else
#endif #endif
{ {
if (quality_flags == RTC_BUILD_QUALITY_HIGH) if (quality_flags == RTC_BUILD_QUALITY_HIGH)
accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
else else
accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
} }
break; break;
case /*0b01*/ 1: case /*0b01*/ 1:
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
if (device->canUseAVX()) if (device->canUseAVX())
accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
else else
#endif #endif
@ -175,8 +175,8 @@ namespace embree
#if defined(EMBREE_GEOMETRY_TRIANGLE) #if defined(EMBREE_GEOMETRY_TRIANGLE)
if (device->tri_accel_mb == "default") if (device->tri_accel_mb == "default")
{ {
int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
{ {
@ -211,18 +211,18 @@ namespace embree
void Scene::createQuadAccel() void Scene::createQuadAccel()
{ {
#if defined(EMBREE_GEOMETRY_QUAD) #if defined(EMBREE_GEOMETRY_QUAD)
if (device->quad_accel == "default") if (device->quad_accel == "default")
{ {
if (quality_flags != RTC_BUILD_QUALITY_LOW) if (quality_flags != RTC_BUILD_QUALITY_LOW)
{ {
/* static */ /* static */
int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
switch (mode) { switch (mode) {
case /*0b00*/ 0: case /*0b00*/ 0:
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
if (device->canUseAVX()) if (device->canUseAVX())
{ {
if (quality_flags == RTC_BUILD_QUALITY_HIGH) if (quality_flags == RTC_BUILD_QUALITY_HIGH)
accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
else else
accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
@ -230,7 +230,7 @@ namespace embree
else else
#endif #endif
{ {
if (quality_flags == RTC_BUILD_QUALITY_HIGH) if (quality_flags == RTC_BUILD_QUALITY_HIGH)
accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
else else
accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
@ -292,9 +292,9 @@ namespace embree
void Scene::createQuadMBAccel() void Scene::createQuadMBAccel()
{ {
#if defined(EMBREE_GEOMETRY_QUAD) #if defined(EMBREE_GEOMETRY_QUAD)
if (device->quad_accel_mb == "default") if (device->quad_accel_mb == "default")
{ {
int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
switch (mode) { switch (mode) {
case /*0b00*/ 0: case /*0b00*/ 0:
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
@ -416,7 +416,7 @@ namespace embree
void Scene::createUserGeometryAccel() void Scene::createUserGeometryAccel()
{ {
#if defined(EMBREE_GEOMETRY_USER) #if defined(EMBREE_GEOMETRY_USER)
if (device->object_accel == "default") if (device->object_accel == "default")
{ {
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
if (device->canUseAVX() && !isCompactAccel()) if (device->canUseAVX() && !isCompactAccel())
@ -554,7 +554,7 @@ namespace embree
{ {
BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST; BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
#if defined(EMBREE_GEOMETRY_GRID) #if defined(EMBREE_GEOMETRY_GRID)
if (device->grid_accel == "default") if (device->grid_accel == "default")
{ {
#if defined (EMBREE_TARGET_SIMD8) #if defined (EMBREE_TARGET_SIMD8)
if (device->canUseAVX() && !isCompactAccel()) if (device->canUseAVX() && !isCompactAccel())
@ -579,7 +579,7 @@ namespace embree
void Scene::createGridMBAccel() void Scene::createGridMBAccel()
{ {
#if defined(EMBREE_GEOMETRY_GRID) #if defined(EMBREE_GEOMETRY_GRID)
if (device->grid_accel_mb == "default") if (device->grid_accel_mb == "default")
{ {
accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC)); accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
} }
@ -588,13 +588,17 @@ namespace embree
#endif #endif
} }
void Scene::clear() { void Scene::clear() {
} }
unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(geometriesMutex);
#else
Lock<SpinLock> lock(geometriesMutex); Lock<SpinLock> lock(geometriesMutex);
#endif
if (geomID == RTC_INVALID_GEOMETRY_ID) { if (geomID == RTC_INVALID_GEOMETRY_ID) {
geomID = id_pool.allocate(); geomID = id_pool.allocate();
if (geomID == RTC_INVALID_GEOMETRY_ID) if (geomID == RTC_INVALID_GEOMETRY_ID)
@ -620,15 +624,19 @@ namespace embree
void Scene::detachGeometry(size_t geomID) void Scene::detachGeometry(size_t geomID)
{ {
#if defined(__aarch64__) && defined(BUILD_IOS)
std::scoped_lock lock(geometriesMutex);
#else
Lock<SpinLock> lock(geometriesMutex); Lock<SpinLock> lock(geometriesMutex);
#endif
if (geomID >= geometries.size()) if (geomID >= geometries.size())
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID"); throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
Ref<Geometry>& geometry = geometries[geomID]; Ref<Geometry>& geometry = geometries[geomID];
if (geometry == null) if (geometry == null)
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
if (geometry->isEnabled()) { if (geometry->isEnabled()) {
setModified (); setModified ();
} }
@ -650,21 +658,21 @@ namespace embree
if (!isModified()) { if (!isModified()) {
return; return;
} }
/* print scene statistics */ /* print scene statistics */
if (device->verbosity(2)) if (device->verbosity(2))
printStatistics(); printStatistics();
progress_monitor_counter = 0; progress_monitor_counter = 0;
/* gather scene stats and call preCommit function of each geometry */ /* gather scene stats and call preCommit function of each geometry */
this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (),
[this](const range<size_t>& r)->GeometryCounts [this](const range<size_t>& r)->GeometryCounts
{ {
GeometryCounts c; GeometryCounts c;
for (auto i=r.begin(); i<r.end(); ++i) for (auto i=r.begin(); i<r.end(); ++i)
{ {
if (geometries[i] && geometries[i]->isEnabled()) if (geometries[i] && geometries[i]->isEnabled())
{ {
geometries[i]->preCommit(); geometries[i]->preCommit();
geometries[i]->addElementsToCount (c); geometries[i]->addElementsToCount (c);
@ -675,19 +683,19 @@ namespace embree
}, },
std::plus<GeometryCounts>() std::plus<GeometryCounts>()
); );
/* select acceleration structures to build */ /* select acceleration structures to build */
unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask(); unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
if (flags_modified || new_enabled_geometry_types != enabled_geometry_types) if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
{ {
accels_init(); accels_init();
/* we need to make all geometries modified, otherwise two level builder will /* we need to make all geometries modified, otherwise two level builder will
not rebuild currently not modified geometries */ not rebuild currently not modified geometries */
parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) { parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
geometryModCounters_[i] = 0; geometryModCounters_[i] = 0;
}); });
if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel(); if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel(); if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel(); if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
@ -704,14 +712,14 @@ namespace embree
if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel(); if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel(); if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel(); if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
flags_modified = false; flags_modified = false;
enabled_geometry_types = new_enabled_geometry_types; enabled_geometry_types = new_enabled_geometry_types;
} }
/* select fast code path if no filter function is present */ /* select fast code path if no filter function is present */
accels_select(hasFilterFunction()); accels_select(hasFilterFunction());
/* build all hierarchies of this scene */ /* build all hierarchies of this scene */
accels_build(); accels_build();
@ -729,7 +737,7 @@ namespace embree
geometryModCounters_[i] = geometries[i]->getModCounter(); geometryModCounters_[i] = geometries[i]->getModCounter();
} }
}); });
updateInterface(); updateInterface();
if (device->verbosity(2)) { if (device->verbosity(2)) {
@ -738,7 +746,7 @@ namespace embree
std::cout << "selected scene intersector" << std::endl; std::cout << "selected scene intersector" << std::endl;
intersectors.print(2); intersectors.print(2);
} }
setModified(false); setModified(false);
} }
@ -763,16 +771,16 @@ namespace embree
RTCSceneFlags Scene::getSceneFlags() const { RTCSceneFlags Scene::getSceneFlags() const {
return scene_flags; return scene_flags;
} }
#if defined(TASKING_INTERNAL) #if defined(TASKING_INTERNAL)
void Scene::commit (bool join) void Scene::commit (bool join)
{ {
Lock<MutexSys> buildLock(buildMutex,false); Lock<MutexSys> buildLock(buildMutex,false);
/* allocates own taskscheduler for each build */ /* allocates own taskscheduler for each build */
Ref<TaskScheduler> scheduler = nullptr; Ref<TaskScheduler> scheduler = nullptr;
{ {
Lock<MutexSys> lock(schedulerMutex); Lock<MutexSys> lock(schedulerMutex);
scheduler = this->scheduler; scheduler = this->scheduler;
if (scheduler == null) { if (scheduler == null) {
@ -784,31 +792,33 @@ namespace embree
/* worker threads join build */ /* worker threads join build */
if (!buildLock.isLocked()) if (!buildLock.isLocked())
{ {
if (!join) if (!join)
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation"); throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
scheduler->join(); scheduler->join();
return; return;
} }
/* initiate build */ /* initiate build */
try { // -- GODOT start --
// try {
scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
} // }
catch (...) { // catch (...) {
accels_clear(); // accels_clear();
updateInterface(); // updateInterface();
Lock<MutexSys> lock(schedulerMutex); // Lock<MutexSys> lock(schedulerMutex);
this->scheduler = nullptr; // this->scheduler = nullptr;
throw; // throw;
} // }
// -- GODOT end --
} }
#endif #endif
#if defined(TASKING_TBB) #if defined(TASKING_TBB) || defined(TASKING_GCD)
void Scene::commit (bool join) void Scene::commit (bool join)
{ {
#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) #if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
if (join) if (join)
@ -822,12 +832,15 @@ namespace embree
if (!lock.isLocked()) if (!lock.isLocked())
{ {
#if !TASKING_TBB_USE_TASK_ISOLATION #if !TASKING_TBB_USE_TASK_ISOLATION
if (!join) if (!join)
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version"); throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
#endif #endif
do { do {
#if defined(TASKING_GCD)
// Do Nothing
#else
#if USE_TASK_ARENA #if USE_TASK_ARENA
if (join) { if (join) {
device->arena->execute([&]{ group.wait(); }); device->arena->execute([&]{ group.wait(); });
@ -837,21 +850,24 @@ namespace embree
{ {
group.wait(); group.wait();
} }
#endif
pause_cpu(); pause_cpu();
yield(); yield();
} while (!buildMutex.try_lock()); } while (!buildMutex.try_lock());
buildMutex.unlock(); buildMutex.unlock();
return; return;
} }
/* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
const unsigned int mxcsr = _mm_getcsr(); const unsigned int mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
try { try {
#if TBB_INTERFACE_VERSION_MAJOR < 8 #if defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION_MAJOR < 8
tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits); tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
#else #else
tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
@ -876,15 +892,22 @@ namespace embree
}); });
group.wait(); group.wait();
} }
/* reset MXCSR register again */ /* reset MXCSR register again */
_mm_setcsr(mxcsr); _mm_setcsr(mxcsr);
}
#elif defined(TASKING_GCD)
commit_task();
#endif // #if defined(TASKING_TBB)
}
catch (...) catch (...)
{ {
/* reset MXCSR register again */ /* reset MXCSR register again */
_mm_setcsr(mxcsr); _mm_setcsr(mxcsr);
accels_clear(); accels_clear();
updateInterface(); updateInterface();
throw; throw;
@ -894,7 +917,7 @@ namespace embree
#if defined(TASKING_PPL) #if defined(TASKING_PPL)
void Scene::commit (bool join) void Scene::commit (bool join)
{ {
#if defined(TASKING_PPL) #if defined(TASKING_PPL)
if (join) if (join)
@ -912,7 +935,7 @@ namespace embree
/* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
const unsigned int mxcsr = _mm_getcsr(); const unsigned int mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
try { try {
group.run([&]{ group.run([&]{
@ -922,12 +945,12 @@ namespace embree
/* reset MXCSR register again */ /* reset MXCSR register again */
_mm_setcsr(mxcsr); _mm_setcsr(mxcsr);
} }
catch (...) catch (...)
{ {
/* reset MXCSR register again */ /* reset MXCSR register again */
_mm_setcsr(mxcsr); _mm_setcsr(mxcsr);
accels_clear(); accels_clear();
updateInterface(); updateInterface();
throw; throw;
@ -935,7 +958,7 @@ namespace embree
} }
#endif #endif
void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr)
{ {
progress_monitor_function = func; progress_monitor_function = func;
progress_monitor_ptr = ptr; progress_monitor_ptr = ptr;

View file

@ -275,11 +275,11 @@ namespace embree
parallel_set<uint32_t> holeSet; parallel_set<uint32_t> holeSet;
/*! fast lookup table to detect invalid faces */ /*! fast lookup table to detect invalid faces */
mvector<char> invalid_face; mvector<int8_t> invalid_face;
/*! test if face i is invalid in timestep j */ /*! test if face i is invalid in timestep j */
__forceinline char& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; } __forceinline int8_t& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; }
__forceinline const char& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; } __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
/*! interpolation cache */ /*! interpolation cache */
public: public:

View file

@ -147,7 +147,20 @@ namespace embree
} }
bool State::checkISASupport() { bool State::checkISASupport() {
#if defined(__ARM_NEON)
/*
* NEON CPU type is a mixture of NEON and SSE2
*/
bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
/* this will be true when explicitly initialize Device with `isa=neon` config */
bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
return hasSSE2 || hasNEON;
#else
return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
#endif
} }
void State::verify() void State::verify()
@ -160,8 +173,10 @@ namespace embree
* functions */ * functions */
#if defined(DEBUG) #if defined(DEBUG)
#if defined(EMBREE_TARGET_SSE2) #if defined(EMBREE_TARGET_SSE2)
#if !defined(__ARM_NEON)
assert(sse2::getISA() <= SSE2); assert(sse2::getISA() <= SSE2);
#endif #endif
#endif
#if defined(EMBREE_TARGET_SSE42) #if defined(EMBREE_TARGET_SSE42)
assert(sse42::getISA() <= SSE42); assert(sse42::getISA() <= SSE42);
#endif #endif

View file

@ -43,10 +43,10 @@ namespace embree
__forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
{ {
size_t end = min(begin+M,_end); size_t end = min(begin+M,_end);
N = (unsigned char)(end-begin); N = (uint8_t)(end-begin);
const unsigned int geomID0 = prims[begin].geomID(); const unsigned int geomID0 = prims[begin].geomID();
this->geomID(N) = geomID0; this->geomID(N) = geomID0;
ty = (unsigned char) scene->get(geomID0)->getType(); ty = (uint8_t) scene->get(geomID0)->getType();
/* encode all primitives */ /* encode all primitives */
BBox3fa bounds = empty; BBox3fa bounds = empty;
@ -76,25 +76,25 @@ namespace embree
const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID); const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
bounds_vx_x(N)[i] = (char) space3.vx.x; bounds_vx_x(N)[i] = (int8_t) space3.vx.x;
bounds_vx_y(N)[i] = (char) space3.vx.y; bounds_vx_y(N)[i] = (int8_t) space3.vx.y;
bounds_vx_z(N)[i] = (char) space3.vx.z; bounds_vx_z(N)[i] = (int8_t) space3.vx.z;
bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f); bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f); bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f); assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f); assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
bounds_vy_x(N)[i] = (char) space3.vy.x; bounds_vy_x(N)[i] = (int8_t) space3.vy.x;
bounds_vy_y(N)[i] = (char) space3.vy.y; bounds_vy_y(N)[i] = (int8_t) space3.vy.y;
bounds_vy_z(N)[i] = (char) space3.vy.z; bounds_vy_z(N)[i] = (int8_t) space3.vy.z;
bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f); bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f); bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f); assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f); assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
bounds_vz_x(N)[i] = (char) space3.vz.x; bounds_vz_x(N)[i] = (int8_t) space3.vz.x;
bounds_vz_y(N)[i] = (char) space3.vz.y; bounds_vz_y(N)[i] = (int8_t) space3.vz.y;
bounds_vz_z(N)[i] = (char) space3.vz.z; bounds_vz_z(N)[i] = (int8_t) space3.vz.z;
bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f); bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f); bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f); assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
@ -114,15 +114,15 @@ namespace embree
for (size_t i=0; i<items; i++) { for (size_t i=0; i<items; i++) {
accel[i].fill(prims,start,set.end(),bvh->scene); accel[i].fill(prims,start,set.end(),bvh->scene);
} }
return bvh->encodeLeaf((char*)accel,items); return bvh->encodeLeaf((int8_t*)accel,items);
}; };
public: public:
// 27.6 - 46 bytes per primitive // 27.6 - 46 bytes per primitive
unsigned char ty; uint8_t ty;
unsigned char N; uint8_t N;
unsigned char data[4+25*M+16]; uint8_t data[4+25*M+16];
/* /*
struct Layout struct Layout
@ -130,21 +130,21 @@ namespace embree
unsigned int geomID; unsigned int geomID;
unsigned int primID[N]; unsigned int primID[N];
char bounds_vx_x[N]; int8_t bounds_vx_x[N];
char bounds_vx_y[N]; int8_t bounds_vx_y[N];
char bounds_vx_z[N]; int8_t bounds_vx_z[N];
short bounds_vx_lower[N]; short bounds_vx_lower[N];
short bounds_vx_upper[N]; short bounds_vx_upper[N];
char bounds_vy_x[N]; int8_t bounds_vy_x[N];
char bounds_vy_y[N]; int8_t bounds_vy_y[N];
char bounds_vy_z[N]; int8_t bounds_vy_z[N];
short bounds_vy_lower[N]; short bounds_vy_lower[N];
short bounds_vy_upper[N]; short bounds_vy_upper[N];
char bounds_vz_x[N]; int8_t bounds_vz_x[N];
char bounds_vz_y[N]; int8_t bounds_vz_y[N];
char bounds_vz_z[N]; int8_t bounds_vz_z[N];
short bounds_vz_lower[N]; short bounds_vz_lower[N];
short bounds_vz_upper[N]; short bounds_vz_upper[N];
@ -153,65 +153,65 @@ namespace embree
}; };
*/ */
__forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((char*)this+2); } __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); }
__forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); } __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
__forceinline unsigned int* primID(size_t N) { return (unsigned int*)((char*)this+6); } __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); }
__forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); } __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
__forceinline char* bounds_vx_x(size_t N) { return (char*)((char*)this+6+4*N); } __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); } __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline char* bounds_vx_y(size_t N) { return (char*)((char*)this+6+5*N); } __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); } __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline char* bounds_vx_z(size_t N) { return (char*)((char*)this+6+6*N); } __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); } __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline short* bounds_vx_lower(size_t N) { return (short*)((char*)this+6+7*N); } __forceinline short* bounds_vx_lower(size_t N) { return (short*)((int8_t*)this+6+7*N); }
__forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((char*)this+6+7*N); } __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
__forceinline short* bounds_vx_upper(size_t N) { return (short*)((char*)this+6+9*N); } __forceinline short* bounds_vx_upper(size_t N) { return (short*)((int8_t*)this+6+9*N); }
__forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((char*)this+6+9*N); } __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
__forceinline char* bounds_vy_x(size_t N) { return (char*)((char*)this+6+11*N); } __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+11*N); }
__forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+11*N); } __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); }
__forceinline char* bounds_vy_y(size_t N) { return (char*)((char*)this+6+12*N); } __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+12*N); }
__forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+12*N); } __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); }
__forceinline char* bounds_vy_z(size_t N) { return (char*)((char*)this+6+13*N); } __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+13*N); }
__forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+13*N); } __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); }
__forceinline short* bounds_vy_lower(size_t N) { return (short*)((char*)this+6+14*N); } __forceinline short* bounds_vy_lower(size_t N) { return (short*)((int8_t*)this+6+14*N); }
__forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((char*)this+6+14*N); } __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); }
__forceinline short* bounds_vy_upper(size_t N) { return (short*)((char*)this+6+16*N); } __forceinline short* bounds_vy_upper(size_t N) { return (short*)((int8_t*)this+6+16*N); }
__forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((char*)this+6+16*N); } __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); }
__forceinline char* bounds_vz_x(size_t N) { return (char*)((char*)this+6+18*N); } __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+18*N); }
__forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+18*N); } __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); }
__forceinline char* bounds_vz_y(size_t N) { return (char*)((char*)this+6+19*N); } __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+19*N); }
__forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+19*N); } __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); }
__forceinline char* bounds_vz_z(size_t N) { return (char*)((char*)this+6+20*N); } __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+20*N); }
__forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+20*N); } __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); }
__forceinline short* bounds_vz_lower(size_t N) { return (short*)((char*)this+6+21*N); } __forceinline short* bounds_vz_lower(size_t N) { return (short*)((int8_t*)this+6+21*N); }
__forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((char*)this+6+21*N); } __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); }
__forceinline short* bounds_vz_upper(size_t N) { return (short*)((char*)this+6+23*N); } __forceinline short* bounds_vz_upper(size_t N) { return (short*)((int8_t*)this+6+23*N); }
__forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((char*)this+6+23*N); } __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); }
__forceinline Vec3f* offset(size_t N) { return (Vec3f*)((char*)this+6+25*N); } __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+25*N); }
__forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+25*N); } __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); }
__forceinline float* scale(size_t N) { return (float*)((char*)this+6+25*N+12); } __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+25*N+12); }
__forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+25*N+12); } __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); }
__forceinline char* end(size_t N) { return (char*)this+6+25*N+16; } __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+25*N+16; }
__forceinline const char* end(size_t N) const { return (char*)this+6+25*N+16; } __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; }
}; };
template<int M> template<int M>

View file

@ -43,10 +43,10 @@ namespace embree
__forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range) __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
{ {
size_t end = min(begin+M,_end); size_t end = min(begin+M,_end);
N = (unsigned char)(end-begin); N = (uint8_t)(end-begin);
const unsigned int geomID0 = prims[begin].geomID(); const unsigned int geomID0 = prims[begin].geomID();
this->geomID(N) = geomID0; this->geomID(N) = geomID0;
ty = (unsigned char) scene->get(geomID0)->getType(); ty = (uint8_t) scene->get(geomID0)->getType();
/* encode all primitives */ /* encode all primitives */
LBBox3fa lbounds = empty; LBBox3fa lbounds = empty;
@ -79,10 +79,10 @@ namespace embree
const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range); const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
// NOTE: this weird (char) (short) cast works around VS2015 Win32 compiler bug // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug
bounds_vx_x(N)[i] = (char) (short) space3.vx.x; bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x;
bounds_vx_y(N)[i] = (char) (short) space3.vx.y; bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y;
bounds_vx_z(N)[i] = (char) (short) space3.vx.z; bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z;
bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f); bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f); bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f); bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
@ -92,9 +92,9 @@ namespace embree
assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f); assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f); assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
bounds_vy_x(N)[i] = (char) (short) space3.vy.x; bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x;
bounds_vy_y(N)[i] = (char) (short) space3.vy.y; bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y;
bounds_vy_z(N)[i] = (char) (short) space3.vy.z; bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z;
bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f); bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f); bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f); bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
@ -104,9 +104,9 @@ namespace embree
assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f); assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f); assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
bounds_vz_x(N)[i] = (char) (short) space3.vz.x; bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x;
bounds_vz_y(N)[i] = (char) (short) space3.vz.y; bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y;
bounds_vz_z(N)[i] = (char) (short) space3.vz.z; bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z;
bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f); bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f); bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f); bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
@ -130,7 +130,7 @@ namespace embree
size_t items = CurveNiMB::blocks(prims.size()); size_t items = CurveNiMB::blocks(prims.size());
size_t numbytes = CurveNiMB::bytes(prims.size()); size_t numbytes = CurveNiMB::bytes(prims.size());
CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment); CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items); const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items);
LBBox3fa bounds = empty; LBBox3fa bounds = empty;
for (size_t i=0; i<items; i++) for (size_t i=0; i<items; i++)
@ -143,9 +143,9 @@ namespace embree
public: public:
// 27.6 - 46 bytes per primitive // 27.6 - 46 bytes per primitive
unsigned char ty; uint8_t ty;
unsigned char N; uint8_t N;
unsigned char data[4+37*M+24]; uint8_t data[4+37*M+24];
/* /*
struct Layout struct Layout
@ -153,25 +153,25 @@ namespace embree
unsigned int geomID; unsigned int geomID;
unsigned int primID[N]; unsigned int primID[N];
char bounds_vx_x[N]; int8_t bounds_vx_x[N];
char bounds_vx_y[N]; int8_t bounds_vx_y[N];
char bounds_vx_z[N]; int8_t bounds_vx_z[N];
short bounds_vx_lower0[N]; short bounds_vx_lower0[N];
short bounds_vx_upper0[N]; short bounds_vx_upper0[N];
short bounds_vx_lower1[N]; short bounds_vx_lower1[N];
short bounds_vx_upper1[N]; short bounds_vx_upper1[N];
char bounds_vy_x[N]; int8_t bounds_vy_x[N];
char bounds_vy_y[N]; int8_t bounds_vy_y[N];
char bounds_vy_z[N]; int8_t bounds_vy_z[N];
short bounds_vy_lower0[N]; short bounds_vy_lower0[N];
short bounds_vy_upper0[N]; short bounds_vy_upper0[N];
short bounds_vy_lower1[N]; short bounds_vy_lower1[N];
short bounds_vy_upper1[N]; short bounds_vy_upper1[N];
char bounds_vz_x[N]; int8_t bounds_vz_x[N];
char bounds_vz_y[N]; int8_t bounds_vz_y[N];
char bounds_vz_z[N]; int8_t bounds_vz_z[N];
short bounds_vz_lower0[N]; short bounds_vz_lower0[N];
short bounds_vz_upper0[N]; short bounds_vz_upper0[N];
short bounds_vz_lower1[N]; short bounds_vz_lower1[N];
@ -185,89 +185,89 @@ namespace embree
}; };
*/ */
__forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((char*)this+2); } __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); }
__forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); } __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
__forceinline unsigned int* primID(size_t N) { return (unsigned int*)((char*)this+6); } __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); }
__forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); } __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
__forceinline char* bounds_vx_x(size_t N) { return (char*)((char*)this+6+4*N); } __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); } __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
__forceinline char* bounds_vx_y(size_t N) { return (char*)((char*)this+6+5*N); } __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); } __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
__forceinline char* bounds_vx_z(size_t N) { return (char*)((char*)this+6+6*N); } __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); } __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
__forceinline short* bounds_vx_lower0(size_t N) { return (short*)((char*)this+6+7*N); } __forceinline short* bounds_vx_lower0(size_t N) { return (short*)((int8_t*)this+6+7*N); }
__forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((char*)this+6+7*N); } __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
__forceinline short* bounds_vx_upper0(size_t N) { return (short*)((char*)this+6+9*N); } __forceinline short* bounds_vx_upper0(size_t N) { return (short*)((int8_t*)this+6+9*N); }
__forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((char*)this+6+9*N); } __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
__forceinline short* bounds_vx_lower1(size_t N) { return (short*)((char*)this+6+11*N); } __forceinline short* bounds_vx_lower1(size_t N) { return (short*)((int8_t*)this+6+11*N); }
__forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((char*)this+6+11*N); } __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); }
__forceinline short* bounds_vx_upper1(size_t N) { return (short*)((char*)this+6+13*N); } __forceinline short* bounds_vx_upper1(size_t N) { return (short*)((int8_t*)this+6+13*N); }
__forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((char*)this+6+13*N); } __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); }
__forceinline char* bounds_vy_x(size_t N) { return (char*)((char*)this+6+15*N); } __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+15*N); }
__forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+15*N); } __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); }
__forceinline char* bounds_vy_y(size_t N) { return (char*)((char*)this+6+16*N); } __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+16*N); }
__forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+16*N); } __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); }
__forceinline char* bounds_vy_z(size_t N) { return (char*)((char*)this+6+17*N); } __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+17*N); }
__forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+17*N); } __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); }
__forceinline short* bounds_vy_lower0(size_t N) { return (short*)((char*)this+6+18*N); } __forceinline short* bounds_vy_lower0(size_t N) { return (short*)((int8_t*)this+6+18*N); }
__forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((char*)this+6+18*N); } __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); }
__forceinline short* bounds_vy_upper0(size_t N) { return (short*)((char*)this+6+20*N); } __forceinline short* bounds_vy_upper0(size_t N) { return (short*)((int8_t*)this+6+20*N); }
__forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((char*)this+6+20*N); } __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); }
__forceinline short* bounds_vy_lower1(size_t N) { return (short*)((char*)this+6+22*N); } __forceinline short* bounds_vy_lower1(size_t N) { return (short*)((int8_t*)this+6+22*N); }
__forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((char*)this+6+22*N); } __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); }
__forceinline short* bounds_vy_upper1(size_t N) { return (short*)((char*)this+6+24*N); } __forceinline short* bounds_vy_upper1(size_t N) { return (short*)((int8_t*)this+6+24*N); }
__forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((char*)this+6+24*N); } __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); }
__forceinline char* bounds_vz_x(size_t N) { return (char*)((char*)this+6+26*N); } __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+26*N); }
__forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+26*N); } __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); }
__forceinline char* bounds_vz_y(size_t N) { return (char*)((char*)this+6+27*N); } __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+27*N); }
__forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+27*N); } __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); }
__forceinline char* bounds_vz_z(size_t N) { return (char*)((char*)this+6+28*N); } __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+28*N); }
__forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+28*N); } __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); }
__forceinline short* bounds_vz_lower0(size_t N) { return (short*)((char*)this+6+29*N); } __forceinline short* bounds_vz_lower0(size_t N) { return (short*)((int8_t*)this+6+29*N); }
__forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((char*)this+6+29*N); } __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); }
__forceinline short* bounds_vz_upper0(size_t N) { return (short*)((char*)this+6+31*N); } __forceinline short* bounds_vz_upper0(size_t N) { return (short*)((int8_t*)this+6+31*N); }
__forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((char*)this+6+31*N); } __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); }
__forceinline short* bounds_vz_lower1(size_t N) { return (short*)((char*)this+6+33*N); } __forceinline short* bounds_vz_lower1(size_t N) { return (short*)((int8_t*)this+6+33*N); }
__forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((char*)this+6+33*N); } __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); }
__forceinline short* bounds_vz_upper1(size_t N) { return (short*)((char*)this+6+35*N); } __forceinline short* bounds_vz_upper1(size_t N) { return (short*)((int8_t*)this+6+35*N); }
__forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((char*)this+6+35*N); } __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); }
__forceinline Vec3f* offset(size_t N) { return (Vec3f*)((char*)this+6+37*N); } __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+37*N); }
__forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+37*N); } __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); }
__forceinline float* scale(size_t N) { return (float*)((char*)this+6+37*N+12); } __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+37*N+12); }
__forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+37*N+12); } __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); }
__forceinline float& time_offset(size_t N) { return *(float*)((char*)this+6+37*N+16); } __forceinline float& time_offset(size_t N) { return *(float*)((int8_t*)this+6+37*N+16); }
__forceinline const float& time_offset(size_t N) const { return *(float*)((char*)this+6+37*N+16); } __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); }
__forceinline float& time_scale(size_t N) { return *(float*)((char*)this+6+37*N+20); } __forceinline float& time_scale(size_t N) { return *(float*)((int8_t*)this+6+37*N+20); }
__forceinline const float& time_scale(size_t N) const { return *(float*)((char*)this+6+37*N+20); } __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); }
__forceinline char* end(size_t N) { return (char*)this+6+37*N+24; } __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+37*N+24; }
__forceinline const char* end(size_t N) const { return (char*)this+6+37*N+24; } __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; }
}; };
template<int M> template<int M>

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,21 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim);
#if defined(__AVX__)
void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -0,0 +1,22 @@
// Copyright 2020 Light Transport Entertainment Inc.
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "curve_intersector_virtual.h"
namespace embree
{
namespace isa
{
void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim);
#if defined (__AVX__)
void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim);
void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim);
#endif
}
}

View file

@ -41,7 +41,7 @@ namespace embree
} }
const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float); const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);
size_t rootBytes = time_steps*sizeof(BVH4::NodeRef); size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
#if !defined(__X86_64__) #if !defined(__X86_64__) && !defined(__aarch64__)
rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding. rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
#endif #endif
void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes); void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
@ -62,8 +62,8 @@ namespace embree
__forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
/*! returns pointer to BVH array */ /*! returns pointer to BVH array */
__forceinline char* bvhData() { return &data[0]; } __forceinline int8_t* bvhData() { return &data[0]; }
__forceinline const char* bvhData() const { return &data[0]; } __forceinline const int8_t* bvhData() const { return &data[0]; }
/*! returns pointer to Grid array */ /*! returns pointer to Grid array */
__forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; } __forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; }
@ -253,7 +253,7 @@ namespace embree
public: public:
BVH4::NodeRef troot; BVH4::NodeRef troot;
#if !defined(__X86_64__) #if !defined(__X86_64__) && !defined(__aarch64__)
unsigned align1; unsigned align1;
#endif #endif
unsigned time_steps; unsigned time_steps;
@ -269,7 +269,7 @@ namespace embree
unsigned gridBytes; unsigned gridBytes;
unsigned rootOffset; unsigned rootOffset;
char data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots int8_t data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots
}; };
} }
} }

View file

@ -2,4 +2,4 @@
// Copyright 2009-2020 Intel Corporation // Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
#define RTC_HASH "69bd4c272f1ed608494f233ecfff3feec516880b" #define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6"

View file

@ -63,7 +63,7 @@ namespace embree
static const size_t NUM_CACHE_SEGMENTS = 8; static const size_t NUM_CACHE_SEGMENTS = 8;
static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
static const size_t COMMIT_INDEX_SHIFT = 32+8; static const size_t COMMIT_INDEX_SHIFT = 32+8;
#if defined(__X86_64__) #if defined(__X86_64__) || defined(__aarch64__)
static const size_t REF_TAG_MASK = 0xffffffffff; static const size_t REF_TAG_MASK = 0xffffffffff;
#else #else
static const size_t REF_TAG_MASK = 0x7FFFFFFF; static const size_t REF_TAG_MASK = 0x7FFFFFFF;

View file

@ -1,215 +1,630 @@
diff --git a/common/math/math.h b/common/math/math.h diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
index 5af0691a2..1982c27c1 100644 index 76c6b740aa..51d296fb16 100644
--- a/common/math/math.h --- a/thirdparty/embree/common/algorithms/parallel_for.h
+++ b/common/math/math.h +++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -13,7 +13,7 @@ @@ -27,7 +27,10 @@ namespace embree
#include <immintrin.h> func(r.begin());
});
if (!TaskScheduler::wait())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
}
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
@@ -55,13 +58,19 @@ namespace embree
func(i);
},context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
});
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
#if defined(__WIN32__) #elif defined(TASKING_PPL)
-#if (__MSV_VER <= 1700) @@ -81,7 +90,10 @@ namespace embree
+#if defined(_MSC_VER) && (_MSC_VER <= 1700) #if defined(TASKING_INTERNAL)
namespace std TaskScheduler::spawn(first,last,minStepSize,func);
{ if (!TaskScheduler::wait())
__forceinline bool isinf ( const float x ) { return _finite(x) == 0; } - throw std::runtime_error("task cancelled");
@@ -86,7 +86,7 @@ + // -- GODOT start --
return _mm_cvtss_f32(c); + // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#elif defined(TASKING_GCD) && defined(BUILD_IOS)
@@ -109,13 +121,19 @@ namespace embree
func(range<Index>(r.begin(),r.end()));
},context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
func(range<Index>(r.begin(),r.end()));
});
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
#elif defined(TASKING_PPL)
@@ -147,13 +165,19 @@ namespace embree
func(i);
},tbb::simple_partitioner(),context);
if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
func(i);
},tbb::simple_partitioner());
if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
#endif
} }
-#if defined(__WIN32__) && (__MSC_VER <= 1700) @@ -168,13 +192,19 @@ namespace embree
+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) func(i);
__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } },ap,context);
__forceinline double nextafter(double x, double y) { return _nextafter(x, y); } if (context.is_group_execution_cancelled())
__forceinline int roundf(float f) { return (int)(f + 0.5f); } - throw std::runtime_error("task cancelled");
diff --git a/common/sys/intrinsics.h b/common/sys/intrinsics.h + // -- GODOT start --
index 3f0619cac..58f5c3bb4 100644 + // throw std::runtime_error("task cancelled");
--- a/common/sys/intrinsics.h + abort();
+++ b/common/sys/intrinsics.h + // -- GODOT end --
@@ -11,6 +11,12 @@ #else
tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
#include <immintrin.h> func(i);
},ap);
+// -- GODOT start -- if (tbb::task::self().is_cancelled())
+#if defined(__WIN32__) && defined(__MINGW32__) - throw std::runtime_error("task cancelled");
+#include <unistd.h> + // -- GODOT start --
+#endif + // throw std::runtime_error("task cancelled");
+// -- GODOT end -- + abort();
+ + // -- GODOT end --
#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) #endif
#if !defined(_tzcnt_u32)
#define _tzcnt_u32 __tzcnt_u32
@@ -30,8 +36,14 @@
#endif
#if defined(__WIN32__)
-# define NOMINMAX
-# include <windows.h>
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
+#define NOMINMAX
+// -- GODOT start --
+#endif
+#include "windows.h"
+// -- GODOT end --
#endif
/* normally defined in pmmintrin.h, but we always need this */
@@ -413,8 +425,16 @@ namespace embree
__forceinline void pause_cpu(const size_t N = 8)
{
+// -- GODOT start --
for (size_t i=0; i<N; i++)
+#if !(defined(__WIN32__) && defined(__MINGW32__))
+// -- GODOT end --
_mm_pause();
+// -- GODOT start --
+#else
+ usleep(1);
+#endif
+// -- GODOT end --
} }
/* prefetches */
diff --git a/common/sys/library.cpp b/common/sys/library.cpp
index e448b195d..8ec918660 100644
--- a/common/sys/library.cpp
+++ b/common/sys/library.cpp
@@ -27,7 +27,9 @@ namespace embree
/* returns address of a symbol from the library */ diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
void* getSymbol(lib_t lib, const std::string& sym) { index d444b6a2e4..0daf94e50e 100644
- return GetProcAddress(HMODULE(lib),sym.c_str()); --- a/thirdparty/embree/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -58,15 +58,19 @@ namespace embree
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction,context);
- if (context.is_group_execution_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start -- + // -- GODOT start --
+ return (void*) GetProcAddress(HMODULE(lib),sym.c_str()); + // if (context.is_group_execution_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
return v;
#else
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction);
- if (tbb::task::self().is_cancelled())
- throw std::runtime_error("task cancelled");
+ // -- GODOT start --
+ // if (tbb::task::self().is_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
return v;
#endif
#else // TASKING_PPL
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
index 7e7b9faef8..98dc80ad59 100644
--- a/thirdparty/embree/common/lexers/stringstream.cpp
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -39,7 +39,10 @@ namespace embree
std::vector<char> str; str.reserve(64);
while (cin->peek() != EOF && !isSeparator(cin->peek())) {
int c = cin->get();
- if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+ // -- GODOT start --
+ // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+ if (!isValidChar(c)) abort();
+ // -- GODOT end --
str.push_back((char)c);
}
str.push_back(0);
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
index 4e8928242e..12f143f131 100644
--- a/thirdparty/embree/common/sys/alloc.cpp
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -21,7 +21,10 @@ namespace embree
void* ptr = _mm_malloc(size,align);
if (size != 0 && ptr == nullptr)
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
return ptr;
}
@@ -128,7 +131,10 @@ namespace embree
/* fall back to 4k pages */
int flags = MEM_COMMIT | MEM_RESERVE;
char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
- if (ptr == nullptr) throw std::bad_alloc();
+ // -- GODOT start --
+ // if (ptr == nullptr) throw std::bad_alloc();
+ if (ptr == nullptr) abort();
+ // -- GODOT end --
hugepages = false;
return ptr;
}
@@ -145,7 +151,10 @@ namespace embree
return bytesOld;
if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
return bytesNew;
}
@@ -156,7 +165,10 @@ namespace embree
return;
if (!VirtualFree(ptr,0,MEM_RELEASE))
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
}
void os_advise(void *ptr, size_t bytes)
@@ -260,7 +272,10 @@ namespace embree
/* fallback to 4k pages */
void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
- if (ptr == MAP_FAILED) throw std::bad_alloc();
+ // -- GODOT start --
+ // if (ptr == MAP_FAILED) throw std::bad_alloc();
+ if (ptr == MAP_FAILED) abort();
+ // -- GODOT end --
hugepages = false;
/* advise huge page hint for THP */
@@ -277,7 +292,10 @@ namespace embree
return bytesOld;
if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
return bytesNew;
}
@@ -291,7 +309,10 @@ namespace embree
const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
bytes = (bytes+pageSize-1) & ~(pageSize-1);
if (munmap(ptr,bytes) == -1)
- throw std::bad_alloc();
+ // -- GODOT start --
+ // throw std::bad_alloc();
+ abort();
+ // -- GODOT end --
}
/* hint for transparent huge pages (THP) */
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
index 7914eb7a52..737f14aa6e 100644
--- a/thirdparty/embree/common/sys/platform.h
+++ b/thirdparty/embree/common/sys/platform.h
@@ -174,11 +174,19 @@
#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
#if defined(DEBUG) // only report file and line in debug mode
+ // -- GODOT start --
+ // #define THROW_RUNTIME_ERROR(str)
+ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
#define THROW_RUNTIME_ERROR(str) \
- throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+ // -- GODOT end --
#else
+ // -- GODOT start --
+ // #define THROW_RUNTIME_ERROR(str)
+ // throw std::runtime_error(str);
#define THROW_RUNTIME_ERROR(str) \
- throw std::runtime_error(str);
+ abort();
+ // -- GODOT end --
#endif
#define FATAL(x) THROW_RUNTIME_ERROR(x)
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
index 98d7fb9249..ebf656d1a0 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -48,13 +48,15 @@ namespace embree
{
Task* prevTask = thread.task;
thread.task = this;
- try {
- if (thread.scheduler->cancellingException == nullptr)
+ // -- GODOT start --
+ // try {
+ // if (thread.scheduler->cancellingException == nullptr)
closure->execute();
- } catch (...) {
- if (thread.scheduler->cancellingException == nullptr)
- thread.scheduler->cancellingException = std::current_exception();
- }
+ // } catch (...) {
+ // if (thread.scheduler->cancellingException == nullptr)
+ // thread.scheduler->cancellingException = std::current_exception();
+ // }
+ // -- GODOT end --
thread.task = prevTask;
add_dependencies(-1);
}
@@ -297,8 +299,11 @@ namespace embree
size_t threadIndex = allocThreadIndex();
condition.wait(mutex, [&] () { return hasRootTask.load(); });
mutex.unlock();
- std::exception_ptr except = thread_loop(threadIndex);
- if (except != nullptr) std::rethrow_exception(except);
+ // -- GODOT start --
+ // std::exception_ptr except = thread_loop(threadIndex);
+ // if (except != nullptr) std::rethrow_exception(except);
+ thread_loop(threadIndex);
+ // -- GODOT end -- + // -- GODOT end --
} }
/* closes the shared library */ void TaskScheduler::reset() {
diff --git a/common/sys/mutex.h b/common/sys/mutex.h @@ -330,7 +335,10 @@ namespace embree
index 1164210f2..f0f55340a 100644 return thread->scheduler->cancellingException == nullptr;
--- a/common/sys/mutex.h }
+++ b/common/sys/mutex.h
@@ -47,8 +47,17 @@ namespace embree - std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT start --
+// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+ void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
{
/* allocate thread structure */
std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
@@ -353,9 +361,10 @@ namespace embree
swapThread(oldThread);
/* remember exception to throw */
- std::exception_ptr except = nullptr;
- if (cancellingException != nullptr) except = cancellingException;
-
+ // -- GODOT start --
+ // std::exception_ptr except = nullptr;
+ // if (cancellingException != nullptr) except = cancellingException;
+ // -- GODOT end --
/* wait for all threads to terminate */
threadCounter--;
#if defined(__WIN32__)
@@ -373,7 +382,10 @@ namespace embree
yield();
#endif
}
- return except;
+ // -- GODOT start --
+ // return except;
+ return;
+ // -- GODOT end --
}
bool TaskScheduler::steal_from_other_threads(Thread& thread)
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
index c2a9391aea..8bd70b2b8c 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -123,7 +123,10 @@ namespace embree
{ {
while (flag.load()) size_t ofs = bytes + ((align - stackPtr) & (align-1));
{ if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+// -- GODOT start -- - throw std::runtime_error("closure stack overflow");
+#if !(defined (__WIN32__) && defined (__MINGW32__)) + // -- GODOT start --
+// -- GODOT end -- + // throw std::runtime_error("closure stack overflow");
_mm_pause(); + abort();
_mm_pause(); + // -- GODOT end --
+// -- GODOT start -- stackPtr += ofs;
+#else return &stack[stackPtr-bytes];
+ __builtin_ia32_pause(); }
+ __builtin_ia32_pause(); @@ -132,7 +135,10 @@ namespace embree
+#endif __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+// -- GODOT end --
}
bool expected = false;
@@ -74,8 +82,17 @@ namespace embree
{
while(flag.load())
{ {
+// -- GODOT start -- if (right >= TASK_STACK_SIZE)
+#if !(defined (__WIN32__) && defined(__MINGW32__)) - throw std::runtime_error("task stack overflow");
+// -- GODOT end -- + // -- GODOT start --
_mm_pause(); + // throw std::runtime_error("task stack overflow");
_mm_pause(); + abort();
+// -- GODOT start -- + // -- GODOT end --
+#else
+ __builtin_ia32_pause(); /* allocate new task on right side of stack */
+ __builtin_ia32_pause(); size_t oldStackPtr = stackPtr;
+#endif @@ -239,7 +245,10 @@ namespace embree
+// -- GODOT end -- void wait_for_threads(size_t threadCount);
/*! thread loop for all worker threads */
- std::exception_ptr thread_loop(size_t threadIndex);
+ // -- GODOT start --
+ // std::exception_ptr thread_loop(size_t threadIndex);
+ void thread_loop(size_t threadIndex);
+ // -- GODOT end --
/*! steals a task from a different thread */
bool steal_from_other_threads(Thread& thread);
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
index 20cdd2d320..aa56035026 100644
--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -150,7 +150,10 @@ namespace embree
} }
} }
else {
diff --git a/common/sys/platform.h b/common/sys/platform.h - throw std::runtime_error("not supported node type in bvh_statistics");
index 96f9aab01..08617452f 100644 + // -- GODOT start --
--- a/common/sys/platform.h + // throw std::runtime_error("not supported node type in bvh_statistics");
+++ b/common/sys/platform.h + abort();
@@ -141,6 +141,9 @@ + // -- GODOT end --
#define DELETED = delete }
return s;
}
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
index ee5c37b238..625fbf6d4f 100644
--- a/thirdparty/embree/kernels/common/rtcore.cpp
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
if (quality != RTC_BUILD_QUALITY_LOW &&
quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH)
- throw std::runtime_error("invalid build quality");
+ // -- GODOT start --
+ // throw std::runtime_error("invalid build quality");
+ abort();
+ // -- GODOT end --
scene->setBuildQuality(quality);
RTC_CATCH_END2(scene);
}
@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
quality != RTC_BUILD_QUALITY_MEDIUM &&
quality != RTC_BUILD_QUALITY_HIGH &&
quality != RTC_BUILD_QUALITY_REFIT)
- throw std::runtime_error("invalid build quality");
+ // -- GODOT start --
+ // throw std::runtime_error("invalid build quality");
+ abort();
+ // -- GODOT end --
geometry->setBuildQuality(quality);
RTC_CATCH_END2(geometry);
}
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
index 6583d12d57..4b070e122b 100644
--- a/thirdparty/embree/kernels/common/rtcore.h
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -25,52 +25,58 @@ namespace embree
#endif #endif
/*! Macros used in the rtcore API implementation */
-#define RTC_CATCH_BEGIN try {
+// -- GODOT start -- +// -- GODOT start --
+#if !defined(likely) +// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
-#define RTC_CATCH_END(device) \
- } catch (std::bad_alloc&) { \
- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
- } catch (rtcore_error& e) { \
- Device::process_error(device,e.error,e.what()); \
- } catch (std::exception& e) { \
- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
- } catch (...) { \
- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
- }
+// #define RTC_CATCH_END(device) \
+// } catch (std::bad_alloc&) { \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// } catch (rtcore_error& e) { \
+// Device::process_error(device,e.error,e.what()); \
+// } catch (std::exception& e) { \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// } catch (...) { \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// }
+#define RTC_CATCH_END(device)
-#define RTC_CATCH_END2(scene) \
- } catch (std::bad_alloc&) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
- } catch (rtcore_error& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,e.error,e.what()); \
- } catch (std::exception& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
- } catch (...) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
- }
+// #define RTC_CATCH_END2(scene) \
+// } catch (std::bad_alloc&) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// } catch (rtcore_error& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,e.error,e.what()); \
+// } catch (std::exception& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// } catch (...) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// }
+#define RTC_CATCH_END2(scene)
-#define RTC_CATCH_END2_FALSE(scene) \
- } catch (std::bad_alloc&) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
- return false; \
- } catch (rtcore_error& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,e.error,e.what()); \
- return false; \
- } catch (std::exception& e) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
- return false; \
- } catch (...) { \
- Device* device = scene ? scene->device : nullptr; \
- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
- return false; \
- }
+// #define RTC_CATCH_END2_FALSE(scene) \
+// } catch (std::bad_alloc&) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// return false; \
+// } catch (rtcore_error& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,e.error,e.what()); \
+// return false; \
+// } catch (std::exception& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// return false; \
+// } catch (...) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// return false; \
+// }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end -- +// -- GODOT end --
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#define likely(expr) (expr) #define RTC_VERIFY_HANDLE(handle) \
#define unlikely(expr) (expr) if (handle == nullptr) { \
@@ -148,6 +151,9 @@ @@ -97,28 +103,38 @@ namespace embree
#define likely(expr) __builtin_expect((bool)(expr),true ) #define RTC_TRACE(x)
#define unlikely(expr) __builtin_expect((bool)(expr),false)
#endif #endif
+// -- GODOT start --
+#endif - /*! used to throw embree API errors */
- struct rtcore_error : public std::exception
- {
- __forceinline rtcore_error(RTCError error, const std::string& str)
- : error(error), str(str) {}
-
- ~rtcore_error() throw() {}
-
- const char* what () const throw () {
- return str.c_str();
- }
-
- RTCError error;
- std::string str;
- };
+// -- GODOT begin --
+// /*! used to throw embree API errors */
+// struct rtcore_error : public std::exception
+// {
+// __forceinline rtcore_error(RTCError error, const std::string& str)
+// : error(error), str(str) {}
+//
+// ~rtcore_error() throw() {}
+//
+// const char* what () const throw () {
+// return str.c_str();
+// }
+//
+// RTCError error;
+// std::string str;
+// };
+// -- GODOT end -- +// -- GODOT end --
//////////////////////////////////////////////////////////////////////////////// #if defined(DEBUG) // only report file and line in debug mode
/// Error handling and debugging + // -- GODOT begin --
diff --git a/common/sys/sysinfo.cpp b/common/sys/sysinfo.cpp + // #define throw_RTCError(error,str) \
index eb0a10eaf..74438260d 100644 + // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
--- a/common/sys/sysinfo.cpp #define throw_RTCError(error,str) \
+++ b/common/sys/sysinfo.cpp - throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
@@ -233,7 +233,7 @@ namespace embree + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+ // -- GODOT end --
__noinline int64_t get_xcr0()
{
-#if defined (__WIN32__)
+#if defined (__WIN32__) /* -- GODOT start -- */ && !defined (__MINGW32__) /* -- GODOT end -- */
int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
xcr0 = _xgetbv(0);
return xcr0;
diff --git a/common/tasking/taskschedulerinternal.cpp b/common/tasking/taskschedulerinternal.cpp
index 2152e92f4..923d62f83 100644
--- a/common/tasking/taskschedulerinternal.cpp
+++ b/common/tasking/taskschedulerinternal.cpp
@@ -361,7 +361,15 @@ namespace embree
if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
yield();
else
+// -- GODOT start --
+#if !defined(__MINGW32__)
+// -- GODOT end --
_mm_pause();
+// -- GODOT start --
+#else
+ usleep(1);
+#endif
+// -- GODOT end --
loopIndex++;
#else #else
yield(); + // -- GODOT begin --
diff --git a/common/tasking/taskschedulertbb.h b/common/tasking/taskschedulertbb.h + // #define throw_RTCError(error,str) \
index 98dba2687..369e5edf0 100644 + // throw rtcore_error(error,str);
--- a/common/tasking/taskschedulertbb.h #define throw_RTCError(error,str) \
+++ b/common/tasking/taskschedulertbb.h - throw rtcore_error(error,str);
@@ -12,7 +12,13 @@ + abort();
#include "../sys/ref.h" + // -- GODOT end --
#if defined(__WIN32__)
+// -- GODOT start --
+#if !defined(NOMINMAX)
+// -- GODOT end --
# define NOMINMAX
+// -- GODOT start --
+#endif
+// -- GODOT end --
#endif #endif
// We need to define these to avoid implicit linkage against #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
diff a/include/embree3/rtcore_common.h b/include/embree3/rtcore_common.h diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
--- a/include/embree3/rtcore_common.h index e75aa968f9..1e23aeb415 100644
+++ b/include/embree3/rtcore_common.h --- a/thirdparty/embree/kernels/common/scene.cpp
@@ -19,7 +19,7 @@ +++ b/thirdparty/embree/kernels/common/scene.cpp
#endif @@ -800,16 +800,18 @@ namespace embree
#endif }
-#ifdef _WIN32 /* initiate build */
+#if defined(_WIN32) && defined(_MSC_VER) - try {
# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) + // -- GODOT start --
#else + // try {
# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
- }
- catch (...) {
- accels_clear();
- updateInterface();
- Lock<MutexSys> lock(schedulerMutex);
- this->scheduler = nullptr;
- throw;
- }
+ // }
+ // catch (...) {
+ // accels_clear();
+ // updateInterface();
+ // Lock<MutexSys> lock(schedulerMutex);
+ // this->scheduler = nullptr;
+ // throw;
+ // }
+ // -- GODOT end --
}
#endif