// Copyright 2009-2020 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "default.h" namespace embree { /*! An item on the stack holds the node ID and distance of that node. */ template struct __aligned(16) StackItemT { /*! assert that the xchg function works */ static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed"); __forceinline StackItemT() {} __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {} /*! use SSE instructions to swap stack items */ __forceinline static void xchg(StackItemT& a, StackItemT& b) { const vfloat4 sse_a = vfloat4::load((float*)&a); const vfloat4 sse_b = vfloat4::load((float*)&b); vfloat4::store(&a,sse_b); vfloat4::store(&b,sse_a); } /*! Sort 2 stack items. */ __forceinline friend void sort(StackItemT& s1, StackItemT& s2) { if (s2.dist < s1.dist) xchg(s2,s1); } /*! Sort 3 stack items. */ __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3) { if (s2.dist < s1.dist) xchg(s2,s1); if (s3.dist < s2.dist) xchg(s3,s2); if (s2.dist < s1.dist) xchg(s2,s1); } /*! Sort 4 stack items. */ __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4) { if (s2.dist < s1.dist) xchg(s2,s1); if (s4.dist < s3.dist) xchg(s4,s3); if (s3.dist < s1.dist) xchg(s3,s1); if (s4.dist < s2.dist) xchg(s4,s2); if (s3.dist < s2.dist) xchg(s3,s2); } /*! use SSE instructions to swap stack items */ __forceinline static void cmp_xchg(vint4& a, vint4& b) { #if defined(__AVX512VL__) const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a)); #else const vboolf4 mask0(b < a); const vboolf4 mask(shuffle<2,2,2,2>(mask0)); #endif const vint4 c = select(mask,b,a); const vint4 d = select(mask,a,b); a = c; b = d; } /*! Sort 3 stack items. */ __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3) { cmp_xchg(s2,s1); cmp_xchg(s3,s2); cmp_xchg(s2,s1); } /*! Sort 4 stack items. */ __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4) { cmp_xchg(s2,s1); cmp_xchg(s4,s3); cmp_xchg(s3,s1); cmp_xchg(s4,s2); cmp_xchg(s3,s2); } /*! Sort N stack items. */ __forceinline friend void sort(StackItemT* begin, StackItemT* end) { for (StackItemT* i = begin+1; i != end; ++i) { const vfloat4 item = vfloat4::load((float*)i); const unsigned dist = i->dist; StackItemT* j = i; while ((j != begin) && ((j-1)->dist < dist)) { vfloat4::store(j, vfloat4::load((float*)(j-1))); --j; } vfloat4::store(j, item); } } public: T ptr; unsigned dist; }; /*! An item on the stack holds the node ID and active ray mask. */ template struct __aligned(8) StackItemMaskT { T ptr; size_t mask; }; struct __aligned(8) StackItemMaskCoherent { size_t mask; size_t parent; size_t child; }; }