// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #define MBLUR_NUM_TEMPORAL_BINS 2 #define MBLUR_NUM_OBJECT_BINS 32 #include "../bvh/bvh.h" #include "../common/primref_mb.h" #include "heuristic_binning_array_aligned.h" #include "heuristic_timesplit_array.h" namespace embree { namespace isa { template<typename T> struct SharedVector { __forceinline SharedVector() {} __forceinline SharedVector(T* ptr, size_t refCount = 1) : prims(ptr), refCount(refCount) {} __forceinline void incRef() { refCount++; } __forceinline void decRef() { if (--refCount == 0) delete prims; } T* prims; size_t refCount; }; template<typename BuildRecord, int MAX_BRANCHING_FACTOR> struct LocalChildListT { typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector; __forceinline LocalChildListT (const BuildRecord& record) : numChildren(1), numSharedPrimVecs(1) { /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */ children[0] = record; primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2); } __forceinline ~LocalChildListT() { for (size_t i = 0; i < numChildren; i++) primvecs[i]->decRef(); } __forceinline BuildRecord& operator[] ( const size_t i ) { return children[i]; } __forceinline size_t size() const { return numChildren; } __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector) { SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild]; if (lrecord.prims.prims == bsharedPrimVec->prims) { primvecs[bestChild] = bsharedPrimVec; bsharedPrimVec->incRef(); } else { primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims); } if (rrecord.prims.prims == bsharedPrimVec->prims) { primvecs[numChildren] = bsharedPrimVec; bsharedPrimVec->incRef(); } else { primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims); } bsharedPrimVec->decRef(); new_vector.release(); children[bestChild] = lrecord; children[numChildren] = rrecord; numChildren++; } public: array_t<BuildRecord,MAX_BRANCHING_FACTOR> children; array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs; size_t numChildren; array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs; size_t numSharedPrimVecs; }; template<typename Mesh> struct RecalculatePrimRef { Scene* scene; __forceinline RecalculatePrimRef (Scene* scene) : scene(scene) {} __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const { const unsigned geomID = prim.geomID(); const unsigned primID = prim.primID(); const Mesh* mesh = scene->get<Mesh>(geomID); const LBBox3fa lbounds = mesh->linearBounds(primID, time_range); const range<int> tbounds = mesh->timeSegmentRange(time_range); return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); } // __noinline is workaround for ICC16 bug under MacOSX __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { const unsigned geomID = prim.geomID(); const unsigned primID = prim.primID(); const Mesh* mesh = scene->get<Mesh>(geomID); const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range); const range<int> tbounds = mesh->timeSegmentRange(time_range); return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); } __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range); } // __noinline is workaround for ICC16 bug under MacOSX __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range); } }; struct VirtualRecalculatePrimRef { Scene* scene; __forceinline VirtualRecalculatePrimRef (Scene* scene) : scene(scene) {} __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const { const unsigned geomID = prim.geomID(); const unsigned primID = prim.primID(); const Geometry* mesh = scene->get(geomID); const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range); const range<int> tbounds = mesh->timeSegmentRange(time_range); return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); } __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { const unsigned geomID = prim.geomID(); const unsigned primID = prim.primID(); const Geometry* mesh = scene->get(geomID); const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range); const range<int> tbounds = mesh->timeSegmentRange(time_range); return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); } __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range); } __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range); } }; struct BVHBuilderMSMBlur { /*! settings for msmblur builder */ struct Settings { /*! default settings */ Settings () : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8), travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false), singleThreadThreshold(1024) {} Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold) : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold) { minLeafSize = min(minLeafSize,maxLeafSize); } public: size_t branchingFactor; //!< branching factor of BVH to build size_t maxDepth; //!< maximum depth of BVH to build size_t logBlockSize; //!< log2 of blocksize for SAH heuristic size_t minLeafSize; //!< minimum size of a leaf size_t maxLeafSize; //!< maximum size of a leaf float travCost; //!< estimated cost of one traversal step float intCost; //!< estimated cost of one primitive intersection bool singleLeafTimeSegment; //!< split time to single time range size_t singleThreadThreshold; //!< threshold when we switch to single threaded build }; struct BuildRecord { public: __forceinline BuildRecord () {} __forceinline BuildRecord (size_t depth) : depth(depth) {} __forceinline BuildRecord (const SetMB& prims, size_t depth) : depth(depth), prims(prims) {} __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) { return a.prims.size() < b.prims.size(); } __forceinline size_t size() const { return prims.size(); } public: size_t depth; //!< Depth of the root of this subtree. SetMB prims; //!< The list of primitives. }; struct BuildRecordSplit : public BuildRecord { __forceinline BuildRecordSplit () {} __forceinline BuildRecordSplit (size_t depth) : BuildRecord(depth) {} __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split) : BuildRecord(record), split(split) {} BinSplit<MBLUR_NUM_OBJECT_BINS> split; }; template< typename NodeRef, typename RecalculatePrimRef, typename Allocator, typename CreateAllocFunc, typename CreateNodeFunc, typename SetNodeFunc, typename CreateLeafFunc, typename ProgressMonitor> class BuilderT { ALIGNED_CLASS_(16); static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split; typedef mvector<PrimRefMB>* PrimRefVector; typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector; typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList; typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit; public: BuilderT (MemoryMonitorInterface* device, const RecalculatePrimRef recalculatePrimRef, const CreateAllocFunc createAlloc, const CreateNodeFunc createNode, const SetNodeFunc setNode, const CreateLeafFunc createLeaf, const ProgressMonitor progressMonitor, const Settings& settings) : cfg(settings), heuristicObjectSplit(), heuristicTemporalSplit(device, recalculatePrimRef), recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf), progressMonitor(progressMonitor) { if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); } /*! finds the best split */ const Split find(const SetMB& set) { /* first try standard object split */ const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize); const float object_split_sah = object_split.splitSAH(); /* test temporal splits only when object split was bad */ const float leaf_sah = set.leafSAH(cfg.logBlockSize); if (object_split_sah < 0.50f*leaf_sah) return object_split; /* do temporal splits only if the time range is big enough */ if (set.time_range.size() > 1.01f/float(set.max_num_time_segments)) { const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize); const float temporal_split_sah = temporal_split.splitSAH(); /* take temporal split if it improved SAH */ if (temporal_split_sah < object_split_sah) return temporal_split; } return object_split; } /*! array partitioning */ __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) { /* perform object split */ if (likely(split.data == Split::SPLIT_OBJECT)) { heuristicObjectSplit.split(split,set,lset,rset); } /* perform temporal split */ else if (likely(split.data == Split::SPLIT_TEMPORAL)) { return heuristicTemporalSplit.split(split,set,lset,rset); } /* perform fallback split */ else if (unlikely(split.data == Split::SPLIT_FALLBACK)) { set.deterministic_order(); splitFallback(set,lset,rset); } /* split by geometry */ else if (unlikely(split.data == Split::SPLIT_GEOMID)) { set.deterministic_order(); splitByGeometry(set,lset,rset); } else assert(false); return std::unique_ptr<mvector<PrimRefMB>>(); } /*! finds the best fallback split */ __noinline Split findFallback(const SetMB& set) { /* split if primitives are not from same geometry */ if (!sameGeometry(set)) return Split(0.0f,Split::SPLIT_GEOMID); /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */ if (cfg.singleLeafTimeSegment) { /* test if one primitive has more than one time segment in time range, if so split time */ for (size_t i=set.begin(); i<set.end(); i++) { const PrimRefMB& prim = (*set.prims)[i]; const range<int> itime_range = prim.timeSegmentRange(set.time_range); const int localTimeSegments = itime_range.size(); assert(localTimeSegments > 0); if (localTimeSegments > 1) { const int icenter = (itime_range.begin() + itime_range.end())/2; const float splitTime = prim.timeStep(icenter); return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime); } } } /* otherwise return fallback split */ return Split(0.0f,Split::SPLIT_FALLBACK); } /*! performs fallback split */ void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) { mvector<PrimRefMB>& prims = *set.prims; const size_t begin = set.begin(); const size_t end = set.end(); const size_t center = (begin + end)/2; PrimInfoMB linfo = empty; for (size_t i=begin; i<center; i++) linfo.add_primref(prims[i]); PrimInfoMB rinfo = empty; for (size_t i=center; i<end; i++) rinfo.add_primref(prims[i]); new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); } /*! checks if all primitives are from the same geometry */ __forceinline bool sameGeometry(const SetMB& set) { if (set.size() == 0) return true; mvector<PrimRefMB>& prims = *set.prims; const size_t begin = set.begin(); const size_t end = set.end(); unsigned int firstGeomID = prims[begin].geomID(); for (size_t i=begin+1; i<end; i++) { if (prims[i].geomID() != firstGeomID){ return false; } } return true; } /* split by geometry ID */ void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) { assert(set.size() > 1); mvector<PrimRefMB>& prims = *set.prims; const size_t begin = set.begin(); const size_t end = set.end(); PrimInfoMB left(empty); PrimInfoMB right(empty); unsigned int geomID = prims[begin].geomID(); size_t center = serial_partitioning(prims.data(),begin,end,left,right, [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); }); new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range); new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); } const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc) { /* this should never occur but is a fatal error */ if (in.depth > cfg.maxDepth) throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); /* replace already found split by fallback split */ const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims)); /* special case when directly creating leaf without any splits that could shrink time_range */ bool force_split = false; if (current.depth == 1 && current.size() > 0) { BBox1f c = empty; BBox1f p = current.prims.time_range; for (size_t i=current.prims.begin(); i<current.prims.end(); i++) { mvector<PrimRefMB>& prims = *current.prims.prims; c.extend(prims[i].time_range); } force_split = c.lower > p.lower || c.upper < p.upper; } /* create leaf for few primitives */ if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split) return createLeaf(current,alloc); /* fill all children by always splitting the largest one */ bool hasTimeSplits = false; NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; LocalChildListSplit children(current); do { /* find best child with largest bounding box area */ size_t bestChild = -1; size_t bestSize = 0; for (size_t i=0; i<children.size(); i++) { /* ignore leaves as they cannot get split */ if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split) continue; force_split = false; /* remember child with largest size */ if (children[i].size() > bestSize) { bestSize = children[i].size(); bestChild = i; } } if (bestChild == -1) break; /* perform best found split */ BuildRecordSplit& brecord = children[bestChild]; BuildRecordSplit lrecord(current.depth+1); BuildRecordSplit rrecord(current.depth+1); std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims); hasTimeSplits |= new_vector != nullptr; /* find new splits */ lrecord.split = findFallback(lrecord.prims); rrecord.split = findFallback(rrecord.prims); children.split(bestChild,lrecord,rrecord,std::move(new_vector)); } while (children.size() < cfg.branchingFactor); /* detect time_ranges that have shrunken */ for (size_t i=0; i<children.size(); i++) { const BBox1f c = children[i].prims.time_range; const BBox1f p = in.prims.time_range; hasTimeSplits |= c.lower > p.lower || c.upper < p.upper; } /* create node */ auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits); /* recurse into each child and perform reduction */ LBBox3fa gbounds = empty; for (size_t i=0; i<children.size(); i++) { values[i] = createLargeLeaf(children[i],alloc); gbounds.extend(values[i].lbounds); } setNode(current,children.children.data(),node,values,children.numChildren); /* calculate geometry bounds of this node */ if (hasTimeSplits) return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); else return NodeRecordMB4D(node,gbounds,current.prims.time_range); } const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel) { /* get thread local allocator */ if (!alloc) alloc = createAlloc(); /* call memory monitor function to signal progress */ if (toplevel && current.size() <= cfg.singleThreadThreshold) progressMonitor(current.size()); /*! find best split */ const Split csplit = find(current.prims); /*! compute leaf and split cost */ const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize); const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH(); assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0))); /*! create a leaf node when threshold reached or SAH tells us to stop */ if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { current.prims.deterministic_order(); return createLargeLeaf(current,alloc); } /*! perform initial split */ SetMB lprims,rprims; std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims); bool hasTimeSplits = new_vector != nullptr; NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; LocalChildList children(current); { BuildRecord lrecord(lprims,current.depth+1); BuildRecord rrecord(rprims,current.depth+1); children.split(0,lrecord,rrecord,std::move(new_vector)); } /*! split until node is full or SAH tells us to stop */ while (children.size() < cfg.branchingFactor) { /*! find best child to split */ float bestArea = neg_inf; ssize_t bestChild = -1; for (size_t i=0; i<children.size(); i++) { if (children[i].size() <= cfg.minLeafSize) continue; if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) { bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds); } } if (bestChild == -1) break; /* perform split */ BuildRecord& brecord = children[bestChild]; BuildRecord lrecord(current.depth+1); BuildRecord rrecord(current.depth+1); Split csplit = find(brecord.prims); std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims); hasTimeSplits |= new_vector != nullptr; children.split(bestChild,lrecord,rrecord,std::move(new_vector)); } /* detect time_ranges that have shrunken */ for (size_t i=0; i<children.size(); i++) { const BBox1f c = children[i].prims.time_range; const BBox1f p = current.prims.time_range; hasTimeSplits |= c.lower > p.lower || c.upper < p.upper; } /* sort buildrecords for simpler shadow ray traversal */ //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !! /*! create an inner node */ auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits); LBBox3fa gbounds = empty; /* spawn tasks */ if (unlikely(current.size() > cfg.singleThreadThreshold)) { /*! parallel_for is faster than spawing sub-tasks */ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { for (size_t i=r.begin(); i<r.end(); i++) { values[i] = recurse(children[i],nullptr,true); _mm_mfence(); // to allow non-temporal stores during build } }); /*! merge bounding boxes */ for (size_t i=0; i<children.size(); i++) gbounds.extend(values[i].lbounds); } /* recurse into each child */ else { //for (size_t i=0; i<children.size(); i++) for (ssize_t i=children.size()-1; i>=0; i--) { values[i] = recurse(children[i],alloc,false); gbounds.extend(values[i].lbounds); } } setNode(current,children.children.data(),node,values,children.numChildren); /* calculate geometry bounds of this node */ if (unlikely(hasTimeSplits)) return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); else return NodeRecordMB4D(node,gbounds,current.prims.time_range); } /*! builder entry function */ __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo) { const SetMB set(pinfo,&prims); auto ret = recurse(BuildRecord(set,1),nullptr,true); _mm_mfence(); // to allow non-temporal stores during build return ret; } private: Settings cfg; HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit; HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit; const RecalculatePrimRef recalculatePrimRef; const CreateAllocFunc createAlloc; const CreateNodeFunc createNode; const SetNodeFunc setNode; const CreateLeafFunc createLeaf; const ProgressMonitor progressMonitor; }; template<typename NodeRef, typename RecalculatePrimRef, typename CreateAllocFunc, typename CreateNodeFunc, typename SetNodeFunc, typename CreateLeafFunc, typename ProgressMonitorFunc> static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo, MemoryMonitorInterface* device, const RecalculatePrimRef recalculatePrimRef, const CreateAllocFunc createAlloc, const CreateNodeFunc createNode, const SetNodeFunc setNode, const CreateLeafFunc createLeaf, const ProgressMonitorFunc progressMonitor, const Settings& settings) { typedef BuilderT< NodeRef, RecalculatePrimRef, decltype(createAlloc()), CreateAllocFunc, CreateNodeFunc, SetNodeFunc, CreateLeafFunc, ProgressMonitorFunc> Builder; Builder builder(device, recalculatePrimRef, createAlloc, createNode, setNode, createLeaf, progressMonitor, settings); return builder(prims,pinfo); } }; } }