326 lines
11 KiB
C++
326 lines
11 KiB
C++
|
// Copyright 2009-2020 Intel Corporation
|
||
|
// SPDX-License-Identifier: Apache-2.0
|
||
|
|
||
|
#pragma once
|
||
|
|
||
|
#include "../common/default.h"
|
||
|
|
||
|
/* force a complete cache invalidation when running out of allocation space */
|
||
|
#define FORCE_SIMPLE_FLUSH 0
|
||
|
|
||
|
#define THREAD_BLOCK_ATOMIC_ADD 4
|
||
|
|
||
|
#if defined(DEBUG)
|
||
|
#define CACHE_STATS(x)
|
||
|
#else
|
||
|
#define CACHE_STATS(x)
|
||
|
#endif
|
||
|
|
||
|
namespace embree
|
||
|
{
|
||
|
class SharedTessellationCacheStats
|
||
|
{
|
||
|
public:
|
||
|
/* stats */
|
||
|
static std::atomic<size_t> cache_accesses;
|
||
|
static std::atomic<size_t> cache_hits;
|
||
|
static std::atomic<size_t> cache_misses;
|
||
|
static std::atomic<size_t> cache_flushes;
|
||
|
static size_t cache_num_patches;
|
||
|
__aligned(64) static SpinLock mtx;
|
||
|
|
||
|
/* print stats for debugging */
|
||
|
static void printStats();
|
||
|
static void clearStats();
|
||
|
};
|
||
|
|
||
|
void resizeTessellationCache(size_t new_size);
|
||
|
void resetTessellationCache();
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
struct __aligned(64) ThreadWorkState
|
||
|
{
|
||
|
ALIGNED_STRUCT_(64);
|
||
|
|
||
|
std::atomic<size_t> counter;
|
||
|
ThreadWorkState* next;
|
||
|
bool allocated;
|
||
|
|
||
|
__forceinline ThreadWorkState(bool allocated = false)
|
||
|
: counter(0), next(nullptr), allocated(allocated)
|
||
|
{
|
||
|
assert( ((size_t)this % 64) == 0 );
|
||
|
}
|
||
|
};
|
||
|
|
||
|
class __aligned(64) SharedLazyTessellationCache
|
||
|
{
|
||
|
public:
|
||
|
|
||
|
static const size_t NUM_CACHE_SEGMENTS = 8;
|
||
|
static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
|
||
|
static const size_t COMMIT_INDEX_SHIFT = 32+8;
|
||
|
#if defined(__X86_64__) || defined(__aarch64__)
|
||
|
static const size_t REF_TAG_MASK = 0xffffffffff;
|
||
|
#else
|
||
|
static const size_t REF_TAG_MASK = 0x7FFFFFFF;
|
||
|
#endif
|
||
|
static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;
|
||
|
static const size_t BLOCK_SIZE = 64;
|
||
|
|
||
|
|
||
|
/*! Per thread tessellation ref cache */
|
||
|
static __thread ThreadWorkState* init_t_state;
|
||
|
static ThreadWorkState* current_t_state;
|
||
|
|
||
|
static __forceinline ThreadWorkState *threadState()
|
||
|
{
|
||
|
if (unlikely(!init_t_state))
|
||
|
/* sets init_t_state, can't return pointer due to macosx icc bug*/
|
||
|
SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
|
||
|
return init_t_state;
|
||
|
}
|
||
|
|
||
|
struct Tag
|
||
|
{
|
||
|
__forceinline Tag() : data(0) {}
|
||
|
|
||
|
__forceinline Tag(void* ptr, size_t combinedTime) {
|
||
|
init(ptr,combinedTime);
|
||
|
}
|
||
|
|
||
|
__forceinline Tag(size_t ptr, size_t combinedTime) {
|
||
|
init((void*)ptr,combinedTime);
|
||
|
}
|
||
|
|
||
|
__forceinline void init(void* ptr, size_t combinedTime)
|
||
|
{
|
||
|
if (ptr == nullptr) {
|
||
|
data = 0;
|
||
|
return;
|
||
|
}
|
||
|
int64_t new_root_ref = (int64_t) ptr;
|
||
|
new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
|
||
|
assert( new_root_ref <= (int64_t)REF_TAG_MASK );
|
||
|
new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
|
||
|
data = new_root_ref;
|
||
|
}
|
||
|
|
||
|
__forceinline int64_t get() const { return data.load(); }
|
||
|
__forceinline void set( int64_t v ) { data.store(v); }
|
||
|
__forceinline void reset() { data.store(0); }
|
||
|
|
||
|
private:
|
||
|
atomic<int64_t> data;
|
||
|
};
|
||
|
|
||
|
static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
|
||
|
|
||
|
struct CacheEntry
|
||
|
{
|
||
|
Tag tag;
|
||
|
SpinLock mutex;
|
||
|
};
|
||
|
|
||
|
private:
|
||
|
|
||
|
float *data;
|
||
|
bool hugepages;
|
||
|
size_t size;
|
||
|
size_t maxBlocks;
|
||
|
ThreadWorkState *threadWorkState;
|
||
|
|
||
|
__aligned(64) std::atomic<size_t> localTime;
|
||
|
__aligned(64) std::atomic<size_t> next_block;
|
||
|
__aligned(64) SpinLock reset_state;
|
||
|
__aligned(64) SpinLock linkedlist_mtx;
|
||
|
__aligned(64) std::atomic<size_t> switch_block_threshold;
|
||
|
__aligned(64) std::atomic<size_t> numRenderThreads;
|
||
|
|
||
|
|
||
|
public:
|
||
|
|
||
|
|
||
|
SharedLazyTessellationCache();
|
||
|
~SharedLazyTessellationCache();
|
||
|
|
||
|
void getNextRenderThreadWorkState();
|
||
|
|
||
|
__forceinline size_t maxAllocSize() const {
|
||
|
return switch_block_threshold;
|
||
|
}
|
||
|
|
||
|
__forceinline size_t getCurrentIndex() { return localTime.load(); }
|
||
|
__forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
|
||
|
|
||
|
__forceinline size_t getTime(const size_t globalTime) {
|
||
|
return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
|
||
|
}
|
||
|
|
||
|
|
||
|
__forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }
|
||
|
__forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
|
||
|
|
||
|
__forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
|
||
|
|
||
|
static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }
|
||
|
static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
|
||
|
static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
|
||
|
static __forceinline size_t getState() { return threadState()->counter.load(); }
|
||
|
static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
|
||
|
|
||
|
static __forceinline size_t getTCacheTime(const size_t globalTime) {
|
||
|
return sharedLazyTessellationCache.getTime(globalTime);
|
||
|
}
|
||
|
|
||
|
/* per thread lock */
|
||
|
__forceinline void lockThreadLoop (ThreadWorkState *const t_state)
|
||
|
{
|
||
|
while(1)
|
||
|
{
|
||
|
size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
|
||
|
if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
|
||
|
{
|
||
|
/* lock failed wait until sync phase is over */
|
||
|
sharedLazyTessellationCache.unlockThread(t_state,-1);
|
||
|
sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
|
||
|
}
|
||
|
else
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
|
||
|
{
|
||
|
const int64_t subdiv_patch_root_ref = entry.tag.get();
|
||
|
CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
|
||
|
|
||
|
if (likely(subdiv_patch_root_ref != 0))
|
||
|
{
|
||
|
const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
|
||
|
const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
|
||
|
|
||
|
if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
|
||
|
{
|
||
|
CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
|
||
|
return (void*) subdiv_patch_root;
|
||
|
}
|
||
|
}
|
||
|
CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
|
||
|
return nullptr;
|
||
|
}
|
||
|
|
||
|
template<typename Constructor>
|
||
|
static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
|
||
|
{
|
||
|
ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
|
||
|
|
||
|
while (true)
|
||
|
{
|
||
|
sharedLazyTessellationCache.lockThreadLoop(t_state);
|
||
|
void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
|
||
|
if (patch) return (decltype(constructor())) patch;
|
||
|
|
||
|
if (entry.mutex.try_lock())
|
||
|
{
|
||
|
if (!validTag(entry.tag,globalTime))
|
||
|
{
|
||
|
auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
|
||
|
auto ret = constructor(); // thread is locked here!
|
||
|
assert(ret);
|
||
|
/* this should never return nullptr */
|
||
|
auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
|
||
|
auto time = before ? timeBefore : timeAfter;
|
||
|
__memory_barrier();
|
||
|
entry.tag = SharedLazyTessellationCache::Tag(ret,time);
|
||
|
__memory_barrier();
|
||
|
entry.mutex.unlock();
|
||
|
return ret;
|
||
|
}
|
||
|
entry.mutex.unlock();
|
||
|
}
|
||
|
SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
|
||
|
{
|
||
|
#if FORCE_SIMPLE_FLUSH == 1
|
||
|
return i == getTime(globalTime);
|
||
|
#else
|
||
|
return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
|
||
|
{
|
||
|
return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
|
||
|
}
|
||
|
|
||
|
|
||
|
static __forceinline bool validTag(const Tag& tag, size_t globalTime)
|
||
|
{
|
||
|
const int64_t subdiv_patch_root_ref = tag.get();
|
||
|
if (subdiv_patch_root_ref == 0) return false;
|
||
|
const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
|
||
|
return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
|
||
|
}
|
||
|
|
||
|
void waitForUsersLessEqual(ThreadWorkState *const t_state,
|
||
|
const unsigned int users);
|
||
|
|
||
|
__forceinline size_t alloc(const size_t blocks)
|
||
|
{
|
||
|
if (unlikely(blocks >= switch_block_threshold))
|
||
|
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
|
||
|
|
||
|
assert(blocks < switch_block_threshold);
|
||
|
size_t index = next_block.fetch_add(blocks);
|
||
|
if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
|
||
|
return index;
|
||
|
}
|
||
|
|
||
|
static __forceinline void* malloc(const size_t bytes)
|
||
|
{
|
||
|
size_t block_index = -1;
|
||
|
ThreadWorkState *const t_state = threadState();
|
||
|
while (true)
|
||
|
{
|
||
|
block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
|
||
|
if (block_index == (size_t)-1)
|
||
|
{
|
||
|
sharedLazyTessellationCache.unlockThread(t_state);
|
||
|
sharedLazyTessellationCache.allocNextSegment();
|
||
|
sharedLazyTessellationCache.lockThread(t_state);
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
return sharedLazyTessellationCache.getBlockPtr(block_index);
|
||
|
}
|
||
|
|
||
|
__forceinline void *getBlockPtr(const size_t block_index)
|
||
|
{
|
||
|
assert(block_index < maxBlocks);
|
||
|
assert(data);
|
||
|
assert(block_index*16 <= size);
|
||
|
return (void*)&data[block_index*16];
|
||
|
}
|
||
|
|
||
|
__forceinline void* getDataPtr() { return data; }
|
||
|
__forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
|
||
|
__forceinline size_t getMaxBlocks() { return maxBlocks; }
|
||
|
__forceinline size_t getSize() { return size; }
|
||
|
|
||
|
void allocNextSegment();
|
||
|
void realloc(const size_t newSize);
|
||
|
|
||
|
void reset();
|
||
|
|
||
|
static SharedLazyTessellationCache sharedLazyTessellationCache;
|
||
|
};
|
||
|
}
|