Robustify multi-threading primitives

2023-01-28 13:34:06 +01:00 · 2023-01-28 13:34:06 +01:00 · 32a9227f5b
commit 32a9227f5b
parent 7d135cb962
6 changed files with 94 additions and 8 deletions
--- a/core/bind/core_bind.cpp
+++ b/core/bind/core_bind.cpp
@ -2807,7 +2807,27 @@ void _Thread::_start_func(void *ud) {

 	Thread::set_name(t->target_method);

-	t->ret = target_instance->call(t->target_method, arg, argc, ce);
+	// To avoid a circular reference between the thread and the script which can possibly contain a reference
+	// to the thread, we will do the call (keeping a reference up to that point) and then break chains with it.
+	// When the call returns, we will reference the thread again if possible.
+	ObjectID th_instance_id = t->get_instance_id();
+	StringName target_method = t->target_method;
+	t = Ref<_Thread>();
+
+	Variant ret;
+	ret = target_instance->call(target_method, arg, argc, ce);
+
+	// If script properly kept a reference to the thread, we should be able to re-reference it now
+	// (well, or if the call failed, since we had to break chains anyway because the outcome isn't known upfront).
+	t = Ref<_Thread>(ObjectDB::get_instance(th_instance_id));
+	if (t.is_valid()) {
+		t->ret = ret;
+		t->running.clear();
+	} else {
+		// We could print a warning here, but the Thread object will be eventually destroyed
+		// noticing wait_to_finish() hasn't been called on it, and it will print a warning itself.
+	}
+
 	if (ce.error != Variant::CallError::CALL_OK) {
 		String reason;
 		switch (ce.error) {
@ -2826,12 +2846,8 @@ void _Thread::_start_func(void *ud) {
 			default: {
 			}
 		}
-
-		t->running.clear();
-		ERR_FAIL_MSG("Could not call function '" + t->target_method.operator String() + "' to start thread " + t->get_id() + ": " + reason + ".");
+		ERR_FAIL_MSG("Could not call function '" + target_method.operator String() + "' to start thread " + uitos(Thread::get_caller_id()) + ": " + reason + ".");
 	}
-
-	t->running.clear();
 }

 Error _Thread::start(Object *p_instance, const StringName &p_method, const Variant &p_userdata, Priority p_priority) {
--- a/core/os/semaphore.h
+++ b/core/os/semaphore.h
@ -33,6 +33,9 @@

 #include "core/error_list.h"
 #include "core/typedefs.h"
+#ifdef DEBUG_ENABLED
+#include "core/error_macros.h"
+#endif

 #if !defined(NO_THREADS)

@ -44,6 +47,9 @@ private:
 	mutable std::mutex mutex;
 	mutable std::condition_variable condition;
 	mutable uint32_t count = 0; // Initialized as locked.
+#ifdef DEBUG_ENABLED
+	mutable uint32_t awaiters = 0;
+#endif

 public:
 	_ALWAYS_INLINE_ void post() const {
@ -54,10 +60,16 @@ public:

 	_ALWAYS_INLINE_ void wait() const {
 		std::unique_lock<std::mutex> lock(mutex);
+#ifdef DEBUG_ENABLED
+		++awaiters;
+#endif
 		while (!count) { // Handle spurious wake-ups.
 			condition.wait(lock);
 		}
-		count--;
+		--count;
+#ifdef DEBUG_ENABLED
+		--awaiters;
+#endif
 	}

 	_ALWAYS_INLINE_ bool try_wait() const {
@ -74,6 +86,47 @@ public:
 		std::lock_guard<std::mutex> lock(mutex);
 		return count;
 	}
+
+#ifdef DEBUG_ENABLED
+	~Semaphore() {
+		// Destroying an std::condition_variable when not all threads waiting on it have been notified
+		// invokes undefined behavior (e.g., it may be nicely destroyed or it may be awaited forever.)
+		// That means other threads could still be running the body of std::condition_variable::wait()
+		// but already past the safety checkpoint. That's the case for instance if that function is already
+		// waiting to lock again.
+		//
+		// We will make the rule a bit more restrictive and simpler to understand at the same time: there
+		// should not be any threads at any stage of the waiting by the time the semaphore is destroyed.
+		//
+		// We do so because of the following reasons:
+		// - We have the guideline that threads must be awaited (i.e., completed), so the waiting thread
+		//   must be completely done by the time the thread controlling it finally destroys the semaphore.
+		//   Therefore, only a coding mistake could make the program run into such a attempt at premature
+		//   destruction of the semaphore.
+		// - In scripting, given that Semaphores are wrapped by RefCounted classes, in general it can't
+		//   happen that a thread is trying to destroy a Semaphore while another is still doing whatever with
+		//   it, so the simplification is mostly transparent to script writers.
+		// - The redefined rule can be checked for failure to meet it, which is what this implementation does.
+		//   This is useful to detect a few cases of potential misuse; namely:
+		//   a) In scripting:
+		//      * The coder is naughtily dealing with the reference count causing a semaphore to die prematurely.
+		//      * The coder is letting the project reach its termination without having cleanly finished threads
+		//        that await on semaphores (or at least, let the usual semaphore-controlled loop exit).
+		//   b) In the native side, where Semaphore is not a ref-counted beast and certain coding mistakes can
+		//      lead to its premature destruction as well.
+		//
+		// Let's let users know they are doing it wrong, but apply a, somewhat hacky, countermeasure against UB
+		// in debug builds.
+		std::lock_guard<std::mutex> lock(mutex);
+		if (awaiters) {
+			WARN_PRINT(
+					"A Semaphore object is being destroyed while one or more threads are still waiting on it.\n"
+					"Please call post() on it as necessary to prevent such a situation and so ensure correct cleanup.");
+			// And now, the hacky countermeasure (i.e., leak the condition variable).
+			new (&condition) std::condition_variable();
+		}
+	}
+#endif
 };

 #else
--- a/core/os/thread.cpp
+++ b/core/os/thread.cpp
@ -121,7 +121,9 @@ Error Thread::set_name(const String &p_name) {
 Thread::~Thread() {
 	if (id != _thread_id_hash(std::thread::id())) {
 #ifdef DEBUG_ENABLED
-		WARN_PRINT("A Thread object has been destroyed without wait_to_finish() having been called on it. Please do so to ensure correct cleanup of the thread.");
+		WARN_PRINT(
+				"A Thread object is being destroyed without its completion having been realized.\n"
+				"Please call wait_to_finish() on it to ensure correct cleanup.");
 #endif
 		thread.detach();
 	}
--- a/doc/classes/Mutex.xml
+++ b/doc/classes/Mutex.xml
@ -5,6 +5,11 @@
 	</brief_description>
 	<description>
 		A synchronization mutex (mutual exclusion). This is used to synchronize multiple [Thread]s, and is equivalent to a binary [Semaphore]. It guarantees that only one thread can ever acquire the lock at a time. A mutex can be used to protect a critical section; however, be careful to avoid deadlocks.
+		It's of the recursive kind, so it can be locked multiple times by one thread, provided it also unlocks it as many times.
+		[b]Warning:[/b]
+		To guarantee that the operating system is able to perform proper cleanup (no crashes, no deadlocks), these conditions must be met:
+		- By the time a [Mutex]'s reference count reaches zero and therefore it is destroyed, no threads (including the one on which the destruction will happen) must have it locked.
+		- By the time a [Thread]'s reference count reaches zero and therefore it is destroyed, it must not have any mutex locked.
 	</description>
 	<tutorials>
 		<link>$DOCS_URL/tutorials/performance/threads/using_multiple_threads.html</link>
@ -29,6 +34,7 @@
 			<description>
 				Unlocks this [Mutex], leaving it to other threads.
 				[b]Note:[/b] If a thread called [method lock] or [method try_lock] multiple times while already having ownership of the mutex, it must also call [method unlock] the same number of times in order to unlock it correctly.
+				[b]Warning:[/b] Calling [method unlock] more times that [method lock] on a given thread, thus ending up trying to unlock a non-locked mutex, is wrong and may causes crashes or deadlocks.
 			</description>
 		</method>
 	</methods>
--- a/doc/classes/Semaphore.xml
+++ b/doc/classes/Semaphore.xml
@ -5,6 +5,10 @@
 	</brief_description>
 	<description>
 		A synchronization semaphore which can be used to synchronize multiple [Thread]s. Initialized to zero on creation. Be careful to avoid deadlocks. For a binary version, see [Mutex].
+		[b]Warning:[/b]
+		To guarantee that the operating system is able to perform proper cleanup (no crashes, no deadlocks), these conditions must be met:
+		- By the time a [Semaphore]'s reference count reaches zero and therefore it is destroyed, no threads must be waiting on it.
+		- By the time a [Thread]'s reference count reaches zero and therefore it is destroyed, it must not be waiting on any semaphore.
 	</description>
 	<tutorials>
 		<link>$DOCS_URL/tutorials/performance/threads/using_multiple_threads.html</link>
--- a/doc/classes/Thread.xml
+++ b/doc/classes/Thread.xml
@ -6,6 +6,11 @@
 	<description>
 		A unit of execution in a process. Can run methods on [Object]s simultaneously. The use of synchronization via [Mutex] or [Semaphore] is advised if working with shared objects.
 		[b]Note:[/b] Breakpoints won't break on code if it's running in a thread. This is a current limitation of the GDScript debugger.
+		[b]Warning:[/b]
+		To guarantee that the operating system is able to perform proper cleanup (no crashes, no deadlocks), these conditions must be met by the time a [Thread]'s reference count reaches zero and therefore it is destroyed:
+		- It must not have any [Mutex] objects locked.
+		- It must not be waiting on any [Semaphore] objects.
+		- [method wait_to_finish] should have been called on it.
 	</description>
 	<tutorials>
 		<link title="Using multiple threads">$DOCS_URL/tutorials/performance/threads/using_multiple_threads.html</link>