/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include #include #include // Implementation of proposed Read-Copy-Update (RCU) C++ API // http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf // Overview // This file provides a low-overhead mechanism to guarantee ordering // between operations on shared data. In the simplest usage pattern, // readers enter a critical section, view some state, and leave the // critical section, while writers modify shared state and then defer // some cleanup operations. Proper use of these classes will guarantee // that a cleanup operation that is deferred during a reader critical // section will not be executed until after that critical section is // over. // Example // As an example, suppose we have some configuration data that gets // periodically updated. We need to protect ourselves on *every* read // path, even if updates are only vanishly rare, because we don't know // if some writer will come along and yank the data out from under us. // // Here's how that might look without deferral: // // void doSomething(IPAddress host); // // folly::SharedMutex sm; // ConfigData* globalConfigData; // // void reader() { // while (true) { // sm.lock_shared(); // IPAddress curManagementServer = globalConfigData->managementServerIP; // sm.unlock_shared(); // doSomethingWith(curManagementServer); // } // } // // void writer() { // while (true) { // std::this_thread::sleep_for(std::chrono::seconds(60)); // ConfigData* oldConfigData; // ConfigData* newConfigData = loadConfigDataFromRemoteServer(); // sm.lock(); // oldConfigData = globalConfigData; // globalConfigData = newConfigData; // sm.unlock(); // delete oldConfigData; // } // } // // The readers have to lock_shared and unlock_shared every iteration, even // during the overwhelming majority of the time in which there's no writer // present. These functions are surprisingly expensive; there's around 30ns of // overhead per critical section on my machine. // // Let's get rid of the locking. The readers and writer may proceed // concurrently. Here's how this would look: // // void doSomething(IPAddress host); // // std::atomic globalConfigData; // // void reader() { // while (true) { // ConfigData* configData = globalConfigData.load(); // IPAddress curManagementServer = configData->managementServerIP; // doSomethingWith(curManagementServer); // } // } // // void writer() { // while (true) { // std::this_thread::sleep_for(std::chrono::seconds(60)); // ConfigData* newConfigData = loadConfigDataFromRemoteServer(); // globalConfigData.store(newConfigData); // // We can't delete the old ConfigData; we don't know when the readers // // will be done with it! I guess we have to leak it... // } // } // // This works and is fast, but we don't ever reclaim the memory we // allocated for the copy of the data. We need a way for the writer to // know that no readers observed the old value of the pointer and are // still using it. Tweaking this slightly, we want a way for the // writer to say "I want some operation (deleting the old ConfigData) // to happen, but only after I know that all readers that started // before I requested the action have finished.". The classes in this // namespace allow this. Here's how we'd express this idea: // // void doSomething(IPAddress host); // std::atomic globalConfigData; // // // void reader() { // while (true) { // IPAddress curManagementServer; // { // // We're about to do some reads we want to protect; if we read a // // pointer, we need to make sure that if the writer comes along and // // updates it, the writer's cleanup operation won't happen until we're // // done accessing the pointed-to data. We get a Guard on that // // domain; as long as it exists, no function subsequently passed to // // invokeEventually will execute. // std::scoped_lock(rcu_default_domain()); // ConfigData* configData = globalConfigData.load(); // // We created a guard before we read globalConfigData; we know that the // // pointer will remain valid until the guard is destroyed. // curManagementServer = configData->managementServerIP; // // RCU domain via the scoped mutex is released here; retired objects // // may be freed. // } // doSomethingWith(curManagementServer); // } // } // // void writer() { // // while (true) { // std::this_thread::sleep_for(std::chrono::seconds(60)); // ConfigData* oldConfigData = globalConfigData.load(); // ConfigData* newConfigData = loadConfigDataFromRemoteServer(); // globalConfigData.store(newConfigData); // rcu_retire(oldConfigData); // // alternatively, in a blocking manner: // // synchronize_rcu(); // // delete oldConfigData; // } // } // // This gets us close to the speed of the second solution, without // leaking memory. An std::scoped_lock costs about 5 ns, faster than the // lock() / unlock() pair in the more traditional mutex-based approach from our // first attempt, and the writer never blocks the readers. // Notes // This implementation does implement an rcu_domain, and provides a default // one for use per the standard implementation. // // rcu_domain: // A "universe" of deferred execution. Each rcu_domain has an // executor on which deferred functions may execute. Readers enter // a read region in an rcu_domain by creating an instance of an // std::scoped_lock object on the domain. The scoped // lock provides RAII semantics for read region protection over the domain. // // rcu_domains should in general be completely separated; // std::scoped_lock locks created on one domain do not // prevent functions deferred on other domains from running. It's intended // that most callers should only ever use the default, global domain. // // You should use a custom rcu_domain if you can't avoid sleeping // during reader critical sections (because you don't want to block // all other users of the domain while you sleep), or you want to change // the default executor type on which retire callbacks are invoked. // Otherwise, users are strongly encouraged to use the default domain. // // API correctness limitations: // // - fork(): // Invoking fork() in a multithreaded program with any thread other than the // forking thread being present in a read region will result in undefined // behavior. Similarly, a forking thread must immediately invoke exec if // fork() is invoked while in a read region. Invoking fork() inside of a read // region, and then exiting before invoking exec(), will similarly result in // undefined behavior. // // - Exceptions: // In short, nothing about this is exception safe. retire functions should // not throw exceptions in their destructors, move constructors or call // operators. // // Performance limitations: // - Blocking: // A blocked reader will block invocation of deferred functions until it // becomes unblocked. Sleeping while holding an // std::scoped_lock lock can have bad performance // consequences. // // API questions you might have: // - Nested critical sections: // These are fine. The following is explicitly allowed by the standard, up to // a nesting level of 100: // std::scoped_lock reader1(rcu_default_domain()); // doSomeWork(); // std::scoped_lock reader2(rcu_default_domain()); // doSomeMoreWork(); // - Restrictions on retired()ed functions: // Any operation is safe from within a retired function's // execution; you can retire additional functions or add a new domain call to // the domain. However, when using the default domain or the default // executor, it is not legal to hold a lock across rcu_retire or call // that is acquired by the deleter. This is normally not a problem when // using the default deleter delete, which does not acquire any user locks. // However, even when using the default deleter, an object having a // user-defined destructor that acquires locks held across the corresponding // call to rcu_retire can still deadlock. // - rcu_domain destruction: // Destruction of a domain assumes previous synchronization: all remaining // call and retire calls are immediately added to the executor. // Overheads // Deferral latency is as low as is practical: overhead involves running // several global memory barriers on the machine to ensure all readers are gone. // // Currently use of MPMCQueue is the bottleneck for deferred calls, more // specialized queues could be used if available, since only a single reader // reads the queue, and can splice all of the items to the executor if possible. // // rcu_synchronize() call latency is on the order of 10ms. Multiple // separate threads can share a synchronized period and should scale. // // rcu_retire() is a queue push, and on the order of 150 ns, however, // the current implementation may synchronize if the retire queue is full, // resulting in tail latencies of ~10ms. // // std::scoped_lock creation/destruction is ~5ns. By comparison, // folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns // Hazard pointers vs. RCU: // // Hazard pointers protect pointers, generally malloc()d pieces of memory, and // each hazptr only protects one such block. // // RCU protects critical sections, *all* memory is protected as long // as the critical section is active. // // For example, this has implications for linked lists: If you wish to // free an entire linked list, you simply rcu_retire() each node with // RCU: readers will see either an entirely valid list, or no list at // all. // // Conversely with hazptrs, generally lists are walked with // hand-over-hand hazptrs. Only one node is protected at a time. So // anywhere in the middle of the list, hazptrs may read NULL, and throw // away all current work. Alternatively, reference counting can be used, // (as if each node was a shared_ptr), so that readers will always see // *the remaining part* of the list as valid, however parts before its current // hazptr may be freed. // // So roughly: RCU is simple, but an all-or-nothing affair. A single // std::scoped_lock can block all reclamation. Hazptrs will // reclaim exactly as much as possible, at the cost of extra work writing // traversal code // // Reproduced from // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/n4895.pdf // // Reference Counting RCU Hazptr // // Unreclaimed objects Bounded Unbounded Bounded // // Contention among readers High None None // // Traversal forward progress lock-free wait-free lock-free // // Reclamation forward progress lock-free blocking wait-free // // Traversal speed atomic no-overhead folly's is // no-overhead // // Reference acquisition unconditional unconditional conditional // // Automatic reclamation yes no no // // Purpose of domains N/A isolate slow configeration // readers namespace folly { // Defines an RCU domain. RCU readers within a given domain block updaters // (rcu_synchronize, call, retire, or rcu_retire) only within that same // domain, and have no effect on updaters associated with other rcu_domains. // // Custom domains are normally not necessary because the default domain works // in most cases. But it makes sense to create a separate domain for uses // having unusually long read-side critical sections (many milliseconds) // or uses that cannot tolerate moderately long read-side critical sections // from others. // // The executor runs grace-period processing and invokes deleters. // The default of QueuedImmediateExecutor is very light weight (compared // to, say, a thread pool). However, the flip side of this light weight // is that the overhead of this processing and invocation is incurred within // the executor invoking the RCU primitive, for example, rcu_retire(). // // The domain must survive all its readers. class rcu_domain { using list_head = typename detail::ThreadCachedLists::ListHead; using list_node = typename detail::ThreadCachedLists::Node; public: /* * If an executor is passed, it is used to run calls and delete * retired objects. */ explicit rcu_domain(Executor* executor = nullptr) noexcept : executor_(executor ? executor : &QueuedImmediateExecutor::instance()) {} rcu_domain(const rcu_domain&) = delete; rcu_domain(rcu_domain&&) = delete; rcu_domain& operator=(const rcu_domain&) = delete; rcu_domain& operator=(rcu_domain&&) = delete; // Reader locks: Prevent any calls from occuring, retired memory // from being freed, and synchronize() calls from completing until // all preceding lock() sections are finished. // Note: can potentially allocate on thread first use. FOLLY_ALWAYS_INLINE void lock() { counters_.increment(version_.load(std::memory_order_acquire)); } FOLLY_ALWAYS_INLINE void unlock() { counters_.decrement(); } // Invokes cbin(this) and then deletes this some time after all pre-existing // RCU readers have completed. See rcu_synchronize() for more information // about RCU readers and domains. template void call(T&& cbin) { auto node = new list_node; node->cb_ = [node, cb = std::forward(cbin)]() { cb(); delete node; }; retire(node); } // Invokes node->cb_(node) some time after all pre-existing RCU readers // have completed. See rcu_synchronize() for more information about RCU // readers and domains. void retire(list_node* node) noexcept { q_.push(node); // Note that it's likely we hold a read lock here, // so we can only half_sync(false). half_sync(true) // or a synchronize() call might block forever. uint64_t time = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); auto syncTime = syncTime_.load(std::memory_order_relaxed); if (time > syncTime + syncTimePeriod_ && syncTime_.compare_exchange_strong( syncTime, time, std::memory_order_relaxed)) { list_head finished; { std::lock_guard g(syncMutex_); half_sync(false, finished); } // callbacks are called outside of syncMutex_ finished.forEach( [&](list_node* item) { executor_->add(std::move(item->cb_)); }); } } // Ensure concurrent critical sections have finished. // Always waits for full synchronization. // read lock *must not* be held. void synchronize() noexcept { auto curr = version_.load(std::memory_order_acquire); // Target is two epochs away. auto target = curr + 2; while (true) { // Try to assign ourselves to do the sync work. // If someone else is already assigned, we can wait for // the work to be finished by waiting on turn_. auto work = work_.load(std::memory_order_acquire); auto tmp = work; if (work < target && work_.compare_exchange_strong(tmp, target)) { list_head finished; { std::lock_guard g(syncMutex_); while (version_.load(std::memory_order_acquire) < target) { half_sync(true, finished); } } // callbacks are called outside of syncMutex_ finished.forEach( [&](list_node* node) { executor_->add(std::move(node->cb_)); }); return; } else { if (version_.load(std::memory_order_acquire) >= target) { return; } std::atomic cutoff{100}; // Wait for someone to finish the work. turn_.tryWaitForTurn(to_narrow(work), cutoff, false); } } } private: detail::ThreadCachedReaders counters_; // Global epoch. std::atomic version_{0}; // Future epochs being driven by threads in synchronize std::atomic work_{0}; // Matches version_, for waking up threads in synchronize that are // sharing an epoch. detail::TurnSequencer turn_; // Only one thread can drive half_sync. std::mutex syncMutex_; // Currently half_sync takes ~16ms due to heavy barriers. // Ensure that if used in a single thread, max GC time // is limited to 1% of total CPU time. static constexpr uint64_t syncTimePeriod_{1600 * 2 /* full sync is 2x */}; std::atomic syncTime_{0}; // call()s waiting to move through two epochs. detail::ThreadCachedLists q_; // Executor callbacks will eventually be run on. Executor* executor_{nullptr}; // Queues for callbacks waiting to go through two epochs. list_head queues_[2]{}; // Move queues through one epoch (half of a full synchronize()). // Will block waiting for readers to exit if blocking is true. // blocking must *not* be true if the current thread is locked, // or will deadlock. // // returns a list of callbacks ready to run in finished. void half_sync(bool blocking, list_head& finished) { auto curr = version_.load(std::memory_order_acquire); auto next = curr + 1; // Push all work to a queue for moving through two epochs. One // version is not enough because of late readers of the version_ // counter in lock_shared. // // Note that for a similar reason we can't swap out the q here, // and instead drain it, so concurrent calls to call() are safe, // and will wait for the next epoch. q_.collect(queues_[0]); if (blocking) { counters_.waitForZero(next & 1); } else { if (!counters_.epochIsClear(next & 1)) { return; } } // Run callbacks that have been through two epochs, and swap queues // for those only through a single epoch. finished.splice(queues_[1]); queues_[1].splice(queues_[0]); version_.store(next, std::memory_order_release); // Notify synchronous waiters in synchronize(). turn_.completeTurn(to_narrow(curr)); } }; extern folly::Indestructible rcu_default_domain_; inline rcu_domain& rcu_default_domain() { return **rcu_default_domain_; } // Waits for all pre-existing RCU readers to complete. // RCU readers will normally be marked using the RAII interface // std::scoped_lock, as in: // // std::scoped_lock rcuReader(rcu_default_domain()); // // Other locking primitives that provide moveable semantics such as // std::unique_lock may be used as well. // // Note that rcu_synchronize is not obligated to wait for RCU readers that // start after rcu_synchronize starts. Note also that holding a lock across // rcu_synchronize that is acquired by any deleter (as in is passed to // rcu_retire, retire, or call) will result in deadlock. Note that such // deadlock will normally only occur with user-written deleters, as in the // default of delele will normally be immune to such deadlocks. inline void rcu_synchronize( rcu_domain& domain = rcu_default_domain()) noexcept { domain.synchronize(); } // Waits for all in-flight deleters to complete. // // An in-flight deleter is one that has already been passed to rcu_retire, // retire, or call. In other words, rcu_barrier is not obligated to wait // on any deleters passed to later calls to rcu_retire, retire, or call. // // And yes, the current implementation is buggy, and will be fixed. inline void rcu_barrier(rcu_domain& domain = rcu_default_domain()) noexcept { domain.synchronize(); } // Free-function retire. Always allocates. // // This will invoke the deleter d(p) asynchronously some time after all // pre-existing RCU readers have completed. See synchronize_rcu() for more // information about RCU readers and domains. template > void rcu_retire(T* p, D d = {}, rcu_domain& domain = rcu_default_domain()) { domain.call([p, del = std::move(d)]() { del(p); }); } // Base class for rcu objects. retire() will use preallocated storage // from rcu_obj_base, vs. rcu_retire() which always allocates. template > class rcu_obj_base : detail::ThreadCachedListsBase::Node { public: void retire(D d = {}, rcu_domain& domain = rcu_default_domain()) { // This implementation assumes folly::Function has enough // inline storage for D, otherwise, it allocates. this->cb_ = [this, d = std::move(d)]() { d(static_cast(this)); }; domain.retire(this); } }; } // namespace folly