/*
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <limits.h>

#include <atomic>
#include <functional>
#include <mutex>
#include <string>
#include <thread>
#include <vector>

#include <glog/logging.h>

#include <folly/Exception.h>
#include <folly/Function.h>
#include <folly/Portability.h>
#include <folly/ScopeGuard.h>
#include <folly/SharedMutex.h>
#include <folly/container/Foreach.h>
#include <folly/detail/AtFork.h>
#include <folly/detail/StaticSingletonManager.h>
#include <folly/lang/Exception.h>
#include <folly/memory/Malloc.h>
#include <folly/portability/PThread.h>
#include <folly/synchronization/MicroSpinLock.h>
#include <folly/system/ThreadId.h>

namespace folly {

enum class TLPDestructionMode { THIS_THREAD, ALL_THREADS };
struct AccessModeStrict {};

namespace threadlocal_detail {

constexpr uint32_t kEntryIDInvalid = std::numeric_limits<uint32_t>::max();

struct ThreadEntry;
/* This represents a node in doubly linked list where all the nodes
 * are part of an ElementWrapper struct that has the same id.
 * we cannot use prev and next as ThreadEntryNode pointers since the
 * ThreadEntry::elements can be reallocated and the pointers will change
 * in this case. So we keep a pointer to the parent ThreadEntry struct
 * one for the prev and next and also the id.
 * We will traverse and update the list only when holding the
 * StaticMetaBase::lock_
 */
struct ThreadEntryNode {
  uint32_t id : 31; // Note: this will never be kEntryIDInvalid.
  bool isZero : 1; // equivalent to !next, but used only in one thread
  ThreadEntry* parent;
  ThreadEntry* prev;
  ThreadEntry* next;

  void initIfZero(bool locked);

  void init(ThreadEntry* entry, uint32_t newId) {
    id = newId;
    isZero = false;
    parent = prev = next = entry;
  }

  void initZero(ThreadEntry* entry, uint32_t newId) {
    id = newId;
    isZero = true;
    parent = entry;
    prev = next = nullptr;
  }

  // if the list this node is part of is empty
  FOLLY_ALWAYS_INLINE bool empty() const { return (next == parent); }

  FOLLY_ALWAYS_INLINE bool zero() const { return isZero; }

  FOLLY_ALWAYS_INLINE ThreadEntry* getThreadEntry() { return parent; }

  FOLLY_ALWAYS_INLINE ThreadEntryNode* getPrev();

  FOLLY_ALWAYS_INLINE ThreadEntryNode* getNext();

  void push_back(ThreadEntry* head);

  void eraseZero();
};

/**
 * POD wrapper around an element (a void*) and an associated deleter.
 * This must be POD, as we memset() it to 0 and memcpy() it around.
 */
struct ElementWrapper {
  using DeleterFunType = void(void*, TLPDestructionMode);

  bool dispose(TLPDestructionMode mode) {
    if (ptr == nullptr) {
      return false;
    }

    DCHECK(deleter1 != nullptr);
    ownsDeleter ? (*deleter2)(ptr, mode) : (*deleter1)(ptr, mode);
    return true;
  }

  void* release() {
    auto retPtr = ptr;

    if (ptr != nullptr) {
      cleanup();
    }

    return retPtr;
  }

  template <class Ptr>
  void set(Ptr p) {
    auto guard = makeGuard([&] { delete p; });
    DCHECK(ptr == nullptr);
    DCHECK(deleter1 == nullptr);

    if (p) {
      node.initIfZero(true /*locked*/);
      ptr = p;
      deleter1 = [](void* pt, TLPDestructionMode) {
        delete static_cast<Ptr>(pt);
      };
      ownsDeleter = false;
      guard.dismiss();
    }
  }

  template <class Ptr, class Deleter>
  void set(Ptr p, const Deleter& d) {
    auto guard = makeGuard([&] {
      if (p) {
        d(p, TLPDestructionMode::THIS_THREAD);
      }
    });
    DCHECK(ptr == nullptr);
    DCHECK(deleter2 == nullptr);
    if (p) {
      node.initIfZero(true /*locked*/);
      ptr = p;
      auto d2 = d; // gcc-4.8 doesn't decay types correctly in lambda captures
      deleter2 = new std::function<DeleterFunType>(
          [d2](void* pt, TLPDestructionMode mode) {
            d2(static_cast<Ptr>(pt), mode);
          });
      ownsDeleter = true;
      guard.dismiss();
    }
  }

  void cleanup() {
    if (ownsDeleter) {
      delete deleter2;
    }
    ptr = nullptr;
    deleter1 = nullptr;
    ownsDeleter = false;
  }

  void* ptr;
  union {
    DeleterFunType* deleter1;
    std::function<DeleterFunType>* deleter2;
  };
  bool ownsDeleter;
  ThreadEntryNode node;
};

struct StaticMetaBase;
struct ThreadEntryList;

/**
 * Per-thread entry.  Each thread using a StaticMeta object has one.
 * This is written from the owning thread only (under the lock), read
 * from the owning thread (no lock necessary), and read from other threads
 * (under the lock).
 * StaticMetaBase::head_ elementsCapacity can be read from any thread on
 * reallocate (no lock)
 */
struct ThreadEntry {
  ElementWrapper* elements{nullptr};
  std::atomic<size_t> elementsCapacity{0};
  ThreadEntryList* list{nullptr};
  ThreadEntry* listNext{nullptr};
  StaticMetaBase* meta{nullptr};
  bool removed_{false};
  uint64_t tid_os{};
  aligned_storage_for_t<std::thread::id> tid_data{};

  size_t getElementsCapacity() const noexcept {
    return elementsCapacity.load(std::memory_order_relaxed);
  }

  void setElementsCapacity(size_t capacity) noexcept {
    elementsCapacity.store(capacity, std::memory_order_relaxed);
  }

  std::thread::id& tid() {
    return *reinterpret_cast<std::thread::id*>(&tid_data);
  }
};

struct ThreadEntryList {
  ThreadEntry* head{nullptr};
  size_t count{0};
};

struct PthreadKeyUnregisterTester;

FOLLY_ALWAYS_INLINE ThreadEntryNode* ThreadEntryNode::getPrev() {
  return &prev->elements[id].node;
}

FOLLY_ALWAYS_INLINE ThreadEntryNode* ThreadEntryNode::getNext() {
  return &next->elements[id].node;
}

/**
 * We want to disable onThreadExit call at the end of shutdown, we don't care
 * about leaking memory at that point.
 *
 * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be
 * called after dlclose().
 *
 * This class has one single static instance; however since it's so widely used,
 * directly or indirectly, by so many classes, we need to take care to avoid
 * problems stemming from the Static Initialization/Destruction Order Fiascos.
 * Therefore this class needs to be constexpr-constructible, so as to avoid
 * the need for this to participate in init/destruction order.
 */
class PthreadKeyUnregister {
 public:
  static constexpr size_t kMaxKeys = 1UL << 16;

  ~PthreadKeyUnregister() {
    // If static constructor priorities are not supported then
    // ~PthreadKeyUnregister logic is not safe.
#if !defined(__APPLE__) && !defined(_MSC_VER)
    MSLGuard lg(lock_);
    while (size_) {
      pthread_key_delete(keys_[--size_]);
    }
#endif
  }

  static void registerKey(pthread_key_t key) { instance_.registerKeyImpl(key); }

 private:
  /**
   * Only one global instance should exist, hence this is private.
   * See also the important note at the top of this class about `constexpr`
   * usage.
   */
  constexpr PthreadKeyUnregister() : lock_(), size_(0), keys_() {}
  friend struct folly::threadlocal_detail::PthreadKeyUnregisterTester;

  void registerKeyImpl(pthread_key_t key) {
    MSLGuard lg(lock_);
    if (size_ == kMaxKeys) {
      throw_exception<std::logic_error>(
          "pthread_key limit has already been reached");
    }
    keys_[size_++] = key;
  }

  MicroSpinLock lock_;
  size_t size_;
  pthread_key_t keys_[kMaxKeys];

  static PthreadKeyUnregister instance_;
};

struct StaticMetaBase {
  // In general, emutls cleanup is not guaranteed to play nice with the way
  // StaticMeta mixes direct pthread calls and the use of __thread. This has
  // caused problems on multiple platforms so don't use __thread there.
  //
  // XXX: Ideally we would instead determine if emutls is in use at runtime as
  // it is possible to configure glibc on Linux to use emutls regardless.
  static constexpr bool kUseThreadLocal = !kIsMobile && !kIsApple && !kMscVer;

  // Represents an ID of a thread local object. Initially set to the maximum
  // uint. This representation allows us to avoid a branch in accessing TLS data
  // (because if you test capacity > id if id = maxint then the test will always
  // fail). It allows us to keep a constexpr constructor and avoid SIOF.
  class EntryID {
   public:
    std::atomic<uint32_t> value;

    constexpr EntryID() : value(kEntryIDInvalid) {}

    EntryID(EntryID&& other) noexcept : value(other.value.load()) {
      other.value = kEntryIDInvalid;
    }

    EntryID& operator=(EntryID&& other) noexcept {
      assert(this != &other);
      value = other.value.load();
      other.value = kEntryIDInvalid;
      return *this;
    }

    EntryID(const EntryID& other) = delete;
    EntryID& operator=(const EntryID& other) = delete;

    uint32_t getOrInvalid() { return value.load(std::memory_order_acquire); }

    uint32_t getOrAllocate(StaticMetaBase& meta) {
      uint32_t id = getOrInvalid();
      if (id != kEntryIDInvalid) {
        return id;
      }
      // The lock inside allocate ensures that a single value is allocated
      return meta.allocate(this);
    }
  };

  StaticMetaBase(ThreadEntry* (*threadEntry)(), bool strict);

  FOLLY_EXPORT static ThreadEntryList* getThreadEntryList();

  static bool dying();

  static void onThreadExit(void* ptr);

  // returns the elementsCapacity for the
  // current thread ThreadEntry struct
  uint32_t elementsCapacity() const;

  uint32_t allocate(EntryID* ent);

  void destroy(EntryID* ent);

  /**
   * Reserve enough space in the ThreadEntry::elements for the item
   * @id to fit in.
   */
  void reserve(EntryID* id);

  ElementWrapper& getElement(EntryID* ent);

  // reserve an id in the head_ ThreadEntry->elements
  // array if not already there
  void reserveHeadUnlocked(uint32_t id);

  // push back an entry in the doubly linked list
  // that corresponds to idx id
  void pushBackLocked(ThreadEntry* t, uint32_t id);
  void pushBackUnlocked(ThreadEntry* t, uint32_t id);

  // static helper method to reallocate the ThreadEntry::elements
  // returns != nullptr if the ThreadEntry::elements was reallocated
  // nullptr if the ThreadEntry::elements was just extended
  // and throws stdd:bad_alloc if memory cannot be allocated
  static ElementWrapper* reallocate(
      ThreadEntry* threadEntry, uint32_t idval, size_t& newCapacity);

  uint32_t nextId_;
  std::vector<uint32_t> freeIds_;
  std::mutex lock_;
  SharedMutex accessAllThreadsLock_;
  pthread_key_t pthreadKey_;
  ThreadEntry head_;
  ThreadEntry* (*threadEntry_)();
  bool strict_;
};

// Held in a singleton to track our global instances.
// We have one of these per "Tag", by default one for the whole system
// (Tag=void).
//
// Creating and destroying ThreadLocalPtr objects, as well as thread exit
// for threads that use ThreadLocalPtr objects collide on a lock inside
// StaticMeta; you can specify multiple Tag types to break that lock.
template <class Tag, class AccessMode>
struct FOLLY_EXPORT StaticMeta final : StaticMetaBase {
  StaticMeta()
      : StaticMetaBase(
            &StaticMeta::getThreadEntrySlow,
            std::is_same<AccessMode, AccessModeStrict>::value) {
    detail::AtFork::registerHandler(
        this,
        /*prepare*/ &StaticMeta::preFork,
        /*parent*/ &StaticMeta::onForkParent,
        /*child*/ &StaticMeta::onForkChild);
  }

  static StaticMeta<Tag, AccessMode>& instance() {
    // Leak it on exit, there's only one per process and we don't have to
    // worry about synchronization with exiting threads.
    return detail::createGlobal<StaticMeta<Tag, AccessMode>, void>();
  }

  FOLLY_EXPORT FOLLY_ALWAYS_INLINE static ElementWrapper& get(EntryID* ent) {
    // Eliminate as many branches and as much extra code as possible in the
    // cached fast path, leaving only one branch here and one indirection below.
    uint32_t id = ent->getOrInvalid();

    static thread_local ThreadEntry* threadEntryTL{};
    ThreadEntry* threadEntryNonTL{};
    auto& threadEntry = kUseThreadLocal ? threadEntryTL : threadEntryNonTL;

    static thread_local size_t capacityTL{};
    size_t capacityNonTL{};
    auto& capacity = kUseThreadLocal ? capacityTL : capacityNonTL;

    if (FOLLY_UNLIKELY(capacity <= id)) {
      getSlowReserveAndCache(ent, id, threadEntry, capacity);
    }
    return threadEntry->elements[id];
  }

  FOLLY_NOINLINE static void getSlowReserveAndCache(
      EntryID* ent, uint32_t& id, ThreadEntry*& threadEntry, size_t& capacity) {
    auto& inst = instance();
    threadEntry = inst.threadEntry_();
    if (UNLIKELY(threadEntry->getElementsCapacity() <= id)) {
      inst.reserve(ent);
      id = ent->getOrInvalid();
    }
    capacity = threadEntry->getElementsCapacity();
    assert(capacity > id);
  }

  FOLLY_EXPORT FOLLY_NOINLINE static ThreadEntry* getThreadEntrySlow() {
    auto& meta = instance();
    auto key = meta.pthreadKey_;
    ThreadEntry* threadEntry =
        static_cast<ThreadEntry*>(pthread_getspecific(key));
    if (!threadEntry) {
      ThreadEntryList* threadEntryList = StaticMeta::getThreadEntryList();
      if (kUseThreadLocal) {
        static thread_local ThreadEntry threadEntrySingleton;
        threadEntry = &threadEntrySingleton;
      } else {
        threadEntry = new ThreadEntry();
      }
      // if the ThreadEntry already exists
      // but pthread_getspecific returns NULL
      // do not add the same entry twice to the list
      // since this would create a loop in the list
      if (!threadEntry->list) {
        threadEntry->list = threadEntryList;
        threadEntry->listNext = threadEntryList->head;
        threadEntryList->head = threadEntry;
      }

      threadEntry->tid() = std::this_thread::get_id();
      threadEntry->tid_os = folly::getOSThreadID();

      // if we're adding a thread entry
      // we need to increment the list count
      // even if the entry is reused
      threadEntryList->count++;

      threadEntry->meta = &meta;
      int ret = pthread_setspecific(key, threadEntry);
      checkPosixError(ret, "pthread_setspecific failed");
    }
    return threadEntry;
  }

  static bool preFork() {
    return instance().lock_.try_lock(); // Make sure it's created
  }

  static void onForkParent() { instance().lock_.unlock(); }

  static void onForkChild() {
    // only the current thread survives
    auto& head = instance().head_;
    // init the circular lists
    auto elementsCapacity = head.getElementsCapacity();
    for (size_t i = 0u; i < elementsCapacity; ++i) {
      head.elements[i].node.init(&head, static_cast<uint32_t>(i));
    }
    // init the thread entry
    ThreadEntry* threadEntry = instance().threadEntry_();
    elementsCapacity = threadEntry->getElementsCapacity();
    for (size_t i = 0u; i < elementsCapacity; ++i) {
      if (!threadEntry->elements[i].node.zero()) {
        threadEntry->elements[i].node.initZero(
            threadEntry, static_cast<uint32_t>(i));
        threadEntry->elements[i].node.initIfZero(false /*locked*/);
      }
    }

    instance().lock_.unlock();
  }
};
} // namespace threadlocal_detail
} // namespace folly