/* pocl_llvm_utils.cc: various helpers for pocl LLVM API.

   Copyright (c) 2013 Kalle Raiskila
                 2013-2020 Pekka Jääskeläinen
                 2024 Pekka Jääskeläinen / Intel Finland Oy

   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
   THE SOFTWARE.
*/

#include "CompilerWarnings.h"
IGNORE_COMPILER_WARNING("-Wmaybe-uninitialized")
#include <llvm/ADT/StringRef.h>
#include <llvm/ADT/StringMap.h>
#include <llvm/TargetParser/Triple.h>
POP_COMPILER_DIAGS
IGNORE_COMPILER_WARNING("-Wunused-parameter")

#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/SourceMgr.h>
#include <llvm/Support/Signals.h>
#include <llvm/IR/Constants.h>
#include <llvm/IR/DiagnosticInfo.h>
#include <llvm/IR/DiagnosticPrinter.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Bitcode/BitcodeReader.h>
#include <llvm/Bitcode/BitcodeWriter.h>
#include <llvm/Support/raw_os_ostream.h>
#include <llvm/PassRegistry.h>
#include <llvm/InitializePasses.h>
#include <llvm/Support/CommandLine.h>
#include <llvm-c/Core.h>

#include <llvm/Analysis/CGSCCPassManager.h>
#include <llvm/Analysis/LoopAnalysisManager.h>
#include <llvm/Passes/PassBuilder.h>
#include <llvm/TargetParser/Host.h>

#include "config.h"
#include "pocl_debug.h"
#include "pocl_file_util.h"
#include "pocl_llvm.h"
#include "pocl_llvm_api.h"
#include "pocl_runtime_config.h"

using namespace llvm;

#include <string>
#include <map>

llvm::Module *parseModuleIR(const char *path, llvm::LLVMContext *c) {
  SMDiagnostic Err;
  return parseIRFile(path, Err, *c).release();
}

void parseModuleGVarSize(cl_program program, unsigned device_i,
                         llvm::Module *ProgramBC) {

  unsigned long TotalGVarBytes = 0;
  if (!getModuleIntMetadata(*ProgramBC, PoclGVarMDName, TotalGVarBytes))
    return;

  if (TotalGVarBytes) {
    if (program->global_var_total_size[device_i])
      assert(program->global_var_total_size[device_i] == TotalGVarBytes);
    else
      program->global_var_total_size[device_i] = TotalGVarBytes;
    POCL_MSG_PRINT_LLVM("Total Global Variable Bytes: %zu\n",
                        (size_t)TotalGVarBytes);
  }
}

void writeModuleIRtoString(const llvm::Module *mod, std::string& dest) {
  llvm::raw_string_ostream sos(dest);
  WriteBitcodeToFile(*mod, sos);
  sos.str(); // flush
}

int pocl_write_module(void *module, const char *path) {
  assert(module);
  assert(path);

  std::string binary;
  writeModuleIRtoString((const Module *)module, binary);

  return pocl_write_file(path, binary.data(), (uint64_t)binary.size(), 0);
}

llvm::Module *parseModuleIRMem(const char *input_stream, size_t size,
                               llvm::LLVMContext *c) {
  StringRef input_stream_ref(input_stream, size);
  std::unique_ptr<MemoryBuffer> buffer =
      MemoryBuffer::getMemBufferCopy(input_stream_ref);

  auto parsed_module = parseBitcodeFile(buffer->getMemBufferRef(), *c);
  if (auto error = parsed_module.takeError()) {
    POCL_MSG_ERR("parseBitcodeFile failed:\n%s\n",
                 toString(std::move(error)).c_str());
    return nullptr;
  }
  return parsed_module.get().release();
}

static int getModuleTriple(const char *input_stream, size_t size,
                           std::string &triple) {
  StringRef input_stream_ref(input_stream, size);
  std::unique_ptr<MemoryBuffer> buffer =
      MemoryBuffer::getMemBufferCopy(input_stream_ref);
  if (!isBitcode((const unsigned char*)input_stream,
                 (const unsigned char*)input_stream+size))
    return -1;

  auto triple_e = getBitcodeTargetTriple(buffer->getMemBufferRef());
  if (!triple_e)
    return -1;
  triple = triple_e.get();
  return 0;
}

const char *pocl_get_llvm_cpu_abi() {
#ifdef HOST_CPU_TARGET_ABI
  if (strlen(HOST_CPU_TARGET_ABI) > 0)
    return HOST_CPU_TARGET_ABI;
  else {
#if defined(__riscv) && (HOST_DEVICE_ADDRESS_BITS == 64)

#ifdef __riscv_float_abi_soft
    return "lp64";
#endif

#ifdef __riscv_float_abi_single
    return "lp64f";
#endif

#ifdef __riscv_float_abi_double
    return "lp64d";
#endif

#ifdef __riscv_float_abi_quad
    return "lp64q";
#endif

#endif // __riscv
  }
#endif // HOST_CPU_TARGET_ABI
  return nullptr;
}

char *pocl_get_llvm_cpu_name() {
  const char *custom = pocl_get_string_option("POCL_LLVM_CPU_NAME", NULL);
  StringRef r = custom ? StringRef(custom) : llvm::sys::getHostCPUName();

  // LLVM may return an empty string -- treat as generic
  if (r.empty())
    r = "generic";

#ifndef KERNELLIB_HOST_DISTRO_VARIANTS
  if (r.str() == "generic" && strlen(OCL_KERNEL_TARGET_CPU)) {
    POCL_MSG_WARN("LLVM does not recognize your cpu, trying to use "
                   OCL_KERNEL_TARGET_CPU " for -target-cpu\n");
    r = StringRef(OCL_KERNEL_TARGET_CPU);
  }
#endif

  assert(r.size() > 0);
  char *cpu_name = (char *)malloc(r.size() + 1);
  strncpy(cpu_name, r.data(), r.size());
  cpu_name[r.size()] = 0;
  return cpu_name;
}

#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
const struct kernellib_features {
  const char *kernellib_variant;
  const char *cpu_name;
  const char *features[12];
} kernellib_feature_map[] = {
// order the entries s.t. if a cpu matches multiple entries, the "best" match
// comes last
#if defined(__i386__) || defined(_M_IX86)
    "i386",
    "i386",
    {NULL},
    // FIXME: Can we distinguish i686 from i386 using feature flags?
    "i686",
    "i686",
    {NULL},
    "mmx",
    "pentium-mmx",
    {"mmx", NULL},
    "sse",
    "pentium3",
    {"sse", NULL},
#endif
#if defined(__i386__) || defined(_M_IX86) || \
    defined(__x86_64__) || defined(_M_X64)
    "sse2",
    "x86-64",
    {"sse2", NULL},
    "ssse3",
    "core2",
    {"sse2", "ssse3", "cx16", NULL},
    "sse41",
    "penryn",
    {"sse2", "sse4.1", "cx16", NULL},
    "avx",
    "sandybridge",
    {"sse2", "avx", "cx16", "popcnt", NULL},
    "avx_f16c",
    "ivybridge",
    {"sse2", "avx", "cx16", "popcnt", "f16c", NULL},
    "avx_fma4",
    "bdver1",
    {"sse2", "avx", "cx16", "popcnt", "xop", "fma4", NULL},
    "avx2",
    "haswell",
    {"sse2", "avx", "avx2", "cx16", "popcnt", "lzcnt", "f16c", "fma", "bmi",
     "bmi2", NULL},
    "avx512",
    "skylake-avx512",
    {"sse2", "avx512f", NULL},
#endif
    NULL,
    NULL,
    {NULL}};

/* for "distro" style kernel libs, return which kernellib to use, at runtime */
const char *pocl_get_distro_kernellib_variant() {
  StringMap<bool> Features;

#if defined(__i386__) || defined(_M_IX86) || \
    defined(__x86_64__) || defined(_M_X64)

#if LLVM_MAJOR < 19
  if (!llvm::sys::getHostCPUFeatures(Features)) {
    POCL_MSG_WARN("LLVM can't get host CPU flags!\n");
    return NULL;
  }
#else
  Features = llvm::sys::getHostCPUFeatures();
#endif

#else
  return pocl_get_llvm_cpu_name();
#endif

  const char *custom = pocl_get_string_option("POCL_KERNELLIB_NAME", NULL);

  const kernellib_features *best_match = NULL;
  for (const kernellib_features *kf = kernellib_feature_map; kf->kernellib_variant;
       ++kf) {
    bool matches = true;
    for (const char *const *f = kf->features; *f; ++f)
      matches &= Features[*f];
    if (matches) {
      best_match = kf;
      if (custom && !strcmp(custom, kf->kernellib_variant))
        break;
    }
  }

  if (!best_match) {
    POCL_MSG_WARN("Can't find a kernellib supported by the host CPU (%s)\n",
                  llvm::sys::getHostCPUName());
    return NULL;
  }

  return best_match->kernellib_variant;
}

/* for "distro" style kernel libs, return which target cpu to use for a given
 * kernellib */
const char *pocl_get_distro_cpu_name(const char *kernellib_variant) {
  StringMap<bool> Features;

#if defined(__i386__) || defined(_M_IX86) || \
    defined(__x86_64__) || defined(_M_X64)

#if LLVM_MAJOR < 19
  if (!llvm::sys::getHostCPUFeatures(Features)) {
    POCL_MSG_WARN("LLVM can't get host CPU flags!\n");
    return NULL;
  }
#else
  Features = llvm::sys::getHostCPUFeatures();
#endif

#else
  return pocl_get_llvm_cpu_name();
#endif

  const kernellib_features *best_match = NULL;
  for (const kernellib_features *kf = kernellib_feature_map; kf->kernellib_variant;
       ++kf) {
    if (!strcmp(kernellib_variant, kf->kernellib_variant))
      return kf->cpu_name;
  }

  POCL_MSG_WARN("Can't find a cpu name matching the kernellib (%s)\n",
                kernellib_variant);
  return NULL;
}
#endif

int pocl_bitcode_is_triple(const char *bitcode, size_t size, const char *triple) {
  std::string Triple;
  if (getModuleTriple(bitcode, size, Triple) == 0)
    return Triple.find(triple) != std::string::npos;
  else
    return 0;
}

// TODO this should be fixed to not require LLVM eventually,
// so that LLVM-less builds also report FMA correctly.
int cpu_has_fma() {
  StringMap<bool> Features;
#if LLVM_MAJOR < 19
  bool Res = llvm::sys::getHostCPUFeatures(Features);
#else
  const bool Res = true;
  Features = llvm::sys::getHostCPUFeatures();
#endif
  return ((Res && (Features["fma"] || Features["fma4"])) ? 1 : 0);
}

#define VECWIDTH(x)                                                            \
  std::min(std::max((lane_width / (unsigned)(sizeof(x))), 1U), 16U)

void cpu_setup_vector_widths(cl_device_id dev) {

  StringMap<bool> Features;
  bool Res = true;
#if LLVM_MAJOR < 19
  Res = llvm::sys::getHostCPUFeatures(Features);
#else
  Features = llvm::sys::getHostCPUFeatures();
#endif

  // set the minimum vec size to word size
#if HOST_DEVICE_ADDRESS_BITS == 64
  unsigned lane_width = 8;
#else
  unsigned lane_width = 4;
#endif
  if (Res) {
#if defined(__arm__) || defined(__aarch64__)
    if (Features["sve"] || Features["sve2"] || Features["neon"])
      lane_width = 16;
#endif
#if defined(__linux__) && defined(__riscv)
    if (Features["v"]) {
#ifdef __riscv_zvl128b
      lane_width = 16;
#endif
#ifdef __riscv_zvl256b
      lane_width = 32;
#endif
#ifdef __riscv_zvl512b
      lane_width = 64;
#endif
#ifdef __riscv_zvl1024b
      lane_width = 128;
#endif
    }
#endif
#if defined(__i386__) || defined(_M_IX86) || \
    defined(__x86_64__) || defined(_M_X64)
    if (Features["sse"])
      lane_width = 16;
    if (Features["avx"])
      lane_width = 32;
    if (Features["avx512f"])
      lane_width = 64;
#endif
  }
  dev->native_vector_width_in_bits = lane_width * 8;

  dev->native_vector_width_char = dev->preferred_vector_width_char =
      VECWIDTH(cl_char);
  dev->native_vector_width_short = dev->preferred_vector_width_short =
      VECWIDTH(cl_short);
  dev->native_vector_width_int = dev->preferred_vector_width_int =
      VECWIDTH(cl_int);
  dev->native_vector_width_long = dev->preferred_vector_width_long =
      VECWIDTH(cl_long);
  dev->native_vector_width_float = dev->preferred_vector_width_float =
      VECWIDTH(float);
  if (strstr(dev->extensions, "cl_khr_fp64") == NULL) {
    dev->native_vector_width_double = dev->preferred_vector_width_double = 0;
  } else {
    dev->native_vector_width_double = dev->preferred_vector_width_double =
        VECWIDTH(double);
  }

  if (strstr(dev->extensions, "cl_khr_fp16") == NULL) {
    dev->native_vector_width_half = dev->preferred_vector_width_half = 0;
  } else {
    dev->native_vector_width_half = dev->preferred_vector_width_half =
        VECWIDTH(cl_short);
  }
}

int pocl_llvm_remove_file_on_signal_create(const char *file) {
  return llvm::sys::RemoveFileOnSignal(
            StringRef(file)) ? 0 : -1;
}

int pocl_llvm_remove_file_on_signal_destroy(const char *file) {
  llvm::sys::RunInterruptHandlers();
  return 0;
}

/*
 * Use one global LLVMContext across all LLVM bitcodes. This is because
 * we want to cache the bitcode IR libraries and reuse them when linking
 * new kernels. The CloneModule etc. seem to assume we are linking
 * bitcodes with a same LLVMContext. Unfortunately, this requires serializing
 * all calls to the LLVM APIs with mutex.
 * Freeing/deleting the context crashes LLVM 3.2 (at program exit), as a
 * work-around, allocate this from heap.
 */

static void diagHandler(LLVMDiagnosticInfoRef DI, void *diagprinter) {
  assert(diagprinter);
  DiagnosticPrinterRawOStream *poclDiagPrinter =
      (DiagnosticPrinterRawOStream *)diagprinter;
  unwrap(DI)->print(*poclDiagPrinter);
  *poclDiagPrinter << "\n";
}

std::string getDiagString(void *PoclCtx) {
  PoclLLVMContextData *llvm_ctx = (PoclLLVMContextData *)PoclCtx;
  llvm_ctx->poclDiagStream->flush();
  std::string ret(*llvm_ctx->poclDiagString);
  llvm_ctx->poclDiagString->clear();
  return ret;
}

std::string getDiagString(cl_context ctx) {
  return getDiagString((PoclLLVMContextData *)ctx->llvm_context_data);
}

/* The LLVM API interface functions are not at the moment not thread safe,
 * Pocl needs to ensure only one thread is using this layer at the time.
 */
PoclCompilerMutexGuard::PoclCompilerMutexGuard(pocl_lock_t *ptr) {
  lock = ptr;
  POCL_LOCK(*lock);
}

PoclCompilerMutexGuard::~PoclCompilerMutexGuard() { POCL_UNLOCK(*lock); }

std::string CurrentWgMethod;

static bool LLVMInitialized = false;
static bool LLVMOptionsInitialized = false;
static bool LLVMUseGlobalContext = true;
/* must be called with kernelCompilerLock locked */
void InitializeLLVM() {

  if (!LLVMInitialized) {

    LLVMInitialized = true;
    // We have not initialized any pass managers for any device yet.
    // Run the global LLVM pass initialization functions.
    InitializeAllTargets();
    InitializeAllTargetMCs();
    InitializeAllAsmPrinters();
    InitializeAllAsmParsers();

    PassRegistry &Registry = *PassRegistry::getPassRegistry();

    initializeCore(Registry);
    initializeScalarOpts(Registry);
    initializeVectorization(Registry);
    initializeIPO(Registry);
    initializeAnalysis(Registry);
    initializeTransformUtils(Registry);
    initializeInstCombine(Registry);
#if LLVM_MAJOR < 16
    initializeInstrumentation(Registry);
#endif
    initializeTarget(Registry);
  }

  if (pocl_get_bool_option("POCL_LLVM_GLOBAL_CONTEXT", 1) == 1)
    LLVMUseGlobalContext = true;
  else
    LLVMUseGlobalContext = false;

  // Set the options only once. TODO: fix it so that each
  // device can reset their own options. Now one cannot compile
  // with different options to different devices at one run.

  if (!LLVMOptionsInitialized) {

    LLVMOptionsInitialized = true;

    StringMap<llvm::cl::Option *> &opts = llvm::cl::getRegisteredOptions();

    llvm::cl::Option *O = nullptr;

    CurrentWgMethod =
        pocl_get_string_option("POCL_WORK_GROUP_METHOD", "loopvec");
    if (CurrentWgMethod == "auto")
      CurrentWgMethod = "loopvec";

    if (CurrentWgMethod == "loopvec" || CurrentWgMethod == "loops" ||
        CurrentWgMethod == "cbs") {

      if (pocl_get_bool_option("POCL_VECTORIZER_REMARKS", 0) == 1) {
        // Enable diagnostics from the loop vectorizer.
        O = opts["pass-remarks-missed"];
        assert(O && "could not find LLVM option 'pass-remarks-missed'");
        O->addOccurrence(1, StringRef("pass-remarks-missed"),
                         StringRef("loop-vectorize"), false);

        O = opts["pass-remarks-analysis"];
        assert(O && "could not find LLVM option 'pass-remarks-analysis'");
        O->addOccurrence(1, StringRef("pass-remarks-analysis"),
                         StringRef("loop-vectorize"), false);

        O = opts["pass-remarks"];
        assert(O && "could not find LLVM option 'pass-remarks'");
        O->addOccurrence(1, StringRef("pass-remarks"),
                         StringRef("loop-vectorize"), false);
      }

      // Force the loop vectorizer to use the same width for all loops.
      if (int VecWidth =
              pocl_get_int_option("POCL_VECTORIZER_FORCE_VECTOR_WIDTH", 0)) {
        O = opts["force-vector-width"];
        assert(O && "could not find LLVM option 'force-vector-width'");
        O->addOccurrence(1, StringRef("force-vector-width"),
                         StringRef(std::to_string(VecWidth)), false);
      }
    }
    if (pocl_get_bool_option("POCL_DEBUG_LLVM_PASSES", 0) == 1) {
      O = opts["debug"];
      assert(O && "could not find LLVM option 'debug'");
      O->addOccurrence(1, StringRef("debug"), StringRef("true"), false);
#if 0
      O = opts["debug-only"];
      assert(O && "could not find LLVM option 'debug'");
      O->addOccurrence(1, StringRef("debug-only"), StringRef("inline"), false);
#endif
    }
    O = opts["inline-threshold"];
    assert(O && "inline-threshold not found");
    O->addOccurrence(1, StringRef("inline-threshold"), StringRef("1200"));
#if 0
    O = opts["inline-enable-cost-benefit-analysis"];
    assert(O && "inline-enable-cost-benefit-analysis not found");
    O->addOccurrence(1, StringRef("inline-enable-cost-benefit-analysis"), StringRef("true"));
#endif
  }
}

/* re-initialization causes errors like this:
clang: for the   --scalarize-load-store option: may only occur zero or one
times! clang: for the   --vectorizer-min-trip-count option: may only occur zero
or one times! clang: for the   --unroll-threshold option: may only occur zero or
one times!
*/

void UnInitializeLLVM() {
  LLVMInitialized = false;
}

static PoclLLVMContextData *GlobalLLVMContext = nullptr;
static unsigned GlobalLLVMContextRefcount = 0;

void pocl_llvm_create_context(cl_context ctx) {

  if (LLVMUseGlobalContext && GlobalLLVMContext != nullptr) {
    ctx->llvm_context_data = GlobalLLVMContext;
    ++GlobalLLVMContextRefcount;
    return;
  }

  PoclLLVMContextData *data = new PoclLLVMContextData;
  assert(data);

  data->Context = new llvm::LLVMContext();
  assert(data->Context);
  data->number_of_IRs = 0;
  data->poclDiagString = new std::string;
  data->poclDiagStream = new llvm::raw_string_ostream(*data->poclDiagString);
  data->poclDiagPrinter =
      new DiagnosticPrinterRawOStream(*data->poclDiagStream);

  data->kernelLibraryMap = new kernelLibraryMapTy;
  assert(data->kernelLibraryMap);
  POCL_INIT_LOCK(data->Lock);

  LLVMContextSetDiagnosticHandler(wrap(data->Context),
                                  (LLVMDiagnosticHandler)diagHandler,
                                  (void *)data->poclDiagPrinter);
  assert(ctx->llvm_context_data == nullptr);
  ctx->llvm_context_data = data;
  if (LLVMUseGlobalContext) {
    GlobalLLVMContext = data;
    ++GlobalLLVMContextRefcount;
  }

  POCL_MSG_PRINT_LLVM("Created context %" PRId64 " (%p)\n", ctx->id, ctx);
}

void pocl_llvm_release_context(cl_context ctx) {

  POCL_MSG_PRINT_LLVM("releasing LLVM context\n");

  PoclLLVMContextData *data = (PoclLLVMContextData *)ctx->llvm_context_data;
  // this can happen if clContextCreate runs into an error
  if (data == NULL)
    return;

  {
    PoclCompilerMutexGuard LockGuard(&data->Lock);
    if (data->number_of_IRs > 0) {
      POCL_MSG_ERR("still have IR references - can't release LLVM context !\n");
      return;
    }

    if (LLVMUseGlobalContext) {
      --GlobalLLVMContextRefcount;
      if (GlobalLLVMContextRefcount > 0)
        return;
    }

    delete data->poclDiagPrinter;
    delete data->poclDiagStream;
    delete data->poclDiagString;

    assert(data->kernelLibraryMap);
    // void cleanKernelLibrary(cl_context ctx) {
    for (auto i = data->kernelLibraryMap->begin(),
              e = data->kernelLibraryMap->end();
         i != e; ++i) {
      delete (llvm::Module *)i->second;
    }
    data->kernelLibraryMap->clear();
    delete data->kernelLibraryMap;
  }

  POCL_DESTROY_LOCK(data->Lock);
  delete data->Context;
  delete data;
  ctx->llvm_context_data = nullptr;
  if (LLVMUseGlobalContext) {
    GlobalLLVMContext = nullptr;
  }
}

void pocl_append_to_buildlog(cl_program Program, cl_uint DeviceI, char *Log,
                             size_t LogSize) {
  size_t ExistingLogSize = 0;
  if (LogSize == 0) {
    return;
  }

  if (Program->build_log[DeviceI] != nullptr) {
    ExistingLogSize = strlen(Program->build_log[DeviceI]);
    size_t TotalLogSize = LogSize + ExistingLogSize;
    char *NewLog = (char *)malloc(TotalLogSize);
    assert(NewLog);
    memcpy(NewLog, Program->build_log[DeviceI], ExistingLogSize);
    memcpy(NewLog + ExistingLogSize, Log, LogSize);
    free(Log);
    free(Program->build_log[DeviceI]);
    Program->build_log[DeviceI] = NewLog;
  } else {
    Program->build_log[DeviceI] = Log;
  }
}

#define POCL_METADATA_ROOT "pocl_meta"

void setModuleIntMetadata(llvm::Module *mod, const char *key, unsigned long data) {

  llvm::Metadata *meta[] = {MDString::get(mod->getContext(), key),
                            llvm::ConstantAsMetadata::get(ConstantInt::get(
                                Type::getInt64Ty(mod->getContext()), data))};

  MDNode *MD = MDNode::get(mod->getContext(), meta);

  NamedMDNode *Root = mod->getOrInsertNamedMetadata(POCL_METADATA_ROOT);
  Root->addOperand(MD);
}

void setModuleStringMetadata(llvm::Module *mod, const char *key,
                             const char *data) {
  llvm::Metadata *meta[] = {MDString::get(mod->getContext(), key),
                            MDString::get(mod->getContext(), data)};

  MDNode *MD = MDNode::get(mod->getContext(), meta);

  NamedMDNode *Root = mod->getOrInsertNamedMetadata(POCL_METADATA_ROOT);
  Root->addOperand(MD);
}

void setModuleBoolMetadata(llvm::Module *mod, const char *key, bool data) {
  llvm::Metadata *meta[] = {
      MDString::get(mod->getContext(), key),
      llvm::ConstantAsMetadata::get(
          ConstantInt::get(Type::getInt8Ty(mod->getContext()), data ? 1 : 0))};

  MDNode *MD = MDNode::get(mod->getContext(), meta);

  NamedMDNode *Root = mod->getOrInsertNamedMetadata(POCL_METADATA_ROOT);
  Root->addOperand(MD);
}

bool getModuleIntMetadata(const llvm::Module &mod, const char *key,
                          unsigned long &data) {
  NamedMDNode *Root = mod.getNamedMetadata(POCL_METADATA_ROOT);
  if (!Root)
    return false;

  bool found = false;

  for (size_t i = 0; i < Root->getNumOperands(); ++i) {
    MDNode *MD = Root->getOperand(i);

    Metadata *KeyMD = MD->getOperand(0);
    assert(KeyMD);
    MDString *Key = dyn_cast<MDString>(KeyMD);
    assert(Key);
    if (Key->getString().compare(key) != 0)
      continue;

    Metadata *ValueMD = MD->getOperand(1);
    assert(ValueMD);
    ConstantInt *CI = mdconst::extract<ConstantInt>(ValueMD);
    data = CI->getZExtValue();
    found = true;
  }
  return found;
}

bool getModuleStringMetadata(const llvm::Module &mod, const char *key,
                             std::string &data) {
  NamedMDNode *Root = mod.getNamedMetadata(POCL_METADATA_ROOT);
  if (!Root)
    return false;

  bool found = false;

  for (size_t i = 0; i < Root->getNumOperands(); ++i) {
    MDNode *MD = Root->getOperand(i);

    Metadata *KeyMD = MD->getOperand(0);
    assert(KeyMD);
    MDString *Key = dyn_cast<MDString>(KeyMD);
    assert(Key);
    if (Key->getString().compare(key) != 0)
      continue;

    Metadata *ValueMD = MD->getOperand(1);
    assert(ValueMD);
    MDString *StringValue = dyn_cast<MDString>(ValueMD);
    data = StringValue->getString().str();
    found = true;
  }
  return found;
}

bool getModuleBoolMetadata(const llvm::Module &mod, const char *key,
                           bool &data) {
  unsigned long temporary;
  bool found = getModuleIntMetadata(mod, key, temporary);
  if (found) {
    data = temporary > 0;
  }
  return found;
}
