add xinhe runtime code

Change-Id: I3580a997dfa9f53df3cc686055b335da1fb1b66b
This commit is contained in:
shilei1108 2024-09-18 15:21:12 +08:00
parent 6e786339be
commit 026f284ba4
5 changed files with 485 additions and 0 deletions

View File

@ -0,0 +1,5 @@
# CMCC -- XINHE Runtime
XINHE Runtime is an cross-arch runtime system for Multi-vendor & Multi-type architectures. XINHE Runtime discovers available functionality, manage multiple diverse programming
systems (e.g., CUDA, HIP, Level Zero, DTK, Vasti) in the same application, represents data dependencies, orchestrates data movement proactively, and allows configurable work schedulers for diverse multi-vendors devices.

View File

@ -0,0 +1,146 @@
#ifndef HYCR_INCLUDE_HYCR_HYCR_RUNTIME_H
#define HYCR_INCLUDE_HYCR_HYCR_RUNTIME_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#else
typedef int8_t bool;
#endif
#define HYCR_MAX_NPLATFORMS 64
#define HYCR_MAX_NDEVS (1 << 8) - 1
#define hycr_default (1 << 5)
#define hycr_cpu (1 << 6)
#define hycr_gpu_nvidia (1 << 7)
#define hycr_gpu_amd (1 << 8)
#define hycr_gpu_intel (1 << 9)
#define hycr_gpu_hygon (1 << 10)
#define hycr_gpu_iltar (1 << 11)
#define hycr_ogpu (hycr_gpu_nvidia | hycr_gpu_amd | hycr_gpu_intel | hycr_gpu_hygon | hycr_gpu_iltar)
#define hycr_npu_ascend (1 << 13)
#define hycr_dla_vasti (1 << 14)
#define hycr_dla_enflame (1 << 15)
#define hycr_cuda 1
#define hycr_hip 3
#define hycr_levelzero 4
#define hycr_dtk 5
#define hycr_ixc 6
#define hycr_cann 7
#define hycr_vasr 8
#define hycr_r -1
#define hycr_w -2
#define hycr_rw -3
#define hycr_xr -4
#define hycr_xw -5
#define hycr_xrw -6
#define hycr_int (1 << 0)
#define hycr_long (1 << 1)
#define hycr_float (1 << 2)
#define hycr_double (1 << 3)
#define hycr_normal (1 << 10)
#define hycr_reduction (1 << 11)
#define hycr_sum ((1 << 12) | hycr_reduction)
#define hycr_max ((1 << 13) | hycr_reduction)
#define hycr_min ((1 << 14) | hycr_reduction)
#define hycr_platform 0x3401
#define hycr_vendor 0x3402
#define hycr_name 0x3403
#define hycr_type 0x3404
#define hycr_ncmds 1
#define hycr_ncmds_kernel 2
#define hycr_ncmds_memcpy 3
#define hycr_cmds 4
#endif
typedef struct _hycr_task* hycr_task;
typedef struct _hycr_mem* hycr_mem;
typedef struct _hycr_kernel* hycr_kernel;
typedef struct _hycr_graph* hycr_graph;
typedef int (*hycr_host_task)(void* params, const int* device);
typedef int (*command_handler)(void* params, void* device);
typedef int (*hook_task)(void* task);
typedef int (*hook_command)(void* command);
typedef int (*hycr_selector_kernel)(hycr_task task, void* params, char* kernel_name);
extern int hycr_init(int* argc, char*** argv, int sync);
extern int hycr_finalize();
extern int hycr_env_set(const char* key, const char* value);
extern int hycr_env_get(const char* key, char** value, size_t* vallen);
extern int hycr_platform_count(int* nplatforms);
extern int hycr_platform_info(int platform, int param, void* value, size_t* size);
extern int hycr_device_count(int* ndevs);
extern int hycr_device_info(int device, int param, void* value, size_t* size);
extern int hycr_device_set_default(int device);
extern int hycr_device_get_default(int* device);
extern int hycr_device_synchronize(int ndevs, int* devices);
extern int hycr_kernel_create(const char* name, hycr_kernel* kernel);
extern int hycr_kernel_get(const char* name, hycr_kernel* kernel);
extern int hycr_kernel_setarg(hycr_kernel kernel, int idx, size_t size, void* value);
extern int hycr_kernel_setmem(hycr_kernel kernel, int idx, hycr_mem mem, size_t mode);
extern int hycr_kernel_setmem_off(hycr_kernel kernel, int idx, hycr_mem mem, size_t off, size_t mode);
extern int hycr_kernel_setmap(hycr_kernel kernel, int idx, void* host, size_t mode);
extern int hycr_kernel_release(hycr_kernel kernel);
extern int hycr_create(hycr_task* task);
extern int hycr_create_name(const char* name, hycr_task* task);
extern int hycr_depend(hycr_task task, int ntasks, hycr_task* tasks);
extern int hycr_malloc(hycr_task task, hycr_mem mem);
extern int hycr_cmd_reset_mem(hycr_task task, hycr_mem mem, uint8_t reset);
extern int hycr_h2d(hycr_task task, hycr_mem mem, size_t off, size_t size, void* host);
extern int hycr_h2d_offsets(hycr_task task, hycr_mem mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, void* host);
extern int hycr_d2h(hycr_task task, hycr_mem mem, size_t off, size_t size, void* host);
extern int hycr_d2h_offsets(hycr_task task, hycr_mem mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, void* host);
extern int hycr_dmem_flush_out(hycr_task task, hycr_mem mem);
extern int hycr_h2d_full(hycr_task task, hycr_mem mem, void* host);
extern int hycr_d2h_full(hycr_task task, hycr_mem mem, void* host);
extern int hycr_arch_kernel_object(hycr_task task, hycr_kernel kernel, int dim, size_t* off, size_t* gws, size_t* lws);
extern int hycr_arch_kernel_selector(hycr_task task, hycr_selector_kernel func, void* params, size_t params_size);
extern int hycr_arch_submit(hycr_task task, int device, const char* opt, int sync);
extern int hycr_arch_release(hycr_task task);
extern int hycr_mem_create(size_t size, hycr_mem* mem);
extern int hycr_mem_init_reset(hycr_mem mem, int reset);
extern int hycr_mem_create(hycr_mem* mem, void *host, size_t size);
extern int hycr_mem_update(hycr_mem mem, void *host);
extern int hycr_mem_create_region(hycr_mem* mem, hycr_mem root_mem, int region);
extern int hycr_mem_enable_outer_dim_regions(hycr_mem mem);
extern int hycr_mem_create_tile(hycr_mem* mem, void *host, size_t *off, size_t *host_size, size_t *dev_size, size_t elem_size, int dim);
extern int hycr_mem_arch(hycr_mem mem, int device, void** arch);
extern int hycr_mem_reduce(hycr_mem mem, int mode, int type);
extern int hycr_mem_release(hycr_mem mem);
extern int hycr_record_start();
extern int hycr_record_stop();
extern int hycr_timer_now(double* time);
extern void hycr_disable_consistency_check();
extern void hycr_enable_consistency_check();
#endif /* HYCR_INCLUDE_HYCR_HYCR_RUNTIME_H */

View File

@ -0,0 +1,123 @@
#ifndef HYCR_SRC_RT_DEVICE_H
#define HYCR_SRC_RT_DEVICE_H
#include "Debug.h"
#include "Config.h"
#include "Timer.h"
#include <map>
#ifndef ASYNC_STREAMING
#define SYNC_EXECUTION
#endif
namespace hycr {
namespace runtime {
class Device {
public:
Device(int devs, int platform);
virtual ~Device();
virtual void TaskPre(Task* task) { return; }
virtual void TaskPost(Task* task) { return; }
void Execute(Task* task);
void ExecuteInit(Command* cmd);
virtual void ExecuteKernel(Command* cmd);
void ExecuteMalloc(Command* cmd);
void InvokeDMemInDataTransfer(Task *task, Command *cmd, DMemType *mem);
void ExecuteMemResetInput(Task *task, Command* cmd);
void ExecuteMemIn(Task *task, Command* cmd);
void ExecuteMemInDMemIn(Task *task, Command* cmd, DataMem *mem);
void ExecuteMemInDMemRegionIn(Task *task, Command* cmd, DataMemRegion *mem);
void ExecuteMemOut(Task *task, Command* cmd);
void ExecuteMemFlushOut(Command* cmd);
void ExecuteH2D(Command* cmd);
void ExecuteH2DNP(Command* cmd);
void ExecuteD2H(Command* cmd);
void ExecuteMap(Command* cmd);
void ExecuteReleaseMem(Command* cmd);
void ExecuteHost(Command* cmd);
virtual int ResetMemory(BaseMem *mem, uint8_t reset_value)=0;
virtual void ResetContext() { }
virtual bool IsContextChangeRequired() { return false; }
virtual int Compile(char* src) { return HYCR_SUCCESS; }
virtual int Init() = 0;
virtual int BuildProgram(char* path) { return HYCR_SUCCESS; }
virtual int MemAlloc(void** mem, size_t size, bool reset=false) = 0;
virtual int MemFree(void* mem) = 0;
virtual int MemD2D(Task *task, BaseMem *mem, void *dst, void *src, size_t size) { _error("Device:%d:%s doesn't support MemD2D", devs_, name()); return HYCR_ERROR; }
virtual int MemH2D(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag="") = 0;
virtual int MemD2H(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag="") = 0;
virtual int KernelGet(Kernel *kernel, void** kernel_bin, const char* name) = 0;
virtual int KernelLaunchInit(Kernel* kernel) { return HYCR_SUCCESS; }
virtual int KernelSetArg(Kernel* kernel, int idx, int kindex, size_t size, void* value) = 0;
virtual int KernelSetMem(Kernel* kernel, int idx, int kindex, BaseMem* mem, size_t off) = 0;
virtual int KernelLaunch(Kernel* kernel, int dim, size_t* off, size_t* gws, size_t* lws) = 0;
virtual int Synchronize() = 0;
virtual int AddCallback(Task* task) = 0;
virtual int Custom(int tag, char* params) { return HYCR_SUCCESS; }
virtual int RecreateContext() { return HYCR_ERROR; }
virtual void SetPeerDevices(int *peers, int count) { }
virtual const char* kernel_src() { return " "; }
virtual const char* kernel_bin() { return " "; }
void set_shared_memory_buffers(bool flag=true) { shared_memory_buffers_ = flag; }
bool is_shared_memory_buffers() { return shared_memory_buffers_ && can_share_host_memory_; }
int platform() { return platform_; }
int devs() { return devs_; }
int type() { return type_; }
int model() { return model_; }
char* vendor() { return vendor_; }
char* name() { return name_; }
bool busy() { return busy_; }
bool idle() { return !busy_; }
bool enable() { return enable_; }
void enableD2D() { is_d2d_possible_ = true; }
bool isD2DEnabled() { return is_d2d_possible_; }
int ok() { return errid_; }
void set_worker(Worker* worker) { worker_ = worker; }
Worker* worker() { return worker_; }
double Now() { return timer_->Now(); }
protected:
int devs_;
int platform_;
int type_;
int model_;
char vendor_[128];
char name_[256];
char version_[64];
int driver_version_;
size_t max_compute_units_;
size_t max_work_group_size_;
size_t max_work_item_sizes_[3];
int max_block_dims_[3];
int nqueues_;
int q_;
int errid_;
char kernel_path_[256];
bool busy_;
bool enable_;
bool shared_memory_buffers_;
bool can_share_host_memory_;
bool is_d2d_possible_;
std::map<int, command_handler> cmd_handlers_;
};
} /* namespace rt */
} /* namespace hycr */
#endif /* HYCR_SRC_RT_DEVICE_H */

View File

@ -0,0 +1,137 @@
#include "DeviceCUDA.h"
namespace hycr {
namespace runtime {
DeviceCUDA::DeviceCUDA(CUDA* ld, Host2CUDA *host2cuda_ld, CUdevice cudev, int devs, int platform) : Device(devs, platform) {
ld_ = ld;
host2cuda_ld_ = host2cuda_ld;
peers_count_ = 0;
max_arg_idx_ = 0;
ngarbage_ = 0;
shared_mem_bytes_ = 0;
dev_ = cudev;
strcpy(vendor_, "NVIDIA");
enableD2D();
err_ = ld_->cuDeviceGetName(name_, sizeof(name_), dev_);
_cuerror(err_);
type_ = hycr_nvidia;
model_ = hycr_cuda;
err_ = ld_->cuDriverGetVersion(&driver_version_);
_cuerror(err_);
//err_ = ld_->cudaSetDevice(dev_);
_cuerror(err_);
sprintf(version_, "NVIDIA CUDA %d", driver_version_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev_);
err_ = ld_->cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev_);
}
DeviceCUDA::~DeviceCUDA() {
if (ld_->hycr_host2cuda_finalize){
ld_->hycr_host2cuda_finalize();
}
if (ld_->hycr_host2cuda_finalize_handles){
ld_->hycr_host2cuda_finalize_handles(dev_);
}
}
int DeviceCUDA::Compile(char* src) {
char cmd[1024];
memset(cmd, 0, 256);
sprintf(cmd, "nvcc -ptx %s -o %s", src, kernel_path_);
if (system(cmd) != EXIT_SUCCESS) {
_error("cmd[%s]", cmd);
worker_->platform()->IncrementErrorCount();
return HYCR_ERROR;
}
return HYCR_SUCCESS;
}
int DeviceCUDA::Init() {
err_ = ld_->cudaSetDevice(dev_);
err_ = ld_->cuCtxCreate(&ctx_, CU_CTX_SCHED_AUTO, dev_);
EnablePeerAccess();
_cuerror(err_);
for (int i = 0; i < nqueues_; i++) {
err_ = ld_->cuStreamCreate(streams_ + i, CU_STREAM_DEFAULT);
_cuerror(err_);
}
char* path = kernel_path_;
char* src = NULL;
size_t srclen = 0;
if (Utils::ReadFile(path, &src, &srclen) == HYCR_ERROR) {
return HYCR_SUCCESS;
}
err_ = ld_->cuModuleLoad(&module_, path);
if (err_ != CUDA_SUCCESS) {
_cuerror(err_);
if (src) free(src);
platform()->IncrementErrorCount();
return HYCR_ERROR;
}
if (src) free(src);
return HYCR_SUCCESS;
}
int DeviceCUDA::ResetMemory(BaseMem *mem, uint8_t reset_value) {
err_ = ld_->cudaMemset(mem->arch(this), reset_value, mem->size());
_cuerror(err_);
if (err_ != CUDA_SUCCESS){
worker_->platform()->IncrementErrorCount();
return HYCR_ERROR;
}
return HYCR_SUCCESS;
}
int DeviceCUDA::MemAlloc(void** mem, size_t size, bool reset) {
CUdeviceptr* cumem = (CUdeviceptr*) mem;
err_ = ld_->cuMemAlloc(cumem, size);
if (reset) ld_->cudaMemset(*mem, 0, size);
if (err_ != CUDA_SUCCESS){
worker_->platform()->IncrementErrorCount();
return HYCR_ERROR;
}
return HYCR_SUCCESS;
}
int DeviceCUDA::MemFree(void* mem) {
CUdeviceptr cumem = (CUdeviceptr) mem;
if (ngarbage_ >= HYCR_MAX_GABAGES) _error("ngarbage[%d]", ngarbage_);
else garbage_[ngarbage_++] = cumem;
/*
_trace("dptr[%p]", cumem);
err_ = ld_->cuMemFree(cumem);
_cuerror(err_);
*/
return HYCR_SUCCESS;
}
int DeviceCUDA::Synchronize() {
err_ = ld_->cuCtxSynchronize();
_cuerror(err_);
if (err_ != CUDA_SUCCESS){
worker_->platform()->IncrementErrorCount();
return HYCR_ERROR;
}
return HYCR_SUCCESS;
}
} /* namespace runtime */
} /* namespace hycr */

View File

@ -0,0 +1,74 @@
#ifndef HYCR_SRC_RT_DEVICE_CUDA_H
#define HYCR_SRC_RT_DEVICE_CUDA_H
#include "Device.h"
#include <map>
#define HYCR_MAX_GABAGES 256
namespace hycr {
namespace runtime {
class DeviceCUDA : public Device {
public:
DeviceCUDA(CUDA* ld, Host2CUDA *host2cuda_ld, CUdevice cudev, int devs, int platform);
~DeviceCUDA();
int Compile(char* src);
int Init();
int ResetMemory(BaseMem *mem, uint8_t reset_value);
int MemAlloc(void** mem, size_t size, bool reset=false);
int MemFree(void* mem);
void EnablePeerAccess();
void SetPeerDevices(int *peers, int count);
void MemCpy3D(CUdeviceptr dev, uint8_t *host, size_t *off,
size_t *dev_sizes, size_t *host_sizes,
size_t elem_size, bool host_2_dev);
int MemD2D(Task *task, BaseMem *mem, void *dst, void *src, size_t size);
int MemH2D(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag="");
int MemD2H(Task *task, BaseMem* mem, size_t *off, size_t *host_sizes, size_t *dev_sizes, size_t elem_size, int dim, size_t size, void* host, const char *tag="");
int KernelGet(Kernel *kernel, void** kernel_bin, const char* name);
int KernelLaunchInit(Kernel* kernel);
int KernelSetArg(Kernel* kernel, int idx, int kindex, size_t size, void* value);
int KernelSetMem(Kernel* kernel, int idx, int kindex, BaseMem* mem, size_t off);
int KernelLaunch(Kernel* kernel, int dim, size_t* off, size_t* gws, size_t* lws);
int Synchronize();
int AddCallback(Task* task);
int Custom(int tag, char* params);
const char* kernel_src() { return "KERNEL_SRC_CUDA"; }
virtual void TaskPre(Task* task);
int cudev() { return dev_; }
void ResetContext();
bool IsContextChangeRequired();
private:
static void Callback(CUstream stream, CUresult status, void* data);
void ClearGarbage();
private:
LoaderCUDA* ld_;
LoaderHost2CUDA* host2cuda_ld_;
CUdevice dev_;
CUdevice peers_[HYCR_MAX_NDEVS];
int peers_count_;
CUcontext ctx_;
CUstream streams_[HYCR_MAX_DEVICE_NQUEUES];
CUmodule module_;
CUresult err_;
unsigned int shared_mem_bytes_;
unsigned int shared_mem_offs_[HYCR_MAX_KERNEL_NARGS];
void* params_[HYCR_MAX_KERNEL_NARGS];
int max_arg_idx_;
CUdeviceptr garbage_[HYCR_MAX_GABAGES];
int ngarbage_;
std::map<CUfunction, CUfunction> kernels_offs_;
};
} /* namespace runtime */
} /* namespace hycr */
#endif /* HYCR_SRC_RT_DEVICE_CUDA_H */