diff --git a/cmake/legion_defines.h.in b/cmake/legion_defines.h.in index 2517ec32fa..852162078d 100644 --- a/cmake/legion_defines.h.in +++ b/cmake/legion_defines.h.in @@ -41,6 +41,8 @@ #cmakedefine LEGION_USE_HDF5 +#cmakedefine LEGION_USE_FPGA + #cmakedefine LEGION_SPY #cmakedefine LEGION_USE_HIP diff --git a/cmake/realm_defines.h.in b/cmake/realm_defines.h.in index 90d1fa70f0..f5fc71e659 100644 --- a/cmake/realm_defines.h.in +++ b/cmake/realm_defines.h.in @@ -59,6 +59,8 @@ #cmakedefine REALM_USE_HDF5 +#cmakedefine REALM_USE_FPGA + #cmakedefine REALM_USE_LIBDL #cmakedefine REALM_USE_DLMOPEN diff --git a/examples/fpga/Makefile b/examples/fpga/Makefile new file mode 100644 index 0000000000..20db046ebc --- /dev/null +++ b/examples/fpga/Makefile @@ -0,0 +1,82 @@ +# Copyright 2021 Stanford University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +ifndef LG_RT_DIR +$(error LG_RT_DIR variable is not defined, aborting build) +endif + +# Flags for directing the runtime makefile what to include +DEBUG ?= 1 # Include debugging symbols +MAX_DIM ?= 3 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 0 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Include FPGA support (requires Xilinx FPGA) +USE_FPGA ?= 1 +# Path to the Xilinx platform definition file +PLATFORM ?= /home/centos/src/project_data/aws-fpga/Vitis/aws_platform/xilinx_aws-vu9p-f1_shell-v04261818_201920_2/xilinx_aws-vu9p-f1_shell-v04261818_201920_2.xpfm +# FPGA kenrel compilation target: Software emulation (sw_emu), hardware emulation (hw_emu), hardware (hw) +TARGET ?= sw_emu +# v++ flags +VPPFLAGS = -g -I. + +# Put the binary file name here +OUTFILE ?= fpga +# List all the application source files here +GEN_SRC ?= fpga.cc # .cc files + +# You can modify these variables, some will be appended to by the runtime makefile +INC_FLAGS ?= +CC_FLAGS ?= +NVCC_FLAGS ?= +HIPCC_FLAGS ?= +GASNET_FLAGS ?= +LD_FLAGS ?= + +.PHONY: run +run: kernel all + XCL_EMULATION_MODE=$(TARGET) ./fpga -ll:fpga 1 -ll:fpga_size 4 -ll:fpga_xclbin vadd.xclbin + +kernel: vadd.xclbin emconfig + +vadd.xo: vadd.cpp + v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -c -k vadd -o vadd.xo vadd.cpp + +# create 4 CUs +vadd.xclbin: vadd.xo + v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -l --connectivity.nk vadd:4 --connectivity.sp vadd_1.m_axi_gmem:DDR[0] --connectivity.sp vadd_2.m_axi_gmem:DDR[0] --connectivity.sp vadd_3.m_axi_gmem:DDR[0] --connectivity.sp vadd_4.m_axi_gmem:DDR[0] -o vadd.xclbin vadd.xo + +emconfig: emconfig.json +emconfig.json: + emconfigutil --platform $(PLATFORM) + +.PHONY: cleankernel +cleankernel: + rm -rf _x *.xo *.xclbin *.log *.xclbin.* *.xo.* emconfig.json .Xil .run + +.PHONY: cleanall +cleanall: clean cleankernel + +########################################################################### +# +# Don't change anything below here +# +########################################################################### + +include $(LG_RT_DIR)/runtime.mk + diff --git a/examples/fpga/README.md b/examples/fpga/README.md new file mode 100644 index 0000000000..60c727d73e --- /dev/null +++ b/examples/fpga/README.md @@ -0,0 +1,32 @@ +This is an example of using Xilinx FPGAs as accelators in Realm. +=============== +## Prerequisite: +Xilinx XRT 2021.1 and Xilinx Vitis 2021.1. + +This example is tested on an F1 node on AWS. Please refer to `https://github.com/aws/aws-fpga` for more information about using FPGA on AWS. + +## Steps: +1. Create an AWS instance with FPGA Developer AMI (1.11.0 tested) in AWS Marketplace + +2. Get AWS FGPA Development Kit: +``` + git clone https://github.com/aws/aws-fpga.git $AWS_FPGA_REPO_DIR +``` + +3. Set up environment: +``` + source /home/centos/src/project_data/aws-fpga/vitis_setup.sh + source /opt/Xilinx/Vivado/2021.1/settings64.sh + + export LG_RT_DIR=/home/centos/programs/legion_fpga/runtime +``` + +4. Build and run the example +``` + make run +``` + +5. Clean +``` + make cleanall +``` diff --git a/examples/fpga/fpga.cc b/examples/fpga/fpga.cc new file mode 100644 index 0000000000..ddb6b16cac --- /dev/null +++ b/examples/fpga/fpga.cc @@ -0,0 +1,409 @@ +/* Copyright 2021 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "legion.h" +#include +#include +#include +#include "realm/fpga/fpga_utils.h" +#include "realm/fpga/xcl2.hpp" + +using namespace Legion; + +#define DEFAULT_COMP_UNITS 1 + +enum TaskIDs +{ + TOP_LEVEL_TASK_ID, + INIT_FIELD_TASK_ID, + VADD_TASK_ID, + CHECK_TASK_ID, +}; + +enum FieldIDs +{ + FID_X, + FID_Y, + FID_Z, +}; + +#define DATA_TYPE int + +void top_level_task(const Task *task, + const std::vector ®ions, + Context ctx, Runtime *runtime) +{ + int num_elements = 8; + int num_subregions = 4; + // See if we have any command line arguments to parse + // Note we now have a new command line parameter which specifies + // how many subregions we should make. + { + const InputArgs &command_args = Runtime::get_input_args(); + for (int i = 1; i < command_args.argc; i++) + { + if (!strcmp(command_args.argv[i], "-n")) + num_elements = atoi(command_args.argv[++i]); + if (!strcmp(command_args.argv[i], "-b")) + num_subregions = atoi(command_args.argv[++i]); + } + } + printf("Running vadd for %d elements...\n", num_elements); + printf("Partitioning data into %d sub-regions...\n", num_subregions); + + // Create our logical regions using the same schemas as earlier examples + Rect<1> elem_rect(0, num_elements - 1); + IndexSpace is = runtime->create_index_space(ctx, elem_rect); + runtime->attach_name(is, "is"); + FieldSpace input_fs = runtime->create_field_space(ctx); + runtime->attach_name(input_fs, "input_fs"); + { + FieldAllocator allocator = + runtime->create_field_allocator(ctx, input_fs); + allocator.allocate_field(sizeof(DATA_TYPE), FID_X); + runtime->attach_name(input_fs, FID_X, "X"); + allocator.allocate_field(sizeof(DATA_TYPE), FID_Y); + runtime->attach_name(input_fs, FID_Y, "Y"); + } + FieldSpace output_fs = runtime->create_field_space(ctx); + runtime->attach_name(output_fs, "output_fs"); + { + FieldAllocator allocator = + runtime->create_field_allocator(ctx, output_fs); + allocator.allocate_field(sizeof(DATA_TYPE), FID_Z); + runtime->attach_name(output_fs, FID_Z, "Z"); + } + LogicalRegion input_lr = runtime->create_logical_region(ctx, is, input_fs); + runtime->attach_name(input_lr, "input_lr"); + LogicalRegion output_lr = runtime->create_logical_region(ctx, is, output_fs); + runtime->attach_name(output_lr, "output_lr"); + + // In addition to using rectangles and domains for launching index spaces + // of tasks (see example 02), Legion also uses them for performing + // operations on logical regions. Here we create a rectangle and a + // corresponding domain for describing the space of subregions that we + // want to create. Each subregion is assigned a 'color' which is why + // we name the variables 'color_bounds' and 'color_domain'. We'll use + // these below when we partition the region. + Rect<1> color_bounds(0, num_subregions - 1); + IndexSpace color_is = runtime->create_index_space(ctx, color_bounds); + + // Parallelism in Legion is implicit. This means that rather than + // explicitly saying what should run in parallel, Legion applications + // partition up data and tasks specify which regions they access. + // The Legion runtime computes non-interference as a function of + // regions, fields, and privileges and then determines which tasks + // are safe to run in parallel. + // + // Data partitioning is performed on index spaces. The partitioning + // operation is used to break an index space of points into subsets + // of points each of which will become a sub index space. Partitions + // created on an index space are then transitively applied to all the + // logical regions created using the index space. We will show how + // to get names to the subregions later in this example. + // + // Here we want to create the IndexPartition 'ip'. We'll illustrate + // two ways of creating an index partition depending on whether the + // array being partitioned can be evenly partitioned into subsets + // or not. There are other methods to partitioning index spaces + // which are not covered here. We'll cover the case of coloring + // individual points in an index space in our capstone circuit example. + IndexPartition ip = runtime->create_equal_partition(ctx, is, color_is); + runtime->attach_name(ip, "ip"); + + // The index space 'is' was used in creating two logical regions: 'input_lr' + // and 'output_lr'. By creating an IndexPartitiong of 'is' we implicitly + // created a LogicalPartition for each of the logical regions created using + // 'is'. The Legion runtime provides several ways of getting the names for + // these LogicalPartitions. We'll look at one of them here. The + // 'get_logical_partition' method takes a LogicalRegion and an IndexPartition + // and returns the LogicalPartition of the given LogicalRegion that corresponds + // to the given IndexPartition. + LogicalPartition input_lp = runtime->get_logical_partition(ctx, input_lr, ip); + runtime->attach_name(input_lp, "input_lp"); + LogicalPartition output_lp = runtime->get_logical_partition(ctx, output_lr, ip); + runtime->attach_name(output_lp, "output_lp"); + + // Create our launch domain. Note that is the same as color domain + // as we are going to launch one task for each subregion we created. + ArgumentMap arg_map; + + // As in previous examples, we now want to launch tasks for initializing + // both the fields. However, to increase the amount of parallelism + // exposed to the runtime we will launch separate sub-tasks for each of + // the logical subregions created by our partitioning. To express this + // we create an IndexLauncher for launching an index space of tasks + // the same as example 02. + IndexLauncher init_launcher(INIT_FIELD_TASK_ID, color_is, + TaskArgument(NULL, 0), arg_map); + // For index space task launches we don't want to have to explicitly + // enumerate separate region requirements for all points in our launch + // domain. Instead Legion allows applications to place an upper bound + // on privileges required by subtasks and then specify which privileges + // each subtask receives using a projection function. In the case of + // the field initialization task, we say that all the subtasks will be + // using some subregion of the LogicalPartition 'input_lp'. Applications + // may also specify upper bounds using logical regions and not partitions. + // + // The Legion implementation assumes that all all points in an index + // space task launch request non-interfering privileges and for performance + // reasons this is unchecked. This means if two tasks in the same index + // space are accessing aliased data, then they must either both be + // with read-only or reduce privileges. + // + // When the runtime enumerates the launch_domain, it will invoke the + // projection function for each point in the space and use the resulting + // LogicalRegion computed for each point in the index space of tasks. + // The projection ID '0' is reserved and corresponds to the identity + // function which simply zips the space of tasks with the space of + // subregions in the partition. Applications can register their own + // projections functions via the 'register_region_projection' and + // 'register_partition_projection' functions before starting + // the runtime similar to how tasks are registered. + init_launcher.add_region_requirement( + RegionRequirement(input_lp, 0 /*projection ID*/, + WRITE_DISCARD, EXCLUSIVE, input_lr)); + init_launcher.region_requirements[0].add_field(FID_X); + runtime->execute_index_space(ctx, init_launcher); + + // Modify our region requirement to initialize the other field + // in the same way. Note that after we do this we have exposed + // 2*num_subregions task-level parallelism to the runtime because + // we have launched tasks that are both data-parallel on + // sub-regions and task-parallel on accessing different fields. + // The power of Legion is that it allows programmers to express + // these data usage patterns and automatically extracts both + // kinds of parallelism in a unified programming framework. + init_launcher.region_requirements[0].privilege_fields.clear(); + init_launcher.region_requirements[0].instance_fields.clear(); + init_launcher.region_requirements[0].add_field(FID_Y); + runtime->execute_index_space(ctx, init_launcher); + + // We launch the subtasks for performing the vadd computation + // in a similar way to the initialize field tasks. Note we + // again make use of two RegionRequirements which use a + // partition as the upper bound for the privileges for the task. + IndexLauncher vadd_launcher(VADD_TASK_ID, color_is, + TaskArgument(NULL, 0), arg_map); + vadd_launcher.add_region_requirement( + RegionRequirement(input_lp, 0 /*projection ID*/, + READ_ONLY, EXCLUSIVE, input_lr)); + vadd_launcher.region_requirements[0].add_field(FID_X); + vadd_launcher.region_requirements[0].add_field(FID_Y); + vadd_launcher.add_region_requirement( + RegionRequirement(output_lp, 0 /*projection ID*/, + WRITE_DISCARD, EXCLUSIVE, output_lr)); + vadd_launcher.region_requirements[1].add_field(FID_Z); + runtime->execute_index_space(ctx, vadd_launcher); + + // While we could also issue parallel subtasks for the checking + // task, we only issue a single task launch to illustrate an + // important Legion concept. Note the checking task operates + // on the entire 'input_lr' and 'output_lr' regions and not + // on the subregions. Even though the previous tasks were + // all operating on subregions, Legion will correctly compute + // data dependences on all the subtasks that generated the + // data in these two regions. + TaskLauncher check_launcher(CHECK_TASK_ID, TaskArgument(NULL, 0)); + check_launcher.add_region_requirement( + RegionRequirement(input_lr, READ_ONLY, EXCLUSIVE, input_lr)); + check_launcher.region_requirements[0].add_field(FID_X); + check_launcher.region_requirements[0].add_field(FID_Y); + check_launcher.add_region_requirement( + RegionRequirement(output_lr, READ_ONLY, EXCLUSIVE, output_lr)); + check_launcher.region_requirements[1].add_field(FID_Z); + runtime->execute_task(ctx, check_launcher); + + runtime->destroy_logical_region(ctx, input_lr); + runtime->destroy_logical_region(ctx, output_lr); + runtime->destroy_field_space(ctx, input_fs); + runtime->destroy_field_space(ctx, output_fs); + runtime->destroy_index_space(ctx, is); + runtime->destroy_index_space(ctx, color_is); +} + +void init_field_task(const Task *task, + const std::vector ®ions, + Context ctx, Runtime *runtime) +{ + assert(regions.size() == 1); + assert(task->regions.size() == 1); + assert(task->regions[0].privilege_fields.size() == 1); + + FieldID fid = *(task->regions[0].privilege_fields.begin()); + const int point = task->index_point.point_data[0]; + printf("Initializing field %d for block %d... with ( ", fid, point); + + const FieldAccessor acc(regions[0], fid); + // Note here that we get the domain for the subregion for + // this task from the runtime which makes it safe for running + // both as a single task and as part of an index space of tasks. + Rect<1> rect = runtime->get_index_space_domain(ctx, + task->regions[0].region.get_index_space()); + for (PointInRectIterator<1> pir(rect); pir(); pir++) + { + int rand = drand48() * 100; + acc[*pir] = rand; + printf("%d ", rand); + } + printf(")\n"); +} + +void vadd_task(const Task *task, + const std::vector ®ions, + Context ctx, Runtime *runtime) +{ + assert(regions.size() == 2); + assert(task->regions.size() == 2); + const int point = task->index_point.point_data[0]; + + const FieldAccessor> acc_x(regions[0], FID_X); + const FieldAccessor> acc_y(regions[0], FID_Y); + const FieldAccessor> acc_z(regions[1], FID_Z); + printf("Running vadd computation for point %d...\n", point); + Rect<1> rect = runtime->get_index_space_domain(ctx, + task->regions[0].region.get_index_space()); + printf("rect.volume = %lu, acc_x.ptr = %p, acc_y.ptr = %p, acc_z.ptr = %p \n", rect.volume(), acc_x.accessor.ptr(rect.lo), acc_y.accessor.ptr(rect.lo), acc_z.accessor.ptr(rect.lo)); + // for (PointInRectIterator<1> pir(rect); pir(); pir++) + // acc_z[*pir] = acc_x[*pir] + acc_y[*pir]; + + size_t data_size = rect.volume(); + int num_cu = DEFAULT_COMP_UNITS; + std::vector krnls(num_cu); + cl::Program program = FPGAGetCurrentProgram(); + cl_int err; + // Creating Kernel objects + for (int i = 0; i < num_cu; i++) + { + OCL_CHECK(err, krnls[i] = cl::Kernel(program, "vadd", &err)); + } + + // Creating sub-buffers + cl::Buffer device_buff = FPGAGetCurrentBuffer(); + void *base_ptr_sys = FPGAGetBasePtrSys(); + auto chunk_size = data_size / num_cu; + size_t vector_size_bytes = sizeof(int) * chunk_size; + std::vector buffer_in1(num_cu); + std::vector buffer_in2(num_cu); + std::vector buffer_output(num_cu); + + for (int i = 0; i < num_cu; i++) { + cl_buffer_region buffer_in1_info = {(uint64_t)acc_x.ptr(rect.lo) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_in1[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in1_info, &err)); + cl_buffer_region buffer_in2_info = {(uint64_t)acc_y.ptr(rect.lo) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_in2[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in2_info, &err)); + cl_buffer_region buffer_output_info = {(uint64_t)acc_z.ptr(rect.lo) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_output[i] = device_buff.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_output_info, &err)); + } + + for (int i = 0; i < num_cu; i++) { + int narg = 0; + + // Setting kernel arguments + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in1[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in2[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_output[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, (int)chunk_size)); + } + + cl::Event task_events[num_cu]; + cl::CommandQueue command_queue = FPGAGetCurrentCommandQueue(); + for (int i = 0; i < num_cu; i++) { + // Launch the kernel + OCL_CHECK(err, err = command_queue.enqueueTask(krnls[i], nullptr, &task_events[i])); + } + + std::vector wait_events[num_cu]; + // Copy result from device global memory to host local memory + for (int i = 0; i < num_cu; i++) { + wait_events[i].push_back(task_events[i]); + OCL_CHECK(err, err = command_queue.enqueueMigrateMemObjects({buffer_output[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &wait_events[i], nullptr)); + } + // OCL_CHECK(err, err = command_queue.finish()); + OCL_CHECK(err, err = command_queue.flush()); + + printf("fpga kernels flushed\n"); +} + +void check_task(const Task *task, + const std::vector ®ions, + Context ctx, Runtime *runtime) +{ + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + const FieldAccessor acc_x(regions[0], FID_X); + const FieldAccessor acc_y(regions[0], FID_Y); + const FieldAccessor acc_z(regions[1], FID_Z); + + Rect<1> rect = runtime->get_index_space_domain(ctx, + task->regions[0].region.get_index_space()); + printf("Checking results...\n"); + bool all_passed = true; + for (PointInRectIterator<1> pir(rect); pir(); pir++) + { + DATA_TYPE expected = acc_x[*pir] + acc_y[*pir]; + DATA_TYPE received = acc_z[*pir]; + // Probably shouldn't check for floating point equivalence but + // the order of operations are the same should they should + // be bitwise equal. + if (expected != received) + { + all_passed = false; + } + printf("expected = %d received = %d\n", expected, received); + } + if (all_passed) + printf("SUCCESS!\n"); + else + printf("FAILURE!\n"); +} + +int main(int argc, char **argv) +{ + Runtime::set_top_level_task_id(TOP_LEVEL_TASK_ID); + + { + TaskVariantRegistrar registrar(TOP_LEVEL_TASK_ID, "top_level"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + Runtime::preregister_task_variant(registrar, "top_level"); + } + + { + TaskVariantRegistrar registrar(INIT_FIELD_TASK_ID, "init_field"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant(registrar, "init_field"); + } + + { + TaskVariantRegistrar registrar(VADD_TASK_ID, "vadd"); + registrar.add_constraint(ProcessorConstraint(Processor::FPGA_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant(registrar, "vadd"); + } + + { + TaskVariantRegistrar registrar(CHECK_TASK_ID, "check"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + registrar.set_leaf(); + Runtime::preregister_task_variant(registrar, "check"); + } + + return Runtime::start(argc, argv); +} diff --git a/examples/fpga/vadd.cpp b/examples/fpga/vadd.cpp new file mode 100644 index 0000000000..81b95028f6 --- /dev/null +++ b/examples/fpga/vadd.cpp @@ -0,0 +1,45 @@ +/** +* Copyright (C) 2019-2021 Xilinx, Inc +* +* Licensed under the Apache License, Version 2.0 (the "License"). You may +* not use this file except in compliance with the License. A copy of the +* License is located at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +* License for the specific language governing permissions and limitations +* under the License. +*/ + +// TRIPCOUNT indentifier +const unsigned int c_min = 4096; +const unsigned int c_max = 4 * 1024 * 1024; + +/* + Vector Addition Kernel Implementation + Arguments: + in1 (input) --> Input Vector1 + in2 (input) --> Input Vector2 + out_r (output) --> Output Vector + size (input) --> Size of Vector in Integer +*/ + +extern "C" { +void vadd(const int* in1, // Read-Only Vector 1 + const int* in2, // Read-Only Vector 2 + int* out_r, // Output Result + int size // Size in integer + ) { +// Unoptimized vector addition kernel to increase the kernel execution time +// Large execution time required to showcase parallel execution of multiple +// compute units in this example. +vadd1: + for (int i = 0; i < size; i++) { +#pragma HLS LOOP_TRIPCOUNT min = c_min max = c_max + out_r[i] = in1[i] + in2[i]; + } +} +} diff --git a/examples/realm_fpga/Makefile b/examples/realm_fpga/Makefile new file mode 100644 index 0000000000..16e55b3e41 --- /dev/null +++ b/examples/realm_fpga/Makefile @@ -0,0 +1,158 @@ +# Copyright 2021 Stanford University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +ifndef LG_RT_DIR +$(error LG_RT_DIR variable is not defined, aborting build) +endif +# vpath stuff below doesn't like a trailing slash in LG_RT_DIR +override LG_RT_DIR := $(patsubst %/,%,$(LG_RT_DIR)) + +# Flags for directing the runtime makefile what to include +DEBUG ?= 1 # Include debugging symbols +MAX_DIM ?= 3 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 0 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 0 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Include FPGA support (requires Xilinx FPGA) +USE_FPGA ?= 1 +# Path to the Xilinx platform definition file +PLATFORM ?= /home/centos/src/project_data/aws-fpga/Vitis/aws_platform/xilinx_aws-vu9p-f1_shell-v04261818_201920_2/xilinx_aws-vu9p-f1_shell-v04261818_201920_2.xpfm +# FPGA kenrel compilation target: Software emulation (sw_emu), hardware emulation (hw_emu), hardware (hw) +TARGET ?= sw_emu +# v++ flags +VPPFLAGS = -g -I. + + +# Put the binary file name here +OUTFILE := +# List all the application source files here +GEN_SRC := # .cc files +GEN_GPU_SRC := # .cu files + +# You can modify these variables, some will be appended to by the runtime makefile +INC_FLAGS ?= +CC_FLAGS ?= +NVCC_FLAGS ?= +GASNET_FLAGS ?= +LD_FLAGS ?= +CXX ?= g++ + +# we're going to include runtime.mk to get variable settings, but then +# do our own build steps +NO_BUILD_RULES=1 +include $(LG_RT_DIR)/runtime.mk + +TESTS := fpga_vadd + +# can set arguments to be passed to a test when running +TESTARGS_fpga_vadd := -ll:fpga 1 -ll:fpga_size 4 -ll:fpga_xclbin vadd.xclbin -level fpga=1 -level app=1 + +REALM_OBJS := $(patsubst %.cc,%.o,$(notdir $(REALM_SRC))) \ + $(patsubst %.cc.o,%.o,$(notdir $(REALM_INST_OBJS))) \ + $(patsubst %.S,%.o,$(notdir $(ASM_SRC))) +EMPTY := +SPACE := $(EMPTY) $(EMPTY) +RUNTIME_VPATH := $(subst $(SPACE),:,$(sort $(dir $(REALM_SRC)))) +vpath %.cc .:$(RUNTIME_VPATH) +vpath %.S .:$(RUNTIME_VPATH) + +REALM_LIB := librealm.a + +TEST_OBJS := $(TESTS:%=%.o) + +.PHONY: run_all +run_all : $(TESTS:%=run_%) + +.PHONY: run_% +run_% : % vadd.xclbin emconfig + @# this echos exactly once, even if -s was specified + @echo XCL_EMULATION_MODE=$(TARGET) $(LAUNCHER) ./$* $(TESTARGS_$*) + @XCL_EMULATION_MODE=$(TARGET) $(LAUNCHER) ./$* $(TESTARGS_$*) + +.PHONY: build +build : $(TESTS) vadd.xclbin emconfig + +.PHONY: cleankernel +cleankernel: + rm -rf _x *.xo *.xclbin *.log *.xclbin.* *.xo.* emconfig.json .Xil .run + +.PHONY: clean +clean : cleankernel + rm -f $(REALM_LIB) $(REALM_OBJS) $(TESTS) $(TEST_OBJS) + rm -f legion_defines.h realm_defines.h + +$(TESTS) : % : %.o librealm.a + $(CXX) -o $@ $< -L. -lrealm $(LEGION_LD_FLAGS) $(LD_FLAGS) + +$(REALM_LIB) : $(REALM_OBJS) + rm -f $(REALM_LIB) + ar rc $(REALM_LIB) $(REALM_OBJS) + +%.o : %.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER) + $(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(CC_FLAGS) + +$(REALM_OBJS) : CC_FLAGS+=$(REALM_SYMBOL_VISIBILITY) + +%.o : %.S + $(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(CC_FLAGS) + +# deppart-related stuff +ifneq ($(USE_PGI),1) +image_%.o : image_tmpl.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER) + $(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(REALM_SYMBOL_VISIBILITY) $(CC_FLAGS) -DINST_N1=$(word 1,$(subst _, ,$*)) -DINST_N2=$(word 2,$(subst _, ,$*)) + +preimage_%.o : preimage_tmpl.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER) + $(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(REALM_SYMBOL_VISIBILITY) $(CC_FLAGS) -DINST_N1=$(word 1,$(subst _, ,$*)) -DINST_N2=$(word 2,$(subst _, ,$*)) + +byfield_%.o : byfield_tmpl.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER) + $(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(REALM_SYMBOL_VISIBILITY) $(CC_FLAGS) -DINST_N1=$(word 1,$(subst _, ,$*)) -DINST_N2=$(word 2,$(subst _, ,$*)) +else +# nvc++ names some symbols based on the source filename, so the trick above +# of compiling multiple things from the same template with different defines +# causes linker errors - work around by generating a different source file for +# each case, but don't leave them lying around +image_%.cc : + echo '#define' INST_N1 $(word 1,$(subst _, ,$(notdir $*))) > $@ + echo '#define' INST_N2 $(word 2,$(subst _, ,$(notdir $*))) >> $@ + echo '#include' '"realm/deppart/image_tmpl.cc"' >> $@ + +preimage_%.cc : + echo '#define' INST_N1 $(word 1,$(subst _, ,$(notdir $*))) > $@ + echo '#define' INST_N2 $(word 2,$(subst _, ,$(notdir $*))) >> $@ + echo '#include' '"realm/deppart/preimage_tmpl.cc"' >> $@ + +byfield_%.cc : + echo '#define' INST_N1 $(word 1,$(subst _, ,$(notdir $*))) > $@ + echo '#define' INST_N2 $(word 2,$(subst _, ,$(notdir $*))) >> $@ + echo '#include' '"realm/deppart/byfield_tmpl.cc"' >> $@ + +.INTERMEDIATE: $(patsubst %.cc.o,%.cc,$(notdir $(REALM_INST_OBJS))) +endif + +# Create FPGA kernel (vadd with 4 CUs) +vadd.xo: vadd.cpp + v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -c -k vadd -o vadd.xo vadd.cpp + +vadd.xclbin: vadd.xo + v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -l --connectivity.nk vadd:4 --connectivity.sp vadd_1.m_axi_gmem:DDR[0] --connectivity.sp vadd_2.m_axi_gmem:DDR[0] --connectivity.sp vadd_3.m_axi_gmem:DDR[0] --connectivity.sp vadd_4.m_axi_gmem:DDR[0] -o vadd.xclbin vadd.xo + +# Create emulation config file +emconfig: emconfig.json +emconfig.json: + emconfigutil --platform $(PLATFORM) diff --git a/examples/realm_fpga/README.md b/examples/realm_fpga/README.md new file mode 100644 index 0000000000..4b6acfd66c --- /dev/null +++ b/examples/realm_fpga/README.md @@ -0,0 +1,33 @@ +This is an example of using Xilinx FPGAs as accelators in Realm. +=============== +## Prerequisite: +Xilinx XRT 2021.1 and Xilinx Vitis 2021.1. + +This example is tested on an F1 node on AWS. Please refer to `https://github.com/aws/aws-fpga` for more information about using FPGA on AWS. + +## Steps: +1. Create an AWS instance with FPGA Developer AMI (1.11.0 tested) in AWS Marketplace + +2. Get AWS FGPA Development Kit: +``` + git clone https://github.com/aws/aws-fpga.git $AWS_FPGA_REPO_DIR +``` + +3. Set up environment: +``` + source /home/centos/src/project_data/aws-fpga/vitis_setup.sh + source /opt/Xilinx/Vivado/2021.1/settings64.sh + + export LG_RT_DIR=/home/centos/programs/legion_fpga/runtime +``` + +4. Build and run the example +``` + make build + make run_fpga_vadd +``` + +5. Clean +``` + make clean +``` diff --git a/examples/realm_fpga/fpga_vadd.cc b/examples/realm_fpga/fpga_vadd.cc new file mode 100644 index 0000000000..5b9e6b051a --- /dev/null +++ b/examples/realm_fpga/fpga_vadd.cc @@ -0,0 +1,317 @@ +#include +#include +#include +#include //sleep + +#include "realm.h" +#include "realm/fpga/fpga_utils.h" + +#include "realm/fpga/xcl2.hpp" + +using namespace Realm; + +#define DEFAULT_COMP_UNITS 1 +#define DATA_SIZE 12 + +enum +{ + FID_X = 101, + FID_Y = 102, + FID_Z = 103, +}; + +// execute a task on FPGA Processor +Logger log_app("app"); + +enum +{ + TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0, + FPGA_TASK, +}; + +struct FPGAArgs +{ + RegionInstance x_inst, y_inst, z_inst; + Rect<1> bounds; +}; + +void fpga_task(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) +{ + log_app.print() << "fpga task started"; + const FPGAArgs &local_args = *(const FPGAArgs *)args; + + // get affine accessors for each of our three instances + AffineAccessor ra_x = AffineAccessor(local_args.x_inst, + FID_X); + AffineAccessor ra_y = AffineAccessor(local_args.y_inst, + FID_Y); + AffineAccessor ra_z = AffineAccessor(local_args.z_inst, + FID_Z); + size_t data_size = local_args.bounds.volume(); + + int num_cu = DEFAULT_COMP_UNITS; + std::vector krnls(num_cu); + cl::Program program = FPGAGetCurrentProgram(); + cl_int err; + // Creating Kernel objects + for (int i = 0; i < num_cu; i++) + { + OCL_CHECK(err, krnls[i] = cl::Kernel(program, "vadd", &err)); + } + + // Creating sub-buffers + cl::Buffer device_buff = FPGAGetCurrentBuffer(); + void *base_ptr_sys = FPGAGetBasePtrSys(); + auto chunk_size = data_size / num_cu; + size_t vector_size_bytes = sizeof(int) * chunk_size; + std::vector buffer_in1(num_cu); + std::vector buffer_in2(num_cu); + std::vector buffer_output(num_cu); + + for (int i = 0; i < num_cu; i++) { + cl_buffer_region buffer_in1_info = {(uint64_t)ra_x.ptr(0) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_in1[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in1_info, &err)); + cl_buffer_region buffer_in2_info = {(uint64_t)ra_y.ptr(0) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_in2[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in2_info, &err)); + cl_buffer_region buffer_output_info = {(uint64_t)ra_z.ptr(0) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_output[i] = device_buff.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_output_info, &err)); + log_app.info() << "buffer_output_info " << buffer_output_info.origin << " " << buffer_output_info.size; + } + + for (int i = 0; i < num_cu; i++) { + int narg = 0; + + // Setting kernel arguments + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in1[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in2[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_output[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, (int)chunk_size)); + } + + cl::Event task_events[num_cu]; + cl::CommandQueue command_queue = FPGAGetCurrentCommandQueue(); + for (int i = 0; i < num_cu; i++) { + // Launch the kernel + OCL_CHECK(err, err = command_queue.enqueueTask(krnls[i], nullptr, &task_events[i])); + } + + std::vector wait_events[num_cu]; + // Copy result from device global memory to host local memory + for (int i = 0; i < num_cu; i++) { + wait_events[i].push_back(task_events[i]); + OCL_CHECK(err, err = command_queue.enqueueMigrateMemObjects({buffer_output[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &wait_events[i], nullptr)); + } + // OCL_CHECK(err, err = command_queue.finish()); + OCL_CHECK(err, err = command_queue.flush()); + + log_app.print() << "fpga kernels flushed"; +} + +void top_level_task(const void *args, size_t arglen, + const void *userdata, size_t userlen, Processor p) +{ + log_app.print() << "top task running on " << p; + Machine machine = Machine::get_machine(); + std::set all_processors; + machine.get_all_processors(all_processors); + for (std::set::const_iterator it = all_processors.begin(); + it != all_processors.end(); + it++) + { + Processor pp = (*it); + if (pp.kind() == Processor::FPGA_PROC) + { + Memory cpu_mem = Memory::NO_MEMORY; + Memory fpga_mem = Memory::NO_MEMORY; + std::set visible_mems; + machine.get_visible_memories(pp, visible_mems); + for (std::set::const_iterator it = visible_mems.begin(); + it != visible_mems.end(); it++) + { + if (it->kind() == Memory::FPGA_MEM) + { + fpga_mem = *it; + log_app.print() << "fpga memory: " << *it << " capacity=" + << (it->capacity() >> 20) << " MB"; + } + if (it->kind() == Memory::SYSTEM_MEM) + { + cpu_mem = *it; + log_app.print() << "sys memory: " << *it << " capacity=" + << (it->capacity() >> 20) << " MB"; + } + } + + int init_x_value = 1; + int init_y_value = 2; + int init_z_value = 9; + + Rect<1> bounds(0, DATA_SIZE - 1); + + std::map field_sizes; + field_sizes[FID_X] = sizeof(int); + field_sizes[FID_Y] = sizeof(int); + field_sizes[FID_Z] = sizeof(int); + + RegionInstance cpu_inst; + RegionInstance::create_instance(cpu_inst, cpu_mem, + bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()) + .wait(); + log_app.print() << "created cpu memory instance: " << cpu_inst; + + CopySrcDstField cpu_x_field, cpu_y_field, cpu_z_field; + cpu_x_field.inst = cpu_inst; + cpu_x_field.field_id = FID_X; + cpu_x_field.size = sizeof(int); + + cpu_y_field.inst = cpu_inst; + cpu_y_field.field_id = FID_Y; + cpu_y_field.size = sizeof(int); + + cpu_z_field.inst = cpu_inst; + cpu_z_field.field_id = FID_Z; + cpu_z_field.size = sizeof(int); + + RegionInstance fpga_inst; + RegionInstance::create_instance(fpga_inst, fpga_mem, + bounds, field_sizes, + 0 /*SOA*/, ProfilingRequestSet()) + .wait(); + log_app.print() << "created fpga memory instance: " << fpga_inst; + + CopySrcDstField fpga_x_field, fpga_y_field, fpga_z_field; + fpga_x_field.inst = fpga_inst; + fpga_x_field.field_id = FID_X; + fpga_x_field.size = sizeof(int); + + fpga_y_field.inst = fpga_inst; + fpga_y_field.field_id = FID_Y; + fpga_y_field.size = sizeof(int); + + fpga_z_field.inst = fpga_inst; + fpga_z_field.field_id = FID_Z; + fpga_z_field.size = sizeof(int); + + AffineAccessor fpga_ra_x = AffineAccessor(fpga_inst, FID_X); + AffineAccessor cpu_ra_y = AffineAccessor(cpu_inst, FID_Y); + AffineAccessor fpga_ra_y = AffineAccessor(fpga_inst, FID_Y); + + //Test fill: fill fpga memory directly + Event fill_x; + { + std::vector fill_vec; + fill_vec.push_back(fpga_x_field); + fill_x = bounds.fill(fill_vec, ProfilingRequestSet(), + &init_x_value, sizeof(init_x_value)); + } + fill_x.wait(); + + Event fill_z; + { + std::vector fill_vec; + fill_vec.push_back(fpga_z_field); + fill_z = bounds.fill(fill_vec, ProfilingRequestSet(), + &init_z_value, sizeof(init_z_value)); + } + fill_z.wait(); + + printf("fpga_ra_x:\n"); + for (int i = bounds.lo; i <= bounds.hi; i++) + { + printf("%d ", fpga_ra_x[i]); + } + printf("\n"); + + // fill cpu mem and copy to fpga mem + Event fill_y_cpu; + { + std::vector fill_vec; + fill_vec.push_back(cpu_y_field); + fill_y_cpu = bounds.fill(fill_vec, ProfilingRequestSet(), + &init_y_value, sizeof(init_y_value)); + } + fill_y_cpu.wait(); + + Event copy_y; + { + std::vector srcs, dsts; + srcs.push_back(cpu_y_field); + dsts.push_back(fpga_y_field); + copy_y = bounds.copy(srcs, dsts, ProfilingRequestSet()); + } + copy_y.wait(); + + printf("cpu_ra_y:\n"); + for (int i = bounds.lo; i <= bounds.hi; i++) + { + printf("%d ", cpu_ra_y[i]); + } + printf("\n"); + printf("fpga_ra_y:\n"); + for (int i = bounds.lo; i <= bounds.hi; i++) + { + printf("%d ", fpga_ra_y[i]); + } + printf("\n"); + + FPGAArgs fpga_args; + fpga_args.x_inst = fpga_inst; + fpga_args.y_inst = fpga_inst; + fpga_args.z_inst = fpga_inst; + fpga_args.bounds = bounds; + Event e = pp.spawn(FPGA_TASK, &fpga_args, sizeof(fpga_args)); + + // Copy back + Event z_ready; + { + std::vector srcs, dsts; + srcs.push_back(fpga_z_field); + dsts.push_back(cpu_z_field); + z_ready = bounds.copy(srcs, dsts, ProfilingRequestSet(), e); + } + z_ready.wait(); + + AffineAccessor ra_z = AffineAccessor(cpu_inst, FID_Z); + for (int i = bounds.lo; i <= bounds.hi; i++) + { + printf("%d ", ra_z[i]); + } + printf("\n"); + } + } + + log_app.print() << "all done!"; +} + +int main(int argc, char **argv) +{ + // sleep(20); + Runtime rt; + rt.init(&argc, &argv); + rt.register_task(TOP_LEVEL_TASK, top_level_task); + + Processor::register_task_by_kind(Processor::FPGA_PROC, false /*!global*/, + FPGA_TASK, + CodeDescriptor(fpga_task), + ProfilingRequestSet()) + .wait(); + + // select a processor to run the top level task on + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::LOC_PROC) + .first(); + assert(p.exists()); + + // collective launch of a single task - everybody gets the same finish event + Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0); + + // request shutdown once that task is complete + rt.shutdown(e); + + // now sleep this thread until that shutdown actually happens + rt.wait_for_shutdown(); + + return 0; +} diff --git a/examples/realm_fpga/vadd.cpp b/examples/realm_fpga/vadd.cpp new file mode 100644 index 0000000000..81b95028f6 --- /dev/null +++ b/examples/realm_fpga/vadd.cpp @@ -0,0 +1,45 @@ +/** +* Copyright (C) 2019-2021 Xilinx, Inc +* +* Licensed under the Apache License, Version 2.0 (the "License"). You may +* not use this file except in compliance with the License. A copy of the +* License is located at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +* License for the specific language governing permissions and limitations +* under the License. +*/ + +// TRIPCOUNT indentifier +const unsigned int c_min = 4096; +const unsigned int c_max = 4 * 1024 * 1024; + +/* + Vector Addition Kernel Implementation + Arguments: + in1 (input) --> Input Vector1 + in2 (input) --> Input Vector2 + out_r (output) --> Output Vector + size (input) --> Size of Vector in Integer +*/ + +extern "C" { +void vadd(const int* in1, // Read-Only Vector 1 + const int* in2, // Read-Only Vector 2 + int* out_r, // Output Result + int size // Size in integer + ) { +// Unoptimized vector addition kernel to increase the kernel execution time +// Large execution time required to showcase parallel execution of multiple +// compute units in this example. +vadd1: + for (int i = 0; i < size; i++) { +#pragma HLS LOOP_TRIPCOUNT min = c_min max = c_max + out_r[i] = in1[i] + in2[i]; + } +} +} diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc index ade87017aa..5184b8eae2 100644 --- a/runtime/legion/runtime.cc +++ b/runtime/legion/runtime.cc @@ -12313,6 +12313,11 @@ namespace Legion { LegionSpy::log_processor_kind(kind, "Python"); break; } + case Processor::FPGA_PROC: + { + LegionSpy::log_processor_kind(kind, "FPGA"); + break; + } default: assert(false); // unknown processor kind } diff --git a/runtime/mappers/default_mapper.cc b/runtime/mappers/default_mapper.cc index ec2920bb1c..049d8cd572 100644 --- a/runtime/mappers/default_mapper.cc +++ b/runtime/mappers/default_mapper.cc @@ -59,13 +59,15 @@ namespace Legion { own ? name : strdup(name)), next_local_gpu(0), next_local_cpu(0), next_local_io(0), next_local_procset(0), next_local_omp(0), next_local_py(0), + next_local_fpga(0), next_global_gpu(Processor::NO_PROC), next_global_cpu(Processor::NO_PROC), next_global_io(Processor::NO_PROC), next_global_procset(Processor::NO_PROC), next_global_omp(Processor::NO_PROC), next_global_py(Processor::NO_PROC), + next_global_fpga(Processor::NO_PROC), global_gpu_query(NULL), global_cpu_query(NULL), global_io_query(NULL), global_procset_query(NULL), global_omp_query(NULL), - global_py_query(NULL), + global_py_query(NULL), global_fpga_query(NULL), max_steals_per_theft(STATIC_MAX_PERMITTED_STEALS), max_steal_count(STATIC_MAX_STEAL_COUNT), breadth_first_traversal(STATIC_BREADTH_FIRST), @@ -155,6 +157,11 @@ namespace Legion { local_omps.push_back(*it); break; } + case Processor::FPGA_PROC: + { + local_fpgas.push_back(*it); + break; + } default: // ignore anything else break; } @@ -215,6 +222,15 @@ namespace Legion { remote_omps[node] = *it; break; } + case Processor::FPGA_PROC: + { + // See if we already have a target FPGA processor for this node + if (node >= remote_fpgas.size()) + remote_fpgas.resize(node+1, Processor::NO_PROC); + if (!remote_fpgas[node].exists()) + remote_fpgas[node] = *it; + break; + } default: // ignore anything else break; } @@ -282,6 +298,19 @@ namespace Legion { } } if (total_nodes == 0) total_nodes = remote_pys.size(); + } + if (!local_fpgas.empty()) { + for (unsigned idx = 0; idx < remote_fpgas.size(); idx++) { + if (idx == node_id) continue; // ignore our own node + if (!remote_fpgas[idx].exists()) { + log_mapper.error("Default mapper has FPGA procs on node %d, but " + "could not detect FPGA procs on node %d. The " + "current default mapper implementation assumes " + "symmetric heterogeneity.", node_id, idx); + assert(false); + } + } + if (total_nodes == 0) total_nodes = remote_fpgas.size(); } // Initialize our random number generator const size_t short_bits = 8*sizeof(unsigned short); @@ -403,6 +432,8 @@ namespace Legion { return default_get_next_local_omp(); case Processor::PY_PROC: return default_get_next_local_py(); + case Processor::FPGA_PROC: + return default_get_next_local_fpga(); default: // make warnings go away break; } @@ -432,6 +463,8 @@ namespace Legion { return default_get_next_local_omp(); case Processor::PY_PROC: return default_get_next_local_py(); + case Processor::FPGA_PROC: + return default_get_next_local_fpga(); default: // make warnings go away break; } @@ -457,6 +490,8 @@ namespace Legion { return default_get_next_global_omp(); case Processor::PY_PROC: return default_get_next_global_py(); + case Processor::FPGA_PROC: + return default_get_next_global_fpga(); default: // make warnings go away break; } @@ -479,6 +514,8 @@ namespace Legion { return default_get_next_local_omp(); case Processor::PY_PROC: return default_get_next_local_py(); + case Processor::FPGA_PROC: + return default_get_next_local_fpga(); default: // make warnings go away break; } @@ -691,6 +728,38 @@ namespace Legion { return result; } + //-------------------------------------------------------------------------- + Processor DefaultMapper::default_get_next_local_fpga(void) + //-------------------------------------------------------------------------- + { + Processor result = local_fpgas[next_local_fpga++]; + if (next_local_fpga == local_fpgas.size()) + next_local_fpga = 0; + return result; + } + + //-------------------------------------------------------------------------- + Processor DefaultMapper::default_get_next_global_fpga(void) + //-------------------------------------------------------------------------- + { + if (total_nodes == 1) + return default_get_next_local_fpga(); + if (!next_global_fpga.exists()) + { + global_fpga_query = new Machine::ProcessorQuery(machine); + global_fpga_query->only_kind(Processor::FPGA_PROC); + next_global_fpga = global_fpga_query->first(); + } + Processor result = next_global_fpga; + next_global_fpga = global_fpga_query->next(result); + if (!next_global_fpga.exists()) + { + delete global_fpga_query; + global_fpga_query = NULL; + } + return result; + } + //-------------------------------------------------------------------------- DefaultMapper::VariantInfo DefaultMapper::default_find_preferred_variant( const Task &task, MapperContext ctx, @@ -799,6 +868,13 @@ namespace Legion { continue; break; } + case Processor::FPGA_PROC: + { + kindString += "FPGA_PROC "; + if (local_fpgas.empty()) + continue; + break; + } default: assert(false); // unknown processor type } @@ -922,8 +998,8 @@ namespace Legion { //-------------------------------------------------------------------------- { // Default mapper is ignorant about task IDs so just do whatever: - // 1) GPU > OMP > procset > cpu > IO > Python (default) - // 2) OMP > procset > cpu > IO > Python > GPU (with PREFER_CPU_VARIANT) + // 1) GPU > OMP > procset > cpu > IO > Python > FPGA (default) + // 2) OMP > procset > cpu > IO > Python > GPU > FPGA (with PREFER_CPU_VARIANT) // It is up to the caller to filter out processor kinds that aren't // suitable for a given task bool prefer_cpu = ((task.tag & PREFER_CPU_VARIANT) != 0); @@ -936,6 +1012,7 @@ namespace Legion { if (local_pys.size() > 0) ranking.push_back(Processor::PY_PROC); if ((local_gpus.size() > 0) && prefer_cpu) ranking.push_back(Processor::TOC_PROC); + if (local_fpgas.size() > 0) ranking.push_back(Processor::FPGA_PROC); } //-------------------------------------------------------------------------- @@ -1095,6 +1172,23 @@ namespace Legion { } break; } + case Processor::FPGA_PROC: + { + if (task.index_domain.get_volume() > local_fpgas.size()) + { + if (!global_memory.exists()) + { + log_mapper.error("Default mapper failure. No memory found " + "for FPGA task %s (ID %lld) which is visible " + "for all point in the index space.", + task.get_task_name(), task.get_unique_id()); + assert(false); + } + else + target_memory = global_memory; + } + break; + } default: assert(false); // unrecognized processor kind } @@ -1137,6 +1231,7 @@ namespace Legion { case Processor::PROC_SET: case Processor::OMP_PROC: case Processor::PY_PROC: + case Processor::FPGA_PROC: //TODO: use system mem for FPGA { visible_memories.only_kind(Memory::SYSTEM_MEM); if (visible_memories.count() == 0) @@ -1284,6 +1379,12 @@ namespace Legion { input, output, omp_slices_cache); break; } + case Processor::FPGA_PROC: + { + default_slice_task(task, local_fpgas, remote_fpgas, + input, output, fpga_slices_cache); + break; + } default: assert(false); // unimplemented processor kind } @@ -1747,6 +1848,16 @@ namespace Legion { } break; } + case Processor::FPGA_PROC: + { + // TODO: + if (!task.must_epoch_task) + target_procs.insert(target_procs.end(), + local_fpgas.begin(), local_fpgas.end()); + else + target_procs.push_back(task.target_proc); + break; + } default: assert(false); // unrecognized processor kind } @@ -3168,6 +3279,11 @@ namespace Legion { *result = local_pys.size(); break; } + case DEFAULT_TUNABLE_LOCAL_FPGAS: + { + *result = local_fpgas.size(); + break; + } case DEFAULT_TUNABLE_GLOBAL_GPUS: { // TODO: deal with machine asymmetry here @@ -3198,6 +3314,12 @@ namespace Legion { *result = (local_pys.size() * total_nodes); break; } + case DEFAULT_TUNABLE_GLOBAL_FPGAS: + { + // TODO: deal with machine asymmetry here + *result = (local_fpgas.size() * total_nodes); + break; + } default: { log_mapper.error("Default mapper error. Unrecognized tunable ID %d " @@ -3524,6 +3646,15 @@ namespace Legion { } break; } + case Processor::FPGA_PROC: + { + if (local_fpgas.empty()) + { + ++it; + continue; + } + break; + } default: assert(false); // unknown processor kind } diff --git a/runtime/mappers/default_mapper.h b/runtime/mappers/default_mapper.h index 6c0c458301..cc17c4bce8 100644 --- a/runtime/mappers/default_mapper.h +++ b/runtime/mappers/default_mapper.h @@ -46,12 +46,14 @@ namespace Legion { DEFAULT_TUNABLE_LOCAL_IOS = 3, DEFAULT_TUNABLE_LOCAL_OMPS = 4, DEFAULT_TUNABLE_LOCAL_PYS = 5, - DEFAULT_TUNABLE_GLOBAL_CPUS = 6, - DEFAULT_TUNABLE_GLOBAL_GPUS = 7, - DEFAULT_TUNABLE_GLOBAL_IOS = 8, - DEFAULT_TUNABLE_GLOBAL_OMPS = 9, - DEFAULT_TUNABLE_GLOBAL_PYS = 10, - DEFAULT_TUNABLE_LAST = 11, // this one must always be last and unused + DEFAULT_TUNABLE_LOCAL_FPGAS = 6, + DEFAULT_TUNABLE_GLOBAL_CPUS = 7, + DEFAULT_TUNABLE_GLOBAL_GPUS = 8, + DEFAULT_TUNABLE_GLOBAL_IOS = 9, + DEFAULT_TUNABLE_GLOBAL_OMPS = 10, + DEFAULT_TUNABLE_GLOBAL_PYS = 11, + DEFAULT_TUNABLE_GLOBAL_FPGAS = 12, + DEFAULT_TUNABLE_LAST = 13, // this one must always be last and unused }; enum MappingKind { TASK_MAPPING, @@ -383,6 +385,8 @@ namespace Legion { Processor default_get_next_global_procset(void); Processor default_get_next_local_omp(void); Processor default_get_next_global_omp(void); + Processor default_get_next_local_fpga(void); + Processor default_get_next_global_fpga(void); VariantInfo default_find_preferred_variant( const Task &task, MapperContext ctx, bool needs_tight_bound, bool cache = true, @@ -472,12 +476,14 @@ namespace Legion { std::vector local_procsets; std::vector local_omps; std::vector local_pys; + std::vector local_fpgas; std::vector remote_gpus; std::vector remote_cpus; std::vector remote_ios; std::vector remote_procsets; std::vector remote_omps; std::vector remote_pys; + std::vector remote_fpgas; // multipleNumaDomainsPresent is set to true when the target machine // has multiple separate NUMA memories (SOCKET_MEM). This controls how // processor selection for NUMA aware allocations is performed. @@ -485,20 +491,24 @@ namespace Legion { protected: // For doing round-robining of tasks onto processors unsigned next_local_gpu, next_local_cpu, next_local_io, - next_local_procset, next_local_omp, next_local_py; + next_local_procset, next_local_omp, next_local_py, + next_local_fpga; Processor next_global_gpu, next_global_cpu, next_global_io, - next_global_procset, next_global_omp, next_global_py; + next_global_procset, next_global_omp, next_global_py, + next_global_fpga; Machine::ProcessorQuery *global_gpu_query, *global_cpu_query, *global_io_query, *global_procset_query, - *global_omp_query, *global_py_query; - protected: + *global_omp_query, *global_py_query, + *global_fpga_query; + protected: // Cached mapping information about the application std::map > gpu_slices_cache, cpu_slices_cache, io_slices_cache, procset_slices_cache, omp_slices_cache, - py_slices_cache; + py_slices_cache, + fpga_slices_cache; std::map, VariantInfo> preferred_variants; std::map, diff --git a/runtime/mappers/mapping_utilities.cc b/runtime/mappers/mapping_utilities.cc index 5520e21a8a..18ef032c1f 100644 --- a/runtime/mappers/mapping_utilities.cc +++ b/runtime/mappers/mapping_utilities.cc @@ -1063,6 +1063,7 @@ namespace Legion { case Processor::PROC_SET: return "PROC_SET"; case Processor::OMP_PROC: return "OMP_PROC"; case Processor::PY_PROC: return "PY_PROC"; + case Processor::FPGA_PROC: return "FPGA_PROC"; default: assert(false); return ""; } } @@ -1084,6 +1085,7 @@ namespace Legion { case Memory::LEVEL3_CACHE: return "LEVEL3_CACHE"; case Memory::LEVEL2_CACHE: return "LEVEL2_CACHE"; case Memory::LEVEL1_CACHE: return "LEVEL1_CACHE"; + case Memory::FPGA_MEM: return "FPGA_MEM"; default: assert(false); return ""; } } diff --git a/runtime/realm/fpga/fpga_module.cc b/runtime/realm/fpga/fpga_module.cc new file mode 100644 index 0000000000..f3bbf570aa --- /dev/null +++ b/runtime/realm/fpga/fpga_module.cc @@ -0,0 +1,1825 @@ +#include "realm/fpga/fpga_module.h" + +#include "realm/logging.h" +#include "realm/cmdline.h" +#include "realm/utils.h" + +#include +#include + +namespace Realm +{ + namespace FPGA + { + + namespace ThreadLocal + { + static REALM_THREAD_LOCAL FPGAProcessor *current_fpga_proc = NULL; + } + + Logger log_fpga("fpga"); + + // need types with various powers-of-2 size/alignment - we have up to + // uint64_t as builtins, but we need trivially-copyable 16B and 32B things + struct dummy_16b_t + { + uint64_t a, b; + }; + struct dummy_32b_t + { + uint64_t a, b, c, d; + }; + REALM_ALIGNED_TYPE_CONST(aligned_16b_t, dummy_16b_t, 16); + REALM_ALIGNED_TYPE_CONST(aligned_32b_t, dummy_32b_t, 32); + + template + static void fpga_memcpy_2d_typed(uintptr_t dst_base, uintptr_t dst_lstride, + uintptr_t src_base, uintptr_t src_lstride, + size_t bytes, size_t lines) + { + for (size_t i = 0; i < lines; i++) + { + std::copy(reinterpret_cast(src_base), + reinterpret_cast(src_base + bytes), + reinterpret_cast(dst_base)); + // manual strength reduction + src_base += src_lstride; + dst_base += dst_lstride; + } + } + + static void fpga_memcpy_2d(uintptr_t dst_base, uintptr_t dst_lstride, + uintptr_t src_base, uintptr_t src_lstride, + size_t bytes, size_t lines) + { + // by subtracting 1 from bases, strides, and lengths, we get LSBs set + // based on the common alignment of every parameter in the copy + unsigned alignment = ((dst_base - 1) & (dst_lstride - 1) & + (src_base - 1) & (src_lstride - 1) & + (bytes - 1)); + // TODO: consider jump table approach? + if ((alignment & 31) == 31) + fpga_memcpy_2d_typed(dst_base, dst_lstride, + src_base, src_lstride, + bytes, lines); + else if ((alignment & 15) == 15) + fpga_memcpy_2d_typed(dst_base, dst_lstride, + src_base, src_lstride, + bytes, lines); + else if ((alignment & 7) == 7) + fpga_memcpy_2d_typed(dst_base, dst_lstride, src_base, src_lstride, + bytes, lines); + else if ((alignment & 3) == 3) + fpga_memcpy_2d_typed(dst_base, dst_lstride, src_base, src_lstride, + bytes, lines); + else if ((alignment & 1) == 1) + fpga_memcpy_2d_typed(dst_base, dst_lstride, src_base, src_lstride, + bytes, lines); + else + fpga_memcpy_2d_typed(dst_base, dst_lstride, src_base, src_lstride, + bytes, lines); + } + + /** + * FPGAWorker: it is responsible for making progress on one or more Command Queues. + * This may be done directly by an FPGAProcessor or in a background thread spawned for the purpose. + */ + FPGAWorker::FPGAWorker(void) + : BackgroundWorkItem("FPGA device worker"), + condvar(lock), + core_rsrv(0), + worker_thread(0), + thread_sleeping(false), + worker_shutdown_requested(false) + { + } + + FPGAWorker::~FPGAWorker(void) + { + // shutdown should have already been called + assert(worker_thread == 0); + } + + void FPGAWorker::start_background_thread( + Realm::CoreReservationSet &crs, + size_t stack_size) + { + assert(manager == 0); + core_rsrv = new Realm::CoreReservation("A worker thread", crs, + Realm::CoreReservationParameters()); + Realm::ThreadLaunchParameters tlp; + worker_thread = Realm::Thread::create_kernel_thread(this, tlp, *core_rsrv, 0); + } + + void FPGAWorker::shutdown_background_thread(void) + { + { + AutoLock<> al(lock); + worker_shutdown_requested.store(true); + if (thread_sleeping) + { + thread_sleeping = false; + condvar.broadcast(); + } + } + + worker_thread->join(); + delete worker_thread; + worker_thread = 0; + + delete core_rsrv; + core_rsrv = 0; + } + + void FPGAWorker::add_queue(FPGAQueue *queue) + { + bool was_empty = false; + { + AutoLock<> al(lock); +#ifdef DEBUG_REALM + // insist that the caller de-duplicate these + for (ActiveQueue::iterator it = active_queues.begin(); + it != active_queues.end(); + ++it) + assert(*it != queue); +#endif + was_empty = active_queues.empty(); + active_queues.push_back(queue); + if (thread_sleeping) + { + thread_sleeping = false; + condvar.broadcast(); + } + } + // if we're a background work item, request attention if needed + if (was_empty && (manager != 0)) + make_active(); + } + + bool FPGAWorker::do_work(TimeLimit work_until) + { + // pop the first queue off the list and immediately become re-activ if more queues remain + FPGAQueue *queue = 0; + bool still_not_empty = false; + { + AutoLock<> al(lock); + + assert(!active_queues.empty()); + queue = active_queues.front(); + active_queues.pop_front(); + still_not_empty = !active_queues.empty(); + } + if (still_not_empty) + make_active(); + // do work for the queue we popped, paying attention to the cutoff time + bool requeue_q = false; + + if (queue->reap_events(work_until)) + { + // still work (e.g. copies) to do + if (work_until.is_expired()) + { + // out of time - save it for later + requeue_q = true; + } + else if (queue->issue_copies(work_until)) + requeue_q = true; + } + + bool was_empty = false; + if (requeue_q) + { + AutoLock<> al(lock); + was_empty = active_queues.empty(); + active_queues.push_back(queue); + } + // note that we can need requeueing even if we called make_active above! + return was_empty; + } + + bool FPGAWorker::process_queues(bool sleep_on_empty) + { + FPGAQueue *cur_queue = 0; + FPGAQueue *first_queue = 0; + bool requeue_queue = false; + while (true) + { + // grab the front queue in the list + { + AutoLock<> al(lock); + // if we didn't finish work on the queue from the previous + // iteration, add it back to the end + if (requeue_queue) + active_queues.push_back(cur_queue); + + while (active_queues.empty()) + { + // sleep only if this was the first attempt to get a queue + if (sleep_on_empty && (first_queue == 0) && + !worker_shutdown_requested.load()) + { + thread_sleeping = true; + condvar.wait(); + } + else + return false; + } + cur_queue = active_queues.front(); + // did we wrap around? if so, stop for now + if (cur_queue == first_queue) + return true; + + active_queues.pop_front(); + if (!first_queue) + first_queue = cur_queue; + } + // and do some work for it + requeue_queue = false; + // reap_events report whether any kind of work + if (!cur_queue->reap_events(TimeLimit())) + continue; + if (!cur_queue->issue_copies(TimeLimit())) + continue; + // if we fall, the queues never went empty at any time, so it's up to us to requeue + requeue_queue = true; + } + } + + void FPGAWorker::thread_main(void) + { + // TODO: consider busy-waiting in some cases to reduce latency? + while (!worker_shutdown_requested.load()) + { + bool work_left = process_queues(true); + // if there was work left, yield our thread + // for now to avoid a tight spin loop + if (work_left) + Realm::Thread::yield(); + } + } + + /** + * FPGAWorkFence: used to determine when a device kernel completes execution + */ + FPGAWorkFence::FPGAWorkFence(Realm::Operation *op) + : Realm::Operation::AsyncWorkItem(op) + { + } + + void FPGAWorkFence::request_cancellation(void) + { + // ignored - no way to shoot down FPGA work + } + + void FPGAWorkFence::print(std::ostream &os) const + { + os << "FPGAWorkFence"; + } + + void FPGAWorkFence::enqueue(FPGAQueue *queue) + { + queue->add_fence(this); + } + + /** + * FPGAQueue: device command queue + */ + FPGAQueue::FPGAQueue(FPGADevice *fpga_device, FPGAWorker *fpga_worker, cl::CommandQueue &command_queue) + : fpga_device(fpga_device), fpga_worker(fpga_device->fpga_worker), command_queue(command_queue) + { + log_fpga.info() << "Create FPGAQueue "; + pending_events.clear(); + pending_copies.clear(); + } + + FPGAQueue::~FPGAQueue(void) + { + } + + cl::CommandQueue &FPGAQueue::get_command_queue() const + { + return command_queue; + } + + void FPGAQueue::add_fence(FPGAWorkFence *fence) + { + cl::Event opencl_event; + cl_int err = 0; + OCL_CHECK(err, err = command_queue.enqueueMarkerWithWaitList(nullptr, &opencl_event)); + add_event(opencl_event, fence, 0); + } + + void FPGAQueue::add_notification(FPGACompletionNotification *notification) + { + cl::Event opencl_event; + cl_int err = 0; + OCL_CHECK(err, err = command_queue.enqueueMarkerWithWaitList(nullptr, &opencl_event)); + + add_event(opencl_event, 0, notification); + } + + // add event to worker so it can be progressed + void FPGAQueue::add_event(cl::Event opencl_event, + FPGAWorkFence *fence, + FPGACompletionNotification *n) + + { + bool add_to_worker = false; + // assert(opencl_event != nullptr); + { + AutoLock<> al(mutex); + // remember to add ourselves + // to the worker if we didn't already have work + add_to_worker = pending_events.empty() && + pending_copies.empty(); + PendingEvent e; + e.opencl_event = opencl_event; + e.fence = fence; + e.notification = n; + pending_events.push_back(e); + } + if (add_to_worker) + fpga_worker->add_queue(this); + } + + bool FPGAQueue::has_work(void) const + { + return (!pending_events.empty() || + !pending_copies.empty()); + } + + bool FPGAQueue::reap_events(TimeLimit work_until) + { + // peek at the first event + cl::Event opencl_event; + FPGACompletionNotification *notification = 0; + bool event_valid = false; + { + AutoLock<> al(mutex); + + if (pending_events.empty()) + // no events left, but command queue + // might have other work left + return has_work(); + opencl_event = pending_events.front().opencl_event; + notification = pending_events.front().notification; + event_valid = true; + } + // we'll keep looking at events + // until we find one that hasn't triggered + bool work_left = true; + while (event_valid) + { + cl_int err = 0; + cl_int status; + OCL_CHECK(err, err = opencl_event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status)); + if (status == CL_QUEUED || status == CL_SUBMITTED || status == CL_RUNNING) + { + // event is not finished - check again later + return true; + } + else if (status != CL_COMPLETE) + { + log_fpga.fatal() << "Error reported on FPGA " << fpga_device->name; + } + + // this event has triggered + FPGAWorkFence *fence = 0; + { + AutoLock<> al(mutex); + + const PendingEvent &e = pending_events.front(); + assert(e.opencl_event == opencl_event); + fence = e.fence; + notification = e.notification; + pending_events.pop_front(); + + if (pending_events.empty()) + { + event_valid = false; + work_left = has_work(); + } + else + { + opencl_event = pending_events.front().opencl_event; + } + } + if (fence) + { + fence->mark_finished(true /*successful*/); // set preconditions for next tasks + } + if (notification) + { + notification->request_completed(); + } + if (event_valid && work_until.is_expired()) + return true; + } + // if we get here, we ran out of events, but there might have been + // other kinds of work that we need to let the caller know about + return work_left; + } + + void FPGAQueue::add_copy(FPGADeviceMemcpy *copy) + { + bool add_to_worker = false; + { + AutoLock<> al(mutex); + // add if we haven't been added yet + add_to_worker = + pending_copies.empty() && pending_events.empty(); + pending_copies.push_back(copy); + } + if (add_to_worker) + fpga_worker->add_queue(this); + } + + bool FPGAQueue::issue_copies(TimeLimit work_until) + { + while (true) + { + // if we cause the list to go empty, + // we stop even if more copies show + // up because we don't want to requeue ourselves twice + bool list_exhausted = false; + FPGADeviceMemcpy *copy = 0; + { + AutoLock<> al(mutex); + if (pending_copies.empty()) + // no copies left, + // but queue might have other work left + return has_work(); + copy = pending_copies.front(); + pending_copies.pop_front(); + list_exhausted = !has_work(); + } + copy->execute(this); + delete copy; + // if the list was exhausted, let the caller know + if (list_exhausted) + return false; + + // if we still have work, but time's up, return also + if (work_until.is_expired()) + return true; + } + return false; // should never reach here + } + + FPGADevice::FPGADevice(cl::Device &device, std::string name, std::string xclbin, FPGAWorker *fpga_worker, size_t fpga_coprocessor_num_cu, std::string fpga_coprocessor_kernel) + : name(name), fpga_worker(fpga_worker), fpga_coprocessor_num_cu(fpga_coprocessor_num_cu), fpga_coprocessor_kernel(fpga_coprocessor_kernel) + { + cl_int err; + this->device = device; + // Create FPGA context and command queue for each device + OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err)); + OCL_CHECK(err, command_queue = cl::CommandQueue(context, device, + CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); + + // Program the device + auto fileBuf = xcl::read_binary_file(xclbin); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + log_fpga.info() << "Trying to program device " << device.getInfo(); + OCL_CHECK(err, program = cl::Program(context, {device}, bins, nullptr, &err)); + + fpga_mem = nullptr; + local_sysmem = nullptr; + local_ibmem = nullptr; + fpga_queue = nullptr; + create_fpga_queues(); + } + + FPGADevice::~FPGADevice() + { + if (fpga_queue != nullptr) + { + delete fpga_queue; + fpga_queue = nullptr; + } + command_queue.finish(); + } + + void FPGADevice::create_dma_channels(RuntimeImpl *runtime) + { + if (!fpga_mem) + { + return; + } + + const std::vector &local_mems = runtime->nodes[Network::my_node_id].memories; + for (std::vector::const_iterator it = local_mems.begin(); + it != local_mems.end(); + it++) + { + if ((*it)->lowlevel_kind == Memory::SYSTEM_MEM) + { + this->local_sysmem = *it; + log_fpga.info() << "local_sysmem " << std::hex << (*it)->me.id << std::dec << " kind: " << (*it)->kind << " low-level kind: " << (*it)->lowlevel_kind; + break; + } + } + + const std::vector &local_ib_mems = runtime->nodes[Network::my_node_id].ib_memories; + for (std::vector::const_iterator it = local_ib_mems.begin(); + it != local_ib_mems.end(); + it++) + { + if ((*it)->lowlevel_kind == Memory::REGDMA_MEM) + { + this->local_ibmem = *it; + log_fpga.info() << "local_ibmem " << std::hex << (*it)->me.id << std::dec << " kind: " << (*it)->kind << " low-level kind: " << (*it)->lowlevel_kind; + break; + } + } + + runtime->add_dma_channel(new FPGAfillChannel(this, &runtime->bgwork)); + runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_IN_DEV, &runtime->bgwork)); + runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_TO_DEV, &runtime->bgwork)); + runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_FROM_DEV, &runtime->bgwork)); + runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_COMP, &runtime->bgwork)); + + Machine::MemoryMemoryAffinity mma; + mma.m1 = fpga_mem->me; + mma.m2 = local_sysmem->me; + mma.bandwidth = 20; // TODO + mma.latency = 200; + runtime->add_mem_mem_affinity(mma); + + mma.m1 = fpga_mem->me; + mma.m2 = local_ibmem->me; + mma.bandwidth = 20; // TODO + mma.latency = 200; + runtime->add_mem_mem_affinity(mma); + + // TODO: create p2p channel + // runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_PEER_DEV, &runtime->bgwork)); + } + + void FPGADevice::create_fpga_mem(RuntimeImpl *runtime, size_t size) + { + // TODO: only use membank 0 for now + cl_int err; + void *base_ptr_sys = nullptr; + posix_memalign((void **)&base_ptr_sys, 4096, size); + OCL_CHECK(err, buff = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, base_ptr_sys, &err)); + Memory m = runtime->next_local_memory_id(); + this->fpga_mem = new FPGADeviceMemory(m, this, base_ptr_sys, size); + runtime->add_memory(fpga_mem); + log_fpga.info() << "create_fpga_mem: " + << device.getInfo() + << ", size = " + << (size >> 20) << " MB" + << ", base_ptr_sys = " << base_ptr_sys; + } + + void FPGADevice::create_fpga_ib(RuntimeImpl *runtime, size_t size) + { + // TODO: only use membank 0 for now + cl_int err; + void *base_ptr_sys = nullptr; + posix_memalign((void **)&base_ptr_sys, 4096, size); + OCL_CHECK(err, ib_buff = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, base_ptr_sys, &err)); + Memory m = runtime->next_local_ib_memory_id(); + IBMemory *ib_mem; + ib_mem = new IBMemory(m, size, + MemoryImpl::MKIND_FPGA, Memory::FPGA_MEM, + base_ptr_sys, 0); + this->fpga_ib = ib_mem; + runtime->add_ib_memory(ib_mem); + log_fpga.info() << "create_fpga_ib: " + << device.getInfo() + << ", size = " + << (size >> 20) << " MB" + << ", base_ptr_sys = " << base_ptr_sys; + } + + void FPGADevice::create_fpga_queues() + { + fpga_queue = new FPGAQueue(this, fpga_worker, command_queue); + } + + void FPGADevice::copy_to_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification) + { + log_fpga.info() << "copy_to_fpga: src = " << src << " dst = " << dst + << " src_offset = " << src_offset << "dst_offset = " << dst_offset + << " bytes = " << bytes << " notification = " << notification; + + FPGADeviceMemcpy *copy = new FPGADeviceMemcpy1D(this, + dst, + src, + bytes, + dst_offset, + FPGA_MEMCPY_HOST_TO_DEVICE, + notification); + fpga_queue->add_copy(copy); + } + + void FPGADevice::copy_from_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification) + { + log_fpga.info() << "copy_from_fpga: src = " << src << " dst = " << dst + << " src_offset = " << src_offset << "dst_offset = " << dst_offset + << " bytes = " << bytes << " notification = " << notification; + + FPGADeviceMemcpy *copy = new FPGADeviceMemcpy1D(this, + dst, + src, + bytes, + src_offset, + FPGA_MEMCPY_DEVICE_TO_HOST, + notification); + fpga_queue->add_copy(copy); + } + + void FPGADevice::copy_within_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification) + { + log_fpga.info() << "copy_within_fpga: src = " << src << " dst = " << dst + << " src_offset = " << src_offset << "dst_offset = " << dst_offset + << " bytes = " << bytes << " notification = " << notification; + FPGADeviceMemcpy *copy = new FPGADeviceMemcpy1D(this, + dst, + src, + bytes, + dst_offset, + FPGA_MEMCPY_DEVICE_TO_DEVICE, + notification); + fpga_queue->add_copy(copy); + } + + void FPGADevice::copy_to_peer(FPGADevice *dst_dev, void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification) + { + log_fpga.info() << "copy_to_peer(not implemented!): dst_dev = " << dst_dev + << "src = " << src << " dst = " << dst + << " src_offset = " << src_offset << "dst_offset = " << dst_offset + << " bytes = " << bytes << " notification = " << notification; + assert(0); + } + + // TODO: FPGA coprocessor kernels are invoked in this function + void FPGADevice::comp(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification) + { + log_fpga.info() << "comp: src = " << src << " dst = " << dst + << " src_offset = " << src_offset << "dst_offset = " << dst_offset + << " bytes = " << bytes << " notification = " << notification; + // An example of invoking an FPGA coprocessor kernel + // program device + int num_cu = fpga_coprocessor_num_cu; + std::vector krnls(num_cu); + cl_int err; + // Creating Kernel objects + for (int i = 0; i < num_cu; i++) + { + OCL_CHECK(err, krnls[i] = cl::Kernel(program, fpga_coprocessor_kernel.c_str(), &err)); + } + // Creating sub-buffers + auto chunk_size = bytes / num_cu; + size_t vector_size_bytes = sizeof(int) * chunk_size; + std::vector buffer_in1(num_cu); + std::vector buffer_in2(num_cu); + std::vector buffer_output(num_cu); + + // I/O data vectors + std::vector> source_in2(bytes, 1); + + for (int i = 0; i < num_cu; i++) + { + OCL_CHECK(err, buffer_in2[i] = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_size_bytes, + source_in2.data() + i * chunk_size, &err)); + } + for (int i = 0; i < num_cu; i++) + { + cl_buffer_region buffer_in1_info = {src_offset + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_in1[i] = ib_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in1_info, &err)); + cl_buffer_region buffer_output_info = {dst_offset + i * vector_size_bytes, vector_size_bytes}; + OCL_CHECK(err, buffer_output[i] = ib_buff.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_output_info, &err)); + } + + for (int i = 0; i < num_cu; i++) + { + int narg = 0; + + // Setting kernel arguments + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in1[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in2[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_output[i])); + OCL_CHECK(err, err = krnls[i].setArg(narg++, (int)chunk_size)); + } + + cl::Event task_events[num_cu]; + for (int i = 0; i < num_cu; i++) + { + // Launch the kernel + OCL_CHECK(err, err = command_queue.enqueueTask(krnls[i], nullptr, &task_events[i])); + } + + std::vector wait_events[num_cu]; + // Copy result from device global memory to host local memory + for (int i = 0; i < num_cu; i++) + { + wait_events[i].push_back(task_events[i]); + OCL_CHECK(err, err = command_queue.enqueueMigrateMemObjects({buffer_output[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &wait_events[i], nullptr)); + } + + OCL_CHECK(err, err = command_queue.flush()); + fpga_queue->add_notification(notification); + } + + bool FPGADevice::is_in_buff(void *ptr) + { + uint64_t base_ptr = (uint64_t)(fpga_mem->base_ptr_sys); + if ((uint64_t)ptr >= base_ptr && (uint64_t)ptr <= base_ptr + fpga_mem->size) + { + return true; + } + return false; + } + + bool FPGADevice::is_in_ib_buff(void *ptr) + { + uint64_t base_ptr = (uint64_t)(fpga_ib->get_direct_ptr(0, 0)); + if ((uint64_t)ptr >= base_ptr && (uint64_t)ptr <= base_ptr + fpga_ib->size) + { + return true; + } + return false; + } + + /** + * Device Memory Copy Operations + */ + FPGADeviceMemcpy::FPGADeviceMemcpy(FPGADevice *fpga_device, + FPGAMemcpyKind kind, + FPGACompletionNotification *notification) + : fpga_device(fpga_device), kind(kind), notification(notification) + { + } + + /** + * 1D Memory Copy Operation + */ + FPGADeviceMemcpy1D::FPGADeviceMemcpy1D(FPGADevice *fpga_device, + void *dst, + const void *src, + size_t bytes, + off_t buff_offset, + FPGAMemcpyKind kind, + FPGACompletionNotification *notification) + : FPGADeviceMemcpy(fpga_device, kind, notification), + dst(dst), src(src), elmt_size(bytes), buff_offset(buff_offset) + { + } + + FPGADeviceMemcpy1D::~FPGADeviceMemcpy1D(void) + { + } + + void FPGADeviceMemcpy1D::do_span(off_t pos, size_t len) + { + off_t span_start = pos * elmt_size; + size_t span_bytes = len * elmt_size; + void *dstptr = ((uint8_t *)dst) + span_start; + void *srcptr = ((uint8_t *)src) + span_start; + size_t size = span_bytes; + cl_int err = 0; + log_fpga.debug() << "do_span: buff_offset " << buff_offset << " size " << size << " srcptr " << srcptr << " dstptr " << dstptr; + cl::Buffer *temp_buff = nullptr; + if (kind == FPGA_MEMCPY_HOST_TO_DEVICE) + { + if (fpga_device->is_in_ib_buff(dstptr)) + temp_buff = &(fpga_device->ib_buff); + else if (fpga_device->is_in_buff(dstptr)) + temp_buff = &(fpga_device->buff); + else + { + log_fpga.error() << "dstptr is not in buffer or ib_buffer"; + assert(0); + } + OCL_CHECK(err, err = fpga_queue->get_command_queue().enqueueWriteBuffer(*temp_buff, // buffer on the FPGA + CL_FALSE, // blocking call + buff_offset, // buffer offset in bytes + size, // Size in bytes + (void *)srcptr, // Pointer to the data to copy + nullptr, nullptr)); + } + else if (kind == FPGA_MEMCPY_DEVICE_TO_HOST) + { + if (fpga_device->is_in_ib_buff(srcptr)) + temp_buff = &(fpga_device->ib_buff); + else if (fpga_device->is_in_buff(srcptr)) + temp_buff = &(fpga_device->buff); + else + { + log_fpga.error() << "srcptr is not in buffer or ib_buffer"; + assert(0); + } + OCL_CHECK(err, err = fpga_queue->get_command_queue().enqueueReadBuffer(*temp_buff, // buffer on the FPGA + CL_FALSE, // blocking call + buff_offset, // buffer offset in bytes + size, // Size in bytes + (void *)dstptr, // Pointer to the data to copy + nullptr, nullptr)); + } + else if (kind == FPGA_MEMCPY_DEVICE_TO_DEVICE) + { + if (fpga_device->is_in_ib_buff(dstptr)) + temp_buff = &(fpga_device->ib_buff); + else if (fpga_device->is_in_buff(dstptr)) + temp_buff = &(fpga_device->buff); + else + { + log_fpga.error() << "dstptr is not in buffer or ib_buffer"; + assert(0); + } + OCL_CHECK(err, err = fpga_queue->get_command_queue().enqueueWriteBuffer(*temp_buff, // buffer on the FPGA + CL_FALSE, // blocking call + buff_offset, // buffer offset in bytes + size, // Size in bytes + (void *)srcptr, // Pointer to the data to copy + nullptr, nullptr)); + } + else if (kind == FPGA_MEMCPY_PEER_TO_PEER) + { + log_fpga.error() << "FPGA_MEMCPY_PEER_TO_PEER not implemented"; + assert(0); + } + else + { + log_fpga.error() << "FPGADeviceMemcpy kind error"; + assert(0); + } + OCL_CHECK(err, err = fpga_queue->get_command_queue().flush()); + } + + void FPGADeviceMemcpy1D::execute(FPGAQueue *queue) + { + log_fpga.info("FPGADevice memcpy: dst=%p src=%p bytes=%zd kind=%d", + dst, src, elmt_size, kind); + // save queue into local variable + // for do_span (which may be called indirectly by ElementMask::forall_ranges) + fpga_queue = queue; + do_span(0, 1); + if (notification) + { + fpga_queue->add_notification(notification); + } + log_fpga.info("fpga memcpy 1d issued: dst=%p src=%p bytes=%zd kind=%d", + dst, src, elmt_size, kind); + } + + FPGAModule::FPGAModule() + : Module("fpga"), cfg_num_fpgas(0), cfg_use_worker_threads(false), + cfg_use_shared_worker(true), cfg_fpga_mem_size(4 << 20), cfg_fpga_ib_size(4 << 20), + cfg_fpga_coprocessor_num_cu(1) + { + shared_worker = nullptr; + cfg_fpga_xclbin = ""; + fpga_devices.clear(); + dedicated_workers.clear(); + fpga_procs_.clear(); + cfg_fpga_coprocessor_kernel = ""; + } + + FPGAModule::~FPGAModule(void) + { + if (!this->fpga_devices.empty()) + { + for (size_t i = 0; i < fpga_devices.size(); i++) + { + + // xclClose(fpga_devices[i]->dev_handle); + delete this->fpga_devices[i]; + } + } + } + + Module *FPGAModule::create_module(RuntimeImpl *runtime, std::vector &cmdline) + { + FPGAModule *m = new FPGAModule; + log_fpga.info() << "create_module"; + // first order of business - read command line parameters + { + Realm::CommandLineParser cp; + cp.add_option_int("-ll:fpga", m->cfg_num_fpgas); + cp.add_option_bool("-ll:fpga_work_thread", m->cfg_use_worker_threads); + cp.add_option_bool("-ll:fpga_shared_worker", m->cfg_use_shared_worker); + cp.add_option_int_units("-ll:fpga_size", m->cfg_fpga_mem_size, 'm'); + cp.add_option_int_units("-ll:fpga_ib_size", m->cfg_fpga_ib_size, 'm'); + cp.add_option_string("-ll:fpga_xclbin", m->cfg_fpga_xclbin); + cp.add_option_int("-ll:fpga_coprocessor_num_cu", m->cfg_fpga_coprocessor_num_cu); + cp.add_option_string("-ll:fpga_coprocessor_kernel", m->cfg_fpga_coprocessor_kernel); + + bool ok = cp.parse_command_line(cmdline); + if (!ok) + { + log_fpga.error() << "error reading fpga parameters"; + exit(1); + } + } + return m; + } + + // do any general initialization - this is called after all configuration is complete + void FPGAModule::initialize(RuntimeImpl *runtime) + { + log_fpga.info() << "initialize"; + Module::initialize(runtime); + + std::vector devices = xcl::get_xil_devices(); + if (cfg_num_fpgas > devices.size()) + { + log_fpga.error() << cfg_num_fpgas << " FPGA Processors requested, but only " << devices.size() << " available!"; + exit(1); + } + + // if we are using a shared worker, create that next + if (cfg_use_shared_worker) + { + shared_worker = new FPGAWorker; + if (cfg_use_worker_threads) + { + shared_worker->start_background_thread( + runtime->core_reservation_set(), + 1 << 20); // hardcoded worker stack size + } + else + shared_worker->add_to_manager(&(runtime->bgwork)); + } + + // set up fpga_devices + for (unsigned int i = 0; i < cfg_num_fpgas; i++) + { + // for each device record the FPGAWorker + FPGAWorker *worker; + if (cfg_use_shared_worker) + { + worker = shared_worker; + } + else + { + worker = new FPGAWorker; + if (cfg_use_worker_threads) + worker->start_background_thread( + runtime->core_reservation_set(), + 1 << 20); // hardcoded worker stack size + else + worker->add_to_manager(&(runtime->bgwork)); + } + + FPGADevice *fpga_device = new FPGADevice(devices[i], "fpga" + std::to_string(i), cfg_fpga_xclbin, worker, cfg_fpga_coprocessor_num_cu, cfg_fpga_coprocessor_kernel); + fpga_devices.push_back(fpga_device); + + if (!cfg_use_shared_worker) + { + log_fpga.info() << "add to dedicated workers " << worker; + dedicated_workers[fpga_device] = worker; + } + } + } + + // create any memories provided by this module (default == do nothing) + // (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id) + void FPGAModule::create_memories(RuntimeImpl *runtime) + { + log_fpga.info() << "create_memories"; + Module::create_memories(runtime); + if (cfg_fpga_mem_size > 0) + { + for (size_t i = 0; i < cfg_num_fpgas; i++) + { + fpga_devices[i]->create_fpga_mem(runtime, cfg_fpga_mem_size); + } + } + if (cfg_fpga_ib_size > 0) + { + for (size_t i = 0; i < cfg_num_fpgas; i++) + { + fpga_devices[i]->create_fpga_ib(runtime, cfg_fpga_ib_size); + } + } + } + + // create any processors provided by the module (default == do nothing) + // (each new ProcessorImpl should use a Processor from RuntimeImpl::next_local_processor_id) + void FPGAModule::create_processors(RuntimeImpl *runtime) + { + Module::create_processors(runtime); + for (size_t i = 0; i < cfg_num_fpgas; i++) + { + Processor p = runtime->next_local_processor_id(); + FPGAProcessor *proc = new FPGAProcessor(fpga_devices[i], p, runtime->core_reservation_set()); + fpga_procs_.push_back(proc); + runtime->add_processor(proc); + log_fpga.info() << "create fpga processor " << i; + + // create mem affinities to add a proc to machine model + // create affinities between this processor and system/reg memories + // if the memory is one we created, use the kernel-reported distance + // to adjust the answer + std::vector &local_mems = runtime->nodes[Network::my_node_id].memories; + for (std::vector::iterator it = local_mems.begin(); + it != local_mems.end(); + ++it) + { + Memory::Kind kind = (*it)->get_kind(); + if (kind == Memory::SYSTEM_MEM or kind == Memory::FPGA_MEM) + { + Machine::ProcessorMemoryAffinity pma; + pma.p = p; + pma.m = (*it)->me; + + // use the same made-up numbers as in + // runtime_impl.cc + if (kind == Memory::SYSTEM_MEM) + { + pma.bandwidth = 100; // "large" + pma.latency = 5; // "small" + } + else if (kind == Memory::FPGA_MEM) + { + pma.bandwidth = 200; // "large" + pma.latency = 10; // "small" + } + else + { + assert(0 && "wrong memory kind"); + } + + runtime->add_proc_mem_affinity(pma); + } + } + } + } + + // create any DMA channels provided by the module (default == do nothing) + void FPGAModule::create_dma_channels(RuntimeImpl *runtime) + { + log_fpga.info() << "create_dma_channels"; + for (std::vector::iterator it = fpga_devices.begin(); it != fpga_devices.end(); it++) + { + (*it)->create_dma_channels(runtime); + } + + Module::create_dma_channels(runtime); + } + + // create any code translators provided by the module (default == do nothing) + void FPGAModule::create_code_translators(RuntimeImpl *runtime) + { + log_fpga.info() << "create_code_translators"; + Module::create_code_translators(runtime); + } + + // clean up any common resources created by the module - this will be called + // after all memories/processors/etc. have been shut down and destroyed + void FPGAModule::cleanup(void) + { + log_fpga.info() << "cleanup"; + // clean up worker(s) + if (shared_worker) + { +#ifdef DEBUG_REALM + shared_worker->shutdown_work_item(); +#endif + if (cfg_use_worker_threads) + shared_worker->shutdown_background_thread(); + + delete shared_worker; + shared_worker = 0; + } + for (std::map::iterator it = dedicated_workers.begin(); + it != dedicated_workers.end(); + it++) + { + log_fpga.info() << "shutdown worker in cleanup"; + FPGAWorker *worker = it->second; +#ifdef DEBUG_REALM + worker->shutdown_work_item(); +#endif + if (cfg_use_worker_threads) + worker->shutdown_background_thread(); + + delete worker; + } + dedicated_workers.clear(); + Module::cleanup(); + } + + template + class FPGATaskScheduler : public T + { + public: + FPGATaskScheduler(Processor proc, Realm::CoreReservation &core_rsrv, FPGAProcessor *fpga_proc); + virtual ~FPGATaskScheduler(void); + + protected: + virtual bool execute_task(Task *task); + virtual void execute_internal_task(InternalTask *task); + FPGAProcessor *fpga_proc_; + }; + + template + FPGATaskScheduler::FPGATaskScheduler(Processor proc, + Realm::CoreReservation &core_rsrv, + FPGAProcessor *fpga_proc) : T(proc, core_rsrv), fpga_proc_(fpga_proc) + { + } + + template + FPGATaskScheduler::~FPGATaskScheduler(void) + { + } + + template + bool FPGATaskScheduler::execute_task(Task *task) + { + assert(ThreadLocal::current_fpga_proc == NULL); + ThreadLocal::current_fpga_proc = fpga_proc_; + FPGAQueue *queue = fpga_proc_->fpga_device->fpga_queue; + log_fpga.info() << "execute_task " << task; + + // we'll use a "work fence" to track when the kernels launched by this task actually + // finish - this must be added to the task _BEFORE_ we execute + FPGAWorkFence *fence = new FPGAWorkFence(task); + task->add_async_work_item(fence); + bool ok = T::execute_task(task); + fence->enqueue(queue); + + assert(ThreadLocal::current_fpga_proc == fpga_proc_); + ThreadLocal::current_fpga_proc = NULL; + return ok; + } + + template + void FPGATaskScheduler::execute_internal_task(InternalTask *task) + { + assert(ThreadLocal::current_fpga_proc == NULL); + ThreadLocal::current_fpga_proc = fpga_proc_; + log_fpga.info() << "execute_internal_task"; + T::execute_internal_task(task); + assert(ThreadLocal::current_fpga_proc == fpga_proc_); + ThreadLocal::current_fpga_proc = NULL; + } + + FPGAProcessor::FPGAProcessor(FPGADevice *fpga_device, Processor me, Realm::CoreReservationSet &crs) + : LocalTaskProcessor(me, Processor::FPGA_PROC) + { + log_fpga.info() << "FPGAProcessor()"; + this->fpga_device = fpga_device; + Realm::CoreReservationParameters params; + params.set_num_cores(1); + params.set_alu_usage(params.CORE_USAGE_SHARED); + params.set_fpu_usage(params.CORE_USAGE_SHARED); + params.set_ldst_usage(params.CORE_USAGE_SHARED); + params.set_max_stack_size(2 << 20); + std::string name = stringbuilder() << "fpga proc " << me; + core_rsrv_ = new Realm::CoreReservation(name, crs, params); + +#ifdef REALM_USE_USER_THREADS + UserThreadTaskScheduler *sched = new FPGATaskScheduler(me, *core_rsrv_, this); +#else + KernelThreadTaskScheduler *sched = new FPGATaskScheduler(me, *core_rsrv_, this); +#endif + set_scheduler(sched); + } + + FPGAProcessor::~FPGAProcessor(void) + { + delete core_rsrv_; + } + + FPGAProcessor *FPGAProcessor::get_current_fpga_proc(void) + { + return ThreadLocal::current_fpga_proc; + } + + FPGADeviceMemory::FPGADeviceMemory(Memory memory, FPGADevice *device, void *base_ptr_sys, size_t size) + : LocalManagedMemory(memory, size, MKIND_FPGA, 512, Memory::FPGA_MEM, NULL), base_ptr_sys(base_ptr_sys) + { + } + + FPGADeviceMemory::~FPGADeviceMemory(void) + { + // this function is invoked before ~FPGADevice + if (base_ptr_sys != nullptr) + { + free(base_ptr_sys); + base_ptr_sys = nullptr; + } + } + + void FPGADeviceMemory::get_bytes(off_t src_offset, void *dst, size_t size) + { + + FPGACompletionEvent n; // TODO: fix me + void *src = (void *)((uint8_t *)(base_ptr_sys) + src_offset); + off_t dst_offset = (uint8_t *)dst - (uint8_t *)(base_ptr_sys); + get_device()->copy_from_fpga(dst, src, dst_offset, src_offset, size, &n); + n.request_completed(); + } + + void FPGADeviceMemory::put_bytes(off_t dst_offset, const void *src, size_t size) + { + FPGACompletionEvent n; // TODO: fix me + void *dst = (void *)((uint8_t *)(base_ptr_sys) + dst_offset); + off_t src_offset = (uint8_t *)src - (uint8_t *)(base_ptr_sys); + get_device()->copy_to_fpga(dst, src, dst_offset, src_offset, size, &n); + n.request_completed(); + } + + void *FPGADeviceMemory::get_direct_ptr(off_t offset, size_t size) + { + return (void *)((uint8_t *)base_ptr_sys + offset); + } + + void FPGACompletionEvent::request_completed(void) + { + log_fpga.info() << "request_completed " << req; + req->xd->notify_request_read_done(req); + req->xd->notify_request_write_done(req); + } + + FPGAXferDes::FPGAXferDes(uintptr_t _dma_op, Channel *_channel, + NodeID _launch_node, XferDesID _guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int _priority, XferDesKind kind) + : XferDes(_dma_op, _channel, _launch_node, _guid, + inputs_info, outputs_info, + _priority, 0, 0) + { + if ((inputs_info.size() >= 1) && + (input_ports[0].mem->kind == MemoryImpl::MKIND_FPGA)) + { + // all input ports should agree on which fpga they target + src_fpga = ((FPGADeviceMemory *)(input_ports[0].mem))->device; + for (size_t i = 1; i < input_ports.size(); i++) + { + // exception: control and indirect ports should be readable from cpu + if ((int(i) == input_control.control_port_idx) || + (int(i) == output_control.control_port_idx) || + input_ports[i].is_indirect_port) + { + assert((input_ports[i].mem->kind == MemoryImpl::MKIND_SYSMEM)); + continue; + } + assert(input_ports[i].mem == input_ports[0].mem); + } + } + else + { + src_fpga = 0; + } + + if ((outputs_info.size() >= 1) && + (output_ports[0].mem->kind == MemoryImpl::MKIND_FPGA)) + { + // all output ports should agree on which adev they target + dst_fpga = ((FPGADeviceMemory *)(output_ports[0].mem))->device; + for (size_t i = 1; i < output_ports.size(); i++) + assert(output_ports[i].mem == output_ports[0].mem); + } + else + { + dst_fpga = 0; + } + + // if we're doing a multi-hop copy, we'll dial down the request + // sizes to improve pipelining + bool multihop_copy = false; + for (size_t i = 1; i < input_ports.size(); i++) + if (input_ports[i].peer_guid != XFERDES_NO_GUID) + multihop_copy = true; + for (size_t i = 1; i < output_ports.size(); i++) + if (output_ports[i].peer_guid != XFERDES_NO_GUID) + multihop_copy = true; + + log_fpga.info() << "create FPGAXferDes " << kind; + this->kind = kind; + switch (kind) + { + case XFER_FPGA_TO_DEV: + if (multihop_copy) + max_req_size = 4 << 20; + break; + case XFER_FPGA_FROM_DEV: + if (multihop_copy) + max_req_size = 4 << 20; + break; + case XFER_FPGA_IN_DEV: + max_req_size = 1 << 30; + break; + case XFER_FPGA_PEER_DEV: + max_req_size = 256 << 20; + break; + case XFER_FPGA_COMP: + if (multihop_copy) + max_req_size = 4 << 20; + break; + default: + break; + } + const int max_nr = 10; // TODO:FIXME + for (int i = 0; i < max_nr; i++) + { + FPGARequest *fpga_req = new FPGARequest; + fpga_req->xd = this; + fpga_req->event.req = fpga_req; + available_reqs.push(fpga_req); + } + } + + long FPGAXferDes::get_requests(Request **requests, long nr) + { + FPGARequest **reqs = (FPGARequest **)requests; + // no do allow 2D and 3D copies + // unsigned flags = (TransferIterator::LINES_OK | + // TransferIterator::PLANES_OK); + unsigned flags = 0; + long new_nr = default_get_requests(requests, nr, flags); + for (long i = 0; i < new_nr; i++) + { + switch (kind) + { + case XFER_FPGA_TO_DEV: + { + reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes); + reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes); + assert(reqs[i]->src_base != 0); + assert(reqs[i]->dst_base != 0); + break; + } + case XFER_FPGA_FROM_DEV: + { + reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes); + reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes); + assert(reqs[i]->src_base != 0); + assert(reqs[i]->dst_base != 0); + break; + } + case XFER_FPGA_IN_DEV: + { + reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes); + reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes); + assert(reqs[i]->src_base != 0); + assert(reqs[i]->dst_base != 0); + break; + } + case XFER_FPGA_PEER_DEV: + { + reqs[i]->dst_fpga = dst_fpga; + break; + } + case XFER_FPGA_COMP: + { + reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes); + reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes); + assert(reqs[i]->src_base != 0); + assert(reqs[i]->dst_base != 0); + break; + } + default: + assert(0); + } + } + return new_nr; + } + + bool FPGAXferDes::progress_xd(FPGAChannel *channel, TimeLimit work_until) + { + Request *rq; + bool did_work = false; + do + { + long count = get_requests(&rq, 1); + if (count > 0) + { + channel->submit(&rq, count); + did_work = true; + } + else + break; + } while (!work_until.is_expired()); + return did_work; + } + + void FPGAXferDes::notify_request_read_done(Request *req) + { + default_notify_request_read_done(req); + } + + void FPGAXferDes::notify_request_write_done(Request *req) + { + default_notify_request_write_done(req); + } + + void FPGAXferDes::flush() + { + } + + FPGAChannel::FPGAChannel(FPGADevice *_src_fpga, XferDesKind _kind, BackgroundWorkManager *bgwork) + : SingleXDQChannel(bgwork, _kind, "FPGA channel") + { + log_fpga.info() << "FPGAChannel(): " << (int)_kind; + src_fpga = _src_fpga; + + Memory temp_fpga_mem = src_fpga->fpga_mem->me; + Memory temp_fpga_ib_mem = src_fpga->fpga_ib->me; + Memory temp_sys_mem = src_fpga->local_sysmem->me; + Memory temp_rdma_mem = src_fpga->local_ibmem->me; + + switch (_kind) + { + case XFER_FPGA_TO_DEV: + { + unsigned bw = 10; // TODO + unsigned latency = 0; + unsigned frag_overhead = 0; + add_path(temp_sys_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_TO_DEV); + add_path(temp_rdma_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_TO_DEV); + add_path(temp_sys_mem, temp_fpga_ib_mem, bw, latency, frag_overhead, XFER_FPGA_TO_DEV); + break; + } + case XFER_FPGA_FROM_DEV: + { + unsigned bw = 10; // TODO + unsigned latency = 0; + unsigned frag_overhead = 0; + add_path(temp_fpga_mem, temp_sys_mem, bw, latency, frag_overhead, XFER_FPGA_FROM_DEV); + add_path(temp_fpga_mem, temp_rdma_mem, bw, latency, frag_overhead, XFER_FPGA_FROM_DEV); + add_path(temp_fpga_ib_mem, temp_sys_mem, bw, latency, frag_overhead, XFER_FPGA_FROM_DEV); + break; + } + case XFER_FPGA_IN_DEV: + { + // self-path + unsigned bw = 10; // TODO + unsigned latency = 0; + unsigned frag_overhead = 0; + add_path(temp_fpga_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_IN_DEV); + add_path(temp_fpga_ib_mem, temp_fpga_ib_mem, bw, latency, frag_overhead, XFER_FPGA_IN_DEV); + break; + } + case XFER_FPGA_PEER_DEV: + { + // just do paths to peers - they'll do the other side + assert(0 && "not implemented"); + break; + } + case XFER_FPGA_COMP: + { + unsigned bw = 10; // TODO + unsigned latency = 1000; + unsigned frag_overhead = 0; + add_path(temp_fpga_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_COMP); + add_path(temp_fpga_ib_mem, temp_fpga_ib_mem, bw, latency, frag_overhead, XFER_FPGA_COMP); + break; + } + default: + assert(0); + } + } + + FPGAChannel::~FPGAChannel() + { + } + + XferDes *FPGAChannel::create_xfer_des(uintptr_t dma_op, + NodeID launch_node, + XferDesID guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int priority, + XferDesRedopInfo redop_info, + const void *fill_data, size_t fill_size) + { + assert(redop_info.id == 0); + assert(fill_size == 0); + return new FPGAXferDes(dma_op, this, launch_node, guid, + inputs_info, outputs_info, + priority, kind); + } + + long FPGAChannel::submit(Request **requests, long nr) + { + for (long i = 0; i < nr; i++) + { + FPGARequest *req = (FPGARequest *)requests[i]; + // no serdez support + assert(req->xd->input_ports[req->src_port_idx].serdez_op == 0); + assert(req->xd->output_ports[req->dst_port_idx].serdez_op == 0); + + // empty transfers don't need to bounce off the ADevice + if (req->nbytes == 0) + { + req->xd->notify_request_read_done(req); + req->xd->notify_request_write_done(req); + continue; + } + + switch (req->dim) + { + case Request::DIM_1D: + { + switch (kind) + { + case XFER_FPGA_TO_DEV: + src_fpga->copy_to_fpga(req->dst_base, req->src_base, + req->dst_off, req->src_off, + req->nbytes, &req->event); + break; + case XFER_FPGA_FROM_DEV: + src_fpga->copy_from_fpga(req->dst_base, req->src_base, + req->dst_off, req->src_off, + req->nbytes, &req->event); + break; + case XFER_FPGA_IN_DEV: + src_fpga->copy_within_fpga(req->dst_base, req->src_base, + req->dst_off, req->src_off, + req->nbytes, &req->event); + break; + case XFER_FPGA_PEER_DEV: + src_fpga->copy_to_peer(req->dst_fpga, + req->dst_base, req->src_base, + req->dst_off, req->src_off, + req->nbytes, &req->event); + break; + case XFER_FPGA_COMP: + src_fpga->comp(req->dst_base, req->src_base, + req->dst_off, req->src_off, + req->nbytes, &req->event); + break; + default: + assert(0); + } + break; + } + + case Request::DIM_2D: + { + switch (kind) + { + case XFER_FPGA_TO_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_FROM_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_IN_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_PEER_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_COMP: + assert(0 && "not implemented"); + break; + default: + assert(0); + } + break; + } + + case Request::DIM_3D: + { + switch (kind) + { + case XFER_FPGA_TO_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_FROM_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_IN_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_PEER_DEV: + assert(0 && "not implemented"); + break; + case XFER_FPGA_COMP: + assert(0 && "not implemented"); + break; + default: + assert(0); + } + break; + } + + default: + assert(0); + } + } + + return nr; + } + + FPGAfillXferDes::FPGAfillXferDes(uintptr_t _dma_op, Channel *_channel, + NodeID _launch_node, XferDesID _guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int _priority, + const void *_fill_data, size_t _fill_size) + : XferDes(_dma_op, _channel, _launch_node, _guid, + inputs_info, outputs_info, + _priority, _fill_data, _fill_size) + { + kind = XFER_FPGA_IN_DEV; + + // no direct input data for us + assert(input_control.control_port_idx == -1); + input_control.current_io_port = -1; + } + + long FPGAfillXferDes::get_requests(Request **requests, long nr) + { + // unused + assert(0); + return 0; + } + + bool FPGAfillXferDes::progress_xd(FPGAfillChannel *channel, + TimeLimit work_until) + { + bool did_work = false; + ReadSequenceCache rseqcache(this, 2 << 20); + WriteSequenceCache wseqcache(this, 2 << 20); + + while (true) + { + size_t min_xfer_size = 4096; // TODO: make controllable + size_t max_bytes = get_addresses(min_xfer_size, &rseqcache); + if (max_bytes == 0) + break; + + XferPort *out_port = 0; + size_t out_span_start = 0; + if (output_control.current_io_port >= 0) + { + out_port = &output_ports[output_control.current_io_port]; + out_span_start = out_port->local_bytes_total; + } + + size_t total_bytes = 0; + if (out_port != 0) + { + // input and output both exist - transfer what we can + log_fpga.info() << "memfill chunk: min=" << min_xfer_size + << " max=" << max_bytes; + + uintptr_t out_base = reinterpret_cast(out_port->mem->get_direct_ptr(0, 0)); + uintptr_t initial_out_offset = out_port->addrcursor.get_offset(); + while (total_bytes < max_bytes) + { + AddressListCursor &out_alc = out_port->addrcursor; + + uintptr_t out_offset = out_alc.get_offset(); + + // the reported dim is reduced for partially consumed address + // ranges - whatever we get can be assumed to be regular + int out_dim = out_alc.get_dim(); + + size_t bytes = 0; + size_t bytes_left = max_bytes - total_bytes; + // memfills don't need to be particularly big to achieve + // peak efficiency, so trim to something that takes + // 10's of us to be responsive to the time limit + // NOTE: have to be a little careful and make sure the limit + // is a multiple of the fill size - we'll make it a power-of-2 + const size_t TARGET_CHUNK_SIZE = 256 << 10; // 256KB + if (bytes_left > TARGET_CHUNK_SIZE) + { + size_t max_chunk = fill_size; + while (max_chunk < TARGET_CHUNK_SIZE) + max_chunk <<= 1; + bytes_left = std::min(bytes_left, max_chunk); + } + + if (out_dim > 0) + { + size_t ocount = out_alc.remaining(0); + + // contig bytes is always the first dimension + size_t contig_bytes = std::min(ocount, bytes_left); + + // catch simple 1D case first + if ((contig_bytes == bytes_left) || + ((contig_bytes == ocount) && (out_dim == 1))) + { + bytes = contig_bytes; + // we only have one element worth of data, so fill + // multiple elements by using a "2d" copy with a + // source stride of 0 + size_t repeat_count = contig_bytes / fill_size; +#ifdef DEBUG_REALM + assert((contig_bytes % fill_size) == 0); +#endif + fpga_memcpy_2d(out_base + out_offset, fill_size, + reinterpret_cast(fill_data), 0, + fill_size, repeat_count); + out_alc.advance(0, bytes); + } + else + { + // grow to a 2D fill + assert(0 && "FPGA 2D fill not implemented"); + } + } + else + { + // scatter adddress list + assert(0); + } + +#ifdef DEBUG_REALM + assert(bytes <= bytes_left); +#endif + total_bytes += bytes; + + // stop if it's been too long, but make sure we do at least the + // minimum number of bytes + if ((total_bytes >= min_xfer_size) && work_until.is_expired()) + break; + } + for (size_t i = 0; i < 50; i++) + { + printf("%d ", ((int *)(channel->fpga->fpga_mem->base_ptr_sys))[i]); + } + printf("\n"); + + log_fpga.info() << "before write buffer, initial_out_offset " << initial_out_offset << " total_bytes " << total_bytes; + + cl_int err = 0; + OCL_CHECK(err, err = channel->fpga->command_queue.enqueueWriteBuffer(channel->fpga->buff, // buffer on the FPGA + CL_TRUE, // blocking call + initial_out_offset, // buffer offset in bytes + total_bytes, // Size in bytes + (void *)((uint64_t)(channel->fpga->fpga_mem->base_ptr_sys) + initial_out_offset), // Pointer to the data to copy + nullptr, nullptr)); + log_fpga.info() << "after write buffer"; + } + else + { + // fill with no output, so just count the bytes + total_bytes = max_bytes; + } + + // mem fill is always immediate, so handle both skip and copy with + // the same code + wseqcache.add_span(output_control.current_io_port, + out_span_start, total_bytes); + out_span_start += total_bytes; + + bool done = record_address_consumption(total_bytes, total_bytes); + + did_work = true; + + if (done || work_until.is_expired()) + break; + } + + rseqcache.flush(); + wseqcache.flush(); + + return did_work; + } + + FPGAfillChannel::FPGAfillChannel(FPGADevice *_fpga, BackgroundWorkManager *bgwork) + : SingleXDQChannel(bgwork, + XFER_GPU_IN_FB, + "FPGA fill channel"), + fpga(_fpga) + { + Memory temp_fpga_mem = fpga->fpga_mem->me; + + unsigned bw = 10; // TODO + unsigned latency = 0; + unsigned frag_overhead = 0; + add_path(Memory::NO_MEMORY, temp_fpga_mem, + bw, latency, frag_overhead, XFER_FPGA_IN_DEV); + + xdq.add_to_manager(bgwork); + } + + XferDes *FPGAfillChannel::create_xfer_des(uintptr_t dma_op, + NodeID launch_node, + XferDesID guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int priority, + XferDesRedopInfo redop_info, + const void *fill_data, size_t fill_size) + { + assert(redop_info.id == 0); + return new FPGAfillXferDes(dma_op, this, launch_node, guid, + inputs_info, outputs_info, + priority, + fill_data, fill_size); + } + + long FPGAfillChannel::submit(Request **requests, long nr) + { + // unused + assert(0); + return 0; + } + + }; // namespace FPGA +}; // namespace Realm diff --git a/runtime/realm/fpga/fpga_module.h b/runtime/realm/fpga/fpga_module.h new file mode 100644 index 0000000000..f29c3df99c --- /dev/null +++ b/runtime/realm/fpga/fpga_module.h @@ -0,0 +1,395 @@ +#ifndef REALM_FPGA_H +#define REALM_FPGA_H + +#include "realm/module.h" +#include "realm/proc_impl.h" +#include "realm/mem_impl.h" +#include "realm/runtime_impl.h" +#include "realm/transfer/channel.h" +#include "realm/circ_queue.h" +#include "realm/transfer/ib_memory.h" + +// OpenCL utility layer +#include "xcl2.hpp" + +namespace Realm +{ + namespace FPGA + { + class FPGAQueue; + class FPGADevice; + class FPGADeviceMemcpy; + class FPGARequest; + + class FPGACompletionNotification + { + public: + virtual ~FPGACompletionNotification(void) {} + + virtual void request_completed(void) = 0; + }; + + + class FPGACompletionEvent : public FPGACompletionNotification + { + public: + void request_completed(void); + + FPGARequest *req; + }; + + class FPGAWorker : public BackgroundWorkItem + { + public: + FPGAWorker(void); + virtual ~FPGAWorker(void); + // adds a stream that has work to be done + void add_queue(FPGAQueue *queue); + // processes work on queues, + // optionally sleeping for work to show up + // returns true if work remains to be done + bool process_queues(bool sleep_on_empty); + void start_background_thread(Realm::CoreReservationSet & + crs, + size_t stack_size); + void shutdown_background_thread(void); + bool do_work(TimeLimit work_until); + + public: + void thread_main(void); + + protected: + Mutex lock; + Mutex::CondVar condvar; + typedef Realm::CircularQueue ActiveQueue; + ActiveQueue active_queues; + // used by the background thread (if any) + Realm::CoreReservation *core_rsrv; + Realm::Thread *worker_thread; + bool thread_sleeping; + atomic worker_shutdown_requested; + }; + + + class FPGAWorkFence : public Realm::Operation::AsyncWorkItem + { + public: + FPGAWorkFence(Realm::Operation *op); + virtual void request_cancellation(void); + void enqueue(FPGAQueue *queue); + virtual void print(std::ostream &os) const; + }; + + class FPGAQueue + { + public: + FPGAQueue(FPGADevice *fpga_device, FPGAWorker *fpga_worker, cl::CommandQueue &command_queue); + ~FPGAQueue(void); + cl::CommandQueue &get_command_queue() const; + void add_fence(FPGAWorkFence *fence); + void add_notification(FPGACompletionNotification *notification); + bool reap_events(TimeLimit work_until); + void add_copy(FPGADeviceMemcpy *copy); + bool issue_copies(TimeLimit work_until); + void add_event(cl::Event opencl_event, + FPGAWorkFence *fence, + FPGACompletionNotification *n = 0); + + protected: + // may only be tested with lock held + bool has_work(void) const; + FPGADevice *fpga_device; + FPGAWorker *fpga_worker; + cl::CommandQueue &command_queue; + Mutex mutex; + struct PendingEvent + { + cl::Event opencl_event; + FPGAWorkFence *fence; + FPGACompletionNotification *notification; + }; + std::deque pending_events; + std::deque pending_copies; + }; + + class FPGADeviceMemory; + + class FPGADevice + { + public: + std::string name; + cl::Device device; + cl::Buffer buff; + cl::Buffer ib_buff; + cl::Context context; + cl::CommandQueue command_queue; + cl::Program program; + FPGADevice(cl::Device &device, std::string name, std::string xclbin, FPGAWorker *fpga_worker, size_t fpga_coprocessor_num_cu, std::string fpga_coprocessor_kernel); + ~FPGADevice(); + void create_fpga_mem(RuntimeImpl *runtime, size_t size); + void create_fpga_ib(RuntimeImpl *runtime, size_t size); + void create_dma_channels(RuntimeImpl *runtime); + void create_fpga_queues(); + void copy_to_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification); + void copy_from_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification); + void copy_within_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification); + void copy_to_peer(FPGADevice *dst_dev, void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification); + void comp(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification); + bool is_in_buff(void *ptr); + bool is_in_ib_buff(void *ptr); + FPGADeviceMemory *fpga_mem; + IBMemory *fpga_ib; + MemoryImpl *local_sysmem; + IBMemory *local_ibmem; + FPGAWorker *fpga_worker; + FPGAQueue *fpga_queue; + size_t fpga_coprocessor_num_cu; + std::string fpga_coprocessor_kernel; + }; + + enum FPGAMemcpyKind + { + FPGA_MEMCPY_HOST_TO_DEVICE, + FPGA_MEMCPY_DEVICE_TO_HOST, + FPGA_MEMCPY_DEVICE_TO_DEVICE, + FPGA_MEMCPY_PEER_TO_PEER, + }; + + // An abstract base class for all FPGA memcpy operations + class FPGADeviceMemcpy + { + public: + FPGADeviceMemcpy(FPGADevice *fpga_device, FPGAMemcpyKind kind, FPGACompletionNotification *notification); + virtual ~FPGADeviceMemcpy(void) {} + + public: + virtual void execute(FPGAQueue *queue) = 0; + + public: + FPGADevice *const fpga_device; + + protected: + FPGAMemcpyKind kind; + FPGACompletionNotification *notification; + }; + + class FPGADeviceMemcpy1D : public FPGADeviceMemcpy + { + public: + FPGADeviceMemcpy1D(FPGADevice *fpga_device, + void *dst, + const void *src, + size_t bytes, + off_t buff_offset, + FPGAMemcpyKind kind, + FPGACompletionNotification *notification); + + virtual ~FPGADeviceMemcpy1D(void); + + public: + virtual void execute(FPGAQueue *q); + + protected: + void *dst; + const void *src; + size_t elmt_size; + off_t buff_offset; + + private: + void do_span(off_t pos, size_t len); + FPGAQueue *fpga_queue; + }; + + class FPGAProcessor : public LocalTaskProcessor + { + public: + FPGAProcessor(FPGADevice *fpga_device, Processor me, Realm::CoreReservationSet &crs); + virtual ~FPGAProcessor(void); + static FPGAProcessor *get_current_fpga_proc(void); + FPGADevice *fpga_device; + + protected: + Realm::CoreReservation *core_rsrv_; + }; + + class FPGADeviceMemory : public LocalManagedMemory + { + public: + FPGADeviceMemory(Memory memory, FPGADevice *device, void *base_ptr_sys, size_t size); + virtual ~FPGADeviceMemory(void); + virtual void get_bytes(off_t offset, void *dst, size_t size); + virtual void put_bytes(off_t offset, const void *src, size_t size); + virtual void *get_direct_ptr(off_t offset, size_t size); + + FPGADevice *get_device() const { return device; }; + void *get_mem_base_sys() const { return base_ptr_sys; }; + FPGADevice *device; + void *base_ptr_sys; + }; + + class FPGARequest : public Request + { + public: + const void *src_base; + void *dst_base; + FPGADevice *dst_fpga; + FPGACompletionEvent event; + }; + + class FPGAChannel; + + class FPGAXferDes : public XferDes + { + public: + FPGAXferDes(uintptr_t _dma_op, Channel *_channel, + NodeID _launch_node, XferDesID _guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int _priority, XferDesKind kind); + + ~FPGAXferDes() + { + while (!available_reqs.empty()) + { + FPGARequest *fpga_req = (FPGARequest *)available_reqs.front(); + available_reqs.pop(); + delete fpga_req; + } + } + + long default_get_requests_tentative(Request **requests, long nr, unsigned flags); + long get_requests(Request **requests, long nr); + void notify_request_read_done(Request *req); + void notify_request_write_done(Request *req); + void flush(); + + bool progress_xd(FPGAChannel *channel, TimeLimit work_until); + + private: + FPGADevice *src_fpga; + FPGADevice *dst_fpga; + }; + + class FPGAChannel : public SingleXDQChannel + { + public: + FPGAChannel(FPGADevice *_src_fpga, XferDesKind _kind, + BackgroundWorkManager *bgwork); + ~FPGAChannel(); + + // TODO: multiple concurrent copies not ok for now + static const bool is_ordered = true; + + virtual XferDes *create_xfer_des(uintptr_t dma_op, + NodeID launch_node, + XferDesID guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int priority, + XferDesRedopInfo redop_info, + const void *fill_data, size_t fill_size); + + long submit(Request **requests, long nr); + + private: + FPGADevice *src_fpga; + }; + + class FPGAfillChannel; + + class FPGAfillXferDes : public XferDes + { + public: + FPGAfillXferDes(uintptr_t _dma_op, Channel *_channel, + NodeID _launch_node, XferDesID _guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int _priority, + const void *_fill_data, size_t _fill_size); + + long get_requests(Request **requests, long nr); + + bool progress_xd(FPGAfillChannel *channel, TimeLimit work_until); + + protected: + size_t reduced_fill_size; + }; + + class FPGAfillChannel : public SingleXDQChannel + { + public: + FPGAfillChannel(FPGADevice *_fpga, BackgroundWorkManager *bgwork); + + // TODO: multiple concurrent fills not ok for now + static const bool is_ordered = true; + + virtual XferDes *create_xfer_des(uintptr_t dma_op, + NodeID launch_node, + XferDesID guid, + const std::vector &inputs_info, + const std::vector &outputs_info, + int priority, + XferDesRedopInfo redop_info, + const void *fill_data, size_t fill_size); + + long submit(Request **requests, long nr); + + protected: + friend class FPGAfillXferDes; + FPGADevice *fpga; + }; + + class FPGAModule : public Module + { + protected: + FPGAModule(void); + + public: + virtual ~FPGAModule(void); + + static Module *create_module(RuntimeImpl *runtime, std::vector &cmdline); + + // do any general initialization - this is called after all configuration is + // complete + virtual void initialize(RuntimeImpl *runtime); + + // create any memories provided by this module (default == do nothing) + // (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id) + virtual void create_memories(RuntimeImpl *runtime); + + // create any processors provided by the module (default == do nothing) + // (each new ProcessorImpl should use a Processor from + // RuntimeImpl::next_local_processor_id) + virtual void create_processors(RuntimeImpl *runtime); + + // create any DMA channels provided by the module (default == do nothing) + virtual void create_dma_channels(RuntimeImpl *runtime); + + // create any code translators provided by the module (default == do nothing) + virtual void create_code_translators(RuntimeImpl *runtime); + + // clean up any common resources created by the module - this will be called + // after all memories/processors/etc. have been shut down and destroyed + virtual void cleanup(void); + + public: + size_t cfg_num_fpgas; + bool cfg_use_worker_threads; + bool cfg_use_shared_worker; + size_t cfg_fpga_mem_size; + size_t cfg_fpga_ib_size; + FPGAWorker *shared_worker; + std::map dedicated_workers; + std::string cfg_fpga_xclbin; + std::vector fpga_devices; + size_t cfg_fpga_coprocessor_num_cu; + std::string cfg_fpga_coprocessor_kernel; + + protected: + std::vector fpga_procs_; + }; + + }; // namespace FPGA +}; // namespace Realm + +#endif diff --git a/runtime/realm/fpga/fpga_utils.cc b/runtime/realm/fpga/fpga_utils.cc new file mode 100644 index 0000000000..6da890e233 --- /dev/null +++ b/runtime/realm/fpga/fpga_utils.cc @@ -0,0 +1,64 @@ +#include "realm/fpga/fpga_module.h" +#include "realm/fpga/fpga_utils.h" +#include "realm/logging.h" + +namespace Realm +{ + namespace FPGA + { + extern Logger log_fpga; + } +} + +extern "C" +{ + using namespace Realm; + using namespace Realm::FPGA; + + REALM_PUBLIC_API cl::Device FPGAGetCurrentDevice(void) + { + FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc(); + cl::Device ret = p->fpga_device->device; + log_fpga.info() << "FPGAGetCurrentDevice()"; + return ret; + } + + REALM_PUBLIC_API cl::Context FPGAGetCurrentContext(void) + { + FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc(); + cl::Context ret = p->fpga_device->context; + log_fpga.info() << "FPGAGetCurrentContext()"; + return ret; + } + + REALM_PUBLIC_API cl::Program FPGAGetCurrentProgram(void) + { + FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc(); + cl::Program ret = p->fpga_device->program; + log_fpga.info() << "FPGAGetCurrentProgram()"; + return ret; + } + + REALM_PUBLIC_API cl::Buffer FPGAGetCurrentBuffer(void) + { + FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc(); + cl::Buffer ret = p->fpga_device->buff; + log_fpga.info() << "FPGAGetCurrentBuffer()"; + return ret; + } + + REALM_PUBLIC_API cl::CommandQueue FPGAGetCurrentCommandQueue(void) + { + FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc(); + cl::CommandQueue ret = p->fpga_device->command_queue; + log_fpga.info() << "FPGAGetCurrentCommandQueue()"; + return ret; + } + + REALM_PUBLIC_API void *FPGAGetBasePtrSys(void) + { + FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc(); + void *ret = p->fpga_device->fpga_mem->base_ptr_sys; + return ret; + } +} diff --git a/runtime/realm/fpga/fpga_utils.h b/runtime/realm/fpga/fpga_utils.h new file mode 100644 index 0000000000..45b53ead53 --- /dev/null +++ b/runtime/realm/fpga/fpga_utils.h @@ -0,0 +1,14 @@ +#ifndef FPGA_UTILS_H +#define FPGA_UTILS_H + +#include "xcl2.hpp" +extern "C" +{ + cl::Device FPGAGetCurrentDevice(void); + cl::Context FPGAGetCurrentContext(void); + cl::Program FPGAGetCurrentProgram(void); + cl::Buffer FPGAGetCurrentBuffer(void); + cl::CommandQueue FPGAGetCurrentCommandQueue(void); + void *FPGAGetBasePtrSys(void); +} +#endif // FPGA_UTILS_H diff --git a/runtime/realm/fpga/xcl2.cc b/runtime/realm/fpga/xcl2.cc new file mode 100644 index 0000000000..fdc4f8a27b --- /dev/null +++ b/runtime/realm/fpga/xcl2.cc @@ -0,0 +1,157 @@ +/** +* Copyright (C) 2019-2021 Xilinx, Inc +* +* Licensed under the Apache License, Version 2.0 (the "License"). You may +* not use this file except in compliance with the License. A copy of the +* License is located at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +* License for the specific language governing permissions and limitations +* under the License. +*/ + +#include "xcl2.hpp" +#include +#include +#include +#include +#include +#if defined(_WINDOWS) +#include +#else +#include +#endif + +namespace xcl { +std::vector get_devices(const std::string& vendor_name) { + size_t i; + cl_int err; + std::vector platforms; + OCL_CHECK(err, err = cl::Platform::get(&platforms)); + cl::Platform platform; + for (i = 0; i < platforms.size(); i++) { + platform = platforms[i]; + OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); + if (!(platformName.compare(vendor_name))) { + std::cout << "Found Platform" << std::endl; + std::cout << "Platform Name: " << platformName.c_str() << std::endl; + break; + } + } + if (i == platforms.size()) { + std::cout << "Error: Failed to find Xilinx platform" << std::endl; + std::cout << "Found the following platforms : " << std::endl; + for (size_t j = 0; j < platforms.size(); j++) { + platform = platforms[j]; + OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); + std::cout << "Platform Name: " << platformName.c_str() << std::endl; + } + exit(EXIT_FAILURE); + } + // Getting ACCELERATOR Devices and selecting 1st such device + std::vector devices; + OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); + return devices; +} + +std::vector get_xil_devices() { + return get_devices("Xilinx"); +} + +cl::Device find_device_bdf(const std::vector& devices, const std::string& bdf) { + char device_bdf[20]; + cl_int err; + cl::Device device; + int cnt = 0; + for (uint32_t i = 0; i < devices.size(); i++) { + OCL_CHECK(err, err = devices[i].getInfo(CL_DEVICE_PCIE_BDF, &device_bdf)); + if (bdf == device_bdf) { + device = devices[i]; + cnt++; + break; + } + } + if (cnt == 0) { + std::cout << "Invalid device bdf. Please check and provide valid bdf\n"; + exit(EXIT_FAILURE); + } + return device; +} +std::vector read_binary_file(const std::string& xclbin_file_name) { + std::cout << "INFO: Reading " << xclbin_file_name << std::endl; + FILE* fp; + if ((fp = fopen(xclbin_file_name.c_str(), "r")) == nullptr) { + printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); + exit(EXIT_FAILURE); + } + // Loading XCL Bin into char buffer + std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; + std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); + bin_file.seekg(0, bin_file.end); + auto nb = bin_file.tellg(); + bin_file.seekg(0, bin_file.beg); + std::vector buf; + buf.resize(nb); + bin_file.read(reinterpret_cast(buf.data()), nb); + return buf; +} + +bool is_emulation() { + bool ret = false; + char* xcl_mode = getenv("XCL_EMULATION_MODE"); + if (xcl_mode != nullptr) { + ret = true; + } + return ret; +} + +bool is_hw_emulation() { + bool ret = false; + char* xcl_mode = getenv("XCL_EMULATION_MODE"); + if ((xcl_mode != nullptr) && !strcmp(xcl_mode, "hw_emu")) { + ret = true; + } + return ret; +} +double round_off(double n) { + double d = n * 100.0; + int i = d + 0.5; + d = i / 100.0; + return d; +} + +std::string convert_size(size_t size) { + static const char* SIZES[] = {"B", "KB", "MB", "GB"}; + uint32_t div = 0; + size_t rem = 0; + + while (size >= 1024 && div < (sizeof SIZES / sizeof *SIZES)) { + rem = (size % 1024); + div++; + size /= 1024; + } + + double size_d = (float)size + (float)rem / 1024.0; + double size_val = round_off(size_d); + + std::stringstream stream; + stream << std::fixed << std::setprecision(2) << size_val; + std::string size_str = stream.str(); + std::string result = size_str + " " + SIZES[div]; + return result; +} + +bool is_xpr_device(const char* device_name) { + const char* output = strstr(device_name, "xpr"); + + if (output == nullptr) { + return false; + } else { + return true; + } +} +}; // namespace xcl diff --git a/runtime/realm/fpga/xcl2.hpp b/runtime/realm/fpga/xcl2.hpp new file mode 100644 index 0000000000..aa52839dc4 --- /dev/null +++ b/runtime/realm/fpga/xcl2.hpp @@ -0,0 +1,137 @@ +/** +* Copyright (C) 2019-2021 Xilinx, Inc +* +* Licensed under the Apache License, Version 2.0 (the "License"). You may +* not use this file except in compliance with the License. A copy of the +* License is located at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +* License for the specific language governing permissions and limitations +* under the License. +*/ + +#pragma once + +#define CL_HPP_CL_1_2_DEFAULT_BUILD +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS + +// OCL_CHECK doesn't work if call has templatized function call +#define OCL_CHECK(error, call) \ + call; \ + if (error != CL_SUCCESS) { \ + printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, error); \ + exit(EXIT_FAILURE); \ + } + +#include +#include +#include +#include +// When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the +// hood +// User ptr is used if and only if it is properly aligned (page aligned). When +// not +// aligned, runtime has no choice but to create its own host side buffer that +// backs +// user ptr. This in turn implies that all operations that move data to and from +// device incur an extra memcpy to move data to/from runtime's own host buffer +// from/to user pointer. So it is recommended to use this allocator if user wish +// to +// Create Buffer/Memory Object with CL_MEM_USE_HOST_PTR to align user buffer to +// the +// page boundary. It will ensure that user buffer will be used when user create +// Buffer/Mem Object with CL_MEM_USE_HOST_PTR. +template +struct aligned_allocator { + using value_type = T; + + aligned_allocator() {} + + aligned_allocator(const aligned_allocator&) {} + + template + aligned_allocator(const aligned_allocator&) {} + + T* allocate(std::size_t num) { + void* ptr = nullptr; + +#if defined(_WINDOWS) + { + ptr = _aligned_malloc(num * sizeof(T), 4096); + if (ptr == nullptr) { + std::cout << "Failed to allocate memory" << std::endl; + exit(EXIT_FAILURE); + } + } +#else + { + if (posix_memalign(&ptr, 4096, num * sizeof(T))) throw std::bad_alloc(); + } +#endif + return reinterpret_cast(ptr); + } + void deallocate(T* p, std::size_t num) { +#if defined(_WINDOWS) + _aligned_free(p); +#else + free(p); +#endif + } +}; + +namespace xcl { +std::vector get_xil_devices(); +std::vector get_devices(const std::string& vendor_name); +cl::Device find_device_bdf(const std::vector& devices, const std::string& bdf); +std::string convert_size(size_t size); +std::vector read_binary_file(const std::string& xclbin_file_name); +bool is_emulation(); +bool is_hw_emulation(); +bool is_xpr_device(const char* device_name); +class Stream { + public: + static decltype(&clCreateStream) createStream; + static decltype(&clReleaseStream) releaseStream; + static decltype(&clReadStream) readStream; + static decltype(&clWriteStream) writeStream; + static decltype(&clPollStreams) pollStreams; + static void init(const cl_platform_id& platform) { + void* bar = clGetExtensionFunctionAddressForPlatform(platform, "clCreateStream"); + createStream = (decltype(&clCreateStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clReleaseStream"); + releaseStream = (decltype(&clReleaseStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clReadStream"); + readStream = (decltype(&clReadStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clWriteStream"); + writeStream = (decltype(&clWriteStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clPollStreams"); + pollStreams = (decltype(&clPollStreams))bar; + } +}; +class P2P { + public: + static decltype(&xclGetMemObjectFd) getMemObjectFd; + static decltype(&xclGetMemObjectFromFd) getMemObjectFromFd; + static void init(const cl_platform_id& platform) { + void* bar = clGetExtensionFunctionAddressForPlatform(platform, "xclGetMemObjectFd"); + getMemObjectFd = (decltype(&xclGetMemObjectFd))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "xclGetMemObjectFromFd"); + getMemObjectFromFd = (decltype(&xclGetMemObjectFromFd))bar; + } +}; +class Ext { + public: + static decltype(&xclGetComputeUnitInfo) getComputeUnitInfo; + static void init(const cl_platform_id& platform) { + void* bar = clGetExtensionFunctionAddressForPlatform(platform, "xclGetComputeUnitInfo"); + getComputeUnitInfo = (decltype(&xclGetComputeUnitInfo))bar; + } +}; +} diff --git a/runtime/realm/machine_impl.cc b/runtime/realm/machine_impl.cc index ccb1fdf9da..a2956eca31 100644 --- a/runtime/realm/machine_impl.cc +++ b/runtime/realm/machine_impl.cc @@ -409,6 +409,7 @@ namespace Realm { // these can be the source of a RemoteWrite, but it's non-ideal case Memory::SYSTEM_MEM: case Memory::Z_COPY_MEM: + case Memory::FPGA_MEM: { send_ok = true; recv_ok = true; diff --git a/runtime/realm/mem_impl.h b/runtime/realm/mem_impl.h index 1a21fdcca2..149f9170d7 100644 --- a/runtime/realm/mem_impl.h +++ b/runtime/realm/mem_impl.h @@ -61,6 +61,9 @@ namespace Realm { MKIND_FILE, // file memory accessible by owner node #ifdef REALM_USE_HDF5 MKIND_HDF, // HDF memory accessible by owner node +#endif +#ifdef REALM_USE_FPGA + MKIND_FPGA, // FPGA device memory #endif }; diff --git a/runtime/realm/module.cc b/runtime/realm/module.cc index c068693847..f1fdc0362d 100644 --- a/runtime/realm/module.cc +++ b/runtime/realm/module.cc @@ -78,6 +78,11 @@ REGISTER_REALM_MODULE_STATIC(Realm::LLVMJit::LLVMJitModule); REGISTER_REALM_MODULE_STATIC(Realm::HDF5::HDF5Module); #endif +#if defined REALM_USE_FPGA +#include "realm/fpga/fpga_module.h" +REGISTER_REALM_MODULE_STATIC(Realm::FPGA::FPGAModule); +#endif + #ifdef REALM_USE_GASNET1 #include "realm/gasnet1/gasnet1_module.h" REGISTER_REALM_NETWORK_MODULE_STATIC(Realm::GASNet1Module); diff --git a/runtime/realm/realm_c.h b/runtime/realm/realm_c.h index 5600b88d01..1e059b88b0 100644 --- a/runtime/realm/realm_c.h +++ b/runtime/realm/realm_c.h @@ -51,7 +51,8 @@ typedef unsigned long long realm_barrier_timestamp_t; __op__(PROC_GROUP, "Processor group") \ __op__(PROC_SET, "Set of Processors for OpenMP/Kokkos etc.") \ __op__(OMP_PROC, "OpenMP (or similar) thread pool") \ - __op__(PY_PROC, "Python interpreter") + __op__(PY_PROC, "Python interpreter") \ + __op__(FPGA_PROC, "FPGA") typedef enum realm_processor_kind_t { #define C_ENUMS(name, desc) name, @@ -74,7 +75,8 @@ typedef enum realm_processor_kind_t { __op__(LEVEL3_CACHE, "CPU L3 Visible to all processors on the node, better performance to processors on same socket") \ __op__(LEVEL2_CACHE, "CPU L2 Visible to all processors on the node, better performance to one processor") \ __op__(LEVEL1_CACHE, "CPU L1 Visible to all processors on the node, better performance to one processor") \ - __op__(GPU_MANAGED_MEM, "Managed memory that can be cached by either host or GPU") + __op__(GPU_MANAGED_MEM, "Managed memory that can be cached by either host or GPU") \ + __op__(FPGA_MEM, "FPGA device memory") typedef enum realm_memory_kind_t { #define C_ENUMS(name, desc) name, diff --git a/runtime/realm/transfer/channel.h b/runtime/realm/transfer/channel.h index 88393d6b28..4642b4a5bc 100644 --- a/runtime/realm/transfer/channel.h +++ b/runtime/realm/transfer/channel.h @@ -72,6 +72,11 @@ namespace Realm { XFER_FILE_WRITE, XFER_ADDR_SPLIT, XFER_MEM_FILL, + XFER_FPGA_TO_DEV, + XFER_FPGA_FROM_DEV, + XFER_FPGA_IN_DEV, + XFER_FPGA_PEER_DEV, + XFER_FPGA_COMP, }; class Request { diff --git a/runtime/runtime.mk b/runtime/runtime.mk index 78d3eb27eb..7e8cc48167 100644 --- a/runtime/runtime.mk +++ b/runtime/runtime.mk @@ -697,6 +697,25 @@ ifeq ($(strip $(USE_HDF)), 1) endif endif +# Realm doesn't use FPGA by default +USE_FPGA ?= 0 +ifeq ($(strip $(USE_FPGA)), 1) + REALM_CC_FLAGS += -DREALM_USE_FPGA + LEGION_CC_FLAGS += -DLEGION_USE_FPGA + # provide this for backward-compatibility in applications + CC_FLAGS += -DUSE_FPGA -I$(XILINX_VIVADO)/include -I$(XILINX_XRT)/include + FC_FLAGS += -DUSE_FPGA + LEGION_LD_FLAGS += -L$(XILINX_XRT)/lib -lOpenCL -pthread + ifdef FPGA_ROOT + CC_FLAGS += -I$(FPGA_ROOT)/include + FC_FLAGS += -I$(FPGA_ROOT)/include + LD_FLAGS += -L$(FPGA_ROOT)/lib + else + CC_FLAGS += -I/usr/include + FC_FLAGS += -I/usr/include + endif +endif + SKIP_MACHINES= titan% daint% excalibur% cori% # use mpi{cc,cxx,f90} compiler wrappers if USE_MPI=1 ifeq ($(strip $(USE_MPI)),1) @@ -935,6 +954,12 @@ REALM_SRC += $(LG_RT_DIR)/realm/hdf5/hdf5_module.cc \ $(LG_RT_DIR)/realm/hdf5/hdf5_internal.cc \ $(LG_RT_DIR)/realm/hdf5/hdf5_access.cc endif +ifeq ($(strip $(USE_FPGA)),1) +REALM_SRC += $(LG_RT_DIR)/realm/fpga/fpga_module.cc \ + $(LG_RT_DIR)/realm/fpga/fpga_utils.cc \ + $(LG_RT_DIR)/realm/fpga/xcl2.cc +endif + REALM_SRC += $(LG_RT_DIR)/realm/activemsg.cc \ $(LG_RT_DIR)/realm/nodeset.cc \ $(LG_RT_DIR)/realm/network.cc