From 2ffb0c11141448c38dc17bd8014c381eca707af8 Mon Sep 17 00:00:00 2001 From: "Documenter.jl" Date: Fri, 6 Sep 2024 14:04:59 +0200 Subject: [PATCH] build based on 7273a14 --- dev/custom_xclbin/index.html | 2 +- dev/examples/high_level_basics/index.html | 15 +++++++++------ dev/examples/stream/index.html | 2 +- dev/examples/xrt_api/index.html | 2 +- dev/high_level/index.html | 2 +- dev/index.html | 2 +- dev/refs/api/index.html | 17 ++++++++--------- dev/search/index.html | 2 +- dev/search_index.js | 2 +- 9 files changed, 24 insertions(+), 22 deletions(-) diff --git a/dev/custom_xclbin/index.html b/dev/custom_xclbin/index.html index d30f096..88d00c7 100644 --- a/dev/custom_xclbin/index.html +++ b/dev/custom_xclbin/index.html @@ -11,4 +11,4 @@ "clocks" => Any[Object{Nothing, String}("port_name"=>"ap_clk", "id"=>"0", "requested_frequency"=>0, "achieved_frequency"=>0)] "reset_port_names" => Any["ap_rst_n"] "slr_resources" => Any[Object{Nothing, String}("slr_name"=>"SLR0", "resource_utilization"=>Any[Object{Nothing, String}("resource_name"=>"LUT", "used"=>"1328", "available"=>"439680"), Object{Nothing, String}("resource_name"=>"LUTAsLogic", "used"=>"975", "availabl… - + diff --git a/dev/examples/high_level_basics/index.html b/dev/examples/high_level_basics/index.html index c2a3d33..d63f2bb 100644 --- a/dev/examples/high_level_basics/index.html +++ b/dev/examples/high_level_basics/index.html @@ -1,5 +1,5 @@ -Auto-generate Kernel Interface · XRT.jl

Example: Auto-generated Kernel Interfaces

This example executes a kernel on the FPGA that takes one buffer as output and two scalar values as input. The prepare_bitstream function can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data. Buffer synchronization is handled automatically by XRT.jl. An example code for the execution of a kernel dummyKernel like this:

void dummyKernel(char* a, char validate, int count) {
+Auto-generate Kernel Interface · XRT.jl

Example: Auto-generated Kernel Interfaces

This example executes a kernel on the FPGA that takes one buffer as output and two scalar values as input. The @prepare_bitstream macro can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data. Buffer synchronization is handled automatically by XRT.jl. For more information refer to High Level Abstractions for Kernel Executions.

An example code for the execution of a kernel dummyKernel like this:

void dummyKernel(char* a, char validate, int count) {
     for (int i=0; i<count; i++) {
         a[i] = validate;
     }
@@ -9,12 +9,15 @@
 # Allocate an output array
 a = Array{UInt8}(MemAlign(4096),1)
 
-# Load the bitstream to the FPGA and generate functions 
-# for each kernel
-bs = XRT.prepare_bitstream("communication_PCIE.xclbin")
+# Create a module that should contain the generated functions 
+# of the bitstream
+module Bitstream
+    using XRT
+    @prepare_bitstream("communication_PCIE.xclbin")
+end
 
 # execute the dummyKernel kernel
-bs.dummyKernel!(a, UInt8(1),1)
+Bitstream.dummyKernel!(a, UInt8(1),1)
 
 # validate the execution results
-@assert all(a .== UInt8(1))
+@assert all(a .== UInt8(1))
diff --git a/dev/examples/stream/index.html b/dev/examples/stream/index.html index cd8d82a..45dd6d2 100644 --- a/dev/examples/stream/index.html +++ b/dev/examples/stream/index.html @@ -17,4 +17,4 @@ [ Info: Done device process sw_emu_device done Kernel Name: k1, CU Number: 0, Status: Shutdown -Kernel Name: k2, CU Number: 1, Status: Shutdown

Note, that the measured bandwidth is relatively low because software emulation is used. To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly by updating the bitstream() function.

+Kernel Name: k2, CU Number: 1, Status: Shutdown

Note, that the measured bandwidth is relatively low because software emulation is used. To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly in the @prepare_bitstream line.

diff --git a/dev/examples/xrt_api/index.html b/dev/examples/xrt_api/index.html index c3eb3fc..cea3c13 100644 --- a/dev/examples/xrt_api/index.html +++ b/dev/examples/xrt_api/index.html @@ -25,4 +25,4 @@ wait(r) # Read back and validate output data sync!(xa, XRT.XCL_BO_SYNC_BO_FROM_DEVICE) -@assert all(xa .== UInt8(1)) +@assert all(xa .== UInt8(1)) diff --git a/dev/high_level/index.html b/dev/high_level/index.html index 8954833..1a9ab60 100644 --- a/dev/high_level/index.html +++ b/dev/high_level/index.html @@ -1,2 +1,2 @@ -High Level Execution · XRT.jl

High Level Abstractions for Kernel Executions

Based on our Custom XCLBIN Parser, XRT.jl provides a method to generate ready-to-use functions for the execution of individual kernels in the bitstream: prepare_bitstream(path; device).

This function will create a new module with a function for each kernel in the provided bitstream and load the bitstream on an FPGA device. Kernels can then be executed by calling the function with the required input parameters. If the input parameter is an AbstractArray, it will be automatically copied to the FPGA memory before execution and back after execution.

See Example: Auto-generated Kernel Interfaces and Example: STREAM Benchmark for examples, how this approach can be used to execute compute kernels on the FPGA.

+High Level Execution · XRT.jl

High Level Abstractions for Kernel Executions

Based on our Custom XCLBIN Parser, XRT.jl provides a macro to generate ready-to-use functions for the execution of individual kernels in the bitstream: @prepare_bitstream.

This function will create a new module with a function for each kernel in the provided bitstream. Kernels can then be executed by calling the function with the required input parameters. If the input parameter is an AbstractArray, it will be automatically copied to the FPGA memory before execution and back after execution. All generated function come with a keyworkd parameter device which can be used to specify the device the kernel should be executed on. If the bitstream is not already programmed on the device, this will be done automatically before executing the kernel.

See Example: Auto-generated Kernel Interfaces and Example: STREAM Benchmark for examples, how this approach can be used to execute compute kernels on the FPGA.

diff --git a/dev/index.html b/dev/index.html index 5051ea3..19dfcaf 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,2 +1,2 @@ -Installation · XRT.jl

XRT.jl

Installation

Note: Only Linux and Windows x86_64 systems are supported!

The package is not registered. You can use

] add https://github.com/pc2/XRT.jl

to add the package to your Julia environment.

The following dependencies have to be installed to use XRT.jl:

  • Xilinx Vitis for features like software or hardware emulation

XRT is contained in the xrt_jll package in version 2.17. If a native installation of XRT should be used, set the XILINX_XRT environment variable to the path of the local installation. XRT with the native C++ interface +2.14 are supported.

Known Issues

  • The build in XRT implementation is unable to find a device even when Vitis HLS is installed and the XCL_EMULATION_MODE variable is set.
+Installation · XRT.jl

XRT.jl

Installation

Note: Only Linux and Windows x86_64 systems are supported!

The package is not registered. You can use

] add https://github.com/pc2/XRT.jl

to add the package to your Julia environment.

The following dependencies have to be installed to use XRT.jl:

  • Xilinx Vitis for features like software or hardware emulation

XRT is contained in the xrt_jll package in version 2.17. If a native installation of XRT should be used, set the XILINX_XRT environment variable to the path of the local installation. XRT with the native C++ interface +2.14 are supported.

Known Issues

  • The build in XRT implementation is unable to find a device even when Vitis HLS is installed and the XCL_EMULATION_MODE variable is set.
diff --git a/dev/refs/api/index.html b/dev/refs/api/index.html index 6cb407d..56aaafe 100644 --- a/dev/refs/api/index.html +++ b/dev/refs/api/index.html @@ -1,10 +1,9 @@ -API · XRT.jl

API

Index

References

XRT.XRTWrap.KernelMethod
Kernel(device, uuid, name)
-

Create a new kernel instance using a device, bitstream uuid, and kernel name.

source
XRT.XRTWrap.RunMethod
Run(kernel, arg1, args; autostart)
-

Execute a kernel with the given arguments. To automatically start the execution, set autostart to true. Otherwise, the execution has to be explicitly started by calling start(run::Run)

source
Base.waitMethod
wait(run)
-

Wait for a given Run object to complete execution. The method will return as soon as the execution is completed.

source
XRT.XRTWrap.set_arg!Method
set_arg!(run, index, val)
-

Set the argument for a kernel at the given index. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!

source
XRT.XRTWrap.set_arg!Method
set_arg!(run, index, val)
-

Set the argument for a kernel at the given index to a given BOArray. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!

source
XRT.XRTWrap.set_arg!Method
set_arg!(run, index, val)
-

Set the argument for a kernel at the given index to a given BO. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!

source
XRT.BOArrayMethod
BOArray(device, userdata, mem; flags)
-

Array data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.

source
XRT.BOArrayMethod

Array data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.

source
XRT.get_kernel_infoMethod
get_kernel_info(xclbin_path::String)

Get information about contained kernels, instances, arguments and their register offsets...

source
XRT.get_section_stringMethod
get_section_string(xclbin_path::String, type::SectionType)

Get the specified raw section from the bitstream file xclbin_path.

source
XRT.get_system_infoMethod
get_system_info(xclbin_path::String)

Get information about resource utilization and connectivity

source
XRT.prepare_bitstreamMethod
prepare_bitstream(path; device)
-

Load a bitstream to an FPGA and generate interfaces for the included kernels. Returns a module with functions representing the kernels of the bitstream.

path: Path to the bitstream file (xclbin) device: XRT device to write the bitstream to

source
+API · XRT.jl

API

Index

References

XRT.XRTWrap.KernelMethod
Kernel(device, uuid, name)
+

Create a new kernel instance using a device, bitstream uuid, and kernel name.

source
XRT.XRTWrap.RunMethod
Run(kernel, arg1, args; autostart)
+

Execute a kernel with the given arguments. To automatically start the execution, set autostart to true. Otherwise, the execution has to be explicitly started by calling start(run::Run)

source
Base.waitMethod
wait(run)
+

Wait for a given Run object to complete execution. The method will return as soon as the execution is completed.

source
XRT.XRTWrap.set_arg!Method
set_arg!(run, index, val)
+

Set the argument for a kernel at the given index. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!

source
XRT.XRTWrap.set_arg!Method
set_arg!(run, index, val)
+

Set the argument for a kernel at the given index to a given BOArray. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!

source
XRT.XRTWrap.set_arg!Method
set_arg!(run, index, val)
+

Set the argument for a kernel at the given index to a given BO. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!

source
XRT.BOArrayMethod
BOArray(device, userdata, mem; flags)
+

Array data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.

source
XRT.BOArrayMethod

Array data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.

source
XRT.get_kernel_infoMethod
get_kernel_info(xclbin_path::String)

Get information about contained kernels, instances, arguments and their register offsets...

source
XRT.get_section_stringMethod
get_section_string(xclbin_path::String, type::SectionType)

Get the specified raw section from the bitstream file xclbin_path.

source
XRT.get_system_infoMethod
get_system_info(xclbin_path::String)

Get information about resource utilization and connectivity

source
XRT.@prepare_bitstreamMacro

Parse a bitstream and generate functions for the included kernels. The functions will automatically copy all relevant buffers to the FPGA memory, and execute the Kernel.

It is recommended to generate the kernel functions in a separate module like this:

Julia module DummyBitstream using XRT @prepare_bitstream("my_bitstream.xclbin") end

Afterwards, you find the functions for each kernel in the module. To execute the kernel on a specific device, use the device keyword parameter:

Julia DummyBitstream.kernel_name!(args...; device=XRT.Device(0))

source
diff --git a/dev/search/index.html b/dev/search/index.html index 0e82ba9..c5a703d 100644 --- a/dev/search/index.html +++ b/dev/search/index.html @@ -1,2 +1,2 @@ -Search · XRT.jl

Loading search...

    +Search · XRT.jl

    Loading search...

      diff --git a/dev/search_index.js b/dev/search_index.js index 4f3dc9e..f70d9bb 100644 --- a/dev/search_index.js +++ b/dev/search_index.js @@ -1,3 +1,3 @@ var documenterSearchIndex = {"docs": -[{"location":"examples/high_level_basics/#Example:-Auto-generated-Kernel-Interfaces","page":"Auto-generate Kernel Interface","title":"Example: Auto-generated Kernel Interfaces","text":"","category":"section"},{"location":"examples/high_level_basics/","page":"Auto-generate Kernel Interface","title":"Auto-generate Kernel Interface","text":"This example executes a kernel on the FPGA that takes one buffer as output and two scalar values as input. The prepare_bitstream function can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data. Buffer synchronization is handled automatically by XRT.jl. An example code for the execution of a kernel dummyKernel like this:","category":"page"},{"location":"examples/high_level_basics/","page":"Auto-generate Kernel Interface","title":"Auto-generate Kernel Interface","text":"void dummyKernel(char* a, char validate, int count) {\n for (int i=0; i 0, also the second input array in2 will be processed and added to the result of scaling array in1. The final result will be stored in the array out.","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"In our example code, we will execute the triad kernel of the STREAM benchmark and measure the total execution time of the kernel. Since we are using the high-level wrapper to create our kernel method, all buffers will be read and written to and from the FPGA before the actual kernel execution. ","category":"page"},{"location":"examples/stream/#Building","page":"STREAM TRIAD Example","title":"Building","text":"","category":"section"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"To execute this example, Vitis HLS needs to be installed on the system. The FPGA bitstream first has to be build using the provided Makefile. A emulation target can be specified using the TARGET parameter to build for software emulation (sw_emu), hardware emulation (hw_emu), or hardware (hw). To build the bitstream for software emulation for the Alveo U280, run the following command:","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"make all TARGET=sw_emu","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"After successful build, the julia code can be executed:","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"env XCL_EMULATION_MODE=sw_emu julia --project stream_fpga.jl","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"The output should look similar to this:","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"Kernel Name: k1, CU Number: 0, Thread creation status: success\nKernel Name: k2, CU Number: 1, Thread creation status: success\n[ Info: Execute kernel test run\nKernel Name: k1, CU Number: 0, State: Start\nKernel Name: k1, CU Number: 0, State: Running\nKernel Name: k1, CU Number: 0, State: Idle\n[ Info: Execute full kernel run TRIAD\nKernel Name: k1, CU Number: 0, State: Start\nKernel Name: k1, CU Number: 0, State: Running\nKernel Name: k1, CU Number: 0, State: Idle\n[ Info: Execution time: 0.148555856 seconds\n[ Info: Measured bandwidth: 0.5082093296948187 GB/s\n[ Info: Validate output\n[ Info: Done\ndevice process sw_emu_device done\nKernel Name: k1, CU Number: 0, Status: Shutdown\nKernel Name: k2, CU Number: 1, Status: Shutdown","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"Note, that the measured bandwidth is relatively low because software emulation is used. To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly by updating the bitstream() function.","category":"page"},{"location":"refs/api/#API","page":"API","title":"API","text":"","category":"section"},{"location":"refs/api/#Index","page":"API","title":"Index","text":"","category":"section"},{"location":"refs/api/","page":"API","title":"API","text":"Pages = [\"api.md\"]\nOrder = [:function, :type]","category":"page"},{"location":"refs/api/#References","page":"API","title":"References","text":"","category":"section"},{"location":"refs/api/","page":"API","title":"API","text":"Modules = [XRT]\nPages = [\"XRT.jl\", \"xrt_kernel.jl\", \"xrt_bo.jl\", \"custom_xclbin.jl\", \"hl_execution.jl\"]","category":"page"},{"location":"refs/api/#XRT.XRTWrap.Kernel-Tuple{XRT.XRTWrap.Device, XRT.XRTWrap.UUID, String}","page":"API","title":"XRT.XRTWrap.Kernel","text":"Kernel(device, uuid, name)\n\n\nCreate a new kernel instance using a device, bitstream uuid, and kernel name.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.Run-Tuple{XRT.XRTWrap.Kernel, Any, Vararg{Any}}","page":"API","title":"XRT.XRTWrap.Run","text":"Run(kernel, arg1, args; autostart)\n\n\nExecute a kernel with the given arguments. To automatically start the execution, set autostart to true. Otherwise, the execution has to be explicitly started by calling start(run::Run)\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#Base.wait-Tuple{XRT.XRTWrap.Run}","page":"API","title":"Base.wait","text":"wait(run)\n\n\nWait for a given Run object to complete execution. The method will return as soon as the execution is completed.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.set_arg!-Tuple{XRT.XRTWrap.Run, Any, Any}","page":"API","title":"XRT.XRTWrap.set_arg!","text":"set_arg!(run, index, val)\n\n\nSet the argument for a kernel at the given index. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.set_arg!-Tuple{XRT.XRTWrap.Run, Any, XRT.BOArray}","page":"API","title":"XRT.XRTWrap.set_arg!","text":"set_arg!(run, index, val)\n\n\nSet the argument for a kernel at the given index to a given BOArray. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.set_arg!-Tuple{XRT.XRTWrap.Run, Any, XRT.XRTWrap.BO}","page":"API","title":"XRT.XRTWrap.set_arg!","text":"set_arg!(run, index, val)\n\n\nSet the argument for a kernel at the given index to a given BO. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.BOArray-Union{Tuple{N}, Tuple{T}, Tuple{XRT.XRTWrap.Device, AbstractArray{T, N}, Any}} where {T, N}","page":"API","title":"XRT.BOArray","text":"BOArray(device, userdata, mem; flags)\n\n\nArray data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.BOArray-Union{Tuple{N}, Tuple{T}, Tuple{XRT.XRTWrap.Device, Any, Any}} where {T, N}","page":"API","title":"XRT.BOArray","text":"Array data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.get_kernel_info-Tuple{String}","page":"API","title":"XRT.get_kernel_info","text":"get_kernel_info(xclbin_path::String)\n\nGet information about contained kernels, instances, arguments and their register offsets...\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.get_section_string-Tuple{String, XRT.SectionType}","page":"API","title":"XRT.get_section_string","text":"get_section_string(xclbin_path::String, type::SectionType)\n\nGet the specified raw section from the bitstream file xclbin_path.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.get_system_info-Tuple{String}","page":"API","title":"XRT.get_system_info","text":"get_system_info(xclbin_path::String)\n\nGet information about resource utilization and connectivity\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.prepare_bitstream-Tuple{String}","page":"API","title":"XRT.prepare_bitstream","text":"prepare_bitstream(path; device)\n\n\nLoad a bitstream to an FPGA and generate interfaces for the included kernels. Returns a module with functions representing the kernels of the bitstream.\n\npath: Path to the bitstream file (xclbin) device: XRT device to write the bitstream to\n\n\n\n\n\n","category":"method"},{"location":"high_level/#High-Level-Abstractions-for-Kernel-Executions","page":"High Level Execution","title":"High Level Abstractions for Kernel Executions","text":"","category":"section"},{"location":"high_level/","page":"High Level Execution","title":"High Level Execution","text":"Based on our Custom XCLBIN Parser, XRT.jl provides a method to generate ready-to-use functions for the execution of individual kernels in the bitstream: prepare_bitstream(path; device).","category":"page"},{"location":"high_level/","page":"High Level Execution","title":"High Level Execution","text":"This function will create a new module with a function for each kernel in the provided bitstream and load the bitstream on an FPGA device. Kernels can then be executed by calling the function with the required input parameters. If the input parameter is an AbstractArray, it will be automatically copied to the FPGA memory before execution and back after execution.","category":"page"},{"location":"high_level/","page":"High Level Execution","title":"High Level Execution","text":"See Example: Auto-generated Kernel Interfaces and Example: STREAM Benchmark for examples, how this approach can be used to execute compute kernels on the FPGA.","category":"page"},{"location":"custom_xclbin/#Custom-XCLBIN-Parser","page":"XCLBIN Parsing","title":"Custom XCLBIN Parser","text":"","category":"section"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"XRT.jl comes with functions to parse the bitstream container format xclbin. The most important functions provided are get_kernel_info(path) and get_system_info(path) which both take as input a path to the bitstream. The functions extract the JSON data which is encoded in the bitstream and return it as a LazyJSON data structure.","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"get_kernel_info(path) returns data about the implemented compute kernel, such as input parameters, compute instances, memory addresses, and offest. get_system_info(path) returns information about resource utilization of individual compute units and the available resources on the system.","category":"page"},{"location":"custom_xclbin/#Example","page":"XCLBIN Parsing","title":"Example","text":"","category":"section"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"For our simple dummy kernel that is also used in the examples, we can get the resource utilization of the kernel like this:","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"js = XRT.get_system_info(\"communication_PCIE.xclbin\")\njs[1][\"compute_units\"][1]","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"Results in the following LazyJSON object:","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"LazyJSON.Object{Nothing, String}(...):\n \"id\" => \"0\"\n \"kernel_name\" => \"dummyKernel\"\n \"cu_name\" => \"dummyKernel\"\n \"base_address\" => \"0x800000\"\n \"actual_resources\" => Any[Object{Nothing, String}(\"design_state\"=>\"routed\", \"LUT\"=>\"1328\", \"REG\"=>\"1439\", \"BRAM\"=>\"0\", \"DSP\"=>\"0\", \"URAM\"=>\"0\"), Object{Nothing, String}(\"design_state\"=>\"synthesized\", \"LUT\"=>\"1497\", \"REG\"=>\"1586\", \"BRAM\"=>\"0\", \"DSP\"=>\"0\", \"URAM\"=>\"0…\n \"clock_name\" => \"\"\n \"clock_id\" => 0\n \"clocks\" => Any[Object{Nothing, String}(\"port_name\"=>\"ap_clk\", \"id\"=>\"0\", \"requested_frequency\"=>0, \"achieved_frequency\"=>0)]\n \"reset_port_names\" => Any[\"ap_rst_n\"]\n \"slr_resources\" => Any[Object{Nothing, String}(\"slr_name\"=>\"SLR0\", \"resource_utilization\"=>Any[Object{Nothing, String}(\"resource_name\"=>\"LUT\", \"used\"=>\"1328\", \"available\"=>\"439680\"), Object{Nothing, String}(\"resource_name\"=>\"LUTAsLogic\", \"used\"=>\"975\", \"availabl…\n","category":"page"},{"location":"#XRT.jl","page":"Installation","title":"XRT.jl","text":"","category":"section"},{"location":"#Installation","page":"Installation","title":"Installation","text":"","category":"section"},{"location":"","page":"Installation","title":"Installation","text":"Note: Only Linux and Windows x86_64 systems are supported!","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"The package is not registered. You can use","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"] add https://github.com/pc2/XRT.jl","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"to add the package to your Julia environment.","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"The following dependencies have to be installed to use XRT.jl:","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"Xilinx Vitis for features like software or hardware emulation","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"XRT is contained in the xrt_jll package in version 2.17. If a native installation of XRT should be used, set the XILINX_XRT environment variable to the path of the local installation. XRT with the native C++ interface +2.14 are supported.","category":"page"},{"location":"#Known-Issues","page":"Installation","title":"Known Issues","text":"","category":"section"},{"location":"","page":"Installation","title":"Installation","text":"The build in XRT implementation is unable to find a device even when Vitis HLS is installed and the XCL_EMULATION_MODE variable is set.","category":"page"}] +[{"location":"examples/high_level_basics/#Example:-Auto-generated-Kernel-Interfaces","page":"Auto-generate Kernel Interface","title":"Example: Auto-generated Kernel Interfaces","text":"","category":"section"},{"location":"examples/high_level_basics/","page":"Auto-generate Kernel Interface","title":"Auto-generate Kernel Interface","text":"This example executes a kernel on the FPGA that takes one buffer as output and two scalar values as input. The @prepare_bitstream macro can be used to generate Julia functions for all kernels implemented in the bitstream by parsing its meta data. Buffer synchronization is handled automatically by XRT.jl. For more information refer to High Level Abstractions for Kernel Executions.","category":"page"},{"location":"examples/high_level_basics/","page":"Auto-generate Kernel Interface","title":"Auto-generate Kernel Interface","text":"An example code for the execution of a kernel dummyKernel like this:","category":"page"},{"location":"examples/high_level_basics/","page":"Auto-generate Kernel Interface","title":"Auto-generate Kernel Interface","text":"void dummyKernel(char* a, char validate, int count) {\n for (int i=0; i 0, also the second input array in2 will be processed and added to the result of scaling array in1. The final result will be stored in the array out.","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"In our example code, we will execute the triad kernel of the STREAM benchmark and measure the total execution time of the kernel. Since we are using the high-level wrapper to create our kernel method, all buffers will be read and written to and from the FPGA before the actual kernel execution. ","category":"page"},{"location":"examples/stream/#Building","page":"STREAM TRIAD Example","title":"Building","text":"","category":"section"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"To execute this example, Vitis HLS needs to be installed on the system. The FPGA bitstream first has to be build using the provided Makefile. A emulation target can be specified using the TARGET parameter to build for software emulation (sw_emu), hardware emulation (hw_emu), or hardware (hw). To build the bitstream for software emulation for the Alveo U280, run the following command:","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"make all TARGET=sw_emu","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"After successful build, the julia code can be executed:","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"env XCL_EMULATION_MODE=sw_emu julia --project stream_fpga.jl","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"The output should look similar to this:","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"Kernel Name: k1, CU Number: 0, Thread creation status: success\nKernel Name: k2, CU Number: 1, Thread creation status: success\n[ Info: Execute kernel test run\nKernel Name: k1, CU Number: 0, State: Start\nKernel Name: k1, CU Number: 0, State: Running\nKernel Name: k1, CU Number: 0, State: Idle\n[ Info: Execute full kernel run TRIAD\nKernel Name: k1, CU Number: 0, State: Start\nKernel Name: k1, CU Number: 0, State: Running\nKernel Name: k1, CU Number: 0, State: Idle\n[ Info: Execution time: 0.148555856 seconds\n[ Info: Measured bandwidth: 0.5082093296948187 GB/s\n[ Info: Validate output\n[ Info: Done\ndevice process sw_emu_device done\nKernel Name: k1, CU Number: 0, Status: Shutdown\nKernel Name: k2, CU Number: 1, Status: Shutdown","category":"page"},{"location":"examples/stream/","page":"STREAM TRIAD Example","title":"STREAM TRIAD Example","text":"Note, that the measured bandwidth is relatively low because software emulation is used. To execute the stream benchmark on hardware, the path to the bitstream has to be changed accordingly in the @prepare_bitstream line.","category":"page"},{"location":"refs/api/#API","page":"API","title":"API","text":"","category":"section"},{"location":"refs/api/#Index","page":"API","title":"Index","text":"","category":"section"},{"location":"refs/api/","page":"API","title":"API","text":"Pages = [\"api.md\"]\nOrder = [:function, :type]","category":"page"},{"location":"refs/api/#References","page":"API","title":"References","text":"","category":"section"},{"location":"refs/api/","page":"API","title":"API","text":"Modules = [XRT]\nPages = [\"XRT.jl\", \"xrt_kernel.jl\", \"xrt_bo.jl\", \"custom_xclbin.jl\", \"hl_execution.jl\"]","category":"page"},{"location":"refs/api/#XRT.XRTWrap.Kernel-Tuple{XRT.XRTWrap.Device, XRT.XRTWrap.UUID, String}","page":"API","title":"XRT.XRTWrap.Kernel","text":"Kernel(device, uuid, name)\n\n\nCreate a new kernel instance using a device, bitstream uuid, and kernel name.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.Run-Tuple{XRT.XRTWrap.Kernel, Any, Vararg{Any}}","page":"API","title":"XRT.XRTWrap.Run","text":"Run(kernel, arg1, args; autostart)\n\n\nExecute a kernel with the given arguments. To automatically start the execution, set autostart to true. Otherwise, the execution has to be explicitly started by calling start(run::Run)\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#Base.wait-Tuple{XRT.XRTWrap.Run}","page":"API","title":"Base.wait","text":"wait(run)\n\n\nWait for a given Run object to complete execution. The method will return as soon as the execution is completed.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.set_arg!-Tuple{XRT.XRTWrap.Run, Any, Any}","page":"API","title":"XRT.XRTWrap.set_arg!","text":"set_arg!(run, index, val)\n\n\nSet the argument for a kernel at the given index. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.set_arg!-Tuple{XRT.XRTWrap.Run, Any, XRT.BOArray}","page":"API","title":"XRT.XRTWrap.set_arg!","text":"set_arg!(run, index, val)\n\n\nSet the argument for a kernel at the given index to a given BOArray. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.XRTWrap.set_arg!-Tuple{XRT.XRTWrap.Run, Any, XRT.XRTWrap.BO}","page":"API","title":"XRT.XRTWrap.set_arg!","text":"set_arg!(run, index, val)\n\n\nSet the argument for a kernel at the given index to a given BO. Note, that this is a thin wrapper to the C++ API, so the indices start at 0!\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.BOArray-Union{Tuple{N}, Tuple{T}, Tuple{XRT.XRTWrap.Device, AbstractArray{T, N}, Any}} where {T, N}","page":"API","title":"XRT.BOArray","text":"BOArray(device, userdata, mem; flags)\n\n\nArray data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.BOArray-Union{Tuple{N}, Tuple{T}, Tuple{XRT.XRTWrap.Device, Any, Any}} where {T, N}","page":"API","title":"XRT.BOArray","text":"Array data type usable with XRT. Can be used like BO but supports indexing and automatic alignment of host buffers.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.get_kernel_info-Tuple{String}","page":"API","title":"XRT.get_kernel_info","text":"get_kernel_info(xclbin_path::String)\n\nGet information about contained kernels, instances, arguments and their register offsets...\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.get_section_string-Tuple{String, XRT.SectionType}","page":"API","title":"XRT.get_section_string","text":"get_section_string(xclbin_path::String, type::SectionType)\n\nGet the specified raw section from the bitstream file xclbin_path.\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.get_system_info-Tuple{String}","page":"API","title":"XRT.get_system_info","text":"get_system_info(xclbin_path::String)\n\nGet information about resource utilization and connectivity\n\n\n\n\n\n","category":"method"},{"location":"refs/api/#XRT.@prepare_bitstream-Tuple{String}","page":"API","title":"XRT.@prepare_bitstream","text":"Parse a bitstream and generate functions for the included kernels. The functions will automatically copy all relevant buffers to the FPGA memory, and execute the Kernel.\n\nIt is recommended to generate the kernel functions in a separate module like this:\n\nJulia module DummyBitstream using XRT @prepare_bitstream(\"my_bitstream.xclbin\") end\n\nAfterwards, you find the functions for each kernel in the module. To execute the kernel on a specific device, use the device keyword parameter:\n\nJulia DummyBitstream.kernel_name!(args...; device=XRT.Device(0))\n\n\n\n\n\n","category":"macro"},{"location":"high_level/#High-Level-Abstractions-for-Kernel-Executions","page":"High Level Execution","title":"High Level Abstractions for Kernel Executions","text":"","category":"section"},{"location":"high_level/","page":"High Level Execution","title":"High Level Execution","text":"Based on our Custom XCLBIN Parser, XRT.jl provides a macro to generate ready-to-use functions for the execution of individual kernels in the bitstream: @prepare_bitstream.","category":"page"},{"location":"high_level/","page":"High Level Execution","title":"High Level Execution","text":"This function will create a new module with a function for each kernel in the provided bitstream. Kernels can then be executed by calling the function with the required input parameters. If the input parameter is an AbstractArray, it will be automatically copied to the FPGA memory before execution and back after execution. All generated function come with a keyworkd parameter device which can be used to specify the device the kernel should be executed on. If the bitstream is not already programmed on the device, this will be done automatically before executing the kernel.","category":"page"},{"location":"high_level/","page":"High Level Execution","title":"High Level Execution","text":"See Example: Auto-generated Kernel Interfaces and Example: STREAM Benchmark for examples, how this approach can be used to execute compute kernels on the FPGA.","category":"page"},{"location":"custom_xclbin/#Custom-XCLBIN-Parser","page":"XCLBIN Parsing","title":"Custom XCLBIN Parser","text":"","category":"section"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"XRT.jl comes with functions to parse the bitstream container format xclbin. The most important functions provided are get_kernel_info(path) and get_system_info(path) which both take as input a path to the bitstream. The functions extract the JSON data which is encoded in the bitstream and return it as a LazyJSON data structure.","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"get_kernel_info(path) returns data about the implemented compute kernel, such as input parameters, compute instances, memory addresses, and offest. get_system_info(path) returns information about resource utilization of individual compute units and the available resources on the system.","category":"page"},{"location":"custom_xclbin/#Example","page":"XCLBIN Parsing","title":"Example","text":"","category":"section"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"For our simple dummy kernel that is also used in the examples, we can get the resource utilization of the kernel like this:","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"js = XRT.get_system_info(\"communication_PCIE.xclbin\")\njs[1][\"compute_units\"][1]","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"Results in the following LazyJSON object:","category":"page"},{"location":"custom_xclbin/","page":"XCLBIN Parsing","title":"XCLBIN Parsing","text":"LazyJSON.Object{Nothing, String}(...):\n \"id\" => \"0\"\n \"kernel_name\" => \"dummyKernel\"\n \"cu_name\" => \"dummyKernel\"\n \"base_address\" => \"0x800000\"\n \"actual_resources\" => Any[Object{Nothing, String}(\"design_state\"=>\"routed\", \"LUT\"=>\"1328\", \"REG\"=>\"1439\", \"BRAM\"=>\"0\", \"DSP\"=>\"0\", \"URAM\"=>\"0\"), Object{Nothing, String}(\"design_state\"=>\"synthesized\", \"LUT\"=>\"1497\", \"REG\"=>\"1586\", \"BRAM\"=>\"0\", \"DSP\"=>\"0\", \"URAM\"=>\"0…\n \"clock_name\" => \"\"\n \"clock_id\" => 0\n \"clocks\" => Any[Object{Nothing, String}(\"port_name\"=>\"ap_clk\", \"id\"=>\"0\", \"requested_frequency\"=>0, \"achieved_frequency\"=>0)]\n \"reset_port_names\" => Any[\"ap_rst_n\"]\n \"slr_resources\" => Any[Object{Nothing, String}(\"slr_name\"=>\"SLR0\", \"resource_utilization\"=>Any[Object{Nothing, String}(\"resource_name\"=>\"LUT\", \"used\"=>\"1328\", \"available\"=>\"439680\"), Object{Nothing, String}(\"resource_name\"=>\"LUTAsLogic\", \"used\"=>\"975\", \"availabl…\n","category":"page"},{"location":"#XRT.jl","page":"Installation","title":"XRT.jl","text":"","category":"section"},{"location":"#Installation","page":"Installation","title":"Installation","text":"","category":"section"},{"location":"","page":"Installation","title":"Installation","text":"Note: Only Linux and Windows x86_64 systems are supported!","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"The package is not registered. You can use","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"] add https://github.com/pc2/XRT.jl","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"to add the package to your Julia environment.","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"The following dependencies have to be installed to use XRT.jl:","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"Xilinx Vitis for features like software or hardware emulation","category":"page"},{"location":"","page":"Installation","title":"Installation","text":"XRT is contained in the xrt_jll package in version 2.17. If a native installation of XRT should be used, set the XILINX_XRT environment variable to the path of the local installation. XRT with the native C++ interface +2.14 are supported.","category":"page"},{"location":"#Known-Issues","page":"Installation","title":"Known Issues","text":"","category":"section"},{"location":"","page":"Installation","title":"Installation","text":"The build in XRT implementation is unable to find a device even when Vitis HLS is installed and the XCL_EMULATION_MODE variable is set.","category":"page"}] }