Skip to content

Commit 8847d31

Browse files
AWSaallurikristopk
authored andcommitted
Release candidate for v1_4_1 (#417)
* Releae candidate for v1_4_1 Release candidate for RC_v1_4_1. Porting changes to public with new branch. * updates to 3rd party example for v1.4.1 * delta updatres V1.4.1 * Document update
1 parent 2fdf23f commit 8847d31

File tree

113 files changed

+7809
-3891
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+7809
-3891
lines changed

FAQs.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -382,14 +382,14 @@ FPGA Direct is FPGA to FPGA low latency high throughput peer communication throu
382382

383383
**Q: What is FPGA Link and how fast is it?**
384384

385-
FPGA Link is based on 4 x 100Gbps links on each FPGA card. The FPGA Link is organized as a ring, with 2 x 100Gbps links to each adjacent card. This enables each FPGA card to send/receive data from an adjacent card at 200Gbps speeds. Details on the FPGA Link interface will be provided in the Shell Interface specification when available.
385+
FPGA Link is based on 4 x 100Gbps links on each FPGA card. The FPGA Link is organized as a ring, with 2 x 100Gbps links to each adjacent card. This enables each FPGA card to send/receive data from an adjacent card at 200Gbps speeds. This is a unsupported feature planned for future release. Details on the FPGA Link interface will be provided in the Shell Interface specification when available.
386386

387387

388388
**Q: What protocol is used for FPGA link?**
389389

390390
The FPGA link is a generic raw streaming interface, no transport protocol is provided for it by AWS. It is expected that developers would take advantage of standard PCIe protocol, Ethernet protocol, or Xilinx's (reliable) Aurora protocol layer for this interface.
391391

392-
Details on the Shell Interface to the FPGA Link IP blocks are provided in the [Shell Interface specification](./hdk/docs/AWS_Shell_Interface_Specification.md) when available.
392+
This is a unsupported feature planned for future release. Details on the Shell Interface to the FPGA Link IP blocks are provided in the [Shell Interface specification](./hdk/docs/AWS_Shell_Interface_Specification.md) when available.
393393

394394

395395
**Q: What clock speed does the FPGA utilize?**

Jenkinsfile

+28-7
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def dcp_recipe_scenarios = [
6060
'A1-B2-C0-TIMING',
6161
'A1-B2-C0-CONGESTION',
6262
]
63-
def fdf_test_names = ['cl_dram_dma[A0-B0-C0-DEFAULT]', 'cl_hello_world[A0-B0-C0-DEFAULT]', 'cl_hello_world_vhdl',
63+
def fdf_test_names = ['cl_dram_dma[A1-B0-C0-DEFAULT]', 'cl_hello_world[A0-B0-C0-DEFAULT]', 'cl_hello_world_vhdl',
6464
'cl_uram_example[2]', 'cl_uram_example[3]', 'cl_uram_example[4]']
6565

6666
boolean debug_dcp_gen = params.get('debug_dcp_gen')
@@ -853,12 +853,6 @@ if (test_helloworld_sdaccel_example_fdf || test_all_sdaccel_examples_fdf) {
853853
}
854854
}
855855

856-
boolean test_sw_emu_supported = true
857-
858-
if(test_key =~ '_Debug') {
859-
test_sw_emu_supported = false
860-
}
861-
862856
// dsa = [ 4DDR: 4ddr ]
863857
for ( def dsa in entrySet(dsa_map_for_test) ) {
864858

@@ -880,6 +874,33 @@ if (test_helloworld_sdaccel_example_fdf || test_all_sdaccel_examples_fdf) {
880874
String create_afi_report_file = "sdaccel_create_afi_${e.key}_${dsa.value}_${xilinx_version}.xml"
881875
String run_example_report_file = "sdaccel_run_${e.key}_${dsa.value}_${xilinx_version}.xml"
882876

877+
String description_file = "${example_path}/description.json"
878+
def description_json = ["targets":["hw","hw_emu","sw_emu"]]
879+
880+
try {
881+
description_json = readJSON file: description_file
882+
}
883+
catch (exc) {
884+
echo "Could not read the file: ${description_file}"
885+
throw exc
886+
}
887+
888+
boolean test_sw_emu_supported = true
889+
890+
if(description_json["targets"]) {
891+
if(description_json["targets"].contains("sw_emu")) {
892+
test_sw_emu_supported = true
893+
echo "Description file ${description_file} has target sw_emu"
894+
}
895+
else {
896+
test_sw_emu_supported = false
897+
echo "Description file ${description_file} does not have target sw_emu"
898+
}
899+
}
900+
else {
901+
echo "Description json did not have a 'target' key"
902+
}
903+
883904
sdaccel_build_stages[build_name] = {
884905
if(test_sw_emu_supported) {
885906
stage(sw_emu_stage_name) {

RELEASE_NOTES.md

+14
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@
2626
* 1 DDR controller implemented in the SH (always available)
2727
* 3 DDR controllers implemented in the CL (configurable number of implemented controllers allowed)
2828

29+
## Release 1.4.1 (See [ERRATA](./ERRATA.md) for unsupported features)
30+
* Simulation performance Improvements
31+
* DDR Behavioural Model- Hardware simulations use an AXI memory model to run 4X faster by skipping DDR initialization. Please refer to this [README](./hdk/cl/examples/cl_dram_dma/verif/README.md) on how to use this feature in your simulation.
32+
* DDR Backdoor Loading- Hardware simulation time is reduced by pre-loading data directly into memory models. Please refer to this [README](./hdk/cl/examples/cl_dram_dma/verif/README.md#ddr-backdoor-loading) for example tests that demonstrate this feature.
33+
* Fixed Issues
34+
* XOCL Driver update to address synchronization issues.
35+
* Fixed XOCL driver issues when using ubuntu distribution for Linux OS.
36+
* Improved Performance for [cl_dram_dma Public AFI](./hdk/cl/examples/cl_dram_dma/README.md#metadata).
37+
* SDAccel 3rd party examples updated to use Shell V1.4 DSA.
38+
* Fixed AFI Manifest generation in IPI flow.
39+
* HLX button fixed in IPI
40+
* [FPGA Library update](./sdk/userspace/README.md)
41+
42+
2943
## Release 1.4.0 (See [ERRATA](./ERRATA.md) for unsupported features)
3044
* [New Shell Stable: v04261818](./hdk/common/shell_stable). Starting with release v1.4.0, the AWS FPGA shell stable has been updated and only supports Xilinx 2017.4 SDx/Vivado. All previous versions of tools and shells are not supported with this developer kit shell release.
3145
* [Shell Release Notes](./hdk/docs/AWS_Shell_RELEASE_NOTES.md)

SDAccel/Makefile

+7
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ else
3434
$(info XILINX_SDX = $(XILINX_SDX))
3535
endif
3636

37+
OS=$(shell lsb_release -si)
38+
$(info OS is $(OS))
3739

3840
MODULE :=
3941
ifeq ($(RELEASE_VER),2017.4)
@@ -42,7 +44,12 @@ ifeq ($(RELEASE_VER),2017.4)
4244
XRT_HAL_LIB = libxrt-aws.so
4345
EXE = awssak2
4446
MODULE = xocl
47+
ifeq ($(OS),Ubuntu)
48+
GLIBCPP_PATH = lib/lnx64.o/Ubuntu
49+
else
4550
GLIBCPP_PATH = lib/lnx64.o/Default
51+
endif # OS check
52+
4653
else
4754
$(error Environment variable RELEASE_VER not recognized: $(RELEASE_VER))
4855
endif

SDAccel/examples/3rd_party/README.md

-1
This file was deleted.

SDAccel/examples/3rd_party/common/inc/AOCLUtils/opencl.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ extern void cleanup();
3636

3737
namespace aocl_utils {
3838

39-
static const char *const VERSION_STR = "xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0";
39+
static const char *const VERSION_STR = "xilinx_aws-vu9p-f1-04261818_dynamic_5_0";
4040

4141
// Host allocation functions
4242
void *alignedMalloc(size_t size);

SDAccel/examples/3rd_party/fft1d/Makefile

+5-9
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,21 @@
1515
COMMON_REPO := $(SDACCEL_DIR)/examples/xilinx
1616

1717
include $(COMMON_REPO)/utility/boards.mk
18-
include $(COMMON_REPO)/libs/xcl/xcl.mk
18+
include $(COMMON_REPO)/libs/xcl2/xcl2.mk
1919
include $(COMMON_REPO)/libs/opencl/opencl.mk
2020
include $(COMMON_REPO)/libs/oclHelper/oclHelper.mk
2121

2222
# Host Application
23-
main_SRCS=$(wildcard host/src/*.cpp ../common/src/AOCLUtils/*.cpp) $(xcl_SRCS) $(oclHelper_SRCS)
24-
main_HDRS=$(xcl_HDRS)
25-
main_CXXFLAGS=$(xcl_CXXFLAGS) $(opencl_CXXFLAGS) -Ihost/inc -I../common/inc/ $(oclHelper_CXXFLAGS)
23+
main_SRCS=$(wildcard host/src/*.cpp ../common/src/AOCLUtils/*.cpp) $(xcl2_SRCS) $(oclHelper_SRCS)
24+
main_HDRS=$(xcl2_HDRS)
25+
main_CXXFLAGS=$(xcl2_CXXFLAGS) $(opencl_CXXFLAGS) -Ihost/inc -I../common/inc/ $(oclHelper_CXXFLAGS)
2626
main_LDFLAGS=$(opencl_LDFLAGS) -lrt
2727

2828
EXES=main
2929

3030
# Kernel
3131
fft1d_SRCS=./device/fft1d.cl
32-
fft1d_CLFLAGS= -optimizequick
32+
#fft1d_CLFLAGS= -optimizequick
3333
#Specifying Fifo depth for Dataflow
3434
##fft1d_CLFLAGS+=--xp "param:compiler.xclDataflowFifoDepth=32"
3535

@@ -46,11 +46,7 @@ check_XCLBINS=fft1d
4646

4747
CHECKS=check
4848

49-
ifeq ($(DEBUG),1)
5049
CXXFLAGS += -g
51-
else
52-
CXXFLAGS += -O2
53-
endif
5450

5551
#CXX := g++
5652

Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
1-
--- third_party/fft1d/device/fft1d.cl 2017-05-09 22:47:43.000000000 +0000
2-
+++ sdaccel/fft1d/device/fft1d.cl 2017-09-12 19:21:02.120000000 +0000
3-
@@ -49 +49 @@
1+
--- device/fft1d.cl 2018-02-12 17:54:56.000000000 +0000
2+
+++ device/fft1d.cl 2018-07-23 20:57:12.414000000 +0000
3+
@@ -46,7 +46,7 @@
4+
// Include source code for an engine that produces 8 points each step
5+
#include "fft_8.cl"
6+
47
-#pragma OPENCL EXTENSION cl_intel_channels : enable
58
+//#pragma OPENCL EXTENSION cl_intel_channels : enable
6-
@@ -64 +64,9 @@
9+
10+
#include "../host/inc/fft_config.h"
11+
12+
@@ -66,11 +66,19 @@
13+
#define CONT_FACTOR (1 << LOG_CONT_FACTOR)
14+
15+
// Need some depth to our channels to accomodate their bursty filling.
716
-channel float2 chanin[8] __attribute__((depth(CONT_FACTOR*8)));
817
+
918
+pipe float2 chanin0 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
@@ -14,52 +23,88 @@
1423
+pipe float2 chanin5 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
1524
+pipe float2 chanin6 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
1625
+pipe float2 chanin7 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
17-
@@ -68 +76 @@
18-
- #pragma unroll
19-
+ __attribute__((opencl_unroll_hint()))
20-
@@ -137,2 +145,2 @@
26+
27+
uint bit_reversed(uint x, uint bits) {
28+
uint y = 0;
29+
- #pragma unroll
30+
+ __attribute__((opencl_unroll_hint()))
31+
for (uint i = 0; i < bits; i++) {
32+
y <<= 1;
33+
y |= x & 1;
34+
@@ -139,15 +147,15 @@
35+
}
36+
37+
// group dimension (N/(8*CONT_FACTOR), num_iterations)
2138
-__attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
2239
-kernel void fetch (global float2 * restrict src) {
2340
+kernel __attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
2441
+void fetch (global float2 * restrict src) {
25-
@@ -145 +153 @@
42+
43+
const int N = (1 << LOGN);
44+
// Each thread will fetch POINTS points. Need POINTS times to pass to FFT.
45+
const int BUF_SIZE = 1 << (LOG_CONT_FACTOR + LOGPOINTS + LOGPOINTS);
46+
47+
// Local memory for CONT_FACTOR * POINTS points
2648
- local float2 buf[BUF_SIZE];
27-
+ local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1)));
28-
@@ -156,2 +164,2 @@
49+
+ __local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1)));
50+
51+
uint iteration = get_global_id(1);
52+
uint group_per_iter = get_global_id(0);
53+
@@ -158,17 +166,17 @@
54+
uint lid = get_local_id(0);
55+
uint local_addr = lid << LOGPOINTS;
56+
2957
- #pragma unroll
3058
- for (uint k = 0; k < POINTS; k++) {
3159
+ __attribute__((opencl_unroll_hint()))
3260
+ for (uint k = 0; k < POINTS; k+=2) {
33-
@@ -160 +167,0 @@
34-
-
35-
@@ -163,4 +170,4 @@
61+
buf[local_addr + k] = src[global_addr + k];
62+
}
63+
64+
barrier (CLK_LOCAL_MEM_FENCE);
65+
3666
- #pragma unroll
3767
- for (uint k = 0; k < POINTS; k++) {
3868
- uint buf_addr = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
3969
- write_channel_intel (chanin[k], buf[buf_addr]);
4070
+ uint buf_addr[8];
4171
+ __attribute__((opencl_unroll_hint()))
42-
+ for(uint k=0;k<8;k++) {
72+
+ for (uint k = 0; k < 8; k++) {
4373
+ buf_addr[k] = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
44-
@@ -167,0 +175,12 @@
74+
}
75+
}
76+
77+
@@ -181,9 +189,24 @@
78+
* 'count' represents the number of 4k sets to process
79+
* 'inverse' toggles between the direct and the inverse transform
80+
*/
4581
+ // bit_reversed reverses the bit locations of the value given.
4682
+ // The second parameter is the width of the number (in bits) to reverse.
4783
+ // Only the non-symmetric numbers are changed. E.g. 001,011,100,110 -> 100,110,100,110
48-
+ write_pipe (chanin0, &buf[buf_addr[0]]);
49-
+ write_pipe (chanin1, &buf[buf_addr[1]]);
50-
+ write_pipe (chanin2, &buf[buf_addr[2]]);
51-
+ write_pipe (chanin3, &buf[buf_addr[3]]);
52-
+ write_pipe (chanin4, &buf[buf_addr[4]]);
53-
+ write_pipe (chanin5, &buf[buf_addr[5]]);
54-
+ write_pipe (chanin6, &buf[buf_addr[6]]);
55-
+ write_pipe (chanin7, &buf[buf_addr[7]]);
5684
+
57-
@@ -180,2 +199,2 @@
85+
+
86+
+
87+
+ write_pipe(chanin0, &buf[buf_addr[0]]);
88+
+ write_pipe(chanin1, &buf[buf_addr[1]]);
89+
+ write_pipe(chanin2, &buf[buf_addr[2]]);
90+
+ write_pipe(chanin3, &buf[buf_addr[3]]);
91+
+ write_pipe(chanin4, &buf[buf_addr[4]]);
92+
+ write_pipe(chanin5, &buf[buf_addr[5]]);
93+
+ write_pipe(chanin6, &buf[buf_addr[6]]);
94+
+ write_pipe(chanin7, &buf[buf_addr[7]]);
95+
+
96+
5897
-__attribute((task))
5998
-kernel void fft1d(global float2 * restrict dest,
6099
+kernel __attribute((reqd_work_group_size(1, 1, 1))) //task))
61100
+void fft1d(global float2 * restrict dest,
62-
@@ -218,8 +237,9 @@
101+
int count, int inverse) {
102+
103+
const int N = (1 << LOGN);
104+
@@ -220,14 +243,14 @@
105+
float2x8 data;
106+
// Perform memory transfers only when reading data in range
107+
if (i < count * (N / 8)) {
63108
- data.i0 = read_channel_intel(chanin[0]);
64109
- data.i1 = read_channel_intel(chanin[1]);
65110
- data.i2 = read_channel_intel(chanin[2]);
@@ -68,7 +113,6 @@
68113
- data.i5 = read_channel_intel(chanin[5]);
69114
- data.i6 = read_channel_intel(chanin[6]);
70115
- data.i7 = read_channel_intel(chanin[7]);
71-
+
72116
+ read_pipe(chanin0,&data.i0);
73117
+ read_pipe(chanin1,&data.i1);
74118
+ read_pipe(chanin2,&data.i2);
@@ -77,3 +121,6 @@
77121
+ read_pipe(chanin5,&data.i5);
78122
+ read_pipe(chanin6,&data.i6);
79123
+ read_pipe(chanin7,&data.i7);
124+
} else {
125+
data.i0 = data.i1 = data.i2 = data.i3 =
126+
data.i4 = data.i5 = data.i6 = data.i7 = 0;

0 commit comments

Comments
 (0)