This repository has been archived by the owner on Aug 11, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
bsg_exh.py
1450 lines (1290 loc) · 74.9 KB
/
bsg_exh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import math
import numpy as np
from itertools import combinations
from itertools import permutations
import time
import copy
import bsgutils as su
import classes as cls
import sys
import pickle
from collections import Counter
'''
The blocking scheme generator generates the optimal blocking schemes given the memory hierarchy
The sequence of operations that are done are
1. blocking scheme generation,
2. data reuse cleanup
3. loop order combinations
The first function to be called is bsg, followed by data_reuse_cleanup and finally loop_order_combinations
The output of the loop_order_combinations function is a list of temporal loops ready to be used by the cost model
'''
def data_reuse_cleanup(layer_loop_info, list_LPF_schemes, spatial_unrolling, mem_size, precision):
'''
@param layer_loop_info: The size of the layer loops
@type layer_loop_info: dict
@param list_LPF_schemes: A list of blocking schemes
@param spatial_unrolling: The spatial unrolling given by the memory scheme
@return: a list_LPF_schemes without all the schemes that exhibited data reuse equal to 1 in intermediate levels
Function steps
==============
1. define the data reuse of WEIGHTS AND OUTPUTS. INPUTs are not considered as of now, will be implemented later
2. look for intermediate levels (threfore excluding level 0 and last level) that have data reuse == 1. If it is the case, the scheme is discarded
3. if no data reuse == 1 in intermediate levels, append the scheme to the good_list list
'''
good_list = []
spatial_loop = cls.SpatialLoop.extract_loop_info(spatial_unrolling, layer_loop_info)
layer = cls.Layer.extract_layer_info(layer_loop_info)
# i_tot = layer.total_data_size['I']
# w_tot = layer.total_data_size['W']
# o_tot = layer.total_data_size['O']
data_reuse_pruning_levels = {'W':[],'I':[],'O':[]}
for op in ['W','I','O']:
for level in mem_size[op]:
if level <= precision[op] or level >= layer.total_data_size[op] * precision[op]:
data_reuse_pruning_levels[op].append(False)
else:
data_reuse_pruning_levels[op].append(True)
for bs in list_LPF_schemes:
temporal_loop = cls.TemporalLoop.extract_loop_info(layer, bs, spatial_loop)
loop = cls.Loop.extract_loop_info(layer, temporal_loop, spatial_loop, precision, False)
# DATA REUSE CLEANING IS DONE ONLY FOR WEIGHT AND OUTPUT
discard = False
for operand in ['W', 'O']:
for ii_dr, dr in enumerate(loop.data_reuse[operand][1:]):
if data_reuse_pruning_levels[operand][ii_dr]:
if dr == 1 and np.prod([d for d in loop.data_reuse[operand][:ii_dr+1]])/layer.total_data_reuse[operand] != 1:
discard = True
break
if discard:
break
# for level, level_reuse in enumerate(loop.data_reuse[operand][1:-1]):
# if level_reuse == 1:
# discard = True
# break
# if discard:
# break
if not discard:
good_list.append(bs)
# print('\n |-> data reuse cleanup:', len(list_LPF_schemes), ' to ', len(good_list))
return good_list
def even_uneven(LPF_scheme):
'''
@param LPF_scheme: Single LPF scheme
@type LPF_scheme : dict
@return: a list of valid permutations of the LPF scheme
Function steps
==============
Iteratively split the LPF scheme in virtual memory level, starting from the innermost set of LPFs
1. Find innermost set of common LPFs among different operands
2. Generate possible permuations for the found set of LPFs
3. For each permutation append it to the partial scheme. For the first iteration the partial scheme is a single empty one
4. Each partial scheme is appended to a list of partial schemes. The set of LPFs appended is removed from the original LPF scheme
- For the subsequent iterations, each virtual level is appended on top of the previous partial schemes.
- Steps 1 to 4 are repeated until all common sets of LPFs (virtual levels) have been appended.
'''
bs_next = []
finished = False
# tmp_LPF_scheme = copy.deepcopy(LPF_scheme)
tmp_LPF_scheme = {'W': [], 'I': [], 'O': []}
for op in LPF_scheme:
for ii_lev, lev in enumerate(LPF_scheme[op]):
tmp_LPF_scheme[op].append([])
for pf in lev:
tmp_LPF_scheme[op][ii_lev].append(tuple([pf[0], pf[1], ii_lev]))
lo_ok = True
operand_irrelevant = {'W': [3, 4, 7], 'I': [6], 'O': [1, 2, 5]}
length_bs = {
0: len(LPF_scheme['W']),
1: len(LPF_scheme['I']),
2: len(LPF_scheme['O']),
}
bsx = {
'W': [[]],
'I': [[]],
'O': [[]]
}
# bs_next is a list of partial schemes that gets updated after each iteration
# bsx is an empty scheme
bs_next.append(bsx)
even = True
# The while loop is looped through until all virtual levels are assigned and the schemes are complete
while not finished:
# The first step is the identification of the virtual memory level
# Basically corresponds to comparing for each operand what is the number of LPFs in the innermost level of the
# *TMP_LPF_SCHEME* (which is different from the LPF scheme which is initially fed to the function and is a copy of the
# former that gets updated after each iteration by removing the assigned the virtual level) and setting as
# virtual memory level the one which has the lowest number of LPFs
if any(tmp_LPF_scheme[op] == [[]] for op in tmp_LPF_scheme):
min_virtual_roof = 0
else:
min_virtual_roof = min([len(tmp_LPF_scheme[op][0]) for op in tmp_LPF_scheme])
min_virtual_roof_list = []
for operand in tmp_LPF_scheme:
if tmp_LPF_scheme[operand]:
if len(tmp_LPF_scheme[operand][0]) == min_virtual_roof:
min_virtual_roof_list.append([operand, len(tmp_LPF_scheme[operand][0])])
virtual_level = {'W': [], 'I': [], 'O': []}
for operand in virtual_level:
for ii_pf in range(0, min_virtual_roof):
virtual_level[operand].append(tmp_LPF_scheme[operand][0][ii_pf])
pf = list(zip(*[virtual_level['W'], virtual_level['I'], virtual_level['O']]))
# print()
for pf_op in pf:
for ii_pfx, pfx in enumerate(pf_op):
op_in = {0: 'W', 1: 'I', 2: 'O'}
if any([pfx[2] < pfy[2] and pfx[2] < length_bs[ii_pfx] - 1 for pfy in pf_op]):
# print()
# for op in ['W','I','O']:
# print(LPF_scheme[op])
# print(length_bs)
# print(virtual_level)
# print(pf)
even = False
break
if not even:
break
if not even:
break
for operand in tmp_LPF_scheme:
for pf in virtual_level[operand]:
tmp_LPF_scheme[operand][0].remove(pf)
# Merge LPFs in the virtual memory level with the same loop type
# EG (6, 2), (3, 3), (6, 5) -> (6, 10), (3, 3)
# Remove the virtual level from tmp_LPF_scheme
for operand in ['W', 'I', 'O']:
if tmp_LPF_scheme[operand]:
if not tmp_LPF_scheme[operand][0]:
tmp_LPF_scheme[operand].remove(tmp_LPF_scheme[operand][0])
# The assignment process of different virtual levels terminates when tmp_LPF_scheme is an empty scheme,
# having had all its virtual levels removed in previous iterations
if all(not tmp_LPF_scheme[op] for op in tmp_LPF_scheme):
finished = True
# print(virtual_level)
# print(pf)
return even
def st_loop_orders_gen(tmp_virtual_level_aux, O_st, I_st, W_st):
vl_order_list = []
for st_list in O_st + I_st + W_st:
order = []
for l in tmp_virtual_level_aux:
order.append(st_list[l[0]])
vl = [x for _, x in sorted(zip(order, tmp_virtual_level_aux))]
if vl not in vl_order_list:
vl_order_list.append(vl)
return vl_order_list
def loop_order_combinations_stationary_v2(LPF_scheme):
'''
@param LPF_scheme: Single LPF scheme
@type LPF_scheme : dict
@return: a list of valid permutations of the LPF scheme
Function steps
==============
Iteratively split the LPF scheme in virtual memory level, starting from the innermost set of LPFs
1. Find innermost set of common LPFs among different operands
2. Generate possible permuations for the found set of LPFs
3. For each permutation append it to the partial scheme. For the first iteration the partial scheme is a single empty one
4. Each partial scheme is appended to a list of partial schemes. The set of LPFs appended is removed from the original LPF scheme
- For the subsequent iterations, each virtual level is appended on top of the previous partial schemes.
- Steps 1 to 4 are repeated until all common sets of LPFs (virtual levels) have been appended.
'''
bs_next = []
finished = False
tmp_LPF_scheme = copy.deepcopy(LPF_scheme)
lo_ok = True
operand_irrelevant = {'W': [3, 4, 7], 'I': [6], 'O': [1, 2, 5]}
O_st = [
[None, 1, 2, 4, 5, 3, 6, 7],
[None, 2, 1, 4, 5, 3, 6, 7]
]
I_st = [
[None, 2, 3, 4, 5, 6, 1, 7],
[None, 3, 2, 4, 5, 6, 1, 7],
[None, 3, 4, 2, 5, 6, 1, 7],
[None, 3, 4, 5, 2, 6, 1, 7]
]
W_st = [
[None, 4, 5, 1, 2, 6, 7, 3],
[None, 4, 5, 2, 1, 6, 7, 3]
]
length_bs = {
'W': len(LPF_scheme['W']),
'I': len(LPF_scheme['I']),
'O': len(LPF_scheme['O']),
}
bsx = {
'W': [[]],
'I': [[]],
'O': [[]]
}
# bs_next is a list of partial schemes that gets updated after each iteration
# bsx is an empty scheme
bs_next.append(bsx)
# The while loop is looped through until all virtual levels are assigned and the schemes are complete
while not finished:
# The first step is the identification of the virtual memory level
# Basically corresponds to comparing for each operand what is the number of LPFs in the innermost level of the
# *TMP_LPF_SCHEME* (which is different from the LPF scheme which is initially fed to the function and is a copy of the
# former that gets updated after each iteration by removing the assigned the virtual level) and setting as
# virtual memory level the one which has the lowest number of LPFs
if any(tmp_LPF_scheme[op] == [[]] for op in tmp_LPF_scheme):
min_virtual_roof = 0
else:
try:
min_virtual_roof = min(
[len(tmp_LPF_scheme['W'][0]), len(tmp_LPF_scheme['I'][0]), len(tmp_LPF_scheme['O'][0])])
except IndexError:
print('tmp_LPF_scheme', tmp_LPF_scheme)
min_virtual_roof = 0
min_virtual_roof_list = []
for operand in ['W', 'I', 'O']:
if tmp_LPF_scheme[operand]:
if len(tmp_LPF_scheme[operand][0]) == min_virtual_roof:
min_virtual_roof_list.append([operand, len(tmp_LPF_scheme[operand][0])])
virtual_level = copy.deepcopy(tmp_LPF_scheme[min_virtual_roof_list[0][0]][0])
for operand in ['W', 'I', 'O']:
for pf in virtual_level:
try:
tmp_LPF_scheme[operand][0].remove(pf)
except IndexError:
lo_ok = False
return lo_ok, bs_next
except ValueError:
lo_ok = False
return lo_ok, bs_next
# Merge LPFs in the virtual memory level with the same loop type
# EG (6, 2), (3, 3), (6, 5) -> (6, 10), (3, 3)
tmp_virtual_level = []
for pf in virtual_level:
if pf[0] not in [x[0] for x in tmp_virtual_level]:
c = np.prod([x[1] for x in virtual_level if x[0] == pf[0]])
tmp_virtual_level.append(tuple([pf[0], c]))
bs_old = copy.deepcopy(bs_next)
bs_next = []
if tmp_virtual_level == []:
tmp_bs = copy.deepcopy(bs_old[0])
for op in ['W', 'I', 'O']:
if op in [opx[0] for opx in min_virtual_roof_list]:
if len(tmp_bs[op]) != length_bs[op]:
tmp_bs[op].append([])
bs_next.append(tmp_bs)
else:
tmp_virtual_level_aux = copy.deepcopy(tmp_virtual_level)
for bs in bs_old:
tmp_virtual_level_fully_pmt = st_loop_orders_gen(tmp_virtual_level_aux, O_st, I_st, W_st)
for vl in tmp_virtual_level_fully_pmt:
tmp_bs = copy.deepcopy(bs)
for op in ['W', 'I', 'O']:
tmp_bs[op][len(bs[op]) - 1] += list(vl)
if op in [opx[0] for opx in min_virtual_roof_list]:
if len(tmp_bs[op]) != length_bs[op]:
tmp_bs[op].append([])
if tmp_bs not in bs_next:
bs_next.append(tmp_bs)
# Remove the virtual level from tmp_LPF_scheme
for operand in tmp_LPF_scheme:
if tmp_LPF_scheme[operand]:
if not tmp_LPF_scheme[operand][0]:
tmp_LPF_scheme[operand].remove(tmp_LPF_scheme[operand][0])
# The assignement process of different virtual levels terminates when tmp_LPF_scheme is an empty scheme,
# having had all its virtual levels removed in previous iterations
if tmp_LPF_scheme['W'] or tmp_LPF_scheme['I'] or tmp_LPF_scheme['O']:
finished = False
else:
finished = True
return lo_ok, bs_next
def loop_order_combinations_stationary(LPF_scheme):
'''
@param LPF_scheme: Single LPF scheme
@type LPF_scheme : dict
@return: a list of valid permutations of the LPF scheme
Function steps
==============
Iteratively split the LPF scheme in virtual memory level, starting from the innermost set of LPFs
1. Find innermost set of common LPFs among different operands
2. Generate possible permuations for the found set of LPFs
3. For each permutation append it to the partial scheme. For the first iteration the partial scheme is a single empty one
4. Each partial scheme is appended to a list of partial schemes. The set of LPFs appended is removed from the original LPF scheme
- For the subsequent iterations, each virtual level is appended on top of the previous partial schemes.
- Steps 1 to 4 are repeated until all common sets of LPFs (virtual levels) have been appended.
'''
bs_next = []
finished = False
tmp_LPF_scheme = copy.deepcopy(LPF_scheme)
lo_ok = True
operand_irrelevant = {'W': [3, 4, 7], 'I': [6], 'O': [1, 2, 5]}
length_bs = {
'W': len(LPF_scheme['W']),
'I': len(LPF_scheme['I']),
'O': len(LPF_scheme['O']),
}
bsx = {
'W': [[]],
'I': [[]],
'O': [[]]
}
# bs_next is a list of partial schemes that gets updated after each iteration
# bsx is an empty scheme
bs_next.append(bsx)
# The while loop is looped through until all virtual levels are assigned and the schemes are complete
while not finished:
# The first step is the identification of the virtual memory level
# Basically corresponds to comparing for each operand what is the number of LPFs in the innermost level of the
# *TMP_LPF_SCHEME* (which is different from the LPF scheme which is initially fed to the function and is a copy of the
# former that gets updated after each iteration by removing the assigned the virtual level) and setting as
# virtual memory level the one which has the lowest number of LPFs
if any(tmp_LPF_scheme[op] == [[]] for op in tmp_LPF_scheme):
min_virtual_roof = 0
else:
try:
min_virtual_roof = min(
[len(tmp_LPF_scheme['W'][0]), len(tmp_LPF_scheme['I'][0]), len(tmp_LPF_scheme['O'][0])])
except IndexError:
print('tmp_LPF_scheme', tmp_LPF_scheme)
min_virtual_roof = 0
min_virtual_roof_list = []
for operand in ['W', 'I', 'O']:
if tmp_LPF_scheme[operand]:
if len(tmp_LPF_scheme[operand][0]) == min_virtual_roof:
min_virtual_roof_list.append([operand, len(tmp_LPF_scheme[operand][0])])
virtual_level = copy.deepcopy(tmp_LPF_scheme[min_virtual_roof_list[0][0]][0])
for operand in ['W', 'I', 'O']:
for pf in virtual_level:
try:
tmp_LPF_scheme[operand][0].remove(pf)
except IndexError:
lo_ok = False
return lo_ok, bs_next
except ValueError:
lo_ok = False
return lo_ok, bs_next
# Merge LPFs in the virtual memory level with the same loop type
# EG (6, 2), (3, 3), (6, 5) -> (6, 10), (3, 3)
tmp_virtual_level = []
for pf in virtual_level:
if pf[0] not in [x[0] for x in tmp_virtual_level]:
c = np.prod([x[1] for x in virtual_level if x[0] == pf[0]])
tmp_virtual_level.append(tuple([pf[0], c]))
bs_old = copy.deepcopy(bs_next)
bs_next = []
if tmp_virtual_level == []:
tmp_bs = copy.deepcopy(bs_old[0])
bs_next.append(tmp_bs)
for op in ['W', 'I', 'O']:
if op in [opx[0] for opx in min_virtual_roof_list]:
if len(tmp_bs[op]) != length_bs[op]:
tmp_bs[op].append([])
else:
tmp_virtual_level_aux = copy.deepcopy(tmp_virtual_level)
tmp_virtual_level_rel = {
'W': [lpf for lpf in tmp_virtual_level_aux if lpf[0] not in operand_irrelevant['W']],
'I': [lpf for lpf in tmp_virtual_level_aux if lpf[0] not in operand_irrelevant['I']],
'O': [lpf for lpf in tmp_virtual_level_aux if lpf[0] not in operand_irrelevant['O']]}
operand_short_cut = None
for operand in ['W', 'I', 'O']:
if not tmp_virtual_level_rel[operand]:
operand_short_cut = operand
break
for bs in bs_old:
# --- shortcut for fully permutation when all loop sets at current vm level are ir for one operand.
if operand_short_cut:
tmp_virtual_level_irrel = tmp_virtual_level_aux
virtual_level_irrel_perm = list(permutations(tmp_virtual_level_irrel))
for vl in virtual_level_irrel_perm:
tmp_bs = copy.deepcopy(bs)
for op in ['W', 'I', 'O']:
tmp_bs[op][len(bs[op]) - 1] += list(vl)
if op in [opx[0] for opx in min_virtual_roof_list]:
if len(tmp_bs[op]) != length_bs[op]:
tmp_bs[op].append([])
if tmp_bs not in bs_next:
bs_next.append(tmp_bs)
# ------------------------------------------------------------------------------------------------
else:
for operand in ['W', 'I', 'O']:
tmp_virtual_level_irrel = [lpf for lpf in tmp_virtual_level_aux if
lpf[0] in operand_irrelevant[operand]]
virtual_level_irrel_perm = list(permutations(tmp_virtual_level_irrel))
for vl in virtual_level_irrel_perm:
tmp_bs = copy.deepcopy(bs)
for op in ['W', 'I', 'O']:
tmp_bs[op][len(bs[op]) - 1] += list(vl) + tmp_virtual_level_rel[operand]
if op in [opx[0] for opx in min_virtual_roof_list]:
if len(tmp_bs[op]) != length_bs[op]:
tmp_bs[op].append([])
if tmp_bs not in bs_next:
bs_next.append(tmp_bs)
# Remove the virtual level from tmp_LPF_scheme
for operand in tmp_LPF_scheme:
if tmp_LPF_scheme[operand]:
if not tmp_LPF_scheme[operand][0]:
tmp_LPF_scheme[operand].remove(tmp_LPF_scheme[operand][0])
# The assignement process of different virtual levels terminates when tmp_LPF_scheme is an empty scheme,
# having had all its virtual levels removed in previous iterations
if tmp_LPF_scheme['W'] or tmp_LPF_scheme['I'] or tmp_LPF_scheme['O']:
finished = False
else:
finished = True
return lo_ok, bs_next
def loop_order_combinations_exhaustive(blocking_scheme):
"""
@param LPF_scheme: Single LPF scheme
@type LPF_scheme : dict
@return: a list of valid permutations of the LPF scheme
Function steps
==============
Iteratively split the LPF scheme in virtual memory level, starting from the innermost set of LPFs
1. Find innermost set of common LPFs among different operands
2. Generate possible permuations for the found set of LPFs
3. For each permutation append it to the partial scheme. For the first iteration the partial scheme is a single empty one
4. Each partial scheme is appended to a list of partial schemes. The set of LPFs appended is removed from the original LPF scheme
- For the subsequent iterations, each virtual level is appended on top of the previous partial schemes.
- Steps 1 to 4 are repeated until all common sets of LPFs (virtual levels) have been appended.
"""
bs_next = []
finished = False
tmp_blocking_scheme = copy.deepcopy(blocking_scheme)
lo_ok = True
length_bs = {
'W': len(blocking_scheme['W']),
'I': len(blocking_scheme['I']),
'O': len(blocking_scheme['O']),
}
bsx = {
'W': [[]],
'I': [[]],
'O': [[]]
}
# bs_next is a list of partial schemes that gets updated after each iteration
# bsx is an empty scheme
bs_next.append(bsx)
# The while loop is looped through until all virtual levels are assigned and the schemes are complete
while not finished:
if any(tmp_blocking_scheme[op] == [[]] for op in tmp_blocking_scheme):
min_virtual_roof = 0
else:
try:
min_virtual_roof = min([len(tmp_blocking_scheme[op][0]) for op in tmp_blocking_scheme])
except IndexError:
print(tmp_blocking_scheme)
min_virtual_roof = 0
min_virtual_roof_list = []
# Find lowest memory level to be virtualized
for operand in tmp_blocking_scheme:
if tmp_blocking_scheme[operand]:
if len(tmp_blocking_scheme[operand][0]) == min_virtual_roof:
min_virtual_roof_list.append([operand, len(tmp_blocking_scheme[operand][0])])
virtual_level = copy.deepcopy(tmp_blocking_scheme[min_virtual_roof_list[0][0]][0])
for operand in tmp_blocking_scheme:
for pf in virtual_level:
try:
tmp_blocking_scheme[operand][0].remove(pf)
except IndexError:
# print(blocking_scheme)
lo_ok = False
return lo_ok, bs_next
except ValueError:
# print(blocking_scheme)
lo_ok = False
return lo_ok, bs_next
# Merge LPFs in the virtual memory level with the same loop type
# EG (6, 2), (3, 3), (6, 5) -> (6, 10), (3, 3)
tmp_virtual_level = []
for pf in virtual_level:
if pf[0] not in [x[0] for x in tmp_virtual_level]:
c = np.prod([x[1] for x in virtual_level if x[0] == pf[0]])
tmp_virtual_level.append(tuple([pf[0], c]))
# Generate all possible permutations within the virtual level
virtual_level_comb = list(permutations(tmp_virtual_level))
# Append the permutations found in the virtual level to the partial schemes found previously, contained in bs_next
# Create a new bs_next list that contains the new schemes
bs_old = copy.deepcopy(bs_next)
bs_next = []
for bs in bs_old:
for vl in virtual_level_comb:
tmp_bs = copy.deepcopy(bs)
for operand in ['W', 'I', 'O']:
tmp_bs[operand][len(bs[operand]) - 1] += vl
if operand in [opx[0] for opx in min_virtual_roof_list]:
if len(tmp_bs[operand]) != length_bs[operand]:
tmp_bs[operand].append([])
# if tmp_bs not in bs_next:
# bs_next.append(tmp_bs)
bs_next.append(tmp_bs)
for operand in tmp_blocking_scheme:
if tmp_blocking_scheme[operand]:
if not tmp_blocking_scheme[operand][0]:
tmp_blocking_scheme[operand].remove(tmp_blocking_scheme[operand][0])
# The assignment process of different virtual levels terminates when tmp_LPF_scheme is an empty scheme,
# having had all its virtual levels removed in previous iterations
if all(not tmp_blocking_scheme[op] for op in tmp_blocking_scheme):
finished = True
return lo_ok, bs_next
def bsg(mem_size, mem_share, precision, utilization_rate, layer_loop_info, layer_index, spatial_unrolling, drc_enabled,
stationary_enable, tmg_scheme_hint=[]):
operand_irrelevant = {
'W': [3, 4, 7],
'O': [1, 2, 5],
'I': [6]
}
# loops_pf contains prime factors for each loop type.
# After each loop assignment the relative list is updated
loops_pf = {
7: [],
6: [],
5: [],
4: [],
3: [],
2: [],
1: []
}
# For each operand and memory level defines the effective size of blockings that it can contain
mem_block_size = {
'W': [],
'O': [],
'I': []
}
m_size = max([len(mem_size['I']), len(mem_size['W']), len(mem_size['O'])])
# Auxiliary term that stores the temporary roof values.
# For each operand it is defined to what memory level the roof belongs and how much space is still left to be assigned
# 'operand' : [memory_level, roof_value]
roof = {
'O': [0, 0],
'I': [0, 0],
'W': [0, 0]
}
# Init blocking scheme
bs = {
'W': [[] for i in range(len(mem_size['W']))],
'O': [[] for i in range(len(mem_size['O']))],
'I': [[] for i in range(len(mem_size['I']))],
}
# Append those LPFs that are relative to the spatial unrollings to the memory level where they have to be stored
for op in ['W', 'I', 'O']:
for level in range(0, len(spatial_unrolling[op])):
if spatial_unrolling[op][level]:
for unroll in range(0, len(spatial_unrolling[op][level])):
sp = su.prime_factors(spatial_unrolling[op][level][unroll][1])
bs[op][level] += [tuple([spatial_unrolling[op][level][unroll][0], j]) for j in sp]
# Assign prime factors for each loop type
ll = {1: 'FX', 2: 'FY', 3: 'OX', 4: 'OY', 5: 'C', 6: 'K', 7: 'B'}
for loop_type in loops_pf:
loops_pf[loop_type] = su.prime_factors(layer_loop_info[ll[loop_type]])
loops_pf_irrelevant_unrolled = {'W': [], 'I': [], 'O': []}
# Remove the LPFs relative to the spatial unrollings from loops_pf
for level in range(0, len(spatial_unrolling['W'])):
for unroll in range(0, len(spatial_unrolling['W'][level])):
sp = su.prime_factors(spatial_unrolling['W'][level][unroll][1])
for j in sp:
try:
loops_pf[spatial_unrolling['W'][level][unroll][0]].remove(j)
except:
print('in bsg, spatial_unrolling', spatial_unrolling)
print(j)
# Assign relative memory sizes for each operand at each memory level
for operand in ['W', 'I', 'O']:
for level in range(0, len(mem_size[operand])):
mem_block_size[operand].append(math.floor(mem_size[operand][level] / precision[operand]))
# Initialize roof values
for operand in ['W', 'I', 'O']:
roof[operand][1] = mem_block_size[operand][0]
roof = su.update_roof(bs, spatial_unrolling, [], roof, mem_share, mem_size, precision, operand_irrelevant, loops_pf,
layer_loop_info)
r_op, r_lev = roof.keys(), roof.values()
r_lev, r_size = zip(*r_lev)
min_roof_aux = ['', 0, max(r_size)]
next_min_roof = ['', 0, max(r_size)]
last_roof = ['', 0, max(r_size)]
# The root_node is a SchedulerNode object. It corresponds to the very initial partial scheme, which is empty and with no
# temporal LPF assigned (only the spatial LPF are present in bs). Each SchedulerNode object is defined by
# - Its partial scheme of LPFs (bs in this case)
# - The set of LPFs still to be assigned (loops_pf)
# - The roof variable which identifies at which memory level in the hierarchy for each operand the assignments must be carried out
root_node = su.SchedulerNode(bs, roof, loops_pf)
next_partial_LPF_schemes_list = [root_node]
final_LPF_schemes_list = []
finished = False
old_clean_LPF_schemes_list = []
rep = 0
# finished = True
while not finished:
# finished = True
# Check if all the nodes in next_partial_LPF_schemes_list are leaf nodes
# The cleaner eliminates duplicate partial schemes
clean_LPF_schemes_list = su.cleaner(next_partial_LPF_schemes_list)
# The following if condition checks whether the previous assignment step was unsuccessful.
# If one (originally three) consecutive LPF assignments are unsuccessful, the assignment process is terminated (finished = True),
# and only those schemes that are finished (leaf.over == True) are considered for successive loop ordering step
if clean_LPF_schemes_list:
if all(any(old_node == node for old_node in old_clean_LPF_schemes_list) for node in
clean_LPF_schemes_list):
rep += 1
if rep == 1: # rep == 3
final_LPF_schemes_list = [bn for bn in clean_LPF_schemes_list if bn.leaf_over == True]
# if not final_LPF_schemes_list:
# print('no fitting found!')
finished = True
# print('case2')
continue
else:
rep = 0
old_clean_LPF_schemes_list = copy.deepcopy(clean_LPF_schemes_list)
# The following if condition checks whether all the partial schemes in the cleaned list of schemes are finished.
# If all partial schemes are finished (leaf_over == True) then the while loop is broken (finished = True) and the
# scheme are considered for the reordering step
if all(nodes.leaf_over == True for nodes in clean_LPF_schemes_list) and clean_LPF_schemes_list.__len__() > 0:
final_LPF_schemes_list = [bn for bn in clean_LPF_schemes_list if bn.leaf_over == True]
finished = True
# print('case3', clean_LPF_schemes_list.__len__())
continue
partial_LPF_schemes_list = copy.deepcopy(clean_LPF_schemes_list)
next_partial_LPF_schemes_list = []
# partial_LPF_schemes_list is a copy of the cleaned list of partial schemes obtained from previous assignments.
# Each partial scheme is analyzed *individually*: a set of fitting LPF combinations will be found that can be stacked
# for each partial scheme. Each of these sets will contribute to a new partial scheme, that will be appended to
# next_partial_LPF_schemes_list
for z in range(0, len(partial_LPF_schemes_list)):
# The following if condition checks whether the partial scheme is in fact a complete scheme
# A complete scheme is a scheme where all the LPF have been assigned
# If it is a complete scheme (leaf_over == True), no assignement is needed and the scheme is appended to next_partial_LPF_schemes_list
if partial_LPF_schemes_list[z].leaf_over:
leaf_node = su.SchedulerNode(partial_LPF_schemes_list[z].LPF_scheme, partial_LPF_schemes_list[z].roof,
partial_LPF_schemes_list[z].loops_pf)
leaf_node.set_leaf_over()
next_partial_LPF_schemes_list.append(leaf_node)
continue
roof = copy.deepcopy(partial_LPF_schemes_list[z].roof)
LPF_scheme = copy.deepcopy(partial_LPF_schemes_list[z].LPF_scheme)
loops_pf = copy.deepcopy(partial_LPF_schemes_list[z].loops_pf)
r_op, r_lev = roof.keys(), roof.values()
r_lev, r_size = zip(*r_lev)
min_roof_aux = ['', m_size, max(r_size)]
tmp_min_roof = ['', m_size, max(r_size)]
# Given the partial scheme and its relative roof value, a list of minimum roofs is found.
# A minimum roof correspond to a roof value that has the lowest level in the mem hierarchy and/or
# the smallest amount of blockings space available if equivalent level in the hierarchy
# EG between W : [0, 5] and I : [2, 245] the min roof will be the 'W' one, since it belongs to a lower level in the hierarchy
# EG between W : [0, 5] and I : [0, 2] the min roof will be the 'I' one, since it has less blockings space available
min_roof_list = []
for operand in roof:
if any((roof[roof_op][0] < roof[operand][0]) for roof_op in roof) and \
not all([roof[opx][0] == len(mem_size[opx]) - 1 for opx in ['W', 'I', 'O']]):
if roof[operand][0] == (len(mem_size[operand]) - 1):
continue
if (roof[operand][0] == (len(mem_size[operand]) - 1) and
(roof[operand][1] == 1 or roof[operand][1] == 0)):
continue
else:
if roof[operand][1] <= min_roof_aux[2]:
min_roof_aux[0] = operand
min_roof_aux[1] = roof[operand][0]
min_roof_aux[2] = roof[operand][1]
for operand in roof:
if roof[operand][0] == min_roof_aux[1] and roof[operand][1] == min_roof_aux[2]:
tmp_min_roof = ['', m_size, r_size]
tmp_min_roof[0] = operand
tmp_min_roof[1] = roof[operand][0]
tmp_min_roof[2] = roof[operand][1]
min_roof_list.append(tmp_min_roof)
mr_list_operand = [mr[0] for mr in min_roof_list]
# The assignment process is carried out for *each min roof separately*
# Each min roof will have a different set of
for min_roof in reversed(min_roof_list):
# Check if min roof belongs to a shared level of memory. If so, create a list with all shared memory levels related to min roof
shared_min_roof_levels = [tuple([min_roof[0], min_roof[1]])]
for shared_set in mem_share:
if tuple([min_roof[0], min_roof[1]]) in mem_share[shared_set]:
shared_min_roof_levels = mem_share[shared_set]
fitting_combination = []
min_list_fitting_combinations = []
loop_blocks = []
# List LPFs that can be assigned to min roof operand
for loop_type in loops_pf:
for i in range(0, len(loops_pf[loop_type])):
loop_blocks.append(tuple([loop_type, loops_pf[loop_type][i]]))
roof_list = [[rf, roof[rf][0], roof[rf][1]] for rf in roof]
# Definition of k_range: Each LPF combination corresponds to a combination of LPFs drawn from loop_block
# Normally this combinations can have range (defined by k) from 0 to the amount of LPFs to be assigned.
# If the level where the min roof belongs to is the last level in the hierarchy then the k should be equal only
# to all the LPF still to be assigned, in order to avoid redundant computations.
if all([roof[rf][0] == len(mem_size[rf]) - 1 for rf in roof]):
k_range = range(len(loop_blocks), len(loop_blocks) + 1)
else:
k_range = range(0, len(loop_blocks) + 1)
# For each k in k_range:
# 1. Generate all possible combinations of loop blocks with length k
# 2. For each combination generated check if they fit within the roof (check_comb_fit function)
# 3. If they fit, append the combination to fitting_combination
for k in k_range:
tmp_comb = combinations(loop_blocks, k)
comb = []
for x in tmp_comb:
if sorted(x) not in comb:
comb.append(sorted(x))
mlft = len(fitting_combination)
for j in range(0, len(comb)):
is_fit = True
for r in roof_list:
is_min_roof = False
if r[0] == min_roof[0]: # in mr_list_operand:
is_min_roof = True
is_fit = su.check_comb_fit(LPF_scheme, spatial_unrolling, comb[j], r, mem_size, mem_share,
utilization_rate, precision, operand_irrelevant, is_min_roof,
layer_loop_info)
if not is_fit:
break
if not is_fit:
continue
fitting_combination.append(comb[j])
if len(fitting_combination) > 0 and len(fitting_combination) == mlft:
break
# Clean up all duplicate combinations from fitting_combination (order is not important yet)
# The sanitized list of fitting combinations of LPF within the roof is contained in min_list_fitting_combinations
# ------------------------------------------------------------------------------Here--------------------
# fitting_combination.append([])
for ii_fit_comb in range(0, len(fitting_combination)):
s = sorted(fitting_combination[ii_fit_comb])
if s not in min_list_fitting_combinations:
min_list_fitting_combinations.append(s)
# remove all fitting combs that are subsets
if all([roof[op][0] == (len(mem_block_size[op]) - 1) for op in ['W', 'I', 'O']]):
for lfc in reversed(min_list_fitting_combinations):
if any([all(x in mlfc for x in lfc) for mlfc in min_list_fitting_combinations if mlfc != lfc]):
min_list_fitting_combinations.remove(lfc)
# The following if condition is for the case when, given a specific roof, *no LPF combination* is found to fit
# In this case the only operation to be done is to update the roof value by going up one level in the min roof
# The loops_pf will remain the same (no comb is found and therefore no LPF has to be removed from loops_pf)
# The LPF_scheme will remain the same (no comb is found that can be stacked on top of the previous one)
# The roof value is updated (update level of current min_roof and then call update_roof)
if not min_list_fitting_combinations:
# No need to update loops_pf
new_loops_pf = copy.deepcopy(loops_pf)
# tt = sum([len(x) for x in new_loops_pf.values()])
# Update roof level of current min_roof.
# If min roof is shared, go one memory level up for all roofs that are shared with min_roof
new_tmp_roof = copy.deepcopy(roof)
for p in range(0, len(shared_min_roof_levels)):
level_up = 1
if shared_min_roof_levels[p][1] == len(mem_block_size[shared_min_roof_levels[p][0]]) - 1:
level_up = 0
new_tmp_roof[shared_min_roof_levels[p][0]][0] = shared_min_roof_levels[p][1] + level_up
new_roof = su.update_roof(LPF_scheme, spatial_unrolling, [], new_tmp_roof, mem_share, mem_size,
precision,
operand_irrelevant, new_loops_pf, layer_loop_info)
isgood = su.check_node(LPF_scheme, mem_size, operand_irrelevant, mem_share, precision,
layer_loop_info, utilization_rate)
# No need to update LPF scheme
new_LPF_scheme = copy.deepcopy(LPF_scheme)
# Generate new node in the blocking scheme tree, add it to the list
blocking_node = su.SchedulerNode(new_LPF_scheme, new_roof, new_loops_pf)
if isgood:
next_partial_LPF_schemes_list.append(blocking_node)
# print('\r bs',f'{tt:3d}','/',f'{lpf2a:3d}',' ', f'{(z+1)/len(partial_LPF_schemes_list)*100:3.0f}','%, : f ',finished_lpf_scheme,' r ',len(next_partial_LPF_schemes_list), end='')
# If there list of fitting combinations of LPFs within the roof is NOT empty, proceed in creating new partial schemes
# For each fitting combination a new SchedulerNode object is created that contains:
# - An updated loops_pf in which the LPF in the fitting combination are removed
# - An updated roof value
# - A new partial scheme with the fitting combination stacked on top of the previous partial scheme
else:
for k in range(0, len(min_list_fitting_combinations)):
# Generate different tmp roof and tmp loops pf since for each min roof fitting combination different roofs and loops pf will be defined
new_loops_pf = copy.deepcopy(loops_pf)
# Given remaining loop prime factors, remove those assigned in this combination to min roof
for i in range(0, len(min_list_fitting_combinations[k])):
new_loops_pf[min_list_fitting_combinations[k][i][0]].remove(
min_list_fitting_combinations[k][i][1])
# tt = sum([len(x) for x in new_loops_pf.values()])
# Set temporary roof with selected fitting combination
# This temporary roof is only used for updating the level in the min roof
# It is not the one that is ultimately saved in the SchedulerNode object
tmp_roof = su.update_roof(LPF_scheme, spatial_unrolling, min_list_fitting_combinations[k], roof,
mem_share,
mem_size, precision, operand_irrelevant, new_loops_pf,
layer_loop_info)
new_tmp_roof = copy.deepcopy(tmp_roof)
new_LPF_scheme = copy.deepcopy(LPF_scheme)
level0 = copy.deepcopy(min_list_fitting_combinations[k])
for rf in roof:
new_LPF_scheme[rf][roof[rf][0]] += copy.deepcopy(level0)
for p in range(0, len(shared_min_roof_levels)):
level_up = 1
if shared_min_roof_levels[p][1] == len(
mem_block_size[shared_min_roof_levels[p][0]]) - 1:
level_up = 0
new_tmp_roof[shared_min_roof_levels[p][0]][0] = shared_min_roof_levels[p][1] + level_up
# The new_roof variable contains the final updated value of the new roof, with the updated blocking space available
# for each operand in the roof
new_roof = su.update_roof(new_LPF_scheme, spatial_unrolling, [], new_tmp_roof, mem_share,
mem_size,
precision,
operand_irrelevant, new_loops_pf, layer_loop_info)
isgood = su.check_node(new_LPF_scheme, mem_size, operand_irrelevant, mem_share, precision,
layer_loop_info, utilization_rate)
if isgood:
# Generate new node in the blocking scheme tree, add it to the list
blocking_node = su.SchedulerNode(new_LPF_scheme, new_roof, new_loops_pf)
# The following if condition checks whether all the LPFs have been assigned.
# If so, the partial scheme is considere a complete scheme and the SchedulerNode will have leaf_over == True
over = False
if all(new_loops_pf[loop_types] == [] for loop_types in new_loops_pf):
over = True
blocking_node.set_leaf_over()
next_partial_LPF_schemes_list.append(blocking_node)
else:
# No need to update loops_pf
new_loops_pf = copy.deepcopy(loops_pf)
# tt = sum([len(x) for x in new_loops_pf.values()])
# Update roof level of current min_roof.
# If min roof is shared, go one memory level up for all roofs that are shared with min_roof
new_tmp_roof = copy.deepcopy(roof)
for p in range(0, len(shared_min_roof_levels)):
level_up = 1
if shared_min_roof_levels[p][1] == len(
mem_block_size[shared_min_roof_levels[p][0]]) - 1:
level_up = 0
new_tmp_roof[shared_min_roof_levels[p][0]][0] = shared_min_roof_levels[p][1] + level_up
new_roof = su.update_roof(LPF_scheme, spatial_unrolling, [], new_tmp_roof, mem_share,
mem_size,
precision,
operand_irrelevant, new_loops_pf, layer_loop_info)
isgood = su.check_node(LPF_scheme, mem_size, operand_irrelevant, mem_share, precision,
layer_loop_info, utilization_rate)
if isgood:
# No need to update LPF scheme
new_LPF_scheme = copy.deepcopy(LPF_scheme)
# Generate new node in the blocking scheme tree, add it to the list
blocking_node = su.SchedulerNode(new_LPF_scheme, new_roof, new_loops_pf)
next_partial_LPF_schemes_list.append(blocking_node)
list_LPF_schemes = [scheme_node.LPF_scheme for scheme_node in final_LPF_schemes_list]
# Remove the LPFs which correspond to the spatial unrollings that were previously assigned
# for each LPF scheme in list_LPF_schemes
for bs in list_LPF_schemes:
for op in spatial_unrolling:
for level in range(0, len(spatial_unrolling[op])):
if spatial_unrolling[op][level]:
for unroll in range(0, len(spatial_unrolling[op][level])):
sp = su.prime_factors(spatial_unrolling[op][level][unroll][1])
for i in range(0, len(sp)):
try:
bs[op][level].remove(tuple([spatial_unrolling[op][level][unroll][0], sp[i]]))
except IndexError:
continue
# Data reuse cleanup for the LPF schemes
# If you want this cleanup to be disabled, set drc_enabled = 0 in the input file
if drc_enabled:
list_LPF_schemes = data_reuse_cleanup(layer_loop_info, list_LPF_schemes, spatial_unrolling, mem_size, precision)
# print('\n |-> drc: ', len(list_LPF_schemes),'/',len(list_scheme_nodes))
# print('Loop blocking finished. Loop ordering starts.')
total = []
for ii_bs, bs in enumerate(list_LPF_schemes):
# print('\r |-> ordering: ', ii_bs, '/', len(list_LPF_schemes), end='', flush=True)
if stationary_enable is True:
lo_ok, lo = loop_order_combinations_stationary(bs)
# lo_ok, lo = loop_order_combinations_stationary_v2(bs)
else:
lo_ok, lo = loop_order_combinations_exhaustive(bs)
if lo_ok:
total.extend(lo)
# print(' |-> loop order combinations: ', len(total))