@@ -46,65 +46,159 @@ void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, F
46
46
i_out += a * d + b * c;
47
47
}
48
48
49
+ /* *
50
+ Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
51
+
52
+ Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
53
+
54
+ If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
55
+ t)` coordinates to block indices. Also it will check that each block is accessed the proper number
56
+ of times, that is `soalen` for spinors and `veclen` for clover blocks.
57
+
58
+ \param[out] out Output spinor
59
+ \param[in] in Input spinor
60
+ \param[in] clover Clover block
61
+ \param[in] geom Geometry object holding the dimension of clover and spinor
62
+ */
49
63
template <typename FT, int veclen, int soalen, bool compress12>
50
64
void clover_product (
51
65
typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
52
66
typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
53
- typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock *local_clover ,
67
+ typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock *clover ,
54
68
::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
55
69
::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
56
70
71
+ #ifdef PRINT_MAPPING
72
+ std::vector<int > spin_touches (geom.getPxyz () * geom.Nt (), 0 );
73
+ std::vector<int > clover_touches (geom.getPxyz () * geom.Nt () * soalen / veclen, 0 );
74
+ #endif
75
+
76
+ #ifdef PRINT_MAPPING
77
+ std::cout << std::setw (3 ) << " x" << std::setw (3 ) << " y" << std::setw (3 ) << " z" << std::setw (3 )
78
+ << " t"
79
+ << " :" << std::setw (5 ) << " spin" << std::setw (5 ) << " clov"
80
+ << " \n " ;
81
+ #endif
82
+
57
83
// Iterate through all the block.
58
- auto const num_blocks = get_num_blocks (geom);
59
- for (auto block = 0u ; block < num_blocks; ++block) {
60
- // The clover term is block-diagonal in spin. Therefore we need
61
- // to iterate over the two blocks of spin.
62
- for (auto s_block : {0 , 1 }) {
63
- // Extract the diagonal and triangular parts.
64
- auto const &diag_in = s_block == 0 ? local_clover[block].diag1 : local_clover[block].diag2 ;
65
- auto const &off_diag_in = s_block == 0 ? local_clover[block].off_diag1 : local_clover[block].off_diag1 ;
66
- // Input two-spinor component.
67
- for (auto two_s_in : {0 , 1 }) {
68
- // Reconstruct four spinor index.
69
- auto const four_s_in = 2 * s_block + two_s_in;
70
- // Output two-spinor component.
71
- for (auto two_s_out : {0 , 1 }) {
72
- // Reconstruct four spinor index.
73
- auto const four_s_out = 2 * s_block + two_s_out;
74
- // Input color.
75
- for (auto c_in : {0 , 1 , 2 }) {
76
- // Spin-color index (0, ..., 5).
77
- auto const sc_in = 3 * two_s_in + c_in;
78
- // Output color.
79
- for (auto c_out : {0 , 1 , 2 }) {
80
- // Spin-color index (0, ..., 5).
81
- auto const sc_out = 3 * two_s_out + c_out;
82
- // SIMD vector.
83
- for (auto v = 0 ; v < veclen; ++v) {
84
- if (sc_out == sc_in) {
85
- cplx_mul_acc (out[block][c_out][four_s_out][re][v],
86
- out[block][c_out][four_s_out][im][v],
87
- diag_in[sc_in][v],
88
- 0.0 ,
89
- in[block][c_in][four_s_in][re][v],
90
- in[block][c_in][four_s_in][im][v]);
91
- }
92
- else if (sc_out < sc_in) {
93
- auto const idx15 = sc_in * (sc_in - 1 ) / 2 + sc_out;
94
- cplx_mul_acc (out[block][c_out][four_s_out][re][v],
95
- out[block][c_out][four_s_out][im][v],
96
- off_diag_in[idx15][re][v],
97
- off_diag_in[idx15][im][v],
98
- in[block][c_in][four_s_in][re][v],
99
- in[block][c_in][four_s_in][im][v]);
84
+ for (int t = 0 ; t < geom.Nt (); ++t) {
85
+ for (int z = 0 ; z < geom.Nz (); ++z) {
86
+ for (int y = 0 ; y < geom.Ny (); ++y) {
87
+ for (int x = 0 ; x < geom.Nxh (); ++x) {
88
+ // First element in the current XY plane at desired Z and T.
89
+ auto const xyBase = t * geom.getPxyz () + z * geom.getPxy ();
90
+ // Index of the SoA along the X direction.
91
+ auto const xb = x / soalen;
92
+ // Index within the SoA.
93
+ auto const xi = x % soalen;
94
+ // Global spin block index.
95
+ auto const spin_block_idx = xb + geom.Nxh () / soalen * y + xyBase;
96
+ // Global clover/gauge block index.
97
+ auto const clov_block_idx =
98
+ xb + (y / geom.nGY ()) * geom.Nxh () / soalen + xyBase / geom.nGY ();
99
+ // Index of the SoA structure within the current tile.
100
+ // auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
101
+ auto const tile = y % geom.nGY ();
102
+ // Vector index for clover/gauge. The SoA index only runs to
103
+ // `soalen`, this index needs to run to `veclen`, that is across the
104
+ // various SoA within the tile.
105
+ auto const veclen_idx = soalen * tile + xi;
106
+
107
+ #ifdef PRINT_MAPPING
108
+ ++spin_touches[spin_block_idx];
109
+ ++clover_touches[clov_block_idx];
110
+
111
+ std::cout << std::setw (3 ) << x << std::setw (3 ) << y << std::setw (3 ) << z << std::setw (3 )
112
+ << t << " :" << std::setw (5 ) << spin_block_idx << std::setw (5 ) << clov_block_idx
113
+ << " \n " ;
114
+ #endif
115
+
116
+ assert (xi + xb * soalen == x);
117
+
118
+ // References to the objects at desired block.
119
+ auto const &clov_block = clover[clov_block_idx];
120
+ auto const &spinor_in = in[spin_block_idx];
121
+ auto &spinor_out = out[spin_block_idx];
122
+
123
+ // The clover term is block-diagonal in spin. Therefore we need
124
+ // to iterate over the two blocks of spin.
125
+ for (auto s_block : {0 , 1 }) {
126
+ // Extract the diagonal and triangular parts.
127
+ auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2 ;
128
+ auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2 ;
129
+ // Input two-spinor component.
130
+ for (auto two_s_in : {0 , 1 }) {
131
+ // Reconstruct four spinor index.
132
+ auto const four_s_in = 2 * s_block + two_s_in;
133
+ // Output two-spinor component.
134
+ for (auto two_s_out : {0 , 1 }) {
135
+ // Reconstruct four spinor index.
136
+ auto const four_s_out = 2 * s_block + two_s_out;
137
+ // Input color.
138
+ for (auto c_in : {0 , 1 , 2 }) {
139
+ // Spin-color index (0, ..., 5).
140
+ auto const sc_in = 3 * two_s_in + c_in;
141
+ // Output color.
142
+ for (auto c_out : {0 , 1 , 2 }) {
143
+ // Spin-color index (0, ..., 5).
144
+ auto const sc_out = 3 * two_s_out + c_out;
145
+
146
+ // See `qphix-codegen` file `dslash_common.cc` function
147
+ // `clover_term` for the index manipulations done here.
148
+
149
+ // Using separate loops over the actual indices is probably
150
+ // faster than the branching in the innermost loop.
151
+
152
+ if (sc_out == sc_in) {
153
+ cplx_mul_acc (
154
+ spinor_out[c_out][four_s_out][re][xi],
155
+ spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx], FT{0 },
156
+ spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
157
+ } else if (sc_out < sc_in) {
158
+ auto const idx15 = sc_in * (sc_in - 1 ) / 2 + sc_out;
159
+ cplx_mul_acc (
160
+ spinor_out[c_out][four_s_out][re][xi],
161
+ spinor_out[c_out][four_s_out][im][xi], off_diag_in[idx15][re][veclen_idx],
162
+ -off_diag_in[idx15][im][veclen_idx], spinor_in[c_in][four_s_in][re][xi],
163
+ spinor_in[c_in][four_s_in][im][xi]);
164
+ } else {
165
+ auto const idx15 = sc_out * (sc_out - 1 ) / 2 + sc_in;
166
+ cplx_mul_acc (
167
+ spinor_out[c_out][four_s_out][re][xi],
168
+ spinor_out[c_out][four_s_out][im][xi], off_diag_in[idx15][re][veclen_idx],
169
+ off_diag_in[idx15][im][veclen_idx], spinor_in[c_in][four_s_in][re][xi],
170
+ spinor_in[c_in][four_s_in][im][xi]);
171
+ }
100
172
}
173
+ }
101
174
}
102
175
}
103
176
}
104
177
}
105
178
}
106
179
}
107
180
}
181
+
182
+ #ifdef PRINT_MAPPING
183
+ std::cout << std::flush;
184
+
185
+ // Make sure that each block got touched the correct number of times.
186
+ for (int i = 0 ; i != spin_touches.size (); ++i) {
187
+ if (spin_touches[i] != soalen) {
188
+ std::cout << " Spin missmatch: Block " << std::setw (4 ) << i << " accessed " << std::setw (4 )
189
+ << spin_touches[i] << " times instead of " << soalen << " \n " ;
190
+ }
191
+ }
192
+
193
+ for (int i = 0 ; i != clover_touches.size (); ++i) {
194
+ if (clover_touches[i] != veclen) {
195
+ std::cout << " Clover missmatch: Block " << std::setw (4 ) << i << " accessed " << std::setw (4 )
196
+ << clover_touches[i] << " times instead of " << veclen << " \n " ;
197
+ }
198
+ }
199
+
200
+ std::cout << std::flush;
201
+ #endif
108
202
}
109
203
110
204
template <typename FT, int veclen, int soalen, bool compress12>
0 commit comments