Skip to content

Commit ca7781b

Browse files
committed
Add clover product
1 parent 443d29d commit ca7781b

File tree

2 files changed

+139
-45
lines changed

2 files changed

+139
-45
lines changed

.clang-format

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ AlwaysBreakAfterDefinitionReturnType: None
1818
AlwaysBreakAfterReturnType: None
1919
AlwaysBreakBeforeMultilineStrings: true
2020
AlwaysBreakTemplateDeclarations: true
21-
BinPackArguments: false
22-
BinPackParameters: false
21+
BinPackArguments: true
22+
BinPackParameters: true
2323
BraceWrapping:
2424
AfterClass: false
2525
AfterControlStatement: false

qphix_base_classes.hpp

+137-43
Original file line numberDiff line numberDiff line change
@@ -46,65 +46,159 @@ void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, F
4646
i_out += a * d + b * c;
4747
}
4848

49+
/**
50+
Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
51+
52+
Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
53+
54+
If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
55+
t)` coordinates to block indices. Also it will check that each block is accessed the proper number
56+
of times, that is `soalen` for spinors and `veclen` for clover blocks.
57+
58+
\param[out] out Output spinor
59+
\param[in] in Input spinor
60+
\param[in] clover Clover block
61+
\param[in] geom Geometry object holding the dimension of clover and spinor
62+
*/
4963
template <typename FT, int veclen, int soalen, bool compress12>
5064
void clover_product(
5165
typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
5266
typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
53-
typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock *local_clover,
67+
typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock *clover,
5468
::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
5569
::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
5670

71+
#ifdef PRINT_MAPPING
72+
std::vector<int> spin_touches(geom.getPxyz() * geom.Nt(), 0);
73+
std::vector<int> clover_touches(geom.getPxyz() * geom.Nt() * soalen / veclen, 0);
74+
#endif
75+
76+
#ifdef PRINT_MAPPING
77+
std::cout << std::setw(3) << "x" << std::setw(3) << "y" << std::setw(3) << "z" << std::setw(3)
78+
<< "t"
79+
<< ":" << std::setw(5) << "spin" << std::setw(5) << "clov"
80+
<< "\n";
81+
#endif
82+
5783
// Iterate through all the block.
58-
auto const num_blocks = get_num_blocks(geom);
59-
for (auto block = 0u; block < num_blocks; ++block) {
60-
// The clover term is block-diagonal in spin. Therefore we need
61-
// to iterate over the two blocks of spin.
62-
for (auto s_block : {0, 1}) {
63-
// Extract the diagonal and triangular parts.
64-
auto const &diag_in = s_block == 0 ? local_clover[block].diag1 : local_clover[block].diag2;
65-
auto const &off_diag_in = s_block == 0 ? local_clover[block].off_diag1 : local_clover[block].off_diag1;
66-
// Input two-spinor component.
67-
for (auto two_s_in : {0, 1}) {
68-
// Reconstruct four spinor index.
69-
auto const four_s_in = 2 * s_block + two_s_in;
70-
// Output two-spinor component.
71-
for (auto two_s_out : {0, 1}) {
72-
// Reconstruct four spinor index.
73-
auto const four_s_out = 2 * s_block + two_s_out;
74-
// Input color.
75-
for (auto c_in : {0, 1, 2}) {
76-
// Spin-color index (0, ..., 5).
77-
auto const sc_in = 3 * two_s_in + c_in;
78-
// Output color.
79-
for (auto c_out : {0, 1, 2}) {
80-
// Spin-color index (0, ..., 5).
81-
auto const sc_out = 3 * two_s_out + c_out;
82-
// SIMD vector.
83-
for (auto v = 0; v < veclen; ++v) {
84-
if (sc_out == sc_in) {
85-
cplx_mul_acc(out[block][c_out][four_s_out][re][v],
86-
out[block][c_out][four_s_out][im][v],
87-
diag_in[sc_in][v],
88-
0.0,
89-
in[block][c_in][four_s_in][re][v],
90-
in[block][c_in][four_s_in][im][v]);
91-
}
92-
else if (sc_out < sc_in) {
93-
auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
94-
cplx_mul_acc(out[block][c_out][four_s_out][re][v],
95-
out[block][c_out][four_s_out][im][v],
96-
off_diag_in[idx15][re][v],
97-
off_diag_in[idx15][im][v],
98-
in[block][c_in][four_s_in][re][v],
99-
in[block][c_in][four_s_in][im][v]);
84+
for (int t = 0; t < geom.Nt(); ++t) {
85+
for (int z = 0; z < geom.Nz(); ++z) {
86+
for (int y = 0; y < geom.Ny(); ++y) {
87+
for (int x = 0; x < geom.Nxh(); ++x) {
88+
// First element in the current XY plane at desired Z and T.
89+
auto const xyBase = t * geom.getPxyz() + z * geom.getPxy();
90+
// Index of the SoA along the X direction.
91+
auto const xb = x / soalen;
92+
// Index within the SoA.
93+
auto const xi = x % soalen;
94+
// Global spin block index.
95+
auto const spin_block_idx = xb + geom.Nxh() / soalen * y + xyBase;
96+
// Global clover/gauge block index.
97+
auto const clov_block_idx =
98+
xb + (y / geom.nGY()) * geom.Nxh() / soalen + xyBase / geom.nGY();
99+
// Index of the SoA structure within the current tile.
100+
// auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
101+
auto const tile = y % geom.nGY();
102+
// Vector index for clover/gauge. The SoA index only runs to
103+
// `soalen`, this index needs to run to `veclen`, that is across the
104+
// various SoA within the tile.
105+
auto const veclen_idx = soalen * tile + xi;
106+
107+
#ifdef PRINT_MAPPING
108+
++spin_touches[spin_block_idx];
109+
++clover_touches[clov_block_idx];
110+
111+
std::cout << std::setw(3) << x << std::setw(3) << y << std::setw(3) << z << std::setw(3)
112+
<< t << ":" << std::setw(5) << spin_block_idx << std::setw(5) << clov_block_idx
113+
<< "\n";
114+
#endif
115+
116+
assert(xi + xb * soalen == x);
117+
118+
// References to the objects at desired block.
119+
auto const &clov_block = clover[clov_block_idx];
120+
auto const &spinor_in = in[spin_block_idx];
121+
auto &spinor_out = out[spin_block_idx];
122+
123+
// The clover term is block-diagonal in spin. Therefore we need
124+
// to iterate over the two blocks of spin.
125+
for (auto s_block : {0, 1}) {
126+
// Extract the diagonal and triangular parts.
127+
auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2;
128+
auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2;
129+
// Input two-spinor component.
130+
for (auto two_s_in : {0, 1}) {
131+
// Reconstruct four spinor index.
132+
auto const four_s_in = 2 * s_block + two_s_in;
133+
// Output two-spinor component.
134+
for (auto two_s_out : {0, 1}) {
135+
// Reconstruct four spinor index.
136+
auto const four_s_out = 2 * s_block + two_s_out;
137+
// Input color.
138+
for (auto c_in : {0, 1, 2}) {
139+
// Spin-color index (0, ..., 5).
140+
auto const sc_in = 3 * two_s_in + c_in;
141+
// Output color.
142+
for (auto c_out : {0, 1, 2}) {
143+
// Spin-color index (0, ..., 5).
144+
auto const sc_out = 3 * two_s_out + c_out;
145+
146+
// See `qphix-codegen` file `dslash_common.cc` function
147+
// `clover_term` for the index manipulations done here.
148+
149+
// Using separate loops over the actual indices is probably
150+
// faster than the branching in the innermost loop.
151+
152+
if (sc_out == sc_in) {
153+
cplx_mul_acc(
154+
spinor_out[c_out][four_s_out][re][xi],
155+
spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx], FT{0},
156+
spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
157+
} else if (sc_out < sc_in) {
158+
auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
159+
cplx_mul_acc(
160+
spinor_out[c_out][four_s_out][re][xi],
161+
spinor_out[c_out][four_s_out][im][xi], off_diag_in[idx15][re][veclen_idx],
162+
-off_diag_in[idx15][im][veclen_idx], spinor_in[c_in][four_s_in][re][xi],
163+
spinor_in[c_in][four_s_in][im][xi]);
164+
} else {
165+
auto const idx15 = sc_out * (sc_out - 1) / 2 + sc_in;
166+
cplx_mul_acc(
167+
spinor_out[c_out][four_s_out][re][xi],
168+
spinor_out[c_out][four_s_out][im][xi], off_diag_in[idx15][re][veclen_idx],
169+
off_diag_in[idx15][im][veclen_idx], spinor_in[c_in][four_s_in][re][xi],
170+
spinor_in[c_in][four_s_in][im][xi]);
171+
}
100172
}
173+
}
101174
}
102175
}
103176
}
104177
}
105178
}
106179
}
107180
}
181+
182+
#ifdef PRINT_MAPPING
183+
std::cout << std::flush;
184+
185+
// Make sure that each block got touched the correct number of times.
186+
for (int i = 0; i != spin_touches.size(); ++i) {
187+
if (spin_touches[i] != soalen) {
188+
std::cout << "Spin missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
189+
<< spin_touches[i] << " times instead of " << soalen << "\n";
190+
}
191+
}
192+
193+
for (int i = 0; i != clover_touches.size(); ++i) {
194+
if (clover_touches[i] != veclen) {
195+
std::cout << "Clover missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
196+
<< clover_touches[i] << " times instead of " << veclen << "\n";
197+
}
198+
}
199+
200+
std::cout << std::flush;
201+
#endif
108202
}
109203

110204
template <typename FT, int veclen, int soalen, bool compress12>

0 commit comments

Comments
 (0)