Skip to content

Commit 8b8b447

Browse files
committedNov 11, 2015
mixed_cg_merge: 32 bit bgq stuff
1 parent c9aca78 commit 8b8b447

File tree

3 files changed

+342
-2
lines changed

3 files changed

+342
-2
lines changed
 

‎bgq.h

+44-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
r1 = vec_ld(32L, (double*) &(phi).c0); \
1717
r2 = vec_ld(64L, (double*) &(phi).c0);
1818

19+
#define _vec_load_halfspinor_32(r0, r1, r2, phi) \
20+
r0 = vec_ld(0L, (float*) &(phi).c0); \
21+
r1 = vec_ld(16L, (float*) &(phi).c0); \
22+
r2 = vec_ld(32L, (float*) &(phi).c0);
23+
1924

2025
#define _vec_store_spinor(phi, r0, r1, r2, r3, r4, r5) \
2126
vec_st(r0, 0L, (double*) &(phi).c0); \
@@ -40,6 +45,11 @@
4045
r0 = vec_ld(0L, (double*) &(phi).c0); \
4146
r1 = vec_ld2(0L, (double*) &(phi).c2);
4247

48+
#define _vec_load_32(r0, r1, phi) \
49+
r0 = vec_ld(0L, (float*) &(phi).c0); \
50+
r1 = vec_ld2(0L, (float*) &(phi).c2);
51+
52+
4353
// works also with 16 byte alignement of phi
4454
#define _vec_load16(r0, r1, phi, tmp) \
4555
r0 = vec_ld2(0L, (double*) &(phi).c0); \
@@ -49,6 +59,15 @@
4959
tmp = vec_gpci(02301); \
5060
r1 = vec_perm(r1, r0, tmp);
5161

62+
#define _vec_load16_32(r0, r1, phi, tmp) \
63+
r0 = vec_ld2(0L, (float*) &(phi).c0); \
64+
r1 = vec_ld(0L, (float*) &(phi).c1); \
65+
tmp = vec_gpci(00145); \
66+
r0 = vec_perm(r0, r1, tmp); \
67+
tmp = vec_gpci(02301); \
68+
r1 = vec_perm(r1, r0, tmp);
69+
70+
5271
// alternative
5372
#define _vec_load16c(r0, r1, phi, tmp) \
5473
r0 = vec_ld2(0L, (double*) &(phi).c0); \
@@ -61,20 +80,43 @@
6180
#define _vec_store(phi, r0, r1) \
6281
vec_st((r0), 0L, (double*) &(phi).c0); \
6382
vec_st2((r1), 0L, (double*) &(phi).c2);
64-
83+
84+
85+
// requires 16 byte alignment of phi
86+
#define _vec_store_32(phi, r0, r1) \
87+
vec_st((r0), 0L, (float*) &(phi).c0); \
88+
vec_st2((r1), 0L, (float*) &(phi).c2);
89+
90+
6591
// requires 16 (and must not be 32) byte alignment of phi
6692
#define _vec_store16(phi, r0, r1, tmp) \
6793
vec_st2((r0), 0L, (double*) &(phi).c0); \
6894
tmp = vec_gpci(02345); \
6995
r0 = vec_perm(r0, r1, tmp); \
7096
vec_st((r0), 0L, (double *) &(phi).c1);
71-
97+
98+
99+
// requires 8 (and must not be 16) byte alignment of phi
100+
#define _vec_store16_32(phi, r0, r1, tmp) \
101+
vec_st2((r0), 0L, (float*) &(phi).c0); \
102+
tmp = vec_gpci(02345); \
103+
r0 = vec_perm(r0, r1, tmp); \
104+
vec_st((r0), 0L, (float *) &(phi).c1);
105+
106+
72107
// requires 32 byte alignment of phi
73108
#define _vec_store_halfspinor(phi, r0, r1, r2) \
74109
vec_st((r0), 0L, (double*) &(phi).c0); \
75110
vec_st((r1), 32L, (double*) &(phi).c0); \
76111
vec_st((r2), 64L, (double*) &(phi).c0);
77112

113+
// requires 16 byte alignment of phi
114+
#define _vec_store_halfspinor_32(phi, r0, r1, r2) \
115+
vec_st((r0), 0L, (float*) &(phi).c0); \
116+
vec_st((r1), 16L, (float*) &(phi).c0); \
117+
vec_st((r2), 32L, (float*) &(phi).c0);
118+
119+
78120
#define _vec_add(rs0, rs1, r0, r1, s0, s1) \
79121
rs0 = vec_add(r0, s0); \
80122
rs1 = vec_add(r1, s1);

‎bgq2.h

+280
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,139 @@
334334
r10= vec_xxnpmadd(U7, r5, r10); \
335335
r11= vec_xxnpmadd(U1, r5, r11);
336336

337+
338+
//same as _vec_su3_multiply_double2 but loading a 32bit gauge field
339+
#define _vec_su3_multiply_double2_32(u) \
340+
U0 = vec_ld2(0, (float*) &(u)->c00); \
341+
U3 = vec_ld2(0, (float*) &(u)->c01); \
342+
U6 = vec_ld2(0, (float*) &(u)->c02); \
343+
U1 = vec_ld2(0, (float*) &(u)->c10); \
344+
U4 = vec_ld2(0, (float*) &(u)->c11); \
345+
U7 = vec_ld2(0, (float*) &(u)->c12); \
346+
U2 = vec_ld2(0, (float*) &(u)->c20); \
347+
r6 = vec_xmul(r0, U0); \
348+
r7 = vec_xmul(r0, U1); \
349+
r8 = vec_xmul(r0, U2); \
350+
r9 = vec_xmul(r3, U0); \
351+
r10= vec_xmul(r3, U1); \
352+
r11= vec_xmul(r3, U2); \
353+
\
354+
r6 = vec_xxnpmadd(U0, r0, r6); \
355+
r7 = vec_xxnpmadd(U1, r0, r7); \
356+
r8 = vec_xxnpmadd(U2, r0, r8); \
357+
r9 = vec_xxnpmadd(U0, r3, r9); \
358+
r10= vec_xxnpmadd(U1, r3, r10); \
359+
r11= vec_xxnpmadd(U2, r3, r11); \
360+
U0 = vec_ld2(0, (float*) &(u)->c21); \
361+
\
362+
r6 = vec_xmadd(r1, U3, r6); \
363+
r7 = vec_xmadd(r1, U4, r7); \
364+
r8 = vec_xmadd(r1, U0, r8); \
365+
r9 = vec_xmadd(r4, U3, r9); \
366+
r10= vec_xmadd(r4, U4, r10); \
367+
r11= vec_xmadd(r4, U0, r11); \
368+
\
369+
r6 = vec_xxnpmadd(U3, r1, r6); \
370+
r7 = vec_xxnpmadd(U4, r1, r7); \
371+
r8 = vec_xxnpmadd(U0, r1, r8); \
372+
r9 = vec_xxnpmadd(U3, r4, r9); \
373+
r10= vec_xxnpmadd(U4, r4, r10); \
374+
r11= vec_xxnpmadd(U0, r4, r11); \
375+
U1 = vec_ld2(0, (float*) &(u)->c22); \
376+
\
377+
r6 = vec_xmadd(r2, U6, r6); \
378+
r7 = vec_xmadd(r2, U7, r7); \
379+
r8 = vec_xmadd(r2, U1, r8); \
380+
r9 = vec_xmadd(r5, U6, r9); \
381+
r10= vec_xmadd(r5, U7, r10); \
382+
r11= vec_xmadd(r5, U1, r11); \
383+
\
384+
r6 = vec_xxnpmadd(U6, r2, r6); \
385+
r7 = vec_xxnpmadd(U7, r2, r7); \
386+
r8 = vec_xxnpmadd(U1, r2, r8); \
387+
r9 = vec_xxnpmadd(U6, r5, r9); \
388+
r10= vec_xxnpmadd(U7, r5, r10); \
389+
r11= vec_xxnpmadd(U1, r5, r11);
390+
391+
392+
393+
394+
#define _vec_su3_multiply(u) \
395+
U0 = vec_ld2(0, (double*) &(u)->c00); \
396+
U3 = vec_ld2(0, (double*) &(u)->c01); \
397+
U6 = vec_ld2(0, (double*) &(u)->c02); \
398+
U1 = vec_ld2(0, (double*) &(u)->c10); \
399+
U4 = vec_ld2(0, (double*) &(u)->c11); \
400+
U7 = vec_ld2(0, (double*) &(u)->c12); \
401+
U2 = vec_ld2(0, (double*) &(u)->c20); \
402+
r6 = vec_xmul(r0, U0); \
403+
r7 = vec_xmul(r0, U1); \
404+
r8 = vec_xmul(r0, U2); \
405+
\
406+
r6 = vec_xxnpmadd(U0, r0, r6); \
407+
r7 = vec_xxnpmadd(U1, r0, r7); \
408+
r8 = vec_xxnpmadd(U2, r0, r8); \
409+
U0 = vec_ld2(0, (double*) &(u)->c21); \
410+
\
411+
r6 = vec_xmadd(r1, U3, r6); \
412+
r7 = vec_xmadd(r1, U4, r7); \
413+
r8 = vec_xmadd(r1, U0, r8); \
414+
\
415+
r6 = vec_xxnpmadd(U3, r1, r6); \
416+
r7 = vec_xxnpmadd(U4, r1, r7); \
417+
r8 = vec_xxnpmadd(U0, r1, r8); \
418+
U1 = vec_ld2(0, (double*) &(u)->c22); \
419+
\
420+
r6 = vec_xmadd(r2, U6, r6); \
421+
r7 = vec_xmadd(r2, U7, r7); \
422+
r8 = vec_xmadd(r2, U1, r8); \
423+
\
424+
r6 = vec_xxnpmadd(U6, r2, r6); \
425+
r7 = vec_xxnpmadd(U7, r2, r7); \
426+
r8 = vec_xxnpmadd(U1, r2, r8); \
427+
428+
429+
430+
#define _vec_su3_inverse_multiply(u) \
431+
U0 = vec_ld2(0, (double*) &(u)->c00); \
432+
U1 = vec_ld2(0, (double*) &(u)->c01); \
433+
U2 = vec_ld2(0, (double*) &(u)->c02); \
434+
\
435+
r6 = vec_xmul(U0, r0); \
436+
r7 = vec_xmul(U1, r0); \
437+
r8 = vec_xmul(U2, r0); \
438+
\
439+
r6 = vec_xxcpnmadd(r0, U0, r6); \
440+
r7 = vec_xxcpnmadd(r0, U1, r7); \
441+
r8 = vec_xxcpnmadd(r0, U2, r8); \
442+
\
443+
U3 = vec_ld2(0, (double*) &(u)->c10); \
444+
U4 = vec_ld2(0, (double*) &(u)->c11); \
445+
U6 = vec_ld2(0, (double*) &(u)->c12); \
446+
\
447+
r6 = vec_xmadd(U3, r1, r6); \
448+
r7 = vec_xmadd(U4, r1, r7); \
449+
r8 = vec_xmadd(U6, r1, r8); \
450+
\
451+
r6 = vec_xxcpnmadd(r1, U3, r6); \
452+
r7 = vec_xxcpnmadd(r1, U4, r7); \
453+
r8 = vec_xxcpnmadd(r1, U6, r8); \
454+
\
455+
U0 = vec_ld2(0, (double*) &(u)->c20); \
456+
U1 = vec_ld2(0, (double*) &(u)->c21); \
457+
U2 = vec_ld2(0, (double*) &(u)->c22); \
458+
\
459+
r6 = vec_xmadd(U0, r2, r6); \
460+
r7 = vec_xmadd(U1, r2, r7); \
461+
r8 = vec_xmadd(U2, r2, r8); \
462+
\
463+
r6 = vec_xxcpnmadd(r2, U0, r6); \
464+
r7 = vec_xxcpnmadd(r2, U1, r7); \
465+
r8 = vec_xxcpnmadd(r2, U2, r8); \
466+
467+
468+
469+
337470
// expects the spinor to act on in
338471
// r0, r1 -> s0
339472
// r2, r3 -> s1
@@ -376,6 +509,49 @@
376509
r5 = vec_xxnpmadd(U7, r7, r5); \
377510
r6 = vec_xxnpmadd(U1, r7, r6);
378511

512+
513+
514+
#define _vec_su3_multiply_double2c_32(u) \
515+
r8 = vec_gpci(00145); \
516+
r9 = vec_gpci(02367); \
517+
U0 = vec_ld2(0, (float*) &(u)->c00); \
518+
U3 = vec_ld2(0, (float*) &(u)->c01); \
519+
U6 = vec_ld2(0, (float*) &(u)->c02); \
520+
U1 = vec_ld2(0, (float*) &(u)->c10); \
521+
r7 = vec_perm(r0, r2, r8); \
522+
U4 = vec_ld2(0, (float*) &(u)->c11); \
523+
U7 = vec_ld2(0, (float*) &(u)->c12); \
524+
U2 = vec_ld2(0, (float*) &(u)->c20); \
525+
r4 = vec_xmul(r7, U0); \
526+
r5 = vec_xmul(r7, U1); \
527+
r6 = vec_xmul(r7, U2); \
528+
\
529+
r4 = vec_xxnpmadd(U0, r7, r4); \
530+
r5 = vec_xxnpmadd(U1, r7, r5); \
531+
r6 = vec_xxnpmadd(U2, r7, r6); \
532+
r7 = vec_perm(r0, r2, r9); \
533+
U0 = vec_ld2(0, (float*) &(u)->c21); \
534+
\
535+
r4 = vec_xmadd(r7, U3, r4); \
536+
r5 = vec_xmadd(r7, U4, r5); \
537+
r6 = vec_xmadd(r7, U0, r6); \
538+
\
539+
r4 = vec_xxnpmadd(U3, r7, r4); \
540+
r5 = vec_xxnpmadd(U4, r7, r5); \
541+
r6 = vec_xxnpmadd(U0, r7, r6); \
542+
r7 = vec_perm(r1, r3, r8); \
543+
U1 = vec_ld2(0, (float*) &(u)->c22); \
544+
\
545+
r4 = vec_xmadd(r7, U6, r4); \
546+
r5 = vec_xmadd(r7, U7, r5); \
547+
r6 = vec_xmadd(r7, U1, r6); \
548+
\
549+
r4 = vec_xxnpmadd(U6, r7, r4); \
550+
r5 = vec_xxnpmadd(U7, r7, r5); \
551+
r6 = vec_xxnpmadd(U1, r7, r6);
552+
553+
554+
379555
#define _vec_su3_multiply_double2ct(u) \
380556
r8 = vec_gpci(00167); \
381557
U0 = vec_ld2(0, (double*) &(u)->c00); \
@@ -478,6 +654,64 @@
478654
r11= vec_xxcpnmadd(r5, U2, r11);
479655

480656

657+
//same as _vec_su3_inverse_multiply_double2 but for 32bit gauge field
658+
#define _vec_su3_inverse_multiply_double2_32(u) \
659+
U0 = vec_ld2(0, (float*) &(u)->c00); \
660+
U1 = vec_ld2(0, (float*) &(u)->c01); \
661+
U2 = vec_ld2(0, (float*) &(u)->c02); \
662+
\
663+
r6 = vec_xmul(U0, r0); \
664+
r7 = vec_xmul(U1, r0); \
665+
r8 = vec_xmul(U2, r0); \
666+
r9 = vec_xmul(U0, r3); \
667+
r10= vec_xmul(U1, r3); \
668+
r11= vec_xmul(U2, r3); \
669+
\
670+
r6 = vec_xxcpnmadd(r0, U0, r6); \
671+
r7 = vec_xxcpnmadd(r0, U1, r7); \
672+
r8 = vec_xxcpnmadd(r0, U2, r8); \
673+
r9 = vec_xxcpnmadd(r3, U0, r9); \
674+
r10= vec_xxcpnmadd(r3, U1, r10); \
675+
r11= vec_xxcpnmadd(r3, U2, r11); \
676+
\
677+
U3 = vec_ld2(0, (float*) &(u)->c10); \
678+
U4 = vec_ld2(0, (float*) &(u)->c11); \
679+
U6 = vec_ld2(0, (float*) &(u)->c12); \
680+
\
681+
r6 = vec_xmadd(U3, r1, r6); \
682+
r7 = vec_xmadd(U4, r1, r7); \
683+
r8 = vec_xmadd(U6, r1, r8); \
684+
r9 = vec_xmadd(U3, r4, r9); \
685+
r10= vec_xmadd(U4, r4, r10); \
686+
r11= vec_xmadd(U6, r4, r11); \
687+
\
688+
r6 = vec_xxcpnmadd(r1, U3, r6); \
689+
r7 = vec_xxcpnmadd(r1, U4, r7); \
690+
r8 = vec_xxcpnmadd(r1, U6, r8); \
691+
r9 = vec_xxcpnmadd(r4, U3, r9); \
692+
r10= vec_xxcpnmadd(r4, U4, r10); \
693+
r11= vec_xxcpnmadd(r4, U6, r11); \
694+
\
695+
U0 = vec_ld2(0, (float*) &(u)->c20); \
696+
U1 = vec_ld2(0, (float*) &(u)->c21); \
697+
U2 = vec_ld2(0, (float*) &(u)->c22); \
698+
\
699+
r6 = vec_xmadd(U0, r2, r6); \
700+
r7 = vec_xmadd(U1, r2, r7); \
701+
r8 = vec_xmadd(U2, r2, r8); \
702+
r9 = vec_xmadd(U0, r5, r9); \
703+
r10= vec_xmadd(U1, r5, r10); \
704+
r11= vec_xmadd(U2, r5, r11); \
705+
\
706+
r6 = vec_xxcpnmadd(r2, U0, r6); \
707+
r7 = vec_xxcpnmadd(r2, U1, r7); \
708+
r8 = vec_xxcpnmadd(r2, U2, r8); \
709+
r9 = vec_xxcpnmadd(r5, U0, r9); \
710+
r10= vec_xxcpnmadd(r5, U1, r10); \
711+
r11= vec_xxcpnmadd(r5, U2, r11);
712+
713+
714+
481715
#define _vec_su3_inverse_multiply_double2c(u) \
482716
U0 = vec_ld2(0, (double*) &(u)->c00); \
483717
r8 = vec_gpci(00145); \
@@ -520,6 +754,52 @@
520754
r5 = vec_xxcpnmadd(r7, U1, r5); \
521755
r6 = vec_xxcpnmadd(r7, U2, r6);
522756

757+
758+
#define _vec_su3_inverse_multiply_double2c_32(u) \
759+
U0 = vec_ld2(0, (float*) &(u)->c00); \
760+
r8 = vec_gpci(00145); \
761+
r9 = vec_gpci(02367); \
762+
U1 = vec_ld2(0, (float*) &(u)->c01); \
763+
r7 = vec_perm(r0, r2, r8); \
764+
U2 = vec_ld2(0, (float*) &(u)->c02); \
765+
\
766+
r4 = vec_xmul(U0, r7); \
767+
r5 = vec_xmul(U1, r7); \
768+
r6 = vec_xmul(U2, r7); \
769+
\
770+
r4 = vec_xxcpnmadd(r7, U0, r4); \
771+
r5 = vec_xxcpnmadd(r7, U1, r5); \
772+
r6 = vec_xxcpnmadd(r7, U2, r6); \
773+
\
774+
r7 = vec_perm(r0, r2, r9); \
775+
U3 = vec_ld2(0, (float*) &(u)->c10); \
776+
U4 = vec_ld2(0, (float*) &(u)->c11); \
777+
U6 = vec_ld2(0, (float*) &(u)->c12); \
778+
\
779+
r4 = vec_xmadd(U3, r7, r4); \
780+
r5 = vec_xmadd(U4, r7, r5); \
781+
r6 = vec_xmadd(U6, r7, r6); \
782+
\
783+
r4 = vec_xxcpnmadd(r7, U3, r4); \
784+
r5 = vec_xxcpnmadd(r7, U4, r5); \
785+
r6 = vec_xxcpnmadd(r7, U6, r6); \
786+
\
787+
r7 = vec_perm(r1, r3, r8); \
788+
U0 = vec_ld2(0, (float*) &(u)->c20); \
789+
U1 = vec_ld2(0, (float*) &(u)->c21); \
790+
U2 = vec_ld2(0, (float*) &(u)->c22); \
791+
\
792+
r4 = vec_xmadd(U0, r7, r4); \
793+
r5 = vec_xmadd(U1, r7, r5); \
794+
r6 = vec_xmadd(U2, r7, r6); \
795+
\
796+
r4 = vec_xxcpnmadd(r7, U0, r4); \
797+
r5 = vec_xxcpnmadd(r7, U1, r5); \
798+
r6 = vec_xxcpnmadd(r7, U2, r6);
799+
800+
801+
802+
523803
#define _vec_su3_inverse_multiply_double2ct(u) \
524804
U0 = vec_ld2(0, (double*) &(u)->c00); \
525805
r8 = vec_gpci(00167); \

‎xlc_prefetch.h

+18
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,22 @@
2828

2929
#ifdef XLC
3030

31+
#define _prefetch_halfspinor(addr) \
32+
__dcbt(((char*)((unsigned long int)(addr))));
33+
3134
#define _prefetch_spinor(addr) \
3235
__dcbt(((char*)((unsigned long int)(addr)))); \
3336
__dcbt(((char*)((unsigned long int)(addr)))+128);
3437

38+
#define _prefetch_spinor_32(addr) \
39+
__dcbt(((char*)((unsigned long int)(addr))));
40+
//#define _prefetch_spinor_32(addr)
41+
42+
43+
#define _prefetch_su3_32(addr) \
44+
__dcbt(((char*)((unsigned long int)(addr))));
45+
//#define _prefetch_su3_32(addr)
46+
3547
#define _prefetch_su3(addr) \
3648
__dcbt(((char*)((unsigned long int)(addr)))); \
3749
__dcbt(((char*)((unsigned long int)(addr)))+128);
@@ -54,10 +66,16 @@ __prefetch_by_load((void*)(addr2));
5466

5567
#else
5668

69+
#define _prefetch_halfspinor(addr)
70+
5771
#define _prefetch_spinor(addr)
5872

5973
#define _prefetch_su3(addr)
6074

75+
#define _prefetch_spinor_32(addr)
76+
77+
#define _prefetch_su3_32(addr)
78+
6179
#endif
6280

6381
#endif

0 commit comments

Comments
 (0)
Please sign in to comment.