From f671c669f8cae460911c1e016e9e44297e817d79 Mon Sep 17 00:00:00 2001 From: Anthony Scemama Date: Mon, 17 Jun 2024 14:57:48 +0200 Subject: [PATCH] Use less memory in Cholesky --- src/ao_two_e_ints/cholesky.irp.f | 374 +++++++------------------- src/hartree_fock/fock_matrix_hf.irp.f | 21 +- 2 files changed, 117 insertions(+), 278 deletions(-) diff --git a/src/ao_two_e_ints/cholesky.irp.f b/src/ao_two_e_ints/cholesky.irp.f index d731ef042..a680e7eee 100644 --- a/src/ao_two_e_ints/cholesky.irp.f +++ b/src/ao_two_e_ints/cholesky.irp.f @@ -31,14 +31,14 @@ integer*8 :: ndim8 integer :: rank double precision :: tau, tau2 - double precision, pointer :: L(:,:) + double precision, pointer :: L(:,:), Delta(:,:) double precision :: s double precision :: dscale, dscale_tmp - double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:), Delta(:,:) + double precision, allocatable :: D(:), Ltmp_p(:,:), Ltmp_q(:,:), D_sorted(:), Delta_col(:) integer, allocatable :: addr1(:), addr2(:) - integer*8, allocatable :: Lset(:), Dset(:), addr3(:) + integer*8, allocatable :: Lset(:), Dset(:) logical, allocatable :: computed(:) integer :: i,j,k,m,p,q, dj, p2, q2, ii, jj @@ -64,11 +64,8 @@ type(c_ptr) :: c_pointer(2) integer :: fd(2) - logical :: delta_on_disk - integer :: dgemm_block_size, nqq - double precision, allocatable :: dgemm_buffer1(:,:), dgemm_buffer2(:,:), dgemm_buffer3(:,:) - PROVIDE nproc + PROVIDE nproc ao_cholesky_threshold do_direct_integrals qp_max_mem PROVIDE nucl_coord ao_two_e_integral_schwartz call set_multiple_levels_omp(.False.) @@ -88,19 +85,8 @@ else - PROVIDE nucl_coord ao_two_e_integral_schwartz call set_multiple_levels_omp(.False.) - call resident_memory(mem0) - - rank_max = min(ndim8,(qp_max_mem*1024_8*1024_8*1024_8/8_8)/ndim8) - call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) - call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /)) - ! Deleting the file while it is open makes the file invisible on the filesystem, - ! and automatically deleted, even if the program crashes - iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R') - close(iunit,status='delete') - if (do_direct_integrals) then if (ao_two_e_integral(1,1,1,1) < huge(1.d0)) then ! Trigger providers inside ao_two_e_integral @@ -113,8 +99,12 @@ tau = ao_cholesky_threshold tau2 = tau*tau - mem = 6.d0 * memory_of_double8(ndim8) + 6.d0 * memory_of_int8(ndim8) - call check_mem(mem, irp_here) + rank = 0 + + allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8)) + allocate( addr1(ndim8), addr2(ndim8), Delta_col(ndim8) ) + + call resident_memory(mem0) call print_memory_usage() @@ -127,46 +117,35 @@ print *, '============ =============' - rank = 0 - - allocate( D(ndim8), Lset(ndim8), Dset(ndim8), D_sorted(ndim8)) - allocate( addr1(ndim8), addr2(ndim8), addr3(ndim8) ) -!print *, 'allocate : (D(ndim8))', memory_of_int8(ndim8) -!print *, 'allocate : (Lset(ndim8))', memory_of_int8(ndim8) -!print *, 'allocate : (Dset(ndim8))', memory_of_int8(ndim8) -!print *, 'allocate : (4,addr(ndim8))', memory_of_int8(4_8*ndim8) - ! 1. - k=0 + i8=0 do j=1,ao_num do i=1,ao_num - k = k+1 - addr1(k) = i - addr2(k) = j - addr3(k) = (i-1)*ao_num + j + i8 = i8+1 + addr1(i8) = i + addr2(i8) = j enddo enddo if (do_direct_integrals) then - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21) do i8=ndim8,1,-1 D(i8) = ao_two_e_integral(addr1(i8), addr2(i8), & addr1(i8), addr2(i8)) enddo !$OMP END PARALLEL DO else - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,16) + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i8) SCHEDULE(dynamic,21) do i8=ndim8,1,-1 D(i8) = get_ao_two_e_integral(addr1(i8), addr1(i8), & - addr2(i8), addr2(i8), & - ao_integrals_map) + addr2(i8), addr2(i8), ao_integrals_map) enddo !$OMP END PARALLEL DO endif + D_sorted(:) = -D(:) call dsort_noidx_big(D_sorted,ndim8) - D_sorted(:) = dabs(D_sorted(:)) - + D_sorted(:) = -D_sorted(:) Dmax = D_sorted(1) ! 2. @@ -174,12 +153,24 @@ dscale_tmp = dscale*dscale*Dmax np8=0_8 do p8=1,ndim8 - if ( dscale_tmp*D(p8) > tau2 ) then + if ( dscale_tmp*D(p8) >= tau2 ) then np8 = np8+1_8 Lset(np8) = p8 endif enddo np = np8 + if (np <= 0) stop 'np<=0' + if (np > ndim8) stop 'np>ndim8' + + rank_max = min(np,20*elec_num*elec_num) + call mmap(trim(ezfio_work_dir)//'cholesky_ao_tmp', (/ ndim8, rank_max /), 8, fd(1), .False., .True., c_pointer(1)) + call c_f_pointer(c_pointer(1), L, (/ ndim8, rank_max /)) + + ! Deleting the file while it is open makes the file invisible on the filesystem, + ! and automatically deleted, even if the program crashes + iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_ao_tmp', 'R') + close(iunit,status='delete') + ! 3. N = 0 @@ -187,85 +178,59 @@ ! 4. i = 0 + mem = memory_of_double(np) & ! Delta(np,nq) + + (np+1)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) + +! call check_mem(mem) + ! 5. - do while ( (Dmax > tau).and.(rank*1_8 < min(ndim8,rank_max)) ) + do while ( (Dmax > tau).and.(np > 0) ) ! a. i = i+1 - ! Inrease s until the arrays fit in memory - s = 0.01d0 block_size = max(N,24) - do while (.True.) - ! b. - Dmin = max(s*Dmax,tau) + ! Determine nq so that Delta fits in memory - ! c. - nq=0 - do p=1,np - if ( D(Lset(p)) > Dmin ) then - nq = nq+1 - Dset(nq) = Lset(p) - endif - enddo + s = 0.1d0 + Dmin = max(s*Dmax,tau) + do nq=2,np-1 + if (D_sorted(nq) < Dmin) exit + enddo + do while (.True.) - mem = mem0 & - + np*memory_of_double(nq) + mem = mem0 & + + np*memory_of_double(nq) & ! Delta(np,nq) + + (np+nq)*memory_of_double(block_size) & ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) + + memory_of_int(nq) ! computed(nq) -!print *, 'mem = ', mem - if (mem > qp_max_mem/2) then - s = s*2.d0 + if (mem > qp_max_mem*0.5d0) then + nq = nq/2 else exit endif - if ((s > 1.d0).or.(nq == 0)) then - call print_memory_usage() - print *, 'Required peak memory: ', mem, 'Gb' - call resident_memory(mem) - print *, 'Already used memory: ', mem, 'Gb' - print *, 'Not enough memory. Reduce cholesky threshold' - stop -1 - endif - - if (s > 0.3d0) then - exit - endif - enddo - ! d., e. - mem = mem0 & - + memory_of_int(nq) &! computed(nq) - + np*memory_of_int(nq) &! computed(nq) - + memory_of_double(np) &! Delta_col(np) - + 7*memory_of_double8(ndim8) &! D, Lset, Dset, D_sorted, addr[1-3] - + np*memory_of_double(nq) &! Delta(np,nq) - + (np+nq)*memory_of_double(block_size) ! Ltmp_p(np,block_size) + Ltmp_q(nq,block_size) - - if (1.1*mem > qp_max_mem) then -! call mmap(trim(ezfio_work_dir)//'cholesky_delta', (/ np*1_8, nq*1_8 /), 8, fd(2), .False., .True., c_pointer(2)) -! call c_f_pointer(c_pointer(2), Delta, (/ np, nq /)) - -! ! Deleting the file while it is open makes the file invisible on the filesystem, -! ! and automatically deleted, even if the program crashes - iunit = getUnitAndOpen(trim(ezfio_work_dir)//'cholesky_delta', 'R') - close(iunit,status='delete') - open(unit=iunit, access='DIRECT', form='UNFORMATTED', RECL=(np+1)*8, & - ASYNCHRONOUS='YES', file=trim(ezfio_work_dir)//'cholesky_delta') - delta_on_disk = .True. - else - allocate(Delta(np,nq)) - delta_on_disk = .False. + if (nq <= 0) then + print *, nq + stop 'bug in cholesky: nq <= 0' endif -!print *, delta_on_disk - allocate(Delta_col(np)) + Dmin = D_sorted(nq) + nq=0 + do p=1,np + if ( D(Lset(p)) >= Dmin ) then + nq = nq+1 + Dset(nq) = Lset(p) + endif + enddo + + allocate(Delta(np,nq)) allocate(Ltmp_p(np,block_size), stat=ierr) -!print *, 'allocate : Ltmp_p(np,block_size)', memory_of_double8(np*block_size*1_8), np, block_size if (ierr /= 0) then call print_memory_usage() @@ -274,7 +239,6 @@ endif allocate(Ltmp_q(nq,block_size), stat=ierr) -!print *, 'allocate : Ltmp_q(nq,block_size)', memory_of_double8(nq*block_size*1_8), nq, block_size if (ierr /= 0) then call print_memory_usage() @@ -287,7 +251,6 @@ computed(:) = .False. -!print *, 'N, rank, block_size', N, rank, block_size !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(k,p,q) do k=1,N !$OMP DO @@ -305,81 +268,18 @@ !$OMP BARRIER !$OMP END PARALLEL - PROVIDE nproc - - if (delta_on_disk) then - - dgemm_block_size = nproc*4 - - allocate (dgemm_buffer1(np,dgemm_block_size)) - allocate (dgemm_buffer3(np,dgemm_block_size)) - allocate (dgemm_buffer2(dgemm_block_size,N)) - - if (N>0) then - ! Blocking improves I/O performance - - - do jj=1,nq,dgemm_block_size - - nqq = min(nq, jj+dgemm_block_size-1) - jj + 1 - - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,ii) - do ii=1,N - do q=jj,jj+nqq-1 - dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii) - enddo - enddo - !$OMP END PARALLEL DO - -print *, np, nq, jj, nqq, N - call dgemm('N', 'T', np, nqq, N, -1.d0, & - Ltmp_p, np, dgemm_buffer2, dgemm_block_size, 0.d0, dgemm_buffer1, np) - - wait(iunit) - dgemm_buffer3(:,:) = dgemm_buffer1(:,:) -! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) - do q=jj,jj+nqq-1 - write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(1:np, q-jj+1) - enddo -! !$OMP END PARALLEL DO - - enddo -print *, 'ok1' - - else - - - dgemm_buffer1(1:np,1) = 0.d0 - -! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) - do q=1,nq - write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer1(1:np, 1) - enddo -! !$OMP END PARALLEL DO - - endif - - deallocate(dgemm_buffer1, dgemm_buffer2) - if (delta_on_disk) wait(iunit) - deallocate(dgemm_buffer3) - - - else - - if (N>0) then + if (N>0) then call dgemm('N', 'T', np, nq, N, -1.d0, & Ltmp_p(1,1), np, Ltmp_q(1,1), nq, 0.d0, Delta, np) - else - - !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) - do q=1,nq - Delta(:,q) = 0.d0 - enddo - !$OMP END PARALLEL DO + else - endif + !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q,j) + do q=1,nq + Delta(:,q) = 0.d0 + enddo + !$OMP END PARALLEL DO endif @@ -395,64 +295,20 @@ do j=1,nq if ( (Qmax <= Dmin).or.(N+j*1_8 > ndim8) ) exit + ! i. rank = N+j + if (rank == rank_max) then + print *, 'cholesky: rank_max reached' + exit + endif if (iblock == block_size) then - if (delta_on_disk) then - ! Blocking improves I/O performance - - dgemm_block_size = nproc*4 - allocate (dgemm_buffer1(np,dgemm_block_size)) - allocate (dgemm_buffer2(dgemm_block_size,block_size)) - - do jj=1,nq,dgemm_block_size - nqq = min(nq, jj+dgemm_block_size-1) - jj + 1 - - !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(q,ii) - !$OMP DO - do ii=1,block_size - do q=jj,jj+nqq-1 - dgemm_buffer2(q-jj+1,ii) = Ltmp_q(q,ii) - enddo - enddo - !$OMP END DO - !$OMP END PARALLEL - -! !$OMP DO - do q=jj,jj+nqq-1 - read(iunit, rec=q) dgemm_buffer1(1:np, q-jj+1) - enddo -! !$OMP END DO - -print *, np, nq, jj, nqq, block_size - call dgemm('N', 'T', np, nqq, block_size, -1.d0, & - Ltmp_p(1,1), np, dgemm_buffer2, dgemm_block_size, 1.d0, dgemm_buffer1, np) - - wait(iunit) - dgemm_buffer3 = dgemm_buffer1 - -! !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(q) - do q=jj,jj+nqq-1 - write(iunit, ASYNCHRONOUS='YES', rec=q) dgemm_buffer3(:, q-jj+1) - enddo -! !$OMP END PARALLEL DO - - enddo -print *, 'ok' - deallocate(dgemm_buffer1, dgemm_buffer2) - wait(iunit) - deallocate(dgemm_buffer3) - - else - - call dgemm('N','T',np,nq,block_size,-1.d0, & + call dgemm('N','T',np,nq,block_size,-1.d0, & Ltmp_p, np, Ltmp_q, nq, 1.d0, Delta, np) - endif - - iblock = 0 + iblock = 0 endif @@ -469,71 +325,47 @@ enddo iblock = iblock+1 - - if (delta_on_disk) then - - read(iunit,rec=dj) Ltmp_p(1:np,iblock) - - else - - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Delta(p,dj) - enddo - !$OMP END PARALLEL DO - - endif + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Delta(p,dj) + enddo + !$OMP END PARALLEL DO if (.not.computed(dj)) then m = dj if (do_direct_integrals) then !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21) do k=1,np + Delta_col(k) = 0.d0 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),& addr2(Lset(k)), addr2(Dset(m)) ) ) then Delta_col(k) = & ao_two_e_integral(addr1(Lset(k)), addr2(Lset(k)),& addr1(Dset(m)), addr2(Dset(m))) - else - Delta_col(k) = 0.d0 endif enddo !$OMP END PARALLEL DO else + PROVIDE ao_integrals_map !$OMP PARALLEL DO PRIVATE(k) SCHEDULE(dynamic,21) do k=1,np + Delta_col(k) = 0.d0 if (.not.ao_two_e_integral_zero( addr1(Lset(k)), addr1(Dset(m)),& addr2(Lset(k)), addr2(Dset(m)) ) ) then Delta_col(k) = & get_ao_two_e_integral( addr1(Lset(k)), addr1(Dset(m)),& addr2(Lset(k)), addr2(Dset(m)), ao_integrals_map) - else - Delta_col(k) = 0.d0 endif enddo !$OMP END PARALLEL DO endif - if (delta_on_disk) then - - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) - enddo - !$OMP END PARALLEL DO - - write(iunit, ASYNCHRONOUS='YES', rec=dj) Ltmp_p(1:np,iblock) - - else - - !$OMP PARALLEL DO PRIVATE(p) - do p=1,np - Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) - Delta(p,dj) = Ltmp_p(p,iblock) - enddo - !$OMP END PARALLEL DO - - endif + !$OMP PARALLEL DO PRIVATE(p) + do p=1,np + Ltmp_p(p,iblock) = Ltmp_p(p,iblock) + Delta_col(p) + Delta(p,dj) = Ltmp_p(p,iblock) + enddo + !$OMP END PARALLEL DO computed(dj) = .True. endif @@ -572,30 +404,26 @@ print '(I10, 4X, ES12.3)', rank, Qmax - deallocate(Delta_col) deallocate(Ltmp_p) deallocate(Ltmp_q) deallocate(computed) - if (delta_on_disk) then - close(iunit, status='delete') - else - deallocate(Delta) - endif + deallocate(Delta) ! i. N = rank ! j. - Dmax = D(Lset(1)) - do p=1,np - Dmax = max(Dmax, D(Lset(p))) - enddo + D_sorted(:) = -D(:) + call dsort_noidx_big(D_sorted,ndim8) + D_sorted(:) = -D_sorted(:) + + Dmax = D_sorted(1) dscale = 1.d0 dscale_tmp = dscale*dscale*Dmax np8=0_8 do p8=1,ndim8 - if ( dscale_tmp*D(p8) > tau2 ) then + if ( dscale_tmp*D(p8) >= tau2 ) then np8 = np8+1_8 Lset(np8) = p8 endif @@ -609,7 +437,6 @@ print *, '' allocate(cholesky_ao(ao_num,ao_num,rank), stat=ierr) -!print *, 'allocate : cholesky_ao(ao_num,ao_num,rank)', memory_of_double8(ao_num*ao_num*rank*1_8) if (ierr /= 0) then call print_memory_usage() @@ -621,7 +448,7 @@ !$OMP PARALLEL DO PRIVATE(k,j) do k=1,rank do j=1,ao_num - cholesky_ao(1:ao_num,j,k) = L((j-1)*ao_num+1:j*ao_num,k) + cholesky_ao(1:ao_num,j,k) = L((j-1_8)*ao_num+1_8:1_8*j*ao_num,k) enddo enddo !$OMP END PARALLEL DO @@ -646,5 +473,6 @@ call wall_time(wall1) print*,'Time to provide AO cholesky vectors = ',(wall1-wall0)/60.d0, ' min' + END_PROVIDER diff --git a/src/hartree_fock/fock_matrix_hf.irp.f b/src/hartree_fock/fock_matrix_hf.irp.f index 65b3d63cf..6d917322c 100644 --- a/src/hartree_fock/fock_matrix_hf.irp.f +++ b/src/hartree_fock/fock_matrix_hf.irp.f @@ -194,17 +194,28 @@ endif - double precision :: rss + double precision :: rss, mem0, mem double precision :: memory_of_double integer :: iblock - integer, parameter :: block_size = 32 + integer :: block_size + + call resident_memory(mem0) + + block_size = 1024 + + rss = memory_of_double(2.d0*ao_num*ao_num) + do + mem = mem0 + block_size*rss + if ( (block_size < 2).or.(mem < qp_max_mem) ) exit + block_size = block_size/2 + enddo + + call check_mem(block_size*rss, irp_here) - rss = memory_of_double(ao_num*ao_num) - call check_mem(2.d0*block_size*rss, irp_here) allocate(X2(ao_num,ao_num,block_size,2)) allocate(X3(ao_num,block_size,ao_num,2)) - + ! ao_two_e_integral_alpha_chol (l,s) -= cholesky_ao(l,m,j) * SCF_density_matrix_ao_beta (m,n) * cholesky_ao(n,s,j) do iblock=1,cholesky_ao_num,block_size