From 995f05020e08ebdc2a55c2cd5da5d4b5c5545052 Mon Sep 17 00:00:00 2001 From: Dom Heinzeller Date: Fri, 2 Jun 2017 13:11:01 +0200 Subject: [PATCH 1/3] Speedup of mpas_binary_search making use of the fact that the list to search is sorted --- src/framework/mpas_sort.F | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/framework/mpas_sort.F b/src/framework/mpas_sort.F index 2b5d28cd52..d27a0b12f9 100644 --- a/src/framework/mpas_sort.F +++ b/src/framework/mpas_sort.F @@ -412,10 +412,14 @@ end subroutine mpas_quicksort_2dreal!}}} ! integer function mpas_binary_search ! !> \brief MPAS Binary search routine -!> \author Michael Duda -!> \date 03/27/13 +!> \author Michael Duda, modified by Dom Heinzeller +!> \date 03/27/13, modified 05/29/17 !> \details -!> This routine performs a binary search in array for the key. It either returns the index of the key within array, or n2+1 if the key is not found. +!> This routine performs a binary search in array for the key. It either +!> returns the index of the key within array, or n2+1 if the key is not +!> found. As every binary search, it requires a sorted array to search. +!> This allows to shorten/skip the search if the key is smaller than the +!> first item in the array, or larger than the last item. ! !----------------------------------------------------------------------- integer function mpas_binary_search(array, d1, n1, n2, key)!{{{ @@ -429,6 +433,10 @@ integer function mpas_binary_search(array, d1, n1, n2, key)!{{{ mpas_binary_search = n2+1 + if (key < array(1,n1) .or. key > array(1,n2)) then + return + end if + l = n1 u = n2 k = (l+u)/2 From 8ca3843a1cc17441d06ccecd640d4253595966a9 Mon Sep 17 00:00:00 2001 From: Dom Heinzeller Date: Fri, 2 Jun 2017 13:13:35 +0200 Subject: [PATCH 2/3] Threading of binary searches in stream manager --- src/framework/mpas_stream_manager.F | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/framework/mpas_stream_manager.F b/src/framework/mpas_stream_manager.F index cc96e77d28..c7be37129c 100644 --- a/src/framework/mpas_stream_manager.F +++ b/src/framework/mpas_stream_manager.F @@ -5245,6 +5245,7 @@ subroutine postread_reindex(allFields, streamFields) !{{{ call mpas_quicksort(indexSpaceDim, sortedID) ! Reindex the field +!$OMP parallel do default(none) shared(outerDim, innerDim, sortedID, indexSpaceDim, int2DField) private(i,j,k) do i = 1, outerDim do j = 1, innerDim k = mpas_binary_search(sortedID, 2, 1, indexSpaceDim, int2DField % array(j,i)) @@ -5255,6 +5256,7 @@ subroutine postread_reindex(allFields, streamFields) !{{{ end if end do end do +!$OMP end parallel do deallocate(sortedID) int2DField => int2DField % next From 53372d106980618eb678c8e710750cfc1f7df803 Mon Sep 17 00:00:00 2001 From: Dom Heinzeller Date: Fri, 2 Jun 2017 13:33:20 +0200 Subject: [PATCH 3/3] Improved runtime performance in key routine mpas_dmpar_get_exch_list: use OpenMP threading to speedup calls to mpas_binary_search --- src/framework/mpas_dmpar.F | 56 ++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/src/framework/mpas_dmpar.F b/src/framework/mpas_dmpar.F index 118fdf82c6..2113b32c48 100644 --- a/src/framework/mpas_dmpar.F +++ b/src/framework/mpas_dmpar.F @@ -1497,7 +1497,8 @@ subroutine mpas_dmpar_get_exch_list(haloLayer, ownedListField, neededListField, type (hashtable) :: neededHash integer :: nUniqueNeededList, threadNum integer, dimension(:,:), pointer :: uniqueSortedNeededList - + ! For threading of binary search blocks + integer, dimension(:), allocatable :: karray ! ! *** NB: This code assumes that block % blockID values are local block IDs and are in the range [1, numBlocks] @@ -1756,28 +1757,34 @@ subroutine mpas_dmpar_get_exch_list(haloLayer, ownedListField, neededListField, totalSent = 0 currentProc = mod(dminfo % my_proc_id + dminfo % nprocs - i + 1, dminfo % nprocs) + allocate(karray(1:nMesgRecv)) + karray = nOwnedList + 1 +!$OMP parallel do default(shared) private(j) do j=1,nMesgRecv if (ownerListIn(j) > 0) then - k = mpas_binary_search(ownedListSorted, 2, 1, nOwnedList, ownerListIn(j)) - if (k <= nOwnedList) then - iBlock = ownedBlock(ownedListSorted(2,k)) + 1 - numToSend(iBlock) = numToSend(iBlock) + 1 - totalSent = totalSent + 1 + karray(j) = mpas_binary_search(ownedListSorted, 2, 1, nOwnedList, ownerListIn(j)) + end if + end do +!$OMP end parallel do + do j=1,nMesgRecv + if (karray(j) <= nOwnedList) then + iBlock = ownedBlock(ownedListSorted(2,karray(j))) + 1 + numToSend(iBlock) = numToSend(iBlock) + 1 + totalSent = totalSent + 1 - ! recipientList(1,:) represents the index in the srcList to place this data - recipientList(1,ownedListSorted(2,k)) = numToSend(iBlock) - ! recipientList(2,:) represnets the index in the buffer to place this data - recipientList(2,ownedListSorted(2,k)) = totalSent + ! recipientList(1,:) represents the index in the srcList to place this data + recipientList(1,ownedListSorted(2,karray(j))) = numToSend(iBlock) + ! recipientList(2,:) represnets the index in the buffer to place this data + recipientList(2,ownedListSorted(2,karray(j))) = totalSent - ownerListOut(j) = -1 * dminfo % my_proc_id - else - ownerListOut(j) = ownerListIn(j) - end if + ownerListOut(j) = -1 * dminfo % my_proc_id else ownerListOut(j) = ownerListIn(j) end if end do + deallocate(karray) + fieldCursor => ownedListField do while (associated(fieldCursor)) iBlock = fieldCursor % block % localBlockID + 1 @@ -1854,15 +1861,24 @@ subroutine mpas_dmpar_get_exch_list(haloLayer, ownedListField, neededListField, fieldCursor => neededListField do while (associated(fieldCursor)) + + allocate(karray(1:fieldCursor % dimSizes(1))) +!$OMP parallel do default(shared) private(j) do j = 1, fieldCursor % dimSizes(1) - k = mpas_binary_search(uniqueSortedNeededList, 2, 1, nUniqueNeededList, fieldCursor % array(j)) - if(k <= nUniqueNeededList) then - if(ownerListIn(k) == -i) then - iBlock = fieldCursor % block % localBlockID + 1 - numToRecv(iBlock) = numToRecv(iBlock) + 1 - end if + karray(j) = mpas_binary_search(uniqueSortedNeededList, 2, 1, nUniqueNeededList, fieldCursor % array(j)) + end do +!$OMP end parallel do + do j = 1, fieldCursor % dimSizes(1) + if(karray(j) <= nUniqueNeededList) then + if(ownerListIn(karray(j)) == -i) then + iBlock = fieldCursor % block % localBlockID + 1 + numToRecv(iBlock) = numToRecv(iBlock) + 1 + end if end if end do + + deallocate(karray) + fieldCursor => fieldCursor % next end do