diff --git a/detect_faces.py b/detect_faces.py index 9271fe6..04a53a9 100755 --- a/detect_faces.py +++ b/detect_faces.py @@ -11,7 +11,6 @@ import argparse import numpy as np -import net_s3fd from bbox import decode, nms def detect_faces(net:nn.Module, img:np.ndarray, minscale:int=3, ovr_threshhold:float=0.3, @@ -36,7 +35,6 @@ def detect(net:nn.Module, img:np.ndarray, minscale:int=3) -> torch.Tensor: This will have LOTS of similar/overlapping regions. Need to call bbox.nms to reconcile them. Setting minscale to 0 finds the smallest faces, but takes the longest. """ - start_time = time.time() img = img - np.array([104,117,123]) img = img.transpose(2, 0, 1) img = img.reshape((1,)+img.shape) @@ -44,26 +42,25 @@ def detect(net:nn.Module, img:np.ndarray, minscale:int=3) -> torch.Tensor: img = Variable(torch.from_numpy(img).float()).cuda() BB,CC,HH,WW = img.size() olist = net(img) - print(f"Running CNN took {1000*(time.time() - start_time):.1f}ms") bboxlist = [] - for i in range(len(olist)//2): - olist[i*2] = F.softmax(olist[i*2], dim=1) for i in range(minscale, len(olist)//2): - #print(f"Going through olist {i} at {1000*(time.time() - start_time):.1f}ms. bboxlist has {len(bboxlist)} entries") - ocls,oreg = olist[i*2].data,olist[i*2+1].data + ocls = F.softmax(olist[i*2], dim=1).data + oreg = olist[i*2+1].data FB,FC,FH,FW = ocls.size() # feature map size stride = 2**(i+2) # 4,8,16,32,64,128 anchor = stride*4 - for Findex in range(FH*FW): # Run a sliding window over the whole thing... - windex,hindex = Findex%FW,Findex//FW - score = ocls[0,1,hindex,windex] - if score<0.05: - continue + # this workload is small enough that it's faster on CPU than GPU (~55ms vs ~65ms) + # but most of that time (40ms) is spend moving the data from GPU to CPU. + all_scores = ocls[0,1,:,:].cpu() + oreg = oreg.cpu() + # instead of running a sliding window, first find the places where score is big enough to bother + bigenough = torch.nonzero(all_scores > 0.05) + for hindex, windex in bigenough: + score = all_scores[hindex,windex] loc = oreg[0,:,hindex,windex].contiguous().view(1,4) axc,ayc = stride/2+windex*stride,stride/2+hindex*stride - priors = torch.Tensor([[axc/1.0,ayc/1.0,stride*4/1.0,stride*4/1.0]]).cuda() - priors = torch.Tensor([[axc/1.0,ayc/1.0,stride*4/1.0,stride*4/1.0]]).cuda() + priors = torch.Tensor([[axc/1.0,ayc/1.0,stride*4/1.0,stride*4/1.0]]) variances = [0.1,0.2] box = decode(loc,priors,variances) x1,y1,x2,y2 = box[0]*1.0