CIS565-Fall-2016 · reddeupenn · Nov 15, 2016
diff --git a/README.md b/README.md
@@ -3,13 +3,25 @@ Vulkan Flocking: compute and shading in one pipeline!
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 6**
 
-* (TODO) YOUR NAME HERE
-  Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Rony Edde (redde)
+  Windows 10, i7-6700k @ 4.00GHz 64GB, GTX 980M 8GB (Personal Laptop)
 
-  ### (TODO: Your README)
+  * ![capture](./vulkan_boids.gif)  
+
+* This is a 2D flocking engine using Vulkan.
+
+* Vulkan expects explicit descriptors because it's optimized in such a way that the commands expect a framework with a certain ordering of variables and layout locations.
+
+* Multiple descriptor sets can be used for different stages of a pipeline, such as post processing, compute or additional processes.
+
+* Increasing the number of Vulkan queues can create a bottleneck.  The simple reason is synchronization between queues which can be locking.  When multi threadding queues, the more granular the queues, the more synchronization is needed for dependent queues. Queues of the same family should reduce the bottlneck that queues from different families could have.  There are however specialized vender specific dedicated transfer queues that can also be used and are worth trying out.
+
+* Compute commands can be very powerful when they can share data with a rendering pipeline.  Data transfer is usually a bottleneck, but once the data is shared, it's redily accessible which removes the delays introduced when accessing and overiding the data using standard copying.
+
+* Performance.
+  * Performance wize, without optimization, increasing the number of boids dramatically reduces performance.  Attempting to use an AABB calculation seems to lead to poorer performance.  This is possibly due to the additional checks for every boid.  An improvement that would be more beneficial if computed prior to sending all the boids to the compute shader.  Here are the results:  
+  * ![results](./performance.png)  
 
-  Include screenshots, analysis, etc. (Remember, this is public, so don't put
-  anything here that you don't want to share with the world.)
 
 ### Credits
 

diff --git a/data/shaders/computeparticles/particle.comp b/data/shaders/computeparticles/particle.comp
@@ -5,8 +5,8 @@
 
 struct Particle
 {
-	vec2 pos;
-	vec2 vel;
+    vec2 pos;
+    vec2 vel;
 };
 
 // LOOK: These bindings correspond to the DescriptorSetLayouts and
@@ -15,13 +15,13 @@ struct Particle
 // Binding 0 : Particle storage buffer (read)
 layout(std140, binding = 0) buffer ParticlesA
 {
-   Particle particlesA[ ];
+    Particle particlesA[ ];
 };
 
 // Binding 1 : Particle storage buffer (write)
 layout(std140, binding = 1) buffer ParticlesB
 {
-   Particle particlesB[ ];
+    Particle particlesB[ ];
 };
 
 layout (local_size_x = 16, local_size_y = 16) in;
@@ -31,44 +31,117 @@ layout (local_size_x = 16, local_size_y = 16) in;
 // frame rate.
 layout (binding = 2) uniform UBO
 {
-	float deltaT;
-	float rule1Distance;
-	float rule2Distance;
-	float rule3Distance;
-	float rule1Scale;
-	float rule2Scale;
-	float rule3Scale;
-	int particleCount;
+    float deltaT;
+    float rule1Distance;
+    float rule2Distance;
+    float rule3Distance;
+    float rule1Scale;
+    float rule2Scale;
+    float rule3Scale;
+    int particleCount;
 } ubo;
 
 void main()
 {
-		// LOOK: This is very similar to a CUDA kernel.
-		// Right now, the compute shader only advects the particles with their
-		// velocity and handles wrap-around.
-		// TODO: implement flocking behavior.
+    // LOOK: This is very similar to a CUDA kernel.
+    // Right now, the compute shader only advects the particles with their
+    // velocity and handles wrap-around.
+    // TODO: implement flocking behavior.
 
     // Current SSBO index
     uint index = gl_GlobalInvocationID.x;
-	// Don't try to write beyond particle count
+    // Don't try to write beyond particle count
     if (index >= ubo.particleCount)
-		return;
+	    return;
+
+
+    // loop through particles
+    uint numNeighborsCenter = 0;
+    uint numNeighborsVelocity = 0;
+
+    vec2 center = vec2(0.0, 0.0);
+    vec2 velocity= vec2(0.0, 0.0);
+    vec2 prox = vec2(0.0, 0.0);
+
+    float maxSpeed = 1.0;
+
+    /*
+    // for AABB optimization on large number of particles
+    float maxDist = ubo.rule1Distance;
+    if (maxDist < ubo.rule2Distance)
+        maxDist =  ubo.rule2Distance;
+    if (maxDist < ubo.rule3Distance)
+        maxDist =  ubo.rule3Distance;
+    */
 
     // Read position and velocity
-		vec2 vPos = particlesA[index].pos.xy;
+	vec2 vPos = particlesA[index].pos.xy;
     vec2 vVel = particlesA[index].vel.xy;
 
-		// clamp velocity for a more pleasing simulation.
-		vVel = normalize(vVel) * clamp(length(vVel), 0.0, 0.1);
 
-		// kinematic update
-		vPos += vVel * ubo.deltaT;
+    for (int i = 0; i < ubo.particleCount; i++)
+	{
+		if (i != index)
+		{
+            /*
+            // check bounds first SLOWER
+            if (abs(particlesA[i].pos.x - vPos.x) > maxDist || 
+                abs(particlesA[i].pos.y - vPos.y) > maxDist)
+                continue;
+            else*/
+            {
+			    float dist = distance(particlesA[i].pos, vPos);
+			    //printf("\ndist: %f", dist);
+			    if (dist < ubo.rule1Distance)
+			    {
+				    center = center + particlesA[i].pos;
+				    numNeighborsCenter++;
+			    }
+			    if (dist < ubo.rule2Distance)
+			    {
+				    prox = prox - (particlesA[i].pos - vPos);
+			    }
+			    if (dist < ubo.rule3Distance)
+			    {
+				    velocity = velocity + particlesA[i].vel;
+				    numNeighborsVelocity++;
+			    }
+            }
+		}
+	}
+
+
+	if (numNeighborsCenter != 0)
+	{
+		center /= numNeighborsCenter;
+		vVel += (center - vPos) * ubo.rule1Scale;
+	}
+
+	if (numNeighborsVelocity != 0)
+	{
+		//velocity = velocity / (float)numNeighborsVelocity;
+		vVel += velocity * ubo.rule3Scale;
+	}
+
+	vVel += prox * ubo.rule2Scale;
+
+	if (length(vVel) > maxSpeed)
+	{
+		vVel = normalize(vVel) * maxSpeed;
+	}
+
+	// clamp velocity for a more pleasing simulation.
+	vVel = normalize(vVel) * clamp(length(vVel), 0.0, 0.1);
+
+	// kinematic update
+	vPos += vVel * ubo.deltaT;
 
     // Wrap around boundary
-		if (vPos.x < -1.0) vPos.x = 1.0;
-		if (vPos.x > 1.0) vPos.x = -1.0;
-		if (vPos.y < -1.0) vPos.y = 1.0;
-		if (vPos.y > 1.0) vPos.y = -1.0;
+	if (vPos.x < -1.0) vPos.x = 1.0;
+	if (vPos.x > 1.0) vPos.x = -1.0;
+	if (vPos.y < -1.0) vPos.y = 1.0;
+	if (vPos.y > 1.0) vPos.y = -1.0;
+
 
     particlesB[index].pos.xy = vPos;
 

diff --git a/data/shaders/computeparticles/particle.comp.spv b/data/shaders/computeparticles/particle.comp.spv
diff --git a/data/shaders/computeparticles/particle.frag b/data/shaders/computeparticles/particle.frag
@@ -11,4 +11,5 @@ layout (location = 0) out vec4 outFragColor;
 void main ()
 {
 	outFragColor.rgb = vec3(inColor.x, abs(inColor.y), -inColor.x) * 10.0;
+	//outFragColor.rgb = vec3(1.0, 0.0, 0.0) * 10.0;
 }
diff --git a/data/shaders/computeparticles/particle.vert b/data/shaders/computeparticles/particle.vert
@@ -23,4 +23,5 @@ void main ()
   gl_PointSize = 2.0;
   outColor = inVel;
   gl_Position = vec4(inPos.xy, 1.0, 1.0);
+  //outColor.rgb = vec3(1.0, 0.0, 0.0);
 }
diff --git a/perf.py b/perf.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# a bar plot with errorbars
+import numpy as np
+import matplotlib.pyplot as plt
+
+'''
+num particles  x1024  : 1    2     4     8   12  16   20     
+AABB optimisation off : 1    3.5   10.5  39  84  148  232
+AABB optimisation on  : 1.2  3.8   12    44  96 230  264   
+'''
+
+
+N = 7
+defaultMeans = (1, 3.5, 10.5, 39, 84, 148, 232)
+
+ind = np.arange(N)  # the x locations for the groups
+width = 0.35       # the width of the bars
+
+fig, ax = plt.subplots()
+rects1 = ax.bar(ind, defaultMeans, width, color='#cc1111')
+
+scissorMeans = ( 1.2, 3.8, 12, 44, 96, 230, 264)
+rects2 = ax.bar(ind + width, scissorMeans, width, color='#0088cc')
+
+# add some text for labels, title and axes ticks
+ax.set_ylabel('time (ms)')
+ax.set_title('simulation time')
+ax.set_xticks(ind + width)
+ax.set_xticklabels(('1x1024', '2x1024', '4x1024', '8x1024', '12x1024', '16x1024', '20x1024'))
+
+ax.legend((rects1[0], rects2[0]), ('AABB test off', 'AABB test on'), loc=2)
+ax.axis((0,7,0,290))
+
+
+def autolabel(rects):
+    # attach some text labels
+    for rect in rects:
+        height = rect.get_height()
+        ax.text(rect.get_x() + rect.get_width()/2., 1.01*height,
+                '%.1f' % float(height),
+                ha='center', va='bottom')
+
+autolabel(rects1)
+autolabel(rects2)
+
+plt.show()
diff --git a/performance.png b/performance.png
diff --git a/vulkanBoids/vulkanBoids.cpp b/vulkanBoids/vulkanBoids.cpp
@@ -158,6 +158,7 @@ class VulkanExample : public VulkanExampleBase
 		{
 			particle.pos = glm::vec2(rDistribution(rGenerator), rDistribution(rGenerator));
 			// TODO: add randomized velocities with a slight scale here, something like 0.1f.
+            particle.vel = glm::vec2(rDistribution(rGenerator)*0.1f, rDistribution(rGenerator)*0.1f);
 		}
 
 		VkDeviceSize storageBufferSize = particleBuffer.size() * sizeof(Particle);
@@ -244,7 +245,7 @@ class VulkanExample : public VulkanExampleBase
 			VERTEX_BUFFER_BIND_ID,
 			1,
 			VK_FORMAT_R32G32_SFLOAT,
-			offsetof(Particle, pos)); // TODO: change this so that we can color the particles based on velocity.
+			offsetof(Particle, vel)); // TODO: change this so that we can color the particles based on velocity.
 
 		// vertices.inputState encapsulates everything we need for these particular buffers to
 		// interface with the graphics pipeline.
@@ -522,33 +523,54 @@ class VulkanExample : public VulkanExampleBase
 			// which in turn corresponds with something like `layout(std140, binding = 0)` in `particle.comp`.
 
 			// Binding 0 : Particle position storage buffer
-			vkTools::initializers::writeDescriptorSet(
-			compute.descriptorSets[0], // LOOK: which descriptor set to write to?
-			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-			0, // LOOK: which binding in the descriptor set Layout?
-			&compute.storageBufferA.descriptor), // LOOK: which SSBO?
-
-			// Binding 1 : Particle position storage buffer
-			vkTools::initializers::writeDescriptorSet(
-			compute.descriptorSets[0],
-			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-			1,
-			&compute.storageBufferB.descriptor),
-
-			// Binding 2 : Uniform buffer
-			vkTools::initializers::writeDescriptorSet(
-			compute.descriptorSets[0],
-			VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-			2,
-			&compute.uniformBuffer.descriptor)
-
-			// TODO: write the second descriptorSet, using the top for reference.
-			// We want the descriptorSets to be used for flip-flopping:
-			// on one frame, we use one descriptorSet with the compute pass,
-			// on the next frame, we use the other.
-			// What has to be different about how the second descriptorSet is written here?
+            vkTools::initializers::writeDescriptorSet(
+            compute.descriptorSets[0], // LOOK: which descriptor set to write to?
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            0, // LOOK: which binding in the descriptor set Layout?
+            &compute.storageBufferA.descriptor), // LOOK: which SSBO?
+
+            // Binding 1 : Particle position storage buffer
+            vkTools::initializers::writeDescriptorSet(
+            compute.descriptorSets[0],
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            1,
+            &compute.storageBufferB.descriptor),
+
+            // Binding 2 : Uniform buffer
+            vkTools::initializers::writeDescriptorSet(
+            compute.descriptorSets[0],
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            2,
+            &compute.uniformBuffer.descriptor),
+
+            // TODO: write the second descriptorSet, using the top for reference.
+            // We want the descriptorSets to be used for flip-flopping:
+            // on one frame, we use one descriptorSet with the compute pass,
+            // on the next frame, we use the other.
+            // What has to be different about how the second descriptorSet is written here?
+            // Binding 0 : Particle position storage buffer
+
+            vkTools::initializers::writeDescriptorSet(
+            compute.descriptorSets[1], // LOOK: which descriptor set to write to?
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            0, // LOOK: which binding in the descriptor set Layout?
+            &compute.storageBufferB.descriptor), // LOOK: which SSBO?
+
+            // Binding 1 : Particle position storage buffer
+            vkTools::initializers::writeDescriptorSet(
+            compute.descriptorSets[1],
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            1,
+            &compute.storageBufferA.descriptor),
+
+            // Binding 2 : Uniform buffer
+            vkTools::initializers::writeDescriptorSet(
+            compute.descriptorSets[1],
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            2,
+            &compute.uniformBuffer.descriptor)
+
 		};
-
 		vkUpdateDescriptorSets(device, static_cast<uint32_t>(computeWriteDescriptorSets.size()), computeWriteDescriptorSets.data(), 0, NULL);
 	}
 
@@ -590,6 +612,8 @@ class VulkanExample : public VulkanExampleBase
 		// We also want to flip what SSBO we draw with in the next
 		// pass through the graphics pipeline.
 		// Feel free to use std::swap here. You should need it twice.
+        std::swap(compute.descriptorSets[0], compute.descriptorSets[1]);
+
 	}
 
 	// Record command buffers for drawing using the graphics pipeline

diff --git a/vulkan_boids.gif b/vulkan_boids.gif