got various GPU bugs sorted -- only 1 real bug, in NewStateNeuron -- …

…filed issue #234
emer · Jun 2, 2023 · 1a14a31 · 1a14a31
1 parent ed3de63
commit 1a14a31
Show file tree

Hide file tree

Showing 14 changed files with 157 additions and 31 deletions.
diff --git a/axon/gpu.go b/axon/gpu.go
@@ -203,7 +203,8 @@ func (gp *GPU) Config(ctx *Context, net *Network) {
 	gp.Sys.NewComputePipelineEmbed("SynCa", content, "shaders/gpu_synca.spv")
 	gp.Sys.NewComputePipelineEmbed("CyclePost", content, "shaders/gpu_cyclepost.spv")
 
-	gp.Sys.NewComputePipelineEmbed("NewState", content, "shaders/gpu_newstate.spv")
+	gp.Sys.NewComputePipelineEmbed("NewStatePool", content, "shaders/gpu_newstate_pool.spv")
+	gp.Sys.NewComputePipelineEmbed("NewStateNeuron", content, "shaders/gpu_newstate_neuron.spv")
 	gp.Sys.NewComputePipelineEmbed("MinusPool", content, "shaders/gpu_minuspool.spv")
 	gp.Sys.NewComputePipelineEmbed("MinusNeuron", content, "shaders/gpu_minusneuron.spv")
 	gp.Sys.NewComputePipelineEmbed("PlusStart", content, "shaders/gpu_plusstart.spv")
@@ -216,6 +217,7 @@ func (gp *GPU) Config(ctx *Context, net *Network) {
 	gp.Sys.NewComputePipelineEmbed("ApplyExts", content, "shaders/gpu_applyext.spv")
 
 	gp.Sys.NewEvent("MemCopyTo")
+	gp.Sys.NewEvent("MemCopyTo2")
 	gp.Sys.NewEvent("MemCopyFm")
 	gp.Sys.NewEvent("CycleEnd")
 	gp.Sys.NewEvent("CycleInc")
@@ -460,6 +462,19 @@ func (gp *GPU) SyncStateToGPU() {
 	gp.SyncMemToGPU()
 }
 
+// SyncStateGBufToGPU copies LayVals, Pools, Neurons, GBuf state to GPU
+// this is typically sufficient for most syncing --
+// only missing the Synapses which must be copied separately.
+// Calls SyncMemToGPU -- use when this is the only copy taking place.
+func (gp *GPU) SyncStateGBufToGPU() {
+	if !gp.On {
+		return
+	}
+	gp.CopyStateToStaging()
+	gp.CopyGBufToStaging()
+	gp.SyncMemToGPU()
+}
+
 // SyncAllToGPU copies LayerVals, Pools, Neurons, Synapses to GPU.
 // Calls SyncMemToGPU -- use when this is the only copy taking place.
 func (gp *GPU) SyncAllToGPU() {
@@ -497,17 +512,23 @@ func (gp *GPU) SyncSynapsesToGPU() {
 	gp.SyncMemToGPU()
 }
 
-// SyncGBufToGPU copies the GBuf and GSyns memory to the GPU.
-// This is a temporary measure to be replaced with a simple kernel to init gbuf,
-// needed for InitActs.
-func (gp *GPU) SyncGBufToGPU() {
+// CopyGBufToStaging copies the GBuf and GSyns memory to staging.
+func (gp *GPU) CopyGBufToStaging() {
 	if !gp.On {
 		return
 	}
 	_, gbv, _ := gp.Syns.ValByIdxTry("GBuf", 0)
 	gbv.CopyFromBytes(unsafe.Pointer(&gp.Net.PrjnGBuf[0]))
 	_, gsv, _ := gp.Syns.ValByIdxTry("GSyns", 0)
 	gsv.CopyFromBytes(unsafe.Pointer(&gp.Net.PrjnGSyns[0]))
+}
+
+// SyncGBufToGPU copies the GBuf and GSyns memory to the GPU.
+func (gp *GPU) SyncGBufToGPU() {
+	if !gp.On {
+		return
+	}
+	gp.CopyGBufToStaging()
 	gp.SyncMemToGPU()
 }
 
@@ -816,8 +837,8 @@ func (gp *GPU) RunApplyExtsCmd() vk.CommandBuffer {
 	glr := gp.SyncRegionStruct("Globals")
 	gp.StartRunCmd(cmd)
 	gp.Sys.ComputeCmdCopyToGPUCmd(cmd, exr, cxr, glr)
-	gp.Sys.ComputeSetEventCmd(cmd, "MemCopyTo")
-	gp.RunPipelineCmd(cmd, "ApplyExts", neurDataN, "MemCopyTo", "")
+	gp.Sys.ComputeSetEventCmd(cmd, "MemCopyTo2")
+	gp.RunPipelineCmd(cmd, "ApplyExts", neurDataN, "MemCopyTo2", "")
 	gp.Sys.ComputeCmdEndCmd(cmd)
 	return cmd
 }
@@ -1005,8 +1026,33 @@ func (gp *GPU) RunCycleSeparateFuns() {
 // ThetaCycle trial.
 // The caller must check the On flag before running this, to use CPU vs. GPU
 func (gp *GPU) RunNewState() {
+	// todo: we're not actually calling this now, due to bug in NewStateNeuron
+	cmd := gp.RunNewStateCmd()
+	gnm := "GPU:NewState"
+	gp.Net.FunTimerStart(gnm)
+	gp.Sys.ComputeSubmitWaitCmd(cmd)
+	gp.Net.FunTimerStop(gnm)
+}
+
+// RunNewStateCmd returns the commands to
+// run the NewState shader to update variables
+// at the start of a new trial.
+func (gp *GPU) RunNewStateCmd() vk.CommandBuffer {
+	cnm := "RunNewState"
+	cmd, err := gp.Sys.CmdBuffByNameTry(cnm)
+	if err == nil {
+		return cmd
+	}
+	cmd = gp.Sys.NewCmdBuff(cnm)
+
+	neurDataN := int(gp.Net.NNeurons) * int(gp.Net.MaxData)
 	poolDataN := len(gp.Net.Pools)
-	gp.RunPipelineWait("NewState", poolDataN)
+
+	gp.StartRunCmd(cmd)
+	gp.RunPipelineCmd(cmd, "NewStatePool", poolDataN, "", "PoolGi")
+	gp.RunPipelineCmd(cmd, "NewStateNeuron", neurDataN, "PoolGi", "") // todo: this has NrnV read = 0 bug
+	gp.Sys.ComputeCmdEndCmd(cmd)
+	return cmd
 }
 
 // RunMinusPhase runs the MinusPhase shader to update snapshot variables

diff --git a/axon/gpu_hlsl/gpu_cycleinc.hlsl b/axon/gpu_hlsl/gpu_cycleinc.hlsl
@@ -27,7 +27,7 @@
 // Set 2: main network structs and vals -- all are writable
 [[vk::binding(0, 2)]] RWStructuredBuffer<Context> Ctx; // [0]
 
-[numthreads(1, 1, 1)]
+[numthreads(64, 1, 1)]
 void main(uint3 idx : SV_DispatchThreadID) { // over Context
 	if(idx.x == 0) {
 		Ctx[0].CycleInc();

diff --git a/axon/gpu_hlsl/gpu_cyclepost.hlsl b/axon/gpu_hlsl/gpu_cyclepost.hlsl
@@ -98,8 +98,8 @@ void CyclePost(inout Context ctx, in LayerParams ly, int li, uint di) {
 	CyclePost2(ctx, ly, uint(li), di, LayVals[ly.Idxs.ValsIdx(di)], Pools[ly.Idxs.PoolIdx(0, di)]);
 }
 
-[numthreads(1, 1, 1)]
-void main(uint3 idx : SV_DispatchThreadID) { // todo: iterate over global Data parallel
+[numthreads(64, 1, 1)]
+void main(uint3 idx : SV_DispatchThreadID) {
 	if (idx.x >= Ctx[0].NetIdxs.NData) {
 		return;
 	}

diff --git a/axon/gpu_hlsl/gpu_newstate_neuron.hlsl b/axon/gpu_hlsl/gpu_newstate_neuron.hlsl
@@ -0,0 +1,57 @@
+// Copyright (c) 2022, The Emergent Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// does NewState Update on each Neuron
+// note: anything *reading from neuron level must be called at neuron level!
+
+#include "synmem.hlsl"
+
+// note: all must be visible always because accessor methods refer to them
+[[vk::binding(0, 1)]] StructuredBuffer<uint> NeuronIxs; // [Neurons][Idxs]
+[[vk::binding(1, 1)]] StructuredBuffer<uint> SynapseIxs;  // [Layer][SendPrjns][SendNeurons][Syns]
+[[vk::binding(1, 2)]] RWStructuredBuffer<float> Neurons; // [Neurons][Vars][Data]
+[[vk::binding(2, 2)]] RWStructuredBuffer<float> NeuronAvgs; // [Neurons][Vars]
+[[vk::binding(5, 2)]] RWStructuredBuffer<float> Globals;  // [NGlobals]
+[[vk::binding(0, 3)]] RWStructuredBuffer<SynMemBlock> Synapses;  // [Layer][SendPrjns][SendNeurons][Syns]
+[[vk::binding(1, 3)]] RWStructuredBuffer<SynMemBlock> SynapseCas;  // [Layer][SendPrjns][SendNeurons][Syns][Data]
+
+#include "context.hlsl"
+#include "layerparams.hlsl"
+
+// note: binding is var, set
+
+// Set 0: uniform layer params -- could not have prjns also be uniform..
+[[vk::binding(0, 0)]] StructuredBuffer<LayerParams> Layers; // [Layer]
+
+// Set 1: effectively uniform prjn params as structured buffers in storage
+
+// Set 2: main network structs and vals -- all are writable
+[[vk::binding(0, 2)]] StructuredBuffer<Context> Ctx; // [0]
+[[vk::binding(3, 2)]] RWStructuredBuffer<Pool> Pools; // [Layer][Pools][Data]
+[[vk::binding(4, 2)]] RWStructuredBuffer<LayerVals> LayVals; // [Layer][Data]
+
+
+void NewStateNeuron2(in Context ctx, in LayerParams ly, uint ni, uint di) {
+	ly.NewStateNeuron(ctx, ni, di, LayVals[ly.Idxs.ValsIdx(di)]);
+}
+
+void NewStateNeuron(in Context ctx, uint ni, uint di) {
+	uint li = NrnI(ctx, ni, NrnLayIdx);
+	NewStateNeuron2(ctx, Layers[li], ni, di);
+}
+
+[numthreads(64, 1, 1)]
+void main(uint3 idx : SV_DispatchThreadID) { // over Neurons * Data
+	uint ni = Ctx[0].NetIdxs.ItemIdx(idx.x);
+	if (!Ctx[0].NetIdxs.NeurIdxIsValid(ni)) {
+		return;
+	}
+	uint di = Ctx[0].NetIdxs.DataIdx(idx.x);
+	if (!Ctx[0].NetIdxs.DataIdxIsValid(di)) {
+		return;
+	}
+	NewStateNeuron(Ctx[0], ni, di);
+}
+
+
diff --git a/axon/gpu_hlsl/gpu_newstate.hlsl → axon/gpu_hlsl/gpu_newstate_pool.hlsl b/axon/gpu_hlsl/gpu_newstate.hlsl → axon/gpu_hlsl/gpu_newstate_pool.hlsl
@@ -49,24 +49,15 @@ void InitPrjnGBuffs(in Context ctx, in PrjnParams pj) {
 	}
 }
 
-void NewStateNeuron(in Context ctx, in LayerParams ly, uint ni, uint di, in LayerVals vals) {
-	ly.NewStateNeuron(ctx, ni, di, vals);
-}
-
 void NewState2(in Context ctx, in LayerParams ly, uint di, inout Pool pl, inout LayerVals vals) {
 	ly.NewStatePool(ctx, pl);
 	if (pl.IsLayPool == 0) {
 		return;
 	}
 	ly.NewStateLayer(ctx, pl, vals);
-	for (uint lni = pl.StIdx; lni < pl.EdIdx; lni++) {
-		NewStateNeuron(ctx, ly, lni + ly.Idxs.NeurSt, di, vals);
-	}
-	// if (ly.Act.Decay.Glong != 0) { // clear pipeline of incoming spikes, assuming time has passed
 	for (uint pi = 0; pi < ly.Idxs.RecvN; pi++) {
 		InitPrjnGBuffs(ctx, Prjns[ly.Idxs.RecvSt + pi]);
 	}
-	// }
 }
 
 void NewState(in Context ctx, uint di, inout Pool pl) {

diff --git a/axon/layer_compute.go b/axon/layer_compute.go
@@ -318,6 +318,19 @@ func (ly *Layer) NewState(ctx *Context) {
 	ly.InitPrjnGBuffs(ctx)
 }
 
+// NewStateNeurons only calls the neurons part of new state -- for misbehaving GPU
+func (ly *Layer) NewStateNeurons(ctx *Context) {
+	nn := ly.NNeurons
+	for di := uint32(0); di < ctx.NetIdxs.NData; di++ {
+		vals := ly.LayerVals(di)
+		for lni := uint32(0); lni < nn; lni++ {
+			ni := ly.NeurStIdx + lni
+			// note: this calls the basic neuron-level DecayState
+			ly.Params.NewStateNeuron(ctx, ni, di, vals)
+		}
+	}
+}
+
 // DecayState decays activation state by given proportion
 // (default decay values are ly.Params.Acts.Decay.Act, Glong)
 func (ly *Layer) DecayState(ctx *Context, di uint32, decay, glong, ahp float32) {

diff --git a/axon/network.go b/axon/network.go
@@ -86,16 +86,19 @@ func (nt *Network) UpdateParams() {
 // properly prior to calling this and subsequent Cycle methods.
 func (nt *Network) NewState(ctx *Context) {
 	nt.NData = ctx.NetIdxs.NData
-	if nt.GPU.On {
-		nt.GPU.RunNewState()
-		return
-	}
+	// if nt.GPU.On { // todo: this has a bug in neuron-level access in updating SpkPrv
+	// 	nt.GPU.RunNewState()
+	// 	return
+	// }
 	for _, ly := range nt.Layers {
 		if ly.IsOff() {
 			continue
 		}
 		ly.NewState(ctx)
 	}
+	if nt.GPU.On {
+		nt.GPU.SyncStateGBufToGPU()
+	}
 }
 
 // Cycle runs one cycle of activation updating using threading methods.

diff --git a/axon/shaders/gpu_cycleinc.spv b/axon/shaders/gpu_cycleinc.spv
diff --git a/axon/shaders/gpu_cyclepost.spv b/axon/shaders/gpu_cyclepost.spv
diff --git a/axon/shaders/gpu_newstate.spv b/axon/shaders/gpu_newstate.spv
diff --git a/axon/shaders/gpu_newstate_neuron.spv b/axon/shaders/gpu_newstate_neuron.spv
diff --git a/axon/shaders/gpu_newstate_pool.spv b/axon/shaders/gpu_newstate_pool.spv
diff --git a/examples/boa/approach_env.go b/examples/boa/approach_env.go
@@ -81,7 +81,7 @@ func (ev *Approach) Defaults() {
 
 // Config configures the world
 func (ev *Approach) Config() {
-	// ev.Rand.NewRand(ev.RndSeed)
+	ev.Rand.NewRand(ev.RndSeed)
 	ev.CSTot = ev.NDrives * ev.CSPerDrive
 	ev.ActMap = make(map[string]int)
 	for i, act := range ev.Acts {

diff --git a/examples/boa/boa.go b/examples/boa/boa.go
@@ -74,9 +74,9 @@ type SimParams struct {
 // Defaults sets default params
 func (ss *SimParams) Defaults() {
 	ss.NData = 1
-	ss.EnvSameSeed = false
+	ss.EnvSameSeed = false // set to true to test ndata
 	ss.PctCortexMax = 1.0
-	ss.PctCortexStEpc = 5
+	ss.PctCortexStEpc = 10
 	ss.PctCortexNEpc = 5
 	ss.PctCortexInterval = 1
 	ss.PCAInterval = 10
@@ -329,6 +329,7 @@ func (ss *Sim) ConfigNet(net *axon.Network) {
 		return
 	}
 	net.Defaults()
+	net.SetNThreads(4)
 	ss.Params.SetObject("Network")
 	ss.InitWts(net)
 }
@@ -480,6 +481,14 @@ func (ss *Sim) ConfigLoops() {
 	} else {
 		axon.LooperUpdtNetView(man, &ss.ViewUpdt, ss.Net)
 		axon.LooperUpdtPlots(man, &ss.GUI)
+		for _, m := range man.Stacks {
+			m.Loops[etime.Cycle].OnEnd.Prepend("GUI:CounterUpdt", func() {
+				ss.NetViewCounters()
+			})
+			m.Loops[etime.Trial].OnEnd.Prepend("GUI:CounterUpdt", func() {
+				ss.NetViewCounters()
+			})
+		}
 	}
 
 	if Debug {
@@ -693,10 +702,16 @@ func (ss *Sim) StatCounters(di int) {
 	ss.Stats.SetFloat32("CS", float32(ev.CS))
 	ss.Stats.SetFloat32("US", float32(ev.US))
 	ss.Stats.SetFloat32("HasRew", axon.GlbV(ctx, uint32(di), axon.GvHasRew))
-	ss.Stats.SetString("TrialName", "trl")
-	if di == 0 {
-		ss.ViewUpdt.Text = ss.Stats.Print([]string{"Run", "Epoch", "Trial", "Cycle", "NetAction", "Instinct", "ActAction", "ActMatch", "JustGated", "Should", "Rew"})
+	ss.Stats.SetString("TrialName", "trl") // todo: could have dist, US etc
+}
+
+func (ss *Sim) NetViewCounters() {
+	if ss.GUI.ViewUpdt.View == nil {
+		return
 	}
+	di := ss.GUI.ViewUpdt.View.Di
+	ss.StatCounters(di)
+	ss.ViewUpdt.Text = ss.Stats.Print([]string{"Run", "Epoch", "Trial", "Cycle", "NetAction", "Instinct", "ActAction", "ActMatch", "JustGated", "Should", "Rew"})
 }
 
 // TrialStats computes the trial-level statistics.
@@ -997,7 +1012,8 @@ func (ss *Sim) Log(mode etime.Modes, time etime.Times) {
 
 	switch {
 	case time == etime.Cycle:
-		row = ss.Stats.Int("Cycle")
+		return /// not doing cycle-level logging -- too slow for gpu in general
+		// row = ss.Stats.Int("Cycle")
 	case time == etime.Trial:
 		if mode == etime.Train {
 			trl := ss.Loops.GetLoop(mode, etime.Trial).Counter.Cur