From 4f870e131e1983db718c50cebdb19df3e5e86c9a Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 5 Sep 2024 14:21:28 +0800 Subject: [PATCH] [rtl] refactor mask unit. --- t1/src/Bundles.scala | 90 +- t1/src/Lane.scala | 94 ++- t1/src/T1.scala | 1031 +++-------------------- t1/src/decoder/Decoder.scala | 43 +- t1/src/decoder/attribute/isSwrite.scala | 64 -- t1/src/decoder/attribute/topUop.scala | 263 +++++- t1/src/laneStage/LaneStage3.scala | 61 +- t1/src/laneStage/MaskExchangeUnit.scala | 68 ++ t1/src/laneStage/SlotTokenManager.scala | 27 +- t1/src/lsu/LSU.scala | 2 +- t1/src/lsu/StoreUnit.scala | 9 +- t1/src/mask/BitLevelMaskWrite.scala | 91 ++ t1/src/mask/MaskCompress.scala | 185 ++++ t1/src/mask/MaskExtend.scala | 75 ++ t1/src/mask/MaskReduce.scala | 201 +++++ t1/src/mask/MaskUnit.scala | 914 ++++++++++++++++++++ t1/src/mask/MaskUnitReadCrossBar.scala | 48 ++ t1/src/package.scala | 21 + t1/src/sequencer/T1TokenManager.scala | 49 +- t1/src/vrf/VRF.scala | 11 +- 20 files changed, 2213 insertions(+), 1134 deletions(-) create mode 100644 t1/src/laneStage/MaskExchangeUnit.scala create mode 100644 t1/src/mask/BitLevelMaskWrite.scala create mode 100644 t1/src/mask/MaskCompress.scala create mode 100644 t1/src/mask/MaskExtend.scala create mode 100644 t1/src/mask/MaskReduce.scala create mode 100644 t1/src/mask/MaskUnit.scala create mode 100644 t1/src/mask/MaskUnitReadCrossBar.scala diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index fd833f07e..0722008e6 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -66,7 +66,7 @@ class InstructionState extends Bundle { val idle: Bool = Bool() /** used for mask unit, schedule mask unit to execute. */ - val sMaskUnitExecution: Bool = Bool() + val wMaskUnitLast: Bool = Bool() /** wait for vrf write finish. */ val wVRFWrite: Bool = Bool() @@ -698,3 +698,91 @@ class T1Retire(xLen: Int) extends Bundle { val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire) val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle) } + +class MaskUnitExecuteState(parameter: T1Parameter) extends Bundle { + val groupReadState: UInt = UInt(parameter.laneNumber.W) + val needRead: UInt = UInt(parameter.laneNumber.W) + val elementValid: UInt = UInt(parameter.laneNumber.W) + val readOffset: UInt = UInt((parameter.laneNumber * parameter.laneParam.vrfOffsetBits).W) + val accessLane: Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber).W)) + // 3: log2Ceil(8); 8: Use up to 8 registers + val vsGrowth: Vec[UInt] = Vec(parameter.laneNumber, UInt(3.W)) + val groupCount: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val executeIndex: UInt = UInt(2.W) + val readDataOffset: UInt = UInt((log2Ceil(parameter.datapathWidth / 8) * parameter.laneNumber).W) + val last: Bool = Bool() +} + +class MaskUnitInstReq(parameter: T1Parameter) extends Bundle { + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) + val readFromScala: UInt = UInt(parameter.datapathWidth.W) + val sew: UInt = UInt(2.W) + val vlmul: UInt = UInt(3.W) + val maskType: Bool = Bool() + val vxrm: UInt = UInt(3.W) + val vs2: UInt = UInt(5.W) + val vs1: UInt = UInt(5.W) + val vd: UInt = UInt(5.W) + val vl: UInt = UInt(parameter.laneParam.vlMaxBits.W) +} + +class MaskUnitExeReq(parameter: LaneParameter) extends Bundle { + // source1, read vs + val source1: UInt = UInt(parameter.datapathWidth.W) + // source2, read offset + val source2: UInt = UInt(parameter.datapathWidth.W) + val index: UInt = UInt(parameter.instructionIndexBits.W) +} + +class MaskUnitExeResponse(parameter: LaneParameter) extends Bundle { + val ffoByOther: Bool = Bool() + val writeData = new MaskUnitWriteBundle(parameter) + val index: UInt = UInt(parameter.instructionIndexBits.W) +} + +class MaskUnitReadReq(parameter: T1Parameter) extends Bundle { + val vs: UInt = UInt(5.W) + // source2, read offset + val offset: UInt = UInt(parameter.laneParam.vrfOffsetBits.W) + // Read which lane + val readLane: UInt = UInt(log2Ceil(parameter.laneNumber).W) + // from which request + val requestIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W) + // data position in data path + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} + +class MaskUnitReadQueue(parameter: T1Parameter) extends Bundle { + val vs: UInt = UInt(5.W) + // source2, read offset + val offset: UInt = UInt(parameter.laneParam.vrfOffsetBits.W) + // Which channel will this read request be written to? + val writeIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W) + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} + +class MaskUnitWaitReadQueue(parameter: T1Parameter) extends Bundle { + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val executeIndex: UInt = UInt(2.W) + val sourceValid: UInt = UInt(parameter.laneNumber.W) + val needRead: UInt = UInt(parameter.laneNumber.W) + val last: Bool = Bool() +} + +class MaskUnitWriteBundle(parameter: LaneParameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) + val vd: UInt = UInt(5.W) +} + +class MaskUnitReadVs1(parameter: T1Parameter) extends Bundle { + val indexSize: Int = log2Ceil(parameter.vLen * 8 / parameter.datapathWidth / parameter.laneNumber) + val dataValid: Bool = Bool() + val requestSend: Bool = Bool() + val sendToExecution: Bool = Bool() + val data: UInt = UInt(parameter.datapathWidth.W) + val readIndex: UInt = UInt(indexSize.W) + val laneIndex: UInt = UInt(parameter.laneNumber.W) +} diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 7df91efa9..be11a1f5a 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -103,6 +103,7 @@ case class LaneParameter( decoderParam: DecoderParam, vfuInstantiateParameter: VFUInstantiateParameter) extends SerializableModuleParameter { + val maskUnitVefWriteQueueSize: Int = 8 /** 1 in MSB for instruction order. */ val instructionIndexBits: Int = log2Ceil(chainingSize) + 1 @@ -235,13 +236,17 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits))) - /** response to [[T1.lsu]] or mask unit in [[T1]] */ @public - val laneResponse: ValidIO[LaneResponse] = IO(Valid(new LaneResponse(parameter))) + val maskUnitRequest: DecoupledIO[MaskUnitExeReq] = IO(Decoupled(new MaskUnitExeReq(parameter))) - /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */ @public - val laneResponseFeedback: ValidIO[LaneResponseFeedback] = IO(Flipped(Valid(new LaneResponseFeedback(parameter)))) + val maskRequestToLSU: Bool = IO(Output(Bool())) + + @public + val maskUnitResponse: ValidIO[MaskUnitExeResponse] = IO(Flipped(Valid(new MaskUnitExeResponse(parameter)))) + + @public + val maskResponseRelease: Bool = IO(Output(Bool())) /** for LSU and V accessing lane, this is not a part of ring, but a direct connection. */ @public @@ -569,14 +574,26 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ slotCanShift(index) := true.B } - val laneState: LaneState = Wire(new LaneState(parameter)) - val stage0: Instance[LaneStage0] = Instantiate(new LaneStage0(parameter, isLastSlot)) - val stage1: Instance[LaneStage1] = Instantiate(new LaneStage1(parameter, isLastSlot)) - val stage2: Instance[LaneStage2] = Instantiate(new LaneStage2(parameter, isLastSlot)) - val executionUnit: Instance[LaneExecutionBridge] = Instantiate( + val laneState: LaneState = Wire(new LaneState(parameter)) + val stage0: Instance[LaneStage0] = Instantiate(new LaneStage0(parameter, isLastSlot)) + val stage1: Instance[LaneStage1] = Instantiate(new LaneStage1(parameter, isLastSlot)) + val stage2: Instance[LaneStage2] = Instantiate(new LaneStage2(parameter, isLastSlot)) + val executionUnit: Instance[LaneExecutionBridge] = Instantiate( new LaneExecutionBridge(parameter, isLastSlot, index) ) - val stage3: Instance[LaneStage3] = Instantiate(new LaneStage3(parameter, isLastSlot)) + val maskStage: Option[Instance[MaskExchangeUnit]] = + Option.when(isLastSlot)(Instantiate(new MaskExchangeUnit(parameter))) + val stage3: Instance[LaneStage3] = Instantiate(new LaneStage3(parameter, isLastSlot)) + val stage3EnqWire: DecoupledIO[LaneStage3Enqueue] = Wire(Decoupled(new LaneStage3Enqueue(parameter, isLastSlot))) + val stage3EnqSelect: DecoupledIO[LaneStage3Enqueue] = maskStage.map { mask => + mask.enqueue <> stage3EnqWire + maskUnitRequest <> mask.maskReq + maskRequestToLSU <> mask.maskRequestToLSU + mask.maskUnitResponse := maskUnitResponse + maskResponseRelease := mask.maskResponseRelease + mask.dequeue + }.getOrElse(stage3EnqWire) + stage3.enqueue <> stage3EnqSelect // slot state laneState.vSew1H := vSew1H @@ -758,50 +775,47 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ 0.U(parameter.chainingSize.W) ) AssertProperty(BoolSequence(!executionUnit.dequeue.valid || stage2.dequeue.valid)) - stage3.enqueue.valid := executionUnit.dequeue.valid - executionUnit.dequeue.ready := stage3.enqueue.ready + stage3EnqWire.valid := executionUnit.dequeue.valid + executionUnit.dequeue.ready := stage3EnqWire.ready stage2.dequeue.ready := executionUnit.dequeue.fire if (!isLastSlot) { - stage3.enqueue.bits := DontCare + stage3EnqWire.bits := DontCare } // pipe state from stage0 - stage3.enqueue.bits.decodeResult := stage2.dequeue.bits.decodeResult - stage3.enqueue.bits.instructionIndex := stage2.dequeue.bits.instructionIndex - stage3.enqueue.bits.loadStore := stage2.dequeue.bits.loadStore - stage3.enqueue.bits.vd := stage2.dequeue.bits.vd - stage3.enqueue.bits.ffoByOtherLanes := ffoRecord.ffoByOtherLanes - stage3.enqueue.bits.groupCounter := stage2.dequeue.bits.groupCounter - stage3.enqueue.bits.mask := stage2.dequeue.bits.mask + stage3EnqWire.bits.decodeResult := stage2.dequeue.bits.decodeResult + stage3EnqWire.bits.instructionIndex := stage2.dequeue.bits.instructionIndex + stage3EnqWire.bits.loadStore := stage2.dequeue.bits.loadStore + stage3EnqWire.bits.vd := stage2.dequeue.bits.vd + stage3EnqWire.bits.ffoByOtherLanes := ffoRecord.ffoByOtherLanes + stage3EnqWire.bits.groupCounter := stage2.dequeue.bits.groupCounter + stage3EnqWire.bits.mask := stage2.dequeue.bits.mask if (isLastSlot) { - stage3.enqueue.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get - stage3.enqueue.bits.ffoSuccess := executionUnit.dequeue.bits.ffoSuccess.get - stage3.enqueue.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) => + stage3EnqWire.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get + stage3EnqWire.bits.ffoSuccess := executionUnit.dequeue.bits.ffoSuccess.get + stage3EnqWire.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) => sink := source } } - stage3.enqueue.bits.data := executionUnit.dequeue.bits.data - stage3.enqueue.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) - stage3.enqueue.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex - executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3.enqueue.bits.crossWriteData := data) - stage2.dequeue.bits.sSendResponse.foreach(_ => stage3.enqueue.bits.sSendResponse := _) - executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3.enqueue.bits.ffoSuccess := _) + stage3EnqWire.bits.data := executionUnit.dequeue.bits.data + stage3EnqWire.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) + stage3EnqWire.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex + executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3EnqWire.bits.crossWriteData := data) + stage2.dequeue.bits.sSendResponse.foreach(_ => stage3EnqWire.bits.sSendResponse := _) + executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3EnqWire.bits.ffoSuccess := _) if (isLastSlot) { - when(laneResponseFeedback.valid) { - when(laneResponseFeedback.bits.complete) { + when(maskUnitResponse.valid) { + when(maskUnitResponse.bits.ffoByOther) { ffoRecord.ffoByOtherLanes := true.B } } - when(stage3.enqueue.fire) { + when(stage3EnqWire.fire) { executionUnit.dequeue.bits.ffoSuccess.foreach(ffoRecord.selfCompleted := _) // This group found means the next group ended early ffoRecord.ffoByOtherLanes := ffoRecord.ffoByOtherLanes || ffoRecord.selfCompleted } - - laneResponse <> stage3.laneResponse.get - stage3.laneResponseFeedback.get <> laneResponseFeedback } // --- stage 3 end & stage 4 start --- @@ -1174,10 +1188,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex } // todo: add mask unit write token - tokenManager.responseReport.valid := laneResponse.valid - tokenManager.responseReport.bits := laneResponse.bits.instructionIndex - tokenManager.responseFeedbackReport.valid := laneResponseFeedback.valid - tokenManager.responseFeedbackReport.bits := laneResponseFeedback.bits.instructionIndex + tokenManager.responseReport.valid := maskUnitRequest.valid + tokenManager.responseReport.bits := maskUnitRequest.bits.index + tokenManager.responseFeedbackReport.valid := maskUnitResponse.valid + tokenManager.responseFeedbackReport.bits := maskUnitResponse.bits.index val instInSlot: UInt = slotControl .zip(slotOccupied) .map { case (slotState, occupied) => @@ -1210,6 +1224,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + tokenManager.maskUnitLastReport := lsuLastReport + layer.block(layers.Verification) { val probeWire = Wire(new LaneProbe(parameter)) define(laneProbe, ProbeValue(probeWire)) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 6a78a2b7c..847af6193 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -252,6 +252,8 @@ case class T1Parameter( val vrfReadLatency = 2 + val maskUnitVefWriteQueueSize: Int = 8 + // each element: Each lane will be connected to the other two lanes, // and the values are their respective delays. val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) @@ -395,8 +397,9 @@ class T1(val parameter: T1Parameter) /** the LSU Module */ - val lsu: Instance[LSU] = Instantiate(new LSU(parameter.lsuParameters)) - val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam)) + val lsu: Instance[LSU] = Instantiate(new LSU(parameter.lsuParameters)) + val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam)) + val maskUnit: Instance[MaskUnit] = Instantiate(new MaskUnit(parameter)) omInstance.decoderIn := Property(decode.om.asAnyClassType) val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter)) @@ -504,35 +507,9 @@ class T1(val parameter: T1Parameter) Fill(8, imm(4) && (vSew1H(1) || vSew1H(2) || src1IsSInt)) ## Fill(3, imm(4)) ## imm - /** duplicate v0 for mask */ - val v0: Vec[UInt] = RegInit( - VecInit(Seq.fill(parameter.vLen / parameter.datapathWidth)(0.U(parameter.datapathWidth.W))) - ) - // TODO: uarch doc for the regroup - val regroupV0: Seq[UInt] = Seq(4, 2, 1).map { groupSize => - VecInit( - cutUInt(v0.asUInt, groupSize) - .grouped(parameter.laneNumber) - .toSeq - .transpose - .map(seq => VecInit(seq).asUInt) - ).asUInt - } - /** which slot the instruction is entering */ val instructionToSlotOH: UInt = Wire(UInt(parameter.chainingSize.W)) - /** synchronize signal from each lane, for mask units.(ffo) */ - val laneSynchronize: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - - /** all lanes are synchronized. */ - val synchronized: Bool = WireDefault(false.B) - - /** for mask unit that need to access VRF from lanes, use this signal to indicate it is finished access VRF(but - * instruction might not finish). - */ - val maskUnitReadOnlyFinish: Bool = WireDefault(false.B) - /** last slot is committing. */ val lastSlotCommit: Bool = Wire(Bool()) @@ -543,11 +520,6 @@ class T1(val parameter: T1Parameter) val vxsatReportVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) val vxsatReport = vxsatReportVec.reduce(_ | _) - // todo: 把lsu也放decode里去 - val maskUnitType: Bool = decodeResult(Decoder.maskUnit) && requestRegDequeue.bits.instruction(6) - val maskDestination = decodeResult(Decoder.maskDestination) - val unOrderType: Bool = decodeResult(Decoder.unOrderWrite) - /** Special instructions which will be allocate to the last slot. * - mask unit * - Lane <-> Top has data exchange(top might forward to LSU.) TODO: move to normal slots(add `offset` fields) @@ -558,99 +530,44 @@ class T1(val parameter: T1Parameter) val dataInWritePipeVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) val dataInWritePipe: UInt = dataInWritePipeVec.reduce(_ | _) - /** designed for unordered instruction(slide), it doesn't go to lane, it has RAW hazzard. - */ - val instructionRAWReady: Bool = Wire(Bool()) - val allSlotFree: Bool = Wire(Bool()) - val existMaskType: Bool = Wire(Bool()) - - // mask Unit 与lane交换数据 - val writeType: VRFWriteRequest = new VRFWriteRequest( - parameter.vrfParam.regNumBits, - parameter.vrfParam.vrfOffsetBits, - parameter.instructionIndexBits, - parameter.datapathWidth - ) - val maskUnitWrite: ValidIO[VRFWriteRequest] = Wire(Valid(writeType)) - val maskUnitWriteVec: Vec[ValidIO[VRFWriteRequest]] = Wire(Vec(3, Valid(writeType))) - val maskWriteLaneSelect: Vec[UInt] = Wire(Vec(3, UInt(parameter.laneNumber.W))) - // 默认是head - val maskUnitWriteSelect: UInt = Mux1H(maskUnitWriteVec.map(_.valid), maskWriteLaneSelect) - maskUnitWriteVec.foreach(_ := DontCare) - maskUnitWrite := Mux1H(maskUnitWriteVec.map(_.valid), maskUnitWriteVec) - val writeSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val maskUnitWriteReady: Bool = writeSelectMaskUnit.asUInt.orR + // todo: instructionRAWReady -> v0 write token + val allSlotFree: Bool = Wire(Bool()) + val existMaskType: Bool = Wire(Bool()) // read - val readType: VRFReadRequest = new VRFReadRequest( + val readType: VRFReadRequest = new VRFReadRequest( parameter.vrfParam.regNumBits, parameter.vrfParam.vrfOffsetBits, parameter.instructionIndexBits ) - val maskUnitRead: ValidIO[VRFReadRequest] = Wire(Valid(readType)) - val maskUnitReadVec: Vec[ValidIO[VRFReadRequest]] = Wire(Vec(3, Valid(readType))) - val maskReadLaneSelect: Vec[UInt] = Wire(Vec(3, UInt(parameter.laneNumber.W))) - val maskUnitReadSelect: UInt = Mux1H(maskUnitReadVec.map(_.valid), maskReadLaneSelect) - maskUnitRead := Mux1H(maskUnitReadVec.map(_.valid), maskUnitReadVec) - val readSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val maskUnitReadReady = readSelectMaskUnit.asUInt.orR - val laneReadResult: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) - val WARRedResult: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))) - // mask unit 最后的写 - val maskUnitFlushVrf: Bool = WireDefault(false.B) + // todo: ix type gather read // gather read state - val gatherOverlap: Bool = Wire(Bool()) - val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) && + val gatherOverlap: Bool = Wire(Bool()) + val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) && !decodeResult(Decoder.vtype) && !gatherOverlap + val gatherData: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val gatherReadRequest: DecoupledIO[VRFReadRequest] = Wire(Decoupled(readType)) + val gatherReadLaneSelect: UInt = Wire(UInt(parameter.laneNumber.W)) + val gatherReadResultFire = Pipe(gatherReadRequest.fire, gatherReadLaneSelect, parameter.vrfReadLatency).valid val gatherReadFinish: Bool = RegEnable( !requestRegDequeue.fire, false.B, - (RegNext(RegNext(maskUnitReadReady)) && gatherNeedRead) || requestRegDequeue.fire + (gatherReadResultFire && gatherNeedRead) || requestRegDequeue.fire ) val gatherReadDataOffset: UInt = Wire(UInt(5.W)) - val gatherData: UInt = Mux(gatherOverlap, 0.U, (WARRedResult.bits >> gatherReadDataOffset).asUInt) - /** data that need to be compute at top. */ - val data: Vec[ValidIO[UInt]] = RegInit( - VecInit(Seq.fill(parameter.laneNumber)(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W))))) - ) - val flotReduceValid: Seq[Option[Bool]] = Seq.tabulate(parameter.laneNumber) { _ => - Option.when(parameter.fpuEnable)(RegInit(false.B)) - } - val maskDataForCompress: UInt = RegInit(0.U(parameter.datapathWidth.W)) - // clear the previous set of data from lane - val dataClear: Bool = WireDefault(false.B) - val completedVec: Vec[Bool] = RegInit(VecInit(Seq.fill(parameter.laneNumber)(false.B))) - // ffoIndexReg.valid: Already found the first one - val ffoIndexReg: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.xLen.W)))) - val ffoType: Bool = Wire(Bool()) + // todo + gatherReadRequest.valid := DontCare + gatherReadRequest.bits := DontCare + gatherReadRequest.ready := DontCare + gatherOverlap := DontCare + gatherReadLaneSelect := DontCare + gatherReadDataOffset := DontCare /** for find first one, need to tell the lane with higher index `1` . */ - val completedLeftOr: UInt = (scanLeftOr(completedVec.asUInt) << 1).asUInt(parameter.laneNumber - 1, 0) - // 按指定的sew拼成 {laneNumer * dataPathWidth} bit, 然后根据sew选择出来 - val sortedData: UInt = Mux1H( - vSewOHForMask, - Seq(4, 2, 1).map { groupSize => - VecInit(data.map { element => - element.bits.asBools // [x] * 32 eg: sew = 1 - .grouped(groupSize) // [x, x] * 16 - .toSeq - .map(VecInit(_).asUInt) // [xx] * 16 - }.transpose.map(VecInit(_).asUInt)).asUInt // [x*16] * 16 -> x * 256 - } - ) - // 把已经排过序的数据重新分给各个lane - val regroupData: Vec[UInt] = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex => - sortedData( - laneIndex * parameter.datapathWidth + parameter.datapathWidth - 1, - laneIndex * parameter.datapathWidth - ) - }) - val dataResult: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))) - - val executeForLastLaneFire: Bool = WireDefault(false.B) + val dataResult: UInt = RegInit(0.U.asTypeOf(UInt(parameter.datapathWidth.W))) /** state machine register for each instruction. */ val slots: Seq[InstructionControl] = Seq.tabulate(parameter.chainingSize) { index => @@ -661,8 +578,6 @@ class T1(val parameter: T1Parameter) .asTypeOf(new InstructionControl(parameter.instructionIndexBits, parameter.laneNumber)) ) - val mvToVRF: Option[Bool] = Option.when(index == parameter.chainingSize - 1)(RegInit(false.B)) - /** the execution is finished. (but there might still exist some data in the ring.) */ val laneAndLSUFinish: Bool = control.endTag.asUInt.andR @@ -688,6 +603,7 @@ class T1(val parameter: T1Parameter) control.state.wLast := false.B control.state.sCommit := false.B control.state.wVRFWrite := !requestReg.bits.decodeResult(Decoder.maskUnit) + control.state.wMaskUnitLast := !requestReg.bits.decodeResult(Decoder.maskUnit) control.vxsat := false.B // two different initial states for endTag: // for load/store instruction, use the last bit to indicate whether it is the last instruction @@ -696,11 +612,14 @@ class T1(val parameter: T1Parameter) } // state machine starts here .otherwise { + when(maskUnit.lastReport.orR) { + control.state.wMaskUnitLast := true.B + } when(laneAndLSUFinish && v0WriteFinish) { control.state.wLast := true.B } - when(control.state.wLast && control.state.sMaskUnitExecution && !dataInWritePipeCheck) { + when(control.state.wLast && control.state.wMaskUnitLast && !dataInWritePipeCheck) { control.state.wVRFWrite := true.B } @@ -708,7 +627,7 @@ class T1(val parameter: T1Parameter) control.state.sCommit := true.B } - when(control.state.sCommit && control.state.wVRFWrite && control.state.sMaskUnitExecution) { + when(control.state.sCommit && control.state.wVRFWrite && control.state.wMaskUnitLast) { control.state.idle := true.B } @@ -720,735 +639,22 @@ class T1(val parameter: T1Parameter) control.vxsat := true.B } } - // logic like mask&reduce will be put to last slot - // TODO: review later if (index == (parameter.chainingSize - 1)) { - val feedBack: UInt = RegInit(0.U(parameter.laneNumber.W)) - val executeCounter: UInt = RegInit(0.U((log2Ceil(parameter.laneNumber) + 1).W)) - // mask destination时这两count都是以写vrf为视角 - val writeBackCounter: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W)) - val groupCounter: UInt = RegInit(0.U(parameter.groupNumberMaxBits.W)) - val iotaCount: UInt = RegInit(0.U((parameter.laneParam.vlMaxBits - 1).W)) - val maskTypeInstruction = RegInit(false.B) - val vd = RegInit(0.U(5.W)) - val vs1 = RegInit(0.U(5.W)) - val vs2 = RegInit(0.U(5.W)) - val rs1 = RegInit(0.U(parameter.xLen.W)) - val vm = RegInit(false.B) - val executeFinishReg = RegInit(true.B) - val unOrderTypeInstruction = RegInit(false.B) - val decodeResultReg = RegInit(0.U.asTypeOf(decodeResult)) - val gather: Bool = decodeResultReg(Decoder.gather) - // for slid - val elementIndexCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W)) - val compressWriteCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W)) - val nextElementIndex: UInt = elementIndexCount + 1.U - val firstElement = elementIndexCount === 0.U - val lastElement: Bool = nextElementIndex === csrRegForMaskUnit.vl - val updateMaskIndex = WireDefault(false.B) - when(updateMaskIndex) { elementIndexCount := nextElementIndex } - // 特殊的指令,会阻止 wLast 后把 sExecute 拉回来, 因为需要等待读后才写 - val mixedUnit: Bool = Wire(Bool()) - // slid & gather & extend - val slidUnitIdle: Bool = RegInit(true.B) - // compress & iota - val iotaUnitIdle: Bool = RegInit(true.B) - val orderedReduceGroupCount: Option[UInt] = Option.when(parameter.fpuEnable)( - RegInit(0.U(log2Ceil(parameter.vLen / parameter.laneNumber).W)) - ) - val orderedReduceIdle: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(true.B)) - val maskUnitIdle = (Seq(slidUnitIdle, iotaUnitIdle) ++ orderedReduceIdle).reduce(_ && _) - val reduce = decodeResultReg(Decoder.red) - val orderedReduce: Bool = if (parameter.fpuEnable) decodeResultReg(Decoder.orderReduce) else false.B - val popCount = decodeResultReg(Decoder.popCount) - val extend = decodeResultReg(Decoder.extend) - // first type instruction - val firstLane = ffo(completedVec.asUInt) - val firstLaneIndex: UInt = OHToUInt(firstLane)(log2Ceil(parameter.laneNumber) - 1, 0) - io.retire.rd.valid := lastSlotCommit && decodeResultReg(Decoder.targetRd) + val writeRD = RegInit(false.B) + val float: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(false.B)) + val vd = RegInit(0.U(5.W)) + when(instructionToSlotOH(index)) { + writeRD := decodeResult(Decoder.targetRd) + float.foreach(_ := decodeResult(Decoder.float)) + vd := requestRegDequeue.bits.instruction(11, 7) + } + io.retire.rd.valid := lastSlotCommit && writeRD io.retire.rd.bits.rdAddress := vd if (parameter.fpuEnable) { - io.retire.rd.bits.isFp := decodeResultReg(Decoder.float) + io.retire.rd.bits.isFp := float.getOrElse(false.B) } else { io.retire.rd.bits.isFp := false.B } - when(requestRegDequeue.fire) { - ffoIndexReg.valid := false.B - ffoIndexReg.bits := -1.S(parameter.xLen.W).asUInt - }.elsewhen(synchronized && completedVec.asUInt.orR && !ffoIndexReg.valid) { - ffoIndexReg.valid := true.B - ffoIndexReg.bits := Mux1H( - firstLane, - // 3: firstLaneIndex.width - data.map(i => i.bits(parameter.xLen - 1 - 3, 5) ## firstLaneIndex ## i.bits(4, 0)) - ) - } - ffoType := decodeResultReg(Decoder.ffo) - - /** vlmax = vLen * (2**lmul) / (2 ** sew * 8) \= (vLen / 8) * 2 ** (lmul - sew) \= vlb * 2 ** (lmul - sew) lmul <- - * (-3, -2, -1, 0 ,1, 2, 3) sew <- (0, 1, 2) lmul - sew <- [-5, 3] 选择信号 +5 -> lmul - sew + 5 <- [0, 8] - */ - def largeThanVLMax(source: UInt, advance: Bool = false.B, lmul: UInt, sew: UInt): Bool = { - val vlenLog2 = log2Ceil(parameter.vLen) // 10 - val cut = - if (source.getWidth >= vlenLog2) source(vlenLog2 - 1, vlenLog2 - 9) - else (0.U(vlenLog2.W) ## source)(vlenLog2 - 1, vlenLog2 - 9) - // 9: lmul - sew 的可能值的个数 - val largeList: Vec[Bool] = Wire(Vec(9, Bool())) - cut.asBools.reverse.zipWithIndex.foldLeft(advance) { case (a, (b, i)) => - largeList(i) := a - a || b - } - val extendVlmul = lmul(2) ## lmul - val selectWire = UIntToOH(5.U(4.W) + extendVlmul - sew)(8, 0).asBools.reverse - Mux1H(selectWire, largeList) - } - // 算req上面的分开吧 - val gatherWire = - Mux(decodeResult(Decoder.itype), requestRegDequeue.bits.instruction(19, 15), requestRegDequeue.bits.rs1Data) - val gatherAdvance = (gatherWire >> log2Ceil(parameter.vLen)).asUInt.orR - gatherOverlap := largeThanVLMax( - gatherWire, - gatherAdvance, - T1Issue.vlmul(requestReg.bits.issue), - T1Issue.vsew(requestReg.bits.issue) - ) - val slotValid = !control.state.idle - val storeAfterSlide = isStoreType && (requestRegDequeue.bits.instruction(11, 7) === vd) - instructionRAWReady := !((unOrderTypeInstruction && slotValid && - // slid 类的会比执行得慢的指令快(div),会修改前面的指令的source - ((vd === requestRegDequeue.bits.instruction(24, 20)) || - (vd === requestRegDequeue.bits.instruction(19, 15)) || - storeAfterSlide || - // slid 类的会比执行得快的指令慢(mv),会被后来的指令修改 source2 - (vs2 === requestRegDequeue.bits.instruction(11, 7))) || - (unOrderType && !allSlotFree) || - (requestReg.bits.vdIsV0 && existMaskType)) || - (vd === 0.U && maskType && slotValid)) - when(instructionToSlotOH(index)) { - writeBackCounter := 0.U - groupCounter := 0.U - executeCounter := 0.U - elementIndexCount := 0.U - compressWriteCount := 0.U - iotaCount := 0.U - slidUnitIdle := !((decodeResult(Decoder.slid) || (decodeResult(Decoder.gather) && decodeResult(Decoder.vtype)) - || decodeResult(Decoder.extend)) && instructionValid) - iotaUnitIdle := !((decodeResult(Decoder.compress) || decodeResult(Decoder.iota)) && instructionValid) - orderedReduceIdle.foreach(_ := !(decodeResult(Decoder.orderReduce) && instructionValid)) - orderedReduceGroupCount.foreach(_ := 0.U) - vd := requestRegDequeue.bits.instruction(11, 7) - vs1 := requestRegDequeue.bits.instruction(19, 15) - vs2 := requestRegDequeue.bits.instruction(24, 20) - vm := requestRegDequeue.bits.instruction(25) - executeFinishReg := false.B - rs1 := requestRegDequeue.bits.rs1Data - decodeResultReg := decodeResult - csrRegForMaskUnit := requestRegCSR - // todo: decode need execute - control.state.sMaskUnitExecution := !maskUnitType - maskTypeInstruction := maskType && !decodeResult(Decoder.maskSource) - completedVec.foreach(_ := false.B) - WARRedResult.valid := false.B - unOrderTypeInstruction := unOrderType - dataResult := 0.U.asTypeOf(dataResult) - }.elsewhen(control.state.wLast && maskUnitIdle) { - // 如果真需要执行的lane会wScheduler,不会提前发出last确认 - when(!mixedUnit) { - control.state.sMaskUnitExecution := true.B - } - maskUnitFlushVrf := !control.state.idle - } - when(laneSynchronize.asUInt.orR) { - feedBack := feedBack | laneSynchronize.asUInt - }.elsewhen(lastSlotCommit) { - feedBack := 0.U - } - // 执行 - // mask destination write - /** 对于mask destination 类型的指令需要特别注意两种不对齐 第一种是我们以 32(dataPatWidth) * 8(laneNumber) 为一个组, 但是我们vl可能不对齐一整个组 第二种是 - * 32(dataPatWidth) 的时候对不齐 vl假设最大1024,相应的会有11位的vl xxx xxx xxxxx - */ - val dataPathMisaligned = csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0).orR - val groupMisaligned = - if (parameter.laneNumber > 1) - csrRegForMaskUnit - .vl(parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, parameter.dataPathWidthBits) - .orR - else false.B - - /** 我们需要计算最后一次写的 [[writeBackCounter]] & [[groupCounter]] lastGroupCounter = vl(10, 8) - !([[dataPathMisaligned]] - * \|| [[groupMisaligned]]) lastExecuteCounter = vl(7, 5) - ![[dataPathMisaligned]] - */ - val lastGroupCounter: UInt = - csrRegForMaskUnit.vl( - parameter.laneParam.vlMaxBits - 1, - parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - ) - !(dataPathMisaligned || groupMisaligned) - val lastExecuteCounter: UInt = if (parameter.laneNumber > 1) { - csrRegForMaskUnit.vl( - parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, - parameter.dataPathWidthBits - ) - !dataPathMisaligned - } else 0.U - val lastGroup = groupCounter === lastGroupCounter - val lastExecute = lastGroup && writeBackCounter === lastExecuteCounter - val lastExecuteForGroup = writeBackCounter.andR - // 计算正写的这个lane是不是在边界上 - val endOH = UIntToOH(csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0)) - val border = lastExecute && dataPathMisaligned && - !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.gather)) - val lastGroupMask = scanRightOr(endOH(parameter.datapathWidth - 1, 1)) - val mvType = decodeResultReg(Decoder.mv) - val readMv = mvType && decodeResultReg(Decoder.targetRd) - val writeMv = mvType && !decodeResultReg(Decoder.targetRd) && - csrRegForMaskUnit.vl > csrRegForMaskUnit.vStart - mvToVRF.foreach(d => when(requestRegDequeue.fire) { d := writeMv }) - // 读后写中的读 - val needWAR = (maskTypeInstruction || border || reduce || readMv) && !popCount - val skipLaneData: Bool = decodeResultReg(Decoder.mv) - mixedUnit := writeMv || readMv - maskReadLaneSelect.head := UIntToOH(writeBackCounter) - maskReadLaneSelect.head := UIntToOH(writeBackCounter) - maskWriteLaneSelect.head := maskReadLaneSelect.head - maskUnitReadVec.head.valid := false.B - maskUnitReadVec.head.bits.vs := Mux(readMv, vs2, Mux(reduce, vs1, vd)) - maskUnitReadVec.head.bits.readSource := Mux(readMv, 1.U, Mux(reduce, 0.U, 2.U)) - maskUnitReadVec.head.bits.offset := groupCounter - maskUnitRead.bits.instructionIndex := control.record.instructionIndex - val readResultSelectResult = Mux1H( - Pipe(true.B, maskUnitReadSelect, parameter.vrfReadLatency).bits, - laneReadResult - ) - // 把mask选出来 - val maskSelect = v0(groupCounter ## writeBackCounter) - val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt - - /** 正常全1 mask:[[maskSelect]] border: [[lastGroupMask]] mask && border: [[maskSelect]] & [[lastGroupMask]] - */ - val maskCorrect: UInt = Mux(maskTypeInstruction, maskSelect, fullMask) & - Mux(border, lastGroupMask, fullMask) - // mask - val sew1HCorrect = Mux(decodeResultReg(Decoder.widenReduce), vSewOHForMask ## false.B, vSewOHForMask) - // 写的data - val writeData = (WARRedResult.bits & (~maskCorrect).asUInt) | (regroupData(writeBackCounter) & maskCorrect) - val writeMask = Mux(sew1HCorrect(2) || !reduce, 15.U, Mux(sew1HCorrect(1), 3.U, 1.U)) - maskUnitWriteVec.head.valid := false.B - maskUnitWriteVec.head.bits.vd := vd - maskUnitWriteVec.head.bits.offset := groupCounter - maskUnitWriteVec.head.bits.data := Mux(writeMv, rs1, Mux(reduce, dataResult.bits, writeData)) - maskUnitWriteVec.head.bits.last := control.state.wLast || reduce - maskUnitWriteVec.head.bits.instructionIndex := control.record.instructionIndex - - val waitReadResult: Bool = Wire(Bool()) - val maskUnitReadVrf = maskUnitReadReady && maskUnitReadVec.map(_.valid).reduce(_ || _) && !waitReadResult - val readNext = RegNext(maskUnitReadVrf) - waitReadResult := RegNext(readNext) || readNext - when(Pipe(maskUnitReadVrf, false.B, parameter.vrfReadLatency).valid) { - WARRedResult.bits := readResultSelectResult - WARRedResult.valid := true.B - } - // alu start - val aluInput1 = Mux( - (Seq(executeCounter === 0.U) ++ orderedReduceGroupCount.map(_ === 0.U)).reduce(_ && _), - Mux( - needWAR, - WARRedResult.bits & FillInterleaved(8, writeMask), - 0.U - ), - dataResult.bits - ) - val aluInput2 = Mux1H(UIntToOH(executeCounter), data.map(d => Mux(d.valid, d.bits, 0.U))) - val skipFlotReduce: Bool = !Mux1H(UIntToOH(executeCounter), flotReduceValid.map(_.getOrElse(false.B))) - // red alu instance - val adder: Instance[ReduceAdder] = Instantiate(new ReduceAdder(parameter.datapathWidth)) - val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(parameter.datapathWidth)) - // option unit for flot reduce - val floatAdder: Option[Instance[FloatAdder]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24))) - val flotCompare: Option[Instance[FloatCompare]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24))) - - val sign = !decodeResultReg(Decoder.unsigned1) - adder.request.src := VecInit( - Seq( - (aluInput1(parameter.datapathWidth - 1) && sign) ## aluInput1, - (aluInput2(parameter.datapathWidth - 1) && sign) ## aluInput2 - ) - ) - // popCount 在top视为reduce add - adder.request.opcode := Mux(popCount, 0.U, decodeResultReg(Decoder.uop)) - adder.request.sign := sign - adder.request.vSew := Mux(popCount, 2.U, OHToUInt(sew1HCorrect)) - - floatAdder.foreach { fAdder => - fAdder.io.a := aluInput1 - fAdder.io.b := aluInput2 - fAdder.io.roundingMode := csrRegForMaskUnit.vxrm - } - - flotCompare.foreach { fCompare => - fCompare.io.a := aluInput1 - fCompare.io.b := aluInput2 - // max -> 12, min -> 8 - fCompare.io.isMax := decodeResultReg(Decoder.uop)(2) - } - - logicUnit.req.src := VecInit(Seq(aluInput1, aluInput2)) - logicUnit.req.opcode := decodeResultReg(Decoder.uop) - - // reduce resultSelect - val intReduceResult = Mux( - decodeResultReg(Decoder.adder) || popCount, - adder.response.data, - logicUnit.resp - ) - val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)( - Mux( - skipFlotReduce, - aluInput1, - Mux(decodeResultReg(Decoder.fpExecutionType) === 0.U, floatAdder.get.io.out, flotCompare.get.io.out) - ) - ) - val aluOutPut = Mux1H( - Seq(if (parameter.fpuEnable) reduce && !decodeResultReg(Decoder.float) else reduce) ++ - Option.when(parameter.fpuEnable)(reduce && decodeResultReg(Decoder.float)), - Seq(intReduceResult) ++ flotReduceResult - ) - // slid & gather unit - val slideUp = decodeResultReg(Decoder.topUop)(1) - val slide1 = decodeResultReg(Decoder.topUop)(0) && decodeResultReg(Decoder.slid) - - /** special uop 里面编码了extend的信息: specialUop(1,0): 倍率 specialUop(2):是否是符号 - */ - val extendSourceSew: Bool = (csrRegForMaskUnit.vSew >> decodeResultReg(Decoder.topUop)(1, 0))(0) - val extendSign: Bool = decodeResultReg(Decoder.topUop)(2) - // gather 相关的控制 - val gather16: Bool = decodeResultReg(Decoder.gather16) - val maskUnitEEW = Mux(gather16, 1.U, Mux(extend, extendSourceSew, csrRegForMaskUnit.vSew)) - val maskUnitEEW1H: UInt = UIntToOH(maskUnitEEW) - val maskUnitByteEnable = maskUnitEEW1H(2) ## maskUnitEEW1H(2) ## maskUnitEEW1H(2, 1).orR ## true.B - val maskUnitBitEnable = FillInterleaved(8, maskUnitByteEnable) - maskUnitWriteVec.head.bits.mask := Mux(writeMv, maskUnitByteEnable, writeMask) - // log2(dataWidth * laneNumber / 8) - val maskUnitDataOffset = - (elementIndexCount << maskUnitEEW).asUInt( - log2Ceil(parameter.datapathWidth * parameter.laneNumber / 8) - 1, - 0 - ) ## 0.U(3.W) - val maskUnitData = ((VecInit(data.map(_.bits)).asUInt >> maskUnitDataOffset).asUInt & maskUnitBitEnable)( - parameter.datapathWidth - 1, - 0 - ) - - val compareWire = Mux(decodeResultReg(Decoder.slid), rs1, maskUnitData) - val compareAdvance: Bool = (compareWire >> log2Ceil(parameter.vLen)).asUInt.orR - val compareResult: Bool = - largeThanVLMax(compareWire, compareAdvance, csrRegForMaskUnit.vlmul, csrRegForMaskUnit.vSew) - // 正在被gather使用的数据在data的那个组里 - val gatherDataSelect = - UIntToOH((false.B ## maskUnitDataOffset)(5 + (log2Ceil(parameter.laneNumber).max(1)) - 1, 5)) - val dataTail = Mux1H(UIntToOH(maskUnitEEW)(1, 0), Seq(3.U(2.W), 2.U(2.W))) - val lastElementForData = gatherDataSelect.asBools.last && maskUnitDataOffset(4, 3) === dataTail - val lastElementForCompressMask = elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0).andR - val maskUnitDataReady: Bool = (gatherDataSelect & VecInit(data.map(_.valid)).asUInt).orR - // 正在被gather使用的数据是否就绪了 - val isSlide = !(gather || extend) - val slidUnitDataReady: Bool = maskUnitDataReady || isSlide - val compressDataReady = maskUnitDataReady || !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.iota)) - // slid 先用状态机 - val idle :: sRead :: sWrite :: Nil = Enum(3) - val slideState = RegInit(idle) - val readState = slideState === sRead - - // slid 的立即数是0扩展的 - val slidSize = Mux(slide1, 1.U, Mux(decodeResultReg(Decoder.itype), vs1, rs1)) - // todo: 这里是否有更好的处理方式 - val slidSizeLSB = slidSize(parameter.laneParam.vlMaxBits - 1, 0) - // down + - // up - - val directionSelection = Mux(slideUp, (~slidSizeLSB).asUInt, slidSizeLSB) - val slideReadIndex = elementIndexCount + directionSelection + slideUp - val readIndex: UInt = Mux( - !maskUnitIdle, - Mux( - decodeResultReg(Decoder.slid), - slideReadIndex, - maskUnitData - ), - gatherWire - ) - - def indexAnalysis(elementIndex: UInt, csrInput: CSRInterface = csrRegForMaskUnit) = { - val sewInput = csrInput.vSew - val sewOHInput = UIntToOH(csrInput.vSew)(2, 0) - val intLMULInput: UInt = (1.U << csrInput.vlmul(1, 0)).asUInt - val dataPosition = (elementIndex(parameter.laneParam.vlMaxBits - 2, 0) << sewInput) - .asUInt(parameter.laneParam.vlMaxBits - 2, 0) - val accessMask = Mux1H( - sewOHInput(2, 0), - Seq( - UIntToOH(dataPosition(1, 0)), - FillInterleaved(2, UIntToOH(dataPosition(1))), - 15.U(4.W) - ) - ) - // 数据起始位置在32bit(暂时只32)中的偏移,由于数据会有跨lane的情况,融合的优化时再做 - val dataOffset = (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) ## 0.U(3.W) - val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) - // 32 bit / group - val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt - val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits - val offset = dataGroup(offsetWidth - 1, 0) - val accessRegGrowth = (dataGroup >> offsetWidth).asUInt - val decimalProportion = offset ## accessLane - // 1/8 register - val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) - - /** elementIndex 需要与vlMax比较, vLen * lmul /sew 这个计算太复杂了 我们可以换一个角度,计算读寄存器的增量与lmul比较,就能知道下标是否超vlMax了 vlmul - * 需要区分整数与浮点 - */ - val overlap = - (csrInput.vlmul(2) && decimal >= intLMULInput(3, 1)) || - (!csrInput.vlmul(2) && accessRegGrowth >= intLMULInput) - accessRegGrowth >= csrInput.vlmul - val reallyGrowth = accessRegGrowth(2, 0) - (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap) - } - val srcOverlap: Bool = !decodeResultReg(Decoder.itype) && (rs1 >= csrRegForMaskUnit.vl) - // rs1 >= vlMax - val srcOversize = !decodeResultReg(Decoder.itype) && !slide1 && compareResult - val signBit = Mux1H( - vSewOHForMask, - readIndex(parameter.laneParam.vlMaxBits - 1, parameter.laneParam.vlMaxBits - 3).asBools.reverse - ) - // 对于up来说小于offset的element是不变得的 - val slideUpUnderflow = slideUp && !slide1 && (signBit || srcOverlap) - val elementActive: Bool = v0.asUInt(elementIndexCount) || vm - val slidActive = elementActive && (!slideUpUnderflow || !decodeResultReg(Decoder.slid)) - // index >= vlMax 是写0 - val overlapVlMax: Bool = !slideUp && (signBit || srcOversize) - // select csr - val csrSelect = Mux(control.state.idle, requestRegCSR, csrRegForMaskUnit) - // slid read - val (_, readDataOffset, readLane, readOffset, readGrowth, lmulOverlap) = indexAnalysis(readIndex, csrSelect) - gatherReadDataOffset := readDataOffset - val readOverlap = lmulOverlap || overlapVlMax - val skipRead = readOverlap || (gather && compareResult) || extend - val maskUnitWriteVecFire1 = maskUnitReadVec(1).valid && maskUnitReadReady - val readFireNext1: Bool = RegNext(maskUnitWriteVecFire1) - val readFireNextNext1: Bool = RegNext(readFireNext1) - val port1WaitForResult: Bool = readFireNext1 || readFireNextNext1 - val gatherTryToRead = - gatherNeedRead && !VecInit(lsu.vrfReadDataPorts.map(_.valid)).asUInt.orR && !gatherReadFinish - maskUnitReadVec(1).valid := (readState || gatherTryToRead) && !port1WaitForResult - maskUnitReadVec(1).bits.vs := Mux(readState, vs2, requestRegDequeue.bits.instruction(24, 20)) + readGrowth - maskUnitReadVec(1).bits.readSource := 1.U - maskUnitReadVec(1).bits.offset := readOffset - maskReadLaneSelect(1) := UIntToOH(readLane) - // slid write, vlXXX: 用element index 算出来的 - val (vlMask, vlDataOffset, vlLane, vlOffset, vlGrowth, _) = indexAnalysis(elementIndexCount) - val writeState = slideState === sWrite - // 处理数据,先硬移位吧 - val slidReadData: UInt = ((WARRedResult.bits >> readDataOffset) << vlDataOffset) - .asUInt(parameter.datapathWidth - 1, 0) - val selectRS1 = slide1 && ((slideUp && firstElement) || (!slideUp && lastElement)) - // extend 类型的扩展和移位 - val extendData: UInt = (Mux( - extendSourceSew, - Fill(parameter.datapathWidth - 16, extendSign && maskUnitData(15)) ## maskUnitData(15, 0), - Fill(parameter.datapathWidth - 8, extendSign && maskUnitData(7)) ## maskUnitData(7, 0) - ) << vlDataOffset).asUInt(parameter.xLen - 1, 0) - - /** vd 的值有4种: - * 1. 用readIndex读出来的vs2的值 - * 1. 0 - * 1. slide1 时插进来的rs1 - * 1. extend 的值 - */ - val slidWriteData = Mux1H( - Seq((!(readOverlap || selectRS1 || extend)) || (gather && !compareResult), selectRS1, extend), - Seq(slidReadData, (rs1 << vlDataOffset).asUInt(parameter.xLen - 1, 0), extendData) - ) - maskUnitWriteVec(1).valid := writeState && slidActive - maskUnitWriteVec(1).bits.vd := vd + vlGrowth - maskUnitWriteVec(1).bits.offset := vlOffset - maskUnitWriteVec(1).bits.mask := vlMask - maskUnitWriteVec(1).bits.data := slidWriteData - maskUnitWriteVec(1).bits.last := lastElement - maskUnitWriteVec(1).bits.instructionIndex := control.record.instructionIndex - maskWriteLaneSelect(1) := UIntToOH(vlLane) - // slid 跳状态机 - when(slideState === idle) { - when((!slidUnitIdle) && slidUnitDataReady) { - when(skipRead) { - slideState := sWrite - }.otherwise { - slideState := sRead - } - } - } - when(readState) { - // 不需要valid,因为这个状态下一定是valid的 - when(readFireNextNext1) { - slideState := sWrite - } - } - when(writeState) { - when(maskUnitWriteReady || !slidActive) { - when(lastElement) { - slideState := idle - slidUnitIdle := true.B - when(gather || extend) { - synchronized := true.B - dataClear := true.B - maskUnitReadOnlyFinish := true.B - } - }.otherwise { - when(lastElementForData && (gather || extend)) { - synchronized := true.B - dataClear := true.B - slideState := idle - }.otherwise { - // todo: skip read - slideState := sRead - } - updateMaskIndex := true.B - } - } - } - - // compress & iota - val idle1 :: sReadMask :: sWrite1 :: Nil = Enum(3) - val compressState = RegInit(idle1) - val compressStateIdle = compressState === idle1 - val compressStateRead = compressState === sReadMask - val compressStateWrite = compressState === sWrite1 - - // compress 用vs1当mask,需要先读vs1 - val readCompressMaskNext = Pipe(maskUnitReadReady && compressStateRead, false.B, parameter.vrfReadLatency).valid - when(readCompressMaskNext) { - maskDataForCompress := readResultSelectResult - } - - // 处理 iota - val iotaDataOffset: UInt = elementIndexCount(log2Ceil(parameter.datapathWidth * parameter.laneNumber) - 1, 0) - val lastDataForIota: Bool = iotaDataOffset.andR - val iotaData = VecInit(data.map(_.bits)).asUInt(iotaDataOffset) - val iota = decodeResultReg(Decoder.iota) - - val maskUnitReadFire2: Bool = maskUnitReadVec(2).valid && maskUnitReadReady - val readFireNext2 = RegNext(maskUnitReadFire2) - val readFireNextNext2 = RegNext(readFireNext2) - val port2WaitForResult = readFireNextNext2 || readFireNext2 - - /** 计算需要读的mask的相关 elementIndexCount -> 11bit 只会访问单寄存器 elementIndexCount(4, 0)做为32bit内的offset elementIndexCount(7, - * 5)作为lane的选择 elementIndexCount(9, 8)作为offset - */ - // compress read - maskUnitReadVec(2).valid := compressStateRead && !port2WaitForResult - maskUnitReadVec(2).bits.vs := vs1 - maskUnitReadVec(2).bits.readSource := 0.U - maskUnitReadVec(2).bits.offset := elementIndexCount( - log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) + - parameter.laneParam.vrfParam.vrfOffsetBits - 1, - log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) - ) - maskReadLaneSelect(2) := UIntToOH( - elementIndexCount( - log2Ceil(parameter.datapathWidth) + ((log2Ceil(parameter.laneNumber) - 1).max(0)), - log2Ceil(parameter.datapathWidth) - ) - ) - // val lastElementForMask: Bool = elementIndexCount(4, 0).andR - val maskForCompress: Bool = maskDataForCompress(elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0)) - - // compress vm=0 是保留的 - val skipWrite = !Mux(decodeResultReg(Decoder.compress), maskForCompress, elementActive) - val dataGroupTailForCompressUnit: Bool = Mux(iota, lastDataForIota, lastElementForData) - - // 计算compress write的位置信息 - val (compressMask, compressDataOffset, compressLane, compressOffset, compressGrowth, _) = - indexAnalysis(compressWriteCount) - val compressWriteData = (maskUnitData << compressDataOffset).asUInt - val iotaWriteData = (iotaCount << vlDataOffset).asUInt - // compress write - maskUnitWriteVec(2).valid := compressStateWrite && !skipWrite - maskUnitWriteVec(2).bits.vd := vd + Mux(iota, vlGrowth, compressGrowth) - maskUnitWriteVec(2).bits.offset := Mux(iota, vlOffset, compressOffset) - maskUnitWriteVec(2).bits.mask := Mux(iota, vlMask, compressMask) - maskUnitWriteVec(2).bits.data := Mux(iota, iotaWriteData, compressWriteData) - maskUnitWriteVec(2).bits.last := lastElement - maskUnitWriteVec(2).bits.instructionIndex := control.record.instructionIndex - maskWriteLaneSelect(2) := UIntToOH(Mux(iota, vlLane, compressLane)) - - // 跳状态机 - // compress每组数据先读mask - val firstState = Mux(iota, sWrite1, sReadMask) - when(compressStateIdle && (!iotaUnitIdle) && compressDataReady) { - compressState := firstState - } - - when(compressStateRead && readFireNextNext2) { - compressState := sWrite1 - } - - when(compressStateWrite) { - when(maskUnitWriteReady || skipWrite) { - when(!skipWrite) { - compressWriteCount := compressWriteCount + 1.U - iotaCount := iotaCount + iotaData - } - when(lastElement) { - compressState := idle - iotaUnitIdle := true.B - synchronized := true.B - dataClear := true.B - maskUnitReadOnlyFinish := true.B - }.otherwise { - when(lastElementForCompressMask) { - // update vs1 as mask for compress - compressState := sRead - } - when(dataGroupTailForCompressUnit) { - synchronized := true.B - dataClear := true.B - compressState := idle - } - updateMaskIndex := true.B - } - } - } - // for small vl & reduce - val accessByte = (csrRegForMaskUnit.vl << csrRegForMaskUnit.vSew).asUInt - // vl < row(vl) - val smallVL = accessByte < (parameter.datapathWidth * parameter.laneNumber / 8).U - val byteSizePerDataPathBits = log2Ceil(parameter.datapathWidth / 8) - val lastExecuteCounterForReduce: UInt = if (parameter.laneNumber > 1) { - accessByte( - byteSizePerDataPathBits + log2Ceil(parameter.laneNumber) - 1, - byteSizePerDataPathBits - ) - !accessByte(byteSizePerDataPathBits - 1, 0).orR - } else 0.U - val lastGroupDataWaitMaskForRed: UInt = scanRightOr(UIntToOH(lastExecuteCounterForReduce)) - // alu end - val maskOperation = - decodeResultReg(Decoder.maskLogic) || - decodeResultReg(Decoder.maskDestination) || - decodeResultReg(Decoder.ffo) - // How many data path(32 bit) will used by maskDestination instruction. - val maskDestinationByteSize: Bits = - csrRegForMaskUnit.vl(log2Ceil(parameter.dLen) - 1, 0) << csrRegForMaskUnit.vSew - val maskDestinationUseDataPathSize = - (maskDestinationByteSize >> 2).asUInt + maskDestinationByteSize(1, 0).orR - val lastGroupCountForThisGroup: UInt = maskDestinationUseDataPathSize(log2Ceil(parameter.laneNumber) - 1, 0) - val counterForMaskDestination: UInt = if (parameter.laneNumber > 1) { - (lastGroupCountForThisGroup - 1.U) | - Fill( - log2Ceil(parameter.laneNumber), - (maskDestinationUseDataPathSize >> log2Ceil(parameter.laneNumber)).asUInt.orR - ) - } else 0.U - - val waitSourceDataCounter = - Mux(decodeResultReg(Decoder.maskDestination), counterForMaskDestination, lastExecuteCounter) - val lastGroupDataWaitMask = scanRightOr(UIntToOH(waitSourceDataCounter)) - // todo: other ways - val lastOrderedGroup: Option[Bool] = orderedReduceGroupCount.map(count => - (count ## 0 - .U(log2Ceil(parameter.laneNumber).W) + -1.S(log2Ceil(parameter.laneNumber).W).asUInt) >= csrRegForMaskUnit.vl - ) - val misalignedOrdered: Bool = if (parameter.fpuEnable) { - lastOrderedGroup.get && csrRegForMaskUnit.vl(log2Ceil(parameter.laneNumber) - 1, 0).orR && decodeResultReg( - Decoder.float - ) - } else false.B - val dataMask = - Mux( - maskOperation && lastGroup, - lastGroupDataWaitMask, - Mux( - reduce && (smallVL || misalignedOrdered), - lastGroupDataWaitMaskForRed, - -1.S(parameter.laneNumber.W).asUInt - ) - ) - val dataReady = ((~dataMask).asUInt | VecInit(data.map(_.valid)).asUInt).andR || skipLaneData - when( - // data ready - dataReady && - // state check - !control.state.sMaskUnitExecution - ) { - // 读 - when(needWAR && !WARRedResult.valid) { - maskUnitReadVec.head.valid := true.B - } - // 可能有的计算 - val nextExecuteIndex: UInt = executeCounter + 1.U - val isLastExecuteForGroup: Bool = executeCounter(log2Ceil(parameter.laneNumber) - 1, 0).andR - val lastExecuteForInstruction: Option[Bool] = orderedReduceGroupCount.map(count => - (count ## 0.U(log2Ceil(parameter.laneNumber).W) + nextExecuteIndex) === csrRegForMaskUnit.vl - ) - val readFinish = WARRedResult.valid || !needWAR - val readDataSign = - Mux1H(vSewOHForMask(2, 0), Seq(WARRedResult.bits(7), WARRedResult.bits(15), WARRedResult.bits(31))) - when(readFinish && !executeFinishReg) { - when(readMv) { - control.state.sMaskUnitExecution := true.B - // signExtend for vmv.x.s - dataResult.bits := Mux(vSewOHForMask(2), WARRedResult.bits(31, 16), Fill(16, readDataSign)) ## - Mux(vSewOHForMask(0), Fill(8, readDataSign), WARRedResult.bits(15, 8)) ## - WARRedResult.bits(7, 0) - - }.otherwise { - executeCounter := nextExecuteIndex - when(executeCounter =/= csrRegForMaskUnit.vl) { - dataResult.bits := aluOutPut - } - if (parameter.fpuEnable) { - when(!orderedReduceIdle.get) { - when(lastExecuteForInstruction.get) { - orderedReduceIdle.get := true.B - }.elsewhen(isLastExecuteForGroup) { - synchronized := true.B - executeCounter := 0.U - dataClear := true.B - orderedReduceGroupCount.foreach(d => d := d + 1.U) - } - } - } - } - } - // for vfredmax - val lastReduceCounter = - executeCounter === csrRegForMaskUnit.vl || executeCounter(log2Ceil(parameter.laneNumber)) - dontTouch(lastReduceCounter) - val executeFinish: Bool = - (lastReduceCounter || !(reduce || popCount) || orderedReduce) && maskUnitIdle - val schedulerWrite = decodeResultReg(Decoder.maskDestination) || (reduce && !popCount) || writeMv - val groupSync = decodeResultReg(Decoder.ffo) - // 写回 - when(readFinish && (executeFinish || writeMv || executeFinishReg)) { - maskUnitWriteVec.head.valid := schedulerWrite - executeFinishReg := true.B - when(maskUnitWriteReady || !schedulerWrite) { - WARRedResult.valid := false.B - writeBackCounter := writeBackCounter + schedulerWrite - when(lastExecuteForGroup || lastExecute || reduce || groupSync || writeMv || popCount) { - synchronized := true.B - dataClear := true.B - when(lastExecuteForGroup || groupSync) { - executeForLastLaneFire := true.B - groupCounter := groupCounter + 1.U - } - when(lastExecute || reduce || writeMv || popCount) { - control.state.sMaskUnitExecution := true.B - } - } - } - } - } } control } @@ -1487,7 +693,7 @@ class T1(val parameter: T1Parameter) Mux(decodeResult(Decoder.gather), gatherData, Mux(decodeResult(Decoder.itype), immSignExtend, source1Extend)) // data eew for extend type - val extendDataEEW: Bool = (csrRegForMaskUnit.vSew >> decodeResult(Decoder.topUop)(1, 0))(0) + val extendDataEEW: Bool = (T1Issue.vsew(requestReg.bits.issue) - decodeResult(Decoder.topUop)(2, 1))(0) val gather16: Bool = decodeResult(Decoder.gather16) val vSewSelect: UInt = Mux( isLoadStoreType, @@ -1555,58 +761,37 @@ class T1(val parameter: T1Parameter) lane.csrInterface.vl := evlForLane lane.laneIndex := index.U - // - LSU request next offset of group - // - all lane are synchronized - // - the index type of instruction is finished. - lane.laneResponseFeedback.valid := lsu.lsuOffsetRequest || synchronized || completeIndexInstruction - // - the index type of instruction is finished. - // - for find first one. - lane.laneResponseFeedback.bits.complete := - completeIndexInstruction || - completedLeftOr(index) || - maskUnitReadOnlyFinish - // tell lane which - lane.laneResponseFeedback.bits.instructionIndex := slots.last.record.instructionIndex - // lsu 优先会有死锁: // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read // 读 lane - lane.vrfReadAddressChannel.valid := lsu.vrfReadDataPorts(index).valid || - (maskUnitRead.valid && maskUnitReadSelect(index)) + lane.vrfReadAddressChannel.valid := lsu.vrfReadDataPorts(index).valid || maskUnit.readChannel(index).valid lane.vrfReadAddressChannel.bits := - Mux(maskUnitRead.valid, maskUnitRead.bits, lsu.vrfReadDataPorts(index).bits) - lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitRead.valid - readSelectMaskUnit(index) := - lane.vrfReadAddressChannel.ready && maskUnitReadSelect(index) - laneReadResult(index) := lane.vrfReadDataChannel + Mux(maskUnit.readChannel(index).valid, maskUnit.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits) + lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnit.readChannel(index).valid + maskUnit.readChannel(index).ready := lane.vrfReadAddressChannel.ready + maskUnit.readResult(index) := lane.vrfReadDataChannel lsu.vrfReadResults(index) := lane.vrfReadDataChannel - // 写lane - lane.vrfWriteChannel.valid := vrfWrite(index).valid || (maskUnitWrite.valid && maskUnitWriteSelect(index)) - lane.vrfWriteChannel.bits := - Mux(vrfWrite(index).valid, vrfWrite(index).bits, maskUnitWrite.bits) + // lsu & mask unit write lane + lane.vrfWriteChannel.valid := vrfWrite(index).valid + lane.vrfWriteChannel.bits := vrfWrite(index).bits vrfWrite(index).ready := lane.vrfWriteChannel.ready - writeSelectMaskUnit(index) := - lane.vrfWriteChannel.ready && !vrfWrite(index).valid && maskUnitWriteSelect(index) - lsu.offsetReadResult(index).valid := lane.laneResponse.valid && lane.laneResponse.bits.toLSU - lsu.offsetReadResult(index).bits := lane.laneResponse.bits.data - lsu.offsetReadIndex(index) := lane.laneResponse.bits.instructionIndex + lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU + lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 + lsu.offsetReadIndex(index) := lane.maskUnitRequest.bits.index instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) => d := (UIntToOH(f(parameter.instructionIndexBits - 2, 0)) & lane.instructionFinished).orR } - vxsatReportVec(index) := lane.vxsatReport - val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index)) - val v0SelectBySew = Mux1H(UIntToOH(lane.maskSelectSew)(2, 0), v0ForThisLane) - lane.maskInput := cutUInt(v0SelectBySew, parameter.datapathWidth)(lane.maskSelect) - lane.lsuLastReport := lsu.lastReport | - Mux( - maskUnitFlushVrf, - indexToOH(slots.last.record.instructionIndex, parameter.chainingSize), - 0.U - ) + vxsatReportVec(index) := lane.vxsatReport + lane.maskInput := maskUnit.laneMaskInput(index) + maskUnit.laneMaskSelect(index) := lane.maskSelect + maskUnit.laneMaskSewSelect(index) := lane.maskSelectSew + maskUnit.v0UpdateVec(index) <> lane.v0Update + + lane.lsuLastReport := lsu.lastReport | maskUnit.lastReport lane.lsuMaskGroupChange := lsu.lsuMaskGroupChange lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index) @@ -1616,18 +801,7 @@ class T1(val parameter: T1Parameter) (requestReg.bits.writeByte >> rowWith).asUInt + (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) - // 处理lane的mask类型请求 - laneSynchronize(index) := lane.laneResponse.valid && !lane.laneResponse.bits.toLSU - when(laneSynchronize(index)) { - data(index).valid := true.B - data(index).bits := lane.laneResponse.bits.data - completedVec(index) := lane.laneResponse.bits.ffoSuccess - flotReduceValid(index).foreach(d => d := lane.laneResponse.bits.fpReduceValid.get) - } - // token manager - tokenManager.writeV0(index).valid := lane.vrfWriteChannel.fire && (lane.vrfWriteChannel.bits.vd === 0.U) - tokenManager.writeV0(index).bits := lane.vrfWriteChannel.bits.instructionIndex tokenManager.instructionFinish(index) := lane.instructionFinished lane @@ -1651,13 +825,45 @@ class T1(val parameter: T1Parameter) lsu.request.bits.instructionInformation.isStore := isStoreType lsu.request.bits.instructionInformation.maskedLoadStore := maskType - lsu.maskInput.zip(lsu.maskSelect).foreach { case (data, index) => - data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index) + maskUnit.lsuMaskSelect := lsu.maskSelect + lsu.maskInput := maskUnit.lsuMaskInput + lsu.csrInterface := requestRegCSR + lsu.csrInterface.vl := evlForLsu + lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR + lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR + + // connect mask unit + maskUnit.instReq.valid := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit) + maskUnit.instReq.bits.instructionIndex := requestReg.bits.instructionIndex + maskUnit.instReq.bits.decodeResult := decodeResult + maskUnit.instReq.bits.readFromScala := source1Select + maskUnit.instReq.bits.sew := T1Issue.vsew(requestReg.bits.issue) + maskUnit.instReq.bits.maskType := maskType + maskUnit.instReq.bits.vxrm := requestReg.bits.issue.vcsr(2, 1) + maskUnit.instReq.bits.vlmul := requestReg.bits.issue.vtype(2, 0) + maskUnit.instReq.bits.vs1 := requestRegDequeue.bits.instruction(19, 15) + maskUnit.instReq.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) + maskUnit.instReq.bits.vd := requestRegDequeue.bits.instruction(11, 7) + maskUnit.instReq.bits.vl := requestReg.bits.issue.vl + + maskUnit.exeReq.zip(laneVec).foreach { case (maskInput, lane) => + maskInput <> lane.maskUnitRequest + } + maskUnit.exeResp.zip(laneVec).foreach { case (maskOutput, lane) => + lane.maskUnitResponse <> maskOutput + } + maskUnit.maskResponseRelease.zip(laneVec).foreach { case (release, lane) => + release := lane.maskResponseRelease + } + + val gatherResultSelect: UInt = Mux1H( + gatherReadLaneSelect, + laneVec.map(_.vrfReadDataChannel) + ) + // gather read result + when(gatherReadResultFire) { + gatherData := Mux(gatherOverlap, 0.U, (gatherResultSelect >> gatherReadDataOffset).asUInt) } - lsu.csrInterface := requestRegCSR - lsu.csrInterface.vl := evlForLsu - lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR - lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR // 连lane的环 parameter.crossLaneConnectCycles.zipWithIndex.foreach { case (cycles, index) => @@ -1724,15 +930,22 @@ class T1(val parameter: T1Parameter) // we detect the hazard and decide should we issue this slide or // issue the instruction after the slide which already in the slot. requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || gatherReadFinish) && - instructionRAWReady && instructionIndexFree && vrfAllocate + tokenManager.issueAllow && instructionIndexFree && vrfAllocate instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U) + tokenManager.instructionIssue.valid := requestRegDequeue.fire + tokenManager.instructionIssue.bits.instructionIndex := requestReg.bits.instructionIndex + tokenManager.instructionIssue.bits.writeV0 := + (!requestReg.bits.decodeResult(Decoder.targetRd) && !isStoreType) && requestReg.bits.vdIsV0 + tokenManager.instructionIssue.bits.useV0AsMask := maskType + tokenManager.instructionIssue.bits.isLoadStore := !requestRegDequeue.bits.instruction(6) + // instruction commit { val slotCommit: Vec[Bool] = VecInit(slots.map { inst => // mask unit finish - inst.state.sMaskUnitExecution && + inst.state.wMaskUnitLast && // lane|lsu finish inst.state.wLast && // mask unit write finish @@ -1743,7 +956,7 @@ class T1(val parameter: T1Parameter) inst.record.instructionIndex === responseCounter }) retire := slotCommit.asUInt.orR - io.retire.rd.bits.rdData := Mux(ffoType, ffoIndexReg.bits, dataResult.bits) + io.retire.rd.bits.rdData := dataResult // TODO: csr retire. io.retire.csr.bits.vxsat := (slotCommit.asUInt & VecInit(slots.map(_.vxsat)).asUInt).orR io.retire.csr.bits.fflag := DontCare @@ -1752,25 +965,6 @@ class T1(val parameter: T1Parameter) lastSlotCommit := slotCommit.last } - // write v0(mask) - v0.zipWithIndex.foreach { case (data, index) => - // 属于哪个lane - val laneIndex: Int = index % parameter.laneNumber - // 取出写的端口 - val v0Write = laneVec(laneIndex).v0Update - // offset - val offset: Int = index / parameter.laneNumber - val maskExt = FillInterleaved(8, v0Write.bits.mask) - when(v0Write.valid && v0Write.bits.offset === offset.U) { - data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) - } - } - when(dataClear) { - data.foreach(_.valid := false.B) - } - // don't care有可能会导致先读后写失败 - maskUnitReadVec.foreach(_.bits.instructionIndex := slots.last.record.instructionIndex) - layer.block(layers.Verification) { /** Probes @@ -1784,11 +978,12 @@ class T1(val parameter: T1Parameter) probeWire.requestReg := requestReg probeWire.requestRegReady := requestRegDequeue.ready // maskUnitWrite maskUnitWriteReady - probeWire.writeQueueEnq.valid := maskUnitWrite.valid && maskUnitWriteReady - probeWire.writeQueueEnq.bits := maskUnitWrite.bits.instructionIndex - probeWire.writeQueueEnqMask := maskUnitWrite.bits.mask + // todo + probeWire.writeQueueEnq.valid := DontCare + probeWire.writeQueueEnq.bits := DontCare + probeWire.writeQueueEnqMask := DontCare probeWire.instructionValid := maskAnd( - !slots.last.state.sMaskUnitExecution && !slots.last.state.idle, + !slots.last.state.wMaskUnitLast && !slots.last.state.idle, indexToOH(slots.last.record.instructionIndex, parameter.chainingSize * 2) ).asUInt probeWire.responseCounter := responseCounter diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala index 4072ae589..dd27a1657 100644 --- a/t1/src/decoder/Decoder.scala +++ b/t1/src/decoder/Decoder.scala @@ -33,7 +33,7 @@ trait T1UopField extends T1DecodeFiled[UInt] with FieldName { } trait T1TopUopField extends T1DecodeFiled[UInt] with FieldName { - def chiselType: UInt = UInt(3.W) + def chiselType: UInt = UInt(5.W) } trait T1fpExecutionTypeUopField extends T1DecodeFiled[UInt] with FieldName { @@ -227,14 +227,39 @@ object Decoder { object topUop extends T1TopUopField { override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { - case _: TopT0.type => BitPat("b000") - case _: TopT1.type => BitPat("b001") - case _: TopT2.type => BitPat("b010") - case _: TopT3.type => BitPat("b011") - case _: TopT5.type => BitPat("b101") - case _: TopT6.type => BitPat("b110") - case _: TopT7.type => BitPat("b111") - case _ => BitPat.dontCare(3) + case _: TopT0.type => BitPat("b00000") + case _: TopT1.type => BitPat("b00001") + case _: TopT2.type => BitPat("b00010") + case _: TopT3.type => BitPat("b00011") + case _: TopT4.type => BitPat("b00100") + case _: TopT5.type => BitPat("b00101") + case _: TopT6.type => BitPat("b00110") + case _: TopT7.type => BitPat("b00111") + case _: TopT8.type => BitPat("b01000") + case _: TopT9.type => BitPat("b01001") + case _: TopT10.type => BitPat("b01010") + case _: TopT11.type => BitPat("b01011") + case _: TopT12.type => BitPat("b01100") + case _: TopT13.type => BitPat("b01101") + case _: TopT14.type => BitPat("b01110") + case _: TopT15.type => BitPat("b01111") + case _: TopT16.type => BitPat("b10000") + case _: TopT17.type => BitPat("b10001") + case _: TopT18.type => BitPat("b10010") + case _: TopT19.type => BitPat("b10011") + case _: TopT20.type => BitPat("b10100") + case _: TopT21.type => BitPat("b10101") + case _: TopT22.type => BitPat("b10110") + case _: TopT23.type => BitPat("b10111") + case _: TopT24.type => BitPat("b11000") + case _: TopT25.type => BitPat("b11001") + case _: TopT26.type => BitPat("b11010") + case _: TopT27.type => BitPat("b11011") + case _: TopT28.type => BitPat("b11100") + case _: TopT29.type => BitPat("b11101") + case _: TopT30.type => BitPat("b11110") + case _: TopT31.type => BitPat("b11111") + case _ => BitPat.dontCare(5) } } diff --git a/t1/src/decoder/attribute/isSwrite.scala b/t1/src/decoder/attribute/isSwrite.scala index f14bad4c0..921f29dc5 100644 --- a/t1/src/decoder/attribute/isSwrite.scala +++ b/t1/src/decoder/attribute/isSwrite.scala @@ -17,17 +17,9 @@ object isSwrite { def y(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched = Seq( - "vcompress.vm", "vcpop.m", "vfirst.m", "vfmv.f.s", - "vfredmax.vs", - "vfredmin.vs", - "vfredosum.vs", - "vfredusum.vs", - "vfwredosum.vs", - "vfwredusum.vs", - "viota.m", "vl1re16.v", "vl1re32.v", "vl1re64.v", @@ -85,57 +77,7 @@ object isSwrite { "vluxei512.v", "vluxei64.v", "vluxei8.v", - "vmadc.vi", - "vmadc.vim", - "vmadc.vv", - "vmadc.vvm", - "vmadc.vx", - "vmadc.vxm", - "vmfeq.vf", - "vmfeq.vv", - "vmfge.vf", - "vmfgt.vf", - "vmfle.vf", - "vmfle.vv", - "vmflt.vf", - "vmflt.vv", - "vmfne.vf", - "vmfne.vv", - "vmsbc.vv", - "vmsbc.vvm", - "vmsbc.vx", - "vmsbc.vxm", - "vmseq.vi", - "vmseq.vv", - "vmseq.vx", - "vmsgt.vi", - "vmsgt.vx", - "vmsgtu.vi", - "vmsgtu.vx", - "vmsle.vi", - "vmsle.vv", - "vmsle.vx", - "vmsleu.vi", - "vmsleu.vv", - "vmsleu.vx", - "vmslt.vv", - "vmslt.vx", - "vmsltu.vv", - "vmsltu.vx", - "vmsne.vi", - "vmsne.vv", - "vmsne.vx", "vmv.x.s", - "vredand.vs", - "vredmax.vs", - "vredmaxu.vs", - "vredmin.vs", - "vredminu.vs", - "vredor.vs", - "vredsum.vs", - "vredxor.vs", - "vrgather.vv", - "vrgatherei16.vv", "vs1r.v", "vs2r.v", "vs4r.v", @@ -148,9 +90,6 @@ object isSwrite { "vse512.v", "vse64.v", "vse8.v", - "vsext.vf2", - "vsext.vf4", - "vsext.vf8", "vsm.v", "vsoxei1024.v", "vsoxei128.v", @@ -207,9 +146,6 @@ object isSwrite { "vwsubu.vx", "vwsubu.wv", "vwsubu.wx", - "vzext.vf2", - "vzext.vf4", - "vzext.vf8", // rv_zvbb "vwsll.vv", "vwsll.vx", diff --git a/t1/src/decoder/attribute/topUop.scala b/t1/src/decoder/attribute/topUop.scala index ae8beeca1..04cabdfdf 100644 --- a/t1/src/decoder/attribute/topUop.scala +++ b/t1/src/decoder/attribute/topUop.scala @@ -10,77 +10,280 @@ object TopT0 extends TopUopType object TopT1 extends TopUopType object TopT2 extends TopUopType object TopT3 extends TopUopType +object TopT4 extends TopUopType object TopT5 extends TopUopType object TopT6 extends TopUopType object TopT7 extends TopUopType +object TopT8 extends TopUopType +object TopT9 extends TopUopType +object TopT10 extends TopUopType +object TopT11 extends TopUopType +object TopT12 extends TopUopType +object TopT13 extends TopUopType +object TopT14 extends TopUopType +object TopT15 extends TopUopType +object TopT16 extends TopUopType +object TopT17 extends TopUopType +object TopT18 extends TopUopType +object TopT19 extends TopUopType +object TopT20 extends TopUopType +object TopT21 extends TopUopType +object TopT22 extends TopUopType +object TopT23 extends TopUopType +object TopT24 extends TopUopType +object TopT25 extends TopUopType +object TopT26 extends TopUopType +object TopT27 extends TopUopType +object TopT28 extends TopUopType +object TopT29 extends TopUopType +object TopT30 extends TopUopType +object TopT31 extends TopUopType object TopUop { def apply(t1DecodePattern: T1DecodePattern): TopUop = { Seq( - t0 _ -> TopT0, - t1 _ -> TopT1, - t2 _ -> TopT2, - t3 _ -> TopT3, - t5 _ -> TopT5, - t6 _ -> TopT6, - t7 _ -> TopT7 + t0 _ -> TopT0, + t1 _ -> TopT1, + t2 _ -> TopT2, + t3 _ -> TopT3, + t4 _ -> TopT4, + t5 _ -> TopT5, + t6 _ -> TopT6, + t7 _ -> TopT7, + t8 _ -> TopT8, + t9 _ -> TopT9, + t10 _ -> TopT10, + t11 _ -> TopT11, + t12 _ -> TopT12, + t13 _ -> TopT13, + t14 _ -> TopT14, + t15 _ -> TopT15, + t16 _ -> TopT16, + t17 _ -> TopT17, + t18 _ -> TopT18, + t19 _ -> TopT19, + t20 _ -> TopT20, + t21 _ -> TopT21, + t22 _ -> TopT22, + t23 _ -> TopT23, + t24 _ -> TopT24, + t25 _ -> TopT25, + t26 _ -> TopT26, + t27 _ -> TopT27, + t28 _ -> TopT28, + t29 _ -> TopT29, + t30 _ -> TopT30, + t31 _ -> TopT31 ).collectFirst { case (fn, tpe) if fn(t1DecodePattern) => TopUop(tpe) }.getOrElse(TopUop(TopT0)) } def t0(t1DecodePattern: T1DecodePattern): Boolean = { - val allMatched = t1DecodePattern.param.allInstructions.filter(i => - !(t1(t1DecodePattern) - || t2(t1DecodePattern) - || t3(t1DecodePattern) - || t5(t1DecodePattern) - || t6(t1DecodePattern) - || t7(t1DecodePattern)) + val allMatched: Seq[String] = Seq( + "vslidedown.vi", + "vslidedown.vx" ) - allMatched.contains(t1DecodePattern.instruction) + allMatched.contains(t1DecodePattern.instruction.name) } def t1(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vfslide1down.vf", - "vslide1down.vx", - "vzext.vf2" + "vslideup.vi", + "vslideup.vx" ) allMatched.contains(t1DecodePattern.instruction.name) } def t2(t1DecodePattern: T1DecodePattern): Boolean = { - val allMatched: Seq[String] = Seq( - "vslideup.vi", - "vslideup.vx", - "vzext.vf4" - ) + val allMatched: Seq[String] = Seq("vslide1down.vx") allMatched.contains(t1DecodePattern.instruction.name) } def t3(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vslide1up.vx") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t4(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vfslide1up.vf", - "vslide1up.vx", - "vzext.vf8" + "vrgather.vv" ) allMatched.contains(t1DecodePattern.instruction.name) } def t5(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vsext.vf2" + "vrgatherei16.vv" ) allMatched.contains(t1DecodePattern.instruction.name) } def t6(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t7(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t8(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("viota.m") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t9(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vcompress.vm") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t10(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vsext.vf4" + "vfmv.s.f", + "vmv.s.x" ) allMatched.contains(t1DecodePattern.instruction.name) } - def t7(t1DecodePattern: T1DecodePattern): Boolean = { + def t11(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vfmv.f.s", + "vmv.x.s" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t12(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t13(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t14(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vmsbf.m", + "vmsif.m", + "vmsof.m" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t15(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfirst.m") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t16(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vcpop.m", + "vredmax.vs", + "vredmaxu.vs", + "vredmin.vs", + "vredminu.vs", + "vredsum.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t17(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vwredsum.vs", + "vwredsumu.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t18(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vredand.vs", + "vredor.vs", + "vredxor.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t19(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vsext.vf8" + "vfredmax.vs", + "vfredmin.vs" ) allMatched.contains(t1DecodePattern.instruction.name) } + def t20(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfredusum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t21(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfredosum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t22(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfwredusum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t23(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfwredosum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t24(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vmadc.vi", + "vmadc.vim", + "vmadc.vv", + "vmadc.vvm", + "vmadc.vx", + "vmadc.vxm", + "vmfeq.vf", + "vmfeq.vv", + "vmfge.vf", + "vmfgt.vf", + "vmfle.vf", + "vmfle.vv", + "vmflt.vf", + "vmflt.vv", + "vmfne.vf", + "vmfne.vv", + "vmsbc.vv", + "vmsbc.vvm", + "vmsbc.vx", + "vmsbc.vxm", + "vmseq.vi", + "vmseq.vv", + "vmseq.vx", + "vmsgt.vi", + "vmsgt.vx", + "vmsgtu.vi", + "vmsgtu.vx", + "vmsle.vi", + "vmsle.vv", + "vmsle.vx", + "vmsleu.vi", + "vmsleu.vv", + "vmsleu.vx", + "vmslt.vv", + "vmslt.vx", + "vmsltu.vv", + "vmsltu.vx", + "vmsne.vi", + "vmsne.vv", + "vmsne.vx" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t25(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t26(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vzext.vf2") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t27(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vsext.vf2") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t28(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vzext.vf4") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t29(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vsext.vf4") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t30(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vzext.vf8") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t31(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vsext.vf8") + allMatched.contains(t1DecodePattern.instruction.name) + } } case class TopUop(value: TopUopType) extends UopDecodeAttribute[TopUopType] { diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala index 46427ade9..2522ee581 100644 --- a/t1/src/laneStage/LaneStage3.scala +++ b/t1/src/laneStage/LaneStage3.scala @@ -47,18 +47,11 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val pipeEnqueue: Option[LaneStage3Enqueue] = Option.when(isLastSlot)(RegInit(0.U.asTypeOf(enqueue.bits))) - /** response to [[T1.lsu]] or mask unit in [[T1]] */ @public - val laneResponse: Option[ValidIO[LaneResponse]] = Option.when(isLastSlot)(IO(Valid(new LaneResponse(parameter)))) - @public - val stageValid: Bool = IO(Output(Bool())) + val stageValid: Bool = IO(Output(Bool())) - /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */ - @public - val laneResponseFeedback: Option[ValidIO[LaneResponseFeedback]] = - Option.when(isLastSlot)(IO(Flipped(Valid(new LaneResponseFeedback(parameter))))) @public - val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = + val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) val stageValidReg: Option[Bool] = Option.when(isLastSlot)(RegInit(false.B)) @@ -69,28 +62,17 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** schedule cross lane write MSB */ val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - // state for response to scheduler - /** schedule send [[LaneResponse]] to scheduler */ - val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - - /** wait scheduler send [[LaneResponseFeedback]] */ - val wResponseFeedback: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - // update register when(enqueue.fire) { pipeEnqueue.foreach(_ := enqueue.bits) (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !enqueue.bits.decodeResult(Decoder.crossWrite)) - (sSendResponse ++ wResponseFeedback).foreach( - _ := enqueue.bits.decodeResult(Decoder.scheduler) || enqueue.bits.sSendResponse - ) } // Used to cut off back pressure forward - val vrfWriteQueue: QueueIO[VRFWriteRequest] = - Queue.io(vrfWriteBundle, entries = 4, pipe = false, flow = false) + val vrfWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteBundle, 4) // The load of the pointer is a bit large, copy one - val vrfPtrReplica: QueueIO[UInt] = - Queue.io(UInt(parameter.vrfParam.vrfOffsetBits.W), entries = 4, pipe = false, flow = false) + val offsetBit: Int = 1.max(parameter.vrfParam.vrfOffsetBits) + val vrfPtrReplica: QueueIO[UInt] = Queue.io(UInt(offsetBit.W), 4) vrfPtrReplica.enq.valid := vrfWriteQueue.enq.valid vrfPtrReplica.enq.bits := vrfWriteQueue.enq.bits.offset vrfPtrReplica.deq.ready := vrfWriteQueue.deq.ready @@ -112,43 +94,18 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sendState(index) := true.B } } - // scheduler synchronization - val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) val dataSelect: Option[UInt] = Option.when(isLastSlot) { Mux( pipeEnqueue.get.decodeResult(Decoder.nr) || - (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)) || - pipeEnqueue.get.decodeResult(Decoder.dontNeedExecuteInLane), + (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)), pipeEnqueue.get.pipeData, pipeEnqueue.get.data ) } - // mask request - laneResponse.head.valid := stageValidReg.get && !sSendResponse.get - laneResponse.head.bits.data := Mux( - pipeEnqueue.get.decodeResult(Decoder.ffo), - pipeEnqueue.get.ffoIndex, - dataSelect.get - ) - laneResponse.head.bits.toLSU := pipeEnqueue.get.loadStore - laneResponse.head.bits.instructionIndex := pipeEnqueue.get.instructionIndex - laneResponse.head.bits.ffoSuccess := pipeEnqueue.get.ffoSuccess - laneResponse.head.bits.fpReduceValid.zip(pipeEnqueue.get.fpReduceValid).foreach { case (s, f) => s := f } - - sSendResponse.foreach(state => - when(laneResponse.head.valid) { - state := true.B - } - ) - wResponseFeedback.foreach(state => - when(laneResponseFeedback.head.valid) { - state := true.B - } - ) // enqueue write for last slot - vrfWriteQueue.enq.valid := stageValidReg.get && schedulerFinish && !pipeEnqueue.get.decodeResult(Decoder.sWrite) + vrfWriteQueue.enq.valid := stageValidReg.get && !pipeEnqueue.get.decodeResult(Decoder.sWrite) // UInt(5.W) + UInt(3.W), use `+` here vrfWriteQueue.enq.bits.vd := pipeEnqueue.get.vd + pipeEnqueue.get.groupCounter( @@ -166,8 +123,8 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** Cross-lane writing is over */ val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _) - enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady) - val dequeueFire = stageValidReg.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady + enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && vrfWriteReady) + val dequeueFire = stageValidReg.get && CrossLaneWriteOver && vrfWriteReady stageValidReg.foreach { data => when(dequeueFire ^ enqueue.fire) { data := enqueue.fire diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala new file mode 100644 index 000000000..1108128c0 --- /dev/null +++ b/t1/src/laneStage/MaskExchangeUnit.scala @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.lane + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl._ +import org.chipsalliance.t1.rtl.decoder.Decoder +import org.chipsalliance.dwbb.stdlib.queue.{Queue, QueueIO} + +@instantiable +class MaskExchangeUnit(parameter: LaneParameter) extends Module { + @public + val enqueue: DecoupledIO[LaneStage3Enqueue] = + IO(Flipped(Decoupled(new LaneStage3Enqueue(parameter, true)))) + + @public + val dequeue: DecoupledIO[LaneStage3Enqueue] = + IO(Decoupled(new LaneStage3Enqueue(parameter, true))) + + @public + val maskReq: DecoupledIO[MaskUnitExeReq] = IO(Decoupled(new MaskUnitExeReq(parameter))) + + @public + val maskRequestToLSU: Bool = IO(Output(Bool())) + + @public + val maskUnitResponse: ValidIO[MaskUnitExeResponse] = IO(Flipped(Valid(new MaskUnitExeResponse(parameter)))) + + @public + val maskResponseRelease: Bool = IO(Output(Bool())) + + val maskUnitWriteQueue: QueueIO[MaskUnitExeResponse] = + Queue.io(new MaskUnitExeResponse(parameter), parameter.maskUnitVefWriteQueueSize) + + // todo: sSendResponse -> sendResponse + val enqIsMaskRequest: Bool = !enqueue.bits.sSendResponse + val enqSendToDeq: Bool = !enqueue.bits.decodeResult(Decoder.maskUnit) + + // todo: connect mask request & response + maskReq.valid := enqIsMaskRequest && enqueue.valid + maskReq.bits.source1 := enqueue.bits.pipeData + maskReq.bits.source2 := enqueue.bits.data + maskReq.bits.index := enqueue.bits.instructionIndex + + maskRequestToLSU := enqueue.bits.loadStore + + // type change MaskUnitExeResponse -> LaneStage3Enqueue + val maskUnitResponsePipeType: LaneStage3Enqueue = WireDefault(enqueue.bits) + maskUnitResponsePipeType.groupCounter := maskUnitWriteQueue.deq.bits.writeData.groupCounter + maskUnitResponsePipeType.data := maskUnitWriteQueue.deq.bits.writeData.data + maskUnitResponsePipeType.mask := maskUnitWriteQueue.deq.bits.writeData.mask + maskUnitResponsePipeType.vd := maskUnitWriteQueue.deq.bits.writeData.vd + maskUnitResponsePipeType.instructionIndex := maskUnitWriteQueue.deq.bits.index + maskUnitResponsePipeType.ffoByOtherLanes := enqueue.bits.ffoByOtherLanes + + maskUnitWriteQueue.enq.valid := maskUnitResponse.valid + maskUnitWriteQueue.enq.bits := maskUnitResponse.bits + + val enqWantToSend: Bool = enqueue.valid && enqSendToDeq + dequeue.valid := (enqueue.valid && enqSendToDeq) || maskUnitWriteQueue.deq.valid + dequeue.bits := Mux(enqWantToSend, enqueue.bits, maskUnitResponsePipeType) + enqueue.ready := dequeue.ready + maskUnitWriteQueue.deq.ready := dequeue.ready && !enqWantToSend + maskResponseRelease := maskUnitWriteQueue.deq.fire +} diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index 61212900f..37da14da5 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -102,6 +102,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { @public val dataInWritePipe: UInt = IO(Output(UInt(parameter.chainingSize.W))) + @public + val maskUnitLastReport: UInt = IO(Input(UInt(parameter.chainingSize.W))) + def tokenUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = { tokenData.zipWithIndex.foreach { case (t, i) => val e = enqWire(i) @@ -115,12 +118,15 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { } // todo: Precise feedback - def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = { + def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt, clear: UInt): UInt = { tokenData.zipWithIndex.foreach { case (t, i) => val e = enqWire(i) val d = deqWire(i) + val c = clear(i) val change = Mux(e, 1.U(tokenWith.W), -1.S(tokenWith.W).asUInt) - when((e ^ d) && (e || t =/= 0.U)) { + when(c) { + t := 0.U + }.elsewhen((e ^ d) && (e || t =/= 0.U)) { t := t + change } } @@ -133,7 +139,11 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val enqOH = indexToOH(enqReport.bits.instructionIndex, parameter.chainingSize) val writeDoEnq: UInt = - maskAnd(enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite), enqOH).asUInt + maskAnd( + enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite) && + !enqReport.bits.decodeResult(Decoder.maskUnit), + enqOH + ).asUInt val writeDoDeq: UInt = maskAnd( @@ -141,7 +151,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { indexToOH(slotWriteReport(slotIndex).bits, parameter.chainingSize) ).asUInt - val pendingSlotWrite = tokenUpdate(writeToken, writeDoEnq, writeDoDeq) + val writeEnqSelect: UInt = Wire(UInt(parameter.chainingSize.W)) + + val pendingSlotWrite = tokenUpdate(writeToken, writeEnqSelect, writeDoDeq) if (slotIndex == 0) { val responseToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) @@ -182,13 +194,16 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { responseFeedbackReport.bits ) val feedbackDoDeq: UInt = - maskAnd(responseFeedbackReport.valid, indexToOH(feedbackIndexSelect, parameter.chainingSize)).asUInt + maskAnd(responseFeedbackReport.valid, indexToOH(responseFeedbackReport.bits, parameter.chainingSize)).asUInt + + writeEnqSelect := writeDoEnq | feedbackDoDeq val pendingResponse = tokenUpdate(responseToken, responseDoEnq, responseDoDeq) // todo: Precise feedback - val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq) + val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq, maskUnitLastReport) pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback } else { + writeEnqSelect := writeDoEnq pendingSlotWrite } }.reduce(_ | _) diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index 6d1b86b1e..acf53874e 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -58,7 +58,7 @@ case class LSUParameter( */ val vLenBits: Int = log2Ceil(vLen) + 1 - val sourceQueueSize: Int = vLen * 8 / (transferSize * 8) + val sourceQueueSize: Int = 32.min(vLen * 8 / (transferSize * 8)) def mshrParam: MSHRParam = MSHRParam(chainingSize, datapathWidth, vLen, laneNumber, paWidth, transferSize, vrfReadLatency) diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala index a5afb5a05..1fb72096b 100644 --- a/t1/src/lsu/StoreUnit.scala +++ b/t1/src/lsu/StoreUnit.scala @@ -248,8 +248,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { } // 连接 alignedDequeue - val needSendTail: Bool = bufferBaseCacheLineIndex === cacheLineNumberReg - memRequest.valid := bufferValid || (canSendTail && needSendTail) + val needSendTail: Bool = bufferBaseCacheLineIndex === cacheLineNumberReg + val addressQueueFree: Bool = Wire(Bool()) + memRequest.valid := (bufferValid || (canSendTail && needSendTail)) && addressQueueFree // aligned memRequest.bits.data := multiShifter(right = false, multiSize = 8)(dataBuffer.head ## cacheLineTemp, initOffset) >> cacheLineTemp.getWidth @@ -263,12 +264,14 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { 0.U(param.cacheLineBits.W) memRequest.bits.address := alignedDequeueAddress - val addressQueueSize: Int = (param.vLen * 8) / (param.datapathWidth * param.laneNumber) + 1 + // todo: param outstanding + val addressQueueSize: Int = 32.min((param.vLen * 8) / (param.datapathWidth * param.laneNumber) + 1) // address Wait For Response val addressQueue: QueueIO[UInt] = Queue.io(UInt(param.paWidth.W), addressQueueSize) addressQueue.enq.valid := memRequest.fire addressQueue.enq.bits := alignedDequeueAddress addressQueue.deq.ready := storeResponse + addressQueueFree := addressQueue.enq.ready status.idle := !bufferValid && !readStageValid && readQueueClear && !bufferFull && !addressQueue.deq.valid val idleNext: Bool = RegNext(status.idle, true.B) diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala new file mode 100644 index 000000000..a50cc849c --- /dev/null +++ b/t1/src/mask/BitLevelMaskWrite.scala @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.dwbb.stdlib.queue.{Queue, QueueIO} + +class BitLevelWriteRequest(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val bitMask: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) +} + +class BitLevelMaskWrite(parameter: T1Parameter) extends Module { + // todo + val readVRFLatency: Int = 2 + + val needWAR: Bool = IO(Input(Bool())) + val vd: UInt = IO(Input(UInt(5.W))) + + val in: Seq[DecoupledIO[BitLevelWriteRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Decoupled(new BitLevelWriteRequest(parameter)))) + } + + val out: Seq[DecoupledIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Decoupled(new MaskUnitExeResponse(parameter.laneParam))) + } + + val readChannel: Seq[DecoupledIO[VRFReadRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO( + Decoupled( + new VRFReadRequest( + parameter.vrfParam.regNumBits, + parameter.laneParam.vrfOffsetBits, + parameter.instructionIndexBits + ) + ) + ) + } + + val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Input(UInt(parameter.datapathWidth.W))) + } + + val stageClear: Bool = IO(Output(Bool())) + + val stageClearVec: Seq[Bool] = in.zipWithIndex.map { case (req, index) => + val reqQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), 4) + val readPort = readChannel(index) + val readData = readResult(index) + val res = out(index) + + val WaitReadQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), readVRFLatency) + val readReady = !needWAR || readPort.ready + + reqQueue.enq <> req + WaitReadQueue.enq.valid := reqQueue.deq.valid && readReady + WaitReadQueue.enq.bits := reqQueue.deq.bits + reqQueue.deq.ready := WaitReadQueue.enq.ready && readReady + + readPort.valid := reqQueue.deq.valid && needWAR + readPort.bits := DontCare + readPort.bits.vs := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt + readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth) + + val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid + val readResultValid = !needWAR || readValidPipe + + val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) | + (readData & (~WaitReadQueue.deq.bits.bitMask).asUInt) + + res.valid := WaitReadQueue.deq.valid && readResultValid + WaitReadQueue.deq.ready := res.ready && readResultValid + res.bits := DontCare + res.bits.writeData.data := Mux(needWAR, WARData, WaitReadQueue.deq.bits.data) + res.bits.writeData.mask := maskEnable(!needWAR, WaitReadQueue.deq.bits.mask) + res.bits.writeData.groupCounter := WaitReadQueue.deq.bits.groupCounter + + // valid token + val counter = RegInit(0.U(3.W)) + val counterChange = Mux(req.fire, 1.U(3.W), 7.U(3.W)) + when(req.fire ^ res.fire) { + counter := counter + counterChange + } + counter === 0.U + } + stageClear := stageClearVec.reduce(_ && _) +} diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala new file mode 100644 index 000000000..83ef055de --- /dev/null +++ b/t1/src/mask/MaskCompress.scala @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class CompressInput(parameter: T1Parameter) extends Bundle { + val maskType: Bool = Bool() + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val readFromScalar: UInt = UInt(parameter.datapathWidth.W) + val source1: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt(parameter.datapathWidth.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val lastCompress: Bool = Bool() +} + +class CompressOutput(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val mask: UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val compressValid: Bool = Bool() +} + +class MaskCompress(parameter: T1Parameter) extends Module { + val in: ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter)))) + val out: CompressOutput = IO(Output(new CompressOutput(parameter))) + val newInstruction: Bool = IO(Input(Bool())) + + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + + val compress = in.bits.uop === "b001".U + val viota = in.bits.uop === "b000".U + val mv = in.bits.uop === "b101".U + + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) + val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) + val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) + val compressMaskVec: Seq[Bool] = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools + val compressCount: UInt = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => + compressVec(index) := pre + pre + mask + } + + when(newInstruction) { + compressInit := 0.U + } + + val countSplit: Seq[(Bool, UInt)] = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte + val countWidth = log2Ceil(elementSizePerSet) + val compressDeqValid = (compressCount >> countWidth).asUInt.orR + val compressUpdate = changeUIntSize(compressCount, countWidth) + (compressDeqValid, compressUpdate) + } + + val compressDeqValid: Bool = Mux1H(eew1H, countSplit.map(_._1)) || !compress + val compressCountSelect: UInt = Mux1H(eew1H, countSplit.map(_._2)) + + when(in.fire) { + when(viota) { + compressInit := compressCount + }.otherwise { + // count update compress + compressInit := compressCountSelect + } + } + + val viotaResult: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber) { index => + // data width: eew * 8, data path 32, need [4 / eew] element + val dataSize = 4 / eew + val res: Seq[UInt] = Seq.tabulate(dataSize) { i => + changeUIntSize(compressVec(dataSize * index + i), eew * 8) + } + // each data path + VecInit(res).asUInt + }).asUInt + } + ) + val viotaMask: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber) { index => + val dataSize = 4 / eew + val res: Seq[UInt] = Seq.tabulate(dataSize) { i => + val maskIndex: Int = (parameter.datapathWidth - 1).min(dataSize * index + i) + Fill(eew, in.bits.mask(maskIndex)) + } + // 4 bit mask + VecInit(res).asUInt + }).asUInt + } + ) + + val tailCount: UInt = { + val minElementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 + val maxCountWidth = log2Ceil(minElementSizePerSet) + changeUIntSize(compressInit, maxCountWidth) + } + + val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) + val compressTailValid: Bool = RegInit(false.B) + val compressWriteGroupCount: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W)) + val compressDataVec = Seq(1, 2, 4).map { dataByte => + val dataBit = dataByte * 8 + val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte + VecInit(Seq.tabulate(elementSizePerSet * 2) { index => + val hitReq = + Seq.tabulate(elementSizePerSet)(maskIndex => compressMaskVec(maskIndex) && compressVec(maskIndex) === index.U) + val selectReqData = Mux1H( + hitReq, + cutUInt(in.bits.source2, dataBit) + ) + if (index < elementSizePerSet) { + val useTail = index.U < tailCount + val tailData = cutUInt(compressDataReg, dataBit)(index) + Mux(useTail, tailData, selectReqData) + } else { + selectReqData + } + }).asUInt + } + val compressResult: UInt = Mux1H(eew1H, compressDataVec) + val lastCompressEnq: Bool = in.fire && in.bits.lastCompress + when(newInstruction || lastCompressEnq || out.compressValid) { + compressTailValid := lastCompressEnq && compress + } + + when(newInstruction || out.compressValid) { + compressWriteGroupCount := Mux(newInstruction, 0.U, compressWriteGroupCount + 1.U) + } + + val splitCompressResult: Vec[UInt] = cutUIntBySize(compressResult, 2) + when(in.fire) { + compressDataReg := Mux(compressDeqValid, splitCompressResult(1), splitCompressResult(0)) + } + + // todo: connect & update compressInit + val compressMask = Wire(UInt(out.mask.getWidth.W)) + // todo: optimization + val compressTailMask: UInt = Mux1H( + eew1H, + Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte + VecInit(Seq.tabulate(elementSizePerSet) { elementIndex => + val elementValid = elementIndex.U < tailCount + val elementMask = Fill(dataByte, elementValid) + elementMask + }).asUInt + } + ) + compressMask := Mux(compressTailValid, compressTailMask, (-1.S(out.mask.getWidth.W)).asUInt) + + val mvMask = Mux1H(eew1H, Seq(1.U, 3.U, 15.U)) + val mvData = in.bits.readFromScalar + + out.data := Mux1H( + Seq( + compress -> compressResult, + viota -> viotaResult, + mv -> mvData + ) + ) + + // todo: compressMask + out.mask := Mux1H( + Seq( + compress -> compressMask, + viota -> viotaMask, + mv -> mvMask + ) + ) + + // todo + out.compressValid := compressTailValid || (compressDeqValid && in.fire) + out.groupCounter := Mux(compress, compressWriteGroupCount, in.bits.groupCounter) +} diff --git a/t1/src/mask/MaskExtend.scala b/t1/src/mask/MaskExtend.scala new file mode 100644 index 000000000..5df5ed2ee --- /dev/null +++ b/t1/src/mask/MaskExtend.scala @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class ExtendInput(parameter: T1Parameter) extends Bundle { + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) +} + +class MaskExtend(parameter: T1Parameter) extends Module { + val in: ExtendInput = IO(Input(new ExtendInput(parameter))) + val out: UInt = IO(Output(UInt((parameter.laneNumber * parameter.datapathWidth).W))) + + val eew1H: UInt = UIntToOH(in.eew)(2, 0) + + val isMaskDestination: Bool = !in.uop(2, 0).orR + val sourceDataVec: Vec[UInt] = cutUInt(in.source2, parameter.datapathWidth) + val maskDestinationResult: UInt = + Mux1H( + eew1H, + Seq(4, 2, 1).map { groupSize => + VecInit(sourceDataVec.map { element => + element.asBools // [x] * 32 eg: sew = 1 + .grouped(groupSize) // [x, x] * 16 + .toSeq + .map(VecInit(_).asUInt) // [xx] * 16 + }.transpose.map(VecInit(_).asUInt)).asUInt // [x*16] * 16 -> x * 256 + } + ) + + // extend + val sign: Bool = in.uop(0) + // extend ratio + // todo: Currently only vf2 and vf4 + // 0b10 -> 4, 0b01 -> 2 + val extendRatio: Bool = in.uop(2) + + // select source2 + // extendRatio: 0 -> vf2; 1-> vf4 + val source2: UInt = Mux( + extendRatio, + Mux1H( + UIntToOH(in.groupCounter(1, 0)), + cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 4) + ), + Mux1H( + UIntToOH(in.groupCounter(0)), + cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 2) + ) + ) + + val extendResult: UInt = Mux1H( + eew1H(2, 1), + Seq(2, 4).map { dataWidth => + Mux1H( + UIntToOH(extendRatio), + Seq(2, 4).map { ratio => + val resWidth = dataWidth * 8 + val sourceWidth = resWidth / ratio + VecInit(cutUInt(source2, sourceWidth).map { sourceData => + Fill(resWidth - sourceWidth, sourceData(sourceWidth - 1) && sign) ## sourceData + }).asUInt + } + ) + } + ) + + out := Mux(isMaskDestination, maskDestinationResult, extendResult) +} diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala new file mode 100644 index 000000000..c2242f701 --- /dev/null +++ b/t1/src/mask/MaskReduce.scala @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.experimental.hierarchy.{Instance, Instantiate} +import chisel3.util._ + +class ReduceInput(parameter: T1Parameter) extends Bundle { + val maskType: Bool = Bool() + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val readVS1: UInt = UInt(parameter.datapathWidth.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val sourceValid: UInt = UInt(parameter.laneNumber.W) + val lastGroup: Bool = Bool() + val vxrm: UInt = UInt(3.W) + val aluUop: UInt = UInt(4.W) + val sign: Bool = Bool() +} + +class ReduceOutput(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) +} + +class MaskReduce(parameter: T1Parameter) extends Module { + val in: DecoupledIO[ReduceInput] = IO(Flipped(Decoupled(new ReduceInput(parameter)))) + val out: ValidIO[ReduceOutput] = IO(Valid(new ReduceOutput(parameter))) + val newInstruction: Bool = IO(Input(Bool())) + val validInst: Bool = IO(Input(Bool())) + + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + + // todo: uop decode + val order: Bool = in.bits.uop === "b101".U + val reqWiden: Bool = in.bits.uop === "b001".U + + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) + val nextFoldCount: Bool = eew1H(0) && !reqWiden + + // reduce function unit + val adder: Instance[ReduceAdder] = Instantiate(new ReduceAdder(parameter.datapathWidth)) + val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(parameter.datapathWidth)) + // option unit for flot reduce + val floatAdder: Option[Instance[FloatAdder]] = + Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24))) + val flotCompare: Option[Instance[FloatCompare]] = + Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24))) + + // init reg + val reduceInit: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val reduceResult: UInt = Wire(UInt(parameter.datapathWidth.W)) + val crossFoldCount: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W)) + val lastFoldCount: Bool = RegInit(false.B) + val updateResult: Bool = Wire(Bool()) + val sourceValid: Bool = Wire(Bool()) + + val reqReg = RegEnable(in.bits, 0.U.asTypeOf(in.bits), in.fire) + // todo: handle reqReg.sourceValid + val groupLastReduce = crossFoldCount.andR + val lastFoldEnd = !lastFoldCount + val outValid: Bool = WireDefault(false.B) + // todo: skip float reduce + val skipFlotReduce: Bool = WireDefault(false.B) + + val eew1HReg: UInt = UIntToOH(reqReg.eew)(2, 0) + val floatType: Bool = reqReg.uop(2) + val NotAdd: Bool = reqReg.uop(1) + val widen: Bool = reqReg.uop === "b001".U || reqReg.uop(2, 1) === "b11".U + val needFold: Bool = eew1HReg(0) || (eew1HReg(1) && !widen) + val writeEEW: UInt = reqReg.eew + widen + val writeEEW1H: UInt = UIntToOH(writeEEW)(2, 0) + val writeMask: UInt = Fill(2, writeEEW1H(2)) ## !writeEEW1H(0) ## true.B + + // crossFold: reduce between lane + // lastFold: reduce in data path + // orderRed: order reduce + val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4) + val state: UInt = RegInit(idle) + + val stateIdle: Bool = state === idle + val stateCross: Bool = state === crossFold + val stateLast: Bool = state === lastFold + val stateOrder: Bool = state === orderRed + + updateResult := + stateLast || ((stateCross || stateOrder) && sourceValid) + + // state update + in.ready := stateIdle + when(stateIdle) { + when(in.valid) { + state := Mux(order, orderRed, crossFold) + } + } + + when(stateCross) { + when(groupLastReduce) { + state := Mux(reqReg.lastGroup && needFold, lastFold, idle) + outValid := reqReg.lastGroup && !needFold + } + } + + when(stateOrder) { + when(groupLastReduce) { + state := idle + outValid := reqReg.lastGroup + } + } + + when(stateLast) { + when(lastFoldEnd) { + state := idle + outValid := true.B + } + } + + val updateInitMask: UInt = FillInterleaved(8, writeMask) + when(newInstruction) { + // todo: update reduceInit when first in.fire + reduceInit := in.bits.readVS1 & updateInitMask + crossFoldCount := 0.U + lastFoldCount := nextFoldCount + } + + // count update + // todo: stateCross <=> stateOrder ?? + when(stateCross || stateOrder || in.fire) { + crossFoldCount := Mux(in.fire, 0.U, crossFoldCount + 1.U) + } + + // result update + when(updateResult) { + reduceInit := reduceResult & updateInitMask + } + + when(stateLast) { + lastFoldCount := false.B + } + + val selectLaneResult: UInt = Mux1H( + UIntToOH(crossFoldCount), + cutUInt(reqReg.source2, parameter.datapathWidth) + ) + sourceValid := Mux1H( + UIntToOH(crossFoldCount), + reqReg.sourceValid.asBools + ) + val reduceDataVec = cutUInt(reduceInit, 8) + // reduceFoldCount = false => abcd -> xxab | xxcd -> mask 0011 + // reduceFoldCount = true => abcd -> xaxc | xbxd -> mask 0101 + val lastFoldSource1: UInt = Mux( + lastFoldCount, + reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(1), + reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(2) + ) + val source2Select: UInt = Mux(stateCross || stateOrder, selectLaneResult, lastFoldSource1) + + // popCount 在top视为reduce add + adder.request.src := VecInit(Seq(reduceInit, source2Select)) + // todo: pop + adder.request.opcode := reqReg.aluUop(2) + adder.request.sign := reqReg.sign + adder.request.vSew := reqReg.eew + + floatAdder.foreach { fAdder => + fAdder.io.a := reduceInit + fAdder.io.b := source2Select + fAdder.io.roundingMode := reqReg.vxrm + } + + flotCompare.foreach { fCompare => + fCompare.io.a := reduceInit + fCompare.io.b := source2Select + // max -> 12, min -> 8 + fCompare.io.isMax := reqReg.aluUop(2) + } + + logicUnit.req.src := VecInit(Seq(reduceInit, source2Select)) + logicUnit.req.opcode := reqReg.aluUop + + val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)( + Mux( + skipFlotReduce, + reduceInit, + Mux(NotAdd, flotCompare.get.io.out, floatAdder.get.io.out) + ) + ) + // select result + reduceResult := Mux( + floatType, + flotReduceResult.getOrElse(adder.response.data), + Mux(NotAdd, logicUnit.resp, adder.response.data) + ) + + out.valid := outValid + out.bits.data := reduceResult + out.bits.mask := writeMask & Fill(4, validInst) +} diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala new file mode 100644 index 000000000..8ee19f63a --- /dev/null +++ b/t1/src/mask/MaskUnit.scala @@ -0,0 +1,914 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.Decoder +import org.chipsalliance.dwbb.stdlib.queue.{Queue, QueueIO} + +// top uop decode +// uu ii x -> uu: unit index; ii: Internal encoding, x: additional encode + +// slid & gather unit, need read vrf in mask unit(00) +// 00 00 x -> slid; x? up: down +// 00 01 x -> slid1; x? up: down +// 00 10 x -> gather; x? 16 : sew todo:(multi address check/ index -> data cache?) + +// compress & viota unit & vmv(01) +// These instructions cannot extend their execution width indefinitely. +// 01 00 x -> x ? compress : viota +// 01 01 x -> vmv; x: write rd ? + +// reduce unit(10) n + 8 + m -> n + 3 + m // Folded into datapath, then folded into sew +// The Reduce instruction folds the data. +// Considering the sequential addition, a state machine is needed to control it. +// 10 00 x -> adder; x: widen reduce? +// 10 01 x -> logic; x: dc +// 10 10 x -> floatAdder; x: order? +// 10 11 x -> flotCompare; x: dc + +// extend unit & maskdestination(11) +// These instructions write an entire data path each time they are executed. +// 11 mm x -> s(z)ext; mm: multiple(00, 01, 10); x ? sign : zero +// 11 11 1 -> maskdestination +@instantiable +class MaskUnit(parameter: T1Parameter) extends Module { + // todo: param + val readQueueSize: Int = 4 + val readVRFLatency: Int = 2 + val maskUnitWriteQueueSize: Int = 8 + + @public + val instReq: ValidIO[MaskUnitInstReq] = IO(Flipped(Valid(new MaskUnitInstReq(parameter)))) + + @public + val exeReq: Seq[DecoupledIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Decoupled(new MaskUnitExeReq(parameter.laneParam)))) + } + + @public + val exeResp: Seq[ValidIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Valid(new MaskUnitExeResponse(parameter.laneParam))) + } + + @public + val maskResponseRelease: Seq[Bool] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Input(Bool())) + } + + @public + val readChannel: Seq[DecoupledIO[VRFReadRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO( + Decoupled( + new VRFReadRequest( + parameter.vrfParam.regNumBits, + parameter.laneParam.vrfOffsetBits, + parameter.instructionIndexBits + ) + ) + ) + } + + @public + val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Input(UInt(parameter.datapathWidth.W))) + } + + @public + val writeRD: ValidIO[UInt] = IO(Valid(UInt(parameter.datapathWidth.W))) + + @public + val lastReport: UInt = IO(Output(UInt(parameter.chainingSize.W))) + + // mask + @public + val lsuMaskInput: Vec[UInt] = IO(Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W)))) + + @public + val lsuMaskSelect: Vec[UInt] = + IO(Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W)))) + + // mask + @public + val laneMaskInput: Vec[UInt] = IO(Output(Vec(parameter.laneNumber, UInt(parameter.maskGroupWidth.W)))) + + @public + val laneMaskSelect: Vec[UInt] = + IO(Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W)))) + + @public + val laneMaskSewSelect: Vec[UInt] = IO(Input(Vec(parameter.laneNumber, UInt(2.W)))) + + @public + val v0UpdateVec = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(new V0Update(parameter.laneParam)))) + } + + /** duplicate v0 for mask */ + val v0: Vec[UInt] = RegInit( + VecInit(Seq.fill(parameter.vLen / parameter.datapathWidth)(0.U(parameter.datapathWidth.W))) + ) + + // write v0(mask) + v0.zipWithIndex.foreach { case (data, index) => + // 属于哪个lane + val laneIndex: Int = index % parameter.laneNumber + // 取出写的端口 + val v0Write = v0UpdateVec(laneIndex) + // offset + val offset: Int = index / parameter.laneNumber + val maskExt = FillInterleaved(8, v0Write.bits.mask) + when(v0Write.valid && v0Write.bits.offset === offset.U) { + data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) + } + } + + // mask update & select + // lane + // TODO: uarch doc for the regroup + val regroupV0: Seq[UInt] = Seq(4, 2, 1).map { groupSize => + VecInit( + cutUInt(v0.asUInt, groupSize) + .grouped(parameter.laneNumber) + .toSeq + .transpose + .map(seq => VecInit(seq).asUInt) + ).asUInt + } + laneMaskInput.zipWithIndex.foreach { case (input, index) => + val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index)) + val v0SelectBySew = Mux1H(UIntToOH(laneMaskSewSelect(index))(2, 0), v0ForThisLane) + input := cutUInt(v0SelectBySew, parameter.datapathWidth)(laneMaskSelect(index)) + } + + // lsu + lsuMaskInput.zip(lsuMaskSelect).foreach { case (data, index) => + data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index) + } + + val maskedWrite: BitLevelMaskWrite = Module(new BitLevelMaskWrite(parameter)) + + val instReg: MaskUnitInstReq = RegEnable(instReq.bits, 0.U.asTypeOf(instReq.bits), instReq.valid) + // viota mask read vs2. Also pretending to be reading vs1 + val viotaReq: Bool = instReq.bits.decodeResult(Decoder.topUop) === "b01000".U + when(instReq.valid && viotaReq) { instReg.vs1 := instReq.bits.vs2 } + // register for read vs1 + val readVS1Reg: MaskUnitReadVs1 = RegInit(0.U.asTypeOf(new MaskUnitReadVs1(parameter))) + val sew1H: UInt = UIntToOH(instReg.sew)(2, 0) + // request for read vs1 + val readVS1Req: MaskUnitReadReq = WireDefault(0.U.asTypeOf(new MaskUnitReadReq(parameter))) + + when(instReq.valid) { + readVS1Reg.requestSend := false.B + readVS1Reg.dataValid := false.B + readVS1Reg.sendToExecution := false.B + readVS1Reg.readIndex := 0.U + } + + // from decode + val unitType: UInt = UIntToOH(instReg.decodeResult(Decoder.topUop)(4, 3)) + val subType: UInt = UIntToOH(instReg.decodeResult(Decoder.topUop)(2, 1)) + val readType: Bool = unitType(0) + val gather16: Bool = instReg.decodeResult(Decoder.topUop) === "b00101".U + val maskDestinationType: Bool = instReg.decodeResult(Decoder.topUop) === "b11000".U + val compress: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b0100?") + val viota: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b01000") + val orderReduce: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b101?1") + val extendType: Bool = unitType(3) && (subType(2) || subType(1)) + + val allGroupExecute: Bool = maskDestinationType || unitType(2) || compress + val useDefaultSew: Bool = unitType(0) + // todo: decode ? + // Indicates how many times a set of data will be executed + // 0 -> 4 times + // 1 -> 2 times + // 3 -> 1 times + val dataSplitSew: UInt = Mux1H( + Seq( + useDefaultSew -> instReg.sew, + // extend + (unitType(3) && subType(2)) -> 0.U, + (unitType(3) && subType(1)) -> 1.U, + allGroupExecute -> 2.U + ) + ) + + // Indicates that an element will use the width of the original data + val sourceDataUseDefaultSew: Bool = !(unitType(3) || gather16) + val sourceDataEEW: UInt = Mux1H( + Seq( + sourceDataUseDefaultSew -> instReg.sew, + // extend + unitType(3) -> (instReg.sew >> subType(2, 1)).asUInt, + gather16 -> 1.U + ) + ) + + // ExecuteIndex is only related to how many times it will be executed, so use [dataSplitSew] + val lastExecuteIndex: UInt = Mux1H(UIntToOH(dataSplitSew), Seq(3.U(2.W), 2.U(2.W), 0.U(2.W))) + + // calculate last group + val sourceDataEEW1H: UInt = UIntToOH(sourceDataEEW)(2, 0) + val lastElementIndex: UInt = (instReg.vl - instReg.vl.orR)(parameter.laneParam.vlMaxBits - 2, 0) + val laneNumberBits: Int = 1.max(log2Ceil(parameter.laneNumber)) + + /** For an instruction, the last group is not executed by all lanes, here is the last group of the instruction xxxxx + * xxx xx -> vsew = 0 xxxxxx xxx x -> vsew = 1 xxxxxxx xxx -> vsew = 2 + */ + val lastGroupForOther: UInt = Mux1H( + sourceDataEEW1H, + Seq( + lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits + 2), + lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits + 1), + lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits) + ) + ) + + val groupSizeForMaskDestination: Int = parameter.laneNumber * parameter.datapathWidth + val lastGroupForMaskDestination: UInt = (lastElementIndex >> log2Ceil(groupSizeForMaskDestination)).asUInt + val lastGroupForInstruction: UInt = Mux(maskDestinationType, lastGroupForMaskDestination, lastGroupForOther) + + /** Which lane the last element is in. */ + val lastLaneIndex: UInt = Mux1H( + sourceDataEEW1H, + Seq( + lastElementIndex(laneNumberBits + 2 - 1, 2), + lastElementIndex(laneNumberBits + 1 - 1, 1), + lastElementIndex(laneNumberBits - 1, 0) + ) + ) + val lastGroupDataNeedForOther: UInt = scanRightOr(UIntToOH(lastLaneIndex)) + + val elementTailForMaskDestination = lastElementIndex(log2Ceil(groupSizeForMaskDestination) - 1, 0) + // xxx -> widthForLaneIndex + // .. -> widthForDataPath + // .. -> tailMsB + // 0 -> ..xxx.. + // 1 -> ...xxx. + // 2 -> ....xxx + val lastGroupDataNeedForMaskDestination: UInt = Mux1H( + sourceDataEEW1H, + VecInit(Seq(0, 1, 2).map { sewInt => + val widthForDataPath = 2 - sewInt + val widthForLaneIndex = log2Ceil(parameter.laneNumber) + val tailMsB = elementTailForMaskDestination >> (widthForDataPath + widthForLaneIndex) + val allDataNeed = tailMsB.asUInt.orR + scanRightOr(UIntToOH(elementTailForMaskDestination(widthForDataPath + widthForLaneIndex - 1, widthForDataPath))) | + Fill(parameter.laneNumber, allDataNeed) + }) + ) + + val lastGroupDataNeed: UInt = Mux(maskDestinationType, lastGroupDataNeedForMaskDestination, lastGroupDataNeedForOther) + + val exeRequestQueue: Seq[QueueIO[MaskUnitExeReq]] = exeReq.map { req => + // todo: max or token? + val queue: QueueIO[MaskUnitExeReq] = Queue.io(chiselTypeOf(req.bits), 64, flow = true) + queue.enq.valid := req.valid + req.ready := queue.enq.ready + queue.enq.bits := req.bits + queue + } + + val exeReqReg: Seq[ValidIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ => + RegInit(0.U.asTypeOf(Valid(new MaskUnitExeReq(parameter.laneParam)))) + } + val requestCounter: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W)) + val executeGroupCounter: UInt = Wire(UInt(parameter.laneParam.groupNumberBits.W)) + + val counterValid: Bool = requestCounter <= lastGroupForInstruction + val lastGroup: Bool = + requestCounter === lastGroupForInstruction || (!orderReduce && unitType(2)) + + val viotaCounterAdd: Bool = Wire(Bool()) + val groupCounterAdd: Bool = Mux(viota, viotaCounterAdd, exeRequestQueue.head.deq.fire) + when(instReq.valid || groupCounterAdd) { + requestCounter := Mux(instReq.valid, 0.U, requestCounter + 1.U) + } + + // todo: mask + val groupDataNeed: UInt = Mux(lastGroup, lastGroupDataNeed, (-1.S(parameter.laneNumber.W)).asUInt) + // For read type, only sew * laneNumber data will be consumed each time + // There will be a maximum of (dataPath * laneNumber) / (sew * laneNumber) times + val executeIndex: UInt = RegInit(0.U(2.W)) + // The status of an execution + // Each execution ends with executeIndex + 1 + val readIssueStageState: MaskUnitExecuteState = RegInit(0.U.asTypeOf(new MaskUnitExecuteState(parameter))) + val readIssueStageValid: Bool = RegInit(false.B) + + def indexAnalysis(sewInt: Int)(elementIndex: UInt, vlmul: UInt, valid: Option[Bool] = None): Seq[UInt] = { + val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt + val positionSize = parameter.laneParam.vlMaxBits - 1 + val dataPosition = (changeUIntSize(elementIndex, positionSize) << sewInt).asUInt(positionSize - 1, 0) + val accessMask: UInt = Seq( + UIntToOH(dataPosition(1, 0)), + FillInterleaved(2, UIntToOH(dataPosition(1))), + 15.U(4.W) + )(sewInt) + // The offset of the data starting position in 32 bits (currently only 32). + // Since the data may cross lanes, it will be optimized during fusion. + // (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) + val dataOffset: UInt = + (if (sewInt < 2) dataPosition(1) else false.B) ## + (if (sewInt == 0) dataPosition(0) else false.B) + val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) + // 32 bit / group + val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt + val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits + val offset = dataGroup(offsetWidth - 1, 0) + val accessRegGrowth = (dataGroup >> offsetWidth).asUInt + val decimalProportion = offset ## accessLane + // 1/8 register + val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) + + /** elementIndex needs to be compared with vlMax(vLen * lmul /sew) This calculation is too complicated We can change + * the angle. Calculate the increment of the read register and compare it with lmul to know whether the index + * exceeds vlMax. vlmul needs to distinguish between integers and floating points + */ + val overlap = + (vlmul(2) && decimal >= intLMULInput(3, 1)) || + (!vlmul(2) && accessRegGrowth >= intLMULInput) || + (elementIndex >> log2Ceil(parameter.vLen)).asUInt.orR + val elementValid = valid.getOrElse(true.B) + val notNeedRead = overlap || !elementValid + val reallyGrowth: UInt = changeUIntSize(accessRegGrowth, 3) + Seq(accessMask, dataOffset, accessLane, offset, reallyGrowth, notNeedRead, elementValid) + } + + // datapath bit per mask group + // laneNumber bit per execute group + val executeGroup: UInt = Mux1H( + sew1H, + Seq( + requestCounter ## executeIndex, + requestCounter ## executeIndex(1), + requestCounter + ) + ) + + // read vl boundary + val executeSizeBit: Int = log2Ceil(parameter.laneNumber) + val vlMisAlign = instReg.vl(executeSizeBit - 1, 0).orR + val lastexecuteGroup: UInt = (instReg.vl >> executeSizeBit).asUInt - !vlMisAlign + val isVlBoundary: Bool = executeGroup === lastexecuteGroup + val validExecuteGroup: Bool = executeGroup <= lastexecuteGroup + val vlBoundaryCorrection: UInt = Mux( + vlMisAlign && isVlBoundary, + (~scanLeftOr(UIntToOH(instReg.vl(executeSizeBit - 1, 0)))).asUInt, + -1.S(parameter.laneNumber.W).asUInt + ) & Fill(parameter.laneNumber, validExecuteGroup) + + // handle mask + val readMaskSelect: UInt = + (executeGroup >> log2Ceil(parameter.datapathWidth / parameter.laneNumber)).asUInt + val readMaskInput: UInt = cutUInt(v0.asUInt, parameter.maskGroupWidth)(readMaskSelect) + val selectReadStageMask: UInt = cutUIntBySize(readMaskInput, 4)(executeGroup(1, 0)) + val readMaskCorrection: UInt = + Mux(instReg.maskType, selectReadStageMask, -1.S(parameter.laneNumber.W).asUInt) & + vlBoundaryCorrection + + // write mask for normal execute + val maskSplit = Seq(0, 1, 2).map { sewInt => + // byte / element + val dataByte = 1 << sewInt + val rowElementSize: Int = parameter.laneNumber * parameter.datapathWidth / dataByte / 8 + val maskSelect = cutUInt(v0.asUInt, rowElementSize)(executeGroupCounter) + + val executeSizeBit: Int = log2Ceil(rowElementSize) + val vlMisAlign = instReg.vl(executeSizeBit - 1, 0).orR + val lastexecuteGroup: UInt = (instReg.vl >> executeSizeBit).asUInt - !vlMisAlign + val isVlBoundary: Bool = executeGroupCounter === lastexecuteGroup + val validExecuteGroup: Bool = executeGroupCounter <= lastexecuteGroup + val vlBoundaryCorrection: UInt = maskEnable( + vlMisAlign && isVlBoundary, + (~scanLeftOr(UIntToOH(instReg.vl(executeSizeBit - 1, 0)))).asUInt + ) & Fill(rowElementSize, validExecuteGroup) + val elementMask = maskEnable(instReg.maskType, maskSelect) & vlBoundaryCorrection + val byteMask = FillInterleaved(dataByte, elementMask) + (byteMask, elementMask) + } + val executeByteMask: UInt = Mux1H(sew1H, maskSplit.map(_._1)) + val executeElementMask: UInt = Mux1H(sew1H, maskSplit.map(_._2)) + + // mask for destination + val maskForDestination: UInt = cutUInt(v0.asUInt, groupSizeForMaskDestination)(requestCounter) + val lastGroupMask: UInt = scanRightOr(UIntToOH(elementTailForMaskDestination)) + val currentMaskGroupForDestination: UInt = maskEnable(lastGroup, lastGroupMask) & + maskEnable(instReg.maskType, maskForDestination) + + val checkVec: Seq[Seq[UInt]] = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + // All data of this group + val groupSourceData: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt + val groupSourceValid: UInt = VecInit(exeReqReg.map(_.valid)).asUInt + // Single use length + val singleWidth = dataByte * 8 * parameter.laneNumber + // How many times will a set of data be executed? + val executeTimes = (parameter.datapathWidth / 8) / dataByte + // Which part is selected as the source data this time? + val executeDataSelect1H: UInt = if (sewInt == 0) { + UIntToOH(executeIndex) + } else if (sewInt == 1) { + UIntToOH(executeIndex(1)) + } else { + true.B + } + // Select source data + val sourceSelect = Mux1H( + executeDataSelect1H, + cutUInt(groupSourceData, singleWidth) + ) + val validSelect: UInt = Mux1H( + executeDataSelect1H, + cutUInt(groupSourceValid, singleWidth / parameter.datapathWidth) + ) + + // The length of an element + val dataWidth = 8 * dataByte + // Split into elements + val source = cutUInt(sourceSelect, dataWidth) + val validVec = FillInterleaved(parameter.datapathWidth / dataWidth, validSelect) & readMaskCorrection + // read index check + // (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap) + val checkResultVec: Seq[Seq[UInt]] = source.zipWithIndex.map { case (s, i) => + indexAnalysis(sewInt)(s, instReg.vlmul, Some(validVec(i))) + } + val checkResult = checkResultVec.transpose.map(a => VecInit(a).asUInt) + checkResult + } + val sewCorrection1H: UInt = sourceDataEEW1H + val dataOffsetSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(1))) + val accessLaneSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(2))) + val offsetSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(3))) + val growthSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(4))) + val notReadSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(5))) + val elementValidSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(6))) + + val readCrossBar: MaskUnitReadCrossBar = Module(new MaskUnitReadCrossBar(parameter)) + + // The queue waiting to read data. This queue contains other information about this group. + // 64: todo: max or token? + val readWaitQueue: QueueIO[MaskUnitWaitReadQueue] = Queue.io(new MaskUnitWaitReadQueue(parameter), 64) + + // s0 pipe request from lane + val laseExecuteGroupDeq: Bool = Wire(Bool()) + exeRequestQueue.zip(exeReqReg).foreach { case (req, reg) => + req.deq.ready := !reg.valid || laseExecuteGroupDeq || viota + when(req.deq.fire) { + reg.bits := req.deq.bits + } + when(req.deq.fire ^ laseExecuteGroupDeq) { + reg.valid := req.deq.fire && !viota + } + } + + val isLastExecuteGroup: Bool = executeIndex === lastExecuteIndex + val allDataValid: Bool = exeReqReg.zipWithIndex.map { case (d, i) => d.valid || !groupDataNeed(i) }.reduce(_ && _) + val anyDataValid: Bool = exeReqReg.zipWithIndex.map { case (d, i) => d.valid }.reduce(_ || _) + + // try to read vs1 + val readVs1Valid: Bool = (unitType(2) || compress) && !readVS1Reg.requestSend + readVS1Req.vs := instReg.vs1 + when(compress) { + val logLaneNumber = log2Ceil(parameter.laneNumber) + readVS1Req.vs := instReg.vs1 + (readVS1Reg.readIndex >> (parameter.laneParam.vrfOffsetBits + logLaneNumber)) + readVS1Req.offset := readVS1Reg.readIndex >> logLaneNumber + readVS1Req.readLane := changeUIntSize(readVS1Reg.readIndex, logLaneNumber) + } + + // select execute group + val selectExecuteReq: Seq[ValidIO[MaskUnitReadReq]] = exeReqReg.zipWithIndex.map { case (_, index) => + val res: ValidIO[MaskUnitReadReq] = WireInit(0.U.asTypeOf(Valid(new MaskUnitReadReq(parameter)))) + res.bits.vs := instReg.vs2 + readIssueStageState.vsGrowth(index) + if (parameter.laneParam.vrfOffsetBits > 0) { + res.bits.offset := readIssueStageState.readOffset(index) + } + res.bits.readLane := readIssueStageState.accessLane(index) + res.bits.dataOffset := cutUIntBySize(readIssueStageState.readDataOffset, parameter.laneNumber)(index) + res.bits.requestIndex := index.U + res.valid := readIssueStageValid && !readIssueStageState.groupReadState(index) && + readIssueStageState.needRead(index) && unitType(0) + if (index == 0) { + when(readVs1Valid) { + res.valid := true.B + res.bits := readVS1Req + } + } + res + } + + when(readCrossBar.input.head.fire) { + readVS1Reg.requestSend := true.B + } + + // read arbitration + readCrossBar.input.zip(selectExecuteReq).foreach { case (cross, req) => + cross.valid := req.valid + cross.bits := req.bits + } + + // read control register update + val readFire: UInt = VecInit(readCrossBar.input.map(_.fire)).asUInt + val anyReadFire: Bool = readFire.orR + val readStateUpdate: UInt = readFire | readIssueStageState.groupReadState + val groupReadFinish: Bool = readStateUpdate === readIssueStageState.needRead + val readTypeRequestDeq: Bool = + (anyReadFire && groupReadFinish) || (readIssueStageValid && readIssueStageState.needRead === 0.U) + + val viotaValid: Bool = viota && counterValid && instReg.vl.orR + val vs1DataValid: Bool = readVS1Reg.dataValid || !(unitType(2) || compress) + val executeReady: Bool = Wire(Bool()) + // todo: remove, it will do nothing when vl=0 + val sendInitData: Bool = unitType(2) && !readVS1Reg.sendToExecution && instReg.vl === 0.U + val otherTypeRequestDeq: Bool = (allDataValid || sendInitData || viotaValid) && vs1DataValid + val requestStageDeq: Bool = Mux(readType, readTypeRequestDeq, otherTypeRequestDeq && executeReady) + val readIssueStageEnq: Bool = (allDataValid && readTypeRequestDeq) || !readIssueStageValid + when(anyReadFire) { + readIssueStageState.groupReadState := readStateUpdate + } + + when(readTypeRequestDeq ^ readIssueStageEnq) { + readIssueStageValid := readIssueStageEnq + } + + val executeIndexGrowth: UInt = (1.U << dataSplitSew).asUInt + when(requestStageDeq && anyDataValid) { + executeIndex := executeIndex + executeIndexGrowth + } + when(readIssueStageEnq) { + readIssueStageState.groupReadState := 0.U + readIssueStageState.needRead := (~notReadSelect).asUInt + readIssueStageState.elementValid := elementValidSelect + readIssueStageState.accessLane := cutUIntBySize(accessLaneSelect, parameter.laneNumber) + readIssueStageState.vsGrowth := cutUIntBySize(growthSelect, parameter.laneNumber) + readIssueStageState.readOffset := offsetSelect + readIssueStageState.groupCount := requestCounter + readIssueStageState.executeIndex := executeIndex + readIssueStageState.readDataOffset := dataOffsetSelect + readIssueStageState.last := isVlBoundary + } + + readWaitQueue.enq.valid := readTypeRequestDeq + readWaitQueue.enq.bits.groupCounter := readIssueStageState.groupCount + readWaitQueue.enq.bits.executeIndex := readIssueStageState.executeIndex + readWaitQueue.enq.bits.sourceValid := readIssueStageState.elementValid + readWaitQueue.enq.bits.needRead := readIssueStageState.needRead + readWaitQueue.enq.bits.last := readIssueStageState.last + + // last execute group in this request group dequeue + laseExecuteGroupDeq := requestStageDeq && isLastExecuteGroup + + // s1 read vrf + val write1HPipe: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + val pipeDataOffset: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(log2Ceil(parameter.datapathWidth / 8).W))) + + readCrossBar.output.zipWithIndex.foreach { case (request, index) => + val sourceLane = UIntToOH(request.bits.writeIndex) + readChannel(index).valid := request.valid + readChannel(index).bits.readSource := 2.U + readChannel(index).bits.vs := request.bits.vs + readChannel(index).bits.offset := request.bits.offset + readChannel(index).bits.instructionIndex := instReg.instructionIndex + request.ready := readChannel(index).ready + + maskedWrite.readChannel(index).ready := readChannel(index).ready + maskedWrite.readResult(index) := readResult(index) + when(maskDestinationType) { + readChannel(index).valid := maskedWrite.readChannel(index).valid + readChannel(index).bits.vs := maskedWrite.readChannel(index).bits.vs + readChannel(index).bits.offset := maskedWrite.readChannel(index).bits.offset + } + + // pipe read fire + val pipeRead = Pipe(readChannel(index).fire, sourceLane, readVRFLatency) + val pipeOffset = Pipe(readChannel(index).fire, request.bits.dataOffset, readVRFLatency) + write1HPipe(index) := Mux(pipeRead.valid, pipeRead.bits, 0.U(parameter.laneNumber.W)) + pipeDataOffset(index) := pipeOffset.bits + } + + // Processing read results + val readData: Seq[DecoupledIO[UInt]] = Seq.tabulate(parameter.laneNumber) { index => + // todo: assert enq.read & use token + val readDataQueue = Queue.io(UInt(parameter.datapathWidth.W), 4, flow = true) + val readResultSelect = VecInit(write1HPipe.map(_(index))).asUInt + val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) + readDataQueue.enq.valid := readResultSelect.orR + readDataQueue.enq.bits := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W)) + readDataQueue.deq + } + + /** todo: [[waiteReadDataPipeReg]] enq && [[readWaitQueue]] enq * */ + // reg before execute + val waiteReadDataPipeReg: MaskUnitWaitReadQueue = RegInit(0.U.asTypeOf(new MaskUnitWaitReadQueue(parameter))) + val waiteReadData: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => RegInit(0.U(parameter.datapathWidth.W)) } + val waiteReadSate: UInt = RegInit(0.U(parameter.laneNumber.W)) + val waiteReadStageValid: Bool = RegInit(false.B) + + // Process the data that needs to be written + val dlen: Int = parameter.datapathWidth * parameter.laneNumber + // Execute at most 4 times, each index represents 1/4 of dlen + val eachIndexSize = dlen / 4 + val writeDataVec = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val data = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex => + val dataElement: UInt = Wire(UInt((dataByte * 8).W)) + val dataIsRead = waiteReadDataPipeReg.needRead(laneIndex) + // todo: select vs1 when slide1 + dataElement := Mux(dataIsRead, waiteReadData(laneIndex), 0.U) + dataElement + }).asUInt + + val shifterData = (data << (waiteReadDataPipeReg.executeIndex ## 0.U(log2Ceil(eachIndexSize).W))).asUInt + // align + changeUIntSize(shifterData, dlen) + } + val writeData = Mux1H(sew1H, writeDataVec) + + val writeMaskVec: Seq[UInt] = Seq(0, 1, 2).map { sewInt => + val MaskMagnification = 1 << sewInt + val mask = FillInterleaved(MaskMagnification, waiteReadDataPipeReg.sourceValid) + val shifterMask = (mask << (waiteReadDataPipeReg.executeIndex ## 0.U(log2Ceil(eachIndexSize / 8).W))).asUInt + // align + changeUIntSize(shifterMask, dlen / 8) + } + val writeMask = Mux1H(sew1H, writeMaskVec) + + val writeRequest: Seq[MaskUnitExeResponse] = Seq.tabulate(parameter.laneNumber) { laneIndex => + val res: MaskUnitExeResponse = Wire(new MaskUnitExeResponse(parameter.laneParam)) + res.ffoByOther := false.B + res.index := instReg.instructionIndex + res.writeData.groupCounter := waiteReadDataPipeReg.groupCounter + res.writeData.vd := instReg.vd + res.writeData.data := cutUIntBySize(writeData, parameter.laneNumber)(laneIndex) + res.writeData.mask := cutUIntBySize(writeMask, parameter.laneNumber)(laneIndex) + res + } + val WillWriteLane: UInt = VecInit(cutUIntBySize(writeMask, parameter.laneNumber).map(_.orR)).asUInt + + // update waite read stage + val waiteStageDeqValid: Bool = + waiteReadStageValid && + (waiteReadSate === waiteReadDataPipeReg.needRead || waiteReadDataPipeReg.needRead === 0.U) + val waiteStageDeqReady: Bool = Wire(Bool()) + val waiteStageDeqFire: Bool = waiteStageDeqValid && waiteStageDeqReady + + val waiteStageEnqReady: Bool = !waiteReadStageValid || waiteStageDeqFire + val waiteStageEnqFire: Bool = readWaitQueue.deq.valid && waiteStageEnqReady + + readWaitQueue.deq.ready := waiteStageEnqReady + + when(waiteStageEnqFire) { + waiteReadDataPipeReg := readWaitQueue.deq.bits + } + + when(waiteStageDeqFire ^ waiteStageEnqFire) { + waiteReadStageValid := waiteStageEnqFire + } + + waiteReadData.zipWithIndex.foreach { case (reg, index) => + val isWaiteForThisData = waiteReadDataPipeReg.needRead(index) && !waiteReadSate(index) && waiteReadStageValid + val read = readData(index) + read.ready := isWaiteForThisData + if (index == 0) { + read.ready := isWaiteForThisData || unitType(2) || compress + when(read.fire) { + readVS1Reg.data := read.bits + readVS1Reg.dataValid := true.B + } + } + when(read.fire) { + reg := read.bits + } + } + val readResultValid: UInt = VecInit(readData.map(_.fire)).asUInt + when(waiteStageEnqFire && readResultValid.orR) { + waiteReadSate := readResultValid + }.elsewhen(readResultValid.orR) { + waiteReadSate := waiteReadSate | readResultValid + }.elsewhen(waiteStageEnqFire) { + waiteReadSate := 0.U + } + + // Determine whether the data is ready + val executeEnqValid: Bool = otherTypeRequestDeq && !readType + + // start execute + val compressUnit: MaskCompress = Module(new MaskCompress(parameter)) + val reduceUnit: MaskReduce = Module(new MaskReduce(parameter)) + val extendUnit: MaskExtend = Module(new MaskExtend(parameter)) + + // todo + val source2: UInt = VecInit(exeReqReg.map(_.bits.source2)).asUInt + val source1: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt + + // compress data + // compress executes a whole set of data + val vs1Split: Seq[(UInt, Bool)] = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + // For compress, a set of data requires vs1Size bits of vs1 + val vs1Size = (parameter.datapathWidth / 8) * parameter.laneNumber / dataByte + // How many sets of vs1 can a dataPath have? + val setSize = parameter.datapathWidth / vs1Size + val vs1SetIndex: UInt = + if (parameter.datapathWidth <= vs1Size) true.B + else + requestCounter(log2Ceil(setSize) - 1, 0) + val selectVS1: UInt = + if (parameter.datapathWidth <= vs1Size) readVS1Reg.data + else + cutUIntBySize(readVS1Reg.data, setSize)(vs1SetIndex) + val willChangeVS1Index = vs1SetIndex.andR + (selectVS1, willChangeVS1Index) + } + + val source1Data: UInt = Mux1H(sew1H, vs1Split.map(_._1)) + val source1Change: Bool = Mux1H(sew1H, vs1Split.map(_._2)) + when(source1Change && compressUnit.in.fire) { + readVS1Reg.dataValid := false.B + readVS1Reg.requestSend := false.B + readVS1Reg.readIndex := readVS1Reg.readIndex + 1.U + + } + viotaCounterAdd := compressUnit.in.fire + + compressUnit.in.valid := executeEnqValid && unitType(1) + compressUnit.in.bits.maskType := instReg.maskType + compressUnit.in.bits.eew := instReg.sew + compressUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) + compressUnit.in.bits.readFromScalar := instReg.readFromScala + compressUnit.in.bits.source1 := source1Data + compressUnit.in.bits.mask := executeElementMask + compressUnit.in.bits.source2 := source2 + compressUnit.in.bits.groupCounter := requestCounter + compressUnit.in.bits.lastCompress := lastGroup + compressUnit.newInstruction := instReq.valid + + reduceUnit.in.valid := executeEnqValid && unitType(2) + reduceUnit.in.bits.maskType := instReg.maskType + reduceUnit.in.bits.eew := instReg.sew + reduceUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) + reduceUnit.in.bits.readVS1 := readVS1Reg.data + reduceUnit.in.bits.source2 := source2 + reduceUnit.in.bits.sourceValid := VecInit(exeReqReg.map(_.valid)).asUInt + reduceUnit.in.bits.lastGroup := lastGroup + reduceUnit.in.bits.vxrm := instReg.vxrm + reduceUnit.in.bits.aluUop := instReg.decodeResult(Decoder.uop) + reduceUnit.in.bits.sign := !instReg.decodeResult(Decoder.unsigned1) + reduceUnit.newInstruction := !readVS1Reg.sendToExecution && reduceUnit.in.fire + reduceUnit.validInst := instReg.vl.orR + when(reduceUnit.in.fire) { + readVS1Reg.sendToExecution := true.B + } + + val extendGroupCount: UInt = Mux( + extendType, + Mux( + subType(2), + requestCounter ## executeIndex, + requestCounter ## executeIndex(1) + ), + requestCounter + ) + extendUnit.in.eew := instReg.sew + extendUnit.in.uop := instReg.decodeResult(Decoder.topUop) + extendUnit.in.source2 := source2 + extendUnit.in.groupCounter := extendGroupCount + + val executeResult: UInt = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.data, + reduceUnit.out.bits.data, + extendUnit.out + ) + ) + + // todo + executeReady := Mux1H( + unitType, + Seq( + true.B, // read type + true.B, // compress + reduceUnit.in.ready && readVS1Reg.dataValid, // reduce + executeEnqValid // extend unit + ) + ) + + val executeValid: Bool = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.compressValid, + false.B, + executeEnqValid + ) + ) + + executeGroupCounter := Mux1H( + unitType(3, 1), + Seq( + requestCounter, + requestCounter, + extendGroupCount + ) + ) + + val executeDeqGroupCounter: UInt = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.groupCounter, + requestCounter, + extendGroupCount + ) + ) + + val executeWriteByteMask: UInt = Mux(compress, compressUnit.out.mask, executeByteMask) + maskedWrite.needWAR := maskDestinationType + maskedWrite.vd := instReg.vd + maskedWrite.in.zipWithIndex.foreach { case (req, index) => + req.valid := executeValid + req.bits.mask := cutUIntBySize(executeWriteByteMask, parameter.laneNumber)(index) + req.bits.data := cutUInt(executeResult, parameter.datapathWidth)(index) + req.bits.bitMask := cutUInt(currentMaskGroupForDestination, parameter.datapathWidth)(index) + req.bits.groupCounter := executeDeqGroupCounter + if (index == 0) { + // reduce result + when(unitType(2)) { + req.valid := reduceUnit.out.valid + req.bits.mask := reduceUnit.out.bits.mask + req.bits.data := reduceUnit.out.bits.data + req.bits.groupCounter := 0.U + } + } + } + + // mask unit write queue + val writeQueue: Seq[QueueIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + Queue.io(new MaskUnitExeResponse(parameter.laneParam), maskUnitWriteQueueSize) + } + + writeQueue.zipWithIndex.foreach { case (queue, index) => + val readTypeWriteVrf: Bool = waiteStageDeqFire && WillWriteLane(index) + queue.enq.valid := maskedWrite.out(index).valid || readTypeWriteVrf + maskedWrite.out(index).ready := queue.enq.ready + queue.enq.bits := maskedWrite.out(index).bits + when(readTypeWriteVrf) { + queue.enq.bits := writeRequest(index) + } + queue.enq.bits.ffoByOther := false.B // todo + queue.enq.bits.index := instReg.instructionIndex + + // write token + val tokenCounter = RegInit(0.U(log2Ceil(parameter.maskUnitVefWriteQueueSize + 1).W)) + val tokenAllow: Bool = queue.deq.fire + val counterChange: UInt = Mux(tokenAllow, 1.U, -1.S(tokenCounter.getWidth.W).asUInt) + when(tokenAllow ^ maskResponseRelease(index)) { + tokenCounter := tokenCounter + counterChange + } + // write vrf + val writePort = exeResp(index) + queue.deq.ready := !tokenCounter.asBools.last + writePort.valid := tokenAllow + writePort.bits := queue.deq.bits + writePort.bits.writeData.vd := instReg.vd + } + waiteStageDeqReady := writeQueue.zipWithIndex.map { case (queue, index) => + !WillWriteLane(index) || queue.enq.ready + }.reduce(_ && _) + writeRD <> DontCare + + // todo: token + val waiteLastRequest: Bool = RegInit(false.B) + val waitQueueClear: Bool = RegInit(false.B) + val lastReportValid = waitQueueClear && !writeQueue.map(_.deq.valid).reduce(_ || _) + when(lastReportValid) { + waitQueueClear := false.B + waiteLastRequest := false.B + } + when(!readType && requestStageDeq && lastGroup) { + waiteLastRequest := true.B + } + val executeStageInvalid: Bool = Mux1H( + unitType(3, 1), + Seq( + !compressUnit.out.compressValid, + reduceUnit.in.ready, + true.B + ) + ) + val executeStageClean: Bool = Mux( + readType, + waiteStageDeqFire && waiteReadDataPipeReg.last, + waiteLastRequest && maskedWrite.stageClear && executeStageInvalid + ) + val alwaysNeedExecute: Bool = WireInit(false.B) // todo: mv? + val invalidEnq: Bool = instReq.fire && !instReq.bits.vl && !alwaysNeedExecute + when(executeStageClean || invalidEnq) { + waitQueueClear := true.B + } + lastReport := maskAnd( + lastReportValid, + indexToOH(instReg.instructionIndex, parameter.chainingSize) + ) +} diff --git a/t1/src/mask/MaskUnitReadCrossBar.scala b/t1/src/mask/MaskUnitReadCrossBar.scala new file mode 100644 index 000000000..dab845d9e --- /dev/null +++ b/t1/src/mask/MaskUnitReadCrossBar.scala @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class MaskUnitReadCrossBar(parameter: T1Parameter) extends Module { + val input: Seq[DecoupledIO[MaskUnitReadReq]] = Seq.tabulate(parameter.laneNumber)(_ => + IO( + Flipped( + Decoupled( + new MaskUnitReadReq(parameter) + ) + ) + ) + ) + val output: Seq[DecoupledIO[MaskUnitReadQueue]] = Seq.tabulate(parameter.laneNumber)(_ => + IO( + Decoupled( + new MaskUnitReadQueue(parameter) + ) + ) + ) + + val inputSelect1H: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + + input.zipWithIndex.foldLeft(0.U(parameter.laneNumber.W)) { case (laneOccupied, (req, index)) => + val requestReadLane = UIntToOH(req.bits.readLane) + // read lane free + val free: Bool = (requestReadLane & (~laneOccupied).asUInt).orR + val outReady: Bool = Mux1H(requestReadLane, output.map(_.ready)) + req.ready := free && outReady + inputSelect1H(index) := Mux(req.valid && free, requestReadLane, 0.U(parameter.laneNumber.W)) + laneOccupied | inputSelect1H(index) + } + + output.zipWithIndex.foreach { case (req, index) => + val tryToRead: UInt = VecInit(inputSelect1H.map(_(index))).asUInt + req.valid := tryToRead.orR + val selectReq: DecoupledIO[MaskUnitReadReq] = Mux1H(tryToRead, input) + req.bits.vs := selectReq.bits.vs + req.bits.offset := selectReq.bits.offset + req.bits.writeIndex := selectReq.bits.requestIndex + req.bits.dataOffset := selectReq.bits.dataOffset + } +} diff --git a/t1/src/package.scala b/t1/src/package.scala index b0afc12be..87b0e1eb0 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -10,6 +10,8 @@ import chisel3.util.experimental.decode.DecodeBundle import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator} import org.chipsalliance.t1.rtl.lane.Distributor +import scala.jdk.CollectionConverters._ + package object rtl { def csa32(s: UInt, c: UInt, a: UInt): (UInt, UInt) = { val xor = s ^ c @@ -41,6 +43,10 @@ package object rtl { Mux(mask, data, 0.U.asTypeOf(data)) } + def maskEnable(enable: Bool, mask: UInt): UInt = { + Mux(enable, mask, (-1.S(mask.getWidth.W)).asUInt.asTypeOf(mask)) + } + def indexToOH(index: UInt, chainingSize: Int): UInt = { UIntToOH(index(log2Ceil(chainingSize) - 1, 0)) } @@ -74,6 +80,21 @@ package object rtl { }) } + def cutUIntBySize(data: UInt, size: Int): Vec[UInt] = { + require(data.getWidth % size == 0) + val width: Int = data.getWidth / size + cutUInt(data, width) + } + + def changeUIntSize(data: UInt, size: Int, sign: Boolean = false): UInt = { + if (data.getWidth >= size) { + data(size - 1, 0) + } else { + val extend = if (sign) data(data.getWidth - 1) else false.B + Fill(size - data.getWidth, extend) ## data + } + } + def calculateSegmentWriteMask( datapath: Int, laneNumber: Int, diff --git a/t1/src/sequencer/T1TokenManager.scala b/t1/src/sequencer/T1TokenManager.scala index b80ed2dcd..de19e9be0 100644 --- a/t1/src/sequencer/T1TokenManager.scala +++ b/t1/src/sequencer/T1TokenManager.scala @@ -7,10 +7,20 @@ import chisel3._ import chisel3.experimental.hierarchy.{instantiable, public} import chisel3.util._ +class IssueToken(parameter: T1Parameter) extends Bundle { + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val writeV0: Bool = Bool() + val useV0AsMask: Bool = Bool() + val isLoadStore: Bool = Bool() +} + @instantiable class T1TokenManager(parameter: T1Parameter) extends Module { @public - val writeV0 = IO(Vec(parameter.laneNumber, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + val instructionIssue: ValidIO[IssueToken] = IO(Flipped(Valid(new IssueToken(parameter)))) + + @public + val issueAllow: Bool = IO(Output(Bool())) @public val instructionFinish: Vec[UInt] = IO(Vec(parameter.laneNumber, Input(UInt(parameter.chainingSize.W)))) @@ -18,19 +28,42 @@ class T1TokenManager(parameter: T1Parameter) extends Module { @public val v0WriteValid = IO(Output(UInt(parameter.chainingSize.W))) - // v0 write token - val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex => - val update: ValidIO[UInt] = writeV0(laneIndex) - val clear: UInt = instructionFinish(laneIndex) - val updateOH = maskAnd(update.valid, indexToOH(update.bits, parameter.chainingSize)).asUInt + val issueIndex1H: UInt = indexToOH(instructionIssue.bits.instructionIndex, parameter.chainingSize) + + // Boolean type token clear & set + def updateBooleanToken(set: UInt, clear: UInt): UInt = { VecInit(Seq.tabulate(parameter.chainingSize) { chainingIndex => val res = RegInit(false.B) - when(updateOH(chainingIndex) || clear(chainingIndex)) { - res := updateOH(chainingIndex) + when(set(chainingIndex) || clear(chainingIndex)) { + res := set(chainingIndex) } res }).asUInt } + // v0 write token + val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex => + val v0WriteIssue = instructionIssue.valid && instructionIssue.bits.writeV0 + val clear: UInt = instructionFinish(laneIndex) + val updateOH = maskAnd(v0WriteIssue, issueIndex1H).asUInt + updateBooleanToken(updateOH, clear) + } + + val useV0AsMaskToken: UInt = Seq + .tabulate(parameter.laneNumber) { laneIndex => + val useV0Issue = instructionIssue.valid && instructionIssue.bits.useV0AsMask + val clear: UInt = instructionFinish(laneIndex) + val updateOH = maskAnd(useV0Issue, issueIndex1H).asUInt + updateBooleanToken(updateOH, clear) + } + .reduce(_ | _) + v0WriteValid := v0WriteValidVec.reduce(_ | _) + + // v0 read-write conflict + val v0Conflict: Bool = + (instructionIssue.bits.writeV0 && useV0AsMaskToken.orR) || + (instructionIssue.bits.useV0AsMask && v0WriteValid.orR) + + issueAllow := !(v0Conflict) } diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 9c8321fda..155b33f7e 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -531,7 +531,11 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val dataInLsuQueue = ohCheck(loadDataInLSUWriteQueue, record.bits.instIndex, parameter.chainingSize) // elementMask update by write val writeUpdateValidVec: Seq[Bool] = - writePort.map(p => p.fire && p.bits.instructionIndex === record.bits.instIndex && p.bits.mask(3)) + writePort.map(p => + p.fire && p.bits.instructionIndex === record.bits.instIndex && + // Only index load will split the datapath into separate parts. + (p.bits.mask(3) || !record.bits.ls) + ) val writeUpdate1HVec: Seq[UInt] = writeOH.zip(writeUpdateValidVec).map { case (oh, v) => Mux(v, oh, 0.U) } // elementMask update by read of store instruction val loadUpdateValidVec = @@ -547,7 +551,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val waitLaneClear = record.bits.state.stFinish && record.bits.state.wWriteQueueClear && record.bits.state.wLaneLastReport && record.bits.state.wTopLastReport - val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear + val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear || record.bits.elementMask.andR when(topLastReport) { record.bits.state.stFinish := true.B @@ -609,7 +613,8 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar Mux(older, sourceVdEqSinkVs, sinkVdEqSourceVs) ) val rawForeStore = Mux(older, isStore.head && isSlow.last, isStore.last && isSlow.head) && samVd - (hazardForeLoad, rawForeStore) + // (hazardForeLoad, rawForeStore) todo: need check hazard? + (false.B, false.B) } } writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _)