From e5fb6752d613a6f85e2f93b4c01836ac59a8c90c Mon Sep 17 00:00:00 2001 From: Arthur Perais Date: Wed, 21 Dec 2016 15:04:06 -0600 Subject: cpu: Clarify meaning of cachePorts variable in lsq_unit.hh of O3 cachePorts currently constrains the number of store packets written to the D-Cache each cycle), but loads currently affect this variable. This leads to unexpected congestion (e.g., setting cachePorts to a realistic 1 will in fact allow a store to WB only if no loads have accessed the D-Cache this cycle). In the absence of arbitration, this patch decouples how many loads can be done per cycle from how many stores can be done per cycle. Signed-off-by: Jason Lowe-Power --- src/cpu/o3/O3CPU.py | 3 ++- src/cpu/o3/lsq_unit.hh | 20 ++++++++++++-------- src/cpu/o3/lsq_unit_impl.hh | 14 +++++++------- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py index 92f96a3b6..cab2cf34f 100644 --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -52,7 +52,8 @@ class DerivO3CPU(BaseCPU): activity = Param.Unsigned(0, "Initial count") - cachePorts = Param.Unsigned(200, "Cache Ports") + cacheStorePorts = Param.Unsigned(200, "Cache Ports. " + "Constrains stores only. Loads are constrained by load FUs.") decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay") renameToFetchDelay = Param.Cycles(1 ,"Rename to fetch delay") diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index b1b0aae3a..10d4966e8 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -113,7 +113,7 @@ class LSQUnit { * @todo: Move the number of used ports up to the LSQ level so it can * be shared by all LSQ units. */ - void tick() { usedPorts = 0; } + void tick() { usedStorePorts = 0; } /** Inserts an instruction. */ void insert(DynInstPtr &inst); @@ -429,11 +429,11 @@ class LSQUnit { int storeTail; /// @todo Consider moving to a more advanced model with write vs read ports - /** The number of cache ports available each cycle. */ - int cachePorts; + /** The number of cache ports available each cycle (stores only). */ + int cacheStorePorts; - /** The number of used cache ports in this cycle. */ - int usedPorts; + /** The number of used cache ports in this cycle by stores. */ + int usedStorePorts; //list mshrSeqNums; @@ -765,8 +765,6 @@ LSQUnit::read(Request *req, Request *sreqLow, Request *sreqHigh, load_inst->memData = new uint8_t[req->getSize()]; } - ++usedPorts; - // if we the cache is not blocked, do cache access bool completedFirst = false; PacketPtr data_pkt = Packet::createRead(req); @@ -800,6 +798,11 @@ LSQUnit::read(Request *req, Request *sreqLow, Request *sreqHigh, state->mainPkt = data_pkt; } + // For now, load throughput is constrained by the number of + // load FUs only, and loads do not consume a cache port (only + // stores do). + // @todo We should account for cache port contention + // and arbitrate between loads and stores. bool successful_load = true; if (!dcachePort->sendTimingReq(fst_data_pkt)) { successful_load = false; @@ -811,7 +814,8 @@ LSQUnit::read(Request *req, Request *sreqLow, Request *sreqHigh, // load will be squashed, so indicate this to the state object. // The first packet will return in completeDataAccess and be // handled there. - ++usedPorts; + // @todo We should also account for cache port contention + // here. if (!dcachePort->sendTimingReq(snd_data_pkt)) { // The main packet will be deleted in completeDataAccess. state->complete(); diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 73be5e56f..56f12cbb4 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -176,7 +176,7 @@ LSQUnit::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params, depCheckShift = params->LSQDepCheckShift; checkLoads = params->LSQCheckLoads; - cachePorts = params->cachePorts; + cacheStorePorts = params->cacheStorePorts; needsTSO = params->needsTSO; resetState(); @@ -193,7 +193,7 @@ LSQUnit::resetState() storeHead = storeWBIdx = storeTail = 0; - usedPorts = 0; + usedStorePorts = 0; retryPkt = NULL; memDepViolator = NULL; @@ -792,7 +792,7 @@ LSQUnit::writebackStores() storeQueue[storeWBIdx].inst && storeQueue[storeWBIdx].canWB && ((!needsTSO) || (!storeInFlight)) && - usedPorts < cachePorts) { + usedStorePorts < cacheStorePorts) { if (isStoreBlocked) { DPRINTF(LSQUnit, "Unable to write back any more stores, cache" @@ -810,7 +810,7 @@ LSQUnit::writebackStores() continue; } - ++usedPorts; + ++usedStorePorts; if (storeQueue[storeWBIdx].inst->isDataPrefetch()) { incrStIdx(storeWBIdx); @@ -950,8 +950,8 @@ LSQUnit::writebackStores() assert(snd_data_pkt); // Ensure there are enough ports to use. - if (usedPorts < cachePorts) { - ++usedPorts; + if (usedStorePorts < cacheStorePorts) { + ++usedStorePorts; if (sendStore(snd_data_pkt)) { storePostSend(snd_data_pkt); } else { @@ -975,7 +975,7 @@ LSQUnit::writebackStores() } // Not sure this should set it to 0. - usedPorts = 0; + usedStorePorts = 0; assert(stores >= 0 && storesToWB >= 0); } -- cgit v1.2.3