3 files changed, 180 insertions, 69 deletions
diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py
index 8c573ca3a..b06b8e7eb 100644
--- a/src/mem/DRAMCtrl.py
+++ b/src/mem/DRAMCtrl.py
@@ -156,11 +156,17 @@ class DRAMCtrl(AbstractMemory):
     # to be sent. It is 7.8 us for a 64ms refresh requirement
     tREFI = Param.Latency("Refresh command interval")
 
-    # write-to-read turn around penalty
-    tWTR = Param.Latency("Write to read switching time")
+    # write-to-read, same rank turnaround penalty
+    tWTR = Param.Latency("Write to read, same rank switching time")
 
-    # read-to-write turn around penalty, bus turnaround delay
-    tRTW = Param.Latency("Read to write switching time")
+    # read-to-write, same rank turnaround penalty
+    tRTW = Param.Latency("Read to write, same rank switching time")
+
+    # rank-to-rank bus delay penalty
+    # this does not correlate to a memory timing parameter and encompasses:
+    # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD
+    # different rank bus delay
+    tCS = Param.Latency("Rank to rank switching time")
 
     # minimum row activate to row activate delay time
     tRRD = Param.Latency("ACT to ACT delay")
@@ -221,9 +227,12 @@ class DDR3_1600_x64(DRAMCtrl):
     # Greater of 4 CK or 7.5 ns
     tRTP = '7.5ns'
 
-    # Default read-to-write bus around to 2 CK, @800 MHz = 2.5 ns
+    # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
     tRTW = '2.5ns'
 
+    # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
+    tCS = '2.5ns'
+
     # <=85C, half for >85C
     tREFI = '7.8us'
 
@@ -296,9 +305,12 @@ class DDR4_2400_x64(DRAMCtrl):
     # Greater of 4 CK or 7.5 ns
     tRTP = '7.5ns'
 
-    # Default read-to-write bus around to 2 CK, @1200 MHz = 1.666 ns
+    # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns
     tRTW = '1.666ns'
 
+    # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns
+    tCS = '1.666ns'
+
     # <=85C, half for >85C
     tREFI = '7.8us'
 
@@ -353,9 +365,12 @@ class DDR3_1333_x64_DRAMSim2(DRAMCtrl):
     # Greater of 4 CK or 7.5 ns, 4 CK @ 666.66 MHz = 6 ns
     tWTR = '7.5ns'
 
-    # Default read-to-write bus around to 2 CK, @666.66 MHz = 3 ns
+    # Default same rank rd-to-wr bus turnaround to 2 CK, @666.66 MHz = 3 ns
     tRTW = '3ns'
 
+    # Default different rank bus delay to 2 CK, @666.66 MHz = 3 ns
+    tCS = '3ns'
+
     tRRD = '6.0ns'
 
     tXAW = '30ns'
@@ -416,9 +431,12 @@ class LPDDR2_S4_1066_x32(DRAMCtrl):
     # Irrespective of speed grade, tWTR is 7.5 ns
     tWTR = '7.5ns'
 
-    # Default read-to-write bus around to 2 CK, @533 MHz = 3.75 ns
+    # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns
     tRTW = '3.75ns'
 
+    # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns
+    tCS = '3.75ns'
+
     # Activate to activate irrespective of density and speed grade
     tRRD = '10.0ns'
 
@@ -473,9 +491,12 @@ class WideIO_200_x128(DRAMCtrl):
     # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns
     tWTR = '15ns'
 
-    # Default read-to-write bus around to 2 CK, @200 MHz = 10 ns
+    # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns
     tRTW = '10ns'
 
+    # Default different rank bus delay to 2 CK, @200 MHz = 10 ns
+    tCS = '10ns'
+
     # Activate to activate irrespective of density and speed grade
     tRRD = '10.0ns'
 
@@ -536,9 +557,12 @@ class LPDDR3_1600_x32(DRAMCtrl):
     # Irrespective of speed grade, tWTR is 7.5 ns
     tWTR = '7.5ns'
 
-    # Default read-to-write bus around to 2 CK, @800 MHz = 2.5 ns
+    # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
     tRTW = '2.5ns'
 
+    # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
+    tCS = '2.5ns'
+
     # Activate to activate irrespective of density and speed grade
     tRRD = '10.0ns'
 
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index 1d96e274c..ca562f4f7 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -76,7 +76,7 @@ DRAMCtrl::DRAMCtrl(const DRAMCtrlParams* p) :
     writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0),
     minWritesPerSwitch(p->min_writes_per_switch),
     writesThisTime(0), readsThisTime(0),
-    tCK(p->tCK), tWTR(p->tWTR), tRTW(p->tRTW), tBURST(p->tBURST),
+    tCK(p->tCK), tWTR(p->tWTR), tRTW(p->tRTW), tCS(p->tCS), tBURST(p->tBURST),
     tRCD(p->tRCD), tCL(p->tCL), tRP(p->tRP), tRAS(p->tRAS), tWR(p->tWR),
     tRTP(p->tRTP), tRFC(p->tRFC), tREFI(p->tREFI), tRRD(p->tRRD),
     tXAW(p->tXAW), activationLimit(p->activation_limit),
@@ -87,7 +87,8 @@ DRAMCtrl::DRAMCtrl(const DRAMCtrlParams* p) :
     backendLatency(p->static_backend_latency),
     busBusyUntil(0), refreshDueAt(0), refreshState(REF_IDLE),
     pwrStateTrans(PWR_IDLE), pwrState(PWR_IDLE), prevArrival(0),
-    nextReqTime(0), pwrStateTick(0), numBanksActive(0)
+    nextReqTime(0), pwrStateTick(0), numBanksActive(0),
+    activeRank(0)
 {
     // create the bank states based on the dimensions of the ranks and
     // banks
@@ -683,7 +684,7 @@ DRAMCtrl::processRespondEvent()
 }
 
 void
-DRAMCtrl::chooseNext(std::deque<DRAMPacket*>& queue)
+DRAMCtrl::chooseNext(std::deque<DRAMPacket*>& queue, bool switched_cmd_type)
 {
     // This method does the arbitration between requests. The chosen
     // packet is simply moved to the head of the queue. The other
@@ -699,13 +700,13 @@ DRAMCtrl::chooseNext(std::deque<DRAMPacket*>& queue)
     if (memSchedPolicy == Enums::fcfs) {
         // Do nothing, since the correct request is already head
     } else if (memSchedPolicy == Enums::frfcfs) {
-        reorderQueue(queue);
+        reorderQueue(queue, switched_cmd_type);
     } else
         panic("No scheduling policy chosen\n");
 }
 
 void
-DRAMCtrl::reorderQueue(std::deque<DRAMPacket*>& queue)
+DRAMCtrl::reorderQueue(std::deque<DRAMPacket*>& queue, bool switched_cmd_type)
 {
     // Only determine this when needed
     uint64_t earliest_banks = 0;
@@ -713,6 +714,7 @@ DRAMCtrl::reorderQueue(std::deque<DRAMPacket*>& queue)
     // Search for row hits first, if no row hit is found then schedule the
     // packet to one of the earliest banks available
     bool found_earliest_pkt = false;
+    bool found_prepped_diff_rank_pkt = false;
     auto selected_pkt_it = queue.begin();
 
     for (auto i = queue.begin(); i != queue.end() ; ++i) {
@@ -720,25 +722,30 @@ DRAMCtrl::reorderQueue(std::deque<DRAMPacket*>& queue)
         const Bank& bank = dram_pkt->bankRef;
         // Check if it is a row hit
         if (bank.openRow == dram_pkt->row) {
-            // FCFS within the hits
-            DPRINTF(DRAM, "Row buffer hit\n");
-            selected_pkt_it = i;
-            break;
-        } else if (!found_earliest_pkt) {
-            // No row hit, go for first ready
+            if (dram_pkt->rank == activeRank || switched_cmd_type) {
+                // FCFS within the hits, giving priority to commands
+                // that access the same rank as the previous burst
+                // to minimize bus turnaround delays
+                // Only give rank prioity when command type is not changing
+                DPRINTF(DRAM, "Row buffer hit\n");
+                selected_pkt_it = i;
+                break;
+            } else if (!found_prepped_diff_rank_pkt) {
+                // found row hit for command on different rank than prev burst
+                selected_pkt_it = i;
+                found_prepped_diff_rank_pkt = true;
+            }
+        } else if (!found_earliest_pkt & !found_prepped_diff_rank_pkt) {
+            // No row hit and
+            // haven't found an entry with a row hit to a new rank
             if (earliest_banks == 0)
-                earliest_banks = minBankActAt(queue);
-
-            // simplistic approximation of when the bank can issue an
-            // activate, this is calculated in minBankActAt and could
-            // be cached
-            Tick act_at = bank.openRow == Bank::NO_ROW ?
-                bank.actAllowedAt :
-                std::max(bank.preAllowedAt, curTick()) + tRP;
-
-            // Bank is ready or is the first available bank
-            if (act_at <= curTick() ||
-                bits(earliest_banks, dram_pkt->bankId, dram_pkt->bankId)) {
+                // Determine entries with earliest bank prep delay
+                // Function will give priority to commands that access the
+                // same rank as previous burst and can prep the bank seamlessly
+                earliest_banks = minBankPrep(queue, switched_cmd_type);
+
+            // FCFS - Bank is first available bank
+            if (bits(earliest_banks, dram_pkt->bankId, dram_pkt->bankId)) {
                 // Remember the packet to be scheduled to one of the earliest
                 // banks available, FCFS amongst the earliest banks
                 selected_pkt_it = i;
@@ -983,6 +990,9 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
     // read/write (add a max with tCCD here)
     bank.colAllowedAt = cmd_at + tBURST;
 
+    // Save rank of current access
+    activeRank = dram_pkt->rank;
+
     // If this is a write, we also need to respect the write recovery
     // time before a precharge, in the case of a read, respect the
     // read to precharge constraint
@@ -1095,6 +1105,9 @@ DRAMCtrl::doDRAMAccess(DRAMPacket* dram_pkt)
 void
 DRAMCtrl::processNextReqEvent()
 {
+    // pre-emptively set to false.  Overwrite if in READ_TO_WRITE
+    // or WRITE_TO_READ state
+    bool switched_cmd_type = false;
     if (busState == READ_TO_WRITE) {
         DPRINTF(DRAM, "Switching to writes after %d reads with %d reads "
                 "waiting\n", readsThisTime, readQueue.size());
@@ -1106,6 +1119,7 @@ DRAMCtrl::processNextReqEvent()
 
         // now proceed to do the actual writes
         busState = WRITE;
+        switched_cmd_type = true;
     } else if (busState == WRITE_TO_READ) {
         DPRINTF(DRAM, "Switching to reads after %d writes with %d writes "
                 "waiting\n", writesThisTime, writeQueue.size());
@@ -1114,6 +1128,7 @@ DRAMCtrl::processNextReqEvent()
         writesThisTime = 0;
 
         busState = READ;
+        switched_cmd_type = true;
     }
 
     if (refreshState != REF_IDLE) {
@@ -1160,10 +1175,26 @@ DRAMCtrl::processNextReqEvent()
         } else {
             // Figure out which read request goes next, and move it to the
             // front of the read queue
-            chooseNext(readQueue);
+            chooseNext(readQueue, switched_cmd_type);
 
             DRAMPacket* dram_pkt = readQueue.front();
 
+            // here we get a bit creative and shift the bus busy time not
+            // just the tWTR, but also a CAS latency to capture the fact
+            // that we are allowed to prepare a new bank, but not issue a
+            // read command until after tWTR, in essence we capture a
+            // bubble on the data bus that is tWTR + tCL
+            if (switched_cmd_type) {
+                // add a bubble to the data bus for write-to-read turn around
+                // or tCS (different rank bus delay).
+                busBusyUntil += (dram_pkt->rank == activeRank) ? tWTR + tCL :
+                                                                 tCS;
+            } else if (dram_pkt->rank != activeRank) {
+                // add a bubble to the data bus, as defined by the
+                // tCS parameter for rank-to-rank delay
+                busBusyUntil += tCS;
+            }
+
             doDRAMAccess(dram_pkt);
 
             // At this point we're done dealing with the request
@@ -1197,21 +1228,23 @@ DRAMCtrl::processNextReqEvent()
         if (switch_to_writes) {
             // transition to writing
             busState = READ_TO_WRITE;
-
-            // add a bubble to the data bus, as defined by the
-            // tRTW parameter
-            busBusyUntil += tRTW;
-
-            // update the minimum timing between the requests,
-            // this shifts us back in time far enough to do any
-            // bank preparation
-            nextReqTime = busBusyUntil - (tRP + tRCD + tCL);
         }
     } else {
-        chooseNext(writeQueue);
+        chooseNext(writeQueue, switched_cmd_type);
         DRAMPacket* dram_pkt = writeQueue.front();
         // sanity check
         assert(dram_pkt->size <= burstSize);
+
+        if (switched_cmd_type) {
+            // add a bubble to the data bus, as defined by the
+            // tRTW or tCS parameter, depending on whether changing ranks
+            busBusyUntil += (dram_pkt->rank == activeRank) ? tRTW : tCS;
+        } else if (dram_pkt->rank != activeRank) {
+            // add a bubble to the data bus, as defined by the
+            // tCS parameter for rank-to-rank delay
+            busBusyUntil += tCS;
+        }
+
         doDRAMAccess(dram_pkt);
 
         writeQueue.pop_front();
@@ -1232,17 +1265,6 @@ DRAMCtrl::processNextReqEvent()
             // case, which eventually will check for any draining and
             // also pause any further scheduling if there is really
             // nothing to do
-
-            // here we get a bit creative and shift the bus busy time not
-            // just the tWTR, but also a CAS latency to capture the fact
-            // that we are allowed to prepare a new bank, but not issue a
-            // read command until after tWTR, in essence we capture a
-            // bubble on the data bus that is tWTR + tCL
-            busBusyUntil += tWTR + tCL;
-
-            // update the minimum timing between the requests, this shifts
-            // us back in time far enough to do any bank preparation
-            nextReqTime = busBusyUntil - (tRP + tRCD + tCL);
         }
     }
 
@@ -1259,12 +1281,19 @@ DRAMCtrl::processNextReqEvent()
 }
 
 uint64_t
-DRAMCtrl::minBankActAt(const deque<DRAMPacket*>& queue) const
+DRAMCtrl::minBankPrep(const deque<DRAMPacket*>& queue,
+                      bool switched_cmd_type) const
 {
     uint64_t bank_mask = 0;
     Tick min_act_at = MaxTick;
 
-    // deterimne if we have queued transactions targetting a
+    uint64_t bank_mask_same_rank = 0;
+    Tick min_act_at_same_rank = MaxTick;
+
+    // Give precedence to commands that access same rank as previous command
+    bool same_rank_match = false;
+
+    // determine if we have queued transactions targetting the
     // bank in question
     vector<bool> got_waiting(ranksPerChannel * banksPerRank, false);
     for (auto p = queue.begin(); p != queue.end(); ++p) {
@@ -1280,23 +1309,64 @@ DRAMCtrl::minBankActAt(const deque<DRAMPacket*>& queue) const
             if (got_waiting[bank_id]) {
                 // simplistic approximation of when the bank can issue
                 // an activate, ignoring any rank-to-rank switching
-                // cost
+                // cost in this calculation
                 Tick act_at = banks[i][j].openRow == Bank::NO_ROW ?
                     banks[i][j].actAllowedAt :
                     std::max(banks[i][j].preAllowedAt, curTick()) + tRP;
 
-                if (act_at <= min_act_at) {
-                    // reset bank mask if new minimum is found
-                    if (act_at < min_act_at)
-                        bank_mask = 0;
-                    // set the bit corresponding to the available bank
-                    replaceBits(bank_mask, bank_id, bank_id, 1);
-                    min_act_at = act_at;
+                // prioritize commands that access the
+                // same rank as previous burst
+                // Calculate bank mask separately for the case and
+                // evaluate after loop iterations complete
+                if (i == activeRank && ranksPerChannel > 1) {
+                    if (act_at <= min_act_at_same_rank) {
+                        // reset same rank bank mask if new minimum is found
+                        // and previous minimum could not immediately send ACT
+                        if (act_at < min_act_at_same_rank &&
+                            min_act_at_same_rank > curTick())
+                            bank_mask_same_rank = 0;
+
+                        // Set flag indicating that a same rank
+                        // opportunity was found
+                        same_rank_match = true;
+
+                        // set the bit corresponding to the available bank
+                        replaceBits(bank_mask_same_rank, bank_id, bank_id, 1);
+                        min_act_at_same_rank = act_at;
+                    }
+                } else {
+                    if (act_at <= min_act_at) {
+                        // reset bank mask if new minimum is found
+                        // and either previous minimum could not immediately send ACT
+                        if (act_at < min_act_at && min_act_at > curTick())
+                            bank_mask = 0;
+                        // set the bit corresponding to the available bank
+                        replaceBits(bank_mask, bank_id, bank_id, 1);
+                        min_act_at = act_at;
+                    }
                 }
             }
         }
     }
 
+    // Determine the earliest time when the next burst can issue based
+    // on the current busBusyUntil delay.
+    // Offset by tRCD to correlate with ACT timing variables
+    Tick min_cmd_at = busBusyUntil - tCL - tRCD;
+
+    // Prioritize same rank accesses that can issue B2B
+    // Only optimize for same ranks when the command type
+    // does not change; do not want to unnecessarily incur tWTR
+    //
+    // Resulting FCFS prioritization Order is:
+    // 1) Commands that access the same rank as previous burst
+    //    and can prep the bank seamlessly.
+    // 2) Commands (any rank) with earliest bank prep
+    if (!switched_cmd_type && same_rank_match &&
+        min_act_at_same_rank <= min_cmd_at) {
+        bank_mask = bank_mask_same_rank;
+    }
+
     return bank_mask;
 }
 
diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index ef8b47a2e..8dcbe817b 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -368,23 +368,36 @@ class DRAMCtrl : public AbstractMemory
      * The memory schduler/arbiter - picks which request needs to
      * go next, based on the specified policy such as FCFS or FR-FCFS
      * and moves it to the head of the queue.
+     * Prioritizes accesses to the same rank as previous burst unless
+     * controller is switching command type.
+     *
+     * @param queue Queued requests to consider
+     * @param switched_cmd_type Command type is changing
      */
-    void chooseNext(std::deque<DRAMPacket*>& queue);
+    void chooseNext(std::deque<DRAMPacket*>& queue, bool switched_cmd_type);
 
     /**
      * For FR-FCFS policy reorder the read/write queue depending on row buffer
      * hits and earliest banks available in DRAM
+     * Prioritizes accesses to the same rank as previous burst unless
+     * controller is switching command type.
+     *
+     * @param queue Queued requests to consider
+     * @param switched_cmd_type Command type is changing
      */
-    void reorderQueue(std::deque<DRAMPacket*>& queue);
+    void reorderQueue(std::deque<DRAMPacket*>& queue, bool switched_cmd_type);
 
     /**
      * Find which are the earliest banks ready to issue an activate
      * for the enqueued requests. Assumes maximum of 64 banks per DIMM
+     * Also checks if the bank is already prepped.
      *
-     * @param Queued requests to consider
+     * @param queue Queued requests to consider
+     * @param switched_cmd_type Command type is changing
      * @return One-hot encoded mask of bank indices
      */
-    uint64_t minBankActAt(const std::deque<DRAMPacket*>& queue) const;
+    uint64_t minBankPrep(const std::deque<DRAMPacket*>& queue,
+                         bool switched_cmd_type) const;
 
     /**
      * Keep track of when row activations happen, in order to enforce
@@ -475,6 +488,7 @@ class DRAMCtrl : public AbstractMemory
     const Tick M5_CLASS_VAR_USED tCK;
     const Tick tWTR;
     const Tick tRTW;
+    const Tick tCS;
     const Tick tBURST;
     const Tick tRCD;
     const Tick tCL;
@@ -664,6 +678,9 @@ class DRAMCtrl : public AbstractMemory
     // To track number of banks which are currently active
     unsigned int numBanksActive;
 
+    // Holds the value of the rank of burst issued
+    uint8_t activeRank;
+
     /** @todo this is a temporary workaround until the 4-phase code is
      * committed. upstream caches needs this packet until true is returned, so
      * hold onto it for deletion until a subsequent call