3 files changed, 390 insertions, 194 deletions
diff --git a/src/mem/SimpleDRAM.py b/src/mem/SimpleDRAM.py
index ec76542d8..b066b27de 100644
--- a/src/mem/SimpleDRAM.py
+++ b/src/mem/SimpleDRAM.py
@@ -10,6 +10,9 @@
 # unmodified and in its entirety in all distributions of the software,
 # modified or unmodified, in source code or in binary form.
 #
+# Copyright (c) 2013 Amin Farmahini-Farahani
+# All rights reserved.
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met: redistributions of source code must retain the above copyright
@@ -118,7 +121,12 @@ class SimpleDRAM(AbstractMemory):
     static_backend_latency = Param.Latency("10ns", "Static backend latency")
 
     # the physical organisation of the DRAM
-    lines_per_rowbuffer = Param.Unsigned("Row buffer size in cache lines")
+    device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\
+                                      "device/chip")
+    burst_length = Param.Unsigned("Burst lenght (BL) in beats")
+    device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\
+                                           "device/chip")
+    devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
     ranks_per_channel = Param.Unsigned("Number of ranks per channel")
     banks_per_rank = Param.Unsigned("Number of banks per rank")
     # only used for the address mapping as the controller by
@@ -141,9 +149,9 @@ class SimpleDRAM(AbstractMemory):
     # time to complete a burst transfer, typically the burst length
     # divided by two due to the DDR bus, but by making it a parameter
     # it is easier to also evaluate SDR memories like WideIO.
-    # This parameter has to account for bus width and burst length.
-    # Adjustment also necessary if cache line size is greater than
-    # data size read/written by one full burst.
+    # This parameter has to account for burst length.
+    # Read/Write requests with data size larger than one full burst are broken
+    # down into multiple requests in the SimpleDRAM controller
     tBURST = Param.Latency("Burst duration (for DDR burst length / 2 cycles)")
 
     # time taken to complete one refresh cycle (N rows in all banks)
@@ -170,15 +178,22 @@ class SimpleDRAM(AbstractMemory):
 
     # tRC  - assumed to be 4 * tRP
 
-    # burst length for an access derived from the cache line size
-
 # A single DDR3 x64 interface (one command and address bus), with
 # default timings based on DDR3-1600 4 Gbit parts in an 8x8
 # configuration, which would amount to 4 Gbyte of memory.
 class DDR3_1600_x64(SimpleDRAM):
-    # Assuming 64 byte cache lines, and a 1kbyte page size per module
+    # 8x8 configuration, 8 devices each with an 8-bit interface
+    device_bus_width = 8
+
+    # DDR3 is a BL8 device
+    burst_length = 8
+
+    # Each device has a page (row buffer) size of 1KB
     # (this depends on the memory density)
-    lines_per_rowbuffer = 128
+    device_rowbuffer_size = '1kB'
+
+    # 8x8 configuration, so 8 devices
+    devices_per_rank = 8
 
     # Use two ranks
     ranks_per_channel = 2
@@ -191,8 +206,8 @@ class DDR3_1600_x64(SimpleDRAM):
     tCL = '13.75ns'
     tRP = '13.75ns'
 
-    # Assuming 64 byte cache lines, across an x64
-    # interface, translates to BL8, 4 clocks @ 800 MHz
+    # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz.
+    # Note this is a BL8 DDR device.
     tBURST = '5ns'
 
     # DDR3, 4 Gbit has a tRFC of 240 CK and tCK = 1.25 ns
@@ -213,9 +228,18 @@ class DDR3_1600_x64(SimpleDRAM):
 # default timings based on a LPDDR2-1066 4 Gbit part in a 1x32
 # configuration.
 class LPDDR2_S4_1066_x32(SimpleDRAM):
-    # Assuming 64 byte cache lines, use a 1kbyte page size, this
-    # depends on the memory density
-    lines_per_rowbuffer = 16
+    # 1x32 configuration, 1 device with a 32-bit interface
+    device_bus_width = 32
+
+    # LPDDR2_S4 is a BL4 and BL8 device
+    burst_length = 8
+
+    # Each device has a page (row buffer) size of 1KB
+    # (this depends on the memory density)
+    device_rowbuffer_size = '1kB'
+
+    # 1x32 configuration, so 1 device
+    devices_per_rank = 1
 
     # Use a single rank
     ranks_per_channel = 1
@@ -232,10 +256,11 @@ class LPDDR2_S4_1066_x32(SimpleDRAM):
     # Pre-charge one bank 15 ns (all banks 18 ns)
     tRP = '15ns'
 
-    # Assuming 64 byte cache lines, across a x32 DDR interface
-    # translates to two BL8, 8 clocks @ 533 MHz. Note that this is a
-    # simplification
-    tBURST = '15ns'
+    # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.
+    # Note this is a BL8 DDR device.
+    # Requests larger than 32 bytes are broken down into multiple requests
+    # in the SimpleDRAM controller
+    tBURST = '7.5ns'
 
     # LPDDR2-S4, 4 Gbit
     tRFC = '130ns'
@@ -251,9 +276,18 @@ class LPDDR2_S4_1066_x32(SimpleDRAM):
 # A single WideIO x128 interface (one command and address bus), with
 # default timings based on an estimated WIO-200 8 Gbit part.
 class WideIO_200_x128(SimpleDRAM):
-    # Assuming 64 byte cache lines, use a 4kbyte page size, this
-    # depends on the memory density
-    lines_per_rowbuffer = 64
+    # 1x128 configuration, 1 device with a 128-bit interface
+    device_bus_width = 128
+
+    # This is a BL4 device
+    burst_length = 4
+
+    # Each device has a page (row buffer) size of 4KB
+    # (this depends on the memory density)
+    device_rowbuffer_size = '4kB'
+
+    # 1x128 configuration, so 1 device
+    devices_per_rank = 1
 
     # Use one rank for a one-high die stack
     ranks_per_channel = 1
@@ -266,8 +300,8 @@ class WideIO_200_x128(SimpleDRAM):
     tCL = '18ns'
     tRP = '18ns'
 
-    # Assuming 64 byte cache lines, across an x128 SDR interface,
-    # translates to BL4, 4 clocks @ 200 MHz
+    # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz.
+    # Note this is a BL4 SDR device.
     tBURST = '20ns'
 
     # WIO 8 Gb
@@ -287,9 +321,18 @@ class WideIO_200_x128(SimpleDRAM):
 # default timings based on a LPDDR3-1600 4 Gbit part in a 1x32
 # configuration
 class LPDDR3_1600_x32(SimpleDRAM):
-    # 4 Gbit and 8 Gbit devices use a 1 kByte page size, so ssuming 64
-    # byte cache lines, that is 16 lines
-    lines_per_rowbuffer = 16
+    # 1x32 configuration, 1 device with a 32-bit interface
+    device_bus_width = 32
+
+    # LPDDR3 is a BL8 device
+    burst_length = 8
+
+    # Each device has a page (row buffer) size of 1KB
+    # (this depends on the memory density)
+    device_rowbuffer_size = '1kB'
+
+    # 1x32 configuration, so 1 device
+    devices_per_rank = 1
 
     # Use a single rank
     ranks_per_channel = 1
@@ -306,9 +349,11 @@ class LPDDR3_1600_x32(SimpleDRAM):
     # Pre-charge one bank 15 ns (all banks 18 ns)
     tRP = '15ns'
 
-    # Assuming 64 byte cache lines, across a x32 DDR interface
-    # translates to two bursts of BL8, 8 clocks @ 800 MHz
-    tBURST = '10ns'
+    # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.
+    # Note this is a BL8 DDR device.
+    # Requests larger than 32 bytes are broken down into multiple requests
+    # in the SimpleDRAM controller
+    tBURST = '5ns'
 
     # LPDDR3, 4 Gb
     tRFC = '130ns'
diff --git a/src/mem/simple_dram.cc b/src/mem/simple_dram.cc
index 9091288ec..faeedbb2b 100644
--- a/src/mem/simple_dram.cc
+++ b/src/mem/simple_dram.cc
@@ -11,6 +11,9 @@
  * unmodified and in its entirety in all distributions of the software,
  * modified or unmodified, in source code or in binary form.
  *
+ * Copyright (c) 2013 Amin Farmahini-Farahani
+ * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met: redistributions of source code must retain the above copyright
@@ -54,8 +57,11 @@ SimpleDRAM::SimpleDRAM(const SimpleDRAMParams* p) :
     rowHitFlag(false), stopReads(false), actTicks(p->activation_limit, 0),
     writeEvent(this), respondEvent(this),
     refreshEvent(this), nextReqEvent(this), drainManager(NULL),
-    bytesPerCacheLine(0),
-    linesPerRowBuffer(p->lines_per_rowbuffer),
+    deviceBusWidth(p->device_bus_width), burstLength(p->burst_length),
+    deviceRowBufferSize(p->device_rowbuffer_size),
+    devicesPerRank(p->devices_per_rank),
+    burstSize((devicesPerRank * burstLength * deviceBusWidth) / 8),
+    rowBufferSize(devicesPerRank * deviceRowBufferSize),
     ranksPerChannel(p->ranks_per_channel),
     banksPerRank(p->banks_per_rank), channels(p->channels), rowsPerBank(0),
     readBufferSize(p->read_buffer_size),
@@ -93,22 +99,22 @@ SimpleDRAM::init()
         port.sendRangeChange();
     }
 
-    // get the burst size from the connected port as it is currently
-    // assumed to be equal to the cache line size
-    bytesPerCacheLine = _system->cacheLineSize();
-
     // we could deal with plenty options here, but for now do a quick
     // sanity check
-    if (bytesPerCacheLine != 64 && bytesPerCacheLine != 32)
-        panic("Unexpected burst size %d", bytesPerCacheLine);
+    DPRINTF(DRAM, "Burst size %d bytes\n", burstSize);
 
     // determine the rows per bank by looking at the total capacity
     uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
 
     DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
             AbstractMemory::size());
-    rowsPerBank = capacity / (bytesPerCacheLine * linesPerRowBuffer *
-                              banksPerRank * ranksPerChannel);
+
+    columnsPerRowBuffer = rowBufferSize / burstSize;
+
+    DPRINTF(DRAM, "Row buffer size %d bytes with %d columns per row buffer\n",
+            rowBufferSize, columnsPerRowBuffer);
+
+    rowsPerBank = capacity / (rowBufferSize * banksPerRank * ranksPerChannel);
 
     if (range.interleaved()) {
         if (channels != range.stripes())
@@ -116,18 +122,17 @@ SimpleDRAM::init()
                   name(), range.stripes(), channels);
 
         if (addrMapping == Enums::RaBaChCo) {
-            if (bytesPerCacheLine * linesPerRowBuffer !=
-                range.granularity()) {
+            if (rowBufferSize != range.granularity()) {
                 panic("Interleaving of %s doesn't match RaBaChCo address map\n",
                       name());
             }
         } else if (addrMapping == Enums::RaBaCoCh) {
-            if (bytesPerCacheLine != range.granularity()) {
+            if (burstSize != range.granularity()) {
                 panic("Interleaving of %s doesn't match RaBaCoCh address map\n",
                       name());
             }
         } else if (addrMapping == Enums::CoRaBaCh) {
-            if (bytesPerCacheLine != range.granularity())
+            if (burstSize != range.granularity())
                 panic("Interleaving of %s doesn't match CoRaBaCh address map\n",
                       name());
         }
@@ -162,24 +167,26 @@ SimpleDRAM::recvAtomic(PacketPtr pkt)
 }
 
 bool
-SimpleDRAM::readQueueFull() const
+SimpleDRAM::readQueueFull(unsigned int neededEntries) const
 {
-    DPRINTF(DRAM, "Read queue limit %d current size %d\n",
-            readBufferSize, readQueue.size() + respQueue.size());
+    DPRINTF(DRAM, "Read queue limit %d, current size %d, entries needed %d\n",
+            readBufferSize, readQueue.size() + respQueue.size(),
+            neededEntries);
 
-    return (readQueue.size() + respQueue.size()) == readBufferSize;
+    return
+        (readQueue.size() + respQueue.size() + neededEntries) > readBufferSize;
 }
 
 bool
-SimpleDRAM::writeQueueFull() const
+SimpleDRAM::writeQueueFull(unsigned int neededEntries) const
 {
-    DPRINTF(DRAM, "Write queue limit %d current size %d\n",
-            writeBufferSize, writeQueue.size());
-    return writeQueue.size() == writeBufferSize;
+    DPRINTF(DRAM, "Write queue limit %d, current size %d, entries needed %d\n",
+            writeBufferSize, writeQueue.size(), neededEntries);
+    return (writeQueue.size() + neededEntries) > writeBufferSize;
 }
 
 SimpleDRAM::DRAMPacket*
-SimpleDRAM::decodeAddr(PacketPtr pkt)
+SimpleDRAM::decodeAddr(PacketPtr pkt, Addr dramPktAddr, unsigned size)
 {
     // decode the address based on the address mapping scheme, with
     // Ra, Co, Ba and Ch denoting rank, column, bank and channel,
@@ -188,17 +195,15 @@ SimpleDRAM::decodeAddr(PacketPtr pkt)
     uint16_t bank;
     uint16_t row;
 
-    Addr addr = pkt->getAddr();
-
     // truncate the address to the access granularity
-    addr = addr / bytesPerCacheLine;
+    Addr addr = dramPktAddr / burstSize;
 
     // we have removed the lowest order address bits that denote the
-    // position within the cache line
+    // position within the column
     if (addrMapping == Enums::RaBaChCo) {
         // the lowest order bits denote the column to ensure that
         // sequential cache lines occupy the same row
-        addr = addr / linesPerRowBuffer;
+        addr = addr / columnsPerRowBuffer;
 
         // take out the channel part of the address
         addr = addr / channels;
@@ -221,7 +226,7 @@ SimpleDRAM::decodeAddr(PacketPtr pkt)
         addr = addr / channels;
 
         // next, the column
-        addr = addr / linesPerRowBuffer;
+        addr = addr / columnsPerRowBuffer;
 
         // after the column bits, we get the bank bits to interleave
         // over the banks
@@ -256,7 +261,7 @@ SimpleDRAM::decodeAddr(PacketPtr pkt)
 
         // next the column bits which we do not need to keep track of
         // and simply skip past
-        addr = addr / linesPerRowBuffer;
+        addr = addr / columnsPerRowBuffer;
 
         // lastly, get the row bits
         row = addr % rowsPerBank;
@@ -269,54 +274,98 @@ SimpleDRAM::decodeAddr(PacketPtr pkt)
     assert(row < rowsPerBank);
 
     DPRINTF(DRAM, "Address: %lld Rank %d Bank %d Row %d\n",
-            pkt->getAddr(), rank, bank, row);
+            dramPktAddr, rank, bank, row);
 
     // create the corresponding DRAM packet with the entry time and
     // ready time set to the current tick, the latter will be updated
     // later
-    return new DRAMPacket(pkt, rank, bank, row, pkt->getAddr(),
+    return new DRAMPacket(pkt, rank, bank, row, dramPktAddr, size,
                           banks[rank][bank]);
 }
 
 void
-SimpleDRAM::addToReadQueue(PacketPtr pkt)
+SimpleDRAM::addToReadQueue(PacketPtr pkt, unsigned int pktCount)
 {
     // only add to the read queue here. whenever the request is
     // eventually done, set the readyTime, and call schedule()
     assert(!pkt->isWrite());
 
-    // First check write buffer to see if the data is already at
-    // the controller
-    list<DRAMPacket*>::const_iterator i;
-    Addr addr = pkt->getAddr();
+    assert(pktCount != 0);
 
-    // @todo: add size check
-    for (i = writeQueue.begin(); i != writeQueue.end(); ++i) {
-        if ((*i)->addr == addr){
-            servicedByWrQ++;
-            DPRINTF(DRAM, "Read to %lld serviced by write queue\n", addr);
-            bytesRead += bytesPerCacheLine;
-            bytesConsumedRd += pkt->getSize();
-            accessAndRespond(pkt, frontendLatency);
-            return;
+    // if the request size is larger than burst size, the pkt is split into
+    // multiple DRAM packets
+    // Note if the pkt starting address is not aligened to burst size, the
+    // address of first DRAM packet is kept unaliged. Subsequent DRAM packets
+    // are aligned to burst size boundaries. This is to ensure we accurately
+    // check read packets against packets in write queue.
+    Addr addr = pkt->getAddr();
+    unsigned pktsServicedByWrQ = 0;
+    BurstHelper* burst_helper = NULL;
+    for (int cnt = 0; cnt < pktCount; ++cnt) {
+        unsigned size = std::min((addr | (burstSize - 1)) + 1,
+                        pkt->getAddr() + pkt->getSize()) - addr;
+        readPktSize[ceilLog2(size)]++;
+        readBursts++;
+
+        // First check write buffer to see if the data is already at
+        // the controller
+        bool foundInWrQ = false;
+        list<DRAMPacket*>::const_iterator i;
+        for (i = writeQueue.begin(); i != writeQueue.end(); ++i) {
+            if ((*i)->addr == addr && (*i)->size >= size){
+                foundInWrQ = true;
+                servicedByWrQ++;
+                pktsServicedByWrQ++;
+                DPRINTF(DRAM, "Read to addr %lld with size %d serviced by "
+                        "write queue\n", addr, size);
+                bytesRead += burstSize;
+                bytesConsumedRd += size;
+                break;
+            }
         }
-    }
 
-    DRAMPacket* dram_pkt = decodeAddr(pkt);
+        // If not found in the write q, make a DRAM packet and
+        // push it onto the read queue
+        if (!foundInWrQ) {
 
-    assert(readQueue.size() + respQueue.size() < readBufferSize);
-    rdQLenPdf[readQueue.size() + respQueue.size()]++;
+            // Make the burst helper for split packets
+            if (pktCount > 1 && burst_helper == NULL) {
+                DPRINTF(DRAM, "Read to addr %lld translates to %d "
+                        "dram requests\n", pkt->getAddr(), pktCount);
+                burst_helper = new BurstHelper(pktCount);
+            }
 
-    DPRINTF(DRAM, "Adding to read queue\n");
+            DRAMPacket* dram_pkt = decodeAddr(pkt, addr, size);
+            dram_pkt->burstHelper = burst_helper;
 
-    readQueue.push_back(dram_pkt);
+            assert(!readQueueFull(1));
+            rdQLenPdf[readQueue.size() + respQueue.size()]++;
 
-    // Update stats
-    uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank;
-    assert(bank_id < ranksPerChannel * banksPerRank);
-    perBankRdReqs[bank_id]++;
+            DPRINTF(DRAM, "Adding to read queue\n");
 
-    avgRdQLen = readQueue.size() + respQueue.size();
+            readQueue.push_back(dram_pkt);
+
+            // Update stats
+            uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank;
+            assert(bank_id < ranksPerChannel * banksPerRank);
+            perBankRdReqs[bank_id]++;
+
+            avgRdQLen = readQueue.size() + respQueue.size();
+        }
+
+        // Starting address of next dram pkt (aligend to burstSize boundary)
+        addr = (addr | (burstSize - 1)) + 1;
+    }
+
+    // If all packets are serviced by write queue, we send the repsonse back
+    if (pktsServicedByWrQ == pktCount) {
+        accessAndRespond(pkt, frontendLatency);
+        return;
+    }
+
+    // Update how many split packets are serviced by write queue
+    if (burst_helper != NULL)
+        burst_helper->burstsServiced = pktsServicedByWrQ;
 
     // If we are not already scheduled to get the read request out of
     // the queue, do so now
@@ -364,7 +413,7 @@ SimpleDRAM::processWriteEvent()
             bank.openRow = dram_pkt->row;
             bank.freeAt = schedTime + tBURST + std::max(accessLat, tCL);
             busBusyUntil = bank.freeAt - tCL;
-            bank.bytesAccessed += bytesPerCacheLine;
+            bank.bytesAccessed += burstSize;
 
             if (!rowHitFlag) {
                 bank.tRASDoneAt = bank.freeAt + tRP;
@@ -385,7 +434,7 @@ SimpleDRAM::processWriteEvent()
                     "banks_id %d is %lld\n",
                     dram_pkt->rank * banksPerRank + dram_pkt->bank,
                     bank.freeAt);
-            bytesPerActivate.sample(bytesPerCacheLine);
+            bytesPerActivate.sample(burstSize);
         } else
             panic("Unknown page management policy chosen\n");
 
@@ -449,34 +498,49 @@ SimpleDRAM::triggerWrites()
 }
 
 void
-SimpleDRAM::addToWriteQueue(PacketPtr pkt)
+SimpleDRAM::addToWriteQueue(PacketPtr pkt, unsigned int pktCount)
 {
     // only add to the write queue here. whenever the request is
     // eventually done, set the readyTime, and call schedule()
     assert(pkt->isWrite());
 
-    DRAMPacket* dram_pkt = decodeAddr(pkt);
+    // if the request size is larger than burst size, the pkt is split into
+    // multiple DRAM packets
+    Addr addr = pkt->getAddr();
+    for (int cnt = 0; cnt < pktCount; ++cnt) {
+        unsigned size = std::min((addr | (burstSize - 1)) + 1,
+                        pkt->getAddr() + pkt->getSize()) - addr;
+        writePktSize[ceilLog2(size)]++;
+        writeBursts++;
 
-    assert(writeQueue.size() < writeBufferSize);
-    wrQLenPdf[writeQueue.size()]++;
+        DRAMPacket* dram_pkt = decodeAddr(pkt, addr, size);
 
-    DPRINTF(DRAM, "Adding to write queue\n");
+        assert(writeQueue.size() < writeBufferSize);
+        wrQLenPdf[writeQueue.size()]++;
 
-    writeQueue.push_back(dram_pkt);
+        DPRINTF(DRAM, "Adding to write queue\n");
 
-    // Update stats
-    uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank;
-    assert(bank_id < ranksPerChannel * banksPerRank);
-    perBankWrReqs[bank_id]++;
+        writeQueue.push_back(dram_pkt);
 
-    avgWrQLen = writeQueue.size();
+        // Update stats
+        uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank;
+        assert(bank_id < ranksPerChannel * banksPerRank);
+        perBankWrReqs[bank_id]++;
+
+        avgWrQLen = writeQueue.size();
+
+        bytesConsumedWr += dram_pkt->size;
+        bytesWritten += burstSize;
+
+        // Starting address of next dram pkt (aligend to burstSize boundary)
+        addr = (addr | (burstSize - 1)) + 1;
+    }
 
     // we do not wait for the writes to be send to the actual memory,
     // but instead take responsibility for the consistency here and
     // snoop the write queue for any upcoming reads
-
-    bytesConsumedWr += pkt->getSize();
-    bytesWritten += bytesPerCacheLine;
+    // @todo, if a pkt size is larger than burst size, we might need a
+    // different front end latency
     accessAndRespond(pkt, frontendLatency);
 
     // If your write buffer is starting to fill up, drain it!
@@ -491,15 +555,18 @@ SimpleDRAM::printParams() const
     // Sanity check print of important parameters
     DPRINTF(DRAM,
             "Memory controller %s physical organization\n"      \
-            "Bytes per cacheline  %d\n"                         \
-            "Lines per row buffer %d\n"                         \
-            "Rows  per bank       %d\n"                         \
-            "Banks per rank       %d\n"                         \
-            "Ranks per channel    %d\n"                         \
-            "Total mem capacity   %u\n",
-            name(), bytesPerCacheLine, linesPerRowBuffer, rowsPerBank,
-            banksPerRank, ranksPerChannel, bytesPerCacheLine *
-            linesPerRowBuffer * rowsPerBank * banksPerRank * ranksPerChannel);
+            "Number of devices per rank   %d\n"                 \
+            "Device bus width (in bits)   %d\n"                 \
+            "DRAM data bus burst          %d\n"                 \
+            "Row buffer size              %d\n"                 \
+            "Columns per row buffer       %d\n"                 \
+            "Rows    per bank             %d\n"                 \
+            "Banks   per rank             %d\n"                 \
+            "Ranks   per channel          %d\n"                 \
+            "Total mem capacity           %u\n",
+            name(), devicesPerRank, deviceBusWidth, burstSize, rowBufferSize,
+            columnsPerRowBuffer, rowsPerBank, banksPerRank, ranksPerChannel,
+            rowBufferSize * rowsPerBank * banksPerRank * ranksPerChannel);
 
     string scheduler =  memSchedPolicy == Enums::fcfs ? "FCFS" : "FR-FCFS";
     string address_mapping = addrMapping == Enums::RaBaChCo ? "RaBaChCo" :
@@ -560,7 +627,7 @@ SimpleDRAM::recvTimingReq(PacketPtr pkt)
 
     // This is where we enter from the outside world
     DPRINTF(DRAM, "recvTimingReq: request %s addr %lld size %d\n",
-            pkt->cmdString(),pkt->getAddr(), pkt->getSize());
+            pkt->cmdString(), pkt->getAddr(), pkt->getSize());
 
     // simply drop inhibited packets for now
     if (pkt->memInhibitAsserted()) {
@@ -569,9 +636,6 @@ SimpleDRAM::recvTimingReq(PacketPtr pkt)
         return true;
     }
 
-   if (pkt->getSize() == bytesPerCacheLine)
-       cpuReqs++;
-
    // Every million accesses, print the state of the queues
    if (numReqs % 1000000 == 0)
        printQs();
@@ -582,37 +646,39 @@ SimpleDRAM::recvTimingReq(PacketPtr pkt)
     }
     prevArrival = curTick();
 
+
+    // Find out how many dram packets a pkt translates to
+    // If the burst size is equal or larger than the pkt size, then a pkt
+    // translates to only one dram packet. Otherwise, a pkt translates to
+    // multiple dram packets
     unsigned size = pkt->getSize();
-    if (size > bytesPerCacheLine)
-        panic("Request size %d is greater than burst size %d",
-              size, bytesPerCacheLine);
+    unsigned offset = pkt->getAddr() & (burstSize - 1);
+    unsigned int dram_pkt_count = divCeil(offset + size, burstSize);
 
     // check local buffers and do not accept if full
     if (pkt->isRead()) {
         assert(size != 0);
-        if (readQueueFull()) {
+        if (readQueueFull(dram_pkt_count)) {
             DPRINTF(DRAM, "Read queue full, not accepting\n");
             // remember that we have to retry this port
             retryRdReq = true;
             numRdRetry++;
             return false;
         } else {
-            readPktSize[ceilLog2(size)]++;
-            addToReadQueue(pkt);
+            addToReadQueue(pkt, dram_pkt_count);
             readReqs++;
             numReqs++;
         }
     } else if (pkt->isWrite()) {
         assert(size != 0);
-        if (writeQueueFull()) {
+        if (writeQueueFull(dram_pkt_count)) {
             DPRINTF(DRAM, "Write queue full, not accepting\n");
             // remember that we have to retry this port
             retryWrReq = true;
             numWrRetry++;
             return false;
         } else {
-            writePktSize[ceilLog2(size)]++;
-            addToWriteQueue(pkt);
+            addToWriteQueue(pkt, dram_pkt_count);
             writeReqs++;
             numReqs++;
         }
@@ -633,38 +699,54 @@ SimpleDRAM::processRespondEvent()
     DPRINTF(DRAM,
             "processRespondEvent(): Some req has reached its readyTime\n");
 
-     PacketPtr pkt = respQueue.front()->pkt;
-
-     // Actually responds to the requestor
-     bytesConsumedRd += pkt->getSize();
-     bytesRead += bytesPerCacheLine;
-     accessAndRespond(pkt, frontendLatency + backendLatency);
-
-     delete respQueue.front();
-     respQueue.pop_front();
-
-     // Update stats
-     avgRdQLen = readQueue.size() + respQueue.size();
-
-     if (!respQueue.empty()) {
-         assert(respQueue.front()->readyTime >= curTick());
-         assert(!respondEvent.scheduled());
-         schedule(respondEvent, respQueue.front()->readyTime);
-     } else {
-         // if there is nothing left in any queue, signal a drain
-         if (writeQueue.empty() && readQueue.empty() &&
-             drainManager) {
-             drainManager->signalDrainDone();
-             drainManager = NULL;
-         }
-     }
-
-     // We have made a location in the queue available at this point,
-     // so if there is a read that was forced to wait, retry now
-     if (retryRdReq) {
-         retryRdReq = false;
-         port.sendRetry();
-     }
+    DRAMPacket* dram_pkt = respQueue.front();
+
+    // Actually responds to the requestor
+    bytesConsumedRd += dram_pkt->size;
+    bytesRead += burstSize;
+    if (dram_pkt->burstHelper) {
+        // it is a split packet
+        dram_pkt->burstHelper->burstsServiced++;
+        if (dram_pkt->burstHelper->burstsServiced ==
+                                  dram_pkt->burstHelper->burstCount) {
+            // we have now serviced all children packets of a system packet
+            // so we can now respond to the requester
+            // @todo we probably want to have a different front end and back
+            // end latency for split packets
+            accessAndRespond(dram_pkt->pkt, frontendLatency + backendLatency);
+            delete dram_pkt->burstHelper;
+            dram_pkt->burstHelper = NULL;
+        }
+    } else {
+        // it is not a split packet
+        accessAndRespond(dram_pkt->pkt, frontendLatency + backendLatency);
+    }
+
+    delete respQueue.front();
+    respQueue.pop_front();
+
+    // Update stats
+    avgRdQLen = readQueue.size() + respQueue.size();
+
+    if (!respQueue.empty()) {
+        assert(respQueue.front()->readyTime >= curTick());
+        assert(!respondEvent.scheduled());
+        schedule(respondEvent, respQueue.front()->readyTime);
+    } else {
+        // if there is nothing left in any queue, signal a drain
+        if (writeQueue.empty() && readQueue.empty() &&
+            drainManager) {
+            drainManager->signalDrainDone();
+            drainManager = NULL;
+        }
+    }
+
+    // We have made a location in the queue available at this point,
+    // so if there is a read that was forced to wait, retry now
+    if (retryRdReq) {
+        retryRdReq = false;
+        port.sendRetry();
+    }
 }
 
 void
@@ -911,7 +993,7 @@ SimpleDRAM::doDRAMAccess(DRAMPacket* dram_pkt)
     if (pageMgmt == Enums::open) {
         bank.openRow = dram_pkt->row;
         bank.freeAt = curTick() + addDelay + accessLat;
-        bank.bytesAccessed += bytesPerCacheLine;
+        bank.bytesAccessed += burstSize;
 
         // If you activated a new row do to this access, the next access
         // will have to respect tRAS for this bank. Assume tRAS ~= 3 * tRP.
@@ -931,7 +1013,7 @@ SimpleDRAM::doDRAMAccess(DRAMPacket* dram_pkt)
         bank.freeAt = curTick() + addDelay + accessLat + tRP + tRP;
         recordActivate(bank.freeAt - tRP - tRP - tCL - tRCD); //essentially (freeAt - tRC)
         DPRINTF(DRAM,"doDRAMAccess::bank.freeAt is %lld\n",bank.freeAt);
-        bytesPerActivate.sample(bytesPerCacheLine);
+        bytesPerActivate.sample(burstSize);
     } else
         panic("No page management policy chosen\n");
 
@@ -1080,19 +1162,27 @@ SimpleDRAM::regStats()
 
     readReqs
         .name(name() + ".readReqs")
-        .desc("Total number of read requests seen");
+        .desc("Total number of read requests accepted by DRAM controller");
 
     writeReqs
         .name(name() + ".writeReqs")
-        .desc("Total number of write requests seen");
+        .desc("Total number of write requests accepted by DRAM controller");
+
+    readBursts
+        .name(name() + ".readBursts")
+        .desc("Total number of DRAM read bursts. "
+              "Each DRAM read request translates to either one or multiple "
+              "DRAM read bursts");
+
+    writeBursts
+        .name(name() + ".writeBursts")
+        .desc("Total number of DRAM write bursts. "
+              "Each DRAM write request translates to either one or multiple "
+              "DRAM write bursts");
 
     servicedByWrQ
         .name(name() + ".servicedByWrQ")
-        .desc("Number of read reqs serviced by write Q");
-
-    cpuReqs
-        .name(name() + ".cpureqs")
-        .desc("Reqs generatd by CPU via cache - shady");
+        .desc("Number of DRAM read bursts serviced by write Q");
 
     neitherReadNorWrite
         .name(name() + ".neitherReadNorWrite")
@@ -1139,28 +1229,28 @@ SimpleDRAM::regStats()
         .desc("Average queueing delay per request")
         .precision(2);
 
-    avgQLat = totQLat / (readReqs - servicedByWrQ);
+    avgQLat = totQLat / (readBursts - servicedByWrQ);
 
     avgBankLat
         .name(name() + ".avgBankLat")
         .desc("Average bank access latency per request")
         .precision(2);
 
-    avgBankLat = totBankLat / (readReqs - servicedByWrQ);
+    avgBankLat = totBankLat / (readBursts - servicedByWrQ);
 
     avgBusLat
         .name(name() + ".avgBusLat")
         .desc("Average bus latency per request")
         .precision(2);
 
-    avgBusLat = totBusLat / (readReqs - servicedByWrQ);
+    avgBusLat = totBusLat / (readBursts - servicedByWrQ);
 
     avgMemAccLat
         .name(name() + ".avgMemAccLat")
         .desc("Average memory access latency")
         .precision(2);
 
-    avgMemAccLat = totMemAccLat / (readReqs - servicedByWrQ);
+    avgMemAccLat = totMemAccLat / (readBursts - servicedByWrQ);
 
     numRdRetry
         .name(name() + ".numRdRetry")
@@ -1183,22 +1273,22 @@ SimpleDRAM::regStats()
         .desc("Row buffer hit rate for reads")
         .precision(2);
 
-    readRowHitRate = (readRowHits / (readReqs - servicedByWrQ)) * 100;
+    readRowHitRate = (readRowHits / (readBursts - servicedByWrQ)) * 100;
 
     writeRowHitRate
         .name(name() + ".writeRowHitRate")
         .desc("Row buffer hit rate for writes")
         .precision(2);
 
-    writeRowHitRate = (writeRowHits / writeReqs) * 100;
+    writeRowHitRate = (writeRowHits / writeBursts) * 100;
 
     readPktSize
-        .init(ceilLog2(bytesPerCacheLine) + 1)
+        .init(ceilLog2(burstSize) + 1)
         .name(name() + ".readPktSize")
         .desc("Categorize read packet sizes");
 
      writePktSize
-        .init(ceilLog2(bytesPerCacheLine) + 1)
+        .init(ceilLog2(burstSize) + 1)
         .name(name() + ".writePktSize")
         .desc("Categorize write packet sizes");
 
@@ -1213,7 +1303,7 @@ SimpleDRAM::regStats()
         .desc("What write queue length does an incoming req see");
 
      bytesPerActivate
-         .init(bytesPerCacheLine * linesPerRowBuffer)
+         .init(rowBufferSize)
          .name(name() + ".bytesPerActivate")
          .desc("Bytes accessed per row activation")
          .flags(nozero);
@@ -1267,7 +1357,7 @@ SimpleDRAM::regStats()
         .desc("Theoretical peak bandwidth in MB/s")
         .precision(2);
 
-    peakBW = (SimClock::Frequency / tBURST) * bytesPerCacheLine / 1000000;
+    peakBW = (SimClock::Frequency / tBURST) * burstSize / 1000000;
 
     busUtil
         .name(name() + ".busUtil")
diff --git a/src/mem/simple_dram.hh b/src/mem/simple_dram.hh
index e4d20163a..313ad067b 100644
--- a/src/mem/simple_dram.hh
+++ b/src/mem/simple_dram.hh
@@ -11,6 +11,9 @@
  * unmodified and in its entirety in all distributions of the software,
  * modified or unmodified, in source code or in binary form.
  *
+ * Copyright (c) 2013 Amin Farmahini-Farahani
+ * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met: redistributions of source code must retain the above copyright
@@ -158,6 +161,27 @@ class SimpleDRAM : public AbstractMemory
     };
 
     /**
+     * A burst helper helps organize and manage a packet that is larger than
+     * the DRAM burst size. A system packet that is larger than the burst size
+     * is split into multiple DRAM packets and all those DRAM packets point to
+     * a single burst helper such that we know when the whole packet is served.
+     */
+    class BurstHelper {
+
+      public:
+
+        /** Number of DRAM bursts requred for a system packet **/
+        const unsigned int burstCount;
+
+        /** Number of DRAM bursts serviced so far for a system packet **/
+        unsigned int burstsServiced;
+
+        BurstHelper(unsigned int _burstCount)
+            : burstCount(_burstCount), burstsServiced(0)
+            { }
+    };
+
+    /**
      * A DRAM packet stores packets along with the timestamp of when
      * the packet entered the queue, and also the decoded address.
      */
@@ -178,14 +202,34 @@ class SimpleDRAM : public AbstractMemory
         const uint8_t rank;
         const uint16_t bank;
         const uint16_t row;
+
+        /**
+         * The starting address of the DRAM packet.
+         * This address could be unaligned to burst size boundaries. The
+         * reason is to keep the address offset so we can accurately check
+         * incoming read packets with packets in the write queue.
+         */
         const Addr addr;
+
+        /**
+         * The size of this dram packet in bytes
+         * It is always equal or smaller than DRAM burst size
+         */
+        const unsigned int size;
+
+        /**
+         * A pointer to the BurstHelper if this DRAMPacket is a split packet
+         * If not a split packet (common case), this is set to NULL
+         */
+        BurstHelper* burstHelper;
         Bank& bank_ref;
 
-        DRAMPacket(PacketPtr _pkt, uint8_t _rank,
-                   uint16_t _bank, uint16_t _row, Addr _addr, Bank& _bank_ref)
+        DRAMPacket(PacketPtr _pkt, uint8_t _rank, uint16_t _bank,
+                   uint16_t _row, Addr _addr, unsigned int _size,
+                   Bank& _bank_ref)
             : entryTime(curTick()), readyTime(curTick()),
               pkt(_pkt), rank(_rank), bank(_bank), row(_row), addr(_addr),
-              bank_ref(_bank_ref)
+              size(_size), burstHelper(NULL), bank_ref(_bank_ref)
         { }
 
     };
@@ -212,28 +256,34 @@ class SimpleDRAM : public AbstractMemory
     /**
      * Check if the read queue has room for more entries
      *
+     * @param pktCount The number of entries needed in the read queue
      * @return true if read queue is full, false otherwise
      */
-    bool readQueueFull() const;
+    bool readQueueFull(unsigned int pktCount) const;
 
     /**
      * Check if the write queue has room for more entries
      *
+     * @param pktCount The number of entries needed in the write queue
      * @return true if write queue is full, false otherwise
      */
-    bool writeQueueFull() const;
+    bool writeQueueFull(unsigned int pktCount) const;
 
     /**
      * When a new read comes in, first check if the write q has a
      * pending request to the same address.\ If not, decode the
-     * address to populate rank/bank/row, create a "dram_pkt", and
-     * push it to the back of the read queue.\ If this is the only
+     * address to populate rank/bank/row, create one or mutliple
+     * "dram_pkt", and push them to the back of the read queue.\
+     * If this is the only
      * read request in the system, schedule an event to start
      * servicing it.
      *
      * @param pkt The request packet from the outside world
+     * @param pktCount The number of DRAM bursts the pkt
+     * translate to. If pkt size is larger then one full burst,
+     * then pktCount is greater than one.
      */
-    void addToReadQueue(PacketPtr pkt);
+    void addToReadQueue(PacketPtr pkt, unsigned int pktCount);
 
     /**
      * Decode the incoming pkt, create a dram_pkt and push to the
@@ -242,8 +292,11 @@ class SimpleDRAM : public AbstractMemory
      * to get full, stop reads, and start draining writes.
      *
      * @param pkt The request packet from the outside world
+     * @param pktCount The number of DRAM bursts the pkt
+     * translate to. If pkt size is larger then one full burst,
+     * then pktCount is greater than one.
      */
-    void addToWriteQueue(PacketPtr pkt);
+    void addToWriteQueue(PacketPtr pkt, unsigned int pktCount);
 
     /**
      * Actually do the DRAM access - figure out the latency it
@@ -276,12 +329,16 @@ class SimpleDRAM : public AbstractMemory
 
     /**
      * Address decoder to figure out physical mapping onto ranks,
-     * banks, and rows.
+     * banks, and rows. This function is called multiple times on the same
+     * system packet if the pakcet is larger than burst of the memory. The
+     * dramPktAddr is used for the offset within the packet.
      *
      * @param pkt The packet from the outside world
+     * @param dramPktAddr The starting address of the DRAM packet
+     * @param size The size of the DRAM packet in bytes
      * @return A DRAMPacket pointer with the decoded information
      */
-    DRAMPacket* decodeAddr(PacketPtr pkt);
+    DRAMPacket* decodeAddr(PacketPtr pkt, Addr dramPktAddr, unsigned int size);
 
     /**
      * The memory schduler/arbiter - picks which read request needs to
@@ -376,18 +433,21 @@ class SimpleDRAM : public AbstractMemory
 
     /**
      * The following are basic design parameters of the memory
-     * controller, and are initialized based on parameter values. The
-     * bytesPerCacheLine is based on the neighbouring ports cache line
-     * size and thus determined outside the constructor. Similarly,
-     * the rowsPerBank is determined based on the capacity, number of
-     * ranks and banks, the cache line size, and the row buffer size.
-     */
-    uint32_t bytesPerCacheLine;
-    const uint32_t linesPerRowBuffer;
+     * controller, and are initialized based on parameter values.
+     * The rowsPerBank is determined based on the capacity, number of
+     * ranks and banks, the burst size, and the row buffer size.
+     */
+    const uint32_t deviceBusWidth;
+    const uint32_t burstLength;
+    const uint32_t deviceRowBufferSize;
+    const uint32_t devicesPerRank;
+    const uint32_t burstSize;
+    const uint32_t rowBufferSize;
     const uint32_t ranksPerChannel;
     const uint32_t banksPerRank;
     const uint32_t channels;
     uint32_t rowsPerBank;
+    uint32_t columnsPerRowBuffer;
     const uint32_t readBufferSize;
     const uint32_t writeBufferSize;
     const double writeThresholdPerc;
@@ -441,7 +501,8 @@ class SimpleDRAM : public AbstractMemory
     // All statistics that the model needs to capture
     Stats::Scalar readReqs;
     Stats::Scalar writeReqs;
-    Stats::Scalar cpuReqs;
+    Stats::Scalar readBursts;
+    Stats::Scalar writeBursts;
     Stats::Scalar bytesRead;
     Stats::Scalar bytesWritten;
     Stats::Scalar bytesConsumedRd;