33 files changed, 1296 insertions, 771 deletions
diff --git a/configs/common/HMC.py b/configs/common/HMC.py
index 130729f88..fcff94cc7 100644
--- a/configs/common/HMC.py
+++ b/configs/common/HMC.py
@@ -37,6 +37,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # Authors: Erfan Azarkhish
+#          Abdul Mutaal Ahmad
 
 # A Simplified model of a complete HMC device. Based on:
 #  [1] http://www.hybridmemorycube.org/specification-download/
@@ -48,6 +49,10 @@
 #      (G. Kim et. al)
 #  [5] Near Data Processing, Are we there yet? (M. Gokhale)
 #      http://www.cs.utah.edu/wondp/gokhale.pdf
+#  [6] openHMC - A Configurable Open-Source Hybrid Memory Cube Controller
+#      (J. Schmidt)
+#  [7] Hybrid Memory Cube performance characterization on data-centric
+#      workloads (M. Gokhale)
 #
 # This script builds a complete HMC device composed of vault controllers,
 # serial links, the main internal crossbar, and an external hmc controller.
@@ -60,23 +65,62 @@
 #   This component is simply an instance of the NoncoherentXBar class, and its
 #   parameters are tuned to [2].
 #
-# - SERIAL LINKS:
+# - SERIAL LINKS CONTROLLER:
 #   SerialLink is a simple variation of the Bridge class, with the ability to
-#   account for the latency of packet serialization. We assume that the
-#   serializer component at the transmitter side does not need to receive the
-#   whole packet to start the serialization. But the deserializer waits for
-#   the complete packet to check its integrity first.
+#   account for the latency of packet serialization and controller latency. We
+#   assume that the serializer component at the transmitter side does not need
+#   to receive the whole packet to start the serialization. But the
+#   deserializer waits for the complete packet to check its integrity first.
+#
 #   * Bandwidth of the serial links is not modeled in the SerialLink component
-#     itself. Instead bandwidth/port of the HMCController has been adjusted to
-#     reflect the bandwidth delivered by 1 serial link.
+#     itself.
+#
+#   * Latency of serial link controller is composed of SerDes latency + link
+#     controller
 #
-# - HMC CONTROLLER:
-#   Contains a large buffer (modeled with Bridge) to hide the access latency
-#   of the memory cube. Plus it simply forwards the packets to the serial
-#   links in a round-robin fashion to balance load among them.
 #   * It is inferred from the standard [1] and the literature [3] that serial
 #     links share the same address range and packets can travel over any of
 #     them so a load distribution mechanism is required among them.
+#
+#   -----------------------------------------
+#   | Host/HMC Controller                   |
+#   |        ----------------------         |
+#   |        |  Link Aggregator   |  opt    |
+#   |        ----------------------         |
+#   |        ----------------------         |
+#   |        |  Serial Link + Ser | * 4     |
+#   |        ----------------------         |
+#   |---------------------------------------
+#   -----------------------------------------
+#   | Device
+#   |        ----------------------         |
+#   |        |       Xbar         | * 4     |
+#   |        ----------------------         |
+#   |        ----------------------         |
+#   |        |  Vault Controller  | * 16    |
+#   |        ----------------------         |
+#   |        ----------------------         |
+#   |        |     Memory         |         |
+#   |        ----------------------         |
+#   |---------------------------------------|
+#
+#   In this version we have present 3 different HMC archiecture along with
+#   alongwith their corresponding test script.
+#
+#   same: It has 4 crossbars in HMC memory. All the crossbars are connected
+#   to each other, providing complete memory range. This archicture also covers
+#   the added latency for sending a request to non-local vault(bridge in b/t
+#   crossbars). All the 4 serial links can access complete memory. So each
+#   link can be connected to separate processor.
+#
+#   distributed: It has 4 crossbars inside the HMC. Crossbars are not
+#   connected.Through each crossbar only local vaults can be accessed. But to
+#   support this architecture we need a crossbar between serial links and
+#   processor.
+#
+#   mixed: This is a hybrid architecture. It has 4 crossbars inside the HMC.
+#   2 Crossbars are connected to only local vaults. From other 2 crossbar, a
+#   request can be forwarded to any other vault.
 
 import optparse
 
@@ -107,131 +151,277 @@ class HMCSystem(SubSystem):
     # FIFOs at the input and output of the inteconnect)
     xbar_response_latency = Param.Cycles(2, "Response latency of the XBar")
 
-    #*****************************SERIAL LINK PARAMETERS**********************
-    # Number of serial links [1]
-    num_serial_links = Param.Unsigned(4, "Number of serial links")
+    # number of cross which connects 16 Vaults to serial link[7]
+    number_mem_crossbar  = Param.Unsigned(4, "Number of crossbar in HMC"
+            )
+
+    #*****************************SERIAL LINK PARAMETERS***********************
+    # Number of serial links controllers [1]
+    num_links_controllers = Param.Unsigned(4, "Number of serial links")
 
     # Number of packets (not flits) to store at the request side of the serial
     #  link. This number should be adjusted to achive required bandwidth
-    link_buffer_size_req = Param.Unsigned(16, "Number of packets to buffer "
+    link_buffer_size_req = Param.Unsigned(10, "Number of packets to buffer "
         "at the request side of the serial link")
 
     # Number of packets (not flits) to store at the response side of the serial
     #  link. This number should be adjusted to achive required bandwidth
-    link_buffer_size_rsp = Param.Unsigned(16, "Number of packets to buffer "
+    link_buffer_size_rsp = Param.Unsigned(10, "Number of packets to buffer "
         "at the response side of the serial link")
 
     # Latency of the serial link composed by SER/DES latency (1.6ns [4]) plus
     # the PCB trace latency (3ns Estimated based on [5])
     link_latency = Param.Latency('4.6ns', "Latency of the serial links")
 
-    # Header overhead of the serial links: Header size is 128bits in HMC [1],
-    #  and we have 16 lanes, so the overhead is 8 cycles
-    link_overhead = Param.Cycles(8, "The number of cycles required to"
-        " transmit the packet header over the serial link")
-
-    # Clock frequency of the serial links [1]
+    # Clock frequency of the each serial link(SerDes) [1]
     link_frequency = Param.Frequency('10GHz', "Clock Frequency of the serial"
         "links")
 
-    # Number of parallel lanes in each serial link [1]
-    num_lanes_per_link =  Param.Unsigned(16, "Number of lanes per each link")
+    # Clock frequency of serial link Controller[6]
+    # clk_hmc[Mhz]= num_lanes_per_link * lane_speed [Gbits/s] /
+    # data_path_width * 10^6
+    # clk_hmc[Mhz]= 16 * 10 Gbps / 256 * 10^6 = 625 Mhz
+    link_controller_frequency = Param.Frequency('625MHz',
+            "Clock Frequency of the link controller")
 
-    # Number of serial links [1]
-    num_serial_links =  Param.Unsigned(4, "Number of serial links")
+    # Latency of the serial link controller to process the packets[1][6]
+    # (ClockDomain = 625 Mhz )
+    # used here for calculations only
+    link_ctrl_latency = Param.Cycles(4, "The number of cycles required for the"
+        "controller to process the packet")
 
-    #*****************************HMC CONTROLLER PARAMETERS*******************
-    # Number of packets (not flits) to store at the HMC controller. This
-    # number should be high enough to be able to hide the high latency of HMC
-    ctrl_buffer_size_req = Param.Unsigned(256, "Number of packets to buffer "
-        "at the HMC controller (request side)")
+    # total_ctrl_latency = link_ctrl_latency + link_latency
+    # total_ctrl_latency = 4(Cycles) * 1.6 ns +  4.6 ns
+    total_ctrl_latency = Param.Latency('11ns', "The latency experienced by"
+            "every packet regardless of size of packet")
 
-    # Number of packets (not flits) to store at the response side of the HMC
-    #  controller.
-    ctrl_buffer_size_rsp = Param.Unsigned(256, "Number of packets to buffer "
-        "at the HMC controller (response side)")
+    # Number of parallel lanes in each serial link [1]
+    num_lanes_per_link = Param.Unsigned( 16, "Number of lanes per each link")
 
-    # Latency of the HMC controller to process the packets
-    # (ClockDomain = Host clock domain)
-    ctrl_latency = Param.Cycles(4, "The number of cycles required for the "
-        " controller to process the packet")
+    # Number of serial links [1]
+    num_serial_links = Param.Unsigned(4, "Number of serial links")
 
-    # Wiring latency from the SoC crossbar to the HMC controller
-    ctrl_static_latency = Param.Latency('500ps', "Static latency of the HMC"
-        "controller")
+    # speed of each lane of serial link - SerDes serial interface 10 Gb/s
+    serial_link_speed = Param.UInt64(10, "Gbs/s speed of each lane of"
+            "serial link")
 
-    #*****************************PERFORMANCE MONITORING**********************
+   #*****************************PERFORMANCE MONITORING************************
     # The main monitor behind the HMC Controller
-    enable_global_monitor = Param.Bool(True, "The main monitor behind the "
+    enable_global_monitor = Param.Bool(False, "The main monitor behind the "
         "HMC Controller")
 
     # The link performance monitors
-    enable_link_monitor = Param.Bool(True, "The link monitors")
+    enable_link_monitor = Param.Bool(False, "The link monitors" )
+
+    # link aggregator enable - put a cross between buffers & links
+    enable_link_aggr = Param.Bool(False, "The crossbar between port and "
+        "Link Controller")
+
+    enable_buff_div  = Param.Bool(True, "Memory Range of Buffer is"
+            "divided between total range")
+
+   #*****************************HMC ARCHITECTURE ************************
+    # Memory chunk for 16 vault - numbers of vault / number of crossbars
+    mem_chunk = Param.Unsigned(4, "Chunk of memory range for each cross bar "
+            "in arch 0")
+
+    # size of req buffer within crossbar, used for modelling extra latency
+    # when the reuqest go to non-local vault
+    xbar_buffer_size_req = Param.Unsigned(10, "Number of packets to buffer "
+        "at the request side of the crossbar")
+
+    # size of response buffer within crossbar, used for modelling extra latency
+    # when the response received from non-local vault
+    xbar_buffer_size_resp = Param.Unsigned(10, "Number of packets to buffer "
+        "at the response side of the crossbar")
+
+# configure host system with Serial Links
+def config_host_hmc(options, system):
+
+    system.hmc_host=HMCSystem()
+
+    try:
+        system.hmc_host.enable_global_monitor = options.enable_global_monitor
+    except:
+        pass;
+
+    try:
+        system.hmc_host.enable_link_monitor = options.enable_link_monitor
+    except:
+        pass;
+
+    # Serial link Controller with 16 SerDes links at 10 Gbps
+    # with serial link ranges w.r.t to architecture
+    system.hmc_host.seriallink = [SerialLink(ranges = options.ser_ranges[i],
+        req_size=system.hmc_host.link_buffer_size_req,
+        resp_size=system.hmc_host.link_buffer_size_rsp,
+        num_lanes=system.hmc_host.num_lanes_per_link,
+        link_speed=system.hmc_host.serial_link_speed,
+        delay=system.hmc_host.total_ctrl_latency)
+        for i in xrange(system.hmc_host.num_serial_links)]
+
+    # enable global monitor
+    if system.hmc_host.enable_global_monitor:
+        system.hmc_host.lmonitor = [ CommMonitor()
+        for i in xrange(system.hmc_host.num_serial_links)]
+
+    # set the clock frequency for serial link
+    for i in xrange(system.hmc_host.num_serial_links):
+        system.hmc_host.seriallink[i].clk_domain = SrcClockDomain(clock=system.
+                hmc_host.link_controller_frequency, voltage_domain=
+                VoltageDomain(voltage = '1V'))
+
+    # Connect membus/traffic gen to Serial Link Controller for differrent HMC
+    # architectures
+    if options.arch == "distributed":
+        for i in xrange(system.hmc_host.num_links_controllers):
+            if system.hmc_host.enable_global_monitor:
+                system.membus.master = system.hmc_host.lmonitor[i].slave
+                system.hmc_host.lmonitor[i].master = \
+                    system.hmc_host.seriallink[i].slave
+            else:
+                system.membus.master = system.hmc_host.seriallink[i].slave
+    if options.arch == "mixed":
+        if system.hmc_host.enable_global_monitor:
+            system.membus.master = system.hmc_host.lmonitor[0].slave
+            system.hmc_host.lmonitor[0].master = \
+                system.hmc_host.seriallink[0].slave
+
+            system.membus.master = system.hmc_host.lmonitor[1].slave
+            system.hmc_host.lmonitor[1].master = \
+                system.hmc_host.seriallink[1].slave
+
+            system.tgen[2].port = system.hmc_host.lmonitor[2].slave
+            system.hmc_host.lmonitor[2].master = \
+                 system.hmc_host.seriallink[2].slave
+
+            system.tgen[3].port = system.hmc_host.lmonitor[3].slave
+            system.hmc_host.lmonitor[3].master = \
+                system.hmc_host.seriallink[3].slave
+        else:
+            system.membus.master = system.hmc_host.seriallink[0].slave
+            system.membus.master = system.hmc_host.seriallink[1].slave
+            system.tgen[2].port = system.hmc_host.seriallink[2].slave
+            system.tgen[3].port = system.hmc_host.seriallink[3].slave
+    if options.arch == "same" :
+        for i in xrange(system.hmc_host.num_links_controllers):
+            if system.hmc_host.enable_global_monitor:
+                system.tgen[i].port = system.hmc_host.lmonitor[i].slave
+                system.hmc_host.lmonitor[i].master = \
+                    system.hmc_host.seriallink[i].slave
+            else:
+                system.tgen[i].port = system.hmc_host.seriallink[i].slave
+
+    return system
 
 # Create an HMC device and attach it to the current system
-def config_hmc(options, system):
+def config_hmc(options, system, hmc_host):
 
-    system.hmc = HMCSystem()
+    # Create HMC device
+    system.hmc_dev = HMCSystem()
 
-    system.buffer = Bridge(ranges=system.mem_ranges,
-                           req_size=system.hmc.ctrl_buffer_size_req,
-                           resp_size=system.hmc.ctrl_buffer_size_rsp,
-                           delay=system.hmc.ctrl_static_latency)
+    # Global monitor
     try:
-        system.hmc.enable_global_monitor = options.enable_global_monitor
+        system.hmc_dev.enable_global_monitor = options.enable_global_monitor
     except:
         pass;
 
     try:
-        system.hmc.enable_link_monitor = options.enable_link_monitor
+        system.hmc_dev.enable_link_monitor = options.enable_link_monitor
     except:
         pass;
 
-    system.membus.master = system.buffer.slave
-
-    # The HMC controller (Clock domain is the same as the host)
-    system.hmccontroller = HMCController(width=(system.hmc.num_lanes_per_link.
-        value * system.hmc.num_serial_links/8),
-        frontend_latency=system.hmc.ctrl_latency,
-        forward_latency=system.hmc.link_overhead,
-        response_latency=system.hmc.link_overhead)
-
-    system.hmccontroller.clk_domain = SrcClockDomain(clock=system.hmc.
-        link_frequency, voltage_domain = VoltageDomain(voltage = '1V'))
-
-    # Serial Links
-    system.hmc.seriallink =[ SerialLink(ranges = system.mem_ranges,
-        req_size=system.hmc.link_buffer_size_req,
-        resp_size=system.hmc.link_buffer_size_rsp,
-        num_lanes=system.hmc.num_lanes_per_link,
-        delay=system.hmc.link_latency)
-        for i in xrange(system.hmc.num_serial_links)]
-
-    if system.hmc.enable_link_monitor:
-        system.hmc.lmonitor = [ CommMonitor()
-        for i in xrange(system.hmc.num_serial_links)]
-
-    # The HMC Crossbar located in its logic-base (LoB)
-    system.hmc.xbar = NoncoherentXBar(width = system.hmc.xbar_width,
-        frontend_latency=system.hmc.xbar_frontend_latency,
-        forward_latency=system.hmc.xbar_forward_latency,
-        response_latency=system.hmc.xbar_response_latency )
-    system.hmc.xbar.clk_domain = SrcClockDomain(clock =
-        system.hmc.xbar_frequency, voltage_domain =
-        VoltageDomain(voltage = '1V'))
-
-    if system.hmc.enable_global_monitor:
-        system.gmonitor = CommMonitor()
-        system.buffer.master = system.gmonitor.slave
-        system.gmonitor.master = system.hmccontroller.slave
-    else:
-        system.hmccontroller.slave = system.buffer.master
-
-    for i in xrange(system.hmc.num_serial_links):
-        system.hmccontroller.master = system.hmc.seriallink[i].slave
-        system.hmc.seriallink[i].clk_domain = system.hmccontroller.clk_domain;
-        if system.hmc.enable_link_monitor:
-            system.hmc.seriallink[i].master = system.hmc.lmonitor[i].slave
-            system.hmc.lmonitor[i].master = system.hmc.xbar.slave
+
+    if system.hmc_dev.enable_link_monitor:
+        system.hmc_dev.lmonitor = [ CommMonitor()
+        for i in xrange(system.hmc_dev.num_links_controllers)]
+
+    # 4 HMC Crossbars located in its logic-base (LoB)
+    system.hmc_dev.xbar = [ NoncoherentXBar(width=system.hmc_dev.xbar_width,
+        frontend_latency=system.hmc_dev.xbar_frontend_latency,
+        forward_latency=system.hmc_dev.xbar_forward_latency,
+        response_latency=system.hmc_dev.xbar_response_latency )
+        for i in xrange(system.hmc_host.number_mem_crossbar)]
+
+    for i in xrange(system.hmc_dev.number_mem_crossbar):
+        system.hmc_dev.xbar[i].clk_domain = SrcClockDomain(
+                clock=system.hmc_dev.xbar_frequency,voltage_domain=
+                VoltageDomain(voltage='1V'))
+
+    # Attach 4 serial link to 4 crossbar/s
+    for i in xrange(system.hmc_dev.num_serial_links):
+        if system.hmc_dev.enable_link_monitor:
+            system.hmc_host.seriallink[i].master = \
+                system.hmc_dev.lmonitor[i].slave
+            system.hmc_dev.lmonitor[i].master = system.hmc_dev.xbar[i].slave
         else:
-            system.hmc.seriallink[i].master = system.hmc.xbar.slave
+            system.hmc_host.seriallink[i].master = system.hmc_dev.xbar[i].slave
+
+    # Connecting xbar with each other for request arriving at the wrong xbar,
+    # then it will be forward to correct xbar. Bridge is used to connect xbars
+    if options.arch == "same":
+        numx = len(system.hmc_dev.xbar)
+
+        # create a list of buffers
+        system.hmc_dev.buffers = [ Bridge(
+            req_size=system.hmc_dev.xbar_buffer_size_req,
+            resp_size=system.hmc_dev.xbar_buffer_size_resp)
+            for i in xrange(numx * (system.hmc_dev.mem_chunk - 1))]
+
+        # Buffer iterator
+        it = iter(range(len(system.hmc_dev.buffers)))
+
+        # necesarry to add system_port to one of the xbar
+        system.system_port = system.hmc_dev.xbar[3].slave
+
+        # iterate over all the crossbars and connect them as required
+        for i in range(numx):
+            for j in range(numx):
+                # connect xbar to all other xbars except itself
+                if i != j:
+                    # get the next index of buffer
+                    index = it.next()
+
+                    # Change the default values for ranges of bridge
+                    system.hmc_dev.buffers[index].ranges = system.mem_ranges[
+                            j * int(system.hmc_dev.mem_chunk):
+                            (j + 1) * int(system.hmc_dev.mem_chunk)]
+
+                    # Connect the bridge between corssbars
+                    system.hmc_dev.xbar[i].master = system.hmc_dev.buffers[
+                            index].slave
+                    system.hmc_dev.buffers[
+                            index].master = system.hmc_dev.xbar[j].slave
+                else:
+                    # Don't connect the xbar to itself
+                    pass
+
+    # Two crossbars are connected to all other crossbars-Other 2 vault
+    # can only direct traffic to it local vaults
+    if options.arch == "mixed":
+
+        system.hmc_dev.buffer30 = Bridge(ranges=system.mem_ranges[0:4])
+        system.hmc_dev.xbar[3].master = system.hmc_dev.buffer30.slave
+        system.hmc_dev.buffer30.master = system.hmc_dev.xbar[0].slave
+
+        system.hmc_dev.buffer31 = Bridge(ranges=system.mem_ranges[4:8])
+        system.hmc_dev.xbar[3].master = system.hmc_dev.buffer31.slave
+        system.hmc_dev.buffer31.master = system.hmc_dev.xbar[1].slave
+
+        system.hmc_dev.buffer32 = Bridge(ranges=system.mem_ranges[8:12])
+        system.hmc_dev.xbar[3].master = system.hmc_dev.buffer32.slave
+        system.hmc_dev.buffer32.master = system.hmc_dev.xbar[2].slave
+
+
+        system.hmc_dev.buffer20 = Bridge(ranges=system.mem_ranges[0:4])
+        system.hmc_dev.xbar[2].master = system.hmc_dev.buffer20.slave
+        system.hmc_dev.buffer20.master = system.hmc_dev.xbar[0].slave
+
+        system.hmc_dev.buffer21 = Bridge(ranges=system.mem_ranges[4:8])
+        system.hmc_dev.xbar[2].master = system.hmc_dev.buffer21.slave
+        system.hmc_dev.buffer21.master = system.hmc_dev.xbar[1].slave
+
+        system.hmc_dev.buffer23 = Bridge(ranges=system.mem_ranges[12:16])
+        system.hmc_dev.xbar[2].master = system.hmc_dev.buffer23.slave
+        system.hmc_dev.buffer23.master = system.hmc_dev.xbar[3].slave
+
diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py
index 4685cd5d1..71e3bf460 100644
--- a/configs/common/MemConfig.py
+++ b/configs/common/MemConfig.py
@@ -153,9 +153,10 @@ def config_mem(options, system):
     """
 
     if ( options.mem_type == "HMC_2500_x32"):
-        HMC.config_hmc(options, system)
-        subsystem = system.hmc
-        xbar = system.hmc.xbar
+        HMChost = HMC.config_host_hmc(options, system)
+        HMC.config_hmc(options, system, HMChost.hmc_host)
+        subsystem = system.hmc_dev
+        xbar = system.hmc_dev.xbar
     else:
         subsystem = system
         xbar = system.membus
@@ -222,4 +223,7 @@ def config_mem(options, system):
 
     # Connect the controllers to the membus
     for i in xrange(len(subsystem.mem_ctrls)):
-        subsystem.mem_ctrls[i].port = xbar.master
+        if (options.mem_type == "HMC_2500_x32"):
+            subsystem.mem_ctrls[i].port = xbar[i/4].master
+        else:
+            subsystem.mem_ctrls[i].port = xbar.master
diff --git a/configs/example/hmctest.py b/configs/example/hmctest.py
new file mode 100644
index 000000000..bd6ca24d1
--- /dev/null
+++ b/configs/example/hmctest.py
@@ -0,0 +1,170 @@
+import optparse
+import sys
+import subprocess
+
+import m5
+from m5.objects import *
+from m5.util import addToPath
+
+addToPath('../common')
+import MemConfig
+import HMC
+
+parser = optparse.OptionParser()
+
+# Use a HMC_2500_x32 by default
+parser.add_option("--mem-type", type = "choice", default = "HMC_2500_x32",
+                  choices = MemConfig.mem_names(),
+                  help = "type of memory to use")
+
+parser.add_option("--ranks", "-r", type = "int", default = 1,
+                  help = "Number of ranks to iterate across")
+
+parser.add_option("--rd_perc", type ="int", default=100,
+                  help = "Percentage of read commands")
+
+parser.add_option("--mode", type ="choice", default ="DRAM",
+                  choices = ["DRAM", "DRAM_ROTATE", "RANDOM"],
+                  help = "DRAM: Random traffic; \
+                          DRAM_ROTATE: Traffic rotating across banks and ranks"
+                          )
+
+parser.add_option("--addr_map", type ="int", default = 1,
+                  help = "0: RoCoRaBaCh; 1: RoRaBaCoCh/RoRaBaChCo")
+
+parser.add_option("--arch", type = "choice", default = "distributed",
+                  choices = ["same", "distributed", "mixed"],
+                  help = "same: HMC-4 links with same range\
+                  distributed: HMC-4 links with distributed range\
+                  mixed: mixed with same & distributed range")
+
+parser.add_option("--linkaggr", type = "int", default = 0,
+                  help = "1: enable link crossbar, 0: disable link crossbar")
+
+parser.add_option("--num_cross", type = "int", default = 4,
+                  help = "1: number of crossbar in HMC=1;\
+                  4: number of crossbar = 4")
+
+parser.add_option("--tlm-memory", type = "string",
+                  help="use external port for SystemC TLM cosimulation")
+
+parser.add_option("--elastic-trace-en", action ="store_true",
+                  help = """Enable capture of data dependency and instruction
+                  fetch traces using elastic trace probe.""")
+
+(options, args) = parser.parse_args()
+
+if args:
+    print "Error: script doesn't take any positional arguments"
+    sys.exit(1)
+
+system = System()
+system.clk_domain = SrcClockDomain(clock='100GHz',
+                                   voltage_domain=
+                                   VoltageDomain(voltage = '1V'))
+# Create additional crossbar for arch1
+if options.arch == "distributed" or options.arch == "mixed" :
+    system.membus = NoncoherentXBar( width=8 )
+    system.membus.badaddr_responder = BadAddr()
+    system.membus.default = Self.badaddr_responder.pio
+    system.membus.width = 8
+    system.membus.frontend_latency = 3
+    system.membus.forward_latency = 4
+    system.membus.response_latency = 2
+
+    system.membus.clk_domain = SrcClockDomain(clock='100GHz', voltage_domain=
+            VoltageDomain(voltage = '1V'))
+
+# we are considering 4GB HMC device with following parameters
+# hmc_device_size = '4GB'
+# hmc_num_vaults = 16
+# hmc_vault_size = '256MB'
+# hmc_stack_size = 8
+# hmc_bank_in_stack = 2
+# hmc_bank_size = '16MB'
+# hmc_bank_in_vault = 16
+
+# determine the burst length in bytes
+burst_size = 256
+num_serial_links = 4
+num_vault_ctrl = 16
+options.mem_channels = 1
+options.external_memory_system = 0
+options.mem_ranks=1
+stride_size = burst_size
+system.cache_line_size = burst_size
+
+# Enable performance monitoring
+options.enable_global_monitor = True
+options.enable_link_monitor = False
+
+# Bytes used for calculations
+oneGBytes = 1024 * 1024 * 1024
+oneMBytes = 1024 * 1024
+
+# Memory ranges of 16 vault controller - Total_HMC_size / 16
+mem_range_vault = [ AddrRange(i * 256 * oneMBytes, ((i + 1) * 256 * oneMBytes)
+    - 1)
+        for i in range(num_vault_ctrl)]
+
+# Memmory ranges of serial link for arch-0
+# Same as the ranges of vault controllers - 4 vault - to - 1 serial link
+if options.arch == "same":
+    ser_range  = [ AddrRange(0, (4 * oneGBytes) - 1)
+            for i in range(num_serial_links)]
+    options.ser_ranges = ser_range
+
+# Memmory ranges of serial link for arch-1
+# Distributed range accross links
+if options.arch == "distributed":
+    ser_range  = [ AddrRange(i * oneGBytes, ((i + 1) * oneGBytes) - 1)
+            for i in range(num_serial_links)]
+    options.ser_ranges = ser_range
+
+# Memmory ranges of serial link for arch-2
+# "Mixed" address distribution over links
+if options.arch == "mixed":
+    ser_range0  = AddrRange(0              , (1 * oneGBytes) - 1)
+    ser_range1  = AddrRange(1 * oneGBytes  , (2 * oneGBytes) - 1)
+    ser_range2  = AddrRange(0              , (4 * oneGBytes) - 1)
+    ser_range3  = AddrRange(0              , (4 * oneGBytes) - 1)
+    options.ser_ranges = [ser_range0, ser_range1, ser_range2, ser_range3]
+
+# Assign ranges of vault controller to system ranges
+system.mem_ranges = mem_range_vault
+
+# open traffic generator
+cfg_file_name = "./tests/quick/se/70.tgen/traffic.cfg"
+cfg_file = open(cfg_file_name, 'r')
+
+# number of traffic generator
+np = 4
+# create a traffic generator, and point it to the file we just created
+system.tgen = [ TrafficGen(config_file = cfg_file_name) for i in xrange(np)]
+
+# Config memory system with given HMC arch
+MemConfig.config_mem(options, system)
+
+if options.arch == "distributed":
+    for i in xrange(np):
+        system.tgen[i].port = system.membus.slave
+    # connect the system port even if it is not used in this example
+    system.system_port = system.membus.slave
+
+if options.arch == "mixed":
+    for i in xrange(int(np/2)):
+        system.tgen[i].port = system.membus.slave
+    # connect the system port even if it is not used in this example
+    system.system_port = system.membus.slave
+
+
+# run Forrest, run!
+root = Root(full_system = False, system = system)
+root.system.mem_mode = 'timing'
+
+m5.instantiate()
+m5.simulate(10000000000)
+
+m5.stats.dump()
+
+print "Done!"
diff --git a/ext/drampower/README.md b/ext/drampower/README.md
index a43298b01..5d6eb6e82 100644
--- a/ext/drampower/README.md
+++ b/ext/drampower/README.md
@@ -252,8 +252,8 @@ The tool is based on the DRAM power model developed jointly by the Computer Engi
 
 **To cite the DRAMPower Tool:**
 ```
-[1] "DRAMPower: Open-source DRAM power & energy estimation tool"
-Karthik Chandrasekar, Christian Weis, Yonghui Li, Benny Akesson, Norbert Wehn, and Kees Goossens
+[1] DRAMPower: Open-source DRAM Power & Energy Estimation Tool
+Karthik Chandrasekar, Christian Weis, Yonghui Li, Sven Goossens, Matthias Jung, Omar Naji, Benny Akesson, Norbert Wehn, and Kees Goossens
 URL: http://www.drampower.info
 ```
 
diff --git a/ext/drampower/src/CmdScheduler.cc b/ext/drampower/src/CmdScheduler.cc
index bffc5d3bb..a4619b94e 100644
--- a/ext/drampower/src/CmdScheduler.cc
+++ b/ext/drampower/src/CmdScheduler.cc
@@ -31,7 +31,7 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Authors: Karthik Chandrasekar
+ * Authors: Karthik Chandrasekar, Yonghui Li, Sven Goossens
  *
  */
 #include "CmdScheduler.h"
@@ -42,17 +42,20 @@
 #include <algorithm>  // For max
 
 
+#define MILLION 1000000
+
+
 using namespace std;
 using namespace Data;
 
 // Read the traces and get the transaction. Each transaction is executed by
 // scheduling a number of commands to the memory. Hence, the transactions are
 // translated into a sequence of commands which will be used for power analysis.
-void cmdScheduler::transTranslation(MemorySpecification memSpec,
+void cmdScheduler::transTranslation(const MemorySpecification& memSpec,
                                     ifstream& trans_trace, int grouping, int interleaving, int burst, int powerdown)
 {
   commands.open("commands.trace", ifstream::out);
-  MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
   nBanks          = memArchSpec.nbrOfBanks;
   nColumns        = memArchSpec.nbrOfColumns;
   burstLength     = memArchSpec.burstLength;
@@ -77,13 +80,14 @@ void cmdScheduler::transTranslation(MemorySpecification memSpec,
 } // cmdScheduler::transTranslation
 
 // initialize the variables and vectors for starting command scheduling.
-void cmdScheduler::schedulingInitialization(MemorySpecification memSpec)
+void cmdScheduler::schedulingInitialization(const MemorySpecification& memSpec)
 {
-  MemTimingSpec& memTimingSpec = memSpec.memTimingSpec;
+  const MemTimingSpec& memTimingSpec = memSpec.memTimingSpec;
 
-  ACT.resize(2 * memSpec.memArchSpec.nbrOfBanks);
-  RDWR.resize(2 * memSpec.memArchSpec.nbrOfBanks);
-  PRE.resize(memSpec.memArchSpec.nbrOfBanks);
+  const size_t numBanks = static_cast<size_t>(memSpec.memArchSpec.nbrOfBanks);
+  ACT.resize(2 * numBanks);
+  RDWR.resize(2 * numBanks);
+  PRE.resize(numBanks);
   bankaccess = memSpec.memArchSpec.nbrOfBanks;
   if (!ACT.empty()) {
     ACT.erase(ACT.begin(), ACT.end());
@@ -96,14 +100,15 @@ void cmdScheduler::schedulingInitialization(MemorySpecification memSpec)
   }
 
   ///////////////initialization//////////////
-  for (unsigned i = 0; i < memSpec.memArchSpec.nbrOfBanks; i++) {
+  for (int64_t i = 0; i < memSpec.memArchSpec.nbrOfBanks; i++) {
     cmd.Type = PRECHARGE;
-    cmd.bank = i;
+    cmd.bank = static_cast<unsigned>(i);
     cmd.name = "PRE";
-    if (memSpec.id == "WIDEIO_SDR")
-      cmd.time = 1 - static_cast<double>(memSpec.memTimingSpec.TAW);
-    else
-      cmd.time = 1 - static_cast<double>(memSpec.memTimingSpec.FAW);
+    if (memSpec.id == "WIDEIO_SDR") {
+      cmd.time = 1 - memSpec.memTimingSpec.TAW;
+    } else {
+      cmd.time = 1 - memSpec.memTimingSpec.FAW;
+    }
 
     PRE.push_back(cmd);
 
@@ -114,7 +119,7 @@ void cmdScheduler::schedulingInitialization(MemorySpecification memSpec)
     cmd.Type = WRITE;
     cmd.name = "WRITE";
     cmd.time = -1;
-    RDWR[i].push_back(cmd);
+    RDWR[static_cast<size_t>(i)].push_back(cmd);
   }
   tREF             = memTimingSpec.REFI;
   transFinish.time = 0;
@@ -130,14 +135,14 @@ void cmdScheduler::schedulingInitialization(MemorySpecification memSpec)
 // transactions are generated according to the information read from the traces.
 // Then the command scheduling function is triggered to generate commands and
 // schedule them to the memory according to the timing constraints.
-void cmdScheduler::getTrans(std::ifstream& trans_trace, MemorySpecification memSpec)
+void cmdScheduler::getTrans(std::ifstream& trans_trace, const MemorySpecification& memSpec)
 {
   std::string line;
 
   transTime = 0;
-  unsigned newtranstime;
-  unsigned transAddr;
-  unsigned transType = 1;
+  uint64_t newtranstime;
+  uint64_t transAddr;
+  int64_t transType = 1;
   trans    TransItem;
 
   if (!transTrace.empty()) {
@@ -147,12 +152,12 @@ void cmdScheduler::getTrans(std::ifstream& trans_trace, MemorySpecification memS
   while (getline(trans_trace, line)) {
     istringstream linestream(line);
     string item;
-    unsigned itemnum = 0;
+    uint64_t itemnum = 0;
     while (getline(linestream, item, ',')) {
       if (itemnum == 0) {
         stringstream timestamp(item);
         timestamp >> newtranstime;
-        transTime = transTime + newtranstime;
+        transTime = transTime + static_cast<int64_t>(newtranstime);
       } else if (itemnum == 1) {
         if (item  == "write" || item == "WRITE") {
           transType = WRITE;
@@ -191,33 +196,35 @@ void cmdScheduler::getTrans(std::ifstream& trans_trace, MemorySpecification memS
 // be scheduled until all the commands for the current one are scheduled.
 // After the scheduling, a sequence of commands are obtained and they are written
 // into commands.txt which will be used for power analysis.
-void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
+void cmdScheduler::analyticalScheduling(const MemorySpecification& memSpec)
 {
-  int  Bs               = -1;
-  int  transType        = -1;
-  double timer          = 0;
-  int  bankGroupPointer = 0;
-  int  bankGroupAddr    = 0;
+  int64_t  transType        = -1;
+  int64_t timer          = 0;
+  uint64_t  bankGroupPointer = 0;
+  uint64_t  bankGroupAddr    = 0;
   bool collisionFound;
   physicalAddr PhysicalAddress;
   bool bankGroupSwitch  = false;
-  std::vector<unsigned> bankPointer(nbrOfBankGroups, 0);
-  std::vector<int>  bankAccessNum(nBanks, -1);
-  std::vector<bool> ACTSchedule(nBanks, false);
-  int bankAddr       = -1;
-  double endTime     = 0;
-  double tComing_REF = 0;
+  std::vector<uint64_t> bankPointer(static_cast<size_t>(nbrOfBankGroups), 0);
+  std::vector<int64_t>  bankAccessNum(static_cast<size_t>(nBanks), -1);
+  std::vector<bool> ACTSchedule(static_cast<size_t>(nBanks), false);
+  uint64_t bankAddr   = 0;
+  int64_t endTime     = 0;
+  int64_t tComing_REF = 0;
 
   Inselfrefresh = 0;
 
-  MemTimingSpec& memTimingSpec = memSpec.memTimingSpec;
+  const MemTimingSpec& memTimingSpec = memSpec.memTimingSpec;
 
-  for (unsigned t = 0; t < transTrace.size(); t++) {
+  for (uint64_t t = 0; t < transTrace.size(); t++) {
     cmdScheduling.erase(cmdScheduling.begin(), cmdScheduling.end());
 
-    for (unsigned i = 0; i < nBanks; i++) {
-      ACTSchedule[i]   = false;
-      bankAccessNum[i] = -1;
+    for (auto a : ACTSchedule) {
+      a = false;
+    }
+
+    for (auto& b : bankAccessNum) {
+      b = -1;
     }
 
     timingsGet      = false;
@@ -225,13 +232,13 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
 
     PhysicalAddress = memoryMap(transTrace[t], memSpec);
 
-    for (unsigned i = 0; i < nbrOfBankGroups; i++) {
-      bankPointer[i] = PhysicalAddress.bankAddr; // the bank pointer per group.
+    for (auto& b : bankPointer) {
+      b = PhysicalAddress.bankAddr; // the bank pointer per group.
     }
     bankGroupPointer = PhysicalAddress.bankGroupAddr;
 
-    endTime          = max(transFinish.time, PRE[transFinish.bank].time +
-                           static_cast<int>(memTimingSpec.RP));
+    endTime = max(transFinish.time, PRE[static_cast<size_t>(transFinish.bank)].time +
+                                    static_cast<int>(memTimingSpec.RP));
 
     // Before starting the scheduling for the next transaction, it has to
     // check whether it is necessary for implementing power down.
@@ -244,14 +251,12 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
 
     ///////////////Scheduling Refresh////////////////////////
     if (((transFinish.time >= tREF) || (timer >= tREF))) {
-      for (double i = 0; i <= ((timer - tComing_REF) > 0 ? (timer - tComing_REF) /
+      for (int64_t i = 0; i <= ((timer - tComing_REF) > 0 ? (timer - tComing_REF) /
                                memTimingSpec.REFI : 0); i++) {
         cmd.bank = 0;
         cmd.name = "REF";
-        cmd.time = max(max(max(transFinish.time, PRE[transFinish.bank].time
-                               + static_cast<int>(memTimingSpec.RP)), tREF), startTime);
-        if (((power_down == SELF_REFRESH) && !Inselfrefresh) ||
-            (power_down != SELF_REFRESH)) {
+        cmd.time = max(max(max(transFinish.time, PRE[static_cast<size_t>(transFinish.bank)].time + memTimingSpec.RP), tREF), startTime);
+        if ((power_down == SELF_REFRESH && !Inselfrefresh) || power_down != SELF_REFRESH) {
           cmdScheduling.push_back(cmd);
           startTime = cmd.time + memTimingSpec.RFC;
         }
@@ -262,7 +267,7 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
       }
     }
     ///////////////Execution Transactions///////////////////
-    Bs        = PhysicalAddress.bankAddr;
+    uint64_t Bs = PhysicalAddress.bankAddr;
     transType = transTrace[t].type;
 
     tRWTP     = getRWTP(transType, memSpec);
@@ -280,9 +285,8 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
               bankGroupSwitch = true;
             }
             // update to the current bank group address.
-            bankGroupAddr = PhysicalAddress.bankGroupAddr + j;
-            bankAddr      = bankGroupAddr * nBanks / nbrOfBankGroups +
-                            bankPointer[bankGroupAddr];
+            bankGroupAddr = PhysicalAddress.bankGroupAddr + static_cast<uint64_t>(j);
+            bankAddr = bankGroupAddr * static_cast<uint64_t>(nBanks) / nbrOfBankGroups + bankPointer[bankGroupAddr];
           } else   {
             bankAddr = Bs + i;
           }
@@ -312,7 +316,7 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
                              static_cast<int>(memTimingSpec.TAW));
             }
 
-            if ((i == 0) && (j == 0)) {
+            if (i == 0 && j == 0) {
               cmd.time = max(cmd.time, PreRDWR.time + 1);
               cmd.time = max(cmd.time, timer);
               cmd.time = max(startTime, cmd.time);
@@ -358,7 +362,7 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
           }
           for (int ACTBank = static_cast<int>(ACT.size() - 1);
                ACTBank >= 0; ACTBank--) {
-            if (ACT[ACTBank].bank == bankAddr) {
+            if (ACT[ACTBank].bank == static_cast<int64_t>(bankAddr)) {
               cmd.time = max(PreRDWR.time + tSwitch_init, ACT.back().time
                              + static_cast<int>(memTimingSpec.RCD));
               break;
@@ -392,7 +396,7 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
             PRE[bankAddr].name = "PRE";
             for (int ACTBank = static_cast<int>(ACT.size() - 1);
                  ACTBank >= 0; ACTBank--) {
-              if (ACT[ACTBank].bank == bankAddr) {
+              if (ACT[ACTBank].bank == static_cast<int64_t>(bankAddr)) {
                 PRE[bankAddr].time = max(ACT.back().time +
                                          static_cast<int>(memTimingSpec.RAS),
                                          PreRDWR.time + tRWTP);
@@ -419,7 +423,7 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
     /////////////Update Vector Length/////////////////
     // the vector length is reduced so that less memory is used for running
     // this tool.
-    if (ACT.size() >= memSpec.memArchSpec.nbrOfBanks) {
+    if (ACT.size() >= static_cast<size_t>(memSpec.memArchSpec.nbrOfBanks)) {
       for (int m = 0; m < BI * BGI; m++) {
         ACT.erase(ACT.begin());
         RDWR[0].erase(RDWR[0].begin(), RDWR[0].end());
@@ -443,14 +447,14 @@ void cmdScheduler::analyticalScheduling(MemorySpecification memSpec)
 // to add the power down/up during the command scheduling for transactions.
 // It is called when the command scheduling for a transaction is finished, and it
 // is also called if there is a refresh.
-void cmdScheduler::pdScheduling(double endTime, double timer,
-                                MemorySpecification memSpec)
+void cmdScheduler::pdScheduling(int64_t endTime, int64_t timer,
+                                const MemorySpecification& memSpec)
 {
-  double ZERO = 0;
-  MemTimingSpec& memTimingSpec = memSpec.memTimingSpec;
+  int64_t ZERO = 0;
+  const MemTimingSpec& memTimingSpec = memSpec.memTimingSpec;
 
   endTime = max(endTime, startTime);
-  double pdTime = max(ZERO, timer - endTime);
+  int64_t pdTime = max(ZERO, timer - endTime);
 
   if ((timer > (endTime + memTimingSpec.CKE)) && (power_down == POWER_DOWN)) {
     cmd.bank = 0;
@@ -490,11 +494,11 @@ void cmdScheduler::pdScheduling(double endTime, double timer,
 
 // get the time when a precharge occurs after a read/write command is scheduled.
 // In addition, it copes with different kind of memories.
-int cmdScheduler::getRWTP(int transType, MemorySpecification memSpec)
+int64_t cmdScheduler::getRWTP(int64_t transType, const MemorySpecification& memSpec)
 {
-  int tRWTP_init = 0;
-  MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
-  MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  int64_t tRWTP_init = 0;
+  const MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
+  const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
 
   if (transType == READ) {
     switch (memSpec.memoryType) {
@@ -506,13 +510,13 @@ int cmdScheduler::getRWTP(int transType, MemorySpecification memSpec)
     case MemoryType::LPDDR2:
     case MemoryType::LPDDR3:
       tRWTP_init = memArchSpec.burstLength / memArchSpec.dataRate +
-                   max(0, static_cast<int>(memTimingSpec.RTP - 2));
+                   max(int64_t(0), memTimingSpec.RTP - 2);
       break;
 
     case MemoryType::DDR2:
       tRWTP_init = memTimingSpec.AL + memArchSpec.burstLength /
                    memArchSpec.dataRate +
-                   max(static_cast<int>(memTimingSpec.RTP), 2) - 2;
+                   max(memTimingSpec.RTP, int64_t(2)) - 2;
       break;
 
     case MemoryType::DDR3:
@@ -525,10 +529,10 @@ int cmdScheduler::getRWTP(int transType, MemorySpecification memSpec)
   } else if (transType == WRITE)    {
     if (memSpec.memoryType == MemoryType::WIDEIO_SDR) {
       tRWTP_init = memTimingSpec.WL + memArchSpec.burstLength /
-                   memArchSpec.dataRate - 1 + memSpec.memTimingSpec.WR;
+                   memArchSpec.dataRate - 1 + memTimingSpec.WR;
     } else   {
       tRWTP_init = memTimingSpec.WL + memArchSpec.burstLength /
-                   memArchSpec.dataRate + memSpec.memTimingSpec.WR;
+                   memArchSpec.dataRate + memTimingSpec.WR;
     }
     if ((memSpec.memoryType == MemoryType::LPDDR2) ||
         (memSpec.memoryType == MemoryType::LPDDR3)) {
@@ -543,11 +547,11 @@ int cmdScheduler::getRWTP(int transType, MemorySpecification memSpec)
 // In particular, tSwitch_init is generally used to provide the timings for
 // scheduling a read/write command after a read/write command which have been
 // scheduled to any possible banks within any possible bank groups (DDR4).
-void cmdScheduler::getTimingConstraints(bool BGSwitch, MemorySpecification memSpec,
-                                        int PreType, int CurrentType)
+void cmdScheduler::getTimingConstraints(bool BGSwitch, const MemorySpecification& memSpec,
+                                        int64_t PreType, int64_t CurrentType)
 {
-  MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
-  MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  const MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
+  const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
 
   if (memSpec.memoryType != MemoryType::DDR4) {
     tRRD_init = memTimingSpec.RRD;
@@ -586,7 +590,7 @@ void cmdScheduler::getTimingConstraints(bool BGSwitch, MemorySpecification memSp
     if (PreType == CurrentType) {
       tSwitch_init = tCCD_init;
       timingsGet   = true;
-    } else if ((PreType == WRITE) && (CurrentType == READ)) {
+    } else if (PreType == WRITE && CurrentType == READ) {
       tSwitch_init = memTimingSpec.WL + memArchSpec.burstLength /
                      memArchSpec.dataRate + tWTR_init;
     }
@@ -601,59 +605,55 @@ void cmdScheduler::getTimingConstraints(bool BGSwitch, MemorySpecification memSp
 // The logical address of each transaction is translated into a physical address
 // which consists of bank group (for DDR4), bank, row and column addresses.
 cmdScheduler::physicalAddr cmdScheduler::memoryMap(trans               Trans,
-                                                   MemorySpecification memSpec)
+                                                   const MemorySpecification& memSpec)
 {
-  int DecLogic;
+  int64_t DecLogic;
   physicalAddr PhysicalAddr;
 
   DecLogic = Trans.logicalAddress;
 
   // row-bank-column-BI-BC-BGI-BL
-  if ((BGI > 1) && (memSpec.memoryType == MemoryType::DDR4)) {
-    unsigned colBits   = static_cast<unsigned>(log2(nColumns));
-    unsigned bankShift = static_cast<unsigned>(colBits + ((BI > 1) ? log2(BI) : 0)
-                                               + ((BGI > 1) ? log2(BGI) : 0));
-    unsigned bankMask  = static_cast<unsigned>(nBanks / (BI * nbrOfBankGroups) - 1)
-                        << bankShift;
-    unsigned bankAddr  = (DecLogic & bankMask) >>
-                         static_cast<unsigned>(colBits + ((BGI > 1) ? log2(BGI) : 0));
+  if (BGI > 1 && memSpec.memoryType == MemoryType::DDR4) {
+    uint64_t colBits   = uintLog2(nColumns);
+    uint64_t bankShift = colBits + ((BI > 1) ? uintLog2(BI) : 0)  + ((BGI > 1) ? uintLog2(BGI) : 0);
+    uint64_t bankMask  = (nBanks / (BI * nbrOfBankGroups) - 1) << bankShift;
+    uint64_t bankAddr  = (DecLogic & bankMask) >> (colBits + ((BGI > 1) ? uintLog2(BGI) : 0));
     PhysicalAddr.bankAddr = bankAddr;
 
-    unsigned bankGroupShift = static_cast<unsigned>(log2(burstLength));
-    unsigned bankGroupMask  = (nbrOfBankGroups / BGI - 1) << bankGroupShift;
-    unsigned bankGroupAddr  = (DecLogic & bankGroupMask) >> bankGroupShift;
+    uint64_t bankGroupShift = uintLog2(burstLength);
+    uint64_t bankGroupMask  = (nbrOfBankGroups / BGI - 1) << bankGroupShift;
+    uint64_t bankGroupAddr  = (DecLogic & bankGroupMask) >> bankGroupShift;
     PhysicalAddr.bankGroupAddr = bankGroupAddr;
 
-    unsigned colShift       = static_cast<unsigned>(log2(BC * burstLength) +
-                                                    ((BI > 1) ? log2(BI) : 0) + ((BGI > 1) ? log2(BGI) : 0));
-    unsigned colMask        = static_cast<unsigned>(nColumns / (BC * burstLength) - 1)
-                       << colShift;
-    unsigned colAddr        = (DecLogic & colMask) >>
-                              static_cast<unsigned>((colShift - log2(static_cast<unsigned>(BC) * burstLength)));
+    uint64_t colShift       = uintLog2(BC * burstLength) +
+                                                    ((BI > 1) ? uintLog2(BI) : 0) + ((BGI > 1) ? uintLog2(BGI) : 0);
+    uint64_t colMask        = (nColumns / (BC * burstLength) - 1) << colShift;
+    uint64_t colAddr        = (DecLogic & colMask) >> (colShift - uintLog2(static_cast<uint64_t>(BC) * burstLength));
     PhysicalAddr.colAddr = colAddr;
   } else   {
-    unsigned colBits   = static_cast<unsigned>(log2(nColumns));
-    unsigned bankShift = static_cast<unsigned>(colBits + ((BI > 1) ? log2(BI) : 0));
-    unsigned bankMask  = static_cast<unsigned>(nBanks / BI - 1) << bankShift;
-    unsigned bankAddr  = (DecLogic & bankMask) >> colBits;
+    uint64_t colBits   = uintLog2(nColumns);
+    uint64_t bankShift = colBits + ((BI > 1) ? uintLog2(BI) : 0);
+    uint64_t bankMask  = (nBanks / BI - 1) << bankShift;
+    uint64_t bankAddr  = (DecLogic & bankMask) >> colBits;
     PhysicalAddr.bankAddr = bankAddr;
 
-    unsigned colShift  = static_cast<unsigned>(log2(BC * burstLength) +
-                                               ((BI > 1) ? log2(BI) : 0));
-    unsigned colMask   = static_cast<unsigned>(nColumns / (BC * burstLength) - 1)
-                       << colShift;
-    unsigned colAddr   = (DecLogic & colMask) >>
-                         static_cast<unsigned>((colShift - log2(static_cast<unsigned>(BC) * burstLength)));
+    uint64_t colShift  = (uintLog2(BC * burstLength) + ((BI > 1) ? uintLog2(BI) : 0));
+    uint64_t colMask   = (nColumns / (BC * burstLength) - 1) << colShift;
+    uint64_t colAddr   = (DecLogic & colMask) >> (colShift - uintLog2(BC * burstLength));
     PhysicalAddr.colAddr       = colAddr;
 
     PhysicalAddr.bankGroupAddr = 0;
   }
 
-  unsigned rowShift = static_cast<unsigned>(log2(nColumns * nBanks));
-  unsigned rowMask  = static_cast<unsigned>(memSpec.memArchSpec.nbrOfRows - 1)
-                     << rowShift;
-  unsigned rowAddr  = (DecLogic & rowMask) >> rowShift;
+  uint64_t rowShift = uintLog2(nColumns * nBanks);
+  uint64_t rowMask  = (memSpec.memArchSpec.nbrOfRows - 1) << rowShift;
+  uint64_t rowAddr  = (DecLogic & rowMask) >> rowShift;
   PhysicalAddr.rowAddr = rowAddr;
 
   return PhysicalAddr;
 } // cmdScheduler::memoryMap
+
+uint64_t cmdScheduler::uintLog2(uint64_t in)
+{
+  return static_cast<uint64_t>(log2(in));
+}
+\ No newline at end of file
diff --git a/ext/drampower/src/CmdScheduler.h b/ext/drampower/src/CmdScheduler.h
index 3c60ea886..58efd279b 100644
--- a/ext/drampower/src/CmdScheduler.h
+++ b/ext/drampower/src/CmdScheduler.h
@@ -59,9 +59,9 @@ class cmdScheduler {
   // the format of a transaction.
   class trans {
    public:
-    int type;
-    double timeStamp;
-    unsigned logicalAddress;
+    int64_t type;
+    int64_t timeStamp;
+    uint64_t logicalAddress;
   };
 
   std::vector<trans> transTrace; // to store the transactions.
@@ -69,18 +69,18 @@ class cmdScheduler {
   // the format of physical address.
   class physicalAddr {
    public:
-    unsigned rowAddr;
-    unsigned bankAddr;
-    unsigned bankGroupAddr;
-    unsigned colAddr;
+    uint64_t rowAddr;
+    uint64_t bankAddr;
+    uint64_t bankGroupAddr;
+    uint64_t colAddr;
   };
 
   // the format of a command.
   class commandItem {
    public:
-    int Type;
-    int bank;
-    double time;
+    int64_t Type;
+    int64_t bank;
+    int64_t time;
     std::string  name;
     physicalAddr PhysicalAddr;
     // sorting the commands according to their scheduling time.
@@ -107,11 +107,11 @@ class cmdScheduler {
   std::vector<commandItem> cmdScheduling;
   std::vector<commandItem> cmdList;
   unsigned elements;
-  int BI, BC, BGI;
+  int64_t BI, BC, BGI;
 
   // the function used to translate a transaction into a sequence of
   // commands which are scheduled to the memory.
-  void transTranslation(Data::MemorySpecification memSpec,
+  void transTranslation(const MemorySpecification& memSpec,
                         std::ifstream&            trans_trace,
                         int                       grouping,
                         int                       interleaving,
@@ -119,45 +119,47 @@ class cmdScheduler {
                         int                       powerdown);
   // get the transactions by reading the traces.
   void getTrans(std::ifstream&      pwr_trace,
-                MemorySpecification memSpec);
+                const MemorySpecification& memSpec);
   // the initialization function for scheduling.
-  void schedulingInitialization(MemorySpecification memSpec);
+  void schedulingInitialization(const MemorySpecification& memSpec);
   // the function used to schedule commands according to the timing constraints.
-  void analyticalScheduling(MemorySpecification memSpec);
+  void analyticalScheduling(const MemorySpecification& memSpec);
   // translate the logical address into physical address.
   physicalAddr memoryMap(trans               Trans,
-                         MemorySpecification memSpec);
+                         const MemorySpecification& memSpec);
   // the power down and power up are scheduled by pdScheduling
-  void pdScheduling(double              endTime,
-                    double              timer,
-                    MemorySpecification memSpec);
+  void pdScheduling(int64_t endTime,
+                    int64_t timer,
+                    const MemorySpecification& memSpec);
   // get the timings for scheduling a precharge since a read or write command
   // is scheduled.
-  int getRWTP(int                 transType,
-              MemorySpecification memSpec);
+  int64_t getRWTP(int64_t transType,
+              const MemorySpecification& memSpec);
   // get different kind of timing constraints according to the used memory.
   void getTimingConstraints(bool                BGSwitch,
-                            MemorySpecification memSpec,
-                            int                 PreType,
-                            int                 CurrentType);
+                            const MemorySpecification& memSpec,
+                            int64_t                 PreType,
+                            int64_t                 CurrentType);
 
-  double transTime;
+  uint64_t uintLog2(uint64_t in);
+
+  int64_t transTime;
   // the flag for power down.
-  int    power_down;
-  int    Inselfrefresh;
-  int    tRRD_init;
-  int    tCCD_init;
-  int    tWTR_init;
-  double tREF;
-  double tSwitch_init;
-  double tRWTP;
-  int    bankaccess;
-  unsigned nBanks;
-  unsigned nColumns;
-  unsigned burstLength;
-  unsigned nbrOfBankGroups;
+  int64_t power_down;
+  int64_t Inselfrefresh;
+  int64_t tRRD_init;
+  int64_t tCCD_init;
+  int64_t tWTR_init;
+  int64_t tREF;
+  int64_t tSwitch_init;
+  int64_t tRWTP;
+  int64_t bankaccess;
+  int64_t nBanks;
+  int64_t nColumns;
+  int64_t burstLength;
+  int64_t nbrOfBankGroups;
   bool timingsGet;
-  double   startTime;
+  int64_t startTime;
 
   // the scheduling results for all the transactions are written into
   // commands which will be used by the power analysis part.
diff --git a/ext/drampower/src/CommandAnalysis.cc b/ext/drampower/src/CommandAnalysis.cc
index 4dea5c101..e557c2920 100644
--- a/ext/drampower/src/CommandAnalysis.cc
+++ b/ext/drampower/src/CommandAnalysis.cc
@@ -45,13 +45,34 @@
 using namespace Data;
 using namespace std;
 
-CommandAnalysis::CommandAnalysis()
+bool commandSorter(const MemCommand& i, const MemCommand& j)
 {
+  if (i.getTimeInt64() == j.getTimeInt64()) {
+    return i.getType() == MemCommand::PRE && j.getType() != MemCommand::PRE;
+  } else {
+    return i.getTimeInt64() < j.getTimeInt64();
+  }
 }
 
-CommandAnalysis::CommandAnalysis(const int nbrofBanks)
+CommandAnalysis::CommandAnalysis(const int64_t nbrofBanks)
 {
   // Initializing all counters and variables
+  clearStats(0);
+  zero = 0;
+
+  bankstate.resize(static_cast<size_t>(nbrofBanks), 0);
+  last_states.resize(static_cast<size_t>(nbrofBanks));
+  mem_state  = 0;
+  num_active_banks  = 0;
+
+  cmd_list.clear();
+  cached_cmd.clear();
+  activation_cycle.resize(static_cast<size_t>(nbrofBanks), 0);
+}
+
+// function to clear counters
+void CommandAnalysis::clearStats(const int64_t timestamp)
+{
 
   numberofacts        = 0;
   numberofpres        = 0;
@@ -64,10 +85,6 @@ CommandAnalysis::CommandAnalysis(const int nbrofBanks)
   s_pre_pdns          = 0;
   numberofsrefs       = 0;
 
-  pop                 = 0;
-  init                = 0;
-  zero                = 0;
-
   actcycles           = 0;
   precycles           = 0;
   f_act_pdcycles      = 0;
@@ -85,28 +102,29 @@ CommandAnalysis::CommandAnalysis(const int nbrofBanks)
   idlecycles_act      = 0;
   idlecycles_pre      = 0;
 
+  // reset count references to timestamp so that they are moved
+  // to start of next stats generation
+  first_act_cycle     = timestamp;
+  last_pre_cycle      = timestamp;
+  pdn_cycle           = timestamp;
+  sref_cycle          = timestamp;
+  end_act_op          = timestamp;
+  end_read_op         = timestamp;
+  end_write_op        = timestamp;
+
   latest_act_cycle    = -1;
-  latest_pre_cycle    = -1;
   latest_read_cycle   = -1;
   latest_write_cycle  = -1;
-  end_read_op         = 0;
-  end_write_op        = 0;
-  end_act_op          = 0;
-
-  first_act_cycle     = 0;
-  last_pre_cycle      = 0;
 
-  bankstate.resize(nbrofBanks, 0);
-  last_states.resize(nbrofBanks);
-  mem_state  = 0;
-
-  sref_cycle = 0;
-  pdn_cycle  = 0;
-
-  cmd_list.clear();
-  full_cmd_list.resize(1, MemCommand::PRE);
-  cached_cmd.clear();
-  activation_cycle.resize(nbrofBanks, 0);
+  if (timestamp == 0) {
+    // set to -1 at beginning of simulation
+    latest_pre_cycle    = -1;
+  } else {
+    // NOTE: reference is adjusted by tRP (PRE delay) when updating counter
+    // could remove tRP to ensure counter starts at beginning of next block;
+    // currently simply setting to timestamp for simplicity
+    latest_pre_cycle    = timestamp;
+  }
 }
 
 // function to clear all arrays
@@ -114,7 +132,6 @@ void CommandAnalysis::clear()
 {
   cached_cmd.clear();
   cmd_list.clear();
-  full_cmd_list.clear();
   last_states.clear();
   bankstate.clear();
 }
@@ -125,132 +142,57 @@ void CommandAnalysis::clear()
 // issued command timestamp, when the auto-precharge would kick in
 
 void CommandAnalysis::getCommands(const Data::MemorySpecification& memSpec,
-                                  const int nbrofBanks, std::vector<MemCommand>& list, bool lastupdate)
+                                  std::vector<MemCommand>& list, bool lastupdate)
 {
-  for (vector<MemCommand>::const_iterator i = list.begin(); i != list.end(); ++i) {
-    const MemCommand& cmd = *i;
-    cmd_list.push_back(cmd);
-
+  for (size_t i = 0; i < list.size(); ++i) {
+    MemCommand& cmd = list[i];
     MemCommand::cmds cmdType = cmd.getType();
     if (cmdType == MemCommand::ACT) {
       activation_cycle[cmd.getBank()] = cmd.getTimeInt64();
     } else if (cmdType == MemCommand::RDA || cmdType == MemCommand::WRA) {
       // Remove auto-precharge flag from command
-      cmd_list.back().setType(cmd.typeWithoutAutoPrechargeFlag());
+      cmd.setType(cmd.typeWithoutAutoPrechargeFlag());
 
       // Add the auto precharge to the list of cached_cmds
       int64_t preTime = max(cmd.getTimeInt64() + cmd.getPrechargeOffset(memSpec, cmdType),
                            activation_cycle[cmd.getBank()] + memSpec.memTimingSpec.RAS);
-      cached_cmd.push_back(MemCommand(MemCommand::PRE, cmd.getBank(), static_cast<double>(preTime)));
+      list.push_back(MemCommand(MemCommand::PRE, cmd.getBank(), preTime));
     }
   }
-  pop = 0;
-  // Note: the extra pre-cmds at the end of the lists, and the cast to double
-  // of the size vector is probably not desirable.
-  cmd_list.push_back(MemCommand::PRE);
-  cached_cmd.push_back(MemCommand::PRE);
-  analyse_commands(nbrofBanks, memSpec, cmd_list.size()-1,
-                                        cached_cmd.size()-1, lastupdate);
-  cmd_list.clear();
-  cached_cmd.clear();
-} // CommandAnalysis::getCommands
-
-// Checks the auto-precharge cached command list and inserts the explicit
-// precharges with the appropriate timestamp in the original command list
-// (by merging) based on their offset from the issuing command. Calls the
-// evaluate function to analyse this expanded list of commands.
+  sort(list.begin(), list.end(), commandSorter);
 
-void CommandAnalysis::analyse_commands(const int nbrofBanks,
-                                       Data::MemorySpecification memSpec, int64_t nCommands, int64_t nCached, bool lastupdate)
-{
-  full_cmd_list.resize(1, MemCommand::PRE);
-  unsigned mCommands = 0;
-  unsigned mCached   = 0;
-  for (unsigned i = 0; i < nCommands + nCached + 1; i++) {
-    if (cached_cmd.size() > 1) {
-      if ((cmd_list[mCommands].getTime() > 1) && (init == 0)) {
-        full_cmd_list[i].setType(MemCommand::PREA);
-        init = 1;
-        pop  = 1;
-      } else {
-        init = 1;
-        if ((cached_cmd[mCached].getTime() > 0) && (cmd_list.
-                                                    at(mCommands).getTime() < cached_cmd[mCached].
-                                                    getTime()) && ((cmd_list[mCommands].getTime() > 0) ||
-                                                                   ((cmd_list[mCommands].getTime() == 0) && (cmd_list[mCommands].
-                                                                                                             getType() != MemCommand::PRE)))) {
-          full_cmd_list[i] = cmd_list[mCommands];
-          mCommands++;
-        } else if ((cached_cmd[mCached].getTime() > 0) && (cmd_list[mCommands].
-                                                           getTime() >= cached_cmd[mCached].getTime())) {
-          full_cmd_list[i] = cached_cmd[mCached];
-          mCached++;
-        } else if (cached_cmd[mCached].getTime() == 0) {
-          if ((cmd_list[mCommands].getTime() > 0) || ((cmd_list[mCommands].
-                                                       getTime() == 0) && (cmd_list[mCommands].
-                                                                           getType() != MemCommand::PRE))) {
-            full_cmd_list[i] = cmd_list[mCommands];
-            mCommands++;
-          }
-        } else if (cmd_list[mCommands].getTime() == 0) {
-          full_cmd_list[i] = cached_cmd[mCached];
-          mCached++;
-        }
-      }
-    } else {
-      if ((cmd_list[mCommands].getTime() > 1) && (init == 0)) {
-        full_cmd_list[i].setType(MemCommand::PREA);
-        init = 1;
-        pop  = 1;
-      } else {
-        init = 1;
-        if ((cmd_list[mCommands].getTime() > 0) || ((cmd_list.
-                                                     at(mCommands).getTime() == 0) && (cmd_list[mCommands].
-                                                                                       getType() != MemCommand::PRE))) {
-          full_cmd_list[i] = cmd_list[mCommands];
-          mCommands++;
-        }
-      }
-    }
-    full_cmd_list.resize(full_cmd_list.size() + 1, MemCommand::PRE);
+  if (lastupdate && list.empty() == false) {
+    // Add cycles at the end of the list
+    int64_t t = timeToCompletion(memSpec, list.back().getType()) + list.back().getTimeInt64() - 1;
+    list.push_back(MemCommand(MemCommand::NOP, 0, t));
   }
 
-  full_cmd_list.pop_back();
-  if (pop == 0) {
-    full_cmd_list.pop_back();
-  }
-  if (lastupdate) {
-    full_cmd_list.resize(full_cmd_list.size() + 1, MemCommand::NOP);
-    full_cmd_list[full_cmd_list.size() - 1].setTime(full_cmd_list
-                                                    [full_cmd_list.size() - 2].getTime() + timeToCompletion(memSpec,
-                                                                                                            full_cmd_list[full_cmd_list.size() - 2].getType()) - 1);
-  }
+  evaluate(memSpec, list);
+} // CommandAnalysis::getCommands
 
-  evaluate(memSpec, full_cmd_list, nbrofBanks);
-} // CommandAnalysis::analyse_commands
 
 // To get the time of completion of the issued command
 // Derived based on JEDEC specifications
 
-int CommandAnalysis::timeToCompletion(const MemorySpecification&
+int64_t CommandAnalysis::timeToCompletion(const MemorySpecification&
                                       memSpec, MemCommand::cmds type)
 {
-  int offset = 0;
+  int64_t offset = 0;
   const MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
   const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
 
   if (type == MemCommand::RD) {
-    offset = static_cast<int>(memTimingSpec.RL +
+    offset = memTimingSpec.RL +
                               memTimingSpec.DQSCK + 1 + (memArchSpec.burstLength /
-                                                         memArchSpec.dataRate));
+                                                         memArchSpec.dataRate);
   } else if (type == MemCommand::WR) {
-    offset = static_cast<int>(memTimingSpec.WL +
+    offset = memTimingSpec.WL +
                               (memArchSpec.burstLength / memArchSpec.dataRate) +
-                              memTimingSpec.WR);
+                              memTimingSpec.WR;
   } else if (type == MemCommand::ACT) {
-    offset = static_cast<int>(memTimingSpec.RCD);
+    offset = memTimingSpec.RCD;
   } else if ((type == MemCommand::PRE) || (type == MemCommand::PREA)) {
-    offset = static_cast<int>(memTimingSpec.RP);
+    offset = memTimingSpec.RP;
   }
   return offset;
 } // CommandAnalysis::timeToCompletion
@@ -258,38 +200,39 @@ int CommandAnalysis::timeToCompletion(const MemorySpecification&
 // Used to analyse a given list of commands and identify command timings
 // and memory state transitions
 void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
-                               vector<MemCommand>& cmd_list, int nbrofBanks)
+                               vector<MemCommand>& cmd_list)
 {
   // for each command identify timestamp, type and bank
-  for (unsigned cmd_list_counter = 0; cmd_list_counter < cmd_list.size();
-       cmd_list_counter++) {
+  for (auto cmd : cmd_list) {
     // For command type
-    int type = cmd_list[cmd_list_counter].getType();
+    int type = cmd.getType();
     // For command bank
-    int bank = cmd_list[cmd_list_counter].getBank();
+    int bank = static_cast<int>(cmd.getBank());
     // Command Issue timestamp in clock cycles (cc)
-    int64_t timestamp = cmd_list[cmd_list_counter].getTimeInt64();
+    int64_t timestamp = cmd.getTimeInt64();
 
     if (type == MemCommand::ACT) {
+      printWarningIfPoweredDown("Command issued while in power-down mode.", type, timestamp, bank);
       // If command is ACT - update number of acts, bank state of the
       // target bank, first and latest activation cycle and the memory
       // state. Update the number of precharged/idle-precharged cycles.
       numberofacts++;
-      if (bankstate[bank] == 1) {
+      if (bankstate[static_cast<size_t>(bank)] == 1) {
         printWarning("Bank is already active!", type, timestamp, bank);
       }
-      bankstate[bank] = 1;
-      if (mem_state == 0) {
+      bankstate[static_cast<size_t>(bank)] = 1;
+      if (num_active_banks == 0) {
         first_act_cycle = timestamp;
         precycles      += max(zero, timestamp - last_pre_cycle);
         idle_pre_update(memSpec, timestamp, latest_pre_cycle);
       }
       latest_act_cycle = timestamp;
-      mem_state++;
+      num_active_banks++;
     } else if (type == MemCommand::RD) {
+      printWarningIfPoweredDown("Command issued while in power-down mode.", type, timestamp, bank);
       // If command is RD - update number of reads and read cycle. Check
       // for active idle cycles (if any).
-      if (bankstate[bank] == 0) {
+      if (bankstate[static_cast<size_t>(bank)] == 0) {
         printWarning("Bank is not active!", type, timestamp, bank);
       }
       numberofreads++;
@@ -297,9 +240,10 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
                       latest_act_cycle, timestamp);
       latest_read_cycle = timestamp;
     } else if (type == MemCommand::WR) {
+      printWarningIfPoweredDown("Command issued while in power-down mode.", type, timestamp, bank);
       // If command is WR - update number of writes and write cycle. Check
       // for active idle cycles (if any).
-      if (bankstate[bank] == 0) {
+      if (bankstate[static_cast<size_t>(bank)] == 0) {
         printWarning("Bank is not active!", type, timestamp, bank);
       }
       numberofwrites++;
@@ -307,6 +251,7 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
                       latest_act_cycle, timestamp);
       latest_write_cycle = timestamp;
     } else if (type == MemCommand::REF) {
+      printWarningIfPoweredDown("Command issued while in power-down mode.", type, timestamp, bank);
       // If command is REF - update number of refreshes, set bank state of
       // all banks to ACT, set the last PRE cycles at RFC-RP cycles from
       // timestamp, set the number of active cycles to RFC-RP and check
@@ -321,56 +266,54 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
                          memSpec.memTimingSpec.RP;
       latest_pre_cycle = last_pre_cycle;
       actcycles       += memSpec.memTimingSpec.RFC - memSpec.memTimingSpec.RP;
-      mem_state        = 0;
-      for (int j = 0; j < nbrofBanks; j++) {
-        bankstate[j] = 0;
+      num_active_banks = 0;
+      for (auto& b : bankstate) {
+        b = 0;
       }
     } else if (type == MemCommand::PRE) {
+      printWarningIfPoweredDown("Command issued while in power-down mode.", type, timestamp, bank);
       // If command is explicit PRE - update number of precharges, bank
       // state of the target bank and last and latest precharge cycle.
       // Calculate the number of active cycles if the memory was in the
       // active state before, but there is a state transition to PRE now.
       // If not, update the number of precharged cycles and idle cycles.
       // Update memory state if needed.
-      if (bankstate[bank] == 1) {
+      if (bankstate[static_cast<size_t>(bank)] == 1) {
         numberofpres++;
       }
-      bankstate[bank] = 0;
+      bankstate[static_cast<size_t>(bank)] = 0;
 
-      if (mem_state == 1) {
+      if (num_active_banks == 1) {
         actcycles     += max(zero, timestamp - first_act_cycle);
         last_pre_cycle = timestamp;
         idle_act_update(memSpec, latest_read_cycle, latest_write_cycle,
                         latest_act_cycle, timestamp);
-      } else if (mem_state == 0) {
+      } else if (num_active_banks == 0) {
         precycles     += max(zero, timestamp - last_pre_cycle);
         idle_pre_update(memSpec, timestamp, latest_pre_cycle);
         last_pre_cycle = timestamp;
       }
       latest_pre_cycle = timestamp;
-      if (mem_state > 0) {
-        mem_state--;
+      if (num_active_banks > 0) {
+        num_active_banks--;
       } else {
-        mem_state = 0;
+        num_active_banks = 0;
       }
     } else if (type == MemCommand::PREA) {
+      printWarningIfPoweredDown("Command issued while in power-down mode.", type, timestamp, bank);
       // If command is explicit PREA (precharge all banks) - update
       // number of precharges by the number of banks, update the bank
       // state of all banks to PRE and set the precharge cycle.
       // Calculate the number of active cycles if the memory was in the
       // active state before, but there is a state transition to PRE now.
       // If not, update the number of precharged cycles and idle cycles.
-      if (timestamp == 0) {
-        numberofpres += 0;
-      } else {
-        numberofpres += mem_state;
-      }
+        numberofpres += num_active_banks;
 
-      if (mem_state > 0) {
+      if (num_active_banks > 0) {
         actcycles += max(zero, timestamp - first_act_cycle);
         idle_act_update(memSpec, latest_read_cycle, latest_write_cycle,
                         latest_act_cycle, timestamp);
-      } else if (mem_state == 0) {
+      } else if (num_active_banks == 0) {
         precycles += max(zero, timestamp - last_pre_cycle);
         idle_pre_update(memSpec, timestamp, latest_pre_cycle);
       }
@@ -378,10 +321,10 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
       latest_pre_cycle = timestamp;
       last_pre_cycle   = timestamp;
 
-      mem_state        = 0;
+      num_active_banks        = 0;
 
-      for (int j = 0; j < nbrofBanks; j++) {
-        bankstate[j] = 0;
+      for (auto& b : bankstate) {
+        b = 0;
       }
     } else if (type == MemCommand::PDN_F_ACT) {
       // If command is fast-exit active power-down - update number of
@@ -391,9 +334,7 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
       // after powering-up. Update active and active idle cycles.
       printWarningIfNotActive("All banks are precharged! Incorrect use of Active Power-Down.", type, timestamp, bank);
       f_act_pdns++;
-      for (int j = 0; j < nbrofBanks; j++) {
-        last_states[j] = bankstate[j];
-      }
+      last_states = bankstate;
       pdn_cycle  = timestamp;
       actcycles += max(zero, timestamp - first_act_cycle);
       idle_act_update(memSpec, latest_read_cycle, latest_write_cycle,
@@ -407,9 +348,7 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
       // after powering-up. Update active and active idle cycles.
       printWarningIfNotActive("All banks are precharged! Incorrect use of Active Power-Down.", type, timestamp, bank);
       s_act_pdns++;
-      for (int j = 0; j < nbrofBanks; j++) {
-        last_states[j] = bankstate[j];
-      }
+      last_states = bankstate;
       pdn_cycle  = timestamp;
       actcycles += max(zero, timestamp - first_act_cycle);
       idle_act_update(memSpec, latest_read_cycle, latest_write_cycle,
@@ -461,14 +400,14 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
                                  memSpec.memTimingSpec.XPDLL -
                                  (2 * memSpec.memTimingSpec.RCD));
         }
-      } else if ((mem_state != CommandAnalysis::MS_PDN_S_ACT) || (mem_state !=
-                                                                  CommandAnalysis::MS_PDN_F_ACT)) {
+      } else if (mem_state != CommandAnalysis::MS_PDN_S_ACT || mem_state != CommandAnalysis::MS_PDN_F_ACT) {
         cerr << "Incorrect use of Active Power-Up!" << endl;
       }
+      num_active_banks = 0;
       mem_state = 0;
-      for (int j = 0; j < nbrofBanks; j++) {
-        bankstate[j] = last_states[j];
-        mem_state   += last_states[j];
+      bankstate = last_states;
+      for (auto& a : last_states) {
+        num_active_banks += static_cast<unsigned int>(a);
       }
       first_act_cycle = timestamp;
     } else if (type == MemCommand::PUP_PRE) {
@@ -493,11 +432,11 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
                                  memSpec.memTimingSpec.XPDLL - memSpec.memTimingSpec.RCD -
                                  memSpec.memTimingSpec.RP);
         }
-      } else if ((mem_state != CommandAnalysis::MS_PDN_S_PRE) || (mem_state !=
-                                                                  CommandAnalysis::MS_PDN_F_PRE)) {
+      } else if (mem_state != CommandAnalysis::MS_PDN_S_PRE || mem_state != CommandAnalysis::MS_PDN_F_PRE) {
         cerr << "Incorrect use of Precharged Power-Up!" << endl;
       }
       mem_state      = 0;
+      num_active_banks = 0;
       last_pre_cycle = timestamp;
     } else if (type == MemCommand::SREN) {
       // If command is self-refresh - update number of self-refreshes,
@@ -583,14 +522,15 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
         }
       }
       mem_state = 0;
-    } else if ((type == MemCommand::END) || (type == MemCommand::NOP)) {
+      num_active_banks = 0;
+    } else if (type == MemCommand::END || type == MemCommand::NOP) {
       // May be optionally used at the end of memory trace for better accuracy
       // Update all counters based on completion of operations.
-      if ((mem_state > 0) && (mem_state < 9)) {
+      if (num_active_banks > 0 && mem_state == 0) {
         actcycles += max(zero, timestamp - first_act_cycle);
         idle_act_update(memSpec, latest_read_cycle, latest_write_cycle,
                         latest_act_cycle, timestamp);
-      } else if (mem_state == 0) {
+      } else if (num_active_banks == 0 && mem_state == 0) {
         precycles += max(zero, timestamp - last_pre_cycle);
         idle_pre_update(memSpec, timestamp, latest_pre_cycle);
       } else if (mem_state == CommandAnalysis::MS_PDN_F_ACT) {
@@ -604,6 +544,9 @@ void CommandAnalysis::evaluate(const MemorySpecification& memSpec,
       } else if (mem_state == CommandAnalysis::MS_SREF) {
         sref_cycles += max(zero, timestamp - sref_cycle);
       }
+    } else {
+      printWarning("Unknown command given, exiting.", type, timestamp, bank);
+      exit(-1);
     }
   }
 } // CommandAnalysis::evaluate
@@ -646,14 +589,21 @@ void CommandAnalysis::idle_pre_update(const MemorySpecification& memSpec,
 
 void CommandAnalysis::printWarningIfActive(const string& warning, int type, int64_t timestamp, int bank)
 {
-  if (mem_state != 0) {
+  if (num_active_banks != 0) {
     printWarning(warning, type, timestamp, bank);
   }
 }
 
 void CommandAnalysis::printWarningIfNotActive(const string& warning, int type, int64_t timestamp, int bank)
 {
-  if (mem_state == 0) {
+  if (num_active_banks == 0) {
+    printWarning(warning, type, timestamp, bank);
+  }
+}
+
+void CommandAnalysis::printWarningIfPoweredDown(const string& warning, int type, int64_t timestamp, int bank)
+{
+  if (mem_state != 0) {
     printWarning(warning, type, timestamp, bank);
   }
 }
diff --git a/ext/drampower/src/CommandAnalysis.h b/ext/drampower/src/CommandAnalysis.h
index b5c7ac778..15261fb2f 100644
--- a/ext/drampower/src/CommandAnalysis.h
+++ b/ext/drampower/src/CommandAnalysis.h
@@ -58,10 +58,8 @@ class CommandAnalysis {
     MS_PDN_S_PRE = 13, MS_SREF = 14
   };
 
-  CommandAnalysis();
-
   // Returns number of reads, writes, acts, pres and refs in the trace
-  CommandAnalysis(const int nbrofBanks);
+  CommandAnalysis(const int64_t nbrofBanks);
 
   // Number of activate commands
   int64_t numberofacts;
@@ -117,29 +115,25 @@ class CommandAnalysis {
   // Number of precharged auto-refresh cycles during self-refresh exit
   int64_t spup_ref_pre_cycles;
 
+  // function for clearing counters
+  void clearStats(const int64_t timestamp);
+
   // function for clearing arrays
   void clear();
 
   // To identify auto-precharges
   void getCommands(const MemorySpecification& memSpec,
-                   const int
-                   nbrofBanks,
                    std::vector<MemCommand>&   list,
                    bool                       lastupdate);
 
  private:
-  unsigned init;
   int64_t  zero;
-  unsigned pop;
   // Cached last read command from the file
   std::vector<MemCommand> cached_cmd;
 
   // Stores the memory commands for analysis
   std::vector<MemCommand> cmd_list;
 
-  // Stores all memory commands for analysis
-  std::vector<MemCommand> full_cmd_list;
-
   // To save states of the different banks, before entering active
   // power-down mode (slow/fast-exit).
   std::vector<int> last_states;
@@ -171,26 +165,20 @@ class CommandAnalysis {
 
   // Memory State
   unsigned mem_state;
+  unsigned num_active_banks;
 
   // Clock cycle of first activate command when memory state changes to ACT
   int64_t first_act_cycle;
 
   // Clock cycle of last precharge command when memory state changes to PRE
   int64_t last_pre_cycle;
-  // To collect and analyse all commands including auto-precharges
-  void analyse_commands(const int nbrofBanks,
-                        Data::MemorySpecification
-                        memSpec,
-                        int64_t    nCommands,
-                        int64_t    nCached,
-                        bool      lastupdate);
+
   // To perform timing analysis of a given set of commands and update command counters
   void evaluate(const MemorySpecification& memSpec,
-                std::vector<MemCommand>&   cmd_list,
-                int                        nbrofBanks);
+                std::vector<MemCommand>&   cmd_list);
 
   // To calculate time of completion of any issued command
-  int timeToCompletion(const MemorySpecification& memSpec,
+  int64_t timeToCompletion(const MemorySpecification& memSpec,
                        MemCommand::cmds           type);
 
   // To update idle period information whenever active cycles may be idle
@@ -207,6 +195,7 @@ class CommandAnalysis {
 
   void printWarningIfActive(const std::string& warning, int type, int64_t timestamp, int bank);
   void printWarningIfNotActive(const std::string& warning, int type, int64_t timestamp, int bank);
+  void printWarningIfPoweredDown(const std::string& warning, int type, int64_t timestamp, int bank);
   void printWarning(const std::string& warning, int type, int64_t timestamp, int bank);
 };
 }
diff --git a/ext/drampower/src/MemArchitectureSpec.h b/ext/drampower/src/MemArchitectureSpec.h
index ca79edc91..49eddc8ac 100644
--- a/ext/drampower/src/MemArchitectureSpec.h
+++ b/ext/drampower/src/MemArchitectureSpec.h
@@ -31,13 +31,15 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Authors: Karthik Chandrasekar
+ * Authors: Karthik Chandrasekar, Sven Goossens
  *
  */
 
 #ifndef TOOLS_MEM_ARCHITECTURE_SPEC_H
 #define TOOLS_MEM_ARCHITECTURE_SPEC_H
 
+#include <stdint.h>
+
 #include "Parametrisable.h"
 
 namespace Data {
@@ -46,14 +48,14 @@ class MemArchitectureSpec : public virtual Parametrisable {
   MemArchitectureSpec();
   void processParameters();
 
-  unsigned int burstLength;
-  unsigned nbrOfBanks;
-  unsigned nbrOfRanks;
-  unsigned dataRate;
-  unsigned nbrOfColumns;
-  unsigned nbrOfRows;
-  unsigned width;
-  unsigned nbrOfBankGroups;
+  int64_t burstLength;
+  int64_t nbrOfBanks;
+  int64_t nbrOfRanks;
+  int64_t dataRate;
+  int64_t nbrOfColumns;
+  int64_t nbrOfRows;
+  int64_t width;
+  int64_t nbrOfBankGroups;
   bool dll;
   bool twoVoltageDomains;
   bool termination;
diff --git a/ext/drampower/src/MemCommand.cc b/ext/drampower/src/MemCommand.cc
index 156716c2f..5e1115e05 100644
--- a/ext/drampower/src/MemCommand.cc
+++ b/ext/drampower/src/MemCommand.cc
@@ -44,15 +44,9 @@
 using namespace Data;
 using namespace std;
 
-MemCommand::MemCommand() :
-  type(MemCommand::PRE),
-  bank(0),
-  timestamp(0)
-{
-}
 
 MemCommand::MemCommand(MemCommand::cmds type,
-                       unsigned bank, double timestamp) :
+                       unsigned bank, int64_t timestamp) :
   type(type),
   bank(bank),
   timestamp(timestamp)
@@ -80,35 +74,35 @@ unsigned MemCommand::getBank() const
 }
 
 // For auto-precharge with read or write - to calculate cycle of precharge
-int MemCommand::getPrechargeOffset(const MemorySpecification& memSpec,
+int64_t MemCommand::getPrechargeOffset(const MemorySpecification& memSpec,
                                    MemCommand::cmds           type) const
 {
-  int precharge_offset = 0;
+  int64_t precharge_offset = 0;
 
-  int BL(static_cast<int>(memSpec.memArchSpec.burstLength));
-  int RTP(static_cast<int>(memSpec.memTimingSpec.RTP));
-  int dataRate(static_cast<int>(memSpec.memArchSpec.dataRate));
-  int AL(static_cast<int>(memSpec.memTimingSpec.AL));
-  int WL(static_cast<int>(memSpec.memTimingSpec.WL));
-  int WR(static_cast<int>(memSpec.memTimingSpec.WR));
-  int B = BL/dataRate;
+  int64_t BL = memSpec.memArchSpec.burstLength;
+  int64_t RTP = memSpec.memTimingSpec.RTP;
+  int64_t dataRate = memSpec.memArchSpec.dataRate;
+  int64_t AL = memSpec.memTimingSpec.AL;
+  int64_t WL = memSpec.memTimingSpec.WL;
+  int64_t WR = memSpec.memTimingSpec.WR;
+  int64_t B = BL/dataRate;
 
   const MemoryType::MemoryType_t& memType = memSpec.memoryType;
 
   // Read with auto-precharge
   if (type == MemCommand::RDA) {
     if (memType == MemoryType::DDR2) {
-      precharge_offset = B + AL - 2 + max(RTP, 2);
+      precharge_offset = B + AL - 2 + max(RTP, int64_t(2));
     } else if (memType == MemoryType::DDR3) {
-      precharge_offset = AL + max(RTP, 4);
+      precharge_offset = AL + max(RTP, int64_t(4));
     } else if (memType == MemoryType::DDR4) {
       precharge_offset = AL + RTP;
     } else if (memType == MemoryType::LPDDR) {
       precharge_offset = B;
     } else if (memType == MemoryType::LPDDR2) {
-      precharge_offset = B + max(0, RTP - 2);
+      precharge_offset = B + max(int64_t(0), RTP - 2);
     } else if (memType == MemoryType::LPDDR3) {
-      precharge_offset = B + max(0, RTP - 4);
+      precharge_offset = B + max(int64_t(0), RTP - 4);
     } else if (memType == MemoryType::WIDEIO_SDR) {
       precharge_offset = B;
     }
@@ -133,19 +127,14 @@ int MemCommand::getPrechargeOffset(const MemorySpecification& memSpec,
   return precharge_offset;
 } // MemCommand::getPrechargeOffset
 
-void MemCommand::setTime(double _timestamp)
+void MemCommand::setTime(int64_t _timestamp)
 {
   timestamp = _timestamp;
 }
 
-double MemCommand::getTime() const
-{
-  return timestamp;
-}
-
 int64_t MemCommand::getTimeInt64() const
 {
-  return static_cast<int64_t>(timestamp);
+  return timestamp;
 }
 
 MemCommand::cmds MemCommand::typeWithoutAutoPrechargeFlag() const
diff --git a/ext/drampower/src/MemCommand.h b/ext/drampower/src/MemCommand.h
index ea7164577..9eb751088 100644
--- a/ext/drampower/src/MemCommand.h
+++ b/ext/drampower/src/MemCommand.h
@@ -86,17 +86,18 @@ class MemCommand {
     PUP_ACT   = 14,
     SREN      = 15,
     SREX      = 16,
-    NOP       = 17
+    NOP       = 17,
+    UNINITIALIZED = 18
   };
 
-  MemCommand();
+//  MemCommand();
   MemCommand(
     // Command Type
-    MemCommand::cmds type,
+    MemCommand::cmds type = UNINITIALIZED,
     // Target Bank
     unsigned         bank = 0,
     // Command Issue Timestamp (in cc)
-    double           timestamp = 0);
+    int64_t          timestamp = 0L);
 
   // Get command type
   cmds getType() const;
@@ -111,16 +112,15 @@ class MemCommand {
   unsigned getBank() const;
 
   // Set timestamp
-  void setTime(double _timestamp);
+  void setTime(int64_t _timestamp);
 
   // Get timestamp
-  double getTime() const;
   int64_t getTimeInt64() const;
 
   cmds typeWithoutAutoPrechargeFlag() const;
 
   // To calculate precharge offset after read or write with auto-precharge
-  int getPrechargeOffset(const MemorySpecification& memSpec,
+  int64_t getPrechargeOffset(const MemorySpecification& memSpec,
                          MemCommand::cmds           type) const;
 
   // To check for equivalence
@@ -136,19 +136,35 @@ class MemCommand {
     }
   }
 
-  static const unsigned int nCommands = 18;
+  static const unsigned int nCommands = 19;
 
   static std::string* getCommandTypeStrings()
   {
-    static std::string type_map[nCommands] = { "ACT",       "RD",      "WR",      "PRE",  "REF",
-                                               "END",       "RDA",     "WRA",     "PREA", "PDN_F_PRE","PDN_S_PRE",  "PDN_F_ACT",
-                                               "PDN_S_ACT", "PUP_PRE", "PUP_ACT", "SREN", "SREX",     "NOP" };
+    static std::string type_map[nCommands] = { "ACT",
+                                               "RD",
+                                               "WR",
+                                               "PRE",
+                                               "REF",
+                                               "END",
+                                               "RDA",
+                                               "WRA",
+                                               "PREA",
+                                               "PDN_F_PRE",
+                                               "PDN_S_PRE",
+                                               "PDN_F_ACT",
+                                               "PDN_S_ACT",
+                                               "PUP_PRE",
+                                               "PUP_ACT",
+                                               "SREN",
+                                               "SREX",
+                                               "NOP",
+                                               "UNINITIALIZED" };
 
     return type_map;
   }
 
   // To identify command type from name
-  static cmds getTypeFromName(const std::string name)
+  static cmds getTypeFromName(const std::string& name)
   {
     std::string* typeStrings = getCommandTypeStrings();
 
@@ -165,7 +181,7 @@ class MemCommand {
  private:
   MemCommand::cmds type;
   unsigned bank;
-  double timestamp;
+  int64_t timestamp;
 };
 }
 #endif // ifndef MEMCOMMAND_H
diff --git a/ext/drampower/src/MemTimingSpec.h b/ext/drampower/src/MemTimingSpec.h
index 1c3a80c6e..104bf5c71 100644
--- a/ext/drampower/src/MemTimingSpec.h
+++ b/ext/drampower/src/MemTimingSpec.h
@@ -31,10 +31,12 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Authors: Karthik Chandrasekar
+ * Authors: Karthik Chandrasekar, Sven Goossens
  *
  */
 
+#include <stdint.h>
+
 #include "Parametrisable.h"
 
 namespace Data {
@@ -44,35 +46,35 @@ class MemTimingSpec : public virtual Parametrisable {
   void processParameters();
 
   double clkMhz;
-  unsigned RC;
-  unsigned RCD;
-  unsigned CCD;
-  unsigned CCD_S;
-  unsigned CCD_L;
-  unsigned RRD;
-  unsigned RRD_S;
-  unsigned RRD_L;
-  unsigned FAW;
-  unsigned TAW;
-  unsigned WTR;
-  unsigned WTR_S;
-  unsigned WTR_L;
-  unsigned REFI;
-  unsigned RL;
-  unsigned RP;
-  unsigned RFC;
-  unsigned RAS;
-  unsigned WL;
-  unsigned AL;
-  unsigned DQSCK;
-  unsigned RTP;
-  unsigned WR;
-  unsigned XP;
-  unsigned XPDLL;
-  unsigned XS;
-  unsigned XSDLL;
-  unsigned CKE;
-  unsigned CKESR;
+  int64_t RC;
+  int64_t RCD;
+  int64_t CCD;
+  int64_t CCD_S;
+  int64_t CCD_L;
+  int64_t RRD;
+  int64_t RRD_S;
+  int64_t RRD_L;
+  int64_t FAW;
+  int64_t TAW;
+  int64_t WTR;
+  int64_t WTR_S;
+  int64_t WTR_L;
+  int64_t REFI;
+  int64_t RL;
+  int64_t RP;
+  int64_t RFC;
+  int64_t RAS;
+  int64_t WL;
+  int64_t AL;
+  int64_t DQSCK;
+  int64_t RTP;
+  int64_t WR;
+  int64_t XP;
+  int64_t XPDLL;
+  int64_t XS;
+  int64_t XSDLL;
+  int64_t CKE;
+  int64_t CKESR;
   double   clkPeriod;
 };
 }
diff --git a/ext/drampower/src/MemoryPowerModel.cc b/ext/drampower/src/MemoryPowerModel.cc
index 4817d1bb5..e020830e6 100644
--- a/ext/drampower/src/MemoryPowerModel.cc
+++ b/ext/drampower/src/MemoryPowerModel.cc
@@ -37,23 +37,24 @@
 
 #include "MemoryPowerModel.h"
 
-#include <cmath>  // For pow
-
 #include <stdint.h>
 
+#include <cmath>  // For pow
+#include <iostream>  // fmtflags
+
 
 using namespace std;
 using namespace Data;
 
 // Calculate energy and average power consumption for the given command trace
 
-void MemoryPowerModel::power_calc(MemorySpecification memSpec,
-                                  const CommandAnalysis& counters,
+void MemoryPowerModel::power_calc(const MemorySpecification& memSpec,
+                                  const CommandAnalysis& c,
                                   int term)
 {
-  MemTimingSpec& t                 = memSpec.memTimingSpec;
-  MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
-  MemPowerSpec&  mps               = memSpec.memPowerSpec;
+  const MemTimingSpec& t                 = memSpec.memTimingSpec;
+  const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  const MemPowerSpec&  mps               = memSpec.memPowerSpec;
 
   energy.act_energy          = 0.0;
   energy.pre_energy          = 0.0;
@@ -102,16 +103,16 @@ void MemoryPowerModel::power_calc(MemorySpecification memSpec,
     // 1 DQS and 1 DM pin is associated with every data byte
     int64_t dqPlusDqsPlusMaskBits = memArchSpec.width + memArchSpec.width / 8 + memArchSpec.width / 8;
     // Size of one clock period for the data bus.
-    double ddrPeriod = t.clkPeriod / memArchSpec.dataRate;
+    double ddrPeriod = t.clkPeriod / static_cast<double>(memArchSpec.dataRate);
 
     // Read IO power is consumed by each DQ (data) and DQS (data strobe) pin
-    energy.read_io_energy = calcIoTermEnergy(counters.numberofreads * memArchSpec.burstLength,
+    energy.read_io_energy = calcIoTermEnergy(c.numberofreads * memArchSpec.burstLength,
                                              ddrPeriod,
                                              power.IO_power,
                                              dqPlusDqsBits);
 
     // Write ODT power is consumed by each DQ (data), DQS (data strobe) and DM
-    energy.write_term_energy = calcIoTermEnergy(counters.numberofwrites * memArchSpec.burstLength,
+    energy.write_term_energy = calcIoTermEnergy(c.numberofwrites * memArchSpec.burstLength,
                                                 ddrPeriod,
                                                 power.WR_ODT_power,
                                                 dqPlusDqsPlusMaskBits);
@@ -119,14 +120,14 @@ void MemoryPowerModel::power_calc(MemorySpecification memSpec,
     if (memArchSpec.nbrOfRanks > 1) {
       // Termination power consumed in the idle rank during reads on the active
       // rank by each DQ (data) and DQS (data strobe) pin.
-      energy.read_oterm_energy = calcIoTermEnergy(counters.numberofreads * memArchSpec.burstLength,
+      energy.read_oterm_energy = calcIoTermEnergy(c.numberofreads * memArchSpec.burstLength,
                                                   ddrPeriod,
                                                   power.TermRD_power,
                                                   dqPlusDqsBits);
 
       // Termination power consumed in the idle rank during writes on the active
       // rank by each DQ (data), DQS (data strobe) and DM (data mask) pin.
-      energy.write_oterm_energy = calcIoTermEnergy(counters.numberofwrites * memArchSpec.burstLength,
+      energy.write_oterm_energy = calcIoTermEnergy(c.numberofwrites * memArchSpec.burstLength,
                                                    ddrPeriod,
                                                    power.TermWR_power,
                                                    dqPlusDqsPlusMaskBits);
@@ -137,101 +138,101 @@ void MemoryPowerModel::power_calc(MemorySpecification memSpec,
                             + energy.read_oterm_energy + energy.write_oterm_energy;
   }
 
-  total_cycles = counters.actcycles + counters.precycles +
-                 counters.f_act_pdcycles + counters.f_pre_pdcycles +
-                 counters.s_act_pdcycles + counters.s_pre_pdcycles + counters.sref_cycles
-                 + counters.sref_ref_act_cycles + counters.sref_ref_pre_cycles +
-                 counters.spup_ref_act_cycles + counters.spup_ref_pre_cycles;
+  total_cycles = c.actcycles + c.precycles +
+                 c.f_act_pdcycles + c.f_pre_pdcycles +
+                 c.s_act_pdcycles + c.s_pre_pdcycles + c.sref_cycles
+                 + c.sref_ref_act_cycles + c.sref_ref_pre_cycles +
+                 c.spup_ref_act_cycles + c.spup_ref_pre_cycles;
 
   EnergyDomain vdd0Domain(mps.vdd, t.clkPeriod);
 
-  energy.act_energy       = vdd0Domain.calcTivEnergy(counters.numberofacts   * t.RAS          , mps.idd0 - mps.idd3n);
-  energy.pre_energy       = vdd0Domain.calcTivEnergy(counters.numberofpres   * (t.RC - t.RAS) , mps.idd0 - mps.idd2n);
-  energy.read_energy      = vdd0Domain.calcTivEnergy(counters.numberofreads  * burstCc        , mps.idd4r - mps.idd3n);
-  energy.write_energy     = vdd0Domain.calcTivEnergy(counters.numberofwrites * burstCc        , mps.idd4w - mps.idd3n);
-  energy.ref_energy       = vdd0Domain.calcTivEnergy(counters.numberofrefs   * t.RFC          , mps.idd5 - mps.idd3n);
-  energy.pre_stdby_energy = vdd0Domain.calcTivEnergy(counters.precycles, mps.idd2n);
-  energy.act_stdby_energy = vdd0Domain.calcTivEnergy(counters.actcycles, mps.idd3n);
+  energy.act_energy       = vdd0Domain.calcTivEnergy(c.numberofacts   * t.RAS          , mps.idd0 - mps.idd3n);
+  energy.pre_energy       = vdd0Domain.calcTivEnergy(c.numberofpres   * (t.RC - t.RAS) , mps.idd0 - mps.idd2n);
+  energy.read_energy      = vdd0Domain.calcTivEnergy(c.numberofreads  * burstCc        , mps.idd4r - mps.idd3n);
+  energy.write_energy     = vdd0Domain.calcTivEnergy(c.numberofwrites * burstCc        , mps.idd4w - mps.idd3n);
+  energy.ref_energy       = vdd0Domain.calcTivEnergy(c.numberofrefs   * t.RFC          , mps.idd5 - mps.idd3n);
+  energy.pre_stdby_energy = vdd0Domain.calcTivEnergy(c.precycles, mps.idd2n);
+  energy.act_stdby_energy = vdd0Domain.calcTivEnergy(c.actcycles, mps.idd3n);
   // Idle energy in the active standby clock cycles
-  energy.idle_energy_act  = vdd0Domain.calcTivEnergy(counters.idlecycles_act, mps.idd3n);
+  energy.idle_energy_act  = vdd0Domain.calcTivEnergy(c.idlecycles_act, mps.idd3n);
   // Idle energy in the precharge standby clock cycles
-  energy.idle_energy_pre  = vdd0Domain.calcTivEnergy(counters.idlecycles_pre, mps.idd2n);
+  energy.idle_energy_pre  = vdd0Domain.calcTivEnergy(c.idlecycles_pre, mps.idd2n);
   // fast-exit active power-down cycles energy
-  energy.f_act_pd_energy  = vdd0Domain.calcTivEnergy(counters.f_act_pdcycles, mps.idd3p1);
+  energy.f_act_pd_energy  = vdd0Domain.calcTivEnergy(c.f_act_pdcycles, mps.idd3p1);
   // fast-exit precharged power-down cycles energy
-  energy.f_pre_pd_energy  = vdd0Domain.calcTivEnergy(counters.f_pre_pdcycles, mps.idd2p1);
+  energy.f_pre_pd_energy  = vdd0Domain.calcTivEnergy(c.f_pre_pdcycles, mps.idd2p1);
   // slow-exit active power-down cycles energy
-  energy.s_act_pd_energy  = vdd0Domain.calcTivEnergy(counters.s_act_pdcycles, mps.idd3p0);
+  energy.s_act_pd_energy  = vdd0Domain.calcTivEnergy(c.s_act_pdcycles, mps.idd3p0);
   // slow-exit precharged power-down cycles energy
-  energy.s_pre_pd_energy  = vdd0Domain.calcTivEnergy(counters.s_pre_pdcycles, mps.idd2p0);
+  energy.s_pre_pd_energy  = vdd0Domain.calcTivEnergy(c.s_pre_pdcycles, mps.idd2p0);
 
   // self-refresh cycles energy including a refresh per self-refresh entry
   energy.sref_energy = engy_sref(mps.idd6, mps.idd3n,
                                  mps.idd5, mps.vdd,
-                                 static_cast<double>(counters.sref_cycles), static_cast<double>(counters.sref_ref_act_cycles),
-                                 static_cast<double>(counters.sref_ref_pre_cycles), static_cast<double>(counters.spup_ref_act_cycles),
-                                 static_cast<double>(counters.spup_ref_pre_cycles), t.clkPeriod);
+                                 static_cast<double>(c.sref_cycles), static_cast<double>(c.sref_ref_act_cycles),
+                                 static_cast<double>(c.sref_ref_pre_cycles), static_cast<double>(c.spup_ref_act_cycles),
+                                 static_cast<double>(c.spup_ref_pre_cycles), t.clkPeriod);
 
   // background energy during active auto-refresh cycles in self-refresh
-  energy.sref_ref_act_energy = vdd0Domain.calcTivEnergy(counters.sref_ref_act_cycles, mps.idd3p0);
+  energy.sref_ref_act_energy = vdd0Domain.calcTivEnergy(c.sref_ref_act_cycles, mps.idd3p0);
   // background energy during precharged auto-refresh cycles in self-refresh
-  energy.sref_ref_pre_energy = vdd0Domain.calcTivEnergy(counters.sref_ref_pre_cycles, mps.idd2p0);
+  energy.sref_ref_pre_energy = vdd0Domain.calcTivEnergy(c.sref_ref_pre_cycles, mps.idd2p0);
   // background energy during active auto-refresh cycles in self-refresh exit
-  energy.spup_ref_act_energy = vdd0Domain.calcTivEnergy(counters.spup_ref_act_cycles, mps.idd3n);
+  energy.spup_ref_act_energy = vdd0Domain.calcTivEnergy(c.spup_ref_act_cycles, mps.idd3n);
   // background energy during precharged auto-refresh cycles in self-refresh exit
-  energy.spup_ref_pre_energy = vdd0Domain.calcTivEnergy(counters.spup_ref_pre_cycles, mps.idd2n);
+  energy.spup_ref_pre_energy = vdd0Domain.calcTivEnergy(c.spup_ref_pre_cycles, mps.idd2n);
   // self-refresh power-up cycles energy -- included
-  energy.spup_energy         = vdd0Domain.calcTivEnergy(counters.spup_cycles, mps.idd2n);
+  energy.spup_energy         = vdd0Domain.calcTivEnergy(c.spup_cycles, mps.idd2n);
   // active power-up cycles energy - same as active standby -- included
-  energy.pup_act_energy      = vdd0Domain.calcTivEnergy(counters.pup_act_cycles, mps.idd3n);
+  energy.pup_act_energy      = vdd0Domain.calcTivEnergy(c.pup_act_cycles, mps.idd3n);
   // precharged power-up cycles energy - same as precharged standby -- included
-  energy.pup_pre_energy      = vdd0Domain.calcTivEnergy(counters.pup_pre_cycles, mps.idd2n);
+  energy.pup_pre_energy      = vdd0Domain.calcTivEnergy(c.pup_pre_cycles, mps.idd2n);
 
   // similar equations as before to support multiple voltage domains in LPDDR2
   // and WIDEIO memories
   if (memArchSpec.twoVoltageDomains) {
     EnergyDomain vdd2Domain(mps.vdd2, t.clkPeriod);
 
-    energy.act_energy       += vdd2Domain.calcTivEnergy(counters.numberofacts   * t.RAS          , mps.idd02 - mps.idd3n2);
-    energy.pre_energy       += vdd2Domain.calcTivEnergy(counters.numberofpres   * (t.RC - t.RAS) , mps.idd02 - mps.idd2n2);
-    energy.read_energy      += vdd2Domain.calcTivEnergy(counters.numberofreads  * burstCc        , mps.idd4r2 - mps.idd3n2);
-    energy.write_energy     += vdd2Domain.calcTivEnergy(counters.numberofwrites * burstCc        , mps.idd4w2 - mps.idd3n2);
-    energy.ref_energy       += vdd2Domain.calcTivEnergy(counters.numberofrefs   * t.RFC          , mps.idd52 - mps.idd3n2);
-    energy.pre_stdby_energy += vdd2Domain.calcTivEnergy(counters.precycles, mps.idd2n2);
-    energy.act_stdby_energy += vdd2Domain.calcTivEnergy(counters.actcycles, mps.idd3n2);
+    energy.act_energy       += vdd2Domain.calcTivEnergy(c.numberofacts   * t.RAS          , mps.idd02 - mps.idd3n2);
+    energy.pre_energy       += vdd2Domain.calcTivEnergy(c.numberofpres   * (t.RC - t.RAS) , mps.idd02 - mps.idd2n2);
+    energy.read_energy      += vdd2Domain.calcTivEnergy(c.numberofreads  * burstCc        , mps.idd4r2 - mps.idd3n2);
+    energy.write_energy     += vdd2Domain.calcTivEnergy(c.numberofwrites * burstCc        , mps.idd4w2 - mps.idd3n2);
+    energy.ref_energy       += vdd2Domain.calcTivEnergy(c.numberofrefs   * t.RFC          , mps.idd52 - mps.idd3n2);
+    energy.pre_stdby_energy += vdd2Domain.calcTivEnergy(c.precycles, mps.idd2n2);
+    energy.act_stdby_energy += vdd2Domain.calcTivEnergy(c.actcycles, mps.idd3n2);
     // Idle energy in the active standby clock cycles
-    energy.idle_energy_act  += vdd2Domain.calcTivEnergy(counters.idlecycles_act, mps.idd3n2);
+    energy.idle_energy_act  += vdd2Domain.calcTivEnergy(c.idlecycles_act, mps.idd3n2);
     // Idle energy in the precharge standby clock cycles
-    energy.idle_energy_pre  += vdd2Domain.calcTivEnergy(counters.idlecycles_pre, mps.idd2n2);
+    energy.idle_energy_pre  += vdd2Domain.calcTivEnergy(c.idlecycles_pre, mps.idd2n2);
     // fast-exit active power-down cycles energy
-    energy.f_act_pd_energy  += vdd2Domain.calcTivEnergy(counters.f_act_pdcycles, mps.idd3p12);
+    energy.f_act_pd_energy  += vdd2Domain.calcTivEnergy(c.f_act_pdcycles, mps.idd3p12);
     // fast-exit precharged power-down cycles energy
-    energy.f_pre_pd_energy  += vdd2Domain.calcTivEnergy(counters.f_pre_pdcycles, mps.idd2p12);
+    energy.f_pre_pd_energy  += vdd2Domain.calcTivEnergy(c.f_pre_pdcycles, mps.idd2p12);
     // slow-exit active power-down cycles energy
-    energy.s_act_pd_energy  += vdd2Domain.calcTivEnergy(counters.s_act_pdcycles, mps.idd3p02);
+    energy.s_act_pd_energy  += vdd2Domain.calcTivEnergy(c.s_act_pdcycles, mps.idd3p02);
     // slow-exit precharged power-down cycles energy
-    energy.s_pre_pd_energy  += vdd2Domain.calcTivEnergy(counters.s_pre_pdcycles, mps.idd2p02);
+    energy.s_pre_pd_energy  += vdd2Domain.calcTivEnergy(c.s_pre_pdcycles, mps.idd2p02);
 
     energy.sref_energy      += engy_sref(mps.idd62, mps.idd3n2,
                                          mps.idd52, mps.vdd2,
-                                         static_cast<double>(counters.sref_cycles), static_cast<double>(counters.sref_ref_act_cycles),
-                                         static_cast<double>(counters.sref_ref_pre_cycles), static_cast<double>(counters.spup_ref_act_cycles),
-                                         static_cast<double>(counters.spup_ref_pre_cycles), t.clkPeriod);
+                                         static_cast<double>(c.sref_cycles), static_cast<double>(c.sref_ref_act_cycles),
+                                         static_cast<double>(c.sref_ref_pre_cycles), static_cast<double>(c.spup_ref_act_cycles),
+                                         static_cast<double>(c.spup_ref_pre_cycles), t.clkPeriod);
 
     // background energy during active auto-refresh cycles in self-refresh
-    energy.sref_ref_act_energy += vdd2Domain.calcTivEnergy(counters.sref_ref_act_cycles, mps.idd3p02);
+    energy.sref_ref_act_energy += vdd2Domain.calcTivEnergy(c.sref_ref_act_cycles, mps.idd3p02);
     // background energy during precharged auto-refresh cycles in self-refresh
-    energy.sref_ref_pre_energy += vdd2Domain.calcTivEnergy(counters.sref_ref_pre_cycles, mps.idd2p02);
+    energy.sref_ref_pre_energy += vdd2Domain.calcTivEnergy(c.sref_ref_pre_cycles, mps.idd2p02);
     // background energy during active auto-refresh cycles in self-refresh exit
-    energy.spup_ref_act_energy += vdd2Domain.calcTivEnergy(counters.spup_ref_act_cycles, mps.idd3n2);
+    energy.spup_ref_act_energy += vdd2Domain.calcTivEnergy(c.spup_ref_act_cycles, mps.idd3n2);
     // background energy during precharged auto-refresh cycles in self-refresh exit
-    energy.spup_ref_pre_energy += vdd2Domain.calcTivEnergy(counters.spup_ref_pre_cycles, mps.idd2n2);
+    energy.spup_ref_pre_energy += vdd2Domain.calcTivEnergy(c.spup_ref_pre_cycles, mps.idd2n2);
     // self-refresh power-up cycles energy -- included
-    energy.spup_energy         += vdd2Domain.calcTivEnergy(counters.spup_cycles, mps.idd2n2);
+    energy.spup_energy         += vdd2Domain.calcTivEnergy(c.spup_cycles, mps.idd2n2);
     // active power-up cycles energy - same as active standby -- included
-    energy.pup_act_energy      += vdd2Domain.calcTivEnergy(counters.pup_act_cycles, mps.idd3n2);
+    energy.pup_act_energy      += vdd2Domain.calcTivEnergy(c.pup_act_cycles, mps.idd3n2);
     // precharged power-up cycles energy - same as precharged standby -- included
-    energy.pup_pre_energy      += vdd2Domain.calcTivEnergy(counters.pup_pre_cycles, mps.idd2n2);
+    energy.pup_pre_energy      += vdd2Domain.calcTivEnergy(c.pup_pre_cycles, mps.idd2n2);
   }
 
   // auto-refresh energy during self-refresh cycles
@@ -244,7 +245,7 @@ void MemoryPowerModel::power_calc(MemorySpecification memSpec,
   // energy components for both ranks (in a dual-rank system)
   energy.total_energy = energy.act_energy + energy.pre_energy + energy.read_energy +
                         energy.write_energy + energy.ref_energy + energy.io_term_energy +
-                        memArchSpec.nbrOfRanks * (energy.act_stdby_energy +
+                        static_cast<double>(memArchSpec.nbrOfRanks) * (energy.act_stdby_energy +
                                                   energy.pre_stdby_energy + energy.sref_energy +
                                                   energy.f_act_pd_energy + energy.f_pre_pd_energy + energy.s_act_pd_energy
                                                   + energy.s_pre_pd_energy + energy.sref_ref_energy + energy.spup_ref_energy);
@@ -253,130 +254,100 @@ void MemoryPowerModel::power_calc(MemorySpecification memSpec,
   power.average_power = energy.total_energy / (static_cast<double>(total_cycles) * t.clkPeriod);
 } // MemoryPowerModel::power_calc
 
-void MemoryPowerModel::power_print(MemorySpecification memSpec, int term, const CommandAnalysis& counters) const
+void MemoryPowerModel::power_print(const MemorySpecification& memSpec, int term, const CommandAnalysis& c) const
 {
-  MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
-  MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  const MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
+  const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  const uint64_t nRanks = static_cast<uint64_t>(memArchSpec.nbrOfRanks);
+  const char eUnit[] = " pJ";
 
+  ios_base::fmtflags flags = cout.flags();
+  streamsize precision = cout.precision();
   cout.precision(0);
-  cout << "* Trace Details:" << endl;
-  cout << "Number of Activates: " << fixed << counters.numberofacts << endl;
-  cout << "Number of Reads: " << counters.numberofreads << endl;
-  cout << "Number of Writes: " << counters.numberofwrites << endl;
-  cout << "Number of Precharges: " << counters.numberofpres << endl;
-  cout << "Number of Refreshes: " << counters.numberofrefs << endl;
-  cout << "Number of Active Cycles: " << counters.actcycles << endl;
-  cout << "  Number of Active Idle Cycles: " << counters.idlecycles_act << endl;
-  cout << "  Number of Active Power-Up Cycles: " << counters.pup_act_cycles << endl;
-  cout << "    Number of Auto-Refresh Active cycles during Self-Refresh " <<
-    "Power-Up: " << counters.spup_ref_act_cycles << endl;
-  cout << "Number of Precharged Cycles: " << counters.precycles << endl;
-  cout << "  Number of Precharged Idle Cycles: " << counters.idlecycles_pre << endl;
-  cout << "  Number of Precharged Power-Up Cycles: " << counters.pup_pre_cycles
-       << endl;
-  cout << "    Number of Auto-Refresh Precharged cycles during Self-Refresh"
-       << " Power-Up: " << counters.spup_ref_pre_cycles << endl;
-  cout << "  Number of Self-Refresh Power-Up Cycles: " << counters.spup_cycles
-       << endl;
-  cout << "Total Idle Cycles (Active + Precharged): " <<
-    counters.idlecycles_act + counters.idlecycles_pre << endl;
-  cout << "Number of Power-Downs: " << counters.f_act_pdns +
-    counters.s_act_pdns + counters.f_pre_pdns + counters.s_pre_pdns << endl;
-  cout << "  Number of Active Fast-exit Power-Downs: " << counters.f_act_pdns
-       << endl;
-  cout << "  Number of Active Slow-exit Power-Downs: " << counters.s_act_pdns
-       << endl;
-  cout << "  Number of Precharged Fast-exit Power-Downs: " <<
-    counters.f_pre_pdns << endl;
-  cout << "  Number of Precharged Slow-exit Power-Downs: " <<
-    counters.s_pre_pdns << endl;
-  cout << "Number of Power-Down Cycles: " << counters.f_act_pdcycles +
-    counters.s_act_pdcycles + counters.f_pre_pdcycles + counters.s_pre_pdcycles << endl;
-  cout << "  Number of Active Fast-exit Power-Down Cycles: " <<
-    counters.f_act_pdcycles << endl;
-  cout << "  Number of Active Slow-exit Power-Down Cycles: " <<
-    counters.s_act_pdcycles << endl;
-  cout << "    Number of Auto-Refresh Active cycles during Self-Refresh: " <<
-    counters.sref_ref_act_cycles << endl;
-  cout << "  Number of Precharged Fast-exit Power-Down Cycles: " <<
-    counters.f_pre_pdcycles << endl;
-  cout << "  Number of Precharged Slow-exit Power-Down Cycles: " <<
-    counters.s_pre_pdcycles << endl;
-  cout << "    Number of Auto-Refresh Precharged cycles during Self-Refresh: " <<
-    counters.sref_ref_pre_cycles << endl;
-  cout << "Number of Auto-Refresh Cycles: " << counters.numberofrefs *
-    memTimingSpec.RFC << endl;
-  cout << "Number of Self-Refreshes: " << counters.numberofsrefs << endl;
-  cout << "Number of Self-Refresh Cycles: " << counters.sref_cycles << endl;
-  cout << "----------------------------------------" << endl;
-  cout << "Total Trace Length (clock cycles): " << total_cycles << endl;
-  cout << "----------------------------------------" << endl;
+  cout << "* Trace Details:" << fixed << endl
+       << endl << "#ACT commands: "                 << c.numberofacts
+       << endl << "#RD + #RDA commands: "           << c.numberofreads
+       << endl << "#WR + #WRA commands: "           << c.numberofwrites
+  /* #PRE commands (precharge all counts a number of #PRE commands equal to the number of active banks) */
+       << endl << "#PRE (+ PREA) commands: "        << c.numberofpres
+       << endl << "#REF commands: "                 << c.numberofrefs
+       << endl << "#Active Cycles: "                << c.actcycles
+       << endl << "  #Active Idle Cycles: "         << c.idlecycles_act
+       << endl << "  #Active Power-Up Cycles: "     << c.pup_act_cycles
+       << endl << "    #Auto-Refresh Active cycles during Self-Refresh Power-Up: " << c.spup_ref_act_cycles
+       << endl << "#Precharged Cycles: "            << c.precycles
+       << endl << "  #Precharged Idle Cycles: "     << c.idlecycles_pre
+       << endl << "  #Precharged Power-Up Cycles: " << c.pup_pre_cycles
+       << endl << "    #Auto-Refresh Precharged cycles during Self-Refresh Power-Up: " << c.spup_ref_pre_cycles
+       << endl << "  #Self-Refresh Power-Up Cycles: "                          << c.spup_cycles
+       << endl << "Total Idle Cycles (Active + Precharged): "                  << c.idlecycles_act + c.idlecycles_pre
+       << endl << "#Power-Downs: "                                             << c.f_act_pdns +  c.s_act_pdns + c.f_pre_pdns + c.s_pre_pdns
+       << endl << "  #Active Fast-exit Power-Downs: "                          << c.f_act_pdns
+       << endl << "  #Active Slow-exit Power-Downs: "                          << c.s_act_pdns
+       << endl << "  #Precharged Fast-exit Power-Downs: "                      << c.f_pre_pdns
+       << endl << "  #Precharged Slow-exit Power-Downs: "                      << c.s_pre_pdns
+       << endl << "#Power-Down Cycles: "                                       << c.f_act_pdcycles + c.s_act_pdcycles + c.f_pre_pdcycles + c.s_pre_pdcycles
+       << endl << "  #Active Fast-exit Power-Down Cycles: "                    << c.f_act_pdcycles
+       << endl << "  #Active Slow-exit Power-Down Cycles: "                    << c.s_act_pdcycles
+       << endl << "    #Auto-Refresh Active cycles during Self-Refresh: "      << c.sref_ref_act_cycles
+       << endl << "  #Precharged Fast-exit Power-Down Cycles: "                << c.f_pre_pdcycles
+       << endl << "  #Precharged Slow-exit Power-Down Cycles: "                << c.s_pre_pdcycles
+       << endl << "    #Auto-Refresh Precharged cycles during Self-Refresh: "  << c.sref_ref_pre_cycles
+       << endl << "#Auto-Refresh Cycles: "                                     << c.numberofrefs * memTimingSpec.RFC
+       << endl << "#Self-Refreshes: "                                          << c.numberofsrefs
+       << endl << "#Self-Refresh Cycles: "                                     << c.sref_cycles
+       << endl << "----------------------------------------"
+       << endl << "Total Trace Length (clock cycles): " << total_cycles
+       << endl << "----------------------------------------" << endl;
+
   cout.precision(2);
+  cout << endl << "* Trace Power and Energy Estimates:" << endl
+       << endl << "ACT Cmd Energy: " << energy.act_energy   << eUnit
+       << endl << "PRE Cmd Energy: " << energy.pre_energy   << eUnit
+       << endl << "RD Cmd Energy: "  << energy.read_energy  << eUnit
+       << endl << "WR Cmd Energy: "  << energy.write_energy << eUnit;
 
-  cout << "\n* Trace Power and Energy Estimates:" << endl;
-  cout << "ACT Cmd Energy: " << energy.act_energy << " pJ" << endl;
-  cout << "PRE Cmd Energy: " << energy.pre_energy << " pJ" << endl;
-  cout << "RD Cmd Energy: " << energy.read_energy << " pJ" << endl;
-  cout << "WR Cmd Energy: " << energy.write_energy << " pJ" << endl;
   if (term) {
-    cout << "RD I/O Energy: " << energy.read_io_energy << " pJ" << endl;
+    cout << "RD I/O Energy: " << energy.read_io_energy << eUnit << endl;
     // No Termination for LPDDR/2/3 and DDR memories
     if (memSpec.memArchSpec.termination) {
-      cout << "WR Termination Energy: " << energy.write_term_energy << " pJ" << endl;
+      cout << "WR Termination Energy: " << energy.write_term_energy << eUnit << endl;
     }
 
-    if ((memArchSpec.nbrOfRanks > 1) && memSpec.memArchSpec.termination) {
-      cout << "RD Termination Energy (Idle rank): " << energy.read_oterm_energy
-           << " pJ" << endl;
-      cout << "WR Termination Energy (Idle rank): " << energy.write_oterm_energy
-           << " pJ" << endl;
+    if (nRanks > 1 && memSpec.memArchSpec.termination) {
+      cout <<         "RD Termination Energy (Idle rank): " << energy.read_oterm_energy << eUnit
+           << endl << "WR Termination Energy (Idle rank): " << energy.write_oterm_energy << eUnit << endl;
     }
   }
-  cout << "ACT Stdby Energy: " << memArchSpec.nbrOfRanks * energy.act_stdby_energy <<
-    " pJ" << endl;
-  cout << "  Active Idle Energy: " << memArchSpec.nbrOfRanks * energy.idle_energy_act <<
-    " pJ" << endl;
-  cout << "  Active Power-Up Energy: " << memArchSpec.nbrOfRanks * energy.pup_act_energy <<
-    " pJ" << endl;
-  cout << "    Active Stdby Energy during Auto-Refresh cycles in Self-Refresh"
-       << " Power-Up: " << memArchSpec.nbrOfRanks * energy.spup_ref_act_energy <<
-    " pJ" << endl;
-  cout << "PRE Stdby Energy: " << memArchSpec.nbrOfRanks * energy.pre_stdby_energy <<
-    " pJ" << endl;
-  cout << "  Precharge Idle Energy: " << memArchSpec.nbrOfRanks * energy.idle_energy_pre <<
-    " pJ" << endl;
-  cout << "  Precharged Power-Up Energy: " << memArchSpec.nbrOfRanks * energy.pup_pre_energy <<
-    " pJ" << endl;
-  cout << "    Precharge Stdby Energy during Auto-Refresh cycles " <<
-    "in Self-Refresh Power-Up: " << memArchSpec.nbrOfRanks * energy.spup_ref_pre_energy <<
-    " pJ" << endl;
-  cout << "  Self-Refresh Power-Up Energy: " << memArchSpec.nbrOfRanks * energy.spup_energy <<
-    " pJ" << endl;
-  cout << "Total Idle Energy (Active + Precharged): " << memArchSpec.nbrOfRanks *
-  (energy.idle_energy_act + energy.idle_energy_pre) << " pJ" << endl;
-  cout << "Total Power-Down Energy: " << memArchSpec.nbrOfRanks * (energy.f_act_pd_energy +
-                                                                   energy.f_pre_pd_energy + energy.s_act_pd_energy + energy.s_pre_pd_energy) << " pJ" << endl;
-  cout << "  Fast-Exit Active Power-Down Energy: " << memArchSpec.nbrOfRanks *
-    energy.f_act_pd_energy << " pJ" << endl;
-  cout << "  Slow-Exit Active Power-Down Energy: " << memArchSpec.nbrOfRanks *
-    energy.s_act_pd_energy << " pJ" << endl;
-  cout << "    Slow-Exit Active Power-Down Energy during Auto-Refresh cycles "
-       << "in Self-Refresh: " << memArchSpec.nbrOfRanks * energy.sref_ref_act_energy <<
-    " pJ" << endl;
-  cout << "  Fast-Exit Precharged Power-Down Energy: " << memArchSpec.nbrOfRanks *
-    energy.f_pre_pd_energy << " pJ" << endl;
-  cout << "  Slow-Exit Precharged Power-Down Energy: " << memArchSpec.nbrOfRanks *
-    energy.s_pre_pd_energy << " pJ" << endl;
-  cout << "    Slow-Exit Precharged Power-Down Energy during Auto-Refresh " <<
-    "cycles in Self-Refresh: " << memArchSpec.nbrOfRanks * energy.sref_ref_pre_energy <<
-    " pJ" << endl;
-  cout << "Auto-Refresh Energy: " << energy.ref_energy << " pJ" << endl;
-  cout << "Self-Refresh Energy: " << memArchSpec.nbrOfRanks * energy.sref_energy <<
-    " pJ" << endl;
-  cout << "----------------------------------------" << endl;
-  cout << "Total Trace Energy: " << energy.total_energy << " pJ" << endl;
-  cout << "Average Power: " << power.average_power << " mW" << endl;
-  cout << "----------------------------------------" << endl;
+
+  double nRanksDouble = static_cast<double>(nRanks);
+
+  cout <<         "ACT Stdby Energy: "                                                                      << nRanksDouble * energy.act_stdby_energy << eUnit
+       << endl << "  Active Idle Energy: "                                                                  << nRanksDouble * energy.idle_energy_act << eUnit
+       << endl << "  Active Power-Up Energy: "                                                              << nRanksDouble * energy.pup_act_energy << eUnit
+       << endl << "    Active Stdby Energy during Auto-Refresh cycles in Self-Refresh Power-Up: "           << nRanksDouble * energy.spup_ref_act_energy << eUnit
+       << endl << "PRE Stdby Energy: "                                                                      << nRanksDouble * energy.pre_stdby_energy << eUnit
+       << endl << "  Precharge Idle Energy: "                                                               << nRanksDouble * energy.idle_energy_pre << eUnit
+       << endl << "  Precharged Power-Up Energy: "                                                          << nRanksDouble * energy.pup_pre_energy << eUnit
+       << endl << "    Precharge Stdby Energy during Auto-Refresh cycles in Self-Refresh Power-Up: "        << nRanksDouble * energy.spup_ref_pre_energy << eUnit
+       << endl << "  Self-Refresh Power-Up Energy: "                                                        << nRanksDouble * energy.spup_energy << eUnit
+       << endl << "Total Idle Energy (Active + Precharged): "                                               << nRanksDouble * (energy.idle_energy_act + energy.idle_energy_pre) << eUnit
+       << endl << "Total Power-Down Energy: "                                                               << nRanksDouble * (energy.f_act_pd_energy + energy.f_pre_pd_energy + energy.s_act_pd_energy + energy.s_pre_pd_energy) << eUnit
+       << endl << "  Fast-Exit Active Power-Down Energy: "                                                  << nRanksDouble * energy.f_act_pd_energy << eUnit
+       << endl << "  Slow-Exit Active Power-Down Energy: "                                                  << nRanksDouble * energy.s_act_pd_energy << eUnit
+       << endl << "    Slow-Exit Active Power-Down Energy during Auto-Refresh cycles in Self-Refresh: "     << nRanksDouble * energy.sref_ref_act_energy << eUnit
+       << endl << "  Fast-Exit Precharged Power-Down Energy: "                                              << nRanksDouble * energy.f_pre_pd_energy << eUnit
+       << endl << "  Slow-Exit Precharged Power-Down Energy: "                                              << nRanksDouble * energy.s_pre_pd_energy << eUnit
+       << endl << "    Slow-Exit Precharged Power-Down Energy during Auto-Refresh cycles in Self-Refresh: " << nRanksDouble * energy.sref_ref_pre_energy << eUnit
+       << endl << "Auto-Refresh Energy: "                                                                   << energy.ref_energy << eUnit
+       << endl << "Self-Refresh Energy: "                                                                   << nRanksDouble * energy.sref_energy << eUnit
+       << endl << "----------------------------------------"
+       << endl << "Total Trace Energy: "                                                                    << energy.total_energy << eUnit
+       << endl << "Average Power: "                                                                         << power.average_power << " mW"
+       << endl << "----------------------------------------" << endl;
+
+  cout.flags(flags);
+  cout.precision(precision);
 } // MemoryPowerModel::power_print
 
 // Self-refresh active energy estimation (not including background energy)
@@ -395,11 +366,11 @@ double MemoryPowerModel::engy_sref(double idd6, double idd3n, double idd5,
 
 // IO and Termination power calculation based on Micron Power Calculators
 // Absolute power measures are obtained from Micron Power Calculator (mentioned in mW)
-void MemoryPowerModel::io_term_power(MemorySpecification memSpec)
+void MemoryPowerModel::io_term_power(const MemorySpecification& memSpec)
 {
-  MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
-  MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
-  MemPowerSpec&  memPowerSpec      = memSpec.memPowerSpec;
+  const MemTimingSpec& memTimingSpec     = memSpec.memTimingSpec;
+  const MemArchitectureSpec& memArchSpec = memSpec.memArchSpec;
+  const MemPowerSpec&  memPowerSpec      = memSpec.memPowerSpec;
 
   power.IO_power     = memPowerSpec.ioPower;    // in mW
   power.WR_ODT_power = memPowerSpec.wrOdtPower; // in mW
diff --git a/ext/drampower/src/MemoryPowerModel.h b/ext/drampower/src/MemoryPowerModel.h
index b894f67dd..2b2304989 100644
--- a/ext/drampower/src/MemoryPowerModel.h
+++ b/ext/drampower/src/MemoryPowerModel.h
@@ -46,9 +46,9 @@ class MemoryPowerModel {
  public:
   // Calculate energy and average power consumption for the given memory
   // command trace
-  void power_calc(MemorySpecification    memSpec,
-                  const CommandAnalysis& counters,
-                  int                    term);
+  void power_calc(const MemorySpecification& memSpec,
+                  const CommandAnalysis& c,
+                  int term);
 
   // Used to calculate self-refresh active energy
   static double engy_sref(double idd6,
@@ -145,12 +145,12 @@ class MemoryPowerModel {
   };
 
   // Print the power and energy
-  void power_print(MemorySpecification memSpec,
+  void power_print(const MemorySpecification& memSpec,
                    int                 term,
-                   const CommandAnalysis& counters) const;
+                   const CommandAnalysis& c) const;
 
   // To derive IO and Termination Power measures using DRAM specification
-  void io_term_power(MemorySpecification memSpec);
+  void io_term_power(const MemorySpecification& memSpec);
 
   Energy energy;
   Power  power;
diff --git a/ext/drampower/src/MemorySpecification.h b/ext/drampower/src/MemorySpecification.h
index 149d41c28..16d77ef86 100644
--- a/ext/drampower/src/MemorySpecification.h
+++ b/ext/drampower/src/MemorySpecification.h
@@ -106,7 +106,8 @@ class MemoryType {
     return val == LPDDR ||
            val == LPDDR2 ||
            val == LPDDR3 ||
-           val == WIDEIO_SDR;
+           val == WIDEIO_SDR ||
+           val == DDR4;
   }
 
   bool isDDRFamily() const
@@ -132,9 +133,11 @@ class MemoryType {
 
   double getCapacitance() const
   {
-    // LPDDR/2/3 and DDR memories only have IO Power (no ODT)
-    // Conservative estimates based on Micron Mobile LPDDR2 Power Calculator
-      // LPDDR/2/3 IO Capacitance in mF
+    // LPDDR1/2 memories only have IO Power (no ODT)
+    // LPDDR3 has optional ODT, but it is typically not used (reflections are elimitated by other means (layout))
+    // The capacitance values are conservative and based on Micron Mobile LPDDR2 Power Calculator
+
+    // LPDDR/2/3 IO Capacitance in mF
     if (val == LPDDR) {
         return 0.0000000045;
     } else if (val == LPDDR2) {
diff --git a/ext/drampower/src/TraceParser.cc b/ext/drampower/src/TraceParser.cc
index ec87f06da..2cf9a8572 100644
--- a/ext/drampower/src/TraceParser.cc
+++ b/ext/drampower/src/TraceParser.cc
@@ -42,14 +42,19 @@
 using namespace Data;
 using namespace std;
 
+TraceParser::TraceParser(int64_t nbrOfBanks) :
+  counters(nbrOfBanks)
+{
+}
+
+
 Data::MemCommand TraceParser::parseLine(std::string line)
 {
-  MemCommand memcmd;
+  MemCommand memcmd(MemCommand::UNINITIALIZED, 0, 0);
   istringstream linestream(line);
   string item;
-  double item_val;
+  int64_t item_val;
   unsigned itemnum = 0;
-  MemCommand::cmds type = MemCommand::NOP; // Initialized to prevent warning
 
   while (getline(linestream, item, ',')) {
     if (itemnum == 0) {
@@ -62,10 +67,8 @@ Data::MemCommand TraceParser::parseLine(std::string line)
     } else if (itemnum == 2) {
       stringstream bank(item);
       bank >> item_val;
-      memcmd.setType(type);
       memcmd.setBank(static_cast<unsigned>(item_val));
     }
-    type = memcmd.getType();
     itemnum++;
   }
   return memcmd;
@@ -90,13 +93,13 @@ void TraceParser::parseFile(MemorySpecification memSpec, std::ifstream& trace,
       cmd_list.push_back(cmdline);
       nCommands++;
       if (nCommands == window) {
-        counters.getCommands(memSpec, memSpec.memArchSpec.nbrOfBanks, cmd_list, lastupdate);
+        counters.getCommands(memSpec, cmd_list, lastupdate);
         nCommands = 0;
         cmd_list.clear();
       }
     }
     lastupdate = true;
-    counters.getCommands(memSpec, memSpec.memArchSpec.nbrOfBanks, cmd_list, lastupdate);
+    counters.getCommands(memSpec, cmd_list, lastupdate);
     cmd_list.clear();
     pwr_trace.close();
   } else   {
@@ -106,13 +109,13 @@ void TraceParser::parseFile(MemorySpecification memSpec, std::ifstream& trace,
       cmd_list.push_back(cmdline);
       nCommands++;
       if (nCommands == window) {
-        counters.getCommands(memSpec, memSpec.memArchSpec.nbrOfBanks, cmd_list, lastupdate);
+        counters.getCommands(memSpec, cmd_list, lastupdate);
         nCommands = 0;
         cmd_list.clear();
       }
     }
     lastupdate = true;
-    counters.getCommands(memSpec, memSpec.memArchSpec.nbrOfBanks, cmd_list, lastupdate);
+    counters.getCommands(memSpec, cmd_list, lastupdate);
     cmd_list.clear();
   }
   counters.clear();
diff --git a/ext/drampower/src/TraceParser.h b/ext/drampower/src/TraceParser.h
index cabfcd395..9727b4800 100644
--- a/ext/drampower/src/TraceParser.h
+++ b/ext/drampower/src/TraceParser.h
@@ -48,6 +48,7 @@
 
 class TraceParser {
  public:
+  TraceParser(int64_t nbrOfBanks);
   // list of parsed commands
   std::vector<Data::MemCommand> cmd_list;
 
diff --git a/ext/drampower/src/Utils.h b/ext/drampower/src/Utils.h
index 4aa8bb220..80f4390c7 100644
--- a/ext/drampower/src/Utils.h
+++ b/ext/drampower/src/Utils.h
@@ -41,9 +41,7 @@
 #include <string>
 #include <sstream>
 #include <stdexcept>
-#include <typeinfo>
 
-#define MILLION 1000000
 
 template<typename T>
 T fromString(const std::string& s,
@@ -54,7 +52,7 @@ throw(std::runtime_error)
   T t;
 
   if (!(is >> f >> t)) {
-    throw std::runtime_error("fromString cannot convert " + s);
+    throw std::runtime_error("Cannot convert string");
   }
 
   return t;
diff --git a/ext/drampower/src/libdrampower/LibDRAMPower.cc b/ext/drampower/src/libdrampower/LibDRAMPower.cc
index ac16f948b..47ed15a99 100644
--- a/ext/drampower/src/libdrampower/LibDRAMPower.cc
+++ b/ext/drampower/src/libdrampower/LibDRAMPower.cc
@@ -52,13 +52,13 @@ libDRAMPower::~libDRAMPower()
 
 void libDRAMPower::doCommand(MemCommand::cmds type, int bank, int64_t timestamp)
 {
-  MemCommand cmd(type, static_cast<unsigned>(bank), static_cast<double>(timestamp));
+  MemCommand cmd(type, static_cast<unsigned>(bank), timestamp);
   cmdList.push_back(cmd);
 }
 
 void libDRAMPower::updateCounters(bool lastUpdate)
 {
-  counters.getCommands(memSpec, memSpec.memArchSpec.nbrOfBanks, cmdList, lastUpdate);
+  counters.getCommands(memSpec, cmdList, lastUpdate);
   cmdList.clear();
 }
 
@@ -72,6 +72,11 @@ void libDRAMPower::clearState()
   counters.clear();
 }
 
+void libDRAMPower::clearCounters(int64_t timestamp)
+{
+  counters.clearStats(timestamp);
+}
+
 const Data::MemoryPowerModel::Energy& libDRAMPower::getEnergy() const
 {
   return mpm.energy;
diff --git a/ext/drampower/src/libdrampower/LibDRAMPower.h b/ext/drampower/src/libdrampower/LibDRAMPower.h
index 9dea8b0f5..4d9ccefe5 100644
--- a/ext/drampower/src/libdrampower/LibDRAMPower.h
+++ b/ext/drampower/src/libdrampower/LibDRAMPower.h
@@ -56,6 +56,8 @@ class libDRAMPower {
 
   void updateCounters(bool lastUpdate);
 
+  void clearCounters(int64_t timestamp);
+
   void clearState();
 
   void calcEnergy();
diff --git a/ext/drampower/test/libdrampowertest/lib_test.cc b/ext/drampower/test/libdrampowertest/lib_test.cc
index f382a727e..20d4d9ebf 100644
--- a/ext/drampower/test/libdrampowertest/lib_test.cc
+++ b/ext/drampower/test/libdrampowertest/lib_test.cc
@@ -79,7 +79,8 @@ int main(int argc, char* argv[])
         test.doCommand(MemCommand::RDA,0,210);
         test.doCommand(MemCommand::ACT,4,232);
         test.doCommand(MemCommand::WRA,4,247);
-        test.doCommand(MemCommand::PDN_F_ACT,3,248);
+        // Need at least tWRAPDEN = AL + CWL + BL/2 + WR + 1 cycles between WR and PDN_F_PRE
+        test.doCommand(MemCommand::PDN_F_PRE,3,265);
 
         //set bool to true when this is the last update of the counters
         test.updateCounters(true);
@@ -106,7 +107,7 @@ int main(int argc, char* argv[])
         //test.getEnergy().act_stdby_energy
         std::cout << "ACT Std Energy" << "\t" << test.getEnergy().act_stdby_energy << endl;
         //total active standby energy for both ranks
-        std::cout << "ACT Std Energy total ranks" << "\t" << memSpec.memArchSpec.nbrOfRanks *
+        std::cout << "ACT Std Energy total ranks" << "\t" << static_cast<double>(memSpec.memArchSpec.nbrOfRanks) *
         test.getEnergy().act_stdby_energy << "\n" ;
         std::cout << "PRE Std Energy" << "\t" << test.getEnergy().pre_stdby_energy << endl;
         std::cout << "Total Energy" << "\t" << test.getEnergy().total_energy << endl;
diff --git a/src/mem/SerialLink.py b/src/mem/SerialLink.py
index f05f2872d..fd9b0ff6b 100644
--- a/src/mem/SerialLink.py
+++ b/src/mem/SerialLink.py
@@ -61,3 +61,5 @@ class SerialLink(MemObject):
     #  link belongs to and the number of lanes:
     num_lanes = Param.Unsigned(1, "Number of parallel lanes inside the serial"
         "link. (aka. lane width)")
+    link_speed = Param.UInt64(1, "Gb/s Speed of each parallel lane inside the"
+        "serial link. (aka. lane speed)")
diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index 6cd72b266..f59528492 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -41,6 +41,7 @@
  *          Ani Udipi
  *          Neha Agarwal
  *          Omar Naji
+ *          Matthias Jung
  */
 
 /**
@@ -862,7 +863,7 @@ class DRAMCtrl : public AbstractMemory
      */
     static bool sortTime(const Data::MemCommand& m1,
                          const Data::MemCommand& m2) {
-        return m1.getTime() < m2.getTime();
+        return m1.getTimeInt64() < m2.getTimeInt64();
     };
 
 
diff --git a/src/mem/ruby/network/garnet/fixed-pipeline/GarnetLink_d.py b/src/mem/ruby/network/garnet/fixed-pipeline/GarnetLink_d.py
index c7833ee96..5a4f3026e 100644
--- a/src/mem/ruby/network/garnet/fixed-pipeline/GarnetLink_d.py
+++ b/src/mem/ruby/network/garnet/fixed-pipeline/GarnetLink_d.py
@@ -53,19 +53,20 @@ class GarnetIntLink_d(BasicIntLink):
     cxx_header = "mem/ruby/network/garnet/fixed-pipeline/GarnetLink_d.hh"
     # The detailed fixed pipeline bi-directional link include two main
     # forward links and two backward flow-control links, one per direction
-    nls = []
+    _nls = []
     # In uni-directional link
-    nls.append(NetworkLink_d());
+    _nls.append(NetworkLink_d());
     # Out uni-directional link
-    nls.append(NetworkLink_d());
-    network_links = VectorParam.NetworkLink_d(nls, "forward links")
+    _nls.append(NetworkLink_d());
+    network_links = VectorParam.NetworkLink_d(_nls, "forward links")
 
-    cls = []
+    _cls = []
     # In uni-directional link
-    cls.append(CreditLink_d());
+    _cls.append(CreditLink_d());
     # Out uni-directional link
-    cls.append(CreditLink_d());
-    credit_links = VectorParam.CreditLink_d(cls, "backward flow-control links")
+    _cls.append(CreditLink_d());
+    credit_links = VectorParam.CreditLink_d(_cls,
+                                            "backward flow-control links")
 
 # Exterior fixed pipeline links between a router and a controller
 class GarnetExtLink_d(BasicExtLink):
@@ -73,16 +74,17 @@ class GarnetExtLink_d(BasicExtLink):
     cxx_header = "mem/ruby/network/garnet/fixed-pipeline/GarnetLink_d.hh"
     # The detailed fixed pipeline bi-directional link include two main
     # forward links and two backward flow-control links, one per direction
-    nls = []
+    _nls = []
     # In uni-directional link
-    nls.append(NetworkLink_d());
+    _nls.append(NetworkLink_d());
     # Out uni-directional link
-    nls.append(NetworkLink_d());
-    network_links = VectorParam.NetworkLink_d(nls, "forward links")
+    _nls.append(NetworkLink_d());
+    network_links = VectorParam.NetworkLink_d(_nls, "forward links")
 
-    cls = []
+    _cls = []
     # In uni-directional link
-    cls.append(CreditLink_d());
+    _cls.append(CreditLink_d());
     # Out uni-directional link
-    cls.append(CreditLink_d());
-    credit_links = VectorParam.CreditLink_d(cls, "backward flow-control links")
+    _cls.append(CreditLink_d());
+    credit_links = VectorParam.CreditLink_d(_cls,
+                                            "backward flow-control links")
diff --git a/src/mem/serial_link.cc b/src/mem/serial_link.cc
index b6cb097b7..25f5291bb 100644
--- a/src/mem/serial_link.cc
+++ b/src/mem/serial_link.cc
@@ -87,7 +87,9 @@ SerialLink::SerialLink(SerialLinkParams *p)
                 ticksToCycles(p->delay), p->resp_size, p->ranges),
       masterPort(p->name + ".master", *this, slavePort,
                  ticksToCycles(p->delay), p->req_size),
-      num_lanes(p->num_lanes)
+      num_lanes(p->num_lanes),
+      link_speed(p->link_speed)
+
 {
 }
 
@@ -153,8 +155,9 @@ SerialLink::SerialLinkMasterPort::recvTimingResp(PacketPtr pkt)
     // have to wait to receive the whole packet. So we only account for the
     // deserialization latency.
     Cycles cycles = delay;
-    cycles += Cycles(divCeil(pkt->getSize() * 8, serial_link.num_lanes));
-    Tick t = serial_link.clockEdge(cycles);
+    cycles += Cycles(divCeil(pkt->getSize() * 8, serial_link.num_lanes
+                * serial_link.link_speed));
+     Tick t = serial_link.clockEdge(cycles);
 
     //@todo: If the processor sends two uncached requests towards HMC and the
     // second one is smaller than the first one. It may happen that the second
@@ -214,7 +217,7 @@ SerialLink::SerialLinkSlavePort::recvTimingReq(PacketPtr pkt)
             // only.
             Cycles cycles = delay;
             cycles += Cycles(divCeil(pkt->getSize() * 8,
-                serial_link.num_lanes));
+                    serial_link.num_lanes * serial_link.link_speed));
             Tick t = serial_link.clockEdge(cycles);
 
             //@todo: If the processor sends two uncached requests towards HMC
@@ -301,7 +304,7 @@ SerialLink::SerialLinkMasterPort::trySendTiming()
 
             // Make sure bandwidth limitation is met
             Cycles cycles = Cycles(divCeil(pkt->getSize() * 8,
-                serial_link.num_lanes));
+                serial_link.num_lanes * serial_link.link_speed));
             Tick t = serial_link.clockEdge(cycles);
             serial_link.schedule(sendEvent, std::max(next_req.tick, t));
         }
@@ -346,7 +349,7 @@ SerialLink::SerialLinkSlavePort::trySendTiming()
 
             // Make sure bandwidth limitation is met
             Cycles cycles = Cycles(divCeil(pkt->getSize() * 8,
-                serial_link.num_lanes));
+                serial_link.num_lanes * serial_link.link_speed));
             Tick t = serial_link.clockEdge(cycles);
             serial_link.schedule(sendEvent, std::max(next_resp.tick, t));
         }
diff --git a/src/mem/serial_link.hh b/src/mem/serial_link.hh
index d4f6ca488..9fbcce335 100644
--- a/src/mem/serial_link.hh
+++ b/src/mem/serial_link.hh
@@ -312,6 +312,9 @@ class SerialLink : public MemObject
     /** Number of parallel lanes in this serial link */
     unsigned num_lanes;
 
+    /** Speed of each link (Gb/s) in this serial link */
+    uint64_t link_speed;
+
   public:
 
     virtual BaseMasterPort& getMasterPort(const std::string& if_name,
diff --git a/tests/quick/se/70.tgen/traffic.cfg b/tests/quick/se/70.tgen/traffic.cfg
new file mode 100644
index 000000000..88e642553
--- /dev/null
+++ b/tests/quick/se/70.tgen/traffic.cfg
@@ -0,0 +1,7 @@
+STATE 0 10000 RANDOM 100 0 134217727 256 1000 1000 0
+STATE 1 1000000 TRACE tests/quick/se/70.tgen/tgen-simple-mem.trc 100
+STATE 2 1000 IDLE
+INIT 0
+TRANSITION 0 1 1
+TRANSITION 1 2 1
+TRANSITION 2 0 1
diff --git a/util/systemc/main.cc b/util/systemc/main.cc
index 75a77853b..c9fbd48a0 100644
--- a/util/systemc/main.cc
+++ b/util/systemc/main.cc
@@ -74,6 +74,9 @@
 #include "sc_module.hh"
 #include "stats.hh"
 
+// Defining global string variable decalred in stats.hh
+std::string filename;
+
 void
 usage(const std::string &prog_name)
 {
@@ -289,7 +292,7 @@ void SimControl::run()
 
                 std::cerr << "Waiting for " << wait_period << "ps for"
                     " SystemC to catch up to gem5\n";
-                wait(sc_core::sc_time(wait_period, sc_core::SC_PS));
+                wait(sc_core::sc_time::from_value(wait_period));
             }
 
             config_manager->loadState(*checkpoint);
@@ -383,7 +386,11 @@ sc_main(int argc, char **argv)
 {
     SimControl sim_control("gem5", argc, argv);
 
+    filename = "m5out/stats-systemc.txt";
+
     sc_core::sc_start();
 
+    CxxConfig::statsDump();
+
     return EXIT_SUCCESS;
 }
diff --git a/util/systemc/stats.cc b/util/systemc/stats.cc
index ef5d9b5d3..54d149474 100644
--- a/util/systemc/stats.cc
+++ b/util/systemc/stats.cc
@@ -35,6 +35,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: Andrew Bardsley
+ *          Matthias Jung
+ *          Abdul Mutaal Ahmad
  */
 
 /**
@@ -45,7 +47,9 @@
  *  Register with: Stats::registerHandlers(statsReset, statsDump)
  */
 
+#include "base/output.hh"
 #include "base/statistics.hh"
+#include "base/stats/text.hh"
 #include "stats.hh"
 
 namespace CxxConfig
@@ -56,45 +60,76 @@ void statsPrepare()
     std::list<Stats::Info *> stats = Stats::statsList();
 
     /* gather_stats -> prepare */
-    for (auto i = stats.begin(); i != stats.end(); ++i)
-        (*i)->prepare();
+    for (auto i = stats.begin(); i != stats.end(); ++i){
+        Stats::Info *stat = *i;
+        Stats::VectorInfo *vector = dynamic_cast<Stats::VectorInfo *>(stat);
+        if (vector){
+            (dynamic_cast<Stats::VectorInfo *>(*i))->prepare();
+        }
+        else {
+            (*i)->prepare();
+        }
+
+    }
 }
 
 void statsDump()
 {
-    std::cerr << "Stats dump\n";
+    bool desc = true;
+    Stats::Output *output = Stats::initText(filename, desc);
 
     Stats::processDumpQueue();
 
     std::list<Stats::Info *> stats = Stats::statsList();
 
+    statsEnable();
     statsPrepare();
 
+    output->begin();
     /* gather_stats -> convert_value */
     for (auto i = stats.begin(); i != stats.end(); ++i) {
         Stats::Info *stat = *i;
 
-        Stats::ScalarInfo *scalar = dynamic_cast<Stats::ScalarInfo *>(stat);
+        const Stats::ScalarInfo *scalar = dynamic_cast<Stats::ScalarInfo
+            *>(stat);
         Stats::VectorInfo *vector = dynamic_cast<Stats::VectorInfo *>(stat);
-
-        if (scalar) {
-            std::cerr << "SCALAR " << stat->name << ' '
-                << scalar->value() << '\n';
-        } else if (vector) {
-            Stats::VResult results = vector->value();
-
-            unsigned int index = 0;
-            for (auto e = results.begin(); e != results.end(); ++e) {
-                std::cerr << "VECTOR " << stat->name << '[' << index
-                    << "] " << (*e) << '\n';
-                index++;
+        const Stats::Vector2dInfo *vector2d = dynamic_cast<Stats::Vector2dInfo
+            *>(vector);
+        const Stats::DistInfo *dist = dynamic_cast<Stats::DistInfo *>(stat);
+        const Stats::VectorDistInfo *vectordist =
+            dynamic_cast<Stats::VectorDistInfo *>(stat);
+        const Stats::SparseHistInfo *sparse =
+            dynamic_cast<Stats::SparseHistInfo *>(stat);
+        const Stats::InfoProxy <Stats::Vector2d,Stats::Vector2dInfo> *info =
+            dynamic_cast<Stats::InfoProxy
+            <Stats::Vector2d,Stats::Vector2dInfo>*>(stat);
+
+        if (vector) {
+            const Stats::FormulaInfo *formula = dynamic_cast<Stats::FormulaInfo
+                *>(vector);
+            if (formula){
+                output->visit(*formula);
+            } else {
+                const Stats::VectorInfo *vector1 = vector;
+                output->visit(*vector1);
             }
-            std::cerr << "VTOTAL " << stat->name << ' '
-                << vector->total() << '\n';
+        } else if (vector2d) {
+            output->visit(*vector2d);
+        } else if (info){
+            output->visit(*info);
+        } else if (vectordist){
+            output->visit(*vectordist);
+        } else if (dist) {
+            output->visit(*dist);
+        } else if (sparse) {
+            output->visit(*sparse);
+        } else if (scalar) {
+            output->visit(*scalar);
         } else {
-            std::cerr << "?????? " << stat->name << '\n';
+            warn("Stat not dumped: %s\n", stat->name);
         }
     }
+    output->end();
 }
 
 void statsReset()
@@ -108,8 +143,17 @@ void statsEnable()
 {
     std::list<Stats::Info *> stats = Stats::statsList();
 
-    for (auto i = stats.begin(); i != stats.end(); ++i)
-        (*i)->enable();
+    for (auto i = stats.begin(); i != stats.end(); ++i){
+        Stats::Info *stat = *i;
+        Stats::VectorInfo *vector = dynamic_cast<Stats::VectorInfo *>(stat);
+        if (vector){
+            (dynamic_cast<Stats::VectorInfo *>(*i))->enable();
+        }
+        else {
+            (*i)->enable();
+        }
+
+    }
 }
 
 }
diff --git a/util/systemc/stats.hh b/util/systemc/stats.hh
index 360cb6293..9dac960ee 100644
--- a/util/systemc/stats.hh
+++ b/util/systemc/stats.hh
@@ -35,6 +35,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: Andrew Bardsley
+ *          Matthias Jung
+ *          Abdul Mutaal Ahmad
  */
 
 /**
@@ -48,6 +50,8 @@
 #ifndef __UTIL_CXX_CONFIG_STATS_H__
 #define __UTIL_CXX_CONFIG_STATS_H__
 
+extern std::string filename;
+
 namespace CxxConfig
 {
 
diff --git a/util/tlm/README b/util/tlm/README
index 126705296..fc620f145 100644
--- a/util/tlm/README
+++ b/util/tlm/README
@@ -94,3 +94,26 @@ The parameter -o specifies the begining of the memory region (0x80000000).
 The system should boot now.
 
 For conveniance a run_gem5.sh file holds all those commands
+
+
+III. Elastic Trace Setup
+========================
+
+Elastic traces can also be replayed into the SystemC world.
+For more information on elastic traces please refer to:
+
+ - http://www.gem5.org/TraceCPU
+
+ - Exploring System Performance using Elastic Traces:
+   Fast, Accurate and Portable
+   R. Jagtap, S. Diestelhorst, A. Hansson, M. Jung, N. Wehn.
+   IEEE International Conference on Embedded Computer Systems Architectures
+   Modeling and Simulation (SAMOS), July, 2016, Samos Island, Greece.
+
+Similar to I. the simulation can be set up with this command:
+
+> ../../build/ARM/gem5.opt ./tlm_elastic.py
+
+Then:
+
+> ./gem5.opt.sc m5out/config.ini
diff --git a/util/tlm/main.cc b/util/tlm/main.cc
index c06565603..bf442e02b 100644
--- a/util/tlm/main.cc
+++ b/util/tlm/main.cc
@@ -30,6 +30,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: Matthias Jung
+ *          Abdul Mutaal Ahmad
  */
 
 /**
@@ -67,6 +68,9 @@
 #include "sim/system.hh"
 #include "stats.hh"
 
+// Defining global string variable decalred in stats.hh
+std::string filename;
+
 void usage(const std::string &prog_name)
 {
     std::cerr << "Usage: " << prog_name << (
@@ -296,6 +300,8 @@ sc_main(int argc, char **argv)
     SimControl sim_control("gem5", argc, argv);
     Target *memory;
 
+    filename = "m5out/stats-tlm.txt";
+
     tlm::tlm_initiator_socket <> *mem_port =
         dynamic_cast<tlm::tlm_initiator_socket<> *>(
                     sc_core::sc_find_object("gem5.memory")
@@ -319,5 +325,7 @@ sc_main(int argc, char **argv)
 
     SC_REPORT_INFO("sc_main", "End of Simulation");
 
+    CxxConfig::statsDump();
+
     return EXIT_SUCCESS;
 }
diff --git a/util/tlm/tlm_elastic.py b/util/tlm/tlm_elastic.py
new file mode 100644
index 000000000..3de0670c0
--- /dev/null
+++ b/util/tlm/tlm_elastic.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2016, University of Kaiserslautern
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Matthias Jung
+
+import m5
+import optparse
+
+from m5.objects import *
+from m5.util import addToPath, fatal
+
+addToPath('../../configs/common/')
+
+from Caches import *
+
+# This configuration shows a simple setup of a Elastic Trace Player (eTraceCPU)
+# and an external TLM port for SystemC co-simulation.
+#
+# We assume a DRAM size of 512MB and L1 cache sizes of 32KB.
+#
+# Base System Architecture:
+#
+#                  +-----------+       ^
+# +-------------+  | eTraceCPU |       |
+# | System Port |  +-----+-----+       |
+# +------+------+  | $D1 | $I1 |       |
+#        |         +--+--+--+--+       |
+#        |            |     |          | gem5 World
+#        |            |     |          | (see this file)
+#        |            |     |          |
+# +------v------------v-----v--+       |
+# |           Membus           |       v
+# +----------------+-----------+       External Port (see sc_port.*)
+#                  |                   ^
+#              +---v---+               | TLM World
+#              |  TLM  |               | (see sc_target.*)
+#              +-------+               v
+#
+#
+# Create a system with a Crossbar and an Elastic Trace Player as CPU:
+
+# Setup System:
+system = System(cpu=TraceCPU(cpu_id=0),
+                mem_mode='timing',
+                mem_ranges = [AddrRange('512MB')],
+                cache_line_size = 64)
+
+# Create a top-level voltage domain:
+system.voltage_domain = VoltageDomain()
+
+# Create a source clock for the system. This is used as the clock period for
+# xbar and memory:
+system.clk_domain = SrcClockDomain(clock =  '1GHz',
+        voltage_domain = system.voltage_domain)
+
+# Create a CPU voltage domain:
+system.cpu_voltage_domain = VoltageDomain()
+
+# Create a separate clock domain for the CPUs. In case of Trace CPUs this clock
+# is actually used only by the caches connected to the CPU:
+system.cpu_clk_domain = SrcClockDomain(clock = '1GHz',
+        voltage_domain = system.cpu_voltage_domain)
+
+# Setup CPU and its L1 caches:
+system.cpu.createInterruptController()
+system.cpu.icache = L1_ICache(size="32kB")
+system.cpu.dcache = L1_DCache(size="32kB")
+system.cpu.icache.cpu_side = system.cpu.icache_port
+system.cpu.dcache.cpu_side = system.cpu.dcache_port
+
+# Assign input trace files to the eTraceCPU:
+system.cpu.instTraceFile="system.cpu.traceListener.inst.gz"
+system.cpu.dataTraceFile="system.cpu.traceListener.data.gz"
+
+# Setting up L1 BUS:
+system.membus = IOXBar(width = 16)
+system.physmem = SimpleMemory() # This must be instantiated, even if not needed
+
+# Create a external TLM port:
+system.tlm = ExternalSlave()
+system.tlm.addr_ranges = [AddrRange('512MB')]
+system.tlm.port_type = "tlm"
+system.tlm.port_data = "memory"
+
+# Connect everything:
+system.membus = SystemXBar()
+system.system_port = system.membus.slave
+system.cpu.icache.mem_side = system.membus.slave
+system.cpu.dcache.mem_side = system.membus.slave
+system.membus.master = system.tlm.port
+
+# Start the simulation:
+root = Root(full_system = False, system = system)
+root.system.mem_mode = 'timing'
+m5.instantiate()
+m5.simulate() #Simulation time specified later on commandline