9 files changed, 1903 insertions, 1072 deletions
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 600c89aa5..7261f0c9e 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -850,7 +850,6 @@ FullO3CPU<Impl>::insertThread(ThreadID tid)
 
     //Reset ROB/IQ/LSQ Entries
     commit.rob->resetEntries();
-    iew.resetEntries();
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 90024bc84..1159850f8 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013, 2016 ARM Limited
+ * Copyright (c) 2011-2013, 2016-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -125,6 +125,7 @@ class FullO3CPU : public BaseO3CPU
 
     BaseTLB *itb;
     BaseTLB *dtb;
+    using LSQRequest = typename LSQ<Impl>::LSQRequest;
 
     /** Overall CPU status. */
     Status _status;
@@ -733,21 +734,25 @@ class FullO3CPU : public BaseO3CPU
     /** Available thread ids in the cpu*/
     std::vector<ThreadID> tids;
 
+    /** CPU pushRequest function, forwards request to LSQ. */
+    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+                      unsigned int size, Addr addr, Request::Flags flags,
+                      uint64_t *res)
+    {
+        return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr,
+                flags, res);
+    }
+
     /** CPU read function, forwards read to LSQ. */
-    Fault read(const RequestPtr &req,
-               RequestPtr &sreqLow, RequestPtr &sreqHigh,
-               int load_idx)
+    Fault read(LSQRequest* req, int load_idx)
     {
-        return this->iew.ldstQueue.read(req, sreqLow, sreqHigh, load_idx);
+        return this->iew.ldstQueue.read(req, load_idx);
     }
 
     /** CPU write function, forwards write to LSQ. */
-    Fault write(const RequestPtr &req,
-                const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                uint8_t *data, int store_idx)
+    Fault write(LSQRequest* req, uint8_t *data, int store_idx)
     {
-        return this->iew.ldstQueue.write(req, sreqLow, sreqHigh,
-                                         data, store_idx);
+        return this->iew.ldstQueue.write(req, data, store_idx);
     }
 
     /** Used by the fetch unit to get a hold of the instruction port. */
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index e706b09a1..3d5d84886 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013 ARM Limited
+ * Copyright (c) 2010-2013, 2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -744,14 +744,6 @@ DefaultIEW<Impl>::updateStatus()
 }
 
 template <class Impl>
-void
-DefaultIEW<Impl>::resetEntries()
-{
-    instQueue.resetEntries();
-    ldstQueue.resetEntries();
-}
-
-template <class Impl>
 bool
 DefaultIEW<Impl>::checkStall(ThreadID tid)
 {
@@ -1353,7 +1345,7 @@ DefaultIEW<Impl>::executeInsts()
                 DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
                         "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
                         violator->pcState(), violator->seqNum,
-                        inst->pcState(), inst->seqNum, inst->physEffAddrLow);
+                        inst->pcState(), inst->seqNum, inst->physEffAddr);
 
                 fetchRedirect[tid] = true;
 
@@ -1376,7 +1368,7 @@ DefaultIEW<Impl>::executeInsts()
                 DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
                         "%s, inst PC: %s.  Addr is: %#x.\n",
                         violator->pcState(), inst->pcState(),
-                        inst->physEffAddrLow);
+                        inst->physEffAddr);
                 DPRINTF(IEW, "Violation will not be handled because "
                         "already squashing\n");
 
@@ -1460,6 +1452,8 @@ DefaultIEW<Impl>::tick()
     wroteToTimeBuffer = false;
     updatedQueues = false;
 
+    ldstQueue.tick();
+
     sortInsts();
 
     // Free function units marked as being freed this cycle.
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index a8895f8ff..4a55a91ea 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014 ARM Limited
+ * Copyright (c) 2011-2014, 2017-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
@@ -1140,9 +1140,6 @@ template <class Impl>
 void
 InstructionQueue<Impl>::blockMemInst(const DynInstPtr &blocked_inst)
 {
-    blocked_inst->translationStarted(false);
-    blocked_inst->translationCompleted(false);
-
     blocked_inst->clearIssued();
     blocked_inst->clearCanIssue();
     blockedMemInsts.push_back(blocked_inst);
@@ -1285,9 +1282,9 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
                                            squashed_inst);
                     }
 
-
                     ++iqSquashedOperandsExamined;
                 }
+
             } else if (!squashed_inst->isStoreConditional() ||
                        !squashed_inst->isCompleted()) {
                 NonSpecMapIt ns_inst_it =
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 2b2d39bf7..003726c7c 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012, 2014 ARM Limited
+ * Copyright (c) 2011-2012, 2014, 2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -47,8 +47,9 @@
 #include <map>
 #include <queue>
 
-#include "cpu/o3/lsq_unit.hh"
+#include "arch/generic/tlb.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/lsq_unit.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/port.hh"
 #include "sim/sim_object.hh"
@@ -56,13 +57,659 @@
 struct DerivO3CPUParams;
 
 template <class Impl>
-class LSQ {
+class LSQ
+
+{
   public:
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::CPUPol::IEW IEW;
     typedef typename Impl::CPUPol::LSQUnit LSQUnit;
 
+    class LSQRequest;
+    /** Derived class to hold any sender state the LSQ needs. */
+    class LSQSenderState : public Packet::SenderState
+    {
+      protected:
+        /** The senderState needs to know the LSQRequest who owns it. */
+        LSQRequest* _request;
+
+        /** Default constructor. */
+        LSQSenderState(LSQRequest* request, bool isLoad_)
+            : _request(request), mainPkt(nullptr), pendingPacket(nullptr),
+              outstanding(0), isLoad(isLoad_), needWB(isLoad_), isSplit(false),
+              pktToSend(false), deleted(false)
+          { }
+      public:
+
+        /** Instruction which initiated the access to memory. */
+        DynInstPtr inst;
+        /** The main packet from a split load, used during writeback. */
+        PacketPtr mainPkt;
+        /** A second packet from a split store that needs sending. */
+        PacketPtr pendingPacket;
+        /** Number of outstanding packets to complete. */
+        uint8_t outstanding;
+        /** Whether or not it is a load. */
+        bool isLoad;
+        /** Whether or not the instruction will need to writeback. */
+        bool needWB;
+        /** Whether or not this access is split in two. */
+        bool isSplit;
+        /** Whether or not there is a packet that needs sending. */
+        bool pktToSend;
+        /** Has the request been deleted?
+         * LSQ entries can be squashed before the response comes back. in that
+         * case the SenderState knows.
+         */
+        bool deleted;
+        ContextID contextId() { return inst->contextId(); }
+
+        /** Completes a packet and returns whether the access is finished. */
+        inline bool isComplete() { return outstanding == 0; }
+        inline void deleteRequest() { deleted = true; }
+        inline bool alive() { return !deleted; }
+        LSQRequest* request() { return _request; }
+        virtual void complete() = 0;
+        void writebackDone() { _request->writebackDone(); }
+    };
+
+    /** Memory operation metadata.
+     * This class holds the information about a memory operation. It lives
+     * from initiateAcc to resource deallocation at commit or squash.
+     * LSQRequest objects are owned by the LQ/SQ Entry in the LSQUnit that
+     * holds the operation. It is also used by the LSQSenderState. In addition,
+     * the LSQRequest is a TranslationState, therefore, upon squash, there must
+     * be a defined ownership transferal in case the LSQ resources are
+     * deallocated before the TLB is done using the TranslationState. If that
+     * happens, the LSQRequest will be self-owned, and responsible to detect
+     * that its services are no longer required and self-destruct.
+     *
+     * Lifetime of a LSQRequest:
+     *                 +--------------------+
+     *                 |LSQ creates and owns|
+     *                 +--------------------+
+     *                           |
+     *                 +--------------------+
+     *                 | Initate translation|
+     *                 +--------------------+
+     *                           |
+     *                        ___^___
+     *                    ___/       \___
+     *             ______/   Squashed?   \
+     *            |      \___         ___/
+     *            |          \___ ___/
+     *            |              v
+     *            |              |
+     *            |    +--------------------+
+     *            |    |  Translation done  |
+     *            |    +--------------------+
+     *            |              |
+     *            |    +--------------------+
+     *            |    |     Send packet    |<------+
+     *            |    +--------------------+       |
+     *            |              |                  |
+     *            |           ___^___               |
+     *            |       ___/       \___           |
+     *            |  ____/   Squashed?   \          |
+     *            | |    \___         ___/          |
+     *            | |        \___ ___/              |
+     *            | |            v                  |
+     *            | |            |                  |
+     *            | |         ___^___               |
+     *            | |     ___/       \___           |
+     *            | |    /     Done?     \__________|
+     *            | |    \___         ___/
+     *            | |        \___ ___/
+     *            | |            v
+     *            | |            |
+     *            | |  +--------------------+
+     *            | |  |    Manage stuff    |
+     *            | |  |   Free resources   |
+     *            | |  +--------------------+
+     *            | |
+     *            | |  +--------------------+
+     *            | |  |  senderState owns  |
+     *            | +->|  onRecvTimingResp  |
+     *            |    |   free resources   |
+     *            |    +--------------------+
+     *            |
+     *            |   +----------------------+
+     *            |   |  self owned (Trans)  |
+     *            +-->| on TranslationFinish |
+     *                |    free resources    |
+     *                +----------------------+
+     *
+     *
+     */
+    class LSQRequest : public BaseTLB::Translation
+    {
+      protected:
+        typedef uint32_t FlagsStorage;
+        typedef ::Flags<FlagsStorage> FlagsType;
+
+        enum Flag : FlagsStorage
+        {
+            IsLoad              = 0x00000001,
+            /** True if this is a store that writes registers (SC). */
+            WbStore             = 0x00000002,
+            Delayed             = 0x00000004,
+            IsSplit             = 0x00000008,
+            /** True if any translation has been sent to TLB. */
+            TranslationStarted  = 0x00000010,
+            /** True if there are un-replied outbound translations.. */
+            TranslationFinished = 0x00000020,
+            Sent                = 0x00000040,
+            Retry               = 0x00000080,
+            Complete            = 0x00000100,
+            /** Ownership tracking flags. */
+            /** Translation squashed. */
+            TranslationSquashed = 0x00000200,
+            /** Request discarded */
+            Discarded           = 0x00000400,
+            /** LSQ resources freed. */
+            LSQEntryFreed       = 0x00000800,
+            /** Store written back. */
+            WritebackScheduled  = 0x00001000,
+            WritebackDone       = 0x00002000
+        };
+        FlagsType flags;
+
+        enum class State
+        {
+            NotIssued,
+            Translation,
+            Request,
+            Complete,
+            Squashed,
+            Fault,
+        };
+        State _state;
+        LSQSenderState* _senderState;
+        void setState(const State& newState) { _state = newState; }
+
+        uint32_t numTranslatedFragments;
+        uint32_t numInTranslationFragments;
+
+        /** LQ/SQ entry idx. */
+        uint32_t _entryIdx;
+
+        void markDelayed() { flags.set(Flag::Delayed); }
+        bool isDelayed() { return flags.isSet(Flag::Delayed); }
+
+      public:
+        LSQUnit& _port;
+        const DynInstPtr _inst;
+        uint32_t _taskId;
+        PacketDataPtr _data;
+        std::vector<PacketPtr> _packets;
+        std::vector<RequestPtr> _requests;
+        std::vector<Fault> _fault;
+        uint64_t* _res;
+        const Addr _addr;
+        const uint32_t _size;
+        const Request::Flags _flags;
+        uint32_t _numOutstandingPackets;
+      protected:
+        LSQUnit* lsqUnit() { return &_port; }
+        LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad) :
+            _state(State::NotIssued), _senderState(nullptr),
+            _port(*port), _inst(inst), _data(nullptr),
+            _res(nullptr), _addr(0), _size(0), _flags(0),
+            _numOutstandingPackets(0)
+        {
+            flags.set(Flag::IsLoad, isLoad);
+            flags.set(Flag::WbStore, _inst->isStoreConditional());
+            install();
+        }
+        LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
+                   const Addr& addr, const uint32_t& size,
+                   const Request::Flags& flags_,
+                   PacketDataPtr data = nullptr, uint64_t* res = nullptr)
+            : _state(State::NotIssued), _senderState(nullptr),
+            numTranslatedFragments(0),
+            numInTranslationFragments(0),
+            _port(*port), _inst(inst), _data(data),
+            _res(res), _addr(addr), _size(size),
+            _flags(flags_),
+            _numOutstandingPackets(0)
+        {
+            flags.set(Flag::IsLoad, isLoad);
+            flags.set(Flag::WbStore, _inst->isStoreConditional());
+            install();
+        }
+
+        bool
+        isLoad() const
+        {
+            return flags.isSet(Flag::IsLoad);
+        }
+
+        /** Install the request in the LQ/SQ. */
+        void install()
+        {
+            if (isLoad()) {
+                _port.loadQueue[_inst->lqIdx].setRequest(this);
+            } else {
+                _port.storeQueue[_inst->sqIdx].setRequest(this);
+            }
+        }
+        virtual bool
+        squashed() const override
+        {
+            return _inst->isSquashed();
+        }
+
+        /**
+         * Test if the LSQRequest has been released, i.e. self-owned.
+         * An LSQRequest manages itself when the resources on the LSQ are freed
+         * but the translation is still going on and the LSQEntry was freed.
+         */
+        bool
+        isReleased()
+        {
+            return flags.isSet(Flag::LSQEntryFreed) ||
+                flags.isSet(Flag::Discarded);
+        }
+
+        /** Release the LSQRequest.
+         * Notify the sender state that the request it points to is not valid
+         * anymore. Understand if the request is orphan (self-managed) and if
+         * so, mark it as freed, else destroy it, as this means
+         * the end of its life cycle.
+         * An LSQRequest is orphan when its resources are released
+         * but there is any in-flight translation request to the TLB or access
+         * request to the memory.
+         */
+        void release(Flag reason)
+        {
+            assert(reason == Flag::LSQEntryFreed || reason == Flag::Discarded);
+            if (!isAnyOutstandingRequest()) {
+                delete this;
+            } else {
+                if (_senderState) {
+                    _senderState->deleteRequest();
+                }
+                flags.set(reason);
+            }
+        }
+
+        /** Destructor.
+         * The LSQRequest owns the request. If the packet has already been
+         * sent, the sender state will be deleted upon receiving the reply.
+         */
+        virtual ~LSQRequest()
+        {
+            assert(!isAnyOutstandingRequest());
+            _inst->savedReq = nullptr;
+            if (_senderState)
+                delete _senderState;
+
+            for (auto r: _packets)
+                delete r;
+        };
+
+
+      public:
+        /** Convenience getters/setters. */
+        /** @{ */
+        /** Set up Context numbers. */
+        void
+        setContext(const ContextID& context_id)
+        {
+            request()->setContext(context_id);
+        }
+
+        const DynInstPtr&
+        instruction()
+        {
+            return _inst;
+        }
+
+        /** Set up virtual request.
+         * For a previously allocated Request objects.
+         */
+        void
+        setVirt(int asid, Addr vaddr, unsigned size, Request::Flags flags_,
+                MasterID mid, Addr pc)
+        {
+            request()->setVirt(asid, vaddr, size, flags_, mid, pc);
+        }
+
+        void
+        taskId(const uint32_t& v)
+        {
+            _taskId = v;
+            for (auto& r: _requests)
+                r->taskId(v);
+        }
+
+        uint32_t taskId() const { return _taskId; }
+        RequestPtr request(int idx = 0) { return _requests.at(idx); }
+
+        const RequestPtr
+        request(int idx = 0) const
+        {
+            return _requests.at(idx);
+        }
+
+        Addr getVaddr(int idx = 0) const { return request(idx)->getVaddr(); }
+        virtual void initiateTranslation() = 0;
+
+        PacketPtr packet(int idx = 0) { return _packets.at(idx); }
+
+        virtual PacketPtr
+        mainPacket()
+        {
+            assert (_packets.size() == 1);
+            return packet();
+        }
+
+        virtual RequestPtr
+        mainRequest()
+        {
+            assert (_requests.size() == 1);
+            return request();
+        }
+
+        void
+        senderState(LSQSenderState* st)
+        {
+            _senderState = st;
+            for (auto& pkt: _packets) {
+                if (pkt)
+                    pkt->senderState = st;
+            }
+        }
+
+        const LSQSenderState*
+        senderState() const
+        {
+            return _senderState;
+        }
+
+        /**
+         * Mark senderState as discarded. This will cause to discard response
+         * packets from the cache.
+         */
+        void
+        discardSenderState()
+        {
+            assert(_senderState);
+            _senderState->deleteRequest();
+        }
+
+        /**
+         * Test if there is any in-flight translation or mem access request
+         */
+        bool
+        isAnyOutstandingRequest()
+        {
+            return numInTranslationFragments > 0 ||
+                _numOutstandingPackets > 0 ||
+                (flags.isSet(Flag::WritebackScheduled) &&
+                 !flags.isSet(Flag::WritebackDone));
+        }
+
+        bool
+        isSplit() const
+        {
+            return flags.isSet(Flag::IsSplit);
+        }
+        /** @} */
+        virtual bool recvTimingResp(PacketPtr pkt) = 0;
+        virtual void sendPacketToCache() = 0;
+        virtual void buildPackets() = 0;
+
+        /**
+         * Memory mapped IPR accesses
+         */
+        virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt) = 0;
+        virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt) = 0;
+
+        /**
+         * Test if the request accesses a particular cache line.
+         */
+        virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0;
+
+        /** Update the status to reflect that a packet was sent. */
+        void
+        packetSent()
+        {
+            flags.set(Flag::Sent);
+        }
+        /** Update the status to reflect that a packet was not sent.
+         * When a packet fails to be sent, we mark the request as needing a
+         * retry. Note that Retry flag is sticky.
+         */
+        void
+        packetNotSent()
+        {
+            flags.set(Flag::Retry);
+            flags.clear(Flag::Sent);
+        }
+
+        void sendFragmentToTranslation(int i);
+        bool
+        isComplete()
+        {
+            return flags.isSet(Flag::Complete);
+        }
+
+        bool
+        isInTranslation()
+        {
+            return _state == State::Translation;
+        }
+
+        bool
+        isTranslationComplete()
+        {
+            return flags.isSet(Flag::TranslationStarted) &&
+                   !isInTranslation();
+        }
+
+        bool
+        isTranslationBlocked()
+        {
+            return _state == State::Translation &&
+                flags.isSet(Flag::TranslationStarted) &&
+                !flags.isSet(Flag::TranslationFinished);
+        }
+
+        bool
+        isSent()
+        {
+            return flags.isSet(Flag::Sent);
+        }
+
+        /**
+         * The LSQ entry is cleared
+         */
+        void
+        freeLSQEntry()
+        {
+            release(Flag::LSQEntryFreed);
+        }
+
+        /**
+         * The request is discarded (e.g. partial store-load forwarding)
+         */
+        void
+        discard()
+        {
+            release(Flag::Discarded);
+        }
+
+        void
+        packetReplied()
+        {
+            assert(_numOutstandingPackets > 0);
+            _numOutstandingPackets--;
+            if (_numOutstandingPackets == 0 && isReleased())
+                delete this;
+        }
+
+        void
+        writebackScheduled()
+        {
+            assert(!flags.isSet(Flag::WritebackScheduled));
+            flags.set(Flag::WritebackScheduled);
+        }
+
+        void
+        writebackDone()
+        {
+            flags.set(Flag::WritebackDone);
+            /* If the lsq resources are already free */
+            if (isReleased()) {
+                delete this;
+            }
+        }
+
+        void
+        squashTranslation()
+        {
+            assert(numInTranslationFragments == 0);
+            flags.set(Flag::TranslationSquashed);
+            /* If we are on our own, self-destruct. */
+            if (isReleased()) {
+                delete this;
+            }
+        }
+
+        void
+        complete()
+        {
+            flags.set(Flag::Complete);
+        }
+    };
+
+    class SingleDataRequest : public LSQRequest
+    {
+      protected:
+        /* Given that we are inside templates, children need explicit
+         * declaration of the names in the parent class. */
+        using Flag = typename LSQRequest::Flag;
+        using State = typename LSQRequest::State;
+        using LSQRequest::_fault;
+        using LSQRequest::_inst;
+        using LSQRequest::_packets;
+        using LSQRequest::_port;
+        using LSQRequest::_res;
+        using LSQRequest::_senderState;
+        using LSQRequest::_state;
+        using LSQRequest::flags;
+        using LSQRequest::isLoad;
+        using LSQRequest::isTranslationComplete;
+        using LSQRequest::lsqUnit;
+        using LSQRequest::request;
+        using LSQRequest::sendFragmentToTranslation;
+        using LSQRequest::setState;
+        using LSQRequest::numInTranslationFragments;
+        using LSQRequest::numTranslatedFragments;
+        using LSQRequest::_numOutstandingPackets;
+      public:
+        SingleDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
+                          const Addr& addr, const uint32_t& size,
+                          const Request::Flags& flags_,
+                          PacketDataPtr data = nullptr,
+                          uint64_t* res = nullptr) :
+            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res)
+        {
+            LSQRequest::_requests.push_back(
+                std::make_shared<Request>(inst->getASID(), addr, size, flags_,
+                    inst->masterId(), inst->instAddr(), inst->contextId()));
+            LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum);
+        }
+        inline virtual ~SingleDataRequest() {}
+        virtual void initiateTranslation();
+        virtual void finish(const Fault &fault, const RequestPtr &req,
+                ThreadContext* tc, BaseTLB::Mode mode);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void sendPacketToCache();
+        virtual void buildPackets();
+        virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt);
+        virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt);
+        virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
+    };
+
+    class SplitDataRequest : public LSQRequest
+    {
+      protected:
+        /* Given that we are inside templates, children need explicit
+         * declaration of the names in the parent class. */
+        using Flag = typename LSQRequest::Flag;
+        using State = typename LSQRequest::State;
+        using LSQRequest::_addr;
+        using LSQRequest::_data;
+        using LSQRequest::_fault;
+        using LSQRequest::_flags;
+        using LSQRequest::_inst;
+        using LSQRequest::_packets;
+        using LSQRequest::_port;
+        using LSQRequest::_requests;
+        using LSQRequest::_res;
+        using LSQRequest::_senderState;
+        using LSQRequest::_size;
+        using LSQRequest::_state;
+        using LSQRequest::_taskId;
+        using LSQRequest::flags;
+        using LSQRequest::isLoad;
+        using LSQRequest::isTranslationComplete;
+        using LSQRequest::lsqUnit;
+        using LSQRequest::numInTranslationFragments;
+        using LSQRequest::numTranslatedFragments;
+        using LSQRequest::request;
+        using LSQRequest::sendFragmentToTranslation;
+        using LSQRequest::setState;
+        using LSQRequest::_numOutstandingPackets;
+
+        uint32_t numFragments;
+        uint32_t numReceivedPackets;
+        RequestPtr mainReq;
+        PacketPtr _mainPacket;
+
+
+      public:
+        SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
+                         const Addr& addr, const uint32_t& size,
+                         const Request::Flags & flags_,
+                         PacketDataPtr data = nullptr,
+                         uint64_t* res = nullptr) :
+            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res),
+            numFragments(0),
+            numReceivedPackets(0),
+            mainReq(nullptr),
+            _mainPacket(nullptr)
+        {
+            flags.set(Flag::IsSplit);
+        }
+        virtual ~SplitDataRequest()
+        {
+            if (mainReq) {
+                mainReq = nullptr;
+            }
+            if (_mainPacket) {
+                delete _mainPacket;
+                _mainPacket = nullptr;
+            }
+        }
+        virtual void finish(const Fault &fault, const RequestPtr &req,
+                ThreadContext* tc, BaseTLB::Mode mode);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void initiateTranslation();
+        virtual void sendPacketToCache();
+        virtual void buildPackets();
+
+        virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt);
+        virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt);
+        virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
+
+        virtual RequestPtr mainRequest();
+        virtual PacketPtr mainPacket();
+    };
+
     /** Constructs an LSQ with the given parameters. */
     LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params);
     ~LSQ() { }
@@ -85,17 +732,9 @@ class LSQ {
 
     /** Number of entries needed for the given amount of threads.*/
     int entryAmount(ThreadID num_threads);
-    void removeEntries(ThreadID tid);
-    /** Reset the max entries for each thread. */
-    void resetEntries();
-    /** Resize the max entries for a thread. */
-    void resizeEntries(unsigned size, ThreadID tid);
 
     /** Ticks the LSQ. */
-    void tick();
-    /** Ticks a specific LSQ Unit. */
-    void tick(ThreadID tid)
-    { thread[tid].tick(); }
+    void tick() { usedStorePorts = 0; }
 
     /** Inserts a load into the LSQ. */
     void insertLoad(const DynInstPtr &load_inst);
@@ -112,13 +751,13 @@ class LSQ {
      * Commits loads up until the given sequence number for a specific thread.
      */
     void commitLoads(InstSeqNum &youngest_inst, ThreadID tid)
-    { thread[tid].commitLoads(youngest_inst); }
+    { thread.at(tid).commitLoads(youngest_inst); }
 
     /**
      * Commits stores up until the given sequence number for a specific thread.
      */
     void commitStores(InstSeqNum &youngest_inst, ThreadID tid)
-    { thread[tid].commitStores(youngest_inst); }
+    { thread.at(tid).commitStores(youngest_inst); }
 
     /**
      * Attempts to write back stores until all cache ports are used or the
@@ -131,8 +770,11 @@ class LSQ {
     /**
      * Squash instructions from a thread until the specified sequence number.
      */
-    void squash(const InstSeqNum &squashed_num, ThreadID tid)
-    { thread[tid].squash(squashed_num); }
+    void
+    squash(const InstSeqNum &squashed_num, ThreadID tid)
+    {
+        thread.at(tid).squash(squashed_num);
+    }
 
     /** Returns whether or not there was a memory ordering violation. */
     bool violation();
@@ -140,50 +782,49 @@ class LSQ {
      * Returns whether or not there was a memory ordering violation for a
      * specific thread.
      */
-    bool violation(ThreadID tid)
-    { return thread[tid].violation(); }
+    bool violation(ThreadID tid) { return thread.at(tid).violation(); }
 
     /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr getMemDepViolator(ThreadID tid)
-    { return thread[tid].getMemDepViolator(); }
+    DynInstPtr
+    getMemDepViolator(ThreadID tid)
+    {
+        return thread.at(tid).getMemDepViolator();
+    }
 
     /** Returns the head index of the load queue for a specific thread. */
-    int getLoadHead(ThreadID tid)
-    { return thread[tid].getLoadHead(); }
+    int getLoadHead(ThreadID tid) { return thread.at(tid).getLoadHead(); }
 
     /** Returns the sequence number of the head of the load queue. */
-    InstSeqNum getLoadHeadSeqNum(ThreadID tid)
+    InstSeqNum
+    getLoadHeadSeqNum(ThreadID tid)
     {
-        return thread[tid].getLoadHeadSeqNum();
+        return thread.at(tid).getLoadHeadSeqNum();
     }
 
     /** Returns the head index of the store queue. */
-    int getStoreHead(ThreadID tid)
-    { return thread[tid].getStoreHead(); }
+    int getStoreHead(ThreadID tid) { return thread.at(tid).getStoreHead(); }
 
     /** Returns the sequence number of the head of the store queue. */
-    InstSeqNum getStoreHeadSeqNum(ThreadID tid)
+    InstSeqNum
+    getStoreHeadSeqNum(ThreadID tid)
     {
-        return thread[tid].getStoreHeadSeqNum();
+        return thread.at(tid).getStoreHeadSeqNum();
     }
 
     /** Returns the number of instructions in all of the queues. */
     int getCount();
     /** Returns the number of instructions in the queues of one thread. */
-    int getCount(ThreadID tid)
-    { return thread[tid].getCount(); }
+    int getCount(ThreadID tid) { return thread.at(tid).getCount(); }
 
     /** Returns the total number of loads in the load queue. */
     int numLoads();
     /** Returns the total number of loads for a single thread. */
-    int numLoads(ThreadID tid)
-    { return thread[tid].numLoads(); }
+    int numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
 
     /** Returns the total number of stores in the store queue. */
     int numStores();
     /** Returns the total number of stores for a single thread. */
-    int numStores(ThreadID tid)
-    { return thread[tid].numStores(); }
+    int numStores(ThreadID tid) { return thread.at(tid).numStores(); }
 
     /** Returns the number of free load entries. */
     unsigned numFreeLoadEntries();
@@ -242,46 +883,39 @@ class LSQ {
     /** Returns whether or not a specific thread has any stores to write back
      * to memory.
      */
-    bool hasStoresToWB(ThreadID tid)
-    { return thread[tid].hasStoresToWB(); }
+    bool hasStoresToWB(ThreadID tid) { return thread.at(tid).hasStoresToWB(); }
 
     /** Returns the number of stores a specific thread has to write back. */
-    int numStoresToWB(ThreadID tid)
-    { return thread[tid].numStoresToWB(); }
+    int numStoresToWB(ThreadID tid) { return thread.at(tid).numStoresToWB(); }
 
     /** Returns if the LSQ will write back to memory this cycle. */
     bool willWB();
     /** Returns if the LSQ of a specific thread will write back to memory this
      * cycle.
      */
-    bool willWB(ThreadID tid)
-    { return thread[tid].willWB(); }
+    bool willWB(ThreadID tid) { return thread.at(tid).willWB(); }
 
     /** Debugging function to print out all instructions. */
     void dumpInsts() const;
     /** Debugging function to print out instructions from a specific thread. */
-    void dumpInsts(ThreadID tid) const
-    { thread[tid].dumpInsts(); }
+    void dumpInsts(ThreadID tid) const { thread.at(tid).dumpInsts(); }
 
     /** Executes a read operation, using the load specified at the load
      * index.
      */
-    Fault read(const RequestPtr &req,
-               RequestPtr &sreqLow, RequestPtr &sreqHigh,
-               int load_idx);
+    Fault read(LSQRequest* req, int load_idx);
 
     /** Executes a store operation, using the store specified at the store
      * index.
      */
-    Fault write(const RequestPtr &req,
-                const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                uint8_t *data, int store_idx);
+    Fault write(LSQRequest* req, uint8_t *data, int store_idx);
 
     /**
      * Retry the previous send that failed.
      */
     void recvReqRetry();
 
+    void completeDataAccess(PacketPtr pkt);
     /**
      * Handles writing back and completing the load or store that has
      * returned from memory.
@@ -292,13 +926,34 @@ class LSQ {
 
     void recvTimingSnoopReq(PacketPtr pkt);
 
+    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+                      unsigned int size, Addr addr, Request::Flags flags,
+                      uint64_t *res);
+
     /** The CPU pointer. */
     O3CPU *cpu;
 
     /** The IEW stage pointer. */
     IEW *iewStage;
 
+    /** Is D-cache blocked? */
+    bool cacheBlocked() const;
+    /** Set D-cache blocked status */
+    void cacheBlocked(bool v);
+    /** Is any store port available to use? */
+    bool storePortAvailable() const;
+    /** Another store port is in use */
+    void storePortBusy();
+
   protected:
+    /** D-cache is blocked */
+    bool _cacheBlocked;
+    /** The number of cache ports available each cycle (stores only). */
+    int cacheStorePorts;
+    /** The number of used cache ports in this cycle by stores. */
+    int usedStorePorts;
+
+
     /** The LSQ policy for SMT mode. */
     SMTQueuePolicy lsqPolicy;
 
@@ -307,8 +962,10 @@ class LSQ {
      * and threshold, this function calculates how many resources each thread
      * can occupy at most.
      */
-    static uint32_t maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
-            uint32_t numThreads, uint32_t SMTThreshold) {
+    static uint32_t
+    maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
+            uint32_t numThreads, uint32_t SMTThreshold)
+    {
         if (pol == SMTQueuePolicy::Dynamic) {
             return entries;
         } else if (pol == SMTQueuePolicy::Partitioned) {
@@ -346,24 +1003,20 @@ class LSQ {
 
 template <class Impl>
 Fault
-LSQ<Impl>::read(const RequestPtr &req,
-                RequestPtr &sreqLow, RequestPtr &sreqHigh,
-                int load_idx)
+LSQ<Impl>::read(LSQRequest* req, int load_idx)
 {
-    ThreadID tid = cpu->contextToThread(req->contextId());
+    ThreadID tid = cpu->contextToThread(req->request()->contextId());
 
-    return thread[tid].read(req, sreqLow, sreqHigh, load_idx);
+    return thread.at(tid).read(req, load_idx);
 }
 
 template <class Impl>
 Fault
-LSQ<Impl>::write(const RequestPtr &req,
-                 const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                 uint8_t *data, int store_idx)
+LSQ<Impl>::write(LSQRequest* req, uint8_t *data, int store_idx)
 {
-    ThreadID tid = cpu->contextToThread(req->contextId());
+    ThreadID tid = cpu->contextToThread(req->request()->contextId());
 
-    return thread[tid].write(req, sreqLow, sreqHigh, data, store_idx);
+    return thread.at(tid).write(req, data, store_idx);
 }
 
 #endif // __CPU_O3_LSQ_HH__
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index edc3f469b..8a221a8d5 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012, 2014 ARM Limited
+ * Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -61,6 +61,8 @@ using namespace std;
 template <class Impl>
 LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
     : cpu(cpu_ptr), iewStage(iew_ptr),
+      _cacheBlocked(false),
+      cacheStorePorts(params->cacheStorePorts), usedStorePorts(0),
       lsqPolicy(params->smtLSQPolicy),
       LQEntries(params->LQEntries),
       SQEntries(params->SQEntries),
@@ -76,8 +78,8 @@ LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
     //************ Handle SMT Parameters ***********/
     //**********************************************/
 
-    //Figure out fetch policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic) {
+    /* Run SMT olicy checks. */
+        if (lsqPolicy == SMTQueuePolicy::Dynamic) {
         DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
     } else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
         DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
@@ -85,8 +87,8 @@ LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
                 maxLQEntries,maxSQEntries);
     } else if (lsqPolicy == SMTQueuePolicy::Threshold) {
 
-        assert(params->smtLSQThreshold > LQEntries);
-        assert(params->smtLSQThreshold > SQEntries);
+        assert(params->smtLSQThreshold > params->LQEntries);
+        assert(params->smtLSQThreshold > params->SQEntries);
 
         DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
                 "%i entries per LQ | %i entries per SQ\n",
@@ -163,79 +165,41 @@ template <class Impl>
 void
 LSQ<Impl>::takeOverFrom()
 {
+    usedStorePorts = 0;
+    _cacheBlocked = false;
+
     for (ThreadID tid = 0; tid < numThreads; tid++) {
         thread[tid].takeOverFrom();
     }
 }
 
-template <class Impl>
-int
-LSQ<Impl>::entryAmount(ThreadID num_threads)
-{
-    if (lsqPolicy == SMTQueuePolicy::Partitioned) {
-        return LQEntries / num_threads;
-    } else {
-        return 0;
-    }
-}
-
-template <class Impl>
-void
-LSQ<Impl>::resetEntries()
+template<class Impl>
+bool
+LSQ<Impl>::cacheBlocked() const
 {
-    if (lsqPolicy != SMTQueuePolicy::Dynamic || numThreads > 1) {
-        int active_threads = activeThreads->size();
-
-        int maxEntries;
-
-        if (lsqPolicy == SMTQueuePolicy::Partitioned) {
-            maxEntries = LQEntries / active_threads;
-        } else if (lsqPolicy == SMTQueuePolicy::Threshold &&
-                   active_threads == 1) {
-            maxEntries = LQEntries;
-        } else {
-            maxEntries = LQEntries;
-        }
-
-        list<ThreadID>::iterator threads  = activeThreads->begin();
-        list<ThreadID>::iterator end = activeThreads->end();
-
-        while (threads != end) {
-            ThreadID tid = *threads++;
-
-            resizeEntries(maxEntries, tid);
-        }
-    }
+    return _cacheBlocked;
 }
 
 template<class Impl>
 void
-LSQ<Impl>::removeEntries(ThreadID tid)
+LSQ<Impl>::cacheBlocked(bool v)
 {
-    thread[tid].clearLQ();
-    thread[tid].clearSQ();
+    _cacheBlocked = v;
 }
 
 template<class Impl>
-void
-LSQ<Impl>::resizeEntries(unsigned size, ThreadID tid)
+bool
+LSQ<Impl>::storePortAvailable() const
 {
-    thread[tid].resizeLQ(size);
-    thread[tid].resizeSQ(size);
+    return usedStorePorts < cacheStorePorts;
 }
 
 template<class Impl>
 void
-LSQ<Impl>::tick()
+LSQ<Impl>::storePortBusy()
 {
-    list<ThreadID>::iterator threads = activeThreads->begin();
-    list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        thread[tid].tick();
-    }
+    usedStorePorts++;
+    assert(usedStorePorts <= cacheStorePorts);
 }
 
 template<class Impl>
@@ -316,6 +280,7 @@ void
 LSQ<Impl>::recvReqRetry()
 {
     iewStage->cacheUnblocked();
+    cacheBlocked(false);
 
     for (ThreadID tid : *activeThreads) {
         thread[tid].recvRetry();
@@ -323,6 +288,15 @@ LSQ<Impl>::recvReqRetry()
 }
 
 template <class Impl>
+void
+LSQ<Impl>::completeDataAccess(PacketPtr pkt)
+{
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    thread[cpu->contextToThread(senderState->contextId())]
+        .completeDataAccess(pkt);
+}
+
+template <class Impl>
 bool
 LSQ<Impl>::recvTimingResp(PacketPtr pkt)
 {
@@ -330,8 +304,10 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt)
         DPRINTF(LSQ, "Got error packet back for address: %#X\n",
                 pkt->getAddr());
 
-    thread[cpu->contextToThread(pkt->req->contextId())]
-        .completeDataAccess(pkt);
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    panic_if(!senderState, "Got packet back with unknown sender state\n");
+
+    thread[cpu->contextToThread(senderState->contextId())].recvTimingResp(pkt);
 
     if (pkt->isInvalidate()) {
         // This response also contains an invalidate; e.g. this can be the case
@@ -352,8 +328,9 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt)
             thread[tid].checkSnoop(pkt);
         }
     }
+    // Update the LSQRequest state (this may delete the request)
+    senderState->request()->packetReplied();
 
-    delete pkt;
     return true;
 }
 
@@ -681,4 +658,442 @@ LSQ<Impl>::dumpInsts() const
     }
 }
 
+static Addr
+addrBlockOffset(Addr addr, unsigned int block_size)
+{
+    return addr & (block_size - 1);
+}
+
+static Addr
+addrBlockAlign(Addr addr, uint64_t block_size)
+{
+    return addr & ~(block_size - 1);
+}
+
+static bool
+transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size)
+{
+    return (addrBlockOffset(addr, block_size) + size) > block_size;
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+                       unsigned int size, Addr addr, Request::Flags flags,
+                       uint64_t *res)
+{
+    ThreadID tid = cpu->contextToThread(inst->contextId());
+    auto cacheLineSize = cpu->cacheLineSize();
+    bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize);
+    LSQRequest* req = nullptr;
+
+    if (inst->translationStarted()) {
+        req = inst->savedReq;
+        assert(req);
+    } else {
+        if (needs_burst) {
+            req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
+                    size, flags, data, res);
+        } else {
+            req = new SingleDataRequest(&thread[tid], inst, isLoad, addr,
+                    size, flags, data, res);
+        }
+        assert(req);
+        inst->setRequest();
+        req->taskId(cpu->taskId());
+
+        req->initiateTranslation();
+    }
+
+    /* This is the place were instructions get the effAddr. */
+    if (req->isTranslationComplete()) {
+        if (inst->getFault() == NoFault) {
+            inst->effAddr = req->getVaddr();
+            inst->effSize = size;
+            inst->effAddrValid(true);
+
+            if (cpu->checker) {
+                inst->reqToVerify = std::make_shared<Request>(*req->request());
+            }
+            if (isLoad)
+                inst->getFault() = cpu->read(req, inst->lqIdx);
+            else
+                inst->getFault() = cpu->write(req, data, inst->sqIdx);
+        } else if (isLoad) {
+            // Commit will have to clean up whatever happened.  Set this
+            // instruction as executed.
+            inst->setExecuted();
+        }
+    }
+
+    if (inst->traceData)
+        inst->traceData->setMem(addr, size, flags);
+
+    return inst->getFault();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    _fault.push_back(fault);
+    numInTranslationFragments = 0;
+    numTranslatedFragments = 1;
+    /* If the instruction has been squahsed, let the request know
+     * as it may have to self-destruct. */
+    if (_inst->isSquashed()) {
+        this->squashTranslation();
+    } else {
+        _inst->strictlyOrdered(req->isStrictlyOrdered());
+
+        flags.set(Flag::TranslationFinished);
+        if (fault == NoFault) {
+            _inst->physEffAddr = req->getPaddr();
+            _inst->memReqFlags = req->getFlags();
+            if (req->isCondSwap()) {
+                assert(_res);
+                req->setExtraData(*_res);
+            }
+            setState(State::Request);
+        } else {
+            setState(State::Fault);
+        }
+
+        LSQRequest::_inst->fault = fault;
+        LSQRequest::_inst->translationCompleted(true);
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    _fault.push_back(fault);
+    assert(req == _requests[numTranslatedFragments] || this->isDelayed());
+
+    numInTranslationFragments--;
+    numTranslatedFragments++;
+
+    mainReq->setFlags(req->getFlags());
+
+    if (numTranslatedFragments == _requests.size()) {
+        if (_inst->isSquashed()) {
+            this->squashTranslation();
+        } else {
+            _inst->strictlyOrdered(mainReq->isStrictlyOrdered());
+            flags.set(Flag::TranslationFinished);
+            auto fault_it = _fault.begin();
+            /* Ffwd to the first NoFault. */
+            while (fault_it != _fault.end() && *fault_it == NoFault)
+                fault_it++;
+            /* If none of the fragments faulted: */
+            if (fault_it == _fault.end()) {
+                _inst->physEffAddr = request(0)->getPaddr();
+
+                _inst->memReqFlags = mainReq->getFlags();
+                if (mainReq->isCondSwap()) {
+                    assert(_res);
+                    mainReq->setExtraData(*_res);
+                }
+                setState(State::Request);
+                _inst->fault = NoFault;
+            } else {
+                setState(State::Fault);
+                _inst->fault = *fault_it;
+            }
+            _inst->translationCompleted(true);
+        }
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::initiateTranslation()
+{
+    _inst->translationStarted(true);
+    setState(State::Translation);
+    flags.set(Flag::TranslationStarted);
+
+    _inst->savedReq = this;
+    sendFragmentToTranslation(0);
+
+    if (isTranslationComplete()) {
+    }
+}
+
+template<class Impl>
+PacketPtr
+LSQ<Impl>::SplitDataRequest::mainPacket()
+{
+    return _mainPacket;
+}
+
+template<class Impl>
+RequestPtr
+LSQ<Impl>::SplitDataRequest::mainRequest()
+{
+    return mainReq;
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::initiateTranslation()
+{
+    _inst->translationStarted(true);
+    setState(State::Translation);
+    flags.set(Flag::TranslationStarted);
+
+    unsigned int cacheLineSize = _port.cacheLineSize();
+    Addr base_addr = _addr;
+    Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
+    Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
+    uint32_t size_so_far = 0;
+
+    mainReq = std::make_shared<Request>(_inst->getASID(), base_addr,
+                _size, _flags, _inst->masterId(),
+                _inst->instAddr(), _inst->contextId());
+
+    // Paddr is not used in mainReq. However, we will accumulate the flags
+    // from the sub requests into mainReq by calling setFlags() in finish().
+    // setFlags() assumes that paddr is set so flip the paddr valid bit here to
+    // avoid a potential assert in setFlags() when we call it from  finish().
+    mainReq->setPaddr(0);
+
+    /* Get the pre-fix, possibly unaligned. */
+    _requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr,
+                next_addr - base_addr, _flags, _inst->masterId(),
+                _inst->instAddr(), _inst->contextId()));
+    size_so_far = next_addr - base_addr;
+
+    /* We are block aligned now, reading whole blocks. */
+    base_addr = next_addr;
+    while (base_addr != final_addr) {
+        _requests.push_back(std::make_shared<Request>(_inst->getASID(),
+                    base_addr, cacheLineSize, _flags, _inst->masterId(),
+                    _inst->instAddr(), _inst->contextId()));
+        size_so_far += cacheLineSize;
+        base_addr += cacheLineSize;
+    }
+
+    /* Deal with the tail. */
+    if (size_so_far < _size) {
+        _requests.push_back(std::make_shared<Request>(_inst->getASID(),
+                    base_addr, _size - size_so_far, _flags, _inst->masterId(),
+                    _inst->instAddr(), _inst->contextId()));
+    }
+
+    /* Setup the requests and send them to translation. */
+    for (auto& r: _requests) {
+        r->setReqInstSeqNum(_inst->seqNum);
+        r->taskId(_taskId);
+    }
+    this->_inst->savedReq = this;
+    numInTranslationFragments = 0;
+    numTranslatedFragments = 0;
+
+    for (uint32_t i = 0; i < _requests.size(); i++) {
+        sendFragmentToTranslation(i);
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i)
+{
+    numInTranslationFragments++;
+    _port.dTLB()->translateTiming(
+            this->request(i),
+            this->_inst->thread->getTC(), this,
+            this->isLoad() ? BaseTLB::Read : BaseTLB::Write);
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SingleDataRequest::recvTimingResp(PacketPtr pkt)
+{
+    assert(_numOutstandingPackets == 1);
+    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    setState(State::Complete);
+    flags.set(Flag::Complete);
+    state->outstanding--;
+    assert(pkt == _packets.front());
+    _port.completeDataAccess(pkt);
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt)
+{
+    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    uint32_t pktIdx = 0;
+    while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
+        pktIdx++;
+    assert(pktIdx < _packets.size());
+    assert(pkt->req == _requests[pktIdx]);
+    assert(pkt == _packets[pktIdx]);
+    numReceivedPackets++;
+    state->outstanding--;
+    if (numReceivedPackets == _packets.size()) {
+        setState(State::Complete);
+        flags.set(Flag::Complete);
+        /* Assemble packets. */
+        PacketPtr resp = isLoad()
+            ? Packet::createRead(mainReq)
+            : Packet::createWrite(mainReq);
+        if (isLoad())
+            resp->dataStatic(_inst->memData);
+        else
+            resp->dataStatic(_data);
+        resp->senderState = _senderState;
+        _port.completeDataAccess(resp);
+        delete resp;
+    }
+    return true;
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::buildPackets()
+{
+    assert(_senderState);
+    /* Retries do not create new packets. */
+    if (_packets.size() == 0) {
+        _packets.push_back(
+                isLoad()
+                    ?  Packet::createRead(request())
+                    :  Packet::createWrite(request()));
+        _packets.back()->dataStatic(_inst->memData);
+        _packets.back()->senderState = _senderState;
+    }
+    assert(_packets.size() == 1);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::buildPackets()
+{
+    /* Extra data?? */
+    ptrdiff_t offset = 0;
+    if (_packets.size() == 0) {
+        /* New stuff */
+        if (isLoad()) {
+            _mainPacket = Packet::createRead(mainReq);
+            _mainPacket->dataStatic(_inst->memData);
+        }
+        for (auto& r: _requests) {
+            PacketPtr pkt = isLoad() ? Packet::createRead(r)
+                                    : Packet::createWrite(r);
+            if (isLoad()) {
+                pkt->dataStatic(_inst->memData + offset);
+            } else {
+                uint8_t* req_data = new uint8_t[r->getSize()];
+                std::memcpy(req_data,
+                        _inst->memData + offset,
+                        r->getSize());
+                pkt->dataDynamic(req_data);
+            }
+            offset += r->getSize();
+            pkt->senderState = _senderState;
+            _packets.push_back(pkt);
+        }
+    }
+    assert(_packets.size() == _requests.size());
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::sendPacketToCache()
+{
+    assert(_numOutstandingPackets == 0);
+    if (lsqUnit()->trySendPacket(isLoad(), _packets.at(0)))
+        _numOutstandingPackets = 1;
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::sendPacketToCache()
+{
+    /* Try to send the packets. */
+    while (numReceivedPackets + _numOutstandingPackets < _packets.size() &&
+            lsqUnit()->trySendPacket(isLoad(),
+                _packets.at(numReceivedPackets + _numOutstandingPackets))) {
+        _numOutstandingPackets++;
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SingleDataRequest::handleIprWrite(ThreadContext *thread,
+                                             PacketPtr pkt)
+{
+    TheISA::handleIprWrite(thread, pkt);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::SplitDataRequest::handleIprWrite(ThreadContext *thread,
+                                            PacketPtr mainPkt)
+{
+    unsigned offset = 0;
+    for (auto r: _requests) {
+        PacketPtr pkt = new Packet(r, MemCmd::WriteReq);
+        pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
+        TheISA::handleIprWrite(thread, pkt);
+        offset += r->getSize();
+        delete pkt;
+    }
+}
+
+template<class Impl>
+Cycles
+LSQ<Impl>::SingleDataRequest::handleIprRead(ThreadContext *thread,
+                                            PacketPtr pkt)
+{
+    return TheISA::handleIprRead(thread, pkt);
+}
+
+template<class Impl>
+Cycles
+LSQ<Impl>::SplitDataRequest::handleIprRead(ThreadContext *thread,
+                                           PacketPtr mainPkt)
+{
+    Cycles delay(0);
+    unsigned offset = 0;
+
+    for (auto r: _requests) {
+        PacketPtr pkt = new Packet(r, MemCmd::ReadReq);
+        pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
+        Cycles d = TheISA::handleIprRead(thread, pkt);
+        if (d > delay)
+            delay = d;
+        offset += r->getSize();
+        delete pkt;
+    }
+    return delay;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SingleDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
+{
+    return ( (LSQRequest::_requests[0]->getPaddr() & blockMask) == blockAddr);
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::SplitDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
+{
+    bool is_hit = false;
+    for (auto &r: _requests) {
+        if ((r->getPaddr() & blockMask) == blockAddr) {
+            is_hit = true;
+            break;
+        }
+    }
+    return is_hit;
+}
+
 #endif//__CPU_O3_LSQ_IMPL_HH__
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 48a06b386..5b90da4f5 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014,2017 ARM Limited
+ * Copyright (c) 2012-2014,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -62,6 +62,7 @@
 #include "mem/port.hh"
 
 struct DerivO3CPUParams;
+#include "base/circular_queue.hh"
 
 /**
  * Class that implements the actual LQ and SQ for each specific
@@ -76,7 +77,8 @@ struct DerivO3CPUParams;
  * replayed.
  */
 template <class Impl>
-class LSQUnit {
+class LSQUnit
+{
   public:
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::DynInstPtr DynInstPtr;
@@ -84,6 +86,130 @@ class LSQUnit {
     typedef typename Impl::CPUPol::LSQ LSQ;
     typedef typename Impl::CPUPol::IssueStruct IssueStruct;
 
+    using LSQSenderState = typename LSQ::LSQSenderState;
+    using LSQRequest = typename Impl::CPUPol::LSQ::LSQRequest;
+  private:
+    class LSQEntry
+    {
+      private:
+        /** The instruction. */
+        DynInstPtr inst;
+        /** The request. */
+        LSQRequest* req;
+        /** The size of the operation. */
+        uint8_t _size;
+        /** Valid entry. */
+        bool _valid;
+      public:
+        /** Constructs an empty store queue entry. */
+        LSQEntry()
+            : inst(nullptr), req(nullptr), _size(0), _valid(false)
+        {
+        }
+
+        ~LSQEntry()
+        {
+            inst = nullptr;
+            if (req != nullptr) {
+                req->freeLSQEntry();
+                req = nullptr;
+            }
+        }
+
+        void
+        clear()
+        {
+            inst = nullptr;
+            if (req != nullptr) {
+                req->freeLSQEntry();
+            }
+            req = nullptr;
+            _valid = false;
+            _size = 0;
+        }
+
+        void
+        set(const DynInstPtr& inst)
+        {
+            assert(!_valid);
+            this->inst = inst;
+            _valid = true;
+            _size = 0;
+        }
+        LSQRequest* request() { return req; }
+        void setRequest(LSQRequest* r) { req = r; }
+        bool hasRequest() { return req != nullptr; }
+        /** Member accessors. */
+        /** @{ */
+        bool valid() const { return _valid; }
+        uint8_t& size() { return _size; }
+        const uint8_t& size() const { return _size; }
+        const DynInstPtr& instruction() const { return inst; }
+        /** @} */
+    };
+
+    class SQEntry : public LSQEntry
+    {
+      private:
+        /** The store data. */
+        char _data[64];  // TODO: 64 should become a parameter
+        /** Whether or not the store can writeback. */
+        bool _canWB;
+        /** Whether or not the store is committed. */
+        bool _committed;
+        /** Whether or not the store is completed. */
+        bool _completed;
+        /** Does this request write all zeros and thus doesn't
+         * have any data attached to it. Used for cache block zero
+         * style instructs (ARM DC ZVA; ALPHA WH64)
+         */
+        bool _isAllZeros;
+      public:
+        static constexpr size_t DataSize = sizeof(_data);
+        /** Constructs an empty store queue entry. */
+        SQEntry()
+            : _canWB(false), _committed(false), _completed(false),
+              _isAllZeros(false)
+        {
+            std::memset(_data, 0, DataSize);
+        }
+
+        ~SQEntry()
+        {
+        }
+
+        void
+        set(const DynInstPtr& inst)
+        {
+            LSQEntry::set(inst);
+        }
+
+        void
+        clear()
+        {
+            LSQEntry::clear();
+            _canWB = _completed = _committed = _isAllZeros = false;
+        }
+        /** Member accessors. */
+        /** @{ */
+        bool& canWB() { return _canWB; }
+        const bool& canWB() const { return _canWB; }
+        bool& completed() { return _completed; }
+        const bool& completed() const { return _completed; }
+        bool& committed() { return _committed; }
+        const bool& committed() const { return _committed; }
+        bool& isAllZeros() { return _isAllZeros; }
+        const bool& isAllZeros() const { return _isAllZeros; }
+        char* data() { return _data; }
+        const char* data() const { return _data; }
+        /** @} */
+    };
+    using LQEntry = LSQEntry;
+
+  public:
+    using LoadQueue = CircularQueue<LQEntry>;
+    using StoreQueue = CircularQueue<SQEntry>;
+
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit(uint32_t lqEntries, uint32_t sqEntries);
@@ -113,13 +239,6 @@ class LSQUnit {
     /** Takes over from another CPU's thread. */
     void takeOverFrom();
 
-    /** Ticks the LSQ unit, which in this case only resets the number of
-     * used cache ports.
-     * @todo: Move the number of used ports up to the LSQ level so it can
-     * be shared by all LSQ units.
-     */
-    void tick() { usedStorePorts = 0; }
-
     /** Inserts an instruction. */
     void insert(const DynInstPtr &inst);
     /** Inserts a load instruction. */
@@ -133,7 +252,8 @@ class LSQUnit {
      * @param load_idx index to start checking at
      * @param inst the instruction to check
      */
-    Fault checkViolations(int load_idx, const DynInstPtr &inst);
+    Fault checkViolations(typename LoadQueue::iterator& loadIt,
+            const DynInstPtr& inst);
 
     /** Check if an incoming invalidate hits in the lsq on a load
      * that might have issued out of order wrt another load beacuse
@@ -163,18 +283,6 @@ class LSQUnit {
      * memory system. */
     void completeDataAccess(PacketPtr pkt);
 
-    /** Clears all the entries in the LQ. */
-    void clearLQ();
-
-    /** Clears all the entries in the SQ. */
-    void clearSQ();
-
-    /** Resizes the LQ to a given size. */
-    void resizeLQ(unsigned size);
-
-    /** Resizes the SQ to a given size. */
-    void resizeSQ(unsigned size);
-
     /** Squashes all instructions younger than a specific sequence number. */
     void squash(const InstSeqNum &squashed_num);
 
@@ -205,10 +313,10 @@ class LSQUnit {
     bool isEmpty() const { return lqEmpty() && sqEmpty(); }
 
     /** Returns if the LQ is full. */
-    bool lqFull() { return loads >= (LQEntries - 1); }
+    bool lqFull() { return loadQueue.full(); }
 
     /** Returns if the SQ is full. */
-    bool sqFull() { return stores >= (SQEntries - 1); }
+    bool sqFull() { return storeQueue.full(); }
 
     /** Returns if the LQ is empty. */
     bool lqEmpty() const { return loads == 0; }
@@ -226,13 +334,20 @@ class LSQUnit {
     int numStoresToWB() { return storesToWB; }
 
     /** Returns if the LSQ unit will writeback on this cycle. */
-    bool willWB() { return storeQueue[storeWBIdx].canWB &&
-                        !storeQueue[storeWBIdx].completed &&
-                        !isStoreBlocked; }
+    bool
+    willWB()
+    {
+        return storeWBIt.dereferenceable() &&
+                        storeWBIt->valid() &&
+                        storeWBIt->canWB() &&
+                        !storeWBIt->completed() &&
+                        !isStoreBlocked;
+    }
 
     /** Handles doing the retry. */
     void recvRetry();
 
+    unsigned int cacheLineSize();
   private:
     /** Reset the LSQ state */
     void resetState();
@@ -240,31 +355,31 @@ class LSQUnit {
     /** Writes back the instruction, sending it to IEW. */
     void writeback(const DynInstPtr &inst, PacketPtr pkt);
 
-    /** Writes back a store that couldn't be completed the previous cycle. */
-    void writebackPendingStore();
-
-    /** Handles completing the send of a store to memory. */
-    void storePostSend(PacketPtr pkt);
+    /** Try to finish a previously blocked write back attempt */
+    void writebackBlockedStore();
 
     /** Completes the store at the specified index. */
-    void completeStore(int store_idx);
-
-    /** Attempts to send a store to the cache. */
-    bool sendStore(PacketPtr data_pkt);
+    void completeStore(typename StoreQueue::iterator store_idx);
 
-    /** Increments the given store index (circular queue). */
-    inline void incrStIdx(int &store_idx) const;
-    /** Decrements the given store index (circular queue). */
-    inline void decrStIdx(int &store_idx) const;
-    /** Increments the given load index (circular queue). */
-    inline void incrLdIdx(int &load_idx) const;
-    /** Decrements the given load index (circular queue). */
-    inline void decrLdIdx(int &load_idx) const;
+    /** Handles completing the send of a store to memory. */
+    void storePostSend();
 
   public:
+    /** Attempts to send a packet to the cache.
+     * Check if there are ports available. Return true if
+     * there are, false if there are not.
+     */
+    bool trySendPacket(bool isLoad, PacketPtr data_pkt);
+
+
     /** Debugging function to dump instructions in the LSQ. */
     void dumpInsts() const;
 
+    /** Schedule event for the cpu. */
+    void schedule(Event& ev, Tick when) { cpu->schedule(ev, when); }
+
+    BaseTLB* dTLB() { return cpu->dtb; }
+
   private:
     /** Pointer to the CPU. */
     O3CPU *cpu;
@@ -278,44 +393,46 @@ class LSQUnit {
     /** Pointer to the dcache port.  Used only for sending. */
     MasterPort *dcachePort;
 
-    /** Derived class to hold any sender state the LSQ needs. */
-    class LSQSenderState : public Packet::SenderState
+    /** Particularisation of the LSQSenderState to the LQ. */
+    class LQSenderState : public LSQSenderState
     {
+        using LSQSenderState::alive;
       public:
-        /** Default constructor. */
-        LSQSenderState()
-            : mainPkt(NULL), pendingPacket(NULL), idx(0), outstanding(1),
-              isLoad(false), noWB(false), isSplit(false),
-              pktToSend(false), cacheBlocked(false)
-          { }
-
-        /** Instruction who initiated the access to memory. */
-        DynInstPtr inst;
-        /** The main packet from a split load, used during writeback. */
-        PacketPtr mainPkt;
-        /** A second packet from a split store that needs sending. */
-        PacketPtr pendingPacket;
-        /** The LQ/SQ index of the instruction. */
-        uint8_t idx;
-        /** Number of outstanding packets to complete. */
-        uint8_t outstanding;
-        /** Whether or not it is a load. */
-        bool isLoad;
-        /** Whether or not the instruction will need to writeback. */
-        bool noWB;
-        /** Whether or not this access is split in two. */
-        bool isSplit;
-        /** Whether or not there is a packet that needs sending. */
-        bool pktToSend;
-        /** Whether or not the second packet of this split load was blocked */
-        bool cacheBlocked;
-
-        /** Completes a packet and returns whether the access is finished. */
-        inline bool complete() { return --outstanding == 0; }
+        LQSenderState(typename LoadQueue::iterator idx_)
+            : LSQSenderState(idx_->request(), true), idx(idx_) { }
+
+        /** The LQ index of the instruction. */
+        typename LoadQueue::iterator idx;
+        //virtual LSQRequest* request() { return idx->request(); }
+        virtual void
+        complete()
+        {
+            //if (alive())
+            //  idx->request()->senderState(nullptr);
+        }
+    };
+
+    /** Particularisation of the LSQSenderState to the SQ. */
+    class SQSenderState : public LSQSenderState
+    {
+        using LSQSenderState::alive;
+      public:
+        SQSenderState(typename StoreQueue::iterator idx_)
+            : LSQSenderState(idx_->request(), false), idx(idx_) { }
+        /** The SQ index of the instruction. */
+        typename StoreQueue::iterator idx;
+        //virtual LSQRequest* request() { return idx->request(); }
+        virtual void
+        complete()
+        {
+            //if (alive())
+            //   idx->request()->senderState(nullptr);
+        }
     };
 
     /** Writeback event, specifically for when stores forward data to loads. */
-    class WritebackEvent : public Event {
+    class WritebackEvent : public Event
+    {
       public:
         /** Constructs a writeback event. */
         WritebackEvent(const DynInstPtr &_inst, PacketPtr pkt,
@@ -339,72 +456,25 @@ class LSQUnit {
     };
 
   public:
-    struct SQEntry {
-        /** Constructs an empty store queue entry. */
-        SQEntry()
-            : inst(NULL), req(NULL), size(0),
-              canWB(0), committed(0), completed(0)
-        {
-            std::memset(data, 0, sizeof(data));
-        }
-
-        ~SQEntry()
-        {
-            inst = NULL;
-        }
-
-        /** Constructs a store queue entry for a given instruction. */
-        SQEntry(const DynInstPtr &_inst)
-            : inst(_inst), req(NULL), sreqLow(NULL), sreqHigh(NULL), size(0),
-              isSplit(0), canWB(0), committed(0), completed(0), isAllZeros(0)
-        {
-            std::memset(data, 0, sizeof(data));
-        }
-        /** The store data. */
-        char data[16];
-        /** The store instruction. */
-        DynInstPtr inst;
-        /** The request for the store. */
-        RequestPtr req;
-        /** The split requests for the store. */
-        RequestPtr sreqLow;
-        RequestPtr sreqHigh;
-        /** The size of the store. */
-        uint8_t size;
-        /** Whether or not the store is split into two requests. */
-        bool isSplit;
-        /** Whether or not the store can writeback. */
-        bool canWB;
-        /** Whether or not the store is committed. */
-        bool committed;
-        /** Whether or not the store is completed. */
-        bool completed;
-        /** Does this request write all zeros and thus doesn't
-         * have any data attached to it. Used for cache block zero
-         * style instructs (ARM DC ZVA; ALPHA WH64)
-         */
-        bool isAllZeros;
-    };
+    /**
+     * Handles writing back and completing the load or store that has
+     * returned from memory.
+     *
+     * @param pkt Response packet from the memory sub-system
+     */
+    bool recvTimingResp(PacketPtr pkt);
 
   private:
     /** The LSQUnit thread id. */
     ThreadID lsqID;
-
+  public:
     /** The store queue. */
-    std::vector<SQEntry> storeQueue;
+    CircularQueue<SQEntry> storeQueue;
 
     /** The load queue. */
-    std::vector<DynInstPtr> loadQueue;
-
-    /** The number of LQ entries, plus a sentinel entry (circular queue).
-     *  @todo: Consider having var that records the true number of LQ entries.
-     */
-    unsigned LQEntries;
-    /** The number of SQ entries, plus a sentinel entry (circular queue).
-     *  @todo: Consider having var that records the true number of SQ entries.
-     */
-    unsigned SQEntries;
+    LoadQueue loadQueue;
 
+  private:
     /** The number of places to shift addresses in the LSQ before checking
      * for dependency violations
      */
@@ -420,28 +490,10 @@ class LSQUnit {
     /** The number of store instructions in the SQ waiting to writeback. */
     int storesToWB;
 
-    /** The index of the head instruction in the LQ. */
-    int loadHead;
-    /** The index of the tail instruction in the LQ. */
-    int loadTail;
-
-    /** The index of the head instruction in the SQ. */
-    int storeHead;
     /** The index of the first instruction that may be ready to be
      * written back, and has not yet been written back.
      */
-    int storeWBIdx;
-    /** The index of the tail instruction in the SQ. */
-    int storeTail;
-
-    /// @todo Consider moving to a more advanced model with write vs read ports
-    /** The number of cache ports available each cycle (stores only). */
-    int cacheStorePorts;
-
-    /** The number of used cache ports in this cycle by stores. */
-    int usedStorePorts;
-
-    //list<InstSeqNum> mshrSeqNums;
+    typename StoreQueue::iterator storeWBIt;
 
     /** Address Mask for a cache block (e.g. ~(cache_block_size-1)) */
     Addr cacheBlockMask;
@@ -472,10 +524,10 @@ class LSQUnit {
 
     /** Whether or not there is a packet that couldn't be sent because of
      * a lack of cache ports. */
-    bool hasPendingPkt;
+    bool hasPendingRequest;
 
     /** The packet that is pending free cache ports. */
-    PacketPtr pendingPkt;
+    LSQRequest* pendingRequest;
 
     /** Flag for memory model. */
     bool needsTSO;
@@ -516,53 +568,51 @@ class LSQUnit {
 
   public:
     /** Executes the load at the given index. */
-    Fault read(const RequestPtr &req,
-               RequestPtr &sreqLow, RequestPtr &sreqHigh,
-               int load_idx);
+    Fault read(LSQRequest *req, int load_idx);
 
     /** Executes the store at the given index. */
-    Fault write(const RequestPtr &req,
-                const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                uint8_t *data, int store_idx);
+    Fault write(LSQRequest *req, uint8_t *data, int store_idx);
 
     /** Returns the index of the head load instruction. */
-    int getLoadHead() { return loadHead; }
+    int getLoadHead() { return loadQueue.head(); }
+
     /** Returns the sequence number of the head load instruction. */
-    InstSeqNum getLoadHeadSeqNum()
+    InstSeqNum
+    getLoadHeadSeqNum()
     {
-        if (loadQueue[loadHead]) {
-            return loadQueue[loadHead]->seqNum;
-        } else {
-            return 0;
-        }
-
+        return loadQueue.front().valid()
+            ? loadQueue.front().instruction()->seqNum
+            : 0;
     }
 
     /** Returns the index of the head store instruction. */
-    int getStoreHead() { return storeHead; }
+    int getStoreHead() { return storeQueue.head(); }
     /** Returns the sequence number of the head store instruction. */
-    InstSeqNum getStoreHeadSeqNum()
+    InstSeqNum
+    getStoreHeadSeqNum()
     {
-        if (storeQueue[storeHead].inst) {
-            return storeQueue[storeHead].inst->seqNum;
-        } else {
-            return 0;
-        }
-
+        return storeQueue.front().valid()
+            ? storeQueue.front().instruction()->seqNum
+            : 0;
     }
 
     /** Returns whether or not the LSQ unit is stalled. */
     bool isStalled()  { return stalled; }
+  public:
+    typedef typename CircularQueue<LQEntry>::iterator LQIterator;
+    typedef typename CircularQueue<SQEntry>::iterator SQIterator;
+    typedef CircularQueue<LQEntry> LQueue;
+    typedef CircularQueue<SQEntry> SQueue;
 };
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::read(const RequestPtr &req,
-                    RequestPtr &sreqLow, RequestPtr &sreqHigh,
-                    int load_idx)
+LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
 {
-    DynInstPtr load_inst = loadQueue[load_idx];
+    LQEntry& load_req = loadQueue[load_idx];
+    const DynInstPtr& load_inst = load_req.instruction();
 
+    load_req.setRequest(req);
     assert(load_inst);
 
     assert(!load_inst->isExecuted());
@@ -571,184 +621,188 @@ LSQUnit<Impl>::read(const RequestPtr &req,
     // A bit of a hackish way to get strictly ordered accesses to work
     // only if they're at the head of the LSQ and are ready to commit
     // (at the head of the ROB too).
-    if (req->isStrictlyOrdered() &&
-        (load_idx != loadHead || !load_inst->isAtCommit())) {
+
+    if (req->mainRequest()->isStrictlyOrdered() &&
+        (load_idx != loadQueue.head() || !load_inst->isAtCommit())) {
+        // Tell IQ/mem dep unit that this instruction will need to be
+        // rescheduled eventually
         iewStage->rescheduleMemInst(load_inst);
+        load_inst->clearIssued();
+        load_inst->effAddrValid(false);
         ++lsqRescheduledLoads;
         DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
                 load_inst->seqNum, load_inst->pcState());
 
+        // Must delete request now that it wasn't handed off to
+        // memory.  This is quite ugly.  @todo: Figure out the proper
+        // place to really handle request deletes.
+        load_req.setRequest(nullptr);
+        req->discard();
         return std::make_shared<GenericISA::M5PanicFault>(
             "Strictly ordered load [sn:%llx] PC %s\n",
             load_inst->seqNum, load_inst->pcState());
     }
 
-    // Check the SQ for any previous stores that might lead to forwarding
-    int store_idx = load_inst->sqIdx;
-
-    int store_size = 0;
-
     DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
             "storeHead: %i addr: %#x%s\n",
-            load_idx, store_idx, storeHead, req->getPaddr(),
-            sreqLow ? " split" : "");
+            load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1,
+            req->mainRequest()->getPaddr(), req->isSplit() ? " split" : "");
 
-    if (req->isLLSC()) {
-        assert(!sreqLow);
+    if (req->mainRequest()->isLLSC()) {
         // Disable recording the result temporarily.  Writing to misc
         // regs normally updates the result, but this is not the
         // desired behavior when handling store conditionals.
         load_inst->recordResult(false);
-        TheISA::handleLockedRead(load_inst.get(), req);
+        TheISA::handleLockedRead(load_inst.get(), req->mainRequest());
         load_inst->recordResult(true);
     }
 
-    if (req->isMmappedIpr()) {
+    if (req->mainRequest()->isMmappedIpr()) {
         assert(!load_inst->memData);
         load_inst->memData = new uint8_t[64];
 
         ThreadContext *thread = cpu->tcBase(lsqID);
-        Cycles delay(0);
-        PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq);
-
-        data_pkt->dataStatic(load_inst->memData);
-        if (!TheISA::HasUnalignedMemAcc || !sreqLow) {
-            delay = TheISA::handleIprRead(thread, data_pkt);
-        } else {
-            assert(sreqLow->isMmappedIpr() && sreqHigh->isMmappedIpr());
-            PacketPtr fst_data_pkt = new Packet(sreqLow, MemCmd::ReadReq);
-            PacketPtr snd_data_pkt = new Packet(sreqHigh, MemCmd::ReadReq);
-
-            fst_data_pkt->dataStatic(load_inst->memData);
-            snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
-
-            delay = TheISA::handleIprRead(thread, fst_data_pkt);
-            Cycles delay2 = TheISA::handleIprRead(thread, snd_data_pkt);
-            if (delay2 > delay)
-                delay = delay2;
-
-            delete fst_data_pkt;
-            delete snd_data_pkt;
-        }
-        WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+        PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq);
+
+        Cycles delay = req->handleIprRead(thread, main_pkt);
+
+        WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
         cpu->schedule(wb, cpu->clockEdge(delay));
         return NoFault;
     }
 
-    while (store_idx != -1) {
-        // End once we've reached the top of the LSQ
-        if (store_idx == storeWBIdx) {
-            break;
-        }
-
+    // Check the SQ for any previous stores that might lead to forwarding
+    auto store_it = load_inst->sqIt;
+    assert (store_it >= storeWBIt);
+    // End once we've reached the top of the LSQ
+    while (store_it != storeWBIt) {
         // Move the index to one younger
-        if (--store_idx < 0)
-            store_idx += SQEntries;
-
-        assert(storeQueue[store_idx].inst);
-
-        store_size = storeQueue[store_idx].size;
-
-        if (!store_size || storeQueue[store_idx].inst->strictlyOrdered() ||
-            (storeQueue[store_idx].req &&
-             storeQueue[store_idx].req->isCacheMaintenance())) {
-            // Cache maintenance instructions go down via the store
-            // path but they carry no data and they shouldn't be
-            // considered for forwarding
-            continue;
-        }
-
-        assert(storeQueue[store_idx].inst->effAddrValid());
-
-        // Check if the store data is within the lower and upper bounds of
-        // addresses that the request needs.
-        bool store_has_lower_limit =
-            req->getVaddr() >= storeQueue[store_idx].inst->effAddr;
-        bool store_has_upper_limit =
-            (req->getVaddr() + req->getSize()) <=
-            (storeQueue[store_idx].inst->effAddr + store_size);
-        bool lower_load_has_store_part =
-            req->getVaddr() < (storeQueue[store_idx].inst->effAddr +
-                           store_size);
-        bool upper_load_has_store_part =
-            (req->getVaddr() + req->getSize()) >
-            storeQueue[store_idx].inst->effAddr;
-
-        // If the store's data has all of the data needed and the load isn't
-        // LLSC, we can forward.
-        if (store_has_lower_limit && store_has_upper_limit && !req->isLLSC()) {
-            // Get shift amount for offset into the store's data.
-            int shift_amt = req->getVaddr() - storeQueue[store_idx].inst->effAddr;
-
-            // Allocate memory if this is the first time a load is issued.
-            if (!load_inst->memData) {
-                load_inst->memData = new uint8_t[req->getSize()];
-            }
-            if (storeQueue[store_idx].isAllZeros)
-                memset(load_inst->memData, 0, req->getSize());
-            else
-                memcpy(load_inst->memData,
-                    storeQueue[store_idx].data + shift_amt, req->getSize());
-
-            DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
-                    "addr %#x\n", store_idx, req->getVaddr());
-
-            PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq);
-            data_pkt->dataStatic(load_inst->memData);
-
-            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
-
-            // We'll say this has a 1 cycle load-store forwarding latency
-            // for now.
-            // @todo: Need to make this a parameter.
-            cpu->schedule(wb, curTick());
-
-            ++lsqForwLoads;
-            return NoFault;
-        } else if (
-                (!req->isLLSC() &&
+        store_it--;
+        assert(store_it->valid());
+        assert(store_it->instruction()->seqNum < load_inst->seqNum);
+        int store_size = store_it->size();
+
+        // Cache maintenance instructions go down via the store
+        // path but they carry no data and they shouldn't be
+        // considered for forwarding
+        if (store_size != 0 && !store_it->instruction()->strictlyOrdered() &&
+            !(store_it->request()->mainRequest() &&
+              store_it->request()->mainRequest()->isCacheMaintenance())) {
+            assert(store_it->instruction()->effAddrValid());
+
+            // Check if the store data is within the lower and upper bounds of
+            // addresses that the request needs.
+            auto req_s = req->mainRequest()->getVaddr();
+            auto req_e = req_s + req->mainRequest()->getSize();
+            auto st_s = store_it->instruction()->effAddr;
+            auto st_e = st_s + store_size;
+
+            bool store_has_lower_limit = req_s >= st_s;
+            bool store_has_upper_limit = req_e <= st_e;
+            bool lower_load_has_store_part = req_s < st_e;
+            bool upper_load_has_store_part = req_e > st_s;
+
+            // If the store's data has all of the data needed and the load
+            // isn't LLSC then
+            // we can forward.
+            if (store_has_lower_limit && store_has_upper_limit &&
+                !req->mainRequest()->isLLSC()) {
+
+                // Get shift amount for offset into the store's data.
+                int shift_amt = req->mainRequest()->getVaddr() -
+                    store_it->instruction()->effAddr;
+
+                // Allocate memory if this is the first time a load is issued.
+                if (!load_inst->memData) {
+                    load_inst->memData =
+                        new uint8_t[req->mainRequest()->getSize()];
+                }
+                if (store_it->isAllZeros())
+                    memset(load_inst->memData, 0,
+                            req->mainRequest()->getSize());
+                else
+                    memcpy(load_inst->memData,
+                        store_it->data() + shift_amt,
+                        req->mainRequest()->getSize());
+
+                DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
+                        "addr %#x\n", store_it._idx,
+                        req->mainRequest()->getVaddr());
+
+                PacketPtr data_pkt = new Packet(req->mainRequest(),
+                        MemCmd::ReadReq);
+                data_pkt->dataStatic(load_inst->memData);
+
+                if (req->isAnyOutstandingRequest()) {
+                    assert(req->_numOutstandingPackets > 0);
+                    // There are memory requests packets in flight already.
+                    // This may happen if the store was not complete the
+                    // first time this load got executed. Signal the senderSate
+                    // that response packets should be discarded.
+                    req->discardSenderState();
+                }
+
+                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
+                        this);
+
+                // We'll say this has a 1 cycle load-store forwarding latency
+                // for now.
+                // @todo: Need to make this a parameter.
+                cpu->schedule(wb, curTick());
+
+                // Don't need to do anything special for split loads.
+                ++lsqForwLoads;
+
+                return NoFault;
+            } else if (
+                (!req->mainRequest()->isLLSC() &&
                  ((store_has_lower_limit && lower_load_has_store_part) ||
                   (store_has_upper_limit && upper_load_has_store_part) ||
                   (lower_load_has_store_part && upper_load_has_store_part))) ||
-                (req->isLLSC() &&
+                (req->mainRequest()->isLLSC() &&
                  ((store_has_lower_limit || upper_load_has_store_part) &&
                   (store_has_upper_limit || lower_load_has_store_part)))) {
-            // This is the partial store-load forwarding case where a store
-            // has only part of the load's data and the load isn't LLSC or
-            // the load is LLSC and the store has all or part of the load's
-            // data
-
-            // If it's already been written back, then don't worry about
-            // stalling on it.
-            if (storeQueue[store_idx].completed) {
-                panic("Should not check one of these");
-                continue;
+                // This is the partial store-load forwarding case where a store
+                // has only part of the load's data and the load isn't LLSC or
+                // the load is LLSC and the store has all or part of the load's
+                // data
+
+                // If it's already been written back, then don't worry about
+                // stalling on it.
+                if (store_it->completed()) {
+                    panic("Should not check one of these");
+                    continue;
+                }
+
+                // Must stall load and force it to retry, so long as it's the
+                // oldest load that needs to do so.
+                if (!stalled ||
+                    (stalled &&
+                     load_inst->seqNum <
+                     loadQueue[stallingLoadIdx].instruction()->seqNum)) {
+                    stalled = true;
+                    stallingStoreIsn = store_it->instruction()->seqNum;
+                    stallingLoadIdx = load_idx;
+                }
+
+                // Tell IQ/mem dep unit that this instruction will need to be
+                // rescheduled eventually
+                iewStage->rescheduleMemInst(load_inst);
+                load_inst->clearIssued();
+                load_inst->effAddrValid(false);
+                ++lsqRescheduledLoads;
+
+                // Do not generate a writeback event as this instruction is not
+                // complete.
+                DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
+                        "Store idx %i to load addr %#x\n",
+                        store_it._idx, req->mainRequest()->getVaddr());
+
+                // Must discard the request.
+                req->discard();
+                load_req.setRequest(nullptr);
+                return NoFault;
             }
-
-            // Must stall load and force it to retry, so long as it's the oldest
-            // load that needs to do so.
-            if (!stalled ||
-                (stalled &&
-                 load_inst->seqNum <
-                 loadQueue[stallingLoadIdx]->seqNum)) {
-                stalled = true;
-                stallingStoreIsn = storeQueue[store_idx].inst->seqNum;
-                stallingLoadIdx = load_idx;
-            }
-
-            // Tell IQ/mem dep unit that this instruction will need to be
-            // rescheduled eventually
-            iewStage->rescheduleMemInst(load_inst);
-            load_inst->clearIssued();
-            ++lsqRescheduledLoads;
-
-            // Do not generate a writeback event as this instruction is not
-            // complete.
-            DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
-                    "Store idx %i to load addr %#x\n",
-                    store_idx, req->getVaddr());
-
-            return NoFault;
         }
     }
 
@@ -758,40 +812,7 @@ LSQUnit<Impl>::read(const RequestPtr &req,
 
     // Allocate memory if this is the first time a load is issued.
     if (!load_inst->memData) {
-        load_inst->memData = new uint8_t[req->getSize()];
-    }
-
-    // if we the cache is not blocked, do cache access
-    bool completedFirst = false;
-    PacketPtr data_pkt = Packet::createRead(req);
-    PacketPtr fst_data_pkt = NULL;
-    PacketPtr snd_data_pkt = NULL;
-
-    data_pkt->dataStatic(load_inst->memData);
-
-    LSQSenderState *state = new LSQSenderState;
-    state->isLoad = true;
-    state->idx = load_idx;
-    state->inst = load_inst;
-    data_pkt->senderState = state;
-
-    if (!TheISA::HasUnalignedMemAcc || !sreqLow) {
-        // Point the first packet at the main data packet.
-        fst_data_pkt = data_pkt;
-    } else {
-        // Create the split packets.
-        fst_data_pkt = Packet::createRead(sreqLow);
-        snd_data_pkt = Packet::createRead(sreqHigh);
-
-        fst_data_pkt->dataStatic(load_inst->memData);
-        snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
-
-        fst_data_pkt->senderState = state;
-        snd_data_pkt->senderState = state;
-
-        state->isSplit = true;
-        state->outstanding = 2;
-        state->mainPkt = data_pkt;
+        load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
     }
 
     // For now, load throughput is constrained by the number of
@@ -799,97 +820,46 @@ LSQUnit<Impl>::read(const RequestPtr &req,
     // stores do).
     // @todo We should account for cache port contention
     // and arbitrate between loads and stores.
-    bool successful_load = true;
-    if (!dcachePort->sendTimingReq(fst_data_pkt)) {
-        successful_load = false;
-    } else if (TheISA::HasUnalignedMemAcc && sreqLow) {
-        completedFirst = true;
-
-        // The first packet was sent without problems, so send this one
-        // too. If there is a problem with this packet then the whole
-        // load will be squashed, so indicate this to the state object.
-        // The first packet will return in completeDataAccess and be
-        // handled there.
-        // @todo We should also account for cache port contention
-        // here.
-        if (!dcachePort->sendTimingReq(snd_data_pkt)) {
-            // The main packet will be deleted in completeDataAccess.
-            state->complete();
-            // Signify to 1st half that the 2nd half was blocked via state
-            state->cacheBlocked = true;
-            successful_load = false;
-        }
-    }
-
-    // If the cache was blocked, or has become blocked due to the access,
-    // handle it.
-    if (!successful_load) {
-        if (!sreqLow) {
-            // Packet wasn't split, just delete main packet info
-            delete state;
-            delete data_pkt;
-        }
-
-        if (TheISA::HasUnalignedMemAcc && sreqLow) {
-            if (!completedFirst) {
-                // Split packet, but first failed.  Delete all state.
-                delete state;
-                delete data_pkt;
-                delete fst_data_pkt;
-                delete snd_data_pkt;
-                sreqLow.reset();
-                sreqHigh.reset();
-            } else {
-                // Can't delete main packet data or state because first packet
-                // was sent to the memory system
-                delete data_pkt;
-                delete snd_data_pkt;
-                sreqHigh.reset();
-            }
-        }
-
-        ++lsqCacheBlocked;
-
-        iewStage->blockMemInst(load_inst);
 
-        // No fault occurred, even though the interface is blocked.
-        return NoFault;
+    // if we the cache is not blocked, do cache access
+    if (req->senderState() == nullptr) {
+        LQSenderState *state = new LQSenderState(
+                loadQueue.getIterator(load_idx));
+        state->isLoad = true;
+        state->inst = load_inst;
+        state->isSplit = req->isSplit();
+        req->senderState(state);
     }
+    req->buildPackets();
+    req->sendPacketToCache();
+    if (!req->isSent())
+        iewStage->blockMemInst(load_inst);
 
     return NoFault;
 }
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::write(const RequestPtr &req,
-                     const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
-                     uint8_t *data, int store_idx)
+LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
 {
-    assert(storeQueue[store_idx].inst);
-
-    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x"
-            " | storeHead:%i [sn:%i]\n",
-            store_idx, req->getPaddr(), storeHead,
-            storeQueue[store_idx].inst->seqNum);
-
-    storeQueue[store_idx].req = req;
-    storeQueue[store_idx].sreqLow = sreqLow;
-    storeQueue[store_idx].sreqHigh = sreqHigh;
-    unsigned size = req->getSize();
-    storeQueue[store_idx].size = size;
-    bool store_no_data = req->getFlags() & Request::STORE_NO_DATA;
-    storeQueue[store_idx].isAllZeros = store_no_data;
-    assert(size <= sizeof(storeQueue[store_idx].data) || store_no_data);
-
-    // Split stores can only occur in ISAs with unaligned memory accesses.  If
-    // a store request has been split, sreqLow and sreqHigh will be non-null.
-    if (TheISA::HasUnalignedMemAcc && sreqLow) {
-        storeQueue[store_idx].isSplit = true;
-    }
-
-    if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO) && \
-        !req->isCacheMaintenance())
-        memcpy(storeQueue[store_idx].data, data, size);
+    assert(storeQueue[store_idx].valid());
+
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
+            "[sn:%i]\n",
+            store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1,
+            storeQueue[store_idx].instruction()->seqNum);
+
+    storeQueue[store_idx].setRequest(req);
+    unsigned size = req->_size;
+    storeQueue[store_idx].size() = size;
+    bool store_no_data =
+        req->mainRequest()->getFlags() & Request::STORE_NO_DATA;
+    storeQueue[store_idx].isAllZeros() = store_no_data;
+    assert(size <= SQEntry::DataSize || store_no_data);
+
+    if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
+        !req->request()->isCacheMaintenance())
+        memcpy(storeQueue[store_idx].data(), data, size);
 
     // This function only writes the data to the store queue, so no fault
     // can happen here.
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 13b148768..9756a9ef1 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2010-2014, 2017 ARM Limited
+ * Copyright (c) 2010-2014, 2017-2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -66,6 +66,8 @@ LSQUnit<Impl>::WritebackEvent::WritebackEvent(const DynInstPtr &_inst,
     : Event(Default_Pri, AutoDelete),
       inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
 {
+    assert(_inst->savedReq);
+    _inst->savedReq->writebackScheduled();
 }
 
 template<class Impl>
@@ -76,9 +78,8 @@ LSQUnit<Impl>::WritebackEvent::process()
 
     lsqPtr->writeback(inst, pkt);
 
-    if (pkt->senderState)
-        delete pkt->senderState;
-
+    assert(inst->savedReq);
+    inst->savedReq->writebackDone();
     delete pkt;
 }
 
@@ -89,65 +90,61 @@ LSQUnit<Impl>::WritebackEvent::description() const
     return "Store writeback";
 }
 
+template <class Impl>
+bool
+LSQUnit<Impl>::recvTimingResp(PacketPtr pkt)
+{
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    LSQRequest* req = senderState->request();
+    assert(req != nullptr);
+    bool ret = true;
+    /* Check that the request is still alive before any further action. */
+    if (senderState->alive()) {
+        ret = req->recvTimingResp(pkt);
+    } else {
+        senderState->outstanding--;
+    }
+    return ret;
+
+}
+
 template<class Impl>
 void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
     DynInstPtr inst = state->inst;
-    DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum);
-    DPRINTF(Activity, "Activity: Writeback event [sn:%lli].\n", inst->seqNum);
-
-    if (state->cacheBlocked) {
-        // This is the first half of a previous split load,
-        // where the 2nd half blocked, ignore this response
-        DPRINTF(IEW, "[sn:%lli]: Response from first half of earlier "
-                "blocked split load recieved. Ignoring.\n", inst->seqNum);
-        delete state;
-        return;
-    }
 
-    // If this is a split access, wait until all packets are received.
-    if (TheISA::HasUnalignedMemAcc && !state->complete()) {
-        return;
-    }
+    cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
+
+    /* Notify the sender state that the access is complete (for ownership
+     * tracking). */
+    state->complete();
 
     assert(!cpu->switchedOut());
     if (!inst->isSquashed()) {
-        if (!state->noWB) {
+        if (state->needWB) {
             // Only loads and store conditionals perform the writeback
             // after receving the response from the memory
             assert(inst->isLoad() || inst->isStoreConditional());
-            if (!TheISA::HasUnalignedMemAcc || !state->isSplit ||
-                !state->isLoad) {
-                writeback(inst, pkt);
-            } else {
-                writeback(inst, state->mainPkt);
+            writeback(inst, state->request()->mainPacket());
+            if (inst->isStore()) {
+                auto ss = dynamic_cast<SQSenderState*>(state);
+                ss->writebackDone();
+                completeStore(ss->idx);
             }
-        }
-
-        if (inst->isStore()) {
-            completeStore(state->idx);
+        } else if (inst->isStore()) {
+            completeStore(dynamic_cast<SQSenderState*>(state)->idx);
         }
     }
-
-    if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) {
-        delete state->mainPkt;
-    }
-
-    pkt->req->setAccessLatency();
-    cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
-
-    delete state;
 }
 
 template <class Impl>
 LSQUnit<Impl>::LSQUnit(uint32_t lqEntries, uint32_t sqEntries)
     : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1),
-      LQEntries(lqEntries+1), SQEntries(sqEntries+1),
       loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
-      isStoreBlocked(false), storeInFlight(false), hasPendingPkt(false),
-      pendingPkt(nullptr)
+      isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false),
+      pendingRequest(nullptr)
 {
 }
 
@@ -167,7 +164,6 @@ LSQUnit<Impl>::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params,
 
     depCheckShift = params->LSQDepCheckShift;
     checkLoads = params->LSQCheckLoads;
-    cacheStorePorts = params->cacheStorePorts;
     needsTSO = params->needsTSO;
 
     resetState();
@@ -180,11 +176,8 @@ LSQUnit<Impl>::resetState()
 {
     loads = stores = storesToWB = 0;
 
-    loadHead = loadTail = 0;
-
-    storeHead = storeWBIdx = storeTail = 0;
 
-    usedStorePorts = 0;
+    storeWBIt = storeQueue.begin();
 
     retryPkt = NULL;
     memDepViolator = NULL;
@@ -259,24 +252,10 @@ LSQUnit<Impl>::setDcachePort(MasterPort *dcache_port)
 
 template<class Impl>
 void
-LSQUnit<Impl>::clearLQ()
-{
-    loadQueue.clear();
-}
-
-template<class Impl>
-void
-LSQUnit<Impl>::clearSQ()
-{
-    storeQueue.clear();
-}
-
-template<class Impl>
-void
 LSQUnit<Impl>::drainSanityCheck() const
 {
-    for (int i = 0; i < loadQueue.size(); ++i)
-        assert(!loadQueue[i]);
+    for (int i = 0; i < loadQueue.capacity(); ++i)
+        assert(!loadQueue[i].valid());
 
     assert(storesToWB == 0);
     assert(!retryPkt);
@@ -289,44 +268,6 @@ LSQUnit<Impl>::takeOverFrom()
     resetState();
 }
 
-template<class Impl>
-void
-LSQUnit<Impl>::resizeLQ(unsigned size)
-{
-    unsigned size_plus_sentinel = size + 1;
-    assert(size_plus_sentinel >= LQEntries);
-
-    if (size_plus_sentinel > LQEntries) {
-        while (size_plus_sentinel > loadQueue.size()) {
-            DynInstPtr dummy;
-            loadQueue.push_back(dummy);
-            LQEntries++;
-        }
-    } else {
-        LQEntries = size_plus_sentinel;
-    }
-
-    assert(LQEntries <= 256);
-}
-
-template<class Impl>
-void
-LSQUnit<Impl>::resizeSQ(unsigned size)
-{
-    unsigned size_plus_sentinel = size + 1;
-    if (size_plus_sentinel > SQEntries) {
-        while (size_plus_sentinel > storeQueue.size()) {
-            SQEntry dummy;
-            storeQueue.push_back(dummy);
-            SQEntries++;
-        }
-    } else {
-        SQEntries = size_plus_sentinel;
-    }
-
-    assert(SQEntries <= 256);
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::insert(const DynInstPtr &inst)
@@ -348,44 +289,42 @@ template <class Impl>
 void
 LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
 {
-    assert((loadTail + 1) % LQEntries != loadHead);
-    assert(loads < LQEntries);
+    assert(!loadQueue.full());
+    assert(loads < loadQueue.capacity());
 
     DPRINTF(LSQUnit, "Inserting load PC %s, idx:%i [sn:%lli]\n",
-            load_inst->pcState(), loadTail, load_inst->seqNum);
+            load_inst->pcState(), loadQueue.tail(), load_inst->seqNum);
 
-    load_inst->lqIdx = loadTail;
+    /* Grow the queue. */
+    loadQueue.advance_tail();
 
-    if (stores == 0) {
-        load_inst->sqIdx = -1;
-    } else {
-        load_inst->sqIdx = storeTail;
-    }
+    load_inst->sqIt = storeQueue.end();
 
-    loadQueue[loadTail] = load_inst;
-
-    incrLdIdx(loadTail);
+    assert(!loadQueue.back().valid());
+    loadQueue.back().set(load_inst);
+    load_inst->lqIdx = loadQueue.tail();
+    load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx);
 
     ++loads;
 }
 
 template <class Impl>
 void
-LSQUnit<Impl>::insertStore(const DynInstPtr &store_inst)
+LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst)
 {
     // Make sure it is not full before inserting an instruction.
-    assert((storeTail + 1) % SQEntries != storeHead);
-    assert(stores < SQEntries);
+    assert(!storeQueue.full());
+    assert(stores < storeQueue.capacity());
 
     DPRINTF(LSQUnit, "Inserting store PC %s, idx:%i [sn:%lli]\n",
-            store_inst->pcState(), storeTail, store_inst->seqNum);
-
-    store_inst->sqIdx = storeTail;
-    store_inst->lqIdx = loadTail;
+            store_inst->pcState(), storeQueue.tail(), store_inst->seqNum);
+    storeQueue.advance_tail();
 
-    storeQueue[storeTail] = SQEntry(store_inst);
+    store_inst->sqIdx = storeQueue.tail();
+    store_inst->lqIdx = loadQueue.moduloAdd(loadQueue.tail(), 1);
+    store_inst->lqIt = loadQueue.end();
 
-    incrStIdx(storeTail);
+    storeQueue.back().set(store_inst);
 
     ++stores;
 }
@@ -407,8 +346,9 @@ LSQUnit<Impl>::numFreeLoadEntries()
 {
         //LQ has an extra dummy entry to differentiate
         //empty/full conditions. Subtract 1 from the free entries.
-        DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n", LQEntries, loads);
-        return LQEntries - loads - 1;
+        DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n",
+                1 + loadQueue.capacity(), loads);
+        return loadQueue.capacity() - loads;
 }
 
 template <class Impl>
@@ -417,8 +357,9 @@ LSQUnit<Impl>::numFreeStoreEntries()
 {
         //SQ has an extra dummy entry to differentiate
         //empty/full conditions. Subtract 1 from the free entries.
-        DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n", SQEntries, stores);
-        return SQEntries - stores - 1;
+        DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n",
+                1 + storeQueue.capacity(), stores);
+        return storeQueue.capacity() - stores;
 
  }
 
@@ -429,11 +370,8 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
     // Should only ever get invalidations in here
     assert(pkt->isInvalidate());
 
-    int load_idx = loadHead;
     DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr());
 
-    // Only Invalidate packet calls checkSnoop
-    assert(pkt->isInvalidate());
     for (int x = 0; x < cpu->numContexts(); x++) {
         ThreadContext *tc = cpu->getContext(x);
         bool no_squash = cpu->thread[x]->noSquashFromTC;
@@ -442,44 +380,37 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
         cpu->thread[x]->noSquashFromTC = no_squash;
     }
 
-    Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;
+    if (loadQueue.empty())
+        return;
 
-    DynInstPtr ld_inst = loadQueue[load_idx];
-    if (ld_inst) {
-        Addr load_addr_low = ld_inst->physEffAddrLow & cacheBlockMask;
-        Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask;
+    auto iter = loadQueue.begin();
 
-        // Check that this snoop didn't just invalidate our lock flag
-        if (ld_inst->effAddrValid() && (load_addr_low == invalidate_addr
-                                        || load_addr_high == invalidate_addr)
-            && ld_inst->memReqFlags & Request::LLSC)
-            TheISA::handleLockedSnoopHit(ld_inst.get());
-    }
+    Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;
 
-    // If this is the only load in the LSQ we don't care
-    if (load_idx == loadTail)
-        return;
+    DynInstPtr ld_inst = iter->instruction();
+    assert(ld_inst);
+    LSQRequest *req = iter->request();
 
-    incrLdIdx(load_idx);
+    // Check that this snoop didn't just invalidate our lock flag
+    if (ld_inst->effAddrValid() &&
+        req->isCacheBlockHit(invalidate_addr, cacheBlockMask)
+        && ld_inst->memReqFlags & Request::LLSC)
+        TheISA::handleLockedSnoopHit(ld_inst.get());
 
     bool force_squash = false;
 
-    while (load_idx != loadTail) {
-        DynInstPtr ld_inst = loadQueue[load_idx];
-
-        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
-            incrLdIdx(load_idx);
+    while (++iter != loadQueue.end()) {
+        ld_inst = iter->instruction();
+        assert(ld_inst);
+        req = iter->request();
+        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered())
             continue;
-        }
-
-        Addr load_addr_low = ld_inst->physEffAddrLow & cacheBlockMask;
-        Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask;
 
-        DPRINTF(LSQUnit, "-- inst [sn:%lli] load_addr: %#x to pktAddr:%#x\n",
-                    ld_inst->seqNum, load_addr_low, invalidate_addr);
+        DPRINTF(LSQUnit, "-- inst [sn:%lli] to pktAddr:%#x\n",
+                    ld_inst->seqNum, invalidate_addr);
 
-        if ((load_addr_low == invalidate_addr
-             || load_addr_high == invalidate_addr) || force_squash) {
+        if (force_squash ||
+            req->isCacheBlockHit(invalidate_addr, cacheBlockMask)) {
             if (needsTSO) {
                 // If we have a TSO system, as all loads must be ordered with
                 // all other loads, this load as well as *all* subsequent loads
@@ -508,14 +439,14 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
                 ld_inst->hitExternalSnoop(true);
             }
         }
-        incrLdIdx(load_idx);
     }
     return;
 }
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst)
+LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
+        const DynInstPtr& inst)
 {
     Addr inst_eff_addr1 = inst->effAddr >> depCheckShift;
     Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift;
@@ -525,10 +456,10 @@ LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst)
      * all instructions that will execute before the store writes back. Thus,
      * like the implementation that came before it, we're overly conservative.
      */
-    while (load_idx != loadTail) {
-        DynInstPtr ld_inst = loadQueue[load_idx];
+    while (loadIt != loadQueue.end()) {
+        DynInstPtr ld_inst = loadIt->instruction();
         if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
-            incrLdIdx(load_idx);
+            ++loadIt;
             continue;
         }
 
@@ -585,7 +516,7 @@ LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst)
             }
         }
 
-        incrLdIdx(load_idx);
+        ++loadIt;
     }
     return NoFault;
 }
@@ -608,8 +539,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
 
     load_fault = inst->initiateAcc();
 
-    if (inst->isTranslationDelayed() &&
-        load_fault == NoFault)
+    if (inst->isTranslationDelayed() && load_fault == NoFault)
         return load_fault;
 
     // If the instruction faulted or predicated false, then we need to send it
@@ -631,12 +561,13 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
     } else {
-        assert(inst->effAddrValid());
-        int load_idx = inst->lqIdx;
-        incrLdIdx(load_idx);
+        if (inst->effAddrValid()) {
+            auto it = inst->lqIt;
+            ++it;
 
-        if (checkLoads)
-            return checkViolations(load_idx, inst);
+            if (checkLoads)
+                return checkViolations(it, inst);
+        }
     }
 
     return load_fault;
@@ -659,7 +590,7 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
 
     // Check the recently completed loads to see if any match this store's
     // address.  If so, then we have a memory ordering violation.
-    int load_idx = store_inst->lqIdx;
+    typename LoadQueue::iterator loadIt = store_inst->lqIt;
 
     Fault store_fault = store_inst->initiateAcc();
 
@@ -674,7 +605,7 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
         return store_fault;
     }
 
-    if (storeQueue[store_idx].size == 0) {
+    if (storeQueue[store_idx].size() == 0) {
         DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n",
                 store_inst->pcState(), store_inst->seqNum);
 
@@ -686,12 +617,12 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
     if (store_inst->isStoreConditional()) {
         // Store conditionals need to set themselves as able to
         // writeback if we haven't had a fault by here.
-        storeQueue[store_idx].canWB = true;
+        storeQueue[store_idx].canWB() = true;
 
         ++storesToWB;
     }
 
-    return checkViolations(load_idx, store_inst);
+    return checkViolations(loadIt, store_inst);
 
 }
 
@@ -699,14 +630,13 @@ template <class Impl>
 void
 LSQUnit<Impl>::commitLoad()
 {
-    assert(loadQueue[loadHead]);
+    assert(loadQueue.front().valid());
 
     DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n",
-            loadQueue[loadHead]->pcState());
-
-    loadQueue[loadHead] = NULL;
+            loadQueue.front().instruction()->pcState());
 
-    incrLdIdx(loadHead);
+    loadQueue.front().clear();
+    loadQueue.pop_front();
 
     --loads;
 }
@@ -715,9 +645,10 @@ template <class Impl>
 void
 LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
 {
-    assert(loads == 0 || loadQueue[loadHead]);
+    assert(loads == 0 || loadQueue.front().valid());
 
-    while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) {
+    while (loads != 0 && loadQueue.front().instruction()->seqNum
+            <= youngest_inst) {
         commitLoad();
     }
 }
@@ -726,45 +657,37 @@ template <class Impl>
 void
 LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 {
-    assert(stores == 0 || storeQueue[storeHead].inst);
-
-    int store_idx = storeHead;
+    assert(stores == 0 || storeQueue.front().valid());
 
-    while (store_idx != storeTail) {
-        assert(storeQueue[store_idx].inst);
+    /* Forward iterate the store queue (age order). */
+    for (auto& x : storeQueue) {
+        assert(x.valid());
         // Mark any stores that are now committed and have not yet
         // been marked as able to write back.
-        if (!storeQueue[store_idx].canWB) {
-            if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
+        if (!x.canWB()) {
+            if (x.instruction()->seqNum > youngest_inst) {
                 break;
             }
             DPRINTF(LSQUnit, "Marking store as able to write back, PC "
                     "%s [sn:%lli]\n",
-                    storeQueue[store_idx].inst->pcState(),
-                    storeQueue[store_idx].inst->seqNum);
+                    x.instruction()->pcState(),
+                    x.instruction()->seqNum);
 
-            storeQueue[store_idx].canWB = true;
+            x.canWB() = true;
 
             ++storesToWB;
         }
-
-        incrStIdx(store_idx);
     }
 }
 
 template <class Impl>
 void
-LSQUnit<Impl>::writebackPendingStore()
+LSQUnit<Impl>::writebackBlockedStore()
 {
-    if (hasPendingPkt) {
-        assert(pendingPkt != NULL);
-
-        // If the cache is blocked, this will store the packet for retry.
-        if (sendStore(pendingPkt)) {
-            storePostSend(pendingPkt);
-        }
-        pendingPkt = NULL;
-        hasPendingPkt = false;
+    assert(isStoreBlocked);
+    storeWBIt->request()->sendPacketToCache();
+    if (storeWBIt->request()->isSent()){
+        storePostSend();
     }
 }
 
@@ -772,18 +695,17 @@ template <class Impl>
 void
 LSQUnit<Impl>::writebackStores()
 {
-    // First writeback the second packet from any split store that didn't
-    // complete last cycle because there weren't enough cache ports available.
-    if (TheISA::HasUnalignedMemAcc) {
-        writebackPendingStore();
+    if (isStoreBlocked) {
+        DPRINTF(LSQUnit, "Writing back  blocked store\n");
+        writebackBlockedStore();
     }
 
     while (storesToWB > 0 &&
-           storeWBIdx != storeTail &&
-           storeQueue[storeWBIdx].inst &&
-           storeQueue[storeWBIdx].canWB &&
+           storeWBIt.dereferenceable() &&
+           storeWBIt->valid() &&
+           storeWBIt->canWB() &&
            ((!needsTSO) || (!storeInFlight)) &&
-           usedStorePorts < cacheStorePorts) {
+           lsq->storePortAvailable()) {
 
         if (isStoreBlocked) {
             DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
@@ -793,188 +715,112 @@ LSQUnit<Impl>::writebackStores()
 
         // Store didn't write any data so no need to write it back to
         // memory.
-        if (storeQueue[storeWBIdx].size == 0) {
-            completeStore(storeWBIdx);
-
-            incrStIdx(storeWBIdx);
-
+        if (storeWBIt->size() == 0) {
+            /* It is important that the preincrement happens at (or before)
+             * the call, as the the code of completeStore checks
+             * storeWBIt. */
+            completeStore(storeWBIt++);
             continue;
         }
 
-        ++usedStorePorts;
-
-        if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
-            incrStIdx(storeWBIdx);
-
+        if (storeWBIt->instruction()->isDataPrefetch()) {
+            storeWBIt++;
             continue;
         }
 
-        assert(storeQueue[storeWBIdx].req);
-        assert(!storeQueue[storeWBIdx].committed);
-
-        if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) {
-            assert(storeQueue[storeWBIdx].sreqLow);
-            assert(storeQueue[storeWBIdx].sreqHigh);
-        }
-
-        DynInstPtr inst = storeQueue[storeWBIdx].inst;
-
-        RequestPtr &req = storeQueue[storeWBIdx].req;
-        const RequestPtr &sreqLow = storeQueue[storeWBIdx].sreqLow;
-        const RequestPtr &sreqHigh = storeQueue[storeWBIdx].sreqHigh;
+        assert(storeWBIt->hasRequest());
+        assert(!storeWBIt->committed());
 
-        storeQueue[storeWBIdx].committed = true;
+        DynInstPtr inst = storeWBIt->instruction();
+        LSQRequest* req = storeWBIt->request();
+        storeWBIt->committed() = true;
 
         assert(!inst->memData);
-        inst->memData = new uint8_t[req->getSize()];
+        inst->memData = new uint8_t[req->_size];
 
-        if (storeQueue[storeWBIdx].isAllZeros)
-            memset(inst->memData, 0, req->getSize());
+        if (storeWBIt->isAllZeros())
+            memset(inst->memData, 0, req->_size);
         else
-            memcpy(inst->memData, storeQueue[storeWBIdx].data, req->getSize());
+            memcpy(inst->memData, storeWBIt->data(), req->_size);
 
-        PacketPtr data_pkt;
-        PacketPtr snd_data_pkt = NULL;
 
-        LSQSenderState *state = new LSQSenderState;
-        state->isLoad = false;
-        state->idx = storeWBIdx;
-        state->inst = inst;
+        if (req->senderState() == nullptr) {
+            SQSenderState *state = new SQSenderState(storeWBIt);
+            state->isLoad = false;
+            state->needWB = false;
+            state->inst = inst;
 
-        if (!TheISA::HasUnalignedMemAcc || !storeQueue[storeWBIdx].isSplit) {
-
-            // Build a single data packet if the store isn't split.
-            data_pkt = Packet::createWrite(req);
-            data_pkt->dataStatic(inst->memData);
-            data_pkt->senderState = state;
-        } else {
-            // Create two packets if the store is split in two.
-            data_pkt = Packet::createWrite(sreqLow);
-            snd_data_pkt = Packet::createWrite(sreqHigh);
-
-            data_pkt->dataStatic(inst->memData);
-            snd_data_pkt->dataStatic(inst->memData + sreqLow->getSize());
-
-            data_pkt->senderState = state;
-            snd_data_pkt->senderState = state;
-
-            state->isSplit = true;
-            state->outstanding = 2;
-
-            // Can delete the main request now.
-            req = sreqLow;
+            req->senderState(state);
+            if (inst->isStoreConditional()) {
+                /* Only store conditionals need a writeback. */
+                state->needWB = true;
+            }
         }
+        req->buildPackets();
 
         DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%s "
                 "to Addr:%#x, data:%#x [sn:%lli]\n",
-                storeWBIdx, inst->pcState(),
-                req->getPaddr(), (int)*(inst->memData),
+                storeWBIt.idx(), inst->pcState(),
+                req->request()->getPaddr(), (int)*(inst->memData),
                 inst->seqNum);
 
         // @todo: Remove this SC hack once the memory system handles it.
         if (inst->isStoreConditional()) {
-            assert(!storeQueue[storeWBIdx].isSplit);
             // Disable recording the result temporarily.  Writing to
             // misc regs normally updates the result, but this is not
             // the desired behavior when handling store conditionals.
             inst->recordResult(false);
-            bool success = TheISA::handleLockedWrite(inst.get(), req, cacheBlockMask);
+            bool success = TheISA::handleLockedWrite(inst.get(),
+                    req->request(), cacheBlockMask);
             inst->recordResult(true);
+            req->packetSent();
 
             if (!success) {
+                req->complete();
                 // Instantly complete this store.
                 DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
                         "Instantly completing it.\n",
                         inst->seqNum);
-                WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
+                PacketPtr new_pkt = new Packet(*req->packet());
+                WritebackEvent *wb = new WritebackEvent(inst,
+                        new_pkt, this);
                 cpu->schedule(wb, curTick() + 1);
-                completeStore(storeWBIdx);
-                incrStIdx(storeWBIdx);
+                completeStore(storeWBIt);
+                if (!storeQueue.empty())
+                    storeWBIt++;
+                else
+                    storeWBIt = storeQueue.end();
                 continue;
             }
-        } else {
-            // Non-store conditionals do not need a writeback.
-            state->noWB = true;
         }
 
-        bool split =
-            TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit;
-
-        ThreadContext *thread = cpu->tcBase(lsqID);
-
-        if (req->isMmappedIpr()) {
+        if (req->request()->isMmappedIpr()) {
             assert(!inst->isStoreConditional());
-            TheISA::handleIprWrite(thread, data_pkt);
-            delete data_pkt;
-            if (split) {
-                assert(snd_data_pkt->req->isMmappedIpr());
-                TheISA::handleIprWrite(thread, snd_data_pkt);
-                delete snd_data_pkt;
-            }
-            delete state;
-            completeStore(storeWBIdx);
-            incrStIdx(storeWBIdx);
-        } else if (!sendStore(data_pkt)) {
-            DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will"
-                    "retry later\n",
-                    inst->seqNum);
+            ThreadContext *thread = cpu->tcBase(lsqID);
+            PacketPtr main_pkt = new Packet(req->mainRequest(),
+                                            MemCmd::WriteReq);
+            main_pkt->dataStatic(inst->memData);
+            req->handleIprWrite(thread, main_pkt);
+            delete main_pkt;
+            completeStore(storeWBIt);
+            storeWBIt++;
+            continue;
+        }
+        /* Send to cache */
+        req->sendPacketToCache();
 
-            // Need to store the second packet, if split.
-            if (split) {
-                state->pktToSend = true;
-                state->pendingPacket = snd_data_pkt;
-            }
+        /* If successful, do the post send */
+        if (req->isSent()) {
+            storePostSend();
         } else {
-
-            // If split, try to send the second packet too
-            if (split) {
-                assert(snd_data_pkt);
-
-                // Ensure there are enough ports to use.
-                if (usedStorePorts < cacheStorePorts) {
-                    ++usedStorePorts;
-                    if (sendStore(snd_data_pkt)) {
-                        storePostSend(snd_data_pkt);
-                    } else {
-                        DPRINTF(IEW, "D-Cache became blocked when writing"
-                                " [sn:%lli] second packet, will retry later\n",
-                                inst->seqNum);
-                    }
-                } else {
-
-                    // Store the packet for when there's free ports.
-                    assert(pendingPkt == NULL);
-                    pendingPkt = snd_data_pkt;
-                    hasPendingPkt = true;
-                }
-            } else {
-
-                // Not a split store.
-                storePostSend(data_pkt);
-            }
+            DPRINTF(LSQUnit, "D-Cache became blocked when writing [sn:%lli], "
+                    "will retry later\n",
+                    inst->seqNum);
         }
     }
-
-    // Not sure this should set it to 0.
-    usedStorePorts = 0;
-
     assert(stores >= 0 && storesToWB >= 0);
 }
 
-/*template <class Impl>
-void
-LSQUnit<Impl>::removeMSHR(InstSeqNum seqNum)
-{
-    list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(),
-                                              mshrSeqNums.end(),
-                                              seqNum);
-
-    if (mshr_it != mshrSeqNums.end()) {
-        mshrSeqNums.erase(mshr_it);
-        DPRINTF(LSQUnit, "Removing MSHR. count = %i\n",mshrSeqNums.size());
-    }
-}*/
-
 template <class Impl>
 void
 LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
@@ -982,30 +828,26 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
     DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
             "(Loads:%i Stores:%i)\n", squashed_num, loads, stores);
 
-    int load_idx = loadTail;
-    decrLdIdx(load_idx);
-
-    while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
+    while (loads != 0 &&
+            loadQueue.back().instruction()->seqNum > squashed_num) {
         DPRINTF(LSQUnit,"Load Instruction PC %s squashed, "
                 "[sn:%lli]\n",
-                loadQueue[load_idx]->pcState(),
-                loadQueue[load_idx]->seqNum);
+                loadQueue.back().instruction()->pcState(),
+                loadQueue.back().instruction()->seqNum);
 
-        if (isStalled() && load_idx == stallingLoadIdx) {
+        if (isStalled() && loadQueue.tail() == stallingLoadIdx) {
             stalled = false;
             stallingStoreIsn = 0;
             stallingLoadIdx = 0;
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        loadQueue[load_idx]->setSquashed();
-        loadQueue[load_idx] = NULL;
-        --loads;
+        loadQueue.back().instruction()->setSquashed();
+        loadQueue.back().clear();
 
-        // Inefficient!
-        loadTail = load_idx;
+        --loads;
 
-        decrLdIdx(load_idx);
+        loadQueue.pop_back();
         ++lsqSquashedLoads;
     }
 
@@ -1013,76 +855,63 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         memDepViolator = NULL;
     }
 
-    int store_idx = storeTail;
-    decrStIdx(store_idx);
-
     while (stores != 0 &&
-           storeQueue[store_idx].inst->seqNum > squashed_num) {
+           storeQueue.back().instruction()->seqNum > squashed_num) {
         // Instructions marked as can WB are already committed.
-        if (storeQueue[store_idx].canWB) {
+        if (storeQueue.back().canWB()) {
             break;
         }
 
         DPRINTF(LSQUnit,"Store Instruction PC %s squashed, "
                 "idx:%i [sn:%lli]\n",
-                storeQueue[store_idx].inst->pcState(),
-                store_idx, storeQueue[store_idx].inst->seqNum);
+                storeQueue.back().instruction()->pcState(),
+                storeQueue.tail(), storeQueue.back().instruction()->seqNum);
 
         // I don't think this can happen.  It should have been cleared
         // by the stalling load.
         if (isStalled() &&
-            storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+            storeQueue.back().instruction()->seqNum == stallingStoreIsn) {
             panic("Is stalled should have been cleared by stalling load!\n");
             stalled = false;
             stallingStoreIsn = 0;
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        storeQueue[store_idx].inst->setSquashed();
-        storeQueue[store_idx].inst = NULL;
-        storeQueue[store_idx].canWB = 0;
+        storeQueue.back().instruction()->setSquashed();
 
         // Must delete request now that it wasn't handed off to
         // memory.  This is quite ugly.  @todo: Figure out the proper
         // place to really handle request deletes.
-        storeQueue[store_idx].req.reset();
-        if (TheISA::HasUnalignedMemAcc && storeQueue[store_idx].isSplit) {
-            storeQueue[store_idx].sreqLow.reset();
-            storeQueue[store_idx].sreqHigh.reset();
-        }
-
+        storeQueue.back().clear();
         --stores;
 
-        // Inefficient!
-        storeTail = store_idx;
-
-        decrStIdx(store_idx);
+        storeQueue.pop_back();
         ++lsqSquashedStores;
     }
 }
 
 template <class Impl>
 void
-LSQUnit<Impl>::storePostSend(PacketPtr pkt)
+LSQUnit<Impl>::storePostSend()
 {
     if (isStalled() &&
-        storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) {
+        storeWBIt->instruction()->seqNum == stallingStoreIsn) {
         DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
                 "load idx:%i\n",
                 stallingStoreIsn, stallingLoadIdx);
         stalled = false;
         stallingStoreIsn = 0;
-        iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+        iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction());
     }
 
-    if (!storeQueue[storeWBIdx].inst->isStoreConditional()) {
+    if (!storeWBIt->instruction()->isStoreConditional()) {
         // The store is basically completed at this time. This
         // only works so long as the checker doesn't try to
         // verify the value in memory for stores.
-        storeQueue[storeWBIdx].inst->setCompleted();
+        storeWBIt->instruction()->setCompleted();
 
         if (cpu->checker) {
-            cpu->checker->verify(storeQueue[storeWBIdx].inst);
+            cpu->checker->verify(storeWBIt->instruction());
         }
     }
 
@@ -1090,7 +919,7 @@ LSQUnit<Impl>::storePostSend(PacketPtr pkt)
         storeInFlight = true;
     }
 
-    incrStIdx(storeWBIdx);
+    storeWBIt++;
 }
 
 template <class Impl>
@@ -1136,10 +965,10 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
 
 template <class Impl>
 void
-LSQUnit<Impl>::completeStore(int store_idx)
+LSQUnit<Impl>::completeStore(typename StoreQueue::iterator store_idx)
 {
-    assert(storeQueue[store_idx].inst);
-    storeQueue[store_idx].completed = true;
+    assert(store_idx->valid());
+    store_idx->completed() = true;
     --storesToWB;
     // A bit conservative because a store completion may not free up entries,
     // but hopefully avoids two store completions in one cycle from making
@@ -1147,39 +976,42 @@ LSQUnit<Impl>::completeStore(int store_idx)
     cpu->wakeCPU();
     cpu->activityThisCycle();
 
-    if (store_idx == storeHead) {
+    /* We 'need' a copy here because we may clear the entry from the
+     * store queue. */
+    DynInstPtr store_inst = store_idx->instruction();
+    if (store_idx == storeQueue.begin()) {
         do {
-            incrStIdx(storeHead);
-
+            storeQueue.front().clear();
+            storeQueue.pop_front();
             --stores;
-        } while (storeQueue[storeHead].completed &&
-                 storeHead != storeTail);
+        } while (storeQueue.front().completed() &&
+                 !storeQueue.empty());
 
         iewStage->updateLSQNextCycle = true;
     }
 
     DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
             "idx:%i\n",
-            storeQueue[store_idx].inst->seqNum, store_idx, storeHead);
+            store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1);
 
 #if TRACING_ON
     if (DTRACE(O3PipeView)) {
-        storeQueue[store_idx].inst->storeTick =
-            curTick() - storeQueue[store_idx].inst->fetchTick;
+        store_idx->instruction()->storeTick =
+            curTick() - store_idx->instruction()->fetchTick;
     }
 #endif
 
     if (isStalled() &&
-        storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+        store_inst->seqNum == stallingStoreIsn) {
         DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
                 "load idx:%i\n",
                 stallingStoreIsn, stallingLoadIdx);
         stalled = false;
         stallingStoreIsn = 0;
-        iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+        iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction());
     }
 
-    storeQueue[store_idx].inst->setCompleted();
+    store_inst->setCompleted();
 
     if (needsTSO) {
         storeInFlight = false;
@@ -1188,95 +1020,62 @@ LSQUnit<Impl>::completeStore(int store_idx)
     // Tell the checker we've completed this instruction.  Some stores
     // may get reported twice to the checker, but the checker can
     // handle that case.
-
     // Store conditionals cannot be sent to the checker yet, they have
     // to update the misc registers first which should take place
     // when they commit
-    if (cpu->checker && !storeQueue[store_idx].inst->isStoreConditional()) {
-        cpu->checker->verify(storeQueue[store_idx].inst);
+    if (cpu->checker &&  !store_inst->isStoreConditional()) {
+        cpu->checker->verify(store_inst);
     }
 }
 
 template <class Impl>
 bool
-LSQUnit<Impl>::sendStore(PacketPtr data_pkt)
+LSQUnit<Impl>::trySendPacket(bool isLoad, PacketPtr data_pkt)
 {
-    if (!dcachePort->sendTimingReq(data_pkt)) {
-        // Need to handle becoming blocked on a store.
-        isStoreBlocked = true;
-        ++lsqCacheBlocked;
-        assert(retryPkt == NULL);
-        retryPkt = data_pkt;
-        return false;
+    bool ret = true;
+    bool cache_got_blocked = false;
+
+    auto state = dynamic_cast<LSQSenderState*>(data_pkt->senderState);
+
+    if (!lsq->cacheBlocked() && (isLoad || lsq->storePortAvailable())) {
+        if (!dcachePort->sendTimingReq(data_pkt)) {
+            ret = false;
+            cache_got_blocked = true;
+        }
+    } else {
+        ret = false;
     }
-    return true;
-}
 
-template <class Impl>
-void
-LSQUnit<Impl>::recvRetry()
-{
-    if (isStoreBlocked) {
-        DPRINTF(LSQUnit, "Receiving retry: store blocked\n");
-        assert(retryPkt != NULL);
-
-        LSQSenderState *state =
-            dynamic_cast<LSQSenderState *>(retryPkt->senderState);
-
-        if (dcachePort->sendTimingReq(retryPkt)) {
-            // Don't finish the store unless this is the last packet.
-            if (!TheISA::HasUnalignedMemAcc || !state->pktToSend ||
-                    state->pendingPacket == retryPkt) {
-                state->pktToSend = false;
-                storePostSend(retryPkt);
-            }
-            retryPkt = NULL;
+    if (ret) {
+        if (!isLoad) {
+            lsq->storePortBusy();
             isStoreBlocked = false;
-
-            // Send any outstanding packet.
-            if (TheISA::HasUnalignedMemAcc && state->pktToSend) {
-                assert(state->pendingPacket);
-                if (sendStore(state->pendingPacket)) {
-                    storePostSend(state->pendingPacket);
-                }
-            }
-        } else {
-            // Still blocked!
+        }
+        state->outstanding++;
+        state->request()->packetSent();
+    } else {
+        if (cache_got_blocked) {
+            lsq->cacheBlocked(true);
             ++lsqCacheBlocked;
         }
+        if (!isLoad) {
+            assert(state->request() == storeWBIt->request());
+            isStoreBlocked = true;
+        }
+        state->request()->packetNotSent();
     }
-}
-
-template <class Impl>
-inline void
-LSQUnit<Impl>::incrStIdx(int &store_idx) const
-{
-    if (++store_idx >= SQEntries)
-        store_idx = 0;
-}
-
-template <class Impl>
-inline void
-LSQUnit<Impl>::decrStIdx(int &store_idx) const
-{
-    if (--store_idx < 0)
-        store_idx += SQEntries;
-}
 
-template <class Impl>
-inline void
-LSQUnit<Impl>::incrLdIdx(int &load_idx) const
-{
-    if (++load_idx >= LQEntries)
-        load_idx = 0;
+    return ret;
 }
 
 template <class Impl>
-inline void
-LSQUnit<Impl>::decrLdIdx(int &load_idx) const
+void
+LSQUnit<Impl>::recvRetry()
 {
-    if (--load_idx < 0)
-        load_idx += LQEntries;
+    if (isStoreBlocked) {
+        DPRINTF(LSQUnit, "Receiving retry: blocked store\n");
+        writebackBlockedStore();
+    }
 }
 
 template <class Impl>
@@ -1287,29 +1086,28 @@ LSQUnit<Impl>::dumpInsts() const
     cprintf("Load queue size: %i\n", loads);
     cprintf("Load queue: ");
 
-    int load_idx = loadHead;
-
-    while (load_idx != loadTail && loadQueue[load_idx]) {
-        const DynInstPtr &inst(loadQueue[load_idx]);
+    for (const auto& e: loadQueue) {
+        const DynInstPtr &inst(e.instruction());
         cprintf("%s.[sn:%i] ", inst->pcState(), inst->seqNum);
-
-        incrLdIdx(load_idx);
     }
     cprintf("\n");
 
     cprintf("Store queue size: %i\n", stores);
     cprintf("Store queue: ");
 
-    int store_idx = storeHead;
-
-    while (store_idx != storeTail && storeQueue[store_idx].inst) {
-        const DynInstPtr &inst(storeQueue[store_idx].inst);
+    for (const auto& e: storeQueue) {
+        const DynInstPtr &inst(e.instruction());
         cprintf("%s.[sn:%i] ", inst->pcState(), inst->seqNum);
-
-        incrStIdx(store_idx);
     }
 
     cprintf("\n");
 }
 
+template <class Impl>
+unsigned int
+LSQUnit<Impl>::cacheLineSize()
+{
+    return cpu->cacheLineSize();
+}
+
 #endif//__CPU_O3_LSQ_UNIT_IMPL_HH__
diff --git a/src/cpu/o3/probe/elastic_trace.cc b/src/cpu/o3/probe/elastic_trace.cc
index a4a201398..36d8297d1 100644
--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -409,7 +409,7 @@ ElasticTrace::addDepTraceRecord(const DynInstConstPtr& head_inst,
     new_record->reqFlags = head_inst->memReqFlags;
     new_record->virtAddr = head_inst->effAddr;
     new_record->asid = head_inst->asid;
-    new_record->physAddr = head_inst->physEffAddrLow;
+    new_record->physAddr = head_inst->physEffAddr;
     // Currently the tracing does not support split requests.
     new_record->size = head_inst->effSize;
     new_record->pc = head_inst->instAddr();