summaryrefslogtreecommitdiff
path: root/src/cpu/o3/lsq_unit.hh
diff options
context:
space:
mode:
authorTimothy M. Jones <tjones1@inf.ed.ac.uk>2010-02-12 19:53:20 +0000
committerTimothy M. Jones <tjones1@inf.ed.ac.uk>2010-02-12 19:53:20 +0000
commit29e8bcead5700f638c4848d9b5710d0ebf18d64b (patch)
treee85dac6557f13146ae2cb119d3ea5b515f3f9e29 /src/cpu/o3/lsq_unit.hh
parent7fe9f92cfc73147a1a024c1632c9a7619c1779d1 (diff)
downloadgem5-29e8bcead5700f638c4848d9b5710d0ebf18d64b.tar.xz
O3PCU: Split loads and stores that cross cache line boundaries.
When each load or store is sent to the LSQ, we check whether it will cross a cache line boundary and, if so, split it in two. This creates two TLB translations and two memory requests. Care has to be taken if the first packet of a split load is sent but the second blocks the cache. Similarly, for a store, if the first packet cannot be sent, we must store the second one somewhere to retry later. This modifies the LSQSenderState class to record both packets in a split load or store. Finally, a new const variable, HasUnalignedMemAcc, is added to each ISA to indicate whether unaligned memory accesses are allowed. This is used throughout the changed code so that compiler can optimise away code dealing with split requests for ISAs that don't need them.
Diffstat (limited to 'src/cpu/o3/lsq_unit.hh')
-rw-r--r--src/cpu/o3/lsq_unit.hh138
1 files changed, 123 insertions, 15 deletions
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 6ff36d929..cf51f8eab 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -216,12 +216,18 @@ class LSQUnit {
/** Writes back the instruction, sending it to IEW. */
void writeback(DynInstPtr &inst, PacketPtr pkt);
+ /** Writes back a store that couldn't be completed the previous cycle. */
+ void writebackPendingStore();
+
/** Handles completing the send of a store to memory. */
void storePostSend(PacketPtr pkt);
/** Completes the store at the specified index. */
void completeStore(int store_idx);
+ /** Attempts to send a store to the cache. */
+ bool sendStore(PacketPtr data_pkt);
+
/** Increments the given store index (circular queue). */
inline void incrStIdx(int &store_idx);
/** Decrements the given store index (circular queue). */
@@ -254,7 +260,8 @@ class LSQUnit {
public:
/** Default constructor. */
LSQSenderState()
- : noWB(false)
+ : noWB(false), isSplit(false), pktToSend(false), outstanding(1),
+ mainPkt(NULL), pendingPacket(NULL)
{ }
/** Instruction who initiated the access to memory. */
@@ -265,6 +272,19 @@ class LSQUnit {
int idx;
/** Whether or not the instruction will need to writeback. */
bool noWB;
+ /** Whether or not this access is split in two. */
+ bool isSplit;
+ /** Whether or not there is a packet that needs sending. */
+ bool pktToSend;
+ /** Number of outstanding packets to complete. */
+ int outstanding;
+ /** The main packet from a split load, used during writeback. */
+ PacketPtr mainPkt;
+ /** A second packet from a split store that needs sending. */
+ PacketPtr pendingPacket;
+
+ /** Completes a packet and returns whether the access is finished. */
+ inline bool complete() { return --outstanding == 0; }
};
/** Writeback event, specifically for when stores forward data to loads. */
@@ -302,8 +322,8 @@ class LSQUnit {
/** Constructs a store queue entry for a given instruction. */
SQEntry(DynInstPtr &_inst)
- : inst(_inst), req(NULL), size(0),
- canWB(0), committed(0), completed(0)
+ : inst(_inst), req(NULL), sreqLow(NULL), sreqHigh(NULL), size(0),
+ isSplit(0), canWB(0), committed(0), completed(0)
{
std::memset(data, 0, sizeof(data));
}
@@ -312,10 +332,15 @@ class LSQUnit {
DynInstPtr inst;
/** The request for the store. */
RequestPtr req;
+ /** The split requests for the store. */
+ RequestPtr sreqLow;
+ RequestPtr sreqHigh;
/** The size of the store. */
int size;
/** The store data. */
char data[sizeof(IntReg)];
+ /** Whether or not the store is split into two requests. */
+ bool isSplit;
/** Whether or not the store can writeback. */
bool canWB;
/** Whether or not the store is committed. */
@@ -406,6 +431,13 @@ class LSQUnit {
/** The oldest load that caused a memory ordering violation. */
DynInstPtr memDepViolator;
+ /** Whether or not there is a packet that couldn't be sent because of
+ * a lack of cache ports. */
+ bool hasPendingPkt;
+
+ /** The packet that is pending free cache ports. */
+ PacketPtr pendingPkt;
+
// Will also need how many read/write ports the Dcache has. Or keep track
// of that in stage that is one level up, and only call executeLoad/Store
// the appropriate number of times.
@@ -443,11 +475,13 @@ class LSQUnit {
public:
/** Executes the load at the given index. */
template <class T>
- Fault read(Request *req, T &data, int load_idx);
+ Fault read(Request *req, Request *sreqLow, Request *sreqHigh, T &data,
+ int load_idx);
/** Executes the store at the given index. */
template <class T>
- Fault write(Request *req, T &data, int store_idx);
+ Fault write(Request *req, Request *sreqLow, Request *sreqHigh, T &data,
+ int store_idx);
/** Returns the index of the head load instruction. */
int getLoadHead() { return loadHead; }
@@ -482,7 +516,8 @@ class LSQUnit {
template <class Impl>
template <class T>
Fault
-LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
+LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh,
+ T &data, int load_idx)
{
DynInstPtr load_inst = loadQueue[load_idx];
@@ -503,6 +538,10 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
// memory. This is quite ugly. @todo: Figure out the proper
// place to really handle request deletes.
delete req;
+ if (TheISA::HasUnalignedMemAcc && sreqLow) {
+ delete sreqLow;
+ delete sreqHigh;
+ }
return TheISA::genMachineCheckFault();
}
@@ -512,10 +551,12 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
int store_size = 0;
DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
- "storeHead: %i addr: %#x\n",
- load_idx, store_idx, storeHead, req->getPaddr());
+ "storeHead: %i addr: %#x%s\n",
+ load_idx, store_idx, storeHead, req->getPaddr(),
+ sreqLow ? " split" : "");
if (req->isLLSC()) {
+ assert(!sreqLow);
// Disable recording the result temporarily. Writing to misc
// regs normally updates the result, but this is not the
// desired behavior when handling store conditionals.
@@ -587,6 +628,12 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
// @todo: Need to make this a parameter.
cpu->schedule(wb, curTick);
+ // Don't need to do anything special for split loads.
+ if (TheISA::HasUnalignedMemAcc && sreqLow) {
+ delete sreqLow;
+ delete sreqHigh;
+ }
+
++lsqForwLoads;
return NoFault;
} else if ((store_has_lower_limit && lower_load_has_store_part) ||
@@ -630,6 +677,10 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
// memory. This is quite ugly. @todo: Figure out the
// proper place to really handle request deletes.
delete req;
+ if (TheISA::HasUnalignedMemAcc && sreqLow) {
+ delete sreqLow;
+ delete sreqHigh;
+ }
return NoFault;
}
@@ -645,12 +696,14 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
++usedPorts;
// if we the cache is not blocked, do cache access
+ bool completedFirst = false;
if (!lsq->cacheBlocked()) {
- PacketPtr data_pkt =
- new Packet(req,
- (req->isLLSC() ?
- MemCmd::LoadLockedReq : MemCmd::ReadReq),
- Packet::Broadcast);
+ MemCmd command =
+ req->isLLSC() ? MemCmd::LoadLockedReq : MemCmd::ReadReq;
+ PacketPtr data_pkt = new Packet(req, command, Packet::Broadcast);
+ PacketPtr fst_data_pkt = NULL;
+ PacketPtr snd_data_pkt = NULL;
+
data_pkt->dataStatic(load_inst->memData);
LSQSenderState *state = new LSQSenderState;
@@ -659,18 +712,66 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
state->inst = load_inst;
data_pkt->senderState = state;
- if (!dcachePort->sendTiming(data_pkt)) {
+ if (!TheISA::HasUnalignedMemAcc || !sreqLow) {
+
+ // Point the first packet at the main data packet.
+ fst_data_pkt = data_pkt;
+ } else {
+
+ // Create the split packets.
+ fst_data_pkt = new Packet(sreqLow, command, Packet::Broadcast);
+ snd_data_pkt = new Packet(sreqHigh, command, Packet::Broadcast);
+
+ fst_data_pkt->dataStatic(load_inst->memData);
+ snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
+
+ fst_data_pkt->senderState = state;
+ snd_data_pkt->senderState = state;
+
+ state->isSplit = true;
+ state->outstanding = 2;
+ state->mainPkt = data_pkt;
+ }
+
+ if (!dcachePort->sendTiming(fst_data_pkt)) {
// Delete state and data packet because a load retry
// initiates a pipeline restart; it does not retry.
delete state;
delete data_pkt->req;
delete data_pkt;
+ if (TheISA::HasUnalignedMemAcc && sreqLow) {
+ delete fst_data_pkt->req;
+ delete fst_data_pkt;
+ delete snd_data_pkt->req;
+ delete snd_data_pkt;
+ }
req = NULL;
// If the access didn't succeed, tell the LSQ by setting
// the retry thread id.
lsq->setRetryTid(lsqID);
+ } else if (TheISA::HasUnalignedMemAcc && sreqLow) {
+ completedFirst = true;
+
+ // The first packet was sent without problems, so send this one
+ // too. If there is a problem with this packet then the whole
+ // load will be squashed, so indicate this to the state object.
+ // The first packet will return in completeDataAccess and be
+ // handled there.
+ ++usedPorts;
+ if (!dcachePort->sendTiming(snd_data_pkt)) {
+
+ // The main packet will be deleted in completeDataAccess.
+ delete snd_data_pkt->req;
+ delete snd_data_pkt;
+
+ state->complete();
+
+ req = NULL;
+
+ lsq->setRetryTid(lsqID);
+ }
}
}
@@ -679,6 +780,10 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
if (lsq->cacheBlocked()) {
if (req)
delete req;
+ if (TheISA::HasUnalignedMemAcc && sreqLow && !completedFirst) {
+ delete sreqLow;
+ delete sreqHigh;
+ }
++lsqCacheBlocked;
@@ -703,7 +808,8 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
template <class Impl>
template <class T>
Fault
-LSQUnit<Impl>::write(Request *req, T &data, int store_idx)
+LSQUnit<Impl>::write(Request *req, Request *sreqLow, Request *sreqHigh,
+ T &data, int store_idx)
{
assert(storeQueue[store_idx].inst);
@@ -713,6 +819,8 @@ LSQUnit<Impl>::write(Request *req, T &data, int store_idx)
storeQueue[store_idx].inst->seqNum);
storeQueue[store_idx].req = req;
+ storeQueue[store_idx].sreqLow = sreqLow;
+ storeQueue[store_idx].sreqHigh = sreqHigh;
storeQueue[store_idx].size = sizeof(T);
assert(sizeof(T) <= sizeof(storeQueue[store_idx].data));