summaryrefslogtreecommitdiff
path: root/src/cpu/o3
diff options
context:
space:
mode:
authorGiacomo Gabrielli <giacomo.gabrielli@arm.com>2017-07-07 14:13:11 +0100
committerGiacomo Gabrielli <giacomo.gabrielli@arm.com>2019-05-11 12:48:58 +0000
commitc58cb8c9dbeef377da180f1fdaaa1c0eadf85550 (patch)
tree7591abeb888d8c8e645332749bcaea627628f9bf /src/cpu/o3
parentd0e4cdc9c36466a3dbef8c9f9f509cce8f1a6c34 (diff)
downloadgem5-c58cb8c9dbeef377da180f1fdaaa1c0eadf85550.tar.xz
cpu,mem: Add support for partial loads/stores and wide mem. accesses
This changeset adds support for partial (or masked) loads/stores, i.e. loads/stores that can disable accesses to individual bytes within the target address range. In addition, this changeset extends the code to crack memory accesses across most CPU models (TimingSimpleCPU still TBD), so that arbitrarily wide memory accesses are supported. These changes are required for supporting ISAs with wide vectors. Additional authors: - Gabor Dozsa <gabor.dozsa@arm.com> - Tiago Muck <tiago.muck@arm.com> Change-Id: Ibad33541c258ad72925c0b1d5abc3e5e8bf92d92 Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13518 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com> Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Diffstat (limited to 'src/cpu/o3')
-rw-r--r--src/cpu/o3/cpu.hh7
-rw-r--r--src/cpu/o3/lsq.hh48
-rw-r--r--src/cpu/o3/lsq_impl.hh129
-rw-r--r--src/cpu/o3/lsq_unit_impl.hh3
4 files changed, 116 insertions, 71 deletions
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index c754fe8cf..db8fca20a 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -787,10 +787,13 @@ class FullO3CPU : public BaseO3CPU
/** CPU pushRequest function, forwards request to LSQ. */
Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
- uint64_t *res, AtomicOpFunctor *amo_op = nullptr)
+ uint64_t *res, AtomicOpFunctor *amo_op = nullptr,
+ const std::vector<bool>& byteEnable =
+ std::vector<bool>())
+
{
return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr,
- flags, res, amo_op);
+ flags, res, amo_op, byteEnable);
}
/** CPU read function, forwards read to LSQ. */
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index a6037b7f4..84f1411a5 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -50,6 +50,7 @@
#include "arch/generic/tlb.hh"
#include "cpu/inst_seq.hh"
#include "cpu/o3/lsq_unit.hh"
+#include "cpu/utils.hh"
#include "enums/SMTQueuePolicy.hh"
#include "mem/port.hh"
#include "sim/sim_object.hh"
@@ -251,6 +252,7 @@ class LSQ
const Addr _addr;
const uint32_t _size;
const Request::Flags _flags;
+ std::vector<bool> _byteEnable;
uint32_t _numOutstandingPackets;
AtomicOpFunctor *_amo_op;
protected:
@@ -351,6 +353,28 @@ class LSQ
}
}
+ /** Helper function used to add a (sub)request, given its address
+ * `addr`, size `size` and byte-enable mask `byteEnable`.
+ *
+ * The request is only added if the mask is empty or if there is at
+ * least an active element in it.
+ */
+ void
+ addRequest(Addr addr, unsigned size,
+ const std::vector<bool>& byteEnable)
+ {
+ if (byteEnable.empty() ||
+ isAnyActiveElement(byteEnable.begin(), byteEnable.end())) {
+ auto request = std::make_shared<Request>(_inst->getASID(),
+ addr, size, _flags, _inst->masterId(),
+ _inst->instAddr(), _inst->contextId());
+ if (!byteEnable.empty()) {
+ request->setByteEnable(byteEnable);
+ }
+ _requests.push_back(request);
+ }
+ }
+
/** Destructor.
* The LSQRequest owns the request. If the packet has already been
* sent, the sender state will be deleted upon receiving the reply.
@@ -609,11 +633,17 @@ class LSQ
* declaration of the names in the parent class. */
using Flag = typename LSQRequest::Flag;
using State = typename LSQRequest::State;
+ using LSQRequest::_addr;
using LSQRequest::_fault;
+ using LSQRequest::_flags;
+ using LSQRequest::_size;
+ using LSQRequest::_byteEnable;
+ using LSQRequest::_requests;
using LSQRequest::_inst;
using LSQRequest::_packets;
using LSQRequest::_port;
using LSQRequest::_res;
+ using LSQRequest::_taskId;
using LSQRequest::_senderState;
using LSQRequest::_state;
using LSQRequest::flags;
@@ -635,14 +665,8 @@ class LSQ
uint64_t* res = nullptr,
AtomicOpFunctor* amo_op = nullptr) :
LSQRequest(port, inst, isLoad, addr, size, flags_, data, res,
- amo_op)
- {
- LSQRequest::_requests.push_back(
- std::make_shared<Request>(inst->getASID(), addr, size,
- flags_, inst->masterId(), inst->instAddr(),
- inst->contextId(), amo_op));
- LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum);
- }
+ amo_op) {}
+
inline virtual ~SingleDataRequest() {}
virtual void initiateTranslation();
virtual void finish(const Fault &fault, const RequestPtr &req,
@@ -671,6 +695,7 @@ class LSQ
using LSQRequest::_port;
using LSQRequest::_requests;
using LSQRequest::_res;
+ using LSQRequest::_byteEnable;
using LSQRequest::_senderState;
using LSQRequest::_size;
using LSQRequest::_state;
@@ -691,14 +716,14 @@ class LSQ
RequestPtr mainReq;
PacketPtr _mainPacket;
-
public:
SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
const Addr& addr, const uint32_t& size,
const Request::Flags & flags_,
PacketDataPtr data = nullptr,
uint64_t* res = nullptr) :
- LSQRequest(port, inst, isLoad, addr, size, flags_, data, res),
+ LSQRequest(port, inst, isLoad, addr, size, flags_, data, res,
+ nullptr),
numFragments(0),
numReceivedPackets(0),
mainReq(nullptr),
@@ -949,7 +974,8 @@ class LSQ
Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
- uint64_t *res, AtomicOpFunctor *amo_op);
+ uint64_t *res, AtomicOpFunctor *amo_op,
+ const std::vector<bool>& byteEnable);
/** The CPU pointer. */
O3CPU *cpu;
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 732712029..70621a523 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -681,29 +681,12 @@ LSQ<Impl>::dumpInsts() const
}
}
-static Addr
-addrBlockOffset(Addr addr, unsigned int block_size)
-{
- return addr & (block_size - 1);
-}
-
-static Addr
-addrBlockAlign(Addr addr, uint64_t block_size)
-{
- return addr & ~(block_size - 1);
-}
-
-static bool
-transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size)
-{
- return (addrBlockOffset(addr, block_size) + size) > block_size;
-}
-
template<class Impl>
Fault
LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
- uint64_t *res, AtomicOpFunctor *amo_op)
+ uint64_t *res, AtomicOpFunctor *amo_op,
+ const std::vector<bool>& byteEnable)
{
// This comming request can be either load, store or atomic.
// Atomic request has a corresponding pointer to its atomic memory
@@ -735,6 +718,9 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
size, flags, data, res, amo_op);
}
assert(req);
+ if (!byteEnable.empty()) {
+ req->_byteEnable = byteEnable;
+ }
inst->setRequest();
req->taskId(cpu->taskId());
@@ -756,6 +742,7 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
else
inst->getFault() = cpu->write(req, data, inst->sqIdx);
} else if (isLoad) {
+ inst->setMemAccPredicate(false);
// Commit will have to clean up whatever happened. Set this
// instruction as executed.
inst->setExecuted();
@@ -848,14 +835,21 @@ template<class Impl>
void
LSQ<Impl>::SingleDataRequest::initiateTranslation()
{
- _inst->translationStarted(true);
- setState(State::Translation);
- flags.set(Flag::TranslationStarted);
+ assert(_requests.size() == 0);
- _inst->savedReq = this;
- sendFragmentToTranslation(0);
+ this->addRequest(_addr, _size, _byteEnable);
- if (isTranslationComplete()) {
+ if (_requests.size() > 0) {
+ _requests.back()->setReqInstSeqNum(_inst->seqNum);
+ _requests.back()->taskId(_taskId);
+ _inst->translationStarted(true);
+ setState(State::Translation);
+ flags.set(Flag::TranslationStarted);
+
+ _inst->savedReq = this;
+ sendFragmentToTranslation(0);
+ } else {
+ _inst->setMemAccPredicate(false);
}
}
@@ -877,11 +871,7 @@ template<class Impl>
void
LSQ<Impl>::SplitDataRequest::initiateTranslation()
{
- _inst->translationStarted(true);
- setState(State::Translation);
- flags.set(Flag::TranslationStarted);
-
- unsigned int cacheLineSize = _port.cacheLineSize();
+ auto cacheLineSize = _port.cacheLineSize();
Addr base_addr = _addr;
Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
@@ -890,6 +880,9 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation()
mainReq = std::make_shared<Request>(_inst->getASID(), base_addr,
_size, _flags, _inst->masterId(),
_inst->instAddr(), _inst->contextId());
+ if (!_byteEnable.empty()) {
+ mainReq->setByteEnable(_byteEnable);
+ }
// Paddr is not used in mainReq. However, we will accumulate the flags
// from the sub requests into mainReq by calling setFlags() in finish().
@@ -898,39 +891,63 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation()
mainReq->setPaddr(0);
/* Get the pre-fix, possibly unaligned. */
- _requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr,
- next_addr - base_addr, _flags, _inst->masterId(),
- _inst->instAddr(), _inst->contextId()));
+ if (_byteEnable.empty()) {
+ this->addRequest(base_addr, next_addr - base_addr, _byteEnable);
+ } else {
+ auto it_start = _byteEnable.begin();
+ auto it_end = _byteEnable.begin() + (next_addr - base_addr);
+ this->addRequest(base_addr, next_addr - base_addr,
+ std::vector<bool>(it_start, it_end));
+ }
size_so_far = next_addr - base_addr;
/* We are block aligned now, reading whole blocks. */
base_addr = next_addr;
while (base_addr != final_addr) {
- _requests.push_back(std::make_shared<Request>(_inst->getASID(),
- base_addr, cacheLineSize, _flags, _inst->masterId(),
- _inst->instAddr(), _inst->contextId()));
+ if (_byteEnable.empty()) {
+ this->addRequest(base_addr, cacheLineSize, _byteEnable);
+ } else {
+ auto it_start = _byteEnable.begin() + size_so_far;
+ auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize;
+ this->addRequest(base_addr, cacheLineSize,
+ std::vector<bool>(it_start, it_end));
+ }
size_so_far += cacheLineSize;
base_addr += cacheLineSize;
}
/* Deal with the tail. */
if (size_so_far < _size) {
- _requests.push_back(std::make_shared<Request>(_inst->getASID(),
- base_addr, _size - size_so_far, _flags, _inst->masterId(),
- _inst->instAddr(), _inst->contextId()));
+ if (_byteEnable.empty()) {
+ this->addRequest(base_addr, _size - size_so_far, _byteEnable);
+ } else {
+ auto it_start = _byteEnable.begin() + size_so_far;
+ auto it_end = _byteEnable.end();
+ this->addRequest(base_addr, _size - size_so_far,
+ std::vector<bool>(it_start, it_end));
+ }
}
- /* Setup the requests and send them to translation. */
- for (auto& r: _requests) {
- r->setReqInstSeqNum(_inst->seqNum);
- r->taskId(_taskId);
- }
- this->_inst->savedReq = this;
- numInTranslationFragments = 0;
- numTranslatedFragments = 0;
+ if (_requests.size() > 0) {
+ /* Setup the requests and send them to translation. */
+ for (auto& r: _requests) {
+ r->setReqInstSeqNum(_inst->seqNum);
+ r->taskId(_taskId);
+ }
- for (uint32_t i = 0; i < _requests.size(); i++) {
- sendFragmentToTranslation(i);
+ _inst->translationStarted(true);
+ setState(State::Translation);
+ flags.set(Flag::TranslationStarted);
+ this->_inst->savedReq = this;
+ numInTranslationFragments = 0;
+ numTranslatedFragments = 0;
+ _fault.resize(_requests.size());
+
+ for (uint32_t i = 0; i < _requests.size(); i++) {
+ sendFragmentToTranslation(i);
+ }
+ } else {
+ _inst->setMemAccPredicate(false);
}
}
@@ -968,8 +985,6 @@ LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt)
while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
pktIdx++;
assert(pktIdx < _packets.size());
- assert(pkt->req == _requests[pktIdx]);
- assert(pkt == _packets[pktIdx]);
numReceivedPackets++;
state->outstanding--;
if (numReceivedPackets == _packets.size()) {
@@ -1012,16 +1027,19 @@ void
LSQ<Impl>::SplitDataRequest::buildPackets()
{
/* Extra data?? */
- ptrdiff_t offset = 0;
+ Addr base_address = _addr;
+
if (_packets.size() == 0) {
/* New stuff */
if (isLoad()) {
_mainPacket = Packet::createRead(mainReq);
_mainPacket->dataStatic(_inst->memData);
}
- for (auto& r: _requests) {
+ for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) {
+ RequestPtr r = _requests[i];
PacketPtr pkt = isLoad() ? Packet::createRead(r)
- : Packet::createWrite(r);
+ : Packet::createWrite(r);
+ ptrdiff_t offset = r->getVaddr() - base_address;
if (isLoad()) {
pkt->dataStatic(_inst->memData + offset);
} else {
@@ -1031,12 +1049,11 @@ LSQ<Impl>::SplitDataRequest::buildPackets()
r->getSize());
pkt->dataDynamic(req_data);
}
- offset += r->getSize();
pkt->senderState = _senderState;
_packets.push_back(pkt);
}
}
- assert(_packets.size() == _requests.size());
+ assert(_packets.size() > 0);
}
template<class Impl>
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 9323e8634..21bed99fa 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -542,8 +542,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
load_fault = inst->initiateAcc();
- if (!inst->readMemAccPredicate()) {
- assert(load_fault == NoFault);
+ if (load_fault == NoFault && !inst->readMemAccPredicate()) {
assert(inst->readPredicate());
inst->setExecuted();
inst->completeAcc(nullptr);