diff options
Diffstat (limited to 'src/dev/net')
-rw-r--r-- | src/dev/net/Ethernet.py | 15 | ||||
-rw-r--r-- | src/dev/net/SConscript | 12 | ||||
-rw-r--r-- | src/dev/net/dist_etherlink.cc (renamed from src/dev/net/multi_etherlink.cc) | 139 | ||||
-rw-r--r-- | src/dev/net/dist_etherlink.hh (renamed from src/dev/net/multi_etherlink.hh) | 73 | ||||
-rw-r--r-- | src/dev/net/dist_iface.cc | 808 | ||||
-rw-r--r-- | src/dev/net/dist_iface.hh | 595 | ||||
-rw-r--r-- | src/dev/net/dist_packet.hh (renamed from src/dev/net/multi_packet.hh) | 90 | ||||
-rw-r--r-- | src/dev/net/etherpkt.cc | 16 | ||||
-rw-r--r-- | src/dev/net/etherpkt.hh | 12 | ||||
-rw-r--r-- | src/dev/net/multi_iface.cc | 622 | ||||
-rw-r--r-- | src/dev/net/multi_iface.hh | 492 | ||||
-rw-r--r-- | src/dev/net/multi_packet.cc | 100 | ||||
-rw-r--r-- | src/dev/net/tcp_iface.cc | 265 | ||||
-rw-r--r-- | src/dev/net/tcp_iface.hh | 77 |
14 files changed, 1835 insertions, 1481 deletions
diff --git a/src/dev/net/Ethernet.py b/src/dev/net/Ethernet.py index 9859857a0..5f878ea10 100644 --- a/src/dev/net/Ethernet.py +++ b/src/dev/net/Ethernet.py @@ -58,19 +58,22 @@ class EtherLink(EtherObject): speed = Param.NetworkBandwidth('1Gbps', "link speed") dump = Param.EtherDump(NULL, "dump object") -class MultiEtherLink(EtherObject): - type = 'MultiEtherLink' - cxx_header = "dev/net/multi_etherlink.hh" +class DistEtherLink(EtherObject): + type = 'DistEtherLink' + cxx_header = "dev/net/dist_etherlink.hh" int0 = SlavePort("interface 0") delay = Param.Latency('0us', "packet transmit delay") delay_var = Param.Latency('0ns', "packet transmit delay variability") speed = Param.NetworkBandwidth('1Gbps', "link speed") dump = Param.EtherDump(NULL, "dump object") - multi_rank = Param.UInt32('0', "Rank of the this gem5 process (multi run)") - sync_start = Param.Latency('5200000000000t', "first multi sync barrier") - sync_repeat = Param.Latency('10us', "multi sync barrier repeat") + dist_rank = Param.UInt32('0', "Rank of this gem5 process (dist run)") + dist_size = Param.UInt32('1', "Number of gem5 processes (dist run)") + sync_start = Param.Latency('5200000000000t', "first dist sync barrier") + sync_repeat = Param.Latency('10us', "dist sync barrier repeat") server_name = Param.String('localhost', "Message server name") server_port = Param.UInt32('2200', "Message server port") + is_switch = Param.Bool(False, "true if this a link in etherswitch") + num_nodes = Param.UInt32('2', "Number of simulate nodes") class EtherBus(EtherObject): type = 'EtherBus' diff --git a/src/dev/net/SConscript b/src/dev/net/SConscript index f529a1b2a..9ad8eec92 100644 --- a/src/dev/net/SConscript +++ b/src/dev/net/SConscript @@ -70,14 +70,14 @@ DebugFlag('EthernetIntr') DebugFlag('EthernetPIO') DebugFlag('EthernetSM') -# Multi gem5 -Source('multi_packet.cc') -Source('multi_iface.cc') -Source('multi_etherlink.cc') +# Dist gem5 +Source('dist_iface.cc') +Source('dist_etherlink.cc') Source('tcp_iface.cc') -DebugFlag('MultiEthernet') -DebugFlag('MultiEthernetPkt') +DebugFlag('DistEthernet') +DebugFlag('DistEthernetPkt') +DebugFlag('DistEthernetCmd') # Ethernet controllers Source('i8254xGBe.cc') diff --git a/src/dev/net/multi_etherlink.cc b/src/dev/net/dist_etherlink.cc index cf4300ddf..a793739f8 100644 --- a/src/dev/net/multi_etherlink.cc +++ b/src/dev/net/dist_etherlink.cc @@ -38,10 +38,10 @@ */ /* @file - * Device module for a full duplex ethernet link for multi gem5 simulations. + * Device module for a full duplex ethernet link for dist gem5 simulations. */ -#include "dev/net/multi_etherlink.hh" +#include "dev/net/dist_etherlink.hh" #include <arpa/inet.h> #include <sys/socket.h> @@ -54,15 +54,15 @@ #include "base/random.hh" #include "base/trace.hh" +#include "debug/DistEthernet.hh" +#include "debug/DistEthernetPkt.hh" #include "debug/EthernetData.hh" -#include "debug/MultiEthernet.hh" -#include "debug/MultiEthernetPkt.hh" +#include "dev/net/dist_iface.hh" #include "dev/net/etherdump.hh" #include "dev/net/etherint.hh" #include "dev/net/etherlink.hh" #include "dev/net/etherobject.hh" #include "dev/net/etherpkt.hh" -#include "dev/net/multi_iface.hh" #include "dev/net/tcp_iface.hh" #include "params/EtherLink.hh" #include "sim/core.hh" @@ -71,33 +71,45 @@ using namespace std; -MultiEtherLink::MultiEtherLink(const Params *p) - : EtherObject(p) +DistEtherLink::DistEtherLink(const Params *p) + : EtherObject(p), linkDelay(p->delay) { - DPRINTF(MultiEthernet,"MultiEtherLink::MultiEtherLink() " - "link delay:%llu\n", p->delay); + DPRINTF(DistEthernet,"DistEtherLink::DistEtherLink() " + "link delay:%llu ticksPerByte:%f\n", p->delay, p->speed); txLink = new TxLink(name() + ".link0", this, p->speed, p->delay_var, p->dump); rxLink = new RxLink(name() + ".link1", this, p->delay, p->dump); - // create the multi (TCP) interface to talk to the peer gem5 processes. - multiIface = new TCPIface(p->server_name, p->server_port, p->multi_rank, - p->sync_start, p->sync_repeat, this); + Tick sync_repeat; + if (p->sync_repeat != 0) { + if (p->sync_repeat != p->delay) + warn("DistEtherLink(): sync_repeat is %lu and linkdelay is %lu", + p->sync_repeat, p->delay); + sync_repeat = p->sync_repeat; + } else { + sync_repeat = p->delay; + } + + // create the dist (TCP) interface to talk to the peer gem5 processes. + distIface = new TCPIface(p->server_name, p->server_port, + p->dist_rank, p->dist_size, + p->sync_start, sync_repeat, this, p->is_switch, + p->num_nodes); - localIface = new LocalIface(name() + ".int0", txLink, rxLink, multiIface); + localIface = new LocalIface(name() + ".int0", txLink, rxLink, distIface); } -MultiEtherLink::~MultiEtherLink() +DistEtherLink::~DistEtherLink() { delete txLink; delete rxLink; delete localIface; - delete multiIface; + delete distIface; } EtherInt* -MultiEtherLink::getEthPort(const std::string &if_name, int idx) +DistEtherLink::getEthPort(const std::string &if_name, int idx) { if (if_name != "int0") { return nullptr; @@ -107,66 +119,55 @@ MultiEtherLink::getEthPort(const std::string &if_name, int idx) return localIface; } -void MultiEtherLink::memWriteback() -{ - DPRINTF(MultiEthernet,"MultiEtherLink::memWriteback() called\n"); - multiIface->drainDone(); -} - void -MultiEtherLink::serialize(CheckpointOut &cp) const +DistEtherLink::serialize(CheckpointOut &cp) const { - multiIface->serialize("multiIface", cp); - txLink->serialize("txLink", cp); - rxLink->serialize("rxLink", cp); + distIface->serializeSection(cp, "distIface"); + txLink->serializeSection(cp, "txLink"); + rxLink->serializeSection(cp, "rxLink"); } void -MultiEtherLink::unserialize(CheckpointIn &cp) +DistEtherLink::unserialize(CheckpointIn &cp) { - multiIface->unserialize("multiIface", cp); - txLink->unserialize("txLink", cp); - rxLink->unserialize("rxLink", cp); + distIface->unserializeSection(cp, "distIface"); + txLink->unserializeSection(cp, "txLink"); + rxLink->unserializeSection(cp, "rxLink"); } void -MultiEtherLink::init() +DistEtherLink::init() { - DPRINTF(MultiEthernet,"MultiEtherLink::init() called\n"); - multiIface->initRandom(); + DPRINTF(DistEthernet,"DistEtherLink::init() called\n"); + distIface->init(rxLink->doneEvent(), linkDelay); } void -MultiEtherLink::startup() +DistEtherLink::startup() { - DPRINTF(MultiEthernet,"MultiEtherLink::startup() called\n"); - multiIface->startPeriodicSync(); + DPRINTF(DistEthernet,"DistEtherLink::startup() called\n"); + distIface->startup(); } void -MultiEtherLink::RxLink::setMultiInt(MultiIface *m) +DistEtherLink::RxLink::setDistInt(DistIface *m) { - assert(!multiIface); - multiIface = m; - // Spawn a new receiver thread that will process messages - // coming in from peer gem5 processes. - // The receive thread will also schedule a (receive) doneEvent - // for each incoming data packet. - multiIface->spawnRecvThread(&doneEvent, linkDelay); + assert(!distIface); + distIface = m; } void -MultiEtherLink::RxLink::rxDone() +DistEtherLink::RxLink::rxDone() { assert(!busy()); // retrieve the packet that triggered the receive done event - packet = multiIface->packetIn(); + packet = distIface->packetIn(); if (dump) dump->dump(packet); - DPRINTF(MultiEthernetPkt, "MultiEtherLink::MultiLink::rxDone() " + DPRINTF(DistEthernetPkt, "DistEtherLink::DistLink::rxDone() " "packet received: len=%d\n", packet->length); DDUMP(EthernetData, packet->data, packet->length); @@ -176,7 +177,7 @@ MultiEtherLink::RxLink::rxDone() } void -MultiEtherLink::TxLink::txDone() +DistEtherLink::TxLink::txDone() { if (dump) dump->dump(packet); @@ -188,10 +189,10 @@ MultiEtherLink::TxLink::txDone() } bool -MultiEtherLink::TxLink::transmit(EthPacketPtr pkt) +DistEtherLink::TxLink::transmit(EthPacketPtr pkt) { if (busy()) { - DPRINTF(MultiEthernet, "packet not sent, link busy\n"); + DPRINTF(DistEthernet, "packet not sent, link busy\n"); return false; } @@ -201,8 +202,8 @@ MultiEtherLink::TxLink::transmit(EthPacketPtr pkt) delay += random_mt.random<Tick>(0, delayVar); // send the packet to the peers - assert(multiIface); - multiIface->packetOut(pkt, delay); + assert(distIface); + distIface->packetOut(pkt, delay); // schedule the send done event parent->schedule(doneEvent, curTick() + delay); @@ -211,56 +212,56 @@ MultiEtherLink::TxLink::transmit(EthPacketPtr pkt) } void -MultiEtherLink::Link::serialize(const string &base, CheckpointOut &cp) const +DistEtherLink::Link::serialize(CheckpointOut &cp) const { bool packet_exists = (packet != nullptr); - paramOut(cp, base + ".packet_exists", packet_exists); + SERIALIZE_SCALAR(packet_exists); if (packet_exists) - packet->serialize(base + ".packet", cp); + packet->serialize("packet", cp); bool event_scheduled = event->scheduled(); - paramOut(cp, base + ".event_scheduled", event_scheduled); + SERIALIZE_SCALAR(event_scheduled); if (event_scheduled) { Tick event_time = event->when(); - paramOut(cp, base + ".event_time", event_time); + SERIALIZE_SCALAR(event_time); } } void -MultiEtherLink::Link::unserialize(const string &base, CheckpointIn &cp) +DistEtherLink::Link::unserialize(CheckpointIn &cp) { bool packet_exists; - paramIn(cp, base + ".packet_exists", packet_exists); + UNSERIALIZE_SCALAR(packet_exists); if (packet_exists) { packet = make_shared<EthPacketData>(16384); - packet->unserialize(base + ".packet", cp); + packet->unserialize("packet", cp); } bool event_scheduled; - paramIn(cp, base + ".event_scheduled", event_scheduled); + UNSERIALIZE_SCALAR(event_scheduled); if (event_scheduled) { Tick event_time; - paramIn(cp, base + ".event_time", event_time); + UNSERIALIZE_SCALAR(event_time); parent->schedule(*event, event_time); } } -MultiEtherLink::LocalIface::LocalIface(const std::string &name, +DistEtherLink::LocalIface::LocalIface(const std::string &name, TxLink *tx, RxLink *rx, - MultiIface *m) : + DistIface *m) : EtherInt(name), txLink(tx) { tx->setLocalInt(this); rx->setLocalInt(this); - tx->setMultiInt(m); - rx->setMultiInt(m); + tx->setDistInt(m); + rx->setDistInt(m); } -MultiEtherLink * -MultiEtherLinkParams::create() +DistEtherLink * +DistEtherLinkParams::create() { - return new MultiEtherLink(this); + return new DistEtherLink(this); } diff --git a/src/dev/net/multi_etherlink.hh b/src/dev/net/dist_etherlink.hh index 0a3e39bd7..e8218941a 100644 --- a/src/dev/net/multi_etherlink.hh +++ b/src/dev/net/dist_etherlink.hh @@ -38,30 +38,30 @@ */ /* @file - * Device module for a full duplex ethernet link for multi gem5 simulations. + * Device module for a full duplex ethernet link for dist gem5 simulations. * - * See comments in dev/multi_iface.hh for a generic description of multi + * See comments in dev/net/dist_iface.hh for a generic description of dist * gem5 simulations. * * This class is meant to be a drop in replacement for the EtherLink class for - * multi gem5 runs. + * dist gem5 runs. * */ -#ifndef __DEV_NET_MULTIETHERLINK_HH__ -#define __DEV_NET_MULTIETHERLINK_HH__ +#ifndef __DEV_DIST_ETHERLINK_HH__ +#define __DEV_DIST_ETHERLINK_HH__ #include <iostream> #include "dev/net/etherlink.hh" -#include "params/MultiEtherLink.hh" +#include "params/DistEtherLink.hh" -class MultiIface; +class DistIface; class EthPacketData; /** * Model for a fixed bandwidth full duplex ethernet link. */ -class MultiEtherLink : public EtherObject +class DistEtherLink : public EtherObject { protected: class LocalIface; @@ -72,22 +72,22 @@ class MultiEtherLink : public EtherObject * The link will encapsulate and transfer Ethernet packets to/from * the message server. */ - class Link + class Link : public Serializable { protected: std::string objName; - MultiEtherLink *parent; + DistEtherLink *parent; LocalIface *localIface; EtherDump *dump; - MultiIface *multiIface; + DistIface *distIface; Event *event; EthPacketPtr packet; public: - Link(const std::string &name, MultiEtherLink *p, + Link(const std::string &name, DistEtherLink *p, EtherDump *d, Event *e) : objName(name), parent(p), localIface(nullptr), dump(d), - multiIface(nullptr), event(e) {} + distIface(nullptr), event(e) {} ~Link() {} @@ -95,8 +95,8 @@ class MultiEtherLink : public EtherObject bool busy() const { return (bool)packet; } void setLocalInt(LocalIface *i) { assert(!localIface); localIface=i; } - void serialize(const std::string &base, CheckpointOut &cp) const; - void unserialize(const std::string &base, CheckpointIn &cp); + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; }; /** @@ -123,17 +123,17 @@ class MultiEtherLink : public EtherObject DoneEvent doneEvent; public: - TxLink(const std::string &name, MultiEtherLink *p, + TxLink(const std::string &name, DistEtherLink *p, double invBW, Tick delay_var, EtherDump *d) : Link(name, p, d, &doneEvent), ticksPerByte(invBW), delayVar(delay_var), doneEvent(this) {} ~TxLink() {} /** - * Register the multi interface to be used to talk to the + * Register the dist interface to be used to talk to the * peer gem5 processes. */ - void setMultiInt(MultiIface *m) { assert(!multiIface); multiIface=m; } + void setDistInt(DistIface *m) { assert(!distIface); distIface=m; } /** * Initiate sending of a packet via this link. @@ -158,23 +158,27 @@ class MultiEtherLink : public EtherObject /** * Receive done callback method. Called from doneEvent. */ - void rxDone() ; + void rxDone(); typedef EventWrapper<RxLink, &RxLink::rxDone> DoneEvent; friend void DoneEvent::process(); - DoneEvent doneEvent; + DoneEvent _doneEvent; public: - RxLink(const std::string &name, MultiEtherLink *p, + RxLink(const std::string &name, DistEtherLink *p, Tick delay, EtherDump *d) : - Link(name, p, d, &doneEvent), - linkDelay(delay), doneEvent(this) {} + Link(name, p, d, &_doneEvent), + linkDelay(delay), _doneEvent(this) {} ~RxLink() {} /** - * Register our multi interface to talk to the peer gem5 processes. + * Register our dist interface to talk to the peer gem5 processes. + */ + void setDistInt(DistIface *m); + /** + * Done events will be scheduled by DistIface (so we need the accessor) */ - void setMultiInt(MultiIface *m); + const DoneEvent *doneEvent() const { return &_doneEvent; } }; /** @@ -187,7 +191,7 @@ class MultiEtherLink : public EtherObject public: LocalIface(const std::string &name, TxLink *tx, RxLink *rx, - MultiIface *m); + DistIface *m); bool recvPacket(EthPacketPtr pkt) { return txLink->transmit(pkt); } void sendDone() { peer->sendDone(); } @@ -199,7 +203,7 @@ class MultiEtherLink : public EtherObject /** * Interface to talk to the peer gem5 processes. */ - MultiIface *multiIface; + DistIface *distIface; /** * Send link */ @@ -210,10 +214,12 @@ class MultiEtherLink : public EtherObject RxLink *rxLink; LocalIface *localIface; + Tick linkDelay; + public: - typedef MultiEtherLinkParams Params; - MultiEtherLink(const Params *p); - ~MultiEtherLink(); + typedef DistEtherLinkParams Params; + DistEtherLink(const Params *p); + ~DistEtherLink(); const Params * params() const @@ -224,12 +230,11 @@ class MultiEtherLink : public EtherObject virtual EtherInt *getEthPort(const std::string &if_name, int idx) override; - void memWriteback() override; - void init() override; - void startup() override; + virtual void init() override; + virtual void startup() override; void serialize(CheckpointOut &cp) const override; void unserialize(CheckpointIn &cp) override; }; -#endif // __DEV_NET_MULTIETHERLINK_HH__ +#endif // __DEV_DIST_ETHERLINK_HH__ diff --git a/src/dev/net/dist_iface.cc b/src/dev/net/dist_iface.cc new file mode 100644 index 000000000..45ce651a9 --- /dev/null +++ b/src/dev/net/dist_iface.cc @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2015 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Gabor Dozsa + */ + +/* @file + * The interface class for dist-gem5 simulations. + */ + +#include "dev/net/dist_iface.hh" + +#include <queue> +#include <thread> + +#include "base/random.hh" +#include "base/trace.hh" +#include "debug/DistEthernet.hh" +#include "debug/DistEthernetPkt.hh" +#include "dev/net/etherpkt.hh" +#include "sim/sim_exit.hh" +#include "sim/sim_object.hh" + +using namespace std; +DistIface::Sync *DistIface::sync = nullptr; +DistIface::SyncEvent *DistIface::syncEvent = nullptr; +unsigned DistIface::distIfaceNum = 0; +unsigned DistIface::recvThreadsNum = 0; +DistIface *DistIface::master = nullptr; + +void +DistIface::Sync::init(Tick start_tick, Tick repeat_tick) +{ + if (start_tick < firstAt) { + firstAt = start_tick; + inform("Next dist synchronisation tick is changed to %lu.\n", nextAt); + } + + if (repeat_tick == 0) + panic("Dist synchronisation interval must be greater than zero"); + + if (repeat_tick < nextRepeat) { + nextRepeat = repeat_tick; + inform("Dist synchronisation interval is changed to %lu.\n", + nextRepeat); + } +} + +DistIface::SyncSwitch::SyncSwitch(int num_nodes) +{ + numNodes = num_nodes; + waitNum = num_nodes; + numExitReq = 0; + numCkptReq = 0; + doExit = false; + doCkpt = false; + firstAt = std::numeric_limits<Tick>::max(); + nextAt = 0; + nextRepeat = std::numeric_limits<Tick>::max(); +} + +DistIface::SyncNode::SyncNode() +{ + waitNum = 0; + needExit = ReqType::none; + needCkpt = ReqType::none; + doExit = false; + doCkpt = false; + firstAt = std::numeric_limits<Tick>::max(); + nextAt = 0; + nextRepeat = std::numeric_limits<Tick>::max(); +} + +void +DistIface::SyncNode::run(bool same_tick) +{ + std::unique_lock<std::mutex> sync_lock(lock); + Header header; + + assert(waitNum == 0); + waitNum = DistIface::recvThreadsNum; + // initiate the global synchronisation + header.msgType = MsgType::cmdSyncReq; + header.sendTick = curTick(); + header.syncRepeat = nextRepeat; + header.needCkpt = needCkpt; + if (needCkpt != ReqType::none) + needCkpt = ReqType::pending; + header.needExit = needExit; + if (needExit != ReqType::none) + needExit = ReqType::pending; + DistIface::master->sendCmd(header); + // now wait until all receiver threads complete the synchronisation + auto lf = [this]{ return waitNum == 0; }; + cv.wait(sync_lock, lf); + // global synchronisation is done + assert(!same_tick || (nextAt == curTick())); +} + + +void +DistIface::SyncSwitch::run(bool same_tick) +{ + std::unique_lock<std::mutex> sync_lock(lock); + Header header; + // Wait for the sync requests from the nodes + if (waitNum > 0) { + auto lf = [this]{ return waitNum == 0; }; + cv.wait(sync_lock, lf); + } + assert(waitNum == 0); + assert(!same_tick || (nextAt == curTick())); + waitNum = numNodes; + // Complete the global synchronisation + header.msgType = MsgType::cmdSyncAck; + header.sendTick = nextAt; + header.syncRepeat = nextRepeat; + if (doCkpt || numCkptReq == numNodes) { + doCkpt = true; + header.needCkpt = ReqType::immediate; + numCkptReq = 0; + } else { + header.needCkpt = ReqType::none; + } + if (doExit || numExitReq == numNodes) { + doExit = true; + header.needExit = ReqType::immediate; + } else { + header.needExit = ReqType::none; + } + DistIface::master->sendCmd(header); +} + +void +DistIface::SyncSwitch::progress(Tick send_tick, + Tick sync_repeat, + ReqType need_ckpt, + ReqType need_exit) +{ + std::unique_lock<std::mutex> sync_lock(lock); + assert(waitNum > 0); + + if (send_tick > nextAt) + nextAt = send_tick; + if (nextRepeat > sync_repeat) + nextRepeat = sync_repeat; + + if (need_ckpt == ReqType::collective) + numCkptReq++; + else if (need_ckpt == ReqType::immediate) + doCkpt = true; + if (need_exit == ReqType::collective) + numExitReq++; + else if (need_exit == ReqType::immediate) + doExit = true; + + waitNum--; + // Notify the simulation thread if the on-going sync is complete + if (waitNum == 0) { + sync_lock.unlock(); + cv.notify_one(); + } +} + +void +DistIface::SyncNode::progress(Tick max_send_tick, + Tick next_repeat, + ReqType do_ckpt, + ReqType do_exit) +{ + std::unique_lock<std::mutex> sync_lock(lock); + assert(waitNum > 0); + + nextAt = max_send_tick; + nextRepeat = next_repeat; + doCkpt = (do_ckpt != ReqType::none); + doExit = (do_exit != ReqType::none); + + waitNum--; + // Notify the simulation thread if the on-going sync is complete + if (waitNum == 0) { + sync_lock.unlock(); + cv.notify_one(); + } +} + +void +DistIface::SyncNode::requestCkpt(ReqType req) +{ + std::lock_guard<std::mutex> sync_lock(lock); + assert(req != ReqType::none); + if (needCkpt != ReqType::none) + warn("Ckpt requested multiple times (req:%d)\n", static_cast<int>(req)); + if (needCkpt == ReqType::none || req == ReqType::immediate) + needCkpt = req; +} + +void +DistIface::SyncNode::requestExit(ReqType req) +{ + std::lock_guard<std::mutex> sync_lock(lock); + assert(req != ReqType::none); + if (needExit != ReqType::none) + warn("Exit requested multiple times (req:%d)\n", static_cast<int>(req)); + if (needExit == ReqType::none || req == ReqType::immediate) + needExit = req; +} + +void +DistIface::Sync::drainComplete() +{ + if (doCkpt) { + // The first DistIface object called this right before writing the + // checkpoint. We need to drain the underlying physical network here. + // Note that other gem5 peers may enter this barrier at different + // ticks due to draining. + run(false); + // Only the "first" DistIface object has to perform the sync + doCkpt = false; + } +} + +void +DistIface::SyncNode::serialize(CheckpointOut &cp) const +{ + int need_exit = static_cast<int>(needExit); + SERIALIZE_SCALAR(need_exit); +} + +void +DistIface::SyncNode::unserialize(CheckpointIn &cp) +{ + int need_exit; + UNSERIALIZE_SCALAR(need_exit); + needExit = static_cast<ReqType>(need_exit); +} + +void +DistIface::SyncSwitch::serialize(CheckpointOut &cp) const +{ + SERIALIZE_SCALAR(numExitReq); +} + +void +DistIface::SyncSwitch::unserialize(CheckpointIn &cp) +{ + UNSERIALIZE_SCALAR(numExitReq); +} + +void +DistIface::SyncEvent::start() +{ + // Note that this may be called either from startup() or drainResume() + + // At this point, all DistIface objects has already called Sync::init() so + // we have a local minimum of the start tick and repeat for the periodic + // sync. + Tick firstAt = DistIface::sync->firstAt; + repeat = DistIface::sync->nextRepeat; + // Do a global barrier to agree on a common repeat value (the smallest + // one from all participating nodes + DistIface::sync->run(curTick() == 0); + + assert(!DistIface::sync->doCkpt); + assert(!DistIface::sync->doExit); + assert(DistIface::sync->nextAt >= curTick()); + assert(DistIface::sync->nextRepeat <= repeat); + + // if this is called at tick 0 then we use the config start param otherwise + // the maximum of the current tick of all participating nodes + if (curTick() == 0) { + assert(!scheduled()); + assert(DistIface::sync->nextAt == 0); + schedule(firstAt); + } else { + if (scheduled()) + reschedule(DistIface::sync->nextAt); + else + schedule(DistIface::sync->nextAt); + } + inform("Dist sync scheduled at %lu and repeats %lu\n", when(), + DistIface::sync->nextRepeat); +} + +void +DistIface::SyncEvent::process() +{ + // We may not start a global periodic sync while draining before taking a + // checkpoint. This is due to the possibility that peer gem5 processes + // may not hit the same periodic sync before they complete draining and + // that would make this periodic sync clash with sync called from + // DistIface::serialize() by other gem5 processes. + // We would need a 'distributed drain' solution to eliminate this + // restriction. + // Note that if draining was not triggered by checkpointing then we are + // fine since no extra global sync will happen (i.e. all peer gem5 will + // hit this periodic sync eventually). + panic_if(_draining && DistIface::sync->doCkpt, + "Distributed sync is hit while draining"); + /* + * Note that this is a global event so this process method will be called + * by only exactly one thread. + */ + /* + * We hold the eventq lock at this point but the receiver thread may + * need the lock to schedule new recv events while waiting for the + * dist sync to complete. + * Note that the other simulation threads also release their eventq + * locks while waiting for us due to the global event semantics. + */ + { + EventQueue::ScopedRelease sr(curEventQueue()); + // we do a global sync here that is supposed to happen at the same + // tick in all gem5 peers + DistIface::sync->run(true); + // global sync completed + } + if (DistIface::sync->doCkpt) + exitSimLoop("checkpoint"); + if (DistIface::sync->doExit) + exitSimLoop("exit request from gem5 peers"); + + // schedule the next periodic sync + repeat = DistIface::sync->nextRepeat; + schedule(curTick() + repeat); +} + +void +DistIface::RecvScheduler::init(Event *recv_done, Tick link_delay) +{ + // This is called from the receiver thread when it starts running. The new + // receiver thread shares the event queue with the simulation thread + // (associated with the simulated Ethernet link). + curEventQueue(eventManager->eventQueue()); + + recvDone = recv_done; + linkDelay = link_delay; +} + +Tick +DistIface::RecvScheduler::calcReceiveTick(Tick send_tick, + Tick send_delay, + Tick prev_recv_tick) +{ + Tick recv_tick = send_tick + send_delay + linkDelay; + // sanity check (we need atleast a send delay long window) + assert(recv_tick >= prev_recv_tick + send_delay); + panic_if(prev_recv_tick + send_delay > recv_tick, + "Receive window is smaller than send delay"); + panic_if(recv_tick <= curTick(), + "Simulators out of sync - missed packet receive by %llu ticks" + "(rev_recv_tick: %lu send_tick: %lu send_delay: %lu " + "linkDelay: %lu )", + curTick() - recv_tick, prev_recv_tick, send_tick, send_delay, + linkDelay); + + return recv_tick; +} + +void +DistIface::RecvScheduler::resumeRecvTicks() +{ + // Schedule pending packets asap in case link speed/delay changed when + // restoring from the checkpoint. + // This may be done during unserialize except that curTick() is unknown + // so we call this during drainResume(). + // If we are not restoring from a checkppint then link latency could not + // change so we just return. + if (!ckptRestore) + return; + + std::vector<Desc> v; + while (!descQueue.empty()) { + Desc d = descQueue.front(); + descQueue.pop(); + d.sendTick = curTick(); + d.sendDelay = d.packet->size(); // assume 1 tick/byte max link speed + v.push_back(d); + } + + for (auto &d : v) + descQueue.push(d); + + if (recvDone->scheduled()) { + assert(!descQueue.empty()); + eventManager->reschedule(recvDone, curTick()); + } else { + assert(descQueue.empty() && v.empty()); + } + ckptRestore = false; +} + +void +DistIface::RecvScheduler::pushPacket(EthPacketPtr new_packet, + Tick send_tick, + Tick send_delay) +{ + // Note : this is called from the receiver thread + curEventQueue()->lock(); + Tick recv_tick = calcReceiveTick(send_tick, send_delay, prevRecvTick); + + DPRINTF(DistEthernetPkt, "DistIface::recvScheduler::pushPacket " + "send_tick:%llu send_delay:%llu link_delay:%llu recv_tick:%llu\n", + send_tick, send_delay, linkDelay, recv_tick); + // Every packet must be sent and arrive in the same quantum + assert(send_tick > master->syncEvent->when() - + master->syncEvent->repeat); + // No packet may be scheduled for receive in the arrival quantum + assert(send_tick + send_delay + linkDelay > master->syncEvent->when()); + + // Now we are about to schedule a recvDone event for the new data packet. + // We use the same recvDone object for all incoming data packets. Packet + // descriptors are saved in the ordered queue. The currently scheduled + // packet is always on the top of the queue. + // NOTE: we use the event queue lock to protect the receive desc queue, + // too, which is accessed both by the receiver thread and the simulation + // thread. + descQueue.emplace(new_packet, send_tick, send_delay); + if (descQueue.size() == 1) { + assert(!recvDone->scheduled()); + eventManager->schedule(recvDone, recv_tick); + } else { + assert(recvDone->scheduled()); + panic_if(descQueue.front().sendTick + descQueue.front().sendDelay > recv_tick, + "Out of order packet received (recv_tick: %lu top(): %lu\n", + recv_tick, descQueue.front().sendTick + descQueue.front().sendDelay); + } + curEventQueue()->unlock(); +} + +EthPacketPtr +DistIface::RecvScheduler::popPacket() +{ + // Note : this is called from the simulation thread when a receive done + // event is being processed for the link. We assume that the thread holds + // the event queue queue lock when this is called! + EthPacketPtr next_packet = descQueue.front().packet; + descQueue.pop(); + + if (descQueue.size() > 0) { + Tick recv_tick = calcReceiveTick(descQueue.front().sendTick, + descQueue.front().sendDelay, + curTick()); + eventManager->schedule(recvDone, recv_tick); + } + prevRecvTick = curTick(); + return next_packet; +} + +void +DistIface::RecvScheduler::Desc::serialize(CheckpointOut &cp) const +{ + SERIALIZE_SCALAR(sendTick); + SERIALIZE_SCALAR(sendDelay); + packet->serialize("rxPacket", cp); +} + +void +DistIface::RecvScheduler::Desc::unserialize(CheckpointIn &cp) +{ + UNSERIALIZE_SCALAR(sendTick); + UNSERIALIZE_SCALAR(sendDelay); + packet = std::make_shared<EthPacketData>(16384); + packet->unserialize("rxPacket", cp); +} + +void +DistIface::RecvScheduler::serialize(CheckpointOut &cp) const +{ + SERIALIZE_SCALAR(prevRecvTick); + // serialize the receive desc queue + std::queue<Desc> tmp_queue(descQueue); + unsigned n_desc_queue = descQueue.size(); + assert(tmp_queue.size() == descQueue.size()); + SERIALIZE_SCALAR(n_desc_queue); + for (int i = 0; i < n_desc_queue; i++) { + tmp_queue.front().serializeSection(cp, csprintf("rxDesc_%d", i)); + tmp_queue.pop(); + } + assert(tmp_queue.empty()); +} + +void +DistIface::RecvScheduler::unserialize(CheckpointIn &cp) +{ + assert(descQueue.size() == 0); + assert(recvDone->scheduled() == false); + assert(ckptRestore == false); + + UNSERIALIZE_SCALAR(prevRecvTick); + // unserialize the receive desc queue + unsigned n_desc_queue; + UNSERIALIZE_SCALAR(n_desc_queue); + for (int i = 0; i < n_desc_queue; i++) { + Desc recv_desc; + recv_desc.unserializeSection(cp, csprintf("rxDesc_%d", i)); + descQueue.push(recv_desc); + } + ckptRestore = true; +} + +DistIface::DistIface(unsigned dist_rank, + unsigned dist_size, + Tick sync_start, + Tick sync_repeat, + EventManager *em, + bool is_switch, int num_nodes) : + syncStart(sync_start), syncRepeat(sync_repeat), + recvThread(nullptr), recvScheduler(em), + rank(dist_rank), size(dist_size) +{ + DPRINTF(DistEthernet, "DistIface() ctor rank:%d\n",dist_rank); + isMaster = false; + if (master == nullptr) { + assert(sync == nullptr); + assert(syncEvent == nullptr); + if (is_switch) + sync = new SyncSwitch(num_nodes); + else + sync = new SyncNode(); + syncEvent = new SyncEvent(); + master = this; + isMaster = true; + } + distIfaceId = distIfaceNum; + distIfaceNum++; +} + +DistIface::~DistIface() +{ + assert(recvThread); + delete recvThread; + if (this == master) { + assert(syncEvent); + delete syncEvent; + assert(sync); + delete sync; + master = nullptr; + } +} + +void +DistIface::packetOut(EthPacketPtr pkt, Tick send_delay) +{ + Header header; + + // Prepare a dist header packet for the Ethernet packet we want to + // send out. + header.msgType = MsgType::dataDescriptor; + header.sendTick = curTick(); + header.sendDelay = send_delay; + + header.dataPacketLength = pkt->size(); + + // Send out the packet and the meta info. + sendPacket(header, pkt); + + DPRINTF(DistEthernetPkt, + "DistIface::sendDataPacket() done size:%d send_delay:%llu\n", + pkt->size(), send_delay); +} + +void +DistIface::recvThreadFunc(Event *recv_done, Tick link_delay) +{ + EthPacketPtr new_packet; + DistHeaderPkt::Header header; + + // Initialize receive scheduler parameters + recvScheduler.init(recv_done, link_delay); + + // Main loop to wait for and process any incoming message. + for (;;) { + // recvHeader() blocks until the next dist header packet comes in. + if (!recvHeader(header)) { + // We lost connection to the peer gem5 processes most likely + // because one of them called m5 exit. So we stop here. + // Grab the eventq lock to stop the simulation thread + curEventQueue()->lock(); + exit_message("info", + 0, + "Message server closed connection, " + "simulation is exiting"); + } + + // We got a valid dist header packet, let's process it + if (header.msgType == MsgType::dataDescriptor) { + recvPacket(header, new_packet); + recvScheduler.pushPacket(new_packet, + header.sendTick, + header.sendDelay); + } else { + // everything else must be synchronisation related command + sync->progress(header.sendTick, + header.syncRepeat, + header.needCkpt, + header.needExit); + } + } +} + +void +DistIface::spawnRecvThread(const Event *recv_done, Tick link_delay) +{ + assert(recvThread == nullptr); + + recvThread = new std::thread(&DistIface::recvThreadFunc, + this, + const_cast<Event *>(recv_done), + link_delay); + recvThreadsNum++; +} + +DrainState +DistIface::drain() +{ + DPRINTF(DistEthernet,"DistIFace::drain() called\n"); + // This can be called multiple times in the same drain cycle. + if (this == master) + syncEvent->draining(true); + return DrainState::Drained; +} + +void +DistIface::drainResume() { + DPRINTF(DistEthernet,"DistIFace::drainResume() called\n"); + if (this == master) + syncEvent->draining(false); + recvScheduler.resumeRecvTicks(); +} + +void +DistIface::serialize(CheckpointOut &cp) const +{ + // Drain the dist interface before the checkpoint is taken. We cannot call + // this as part of the normal drain cycle because this dist sync has to be + // called exactly once after the system is fully drained. + sync->drainComplete(); + + unsigned rank_orig = rank, dist_iface_id_orig = distIfaceId; + + SERIALIZE_SCALAR(rank_orig); + SERIALIZE_SCALAR(dist_iface_id_orig); + + recvScheduler.serializeSection(cp, "recvScheduler"); + if (this == master) { + sync->serializeSection(cp, "Sync"); + } +} + +void +DistIface::unserialize(CheckpointIn &cp) +{ + unsigned rank_orig, dist_iface_id_orig; + UNSERIALIZE_SCALAR(rank_orig); + UNSERIALIZE_SCALAR(dist_iface_id_orig); + + panic_if(rank != rank_orig, "Rank mismatch at resume (rank=%d, orig=%d)", + rank, rank_orig); + panic_if(distIfaceId != dist_iface_id_orig, "Dist iface ID mismatch " + "at resume (distIfaceId=%d, orig=%d)", distIfaceId, + dist_iface_id_orig); + + recvScheduler.unserializeSection(cp, "recvScheduler"); + if (this == master) { + sync->unserializeSection(cp, "Sync"); + } +} + +void +DistIface::init(const Event *done_event, Tick link_delay) +{ + // Init hook for the underlaying message transport to setup/finalize + // communication channels + initTransport(); + + // Spawn a new receiver thread that will process messages + // coming in from peer gem5 processes. + // The receive thread will also schedule a (receive) doneEvent + // for each incoming data packet. + spawnRecvThread(done_event, link_delay); + + + // Adjust the periodic sync start and interval. Different DistIface + // might have different requirements. The singleton sync object + // will select the minimum values for both params. + assert(sync != nullptr); + sync->init(syncStart, syncRepeat); + + // Initialize the seed for random generator to avoid the same sequence + // in all gem5 peer processes + assert(master != nullptr); + if (this == master) + random_mt.init(5489 * (rank+1) + 257); +} + +void +DistIface::startup() +{ + DPRINTF(DistEthernet, "DistIface::startup() started\n"); + if (this == master) + syncEvent->start(); + DPRINTF(DistEthernet, "DistIface::startup() done\n"); +} + +bool +DistIface::readyToCkpt(Tick delay, Tick period) +{ + bool ret = true; + DPRINTF(DistEthernet, "DistIface::readyToCkpt() called, delay:%lu " + "period:%lu\n", delay, period); + if (master) { + if (delay == 0) { + inform("m5 checkpoint called with zero delay => triggering collaborative " + "checkpoint\n"); + sync->requestCkpt(ReqType::collective); + } else { + inform("m5 checkpoint called with non-zero delay => triggering immediate " + "checkpoint (at the next sync)\n"); + sync->requestCkpt(ReqType::immediate); + } + if (period != 0) + inform("Non-zero period for m5_ckpt is ignored in " + "distributed gem5 runs\n"); + ret = false; + } + return ret; +} + +bool +DistIface::readyToExit(Tick delay) +{ + bool ret = true; + DPRINTF(DistEthernet, "DistIface::readyToExit() called, delay:%lu\n", + delay); + if (master) { + if (delay == 0) { + inform("m5 exit called with zero delay => triggering collaborative " + "exit\n"); + sync->requestExit(ReqType::collective); + } else { + inform("m5 exit called with non-zero delay => triggering immediate " + "exit (at the next sync)\n"); + sync->requestExit(ReqType::immediate); + } + ret = false; + } + return ret; +} + +uint64_t +DistIface::rankParam() +{ + uint64_t val; + if (master) { + val = master->rank; + } else { + warn("Dist-rank parameter is queried in single gem5 simulation."); + val = 0; + } + return val; +} + +uint64_t +DistIface::sizeParam() +{ + uint64_t val; + if (master) { + val = master->size; + } else { + warn("Dist-size parameter is queried in single gem5 simulation."); + val = 1; + } + return val; +} diff --git a/src/dev/net/dist_iface.hh b/src/dev/net/dist_iface.hh new file mode 100644 index 000000000..e69211fb8 --- /dev/null +++ b/src/dev/net/dist_iface.hh @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2015 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Gabor Dozsa + */ + +/* @file + * The interface class for dist gem5 simulations. + * + * dist-gem5 is an extension to gem5 to enable parallel simulation of a + * distributed system (e.g. simulation of a pool of machines + * connected by Ethernet links). A dist gem5 run consists of seperate gem5 + * processes running in parallel. Each gem5 process executes + * the simulation of a component of the simulated distributed system. + * (An example component can be a dist-core board with an Ethernet NIC.) + * The DistIface class below provides services to transfer data and + * control messages among the gem5 processes. The main such services are + * as follows. + * + * 1. Send a data packet coming from a simulated Ethernet link. The packet + * will be transferred to (all) the target(s) gem5 processes. The send + * operation is always performed by the simulation thread, i.e. the gem5 + * thread that is processing the event queue associated with the simulated + * Ethernet link. + * + * 2. Spawn a receiver thread to process messages coming in from the + * from other gem5 processes. Each simulated Ethernet link has its own + * associated receiver thread. The receiver thread saves the incoming packet + * and schedule an appropriate receive event in the event queue. + * + * 3. Schedule a global barrier event periodically to keep the gem5 + * processes in sync. + * Periodic barrier event to keep peer gem5 processes in sync. The basic idea + * is that no gem5 process can go ahead further than the simulated link + * transmission delay to ensure that a corresponding receive event can always + * be scheduled for any message coming in from a peer gem5 process. + * + * + * + * This interface is an abstract class. It can work with various low level + * send/receive service implementations (e.g. TCP/IP, MPI,...). A TCP + * stream socket version is implemented in src/dev/net/tcp_iface.[hh,cc]. + */ +#ifndef __DEV_DIST_IFACE_HH__ +#define __DEV_DIST_IFACE_HH__ + +#include <array> +#include <mutex> +#include <queue> +#include <thread> +#include <utility> + +#include "dev/net/dist_packet.hh" +#include "dev/net/etherpkt.hh" +#include "sim/core.hh" +#include "sim/drain.hh" +#include "sim/global_event.hh" +#include "sim/serialize.hh" + +class EventManager; + +/** + * The interface class to talk to peer gem5 processes. + */ +class DistIface : public Drainable, public Serializable +{ + public: + typedef DistHeaderPkt::Header Header; + + protected: + typedef DistHeaderPkt::MsgType MsgType; + typedef DistHeaderPkt::ReqType ReqType; + + private: + class SyncEvent; + /** @class Sync + * This class implements global sync operations among gem5 peer processes. + * + * @note This class is used as a singleton object (shared by all DistIface + * objects). + */ + class Sync : public Serializable + { + protected: + /** + * The lock to protect access to the Sync object. + */ + std::mutex lock; + /** + * Condition variable for the simulation thread to wait on + * until all receiver threads completes the current global + * synchronisation. + */ + std::condition_variable cv; + /** + * Number of receiver threads that not yet completed the current global + * synchronisation. + */ + unsigned waitNum; + /** + * Flag is set if exit is permitted upon sync completion + */ + bool doExit; + /** + * Flag is set if taking a ckpt is permitted upon sync completion + */ + bool doCkpt; + /** + * The repeat value for the next periodic sync + */ + Tick nextRepeat; + /** + * Tick for the very first periodic sync + */ + Tick firstAt; + /** + * Tick for the next periodic sync (if the event is not scheduled yet) + */ + Tick nextAt; + + friend class SyncEvent; + + public: + /** + * Initialize periodic sync params. + * + * @param start Start tick for dist synchronisation + * @param repeat Frequency of dist synchronisation + * + */ + void init(Tick start, Tick repeat); + /** + * Core method to perform a full dist sync. + */ + virtual void run(bool same_tick) = 0; + /** + * Callback when the receiver thread gets a sync ack message. + */ + virtual void progress(Tick send_tick, + Tick next_repeat, + ReqType do_ckpt, + ReqType do_exit) = 0; + + virtual void requestCkpt(ReqType req) = 0; + virtual void requestExit(ReqType req) = 0; + + void drainComplete(); + + virtual void serialize(CheckpointOut &cp) const override = 0; + virtual void unserialize(CheckpointIn &cp) override = 0; + }; + + class SyncNode: public Sync + { + private: + /** + * Exit requested + */ + ReqType needExit; + /** + * Ckpt requested + */ + ReqType needCkpt; + + public: + + SyncNode(); + ~SyncNode() {} + void run(bool same_tick) override; + void progress(Tick max_req_tick, + Tick next_repeat, + ReqType do_ckpt, + ReqType do_exit) override; + + void requestCkpt(ReqType req) override; + void requestExit(ReqType req) override; + + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; + }; + + class SyncSwitch: public Sync + { + private: + /** + * Counter for recording exit requests + */ + unsigned numExitReq; + /** + * Counter for recording ckpt requests + */ + unsigned numCkptReq; + /** + * Number of connected simulated nodes + */ + unsigned numNodes; + + public: + SyncSwitch(int num_nodes); + ~SyncSwitch() {} + + void run(bool same_tick) override; + void progress(Tick max_req_tick, + Tick next_repeat, + ReqType do_ckpt, + ReqType do_exit) override; + + void requestCkpt(ReqType) override { + panic("Switch requested checkpoint"); + } + void requestExit(ReqType) override { + panic("Switch requested exit"); + } + + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; + }; + + /** + * The global event to schedule periodic dist sync. It is used as a + * singleton object. + * + * The periodic synchronisation works as follows. + * 1. A SyncEvent is scheduled as a global event when startup() is + * called. + * 2. The process() method of the SyncEvent initiates a new barrier + * for each simulated Ethernet link. + * 3. Simulation thread(s) then waits until all receiver threads + * complete the ongoing barrier. The global sync event is done. + */ + class SyncEvent : public GlobalSyncEvent + { + private: + /** + * Flag to set when the system is draining + */ + bool _draining; + public: + /** + * Only the firstly instantiated DistIface object will + * call this constructor. + */ + SyncEvent() : GlobalSyncEvent(Sim_Exit_Pri, 0), _draining(false) {} + + ~SyncEvent() {} + /** + * Schedule the first periodic sync event. + */ + void start(); + /** + * This is a global event so process() will only be called by + * exactly one simulation thread. (See further comments in the .cc + * file.) + */ + void process() override; + + bool draining() const { return _draining; } + void draining(bool fl) { _draining = fl; } + }; + /** + * Class to encapsulate information about data packets received. + + * @note The main purpose of the class to take care of scheduling receive + * done events for the simulated network link and store incoming packets + * until they can be received by the simulated network link. + */ + class RecvScheduler : public Serializable + { + private: + /** + * Received packet descriptor. This information is used by the receive + * thread to schedule receive events and by the simulation thread to + * process those events. + */ + struct Desc : public Serializable + { + EthPacketPtr packet; + Tick sendTick; + Tick sendDelay; + + Desc() : sendTick(0), sendDelay(0) {} + Desc(EthPacketPtr p, Tick s, Tick d) : + packet(p), sendTick(s), sendDelay(d) {} + Desc(const Desc &d) : + packet(d.packet), sendTick(d.sendTick), sendDelay(d.sendDelay) {} + + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; + }; + /** + * The queue to store the receive descriptors. + */ + std::queue<Desc> descQueue; + /** + * The tick when the most recent receive event was processed. + * + * @note This information is necessary to simulate possible receiver + * link contention when calculating the receive tick for the next + * incoming data packet (see the calcReceiveTick() method) + */ + Tick prevRecvTick; + /** + * The receive done event for the simulated Ethernet link. + * + * @note This object is constructed by the simulated network link. We + * schedule this object for each incoming data packet. + */ + Event *recvDone; + /** + * The link delay in ticks for the simulated Ethernet link. + * + * @note This value is used for calculating the receive ticks for + * incoming data packets. + */ + Tick linkDelay; + /** + * The event manager associated with the simulated Ethernet link. + * + * @note It is used to access the event queue for scheduling receive + * done events for the link. + */ + EventManager *eventManager; + /** + * Calculate the tick to schedule the next receive done event. + * + * @param send_tick The tick the packet was sent. + * @param send_delay The simulated delay at the sender side. + * @param prev_recv_tick Tick when the last receive event was + * processed. + * + * @note This method tries to take into account possible receiver link + * contention and adjust receive tick for the incoming packets + * accordingly. + */ + Tick calcReceiveTick(Tick send_tick, + Tick send_delay, + Tick prev_recv_tick); + + /** + * Flag to set if receive ticks for pending packets need to be + * recalculated due to changed link latencies at a resume + */ + bool ckptRestore; + + public: + /** + * Scheduler for the incoming data packets. + * + * @param em The event manager associated with the simulated Ethernet + * link. + */ + RecvScheduler(EventManager *em) : + prevRecvTick(0), recvDone(nullptr), linkDelay(0), + eventManager(em), ckptRestore(false) {} + + /** + * Initialize network link parameters. + * + * @note This method is called from the receiver thread (see + * recvThreadFunc()). + */ + void init(Event *recv_done, Tick link_delay); + /** + * Fetch the next packet that is to be received by the simulated network + * link. + * + * @note This method is called from the process() method of the receive + * done event associated with the network link. + */ + EthPacketPtr popPacket(); + /** + * Push a newly arrived packet into the desc queue. + */ + void pushPacket(EthPacketPtr new_packet, + Tick send_tick, + Tick send_delay); + + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; + /** + * Adjust receive ticks for pending packets when restoring from a + * checkpoint + * + * @note Link speed and delay parameters may change at resume. + */ + void resumeRecvTicks(); + }; + /** + * Tick to schedule the first dist sync event. + * This is just as optimization : we do not need any dist sync + * event until the simulated NIC is brought up by the OS. + */ + Tick syncStart; + /** + * Frequency of dist sync events in ticks. + */ + Tick syncRepeat; + /** + * Receiver thread pointer. + * Each DistIface object must have exactly one receiver thread. + */ + std::thread *recvThread; + /** + * Meta information about data packets received. + */ + RecvScheduler recvScheduler; + + protected: + /** + * The rank of this process among the gem5 peers. + */ + unsigned rank; + /** + * The number of gem5 processes comprising this dist simulation. + */ + unsigned size; + /** + * Number of DistIface objects (i.e. dist links in this gem5 process) + */ + static unsigned distIfaceNum; + /** + * Unique id for the dist link + */ + unsigned distIfaceId; + + bool isMaster; + + private: + /** + * Number of receiver threads (in this gem5 process) + */ + static unsigned recvThreadsNum; + /** + * The singleton Sync object to perform dist synchronisation. + */ + static Sync *sync; + /** + * The singleton SyncEvent object to schedule periodic dist sync. + */ + static SyncEvent *syncEvent; + /** + * The very first DistIface object created becomes the master. We need + * a master to co-ordinate the global synchronisation. + */ + static DistIface *master; + + private: + /** + * Send out a data packet to the remote end. + * @param header Meta info about the packet (which needs to be transferred + * to the destination alongside the packet). + * @param packet Pointer to the packet to send. + */ + virtual void sendPacket(const Header &header, const EthPacketPtr &packet) = 0; + /** + * Send out a control command to the remote end. + * @param header Meta info describing the command (e.g. sync request) + */ + virtual void sendCmd(const Header &header) = 0; + /** + * Receive a header (i.e. meta info describing a data packet or a control command) + * from the remote end. + * @param header The meta info structure to store the incoming header. + */ + virtual bool recvHeader(Header &header) = 0; + /** + * Receive a packet from the remote end. + * @param header Meta info about the incoming packet (obtanied by a previous + * call to the recvHedaer() method). + * @param Pointer to packet received. + */ + virtual void recvPacket(const Header &header, EthPacketPtr &packet) = 0; + /** + * Init hook for the underlaying transport + */ + virtual void initTransport() = 0; + /** + * spawn the receiver thread. + * @param recv_done The receive done event associated with the simulated + * Ethernet link. + * @param link_delay The link delay for the simulated Ethernet link. + */ + void spawnRecvThread(const Event *recv_done, Tick link_delay); + /** + * The function executed by a receiver thread. + */ + void recvThreadFunc(Event *recv_done, Tick link_delay); + + public: + + /** + * ctor + * @param dist_rank Rank of this gem5 process within the dist run + * @param sync_start Start tick for dist synchronisation + * @param sync_repeat Frequency for dist synchronisation + * @param em The event manager associated with the simulated Ethernet link + */ + DistIface(unsigned dist_rank, + unsigned dist_size, + Tick sync_start, + Tick sync_repeat, + EventManager *em, + bool is_switch, + int num_nodes); + + virtual ~DistIface(); + /** + * Send out an Ethernet packet. + * @param pkt The Ethernet packet to send. + * @param send_delay The delay in ticks for the send completion event. + */ + void packetOut(EthPacketPtr pkt, Tick send_delay); + /** + * Fetch the packet scheduled to be received next by the simulated + * network link. + * + * @note This method is called within the process() method of the link + * receive done event. It also schedules the next receive event if the + * receive queue is not empty. + */ + EthPacketPtr packetIn() { return recvScheduler.popPacket(); } + + DrainState drain() override; + void drainResume() override; + void init(const Event *e, Tick link_delay); + void startup(); + + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; + /** + * Initiate the exit from the simulation. + * @param delay Delay param from the m5 exit command. If Delay is zero + * then a collaborative exit is requested (i.e. all nodes have to call + * this method before the distributed simulation can exit). If Delay is + * not zero then exit is requested asap (and it will happen at the next + * sync tick). + * @return False if we are in distributed mode (i.e. exit can happen only + * at sync), True otherwise. + */ + static bool readyToExit(Tick delay); + /** + * Initiate taking a checkpoint + * @param delay Delay param from the m5 checkpoint command. If Delay is + * zero then a collaborative checkpoint is requested (i.e. all nodes have + * to call this method before the checkpoint can be taken). If Delay is + * not zero then a checkpoint is requested asap (and it will happen at the + * next sync tick). + * @return False if we are in dist mode (i.e. exit can happen only at + * sync), True otherwise. + */ + static bool readyToCkpt(Tick delay, Tick period); + /** + * Getter for the dist rank param. + */ + static uint64_t rankParam(); + /** + * Getter for the dist size param. + */ + static uint64_t sizeParam(); + }; + +#endif diff --git a/src/dev/net/multi_packet.hh b/src/dev/net/dist_packet.hh index 3d8e85dfa..4c079c44a 100644 --- a/src/dev/net/multi_packet.hh +++ b/src/dev/net/dist_packet.hh @@ -38,93 +38,69 @@ */ /* @file - * Header packet class for multi gem5 runs. + * Header packet class for dist-gem5 runs. * - * For a high level description about multi gem5 see comments in - * header file multi_iface.hh. + * For a high level description about dist-gem5 see comments in + * header file dist_iface.hh. * - * The MultiHeaderPkt class defines the format of message headers - * sent among gem5 processes during a multi gem5 simulation. A header packet + * The DistHeaderPkt class defines the format of message headers + * sent among gem5 processes during a dist gem5 simulation. A header packet * can either carry the description of data packet (i.e. a simulated Ethernet * packet) or a synchronisation related control command. In case of * data packet description, the corresponding data packet always follows * the header packet back-to-back. */ -#ifndef __DEV_NET_MULTI_PACKET_HH__ -#define __DEV_NET_MULTI_PACKET_HH__ +#ifndef __DEV_DIST_PACKET_HH__ +#define __DEV_DIST_PACKET_HH__ #include <cstring> #include "base/types.hh" -class MultiHeaderPkt +class DistHeaderPkt { private: - MultiHeaderPkt() {} - ~MultiHeaderPkt() {} + DistHeaderPkt() {} + ~DistHeaderPkt() {} public: + enum class ReqType { immediate, collective, pending, none }; /** - * Simply type to help with calculating space requirements for - * the corresponding header field. - */ - typedef uint8_t AddressType[6]; - - /** - * The msg type defines what informarion a multi header packet carries. + * The msg type defines what information a dist header packet carries. */ enum class MsgType { dataDescriptor, - cmdPeriodicSyncReq, - cmdPeriodicSyncAck, - cmdCkptSyncReq, - cmdCkptSyncAck, - cmdAtomicSyncReq, - cmdAtomicSyncAck, + cmdSyncReq, + cmdSyncAck, unknown }; struct Header { /** - * The msg type field is valid for all header packets. In case of - * a synchronisation control command this is the only valid field. + * The msg type field is valid for all header packets. + * + * @note senderRank is used with data packets while collFlags are used + * by sync ack messages to trigger collective ckpt or exit events. */ MsgType msgType; Tick sendTick; - Tick sendDelay; - /** - * Actual length of the simulated Ethernet packet. - */ - unsigned dataPacketLength; - /** - * Source MAC address. - */ - AddressType srcAddress; - /** - * Destination MAC address. - */ - AddressType dstAddress; + union { + Tick sendDelay; + Tick syncRepeat; + }; + union { + /** + * Actual length of the simulated Ethernet packet. + */ + unsigned dataPacketLength; + struct { + ReqType needCkpt; + ReqType needExit; + }; + }; }; - - static unsigned maxAddressLength(); - - /** - * Static functions for manipulating and comparing MAC addresses. - */ - static void clearAddress(AddressType &addr); - static bool isAddressEqual(const AddressType &addr1, - const AddressType &addr2); - static bool isAddressLess(const AddressType &addr1, - const AddressType &addr2); - - static void copyAddress(AddressType &dest, - const AddressType &src); - - static bool isUnicastAddress(const AddressType &addr); - static bool isMulticastAddress(const AddressType &addr); - static bool isBroadcastAddress(const AddressType &addr); }; -#endif // __DEV_NET_MULTI_PACKET_HH__ +#endif diff --git a/src/dev/net/etherpkt.cc b/src/dev/net/etherpkt.cc index a16f572c5..f06af3306 100644 --- a/src/dev/net/etherpkt.cc +++ b/src/dev/net/etherpkt.cc @@ -53,19 +53,3 @@ EthPacketData::unserialize(const string &base, CheckpointIn &cp) arrayParamIn(cp, base + ".data", data, length); } -void -EthPacketData::packAddress(uint8_t *src_addr, - uint8_t *dst_addr, - unsigned &nbytes) -{ - Net::EthHdr *hdr = (Net::EthHdr *)data; - assert(hdr->src().size() == hdr->dst().size()); - if (nbytes < hdr->src().size()) - panic("EthPacketData::packAddress() Buffer overflow"); - - memcpy(dst_addr, hdr->dst().bytes(), hdr->dst().size()); - memcpy(src_addr, hdr->src().bytes(), hdr->src().size()); - - nbytes = hdr->src().size(); -} - diff --git a/src/dev/net/etherpkt.hh b/src/dev/net/etherpkt.hh index 4119578c3..457563293 100644 --- a/src/dev/net/etherpkt.hh +++ b/src/dev/net/etherpkt.hh @@ -71,18 +71,6 @@ class EthPacketData ~EthPacketData() { if (data) delete [] data; } public: - /** - * This function pulls out the MAC source and destination addresses from - * the packet data and stores them in the caller specified buffers. - * - * @param src_addr The buffer to store the source MAC address. - * @param dst_addr The buffer to store the destination MAC address. - * @param length This is an inout parameter. The caller stores in this - * the size of the address buffers. On return, this will contain the - * actual address size stored in the buffers. (We assume that source - * address size is equal to that of the destination address.) - */ - void packAddress(uint8_t *src_addr, uint8_t *dst_addr, unsigned &length); void serialize(const std::string &base, CheckpointOut &cp) const; void unserialize(const std::string &base, CheckpointIn &cp); diff --git a/src/dev/net/multi_iface.cc b/src/dev/net/multi_iface.cc deleted file mode 100644 index 15f69f2ac..000000000 --- a/src/dev/net/multi_iface.cc +++ /dev/null @@ -1,622 +0,0 @@ -/* - * Copyright (c) 2015 ARM Limited - * All rights reserved - * - * The license below extends only to copyright in the software and shall - * not be construed as granting a license to any other intellectual - * property including but not limited to intellectual property relating - * to a hardware implementation of the functionality of the software - * licensed hereunder. You may use the software subject to the license - * terms below provided that you ensure that this notice is replicated - * unmodified and in its entirety in all distributions of the software, - * modified or unmodified, in source code or in binary form. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Authors: Gabor Dozsa - */ - -/* @file - * The interface class for multi gem5 simulations. - */ - -#include "dev/net/multi_iface.hh" - -#include <queue> -#include <thread> - -#include "base/random.hh" -#include "base/trace.hh" -#include "debug/MultiEthernet.hh" -#include "debug/MultiEthernetPkt.hh" -#include "dev/net/etherpkt.hh" -#include "sim/sim_exit.hh" -#include "sim/sim_object.hh" - - -MultiIface::Sync *MultiIface::sync = nullptr; -MultiIface::SyncEvent *MultiIface::syncEvent = nullptr; -unsigned MultiIface::recvThreadsNum = 0; -MultiIface *MultiIface::master = nullptr; - -bool -MultiIface::Sync::run(SyncTrigger t, Tick sync_tick) -{ - std::unique_lock<std::mutex> sync_lock(lock); - - trigger = t; - if (trigger != SyncTrigger::periodic) { - DPRINTF(MultiEthernet,"MultiIface::Sync::run() trigger:%d\n", - (unsigned)trigger); - } - - switch (state) { - case SyncState::asyncCkpt: - switch (trigger) { - case SyncTrigger::ckpt: - assert(MultiIface::syncEvent->interrupted == false); - state = SyncState::busy; - break; - case SyncTrigger::periodic: - if (waitNum == 0) { - // So all recv threads got an async checkpoint request already - // and a simExit is scheduled at the end of the current tick - // (i.e. it is a periodic sync scheduled at the same tick as - // the simExit). - state = SyncState::idle; - DPRINTF(MultiEthernet,"MultiIface::Sync::run() interrupted " - "due to async ckpt scheduled\n"); - return false; - } else { - // we still need to wait for some receiver thread to get the - // aysnc ckpt request. We are going to proceed as 'interrupted' - // periodic sync. - state = SyncState::interrupted; - DPRINTF(MultiEthernet,"MultiIface::Sync::run() interrupted " - "due to ckpt request is coming in\n"); - } - break; - case SyncTrigger::atomic: - assert(trigger != SyncTrigger::atomic); - } - break; - case SyncState::idle: - state = SyncState::busy; - break; - // Only one sync can be active at any time - case SyncState::interrupted: - case SyncState::busy: - assert(state != SyncState::interrupted); - assert(state != SyncState::busy); - break; - } - // Kick-off the sync unless we are in the middle of an interrupted - // periodic sync - if (state != SyncState::interrupted) { - assert(waitNum == 0); - waitNum = MultiIface::recvThreadsNum; - // initiate the global synchronisation - assert(MultiIface::master != nullptr); - MultiIface::master->syncRaw(triggerToMsg[(unsigned)trigger], sync_tick); - } - // now wait until all receiver threads complete the synchronisation - auto lf = [this]{ return waitNum == 0; }; - cv.wait(sync_lock, lf); - - // we are done - assert(state == SyncState::busy || state == SyncState::interrupted); - bool ret = (state != SyncState::interrupted); - state = SyncState::idle; - return ret; -} - -void -MultiIface::Sync::progress(MsgType msg) -{ - std::unique_lock<std::mutex> sync_lock(lock); - - switch (msg) { - case MsgType::cmdAtomicSyncAck: - assert(state == SyncState::busy && trigger == SyncTrigger::atomic); - break; - case MsgType::cmdPeriodicSyncAck: - assert(state == SyncState::busy && trigger == SyncTrigger::periodic); - break; - case MsgType::cmdCkptSyncAck: - assert(state == SyncState::busy && trigger == SyncTrigger::ckpt); - break; - case MsgType::cmdCkptSyncReq: - switch (state) { - case SyncState::busy: - if (trigger == SyncTrigger::ckpt) { - // We are already in a checkpoint sync but got another ckpt - // sync request. This may happen if two (or more) peer gem5 - // processes try to start a ckpt nearly at the same time. - // Incrementing waitNum here (before decrementing it below) - // effectively results in ignoring this new ckpt sync request. - waitNum++; - break; - } - assert (waitNum == recvThreadsNum); - state = SyncState::interrupted; - // we need to fall over here to handle "recvThreadsNum == 1" case - case SyncState::interrupted: - assert(trigger == SyncTrigger::periodic); - assert(waitNum >= 1); - if (waitNum == 1) { - exitSimLoop("checkpoint"); - } - break; - case SyncState::idle: - // There is no on-going sync so we got an async ckpt request. If we - // are the only receiver thread then we need to schedule the - // checkpoint. Otherwise, only change the state to 'asyncCkpt' and - // let the last receiver thread to schedule the checkpoint at the - // 'asyncCkpt' case. - // Note that a periodic or resume sync may start later and that can - // trigger a state change to 'interrupted' (so the checkpoint may - // get scheduled at 'interrupted' case finally). - assert(waitNum == 0); - state = SyncState::asyncCkpt; - waitNum = MultiIface::recvThreadsNum; - // we need to fall over here to handle "recvThreadsNum == 1" case - case SyncState::asyncCkpt: - assert(waitNum >= 1); - if (waitNum == 1) - exitSimLoop("checkpoint"); - break; - default: - panic("Unexpected state for checkpoint request message"); - break; - } - break; - default: - panic("Unknown msg type"); - break; - } - waitNum--; - assert(state != SyncState::idle); - // Notify the simultaion thread if there is an on-going sync. - if (state != SyncState::asyncCkpt) { - sync_lock.unlock(); - cv.notify_one(); - } -} - -void MultiIface::SyncEvent::start(Tick start, Tick interval) -{ - assert(!scheduled()); - if (interval == 0) - panic("Multi synchronisation period must be greater than zero"); - repeat = interval; - schedule(start); -} - -void -MultiIface::SyncEvent::adjust(Tick start_tick, Tick repeat_tick) -{ - // The new multi interface may require earlier start of the - // synchronisation. - assert(scheduled() == true); - if (start_tick < when()) - reschedule(start_tick); - // The new multi interface may require more frequent synchronisation. - if (repeat == 0) - panic("Multi synchronisation period must be greater than zero"); - if (repeat < repeat_tick) - repeat = repeat_tick; -} - -void -MultiIface::SyncEvent::process() -{ - /* - * Note that this is a global event so this process method will be called - * by only exactly one thread. - */ - // if we are draining the system then we must not start a periodic sync (as - // it is not sure that all peer gem5 will reach this tick before taking - // the checkpoint). - if (isDraining == true) { - assert(interrupted == false); - interrupted = true; - DPRINTF(MultiEthernet,"MultiIface::SyncEvent::process() interrupted " - "due to draining\n"); - return; - } - if (interrupted == false) - scheduledAt = curTick(); - /* - * We hold the eventq lock at this point but the receiver thread may - * need the lock to schedule new recv events while waiting for the - * multi sync to complete. - * Note that the other simulation threads also release their eventq - * locks while waiting for us due to the global event semantics. - */ - curEventQueue()->unlock(); - // we do a global sync here - interrupted = !MultiIface::sync->run(SyncTrigger::periodic, scheduledAt); - // Global sync completed or got interrupted. - // we are expected to exit with the eventq lock held - curEventQueue()->lock(); - // schedule the next global sync event if this one completed. Otherwise - // (i.e. this one was interrupted by a checkpoint request), we will - // reschedule this one after the draining is complete. - if (!interrupted) - schedule(scheduledAt + repeat); -} - -void MultiIface::SyncEvent::resume() -{ - Tick sync_tick; - assert(!scheduled()); - if (interrupted) { - assert(curTick() >= scheduledAt); - // We have to complete the interrupted periodic sync asap. - // Note that this sync might be interrupted now again with a checkpoint - // request from a peer gem5... - sync_tick = curTick(); - schedule(sync_tick); - } else { - // So we completed the last periodic sync, let's find out the tick for - // next one - assert(curTick() > scheduledAt); - sync_tick = scheduledAt + repeat; - if (sync_tick < curTick()) - panic("Cannot resume periodic synchronisation"); - schedule(sync_tick); - } - DPRINTF(MultiEthernet, - "MultiIface::SyncEvent periodic sync resumed at %lld " - "(curTick:%lld)\n", sync_tick, curTick()); -} - -void MultiIface::SyncEvent::serialize(const std::string &base, - CheckpointOut &cp) const -{ - // Save the periodic multi sync schedule information - paramOut(cp, base + ".periodicSyncRepeat", repeat); - paramOut(cp, base + ".periodicSyncInterrupted", interrupted); - paramOut(cp, base + ".periodicSyncAt", scheduledAt); -} - -void MultiIface::SyncEvent::unserialize(const std::string &base, - CheckpointIn &cp) -{ - paramIn(cp, base + ".periodicSyncRepeat", repeat); - paramIn(cp, base + ".periodicSyncInterrupted", interrupted); - paramIn(cp, base + ".periodicSyncAt", scheduledAt); -} - -MultiIface::MultiIface(unsigned multi_rank, - Tick sync_start, - Tick sync_repeat, - EventManager *em) : - syncStart(sync_start), syncRepeat(sync_repeat), - recvThread(nullptr), eventManager(em), recvDone(nullptr), - scheduledRecvPacket(nullptr), linkDelay(0), rank(multi_rank) -{ - DPRINTF(MultiEthernet, "MultiIface() ctor rank:%d\n",multi_rank); - if (master == nullptr) { - assert(sync == nullptr); - assert(syncEvent == nullptr); - sync = new Sync(); - syncEvent = new SyncEvent(); - master = this; - } -} - -MultiIface::~MultiIface() -{ - assert(recvThread); - delete recvThread; - if (this == master) { - assert(syncEvent); - delete syncEvent; - assert(sync); - delete sync; - } -} - -void -MultiIface::packetOut(EthPacketPtr pkt, Tick send_delay) -{ - MultiHeaderPkt::Header header_pkt; - unsigned address_length = MultiHeaderPkt::maxAddressLength(); - - // Prepare a multi header packet for the Ethernet packet we want to - // send out. - header_pkt.msgType = MsgType::dataDescriptor; - header_pkt.sendTick = curTick(); - header_pkt.sendDelay = send_delay; - - // Store also the source and destination addresses. - pkt->packAddress(header_pkt.srcAddress, header_pkt.dstAddress, - address_length); - - header_pkt.dataPacketLength = pkt->size(); - - // Send out the multi hedare packet followed by the Ethernet packet. - sendRaw(&header_pkt, sizeof(header_pkt), header_pkt.dstAddress); - sendRaw(pkt->data, pkt->size(), header_pkt.dstAddress); - DPRINTF(MultiEthernetPkt, - "MultiIface::sendDataPacket() done size:%d send_delay:%llu " - "src:0x%02x%02x%02x%02x%02x%02x " - "dst:0x%02x%02x%02x%02x%02x%02x\n", - pkt->size(), send_delay, - header_pkt.srcAddress[0], header_pkt.srcAddress[1], - header_pkt.srcAddress[2], header_pkt.srcAddress[3], - header_pkt.srcAddress[4], header_pkt.srcAddress[5], - header_pkt.dstAddress[0], header_pkt.dstAddress[1], - header_pkt.dstAddress[2], header_pkt.dstAddress[3], - header_pkt.dstAddress[4], header_pkt.dstAddress[5]); -} - -bool -MultiIface::recvHeader(MultiHeaderPkt::Header &header_pkt) -{ - // Blocking receive of an incoming multi header packet. - return recvRaw((void *)&header_pkt, sizeof(header_pkt)); -} - -void -MultiIface::recvData(const MultiHeaderPkt::Header &header_pkt) -{ - // We are here beacuse a header packet has been received implying - // that an Ethernet (data) packet is coming in next. - assert(header_pkt.msgType == MsgType::dataDescriptor); - // Allocate storage for the incoming Ethernet packet. - EthPacketPtr new_packet(new EthPacketData(header_pkt.dataPacketLength)); - // Now execute the blocking receive and store the incoming data directly - // in the new EthPacketData object. - if (! recvRaw((void *)(new_packet->data), header_pkt.dataPacketLength)) - panic("Missing data packet"); - - new_packet->length = header_pkt.dataPacketLength; - // Grab the event queue lock to schedule a new receive event for the - // data packet. - curEventQueue()->lock(); - // Compute the receive tick. It includes the send delay and the - // simulated link delay. - Tick recv_tick = header_pkt.sendTick + header_pkt.sendDelay + linkDelay; - DPRINTF(MultiEthernetPkt, "MultiIface::recvThread() packet receive, " - "send_tick:%llu send_delay:%llu link_delay:%llu recv_tick:%llu\n", - header_pkt.sendTick, header_pkt.sendDelay, linkDelay, recv_tick); - - if (recv_tick <= curTick()) { - panic("Simulators out of sync - missed packet receive by %llu ticks", - curTick() - recv_tick); - } - // Now we are about to schedule a recvDone event for the new data packet. - // We use the same recvDone object for all incoming data packets. If - // that is already scheduled - i.e. a receive event for a previous - // data packet is already pending - then we have to check whether the - // receive tick for the new packet is earlier than that of the currently - // pending event. Packets may arrive out-of-order with respect to - // simulated receive time. If that is the case, we need to re-schedule the - // recvDone event for the new packet. Otherwise, we save the packet - // pointer and the recv tick for the new packet in the recvQueue. See - // the implementation of the packetIn() method for comments on how this - // information is retrieved from the recvQueue by the simulation thread. - if (!recvDone->scheduled()) { - assert(recvQueue.size() == 0); - assert(scheduledRecvPacket == nullptr); - scheduledRecvPacket = new_packet; - eventManager->schedule(recvDone, recv_tick); - } else if (recvDone->when() > recv_tick) { - recvQueue.emplace(scheduledRecvPacket, recvDone->when()); - eventManager->reschedule(recvDone, recv_tick); - scheduledRecvPacket = new_packet; - } else { - recvQueue.emplace(new_packet, recv_tick); - } - curEventQueue()->unlock(); -} - -void -MultiIface::recvThreadFunc() -{ - EthPacketPtr new_packet; - MultiHeaderPkt::Header header; - - // The new receiver thread shares the event queue with the simulation - // thread (associated with the simulated Ethernet link). - curEventQueue(eventManager->eventQueue()); - // Main loop to wait for and process any incoming message. - for (;;) { - // recvHeader() blocks until the next multi header packet comes in. - if (!recvHeader(header)) { - // We lost connection to the peer gem5 processes most likely - // because one of them called m5 exit. So we stop here. - exit_message("info", 0, "Message server closed connection, " - "simulation is exiting"); - } - // We got a valid multi header packet, let's process it - if (header.msgType == MsgType::dataDescriptor) { - recvData(header); - } else { - // everything else must be synchronisation related command - sync->progress(header.msgType); - } - } -} - -EthPacketPtr -MultiIface::packetIn() -{ - // We are called within the process() method of the recvDone event. We - // return the packet that triggered the current receive event. - // If there is further packets in the recvQueue, we also have to schedule - // the recvEvent for the next packet with the smallest receive tick. - // The priority queue container ensures that smallest receive tick is - // always on the top of the queue. - assert(scheduledRecvPacket != nullptr); - EthPacketPtr next_packet = scheduledRecvPacket; - - if (! recvQueue.empty()) { - eventManager->schedule(recvDone, recvQueue.top().second); - scheduledRecvPacket = recvQueue.top().first; - recvQueue.pop(); - } else { - scheduledRecvPacket = nullptr; - } - - return next_packet; -} - -void -MultiIface::spawnRecvThread(Event *recv_done, Tick link_delay) -{ - assert(recvThread == nullptr); - // all receive thread must be spawned before simulation starts - assert(eventManager->eventQueue()->getCurTick() == 0); - - recvDone = recv_done; - linkDelay = link_delay; - - recvThread = new std::thread(&MultiIface::recvThreadFunc, this); - - recvThreadsNum++; -} - -DrainState -MultiIface::drain() -{ - DPRINTF(MultiEthernet,"MultiIFace::drain() called\n"); - - // This can be called multiple times in the same drain cycle. - if (master == this) { - syncEvent->isDraining = true; - } - - return DrainState::Drained; -} - -void MultiIface::drainDone() { - if (master == this) { - assert(syncEvent->isDraining == true); - syncEvent->isDraining = false; - // We need to resume the interrupted periodic sync here now that the - // draining is done. If the last periodic sync completed before the - // checkpoint then the next one is already scheduled. - if (syncEvent->interrupted) - syncEvent->resume(); - } -} - -void MultiIface::serialize(const std::string &base, CheckpointOut &cp) const -{ - // Drain the multi interface before the checkpoint is taken. We cannot call - // this as part of the normal drain cycle because this multi sync has to be - // called exactly once after the system is fully drained. - // Note that every peer will take a checkpoint but they may take it at - // different ticks. - // This sync request may interrupt an on-going periodic sync in some peers. - sync->run(SyncTrigger::ckpt, curTick()); - - // Save the periodic multi sync status - syncEvent->serialize(base, cp); - - unsigned n_rx_packets = recvQueue.size(); - if (scheduledRecvPacket != nullptr) - n_rx_packets++; - - paramOut(cp, base + ".nRxPackets", n_rx_packets); - - if (n_rx_packets > 0) { - assert(recvDone->scheduled()); - scheduledRecvPacket->serialize(base + ".rxPacket[0]", cp); - } - - for (unsigned i=1; i < n_rx_packets; i++) { - const RecvInfo recv_info = recvQueue.impl().at(i-1); - recv_info.first->serialize(base + csprintf(".rxPacket[%d]", i), cp); - Tick rx_tick = recv_info.second; - paramOut(cp, base + csprintf(".rxTick[%d]", i), rx_tick); - } -} - -void MultiIface::unserialize(const std::string &base, CheckpointIn &cp) -{ - assert(recvQueue.size() == 0); - assert(scheduledRecvPacket == nullptr); - assert(recvDone->scheduled() == false); - - // restore periodic sync info - syncEvent->unserialize(base, cp); - - unsigned n_rx_packets; - paramIn(cp, base + ".nRxPackets", n_rx_packets); - - if (n_rx_packets > 0) { - scheduledRecvPacket = std::make_shared<EthPacketData>(16384); - scheduledRecvPacket->unserialize(base + ".rxPacket[0]", cp); - // Note: receive event will be scheduled when the link is unserialized - } - - for (unsigned i=1; i < n_rx_packets; i++) { - EthPacketPtr rx_packet = std::make_shared<EthPacketData>(16384); - rx_packet->unserialize(base + csprintf(".rxPacket[%d]", i), cp); - Tick rx_tick = 0; - paramIn(cp, base + csprintf(".rxTick[%d]", i), rx_tick); - assert(rx_tick > 0); - recvQueue.emplace(rx_packet,rx_tick); - } -} - -void MultiIface::initRandom() -{ - // Initialize the seed for random generator to avoid the same sequence - // in all gem5 peer processes - assert(master != nullptr); - if (this == master) - random_mt.init(5489 * (rank+1) + 257); -} - -void MultiIface::startPeriodicSync() -{ - DPRINTF(MultiEthernet, "MultiIface:::initPeriodicSync started\n"); - // Do a global sync here to ensure that peer gem5 processes are around - // (actually this may not be needed...) - sync->run(SyncTrigger::atomic, curTick()); - - // Start the periodic sync if it is a fresh simulation from scratch - if (curTick() == 0) { - if (this == master) { - syncEvent->start(syncStart, syncRepeat); - inform("Multi synchronisation activated: start at %lld, " - "repeat at every %lld ticks.\n", - syncStart, syncRepeat); - } else { - // In case another multiIface object requires different schedule - // for periodic sync than the master does. - syncEvent->adjust(syncStart, syncRepeat); - } - } else { - // Schedule the next periodic sync if resuming from a checkpoint - if (this == master) - syncEvent->resume(); - } - DPRINTF(MultiEthernet, "MultiIface::initPeriodicSync done\n"); -} diff --git a/src/dev/net/multi_iface.hh b/src/dev/net/multi_iface.hh deleted file mode 100644 index f8ce2abf7..000000000 --- a/src/dev/net/multi_iface.hh +++ /dev/null @@ -1,492 +0,0 @@ -/* - * Copyright (c) 2015 ARM Limited - * All rights reserved - * - * The license below extends only to copyright in the software and shall - * not be construed as granting a license to any other intellectual - * property including but not limited to intellectual property relating - * to a hardware implementation of the functionality of the software - * licensed hereunder. You may use the software subject to the license - * terms below provided that you ensure that this notice is replicated - * unmodified and in its entirety in all distributions of the software, - * modified or unmodified, in source code or in binary form. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Authors: Gabor Dozsa - */ - -/* @file - * The interface class for multi gem5 simulations. - * - * Multi gem5 is an extension to gem5 to enable parallel simulation of a - * distributed system (e.g. simulation of a pool of machines - * connected by Ethernet links). A multi gem5 run consists of seperate gem5 - * processes running in parallel. Each gem5 process executes - * the simulation of a component of the simulated distributed system. - * (An example component can be a multi-core board with an Ethernet NIC.) - * The MultiIface class below provides services to transfer data and - * control messages among the gem5 processes. The main such services are - * as follows. - * - * 1. Send a data packet coming from a simulated Ethernet link. The packet - * will be transferred to (all) the target(s) gem5 processes. The send - * operation is always performed by the simulation thread, i.e. the gem5 - * thread that is processing the event queue associated with the simulated - * Ethernet link. - * - * 2. Spawn a receiver thread to process messages coming in from the - * from other gem5 processes. Each simulated Ethernet link has its own - * associated receiver thread. The receiver thread saves the incoming packet - * and schedule an appropriate receive event in the event queue. - * - * 3. Schedule a global barrier event periodically to keep the gem5 - * processes in sync. - * Periodic barrier event to keep peer gem5 processes in sync. The basic idea - * is that no gem5 process can go ahead further than the simulated link - * transmission delay to ensure that a corresponding receive event can always - * be scheduled for any message coming in from a peer gem5 process. - * - * - * - * This interface is an abstract class (sendRaw() and recvRaw() - * methods are pure virtual). It can work with various low level - * send/receive service implementations (e.g. TCP/IP, MPI,...). A TCP - * stream socket version is implemented in dev/src/tcp_iface.[hh,cc]. - */ -#ifndef __DEV_NET_MULTI_IFACE_HH__ -#define __DEV_NET_MULTI_IFACE_HH__ - -#include <array> -#include <mutex> -#include <queue> -#include <thread> -#include <utility> - -#include "dev/net/etherpkt.hh" -#include "dev/net/multi_packet.hh" -#include "sim/core.hh" -#include "sim/drain.hh" -#include "sim/global_event.hh" - -class EventManager; - -/** - * The interface class to talk to peer gem5 processes. - */ -class MultiIface : public Drainable -{ - public: - /*! - * The possible reasons a multi sync among gem5 peers is needed for. - */ - enum - class SyncTrigger { - periodic, /*!< Regular periodic sync. This can be interrupted by a - checkpoint sync request */ - ckpt, /*!< sync before taking a checkpoint */ - atomic /*!< sync that cannot be interrupted (e.g. sync at startup) */ - }; - - private: - typedef MultiHeaderPkt::MsgType MsgType; - - /** Sync State-Machine - \dot - digraph Sync { - node [shape=box, fontsize=10]; - idle -> busy - [ label="new trigger\n by run()" fontsize=8 ]; - busy -> busy - [ label="new message by progress():\n(msg == SyncAck &&\nwaitNum > 1) || \n(msg==CkptSyncReq &&\ntrigger == ckpt)" fontsize=8 ]; - busy -> idle - [ label="new message by progress():\n(msg == SyncAck &&\nwaitNum == 1)" fontsize=8 ]; - busy -> interrupted - [ label="new message by progress():\n(msg == CkptSyncReq &&\ntrigger == periodic)" fontsize=8 ]; - idle -> asyncCkpt - [ label="new message by progress():\nmsg == CkptSyncReq" fontsize=8 ]; - asyncCkpt -> asyncCkpt - [ label="new message by progress():\nmsg == CkptSyncReq" fontsize=8 ]; - asyncCkpt -> busy - [ label="new trigger by run():\ntrigger == ckpt" fontsize=8 ]; - asyncCkpt -> idle - [ label="new trigger by run():\n(trigger == periodic &&\nwaitNum == 0) " fontsize=8 ]; - asyncCkpt -> interrupted - [ label="new trigger by run():\n(trigger == periodic &&\nwaitNum > 0) " fontsize=8 ]; - interrupted -> interrupted - [ label="new message by progress():\n(msg == CkptSyncReq &&\nwaitNum > 1)" fontsize=8 ]; - interrupted -> idle - [ label="new message by progress():\n(msg == CkptSyncReq &&\nwaitNum == 1)" fontsize=8 ]; - } - \enddot - */ - /** @class Sync - * This class implements global sync operations among gem5 peer processes. - * - * @note This class is used as a singleton object (shared by all MultiIface - * objects). - */ - class Sync - { - private: - /*! - * Internal state of the sync singleton object. - */ - enum class SyncState { - busy, /*!< There is an on-going sync. */ - interrupted, /*!< An on-going periodic sync was interrupted. */ - asyncCkpt, /*!< A checkpoint (sim_exit) is already scheduled */ - idle /*!< There is no active sync. */ - }; - /** - * The lock to protect access to the MultiSync object. - */ - std::mutex lock; - /** - * Condition variable for the simulation thread to wait on - * until all receiver threads completes the current global - * synchronisation. - */ - std::condition_variable cv; - /** - * Number of receiver threads that not yet completed the current global - * synchronisation. - */ - unsigned waitNum; - /** - * The trigger for the most recent sync. - */ - SyncTrigger trigger; - /** - * Map sync triggers to request messages. - */ - std::array<MsgType, 3> triggerToMsg = {{ - MsgType::cmdPeriodicSyncReq, - MsgType::cmdCkptSyncReq, - MsgType::cmdAtomicSyncReq - }}; - - /** - * Current sync state. - */ - SyncState state; - - public: - /** - * Core method to perform a full multi sync. - * - * @param t Sync trigger. - * @param sync_tick The tick the sync was expected to happen at. - * @return true if the sync completed, false if it was interrupted. - * - * @note In case of an interrupted periodic sync, sync_tick can be less - * than curTick() when we resume (i.e. re-run) it - */ - bool run(SyncTrigger t, Tick sync_tick); - /** - * Callback when the receiver thread gets a sync message. - */ - void progress(MsgType m); - - Sync() : waitNum(0), state(SyncState::idle) {} - ~Sync() {} - }; - - - /** - * The global event to schedule peridic multi sync. It is used as a - * singleton object. - * - * The periodic synchronisation works as follows. - * 1. A MultisyncEvent is scheduled as a global event when startup() is - * called. - * 2. The progress() method of the MultisyncEvent initiates a new barrier - * for each simulated Ethernet links. - * 3. Simulation thread(s) then waits until all receiver threads - * completes the ongoing barrier. The global sync event is done. - */ - class SyncEvent : public GlobalSyncEvent - { - public: - /** - * Flag to indicate that the most recent periodic sync was interrupted - * (by a checkpoint request). - */ - bool interrupted; - /** - * The tick when the most recent periodic synchronisation was scheduled - * at. - */ - Tick scheduledAt; - /** - * Flag to indicate an on-going drain cycle. - */ - bool isDraining; - - public: - /** - * Only the firstly instanstiated MultiIface object will - * call this constructor. - */ - SyncEvent() : GlobalSyncEvent(Default_Pri, 0), interrupted(false), - scheduledAt(0), isDraining(false) {} - - ~SyncEvent() { assert (scheduled() == false); } - /** - * Schedule the first periodic sync event. - * - * @param start Start tick for multi synchronisation - * @param repeat Frequency of multi synchronisation - * - */ - void start(Tick start, Tick repeat); - /** - * Reschedule (if necessary) the periodic sync event. - * - * @param start Start tick for multi synchronisation - * @param repeat Frequency of multi synchronisation - * - * @note Useful if we have multiple MultiIface objects with - * different 'start' and 'repeat' values for global sync. - */ - void adjust(Tick start, Tick repeat); - /** - * This is a global event so process() will be called by each - * simulation threads. (See further comments in the .cc file.) - */ - void process() override; - /** - * Schedule periodic sync when resuming from a checkpoint. - */ - void resume(); - - void serialize(const std::string &base, CheckpointOut &cp) const; - void unserialize(const std::string &base, CheckpointIn &cp); - }; - - /** - * The receive thread needs to store the packet pointer and the computed - * receive tick for each incoming data packet. This information is used - * by the simulation thread when it processes the corresponding receive - * event. (See more comments at the implemetation of the recvThreadFunc() - * and RecvPacketIn() methods.) - */ - typedef std::pair<EthPacketPtr, Tick> RecvInfo; - - /** - * Comparison predicate for RecvInfo, needed by the recvQueue. - */ - struct RecvInfoCompare { - bool operator()(const RecvInfo &lhs, const RecvInfo &rhs) - { - return lhs.second > rhs.second; - } - }; - - /** - * Customized priority queue used to store incoming data packets info by - * the receiver thread. We need to expose the underlying container to - * enable iterator access for serializing. - */ - class RecvQueue : public std::priority_queue<RecvInfo, - std::vector<RecvInfo>, - RecvInfoCompare> - { - public: - std::vector<RecvInfo> &impl() { return c; } - const std::vector<RecvInfo> &impl() const { return c; } - }; - - /* - * The priority queue to store RecvInfo items ordered by receive ticks. - */ - RecvQueue recvQueue; - /** - * The singleton Sync object to perform multi synchronisation. - */ - static Sync *sync; - /** - * The singleton SyncEvent object to schedule periodic multi sync. - */ - static SyncEvent *syncEvent; - /** - * Tick to schedule the first multi sync event. - * This is just as optimization : we do not need any multi sync - * event until the simulated NIC is brought up by the OS. - */ - Tick syncStart; - /** - * Frequency of multi sync events in ticks. - */ - Tick syncRepeat; - /** - * Receiver thread pointer. - * Each MultiIface object must have exactly one receiver thread. - */ - std::thread *recvThread; - /** - * The event manager associated with the MultiIface object. - */ - EventManager *eventManager; - - /** - * The receive done event for the simulated Ethernet link. - * It is scheduled by the receiver thread for each incoming data - * packet. - */ - Event *recvDone; - - /** - * The packet that belongs to the currently scheduled recvDone event. - */ - EthPacketPtr scheduledRecvPacket; - - /** - * The link delay in ticks for the simulated Ethernet link. - */ - Tick linkDelay; - - /** - * The rank of this process among the gem5 peers. - */ - unsigned rank; - /** - * Total number of receiver threads (in this gem5 process). - * During the simulation it should be constant and equal to the - * number of MultiIface objects (i.e. simulated Ethernet - * links). - */ - static unsigned recvThreadsNum; - /** - * The very first MultiIface object created becomes the master. We need - * a master to co-ordinate the global synchronisation. - */ - static MultiIface *master; - - protected: - /** - * Low level generic send routine. - * @param buf buffer that holds the data to send out - * @param length number of bytes to send - * @param dest_addr address of the target (simulated NIC). This may be - * used by a subclass for optimization (e.g. optimize broadcast) - */ - virtual void sendRaw(void *buf, - unsigned length, - const MultiHeaderPkt::AddressType dest_addr) = 0; - /** - * Low level generic receive routine. - * @param buf the buffer to store the incoming message - * @param length buffer size (in bytes) - */ - virtual bool recvRaw(void *buf, unsigned length) = 0; - /** - * Low level request for synchronisation among gem5 processes. Only one - * MultiIface object needs to call this (in each gem5 process) to trigger - * a multi sync. - * - * @param sync_req Sync request command. - * @param sync_tick The tick when sync is expected to happen in the sender. - */ - virtual void syncRaw(MsgType sync_req, Tick sync_tick) = 0; - /** - * The function executed by a receiver thread. - */ - void recvThreadFunc(); - /** - * Receive a multi header packet. Called by the receiver thread. - * @param header the structure to store the incoming header packet. - * @return false if any error occured during the receive, true otherwise - * - * A header packet can carry a control command (e.g. 'barrier leave') or - * information about a data packet that is following the header packet - * back to back. - */ - bool recvHeader(MultiHeaderPkt::Header &header); - /** - * Receive a data packet. Called by the receiver thread. - * @param data_header The packet descriptor for the expected incoming data - * packet. - */ - void recvData(const MultiHeaderPkt::Header &data_header); - - public: - - /** - * ctor - * @param multi_rank Rank of this gem5 process within the multi run - * @param sync_start Start tick for multi synchronisation - * @param sync_repeat Frequency for multi synchronisation - * @param em The event manager associated with the simulated Ethernet link - */ - MultiIface(unsigned multi_rank, - Tick sync_start, - Tick sync_repeat, - EventManager *em); - - virtual ~MultiIface(); - /** - * Send out an Ethernet packet. - * @param pkt The Ethernet packet to send. - * @param send_delay The delay in ticks for the send completion event. - */ - void packetOut(EthPacketPtr pkt, Tick send_delay); - /** - * Fetch the next packet from the receive queue. - */ - EthPacketPtr packetIn(); - - /** - * spawn the receiver thread. - * @param recv_done The receive done event associated with the simulated - * Ethernet link. - * @param link_delay The link delay for the simulated Ethernet link. - */ - void spawnRecvThread(Event *recv_done, - Tick link_delay); - /** - * Initialize the random number generator with a different seed in each - * peer gem5 process. - */ - void initRandom(); - - DrainState drain() override; - - /** - * Callback when draining is complete. - */ - void drainDone(); - - /** - * Initialize the periodic synchronisation among peer gem5 processes. - */ - void startPeriodicSync(); - - void serialize(const std::string &base, CheckpointOut &cp) const; - void unserialize(const std::string &base, CheckpointIn &cp); - -}; - - -#endif // __DEV_NET_MULTI_IFACE_HH__ diff --git a/src/dev/net/multi_packet.cc b/src/dev/net/multi_packet.cc deleted file mode 100644 index 85f76b0c4..000000000 --- a/src/dev/net/multi_packet.cc +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2015 ARM Limited - * All rights reserved - * - * The license below extends only to copyright in the software and shall - * not be construed as granting a license to any other intellectual - * property including but not limited to intellectual property relating - * to a hardware implementation of the functionality of the software - * licensed hereunder. You may use the software subject to the license - * terms below provided that you ensure that this notice is replicated - * unmodified and in its entirety in all distributions of the software, - * modified or unmodified, in source code or in binary form. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Authors: Gabor Dozsa - */ - -/* @file - * MultiHeaderPkt class to encapsulate multi-gem5 header packets - * - */ - -#include "dev/net/multi_packet.hh" - -#include <cstdint> -#include <cstring> - -#include "base/inet.hh" - -unsigned -MultiHeaderPkt::maxAddressLength() -{ - return sizeof(AddressType); -} - -void -MultiHeaderPkt::clearAddress(AddressType &addr) -{ - std::memset(addr, 0, sizeof(addr)); -} - -bool -MultiHeaderPkt::isAddressEqual(const AddressType &addr1, - const AddressType &addr2) -{ - return (std::memcmp(addr1, addr2, sizeof(addr1)) == 0); -} - -bool -MultiHeaderPkt::isAddressLess(const AddressType &addr1, - const AddressType &addr2) -{ - return (std::memcmp(addr1, addr2, sizeof(addr1)) < 0); -} - -void -MultiHeaderPkt::copyAddress(AddressType &dest, const AddressType &src) -{ - std::memcpy(dest, src, sizeof(dest)); -} - -bool -MultiHeaderPkt::isBroadcastAddress(const AddressType &addr) -{ - return ((Net::EthAddr *)&addr)->broadcast(); -} - -bool -MultiHeaderPkt::isMulticastAddress(const AddressType &addr) -{ - return ((Net::EthAddr *)&addr)->multicast(); -} - -bool -MultiHeaderPkt::isUnicastAddress(const AddressType &addr) -{ - return ((Net::EthAddr *)&addr)->unicast(); -} diff --git a/src/dev/net/tcp_iface.cc b/src/dev/net/tcp_iface.cc index 035ec8fd0..38fc7aef2 100644 --- a/src/dev/net/tcp_iface.cc +++ b/src/dev/net/tcp_iface.cc @@ -35,25 +35,35 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Authors: Gabor Dozsa + * Mohammad Alian */ /* @file - * TCP stream socket based interface class implementation for multi gem5 runs. + * TCP stream socket based interface class implementation for dist-gem5 runs. */ #include "dev/net/tcp_iface.hh" #include <arpa/inet.h> #include <netdb.h> +#include <netinet/tcp.h> #include <sys/socket.h> #include <sys/types.h> #include <unistd.h> #include <cerrno> #include <cstring> +#include <vector> #include "base/types.hh" -#include "debug/MultiEthernet.hh" +#include "debug/DistEthernet.hh" +#include "debug/DistEthernetCmd.hh" +#include "sim/sim_exit.hh" + +#if defined(__FreeBSD__) +#include <netinet/in.h> + +#endif // MSG_NOSIGNAL does not exists on OS X #if defined(__APPLE__) || defined(__MACH__) @@ -64,42 +74,181 @@ using namespace std; +std::vector<std::pair<TCPIface::NodeInfo, int> > TCPIface::nodes; vector<int> TCPIface::sockRegistry; +int TCPIface::fdStatic = -1; +bool TCPIface::anyListening = false; TCPIface::TCPIface(string server_name, unsigned server_port, - unsigned multi_rank, Tick sync_start, Tick sync_repeat, - EventManager *em) : - MultiIface(multi_rank, sync_start, sync_repeat, em) + unsigned dist_rank, unsigned dist_size, + Tick sync_start, Tick sync_repeat, + EventManager *em, bool is_switch, int num_nodes) : + DistIface(dist_rank, dist_size, sync_start, sync_repeat, em, + is_switch, num_nodes), serverName(server_name), + serverPort(server_port), isSwitch(is_switch), listening(false) { - struct addrinfo addr_hint, *addr_results; + if (is_switch && isMaster) { + while (!listen(serverPort)) { + DPRINTF(DistEthernet, "TCPIface(listen): Can't bind port %d\n", + serverPort); + serverPort++; + } + inform("tcp_iface listening on port %d", serverPort); + // Now accept the first connection requests from each compute node and + // store the node info. The compute nodes will then wait for ack + // messages. Ack messages will be sent by initTransport() in the + // appropriate order to make sure that every compute node is always + // connected to the same switch port. + NodeInfo ni; + for (int i = 0; i < size; i++) { + accept(); + DPRINTF(DistEthernet, "First connection, waiting for link info\n"); + if (!recvTCP(sock, &ni, sizeof(ni))) + panic("Failed to receive link info"); + nodes.push_back(make_pair(ni, sock)); + } + } +} + +bool +TCPIface::listen(int port) +{ + if (listening) + panic("Socket already listening!"); + + struct sockaddr_in sockaddr; int ret; - string port_str = to_string(server_port); + fdStatic = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + panic_if(fdStatic < 0, "socket() failed: %s", strerror(errno)); - sock = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); - panic_if(sock < 0, "socket() failed: %s", strerror(errno)); + sockaddr.sin_family = PF_INET; + sockaddr.sin_addr.s_addr = INADDR_ANY; + sockaddr.sin_port = htons(port); + // finally clear sin_zero + memset(&sockaddr.sin_zero, 0, sizeof(sockaddr.sin_zero)); + ret = ::bind(fdStatic, (struct sockaddr *)&sockaddr, sizeof (sockaddr)); - bzero(&addr_hint, sizeof(addr_hint)); - addr_hint.ai_family = AF_INET; - addr_hint.ai_socktype = SOCK_STREAM; - addr_hint.ai_protocol = IPPROTO_TCP; + if (ret != 0) { + if (ret == -1 && errno != EADDRINUSE) + panic("ListenSocket(listen): bind() failed!"); + return false; + } - ret = getaddrinfo(server_name.c_str(), port_str.c_str(), - &addr_hint, &addr_results); - panic_if(ret < 0, "getaddrinf() failed: %s", strerror(errno)); + if (::listen(fdStatic, 24) == -1) { + if (errno != EADDRINUSE) + panic("ListenSocket(listen): listen() failed!"); - DPRINTF(MultiEthernet, "Connecting to %s:%u\n", - server_name.c_str(), port_str.c_str()); + return false; + } - ret = ::connect(sock, (struct sockaddr *)(addr_results->ai_addr), - addr_results->ai_addrlen); - panic_if(ret < 0, "connect() failed: %s", strerror(errno)); + listening = true; + anyListening = true; + return true; +} - freeaddrinfo(addr_results); - // add our socket to the static registry +void +TCPIface::establishConnection() +{ + static unsigned cur_rank = 0; + static unsigned cur_id = 0; + NodeInfo ni; + + if (isSwitch) { + if (cur_id == 0) { // first connection accepted in the ctor already + auto const &iface0 = + find_if(nodes.begin(), nodes.end(), + [](const pair<NodeInfo, int> &cn) -> bool { + return cn.first.rank == cur_rank; + }); + assert(iface0 != nodes.end()); + assert(iface0->first.distIfaceId == 0); + sock = iface0->second; + ni = iface0->first; + } else { // additional connections from the same compute node + accept(); + DPRINTF(DistEthernet, "Next connection, waiting for link info\n"); + if (!recvTCP(sock, &ni, sizeof(ni))) + panic("Failed to receive link info"); + assert(ni.rank == cur_rank); + assert(ni.distIfaceId == cur_id); + } + inform("Link okay (iface:%d -> (node:%d, iface:%d))", + distIfaceId, ni.rank, ni.distIfaceId); + if (ni.distIfaceId < ni.distIfaceNum - 1) { + cur_id++; + } else { + cur_rank++; + cur_id = 0; + } + // send ack + ni.distIfaceId = distIfaceId; + ni.distIfaceNum = distIfaceNum; + sendTCP(sock, &ni, sizeof(ni)); + } else { // this is not a switch + connect(); + // send link info + ni.rank = rank; + ni.distIfaceId = distIfaceId; + ni.distIfaceNum = distIfaceNum; + sendTCP(sock, &ni, sizeof(ni)); + DPRINTF(DistEthernet, "Connected, waiting for ack (distIfaceId:%d\n", + distIfaceId); + if (!recvTCP(sock, &ni, sizeof(ni))) + panic("Failed to receive ack"); + assert(ni.rank == rank); + inform("Link okay (iface:%d -> switch iface:%d)", distIfaceId, + ni.distIfaceId); + } sockRegistry.push_back(sock); - // let the server know who we are - sendTCP(sock, &multi_rank, sizeof(multi_rank)); +} + +void +TCPIface::accept() +{ + struct sockaddr_in sockaddr; + socklen_t slen = sizeof (sockaddr); + sock = ::accept(fdStatic, (struct sockaddr *)&sockaddr, &slen); + if (sock != -1) { + int i = 1; + if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *)&i, + sizeof(i)) < 0) + warn("ListenSocket(accept): setsockopt() TCP_NODELAY failed!"); + } +} + +void +TCPIface::connect() +{ + struct addrinfo addr_hint, *addr_results; + int ret; + + string port_str = to_string(serverPort); + + sock = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + panic_if(sock < 0, "socket() failed: %s", strerror(errno)); + + int fl = 1; + if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *)&fl, sizeof(fl)) < 0) + warn("ConnectSocket(connect): setsockopt() TCP_NODELAY failed!"); + + bzero(&addr_hint, sizeof(addr_hint)); + addr_hint.ai_family = AF_INET; + addr_hint.ai_socktype = SOCK_STREAM; + addr_hint.ai_protocol = IPPROTO_TCP; + + ret = getaddrinfo(serverName.c_str(), port_str.c_str(), + &addr_hint, &addr_results); + panic_if(ret < 0, "getaddrinf() failed: %s", strerror(errno)); + + DPRINTF(DistEthernet, "Connecting to %s:%s\n", + serverName.c_str(), port_str.c_str()); + + ret = ::connect(sock, (struct sockaddr *)(addr_results->ai_addr), + addr_results->ai_addrlen); + panic_if(ret < 0, "connect() failed: %s", strerror(errno)); + + freeaddrinfo(addr_results); } TCPIface::~TCPIface() @@ -111,12 +260,20 @@ TCPIface::~TCPIface() } void -TCPIface::sendTCP(int sock, void *buf, unsigned length) +TCPIface::sendTCP(int sock, const void *buf, unsigned length) { ssize_t ret; ret = ::send(sock, buf, length, MSG_NOSIGNAL); - panic_if(ret < 0, "send() failed: %s", strerror(errno)); + if (ret < 0) { + if (errno == ECONNRESET || errno == EPIPE) { + inform("send(): %s", strerror(errno)); + exit_message("info", 0, "Message server closed connection, " + "simulation is exiting"); + } else { + panic("send() failed: %s", strerror(errno)); + } + } panic_if(ret != length, "send() failed"); } @@ -140,19 +297,47 @@ TCPIface::recvTCP(int sock, void *buf, unsigned length) } void -TCPIface::syncRaw(MultiHeaderPkt::MsgType sync_req, Tick sync_tick) +TCPIface::sendPacket(const Header &header, const EthPacketPtr &packet) +{ + sendTCP(sock, &header, sizeof(header)); + sendTCP(sock, packet->data, packet->length); +} + +void +TCPIface::sendCmd(const Header &header) { - /* - * Barrier is simply implemented by point-to-point messages to the server - * for now. This method is called by only one TCPIface object. - * The server will send back an 'ack' message when it gets the - * sync request from all clients. - */ - MultiHeaderPkt::Header header_pkt; - header_pkt.msgType = sync_req; - header_pkt.sendTick = sync_tick; - - for (auto s : sockRegistry) - sendTCP(s, (void *)&header_pkt, sizeof(header_pkt)); + DPRINTF(DistEthernetCmd, "TCPIface::sendCmd() type: %d\n", + static_cast<int>(header.msgType)); + // Global commands (i.e. sync request) are always sent by the master + // DistIface. The transfer method is simply implemented as point-to-point + // messages for now + for (auto s: sockRegistry) + sendTCP(s, (void*)&header, sizeof(header)); } +bool +TCPIface::recvHeader(Header &header) +{ + bool ret = recvTCP(sock, &header, sizeof(header)); + DPRINTF(DistEthernetCmd, "TCPIface::recvHeader() type: %d ret: %d\n", + static_cast<int>(header.msgType), ret); + return ret; +} + +void +TCPIface::recvPacket(const Header &header, EthPacketPtr &packet) +{ + packet = make_shared<EthPacketData>(header.dataPacketLength); + bool ret = recvTCP(sock, packet->data, header.dataPacketLength); + panic_if(!ret, "Error while reading socket"); + packet->length = header.dataPacketLength; +} + +void +TCPIface::initTransport() +{ + // We cannot setup the conections in the constructor because the number + // of dist interfaces (per process) is unknown until the (simobject) init + // phase. That information is necessary for global connection ordering. + establishConnection(); +} diff --git a/src/dev/net/tcp_iface.hh b/src/dev/net/tcp_iface.hh index 2eb2c1c07..97ae9cde0 100644 --- a/src/dev/net/tcp_iface.hh +++ b/src/dev/net/tcp_iface.hh @@ -35,17 +35,17 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Authors: Gabor Dozsa + * Mohammad Alian */ /* @file - * TCP stream socket based interface class for multi gem5 runs. + * TCP stream socket based interface class for dist-gem5 runs. * - * For a high level description about multi gem5 see comments in - * header file multi_iface.hh. + * For a high level description about dist-gem5 see comments in + * header file dist_iface.hh. * - * The TCP subclass of MultiIface uses a separate server process - * (see tcp_server.[hh,cc] under directory gem5/util/multi). Each gem5 - * process connects to the server via a stream socket. The server process + * Each gem5 process connects to the server (another gem5 process which + * simulates a switch box) via a stream socket. The server process * transfers messages and co-ordinates the synchronisation among the gem5 * peers. */ @@ -55,11 +55,11 @@ #include <string> -#include "dev/net/multi_iface.hh" +#include "dev/net/dist_iface.hh" class EventManager; -class TCPIface : public MultiIface +class TCPIface : public DistIface { private: /** @@ -67,8 +67,28 @@ class TCPIface : public MultiIface */ int sock; + std::string serverName; + int serverPort; + + bool isSwitch; + + bool listening; + static bool anyListening; + static int fdStatic; + + /** + * Compute node info and storage for the very first connection from each + * node (used by the switch) + */ + struct NodeInfo + { + unsigned rank; + unsigned distIfaceId; + unsigned distIfaceNum; + }; + static std::vector<std::pair<NodeInfo, int> > nodes; /** - * Registry for all sockets to the server opened by this gem5 process. + * Storage for all opened sockets */ static std::vector<int> sockRegistry; @@ -82,7 +102,7 @@ class TCPIface : public MultiIface * @param length Size of the message in bytes. */ void - sendTCP(int sock, void *buf, unsigned length); + sendTCP(int sock, const void *buf, unsigned length); /** * Receive the next incoming message through a TCP stream socket. @@ -92,24 +112,26 @@ class TCPIface : public MultiIface * @param length Exact size of the expected message in bytes. */ bool recvTCP(int sock, void *buf, unsigned length); - + bool listen(int port); + void accept(); + void connect(); + int getfdStatic() const { return fdStatic; } + bool islistening() const { return listening; } + bool anyislistening() const { return anyListening; } + void establishConnection(); protected: - virtual void - sendRaw(void *buf, unsigned length, - const MultiHeaderPkt::AddressType dest_addr=nullptr) override - { - sendTCP(sock, buf, length); - } + void sendPacket(const Header &header, + const EthPacketPtr &packet) override; - virtual bool recvRaw(void *buf, unsigned length) override - { - return recvTCP(sock, buf, length); - } + void sendCmd(const Header &header) override; + + bool recvHeader(Header &header) override; + + void recvPacket(const Header &header, EthPacketPtr &packet) override; - virtual void syncRaw(MultiHeaderPkt::MsgType sync_req, - Tick sync_tick) override; + void initTransport() override; public: /** @@ -118,14 +140,15 @@ class TCPIface : public MultiIface * server process. * @param server_port The port number the server listening for new * connections. - * @param sync_start The tick for the first multi synchronisation. - * @param sync_repeat The frequency of multi synchronisation. + * @param sync_start The tick for the first dist synchronisation. + * @param sync_repeat The frequency of dist synchronisation. * @param em The EventManager object associated with the simulated * Ethernet link. */ TCPIface(std::string server_name, unsigned server_port, - unsigned multi_rank, Tick sync_start, Tick sync_repeat, - EventManager *em); + unsigned dist_rank, unsigned dist_size, + Tick sync_start, Tick sync_repeat, EventManager *em, + bool is_switch, int num_nodes); ~TCPIface() override; }; |