src/gpu-compute/tlb_coalescer.hh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

/*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Lisa Hsu
 */

#ifndef __TLB_COALESCER_HH__
#define __TLB_COALESCER_HH__

#include <list>
#include <queue>
#include <string>
#include <vector>

#include "arch/generic/tlb.hh"
#include "arch/isa.hh"
#include "arch/isa_traits.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/regs/segment.hh"
#include "base/logging.hh"
#include "base/statistics.hh"
#include "gpu-compute/gpu_tlb.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/TLBCoalescer.hh"

class BaseTLB;
class Packet;
class ThreadContext;

/**
 * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of
 * each TLB. It receives packets and issues coalesced requests to the
 * TLB below it. It controls how requests are coalesced (the rules)
 * and the permitted number of TLB probes per cycle (i.e., how many
 * coalesced requests it feeds the TLB per cycle).
 */
class TLBCoalescer : public MemObject
{
   protected:
    // TLB clock: will inherit clock from shader's clock period in terms
    // of nuber of ticks of curTime (aka global simulation clock)
    // The assignment of TLB clock from shader clock is done in the
    // python config files.
    int clock;

  public:
    typedef TLBCoalescerParams Params;
    TLBCoalescer(const Params *p);
    ~TLBCoalescer() { }

    // Number of TLB probes per cycle. Parameterizable - default 2.
    int TLBProbesPerCycle;

    // Consider coalescing across that many ticks.
    // Paraemterizable - default 1.
    int coalescingWindow;

    // Each coalesced request consists of multiple packets
    // that all fall within the same virtual page
    typedef std::vector<PacketPtr> coalescedReq;

    // disables coalescing when true
    bool disableCoalescing;

    /*
     * This is a hash map with <tick_index> as a key.
     * It contains a vector of coalescedReqs per <tick_index>.
     * Requests are buffered here until they can be issued to
     * the TLB, at which point they are copied to the
     * issuedTranslationsTable hash map.
     *
     * In terms of coalescing, we coalesce requests in a given
     * window of x cycles by using tick_index = issueTime/x as a
     * key, where x = coalescingWindow. issueTime is the issueTime
     * of the pkt from the ComputeUnit's perspective, but another
     * option is to change it to curTick(), so we coalesce based
     * on the receive time.
     */
    typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;

    CoalescingFIFO coalescerFIFO;

    /*
     * issuedTranslationsTabler: a hash_map indexed by virtual page
     * address. Each hash_map entry has a vector of PacketPtr associated
     * with it denoting the different packets that share an outstanding
     * coalesced translation request for the same virtual page.
     *
     * The rules that determine which requests we can coalesce are
     * specified in the canCoalesce() method.
     */
    typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;

    CoalescingTable issuedTranslationsTable;

    // number of packets the coalescer receives
    Stats::Scalar uncoalescedAccesses;
    // number packets the coalescer send to the TLB
    Stats::Scalar coalescedAccesses;

    // Number of cycles the coalesced requests spend waiting in
    // coalescerFIFO. For each packet the coalescer receives we take into
    // account the number of all uncoalesced requests this pkt "represents"
    Stats::Scalar queuingCycles;

    // On average how much time a request from the
    // uncoalescedAccesses that reaches the TLB
    // spends waiting?
    Stats::Scalar localqueuingCycles;
    // localqueuingCycles/uncoalescedAccesses
    Stats::Formula localLatency;

    bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
    void updatePhysAddresses(PacketPtr pkt);
    void regStats();

    // Clock related functions. Maps to-and-from
    // Simulation ticks and object clocks.
    Tick frequency() const { return SimClock::Frequency / clock; }
    Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
    Tick curCycle() const { return curTick() / clock; }
    Tick tickToCycles(Tick val) const { return val / clock;}

    class CpuSidePort : public SlavePort
    {
      public:
        CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
                    PortID _index)
            : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
              index(_index) { }

      protected:
        TLBCoalescer *coalescer;
        int index;

        virtual bool recvTimingReq(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt);
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();

        virtual void
        recvRespRetry()
        {
            fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
        }

        virtual AddrRangeList getAddrRanges() const;
    };

    class MemSidePort : public MasterPort
    {
      public:
        MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
                    PortID _index)
            : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
              index(_index) { }

        std::deque<PacketPtr> retries;

      protected:
        TLBCoalescer *coalescer;
        int index;

        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt);
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();

        virtual void
        recvRespRetry()
        {
            fatal("recvRespRetry() not implemented in TLB coalescer");
        }
    };

    // Coalescer slave ports on the cpu Side
    std::vector<CpuSidePort*> cpuSidePort;
    // Coalescer master ports on the memory side
    std::vector<MemSidePort*> memSidePort;

    BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx);
    BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx);

    void processProbeTLBEvent();
    /// This event issues the TLB probes
    EventFunctionWrapper probeTLBEvent;

    void processCleanupEvent();
    /// The cleanupEvent is scheduled after a TLBEvent triggers
    /// in order to free memory and do the required clean-up
    EventFunctionWrapper cleanupEvent;

    // this FIFO queue keeps track of the virt. page
    // addresses that are pending cleanup
    std::queue<Addr> cleanupQueue;
};

#endif // __TLB_COALESCER_HH__