kvm: Don't handle IO and execute in the same tick

We currently execute instructions in the guest and then handle any IO request right after we break out of the virtualized environment. This has the effect of executing IO requests in the exact same tick as the first instruction in the sequence that was just run. There seem to be cases where this simplification upsets some timing-sensitive devices. This changeset splits execute and IO (and other services) across multiple ticks. This is implemented by adding a separate RunningService state to the CPU state machine. When a VM requires service, it enters into this state and pending IO is then serviced in the future instead of immediately. The delay between getting the request and servicing it depends on the number of cycles executed in the guest, which allows other components to catch up with the CPU.
author: Andreas Sandberg <andreas@sandberg.pp.se> 2013-06-11 09:24:51 +0200
committer: Andreas Sandberg <andreas@sandberg.pp.se> 2013-06-11 09:24:51 +0200
commit: 4f002930bc03125b9b886233985993291f5a4730 (patch)
tree: 43d8eb037db7316629ed109add8386ac94d7a5b2 /src/cpu/kvm/base.cc
parent: df059f45a0cbd230ad00f6da24cfc5d228430e16 (diff)
download: gem5-4f002930bc03125b9b886233985993291f5a4730.tar.xz
1 files changed, 312 insertions, 87 deletions
diff --git a/src/cpu/kvm/base.cc b/src/cpu/kvm/base.cc
index 6ffad82d7..3bfe44cf4 100644
--- a/src/cpu/kvm/base.cc
+++ b/src/cpu/kvm/base.cc
@@ -49,6 +49,7 @@
 #include "arch/utility.hh"
 #include "cpu/kvm/base.hh"
 #include "debug/Checkpoint.hh"
+#include "debug/Drain.hh"
 #include "debug/Kvm.hh"
 #include "debug/KvmIO.hh"
 #include "debug/KvmRun.hh"
@@ -56,6 +57,8 @@
 #include "sim/process.hh"
 #include "sim/system.hh"
 
+#include <signal.h>
+
 /* Used by some KVM macros */
 #define PAGE_SIZE pageSize
 
@@ -81,6 +84,7 @@ BaseKvmCPU::BaseKvmCPU(BaseKvmCPUParams *params)
       tickEvent(*this),
       perfControlledByTimer(params->usePerfOverflow),
       hostFactor(params->hostFactor),
+      drainManager(NULL),
       ctrInsts(0)
 {
     if (pageSize == -1)
@@ -94,7 +98,6 @@ BaseKvmCPU::BaseKvmCPU(BaseKvmCPUParams *params)
     threadContexts.push_back(tc);
 
     setupCounters();
-    setupSignalHandler();
 
     if (params->usePerfOverflow)
         runTimer.reset(new PerfKvmTimer(hwCycles,
@@ -151,6 +154,10 @@ BaseKvmCPU::startup()
     // point. Initialize virtual CPUs here instead.
     vcpuFD = vm.createVCPU(vcpuID);
 
+    // Setup signal handlers. This has to be done after the vCPU is
+    // created since it manipulates the vCPU signal mask.
+    setupSignalHandler();
+
     // Map the KVM run structure */
     vcpuMMapSize = kvm.getVCPUMMapSize();
     _kvmRun = (struct kvm_run *)mmap(0, vcpuMMapSize,
@@ -232,9 +239,6 @@ BaseKvmCPU::serializeThread(std::ostream &os, ThreadID tid)
         dump();
     }
 
-    // Update the thread context so we have something to serialize.
-    syncThreadContext();
-
     assert(tid == 0);
     assert(_status == Idle);
     thread->serialize(os);
@@ -258,15 +262,62 @@ BaseKvmCPU::drain(DrainManager *dm)
     if (switchedOut())
         return 0;
 
-    DPRINTF(Kvm, "drain\n");
+    DPRINTF(Drain, "BaseKvmCPU::drain\n");
+    switch (_status) {
+      case Running:
+        // The base KVM code is normally ready when it is in the
+        // Running state, but the architecture specific code might be
+        // of a different opinion. This may happen when the CPU been
+        // notified of an event that hasn't been accepted by the vCPU
+        // yet.
+        if (!archIsDrained()) {
+            drainManager = dm;
+            return 1;
+        }
+
+        // The state of the CPU is consistent, so we don't need to do
+        // anything special to drain it. We simply de-schedule the
+        // tick event and enter the Idle state to prevent nasty things
+        // like MMIOs from happening.
+        if (tickEvent.scheduled())
+            deschedule(tickEvent);
+        _status = Idle;
 
-    // De-schedule the tick event so we don't insert any more MMIOs
-    // into the system while it is draining.
-    if (tickEvent.scheduled())
-        deschedule(tickEvent);
+        /** FALLTHROUGH */
+      case Idle:
+        // Idle, no need to drain
+        assert(!tickEvent.scheduled());
 
-    _status = Idle;
-    return 0;
+        // Sync the thread context here since we'll need it when we
+        // switch CPUs or checkpoint the CPU.
+        syncThreadContext();
+
+        return 0;
+
+      case RunningServiceCompletion:
+        // The CPU has just requested a service that was handled in
+        // the RunningService state, but the results have still not
+        // been reported to the CPU. Now, we /could/ probably just
+        // update the register state ourselves instead of letting KVM
+        // handle it, but that would be tricky. Instead, we enter KVM
+        // and let it do its stuff.
+        drainManager = dm;
+
+        DPRINTF(Drain, "KVM CPU is waiting for service completion, "
+                "requesting drain.\n");
+        return 1;
+
+      case RunningService:
+        // We need to drain since the CPU is waiting for service (e.g., MMIOs)
+        drainManager = dm;
+
+        DPRINTF(Drain, "KVM CPU is waiting for service, requesting drain.\n");
+        return 1;
+
+      default:
+        panic("KVM: Unhandled CPU state in drain()\n");
+        return 0;
+    }
 }
 
 void
@@ -297,10 +348,6 @@ BaseKvmCPU::switchOut()
 {
     DPRINTF(Kvm, "switchOut\n");
 
-    // Make sure to update the thread context in case, the new CPU
-    // will need to access it.
-    syncThreadContext();
-
     BaseCPU::switchOut();
 
     // We should have drained prior to executing a switchOut, which
@@ -324,9 +371,12 @@ BaseKvmCPU::takeOverFrom(BaseCPU *cpu)
     assert(_status == Idle);
     assert(threadContexts.size() == 1);
 
-    // The BaseCPU updated the thread context, make sure that we
-    // synchronize next time we enter start the CPU.
-    threadContextDirty = true;
+    // Force an update of the KVM state here instead of flagging the
+    // TC as dirty. This is not ideal from a performance point of
+    // view, but it makes debugging easier as it allows meaningful KVM
+    // state to be dumped before and after a takeover.
+    updateKvmState();
+    threadContextDirty = false;
 }
 
 void
@@ -436,25 +486,73 @@ BaseKvmCPU::dump()
 void
 BaseKvmCPU::tick()
 {
-    assert(_status == Running);
-
-    DPRINTF(KvmRun, "Entering KVM...\n");
-
-    Tick ticksToExecute(mainEventQueue.nextTick() - curTick());
-    Tick ticksExecuted(kvmRun(ticksToExecute));
-
-    Tick delay(ticksExecuted + handleKvmExit());
+    Tick delay(0);
+    assert(_status != Idle);
 
     switch (_status) {
-      case Running:
-        schedule(tickEvent, clockEdge(ticksToCycles(delay)));
+      case RunningService:
+        // handleKvmExit() will determine the next state of the CPU
+        delay = handleKvmExit();
+
+        if (tryDrain())
+            _status = Idle;
         break;
 
+      case RunningServiceCompletion:
+      case Running: {
+          Tick ticksToExecute(mainEventQueue.nextTick() - curTick());
+
+          // We might need to update the KVM state.
+          syncKvmState();
+
+          DPRINTF(KvmRun, "Entering KVM...\n");
+          if (drainManager) {
+              // Force an immediate exit from KVM after completing
+              // pending operations. The architecture-specific code
+              // takes care to run until it is in a state where it can
+              // safely be drained.
+              delay = kvmRunDrain();
+          } else {
+              delay = kvmRun(ticksToExecute);
+          }
+
+          // Entering into KVM implies that we'll have to reload the thread
+          // context from KVM if we want to access it. Flag the KVM state as
+          // dirty with respect to the cached thread context.
+          kvmStateDirty = true;
+
+          // Enter into the RunningService state unless the
+          // simulation was stopped by a timer.
+          if (_kvmRun->exit_reason !=  KVM_EXIT_INTR)
+              _status = RunningService;
+          else
+              _status = Running;
+
+          if (tryDrain())
+              _status = Idle;
+      } break;
+
       default:
-        /* The CPU is halted or waiting for an interrupt from a
-         * device. Don't start it. */
-        break;
+        panic("BaseKvmCPU entered tick() in an illegal state (%i)\n",
+              _status);
     }
+
+    // Schedule a new tick if we are still running
+    if (_status != Idle)
+        schedule(tickEvent, clockEdge(ticksToCycles(delay)));
+}
+
+Tick
+BaseKvmCPU::kvmRunDrain()
+{
+    // By default, the only thing we need to drain is a pending IO
+    // operation which assumes that we are in the
+    // RunningServiceCompletion state.
+    assert(_status == RunningServiceCompletion);
+
+    // Deliver the data from the pending IO operation and immediately
+    // exit.
+    return kvmRun(0);
 }
 
 uint64_t
@@ -466,68 +564,91 @@ BaseKvmCPU::getHostCycles() const
 Tick
 BaseKvmCPU::kvmRun(Tick ticks)
 {
-    // We might need to update the KVM state.
-    syncKvmState();
-    // Entering into KVM implies that we'll have to reload the thread
-    // context from KVM if we want to access it. Flag the KVM state as
-    // dirty with respect to the cached thread context.
-    kvmStateDirty = true;
-
-    if (ticks < runTimer->resolution()) {
-        DPRINTF(KvmRun, "KVM: Adjusting tick count (%i -> %i)\n",
-                ticks, runTimer->resolution());
-        ticks = runTimer->resolution();
-    }
-
+    Tick ticksExecuted;
     DPRINTF(KvmRun, "KVM: Executing for %i ticks\n", ticks);
     timerOverflowed = false;
 
-    // Get hardware statistics after synchronizing contexts. The KVM
-    // state update might affect guest cycle counters.
-    uint64_t baseCycles(getHostCycles());
-    uint64_t baseInstrs(hwInstructions.read());
-
-    // Arm the run timer and start the cycle timer if it isn't
-    // controlled by the overflow timer. Starting/stopping the cycle
-    // timer automatically starts the other perf timers as they are in
-    // the same counter group.
-    runTimer->arm(ticks);
-    if (!perfControlledByTimer)
-        hwCycles.start();
-
-    if (ioctl(KVM_RUN) == -1) {
-        if (errno != EINTR)
-            panic("KVM: Failed to start virtual CPU (errno: %i)\n",
-                  errno);
-    }
-
-    runTimer->disarm();
-    if (!perfControlledByTimer)
-        hwCycles.stop();
-
-
-    const uint64_t hostCyclesExecuted(getHostCycles() - baseCycles);
-    const uint64_t simCyclesExecuted(hostCyclesExecuted * hostFactor);
-    const uint64_t instsExecuted(hwInstructions.read() - baseInstrs);
-    const Tick ticksExecuted(runTimer->ticksFromHostCycles(hostCyclesExecuted));
-
-    if (ticksExecuted < ticks &&
-        timerOverflowed &&
-        _kvmRun->exit_reason == KVM_EXIT_INTR) {
-        // TODO: We should probably do something clever here...
-        warn("KVM: Early timer event, requested %i ticks but got %i ticks.\n",
-             ticks, ticksExecuted);
+    if (ticks == 0) {
+        // Settings ticks == 0 is a special case which causes an entry
+        // into KVM that finishes pending operations (e.g., IO) and
+        // then immediately exits.
+        DPRINTF(KvmRun, "KVM: Delivering IO without full guest entry\n");
+
+        // This signal is always masked while we are executing in gem5
+        // and gets unmasked temporarily as soon as we enter into
+        // KVM. See setSignalMask() and setupSignalHandler().
+        raise(KVM_TIMER_SIGNAL);
+
+        // Enter into KVM. KVM will check for signals after completing
+        // pending operations (IO). Since the KVM_TIMER_SIGNAL is
+        // pending, this forces an immediate exit into gem5 again. We
+        // don't bother to setup timers since this shouldn't actually
+        // execute any code in the guest.
+        ioctlRun();
+
+        // We always execute at least one cycle to prevent the
+        // BaseKvmCPU::tick() to be rescheduled on the same tick
+        // twice.
+        ticksExecuted = clockPeriod();
+    } else {
+        if (ticks < runTimer->resolution()) {
+            DPRINTF(KvmRun, "KVM: Adjusting tick count (%i -> %i)\n",
+                    ticks, runTimer->resolution());
+            ticks = runTimer->resolution();
+        }
+
+        // Get hardware statistics after synchronizing contexts. The KVM
+        // state update might affect guest cycle counters.
+        uint64_t baseCycles(getHostCycles());
+        uint64_t baseInstrs(hwInstructions.read());
+
+        // Arm the run timer and start the cycle timer if it isn't
+        // controlled by the overflow timer. Starting/stopping the cycle
+        // timer automatically starts the other perf timers as they are in
+        // the same counter group.
+        runTimer->arm(ticks);
+        if (!perfControlledByTimer)
+            hwCycles.start();
+
+        ioctlRun();
+
+        runTimer->disarm();
+        if (!perfControlledByTimer)
+            hwCycles.stop();
+
+        // The timer signal may have been delivered after we exited
+        // from KVM. It will be pending in that case since it is
+        // masked when we aren't executing in KVM. Discard it to make
+        // sure we don't deliver it immediately next time we try to
+        // enter into KVM.
+        discardPendingSignal(KVM_TIMER_SIGNAL);
+
+        const uint64_t hostCyclesExecuted(getHostCycles() - baseCycles);
+        const uint64_t simCyclesExecuted(hostCyclesExecuted * hostFactor);
+        const uint64_t instsExecuted(hwInstructions.read() - baseInstrs);
+        ticksExecuted = runTimer->ticksFromHostCycles(hostCyclesExecuted);
+
+        if (ticksExecuted < ticks &&
+            timerOverflowed &&
+            _kvmRun->exit_reason == KVM_EXIT_INTR) {
+            // TODO: We should probably do something clever here...
+            warn("KVM: Early timer event, requested %i ticks but got %i ticks.\n",
+                 ticks, ticksExecuted);
+        }
+
+        /* Update statistics */
+        numCycles += simCyclesExecuted;;
+        numInsts += instsExecuted;
+        ctrInsts += instsExecuted;
+        system->totalNumInsts += instsExecuted;
+
+        DPRINTF(KvmRun,
+                "KVM: Executed %i instructions in %i cycles "
+                "(%i ticks, sim cycles: %i).\n",
+                instsExecuted, hostCyclesExecuted, ticksExecuted, simCyclesExecuted);
     }
 
-    /* Update statistics */
-    numCycles += simCyclesExecuted;;
     ++numVMExits;
-    numInsts += instsExecuted;
-    ctrInsts += instsExecuted;
-    system->totalNumInsts += instsExecuted;
-
-    DPRINTF(KvmRun, "KVM: Executed %i instructions in %i cycles (%i ticks, sim cycles: %i).\n",
-            instsExecuted, hostCyclesExecuted, ticksExecuted, simCyclesExecuted);
 
     return ticksExecuted + flushCoalescedMMIO();
 }
@@ -700,7 +821,11 @@ Tick
 BaseKvmCPU::handleKvmExit()
 {
     DPRINTF(KvmRun, "handleKvmExit (exit_reason: %i)\n", _kvmRun->exit_reason);
+    assert(_status == RunningService);
 
+    // Switch into the running state by default. Individual handlers
+    // can override this.
+    _status = Running;
     switch (_kvmRun->exit_reason) {
       case KVM_EXIT_UNKNOWN:
         return handleKvmExitUnknown();
@@ -709,6 +834,7 @@ BaseKvmCPU::handleKvmExit()
         return handleKvmExitException();
 
       case KVM_EXIT_IO:
+        _status = RunningServiceCompletion;
         ++numIO;
         return handleKvmExitIO();
 
@@ -728,6 +854,7 @@ BaseKvmCPU::handleKvmExit()
         return 0;
 
       case KVM_EXIT_MMIO:
+        _status = RunningServiceCompletion;
         /* Service memory mapped IO requests */
         DPRINTF(KvmIO, "KVM: Handling MMIO (w: %u, addr: 0x%x, len: %u)\n",
                 _kvmRun->mmio.is_write,
@@ -816,6 +943,27 @@ BaseKvmCPU::doMMIOAccess(Addr paddr, void *data, int size, bool write)
     return dataPort.sendAtomic(&pkt);
 }
 
+void
+BaseKvmCPU::setSignalMask(const sigset_t *mask)
+{
+    std::unique_ptr<struct kvm_signal_mask> kvm_mask;
+
+    if (mask) {
+        kvm_mask.reset((struct kvm_signal_mask *)operator new(
+                           sizeof(struct kvm_signal_mask) + sizeof(*mask)));
+        // The kernel and the user-space headers have different ideas
+        // about the size of sigset_t. This seems like a massive hack,
+        // but is actually what qemu does.
+        assert(sizeof(*mask) >= 8);
+        kvm_mask->len = 8;
+        memcpy(kvm_mask->sigset, mask, kvm_mask->len);
+    }
+
+    if (ioctl(KVM_SET_SIGNAL_MASK, (void *)kvm_mask.get()) == -1)
+        panic("KVM: Failed to set vCPU signal mask (errno: %i)\n",
+              errno);
+}
+
 int
 BaseKvmCPU::ioctl(int request, long p1) const
 {
@@ -862,6 +1010,50 @@ BaseKvmCPU::setupSignalHandler()
     sa.sa_flags = SA_SIGINFO | SA_RESTART;
     if (sigaction(KVM_TIMER_SIGNAL, &sa, NULL) == -1)
         panic("KVM: Failed to setup vCPU signal handler\n");
+
+    sigset_t sigset;
+    if (sigprocmask(SIG_BLOCK, NULL, &sigset) == -1)
+        panic("KVM: Failed get signal mask\n");
+
+    // Request KVM to setup the same signal mask as we're currently
+    // running with. We'll sometimes need to mask the KVM_TIMER_SIGNAL
+    // to cause immediate exits from KVM after servicing IO
+    // requests. See kvmRun().
+    setSignalMask(&sigset);
+
+    // Mask the KVM_TIMER_SIGNAL so it isn't delivered unless we're
+    // actually executing inside KVM.
+    sigaddset(&sigset, KVM_TIMER_SIGNAL);
+    if (sigprocmask(SIG_SETMASK, &sigset, NULL) == -1)
+        panic("KVM: Failed mask the KVM timer signal\n");
+}
+
+bool
+BaseKvmCPU::discardPendingSignal(int signum) const
+{
+    int discardedSignal;
+
+    // Setting the timeout to zero causes sigtimedwait to return
+    // immediately.
+    struct timespec timeout;
+    timeout.tv_sec = 0;
+    timeout.tv_nsec = 0;
+
+    sigset_t sigset;
+    sigemptyset(&sigset);
+    sigaddset(&sigset, signum);
+
+    do {
+        discardedSignal = sigtimedwait(&sigset, NULL, &timeout);
+    } while (discardedSignal == -1 && errno == EINTR);
+
+    if (discardedSignal == signum)
+        return true;
+    else if (discardedSignal == -1 && errno == EAGAIN)
+        return false;
+    else
+        panic("Unexpected return value from sigtimedwait: %i (errno: %i)\n",
+              discardedSignal, errno);
 }
 
 void
@@ -893,3 +1085,36 @@ BaseKvmCPU::setupCounters()
                           0, // TID (0 => currentThread)
                           hwCycles);
 }
+
+bool
+BaseKvmCPU::tryDrain()
+{
+    if (!drainManager)
+        return false;
+
+    if (!archIsDrained()) {
+        DPRINTF(Drain, "tryDrain: Architecture code is not ready.\n");
+        return false;
+    }
+
+    if (_status == Idle || _status == Running) {
+        DPRINTF(Drain,
+                "tryDrain: CPU transitioned into the Idle state, drain done\n");
+        drainManager->signalDrainDone();
+        drainManager = NULL;
+        return true;
+    } else {
+        DPRINTF(Drain, "tryDrain: CPU not ready.\n");
+        return false;
+    }
+}
+
+void
+BaseKvmCPU::ioctlRun()
+{
+    if (ioctl(KVM_RUN) == -1) {
+        if (errno != EINTR)
+            panic("KVM: Failed to start virtual CPU (errno: %i)\n",
+                  errno);
+    }
+}
author	Andreas Sandberg <andreas@sandberg.pp.se>	2013-06-11 09:24:51 +0200
committer	Andreas Sandberg <andreas@sandberg.pp.se>	2013-06-11 09:24:51 +0200
commit	4f002930bc03125b9b886233985993291f5a4730 (patch)
tree	43d8eb037db7316629ed109add8386ac94d7a5b2 /src/cpu/kvm/base.cc
parent	df059f45a0cbd230ad00f6da24cfc5d228430e16 (diff)
download	gem5-4f002930bc03125b9b886233985993291f5a4730.tar.xz