diff --git a/executor/dsdt.asl b/executor/dsdt.asl new file mode 100644 index 00000000..701c08ee --- /dev/null +++ b/executor/dsdt.asl @@ -0,0 +1,322 @@ +// ASL Example +DefinitionBlock ( + "dsdt.aml", // Output Filename + "DSDT", // Signature + 0x00, // DSDT Compliance Revision + "BAMM", // OEMID + "JONGE", // TABLE ID + 0x1 // OEM Revision + ) +{ + Scope(\_SB) { + Device(PCI0) { + // The following magic code stands for "PCI Host Bridge" + Name(_HID, EisaId("PNP0A03")) + Name(_ADR, 0) + Name(_UID, 0) + + // Hot Plug Parameters. Optional. + // Linux will complain and use standard parameters, + // if not given. + Name(_HPP, Package(){ + 0x08, // Cache line size in dwords + 0x40, // Latency timer in PCI clocks + 0x01, // Enable SERR line + 0x00 // Enable PERR line + }) + + // PCI Routing Table + // When defining as much ACPI information as + // needed for hotplug, we also have to define + // stuff like the following. + // Otherwise, Linux would complain. + Name(_PRT, Package() { + Package() { 0x1ffff, 0, LNKA, 0 }, + Package() { 0x1ffff, 1, LNKB, 0 }, + Package() { 0x1ffff, 2, LNKC, 0 }, + Package() { 0x1ffff, 3, LNKD, 0 }, + + Package() { 0x2ffff, 0, LNKA, 0 }, + Package() { 0x2ffff, 1, LNKB, 0 }, + Package() { 0x2ffff, 2, LNKC, 0 }, + Package() { 0x2ffff, 3, LNKD, 0 }, + + Package() { 0x3ffff, 0, LNKA, 0 }, + Package() { 0x3ffff, 1, LNKB, 0 }, + Package() { 0x3ffff, 2, LNKC, 0 }, + Package() { 0x3ffff, 3, LNKD, 0 }, + + Package() { 0x4ffff, 0, LNKA, 0 }, + Package() { 0x4ffff, 1, LNKB, 0 }, + Package() { 0x4ffff, 2, LNKC, 0 }, + Package() { 0x4ffff, 3, LNKD, 0 }, + }) + + // At boot, Linux will either scan the system for + // possible resources used by PCI cards or read + // ACPI tables to obtain this information. + // When providing as much ACPI data as needed + // for hotplugging, then this is not optional any longer. + // Linux would complain if all this was not provided here. + Name (_CRS, ResourceTemplate () { + // Bus enumeration from _MIN to _MAX + WordBusNumber ( + ResourceProducer, + MinFixed, // _MIF + MaxFixed, // _MAF + , + 0x00, // _GRA + 0x00, // _MIN + 0xFF, // _MAX + 0x00, // _TRA + 0x100) // _LEN + // IO ports usable by PCI from _MIN to _MAX + WordIO ( + ResourceProducer, + MinFixed, // _MIF + MaxFixed, // _MAF + PosDecode, + EntireRange, + 0x0000, // _GRA + 0x0000, // _MIN + 0x7FFF, // _MAX + 0x00, // _TRA + 0x8000) // _LEN + // System memory for mapping BAR areas from _MIN to _MAX + // BAR = Base Address Register, every PCI card will + // usually have 2 of those. + DWordMemory ( + ResourceProducer, + PosDecode, + MinFixed, // _MIF + MaxFixed, // _MAF + NonCacheable, // _MEM + ReadWrite, // _RW + 0x00000000, // _GRA + 0xE0000000, // _MIN + 0xE0FFFFFF, // _MAX + 0x00, // _TRA + 0x01000000) // _LEN + }) + + // This introduced three names dword fields in IO space. + // The hotplug controller knows these IO port. + // During hot plug/unplug, guest and the hosts hotplug- + // controller will communicate over these. + OperationRegion(PCST, SystemIO, 0xae00, 12) + Field (PCST, DWordAcc, NoLock, WriteAsZeros) + { + PCIU, 32, // IO port 0xae00 + PCID, 32, // IO port 0xae04 + B0EJ, 32, // IO port 0xae08 + } + + // Status method. Statically returns "Everything is up and working" + // because the PCI root bus will always be there. + Method (_STA, 0) { Return (0xf) } + } + + // All this interrupt routing information is necessary. + // This defines the interrupts A, B, C, D, considered legacy + // nowadays. + // Hotplugging etc. will work without this anyway if the PCI device uses + // MSI for interrupting, but the kernel would complain with + // ugly error messages. + // This device definitions are kept as minimal as possible. + Device(LNKA){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 1) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + Device(LNKB){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 2) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {10} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + Device(LNKC){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 3) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {9} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + Device(LNKD){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 4) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + + } + + Scope(\_SB.PCI0) { + // These are PCI slot definitions. + // They are necessary because every PCI card + // which shall be ejectable, needs an _EJ0 method. + Device (S01) { + Name (_ADR, 0x10000) + Name (_SUN, 0x01) // SUN: Slot User Number + + // This method is called by the operating system + // after unloading the device driver etc. + // _EJ0 = eject callback + Method (_EJ0, 1) { PCEJ(0x01) } + } + + Device (S02) { + Name (_ADR, 0x20000) + Name (_SUN, 0x02) + Method (_EJ0, 1) { PCEJ(0x02) } + } + + Device (S03) { + Name (_ADR, 0x30000) + Name (_SUN, 0x03) + Method (_EJ0, 1) { PCEJ(0x03) } + } + + Device (S04) { + Name (_ADR, 0x40000) + Name (_SUN, 0x04) + Method (_EJ0, 1) { PCEJ(0x04) } + } + + // Called by some PCI card's _EJ0 method, + // This tells the hypervisor to turn off the + // PCI device by writing (1 << PCI_ID) to the + // IO port associated with the B0EJ symbol. + Method (PCEJ, 1, NotSerialized) { + Store(ShiftLeft(1, Arg0), B0EJ) + Return (0x0) + } + + // PCNT = PCi NoTify + // PCNT(, <1 = check for inserted device / 3 = eject requested>) + // The values 1 and 3 are defined in the ACPI spec + Method(PCNT, 2) { + If (LEqual(Arg0, 0x01)) { Notify(S01, Arg1) } + If (LEqual(Arg0, 0x02)) { Notify(S02, Arg1) } + If (LEqual(Arg0, 0x03)) { Notify(S03, Arg1) } + If (LEqual(Arg0, 0x04)) { Notify(S04, Arg1) } + } + + /* PCI hotplug notify method */ + Method(PCNF, 0) { + // Local0 = iterator + Store (Zero, Local0) + + // These two fields contain bits mapped + // to PCI devices, like in the GPE bitmap. + + // bit (1 << N) set here --> Device N was inserted + Store (PCIU, Local1) + // bit (1 << N) set here --> Device N has to be removed + Store (PCID, Local2) + + While (LLess(Local0, 4)) { + Increment(Local0) + If (And(Local1, ShiftLeft(1, Local0))) { + PCNT(Local0, 1) // 1 => DEVICE CHECK + } + If (And(Local2, ShiftLeft(1, Local0))) { + PCNT(Local0, 3) // 3 => EJECT REQUEST + } + } + Return(One) + } + } + + Scope (\_GPE) + { + Name(_HID, "ACPI0006") + + // These methods are wired to the according bits in the GPE bitmap. + // The hypervisor will raise bits and then send an interrupt 9. + // The ACPI code in the guest kernel will then dispatch one of these methods. + Method(_E01) { + \_SB.PCI0.PCNF() // PCI hotplug event + } + } + +} // end of definition block diff --git a/executor/dsdt.h b/executor/dsdt.h new file mode 100644 index 00000000..f8ee3611 --- /dev/null +++ b/executor/dsdt.h @@ -0,0 +1,179 @@ +/* + * To generate this file, download the iASL compiler from + * https://acpica.org/downloads (or install the "iasl" packet, + * if available for your distro) and then run: + * iasl -tc dsdt.asl && mv dsdt.hex dsdt.h + */ + +/* + * + * Intel ACPI Component Architecture + * ASL Optimizing Compiler version 20130418-64 [May 8 2013] + * Copyright (c) 2000 - 2013 Intel Corporation + * + * Compilation of "dsdt.asl" - Thu Jun 20 15:28:32 2013 + * + * C source code output + * AML code block contains 0x4E8 bytes + * + */ +unsigned char AmlCode[] = +{ + 0x44,0x53,0x44,0x54,0xE8,0x04,0x00,0x00, /* 00000000 "DSDT...." */ + 0x00,0x31,0x42,0x41,0x4D,0x4D,0x00,0x00, /* 00000008 ".1BAMM.." */ + 0x4A,0x4F,0x4E,0x47,0x45,0x00,0x00,0x00, /* 00000010 "JONGE..." */ + 0x01,0x00,0x00,0x00,0x49,0x4E,0x54,0x4C, /* 00000018 "....INTL" */ + 0x18,0x04,0x13,0x20,0x10,0x40,0x33,0x5F, /* 00000020 "... .@3_" */ + 0x53,0x42,0x5F,0x5B,0x82,0x4D,0x18,0x50, /* 00000028 "SB_[.M.P" */ + 0x43,0x49,0x30,0x08,0x5F,0x48,0x49,0x44, /* 00000030 "CI0._HID" */ + 0x0C,0x41,0xD0,0x0A,0x03,0x08,0x5F,0x41, /* 00000038 ".A...._A" */ + 0x44,0x52,0x00,0x08,0x5F,0x55,0x49,0x44, /* 00000040 "DR.._UID" */ + 0x00,0x08,0x5F,0x48,0x50,0x50,0x12,0x08, /* 00000048 ".._HPP.." */ + 0x04,0x0A,0x08,0x0A,0x40,0x01,0x00,0x08, /* 00000050 "....@..." */ + 0x5F,0x50,0x52,0x54,0x12,0x4B,0x0E,0x10, /* 00000058 "_PRT.K.." */ + 0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x01,0x00, /* 00000060 "........" */ + 0x00,0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0D, /* 00000068 ".LNKA..." */ + 0x04,0x0C,0xFF,0xFF,0x01,0x00,0x01,0x4C, /* 00000070 ".......L" */ + 0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C, /* 00000078 "NKB....." */ + 0xFF,0xFF,0x01,0x00,0x0A,0x02,0x4C,0x4E, /* 00000080 "......LN" */ + 0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF, /* 00000088 "KC......" */ + 0xFF,0x01,0x00,0x0A,0x03,0x4C,0x4E,0x4B, /* 00000090 ".....LNK" */ + 0x44,0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF, /* 00000098 "D......." */ + 0x02,0x00,0x00,0x4C,0x4E,0x4B,0x41,0x00, /* 000000A0 "...LNKA." */ + 0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x02,0x00, /* 000000A8 "........" */ + 0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E, /* 000000B0 ".LNKB..." */ + 0x04,0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x02, /* 000000B8 "........" */ + 0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,0x04, /* 000000C0 "LNKC...." */ + 0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x03,0x4C, /* 000000C8 ".......L" */ + 0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,0x0C, /* 000000D0 "NKD....." */ + 0xFF,0xFF,0x03,0x00,0x00,0x4C,0x4E,0x4B, /* 000000D8 ".....LNK" */ + 0x41,0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF, /* 000000E0 "A......." */ + 0x03,0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00, /* 000000E8 "...LNKB." */ + 0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00, /* 000000F0 "........" */ + 0x0A,0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12, /* 000000F8 "..LNKC.." */ + 0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00,0x0A, /* 00000100 "........" */ + 0x03,0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D, /* 00000108 ".LNKD..." */ + 0x04,0x0C,0xFF,0xFF,0x04,0x00,0x00,0x4C, /* 00000110 ".......L" */ + 0x4E,0x4B,0x41,0x00,0x12,0x0D,0x04,0x0C, /* 00000118 "NKA....." */ + 0xFF,0xFF,0x04,0x00,0x01,0x4C,0x4E,0x4B, /* 00000120 ".....LNK" */ + 0x42,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF, /* 00000128 "B......." */ + 0x04,0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x43, /* 00000130 "....LNKC" */ + 0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x04, /* 00000138 "........" */ + 0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x44,0x00, /* 00000140 "...LNKD." */ + 0x08,0x5F,0x43,0x52,0x53,0x11,0x3F,0x0A, /* 00000148 "._CRS.?." */ + 0x3C,0x88,0x0D,0x00,0x02,0x0C,0x00,0x00, /* 00000150 "<......." */ + 0x00,0x00,0x00,0xFF,0x00,0x00,0x00,0x00, /* 00000158 "........" */ + 0x01,0x88,0x0D,0x00,0x01,0x0C,0x03,0x00, /* 00000160 "........" */ + 0x00,0x00,0x00,0xFF,0x7F,0x00,0x00,0x00, /* 00000168 "........" */ + 0x80,0x87,0x17,0x00,0x00,0x0C,0x01,0x00, /* 00000170 "........" */ + 0x00,0x00,0x00,0x00,0x00,0x00,0xE0,0xFF, /* 00000178 "........" */ + 0xFF,0xFF,0xE0,0x00,0x00,0x00,0x00,0x00, /* 00000180 "........" */ + 0x00,0x00,0x01,0x79,0x00,0x5B,0x80,0x50, /* 00000188 "...y.[.P" */ + 0x43,0x53,0x54,0x01,0x0B,0x00,0xAE,0x0A, /* 00000190 "CST....." */ + 0x0C,0x5B,0x81,0x15,0x50,0x43,0x53,0x54, /* 00000198 ".[..PCST" */ + 0x43,0x50,0x43,0x49,0x55,0x20,0x50,0x43, /* 000001A0 "CPCIU PC" */ + 0x49,0x44,0x20,0x42,0x30,0x45,0x4A,0x20, /* 000001A8 "ID B0EJ " */ + 0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4, /* 000001B0 ".._STA.." */ + 0x0A,0x0F,0x5B,0x82,0x44,0x06,0x4C,0x4E, /* 000001B8 "..[.D.LN" */ + 0x4B,0x41,0x08,0x5F,0x48,0x49,0x44,0x0C, /* 000001C0 "KA._HID." */ + 0x41,0xD0,0x0C,0x0F,0x08,0x5F,0x55,0x49, /* 000001C8 "A...._UI" */ + 0x44,0x01,0x14,0x09,0x5F,0x53,0x54,0x41, /* 000001D0 "D..._STA" */ + 0x00,0xA4,0x0A,0x0B,0x14,0x1A,0x5F,0x43, /* 000001D8 "......_C" */ + 0x52,0x53,0x00,0x08,0x42,0x55,0x46,0x46, /* 000001E0 "RS..BUFF" */ + 0x11,0x09,0x0A,0x06,0x23,0x20,0x00,0x18, /* 000001E8 "....# .." */ + 0x79,0x00,0xA4,0x42,0x55,0x46,0x46,0x14, /* 000001F0 "y..BUFF." */ + 0x1A,0x5F,0x50,0x52,0x53,0x00,0x08,0x42, /* 000001F8 "._PRS..B" */ + 0x55,0x46,0x46,0x11,0x09,0x0A,0x06,0x23, /* 00000200 "UFF....#" */ + 0x20,0x06,0x18,0x79,0x00,0xA4,0x42,0x55, /* 00000208 " ..y..BU" */ + 0x46,0x46,0x14,0x06,0x5F,0x53,0x52,0x53, /* 00000210 "FF.._SRS" */ + 0x01,0x14,0x06,0x5F,0x44,0x49,0x53,0x00, /* 00000218 "..._DIS." */ + 0x5B,0x82,0x45,0x06,0x4C,0x4E,0x4B,0x42, /* 00000220 "[.E.LNKB" */ + 0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0, /* 00000228 "._HID.A." */ + 0x0C,0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A, /* 00000230 "..._UID." */ + 0x02,0x14,0x09,0x5F,0x53,0x54,0x41,0x00, /* 00000238 "..._STA." */ + 0xA4,0x0A,0x0B,0x14,0x1A,0x5F,0x43,0x52, /* 00000240 "....._CR" */ + 0x53,0x00,0x08,0x42,0x55,0x46,0x46,0x11, /* 00000248 "S..BUFF." */ + 0x09,0x0A,0x06,0x23,0x00,0x04,0x18,0x79, /* 00000250 "...#...y" */ + 0x00,0xA4,0x42,0x55,0x46,0x46,0x14,0x1A, /* 00000258 "..BUFF.." */ + 0x5F,0x50,0x52,0x53,0x00,0x08,0x42,0x55, /* 00000260 "_PRS..BU" */ + 0x46,0x46,0x11,0x09,0x0A,0x06,0x23,0x20, /* 00000268 "FF....# " */ + 0x06,0x18,0x79,0x00,0xA4,0x42,0x55,0x46, /* 00000270 "..y..BUF" */ + 0x46,0x14,0x06,0x5F,0x53,0x52,0x53,0x01, /* 00000278 "F.._SRS." */ + 0x14,0x06,0x5F,0x44,0x49,0x53,0x00,0x5B, /* 00000280 ".._DIS.[" */ + 0x82,0x45,0x06,0x4C,0x4E,0x4B,0x43,0x08, /* 00000288 ".E.LNKC." */ + 0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C, /* 00000290 "_HID.A.." */ + 0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A,0x03, /* 00000298 ".._UID.." */ + 0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4, /* 000002A0 ".._STA.." */ + 0x0A,0x0B,0x14,0x1A,0x5F,0x43,0x52,0x53, /* 000002A8 "...._CRS" */ + 0x00,0x08,0x42,0x55,0x46,0x46,0x11,0x09, /* 000002B0 "..BUFF.." */ + 0x0A,0x06,0x23,0x00,0x02,0x18,0x79,0x00, /* 000002B8 "..#...y." */ + 0xA4,0x42,0x55,0x46,0x46,0x14,0x1A,0x5F, /* 000002C0 ".BUFF.._" */ + 0x50,0x52,0x53,0x00,0x08,0x42,0x55,0x46, /* 000002C8 "PRS..BUF" */ + 0x46,0x11,0x09,0x0A,0x06,0x23,0x20,0x06, /* 000002D0 "F....# ." */ + 0x18,0x79,0x00,0xA4,0x42,0x55,0x46,0x46, /* 000002D8 ".y..BUFF" */ + 0x14,0x06,0x5F,0x53,0x52,0x53,0x01,0x14, /* 000002E0 ".._SRS.." */ + 0x06,0x5F,0x44,0x49,0x53,0x00,0x5B,0x82, /* 000002E8 "._DIS.[." */ + 0x45,0x06,0x4C,0x4E,0x4B,0x44,0x08,0x5F, /* 000002F0 "E.LNKD._" */ + 0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x0F, /* 000002F8 "HID.A..." */ + 0x08,0x5F,0x55,0x49,0x44,0x0A,0x04,0x14, /* 00000300 "._UID..." */ + 0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,0x0A, /* 00000308 "._STA..." */ + 0x0B,0x14,0x1A,0x5F,0x43,0x52,0x53,0x00, /* 00000310 "..._CRS." */ + 0x08,0x42,0x55,0x46,0x46,0x11,0x09,0x0A, /* 00000318 ".BUFF..." */ + 0x06,0x23,0x20,0x00,0x18,0x79,0x00,0xA4, /* 00000320 ".# ..y.." */ + 0x42,0x55,0x46,0x46,0x14,0x1A,0x5F,0x50, /* 00000328 "BUFF.._P" */ + 0x52,0x53,0x00,0x08,0x42,0x55,0x46,0x46, /* 00000330 "RS..BUFF" */ + 0x11,0x09,0x0A,0x06,0x23,0x20,0x06,0x18, /* 00000338 "....# .." */ + 0x79,0x00,0xA4,0x42,0x55,0x46,0x46,0x14, /* 00000340 "y..BUFF." */ + 0x06,0x5F,0x53,0x52,0x53,0x01,0x14,0x06, /* 00000348 "._SRS..." */ + 0x5F,0x44,0x49,0x53,0x00,0x10,0x44,0x12, /* 00000350 "_DIS..D." */ + 0x2E,0x5F,0x53,0x42,0x5F,0x50,0x43,0x49, /* 00000358 "._SB_PCI" */ + 0x30,0x5B,0x82,0x21,0x53,0x30,0x31,0x5F, /* 00000360 "0[.!S01_" */ + 0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00, /* 00000368 "._ADR..." */ + 0x01,0x00,0x08,0x5F,0x53,0x55,0x4E,0x01, /* 00000370 "..._SUN." */ + 0x14,0x0B,0x5F,0x45,0x4A,0x30,0x01,0x50, /* 00000378 ".._EJ0.P" */ + 0x43,0x45,0x4A,0x01,0x5B,0x82,0x23,0x53, /* 00000380 "CEJ.[.#S" */ + 0x30,0x32,0x5F,0x08,0x5F,0x41,0x44,0x52, /* 00000388 "02_._ADR" */ + 0x0C,0x00,0x00,0x02,0x00,0x08,0x5F,0x53, /* 00000390 "......_S" */ + 0x55,0x4E,0x0A,0x02,0x14,0x0C,0x5F,0x45, /* 00000398 "UN...._E" */ + 0x4A,0x30,0x01,0x50,0x43,0x45,0x4A,0x0A, /* 000003A0 "J0.PCEJ." */ + 0x02,0x5B,0x82,0x23,0x53,0x30,0x33,0x5F, /* 000003A8 ".[.#S03_" */ + 0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00, /* 000003B0 "._ADR..." */ + 0x03,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A, /* 000003B8 "..._SUN." */ + 0x03,0x14,0x0C,0x5F,0x45,0x4A,0x30,0x01, /* 000003C0 "..._EJ0." */ + 0x50,0x43,0x45,0x4A,0x0A,0x03,0x5B,0x82, /* 000003C8 "PCEJ..[." */ + 0x23,0x53,0x30,0x34,0x5F,0x08,0x5F,0x41, /* 000003D0 "#S04_._A" */ + 0x44,0x52,0x0C,0x00,0x00,0x04,0x00,0x08, /* 000003D8 "DR......" */ + 0x5F,0x53,0x55,0x4E,0x0A,0x04,0x14,0x0C, /* 000003E0 "_SUN...." */ + 0x5F,0x45,0x4A,0x30,0x01,0x50,0x43,0x45, /* 000003E8 "_EJ0.PCE" */ + 0x4A,0x0A,0x04,0x14,0x11,0x50,0x43,0x45, /* 000003F0 "J....PCE" */ + 0x4A,0x01,0x70,0x79,0x01,0x68,0x00,0x42, /* 000003F8 "J.py.h.B" */ + 0x30,0x45,0x4A,0xA4,0x00,0x14,0x35,0x50, /* 00000400 "0EJ...5P" */ + 0x43,0x4E,0x54,0x02,0xA0,0x0A,0x93,0x68, /* 00000408 "CNT....h" */ + 0x01,0x86,0x53,0x30,0x31,0x5F,0x69,0xA0, /* 00000410 "..S01_i." */ + 0x0B,0x93,0x68,0x0A,0x02,0x86,0x53,0x30, /* 00000418 "..h...S0" */ + 0x32,0x5F,0x69,0xA0,0x0B,0x93,0x68,0x0A, /* 00000420 "2_i...h." */ + 0x03,0x86,0x53,0x30,0x33,0x5F,0x69,0xA0, /* 00000428 "..S03_i." */ + 0x0B,0x93,0x68,0x0A,0x04,0x86,0x53,0x30, /* 00000430 "..h...S0" */ + 0x34,0x5F,0x69,0x14,0x3E,0x50,0x43,0x4E, /* 00000438 "4_i.>PCN" */ + 0x46,0x00,0x70,0x00,0x60,0x70,0x50,0x43, /* 00000440 "F.p.`pPC" */ + 0x49,0x55,0x61,0x70,0x50,0x43,0x49,0x44, /* 00000448 "IUapPCID" */ + 0x62,0xA2,0x26,0x95,0x60,0x0A,0x04,0x75, /* 00000450 "b.&.`..u" */ + 0x60,0xA0,0x0E,0x7B,0x61,0x79,0x01,0x60, /* 00000458 "`..{ay.`" */ + 0x00,0x00,0x50,0x43,0x4E,0x54,0x60,0x01, /* 00000460 "..PCNT`." */ + 0xA0,0x0F,0x7B,0x62,0x79,0x01,0x60,0x00, /* 00000468 "..{by.`." */ + 0x00,0x50,0x43,0x4E,0x54,0x60,0x0A,0x03, /* 00000470 ".PCNT`.." */ + 0xA4,0x01,0x10,0x4D,0x06,0x5F,0x47,0x50, /* 00000478 "...M._GP" */ + 0x45,0x08,0x5F,0x48,0x49,0x44,0x0D,0x41, /* 00000480 "E._HID.A" */ + 0x43,0x50,0x49,0x30,0x30,0x30,0x36,0x00, /* 00000488 "CPI0006." */ + 0x14,0x15,0x5F,0x45,0x30,0x31,0x00,0x5C, /* 00000490 ".._E01.\" */ + 0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,0x43, /* 00000498 "/._SB_PC" */ + 0x49,0x30,0x50,0x43,0x4E,0x46,0x14,0x15, /* 000004A0 "I0PCNF.." */ + 0x5F,0x45,0x30,0x32,0x00,0x5C,0x2F,0x03, /* 000004A8 "_E02.\/." */ + 0x5F,0x53,0x42,0x5F,0x50,0x43,0x49,0x30, /* 000004B0 "_SB_PCI0" */ + 0x50,0x43,0x4E,0x46,0x14,0x15,0x5F,0x45, /* 000004B8 "PCNF.._E" */ + 0x30,0x33,0x00,0x5C,0x2F,0x03,0x5F,0x53, /* 000004C0 "03.\/._S" */ + 0x42,0x5F,0x50,0x43,0x49,0x30,0x50,0x43, /* 000004C8 "B_PCI0PC" */ + 0x4E,0x46,0x14,0x15,0x5F,0x45,0x30,0x34, /* 000004D0 "NF.._E04" */ + 0x00,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F, /* 000004D8 ".\/._SB_" */ + 0x50,0x43,0x49,0x30,0x50,0x43,0x4E,0x46 /* 000004E0 "PCI0PCNF" */ +}; diff --git a/executor/vbios_reset.cc b/executor/vbios_reset.cc index 0a875b10..e8e778d4 100644 --- a/executor/vbios_reset.cc +++ b/executor/vbios_reset.cc @@ -4,6 +4,8 @@ * Copyright (C) 2009-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -19,6 +21,11 @@ #include "nul/motherboard.h" #include "executor/bios.h" +/* This file contains the AML code of the DSDT in form + * of a string, which is available under the symbol name + * "AmlCode" */ +#include "dsdt.h" + bool use_x2apic_mode; PARAM_HANDLER(x2apic_mode, "x2apic_mode - enable x2apic mode in the LAPICs") @@ -144,6 +151,15 @@ class VirtualBiosReset : public StaticReceiver, public BiosCom // the ACPI IRQ is 9 discovery_write_dw("FACP", 46, 9, 2); + /* Initialize DSDT table. + * Its content is defined as AML bytecode in dsdt.h */ + discovery_write_st("DSDT", 0, "DSDT", 4); + + /* Initialize FACS table. + * The table is left empty. Linux demands its existence + * before switching to ACPI mode. */ + discovery_write_st("FACS", 0, "FACS", 4); + // store what remains on memory in KB discovery_write_dw("bda", 0x13, _mem_size >> 10, 2); return jmp_int(msg, 0x19); @@ -220,6 +236,28 @@ class VirtualBiosReset : public StaticReceiver, public BiosCom discovery_write_dw(name, 15, 0, 1); fix_acpi_checksum(_resources + index, 20, 8); } + else if (!strcmp("DSDT", name)) { + unsigned table; + check1(false, !(table = alloc(sizeof(AmlCode), 0x10)), + "allocate ACPI table failed"); + _resources[index] = Resource(name, table, sizeof(AmlCode), true); + + // FADT contains a pointer to the DSDT + discovery_write_dw("FACP", 40, table, 4); + + /* The DSDT is completely defined as AML bytecode in dsdt.h + * which was compiled from ASL by the Intel ASL compiler */ + memcpy(_mem_ptr + table, AmlCode, sizeof(AmlCode)); + } + else if (!strcmp("FACS", name)) { + unsigned table; + check1(false, !(table = alloc(36, 64)), "allocate ACPI table failed"); + _resources[index] = Resource(name, table, 36, true); + init_acpi_table(name); + + // FADT contains a pointer to the FACS + discovery_write_dw("FACP", 36, table, 4); + } else { // we create an ACPI table size_t table; diff --git a/host/migration.cc b/host/migration.cc new file mode 100644 index 00000000..85fb2002 --- /dev/null +++ b/host/migration.cc @@ -0,0 +1,764 @@ +/** + * Base migration code + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +/* Activate checksumming for debugging purposes + * of the received range after migrating. + * As this really makes the freeze gap larger, + * this should only be used for testing when + * the migration algorithm is changed. */ +//#define DO_CHECKSUMMING + + +#include // snprintf + +#include +#include + +#include +#include + +Migration::Migration(Motherboard *mb) +: _mb(mb), + _vcpu_utcb(NULL), +#if PORTED_TO_UNIX + _vcpu_blocked_sem(cap, true), + _vcpu_sem(cap+1, true), +#endif + _vcpu_should_block(false), + _socket(NULL), + _sendmem(0), _sendmem_total(0), + _freeze_timer(_mb->clock()) +{ + _vcpu_utcb = new CpuState; +} + +Migration::~Migration() +{ +} + +void Migration::init_memrange_info() +{ + MessageHostOp msg(MessageHostOp::OP_GUEST_MEM, 0UL); + if (!_mb->bus_hostop.send(msg)) + Logging::panic("%s failed to get physical memory\n", + __PRETTY_FUNCTION__); + + _physmem_start = msg.ptr; + _physmem_size = msg.len; + + _dirtman = DirtManager(_physmem_size >> 12); +} + +void Migration::save_guestregs(CpuState *utcb) +{ + /* After Migration::freeze_vcpus() was called, the VCPU will + * arrive in the recall handler and call this method here. + * Its register states are saved and then it hangs in + * our lock. + */ + if (!_vcpu_should_block) return; + + mword vcpu_bytes = reinterpret_cast(&utcb->id+1); + vcpu_bytes -= reinterpret_cast(&utcb->mtd); + + memcpy(&_vcpu_utcb->mtd, &utcb->mtd, vcpu_bytes); + +#if PORTED_TO_UNIX + // Release the waiting migration thread + _vcopu_blocked_sem.up(); + // Freeze VCPU + _vcpu_sem.downmulti(); +#endif +} + +/* This is used to print messages onto the screen + * just after the VMM has started and waits for incoming + * guest state data. + */ +bool Migration::puts_guestscreen(const char *str, bool reset_screen) +{ + MessageRestore msg(MessageRestore::VGA_DISPLAY_GUEST, + const_cast(str), reset_screen); + return _mb->bus_restore.send(msg, true); +} + +void Migration::print_welcomescreen() +{ + char welcome_msg[255]; + mword ip = IpHelper::instance().get_ip(); + + snprintf(welcome_msg, sizeof(welcome_msg), + " Waiting for guest to migrate. IP: %lu.%lu.%lu.%lu\n\n", + ip & 0xff, (ip >> 8) & 0xff, (ip >> 16) & 0xff, (ip >> 24) & 0xff); + puts_guestscreen(welcome_msg, true); +} + +void Migration::freeze_vcpus() +{ + Logging::printf("Stopping vcpu.\n"); + + _vcpu_should_block = true; + + CpuEvent smsg(VCpu::EVENT_RESUME); + for (VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu=vcpu->get_last()) + vcpu->bus_event.send(smsg); + +#if PORTED_TO_UNIX + _vcpu_blocked_sem.downmulti(); +#endif + + _freeze_timer.start(); +} + +void Migration::unfreeze_vcpus() +{ + _vcpu_should_block = false; +#if PORTED_TO_UNIX + /* After releasing the VCPU it will continue + * through the rest of the recall handler. + */ + _vcpu_sem.up(); +#endif +} + +bool Migration::chksum_page(unsigned page_nr, mword &their_chksum, bool compare) +{ + mword my_chksum = 0; + assert(page_nr < (_physmem_size >> 12)); + + mword *ptr = reinterpret_cast(_physmem_start + (page_nr << 12)); + + for (unsigned i=0; i < 4096 / sizeof(ptr[0]); ++i) + // checksum = sum over (address_i * value_i^2) + my_chksum += reinterpret_cast(ptr+1) * (ptr[i]) * (ptr[i]); + + // Use case one: return true if given memory range is correct + if (compare) return my_chksum == their_chksum; + + // Second use case: Provide a checksum for a given memory range + their_chksum = my_chksum; + return true; +} + +bool Migration::checksums(bool retrieve) +{ + mword pagenr = 0; + mword checksum; + mword magic = 0xfafab0b0; + bool success = true; + + if (retrieve) { + // Receiver. Check the existing checksum list against our memory + mword rec_magic; + + _socket->receive(&rec_magic, sizeof(rec_magic)); + _socket->receive(&pagenr, sizeof(pagenr)); + _socket->receive(&checksum, sizeof(checksum)); + + while (pagenr != ~0ul) { + assert(magic == rec_magic); + MessageMemRegion mmsg(pagenr); + assert(_mb->bus_memregion.send(mmsg, true)); + assert(mmsg.actual_physmem); + + bool area_success = chksum_page(mmsg.start_page, checksum, true); + success &= area_success; + + Logging::printf("Checksum of area [%8lx - %8lx) - %s\n", + reinterpret_cast(mmsg.start_page), + reinterpret_cast(mmsg.start_page + mmsg.count), + area_success ? "OK" : "Error"); + + _socket->receive(&rec_magic, sizeof(rec_magic)); + _socket->receive(&pagenr, sizeof(pagenr)); + _socket->receive(&checksum, sizeof(checksum)); + } + } + else { + // Sender. Make a list of checksums and send it away. + + while (pagenr < _physmem_size) { + MessageMemRegion mmsg(pagenr); + if (!_mb->bus_memregion.send(mmsg, true) || !mmsg.actual_physmem) { + // No one claims this region. do not check. + ++pagenr; + continue; + } + + Logging::printf("Checksumming the area [%8lx - %8lx)\n", + reinterpret_cast(mmsg.start_page), + reinterpret_cast(mmsg.start_page + mmsg.count)); + + chksum_page(pagenr, checksum, false); + success &= _socket->send(&magic, sizeof(magic)); + success &= _socket->send(&pagenr, sizeof(pagenr)); + success &= _socket->send(&checksum, sizeof(checksum)); + + pagenr += mmsg.count; + } + + pagenr = ~0ul; + success &= _socket->send(&magic, sizeof(magic)); + success &= _socket->send(&pagenr, sizeof(pagenr)); + success &= _socket->send(&pagenr, sizeof(pagenr)); + } + + return success; +} + + +/*********************************************************************** + * Guest receiving part + ***********************************************************************/ + +bool Migration::receive_ping() +{ + mword ping_msg = 0; + + _socket->receive(&ping_msg, sizeof(ping_msg)); + + if (ping_msg != 0xc0ffee) { + Logging::printf("Received bad ping message.\n"); + return false; + } + + ping_msg *= 3; + _socket->send(&ping_msg, sizeof(ping_msg)); + + return true; +} + +void Migration::receive_header() +{ + MigrationHeader mig_header; + + Logging::printf("Receiving guest information.\n"); + + _socket->receive(&mig_header, sizeof(mig_header)); + if (!mig_header.magic_string_check()) + Logging::panic("Magic string check failed: MigrationHeader\n"); + + MessageRestore vgamsg(MessageRestore::VGA_VIDEOMODE, NULL, true); + vgamsg.bytes = mig_header.videomode; + _mb->bus_restore.send(vgamsg, true); +} + +void Migration::receive_memory() +{ + StopWatch watch(_mb->clock()); + Logging::printf("Receiving guest memory.\n"); + + Prd current; + unsigned long bytes = 0; + + watch.start(); + while (1) { + _socket->receive(¤t, sizeof(current)); + if (!current.value()) + // Receiving an empty range descriptor means "EOF" + break; + + _socket->receive(current.base() + _physmem_start, current.size()); + bytes += current.size(); + } + watch.stop(); + + Logging::printf("Received %lu MB. RX Rate: %u KB/s\n", + bytes / 1024 / 1024, watch.rate(bytes)); +} + +/* Being equipped with a pointer to the stopped VCPU's + * register state structure, its registers will be overwritten + * and devices restored. + */ +bool Migration::receive_guestdevices(CpuState *vcpu_utcb) +{ + Logging::printf("Receiving UTCB.\n"); + + CpuState *buf = new CpuState; + + mword utcb_end = reinterpret_cast(&buf->id+1); + mword utcb_start = reinterpret_cast(&buf->mtd); + mword utcb_bytes = utcb_end - utcb_start; + + _socket->receive(&buf->mtd, utcb_bytes); + + memcpy(&vcpu_utcb->mtd, &buf->mtd, utcb_bytes); + + delete buf; + + Logging::printf("Receiving Devices.\n"); + + // This works quite similar to the device saving procedure + MessageRestore *rmsg = new MessageRestore(MessageRestore::RESTORE_RESTART, + NULL, false); + _mb->bus_restore.send_fifo(*rmsg); + + // no while(someone_responds_true) approach here because we know + // what we want to restore and how many. + bool ret; + while (1) { + _socket->receive(rmsg, sizeof(*rmsg)); + assert(rmsg->magic_string_check()); + + if (rmsg->devtype == 0xdead) + break; + + char *device_buffer = new char[rmsg->bytes]; + _socket->receive(device_buffer, rmsg->bytes); + + rmsg->space = device_buffer; + rmsg->write = false; + ret = _mb->bus_restore.send(*rmsg, true); + if (!ret) Logging::printf("No device replied on restore message!" + " VMM-Configuration mismatch?\n"); + + delete [] device_buffer; + } + + delete rmsg; + + /* Fix TSC offset. + * The guest would freeze for some time or skip some timesteps otherwise. + */ + unsigned long long sender_rdtsc; + _socket->receive(&sender_rdtsc, sizeof(sender_rdtsc)); + + CpuMessage rdtsc_msg(CpuMessage::TYPE_ADD_TSC_OFF, NULL, 0); + rdtsc_msg.current_tsc_off = sender_rdtsc - Cpu::rdtsc(); + + for (VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu=vcpu->get_last()) + vcpu->executor.send(rdtsc_msg); + + return true; +} + +bool Migration::listen(unsigned port, CpuState *vcpu_utcb) +{ + init_memrange_info(); + + print_welcomescreen(); + + _socket = IpHelper::instance().listen(port); + if (_socket == NULL) Logging::panic("Got no TCP receiver.\n"); + + receive_ping(); + + receive_header(); + + receive_memory(); + + receive_guestdevices(vcpu_utcb); + +#ifdef DO_CHECKSUMMING + // Checksumming really makes the migration gap larger + if (!checksums(true)) { + Logging::printf("Error while comparing checksums.\n"); + return false; + } +#endif + + _socket->close(); + + MessageRestore replug_msg(MessageRestore::PCI_PLUG, NULL, true); + _mb->bus_restore.send(replug_msg, false); + + Logging::printf("That's it. Waking up VCPUs.\n"); + unfreeze_vcpus(); + + return true; +} + +/*********************************************************************** + * Guest sending part + ***********************************************************************/ + +unsigned Migration::negotiate_port() +{ + char *cmdline = NULL; + + MessageHostOp msg(MessageHostOp::OP_GET_CONFIG_STRING, 0ul); + if (!_mb->bus_hostop.send(msg)) + return 0; + assert(msg.obj != NULL); + cmdline = reinterpret_cast(msg.obj); + + /* Send the listener service our configuration string. + * It will try to start an identically configured VMM + * instance and then tell us on what port it is waiting + * for state input. + */ + MigrationInit mig_init(strlen(cmdline)); + if (!_socket->send(&mig_init, sizeof(mig_init))) return 0; + if (!_socket->send(cmdline, mig_init.cmdlen)) return 0; + + MigrationAnswer mig_ans; + _socket->receive(&mig_ans, sizeof(mig_ans)); + if (!mig_ans.magic_string_check()) { + Logging::printf("Magic string check failed: MigrationAnswer"); + return 0; + } + + if (!mig_ans.success) { + Logging::printf("Configuration is not suitable for target machine.\n"); + return 0; + } + + delete [] cmdline; + return mig_ans.port; +} + +bool Migration::send_header() +{ + /* Sending the listening VMM the video mode setting will allow it + * to switch the framebuffer to the right setting before migration. + * The screen would flicker and display ugly symbols if the + * framebuffer state is restored, but the host doesn't display it + * the right way, otherwise. + */ + MessageRestore vgamsg(MessageRestore::VGA_VIDEOMODE, NULL, false); + _mb->bus_restore.send(vgamsg, true); + + MigrationHeader mig_header(vgamsg.bytes); + return _socket->send(&mig_header, sizeof(mig_header)); +} + +timevalue Migration::send_ping() +{ + StopWatch ping_timer(_mb->clock()); + + mword ping_msg = 0xc0ffee; + mword pong_msg = 0; + + ping_timer.start(); + _socket->send(&ping_msg, sizeof(ping_msg)); + _socket->receive(&pong_msg, sizeof(pong_msg)); + ping_timer.stop(); + + if (pong_msg != 3 * ping_msg) { + Logging::printf("Error during latency check\n"); + return 0; + } + + return ping_timer.delta(); +} + +#define NEXT_DIRTY_PAGE() \ +({ \ + MessageHostOp msg(MessageHostOp::OP_NEXT_DIRTY_PAGE, 0ul); \ + _mb->bus_hostop.send(msg); \ + msg.value; \ +}) + +unsigned Migration::enqueue_all_dirty_pages(longrange_data &async_data) +{ + Prd *crds = async_data.crds; + unsigned crds_sent=0; + + Prd first_crd, last_crd; + + /* This loop will cycle through the memory space + * until it ends up without any new dirty regions + * or it has done a full cycle. + */ + while (1) { + Prd current(NEXT_DIRTY_PAGE()); + + if (!current.value() || // Nothing dirty + // Next round through the memspace + (first_crd.value() && current.base() == first_crd.base()) || + (last_crd.value() && current.base() == last_crd.base())) + break; + + /* These pages are just _marked_ dirty in another data structure, + * the dirt manager. + * This structure might be able to apply some smart optimizations + * in the future like e.g. "don't resend pages too often which are dirtied + * with high access-frequency to reduce traffic", etc. + */ + _dirtman.mark_dirty(current); + + if (!first_crd.value()) first_crd = current; + last_crd = current; + } + + unsigned pages_enqueued = 0; + while (_dirtman.dirty_pages() > 0 && crds_sent < async_data.crd_count) { + Prd current = crds[crds_sent] = _dirtman.next_dirty(); + if (!current.value()) + // That's it for now. + break; + + _dirtman.mark_clean(current); + + if (!_socket->send_nonblocking(&crds[crds_sent], sizeof(*crds)) || + !_socket->send_nonblocking(current.base() + _physmem_start, + current.size())) + return 0; + + ++crds_sent; + pages_enqueued += 1 << current.order(); + } + + return pages_enqueued; +} + +bool Migration::send_memory(longrange_data &async_data) +{ + StopWatch lap_time(_mb->clock()); + StopWatch last_lap(_mb->clock()); + + unsigned transfer_rate; + unsigned dirtying_rate; + + /* The underlying socket architecture works a little bit different than + * BSD sockets, where you stuff data to be sent into the send buffer + * until it replies with "buffer is full, wait a bit". + * These sockets here asynchronously manage lists of pointers to memory ranges + * and their size and will pick up this data when it is actually needed. + * And because of this we have to preserve all memory ranges to be sent + * until they are ACKed. + */ + + const unsigned page_limit = 1000; + unsigned pages_transferred; + unsigned round = 0; + async_data.crds = new Prd[page_limit]; + async_data.crd_count = page_limit; + + MessageRestore unplug_msg(MessageRestore::PCI_PLUG, NULL, false); + _mb->bus_restore.send(unplug_msg, false); + + do { + last_lap = lap_time; + lap_time.start(); + + if (!(pages_transferred = enqueue_all_dirty_pages(async_data)) || + !_socket->wait_complete()) + return false; + + lap_time.stop(); + + transfer_rate = lap_time.rate(pages_transferred << 12); + dirtying_rate = last_lap.rate(pages_transferred << 12); + Logging::printf("RND %u PAGE_CNT %5u TX %5u KB/s DRT %5u KB/s DELTA" + " %llu START %llu\n", + round, pages_transferred, transfer_rate, dirtying_rate, + lap_time.delta(), lap_time.abs_start()); + + assert(pages_transferred); + + _sendmem_total += pages_transferred << 12; + if (_sendmem == 0) _sendmem = _sendmem_total; + ++round; + } while (transfer_rate >= dirtying_rate); + + // The last transfer round with a frozen guest system will follow now + freeze_vcpus(); + + unsigned freeze_pages = 0; + while ((freeze_pages = enqueue_all_dirty_pages(async_data)) > 0) { + if (!_socket->wait_complete()) return false; + pages_transferred += freeze_pages; + } + + static Prd end_of_crds; + if (!pages_transferred || + !_socket->send_nonblocking(&end_of_crds, sizeof(end_of_crds))) + return false; + + Logging::printf("Enqueued the last %u dirty pages\n", pages_transferred); + return true; +} + +bool Migration::send_devices(longrange_data dat) +{ + // Send VCPU state +#if PORTED_TO_UNIX + unsigned vcpu_bytes = reinterpret_cast(&_vcpu_utcb->id+1); + vcpu_bytes -= reinterpret_cast(&_vcpu_utcb->mtd); + + if (!_socket->send(&_vcpu_utcb->mtd, vcpu_bytes)) + return false; +#endif + + /* There are multiple RESTORE_xxx types of restore messages. + * For each kind of device there is one. + * So we throw messages of each type onto the bus. + */ + MessageRestore restart_msg(MessageRestore::RESTORE_RESTART, NULL, true); + _mb->bus_restore.send_fifo(restart_msg); + + mword restore_bytes = restart_msg.bytes; + mword restore_bytes_consumed = 0; + dat.restore_buf = new char[restore_bytes + sizeof(MessageRestore)]; + + for (int i=MessageRestore::RESTORE_RESTART+1; + i < MessageRestore::RESTORE_LAST; + i++) { + /* A device will receive this message, write its state into it and + * return true. If it receives such a message again, it will return + * false. That's why we sent this RESTORE_RESTART message before. + * After the first time the bus returns false, we know that we saved + * all devices of this particular type. + */ + while (1) { + char *msg_addr = dat.restore_buf + restore_bytes_consumed; + char *device_space = dat.restore_buf + restore_bytes_consumed + + sizeof(MessageRestore); + + MessageRestore *rmsg = reinterpret_cast(msg_addr); + memset(rmsg, 0, sizeof(*rmsg)); + + rmsg->devtype = i; + rmsg->write = true; + rmsg->space = device_space; + rmsg->magic_string = MessageRestore::MAGIC_STRING_DEVICE_DESC; + + if (!_mb->bus_restore.send(*rmsg, true)) break; + + restore_bytes_consumed += sizeof(*rmsg) + rmsg->bytes; + } + } + assert(restore_bytes == restore_bytes_consumed); + + if (!_socket->send_nonblocking(dat.restore_buf, restore_bytes) || + // Send "end of devices" + !_socket->send_nonblocking(&dat.end_of_devices, + sizeof(dat.end_of_devices)) || + !_socket->wait_complete()) { + Logging::printf("Error sending device states.\n"); + return false; + } + + // Restore current tsc offset at destination + dat.rdtsc = Cpu::rdtsc(); + /* Compensate network latency. + * This was tested with cloning a VM displaying animations + * which were bound to TSC values. After migration, + * they only ran in sync when the following line was applied. + */ + dat.rdtsc += dat.latency * _mb->clock()->freq() / 1000; + + if (!_socket->send(&dat.rdtsc, sizeof(dat.rdtsc))) { + Logging::printf("Error sending RDTSC\n"); + return false; + } + + return true; +} + +bool Migration::send(unsigned long addr, unsigned long port) +{ + StopWatch migration_timer(_mb->clock()); + longrange_data async_data; + + init_memrange_info(); + + Logging::printf("Trying to connect...\n"); + _socket = IpHelper::instance().connect(addr, port); + if (_socket == NULL) { + Logging::printf("Quitting: Got no TCP connection.\n"); + return false; + } + + Logging::printf("Established connection.\n"); + + unsigned mig_port = negotiate_port(); + + _socket->close(); + + if (!mig_port) return false; + + Logging::printf("Connecting to waiting target VM.\n"); + _socket = IpHelper::instance().connect(addr, mig_port); + if (!_socket) { + Logging::printf("Error connecting to target VM.\n"); + return false; + } + Logging::printf("OK, starting the actual migration.\n"); + + migration_timer.start(); + + async_data.latency = send_ping(); + if (!async_data.latency) { + Logging::printf("Ping failed.\n"); + return false; + } + // Latency = round trip time / 2 + async_data.latency >>= 1; + Logging::printf("Connection has a latency of %lu ms * freq %llu kHz" + " = %llu ticks.\n", + async_data.latency, _mb->clock()->freq() / 1000, + async_data.latency * _mb->clock()->freq() / 1000); + + if (!send_header()) { + Logging::printf("Sending header failed.\n"); + return false; + } + if (!send_memory(async_data)) { + Logging::printf("Sending guest state failed.\n"); + return false; + } + + if (!send_devices(async_data)) { + Logging::printf("Sending guest devices failed.\n"); + return false; + } + +#ifdef DO_CHECKSUMMING + // Checksumming really makes the freeze gap larger + if (!checksums(false)) { + Logging::printf("Error while sending checksums.\n"); + return false; + } +#endif + + // Uncomment this to "clone" the VM instead of migrating it away. + //unfreeze_vcpus(); + + _freeze_timer.stop(); + + _socket->close(); + + migration_timer.stop(); + + Logging::printf("Done. VM was frozen for %llu ms.\n", _freeze_timer.delta()); + Logging::printf("This migration took %llu seconds.\n", + migration_timer.delta() / 1000); + Logging::printf("%3lu%% (%lu MB) of guest memory resent due to change.\n", + 100u * (_sendmem_total - _sendmem) / _sendmem, + (_sendmem_total - _sendmem) / 1024u / 1024u); + + _dirtman.print_stats(); + + delete [] async_data.crds; + delete [] async_data.restore_buf; +#if PORTED_TO_UNIX + delete _vcpu_utcb; +#endif + + return true; +} + +PARAM_HANDLER(retrieve_guest, + "retrieve_guest: - Start a VMM instance which waits for guest", + " state input over network listening on ") +{ + MessageHostOp msg(MessageHostOp::OP_MIGRATION_RETRIEVE_INIT, argv[0]); + mb.bus_hostop.send(msg); +} diff --git a/include/nul/iphelper.h b/include/nul/iphelper.h new file mode 100644 index 00000000..417599ec --- /dev/null +++ b/include/nul/iphelper.h @@ -0,0 +1,146 @@ +/* + * IpHelper class + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + * + * This was previously used for network communication in the NUL userland + * when virtualizing with the NOVA microhypervisor. Functionality was not + * ported and rather the interface is described here to ease porting to the + * UNIX socket interface. + */ + +#ifndef __IPHELPER_H +#define __IPHELPER_H + +#include + +#define IP_AS_UL(a, b, c, d) ((((d) & 0xff) << 24) | (((c) & 0xff) << 16) | (((b) & 0xff) << 8) | ((a) & 0xff)) + +class IpHelper; + +class TcpSocket +{ + friend IpHelper; + + private: + + bool _outgoing; + unsigned short _local_port; + unsigned short _remote_port; + + // Indicates if we are connected. + bool _connected; + // A socket can still be "connected" although closed, if there is still data to be sent. + // After sending this data, the socket will finally be marked as "closed" + bool _closed; + + /* ... semaphores used to be initialized here */ + /* ... buffers ... */ + + /* Only to be called by IpHelper */ + TcpSocket(unsigned caps) + : _remote_port(0), _connected(false), _closed(true) + { /* ... */ } + + /* Forbidden and hence not implemented: */ + TcpSocket(TcpSocket const&); + void operator=(TcpSocket const&); + + public: + /* + * Methods for the end user! + */ + + bool block_until_connected() { return false; } + + /* Close this socket. */ + void close() {} + + /* Blocking receive function. Difference to BSD sockets: + * Does _not_ return before it received the expected number of bytes. */ + bool receive(void *data, unsigned bytes) { return false; } + + /* Blocking send function. Difference to BSD sockets: + * Does _not_ return before the user ACKed all bytes. */ + bool send(void *data, unsigned bytes) { return false; } + + /* Nonblocking send function. Returns immediately. + * Call wait_complete after you pushed multiple send_nonblocking() calls. */ + bool send_nonblocking(void *data, unsigned bytes) { return false; } + + /* Wait until the receiver ACKed all packets sent from this socket. */ + bool wait_complete() { return false; } +}; + +class IpHelper +{ + private: + /* ... */ + + unsigned long long _mac; + + mword _ip; + mword _netmask; + mword _gateway; + + TcpSocket *_sockets; + + IpHelper() : _mac(0), _ip(0), _netmask(0), _gateway(0), _sockets(NULL) + {}; + + + /* Forbidden, hence not implemented: */ + IpHelper(IpHelper const&); + void operator=(IpHelper const&); + + public: + /* This is a singleton */ + static IpHelper & instance() + { + static IpHelper instance; + return instance; + } + + /* === These methods are to be used from the network thread === */ + + /* Attach a KernelSemaphore to this and get notified on timeout events. + * You will better attach this to network events, too. */ + unsigned timer_sm() { return 0; /* This used to return a network timer semaphor capability */ } + + /* Call this after the semaphore let you through to reprogram for the next timeout */ + void check_timeout() {} + + /* Call this regularly to let sockets send */ + void sockets_send() {} + + /* Feed this method regularly with new incoming packets from the network. */ + void do_tcpip(unsigned char* data, unsigned size) {} + + /* === These methods are to be used by the actual end user === */ + + /* Call this once at the beginning to initialize everything. */ + bool init(/* ... */) { return false;} + + /* Block-wait until IpHelper gets an IP and return its value. */ + mword get_ip() { return 0; } + + /* Connect to port at given IP and return a working socket. */ + TcpSocket * connect(unsigned addr, unsigned port) { return NULL; } + + /* Make a socket listen on port and return a TcpSocket object when a connection + * was established */ + TcpSocket * listen(unsigned port) { return NULL; } +}; + +#endif /* __IPHELPER_H */ diff --git a/include/nul/message.h b/include/nul/message.h index bf0a31c3..be27d20d 100644 --- a/include/nul/message.h +++ b/include/nul/message.h @@ -6,6 +6,8 @@ * Copyright (C) 2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -110,7 +112,8 @@ struct MessageMemRegion uintptr_t start_page; unsigned count; char * ptr; - MessageMemRegion(uintptr_t _page) : page(_page), count(0), ptr(0) {} + bool actual_physmem; + MessageMemRegion(uintptr_t _page) : page(_page), count(0), ptr(0), actual_physmem(false) {} }; @@ -254,6 +257,7 @@ struct MessageLegacy INTR, DEASS_INTR, INTA, + UNLOCK, } type; unsigned value; MessageLegacy(Type _type, unsigned _value=0) : type(_type), value(_value) {} @@ -449,6 +453,10 @@ struct MessageHostOp OP_VCPU_BLOCK, OP_VCPU_RELEASE, OP_WAIT_CHILD, + OP_NEXT_DIRTY_PAGE, + OP_GET_CONFIG_STRING, + OP_MIGRATION_RETRIEVE_INIT, + OP_MIGRATION_START, } type; union { unsigned long value; @@ -560,6 +568,23 @@ struct MessageAcpi MessageAcpi(unsigned _parent_bdf, unsigned _bdf, unsigned char _pin): type(ACPI_GET_IRQ), parent_bdf(_parent_bdf), bdf(_bdf), pin(_pin), gsi(~0u) {} }; +/** + * Virtual ACPI: Fixed and General Purpose Events + * can be triggered with these messages + */ +struct MessageAcpiEvent +{ + enum EventType { + ACPI_EVENT_FIXED, + ACPI_EVENT_GP, + ACPI_EVENT_HOT_UNPLUG, + ACPI_EVENT_HOT_REPLUG, + } type; + unsigned num; + + MessageAcpiEvent(EventType _type, unsigned _num) + : type(_type), num(_num) {}; +}; /** * Resource discovery between device models is done by the virtual @@ -748,4 +773,52 @@ struct MessageNetwork MessageNetwork(unsigned type, unsigned client) : type(type), mac(0), client(client) { } }; +struct MessageRestore +{ + enum networkStrings { + MAGIC_STRING_DEVICE_DESC = 0x8D06F00D + }; + + enum restoreTypes { + RESTORE_RESTART = 0, // RESTART is sent over the restore bus for initialization + RESTORE_TIMEOUTLIST, + RESTORE_PIC, + RESTORE_LAPIC, + RESTORE_PIT, + RESTORE_VGA, + RESTORE_NIC, + RESTORE_ACPI, + RESTORE_VCPU, + RESTORE_LAST, + // This one is acutally a restore device type: + // vga.cc will react on this, printing messages on the guest screen. + VGA_DISPLAY_GUEST, + VGA_VIDEOMODE, + // This is for pass-through devices. They will un-/replug themselves + // out of/into the guest before/after live migration + PCI_PLUG, + }; + unsigned long magic_string; + // Use these enums on devtype + unsigned devtype; + // The device will note down how many bytes of this structure it actually uses. + mword bytes; + // Two variables which every device type can use for identification + unsigned id1; + unsigned id2; + // write=true: Writing a device state onto disk. false: Reading back from disk + bool write; + + // Space for saving the device state + char *space; + + MessageRestore(unsigned _devtype, char *_space, bool _write) : + magic_string(MAGIC_STRING_DEVICE_DESC), devtype(_devtype), + bytes(0), id1(0), id2(0), write(_write), space(_space) + {} + bool magic_string_check() { return magic_string == MAGIC_STRING_DEVICE_DESC; } +}; + + + /* EOF */ diff --git a/include/nul/migration.h b/include/nul/migration.h new file mode 100644 index 00000000..67ca0cae --- /dev/null +++ b/include/nul/migration.h @@ -0,0 +1,278 @@ +/** + * Base migration code declarations + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include +#include +#include + +class Desc +{ + protected: + unsigned _value; + Desc(unsigned v) : _value(v) {} + public: + unsigned value() { return _value; } +}; + +/** + * A page range descriptor; + * Introduced, because NUL provided CRDs for this... + **/ +class Prd +{ + protected: + unsigned _value; + + public: + unsigned order() { return ((_value >> 7) & 0x1f); } + unsigned size() { return 1 << (order() + 12); } + unsigned base() { return _value & ~0xfff; } + unsigned attr() { return _value & 0x1f; } + unsigned cap() { return _value >> 12; } + unsigned value() { return _value; } + + explicit Prd(unsigned offset, unsigned order, unsigned attr) : _value((offset << 12) | (order << 7) | attr) { } + explicit Prd(unsigned v) : _value(v) {} + explicit Prd() : Prd(0) {} +}; + +/* The DirtManager is feeded with CRDs of dirty page regions. + * There's an internal bitmap which can be used for future resend-optimizations + * as well as generating resend-statistics. + */ +class DirtManager +{ + private: + unsigned *_map; + unsigned _pages; + + unsigned char *_cnt; + + unsigned _dirt_count; + + public: + void mark_dirty(Prd dirty) + { + unsigned base = dirty.base() >> 12; + unsigned pages = 1 << dirty.order(); + for (unsigned i=base; i < base + pages; ++i) mark_dirty(i); + } + + void mark_dirty(unsigned page) + { + if (!Cpu::get_bit(_map, page)) { + ++_dirt_count; + ++_cnt[page]; + } + Cpu::set_bit(_map, page, true); + } + + void mark_clean(Prd clean) + { + unsigned base = clean.base() >> 12; + unsigned pages = 1 << clean.order(); + for (unsigned i=base; i < base + pages; ++i) mark_clean(i); + } + + void mark_clean(unsigned page) + { + --_dirt_count; + Cpu::set_bit(_map, page, false); + } + + unsigned dirty_pages() { return _dirt_count; } + + Prd next_dirty() { + unsigned base, len; + + for (base = 0; base < _pages; ++base) { + len = 0; + while (Cpu::get_bit(_map, base + len)) ++len; + + if (len > 0) break; + } + + if (len == 0) return Prd(); + + Prd ret(base, Cpu::bsr(len), 0); + return ret; + } + + static inline unsigned char fir_max(unsigned char *in, unsigned limit, unsigned pos, int size) + { + int beg = pos - size; + int end = pos + size; + beg = VMM_MAX(beg, static_cast(0)); + end = VMM_MIN(end, static_cast(limit - 1)); + + int width = end - beg; + assert(width > 0); + assert(width < 2 * size + 1); + + unsigned max = 0; + for (int i=beg; i <= end; ++i) max = VMM_MAX(max, in[i]); + + return static_cast(max); + } + + void print_stats() + { + const unsigned size = 20; + unsigned char bucket[size]; + + unsigned sx = 0, sqx = 0; + + unsigned char *smooth[3]; + + smooth[0] = new unsigned char[_pages]; + smooth[1] = new unsigned char[_pages]; + smooth[2] = new unsigned char[_pages]; + + for (unsigned i=0; i < _pages; ++i) { + unsigned faults = VMM_MIN(_cnt[i], size); + ++bucket[faults]; + + sx += faults; + sqx += faults * faults; + + for (unsigned j=0; j < 3; ++j) + smooth[j][i] = fir_max(_cnt, _pages, i, j*50+1); + } + + float avg = sx / _pages; + float var = sqx - _pages * avg * avg; + + Logging::printf("# avg = %u, var = %u\n", + static_cast(avg), static_cast(var)); + +#if 0 + /* This generates a really long list needed for plotting + * statistics + */ + Logging::printf("# Remaps per page:\n"); + for (unsigned i = 0; i < _pages; ++i) + Logging::printf("REMAP %#x %u %u %u %u\n", + i, _cnt[i], smooth[0][i], smooth[1][i], smooth[2][i]); +#endif + + delete [] smooth[0]; + delete [] smooth[1]; + delete [] smooth[2]; + } + + DirtManager() : _map(NULL), _pages(0), _cnt(NULL), _dirt_count(0) {} + DirtManager(unsigned pages) : _map(NULL), _pages(pages), _cnt(NULL), _dirt_count(0) + { + _map = new unsigned[(pages + sizeof(*_map) -1) / sizeof(*_map)]; + _cnt = new unsigned char[pages]; + memset(_cnt, 0, pages * sizeof(*_cnt)); + } + ~DirtManager() + { + if (_map) delete [] _map; + if (_cnt) delete [] _cnt; + } +}; + +class Migration : public StaticReceiver +{ + Motherboard *_mb; +#if PORTED_TO_UNIX + Hip *_hip; + CapAllocator *_tls; +#endif + + char *_physmem_start; + unsigned long _physmem_size; + + CpuState *_vcpu_utcb; +#if PORTED_TO_UNIX + KernelSemaphore _vcpu_blocked_sem; + KernelSemaphore _vcpu_sem; +#endif + bool _vcpu_should_block; + + TcpSocket *_socket; + + unsigned long _sendmem; + unsigned long _sendmem_total; + + StopWatch _freeze_timer; + + /* Because of asynchronous send operations, all + * data to be send has to be preserved somewhere until + * it is ACKED. That's what this structure is for. + */ + struct longrange_data { + unsigned crd_count; + Prd *crds; + + timevalue rdtsc; + char *restore_buf; + MessageRestore end_of_devices; + + mword latency; + + longrange_data() : + crd_count(0), crds(NULL), + rdtsc(0), restore_buf(NULL), end_of_devices(0xdead, NULL, true), + latency(0) {} + }; + + DirtManager _dirtman; + + void init_memrange_info(); + void print_welcomescreen(); + bool puts_guestscreen(const char *str, bool reset_screen); + + void freeze_vcpus(); + void unfreeze_vcpus(); + + unsigned negotiate_port(); + bool send_header(); + timevalue send_ping(); + bool send_devices(longrange_data dat); + unsigned enqueue_all_dirty_pages(longrange_data &async_data); + bool send_memory(longrange_data &async_data); + + void receive_header(); + bool receive_ping(); + void receive_memory(); + bool receive_guestdevices(CpuState *vcpu_utcb); + + bool chksum_page(unsigned page_nr, mword &their_chksum, bool compare); + bool checksums(bool retrieve); + + public: + enum RestoreModes { + MODE_OFF = 0, + MODE_SEND, + MODE_RECEIVE + }; + + bool listen(unsigned port , CpuState *vcpu_utcb); + bool send(unsigned long addr, unsigned long port); + + // To be called from do_recall + void save_guestregs(CpuState *utcb); + + bool receive(MessageHostOp &msg); + + Migration(Motherboard *mb); + ~Migration(); +}; diff --git a/include/nul/migration_structs.h b/include/nul/migration_structs.h new file mode 100644 index 00000000..86893592 --- /dev/null +++ b/include/nul/migration_structs.h @@ -0,0 +1,115 @@ +/** + * Migration protocol structures + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + + +struct MigrationInit { +#define MAGIC_STRING_MIGINIT 0xb00b00 + mword cmdlen; + mword magic_string; + + MigrationInit() : cmdlen(0), magic_string(MAGIC_STRING_MIGINIT) {} + MigrationInit(mword _cmdlen) : cmdlen(_cmdlen), magic_string(MAGIC_STRING_MIGINIT) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_MIGINIT; } +}; + +struct MigrationAnswer { +#define MAGIC_STRING_MIGANSWER 0xfeeb1ed0 + mword success; + mword port; + mword magic_string; + + MigrationAnswer() : success(0), port(0), magic_string(MAGIC_STRING_MIGANSWER) {} + MigrationAnswer(unsigned _port) : success(1), port(_port), magic_string(MAGIC_STRING_MIGANSWER) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_MIGANSWER; } +}; + +/* + * This is an index structure telling us how many memory pages and device pages + * are saved to the hard disk, enabling us to calculate offsets later. + */ +struct RestoreIndex { + unsigned mem_pages; + unsigned dev_pages; + char space[0x1000 - 2*sizeof(unsigned)]; +}; + +struct MigrationHeader { +#define MAGIC_STRING_HEADER 0xb0015366 + mword magic_string; + mword version; + mword videomode; + + MigrationHeader() : magic_string(MAGIC_STRING_HEADER) {} + MigrationHeader(mword _videomode) + : magic_string(MAGIC_STRING_HEADER), videomode(_videomode) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_HEADER; } +}; + +struct AddressSpaceIndex { +#define MAGIC_STRING_ADDR_SPACE 0xBADB0B + unsigned long magic_string; + unsigned long num_pages; + + AddressSpaceIndex() {} + AddressSpaceIndex(unsigned long pages) : magic_string(MAGIC_STRING_ADDR_SPACE), num_pages(pages) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_ADDR_SPACE; } +}; + +struct PageTransferIndex { +#define MAGIC_STRING_PAGE_INDEX 0x51CD06 + unsigned long magic_string; + unsigned long desc_num; + unsigned long total_bytes; + + PageTransferIndex() + : magic_string(MAGIC_STRING_PAGE_INDEX) {} + PageTransferIndex(unsigned long descs, unsigned long bytes) + : magic_string(MAGIC_STRING_PAGE_INDEX), desc_num(descs), total_bytes(bytes) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_PAGE_INDEX; } +}; + +static unsigned long checksum_pages(void *offset, unsigned long count) +{ + if (offset == 0) return 0; + assert(! (reinterpret_cast(offset) & 0xfff) ); + + unsigned long chksum = 0; + unsigned long *ptr = reinterpret_cast(offset); + + for (unsigned i=0; i < count * 0x1000 / sizeof(unsigned long); i++) + chksum += ptr[i] * ptr[i]; + + return chksum; +} + +struct PageTransferDesc { +#define MAGIC_STRING_PAGE_DESC 0xDEADC0DE + unsigned long magic_string; + unsigned long offset; + unsigned long count; + unsigned long checksum; + + PageTransferDesc() {} + PageTransferDesc(unsigned long _offset, unsigned long _count) + : magic_string(MAGIC_STRING_PAGE_DESC), offset(_offset), count(_count), + checksum(checksum_pages(reinterpret_cast(_offset), _count)) { } + unsigned long recalculate_checksums() + { return (checksum = checksum_pages(reinterpret_cast(offset), count)); } + bool magic_string_check() { return magic_string == MAGIC_STRING_PAGE_DESC; } +}; + +#define MAGIC_STRING_PAGE_BORDER 0xC03DD00D diff --git a/include/nul/motherboard.h b/include/nul/motherboard.h index f4ce9b7b..6fb2f149 100644 --- a/include/nul/motherboard.h +++ b/include/nul/motherboard.h @@ -4,6 +4,8 @@ * Copyright (C) 2007-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -48,6 +50,7 @@ class Motherboard public: DBus bus_acpi; + DBus bus_acpi_event; DBus bus_ahcicontroller; DBus bus_apic; DBus bus_bios; @@ -79,6 +82,8 @@ class Motherboard DBus bus_timer; ///< Request for timers DBus bus_vesa; + DBus bus_restore; + VCpu *last_vcpu; Clock *clock() { return _clock; } Hip *hip() { return _hip; } diff --git a/include/nul/timer.h b/include/nul/timer.h index 0f4a821f..2bfd59c1 100644 --- a/include/nul/timer.h +++ b/include/nul/timer.h @@ -4,6 +4,8 @@ * Copyright (C) 2007-2008, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -19,6 +21,10 @@ #include "service/cpu.h" #include "service/math.h" +#include +#include +#include + typedef unsigned long long timevalue; @@ -78,7 +84,7 @@ class Clock * Keeping track of the timeouts. */ template -class TimeoutList +class TimeoutList : public StaticReceiver> { class TimeoutEntry { @@ -91,6 +97,8 @@ class TimeoutList }; TimeoutEntry _entries[ENTRIES]; + + bool _restore_processed; public: /** * Alloc a new timeout object. @@ -187,5 +195,64 @@ class TimeoutList _entries[0]._timeout = ~0ULL; } - TimeoutList() { init(); } + TimeoutList() : _restore_processed(false) { init(); } + +#define REL_PTR(ptr, offset) ( \ + reinterpret_cast( \ + reinterpret_cast(ptr) - reinterpret_cast(offset)) \ +) +#define ABS_PTR(ptr, offset) ( \ + reinterpret_cast( \ + reinterpret_cast(ptr) + reinterpret_cast(offset)) \ +) + + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + - reinterpret_cast(_entries); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_TIMEOUTLIST || _restore_processed) return false; + + unsigned long long rdtsc = Cpu::rdtsc(); + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(_entries), bytes); + + // Do not mess around with timeout entries of the running guest, + // since we may want to let it continue after saving + TimeoutEntry *entries = reinterpret_cast(msg.space); + for (unsigned i=0; i < ENTRIES; i++) { + entries[i]._prev = REL_PTR(entries[i]._prev, _entries); + entries[i]._next = REL_PTR(entries[i]._next, _entries); + + if (i == 0) continue; + + if (entries[i]._timeout <= rdtsc) + entries[i]._timeout = 0; + else + entries[i]._timeout -= rdtsc; + } + } + else { + memcpy(reinterpret_cast(_entries), msg.space, bytes); + for (unsigned i=0; i < ENTRIES; i++) { + _entries[i]._prev = ABS_PTR(_entries[i]._prev, _entries); + _entries[i]._next = ABS_PTR(_entries[i]._next, _entries); + + if (i == 0) continue; + _entries[i]._timeout += rdtsc; + } + } + + //Logging::printf("%s Timeoutlist\n", msg.write ? "Saved" : "Restored"); + _restore_processed = true; + return true; + } }; diff --git a/include/nul/vcpu.h b/include/nul/vcpu.h index 145a0184..5591b285 100644 --- a/include/nul/vcpu.h +++ b/include/nul/vcpu.h @@ -4,6 +4,8 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -36,7 +38,8 @@ struct CpuMessage { TYPE_WBINVD, TYPE_CHECK_IRQ, TYPE_CALC_IRQWINDOW, - TYPE_SINGLE_STEP + TYPE_SINGLE_STEP, + TYPE_ADD_TSC_OFF, } type; union { struct { @@ -122,7 +125,8 @@ class VCpu EVENT_DEBUG = 1 << 17, STATE_BLOCK = 1 << 18, STATE_WAKEUP = 1 << 19, - EVENT_HOST = 1 << 20 + EVENT_HOST = 1 << 20, + EVENT_RESUME = 1 << 21 }; unsigned long long inj_count; diff --git a/include/service/time.h b/include/service/time.h index ab0239be..5ce605cf 100644 --- a/include/service/time.h +++ b/include/service/time.h @@ -4,6 +4,8 @@ * Copyright (C) 2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -97,3 +99,29 @@ static inline void gmtime(timevalue seconds, struct tm_simple *tm) tm->mon = m + 1; tm->mday = days + 1; } + +class StopWatch +{ +private: + Clock *_clock; + unsigned _frequency; + timevalue _tic, _toc; + +public: + void start() { _tic = _clock->clock(_frequency); } + timevalue stop() { _toc = _clock->clock(_frequency); return delta(); } + timevalue delta() { return _toc - _tic; } + + timevalue abs_start() { return _tic; } + timevalue abs_stop() { return _toc; } + + // Returns B/ms, which is actually kB/s (if using default frequency) + unsigned rate(mword bytes) { + if (delta()) return bytes / delta(); + else return 0; + } + + StopWatch(Clock *clock, unsigned frequency = 1000 /* ms */) + : _clock(clock), _frequency(frequency), _tic(0), _toc(0) + {} +}; diff --git a/model/acpicontroller.cc b/model/acpicontroller.cc new file mode 100644 index 00000000..d293a112 --- /dev/null +++ b/model/acpicontroller.cc @@ -0,0 +1,346 @@ +/** + * ACPI controller model + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + + +#include +#include + +#include "nul/motherboard.h" +#include "executor/bios.h" + +#define CMD_ACPI_ENABLE 0xab +#define CMD_ACPI_DISABLE 0xba + +#define PORT_SMI_CMD 0xaeae + +/* The pm1 event register group is somewhat complicated. + * port numbers follow a partition rule of the register block. + * see ACPI spec 4.7.3.1 + */ +#define PM1_EVT_LEN 4 +#define PORT_PM1A_EVENT_BLK 0xaea6 +#define PORT_PM1B_EVENT_BLK 0xaeaa +#define PORT_PM1A_EVENT_STATUS (PORT_PM1A_EVENT_BLK) +#define PORT_PM1A_EVENT_ENABLE (PORT_PM1A_EVENT_BLK + (PM1_EVT_LEN) / 2) // 0xa6 + 4/2 = 0xa8 +#define PORT_PM1B_EVENT_STATUS (PORT_PM1B_EVENT_BLK) +#define PORT_PM1B_EVENT_ENABLE (PORT_PM1B_EVENT_BLK + (PM1_EVT_LEN) / 2) // 0xaa + 4/2 = 0xac + +#define PM1_CNT_LEN 2 +#define PORT_PM1A_CONTROL 0xaeb0 +#define PORT_PM1B_CONTROL 0xaeb2 + +#define PORT_GPE0_STATUS 0xaeb4 +#define PORT_GPE1_STATUS 0xaeb5 +#define PORT_GPE0_ENABLE (PORT_GPE0_STATUS + 2) +#define PORT_GPE1_ENABLE (PORT_GPE1_STATUS + 2) + +#define PORT_PCIU 0xae00 +#define PORT_PCID 0xae04 +#define PORT_B0EJ 0xae08 + + +class AcpiController : public StaticReceiver, public BiosCommon +{ + private: + unsigned short _pm1a_status; + unsigned short _pm1a_enable; + unsigned short _pm1a_control; + + unsigned short _pm1b_status; + unsigned short _pm1b_enable; + unsigned short _pm1b_control; + + unsigned char _gpe0_sts; + unsigned char _gpe0_en; + unsigned char _gpe1_sts; + unsigned char _gpe1_en; + + unsigned _b0ej; // write-only register + unsigned _pciu; // read-only, REFRESH register (card plugged in) + unsigned _pcid; // read-only, DETACH register (card to be unplugged) + + bool _processed; + + StopWatch _watch; + + public: + void trigger_gpe(unsigned event_nr) + { + + // Activate this event in the appropriate register + _gpe0_sts |= 0x00ff & (1 << event_nr); + _gpe1_sts |= (0xff00 & (1 << event_nr)) >> 8; + + // If this event is masked by the guest, then just ignore it + if ((0 == _gpe0_sts & _gpe0_en) || (0 == _gpe1_sts & _gpe1_en)) + return; + + // Send the guest an SCI + MessageIrqLines msg(MessageIrq::ASSERT_IRQ, 9); + _mb.bus_irqlines.send(msg); + } + + bool receive(MessageAcpiEvent &msg) { + switch (msg.type) { + case MessageAcpiEvent::ACPI_EVENT_GP: + trigger_gpe(msg.num); + break; + case MessageAcpiEvent::ACPI_EVENT_HOT_REPLUG: + _pciu |= (1 << msg.num); + trigger_gpe(1); + break; + case MessageAcpiEvent::ACPI_EVENT_HOT_UNPLUG: + _watch.start(); + _pcid |= (1 << msg.num); + trigger_gpe(1); + break; + + case MessageAcpiEvent::ACPI_EVENT_FIXED: + default: + return false; + } + + return true; + } + + bool receive(MessageDiscovery &msg) { + if (msg.type != MessageDiscovery::DISCOVERY) return false; + + /* The following FADT entries will tell the guest kernel + * how to interact with the system when receiving + * System Control Interrupts (SCI). + * Only the GPE part is important for hot plugging, but + * all the PM-stuff is mandatory for event management + * to work. + */ + discovery_write_dw("FACP", 56, PORT_PM1A_EVENT_BLK); + discovery_write_dw("FACP", 60, PORT_PM1B_EVENT_BLK); + discovery_write_dw("FACP", 64, PORT_PM1A_CONTROL); + discovery_write_dw("FACP", 68, PORT_PM1B_CONTROL); + discovery_write_dw("FACP", 88, PM1_EVT_LEN, 1); + discovery_write_dw("FACP", 89, PM1_CNT_LEN, 1); + + discovery_write_dw("FACP", 80, PORT_GPE0_STATUS, 4); // GPE0_BLK + discovery_write_dw("FACP", 84, PORT_GPE1_STATUS, 4); // GPE1_BLK + + discovery_write_dw("FACP", 92, 4, 1); // GPE0_BLK_LEN + discovery_write_dw("FACP", 93, 4, 1); // GPE1_BLK_LEN + discovery_write_dw("FACP", 94, 16, 1); // GPE1_BASE (offset) + + /* This is used at boot once. Linux will write + * CMD_ACPI_ENABLE via system IO using port PORT_SMI_CMD + * to tell the mainboard it wants to use ACPI. + * If CMD_ACPI_ENABLE was defined as 0x00, the guest kernel + * would think that ACPI was always on. Therefore, this is + * optional and one could just erase the next three lines. + */ + discovery_write_dw("FACP", 48, PORT_SMI_CMD); + discovery_write_dw("FACP", 52, CMD_ACPI_ENABLE, 1); + discovery_write_dw("FACP", 53, CMD_ACPI_DISABLE, 1); + + return true; + } + + bool receive(MessageIOIn &msg) { + switch (msg.port) { + case PORT_PM1A_EVENT_STATUS: + //Logging::printf("In on port pm1a EVENT STATUS: %x len %u\n", _pm1a_status, msg.type); + msg.value = _pm1a_status; + return true; + case PORT_PM1A_EVENT_ENABLE: + //Logging::printf("In on port pm1a EVENT ENABLE: %x len %u\n", _pm1a_enable, msg.type); + msg.value = _pm1a_enable; + return true; + case PORT_PM1A_CONTROL: + //Logging::printf("In on port pm1a CONTROL %x len %u\n", _pm1a_control, msg.type); + msg.value = _pm1a_control; + return true; + + case PORT_PM1B_EVENT_STATUS: + //Logging::printf("In on port pm1b EVENT STATUS: %x len %u\n", _pm1b_status, msg.type); + msg.value = _pm1b_status; + return true; + case PORT_PM1B_EVENT_ENABLE: + //Logging::printf("In on port pm1b EVENT ENABLE: %x len %u\n", _pm1b_enable, msg.type); + msg.value = _pm1b_enable; + return true; + case PORT_PM1B_CONTROL: + //Logging::printf("In on port pm1b CONTROL %x len %u\n", _pm1b_control, msg.type); + msg.value = _pm1b_control; + return true; + + + case PORT_GPE0_STATUS: + //Logging::printf("In on port GPE0 STS: %x\n", _gpe0_sts); + msg.value = _gpe0_sts; + return true; + case PORT_GPE0_ENABLE: + //Logging::printf("In on port GPE0 EN %x\n", _gpe0_en); + msg.value = _gpe0_en; + return true; + case PORT_GPE1_STATUS: + //Logging::printf("In on port GPE1 STS: %x\n", _gpe1_sts); + msg.value = _gpe1_sts; + return true; + case PORT_GPE1_ENABLE: + //Logging::printf("In on port GPE1 EN %x\n", _gpe1_en); + msg.value = _gpe1_en; + return true; + + case PORT_PCIU: + //Logging::printf("--- In on PCIU\n"); + msg.value = _pciu; + return true; + case PORT_PCID: + //Logging::printf("--- In on PCID\n"); + msg.value = _pcid; + return true; + default:; + } + return false; + } + + bool receive(MessageIOOut &msg) { + switch (msg.port) { + case PORT_SMI_CMD: + /* During boot the guest kernel checks PORT_SMI_CMD + * in the ACPI FADT table. If SCI_EN is not set, + * the system is in legacy mode. Hence it sends the + * CMD_ACPI_ENABLE cmd it got from the FADT again to + * this port and then polls for SCI_EN until it is set. + * ACPI is then officially active. */ + if (msg.value == CMD_ACPI_ENABLE) { + Logging::printf("Enabling ACPI for guest.\n"); + _pm1a_control |= 1; // Setting SCI_EN bit + } + else if (msg.value == CMD_ACPI_DISABLE) { + Logging::printf("Disabling ACPI for guest.\n"); + _pm1a_control &= ~1U; + } + return true; + + case PORT_PM1A_EVENT_STATUS: + //Logging::printf("Out on port pm1a EVENT STATUS: %x len %u\n", msg.value, msg.type); + return true; + case PORT_PM1A_EVENT_ENABLE: + //Logging::printf("Out on port pm1a EVENT ENABLE: %x len %u\n", msg.value, msg.type); + _pm1a_enable = static_cast(msg.value); + return true; + case PORT_PM1A_CONTROL: + //Logging::printf("Out on port pm1a CONTROL %x len %u\n", msg.value, msg.type); + return true; + + + case PORT_PM1B_EVENT_STATUS: + //Logging::printf("Out on port pm1b EVENT STATUS: %x len %u\n", msg.value, msg.type); + return true; + case PORT_PM1B_EVENT_ENABLE: + //Logging::printf("Out on port pm1b EVENT ENABLE: %x len %u\n", msg.value, msg.type); + _pm1a_enable = static_cast(msg.value); + return true; + case PORT_PM1B_CONTROL: + //Logging::printf("Out on port pm1b CONTROL %x len %u\n", msg.value, msg.type); + return true; + + case PORT_GPE0_STATUS: + //Logging::printf("Out on port GPE0 STS: %x len %u\n", msg.value, msg.type); + _gpe0_sts &= ~ static_cast(msg.value); + return true; + case PORT_GPE0_ENABLE: + //Logging::printf("Out on port GPE0 EN %x len %u\n", msg.value, msg.type); + _gpe0_en = static_cast(msg.value); + return true; + case PORT_GPE1_STATUS: + //Logging::printf("Out on port GPE1 STS: %x\n", msg.value); + _gpe1_sts &= ~ static_cast(msg.value); + return true; + case PORT_GPE1_ENABLE: + //Logging::printf("Out on port GPE1 EN %x\n", msg.value); + _gpe1_en = static_cast(msg.value); + return true; + + case PORT_B0EJ: + _watch.stop(); + Logging::printf("PCI hot-unplug confirmed by guest " + "(Output on B0EJ: %x) after %llu ms\n", + msg.value, _watch.delta()); + _pcid &= ~msg.value; + //Logging::printf("PCIU: %x, PCID: %x\n", _pciu, _pcid); + return true; + default:; + } + + /* Deassert this IRQ if all enabled events were cleared by the guest. + * This interrupt is thrown again otherwise. */ + if (!(_pm1a_status & _pm1a_enable) && + !(_pm1b_status & _pm1b_enable) && + !(_gpe0_sts & _gpe0_en) && + !(_gpe1_sts & _gpe1_en)) { + MessageIrqLines msg(MessageIrq::DEASSERT_IRQ, 9); + _mb.bus_irqlines.send(msg); + } + + return false; + } + + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_processed) + -reinterpret_cast(&_pm1a_status); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_ACPI || _processed) return false; + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(&_pm1a_status), bytes); + } + else { + memcpy(reinterpret_cast(&_pm1a_status), msg.space, bytes); + } + + Logging::printf("%s ACPI controller\n", msg.write?"Saved":"Restored"); + + _processed = true; + return true; + } + + AcpiController(Motherboard &mb) + : BiosCommon(mb), + _pm1a_status(0), _pm1a_enable(0), _pm1a_control(0), + _pm1b_status(0), _pm1b_enable(0), _pm1b_control(0), + _gpe0_sts(0), _gpe0_en(0), _gpe1_sts(0), _gpe1_en(0), + _b0ej(0), _pciu(0), _pcid(0), + _processed(false), _watch(mb.clock()) + { } +}; + +PARAM_HANDLER(acpimodel, + "acpimodel - Capable of issuing ACPI events to the guest.") +{ + AcpiController * dev = new AcpiController(mb); + mb.bus_discovery .add(dev, AcpiController::receive_static); + mb.bus_ioin .add(dev, AcpiController::receive_static); + mb.bus_ioout .add(dev, AcpiController::receive_static); + mb.bus_acpi_event.add(dev, AcpiController::receive_static); + mb.bus_restore .add(dev, AcpiController::receive_static); +} diff --git a/model/intel82576vf.cc b/model/intel82576vf.cc index abe9655e..612356ac 100644 --- a/model/intel82576vf.cc +++ b/model/intel82576vf.cc @@ -5,6 +5,8 @@ * Copyright (C) 2010, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -578,6 +580,40 @@ class Model82576vf : public StaticReceiver uint32 raw[3*4]; } _msix; + unsigned _ip_address; + EthernetAddr _guest_uses_mac; + bool processed; + + void update_ip(unsigned char *packet, unsigned packet_len) + { + unsigned short packet_type = * reinterpret_cast(packet + 12); + if (packet_type == 0x0608) { + unsigned char *mac = packet + 14 + 8; // Source MAC address + unsigned char *ip = packet + 14 + 14; // Source IP address + + EthernetAddr ethaddr(mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + +#if 0 + Logging::printf("Sending packet type %x from MAC %08llx, IP %x\n", + static_cast(packet_type), + ethaddr.raw, *reinterpret_cast(ip)); +#endif + + _guest_uses_mac = ethaddr; + _ip_address = * reinterpret_cast(ip); + } + } + + + void arp_gratuitous(const EthernetAddr &addr, const bool request) + { + const arp_packet arp(_guest_uses_mac, addr, _ip_address, + request ? 0x100 /* ARP_REQUEST */ : 0x200 /* ARP_REPLY */); + + MessageNetwork m(reinterpret_cast(&arp), sizeof(arp), 0); + _net.send(m); + } + uint32 VTFRTIMER_compute() { // XXX @@ -886,6 +922,62 @@ class Model82576vf : public StaticReceiver return false; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&processed) + -reinterpret_cast(&_mac); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + processed = false; + msg.bytes += bytes + 2 * 0x1000 + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_NIC || processed) return false; + + if (msg.write) { + msg.bytes = bytes + 2 * 0x1000; + memcpy(msg.space, reinterpret_cast(&_mac), bytes); + memcpy(msg.space + bytes, reinterpret_cast(_local_rx_regs), 0x1000); + memcpy(msg.space + bytes + 0x1000, reinterpret_cast(_local_tx_regs), 0x1000); + } + else { + uint32 *local_rx_regs = _local_rx_regs; + uint32 *local_tx_regs = _local_tx_regs; + Clock *clock = _clock; + + memcpy(reinterpret_cast(&_mac), msg.space, bytes); + + _local_rx_regs = local_rx_regs; + _local_tx_regs = local_tx_regs; + _clock = clock; + + memcpy(_local_rx_regs, msg.space + bytes, 0x1000); + memcpy(_local_tx_regs, msg.space + bytes + 0x1000, 0x1000); + + _rx_queues[0].parent = this; + _rx_queues[0].regs = local_rx_regs; + _rx_queues[1].parent = this; + _rx_queues[1].regs = local_rx_regs + 0x100/4; + _tx_queues[0].parent = this; + _tx_queues[0].regs = local_tx_regs; + _tx_queues[1].parent = this; + _tx_queues[1].regs = local_tx_regs + 0x100/4; + + if (_ip_address) { + Logging::printf("Trying to claim: MAC " MAC_FMT " IP %x\n", + MAC_SPLIT((&_guest_uses_mac)), _ip_address); + for (int i=0; i < 3; ++i) + arp_gratuitous(EthernetAddr(0xffffffffffffull), true); + } + } + + Logging::printf("%s NIC\n", msg.write?"Saved":"Restored"); + processed = true; + return true; + } + + Model82576vf(uint64 mac, DBus &net, DBus *bus_mem, DBus *bus_memregion, Clock *clock, DBus &timer, @@ -895,7 +987,8 @@ class Model82576vf : public StaticReceiver _clock(clock), _timer(timer), _mem_mmio(mem_mmio), _mem_msix(mem_msix), _txpoll_us(txpoll_us), _map_rx(map_rx), _bdf(bdf), - _promisc_default(promisc_default) + _promisc_default(promisc_default), _ip_address(0), _guest_uses_mac(0), + processed(false) { Logging::printf("Attached 82576VF model at %08x+0x4000, %08x+0x1000\n", mem_mmio, mem_msix); @@ -946,6 +1039,7 @@ PARAM_HANDLER(intel82576vf, mb.bus_network. add(dev, &Model82576vf::receive_static); mb.bus_timeout. add(dev, &Model82576vf::receive_static); mb.bus_legacy. add(dev, &Model82576vf::receive_static); + mb.bus_restore. add(dev, &Model82576vf::receive_static); } diff --git a/model/intel82576vf.h b/model/intel82576vf.h index b857d378..7da55770 100644 --- a/model/intel82576vf.h +++ b/model/intel82576vf.h @@ -5,6 +5,8 @@ * Copyright (C) 2010, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -44,4 +46,38 @@ class Mta { Mta() : _bits() { } }; +struct arp_packet { + unsigned char destination[6]; + unsigned char source[6]; + unsigned short eth_type; + unsigned short hw_type; + unsigned short protocol_type; + unsigned char hwaddr_len; + unsigned char protocoladdr_len; + unsigned short operation; + unsigned char sender_hwaddr[6]; + unsigned sender_ip; + unsigned char target_hwaddr[6]; + unsigned target_ip; + + arp_packet(EthernetAddr src, EthernetAddr dst, unsigned ip_addr, + unsigned short _operation) + : + eth_type(0x608), hw_type(0x100), protocol_type(0x8), hwaddr_len(6), + protocoladdr_len(4), operation(_operation), + sender_ip(ip_addr), target_ip(ip_addr) + { + memcpy(destination, dst.byte, 6); + memset(target_hwaddr, 0, 6); + memcpy(source, src.byte, 6); + memcpy(sender_hwaddr, src.byte, 6); + } + + bool source_is(const EthernetAddr &a) const + { + EthernetAddr my_addr(*reinterpret_cast(destination)); + return my_addr == a; + } +} __attribute__((packed)); + // EOF diff --git a/model/lapic.cc b/model/lapic.cc index bd89ea30..c99b066a 100644 --- a/model/lapic.cc +++ b/model/lapic.cc @@ -4,6 +4,8 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -32,8 +34,11 @@ */ class Lapic : public DiscoveryHelper, public StaticReceiver { + int _regstart; #define VMM_REGBASE "../model/lapic.cc" #include "model/reg.h" + int _regend; + enum { MAX_FREQ = 200000000, LVT_MASK_BIT = 16, @@ -64,6 +69,7 @@ class Lapic : public DiscoveryHelper, public StaticReceiver bool _rirr[NUM_LVT]; unsigned _lowest_rr; + bool _restore_processed; bool sw_disabled() { return ~_SVR & 0x100; } bool hw_disabled() { return ~_msr & 0x800; } @@ -738,8 +744,40 @@ class Lapic : public DiscoveryHelper, public StaticReceiver } } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_timer); - Lapic(Motherboard &mb, VCpu *vcpu, unsigned initial_apic_id, unsigned timer) : _mb(mb), _vcpu(vcpu), _initial_apic_id(initial_apic_id), _timer(timer) + const mword bytes2 = reinterpret_cast(&_regend) - reinterpret_cast(&_regstart); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + bytes2 + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_LAPIC || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes + bytes2; + memcpy(msg.space, reinterpret_cast(&_timer), bytes); + memcpy(msg.space + bytes, reinterpret_cast(&_regstart), bytes2); + } + else { + memcpy(reinterpret_cast(&_timer), msg.space, bytes); + memcpy(reinterpret_cast(&_regstart), msg.space + bytes, bytes2); + } + + Logging::printf("%s LAPIC\n", msg.write?"Saved":"Restored"); + + _restore_processed = true; + return true; + } + + + Lapic(Motherboard &mb, VCpu *vcpu, unsigned initial_apic_id, unsigned timer) + : _mb(mb), _vcpu(vcpu), _initial_apic_id(initial_apic_id), _timer(timer), _restore_processed(false) { // find a FREQ that is not too high for (_timer_clock_shift=0; _timer_clock_shift < 32; _timer_clock_shift++) @@ -762,11 +800,12 @@ class Lapic : public DiscoveryHelper, public StaticReceiver mb.bus_apic.add(this, receive_static); mb.bus_timeout.add(this, receive_static); mb.bus_discovery.add(this,discover); + mb.bus_restore.add(this, receive_static); + vcpu->executor.add(this, receive_static); vcpu->mem.add(this, receive_static); vcpu->memregion.add(this, receive_static); vcpu->bus_lapic.add(this, receive_static); - } }; diff --git a/model/memorycontroller.cc b/model/memorycontroller.cc index c61119eb..d331ef5a 100644 --- a/model/memorycontroller.cc +++ b/model/memorycontroller.cc @@ -45,6 +45,7 @@ class MemoryController : public StaticReceiver msg.start_page = _start >> 12; msg.count = (_end - _start) >> 12; msg.ptr = _physmem + _start; + msg.actual_physmem = true; return true; } diff --git a/model/pcidirect.cc b/model/pcidirect.cc index e8daaeeb..7929860b 100644 --- a/model/pcidirect.cc +++ b/model/pcidirect.cc @@ -4,6 +4,8 @@ * Copyright (C) 2007-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -393,6 +395,19 @@ class DirectPciDevice : public StaticReceiver, public HostVfPci return true; } + bool receive(MessageRestore &msg) { + if (msg.devtype != MessageRestore::PCI_PLUG) return false; + + unsigned slot = (_guestbdf >> 3) & 0x1f; + + MessageAcpiEvent amsg(msg.write ? + MessageAcpiEvent::ACPI_EVENT_HOT_REPLUG : + MessageAcpiEvent::ACPI_EVENT_HOT_UNPLUG, + slot); + + _mb.bus_acpi_event.send(amsg); + return true; + } DirectPciDevice(Motherboard &mb, unsigned hbdf, unsigned guestbdf, bool assign, @@ -461,6 +476,7 @@ class DirectPciDevice : public StaticReceiver, public HostVfPci if (map_mode != MAP_MODE_DISABLED) mb.bus_memregion.add(this, DirectPciDevice::receive_static); mb.bus_hostirq.add(this, DirectPciDevice::receive_static); + mb.bus_restore.add(this, DirectPciDevice::receive_static); //mb.bus_irqnotify.add(this, DirectPciDevice::receive_static); } }; diff --git a/model/pic8259.cc b/model/pic8259.cc index 438bfd44..2b8adede 100644 --- a/model/pic8259.cc +++ b/model/pic8259.cc @@ -4,6 +4,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -67,6 +69,8 @@ class PicDevice : public StaticReceiver unsigned char _elcr; unsigned char _notify; + bool _restore_processed; + // helper functions bool is_slave() { return (_icw[ICW4] & ICW4_BUF) ? (~_icw[ICW4] & ICW4_MS) : _virq; } void rotate_prios() { _prio_lowest = (_prio_lowest+1) & 7; } @@ -351,11 +355,44 @@ class PicDevice : public StaticReceiver return false; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_base); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_PIC || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes; + msg.id1 = _base; + msg.id2 = _upstream_irq; + memcpy(msg.space, reinterpret_cast(&_base), bytes); + + } + else { + if (msg.id1 != _base || msg.id2 != _upstream_irq) return false; + + memcpy(reinterpret_cast(&_base), msg.space, bytes); + } + + //Logging::printf("%s PIC (base %x, IRQ %x)\n", msg.write?"Saved":"Restored", msg.id1, msg.id2); + _restore_processed = true; + return true; + } + + + PicDevice(DBus &bus_irq, DBus &bus_pic, DBus &bus_legacy, DBus &bus_notify, unsigned short base, unsigned char irq, unsigned short elcr_base, unsigned char virq) : _bus_irq(bus_irq), _bus_pic(bus_pic), _bus_legacy(bus_legacy), _bus_notify(bus_notify), - _base(base), _upstream_irq(irq), _elcr_base(elcr_base), _virq(virq), _icw_mode(OCW1) + _base(base), _upstream_irq(irq), _elcr_base(elcr_base), _virq(virq), _icw_mode(OCW1), _restore_processed(false) { _icw[ICW1] = 0; reset_values(); @@ -384,8 +421,10 @@ PARAM_HANDLER(pic, mb.bus_ioout. add(dev, PicDevice::receive_static); mb.bus_irqlines.add(dev, PicDevice::receive_static); mb.bus_pic. add(dev, PicDevice::receive_static); + mb.bus_restore.add(dev, PicDevice::receive_static); if (!virq) mb.bus_legacy.add(dev, PicDevice::receive_static); virq += 8; + } diff --git a/model/pit8254.cc b/model/pit8254.cc index 5853e77c..b67d11af 100644 --- a/model/pit8254.cc +++ b/model/pit8254.cc @@ -4,6 +4,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -370,6 +372,8 @@ class PitDevice : public StaticReceiver static const unsigned COUNTER = 3; PitCounter _c[COUNTER]; + bool _restore_processed; + public: bool receive(MessagePit &msg) @@ -421,9 +425,36 @@ class PitDevice : public StaticReceiver return true; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_base); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_PIT || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(&_base), bytes); + + } + else { + memcpy(reinterpret_cast(&_base), msg.space, bytes); + } + + //Logging::printf("%s PIT\n", msg.write?"Saved":"Restored"); + _restore_processed = true; + return true; + } + PitDevice(Motherboard &mb, unsigned short base, unsigned irq, unsigned pit) - : _base(base), _addr(pit*COUNTER) + : _base(base), _addr(pit*COUNTER), _restore_processed(false) { for (unsigned i=0; i < COUNTER; i++) { @@ -449,4 +480,5 @@ PARAM_HANDLER(pit, mb.bus_ioin.add(dev, PitDevice::receive_static); mb.bus_ioout.add(dev, PitDevice::receive_static); mb.bus_pit.add(dev, PitDevice::receive_static); + mb.bus_restore.add(dev, PitDevice::receive_static); } diff --git a/model/vcpu.cc b/model/vcpu.cc index c412c7ce..ce76685f 100644 --- a/model/vcpu.cc +++ b/model/vcpu.cc @@ -4,6 +4,8 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -204,6 +206,12 @@ class VirtualCpu : public VCpu, public StaticReceiver msg.mtr_out |= MTD_STATE | MTD_INJ; if (!old_event) return; + + if (old_event & EVENT_RESUME) { + Cpu::atomic_and(&_event, ~(old_event & EVENT_RESUME)); + cpu->actv_state = 0; + } + if (old_event & (EVENT_DEBUG | EVENT_HOST)) { if (old_event & EVENT_DEBUG) dprintf("state %x event %8x eip %8x eax %x ebx %x edx %x esi %x\n", cpu->actv_state, old_event, cpu->eip, cpu->eax, cpu->ebx, cpu->edx, cpu->esi); @@ -316,7 +324,7 @@ class VirtualCpu : public VCpu, public StaticReceiver COUNTER_INC("EVENT"); if (value & DEASS_INTR) Cpu::atomic_and(&_event, ~EVENT_INTR); - if (!((~_event & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))) return; + if (!((~_event & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST | EVENT_RESUME))) return; // INIT or AP RESET - go to the wait-for-sipi state if ((value & EVENT_MASK) == EVENT_INIT) @@ -331,7 +339,7 @@ class VirtualCpu : public VCpu, public StaticReceiver */ if (Cpu::cmpxchg4b(&_sipi, 0, value)) return; - Cpu::atomic_or(&_event, STATE_WAKEUP | (value & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))); + Cpu::atomic_or(&_event, STATE_WAKEUP | (value & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST | EVENT_RESUME))); MessageHostOp msg(MessageHostOp::OP_VCPU_RELEASE, _hostop_id, _event & STATE_BLOCK); @@ -353,6 +361,11 @@ class VirtualCpu : public VCpu, public StaticReceiver return true; } + if (msg.type == MessageLegacy::UNLOCK) { + got_event(EVENT_RESUME); + return true; + } + // BSP receives only legacy signals if the LAPIC is disabled if (is_ap() || CPUID_EDX1 & (1 << 9)) return false; @@ -384,6 +397,11 @@ class VirtualCpu : public VCpu, public StaticReceiver bool receive(CpuMessage &msg) { + if (msg.type == CpuMessage::TYPE_ADD_TSC_OFF) { + _reset_tsc_off += msg.current_tsc_off; + return true; + } + // TSC drift compensation. if (msg.type != CpuMessage::TYPE_CPUID_WRITE && msg.mtr_in & MTD_TSC && ~msg.mtr_out & MTD_TSC) { COUNTER_INC("tsc adoption"); @@ -447,6 +465,7 @@ class VirtualCpu : public VCpu, public StaticReceiver case CpuMessage::TYPE_SINGLE_STEP: case CpuMessage::TYPE_WBINVD: case CpuMessage::TYPE_INVD: + case CpuMessage::TYPE_ADD_TSC_OFF: default: return false; } diff --git a/model/vga.cc b/model/vga.cc index dadfe0ed..44c9d3fa 100644 --- a/model/vga.cc +++ b/model/vga.cc @@ -4,6 +4,8 @@ * Copyright (C) 2007-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -45,6 +47,9 @@ class Vga : public StaticReceiver, public BiosCommon unsigned char _crt_index; unsigned _ebda_segment; unsigned _vbe_mode; + mword _last_videomode_request; + + bool _restore_processed; void puts_guest(const char *msg) { unsigned pos = _regs.cursor_pos - TEXT_OFFSET; @@ -174,6 +179,7 @@ class Vga : public StaticReceiver, public BiosCommon case 0x4f02: // set vbemode { ConsoleModeInfo info; + _last_videomode_request = cpu->ebx; unsigned index = get_vesa_mode(cpu->ebx & 0x0fff, &info); if (index != ~0u && info.attr & 1) { @@ -349,6 +355,12 @@ class Vga : public StaticReceiver, public BiosCommon return true; } + void set_videomode(mword videomode) + { + ConsoleModeInfo info; + _regs.mode = get_vesa_mode(videomode & 0x0fff, &info); + } + public: bool receive(MessageBios &msg) @@ -522,9 +534,55 @@ class Vga : public StaticReceiver, public BiosCommon return true; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_view); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype == MessageRestore::VGA_DISPLAY_GUEST) { + if (msg.write) memset(_framebuffer_ptr, 0, _framebuffer_size); + puts_guest(msg.space); + return true; + } + + if (msg.devtype == MessageRestore::VGA_VIDEOMODE) { + if (msg.write) { + set_videomode(msg.bytes); + MessageConsole cmsg(MessageConsole::TYPE_SWITCH_VIEW); + cmsg.view = _view; + _mb.bus_console.send(cmsg); + } + else + msg.bytes = _last_videomode_request; + return true; + } + + if (msg.devtype != MessageRestore::RESTORE_VGA || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(&_view), bytes); + + } + else { + memcpy(reinterpret_cast(&_view), msg.space, bytes); + set_videomode(_last_videomode_request); + } + + //Logging::printf("%s VGA\n", msg.write?"Saved":"Restored"); + _restore_processed = true; + return true; + } + Vga(Motherboard &mb, unsigned short iobase, char *framebuffer_ptr, uintptr_t framebuffer_phys, size_t framebuffer_size) - : BiosCommon(mb), _iobase(iobase), _framebuffer_ptr(framebuffer_ptr), _framebuffer_phys(framebuffer_phys), _framebuffer_size(framebuffer_size), _crt_index(0), _ebda_segment(), _vbe_mode() + : BiosCommon(mb), _iobase(iobase), _framebuffer_ptr(framebuffer_ptr), _framebuffer_phys(framebuffer_phys), _framebuffer_size(framebuffer_size), _crt_index(0), _ebda_segment(), _vbe_mode(), _last_videomode_request(), _restore_processed(false) { assert(!(framebuffer_phys & 0xfff)); assert(!(framebuffer_size & 0xfff)); @@ -576,5 +634,6 @@ PARAM_HANDLER(vga, mb.bus_mem .add(dev, Vga::receive_static); mb.bus_memregion.add(dev, Vga::receive_static); mb.bus_discovery.add(dev, Vga::receive_static); + mb.bus_restore.add(dev, Vga::receive_static); } diff --git a/unix/SConstruct b/unix/SConstruct index 3da784e8..5e64e833 100644 --- a/unix/SConstruct +++ b/unix/SConstruct @@ -123,9 +123,11 @@ sources = Glob('*.cc') + [ # Unix frontend '../model/pmtimer.cc', '../model/vcpu.cc', '../model/vbios.cc', + '../model/acpicontroller.cc', '../model/lapic.cc', '../model/msi.cc', '../host/hostkeyboard.cc', + '../host/migration.cc' ] # TODO not yet ported if target_arch == 'x86_32': diff --git a/unix/main.cc b/unix/main.cc index c4cd9a3f..530403dd 100644 --- a/unix/main.cc +++ b/unix/main.cc @@ -4,6 +4,8 @@ * Copyright (C) 2012, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Seoul. * * Seoul is free software: you can redistribute it and/or modify it @@ -48,6 +50,7 @@ #include #include +#include const char version_str[] = #include "version.inc" @@ -166,6 +169,17 @@ static std::vector disks; // Used to serialize all operations (for now). pthread_mutex_t irq_mtx; +// Relevant to live migration + +Migration *_migrator; +Migration::RestoreModes _restore_mode = Migration::MODE_OFF; +unsigned _migration_ip; +unsigned _migration_port; + +// the memory remapping procedure should only +// remap memory in page size granularity, if set +bool _track_page_usage = false; + static void skip_instruction(CpuMessage &msg) { // advance EIP @@ -231,8 +245,23 @@ static void *vcpu_thread_fn(void *arg) while (true) { pthread_mutex_lock(&irq_mtx); + + if (_restore_mode == Migration::MODE_RECEIVE) + // This will block until everything is restored + _migrator->listen(_migration_port, &cpu_state); + else if (_restore_mode == Migration::MODE_SEND) + // This will block if the last memory resend round is reached + _migrator->save_guestregs(&cpu_state); + handle_vcpu(false, CpuMessage::TYPE_SINGLE_STEP, vcpu, &cpu_state); // Logging::printf("eip %x\n", cpu_state.eip); + + if (_restore_mode == Migration::MODE_RECEIVE) { + _restore_mode = Migration::MODE_OFF; + delete _migrator; + _migrator = NULL; + cpu_state.mtd = MTD_ALL; + } pthread_mutex_unlock(&irq_mtx); } @@ -247,6 +276,31 @@ struct Vcpu_info { static std::vector vcpu_info; +static void *migration_thread_fn(void *) +{ + _migrator = new Migration(&mb); + _migrator->send(_migration_ip, _migration_port); + + delete _migrator; + _migrator = nullptr; + + return nullptr; +} + +static void start_migration_to(unsigned ip, unsigned port) +{ + _migration_ip = ip; + _migration_port = port; + _restore_mode = Migration::MODE_SEND; + + pthread_t migthread; + if (0 != pthread_create(&migthread, NULL, migration_thread_fn, NULL)) { + perror("pthread_create"); + return; + } + pthread_setname_np(migthread, "migration"); +} + static bool receive(Device *, MessageHostOp &msg) { bool res = true; @@ -316,6 +370,122 @@ static bool receive(Device *, MessageHostOp &msg) msg.mac = mac_prefix << 16 | mac_host; break; } + case MessageHostOp::OP_NEXT_DIRTY_PAGE: { + /* + * What this does when it is properly implemented: + * - There is a variable "pageptr" which points + * to a page number. + * - The user emits this message host op when + * he wants a dirty page region + * - pageptr is moved incrementally until + * a dirty page region is found. + * This page region is then remapped RO + * and returned to the user as a CRD description + * - pageptr wraps around if it exceeds guest mem size. + */ +#if PORTED_TO_UNIX + const unsigned physpages = _physsize >> 12; + static unsigned long pageptr = 0; + + _track_page_usage = true; + + Crd reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL)); + // There will be several mappings, but we want to see the ones + // which are set to "writable by the guest" + + unsigned increment = 0; + do { + if (increment >= physpages) { + // That's it for now. Come back later. + msg.value = 0; + return true; + } + MessageMemRegion mmsg(pageptr); + if (!_mb->bus_memregion.send(mmsg, true)) { + // No one claims this region. Do not track. + pageptr = (pageptr + 1) % physpages; + ++increment; + continue; + } + if (!mmsg.actual_physmem) { + // This is no physmem. + pageptr += mmsg.count; + increment += mmsg.count; + if (pageptr > physpages) pageptr = 0; + continue; + } + reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL)); + if (!(reg.attr() & DESC_RIGHT_W)) { + // Not write-mapped, hence not dirty. + pageptr += 1 << reg.order(); + increment += 1 << reg.order(); + if (pageptr > physpages) pageptr = 0; + continue; + } + + break; + } while (1); + + // reg now describes a region which is guest-writable + // This means that the guest wrote to it before and it is now considered "dirty" + + // Tell the user "where" and "how many" + msg.phys = pageptr << 12; + msg.phys_len = reg.order(); + msg.value = reg.value(); + + // Make this page read-only for the guest, so it is considered "clean" now. + nova_revoke(Crd((reg.base() + _physmem) >> 12, reg.order(), + DESC_RIGHT_W | DESC_TYPE_MEM), false); + pageptr += 1 << reg.order(); + if (pageptr >= physpages) pageptr = 0; + +#endif + return true; + } + break; + case MessageHostOp::OP_GET_CONFIG_STRING: { + char *cmdline = NULL; + +#if PORTED_TO_UNIX + // Retrieve the command line string length from sigma0 + MessageConsole cmsg(MessageConsole::TYPE_START, cmdline); + cmsg.read = true; + cmsg.mem = 0; + unsigned ret = Sigma0Base::console(cmsg); + if (ret) { + Logging::printf("Error retrieving the command line" + " string length from sigma0.\n"); + return false; + } + + // Retrieve the command line itself + cmdline = new char[cmsg.mem+1]; + cmsg.mem += 1; + cmsg.cmdline = cmdline; + ret = Sigma0Base::console(cmsg); + if (ret) { + Logging::printf("Error retrieving the command line string sigma0.\n"); + return false; + } +#endif + + msg.obj = cmdline; + } + break; + + case MessageHostOp::OP_MIGRATION_RETRIEVE_INIT: { + _migration_port = msg.value; + _restore_mode = Migration::MODE_RECEIVE; + _migrator = new Migration(&mb); + } + break; + case MessageHostOp::OP_MIGRATION_START: { + start_migration_to(msg.value, 9000); + return true; + } + break; + default: Logging::panic("%s - unimplemented operation %#x\n", __PRETTY_FUNCTION__, msg.type); @@ -589,6 +759,8 @@ int main(int argc, char **argv) mb.bus_network.add(nullptr, receive); mb.bus_disk .add(nullptr, receive); + mb.bus_restore.add(&timeouts, TimeoutList<32, void>::receive_static); + // Synchronization initialization if (0 != pthread_mutex_init(&irq_mtx, nullptr)) { perror("pthread_mutex_init"); @@ -629,6 +801,15 @@ int main(int argc, char **argv) MessageLegacy msg2(MessageLegacy::RESET, 0); mb.bus_legacy.send_fifo(msg2); + if (_restore_mode != Migration::MODE_OFF) { + /* + * The following UNLOCK message helps the VCPU out of the lock + * it is blocked by and catches it into the recall handler. + */ + MessageLegacy msg3(MessageLegacy::UNLOCK, 0); + mb.bus_legacy.send_fifo(msg3); + } + pthread_t iothread; if (tap_fd) { Logging::printf("Starting background threads.\n"); diff --git a/unix/ncurses.cc b/unix/ncurses.cc index 24e33105..9cbe37c9 100644 --- a/unix/ncurses.cc +++ b/unix/ncurses.cc @@ -4,6 +4,8 @@ * Copyright (C) 2013, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Seoul. * * Seoul is free software: you can redistribute it and/or modify it @@ -147,6 +149,14 @@ class NcursesDisplay : public StaticReceiver { if (current_view < views.size() - 1) current_view ++; break; + + case KEY_BACKSPACE: { + /* Migration example start event. As soon as the user hits this event, + * the VM will be migrated to the hard coded destination host. */ + MessageHostOp msg(MessageHostOp::OP_MIGRATION_START, + /* destination ip, address: 192.168.0.1 */ 0xC0A80001ul); + mb.bus_hostop.send(msg); + } case ERR: default: break;