Discussion:
[RFC PATCH v3 01/19][SeaBIOS] Add ACPI_EXTRACT_DEVICE* macros
Vasilis Liaskovitis
2012-09-21 11:17:17 UTC
Permalink
This allows to extract the beginning, end and name of a Device object.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
tools/acpi_extract.py | 28 ++++++++++++++++++++++++++++
1 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/tools/acpi_extract.py b/tools/acpi_extract.py
index 167a322..cb2540e 100755
--- a/tools/acpi_extract.py
+++ b/tools/acpi_extract.py
@@ -195,6 +195,28 @@ def aml_package_start(offset):
offset += 1
return offset + aml_pkglen_bytes(offset) + 1

+def aml_device_start(offset):
+ #0x5B 0x82 DeviceOp PkgLength NameString ProcID
+ if ((aml[offset] != 0x5B) or (aml[offset + 1] != 0x82)):
+ die( "Name offset 0x%x: expected 0x5B 0x83 actual 0x%x 0x%x" %
+ (offset, aml[offset], aml[offset + 1]));
+ return offset
+
+def aml_device_string(offset):
+ #0x5B 0x82 DeviceOp PkgLength NameString ProcID
+ start = aml_device_start(offset)
+ offset += 2
+ pkglenbytes = aml_pkglen_bytes(offset)
+ offset += pkglenbytes
+ return offset
+
+def aml_device_end(offset):
+ start = aml_device_start(offset)
+ offset += 2
+ pkglenbytes = aml_pkglen_bytes(offset)
+ pkglen = aml_pkglen(offset)
+ return offset + pkglen
+
lineno = 0
for line in fileinput.input():
# Strip trailing newline
@@ -279,6 +301,12 @@ for i in range(len(asl)):
offset = aml_processor_end(offset)
elif (directive == "ACPI_EXTRACT_PKG_START"):
offset = aml_package_start(offset)
+ elif (directive == "ACPI_EXTRACT_DEVICE_START"):
+ offset = aml_device_start(offset)
+ elif (directive == "ACPI_EXTRACT_DEVICE_STRING"):
+ offset = aml_device_string(offset)
+ elif (directive == "ACPI_EXTRACT_DEVICE_END"):
+ offset = aml_device_end(offset)
else:
die("Unsupported directive %s" % directive)
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:20 UTC
Permalink
The memory device generation is guided by qemu paravirt info. Seabios
first uses the info to setup SRAT entries for the hotplug-able memory slots.
Afterwards, build_memssdt uses the created SRAT entries to generate
appropriate memory device objects. One memory device (and corresponding SRAT
entry) is generated for each hotplug-able qemu memslot. Currently no SSDT
memory device is created for initial system memory.

We only support up to 255 DIMMs for now (PackageOp used for the MEON array can
only describe an array of at most 255 elements. VarPackageOp would be needed to
support more than 255 devices)

v1->v2:
Seabios reads mems_sts from qemu to build e820_map
SSDT size and some offsets are calculated with extraction macros.

v2->v3:
Minor name changes

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
src/acpi.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/src/acpi.c b/src/acpi.c
index 6d239fa..1223b52 100644
--- a/src/acpi.c
+++ b/src/acpi.c
@@ -13,6 +13,7 @@
#include "pci_regs.h" // PCI_INTERRUPT_LINE
#include "ioport.h" // inl
#include "paravirt.h" // qemu_cfg_irq0_override
+#include "memmap.h"

/****************************************************/
/* ACPI tables init */
@@ -416,11 +417,26 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes)
#define PCIHP_AML (ssdp_pcihp_aml + *ssdt_pcihp_start)
#define PCI_SLOTS 32

+/* 0x5B 0x82 DeviceOp PkgLength NameString DimmID */
+#define MEM_BASE 0xaf80
+#define MEM_AML (ssdm_mem_aml + *ssdt_mem_start)
+#define MEM_SIZEOF (*ssdt_mem_end - *ssdt_mem_start)
+#define MEM_OFFSET_HEX (*ssdt_mem_name - *ssdt_mem_start + 2)
+#define MEM_OFFSET_ID (*ssdt_mem_id - *ssdt_mem_start)
+#define MEM_OFFSET_PXM 31
+#define MEM_OFFSET_START 55
+#define MEM_OFFSET_END 63
+#define MEM_OFFSET_SIZE 79
+
+u64 nb_hp_memslots = 0;
+struct srat_memory_affinity *mem;
+
#define SSDT_SIGNATURE 0x54445353 // SSDT
#define SSDT_HEADER_LENGTH 36

#include "ssdt-susp.hex"
#include "ssdt-pcihp.hex"
+#include "ssdt-mem.hex"

#define PCI_RMV_BASE 0xae0c

@@ -472,6 +488,111 @@ static void patch_pcihp(int slot, u8 *ssdt_ptr, u32 eject)
}
}

+static void build_memdev(u8 *ssdt_ptr, int i, u64 mem_base, u64 mem_len, u8 node)
+{
+ memcpy(ssdt_ptr, MEM_AML, MEM_SIZEOF);
+ ssdt_ptr[MEM_OFFSET_HEX] = getHex(i >> 4);
+ ssdt_ptr[MEM_OFFSET_HEX+1] = getHex(i);
+ ssdt_ptr[MEM_OFFSET_ID] = i;
+ ssdt_ptr[MEM_OFFSET_PXM] = node;
+ *(u64*)(ssdt_ptr + MEM_OFFSET_START) = mem_base;
+ *(u64*)(ssdt_ptr + MEM_OFFSET_END) = mem_base + mem_len;
+ *(u64*)(ssdt_ptr + MEM_OFFSET_SIZE) = mem_len;
+}
+
+static void*
+build_memssdt(void)
+{
+ u64 mem_base;
+ u64 mem_len;
+ u8 node;
+ int i;
+ struct srat_memory_affinity *entry = mem;
+ u64 nb_memdevs = nb_hp_memslots;
+ u8 memslot_status, enabled;
+
+ int length = ((1+3+4)
+ + (nb_memdevs * MEM_SIZEOF)
+ + (1+2+5+(12*nb_memdevs))
+ + (6+2+1+(1*nb_memdevs)));
+ u8 *ssdt = malloc_high(sizeof(struct acpi_table_header) + length);
+ if (! ssdt) {
+ warn_noalloc();
+ return NULL;
+ }
+ u8 *ssdt_ptr = ssdt + sizeof(struct acpi_table_header);
+
+ // build Scope(_SB_) header
+ *(ssdt_ptr++) = 0x10; // ScopeOp
+ ssdt_ptr = encodeLen(ssdt_ptr, length-1, 3);
+ *(ssdt_ptr++) = '_';
+ *(ssdt_ptr++) = 'S';
+ *(ssdt_ptr++) = 'B';
+ *(ssdt_ptr++) = '_';
+
+ for (i = 0; i < nb_memdevs; i++) {
+ mem_base = (((u64)(entry->base_addr_high) << 32 )| entry->base_addr_low);
+ mem_len = (((u64)(entry->length_high) << 32 )| entry->length_low);
+ node = entry->proximity[0];
+ build_memdev(ssdt_ptr, i, mem_base, mem_len, node);
+ ssdt_ptr += MEM_SIZEOF;
+ entry++;
+ }
+
+ // build "Method(MTFY, 2) {If (LEqual(Arg0, 0x00)) {Notify(CM00, Arg1)} ...}"
+ *(ssdt_ptr++) = 0x14; // MethodOp
+ ssdt_ptr = encodeLen(ssdt_ptr, 2+5+(12*nb_memdevs), 2);
+ *(ssdt_ptr++) = 'M';
+ *(ssdt_ptr++) = 'T';
+ *(ssdt_ptr++) = 'F';
+ *(ssdt_ptr++) = 'Y';
+ *(ssdt_ptr++) = 0x02;
+ for (i=0; i<nb_memdevs; i++) {
+ *(ssdt_ptr++) = 0xA0; // IfOp
+ ssdt_ptr = encodeLen(ssdt_ptr, 11, 1);
+ *(ssdt_ptr++) = 0x93; // LEqualOp
+ *(ssdt_ptr++) = 0x68; // Arg0Op
+ *(ssdt_ptr++) = 0x0A; // BytePrefix
+ *(ssdt_ptr++) = i;
+ *(ssdt_ptr++) = 0x86; // NotifyOp
+ *(ssdt_ptr++) = 'M';
+ *(ssdt_ptr++) = 'P';
+ *(ssdt_ptr++) = getHex(i >> 4);
+ *(ssdt_ptr++) = getHex(i);
+ *(ssdt_ptr++) = 0x69; // Arg1Op
+ }
+
+ // build "Name(MEON, Package() { One, One, ..., Zero, Zero, ... })"
+ *(ssdt_ptr++) = 0x08; // NameOp
+ *(ssdt_ptr++) = 'M';
+ *(ssdt_ptr++) = 'E';
+ *(ssdt_ptr++) = 'O';
+ *(ssdt_ptr++) = 'N';
+ *(ssdt_ptr++) = 0x12; // PackageOp
+ ssdt_ptr = encodeLen(ssdt_ptr, 2+1+(1*nb_memdevs), 2);
+ *(ssdt_ptr++) = nb_memdevs;
+
+ entry = mem;
+ memslot_status = 0;
+
+ for (i = 0; i < nb_memdevs; i++) {
+ enabled = 0;
+ if (i % 8 == 0)
+ memslot_status = inb(MEM_BASE + i/8);
+ enabled = memslot_status & 1;
+ mem_base = (((u64)(entry->base_addr_high) << 32 )| entry->base_addr_low);
+ mem_len = (((u64)(entry->length_high) << 32 )| entry->length_low);
+ *(ssdt_ptr++) = enabled ? 0x01 : 0x00;
+ if (enabled)
+ add_e820(mem_base, mem_len, E820_RAM);
+ memslot_status = memslot_status >> 1;
+ entry++;
+ }
+ build_header((void*)ssdt, SSDT_SIGNATURE, ssdt_ptr - ssdt, 1);
+
+ return ssdt;
+}
+
static void*
build_ssdt(void)
{
@@ -644,9 +765,6 @@ build_srat(void)
{
int nb_numa_nodes = qemu_cfg_get_numa_nodes();

- if (nb_numa_nodes == 0)
- return NULL;
-
u64 *numadata = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_numa_nodes));
if (!numadata) {
warn_noalloc();
@@ -655,10 +773,11 @@ build_srat(void)

qemu_cfg_get_numa_data(numadata, MaxCountCPUs + nb_numa_nodes);

+ qemu_cfg_get_numa_data(&nb_hp_memslots, 1);
struct system_resource_affinity_table *srat;
int srat_size = sizeof(*srat) +
sizeof(struct srat_processor_affinity) * MaxCountCPUs +
- sizeof(struct srat_memory_affinity) * (nb_numa_nodes + 2);
+ sizeof(struct srat_memory_affinity) * (nb_numa_nodes + nb_hp_memslots + 2);

srat = malloc_high(srat_size);
if (!srat) {
@@ -693,7 +812,7 @@ build_srat(void)
* from 640k-1M and possibly another one from 3.5G-4G.
*/
struct srat_memory_affinity *numamem = (void*)core;
- int slots = 0;
+ int slots = 0, node;
u64 mem_len, mem_base, next_base = 0;

acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
@@ -720,10 +839,36 @@ build_srat(void)
next_base += (1ULL << 32) - RamSize;
}
acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+
numamem++;
slots++;
+
}
- for (; slots < nb_numa_nodes + 2; slots++) {
+ mem = (void*)numamem;
+
+ if (nb_hp_memslots) {
+ u64 *hpmemdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots));
+ if (!hpmemdata) {
+ warn_noalloc();
+ free(hpmemdata);
+ free(numadata);
+ return NULL;
+ }
+
+ qemu_cfg_get_numa_data(hpmemdata, 3 * nb_hp_memslots);
+
+ for (i = 1; i < nb_hp_memslots + 1; ++i) {
+ mem_base = *hpmemdata++;
+ mem_len = *hpmemdata++;
+ node = *hpmemdata++;
+ acpi_build_srat_memory(numamem, mem_base, mem_len, node, 1);
+ numamem++;
+ slots++;
+ }
+ free(hpmemdata);
+ }
+
+ for (; slots < nb_numa_nodes + nb_hp_memslots + 2; slots++) {
acpi_build_srat_memory(numamem, 0, 0, 0, 0);
numamem++;
}
@@ -774,6 +919,7 @@ acpi_bios_init(void)
ACPI_INIT_TABLE(build_madt());
ACPI_INIT_TABLE(build_hpet());
ACPI_INIT_TABLE(build_srat());
+ ACPI_INIT_TABLE(build_memssdt());

u16 i, external_tables = qemu_cfg_acpi_additional_tables();
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:19 UTC
Permalink
Extend the DSDT to include methods for handling memory hot-add and hot-remove
notifications and memory device status requests. These functions are called
from the memory device SSDT methods.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
src/acpi-dsdt.dsl | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 2060686..5d3e92b 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -737,6 +737,71 @@ DefinitionBlock (
}
Return(One)
}
+ /* Objects filled in by run-time generated SSDT */
+ External(MTFY, MethodObj)
+ External(MEON, PkgObj)
+
+ Method (CMST, 1, NotSerialized) {
+ // _STA method - return ON status of memdevice
+ // Local0 = MEON flag for this cpu
+ Store(DerefOf(Index(MEON, Arg0)), Local0)
+ If (Local0) { Return(0xF) } Else { Return(0x0) }
+ }
+
+ /* Memory hotplug notify array */
+ OperationRegion(MEST, SystemIO, 0xaf80, 32)
+ Field (MEST, ByteAcc, NoLock, Preserve)
+ {
+ MES, 256
+ }
+
+ /* Memory eject byte */
+ OperationRegion(MEMJ, SystemIO, 0xafa0, 1)
+ Field (MEMJ, ByteAcc, NoLock, Preserve)
+ {
+ MPE, 8
+ }
+
+ Method(MESC, 0) {
+ // Local5 = active memdevice bitmap
+ Store (MES, Local5)
+ // Local2 = last read byte from bitmap
+ Store (Zero, Local2)
+ // Local0 = memory device iterator
+ Store (Zero, Local0)
+ While (LLess(Local0, SizeOf(MEON))) {
+ // Local1 = MEON flag for this memory device
+ Store(DerefOf(Index(MEON, Local0)), Local1)
+ If (And(Local0, 0x07)) {
+ // Shift down previously read bitmap byte
+ ShiftRight(Local2, 1, Local2)
+ } Else {
+ // Read next byte from memdevice bitmap
+ Store(DerefOf(Index(Local5, ShiftRight(Local0, 3))), Local2)
+ }
+ // Local3 = active state for this memory device
+ Store(And(Local2, 1), Local3)
+
+ If (LNotEqual(Local1, Local3)) {
+ // State change - update MEON with new state
+ Store(Local3, Index(MEON, Local0))
+ // Do MEM notify
+ If (LEqual(Local3, 1)) {
+ MTFY(Local0, 1)
+ } Else {
+ MTFY(Local0, 3)
+ }
+ }
+ Increment(Local0)
+ }
+ Return(One)
+ }
+
+ Method (MPEJ, 2, NotSerialized) {
+ // _EJ0 method - eject callback
+ Store(Arg0, MPE)
+ Sleep(200)
+ }
}


@@ -759,8 +824,9 @@ DefinitionBlock (
// CPU hotplug event
Return(\_SB.PRSC())
}
- Method(_L03) {
- Return(0x01)
+ Method(_E03) {
+ // Memory hotplug event
+ Return(\_SB.MESC())
}
Method(_L04) {
Return(0x01)
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:18 UTC
Permalink
Define SSDT hotplug-able memory devices in _SB namespace. The dynamically
generated SSDT includes per memory device hotplug methods. These methods
just call methods defined in the DSDT. Also dynamically generate a MTFY
method and a MEON array of the online/available memory devices. ACPI
extraction macros are used to place the AML code in variables later used by
src/acpi. The design is taken from SSDT cpu generation.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
Makefile | 2 +-
src/ssdt-mem.dsl | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 66 insertions(+), 1 deletions(-)
create mode 100644 src/ssdt-mem.dsl

diff --git a/Makefile b/Makefile
index 5486f88..e82cfc9 100644
--- a/Makefile
+++ b/Makefile
@@ -233,7 +233,7 @@ $(OUT)%.hex: src/%.dsl ./tools/acpi_extract_preprocess.py ./tools/acpi_extract.p
$(Q)$(PYTHON) ./tools/acpi_extract.py $(OUT)$*.lst > $(OUT)$*.off
$(Q)cat $(OUT)$*.off > $@

-$(OUT)ccode32flat.o: $(OUT)acpi-dsdt.hex $(OUT)ssdt-proc.hex $(OUT)ssdt-pcihp.hex $(OUT)ssdt-susp.hex
+$(OUT)ccode32flat.o: $(OUT)acpi-dsdt.hex $(OUT)ssdt-proc.hex $(OUT)ssdt-pcihp.hex $(OUT)ssdt-susp.hex $(OUT)ssdt-mem.hex

################ Kconfig rules

diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl
new file mode 100644
index 0000000..ee322f0
--- /dev/null
+++ b/src/ssdt-mem.dsl
@@ -0,0 +1,65 @@
+/* This file is the basis for the ssdt_mem[] variable in src/acpi.c.
+ * It is similar in design to the ssdt_proc variable.
+ * It defines the contents of the per-cpu Processor() object. At
+ * runtime, a dynamically generated SSDT will contain one copy of this
+ * AML snippet for every possible memory device in the system. The
+ * objects will * be placed in the \_SB_ namespace.
+ *
+ * In addition to the aml code generated from this file, the
+ * src/acpi.c file creates a MEMNTFY method with an entry for each memdevice:
+ * Method(MTFY, 2) {
+ * If (LEqual(Arg0, 0x00)) { Notify(MP00, Arg1) }
+ * If (LEqual(Arg0, 0x01)) { Notify(MP01, Arg1) }
+ * ...
+ * }
+ * and a MEON array with the list of active and inactive memory devices:
+ * Name(MEON, Package() { One, One, ..., Zero, Zero, ... })
+ */
+ACPI_EXTRACT_ALL_CODE ssdm_mem_aml
+
+DefinitionBlock ("ssdt-mem.aml", "SSDT", 0x02, "BXPC", "CSSDT", 0x1)
+/* v------------------ DO NOT EDIT ------------------v */
+{
+ ACPI_EXTRACT_DEVICE_START ssdt_mem_start
+ ACPI_EXTRACT_DEVICE_END ssdt_mem_end
+ ACPI_EXTRACT_DEVICE_STRING ssdt_mem_name
+ Device(MPAA) {
+ ACPI_EXTRACT_NAME_BYTE_CONST ssdt_mem_id
+ Name(ID, 0xAA)
+/* ^------------------ DO NOT EDIT ------------------^
+ *
+ * The src/acpi.c code requires the above layout so that it can update
+ * MPAA and 0xAA with the appropriate MEMDEVICE id (see
+ * SD_OFFSET_MEMHEX/MEMID1/MEMID2). Don't change the above without
+ * also updating the C code.
+ */
+ Name(_HID, EISAID("PNP0C80"))
+ Name(_PXM, 0xAA)
+
+ External(CMST, MethodObj)
+ External(MPEJ, MethodObj)
+
+ Name(_CRS, ResourceTemplate() {
+ QwordMemory(
+ ResourceConsumer,
+ ,
+ MinFixed,
+ MaxFixed,
+ Cacheable,
+ ReadWrite,
+ 0x0,
+ 0xDEADBEEF,
+ 0xE6ADBEEE,
+ 0x00000000,
+ 0x08000000,
+ )
+ })
+ Method (_STA, 0) {
+ Return(CMST(ID))
+ }
+ Method (_EJ0, 1, NotSerialized) {
+ MPEJ(ID, Arg0)
+ }
+ }
+}
+
--
1.7.9
Vasilis Liaskovitis
2012-09-21 11:17:22 UTC
Permalink
Example:
"-dimm id=dimm0,size=512M,node=0,populated=off"
will define a 512M memory slot belonging to numa node 0.

When "populated=on", a DimmDevice is created and hot-plugged at system startup.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hw/Makefile.objs | 2 +-
qemu-config.c | 25 +++++++++++++++++++++++++
qemu-options.hx | 5 +++++
sysemu.h | 1 +
vl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 82 insertions(+), 1 deletions(-)

diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 6dfebd2..8c5c39a 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -26,7 +26,7 @@ hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
hw-obj-$(CONFIG_PCSPK) += pcspk.o
hw-obj-$(CONFIG_PCKBD) += pckbd.o
hw-obj-$(CONFIG_FDC) += fdc.o
-hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o
+hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o dimm.o
hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o
hw-obj-$(CONFIG_DMA) += dma.o
hw-obj-$(CONFIG_I82374) += i82374.o
diff --git a/qemu-config.c b/qemu-config.c
index eba977e..4022d64 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -646,6 +646,30 @@ QemuOptsList qemu_boot_opts = {
},
};

+static QemuOptsList qemu_dimm_opts = {
+ .name = "dimm",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_dimm_opts.head),
+ .desc = {
+ {
+ .name = "id",
+ .type = QEMU_OPT_STRING,
+ .help = "id of this dimm device",
+ },{
+ .name = "size",
+ .type = QEMU_OPT_SIZE,
+ .help = "memory size for this dimm",
+ },{
+ .name = "populated",
+ .type = QEMU_OPT_BOOL,
+ .help = "populated for this dimm",
+ },{
+ .name = "node",
+ .type = QEMU_OPT_NUMBER,
+ .help = "NUMA node number (i.e. proximity) for this dimm",
+ },
+ { /* end of list */ }
+ },
+};
static QemuOptsList *vm_config_groups[32] = {
&qemu_drive_opts,
&qemu_chardev_opts,
@@ -662,6 +686,7 @@ static QemuOptsList *vm_config_groups[32] = {
&qemu_boot_opts,
&qemu_iscsi_opts,
&qemu_sandbox_opts,
+ &qemu_dimm_opts,
NULL,
};

diff --git a/qemu-options.hx b/qemu-options.hx
index 804a2d1..3687722 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2842,3 +2842,8 @@ HXCOMM This is the last statement. Insert new options before this line!
STEXI
@end table
ETEXI
+
+DEF("dimm", HAS_ARG, QEMU_OPTION_dimm,
+ "-dimm id=dimmid,size=sz,node=nd,populated=on|off\n"
+ "specify memory dimm device with name dimmid, size sz on node nd",
+ QEMU_ARCH_ALL)
diff --git a/sysemu.h b/sysemu.h
index 65552ac..7baf9c9 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -139,6 +139,7 @@ extern QEMUClock *rtc_clock;
extern int nb_numa_nodes;
extern uint64_t node_mem[MAX_NODES];
extern unsigned long *node_cpumask[MAX_NODES];
+extern int nb_hp_dimms;

#define MAX_OPTION_ROMS 16
typedef struct QEMUOptionRom {
diff --git a/vl.c b/vl.c
index 7c577fa..af1745c 100644
--- a/vl.c
+++ b/vl.c
@@ -126,6 +126,7 @@ int main(int argc, char **argv)
#include "hw/xen.h"
#include "hw/qdev.h"
#include "hw/loader.h"
+#include "hw/dimm.h"
#include "bt-host.h"
#include "net.h"
#include "net/slirp.h"
@@ -248,6 +249,7 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order
int nb_numa_nodes;
uint64_t node_mem[MAX_NODES];
unsigned long *node_cpumask[MAX_NODES];
+int nb_hp_dimms;

uint8_t qemu_uuid[16];

@@ -530,6 +532,37 @@ static void configure_rtc_date_offset(const char *startdate, int legacy)
}
}

+static void configure_dimm(QemuOpts *opts)
+{
+ const char *id;
+ uint64_t size, node;
+ bool populated;
+ QemuOpts *devopts;
+ char buf[256];
+ if (nb_hp_dimms == MAX_DIMMS) {
+ fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n",
+ MAX_DIMMS);
+ exit(1);
+ }
+ id = qemu_opts_id(opts);
+ size = qemu_opt_get_size(opts, "size", DEFAULT_DIMMSIZE);
+ populated = qemu_opt_get_bool(opts, "populated", 0);
+ node = qemu_opt_get_number(opts, "node", 0);
+
+ dimm_config_create((char*)id, size, node, nb_hp_dimms, 0);
+
+ if (populated) {
+ devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL);
+ qemu_opt_set(devopts, "driver", "dimm");
+ snprintf(buf, sizeof(buf), "%lu", size);
+ qemu_opt_set(devopts, "size", buf);
+ snprintf(buf, sizeof(buf), "%lu", node);
+ qemu_opt_set(devopts, "node", buf);
+ qemu_opt_set(devopts, "bus", "membus");
+ }
+ nb_hp_dimms++;
+}
+
static void configure_rtc(QemuOpts *opts)
{
const char *value;
@@ -2354,6 +2387,8 @@ int main(int argc, char **argv, char **envp)
DisplayChangeListener *dcl;
int cyls, heads, secs, translation;
QemuOpts *hda_opts = NULL, *opts, *machine_opts;
+ QemuOpts *dimm_opts[MAX_DIMMS];
+ int nb_dimm_opts = 0;
QemuOptsList *olist;
int optind;
const char *optarg;
@@ -3288,6 +3323,18 @@ int main(int argc, char **argv, char **envp)
exit(0);
}
break;
+ case QEMU_OPTION_dimm:
+ if (nb_dimm_opts == MAX_DIMMS) {
+ fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n",
+ MAX_DIMMS);
+ }
+ dimm_opts[nb_dimm_opts] =
+ qemu_opts_parse(qemu_find_opts("dimm"), optarg, 0);
+ if (!dimm_opts[nb_dimm_opts]) {
+ exit(1);
+ }
+ nb_dimm_opts++;
+ break;
default:
os_parse_cmd_args(popt->index, optarg);
}
@@ -3611,6 +3658,9 @@ int main(int argc, char **argv, char **envp)
}
qemu_add_globals();

+ for (i = 0; i < nb_dimm_opts; i++)
+ configure_dimm(dimm_opts[i]);
+
qdev_machine_init();

machine->init(ram_size, boot_devices,
--
1.7.9
Blue Swirl
2012-09-22 13:46:57 UTC
Permalink
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
"-dimm id=dimm0,size=512M,node=0,populated=off"
There should not be a need to introduce a new top level option,
instead you should just use -device, like
-device dimm,base=0,id=dimm0,size=512M,node=0,populated=off

That would also specify the start address.
Post by Vasilis Liaskovitis
will define a 512M memory slot belonging to numa node 0.
When "populated=on", a DimmDevice is created and hot-plugged at system startup.
---
hw/Makefile.objs | 2 +-
qemu-config.c | 25 +++++++++++++++++++++++++
qemu-options.hx | 5 +++++
sysemu.h | 1 +
vl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 82 insertions(+), 1 deletions(-)
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 6dfebd2..8c5c39a 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -26,7 +26,7 @@ hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
hw-obj-$(CONFIG_PCSPK) += pcspk.o
hw-obj-$(CONFIG_PCKBD) += pckbd.o
hw-obj-$(CONFIG_FDC) += fdc.o
-hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o
+hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o dimm.o
hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o
hw-obj-$(CONFIG_DMA) += dma.o
hw-obj-$(CONFIG_I82374) += i82374.o
diff --git a/qemu-config.c b/qemu-config.c
index eba977e..4022d64 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -646,6 +646,30 @@ QemuOptsList qemu_boot_opts = {
},
};
+static QemuOptsList qemu_dimm_opts = {
+ .name = "dimm",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_dimm_opts.head),
+ .desc = {
+ {
+ .name = "id",
+ .type = QEMU_OPT_STRING,
+ .help = "id of this dimm device",
+ },{
+ .name = "size",
+ .type = QEMU_OPT_SIZE,
+ .help = "memory size for this dimm",
+ },{
+ .name = "populated",
+ .type = QEMU_OPT_BOOL,
+ .help = "populated for this dimm",
+ },{
+ .name = "node",
+ .type = QEMU_OPT_NUMBER,
+ .help = "NUMA node number (i.e. proximity) for this dimm",
+ },
+ { /* end of list */ }
+ },
+};
static QemuOptsList *vm_config_groups[32] = {
&qemu_drive_opts,
&qemu_chardev_opts,
@@ -662,6 +686,7 @@ static QemuOptsList *vm_config_groups[32] = {
&qemu_boot_opts,
&qemu_iscsi_opts,
&qemu_sandbox_opts,
+ &qemu_dimm_opts,
NULL,
};
diff --git a/qemu-options.hx b/qemu-options.hx
index 804a2d1..3687722 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2842,3 +2842,8 @@ HXCOMM This is the last statement. Insert new options before this line!
STEXI
@end table
ETEXI
+
+DEF("dimm", HAS_ARG, QEMU_OPTION_dimm,
+ "-dimm id=dimmid,size=sz,node=nd,populated=on|off\n"
+ "specify memory dimm device with name dimmid, size sz on node nd",
+ QEMU_ARCH_ALL)
diff --git a/sysemu.h b/sysemu.h
index 65552ac..7baf9c9 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -139,6 +139,7 @@ extern QEMUClock *rtc_clock;
extern int nb_numa_nodes;
extern uint64_t node_mem[MAX_NODES];
extern unsigned long *node_cpumask[MAX_NODES];
+extern int nb_hp_dimms;
#define MAX_OPTION_ROMS 16
typedef struct QEMUOptionRom {
diff --git a/vl.c b/vl.c
index 7c577fa..af1745c 100644
--- a/vl.c
+++ b/vl.c
@@ -126,6 +126,7 @@ int main(int argc, char **argv)
#include "hw/xen.h"
#include "hw/qdev.h"
#include "hw/loader.h"
+#include "hw/dimm.h"
#include "bt-host.h"
#include "net.h"
#include "net/slirp.h"
@@ -248,6 +249,7 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order
int nb_numa_nodes;
uint64_t node_mem[MAX_NODES];
unsigned long *node_cpumask[MAX_NODES];
+int nb_hp_dimms;
This counter (if needed) should be private to dimm.c.
Post by Vasilis Liaskovitis
uint8_t qemu_uuid[16];
@@ -530,6 +532,37 @@ static void configure_rtc_date_offset(const char *startdate, int legacy)
}
}
+static void configure_dimm(QemuOpts *opts)
+{
+ const char *id;
+ uint64_t size, node;
+ bool populated;
+ QemuOpts *devopts;
+ char buf[256];
+ if (nb_hp_dimms == MAX_DIMMS) {
Why should there be any limit of DIMMS? Please use lists etc. to avoid
restrictions.
Post by Vasilis Liaskovitis
+ fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n",
+ MAX_DIMMS);
+ exit(1);
+ }
+ id = qemu_opts_id(opts);
+ size = qemu_opt_get_size(opts, "size", DEFAULT_DIMMSIZE);
+ populated = qemu_opt_get_bool(opts, "populated", 0);
+ node = qemu_opt_get_number(opts, "node", 0);
+
+ dimm_config_create((char*)id, size, node, nb_hp_dimms, 0);
+
+ if (populated) {
+ devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL);
+ qemu_opt_set(devopts, "driver", "dimm");
+ snprintf(buf, sizeof(buf), "%lu", size);
+ qemu_opt_set(devopts, "size", buf);
+ snprintf(buf, sizeof(buf), "%lu", node);
+ qemu_opt_set(devopts, "node", buf);
+ qemu_opt_set(devopts, "bus", "membus");
+ }
+ nb_hp_dimms++;
+}
+
static void configure_rtc(QemuOpts *opts)
{
const char *value;
@@ -2354,6 +2387,8 @@ int main(int argc, char **argv, char **envp)
DisplayChangeListener *dcl;
int cyls, heads, secs, translation;
QemuOpts *hda_opts = NULL, *opts, *machine_opts;
+ QemuOpts *dimm_opts[MAX_DIMMS];
+ int nb_dimm_opts = 0;
QemuOptsList *olist;
int optind;
const char *optarg;
@@ -3288,6 +3323,18 @@ int main(int argc, char **argv, char **envp)
exit(0);
}
break;
+ if (nb_dimm_opts == MAX_DIMMS) {
+ fprintf(stderr, "qemu: maximum number of DIMMs (%d) exceeded\n",
+ MAX_DIMMS);
+ }
+ dimm_opts[nb_dimm_opts] =
+ qemu_opts_parse(qemu_find_opts("dimm"), optarg, 0);
+ if (!dimm_opts[nb_dimm_opts]) {
+ exit(1);
+ }
+ nb_dimm_opts++;
+ break;
os_parse_cmd_args(popt->index, optarg);
}
@@ -3611,6 +3658,9 @@ int main(int argc, char **argv, char **envp)
}
qemu_add_globals();
+ for (i = 0; i < nb_dimm_opts; i++)
Missing braces, please read CODING_STYLE.
Post by Vasilis Liaskovitis
+ configure_dimm(dimm_opts[i]);
+
qdev_machine_init();
machine->init(ram_size, boot_devices,
--
1.7.9
Vasilis Liaskovitis
2012-09-24 10:42:43 UTC
Permalink
Post by Blue Swirl
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
"-dimm id=dimm0,size=512M,node=0,populated=off"
There should not be a need to introduce a new top level option,
instead you should just use -device, like
-device dimm,base=0,id=dimm0,size=512M,node=0,populated=off
That would also specify the start address.
What is "base"? the start address? I think the start address should be calculated by the
chipset / board, not by the user.

The "-dimm" option is supposed to specify the dimm/memory layout, and not create
any devices.

If we don't want this new option, I have a question:

A "-device/device_add" means we create a new qdev device at startup or as a
hotplug operation respectively. So, the semantics of
"-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me.

What does "-device dimm,populated=off" mean from a qdev perspective? There are 2
alternatives:

- The device is created on the dimmbus, but is not used/populated yet. Than the
activation/acpi-hotplug of the dimm may require a separate command (we used to have
"dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev
device, so this wouldn't fit this usecase, because the device already exists. In
this case, the actual "acpi hotplug" operation is decoupled from qdev device
creation.

- The dimmdevice is not created when "-device dimm,populated=off" (this would
require some ugly checking in normal -device argument handling). Only the dimm
layout is saved. The hotplug is triggered from a normal device_add later. So in
this case, the "acpi hotplug" happens at the same time as the qdev hotplug.

Do you see a simpler alternative without introducing a new option?

Using the "-dimm" option follows the second semantic and avoids changing the "-device"
semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev
hotplug coincides with acpi hotplug.

thanks,

- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Blue Swirl
2012-09-29 11:13:04 UTC
Permalink
On Mon, Sep 24, 2012 at 10:42 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
Post by Blue Swirl
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
"-dimm id=dimm0,size=512M,node=0,populated=off"
There should not be a need to introduce a new top level option,
instead you should just use -device, like
-device dimm,base=0,id=dimm0,size=512M,node=0,populated=off
That would also specify the start address.
What is "base"? the start address? I think the start address should be calculated by the
chipset / board, not by the user.
Yes.
Post by Vasilis Liaskovitis
The "-dimm" option is supposed to specify the dimm/memory layout, and not create
any devices.
A "-device/device_add" means we create a new qdev device at startup or as a
hotplug operation respectively. So, the semantics of
"-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me.
What does "-device dimm,populated=off" mean from a qdev perspective? There are 2
- The device is created on the dimmbus, but is not used/populated yet. Than the
activation/acpi-hotplug of the dimm may require a separate command (we used to have
"dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev
device, so this wouldn't fit this usecase, because the device already exists. In
this case, the actual "acpi hotplug" operation is decoupled from qdev device
creation.
The bus exists but the devices do not, device_add would add DIMMs to
the bus. This matches PCI bus created by the host bridge and PCI
device hotplug.

A more complex setup would be dimm bus, dimm slot devices and DIMM
devices. The intermediate slot device would contain one DIMM device if
plugged.
Post by Vasilis Liaskovitis
- The dimmdevice is not created when "-device dimm,populated=off" (this would
require some ugly checking in normal -device argument handling). Only the dimm
layout is saved. The hotplug is triggered from a normal device_add later. So in
this case, the "acpi hotplug" happens at the same time as the qdev hotplug.
Do you see a simpler alternative without introducing a new option?
Using the "-dimm" option follows the second semantic and avoids changing the "-device"
semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev
hotplug coincides with acpi hotplug.
Maybe even the dimmbus device shouldn't exist by itself after all, or
it should be pretty much invisible to users. On real HW, the memory
controller or south bridge handles the memory. For i440fx, it's part
of the same chipset. So I think we should just add qdev properties to
i440fx to specify the sizes, nodes etc. Then i440fx should create the
dimmbus device unconditionally using the properties. The default
properties should create a sane configuration, otherwise -global
i440fx.dimm_size=512M etc. could be used. Then the bus would be
populated as before or with device_add.
Post by Vasilis Liaskovitis
thanks,
- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-10-09 17:04:29 UTC
Permalink
Hi,

sorry for the delayed answer.
Post by Blue Swirl
Post by Vasilis Liaskovitis
The "-dimm" option is supposed to specify the dimm/memory layout, and not create
any devices.
A "-device/device_add" means we create a new qdev device at startup or as a
hotplug operation respectively. So, the semantics of
"-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me.
What does "-device dimm,populated=off" mean from a qdev perspective? There are 2
- The device is created on the dimmbus, but is not used/populated yet. Than the
activation/acpi-hotplug of the dimm may require a separate command (we used to have
"dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev
device, so this wouldn't fit this usecase, because the device already exists. In
this case, the actual "acpi hotplug" operation is decoupled from qdev device
creation.
The bus exists but the devices do not, device_add would add DIMMs to
the bus. This matches PCI bus created by the host bridge and PCI
device hotplug.
A more complex setup would be dimm bus, dimm slot devices and DIMM
devices. The intermediate slot device would contain one DIMM device if
plugged.
interesting, I haven't thought about this alternative. It does sounds overly
complex, but a dimmslot / dimmdevice splitup could consolidate hotplug semantic
differences between populated=on/off. Something similar to the dimmslot device
is already present in v3 (dimmcfg structure), but it's not a qdev visible device.
I 'd rather avoid the complication, but i might revisit this idea.
Post by Blue Swirl
Post by Vasilis Liaskovitis
- The dimmdevice is not created when "-device dimm,populated=off" (this would
require some ugly checking in normal -device argument handling). Only the dimm
layout is saved. The hotplug is triggered from a normal device_add later. So in
this case, the "acpi hotplug" happens at the same time as the qdev hotplug.
Do you see a simpler alternative without introducing a new option?
Using the "-dimm" option follows the second semantic and avoids changing the "-device"
semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev
hotplug coincides with acpi hotplug.
Maybe even the dimmbus device shouldn't exist by itself after all, or
it should be pretty much invisible to users. On real HW, the memory
controller or south bridge handles the memory. For i440fx, it's part
of the same chipset. So I think we should just add qdev properties to
i440fx to specify the sizes, nodes etc. Then i440fx should create the
dimmbus device unconditionally using the properties. The default
properties should create a sane configuration, otherwise -global
i440fx.dimm_size=512M etc. could be used. Then the bus would be
populated as before or with device_add.
hmm the problem with using only i440fx properties, is that size/nodes look
dimm specific to me, not chipset-memcontroller specific. Unless we only allow
uniform size dimms. Is it possible to have a dynamic list of sizes/nodes pairs as
properties of a qdev device?

Also if there is no dimmbus, and instead we have only links<> from i440fx to dimm-devices,
would the current qdev hotplug API be enough?

I am currently leaning towards this: i440fx unconditionally creates the dimmbus. Users
don't have to specify the bus (i assume this is what you mean by "dimmbus should
be invisible to the users")

We only use "-device dimm" to describe dimms. With "-device dimm,populated=off", only
the dimm config layout will be saved in the dimmbus. The hotplug is triggered from a normal
device_add later (same as pci hotplug).

thanks,

- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Blue Swirl
2012-10-13 08:57:19 UTC
Permalink
On Tue, Oct 9, 2012 at 5:04 PM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
Hi,
sorry for the delayed answer.
Post by Blue Swirl
Post by Vasilis Liaskovitis
The "-dimm" option is supposed to specify the dimm/memory layout, and not create
any devices.
A "-device/device_add" means we create a new qdev device at startup or as a
hotplug operation respectively. So, the semantics of
"-device dimm,id=dimm0,size=512M,node=0,populated=on" are clear to me.
What does "-device dimm,populated=off" mean from a qdev perspective? There are 2
- The device is created on the dimmbus, but is not used/populated yet. Than the
activation/acpi-hotplug of the dimm may require a separate command (we used to have
"dimm_add" in versions < 3). "device_add" handling always hotplugs a new qdev
device, so this wouldn't fit this usecase, because the device already exists. In
this case, the actual "acpi hotplug" operation is decoupled from qdev device
creation.
The bus exists but the devices do not, device_add would add DIMMs to
the bus. This matches PCI bus created by the host bridge and PCI
device hotplug.
A more complex setup would be dimm bus, dimm slot devices and DIMM
devices. The intermediate slot device would contain one DIMM device if
plugged.
interesting, I haven't thought about this alternative. It does sounds overly
complex, but a dimmslot / dimmdevice splitup could consolidate hotplug semantic
differences between populated=on/off. Something similar to the dimmslot device
is already present in v3 (dimmcfg structure), but it's not a qdev visible device.
I 'd rather avoid the complication, but i might revisit this idea.
The memory controller could be able to also enable and disable slots
independently to their population state.
Post by Vasilis Liaskovitis
Post by Blue Swirl
Post by Vasilis Liaskovitis
- The dimmdevice is not created when "-device dimm,populated=off" (this would
require some ugly checking in normal -device argument handling). Only the dimm
layout is saved. The hotplug is triggered from a normal device_add later. So in
this case, the "acpi hotplug" happens at the same time as the qdev hotplug.
Do you see a simpler alternative without introducing a new option?
Using the "-dimm" option follows the second semantic and avoids changing the "-device"
semantics. Dimm layout description is decoupled from dimmdevice creation, and qdev
hotplug coincides with acpi hotplug.
Maybe even the dimmbus device shouldn't exist by itself after all, or
it should be pretty much invisible to users. On real HW, the memory
controller or south bridge handles the memory. For i440fx, it's part
of the same chipset. So I think we should just add qdev properties to
i440fx to specify the sizes, nodes etc. Then i440fx should create the
dimmbus device unconditionally using the properties. The default
properties should create a sane configuration, otherwise -global
i440fx.dimm_size=512M etc. could be used. Then the bus would be
populated as before or with device_add.
hmm the problem with using only i440fx properties, is that size/nodes look
dimm specific to me, not chipset-memcontroller specific. Unless we only allow
uniform size dimms. Is it possible to have a dynamic list of sizes/nodes pairs as
properties of a qdev device?
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
Post by Vasilis Liaskovitis
Also if there is no dimmbus, and instead we have only links<> from i440fx to dimm-devices,
would the current qdev hotplug API be enough?
I'd just disable hotplug if there is no dimmbus (ISA PC?).
Post by Vasilis Liaskovitis
I am currently leaning towards this: i440fx unconditionally creates the dimmbus. Users
don't have to specify the bus (i assume this is what you mean by "dimmbus should
be invisible to the users")
We only use "-device dimm" to describe dimms. With "-device dimm,populated=off", only
the dimm config layout will be saved in the dimmbus. The hotplug is triggered from a normal
device_add later (same as pci hotplug).
OK.
Post by Vasilis Liaskovitis
thanks,
- Vasilis
Vasilis Liaskovitis
2012-10-17 09:19:23 UTC
Permalink
Post by Blue Swirl
On Tue, Oct 9, 2012 at 5:04 PM, Vasilis Liaskovitis
snip
Post by Blue Swirl
Post by Vasilis Liaskovitis
Post by Blue Swirl
Maybe even the dimmbus device shouldn't exist by itself after all, or
it should be pretty much invisible to users. On real HW, the memory
controller or south bridge handles the memory. For i440fx, it's part
of the same chipset. So I think we should just add qdev properties to
i440fx to specify the sizes, nodes etc. Then i440fx should create the
dimmbus device unconditionally using the properties. The default
properties should create a sane configuration, otherwise -global
i440fx.dimm_size=512M etc. could be used. Then the bus would be
populated as before or with device_add.
hmm the problem with using only i440fx properties, is that size/nodes look
dimm specific to me, not chipset-memcontroller specific. Unless we only allow
uniform size dimms. Is it possible to have a dynamic list of sizes/nodes pairs as
properties of a qdev device?
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
In the case of i440fx specifically, do you mean that we should model the DRB
(Dram row boundary registers in section 3.2.19 of the i440fx spec) ?

The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
memory afaict (bit 31 and above is ignored).

I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too
restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a
waste imho to model an old pc memory controller that only supports 8 DIMMs.

There was also an old discussion about i440fx modeling here:
https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
the general direction was that i440fx is too old and we don't want to precisely
emulate the DRB registers, since they lack flexibility.

Possible solutions:

1) is there a newer and more flexible chipset that we could model?

2) model and document a generic (non-existent) i440fx that would support more
and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description
similar to the i440fx DRB registers, the registers would take up a lot of space.
In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how
many 8MB chunks are contained in DIMMs 0...i. So, the register values are
cumulative (and total described memory cannot exceed 256x8MB = 2GB)

We could for example model:
- an 8-bit non-cumulative register for each DIMM, denoting how many
128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we
describe a bit less than 8TB. These registers require 255 bytes.
- a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows
us to describe 8TB of memory (but the registers take up double the space, because
they describe cumulative memory amounts)

3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling
is not done (at least for i440fx, other machines could). This is the least precise
in terms of emulation. On the other hand, if we are not really trying to emulate
the real (too restrictive) hardware, does it matter?

thanks,

- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity
2012-10-17 10:03:51 UTC
Permalink
Post by Vasilis Liaskovitis
Post by Blue Swirl
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
In the case of i440fx specifically, do you mean that we should model the DRB
(Dram row boundary registers in section 3.2.19 of the i440fx spec) ?
The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
memory afaict (bit 31 and above is ignored).
I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too
restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a
waste imho to model an old pc memory controller that only supports 8 DIMMs.
https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
the general direction was that i440fx is too old and we don't want to precisely
emulate the DRB registers, since they lack flexibility.
1) is there a newer and more flexible chipset that we could model?
Look for q35 on this list.
Post by Vasilis Liaskovitis
2) model and document
^--- the critical bit
Post by Vasilis Liaskovitis
a generic (non-existent) i440fx that would support more
and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description
similar to the i440fx DRB registers, the registers would take up a lot of space.
In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how
many 8MB chunks are contained in DIMMs 0...i. So, the register values are
cumulative (and total described memory cannot exceed 256x8MB = 2GB)
Our i440fx has already been extended by support for pci and cpu hotplug,
and I see no reason not to extend it for memory. We can allocate extra
mmio space for registers if needed. Usually I'm against this sort of
thing, but in this case we don't have much choice.
Post by Vasilis Liaskovitis
- an 8-bit non-cumulative register for each DIMM, denoting how many
128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we
describe a bit less than 8TB. These registers require 255 bytes.
- a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows
us to describe 8TB of memory (but the registers take up double the space, because
they describe cumulative memory amounts)
There is no reason to save space. Why not have two 64-bit registers per
DIMM, one describing the size and the other the base address, both in
bytes? Use a few low order bits for control.
Post by Vasilis Liaskovitis
3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling
is not done (at least for i440fx, other machines could). This is the least precise
in terms of emulation. On the other hand, if we are not really trying to emulate
the real (too restrictive) hardware, does it matter?
We could emulate base memory using the chipset, and extra memory using
the scheme above. This allows guests that are tied to the chipset to
work, and guests that have more awareness (seabios) to use the extra
features.
--
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-10-18 09:27:37 UTC
Permalink
Post by Avi Kivity
Post by Vasilis Liaskovitis
Post by Blue Swirl
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
In the case of i440fx specifically, do you mean that we should model the DRB
(Dram row boundary registers in section 3.2.19 of the i440fx spec) ?
The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
memory afaict (bit 31 and above is ignored).
I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too
restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a
waste imho to model an old pc memory controller that only supports 8 DIMMs.
https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
the general direction was that i440fx is too old and we don't want to precisely
emulate the DRB registers, since they lack flexibility.
1) is there a newer and more flexible chipset that we could model?
Look for q35 on this list.
thanks, I 'll take a look. It sounds like the other options below are more
straightforward now, but let me know if you prefer q35 integration as a priority.
Post by Avi Kivity
Post by Vasilis Liaskovitis
2) model and document
^--- the critical bit
Post by Vasilis Liaskovitis
a generic (non-existent) i440fx that would support more
and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description
similar to the i440fx DRB registers, the registers would take up a lot of space.
In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how
many 8MB chunks are contained in DIMMs 0...i. So, the register values are
cumulative (and total described memory cannot exceed 256x8MB = 2GB)
Our i440fx has already been extended by support for pci and cpu hotplug,
and I see no reason not to extend it for memory. We can allocate extra
mmio space for registers if needed. Usually I'm against this sort of
thing, but in this case we don't have much choice.
ok
Post by Avi Kivity
Post by Vasilis Liaskovitis
- an 8-bit non-cumulative register for each DIMM, denoting how many
128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we
describe a bit less than 8TB. These registers require 255 bytes.
- a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows
us to describe 8TB of memory (but the registers take up double the space, because
they describe cumulative memory amounts)
There is no reason to save space. Why not have two 64-bit registers per
DIMM, one describing the size and the other the base address, both in
bytes? Use a few low order bits for control.
Do we want this generic scheme above to be tied into the i440fx/pc machine?
Or have it as a separate generic memory bus / pmc usable by others (e.g. in
hw/dimm.c)?
The 64-bit values you describe are already part of DimmDevice properties, but
they are not hardware registers described as part of a chipset.

In terms of control bits, did you want to mimic some other chipset registers? -
any examples would be useful.
Post by Avi Kivity
Post by Vasilis Liaskovitis
3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling
is not done (at least for i440fx, other machines could). This is the least precise
in terms of emulation. On the other hand, if we are not really trying to emulate
the real (too restrictive) hardware, does it matter?
We could emulate base memory using the chipset, and extra memory using
the scheme above. This allows guests that are tied to the chipset to
work, and guests that have more awareness (seabios) to use the extra
features.
But if we use the real i440fx pmc DRBs for base memory, this means base memory
would be <= 2GB, right?

Sounds like we 'd need to change the DRBs anyway to describe useful amounts of
base memory (e.g. 512MB chunks and check against address lines [36:29] can
describe base memory up to 64GB, though that's still limiting for very large
VMs). But we'd be diverting from the real hardware again.

Then we can model base memory with "tweaked" i440fx pmc's DRB registers - we
could only use DRB[0] (one DIMM describing all of base memory) or more.

DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only
(unless it makes sense to allow hotplug in the remaining pmc DRBs and
start using the generic scheme once we run out of emulated DRBs)

thanks,

- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity
2012-10-18 12:33:02 UTC
Permalink
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
Post by Blue Swirl
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
In the case of i440fx specifically, do you mean that we should model the DRB
(Dram row boundary registers in section 3.2.19 of the i440fx spec) ?
The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
memory afaict (bit 31 and above is ignored).
I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too
restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a
waste imho to model an old pc memory controller that only supports 8 DIMMs.
https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
the general direction was that i440fx is too old and we don't want to precisely
emulate the DRB registers, since they lack flexibility.
1) is there a newer and more flexible chipset that we could model?
Look for q35 on this list.
thanks, I 'll take a look. It sounds like the other options below are more
straightforward now, but let me know if you prefer q35 integration as a priority.
At least validate that what you're doing fits with how q35 works.
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
- an 8-bit non-cumulative register for each DIMM, denoting how many
128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we
describe a bit less than 8TB. These registers require 255 bytes.
- a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows
us to describe 8TB of memory (but the registers take up double the space, because
they describe cumulative memory amounts)
There is no reason to save space. Why not have two 64-bit registers per
DIMM, one describing the size and the other the base address, both in
bytes? Use a few low order bits for control.
Do we want this generic scheme above to be tied into the i440fx/pc machine?
Yes. q35 should work according to its own specifications.
Post by Vasilis Liaskovitis
Or have it as a separate generic memory bus / pmc usable by others (e.g. in
hw/dimm.c)?
The 64-bit values you describe are already part of DimmDevice properties, but
they are not hardware registers described as part of a chipset.
In terms of control bits, did you want to mimic some other chipset registers? -
any examples would be useful.
I don't have any real requirements. Just make it simple and easily
accessible to ACPI code.
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling
is not done (at least for i440fx, other machines could). This is the least precise
in terms of emulation. On the other hand, if we are not really trying to emulate
the real (too restrictive) hardware, does it matter?
We could emulate base memory using the chipset, and extra memory using
the scheme above. This allows guests that are tied to the chipset to
work, and guests that have more awareness (seabios) to use the extra
features.
But if we use the real i440fx pmc DRBs for base memory, this means base memory
would be <= 2GB, right?
Sounds like we 'd need to change the DRBs anyway to describe useful amounts of
base memory (e.g. 512MB chunks and check against address lines [36:29] can
describe base memory up to 64GB, though that's still limiting for very large
VMs). But we'd be diverting from the real hardware again.
Then there's no point. Modelling real hardware allows guests written to
work against that hardware to function correctly. If you diverge, they
won't.
Post by Vasilis Liaskovitis
Then we can model base memory with "tweaked" i440fx pmc's DRB registers - we
could only use DRB[0] (one DIMM describing all of base memory) or more.
DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only
(unless it makes sense to allow hotplug in the remaining pmc DRBs and
start using the generic scheme once we run out of emulated DRBs)
440fx seems a lost cause, so we can go wild and just implement pv dimms.
For q35 I'd like to stay within the spec.
--
error compiling committee.c: too many arguments to function
Blue Swirl
2012-10-19 17:48:09 UTC
Permalink
Post by Avi Kivity
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
Post by Blue Swirl
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
In the case of i440fx specifically, do you mean that we should model the DRB
(Dram row boundary registers in section 3.2.19 of the i440fx spec) ?
The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
memory afaict (bit 31 and above is ignored).
I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too
restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a
waste imho to model an old pc memory controller that only supports 8 DIMMs.
https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
the general direction was that i440fx is too old and we don't want to precisely
emulate the DRB registers, since they lack flexibility.
1) is there a newer and more flexible chipset that we could model?
Look for q35 on this list.
thanks, I 'll take a look. It sounds like the other options below are more
straightforward now, but let me know if you prefer q35 integration as a priority.
At least validate that what you're doing fits with how q35 works.
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
- an 8-bit non-cumulative register for each DIMM, denoting how many
128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 DIMMs we
describe a bit less than 8TB. These registers require 255 bytes.
- a 16-bit cumulative register for each DIMM again for 128MB chunks. This allows
us to describe 8TB of memory (but the registers take up double the space, because
they describe cumulative memory amounts)
There is no reason to save space. Why not have two 64-bit registers per
DIMM, one describing the size and the other the base address, both in
bytes? Use a few low order bits for control.
Do we want this generic scheme above to be tied into the i440fx/pc machine?
Yes. q35 should work according to its own specifications.
Post by Vasilis Liaskovitis
Or have it as a separate generic memory bus / pmc usable by others (e.g. in
hw/dimm.c)?
The 64-bit values you describe are already part of DimmDevice properties, but
they are not hardware registers described as part of a chipset.
In terms of control bits, did you want to mimic some other chipset registers? -
any examples would be useful.
I don't have any real requirements. Just make it simple and easily
accessible to ACPI code.
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
3) let everything be handled/abstracted by dimmbus - the chipset DRB modelling
is not done (at least for i440fx, other machines could). This is the least precise
in terms of emulation. On the other hand, if we are not really trying to emulate
the real (too restrictive) hardware, does it matter?
We could emulate base memory using the chipset, and extra memory using
the scheme above. This allows guests that are tied to the chipset to
work, and guests that have more awareness (seabios) to use the extra
features.
But if we use the real i440fx pmc DRBs for base memory, this means base memory
would be <= 2GB, right?
Sounds like we 'd need to change the DRBs anyway to describe useful amounts of
base memory (e.g. 512MB chunks and check against address lines [36:29] can
describe base memory up to 64GB, though that's still limiting for very large
VMs). But we'd be diverting from the real hardware again.
Then there's no point. Modelling real hardware allows guests written to
work against that hardware to function correctly. If you diverge, they
won't.
The guest is also unlikely to want to reprogram the memory controller.
Post by Avi Kivity
Post by Vasilis Liaskovitis
Then we can model base memory with "tweaked" i440fx pmc's DRB registers - we
could only use DRB[0] (one DIMM describing all of base memory) or more.
DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only
(unless it makes sense to allow hotplug in the remaining pmc DRBs and
start using the generic scheme once we run out of emulated DRBs)
440fx seems a lost cause, so we can go wild and just implement pv dimms.
Maybe. But what would be a PV DIMM? Do we need any DIMM-like
granularity at all, instead the guest could be told to use a list of
RAM regions with arbitrary start and end addresses? Isn't ballooning
also related?
Post by Avi Kivity
For q35 I'd like to stay within the spec.
That may not last forever when machines have terabytes of memory.
Post by Avi Kivity
--
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity
2012-10-22 10:55:30 UTC
Permalink
Post by Blue Swirl
Post by Avi Kivity
Post by Vasilis Liaskovitis
DIMMs would be allowed to be hotplugged in the generic mem-controller scheme only
(unless it makes sense to allow hotplug in the remaining pmc DRBs and
start using the generic scheme once we run out of emulated DRBs)
440fx seems a lost cause, so we can go wild and just implement pv dimms.
Maybe. But what would be a PV DIMM? Do we need any DIMM-like
granularity at all, instead the guest could be told to use a list of
RAM regions with arbitrary start and end addresses?
Guests are likely to support something that has the same constraints as
real hardware. If we allow non-power-of-two DIMMs, we might find that
guests don't support them well.
Post by Blue Swirl
Isn't ballooning
also related?
It is related in that it is also a memory hotplug technology. But
ballooning is subtractive and fine-grained where classic hotplug is
additive and coarse grained. We can use both together, but I don't
think any work is needed at the qemu level.
Post by Blue Swirl
Post by Avi Kivity
For q35 I'd like to stay within the spec.
That may not last forever when machines have terabytes of memory.
At least there's work for chipset implementers. Or we can do PV-DIMMs
for q35 too.
--
error compiling committee.c: too many arguments to function
Vasilis Liaskovitis
2012-10-22 08:39:49 UTC
Permalink
Hi,
Post by Avi Kivity
Post by Vasilis Liaskovitis
Post by Avi Kivity
Post by Vasilis Liaskovitis
Post by Blue Swirl
I don't think so, but probably there's a limit of DIMMs that real
controllers have, something like 8 max.
In the case of i440fx specifically, do you mean that we should model the DRB
(Dram row boundary registers in section 3.2.19 of the i440fx spec) ?
The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
memory afaict (bit 31 and above is ignored).
I 'd rather not model this part of the i440fx - having only 8 DIMMs seems too
restrictive. The rest of the patchset supports up to 255 DIMMs so it would be a
waste imho to model an old pc memory controller that only supports 8 DIMMs.
https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
the general direction was that i440fx is too old and we don't want to precisely
emulate the DRB registers, since they lack flexibility.
1) is there a newer and more flexible chipset that we could model?
Look for q35 on this list.
thanks, I 'll take a look. It sounds like the other options below are more
straightforward now, but let me know if you prefer q35 integration as a priority.
At least validate that what you're doing fits with how q35 works.
In terms of pmc modeling, the q35 page http://wiki.qemu.org/Features/Q35
mentions:

Refactor i440fx to create i440fx-pmc class
ich9: model ICH9 Super I/O chip
ich9: make i440fx-pmc a generic PCNorthBridge class and add support for ich9
northbridge

is this still the plan? There was an old patchset creating i440fx-pmc here:
http://lists.gnu.org/archive/html/qemu-devel/2012-01/msg03501.html
but I am not sure if it has been dropped or worked on. v3 of the q35 patchset
doesn't include a pmc I think.

It would be good to know what the current plan regarding pmc modeling (for both
q35 and i440fx) is.

thanks,

- Vasilis

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:24 UTC
Permalink
Dimm physical address offsets are calculated automatically and memory map is
adjusted accordingly. If a DIMM can fit before the PCI_HOLE_START (currently
0xe0000000), it will be added normally, otherwise its physical address will be
above 4GB.

Also create memory bus on i440fx-pcihost device.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hw/pc.c | 41 +++++++++++++++++++++++++++++++++++++++++
hw/pc.h | 6 ++++++
hw/pc_piix.c | 20 ++++++++++++++------
vl.c | 1 +
4 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index 112739a..2c9664d 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -52,6 +52,7 @@
#include "arch_init.h"
#include "bitmap.h"
#include "vga-pci.h"
+#include "dimm.h"

/* output Bochs bios info messages */
//#define DEBUG_BIOS
@@ -93,6 +94,9 @@ struct e820_table {
static struct e820_table e820_table;
struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};

+ram_addr_t below_4g_hp_mem_size = 0;
+ram_addr_t above_4g_hp_mem_size = 0;
+extern target_phys_addr_t ram_hp_offset;
void gsi_handler(void *opaque, int n, int level)
{
GSIState *s = opaque;
@@ -1160,3 +1164,40 @@ void pc_pci_device_init(PCIBus *pci_bus)
pci_create_simple(pci_bus, -1, "lsi53c895a");
}
}
+
+
+/* Function to configure memory offsets of hotpluggable dimms */
+
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+ target_phys_addr_t ret;
+
+ /* on first call, initialize ram_hp_offset */
+ if (!ram_hp_offset) {
+ if (ram_size >= PCI_HOLE_START ) {
+ ram_hp_offset = 0x100000000LL + (ram_size - PCI_HOLE_START);
+ } else {
+ ram_hp_offset = ram_size;
+ }
+ }
+
+ if (ram_hp_offset >= 0x100000000LL) {
+ ret = ram_hp_offset;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (ram_hp_offset + size <= PCI_HOLE_START) {
+ ret = ram_hp_offset;
+ below_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* otherwise place it above 4GB */
+ else {
+ ret = 0x100000000LL;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset = 0x100000000LL + size;
+ }
+
+ return ret;
+}
diff --git a/hw/pc.h b/hw/pc.h
index e4db071..f3304fc 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -10,6 +10,7 @@
#include "memory.h"
#include "ioapic.h"

+#define PCI_HOLE_START 0xe0000000
/* PC-style peripherals (also used by other machines). */

/* serial.c */
@@ -214,6 +215,11 @@ static inline bool isa_ne2000_init(ISABus *bus, int base, int irq, NICInfo *nd)
/* pc_sysfw.c */
void pc_system_firmware_init(MemoryRegion *rom_memory);

+/* memory hotplug */
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size);
+extern ram_addr_t below_4g_hp_mem_size;
+extern ram_addr_t above_4g_hp_mem_size;
+
/* e820 types */
#define E820_RAM 1
#define E820_RESERVED 2
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 88ff041..d1fd276 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -43,6 +43,7 @@
#include "xen.h"
#include "memory.h"
#include "exec-memory.h"
+#include "dimm.h"
#ifdef CONFIG_XEN
# include <xen/hvm/hvm_info_table.h>
#endif
@@ -155,9 +156,9 @@ static void pc_init1(MemoryRegion *system_memory,
kvmclock_create();
}

- if (ram_size >= 0xe0000000 ) {
- above_4g_mem_size = ram_size - 0xe0000000;
- below_4g_mem_size = 0xe0000000;
+ if (ram_size >= PCI_HOLE_START ) {
+ above_4g_mem_size = ram_size - PCI_HOLE_START;
+ below_4g_mem_size = PCI_HOLE_START;
} else {
above_4g_mem_size = 0;
below_4g_mem_size = ram_size;
@@ -172,6 +173,9 @@ static void pc_init1(MemoryRegion *system_memory,
rom_memory = system_memory;
}

+ /* adjust memory map for hotplug dimms */
+ dimm_calc_offsets(pc_set_hp_memory_offset);
+
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
fw_cfg = pc_memory_init(system_memory,
@@ -192,9 +196,11 @@ static void pc_init1(MemoryRegion *system_memory,
if (pci_enabled) {
pci_bus = i440fx_init(&i440fx_state, &piix3_devfn, &isa_bus, gsi,
system_memory, system_io, ram_size,
- below_4g_mem_size,
- 0x100000000ULL - below_4g_mem_size,
- 0x100000000ULL + above_4g_mem_size,
+ below_4g_mem_size + below_4g_hp_mem_size,
+ 0x100000000ULL - below_4g_mem_size
+ - below_4g_hp_mem_size,
+ 0x100000000ULL + above_4g_mem_size
+ + above_4g_hp_mem_size,
(sizeof(target_phys_addr_t) == 4
? 0
: ((uint64_t)1 << 62)),
@@ -223,6 +229,8 @@ static void pc_init1(MemoryRegion *system_memory,
ioapic_init(gsi_state);
}

+ main_memory_bus_create(object_resolve_path("i440fx", NULL));
+
pc_register_ferr_irq(gsi[13]);

pc_vga_init(isa_bus, pci_enabled ? pci_bus : NULL);
diff --git a/vl.c b/vl.c
index af1745c..2282910 100644
--- a/vl.c
+++ b/vl.c
@@ -184,6 +184,7 @@ DisplayType display_type = DT_DEFAULT;
int display_remote = 0;
const char* keyboard_layout = NULL;
ram_addr_t ram_size;
+ram_addr_t ram_hp_offset;
const char *mem_path = NULL;
#ifdef MAP_POPULATE
int mem_prealloc = 0; /* force preallocation of physical target memory */
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Blue Swirl
2012-09-22 14:15:28 UTC
Permalink
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
Dimm physical address offsets are calculated automatically and memory map is
adjusted accordingly. If a DIMM can fit before the PCI_HOLE_START (currently
0xe0000000), it will be added normally, otherwise its physical address will be
above 4GB.
Also create memory bus on i440fx-pcihost device.
---
hw/pc.c | 41 +++++++++++++++++++++++++++++++++++++++++
hw/pc.h | 6 ++++++
hw/pc_piix.c | 20 ++++++++++++++------
vl.c | 1 +
4 files changed, 62 insertions(+), 6 deletions(-)
diff --git a/hw/pc.c b/hw/pc.c
index 112739a..2c9664d 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -52,6 +52,7 @@
#include "arch_init.h"
#include "bitmap.h"
#include "vga-pci.h"
+#include "dimm.h"
/* output Bochs bios info messages */
//#define DEBUG_BIOS
@@ -93,6 +94,9 @@ struct e820_table {
static struct e820_table e820_table;
struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
+ram_addr_t below_4g_hp_mem_size = 0;
+ram_addr_t above_4g_hp_mem_size = 0;
+extern target_phys_addr_t ram_hp_offset;
extern declarations belong to headers only.
Post by Vasilis Liaskovitis
void gsi_handler(void *opaque, int n, int level)
{
GSIState *s = opaque;
@@ -1160,3 +1164,40 @@ void pc_pci_device_init(PCIBus *pci_bus)
pci_create_simple(pci_bus, -1, "lsi53c895a");
}
}
+
+
+/* Function to configure memory offsets of hotpluggable dimms */
+
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+ target_phys_addr_t ret;
+
+ /* on first call, initialize ram_hp_offset */
+ if (!ram_hp_offset) {
+ if (ram_size >= PCI_HOLE_START ) {
+ ram_hp_offset = 0x100000000LL + (ram_size - PCI_HOLE_START);
+ } else {
+ ram_hp_offset = ram_size;
+ }
+ }
+
+ if (ram_hp_offset >= 0x100000000LL) {
+ ret = ram_hp_offset;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (ram_hp_offset + size <= PCI_HOLE_START) {
} else if ...
Post by Vasilis Liaskovitis
+ ret = ram_hp_offset;
+ below_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* otherwise place it above 4GB */
+ else {
} else {
Post by Vasilis Liaskovitis
+ ret = 0x100000000LL;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset = 0x100000000LL + size;
+ }
+
+ return ret;
+}
But the function and use of lots of global variables is ugly. The dimm
devices should be just created in piix_pci.c (i440fx) directly with
correct offsets and sizes, so all below_4g_mem_size etc. calculations
should be moved there. That would implement the PMC part of i440fx.

For ISA PC, probably the board should create the DIMMs since there may
not be a memory controller. The >4G logic does not make sense there
anyway.
Post by Vasilis Liaskovitis
diff --git a/hw/pc.h b/hw/pc.h
index e4db071..f3304fc 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -10,6 +10,7 @@
#include "memory.h"
#include "ioapic.h"
+#define PCI_HOLE_START 0xe0000000
/* PC-style peripherals (also used by other machines). */
/* serial.c */
@@ -214,6 +215,11 @@ static inline bool isa_ne2000_init(ISABus *bus, int base, int irq, NICInfo *nd)
/* pc_sysfw.c */
void pc_system_firmware_init(MemoryRegion *rom_memory);
+/* memory hotplug */
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size);
+extern ram_addr_t below_4g_hp_mem_size;
+extern ram_addr_t above_4g_hp_mem_size;
+
/* e820 types */
#define E820_RAM 1
#define E820_RESERVED 2
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 88ff041..d1fd276 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -43,6 +43,7 @@
#include "xen.h"
#include "memory.h"
#include "exec-memory.h"
+#include "dimm.h"
#ifdef CONFIG_XEN
# include <xen/hvm/hvm_info_table.h>
#endif
@@ -155,9 +156,9 @@ static void pc_init1(MemoryRegion *system_memory,
kvmclock_create();
}
- if (ram_size >= 0xe0000000 ) {
- above_4g_mem_size = ram_size - 0xe0000000;
- below_4g_mem_size = 0xe0000000;
+ if (ram_size >= PCI_HOLE_START ) {
+ above_4g_mem_size = ram_size - PCI_HOLE_START;
+ below_4g_mem_size = PCI_HOLE_START;
} else {
above_4g_mem_size = 0;
below_4g_mem_size = ram_size;
@@ -172,6 +173,9 @@ static void pc_init1(MemoryRegion *system_memory,
rom_memory = system_memory;
}
+ /* adjust memory map for hotplug dimms */
+ dimm_calc_offsets(pc_set_hp_memory_offset);
+
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
fw_cfg = pc_memory_init(system_memory,
@@ -192,9 +196,11 @@ static void pc_init1(MemoryRegion *system_memory,
if (pci_enabled) {
pci_bus = i440fx_init(&i440fx_state, &piix3_devfn, &isa_bus, gsi,
system_memory, system_io, ram_size,
- below_4g_mem_size,
- 0x100000000ULL - below_4g_mem_size,
- 0x100000000ULL + above_4g_mem_size,
+ below_4g_mem_size + below_4g_hp_mem_size,
+ 0x100000000ULL - below_4g_mem_size
+ - below_4g_hp_mem_size,
+ 0x100000000ULL + above_4g_mem_size
+ + above_4g_hp_mem_size,
(sizeof(target_phys_addr_t) == 4
? 0
: ((uint64_t)1 << 62)),
@@ -223,6 +229,8 @@ static void pc_init1(MemoryRegion *system_memory,
ioapic_init(gsi_state);
}
+ main_memory_bus_create(object_resolve_path("i440fx", NULL));
+
pc_register_ferr_irq(gsi[13]);
pc_vga_init(isa_bus, pci_enabled ? pci_bus : NULL);
diff --git a/vl.c b/vl.c
index af1745c..2282910 100644
--- a/vl.c
+++ b/vl.c
@@ -184,6 +184,7 @@ DisplayType display_type = DT_DEFAULT;
int display_remote = 0;
const char* keyboard_layout = NULL;
ram_addr_t ram_size;
+ram_addr_t ram_hp_offset;
const char *mem_path = NULL;
#ifdef MAP_POPULATE
int mem_prealloc = 0; /* force preallocation of physical target memory */
--
1.7.9
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-24 15:27:20 UTC
Permalink
Post by Blue Swirl
Post by Vasilis Liaskovitis
+
+/* Function to configure memory offsets of hotpluggable dimms */
+
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+ target_phys_addr_t ret;
+
+ /* on first call, initialize ram_hp_offset */
+ if (!ram_hp_offset) {
+ if (ram_size >= PCI_HOLE_START ) {
+ ram_hp_offset = 0x100000000LL + (ram_size - PCI_HOLE_START);
+ } else {
+ ram_hp_offset = ram_size;
+ }
+ }
+
+ if (ram_hp_offset >= 0x100000000LL) {
+ ret = ram_hp_offset;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (ram_hp_offset + size <= PCI_HOLE_START) {
} else if ...
Post by Vasilis Liaskovitis
+ ret = ram_hp_offset;
+ below_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* otherwise place it above 4GB */
+ else {
} else {
Post by Vasilis Liaskovitis
+ ret = 0x100000000LL;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset = 0x100000000LL + size;
+ }
+
+ return ret;
+}
But the function and use of lots of global variables is ugly. The dimm
devices should be just created in piix_pci.c (i440fx) directly with
correct offsets and sizes, so all below_4g_mem_size etc. calculations
should be moved there. That would implement the PMC part of i440fx.
For ISA PC, probably the board should create the DIMMs since there may
not be a memory controller. The >4G logic does not make sense there
anyway.
What about moving the implementation to pc_piix.c?
Initial RAM and pci windows are already calculated in pc_init1, and then passed
to i440fx_init. The memory bus could be attached to i440fx for pci-enabled pc
and to isabus-bridge for isa-pc (isa-pc not tested yet).

Something like the following:

---
hw/pc.h | 1 +
hw/pc_piix.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++------
2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/hw/pc.h b/hw/pc.h
index e4db071..d6cc43b 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -10,6 +10,7 @@
#include "memory.h"
#include "ioapic.h"

+#define PCI_HOLE_START 0xe0000000
/* PC-style peripherals (also used by other machines). */

/* serial.c */
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 88ff041..17db95a 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -43,6 +43,7 @@
#include "xen.h"
#include "memory.h"
#include "exec-memory.h"
+#include "dimm.h"
#ifdef CONFIG_XEN
# include <xen/hvm/hvm_info_table.h>
#endif
@@ -52,6 +53,8 @@
static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 };
static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 };
static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
+static ram_addr_t below_4g_hp_mem_size = 0;
+static ram_addr_t above_4g_hp_mem_size = 0;

static void kvm_piix3_setup_irq_routing(bool pci_enabled)
{
@@ -117,6 +120,41 @@ static void ioapic_init(GSIState *gsi_state)
}
}

+static target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+ target_phys_addr_t ret;
+ static ram_addr_t ram_hp_offset = 0;
+
+ /* on first call, initialize ram_hp_offset */
+ if (!ram_hp_offset) {
+ if (ram_size >= PCI_HOLE_START ) {
+ ram_hp_offset = 0x100000000LL + (ram_size - PCI_HOLE_START);
+ } else {
+ ram_hp_offset = ram_size;
+ }
+ }
+
+ if (ram_hp_offset >= 0x100000000LL) {
+ ret = ram_hp_offset;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (ram_hp_offset + size <= PCI_HOLE_START) {
+ ret = ram_hp_offset;
+ below_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* otherwise place it above 4GB */
+ else {
+ ret = 0x100000000LL;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset = 0x100000000LL + size;
+ }
+
+ return ret;
+}
+
/* PC hardware initialisation */
static void pc_init1(MemoryRegion *system_memory,
MemoryRegion *system_io,
@@ -155,9 +193,9 @@ static void pc_init1(MemoryRegion *system_memory,
kvmclock_create();
}

- if (ram_size >= 0xe0000000 ) {
- above_4g_mem_size = ram_size - 0xe0000000;
- below_4g_mem_size = 0xe0000000;
+ if (ram_size >= PCI_HOLE_START ) {
+ above_4g_mem_size = ram_size - PCI_HOLE_START;
+ below_4g_mem_size = PCI_HOLE_START;
} else {
above_4g_mem_size = 0;
below_4g_mem_size = ram_size;
@@ -172,6 +210,9 @@ static void pc_init1(MemoryRegion *system_memory,
rom_memory = system_memory;
}

+ /* adjust memory map for hotplug dimms */
+ dimm_calc_offsets(pc_set_hp_memory_offset);
+
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
fw_cfg = pc_memory_init(system_memory,
@@ -192,18 +233,22 @@ static void pc_init1(MemoryRegion *system_memory,
if (pci_enabled) {
pci_bus = i440fx_init(&i440fx_state, &piix3_devfn, &isa_bus, gsi,
system_memory, system_io, ram_size,
- below_4g_mem_size,
- 0x100000000ULL - below_4g_mem_size,
- 0x100000000ULL + above_4g_mem_size,
+ below_4g_mem_size + below_4g_hp_mem_size,
+ 0x100000000ULL - below_4g_mem_size
+ - below_4g_hp_mem_size,
+ 0x100000000ULL + above_4g_mem_size
+ + above_4g_hp_mem_size,
(sizeof(target_phys_addr_t) == 4
? 0
: ((uint64_t)1 << 62)),
pci_memory, ram_memory);
+ main_memory_bus_create(object_resolve_path("i440fx", NULL));
} else {
pci_bus = NULL;
i440fx_state = NULL;
isa_bus = isa_bus_new(NULL, system_io);
no_hpet = 1;
+ main_memory_bus_create(object_resolve_path("isabus-bridge", NULL));
}
isa_bus_irqs(isa_bus, gsi);
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Blue Swirl
2012-09-29 11:27:49 UTC
Permalink
On Mon, Sep 24, 2012 at 3:27 PM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
Post by Blue Swirl
Post by Vasilis Liaskovitis
+
+/* Function to configure memory offsets of hotpluggable dimms */
+
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+ target_phys_addr_t ret;
+
+ /* on first call, initialize ram_hp_offset */
+ if (!ram_hp_offset) {
+ if (ram_size >= PCI_HOLE_START ) {
+ ram_hp_offset = 0x100000000LL + (ram_size - PCI_HOLE_START);
+ } else {
+ ram_hp_offset = ram_size;
+ }
+ }
+
+ if (ram_hp_offset >= 0x100000000LL) {
+ ret = ram_hp_offset;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (ram_hp_offset + size <= PCI_HOLE_START) {
} else if ...
Post by Vasilis Liaskovitis
+ ret = ram_hp_offset;
+ below_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* otherwise place it above 4GB */
+ else {
} else {
Post by Vasilis Liaskovitis
+ ret = 0x100000000LL;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset = 0x100000000LL + size;
+ }
+
+ return ret;
+}
But the function and use of lots of global variables is ugly. The dimm
devices should be just created in piix_pci.c (i440fx) directly with
correct offsets and sizes, so all below_4g_mem_size etc. calculations
should be moved there. That would implement the PMC part of i440fx.
For ISA PC, probably the board should create the DIMMs since there may
not be a memory controller. The >4G logic does not make sense there
anyway.
What about moving the implementation to pc_piix.c?
Initial RAM and pci windows are already calculated in pc_init1, and then passed
to i440fx_init. The memory bus could be attached to i440fx for pci-enabled pc
and to isabus-bridge for isa-pc (isa-pc not tested yet).
I'd move the calculations also to i440fx, it (PMC) determines the
memory configuration on real HW too.
Post by Vasilis Liaskovitis
---
hw/pc.h | 1 +
hw/pc_piix.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++------
2 files changed, 52 insertions(+), 6 deletions(-)
diff --git a/hw/pc.h b/hw/pc.h
index e4db071..d6cc43b 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -10,6 +10,7 @@
#include "memory.h"
#include "ioapic.h"
+#define PCI_HOLE_START 0xe0000000
/* PC-style peripherals (also used by other machines). */
/* serial.c */
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 88ff041..17db95a 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -43,6 +43,7 @@
#include "xen.h"
#include "memory.h"
#include "exec-memory.h"
+#include "dimm.h"
#ifdef CONFIG_XEN
# include <xen/hvm/hvm_info_table.h>
#endif
@@ -52,6 +53,8 @@
static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 };
static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 };
static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
+static ram_addr_t below_4g_hp_mem_size = 0;
+static ram_addr_t above_4g_hp_mem_size = 0;
static void kvm_piix3_setup_irq_routing(bool pci_enabled)
{
@@ -117,6 +120,41 @@ static void ioapic_init(GSIState *gsi_state)
}
}
+static target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+ target_phys_addr_t ret;
+ static ram_addr_t ram_hp_offset = 0;
+
+ /* on first call, initialize ram_hp_offset */
+ if (!ram_hp_offset) {
+ if (ram_size >= PCI_HOLE_START ) {
+ ram_hp_offset = 0x100000000LL + (ram_size - PCI_HOLE_START);
+ } else {
+ ram_hp_offset = ram_size;
+ }
+ }
+
+ if (ram_hp_offset >= 0x100000000LL) {
+ ret = ram_hp_offset;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (ram_hp_offset + size <= PCI_HOLE_START) {
+ ret = ram_hp_offset;
+ below_4g_hp_mem_size += size;
+ ram_hp_offset += size;
+ }
+ /* otherwise place it above 4GB */
+ else {
+ ret = 0x100000000LL;
+ above_4g_hp_mem_size += size;
+ ram_hp_offset = 0x100000000LL + size;
+ }
+
+ return ret;
+}
+
/* PC hardware initialisation */
static void pc_init1(MemoryRegion *system_memory,
MemoryRegion *system_io,
@@ -155,9 +193,9 @@ static void pc_init1(MemoryRegion *system_memory,
kvmclock_create();
}
- if (ram_size >= 0xe0000000 ) {
- above_4g_mem_size = ram_size - 0xe0000000;
- below_4g_mem_size = 0xe0000000;
+ if (ram_size >= PCI_HOLE_START ) {
+ above_4g_mem_size = ram_size - PCI_HOLE_START;
+ below_4g_mem_size = PCI_HOLE_START;
} else {
above_4g_mem_size = 0;
below_4g_mem_size = ram_size;
@@ -172,6 +210,9 @@ static void pc_init1(MemoryRegion *system_memory,
rom_memory = system_memory;
}
+ /* adjust memory map for hotplug dimms */
+ dimm_calc_offsets(pc_set_hp_memory_offset);
+
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
fw_cfg = pc_memory_init(system_memory,
@@ -192,18 +233,22 @@ static void pc_init1(MemoryRegion *system_memory,
if (pci_enabled) {
pci_bus = i440fx_init(&i440fx_state, &piix3_devfn, &isa_bus, gsi,
system_memory, system_io, ram_size,
- below_4g_mem_size,
- 0x100000000ULL - below_4g_mem_size,
- 0x100000000ULL + above_4g_mem_size,
+ below_4g_mem_size + below_4g_hp_mem_size,
+ 0x100000000ULL - below_4g_mem_size
+ - below_4g_hp_mem_size,
+ 0x100000000ULL + above_4g_mem_size
+ + above_4g_hp_mem_size,
(sizeof(target_phys_addr_t) == 4
? 0
: ((uint64_t)1 << 62)),
pci_memory, ram_memory);
+ main_memory_bus_create(object_resolve_path("i440fx", NULL));
} else {
pci_bus = NULL;
i440fx_state = NULL;
isa_bus = isa_bus_new(NULL, system_io);
no_hpet = 1;
+ main_memory_bus_create(object_resolve_path("isabus-bridge", NULL));
}
isa_bus_irqs(isa_bus, gsi);
--
1.7.9
Vasilis Liaskovitis
2012-09-21 11:17:27 UTC
Permalink
Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method.
This patch implements a tail queue to store guest notifications for memory
hot-add and hot-remove requests.

Guest responses for memory hotplug command on a per-dimm basis can be detected
with the new hmp command "info memhp" or the new qmp command "query-memhp"
Examples:

(qemu) device_add dimm,id=ram0
(qemu) info memory-hotplug
dimm: ram0 hot-add success
or
dimm: ram0 hot-add failure

(qemu) device_del ram3
(qemu) info memory-hotplug
dimm: ram3 hot-remove success
or
dimm: ram3 hot-remove failure

Results are removed from the queue once read.

This patch only queues _EJ events that signal hot-remove success.
For _OST event queuing, which cover the hot-remove failure and
hot-add success/failure cases, the _OST patches in this series are are also
needed.

These notification items should probably be part of migration state (not yet
implemented).

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hmp-commands.hx | 2 +
hmp.c | 17 ++++++++++++++
hmp.h | 1 +
hw/dimm.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
hw/dimm.h | 2 +-
monitor.c | 7 ++++++
qapi-schema.json | 26 ++++++++++++++++++++++
qmp-commands.hx | 37 ++++++++++++++++++++++++++++++++
8 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index ed67e99..cfb1b67 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1462,6 +1462,8 @@ show device tree
show qdev device model list
@item info roms
show roms
+@item info memory-hotplug
+show memory-hotplug
@end table
ETEXI

diff --git a/hmp.c b/hmp.c
index ba6fbd3..4b3d63d 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1168,3 +1168,20 @@ void hmp_screen_dump(Monitor *mon, const QDict *qdict)
qmp_screendump(filename, &err);
hmp_handle_error(mon, &err);
}
+
+void hmp_info_memory_hotplug(Monitor *mon)
+{
+ MemHpInfoList *info;
+ MemHpInfoList *item;
+ MemHpInfo *dimm;
+
+ info = qmp_query_memory_hotplug(NULL);
+ for (item = info; item; item = item->next) {
+ dimm = item->value;
+ monitor_printf(mon, "dimm: %s %s %s\n", dimm->dimm,
+ dimm->request, dimm->result);
+ dimm->dimm = NULL;
+ }
+
+ qapi_free_MemHpInfoList(info);
+}
diff --git a/hmp.h b/hmp.h
index 48b9c59..986705a 100644
--- a/hmp.h
+++ b/hmp.h
@@ -73,5 +73,6 @@ void hmp_getfd(Monitor *mon, const QDict *qdict);
void hmp_closefd(Monitor *mon, const QDict *qdict);
void hmp_send_key(Monitor *mon, const QDict *qdict);
void hmp_screen_dump(Monitor *mon, const QDict *qdict);
+void hmp_info_memory_hotplug(Monitor *mon);

#endif
diff --git a/hw/dimm.c b/hw/dimm.c
index 288b997..fbd93a8 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -65,6 +65,7 @@ static void dimm_bus_initfn(Object *obj)
DimmBus *bus = DIMM_BUS(obj);
QTAILQ_INIT(&bus->dimmconfig_list);
QTAILQ_INIT(&bus->dimmlist);
+ QTAILQ_INIT(&bus->dimm_hp_result_queue);

QTAILQ_FOREACH_SAFE(dimm_cfg, &dimmconfig_list, nextdimmcfg, next_dimm_cfg) {
QTAILQ_REMOVE(&dimmconfig_list, dimm_cfg, nextdimmcfg);
@@ -236,20 +237,78 @@ void dimm_notify(uint32_t idx, uint32_t event)
{
DimmBus *bus = main_memory_bus;
DimmDevice *s;
+ DimmConfig *slotcfg;
+ struct dimm_hp_result *result;
+
s = dimm_find_from_idx(idx);
assert(s != NULL);
+ result = g_malloc0(sizeof(*result));
+ slotcfg = dimmcfg_find_from_name(DEVICE(s)->id);
+ result->dimmname = slotcfg->name;

switch(event) {
case DIMM_REMOVE_SUCCESS:
dimm_depopulate(s);
- qdev_simple_unplug_cb((DeviceState*)s);
QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
+ qdev_simple_unplug_cb((DeviceState*)s);
+ QTAILQ_INSERT_TAIL(&bus->dimm_hp_result_queue, result, next);
break;
default:
+ g_free(result);
break;
}
}

+MemHpInfoList *qmp_query_memory_hotplug(Error **errp)
+{
+ DimmBus *bus = main_memory_bus;
+ MemHpInfoList *head = NULL, *cur_item = NULL, *info;
+ struct dimm_hp_result *item, *nextitem;
+
+ QTAILQ_FOREACH_SAFE(item, &bus->dimm_hp_result_queue, next, nextitem) {
+
+ info = g_malloc0(sizeof(*info));
+ info->value = g_malloc0(sizeof(*info->value));
+ info->value->dimm = g_malloc0(sizeof(char) * 32);
+ info->value->request = g_malloc0(sizeof(char) * 16);
+ info->value->result = g_malloc0(sizeof(char) * 16);
+ switch (item->ret) {
+ case DIMM_REMOVE_SUCCESS:
+ strcpy(info->value->request, "hot-remove");
+ strcpy(info->value->result, "success");
+ break;
+ case DIMM_REMOVE_FAIL:
+ strcpy(info->value->request, "hot-remove");
+ strcpy(info->value->result, "failure");
+ break;
+ case DIMM_ADD_SUCCESS:
+ strcpy(info->value->request, "hot-add");
+ strcpy(info->value->result, "success");
+ break;
+ case DIMM_ADD_FAIL:
+ strcpy(info->value->request, "hot-add");
+ strcpy(info->value->result, "failure");
+ break;
+ default:
+ break;
+ }
+ strcpy(info->value->dimm, item->dimmname);
+ /* XXX: waiting for the qapi to support GSList */
+ if (!cur_item) {
+ head = cur_item = info;
+ } else {
+ cur_item->next = info;
+ cur_item = info;
+ }
+
+ /* hotplug notification copied to qmp list, delete original item */
+ QTAILQ_REMOVE(&bus->dimm_hp_result_queue, item, next);
+ g_free(item);
+ }
+
+ return head;
+}
+
static int dimm_init(DeviceState *s)
{
DimmBus *bus = main_memory_bus;
@@ -286,6 +345,7 @@ static void dimm_class_init(ObjectClass *klass, void *data)

dc->props = dimm_properties;
dc->unplug = dimm_unplug_device;
+ dc->bus_type = TYPE_DIMM_BUS;
dc->init = dimm_init;
}

diff --git a/hw/dimm.h b/hw/dimm.h
index 5e991a6..95251ba 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -69,6 +69,7 @@ typedef struct DimmBus {
dimm_calcoffset_fn dimm_calcoffset;
DimmConfiglist dimmconfig_list;
QTAILQ_HEAD(Dimmlist, DimmDevice) dimmlist;
+ QTAILQ_HEAD(dimm_hp_result_head, dimm_hp_result) dimm_hp_result_queue;
} DimmBus;

struct dimm_hp_result {
@@ -86,5 +87,4 @@ void main_memory_bus_create(Object *parent);
void dimm_config_create(char *id, uint64_t size, uint64_t node,
uint32_t dimm_idx, uint32_t populated);

-
#endif
diff --git a/monitor.c b/monitor.c
index 67064e2..be9a1d9 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2740,6 +2740,13 @@ static mon_cmd_t info_cmds[] = {
.mhandler.info = do_trace_print_events,
},
{
+ .name = "memory-hotplug",
+ .args_type = "",
+ .params = "",
+ .help = "show memory hotplug status",
+ .mhandler.info = hmp_info_memory_hotplug,
+ },
+ {
.name = NULL,
},
};
diff --git a/qapi-schema.json b/qapi-schema.json
index a9f465a..3706a2a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2555,3 +2555,29 @@
# Since: 0.14.0
##
{ 'command': 'screendump', 'data': {'filename': 'str'} }
+
+##
+# @MemHpInfo:
+#
+# Information about status of a memory hotplug command
+#
+# @dimm: the Dimm associated with the result
+#
+# @result: the result of the hotplug command
+#
+# Since: 1.3
+#
+##
+{ 'type': 'MemHpInfo',
+ 'data': {'dimm': 'str', 'request': 'str', 'result': 'str'} }
+
+##
+# @query-memory-hotplug:
+#
+# Returns a list of information about pending hotplug commands
+#
+# Returns: a list of @MemhpInfo
+#
+# Since: 1.3
+##
+{ 'command': 'query-memory-hotplug', 'returns': ['MemHpInfo'] }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 6e21ddb..e50dcc2 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2539,3 +2539,40 @@ EQMP
.args_type = "",
.mhandler.cmd_new = qmp_marshal_input_query_target,
},
+ {
+ .name = "query-memory-hotplug",
+ .args_type = "",
+ .mhandler.cmd_new = qmp_marshal_input_query_memory_hotplug
+ },
+SQMP
+query-memory-hotplug
+----------
+
+Show memory hotplug command notifications.
+
+Return a json-array. Each DIMM that has a pending notification is represented
+by a json-object, which contains:
+
+- "dimm": Dimm name (json-str)
+- "request": type of hot request: hot-add or hot-remove (json-str)
+- "result": result of the hotplug request for this Dimm success or failure (json-str)
+
+Example:
+
+-> { "execute": "query-memory-hotplug" }
+<- {
+ "return":[
+ {
+ "result": "failure",
+ "request": "hot-remove",
+ "dimm": "dimm10"
+ },
+ {
+ "result": "success",
+ "request": "hot-add",
+ "dimm": "dimm3"
+ }
+ ]
+ }
+
+EQMP
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Blake
2012-09-21 22:03:26 UTC
Permalink
Post by Vasilis Liaskovitis
Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method.
This patch implements a tail queue to store guest notifications for memory
hot-add and hot-remove requests.
Guest responses for memory hotplug command on a per-dimm basis can be detected
with the new hmp command "info memhp" or the new qmp command "query-memhp"
Naming doesn't match the QMP code.
Post by Vasilis Liaskovitis
(qemu) device_add dimm,id=ram0
These notification items should probably be part of migration state (not yet
implemented).
In the case of libvirt driving migration, you already said in 10/19 that
libvirt has to start the destination with the populated=on|off fields
correct for each dimm according to the state it was in at the time the
host started the update. Can the host hot unplug memory after migration
has started?
Post by Vasilis Liaskovitis
+
+##
+#
+# Information about status of a memory hotplug command
+#
+#
+#
+# Since: 1.3
+#
+##
+{ 'type': 'MemHpInfo',
+ 'data': {'dimm': 'str', 'request': 'str', 'result': 'str'} }
Should 'result' be a bool (true for success, false for still pending) or
an enum, instead of a free-form string? Likewise, isn't 'request' going
to be exactly one of two values (plug or unplug)?
--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
Vasilis Liaskovitis
2012-09-24 14:45:02 UTC
Permalink
Hi,
Post by Eric Blake
Post by Vasilis Liaskovitis
Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method.
This patch implements a tail queue to store guest notifications for memory
hot-add and hot-remove requests.
Guest responses for memory hotplug command on a per-dimm basis can be detected
with the new hmp command "info memhp" or the new qmp command "query-memhp"
Naming doesn't match the QMP code.
will fix.
Post by Eric Blake
Post by Vasilis Liaskovitis
(qemu) device_add dimm,id=ram0
These notification items should probably be part of migration state (not yet
implemented).
In the case of libvirt driving migration, you already said in 10/19 that
libvirt has to start the destination with the populated=on|off fields
correct for each dimm according to the state it was in at the time the
That patch actually alleviates this restriction for the off->on direction i.e.
it allows for the target-VM to not have its args updated for dimm hot-add.
(e.g. Let's say the source was started with a dimm, initialy off. The dimm is
hot-plugged, and then migrated . WIth patch 10/19, the populated arg doesn't
have to be updated on the target)
The other direction (off->on) still needs correct arg change.

If libvirt/management layers guarantee the dimm arguments are correctly changed,
I don't see that we need 10/19 patch eventually.

What I think is needed is another hmp/qmp command, that will report
which dimms are on/off at any given time e.g.

(monitor) info memory-hotplug

dimm0: off
dimm1: on
...
dimmN: off

This can be used on the source by libvirt / other layers to find out the
populated dimms, and construct the correct command line on the destination.
Does this make sense to you?

The current patch only deals with success/failure event notifications (not
on-off state of dimms) and should probably be renamed to
"query-memory-hotplug-events".
Post by Eric Blake
host started the update. Can the host hot unplug memory after migration
has started?
Good testcase. I would rather not allow any hotplug operations while the migration
is happening.

What do we do with pci hotplug during migration currently? I found a discussion
dating from a year ago, suggesting the same as the simplest solution, but I
don't know what's currently implemented.
http://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg01204.html
Post by Eric Blake
Post by Vasilis Liaskovitis
+
+##
+#
+# Information about status of a memory hotplug command
+#
+#
+#
+# Since: 1.3
+#
+##
+{ 'type': 'MemHpInfo',
+ 'data': {'dimm': 'str', 'request': 'str', 'result': 'str'} }
Should 'result' be a bool (true for success, false for still pending) or
an enum, instead of a free-form string? Likewise, isn't 'request' going
to be exactly one of two values (plug or unplug)?
agreed with 'request'.

For 'result' it is also a boolean, but with 'success' and 'failure' (rather than
'pending'). Items are only queued when the guest has given us a definite _OST
or _EJ result wich is either success or fail. If an operation is pending, nothing
is queued here.

Perhaps queueing pending operations also has a usecase, but this isn't addressed
in this patch.

thanks,

- Vasilis

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi
2012-10-23 12:15:12 UTC
Permalink
Post by Vasilis Liaskovitis
+MemHpInfoList *qmp_query_memory_hotplug(Error **errp)
+{
+ DimmBus *bus = main_memory_bus;
+ MemHpInfoList *head = NULL, *cur_item = NULL, *info;
+ struct dimm_hp_result *item, *nextitem;
+
+ QTAILQ_FOREACH_SAFE(item, &bus->dimm_hp_result_queue, next, nextitem) {
+
+ info = g_malloc0(sizeof(*info));
+ info->value = g_malloc0(sizeof(*info->value));
+ info->value->dimm = g_malloc0(sizeof(char) * 32);
+ info->value->request = g_malloc0(sizeof(char) * 16);
+ info->value->result = g_malloc0(sizeof(char) * 16);
+ switch (item->ret) {
+ strcpy(info->value->request, "hot-remove");
+ strcpy(info->value->result, "success");
+ break;
+ strcpy(info->value->request, "hot-remove");
+ strcpy(info->value->result, "failure");
+ break;
+ strcpy(info->value->request, "hot-add");
+ strcpy(info->value->result, "success");
+ break;
+ strcpy(info->value->request, "hot-add");
+ strcpy(info->value->result, "failure");
+ break;
+ break;
+ }
Any reason to use fixed-size malloc + strcpy() instead of just
info->value->X = g_strdup("foo")?

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:33 UTC
Permalink
Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
src/acpi-dsdt.dsl | 15 +++++++++++++++
src/ssdt-mem.dsl | 4 ++++
2 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 0d37bbc..8a18770 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -784,6 +784,13 @@ DefinitionBlock (
MIF, 8
}

+ /* Memory _PS3 byte */
+ OperationRegion(MPSB, SystemIO, 0xafa4, 1)
+ Field (MPSB, ByteAcc, NoLock, Preserve)
+ {
+ MPS, 8
+ }
+
Method(MESC, 0) {
// Local5 = active memdevice bitmap
Store (MES, Local5)
@@ -824,6 +831,14 @@ DefinitionBlock (
Store(Arg0, MPE)
Sleep(200)
}
+
+ Method (MPS3, 1, NotSerialized) {
+ // _PS3 method - power-off method
+ Store(Arg0, MPS)
+ Store(Zero, Index(MEON, Arg0))
+ Sleep(200)
+ }
+
Method (MOST, 3, Serialized) {
// _OST method - OS status indication
Switch (And(Arg0, 0xFF)) {
diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl
index 041d301..7423fc6 100644
--- a/src/ssdt-mem.dsl
+++ b/src/ssdt-mem.dsl
@@ -39,6 +39,7 @@ DefinitionBlock ("ssdt-mem.aml", "SSDT", 0x02, "BXPC", "CSSDT", 0x1)
External(CMST, MethodObj)
External(MPEJ, MethodObj)
External(MOST, MethodObj)
+ External(MPS3, MethodObj)

Name(_CRS, ResourceTemplate() {
QwordMemory(
@@ -64,6 +65,9 @@ DefinitionBlock ("ssdt-mem.aml", "SSDT", 0x02, "BXPC", "CSSDT", 0x1)
Method (_OST, 3) {
MOST(Arg0, Arg1, ID)
}
+ Method (_PS3, 0) {
+ MPS3(ID)
+ }
}
}
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:34 UTC
Permalink
This will allow us to update dimm state on OSPM-initiated eject operations e.g.
with "echo 1 > /sys/bus/acpi/devices/PNP0C80\:00/eject"

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
docs/specs/acpi_hotplug.txt | 7 +++++++
hw/acpi_piix4.c | 5 +++++
hw/dimm.c | 3 +++
hw/dimm.h | 3 ++-
4 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
index 536da16..69868fe 100644
--- a/docs/specs/acpi_hotplug.txt
+++ b/docs/specs/acpi_hotplug.txt
@@ -45,3 +45,10 @@ insertion failed.
Written by ACPI memory device _OST method to notify qemu of failed
hot-add. Write-only.

+Memory Dimm _PS3 power-off initiated by OSPM (IO port 0xafa4, 1-byte access):
+---------------------------------------------------------------
+Dimm hot-add _PS3 initiated by OSPM. Byte value indicates Dimm slot which
+entered D3 state.
+
+Written by ACPI memory device _PS3 method to notify qemu of power-off state for
+the dimm. Write-only.
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 8bf58a6..aad78ca 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -52,6 +52,7 @@
#define MEM_OST_REMOVE_FAIL 0xafa1
#define MEM_OST_ADD_SUCCESS 0xafa2
#define MEM_OST_ADD_FAIL 0xafa3
+#define MEM_PS3 0xafa4

#define PIIX4_MEM_HOTPLUG_STATUS 8
#define PIIX4_PCI_HOTPLUG_STATUS 2
@@ -545,6 +546,9 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
case MEM_OST_ADD_FAIL:
dimm_notify(val, DIMM_ADD_FAIL);
break;
+ case MEM_PS3:
+ dimm_notify(val, DIMM_OSPM_POWEROFF);
+ break;
default:
acpi_gpe_ioport_writeb(&s->ar, addr, val);
}
@@ -621,6 +625,7 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
register_ioport_write(MEM_OST_REMOVE_FAIL, 1, 1, gpe_writeb, s);
register_ioport_write(MEM_OST_ADD_SUCCESS, 1, 1, gpe_writeb, s);
register_ioport_write(MEM_OST_ADD_FAIL, 1, 1, gpe_writeb, s);
+ register_ioport_write(MEM_PS3, 1, 1, gpe_writeb, s);

for(i = 0; i < DIMM_BITMAP_BYTES; i++) {
s->gperegs.mems_sts[i] = 0;
diff --git a/hw/dimm.c b/hw/dimm.c
index b993668..08f66d5 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -319,6 +319,9 @@ void dimm_notify(uint32_t idx, uint32_t event)
qdev_simple_unplug_cb((DeviceState*)s);
QTAILQ_INSERT_TAIL(&bus->dimm_hp_result_queue, result, next);
break;
+ case DIMM_OSPM_POWEROFF:
+ if (bus->dimm_revert)
+ bus->dimm_revert(bus->dimm_hotplug_qdev, s, 1);
default:
g_free(result);
break;
diff --git a/hw/dimm.h b/hw/dimm.h
index ce091fe..8d73b8f 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -15,7 +15,8 @@ typedef enum {
DIMM_REMOVE_SUCCESS = 0,
DIMM_REMOVE_FAIL = 1,
DIMM_ADD_SUCCESS = 2,
- DIMM_ADD_FAIL = 3
+ DIMM_ADD_FAIL = 3,
+ DIMM_OSPM_POWEROFF = 4
} dimm_hp_result_code;

typedef enum {
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:35 UTC
Permalink
pcimem_start and pcimem64_start are adjusted from srat entries. For this reason,
paravirt info (NUMA SRAT entries and number of cpus) need to be read before pci_setup.
Imho, this is an ugly code change since SRAT bios tables and number of
cpus have to be read earlier. But the advantage is that no new paravirt interface
is introduced. Suggestions to make the code change cleaner are welcome.

The alternative patch (will be sent as a reply to this patch) implements a
paravirt interface to read the starting values of pcimem_start and
pcimem64_start from QEMU.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
src/acpi.c | 82 ++++++++++++++++++++++++++++++++++++++++----------------
src/acpi.h | 3 ++
src/pciinit.c | 6 +++-
src/post.c | 3 ++
src/smp.c | 4 +++
5 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/src/acpi.c b/src/acpi.c
index 1223b52..9e99aa7 100644
--- a/src/acpi.c
+++ b/src/acpi.c
@@ -428,7 +428,10 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes)
#define MEM_OFFSET_END 63
#define MEM_OFFSET_SIZE 79

-u64 nb_hp_memslots = 0;
+u64 nb_hp_memslots = 0, nb_numanodes;
+u64 *numa_data, *hp_memdata;
+u64 below_4g_hp_mem_size = 0;
+u64 above_4g_hp_mem_size = 0;
struct srat_memory_affinity *mem;

#define SSDT_SIGNATURE 0x54445353 // SSDT
@@ -763,17 +766,7 @@ acpi_build_srat_memory(struct srat_memory_affinity *numamem,
static void *
build_srat(void)
{
- int nb_numa_nodes = qemu_cfg_get_numa_nodes();
-
- u64 *numadata = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_numa_nodes));
- if (!numadata) {
- warn_noalloc();
- return NULL;
- }
-
- qemu_cfg_get_numa_data(numadata, MaxCountCPUs + nb_numa_nodes);
-
- qemu_cfg_get_numa_data(&nb_hp_memslots, 1);
+ int nb_numa_nodes = nb_numanodes;
struct system_resource_affinity_table *srat;
int srat_size = sizeof(*srat) +
sizeof(struct srat_processor_affinity) * MaxCountCPUs +
@@ -782,7 +775,7 @@ build_srat(void)
srat = malloc_high(srat_size);
if (!srat) {
warn_noalloc();
- free(numadata);
+ free(numa_data);
return NULL;
}

@@ -791,6 +784,7 @@ build_srat(void)
struct srat_processor_affinity *core = (void*)(srat + 1);
int i;
u64 curnode;
+ u64 *numadata = numa_data;

for (i = 0; i < MaxCountCPUs; ++i) {
core->type = SRAT_PROCESSOR;
@@ -847,15 +841,7 @@ build_srat(void)
mem = (void*)numamem;

if (nb_hp_memslots) {
- u64 *hpmemdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots));
- if (!hpmemdata) {
- warn_noalloc();
- free(hpmemdata);
- free(numadata);
- return NULL;
- }
-
- qemu_cfg_get_numa_data(hpmemdata, 3 * nb_hp_memslots);
+ u64 *hpmemdata = hp_memdata;

for (i = 1; i < nb_hp_memslots + 1; ++i) {
mem_base = *hpmemdata++;
@@ -865,7 +851,7 @@ build_srat(void)
numamem++;
slots++;
}
- free(hpmemdata);
+ free(hp_memdata);
}

for (; slots < nb_numa_nodes + nb_hp_memslots + 2; slots++) {
@@ -875,10 +861,58 @@ build_srat(void)

build_header((void*)srat, SRAT_SIGNATURE, srat_size, 1);

- free(numadata);
+ free(numa_data);
return srat;
}

+/* QEMU paravirt SRAT entries need to be read in before pci initilization */
+void read_srat_early(void)
+{
+ int i;
+
+ nb_numanodes = qemu_cfg_get_numa_nodes();
+ u64 *hpmemdata;
+ u64 mem_len, mem_base;
+
+ numa_data = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_numanodes));
+ if (!numa_data) {
+ warn_noalloc();
+ }
+
+ qemu_cfg_get_numa_data(numa_data, MaxCountCPUs + nb_numanodes);
+ qemu_cfg_get_numa_data(&nb_hp_memslots, 1);
+
+ if (nb_hp_memslots) {
+ hp_memdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots));
+ if (!hp_memdata) {
+ warn_noalloc();
+ free(hp_memdata);
+ free(numa_data);
+ }
+
+ qemu_cfg_get_numa_data(hp_memdata, 3 * nb_hp_memslots);
+ hpmemdata = hp_memdata;
+
+ for (i = 1; i < nb_hp_memslots + 1; ++i) {
+ mem_base = *hpmemdata++;
+ mem_len = *hpmemdata++;
+ hpmemdata++;
+ if (mem_base >= 0x100000000LL) {
+ above_4g_hp_mem_size += mem_len;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (mem_base + mem_len <= BUILD_PCIMEM_START) {
+ below_4g_hp_mem_size += mem_len;
+ }
+ /* otherwise place it above 4GB */
+ else {
+ above_4g_hp_mem_size += mem_len;
+ }
+ }
+
+ }
+}
+
static const struct pci_device_id acpi_find_tbl[] = {
/* PIIX4 Power Management device. */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, NULL),
diff --git a/src/acpi.h b/src/acpi.h
index cb21561..d29837f 100644
--- a/src/acpi.h
+++ b/src/acpi.h
@@ -5,6 +5,9 @@

void acpi_bios_init(void);
u32 find_resume_vector(void);
+void read_srat_early(void);
+extern u64 below_4g_hp_mem_size;
+extern u64 above_4g_hp_mem_size;

#define RSDP_SIGNATURE 0x2052545020445352LL // "RSD PTR "

diff --git a/src/pciinit.c b/src/pciinit.c
index 31115ee..c5a4b24 100644
--- a/src/pciinit.c
+++ b/src/pciinit.c
@@ -12,6 +12,7 @@
#include "ioport.h" // PORT_ATA1_CMD_BASE
#include "config.h" // CONFIG_*
#include "xen.h" // usingXen
+#include "acpi.h"

#define PCI_DEVICE_MEM_MIN 0x1000
#define PCI_BRIDGE_IO_MIN 0x1000
@@ -597,7 +598,7 @@ static void pci_region_map_entries(struct pci_bus *busses, struct pci_region *r)

static void pci_bios_map_devices(struct pci_bus *busses)
{
- pcimem_start = RamSize;
+ pcimem_start = RamSize + below_4g_hp_mem_size;

if (pci_bios_init_root_regions(busses)) {
struct pci_region r64_mem, r64_pref;
@@ -616,7 +617,8 @@ static void pci_bios_map_devices(struct pci_bus *busses)
u64 align_mem = pci_region_align(&r64_mem);
u64 align_pref = pci_region_align(&r64_pref);

- r64_mem.base = ALIGN(0x100000000LL + RamSizeOver4G, align_mem);
+ r64_mem.base = ALIGN(0x100000000LL + RamSizeOver4G +
+ above_4g_hp_mem_size, align_mem);
r64_pref.base = ALIGN(r64_mem.base + sum_mem, align_pref);
pcimem64_start = r64_mem.base;
pcimem64_end = r64_pref.base + sum_pref;
diff --git a/src/post.c b/src/post.c
index 924b311..c37730b 100644
--- a/src/post.c
+++ b/src/post.c
@@ -234,6 +234,9 @@ maininit(void)
// Initialize mtrr
mtrr_setup();

+ smp_get_ncpus();
+ read_srat_early();
+
// Initialize pci
pci_setup();
smm_init();
diff --git a/src/smp.c b/src/smp.c
index 4975412..3922776 100644
--- a/src/smp.c
+++ b/src/smp.c
@@ -138,7 +138,11 @@ smp_probe(void)

// Restore memory.
*(u64*)BUILD_AP_BOOT_ADDR = old;
+}

+void
+smp_get_ncpus(void)
+{
MaxCountCPUs = qemu_cfg_get_max_cpus();
if (!MaxCountCPUs || MaxCountCPUs < CountCPUs)
MaxCountCPUs = CountCPUs;
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:19:07 UTC
Permalink
Qemu already calculates the 32-bit and 64-bit PCI starting offsets based on
initial memory and hotplug-able dimms. This info needs to be passed to Seabios
for PCI initialization.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
docs/specs/fwcfg.txt | 9 +++++++++
hw/fw_cfg.h | 1 +
hw/pc_piix.c | 10 ++++++++++
3 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
index 55f96d9..d9fa215 100644
--- a/docs/specs/fwcfg.txt
+++ b/docs/specs/fwcfg.txt
@@ -26,3 +26,12 @@ Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains
the physical address offset, size (in bytes), and node proximity for the
respective dimm.
+
+FW_CFG_PCI_WINDOW paravirt info
+--------------------
+QEMU passes the starting address for the 32-bit and 64-bit PCI windows to BIOS.
+The following layouts are followed:
+
+--------------------------------
+pcimem32_start | pcimem64_start |
+--------------------------------
diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index 856bf91..6c8c151 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -27,6 +27,7 @@
#define FW_CFG_SETUP_SIZE 0x17
#define FW_CFG_SETUP_DATA 0x18
#define FW_CFG_FILE_DIR 0x19
+#define FW_CFG_PCI_WINDOW 0x1a

#define FW_CFG_FILE_FIRST 0x20
#define FW_CFG_FILE_SLOTS 0x10
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index d1fd276..034761f 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -44,6 +44,7 @@
#include "memory.h"
#include "exec-memory.h"
#include "dimm.h"
+#include "fw_cfg.h"
#ifdef CONFIG_XEN
# include <xen/hvm/hvm_info_table.h>
#endif
@@ -149,6 +150,7 @@ static void pc_init1(MemoryRegion *system_memory,
MemoryRegion *pci_memory;
MemoryRegion *rom_memory;
void *fw_cfg = NULL;
+ uint64_t *pci_window_fw_cfg;

pc_cpus_init(cpu_model);

@@ -205,6 +207,14 @@ static void pc_init1(MemoryRegion *system_memory,
? 0
: ((uint64_t)1 << 62)),
pci_memory, ram_memory);
+
+ pci_window_fw_cfg = g_malloc0(2 * 8);
+ pci_window_fw_cfg[0] = cpu_to_le64(below_4g_mem_size +
+ below_4g_hp_mem_size);
+ pci_window_fw_cfg[1] = cpu_to_le64(0x100000000ULL + above_4g_mem_size
+ + above_4g_hp_mem_size);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_PCI_WINDOW,
+ (uint8_t *)pci_window_fw_cfg, 2 * 8);
} else {
pci_bus = NULL;
i440fx_state = NULL;
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:20:06 UTC
Permalink
Initialize the 32-bit and 64-bit pci starting offsets from values passed in by
the qemu paravirt interface QEMU_CFG_PCI_WINDOW. Qemu calculates the starting
offsets based on initial memory and hotplug-able dimms.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
src/paravirt.c | 6 ++++++
src/paravirt.h | 2 ++
src/pciinit.c | 5 ++---
3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/paravirt.c b/src/paravirt.c
index 2a98d53..390ef30 100644
--- a/src/paravirt.c
+++ b/src/paravirt.c
@@ -346,3 +346,9 @@ void qemu_cfg_romfile_setup(void)
dprintf(3, "Found fw_cfg file: %s (size=%d)\n", file->name, file->size);
}
}
+
+void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start)
+{
+ qemu_cfg_read_entry(pcimem_start, QEMU_CFG_PCI_WINDOW, sizeof(u64));
+ qemu_cfg_read((u8*)(pcimem64_start), sizeof(u64));
+}
diff --git a/src/paravirt.h b/src/paravirt.h
index a284c41..b53ff88 100644
--- a/src/paravirt.h
+++ b/src/paravirt.h
@@ -35,6 +35,7 @@ static inline int kvm_para_available(void)
#define QEMU_CFG_BOOT_MENU 0x0e
#define QEMU_CFG_MAX_CPUS 0x0f
#define QEMU_CFG_FILE_DIR 0x19
+#define QEMU_CFG_PCI_WINDOW 0x1a
#define QEMU_CFG_ARCH_LOCAL 0x8000
#define QEMU_CFG_ACPI_TABLES (QEMU_CFG_ARCH_LOCAL + 0)
#define QEMU_CFG_SMBIOS_ENTRIES (QEMU_CFG_ARCH_LOCAL + 1)
@@ -65,5 +66,6 @@ struct e820_reservation {
u32 qemu_cfg_e820_entries(void);
void* qemu_cfg_e820_load_next(void *addr);
void qemu_cfg_romfile_setup(void);
+void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start);

#endif
diff --git a/src/pciinit.c b/src/pciinit.c
index 68f302a..64468a0 100644
--- a/src/pciinit.c
+++ b/src/pciinit.c
@@ -592,8 +592,7 @@ static void pci_region_map_entries(struct pci_bus *busses, struct pci_region *r)

static void pci_bios_map_devices(struct pci_bus *busses)
{
- pcimem_start = RamSize;
-
+ qemu_cfg_get_pci_offsets(&pcimem_start, &pcimem64_start);
if (pci_bios_init_root_regions(busses)) {
struct pci_region r64_mem, r64_pref;
r64_mem.list = NULL;
@@ -611,7 +610,7 @@ static void pci_bios_map_devices(struct pci_bus *busses)
u64 align_mem = pci_region_align(&r64_mem);
u64 align_pref = pci_region_align(&r64_pref);

- r64_mem.base = ALIGN(0x100000000LL + RamSizeOver4G, align_mem);
+ r64_mem.base = ALIGN(pcimem64_start, align_mem);
r64_pref.base = ALIGN(r64_mem.base + sum_mem, align_pref);
pcimem64_start = r64_mem.base;
pcimem64_end = r64_pref.base + sum_pref;
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wen Congyang
2012-09-24 06:35:30 UTC
Permalink
Post by Vasilis Liaskovitis
Initialize the 32-bit and 64-bit pci starting offsets from values passed in by
the qemu paravirt interface QEMU_CFG_PCI_WINDOW. Qemu calculates the starting
offsets based on initial memory and hotplug-able dimms.
This patch can't be applied if I apply the other patches for seabios. And I
don't find this patch in your tree.

Thanks
Wen Congyang
Post by Vasilis Liaskovitis
---
src/paravirt.c | 6 ++++++
src/paravirt.h | 2 ++
src/pciinit.c | 5 ++---
3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/src/paravirt.c b/src/paravirt.c
index 2a98d53..390ef30 100644
--- a/src/paravirt.c
+++ b/src/paravirt.c
@@ -346,3 +346,9 @@ void qemu_cfg_romfile_setup(void)
dprintf(3, "Found fw_cfg file: %s (size=%d)\n", file->name, file->size);
}
}
+
+void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start)
+{
+ qemu_cfg_read_entry(pcimem_start, QEMU_CFG_PCI_WINDOW, sizeof(u64));
+ qemu_cfg_read((u8*)(pcimem64_start), sizeof(u64));
+}
diff --git a/src/paravirt.h b/src/paravirt.h
index a284c41..b53ff88 100644
--- a/src/paravirt.h
+++ b/src/paravirt.h
@@ -35,6 +35,7 @@ static inline int kvm_para_available(void)
#define QEMU_CFG_BOOT_MENU 0x0e
#define QEMU_CFG_MAX_CPUS 0x0f
#define QEMU_CFG_FILE_DIR 0x19
+#define QEMU_CFG_PCI_WINDOW 0x1a
#define QEMU_CFG_ARCH_LOCAL 0x8000
#define QEMU_CFG_ACPI_TABLES (QEMU_CFG_ARCH_LOCAL + 0)
#define QEMU_CFG_SMBIOS_ENTRIES (QEMU_CFG_ARCH_LOCAL + 1)
@@ -65,5 +66,6 @@ struct e820_reservation {
u32 qemu_cfg_e820_entries(void);
void* qemu_cfg_e820_load_next(void *addr);
void qemu_cfg_romfile_setup(void);
+void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start);
#endif
diff --git a/src/pciinit.c b/src/pciinit.c
index 68f302a..64468a0 100644
--- a/src/pciinit.c
+++ b/src/pciinit.c
@@ -592,8 +592,7 @@ static void pci_region_map_entries(struct pci_bus *busses, struct pci_region *r)
static void pci_bios_map_devices(struct pci_bus *busses)
{
- pcimem_start = RamSize;
-
+ qemu_cfg_get_pci_offsets(&pcimem_start, &pcimem64_start);
if (pci_bios_init_root_regions(busses)) {
struct pci_region r64_mem, r64_pref;
r64_mem.list = NULL;
@@ -611,7 +610,7 @@ static void pci_bios_map_devices(struct pci_bus *busses)
u64 align_mem = pci_region_align(&r64_mem);
u64 align_pref = pci_region_align(&r64_pref);
- r64_mem.base = ALIGN(0x100000000LL + RamSizeOver4G, align_mem);
+ r64_mem.base = ALIGN(pcimem64_start, align_mem);
r64_pref.base = ALIGN(r64_mem.base + sum_mem, align_pref);
pcimem64_start = r64_mem.base;
pcimem64_end = r64_pref.base + sum_pref;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-24 10:46:04 UTC
Permalink
Post by Wen Congyang
Post by Vasilis Liaskovitis
Initialize the 32-bit and 64-bit pci starting offsets from values passed in by
the qemu paravirt interface QEMU_CFG_PCI_WINDOW. Qemu calculates the starting
offsets based on initial memory and hotplug-able dimms.
This patch can't be applied if I apply the other patches for seabios. And I
don't find this patch in your tree.
to test these alternative patches, please try these trees:

https://github.com/vliaskov/seabios/commits/memhp-v3-alt
https://github.com/vliaskov/qemu-kvm/commits/memhp-v3-alt

thanks,

- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wen Congyang
2012-09-24 06:51:00 UTC
Permalink
pcimem_start and pcimem64_start are adjusted from srat entries. For t=
his reason,
paravirt info (NUMA SRAT entries and number of cpus) need to be read =
before pci_setup.
Imho, this is an ugly code change since SRAT bios tables and number o=
f
cpus have to be read earlier. But the advantage is that no new paravi=
rt interface
is introduced. Suggestions to make the code change cleaner are welcom=
e.
=20
The alternative patch (will be sent as a reply to this patch) impleme=
nts a
paravirt interface to read the starting values of pcimem_start and
pcimem64_start from QEMU.
=20
com>
---
src/acpi.c | 82 ++++++++++++++++++++++++++++++++++++++++-------=
---------
src/acpi.h | 3 ++
src/pciinit.c | 6 +++-
src/post.c | 3 ++
src/smp.c | 4 +++
5 files changed, 72 insertions(+), 26 deletions(-)
=20
diff --git a/src/acpi.c b/src/acpi.c
index 1223b52..9e99aa7 100644
--- a/src/acpi.c
+++ b/src/acpi.c
@@ -428,7 +428,10 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes)
#define MEM_OFFSET_END 63
#define MEM_OFFSET_SIZE 79
=20
-u64 nb_hp_memslots =3D 0;
+u64 nb_hp_memslots =3D 0, nb_numanodes;
+u64 *numa_data, *hp_memdata;
+u64 below_4g_hp_mem_size =3D 0;
+u64 above_4g_hp_mem_size =3D 0;
struct srat_memory_affinity *mem;
=20
#define SSDT_SIGNATURE 0x54445353 // SSDT
@@ -763,17 +766,7 @@ acpi_build_srat_memory(struct srat_memory_affini=
ty *numamem,
static void *
build_srat(void)
{
- int nb_numa_nodes =3D qemu_cfg_get_numa_nodes();
-
- u64 *numadata =3D malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + n=
b_numa_nodes));
- if (!numadata) {
- warn_noalloc();
- return NULL;
- }
-
- qemu_cfg_get_numa_data(numadata, MaxCountCPUs + nb_numa_nodes);
-
- qemu_cfg_get_numa_data(&nb_hp_memslots, 1);
+ int nb_numa_nodes =3D nb_numanodes;
struct system_resource_affinity_table *srat;
int srat_size =3D sizeof(*srat) +
sizeof(struct srat_processor_affinity) * MaxCountCPUs +
@@ -782,7 +775,7 @@ build_srat(void)
srat =3D malloc_high(srat_size);
if (!srat) {
warn_noalloc();
- free(numadata);
+ free(numa_data);
return NULL;
}
=20
@@ -791,6 +784,7 @@ build_srat(void)
struct srat_processor_affinity *core =3D (void*)(srat + 1);
int i;
u64 curnode;
+ u64 *numadata =3D numa_data;
=20
for (i =3D 0; i < MaxCountCPUs; ++i) {
core->type =3D SRAT_PROCESSOR;
@@ -847,15 +841,7 @@ build_srat(void)
mem =3D (void*)numamem;
=20
if (nb_hp_memslots) {
- u64 *hpmemdata =3D malloc_tmphigh(sizeof(u64) * (3 * nb_hp_m=
emslots));
- if (!hpmemdata) {
- warn_noalloc();
- free(hpmemdata);
- free(numadata);
- return NULL;
- }
-
- qemu_cfg_get_numa_data(hpmemdata, 3 * nb_hp_memslots);
+ u64 *hpmemdata =3D hp_memdata;
=20
for (i =3D 1; i < nb_hp_memslots + 1; ++i) {
mem_base =3D *hpmemdata++;
@@ -865,7 +851,7 @@ build_srat(void)
numamem++;
slots++;
}
- free(hpmemdata);
+ free(hp_memdata);
}
=20
for (; slots < nb_numa_nodes + nb_hp_memslots + 2; slots++) {
@@ -875,10 +861,58 @@ build_srat(void)
=20
build_header((void*)srat, SRAT_SIGNATURE, srat_size, 1);
=20
- free(numadata);
+ free(numa_data);
return srat;
}
=20
+/* QEMU paravirt SRAT entries need to be read in before pci initiliz=
ation */
+void read_srat_early(void)
+{
+ int i;
+
+ nb_numanodes =3D qemu_cfg_get_numa_nodes();
+ u64 *hpmemdata;
+ u64 mem_len, mem_base;
+
+ numa_data =3D malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_nu=
manodes));
+ if (!numa_data) {
+ warn_noalloc();
+ }
+
+ qemu_cfg_get_numa_data(numa_data, MaxCountCPUs + nb_numanodes);
+ qemu_cfg_get_numa_data(&nb_hp_memslots, 1);
+
+ if (nb_hp_memslots) {
+ hp_memdata =3D malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memsl=
ots));
+ if (!hp_memdata) {
+ warn_noalloc();
+ free(hp_memdata);
+ free(numa_data);
+ }
+
+ qemu_cfg_get_numa_data(hp_memdata, 3 * nb_hp_memslots);
+ hpmemdata =3D hp_memdata;
+
+ for (i =3D 1; i < nb_hp_memslots + 1; ++i) {
+ mem_base =3D *hpmemdata++;
+ mem_len =3D *hpmemdata++;
+ hpmemdata++;
+ if (mem_base >=3D 0x100000000LL) {
+ above_4g_hp_mem_size +=3D mem_len;
+ }
+ /* if dimm fits before pci hole, append it normally */
+ else if (mem_base + mem_len <=3D BUILD_PCIMEM_START) {
+ below_4g_hp_mem_size +=3D mem_len;
+ }
+ /* otherwise place it above 4GB */
+ else {
+ above_4g_hp_mem_size +=3D mem_len;
+ }
+ }
+
+ }
+}
+
static const struct pci_device_id acpi_find_tbl[] =3D {
/* PIIX4 Power Management device. */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, N=
ULL),
diff --git a/src/acpi.h b/src/acpi.h
index cb21561..d29837f 100644
--- a/src/acpi.h
+++ b/src/acpi.h
@@ -5,6 +5,9 @@
=20
void acpi_bios_init(void);
u32 find_resume_vector(void);
+void read_srat_early(void);
+extern u64 below_4g_hp_mem_size;
+extern u64 above_4g_hp_mem_size;
=20
#define RSDP_SIGNATURE 0x2052545020445352LL // "RSD PTR "
=20
diff --git a/src/pciinit.c b/src/pciinit.c
index 31115ee..c5a4b24 100644
--- a/src/pciinit.c
+++ b/src/pciinit.c
@@ -12,6 +12,7 @@
#include "ioport.h" // PORT_ATA1_CMD_BASE
#include "config.h" // CONFIG_*
#include "xen.h" // usingXen
+#include "acpi.h"
=20
#define PCI_DEVICE_MEM_MIN 0x1000
#define PCI_BRIDGE_IO_MIN 0x1000
@@ -597,7 +598,7 @@ static void pci_region_map_entries(struct pci_bus=
*busses, struct pci_region *r)
=20
static void pci_bios_map_devices(struct pci_bus *busses)
{
- pcimem_start =3D RamSize;
+ pcimem_start =3D RamSize + below_4g_hp_mem_size;
=20
if (pci_bios_init_root_regions(busses)) {
struct pci_region r64_mem, r64_pref;
@@ -616,7 +617,8 @@ static void pci_bios_map_devices(struct pci_bus *=
busses)
u64 align_mem =3D pci_region_align(&r64_mem);
u64 align_pref =3D pci_region_align(&r64_pref);
=20
- r64_mem.base =3D ALIGN(0x100000000LL + RamSizeOver4G, align_=
mem);
+ r64_mem.base =3D ALIGN(0x100000000LL + RamSizeOver4G +
+ above_4g_hp_mem_size, align_mem);
r64_pref.base =3D ALIGN(r64_mem.base + sum_mem, align_pref);
pcimem64_start =3D r64_mem.base;
pcimem64_end =3D r64_pref.base + sum_pref;
diff --git a/src/post.c b/src/post.c
index 924b311..c37730b 100644
--- a/src/post.c
+++ b/src/post.c
@@ -234,6 +234,9 @@ maininit(void)
// Initialize mtrr
mtrr_setup();
=20
+ smp_get_ncpus();
+ read_srat_early();
+
// Initialize pci
pci_setup();
smm_init();
diff --git a/src/smp.c b/src/smp.c
index 4975412..3922776 100644
--- a/src/smp.c
+++ b/src/smp.c
@@ -138,7 +138,11 @@ smp_probe(void)
=20
// Restore memory.
*(u64*)BUILD_AP_BOOT_ADDR =3D old;
+}
=20
+void
+smp_get_ncpus(void)
You don't declare this function, and use it in another file. It will br=
eak
building:
src/post.c: In function =91maininit=92:
src/post.c:237: warning: implicit declaration of function =91smp_get_nc=
pus=92
src/smp.c:144: note: previous definition of =91smp_get_ncpus=92 was her=
e
src/post.c:237: error: incompatible implicit declaration of function =91=
smp_get_ncpus=92
src/smp.c:144: note: previous definition of =91smp_get_ncpus=92 was her=
e

Thanks
Wen Congyang
+{
MaxCountCPUs =3D qemu_cfg_get_max_cpus();
if (!MaxCountCPUs || MaxCountCPUs < CountCPUs)
MaxCountCPUs =3D CountCPUs;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:32 UTC
Permalink
in case of hot-remove failure on a guest that does not implement _OST,
the dimm bitmaps in qemu and Seabios show the dimm as unplugged, but the dimm
is still present on the qdev/memory bus. To avoid this inconsistency, we set the
dimm state to active/hot-plugged on a reset of the associated acpi_pm device.
This way the dimm is still active after a VM reboot and dimm visibility has
always the same behaviour, regardless of _OST support in the guest.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hw/acpi_piix4.c | 1 +
hw/dimm.c | 20 ++++++++++++++++++++
hw/dimm.h | 1 +
3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index f7220d4..8bf58a6 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -373,6 +373,7 @@ static void piix4_reset(void *opaque)
pci_conf[0x5B] = 0x02;
}
piix4_update_hotplug(s);
+ dimm_state_sync();
}

static void piix4_powerdown(void *opaque, int irq, int power_failing)
diff --git a/hw/dimm.c b/hw/dimm.c
index 1521462..b993668 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -182,6 +182,26 @@ static DimmDevice *dimm_find_from_idx(uint32_t idx)
return NULL;
}

+void dimm_state_sync(void)
+{
+ DimmBus *bus = main_memory_bus;
+ DimmDevice *slot;
+
+ /* if a hot-remove operation is pending on reset, it means the hot-remove
+ * operation has failed, but the guest hasn't notified us e.g. because the
+ * guest does not provide _OST notifications. The device is still present on
+ * the dimmbus, but the qemu and Seabios dimm bitmaps show this device as
+ * unplugged. To avoid this inconsistency, we set the dimm bits to active
+ * i.e. hot-plugged for each dimm present on the dimmbus.
+ */
+ QTAILQ_FOREACH(slot, &bus->dimmlist, nextdimm) {
+ if (slot->pending == DIMM_REMOVE_PENDING) {
+ if (bus->dimm_revert)
+ bus->dimm_revert(bus->dimm_hotplug_qdev, slot, 0);
+ }
+ }
+}
+
/* used to create a dimm device, only on incoming migration of a hotplugged
* RAMBlock
*/
diff --git a/hw/dimm.h b/hw/dimm.h
index a6c6e6f..ce091fe 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -95,5 +95,6 @@ void main_memory_bus_create(Object *parent);
void dimm_config_create(char *id, uint64_t size, uint64_t node,
uint32_t dimm_idx, uint32_t populated);
uint64_t get_hp_memory_total(void);
+void dimm_state_sync(void);

#endif
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:31 UTC
Permalink
This allows qemu to receive notifications from the guest OS on success or
failure of a memory hotplug request. The guest OS needs to implement the _OST
functionality for this to work (linux-next: http://lkml.org/lkml/2012/6/25/321)

This patch also updates dimm bitmap state and hot-remove pending flag
on hot-remove fail. This allows failed hot operations to be retried at
anytime. This only works for guests that use _OST notification.
Also adds new _OST registers in docs/specs/acpi_hotplug.txt

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
docs/specs/acpi_hotplug.txt | 25 +++++++++++++++++++++++++
hw/acpi_piix4.c | 35 ++++++++++++++++++++++++++++++++++-
hw/dimm.c | 28 +++++++++++++++++++++++++++-
hw/dimm.h | 10 +++++++++-
4 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
index cf86242..536da16 100644
--- a/docs/specs/acpi_hotplug.txt
+++ b/docs/specs/acpi_hotplug.txt
@@ -20,3 +20,28 @@ ejected.

Written by ACPI memory device _EJ0 method to notify qemu of successfull
hot-removal. Write-only.
+
+Memory Dimm ejection failure notification (IO port 0xafa1, 1-byte access):
+---------------------------------------------------------------
+Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which
+ejection failed.
+
+Written by ACPI memory device _OST method to notify qemu of failed
+hot-removal. Write-only.
+
+Memory Dimm insertion success notification (IO port 0xafa2, 1-byte access):
+---------------------------------------------------------------
+Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which
+insertion succeeded.
+
+Written by ACPI memory device _OST method to notify qemu of failed
+hot-add. Write-only.
+
+Memory Dimm insertion failure notification (IO port 0xafa3, 1-byte access):
+---------------------------------------------------------------
+Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which
+insertion failed.
+
+Written by ACPI memory device _OST method to notify qemu of failed
+hot-add. Write-only.
+
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 8776669..f7220d4 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -49,6 +49,9 @@
#define PCI_RMV_BASE 0xae0c
#define MEM_BASE 0xaf80
#define MEM_EJ_BASE 0xafa0
+#define MEM_OST_REMOVE_FAIL 0xafa1
+#define MEM_OST_ADD_SUCCESS 0xafa2
+#define MEM_OST_ADD_FAIL 0xafa3

#define PIIX4_MEM_HOTPLUG_STATUS 8
#define PIIX4_PCI_HOTPLUG_STATUS 2
@@ -87,6 +90,7 @@ typedef struct PIIX4PMState {
uint8_t s4_val;
} PIIX4PMState;

+static int piix4_dimm_revert(DeviceState *qdev, DimmDevice *dev, int add);
static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);

#define ACPI_ENABLE 0xf1
@@ -531,6 +535,15 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
case MEM_EJ_BASE:
dimm_notify(val, DIMM_REMOVE_SUCCESS);
break;
+ case MEM_OST_REMOVE_FAIL:
+ dimm_notify(val, DIMM_REMOVE_FAIL);
+ break;
+ case MEM_OST_ADD_SUCCESS:
+ dimm_notify(val, DIMM_ADD_SUCCESS);
+ break;
+ case MEM_OST_ADD_FAIL:
+ dimm_notify(val, DIMM_ADD_FAIL);
+ break;
default:
acpi_gpe_ioport_writeb(&s->ar, addr, val);
}
@@ -604,13 +617,16 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)

register_ioport_read(MEM_BASE, DIMM_BITMAP_BYTES, 1, gpe_readb, s);
register_ioport_write(MEM_EJ_BASE, 1, 1, gpe_writeb, s);
+ register_ioport_write(MEM_OST_REMOVE_FAIL, 1, 1, gpe_writeb, s);
+ register_ioport_write(MEM_OST_ADD_SUCCESS, 1, 1, gpe_writeb, s);
+ register_ioport_write(MEM_OST_ADD_FAIL, 1, 1, gpe_writeb, s);

for(i = 0; i < DIMM_BITMAP_BYTES; i++) {
s->gperegs.mems_sts[i] = 0;
}

pci_bus_hotplug(bus, piix4_device_hotplug, &s->dev.qdev);
- dimm_bus_hotplug(piix4_dimm_hotplug, &s->dev.qdev);
+ dimm_bus_hotplug(piix4_dimm_hotplug, piix4_dimm_revert, &s->dev.qdev);
}

static void enable_device(PIIX4PMState *s, int slot)
@@ -656,6 +672,23 @@ static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int
return 0;
}

+static int piix4_dimm_revert(DeviceState *qdev, DimmDevice *dev, int add)
+{
+ PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, qdev);
+ PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, pci_dev);
+ struct gpe_regs *g = &s->gperegs;
+ DimmDevice *slot = DIMM(dev);
+ int idx = slot->idx;
+
+ if (add) {
+ g->mems_sts[idx/8] &= ~(1 << (idx%8));
+ }
+ else {
+ g->mems_sts[idx/8] |= (1 << (idx%8));
+ }
+ return 0;
+}
+
static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
PCIHotplugState state)
{
diff --git a/hw/dimm.c b/hw/dimm.c
index 21626f6..1521462 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -126,12 +126,14 @@ void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t
QTAILQ_INSERT_TAIL(&dimmconfig_list, dimm_cfg, nextdimmcfg);
}

-void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev)
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, dimm_hotplug_fn revert,
+ DeviceState *qdev)
{
DimmBus *bus = main_memory_bus;
bus->qbus.allow_hotplug = 1;
bus->dimm_hotplug_qdev = qdev;
bus->dimm_hotplug = hotplug;
+ bus->dimm_revert = revert;
}

static void dimm_plug_device(DimmDevice *slot)
@@ -141,6 +143,7 @@ static void dimm_plug_device(DimmDevice *slot)
dimm_populate(slot);
if (bus->dimm_hotplug)
bus->dimm_hotplug(bus->dimm_hotplug_qdev, slot, 1);
+ slot->pending = DIMM_ADD_PENDING;
}

static int dimm_unplug_device(DeviceState *qdev)
@@ -149,6 +152,7 @@ static int dimm_unplug_device(DeviceState *qdev)

if (bus->dimm_hotplug)
bus->dimm_hotplug(bus->dimm_hotplug_qdev, DIMM(qdev), 0);
+ DIMM(qdev)->pending = DIMM_REMOVE_PENDING;
return 1;
}

@@ -266,12 +270,33 @@ void dimm_notify(uint32_t idx, uint32_t event)
result = g_malloc0(sizeof(*result));
slotcfg = dimmcfg_find_from_name(DEVICE(s)->id);
result->dimmname = slotcfg->name;
+ result->ret = event;

switch(event) {
case DIMM_REMOVE_SUCCESS:
dimm_depopulate(s);
QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
qdev_simple_unplug_cb((DeviceState*)s);
+ s->pending = DIMM_NO_PENDING;
+ QTAILQ_INSERT_TAIL(&bus->dimm_hp_result_queue, result, next);
+ break;
+ case DIMM_REMOVE_FAIL:
+ s->pending = DIMM_NO_PENDING;
+ if (bus->dimm_revert)
+ bus->dimm_revert(bus->dimm_hotplug_qdev, s, 0);
+ QTAILQ_INSERT_TAIL(&bus->dimm_hp_result_queue, result, next);
+ break;
+ case DIMM_ADD_SUCCESS:
+ s->pending = DIMM_NO_PENDING;
+ QTAILQ_INSERT_TAIL(&bus->dimm_hp_result_queue, result, next);
+ break;
+ case DIMM_ADD_FAIL:
+ dimm_depopulate(s);
+ s->pending = DIMM_NO_PENDING;
+ if (bus->dimm_revert)
+ bus->dimm_revert(bus->dimm_hotplug_qdev, s, 1);
+ QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
+ qdev_simple_unplug_cb((DeviceState*)s);
QTAILQ_INSERT_TAIL(&bus->dimm_hp_result_queue, result, next);
break;
default:
@@ -352,6 +377,7 @@ static int dimm_init(DeviceState *s)
slot->start = slotcfg->start;
slot->size = slotcfg->size;
slot->node = slotcfg->node;
+ slot->pending = DIMM_NO_PENDING;

QTAILQ_INSERT_TAIL(&bus->dimmlist, slot, nextdimm);
dimm_plug_device(slot);
diff --git a/hw/dimm.h b/hw/dimm.h
index 21225be..4f696d8 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -18,6 +18,12 @@ typedef enum {
DIMM_ADD_FAIL = 3
} dimm_hp_result_code;

+typedef enum {
+ DIMM_NO_PENDING = 0,
+ DIMM_ADD_PENDING = 1,
+ DIMM_REMOVE_PENDING = 2,
+} dimm_hp_pending_code;
+
#define TYPE_DIMM "dimm"
#define DIMM(obj) \
OBJECT_CHECK(DimmDevice, (obj), TYPE_DIMM)
@@ -42,6 +48,7 @@ typedef struct DimmDevice {
ram_addr_t size;
uint32_t node; /* numa node proximity */
MemoryRegion *mr; /* MemoryRegion for this slot. !NULL only if populated */
+ dimm_hp_pending_code pending; /* indicates if a hot operation is pending for this dimm */
QTAILQ_ENTRY (DimmDevice) nextdimm;
} DimmDevice;

@@ -66,6 +73,7 @@ typedef struct DimmBus {
BusState qbus;
DeviceState *dimm_hotplug_qdev;
dimm_hotplug_fn dimm_hotplug;
+ dimm_hotplug_fn dimm_revert;
dimm_calcoffset_fn dimm_calcoffset;
DimmConfiglist dimmconfig_list;
QTAILQ_HEAD(Dimmlist, DimmDevice) dimmlist;
@@ -80,7 +88,7 @@ struct dimm_hp_result {

void dimm_calc_offsets(dimm_calcoffset_fn calcfn);
void dimm_notify(uint32_t idx, uint32_t event);
-void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev);
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, dimm_hotplug_fn revert, DeviceState *qdev);
void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots);
int dimm_add(char *id);
void main_memory_bus_create(Object *parent);
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:25 UTC
Permalink
The numa_fw_cfg paravirt interface is extended to include SRAT information for
all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
denoting start address, size and node proximity. The new info is appended after
existing numa info, so that the fw_cfg layout does not break. This information
is used by Seabios to build hotplug memory device objects at runtime.
nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info
to SeaBIOS.

v1->v2:
Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not
to break existing layout
Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++++
hw/pc.c | 14 ++++++++++++--
2 files changed, 40 insertions(+), 2 deletions(-)
create mode 100644 docs/specs/fwcfg.txt

diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
new file mode 100644
index 0000000..55f96d9
--- /dev/null
+++ b/docs/specs/fwcfg.txt
@@ -0,0 +1,28 @@
+QEMU<->BIOS Paravirt Documentation
+--------------------------------------
+
+This document describes paravirt data structures passed from QEMU to BIOS.
+
+FW_CFG_NUMA paravirt info
+--------------------
+The SRAT info passed from QEMU to BIOS has the following layout:
+
+-----------------------------------------------------------------------------------------------
+#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem
+
+-----------------------------------------------------------------------------------------------
+#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm
+
+Entry 0 contains the number of numa nodes (nb_numa_nodes).
+
+Entries 1..max_cpus: The next max_cpus entries describe node proximity for each
+one of the vCPUs in the system.
+
+Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries
+describe the memory size for each one of the NUMA nodes in the system.
+
+Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
+
+The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains
+the physical address offset, size (in bytes), and node proximity for the
+respective dimm.
diff --git a/hw/pc.c b/hw/pc.c
index 2c9664d..f2604ae 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -598,6 +598,7 @@ static void *bochs_bios_init(void)
uint8_t *smbios_table;
size_t smbios_len;
uint64_t *numa_fw_cfg;
+ uint64_t *hp_dimms_fw_cfg;
int i, j;

register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -632,8 +633,10 @@ static void *bochs_bios_init(void)
/* allocate memory for the NUMA channel: one (64bit) word for the number
* of nodes, one word for each VCPU->node and one word for each node to
* hold the amount of memory.
+ * Finally one word for the number of hotplug memory slots and three words
+ * for each hotplug memory slot (start address, size and node proximity).
*/
- numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
+ numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
for (i = 0; i < max_cpus; i++) {
for (j = 0; j < nb_numa_nodes; j++) {
@@ -646,8 +649,15 @@ static void *bochs_bios_init(void)
for (i = 0; i < nb_numa_nodes; i++) {
numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
}
+
+ numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
+
+ hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
+ if (nb_hp_dimms)
+ setup_fwcfg_hp_dimms(hp_dimms_fw_cfg);
+
fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
- (1 + max_cpus + nb_numa_nodes) * 8);
+ (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);

return fw_cfg;
}
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wen Congyang
2012-09-27 03:55:28 UTC
Permalink
Post by Vasilis Liaskovitis
The numa_fw_cfg paravirt interface is extended to include SRAT information for
all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
denoting start address, size and node proximity. The new info is appended after
existing numa info, so that the fw_cfg layout does not break. This information
is used by Seabios to build hotplug memory device objects at runtime.
nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info
to SeaBIOS.
You forgot to set nb_numa_nodes to 1...

Thanks
Wen Congyang
Post by Vasilis Liaskovitis
Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not
to break existing layout
Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt
---
docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++++
hw/pc.c | 14 ++++++++++++--
2 files changed, 40 insertions(+), 2 deletions(-)
create mode 100644 docs/specs/fwcfg.txt
diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
new file mode 100644
index 0000000..55f96d9
--- /dev/null
+++ b/docs/specs/fwcfg.txt
@@ -0,0 +1,28 @@
+QEMU<->BIOS Paravirt Documentation
+--------------------------------------
+
+This document describes paravirt data structures passed from QEMU to BIOS.
+
+FW_CFG_NUMA paravirt info
+--------------------
+
+-----------------------------------------------------------------------------------------------
+#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem
+
+-----------------------------------------------------------------------------------------------
+#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm
+
+Entry 0 contains the number of numa nodes (nb_numa_nodes).
+
+Entries 1..max_cpus: The next max_cpus entries describe node proximity for each
+one of the vCPUs in the system.
+
+Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries
+describe the memory size for each one of the NUMA nodes in the system.
+
+Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
+
+The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains
+the physical address offset, size (in bytes), and node proximity for the
+respective dimm.
diff --git a/hw/pc.c b/hw/pc.c
index 2c9664d..f2604ae 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -598,6 +598,7 @@ static void *bochs_bios_init(void)
uint8_t *smbios_table;
size_t smbios_len;
uint64_t *numa_fw_cfg;
+ uint64_t *hp_dimms_fw_cfg;
int i, j;
register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -632,8 +633,10 @@ static void *bochs_bios_init(void)
/* allocate memory for the NUMA channel: one (64bit) word for the number
* of nodes, one word for each VCPU->node and one word for each node to
* hold the amount of memory.
+ * Finally one word for the number of hotplug memory slots and three words
+ * for each hotplug memory slot (start address, size and node proximity).
*/
- numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
+ numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
for (i = 0; i < max_cpus; i++) {
for (j = 0; j < nb_numa_nodes; j++) {
@@ -646,8 +649,15 @@ static void *bochs_bios_init(void)
for (i = 0; i < nb_numa_nodes; i++) {
numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
}
+
+ numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
+
+ hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
+ if (nb_hp_dimms)
+ setup_fwcfg_hp_dimms(hp_dimms_fw_cfg);
+
fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
- (1 + max_cpus + nb_numa_nodes) * 8);
+ (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
return fw_cfg;
}
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:26 UTC
Permalink
Live migration works after memory hot-add events, as long as the
qemu command line "-dimm" arguments are changed on the destination host
to specify "populated=on" for the dimms that have been hot-added.

If a command-line change has not occured, the destination host does not yet
have the corresponding ramblock in its ram_list. Activate the dimm on the
destination during ram_load.

Perhaps several fields of the DimmDevice should be part of a
VMStateDescription to handle migration in a cleaner way. But the problem
is that ramblocks are checked before qdev vmstates.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
arch_init.c | 24 +++++++++++++++++++++---
1 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 5a1173e..b63caa7 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -45,6 +45,7 @@
#include "hw/pcspk.h"
#include "qemu/page_cache.h"
#include "qmp-commands.h"
+#include "hw/dimm.h"

#ifdef DEBUG_ARCH_INIT
#define DPRINTF(fmt, ...) \
@@ -740,10 +741,27 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
}

if (!block) {
- fprintf(stderr, "Unknown ramblock \"%s\", cannot "
+ /* this can happen if a dimm was hot-added at source host */
+ bool ramblock_found = false;
+ if (dimm_add(id)) {
+ fprintf(stderr, "Cannot add unknown ramblock \"%s\", "
+ "cannot accept migration\n", id);
+ ret = -EINVAL;
+ goto done;
+ }
+ /* rescan ram_list, verify ramblock is there now */
+ QLIST_FOREACH(block, &ram_list.blocks, next) {
+ if (!strncmp(id, block->idstr, sizeof(id))) {
+ ramblock_found = true;
+ break;
+ }
+ }
+ if (!ramblock_found) {
+ fprintf(stderr, "Unknown ramblock \"%s\", cannot "
"accept migration\n", id);
- ret = -EINVAL;
- goto done;
+ ret = -EINVAL;
+ goto done;
+ }
}

total_ram_bytes -= length;
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:30 UTC
Permalink
Add support for _OST method. _OST method will write into the correct I/O byte to
signal success / failure of hot-add or hot-remove to qemu.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
src/acpi-dsdt.dsl | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
src/ssdt-mem.dsl | 4 ++++
2 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 5d3e92b..0d37bbc 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -762,6 +762,28 @@ DefinitionBlock (
MPE, 8
}

+
+ /* Memory hot-remove notify failure byte */
+ OperationRegion(MEEF, SystemIO, 0xafa1, 1)
+ Field (MEEF, ByteAcc, NoLock, Preserve)
+ {
+ MEF, 8
+ }
+
+ /* Memory hot-add notify success byte */
+ OperationRegion(MPIS, SystemIO, 0xafa2, 1)
+ Field (MPIS, ByteAcc, NoLock, Preserve)
+ {
+ MIS, 8
+ }
+
+ /* Memory hot-add notify failure byte */
+ OperationRegion(MPIF, SystemIO, 0xafa3, 1)
+ Field (MPIF, ByteAcc, NoLock, Preserve)
+ {
+ MIF, 8
+ }
+
Method(MESC, 0) {
// Local5 = active memdevice bitmap
Store (MES, Local5)
@@ -802,6 +824,34 @@ DefinitionBlock (
Store(Arg0, MPE)
Sleep(200)
}
+ Method (MOST, 3, Serialized) {
+ // _OST method - OS status indication
+ Switch (And(Arg0, 0xFF)) {
+ Case(0x3)
+ {
+ Switch(And(Arg1, 0xFF)) {
+ Case(0x1) {
+ Store(Arg2, MEF)
+ // Revert MEON flag for this memory device to one
+ Store(One, Index(MEON, Arg2))
+ }
+ }
+ }
+ Case(0x1)
+ {
+ Switch(And(Arg1, 0xFF)) {
+ Case(0x0) {
+ Store(Arg2, MIS)
+ }
+ Case(0x1) {
+ Store(Arg2, MIF)
+ // Revert MEON flag for this memory device to zero
+ Store(Zero, Index(MEON, Arg2))
+ }
+ }
+ }
+ }
+ }
}


diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl
index ee322f0..041d301 100644
--- a/src/ssdt-mem.dsl
+++ b/src/ssdt-mem.dsl
@@ -38,6 +38,7 @@ DefinitionBlock ("ssdt-mem.aml", "SSDT", 0x02, "BXPC", "CSSDT", 0x1)

External(CMST, MethodObj)
External(MPEJ, MethodObj)
+ External(MOST, MethodObj)

Name(_CRS, ResourceTemplate() {
QwordMemory(
@@ -60,6 +61,9 @@ DefinitionBlock ("ssdt-mem.aml", "SSDT", 0x02, "BXPC", "CSSDT", 0x1)
Method (_EJ0, 1, NotSerialized) {
MPEJ(ID, Arg0)
}
+ Method (_OST, 3) {
+ MOST(Arg0, Arg1, ID)
+ }
}
}
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:28 UTC
Permalink
Returns total physical memory available to guest in bytes, including hotplugged
memory. Note that the number reported here may be different from what the guest
sees e.g. if the guest has not logically onlined hotplugged memory.

This functionality is provided independently of a balloon device, since a
guest can be using ACPI memory hotplug without using a balloon device.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hmp-commands.hx | 2 ++
hmp.c | 7 +++++++
hmp.h | 1 +
hw/dimm.c | 21 +++++++++++++++++++++
hw/dimm.h | 1 +
monitor.c | 7 +++++++
qapi-schema.json | 11 +++++++++++
qmp-commands.hx | 20 ++++++++++++++++++++
8 files changed, 70 insertions(+), 0 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index cfb1b67..988d207 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1464,6 +1464,8 @@ show qdev device model list
show roms
@item info memory-hotplug
show memory-hotplug
+@item info memory-total
+show memory-total
@end table
ETEXI

diff --git a/hmp.c b/hmp.c
index 4b3d63d..cc31ddc 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1185,3 +1185,10 @@ void hmp_info_memory_hotplug(Monitor *mon)

qapi_free_MemHpInfoList(info);
}
+
+void hmp_info_memory_total(Monitor *mon)
+{
+ uint64_t ram_total;
+ ram_total = (uint64_t)qmp_query_memory_total(NULL);
+ monitor_printf(mon, "MemTotal: %lu \n", ram_total);
+}
diff --git a/hmp.h b/hmp.h
index 986705a..ab96dba 100644
--- a/hmp.h
+++ b/hmp.h
@@ -74,5 +74,6 @@ void hmp_closefd(Monitor *mon, const QDict *qdict);
void hmp_send_key(Monitor *mon, const QDict *qdict);
void hmp_screen_dump(Monitor *mon, const QDict *qdict);
void hmp_info_memory_hotplug(Monitor *mon);
+void hmp_info_memory_total(Monitor *mon);

#endif
diff --git a/hw/dimm.c b/hw/dimm.c
index fbd93a8..21626f6 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -28,6 +28,7 @@ static DimmBus *main_memory_bus;
/* the following list is used to hold dimm config info before machine
* initialization. After machine init, the list is emptied and not used anymore.*/
static DimmConfiglist dimmconfig_list = QTAILQ_HEAD_INITIALIZER(dimmconfig_list);
+extern ram_addr_t ram_size;

static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent);
static char *dimmbus_get_fw_dev_path(DeviceState *dev);
@@ -233,6 +234,26 @@ void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots)
}
}

+uint64_t get_hp_memory_total(void)
+{
+ DimmBus *bus = main_memory_bus;
+ DimmDevice *slot;
+ uint64_t info = 0;
+
+ QTAILQ_FOREACH(slot, &bus->dimmlist, nextdimm) {
+ info += slot->size;
+ }
+ return info;
+}
+
+int64_t qmp_query_memory_total(Error **errp)
+{
+ uint64_t info;
+ info = ram_size + get_hp_memory_total();
+
+ return (int64_t)info;
+}
+
void dimm_notify(uint32_t idx, uint32_t event)
{
DimmBus *bus = main_memory_bus;
diff --git a/hw/dimm.h b/hw/dimm.h
index 95251ba..21225be 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -86,5 +86,6 @@ int dimm_add(char *id);
void main_memory_bus_create(Object *parent);
void dimm_config_create(char *id, uint64_t size, uint64_t node,
uint32_t dimm_idx, uint32_t populated);
+uint64_t get_hp_memory_total(void);

#endif
diff --git a/monitor.c b/monitor.c
index be9a1d9..4f5ea60 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2747,6 +2747,13 @@ static mon_cmd_t info_cmds[] = {
.mhandler.info = hmp_info_memory_hotplug,
},
{
+ .name = "memory-total",
+ .args_type = "",
+ .params = "",
+ .help = "show total memory size",
+ .mhandler.info = hmp_info_memory_total,
+ },
+ {
.name = NULL,
},
};
diff --git a/qapi-schema.json b/qapi-schema.json
index 3706a2a..c1d2571 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2581,3 +2581,14 @@
# Since: 1.3
##
{ 'command': 'query-memory-hotplug', 'returns': ['MemHpInfo'] }
+
+##
+# @query-memory-total:
+#
+# Returns total memory in bytes, including hotplugged dimms
+#
+# Returns: int
+#
+# Since: 1.3
+##
+{ 'command': 'query-memory-total', 'returns': 'int' }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index e50dcc2..20b7eea 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2576,3 +2576,23 @@ Example:
}

EQMP
+
+ {
+ .name = "query-memory-total",
+ .args_type = "",
+ .mhandler.cmd_new = qmp_marshal_input_query_memory_total
+ },
+SQMP
+query-memory-total
+----------
+
+Return total memory in bytes, including hotplugged dimms
+
+Example:
+
+-> { "execute": "query-memory-total" }
+<- {
+ "return": 1073741824
+ }
+
+EQMP
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Blake
2012-09-21 22:36:25 UTC
Permalink
Post by Vasilis Liaskovitis
Returns total physical memory available to guest in bytes, including hotplugged
memory. Note that the number reported here may be different from what the guest
sees e.g. if the guest has not logically onlined hotplugged memory.
This functionality is provided independently of a balloon device, since a
guest can be using ACPI memory hotplug without using a balloon device.
+++ b/hmp-commands.hx
@@ -1464,6 +1464,8 @@ show qdev device model list
show roms
@item info memory-hotplug
show memory-hotplug
+show memory-total
+++ b/monitor.c
@@ -2747,6 +2747,13 @@ static mon_cmd_t info_cmds[] = {
.mhandler.info = hmp_info_memory_hotplug,
},
{
+ .name = "memory-total",
We're split on HMP naming conventions ('-' vs. '_'); we have 'show
migrate_capabilities' but 'show block-jobs'. Oh well, that's life.

Reviewed-by: Eric Blake <***@redhat.com>
--
Eric Blake ***@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
Vasilis Liaskovitis
2012-09-21 11:17:29 UTC
Permalink
query-balloon and "info balloon" should report total memory available to the
guest.

balloon inflate/ deflate can also use all memory available to the guest (initial
+ hotplugged memory)

Ballon driver has been minimaly tested with the patch, please review and test.

Caveat: if the guest does not online hotplugged-memory, it's easy for a balloon
inflate command to OOM a guest.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hw/virtio-balloon.c | 13 +++++++++----
1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
index dd1a650..bca21bc 100644
--- a/hw/virtio-balloon.c
+++ b/hw/virtio-balloon.c
@@ -22,6 +22,7 @@
#include "virtio-balloon.h"
#include "kvm.h"
#include "exec-memory.h"
+#include "dimm.h"

#if defined(__linux__)
#include <sys/mman.h>
@@ -147,10 +148,11 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
VirtIOBalloon *dev = to_virtio_balloon(vdev);
struct virtio_balloon_config config;
uint32_t oldactual = dev->actual;
+ uint64_t hotplugged_ram_size = get_hp_memory_total();
memcpy(&config, config_data, 8);
dev->actual = le32_to_cpu(config.actual);
if (dev->actual != oldactual) {
- qemu_balloon_changed(ram_size -
+ qemu_balloon_changed(ram_size + hotplugged_ram_size -
(dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
}
}
@@ -188,17 +190,20 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo *info)

info->actual = ram_size - ((uint64_t) dev->actual <<
VIRTIO_BALLOON_PFN_SHIFT);
+ info->actual += get_hp_memory_total();
}

static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
{
VirtIOBalloon *dev = opaque;
+ uint64_t hotplugged_ram_size = get_hp_memory_total();

- if (target > ram_size) {
- target = ram_size;
+ if (target > ram_size + hotplugged_ram_size) {
+ target = ram_size + hotplugged_ram_size;
}
if (target) {
- dev->num_pages = (ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
+ dev->num_pages = (ram_size + hotplugged_ram_size - target) >>
+ VIRTIO_BALLOON_PFN_SHIFT;
virtio_notify_config(&dev->vdev);
}
}
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:21 UTC
Permalink
Each hotplug-able memory slot is a DimmDevice. All DimmDevices are attached
to a new bus called DimmBus. This bus is introduced so that we no longer
depend on hotplug-capability of main system bus (the main bus does not allow
hotplugging). The DimmBus should be attached to a chipset Device (i440fx in case
of the pc)

A hot-add operation for a particular dimm:
- creates a new DimmDevice and attaches it to the DimmBus
- creates a new MemoryRegion of the given physical address offset, size and
node proximity, and attaches it to main system memory as a sub_region.

A successful hot-remove operation detaches and frees the MemoryRegion from
system memory, and removes the DimmDevice from the DimmBus.

Hotplug operations are done through normal device_add /device_del commands.
Also add properties to DimmDevice.

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
hw/dimm.c | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
hw/dimm.h | 90 ++++++++++++++++++
2 files changed, 395 insertions(+), 0 deletions(-)
create mode 100644 hw/dimm.c
create mode 100644 hw/dimm.h

diff --git a/hw/dimm.c b/hw/dimm.c
new file mode 100644
index 0000000..288b997
--- /dev/null
+++ b/hw/dimm.c
@@ -0,0 +1,305 @@
+/*
+ * Dimm device for Memory Hotplug
+ *
+ * Copyright ProfitBricks GmbH 2012
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "trace.h"
+#include "qdev.h"
+#include "dimm.h"
+#include <time.h>
+#include "../exec-memory.h"
+#include "qmp-commands.h"
+
+/* the system-wide memory bus. */
+static DimmBus *main_memory_bus;
+/* the following list is used to hold dimm config info before machine
+ * initialization. After machine init, the list is emptied and not used anymore.*/
+static DimmConfiglist dimmconfig_list = QTAILQ_HEAD_INITIALIZER(dimmconfig_list);
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent);
+static char *dimmbus_get_fw_dev_path(DeviceState *dev);
+
+static Property dimm_properties[] = {
+ DEFINE_PROP_UINT64("start", DimmDevice, start, 0),
+ DEFINE_PROP_UINT64("size", DimmDevice, size, DEFAULT_DIMMSIZE),
+ DEFINE_PROP_UINT32("node", DimmDevice, node, 0),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent)
+{
+}
+
+static char *dimmbus_get_fw_dev_path(DeviceState *dev)
+{
+ char path[40];
+
+ snprintf(path, sizeof(path), "%s", qdev_fw_name(dev));
+ return strdup(path);
+}
+
+static void dimm_bus_class_init(ObjectClass *klass, void *data)
+{
+ BusClass *k = BUS_CLASS(klass);
+
+ k->print_dev = dimmbus_dev_print;
+ k->get_fw_dev_path = dimmbus_get_fw_dev_path;
+}
+
+static void dimm_bus_initfn(Object *obj)
+{
+ DimmConfig *dimm_cfg, *next_dimm_cfg;
+ DimmBus *bus = DIMM_BUS(obj);
+ QTAILQ_INIT(&bus->dimmconfig_list);
+ QTAILQ_INIT(&bus->dimmlist);
+
+ QTAILQ_FOREACH_SAFE(dimm_cfg, &dimmconfig_list, nextdimmcfg, next_dimm_cfg) {
+ QTAILQ_REMOVE(&dimmconfig_list, dimm_cfg, nextdimmcfg);
+ QTAILQ_INSERT_TAIL(&bus->dimmconfig_list, dimm_cfg, nextdimmcfg);
+ }
+}
+
+static const TypeInfo dimm_bus_info = {
+ .name = TYPE_DIMM_BUS,
+ .parent = TYPE_BUS,
+ .instance_size = sizeof(DimmBus),
+ .instance_init = dimm_bus_initfn,
+ .class_init = dimm_bus_class_init,
+};
+
+void main_memory_bus_create(Object *parent)
+{
+ main_memory_bus = g_malloc0(dimm_bus_info.instance_size);
+ main_memory_bus->qbus.glib_allocated = true;
+ qbus_create_inplace(&main_memory_bus->qbus, TYPE_DIMM_BUS, DEVICE(parent),
+ "membus");
+}
+
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
+
+void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t
+ dimm_idx, uint32_t populated)
+{
+ DimmConfig *dimm_cfg;
+ dimm_cfg = (DimmConfig*) g_malloc0(sizeof(DimmConfig));
+ dimm_cfg->name = id;
+ dimm_cfg->idx = dimm_idx;
+ dimm_cfg->start = 0;
+ dimm_cfg->size = size;
+ dimm_cfg->node = node;
+ dimm_cfg->populated = populated;
+
+ QTAILQ_INSERT_TAIL(&dimmconfig_list, dimm_cfg, nextdimmcfg);
+}
+
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev)
+{
+ DimmBus *bus = main_memory_bus;
+ bus->qbus.allow_hotplug = 1;
+ bus->dimm_hotplug_qdev = qdev;
+ bus->dimm_hotplug = hotplug;
+}
+
+static void dimm_plug_device(DimmDevice *slot)
+{
+ DimmBus *bus = main_memory_bus;
+
+ dimm_populate(slot);
+ if (bus->dimm_hotplug)
+ bus->dimm_hotplug(bus->dimm_hotplug_qdev, slot, 1);
+}
+
+static int dimm_unplug_device(DeviceState *qdev)
+{
+ DimmBus *bus = main_memory_bus;
+
+ if (bus->dimm_hotplug)
+ bus->dimm_hotplug(bus->dimm_hotplug_qdev, DIMM(qdev), 0);
+ return 1;
+}
+
+static DimmConfig *dimmcfg_find_from_name(const char *name)
+{
+ DimmConfig *slot;
+ DimmBus *bus = main_memory_bus;
+
+ QTAILQ_FOREACH(slot, &bus->dimmconfig_list, nextdimmcfg) {
+ if (!strcmp(slot->name, name)) {
+ return slot;
+ }
+ }
+ return NULL;
+}
+
+static DimmDevice *dimm_find_from_idx(uint32_t idx)
+{
+ DimmDevice *slot;
+ DimmBus *bus = main_memory_bus;
+
+ QTAILQ_FOREACH(slot, &bus->dimmlist, nextdimm) {
+ if (slot->idx == idx) {
+ return slot;
+ }
+ }
+ return NULL;
+}
+
+/* used to create a dimm device, only on incoming migration of a hotplugged
+ * RAMBlock
+ */
+int dimm_add(char *id)
+{
+ DimmConfig *slotcfg = NULL;
+ QemuOpts *devopts;
+ char buf[256];
+
+ if (!id) {
+ fprintf(stderr, "ERROR %s invalid id\n",__FUNCTION__);
+ return 1;
+ }
+
+ slotcfg = dimmcfg_find_from_name(id);
+
+ if (!slotcfg) {
+ fprintf(stderr, "%s no slot %s found\n", __FUNCTION__, id);
+ return 1;
+ }
+
+ devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL);
+ qemu_opt_set(devopts, "driver", "dimm");
+
+ snprintf(buf, sizeof(buf), "%lu", slotcfg->size);
+ qemu_opt_set(devopts, "size", buf);
+ snprintf(buf, sizeof(buf), "%u", slotcfg->node);
+ qemu_opt_set(devopts, "node", buf);
+ qdev_device_add(devopts);
+
+ return 0;
+}
+
+/* used to calculate physical address offsets for all dimms */
+void dimm_calc_offsets(dimm_calcoffset_fn calcfn)
+{
+ DimmConfig *slot;
+ QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
+ if (!slot->start) {
+ slot->start = calcfn(slot->size);
+ }
+ }
+}
+
+void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots)
+{
+ DimmConfig *slot;
+
+ QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
+ assert(slot->start);
+ fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
+ fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
+ fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
+ }
+}
+
+void dimm_notify(uint32_t idx, uint32_t event)
+{
+ DimmBus *bus = main_memory_bus;
+ DimmDevice *s;
+ s = dimm_find_from_idx(idx);
+ assert(s != NULL);
+
+ switch(event) {
+ case DIMM_REMOVE_SUCCESS:
+ dimm_depopulate(s);
+ qdev_simple_unplug_cb((DeviceState*)s);
+ QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
+ break;
+ default:
+ break;
+ }
+}
+
+static int dimm_init(DeviceState *s)
+{
+ DimmBus *bus = main_memory_bus;
+ DimmDevice *slot;
+ DimmConfig *slotcfg;
+
+ slot = DIMM(s);
+ slot->mr = NULL;
+
+ slotcfg = dimmcfg_find_from_name(s->id);
+
+ if (!slotcfg) {
+ fprintf(stderr, "%s no config for slot %s found\n",
+ __FUNCTION__, s->id);
+ return 1;
+ }
+
+ slot->idx = slotcfg->idx;
+ assert(slotcfg->start);
+ slot->start = slotcfg->start;
+ slot->size = slotcfg->size;
+ slot->node = slotcfg->node;
+
+ QTAILQ_INSERT_TAIL(&bus->dimmlist, slot, nextdimm);
+ dimm_plug_device(slot);
+
+ return 0;
+}
+
+
+static void dimm_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->props = dimm_properties;
+ dc->unplug = dimm_unplug_device;
+ dc->init = dimm_init;
+}
+
+static TypeInfo dimm_info = {
+ .name = TYPE_DIMM,
+ .parent = TYPE_DEVICE,
+ .instance_size = sizeof(DimmDevice),
+ .class_init = dimm_class_init,
+};
+
+static void dimm_register_types(void)
+{
+ type_register_static(&dimm_bus_info);
+ type_register_static(&dimm_info);
+}
+
+type_init(dimm_register_types)
diff --git a/hw/dimm.h b/hw/dimm.h
new file mode 100644
index 0000000..5e991a6
--- /dev/null
+++ b/hw/dimm.h
@@ -0,0 +1,90 @@
+#ifndef QEMU_DIMM_H
+#define QEMU_DIMM_H
+
+#include "qemu-common.h"
+#include "memory.h"
+#include "sysbus.h"
+#include "qapi-types.h"
+#include "qemu-queue.h"
+#include "cpus.h"
+#define MAX_DIMMS 255
+#define DIMM_BITMAP_BYTES (MAX_DIMMS + 7) / 8
+#define DEFAULT_DIMMSIZE 1024*1024*1024
+
+typedef enum {
+ DIMM_REMOVE_SUCCESS = 0,
+ DIMM_REMOVE_FAIL = 1,
+ DIMM_ADD_SUCCESS = 2,
+ DIMM_ADD_FAIL = 3
+} dimm_hp_result_code;
+
+#define TYPE_DIMM "dimm"
+#define DIMM(obj) \
+ OBJECT_CHECK(DimmDevice, (obj), TYPE_DIMM)
+#define DIMM_CLASS(klass) \
+ OBJECT_CLASS_CHECK(DimmDeviceClass, (klass), TYPE_DIMM)
+#define DIMM_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(DimmDeviceClass, (obj), TYPE_DIMM)
+
+typedef struct DimmDevice DimmDevice;
+typedef QTAILQ_HEAD(DimmConfiglist, DimmConfig) DimmConfiglist;
+
+typedef struct DimmDeviceClass {
+ DeviceClass parent_class;
+
+ int (*init)(DimmDevice *dev);
+} DimmDeviceClass;
+
+typedef struct DimmDevice {
+ DeviceState qdev;
+ uint32_t idx; /* index in memory hotplug register/bitmap */
+ ram_addr_t start; /* starting physical address */
+ ram_addr_t size;
+ uint32_t node; /* numa node proximity */
+ MemoryRegion *mr; /* MemoryRegion for this slot. !NULL only if populated */
+ QTAILQ_ENTRY (DimmDevice) nextdimm;
+} DimmDevice;
+
+typedef struct DimmConfig
+{
+ const char *name;
+ uint32_t idx; /* index in memory hotplug register/bitmap */
+ ram_addr_t start; /* starting physical address */
+ ram_addr_t size;
+ uint32_t node; /* numa node proximity */
+ uint32_t populated; /* 1 means device has been hotplugged. Default is 0. */
+ QTAILQ_ENTRY (DimmConfig) nextdimmcfg;
+} DimmConfig;
+
+typedef int (*dimm_hotplug_fn)(DeviceState *qdev, DimmDevice *dev, int add);
+typedef target_phys_addr_t (*dimm_calcoffset_fn)(uint64_t size);
+
+#define TYPE_DIMM_BUS "dimmbus"
+#define DIMM_BUS(obj) OBJECT_CHECK(DimmBus, (obj), TYPE_DIMM_BUS)
+
+typedef struct DimmBus {
+ BusState qbus;
+ DeviceState *dimm_hotplug_qdev;
+ dimm_hotplug_fn dimm_hotplug;
+ dimm_calcoffset_fn dimm_calcoffset;
+ DimmConfiglist dimmconfig_list;
+ QTAILQ_HEAD(Dimmlist, DimmDevice) dimmlist;
+} DimmBus;
+
+struct dimm_hp_result {
+ const char *dimmname;
+ dimm_hp_result_code ret;
+ QTAILQ_ENTRY (dimm_hp_result) next;
+};
+
+void dimm_calc_offsets(dimm_calcoffset_fn calcfn);
+void dimm_notify(uint32_t idx, uint32_t event);
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev);
+void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots);
+int dimm_add(char *id);
+void main_memory_bus_create(Object *parent);
+void dimm_config_create(char *id, uint64_t size, uint64_t node,
+ uint32_t dimm_idx, uint32_t populated);
+
+
+#endif
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Wen Congyang
2012-09-24 06:02:20 UTC
Permalink
Each hotplug-able memory slot is a DimmDevice. All DimmDevices are at=
tached
to a new bus called DimmBus. This bus is introduced so that we no lon=
ger
depend on hotplug-capability of main system bus (the main bus does no=
t allow
hotplugging). The DimmBus should be attached to a chipset Device (i44=
0fx in case
of the pc)
=20
- creates a new DimmDevice and attaches it to the DimmBus
- creates a new MemoryRegion of the given physical address offset, si=
ze and
node proximity, and attaches it to main system memory as a sub_region=
=2E
=20
A successful hot-remove operation detaches and frees the MemoryRegion=
from
system memory, and removes the DimmDevice from the DimmBus.
=20
Hotplug operations are done through normal device_add /device_del com=
mands.
Also add properties to DimmDevice.
=20
com>
---
hw/dimm.c | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++=
++++++++++
hw/dimm.h | 90 ++++++++++++++++++
2 files changed, 395 insertions(+), 0 deletions(-)
create mode 100644 hw/dimm.c
create mode 100644 hw/dimm.h
=20
diff --git a/hw/dimm.c b/hw/dimm.c
new file mode 100644
index 0000000..288b997
--- /dev/null
+++ b/hw/dimm.c
@@ -0,0 +1,305 @@
+/*
+ * Dimm device for Memory Hotplug
+ *
+ * Copyright ProfitBricks GmbH 2012
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/=
licenses/>
+ */
+
+#include "trace.h"
+#include "qdev.h"
+#include "dimm.h"
+#include <time.h>
+#include "../exec-memory.h"
+#include "qmp-commands.h"
+
+/* the system-wide memory bus. */
+static DimmBus *main_memory_bus;
+/* the following list is used to hold dimm config info before machin=
e
+ * initialization. After machine init, the list is emptied and not u=
sed anymore.*/
+static DimmConfiglist dimmconfig_list =3D QTAILQ_HEAD_INITIALIZER(di=
mmconfig_list);
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int in=
dent);
+static char *dimmbus_get_fw_dev_path(DeviceState *dev);
+
+static Property dimm_properties[] =3D {
+ DEFINE_PROP_UINT64("start", DimmDevice, start, 0),
+ DEFINE_PROP_UINT64("size", DimmDevice, size, DEFAULT_DIMMSIZE),
+ DEFINE_PROP_UINT32("node", DimmDevice, node, 0),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int in=
dent)
+{
+}
+
+static char *dimmbus_get_fw_dev_path(DeviceState *dev)
+{
+ char path[40];
+
+ snprintf(path, sizeof(path), "%s", qdev_fw_name(dev));
+ return strdup(path);
+}
+
+static void dimm_bus_class_init(ObjectClass *klass, void *data)
+{
+ BusClass *k =3D BUS_CLASS(klass);
+
+ k->print_dev =3D dimmbus_dev_print;
+ k->get_fw_dev_path =3D dimmbus_get_fw_dev_path;
+}
+
+static void dimm_bus_initfn(Object *obj)
+{
+ DimmConfig *dimm_cfg, *next_dimm_cfg;
+ DimmBus *bus =3D DIMM_BUS(obj);
+ QTAILQ_INIT(&bus->dimmconfig_list);
+ QTAILQ_INIT(&bus->dimmlist);
+
+ QTAILQ_FOREACH_SAFE(dimm_cfg, &dimmconfig_list, nextdimmcfg, nex=
t_dimm_cfg) {
+ QTAILQ_REMOVE(&dimmconfig_list, dimm_cfg, nextdimmcfg);
+ QTAILQ_INSERT_TAIL(&bus->dimmconfig_list, dimm_cfg, nextdimm=
cfg);
+ }
+}
+
+static const TypeInfo dimm_bus_info =3D {
+ .name =3D TYPE_DIMM_BUS,
+ .parent =3D TYPE_BUS,
+ .instance_size =3D sizeof(DimmBus),
+ .instance_init =3D dimm_bus_initfn,
+ .class_init =3D dimm_bus_class_init,
+};
+
+void main_memory_bus_create(Object *parent)
+{
+ main_memory_bus =3D g_malloc0(dimm_bus_info.instance_size);
+ main_memory_bus->qbus.glib_allocated =3D true;
+ qbus_create_inplace(&main_memory_bus->qbus, TYPE_DIMM_BUS, DEVIC=
E(parent),
+ "membus");
+}
+
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev=3D (DeviceState*)s;
+ MemoryRegion *new =3D NULL;
+
+ new =3D g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr =3D new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr =3D NULL;
+}
+
+void dimm_config_create(char *id, uint64_t size, uint64_t node, uint=
32_t
+ dimm_idx, uint32_t populated)
+{
+ DimmConfig *dimm_cfg;
+ dimm_cfg =3D (DimmConfig*) g_malloc0(sizeof(DimmConfig));
+ dimm_cfg->name =3D id;
+ dimm_cfg->idx =3D dimm_idx;
+ dimm_cfg->start =3D 0;
+ dimm_cfg->size =3D size;
+ dimm_cfg->node =3D node;
+ dimm_cfg->populated =3D populated;
+
+ QTAILQ_INSERT_TAIL(&dimmconfig_list, dimm_cfg, nextdimmcfg);
+}
+
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev)
+{
+ DimmBus *bus =3D main_memory_bus;
+ bus->qbus.allow_hotplug =3D 1;
+ bus->dimm_hotplug_qdev =3D qdev;
+ bus->dimm_hotplug =3D hotplug;
+}
+
+static void dimm_plug_device(DimmDevice *slot)
+{
+ DimmBus *bus =3D main_memory_bus;
+
+ dimm_populate(slot);
+ if (bus->dimm_hotplug)
+ bus->dimm_hotplug(bus->dimm_hotplug_qdev, slot, 1);
+}
+
+static int dimm_unplug_device(DeviceState *qdev)
+{
+ DimmBus *bus =3D main_memory_bus;
+
+ if (bus->dimm_hotplug)
+ bus->dimm_hotplug(bus->dimm_hotplug_qdev, DIMM(qdev), 0);
+ return 1;
+}
+
+static DimmConfig *dimmcfg_find_from_name(const char *name)
+{
+ DimmConfig *slot;
+ DimmBus *bus =3D main_memory_bus;
+
+ QTAILQ_FOREACH(slot, &bus->dimmconfig_list, nextdimmcfg) {
+ if (!strcmp(slot->name, name)) {
+ return slot;
+ }
+ }
+ return NULL;
+}
+
+static DimmDevice *dimm_find_from_idx(uint32_t idx)
+{
+ DimmDevice *slot;
+ DimmBus *bus =3D main_memory_bus;
+
+ QTAILQ_FOREACH(slot, &bus->dimmlist, nextdimm) {
+ if (slot->idx =3D=3D idx) {
+ return slot;
+ }
+ }
+ return NULL;
+}
+
+/* used to create a dimm device, only on incoming migration of a hot=
plugged
+ * RAMBlock
+ */
+int dimm_add(char *id)
+{
+ DimmConfig *slotcfg =3D NULL;
+ QemuOpts *devopts;
+ char buf[256];
+
+ if (!id) {
+ fprintf(stderr, "ERROR %s invalid id\n",__FUNCTION__);
+ return 1;
+ }
+
+ slotcfg =3D dimmcfg_find_from_name(id);
+
+ if (!slotcfg) {
+ fprintf(stderr, "%s no slot %s found\n", __FUNCTION__, id);
+ return 1;
+ }
+
+ devopts =3D qemu_opts_create(qemu_find_opts("device"), id, 0, NU=
LL);
+ qemu_opt_set(devopts, "driver", "dimm");
+
+ snprintf(buf, sizeof(buf), "%lu", slotcfg->size);
+ qemu_opt_set(devopts, "size", buf);
+ snprintf(buf, sizeof(buf), "%u", slotcfg->node);
+ qemu_opt_set(devopts, "node", buf);
+ qdev_device_add(devopts);
+
+ return 0;
+}
+
+/* used to calculate physical address offsets for all dimms */
+void dimm_calc_offsets(dimm_calcoffset_fn calcfn)
+{
+ DimmConfig *slot;
+ QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
+ if (!slot->start) {
+ slot->start =3D calcfn(slot->size);
+ }
+ }
+}
+
+void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots)
+{
+ DimmConfig *slot;
+
+ QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
+ assert(slot->start);
+ fw_cfg_slots[3 * slot->idx] =3D cpu_to_le64(slot->start);
+ fw_cfg_slots[3 * slot->idx + 1] =3D cpu_to_le64(slot->size);
+ fw_cfg_slots[3 * slot->idx + 2] =3D cpu_to_le64(slot->node);
+ }
+}
+
+void dimm_notify(uint32_t idx, uint32_t event)
+{
+ DimmBus *bus =3D main_memory_bus;
+ DimmDevice *s;
+ s =3D dimm_find_from_idx(idx);
+ assert(s !=3D NULL);
+
+ switch(event) {
+ dimm_depopulate(s);
+ qdev_simple_unplug_cb((DeviceState*)s);
+ QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
+ break;
+ break;
+ }
+}
+
+static int dimm_init(DeviceState *s)
+{
+ DimmBus *bus =3D main_memory_bus;
+ DimmDevice *slot;
+ DimmConfig *slotcfg;
+
+ slot =3D DIMM(s);
+ slot->mr =3D NULL;
+
+ slotcfg =3D dimmcfg_find_from_name(s->id);
+
+ if (!slotcfg) {
+ fprintf(stderr, "%s no config for slot %s found\n",
+ __FUNCTION__, s->id);
+ return 1;
+ }
+
+ slot->idx =3D slotcfg->idx;
+ assert(slotcfg->start);
+ slot->start =3D slotcfg->start;
+ slot->size =3D slotcfg->size;
+ slot->node =3D slotcfg->node;
+
+ QTAILQ_INSERT_TAIL(&bus->dimmlist, slot, nextdimm);
+ dimm_plug_device(slot);
+
+ return 0;
+}
+
+
+static void dimm_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc =3D DEVICE_CLASS(klass);
+
+ dc->props =3D dimm_properties;
+ dc->unplug =3D dimm_unplug_device;
+ dc->init =3D dimm_init;
+}
+
+static TypeInfo dimm_info =3D {
+ .name =3D TYPE_DIMM,
+ .parent =3D TYPE_DEVICE,
+ .instance_size =3D sizeof(DimmDevice),
+ .class_init =3D dimm_class_init,
+};
+
+static void dimm_register_types(void)
+{
+ type_register_static(&dimm_bus_info);
+ type_register_static(&dimm_info);
+}
+
+type_init(dimm_register_types)
diff --git a/hw/dimm.h b/hw/dimm.h
new file mode 100644
index 0000000..5e991a6
--- /dev/null
+++ b/hw/dimm.h
@@ -0,0 +1,90 @@
+#ifndef QEMU_DIMM_H
+#define QEMU_DIMM_H
+
+#include "qemu-common.h"
+#include "memory.h"
+#include "sysbus.h"
+#include "qapi-types.h"
+#include "qemu-queue.h"
+#include "cpus.h"
+#define MAX_DIMMS 255
+#define DIMM_BITMAP_BYTES (MAX_DIMMS + 7) / 8
+#define DEFAULT_DIMMSIZE 1024*1024*1024
+
+typedef enum {
+ DIMM_REMOVE_SUCCESS =3D 0,
+ DIMM_REMOVE_FAIL =3D 1,
+ DIMM_ADD_SUCCESS =3D 2,
+ DIMM_ADD_FAIL =3D 3
+} dimm_hp_result_code;
+
+#define TYPE_DIMM "dimm"
+#define DIMM(obj) \
+ OBJECT_CHECK(DimmDevice, (obj), TYPE_DIMM)
+#define DIMM_CLASS(klass) \
+ OBJECT_CLASS_CHECK(DimmDeviceClass, (klass), TYPE_DIMM)
+#define DIMM_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(DimmDeviceClass, (obj), TYPE_DIMM)
+
+typedef struct DimmDevice DimmDevice;
+typedef QTAILQ_HEAD(DimmConfiglist, DimmConfig) DimmConfiglist;
+
+typedef struct DimmDeviceClass {
+ DeviceClass parent_class;
+
+ int (*init)(DimmDevice *dev);
+} DimmDeviceClass;
+
+typedef struct DimmDevice {
typedef is unnecessay here, and it will break building:
CC hmp.o
In file included from /home/wency/source/qemu/hw/acpi_piix4.c:32:
/home/wency/source/qemu/hw/dimm.h:54: error: redefinition of typedef =91=
DimmDevice=92
/home/wency/source/qemu/hw/dimm.h:36: note: previous declaration of =91=
DimmDevice=92 was here
make[1]: *** [hw/acpi_piix4.o] Error 1
make[1]: *** Waiting for unfinished jobs....
CC audio/audio.o
make: *** [subdir-libhw64] Error 2
make: *** Waiting for unfinished jobs....

Thanks
Wen Congyang
+ DeviceState qdev;
+ uint32_t idx; /* index in memory hotplug register/bitmap */
+ ram_addr_t start; /* starting physical address */
+ ram_addr_t size;
+ uint32_t node; /* numa node proximity */
+ MemoryRegion *mr; /* MemoryRegion for this slot. !NULL only if p=
opulated */
+ QTAILQ_ENTRY (DimmDevice) nextdimm;
+} DimmDevice;
+
+typedef struct DimmConfig
+{
+ const char *name;
+ uint32_t idx; /* index in memory hotplug register/bitmap */
+ ram_addr_t start; /* starting physical address */
+ ram_addr_t size;
+ uint32_t node; /* numa node proximity */
+ uint32_t populated; /* 1 means device has been hotplugged. Defau=
lt is 0. */
+ QTAILQ_ENTRY (DimmConfig) nextdimmcfg;
+} DimmConfig;
+
+typedef int (*dimm_hotplug_fn)(DeviceState *qdev, DimmDevice *dev, i=
nt add);
+typedef target_phys_addr_t (*dimm_calcoffset_fn)(uint64_t size);
+
+#define TYPE_DIMM_BUS "dimmbus"
+#define DIMM_BUS(obj) OBJECT_CHECK(DimmBus, (obj), TYPE_DIMM_BUS)
+
+typedef struct DimmBus {
+ BusState qbus;
+ DeviceState *dimm_hotplug_qdev;
+ dimm_hotplug_fn dimm_hotplug;
+ dimm_calcoffset_fn dimm_calcoffset;
+ DimmConfiglist dimmconfig_list;
+ QTAILQ_HEAD(Dimmlist, DimmDevice) dimmlist;
+} DimmBus;
+
+struct dimm_hp_result {
+ const char *dimmname;
+ dimm_hp_result_code ret;
+ QTAILQ_ENTRY (dimm_hp_result) next;
+};
+
+void dimm_calc_offsets(dimm_calcoffset_fn calcfn);
+void dimm_notify(uint32_t idx, uint32_t event);
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev);
+void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots);
+int dimm_add(char *id);
+void main_memory_bus_create(Object *parent);
+void dimm_config_create(char *id, uint64_t size, uint64_t node,
+ uint32_t dimm_idx, uint32_t populated);
+
+
+#endif
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi
2012-10-23 12:25:32 UTC
Permalink
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?

Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.

It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
liu ping fan
2012-10-24 08:06:26 UTC
Permalink
Post by Stefan Hajnoczi
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?
Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.
It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.

Regards,
pingfan
Post by Stefan Hajnoczi
Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi
2012-10-24 10:15:17 UTC
Permalink
Post by liu ping fan
Post by Stefan Hajnoczi
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?
Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.
It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.
Here is the detailed scenario:

1. Emulated device does cpu_physical_memory_map() and gets a pointer
to guest RAM.
2. Return to vcpu or iothread, continue processing...
3. Hot unplug of RAM causes the guest RAM to disappear.
4. Pending I/O completes and overwrites memory from dangling guest RAM pointer.

Any I/O device that does zero-copy I/O in QEMU faces this problem:
* The block layer is affected.
* The net layer is unaffected because it doesn't do zero-copy tx/rx
across returns to the main loop (#2 above).
* Not sure about other devices classes (e.g. USB).

How should the MemoryListener callback work? For block I/O it may not
be possible to cancel pending I/O asynchronously - if you try to
cancel then your thread may block until the I/O completes.
Synchronous cancel behavior is not workable since it can lead to poor
latency or hangs in the guest.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-10-24 17:16:36 UTC
Permalink
Hi,
Post by Stefan Hajnoczi
Post by liu ping fan
Post by Stefan Hajnoczi
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?
Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.
It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.
do we want bdrv_flush, or some kind of cancel request e.g. bdrv_aio_cancel?
Post by Stefan Hajnoczi
1. Emulated device does cpu_physical_memory_map() and gets a pointer
to guest RAM.
2. Return to vcpu or iothread, continue processing...
3. Hot unplug of RAM causes the guest RAM to disappear.
4. Pending I/O completes and overwrites memory from dangling guest RAM pointer.
* The block layer is affected.
* The net layer is unaffected because it doesn't do zero-copy tx/rx
across returns to the main loop (#2 above).
* Not sure about other devices classes (e.g. USB).
How should the MemoryListener callback work? For block I/O it may not
be possible to cancel pending I/O asynchronously - if you try to
cancel then your thread may block until the I/O completes.
e.g. paio_cancel does this?
is there already an API to asynchronously cancel all in flight operations in a
BlockDriverState? Afaict block_job_cancel refers to streaming jobs only and
doesn't help here.

Can we make the RAM unplug initiate async I/O cancellations, prevent further I/Os,
and only free the memory in a callback, after all DMA I/O to the associated memory
region has been cancelled or completed?

Also iiuc the MemoryListener should be registered from users of
cpu_physical_memory_map e.g. hw/virtio.c

By the way dimm_depopulate only frees the qemu memory on an ACPI _EJ request, which
means that a well-behaved guest will have already offlined the memory and is not
using it anymore. If the guest still uses the memory e.g. for a DMA buffer, the
logical memory offlining will fail and the _EJ/qemu memory freeing will never
happen.

But in theory a malicious acpi guest driver could trigger _EJ requests to do step
3 above.

Or perhaps the backing block driver can finish an I/O request for a zero-copy
block device that the guest doesn't care for anymore? I 'll think about this a
bit more.
Post by Stefan Hajnoczi
Synchronous cancel behavior is not workable since it can lead to poor
latency or hangs in the guest.
ok

thanks,

- Vasilis

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
liu ping fan
2012-10-25 08:00:14 UTC
Permalink
On Thu, Oct 25, 2012 at 1:16 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
Hi,
Post by Stefan Hajnoczi
Post by liu ping fan
Post by Stefan Hajnoczi
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?
Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.
It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.
do we want bdrv_flush, or some kind of cancel request e.g. bdrv_aio_cancel?
My original meaning is that flush out the dangling pointer.
Post by Vasilis Liaskovitis
Post by Stefan Hajnoczi
1. Emulated device does cpu_physical_memory_map() and gets a pointer
to guest RAM.
2. Return to vcpu or iothread, continue processing...
3. Hot unplug of RAM causes the guest RAM to disappear.
4. Pending I/O completes and overwrites memory from dangling guest RAM pointer.
* The block layer is affected.
* The net layer is unaffected because it doesn't do zero-copy tx/rx
across returns to the main loop (#2 above).
* Not sure about other devices classes (e.g. USB).
How should the MemoryListener callback work? For block I/O it may not
be possible to cancel pending I/O asynchronously - if you try to
cancel then your thread may block until the I/O completes.
For current code, I think to block on the listener to wait for the
completion of flushing out. But after mr->ops's ref/unref patchset
accept, we can release the ref of RAM device after we have done with
it (it is a very raw idea, need to improve).
Post by Vasilis Liaskovitis
e.g. paio_cancel does this?
is there already an API to asynchronously cancel all in flight operations in a
BlockDriverState? Afaict block_job_cancel refers to streaming jobs only and
doesn't help here.
Can we make the RAM unplug initiate async I/O cancellations, prevent further I/Os,
and only free the memory in a callback, after all DMA I/O to the associated memory
region has been cancelled or completed?
Also iiuc the MemoryListener should be registered from users of
cpu_physical_memory_map e.g. hw/virtio.c
Yes.
Post by Vasilis Liaskovitis
By the way dimm_depopulate only frees the qemu memory on an ACPI _EJ request, which
means that a well-behaved guest will have already offlined the memory and is not
using it anymore. If the guest still uses the memory e.g. for a DMA buffer, the
logical memory offlining will fail and the _EJ/qemu memory freeing will never
happen.
Yes.
Post by Vasilis Liaskovitis
But in theory a malicious acpi guest driver could trigger _EJ requests to do step
3 above.
Or perhaps the backing block driver can finish an I/O request for a zero-copy
block device that the guest doesn't care for anymore? I 'll think about this a
bit more.
The guest is one of the users of dimm device, and block layer is another one.

Regards,
pingfan
Post by Vasilis Liaskovitis
Post by Stefan Hajnoczi
Synchronous cancel behavior is not workable since it can lead to poor
latency or hangs in the guest.
ok
thanks,
- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity
2012-10-31 11:15:20 UTC
Permalink
Post by liu ping fan
Post by Stefan Hajnoczi
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?
Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.
It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.
IMO we should use the same mechanism as proposed for other devices:
address_space_map() should grab a reference on the dimm device, and
address_space_unmap() can release it. This way device destruction will
be deferred as soon as all devices complete I/O.

We will have to be careful with network receive buffers though, since
they can be held indefinitely.
--
error compiling committee.c: too many arguments to function
Stefan Hajnoczi
2012-10-31 12:18:16 UTC
Permalink
Post by Avi Kivity
Post by liu ping fan
Post by Stefan Hajnoczi
Post by Vasilis Liaskovitis
+static void dimm_populate(DimmDevice *s)
+{
+ DeviceState *dev= (DeviceState*)s;
+ MemoryRegion *new = NULL;
+
+ new = g_malloc(sizeof(MemoryRegion));
+ memory_region_init_ram(new, dev->id, s->size);
+ vmstate_register_ram_global(new);
+ memory_region_add_subregion(get_system_memory(), s->start, new);
+ s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+ assert(s);
+ vmstate_unregister_ram(s->mr, NULL);
+ memory_region_del_subregion(get_system_memory(), s->mr);
+ memory_region_destroy(s->mr);
+ s->mr = NULL;
+}
How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?
Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code. The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.
It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example. We also need to be robust against
malicious guests who abuse the hotplug lifecycle. QEMU should never be
left with dangling pointers.
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.
address_space_map() should grab a reference on the dimm device, and
address_space_unmap() can release it. This way device destruction will
be deferred as soon as all devices complete I/O.
We will have to be careful with network receive buffers though, since
they can be held indefinitely.
Network receive buffers aren't mapped. Net receive is not zero-copy.
For example, virtio-net does virtqueue_pop() inside
virtio_net_receive().

I don't see a problem with networking.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity
2012-10-31 12:34:02 UTC
Permalink
Post by Stefan Hajnoczi
Post by Avi Kivity
address_space_map() should grab a reference on the dimm device, and
address_space_unmap() can release it. This way device destruction will
be deferred as soon as all devices complete I/O.
We will have to be careful with network receive buffers though, since
they can be held indefinitely.
Network receive buffers aren't mapped. Net receive is not zero-copy.
For example, virtio-net does virtqueue_pop() inside
virtio_net_receive().
I don't see a problem with networking.
What about vhost-net? But that is managed separately with a MemoryListener.
--
error compiling committee.c: too many arguments to function
Stefan Hajnoczi
2012-10-31 12:34:49 UTC
Permalink
Post by Avi Kivity
Post by Stefan Hajnoczi
Post by Avi Kivity
address_space_map() should grab a reference on the dimm device, and
address_space_unmap() can release it. This way device destruction will
be deferred as soon as all devices complete I/O.
We will have to be careful with network receive buffers though, since
they can be held indefinitely.
Network receive buffers aren't mapped. Net receive is not zero-copy.
For example, virtio-net does virtqueue_pop() inside
virtio_net_receive().
I don't see a problem with networking.
What about vhost-net? But that is managed separately with a MemoryListener.
Yep. It should find out when memory regions change through its listener.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Vasilis Liaskovitis
2012-09-21 11:17:23 UTC
Permalink
A 32-byte register is used to present up to 256 hotplug-able memory devices
to BIOS and OSPM. Hot-add and hot-remove functions trigger an ACPI hotplug
event through these. Only reads are allowed from these registers.

An ACPI hot-remove event but needs to wait for OSPM to eject the device.
We use a single-byte register to know when OSPM has called the _EJ function
for a particular dimm. A write to this byte will depopulate the respective dimm.
Only writes are allowed to this byte.

v1->v2:
mems_sts address moved from 0xaf20 to 0xaf80 (to accomodate more space for
cpu-hotplugging in the future).
_EJ array is reduced to a single byte.
Add documentation in docs/specs/acpi_hotplug.txt

v2->v3:
minor name changes

Signed-off-by: Vasilis Liaskovitis <***@profitbricks.com>
---
docs/specs/acpi_hotplug.txt | 22 +++++++++++++
hw/acpi_piix4.c | 73 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 91 insertions(+), 4 deletions(-)
create mode 100644 docs/specs/acpi_hotplug.txt

diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
new file mode 100644
index 0000000..cf86242
--- /dev/null
+++ b/docs/specs/acpi_hotplug.txt
@@ -0,0 +1,22 @@
+QEMU<->ACPI BIOS hotplug interface
+--------------------------------------
+This document describes the interface between QEMU and the ACPI BIOS for non-PCI
+space. For the PCI interface please look at docs/specs/acpi_pci_hotplug.txt
+
+QEMU<->ACPI BIOS memory hotplug interface
+--------------------------------------
+
+Memory Dimm status array (IO port 0xaf80-0xaf9f, 1-byte access):
+---------------------------------------------------------------
+Dimm hot-plug notification pending. One bit per slot.
+
+Read by ACPI BIOS GPE.3 handler to notify OS of memory hot-add or hot-remove
+events. Read-only.
+
+Memory Dimm ejection success notification (IO port 0xafa0, 1-byte access):
+---------------------------------------------------------------
+Dimm hot-remove _EJ0 notification. Byte value indicates Dimm slot that was
+ejected.
+
+Written by ACPI memory device _EJ0 method to notify qemu of successfull
+hot-removal. Write-only.
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index c56220b..8776669 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -28,6 +28,8 @@
#include "range.h"
#include "ioport.h"
#include "fw_cfg.h"
+#include "sysbus.h"
+#include "dimm.h"

//#define DEBUG

@@ -45,9 +47,15 @@
#define PCI_DOWN_BASE 0xae04
#define PCI_EJ_BASE 0xae08
#define PCI_RMV_BASE 0xae0c
+#define MEM_BASE 0xaf80
+#define MEM_EJ_BASE 0xafa0

+#define PIIX4_MEM_HOTPLUG_STATUS 8
#define PIIX4_PCI_HOTPLUG_STATUS 2

+struct gpe_regs {
+ uint8_t mems_sts[DIMM_BITMAP_BYTES];
+};
struct pci_status {
uint32_t up; /* deprecated, maintained for migration compatibility */
uint32_t down;
@@ -69,6 +77,7 @@ typedef struct PIIX4PMState {
Notifier machine_ready;

/* for pci hotplug */
+ struct gpe_regs gperegs;
struct pci_status pci0_status;
uint32_t pci0_hotplug_enable;
uint32_t pci0_slot_device_present;
@@ -93,8 +102,8 @@ static void pm_update_sci(PIIX4PMState *s)
ACPI_BITMASK_POWER_BUTTON_ENABLE |
ACPI_BITMASK_GLOBAL_LOCK_ENABLE |
ACPI_BITMASK_TIMER_ENABLE)) != 0) ||
- (((s->ar.gpe.sts[0] & s->ar.gpe.en[0])
- & PIIX4_PCI_HOTPLUG_STATUS) != 0);
+ (((s->ar.gpe.sts[0] & s->ar.gpe.en[0]) &
+ (PIIX4_PCI_HOTPLUG_STATUS | PIIX4_MEM_HOTPLUG_STATUS)) != 0);

qemu_set_irq(s->irq, sci_level);
/* schedule a timer interruption if needed */
@@ -499,7 +508,16 @@ type_init(piix4_pm_register_types)
static uint32_t gpe_readb(void *opaque, uint32_t addr)
{
PIIX4PMState *s = opaque;
- uint32_t val = acpi_gpe_ioport_readb(&s->ar, addr);
+ uint32_t val = 0;
+ struct gpe_regs *g = &s->gperegs;
+
+ switch (addr) {
+ case MEM_BASE ... MEM_BASE+DIMM_BITMAP_BYTES:
+ val = g->mems_sts[addr - MEM_BASE];
+ break;
+ default:
+ val = acpi_gpe_ioport_readb(&s->ar, addr);
+ }

PIIX4_DPRINTF("gpe read %x == %x\n", addr, val);
return val;
@@ -509,7 +527,13 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
{
PIIX4PMState *s = opaque;

- acpi_gpe_ioport_writeb(&s->ar, addr, val);
+ switch (addr) {
+ case MEM_EJ_BASE:
+ dimm_notify(val, DIMM_REMOVE_SUCCESS);
+ break;
+ default:
+ acpi_gpe_ioport_writeb(&s->ar, addr, val);
+ }
pm_update_sci(s);

PIIX4_DPRINTF("gpe write %x <== %d\n", addr, val);
@@ -560,9 +584,11 @@ static uint32_t pcirmv_read(void *opaque, uint32_t addr)

static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
PCIHotplugState state);
+static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int add);

static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
{
+ int i = 0;

register_ioport_write(GPE_BASE, GPE_LEN, 1, gpe_writeb, s);
register_ioport_read(GPE_BASE, GPE_LEN, 1, gpe_readb, s);
@@ -576,7 +602,15 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)

register_ioport_read(PCI_RMV_BASE, 4, 4, pcirmv_read, s);

+ register_ioport_read(MEM_BASE, DIMM_BITMAP_BYTES, 1, gpe_readb, s);
+ register_ioport_write(MEM_EJ_BASE, 1, 1, gpe_writeb, s);
+
+ for(i = 0; i < DIMM_BITMAP_BYTES; i++) {
+ s->gperegs.mems_sts[i] = 0;
+ }
+
pci_bus_hotplug(bus, piix4_device_hotplug, &s->dev.qdev);
+ dimm_bus_hotplug(piix4_dimm_hotplug, &s->dev.qdev);
}

static void enable_device(PIIX4PMState *s, int slot)
@@ -591,6 +625,37 @@ static void disable_device(PIIX4PMState *s, int slot)
s->pci0_status.down |= (1U << slot);
}

+static void enable_mem_device(PIIX4PMState *s, int memdevice)
+{
+ struct gpe_regs *g = &s->gperegs;
+ s->ar.gpe.sts[0] |= PIIX4_MEM_HOTPLUG_STATUS;
+ g->mems_sts[memdevice/8] |= (1 << (memdevice%8));
+}
+
+static void disable_mem_device(PIIX4PMState *s, int memdevice)
+{
+ struct gpe_regs *g = &s->gperegs;
+ s->ar.gpe.sts[0] |= PIIX4_MEM_HOTPLUG_STATUS;
+ g->mems_sts[memdevice/8] &= ~(1 << (memdevice%8));
+}
+
+static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int
+ add)
+{
+ PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, qdev);
+ PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, pci_dev);
+ DimmDevice *slot = DIMM(dev);
+
+ if (add) {
+ enable_mem_device(s, slot->idx);
+ }
+ else {
+ disable_mem_device(s, slot->idx);
+ }
+ pm_update_sci(s);
+ return 0;
+}
+
static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
PCIHotplugState state)
{
--
1.7.9

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Blue Swirl
2012-09-22 13:49:05 UTC
Permalink
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis
Post by Vasilis Liaskovitis
A 32-byte register is used to present up to 256 hotplug-able memory devices
to BIOS and OSPM. Hot-add and hot-remove functions trigger an ACPI hotplug
event through these. Only reads are allowed from these registers.
An ACPI hot-remove event but needs to wait for OSPM to eject the device.
We use a single-byte register to know when OSPM has called the _EJ function
for a particular dimm. A write to this byte will depopulate the respective dimm.
Only writes are allowed to this byte.
mems_sts address moved from 0xaf20 to 0xaf80 (to accomodate more space for
cpu-hotplugging in the future).
_EJ array is reduced to a single byte.
Add documentation in docs/specs/acpi_hotplug.txt
minor name changes
---
docs/specs/acpi_hotplug.txt | 22 +++++++++++++
hw/acpi_piix4.c | 73 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 91 insertions(+), 4 deletions(-)
create mode 100644 docs/specs/acpi_hotplug.txt
diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
new file mode 100644
index 0000000..cf86242
--- /dev/null
+++ b/docs/specs/acpi_hotplug.txt
@@ -0,0 +1,22 @@
+QEMU<->ACPI BIOS hotplug interface
+--------------------------------------
+This document describes the interface between QEMU and the ACPI BIOS for non-PCI
+space. For the PCI interface please look at docs/specs/acpi_pci_hotplug.txt
+
+QEMU<->ACPI BIOS memory hotplug interface
+--------------------------------------
+
+---------------------------------------------------------------
+Dimm hot-plug notification pending. One bit per slot.
+
+Read by ACPI BIOS GPE.3 handler to notify OS of memory hot-add or hot-remove
+events. Read-only.
+
+---------------------------------------------------------------
+Dimm hot-remove _EJ0 notification. Byte value indicates Dimm slot that was
+ejected.
+
+Written by ACPI memory device _EJ0 method to notify qemu of successfull
+hot-removal. Write-only.
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index c56220b..8776669 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -28,6 +28,8 @@
#include "range.h"
#include "ioport.h"
#include "fw_cfg.h"
+#include "sysbus.h"
+#include "dimm.h"
//#define DEBUG
@@ -45,9 +47,15 @@
#define PCI_DOWN_BASE 0xae04
#define PCI_EJ_BASE 0xae08
#define PCI_RMV_BASE 0xae0c
+#define MEM_BASE 0xaf80
+#define MEM_EJ_BASE 0xafa0
+#define PIIX4_MEM_HOTPLUG_STATUS 8
#define PIIX4_PCI_HOTPLUG_STATUS 2
+struct gpe_regs {
GPERegs
Post by Vasilis Liaskovitis
+ uint8_t mems_sts[DIMM_BITMAP_BYTES];
+};
struct pci_status {
uint32_t up; /* deprecated, maintained for migration compatibility */
uint32_t down;
@@ -69,6 +77,7 @@ typedef struct PIIX4PMState {
Notifier machine_ready;
/* for pci hotplug */
+ struct gpe_regs gperegs;
struct pci_status pci0_status;
uint32_t pci0_hotplug_enable;
uint32_t pci0_slot_device_present;
@@ -93,8 +102,8 @@ static void pm_update_sci(PIIX4PMState *s)
ACPI_BITMASK_POWER_BUTTON_ENABLE |
ACPI_BITMASK_GLOBAL_LOCK_ENABLE |
ACPI_BITMASK_TIMER_ENABLE)) != 0) ||
- (((s->ar.gpe.sts[0] & s->ar.gpe.en[0])
- & PIIX4_PCI_HOTPLUG_STATUS) != 0);
+ (((s->ar.gpe.sts[0] & s->ar.gpe.en[0]) &
+ (PIIX4_PCI_HOTPLUG_STATUS | PIIX4_MEM_HOTPLUG_STATUS)) != 0);
qemu_set_irq(s->irq, sci_level);
/* schedule a timer interruption if needed */
@@ -499,7 +508,16 @@ type_init(piix4_pm_register_types)
static uint32_t gpe_readb(void *opaque, uint32_t addr)
{
PIIX4PMState *s = opaque;
- uint32_t val = acpi_gpe_ioport_readb(&s->ar, addr);
+ uint32_t val = 0;
+ struct gpe_regs *g = &s->gperegs;
+
+ switch (addr) {
+ val = g->mems_sts[addr - MEM_BASE];
+ break;
+ val = acpi_gpe_ioport_readb(&s->ar, addr);
+ }
PIIX4_DPRINTF("gpe read %x == %x\n", addr, val);
return val;
@@ -509,7 +527,13 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
{
PIIX4PMState *s = opaque;
- acpi_gpe_ioport_writeb(&s->ar, addr, val);
+ switch (addr) {
+ dimm_notify(val, DIMM_REMOVE_SUCCESS);
+ break;
+ acpi_gpe_ioport_writeb(&s->ar, addr, val);
+ }
pm_update_sci(s);
PIIX4_DPRINTF("gpe write %x <== %d\n", addr, val);
@@ -560,9 +584,11 @@ static uint32_t pcirmv_read(void *opaque, uint32_t addr)
static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
PCIHotplugState state);
+static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int add);
static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
{
+ int i = 0;
register_ioport_write(GPE_BASE, GPE_LEN, 1, gpe_writeb, s);
register_ioport_read(GPE_BASE, GPE_LEN, 1, gpe_readb, s);
@@ -576,7 +602,15 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
register_ioport_read(PCI_RMV_BASE, 4, 4, pcirmv_read, s);
+ register_ioport_read(MEM_BASE, DIMM_BITMAP_BYTES, 1, gpe_readb, s);
+ register_ioport_write(MEM_EJ_BASE, 1, 1, gpe_writeb, s);
+
+ for(i = 0; i < DIMM_BITMAP_BYTES; i++) {
+ s->gperegs.mems_sts[i] = 0;
+ }
+
pci_bus_hotplug(bus, piix4_device_hotplug, &s->dev.qdev);
+ dimm_bus_hotplug(piix4_dimm_hotplug, &s->dev.qdev);
}
static void enable_device(PIIX4PMState *s, int slot)
@@ -591,6 +625,37 @@ static void disable_device(PIIX4PMState *s, int slot)
s->pci0_status.down |= (1U << slot);
}
+static void enable_mem_device(PIIX4PMState *s, int memdevice)
+{
+ struct gpe_regs *g = &s->gperegs;
+ s->ar.gpe.sts[0] |= PIIX4_MEM_HOTPLUG_STATUS;
+ g->mems_sts[memdevice/8] |= (1 << (memdevice%8));
+}
+
+static void disable_mem_device(PIIX4PMState *s, int memdevice)
+{
+ struct gpe_regs *g = &s->gperegs;
+ s->ar.gpe.sts[0] |= PIIX4_MEM_HOTPLUG_STATUS;
+ g->mems_sts[memdevice/8] &= ~(1 << (memdevice%8));
+}
+
+static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int
+ add)
+{
+ PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, qdev);
+ PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, pci_dev);
+ DimmDevice *slot = DIMM(dev);
+
+ if (add) {
+ enable_mem_device(s, slot->idx);
+ }
+ else {
} else {
Post by Vasilis Liaskovitis
+ disable_mem_device(s, slot->idx);
+ }
+ pm_update_sci(s);
+ return 0;
+}
+
static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
PCIHotplugState state)
{
--
1.7.9
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Blue Swirl
2012-09-22 14:17:43 UTC
Permalink
On Fri, Sep 21, 2012 at 11:17 AM, Vasilis Liaskovitis
This is v3 of the ACPI memory hotplug functionality. Only x86_64 target is supported
for now.
Dimm device layout is modeled with a new qemu command line
"-dimm id=name,size=sz,node=pxm,populated=on|off"
The starting physical address for all dimms is calculated automatically from
top of memory, skipping the pci hole at [PCI_HOLE_START, 4G).
Node is defining numa proximity for this dimm. When not defined it defaults
to zero.
"-dimm id=dimm0,size=512M,node=0,populated=off"
will define a 512M memory slot belonging to numa node 0.
Hot-add syntax: "device_add dimm,id=mydimm0"
Hot-remove syntax: "dimm_del dimm,id=mydimm0"
Changes v2->v3
- qdev integration. Dimms are attached to a dimmbus. The dimmbus is a child
of i440fx device in the pc machine. Hot-add and hot-remove are done with normal
device_add / device_del operations on the dimmbus. New commands "dimm_add" and
"dimm_del" are obsolete. (In previous versions, dimms were always present on the
qdev tree, and dimm_add/del simply meant allocating or deallocating memory for
the devices. This version actually does hot-operations on the qdev tree)
- Add _PS3 method to allow OSPM-induced hot operations.
- pci-window calculation in Seabios takes dimms into account(for both 32-bit and
64-bit windows)
- rename new qmp commands: query-memory-total and query-memory-hotplug
- balloon driver can see the hotplugged memory
Changes v1->v2
- memory map is automatically calculated for hotplug dimms. Dimms are added from
top-of-memory skipping the pci hole at [PCI_HOLE_START, 4G).
- Renamed from "-memslot" to "-dimm". Commands changed to "dimm_add", "dimm_del".
- Seabios ejection array reduced to a byte. Use extraction macros for dimm ssdt.
- additional SRAT paravirt info does not break previous SRAT fw_cfg layout.
- Documentation of new acpi_piix4 registers and paravirt data.
- add ACPI _OST support for _OST enabled guests. This allows qemu to receive
notification for success / failure of memory hot-add and hot-remove operations.
Guest needs to support _OST (https://lkml.org/lkml/2012/6/25/321)
- add monitor info command to report total guest memory (initial + hot-added)
- add command line options and monitor commands for batch dimm
creation/population (obsolete from v3 onwards)
- A main blocker issue is windows guest functionality. The patchset does not work for
windows currently. My guess is the windows pnpmem driver does not like the
seabios dimm device implementation (or the seabios dimm implementation is not
fully ACPI-compliant). If someone can review the seabios patches or has any
ideas to debug this, let me know.
Testing on win2012 server RC or windows2008 consumer prerelease. When adding a
DIMM, the device shows up in DeviceManager but does not work.
" This device cannot start. (Code 10)
Device configured(memory.inf) (UserPnP eventID 400)
Device installed (memory.inf) ACPI/PNP0C80\2&daba3ff&1 was configured
Device not started(PNPMEM) (Kernel-PnP eventID 411, kernelID)
Device ACPI\PNP0C80\2&daba3ff&1 had a problem starting Driver Name: memory.inf
(c:\Windows\system32\DRIVERS\pnpmem.sys 6.2.8400 winmain_win8rc))
Memory range:0x80000000 - 0x90000000 (Initial memory of VM is 2GB. The hotplugged DIMM
was a 256GB with physical address range starting at 2GB )
Conflicting device list: No conflicts. "
Adding a 2nd or more dimms causes a crash (PNP_DETECTED_FATAL_ERROR with blue
screen of death) and makes windows reboot. After this, the VM keeps rebooting with
ACPI_BIOS_ERROR. The VM refuses to boot anymore once a 2nd (or more) extra dimm is
plugged-in.
- Is the dimmbus the correct way to go about integrating into qdev/qom? In a v1
comment, Anthony mentioned attaching dimms directly to an i440fx device as
children. Is this possible without a bus?
- Live migration works as long as the dimm layout (-dimm command line args) are
identical at the source and destination qemu command line. Patch 10/19
creates the DimmDevice that corresponds to the unknown incoming ramblock.
Ramblocks are migrated before qdev VMStates are migrated (the DimmDevice structure
currently does not define a VMStateDescription). So the DimmDevice is handled
diferrently than other devices. If this is not acceptable, any suggestions on
how should it be reworked?
- Hot-operation notification lists need to be added to migration state.
Please review. Could people state which other issues they consider blocker for
including this upstream?
Please check the patches with checkpatch.pl, there are plenty of
missing braces and uses of __FUNCTION__ etc.

I also have other comments to specific patches, especially the
architecture does not look correct.
Does this patchset need to wait for 1.4 or could this be considered for 1.3 (assuming
blockers are resolved)? The patchset has been revised every few months, but
I will provide quicker version updates onwards. I can also bring this up on a weekly
meeting agenda if needed.
series is based on uq/master for qemu-kvm, and master for seabios. Can be found
http://github.com/vliaskov/qemu-kvm/commits/memhp-v3
http://github.com/vliaskov/seabios/commits/memhp-v3
Implement dimm device abstraction
Implement "-dimm" command line option
acpi_piix4: Implement memory device hotplug registers
pc: calculate dimm physical addresses and adjust memory map
pc: Add dimm paravirt SRAT info
fix live-migration when "populated=on" is missing
Implement qmp and hmp commands for notification lists
Implement "info memory-total" and "query-memory-total"
balloon: update with hotplugged memory
Add _OST dimm support
Update dimm state on reset
Implement _PS3 for dimm
arch_init.c | 24 ++-
docs/specs/acpi_hotplug.txt | 54 ++++++
docs/specs/fwcfg.txt | 28 +++
hmp-commands.hx | 4 +
hmp.c | 24 +++
hmp.h | 2 +
hw/Makefile.objs | 2 +-
hw/acpi_piix4.c | 114 +++++++++++-
hw/dimm.c | 435 +++++++++++++++++++++++++++++++++++++++++++
hw/dimm.h | 101 ++++++++++
hw/pc.c | 55 ++++++-
hw/pc.h | 6 +
hw/pc_piix.c | 20 ++-
hw/virtio-balloon.c | 13 +-
monitor.c | 14 ++
qapi-schema.json | 37 ++++
qemu-config.c | 25 +++
qemu-options.hx | 5 +
qmp-commands.hx | 57 ++++++
sysemu.h | 1 +
vl.c | 51 +++++
21 files changed, 1051 insertions(+), 21 deletions(-)
create mode 100644 docs/specs/acpi_hotplug.txt
create mode 100644 docs/specs/fwcfg.txt
create mode 100644 hw/dimm.c
create mode 100644 hw/dimm.h
Add ACPI_EXTRACT_DEVICE* macros
Subject: [PATCH 02/18] Add SSDT memory device support
acpi-dsdt: Implement functions for memory hotplug
acpi: generate hotplug memory devices
Add _OST dimm method
Implement _PS3 method for memory device
Calculate pcimem_start and pcimem64_start from SRAT entries
Makefile | 2 +-
src/acpi-dsdt.dsl | 135 ++++++++++++++++++++++++++++++-
src/acpi.c | 216 ++++++++++++++++++++++++++++++++++++++++++++----
src/acpi.h | 3 +
src/pciinit.c | 6 +-
src/post.c | 3 +
src/smp.c | 4 +
src/ssdt-mem.dsl | 73 +++++++++++++++++
tools/acpi_extract.py | 28 +++++++
9 files changed, 447 insertions(+), 23 deletions(-)
create mode 100644 src/ssdt-mem.dsl
--
1.7.9
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi
2012-10-31 10:58:33 UTC
Permalink
On Fri, Sep 21, 2012 at 1:17 PM, Vasilis Liaskovitis
This is v3 of the ACPI memory hotplug functionality. Only x86_64 target is supported
for now.
Hi Vasilis,
Regarding the hot unplug issue we've been discussing, it's possible to
progress this patch series without fully solving that problem upfront.

Karen Noel suggested that the series could be rolled without the hot
unplug command, so that it's not possible to hit the unsafe case.
This would allow users to hot plug additional memory. They would have
to use virtio-balloon to reduce the memory footprint again. Later,
when the memory region referencing issue has been solved the hot
unplug command can be added.

Just wanted to mention Karen's idea in case you feel stuck right now.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity
2012-10-31 11:16:56 UTC
Permalink
Post by Stefan Hajnoczi
On Fri, Sep 21, 2012 at 1:17 PM, Vasilis Liaskovitis
This is v3 of the ACPI memory hotplug functionality. Only x86_64 target is supported
for now.
Hi Vasilis,
Regarding the hot unplug issue we've been discussing, it's possible to
progress this patch series without fully solving that problem upfront.
Karen Noel suggested that the series could be rolled without the hot
unplug command, so that it's not possible to hit the unsafe case.
This would allow users to hot plug additional memory. They would have
to use virtio-balloon to reduce the memory footprint again. Later,
when the memory region referencing issue has been solved the hot
unplug command can be added.
Just wanted to mention Karen's idea in case you feel stuck right now.
We could introduce hotunplug as an experimental feature so people can
test and play with it, and later graduate it to a fully supported feature.
--
error compiling committee.c: too many arguments to function
Vasilis Liaskovitis
2012-11-01 09:01:19 UTC
Permalink
Post by Avi Kivity
Post by Stefan Hajnoczi
On Fri, Sep 21, 2012 at 1:17 PM, Vasilis Liaskovitis
This is v3 of the ACPI memory hotplug functionality. Only x86_64 target is supported
for now.
Hi Vasilis,
Regarding the hot unplug issue we've been discussing, it's possible to
progress this patch series without fully solving that problem upfront.
Karen Noel suggested that the series could be rolled without the hot
unplug command, so that it's not possible to hit the unsafe case.
This would allow users to hot plug additional memory. They would have
to use virtio-balloon to reduce the memory footprint again. Later,
when the memory region referencing issue has been solved the hot
unplug command can be added.
Just wanted to mention Karen's idea in case you feel stuck right now.
We could introduce hotunplug as an experimental feature so people can
test and play with it, and later graduate it to a fully supported feature.
ok, I 'll separate hotplug and hotunplug patches for next version of the
patchseries (maybe even offer hotunplug in a separate series)

thanks,

- Vasilis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Loading...