Skip to content

Commit b6f69f7

Browse files
ldu4gregkh
authored andcommitted
mm: don't rely on system state to detect hot-plug operations
commit f85086f upstream. In register_mem_sect_under_node() the system_state's value is checked to detect whether the call is made during boot time or during an hot-plug operation. Unfortunately, that check against SYSTEM_BOOTING is wrong because regular memory is registered at SYSTEM_SCHEDULING state. In addition, memory hot-plug operation can be triggered at this system state by the ACPI [1]. So checking against the system state is not enough. The consequence is that on system with interleaved node's ranges like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] This can be seen on PowerPC LPAR after multiple memory hot-plug and hot-unplug operations are done. At the next reboot the node's memory ranges can be interleaved and since the call to link_mem_sections() is made in topology_init() while the system is in the SYSTEM_SCHEDULING state, the node's id is not checked, and the sections registered to multiple nodes: $ ls -l /sys/devices/system/memory/memory21/node* total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 In that case, the system is able to boot but if later one of theses memory blocks is hot-unplugged and then hot-plugged, the sysfs inconsistency is detected and this is triggering a BUG_ON(): kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This patch addresses the root cause by not relying on the system_state value to detect whether the call is due to a hot-plug operation. An extra parameter is added to link_mem_sections() detailing whether the operation is due to a hot-plug operation. [1] According to Oscar Salvador, using this qemu command line, ACPI memory hotplug operations are raised at SYSTEM_SCHEDULING state: $QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \ -m size=$MEM,slots=255,maxmem=4294967296k \ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \ Fixes: 4fbce63 ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()") Signed-off-by: Laurent Dufour <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Reviewed-by: David Hildenbrand <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: "Rafael J. Wysocki" <[email protected]> Cc: Fenghua Yu <[email protected]> Cc: Nathan Lynch <[email protected]> Cc: Scott Cheloha <[email protected]> Cc: Tony Luck <[email protected]> Cc: <[email protected]> Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 25eaea1 commit b6f69f7

File tree

3 files changed

+63
-35
lines changed

3 files changed

+63
-35
lines changed

drivers/base/node.c

+54-30
Original file line numberDiff line numberDiff line change
@@ -403,10 +403,32 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
403403
return pfn_to_nid(pfn);
404404
}
405405

406+
static int do_register_memory_block_under_node(int nid,
407+
struct memory_block *mem_blk)
408+
{
409+
int ret;
410+
411+
/*
412+
* If this memory block spans multiple nodes, we only indicate
413+
* the last processed node.
414+
*/
415+
mem_blk->nid = nid;
416+
417+
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
418+
&mem_blk->dev.kobj,
419+
kobject_name(&mem_blk->dev.kobj));
420+
if (ret)
421+
return ret;
422+
423+
return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
424+
&node_devices[nid]->dev.kobj,
425+
kobject_name(&node_devices[nid]->dev.kobj));
426+
}
427+
406428
/* register memory section under specified node if it spans that node */
407-
int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
429+
int register_mem_block_under_node_early(struct memory_block *mem_blk, void *arg)
408430
{
409-
int ret, nid = *(int *)arg;
431+
int nid = *(int *)arg;
410432
unsigned long pfn, sect_start_pfn, sect_end_pfn;
411433

412434
sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
@@ -426,38 +448,33 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
426448
}
427449

428450
/*
429-
* We need to check if page belongs to nid only for the boot
430-
* case, during hotplug we know that all pages in the memory
431-
* block belong to the same node.
432-
*/
433-
if (system_state == SYSTEM_BOOTING) {
434-
page_nid = get_nid_for_pfn(pfn);
435-
if (page_nid < 0)
436-
continue;
437-
if (page_nid != nid)
438-
continue;
439-
}
440-
441-
/*
442-
* If this memory block spans multiple nodes, we only indicate
443-
* the last processed node.
451+
* We need to check if page belongs to nid only at the boot
452+
* case because node's ranges can be interleaved.
444453
*/
445-
mem_blk->nid = nid;
446-
447-
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
448-
&mem_blk->dev.kobj,
449-
kobject_name(&mem_blk->dev.kobj));
450-
if (ret)
451-
return ret;
454+
page_nid = get_nid_for_pfn(pfn);
455+
if (page_nid < 0)
456+
continue;
457+
if (page_nid != nid)
458+
continue;
452459

453-
return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
454-
&node_devices[nid]->dev.kobj,
455-
kobject_name(&node_devices[nid]->dev.kobj));
460+
return do_register_memory_block_under_node(nid, mem_blk);
456461
}
457462
/* mem section does not span the specified node */
458463
return 0;
459464
}
460465

466+
/*
467+
* During hotplug we know that all pages in the memory block belong to the same
468+
* node.
469+
*/
470+
static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
471+
void *arg)
472+
{
473+
int nid = *(int *)arg;
474+
475+
return do_register_memory_block_under_node(nid, mem_blk);
476+
}
477+
461478
/*
462479
* Unregister a memory block device under the node it spans. Memory blocks
463480
* with multiple nodes cannot be offlined and therefore also never be removed.
@@ -473,10 +490,17 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
473490
kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
474491
}
475492

476-
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
493+
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
494+
enum meminit_context context)
477495
{
478-
return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
479-
register_mem_sect_under_node);
496+
walk_memory_blocks_func_t func;
497+
498+
if (context == MEMINIT_HOTPLUG)
499+
func = register_mem_block_under_node_hotplug;
500+
else
501+
func = register_mem_block_under_node_early;
502+
503+
return walk_memory_range(start_pfn, end_pfn, (void *)&nid, func);
480504
}
481505

482506
#ifdef CONFIG_HUGETLBFS

include/linux/node.h

+7-4
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@ extern struct node *node_devices[];
3232
typedef void (*node_registration_func_t)(struct node *);
3333

3434
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
35-
extern int link_mem_sections(int nid, unsigned long start_pfn,
36-
unsigned long end_pfn);
35+
int link_mem_sections(int nid, unsigned long start_pfn,
36+
unsigned long end_pfn,
37+
enum meminit_context context);
3738
#else
3839
static inline int link_mem_sections(int nid, unsigned long start_pfn,
39-
unsigned long end_pfn)
40+
unsigned long end_pfn,
41+
enum meminit_context context)
4042
{
4143
return 0;
4244
}
@@ -61,7 +63,8 @@ static inline int register_one_node(int nid)
6163
if (error)
6264
return error;
6365
/* link memory sections under this node */
64-
error = link_mem_sections(nid, start_pfn, end_pfn);
66+
error = link_mem_sections(nid, start_pfn, end_pfn,
67+
MEMINIT_EARLY);
6568
}
6669

6770
return error;

mm/memory_hotplug.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1102,7 +1102,8 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
11021102
}
11031103

11041104
/* link memory sections under this node.*/
1105-
ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
1105+
ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1106+
MEMINIT_HOTPLUG);
11061107
BUG_ON(ret);
11071108

11081109
/* create new memmap entry */

0 commit comments

Comments
 (0)