Skip to content

Commit f85086f

Browse files
ldu4torvalds
authored andcommitted
mm: don't rely on system state to detect hot-plug operations
In register_mem_sect_under_node() the system_state's value is checked to detect whether the call is made during boot time or during an hot-plug operation. Unfortunately, that check against SYSTEM_BOOTING is wrong because regular memory is registered at SYSTEM_SCHEDULING state. In addition, memory hot-plug operation can be triggered at this system state by the ACPI [1]. So checking against the system state is not enough. The consequence is that on system with interleaved node's ranges like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] This can be seen on PowerPC LPAR after multiple memory hot-plug and hot-unplug operations are done. At the next reboot the node's memory ranges can be interleaved and since the call to link_mem_sections() is made in topology_init() while the system is in the SYSTEM_SCHEDULING state, the node's id is not checked, and the sections registered to multiple nodes: $ ls -l /sys/devices/system/memory/memory21/node* total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 In that case, the system is able to boot but if later one of theses memory blocks is hot-unplugged and then hot-plugged, the sysfs inconsistency is detected and this is triggering a BUG_ON(): kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This patch addresses the root cause by not relying on the system_state value to detect whether the call is due to a hot-plug operation. An extra parameter is added to link_mem_sections() detailing whether the operation is due to a hot-plug operation. [1] According to Oscar Salvador, using this qemu command line, ACPI memory hotplug operations are raised at SYSTEM_SCHEDULING state: $QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \ -m size=$MEM,slots=255,maxmem=4294967296k \ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \ Fixes: 4fbce63 ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()") Signed-off-by: Laurent Dufour <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Reviewed-by: David Hildenbrand <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: "Rafael J. Wysocki" <[email protected]> Cc: Fenghua Yu <[email protected]> Cc: Nathan Lynch <[email protected]> Cc: Scott Cheloha <[email protected]> Cc: Tony Luck <[email protected]> Cc: <[email protected]> Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent c1d0da8 commit f85086f

File tree

3 files changed

+64
-35
lines changed

3 files changed

+64
-35
lines changed

drivers/base/node.c

+55-30
Original file line numberDiff line numberDiff line change
@@ -761,14 +761,36 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
761761
return pfn_to_nid(pfn);
762762
}
763763

764+
static int do_register_memory_block_under_node(int nid,
765+
struct memory_block *mem_blk)
766+
{
767+
int ret;
768+
769+
/*
770+
* If this memory block spans multiple nodes, we only indicate
771+
* the last processed node.
772+
*/
773+
mem_blk->nid = nid;
774+
775+
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
776+
&mem_blk->dev.kobj,
777+
kobject_name(&mem_blk->dev.kobj));
778+
if (ret)
779+
return ret;
780+
781+
return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
782+
&node_devices[nid]->dev.kobj,
783+
kobject_name(&node_devices[nid]->dev.kobj));
784+
}
785+
764786
/* register memory section under specified node if it spans that node */
765-
static int register_mem_sect_under_node(struct memory_block *mem_blk,
766-
void *arg)
787+
static int register_mem_block_under_node_early(struct memory_block *mem_blk,
788+
void *arg)
767789
{
768790
unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
769791
unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
770792
unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
771-
int ret, nid = *(int *)arg;
793+
int nid = *(int *)arg;
772794
unsigned long pfn;
773795

774796
for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
@@ -785,38 +807,33 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
785807
}
786808

787809
/*
788-
* We need to check if page belongs to nid only for the boot
789-
* case, during hotplug we know that all pages in the memory
790-
* block belong to the same node.
791-
*/
792-
if (system_state == SYSTEM_BOOTING) {
793-
page_nid = get_nid_for_pfn(pfn);
794-
if (page_nid < 0)
795-
continue;
796-
if (page_nid != nid)
797-
continue;
798-
}
799-
800-
/*
801-
* If this memory block spans multiple nodes, we only indicate
802-
* the last processed node.
810+
* We need to check if page belongs to nid only at the boot
811+
* case because node's ranges can be interleaved.
803812
*/
804-
mem_blk->nid = nid;
805-
806-
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
807-
&mem_blk->dev.kobj,
808-
kobject_name(&mem_blk->dev.kobj));
809-
if (ret)
810-
return ret;
813+
page_nid = get_nid_for_pfn(pfn);
814+
if (page_nid < 0)
815+
continue;
816+
if (page_nid != nid)
817+
continue;
811818

812-
return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
813-
&node_devices[nid]->dev.kobj,
814-
kobject_name(&node_devices[nid]->dev.kobj));
819+
return do_register_memory_block_under_node(nid, mem_blk);
815820
}
816821
/* mem section does not span the specified node */
817822
return 0;
818823
}
819824

825+
/*
826+
* During hotplug we know that all pages in the memory block belong to the same
827+
* node.
828+
*/
829+
static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
830+
void *arg)
831+
{
832+
int nid = *(int *)arg;
833+
834+
return do_register_memory_block_under_node(nid, mem_blk);
835+
}
836+
820837
/*
821838
* Unregister a memory block device under the node it spans. Memory blocks
822839
* with multiple nodes cannot be offlined and therefore also never be removed.
@@ -832,11 +849,19 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
832849
kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
833850
}
834851

835-
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
852+
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
853+
enum meminit_context context)
836854
{
855+
walk_memory_blocks_func_t func;
856+
857+
if (context == MEMINIT_HOTPLUG)
858+
func = register_mem_block_under_node_hotplug;
859+
else
860+
func = register_mem_block_under_node_early;
861+
837862
return walk_memory_blocks(PFN_PHYS(start_pfn),
838863
PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
839-
register_mem_sect_under_node);
864+
func);
840865
}
841866

842867
#ifdef CONFIG_HUGETLBFS

include/linux/node.h

+7-4
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,13 @@ extern struct node *node_devices[];
9999
typedef void (*node_registration_func_t)(struct node *);
100100

101101
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
102-
extern int link_mem_sections(int nid, unsigned long start_pfn,
103-
unsigned long end_pfn);
102+
int link_mem_sections(int nid, unsigned long start_pfn,
103+
unsigned long end_pfn,
104+
enum meminit_context context);
104105
#else
105106
static inline int link_mem_sections(int nid, unsigned long start_pfn,
106-
unsigned long end_pfn)
107+
unsigned long end_pfn,
108+
enum meminit_context context)
107109
{
108110
return 0;
109111
}
@@ -128,7 +130,8 @@ static inline int register_one_node(int nid)
128130
if (error)
129131
return error;
130132
/* link memory sections under this node */
131-
error = link_mem_sections(nid, start_pfn, end_pfn);
133+
error = link_mem_sections(nid, start_pfn, end_pfn,
134+
MEMINIT_EARLY);
132135
}
133136

134137
return error;

mm/memory_hotplug.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1080,7 +1080,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
10801080
}
10811081

10821082
/* link memory sections under this node.*/
1083-
ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
1083+
ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1084+
MEMINIT_HOTPLUG);
10841085
BUG_ON(ret);
10851086

10861087
/* create new memmap entry */

0 commit comments

Comments
 (0)