CoCalc -- page

GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/page_alloc.c
¹⁷²⁸⁰ views
1
/*
2
 *  linux/mm/page_alloc.c
3
 *
4
 *  Manages the free list, the system allocates free pages here.
5
 *  Note that kmalloc() lives in slab.c
6
 *
7
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8
 *  Swap reorganised 29.12.95, Stephen Tweedie
9
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15
 */
16

17
#include <linux/stddef.h>
18
#include <linux/mm.h>
19
#include <linux/swap.h>
20
#include <linux/interrupt.h>
21
#include <linux/pagemap.h>
22
#include <linux/jiffies.h>
23
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
25
#include <linux/compiler.h>
26
#include <linux/kernel.h>
27
#include <linux/kmemcheck.h>
28
#include <linux/module.h>
29
#include <linux/suspend.h>
30
#include <linux/pagevec.h>
31
#include <linux/blkdev.h>
32
#include <linux/slab.h>
33
#include <linux/ratelimit.h>
34
#include <linux/oom.h>
35
#include <linux/notifier.h>
36
#include <linux/topology.h>
37
#include <linux/sysctl.h>
38
#include <linux/cpu.h>
39
#include <linux/cpuset.h>
40
#include <linux/memory_hotplug.h>
41
#include <linux/nodemask.h>
42
#include <linux/vmalloc.h>
43
#include <linux/vmstat.h>
44
#include <linux/mempolicy.h>
45
#include <linux/stop_machine.h>
46
#include <linux/sort.h>
47
#include <linux/pfn.h>
48
#include <linux/backing-dev.h>
49
#include <linux/fault-inject.h>
50
#include <linux/page-isolation.h>
51
#include <linux/page_cgroup.h>
52
#include <linux/debugobjects.h>
53
#include <linux/kmemleak.h>
54
#include <linux/memory.h>
55
#include <linux/compaction.h>
56
#include <trace/events/kmem.h>
57
#include <linux/ftrace_event.h>
58
#include <linux/memcontrol.h>
59
#include <linux/prefetch.h>
60

61
#include <asm/tlbflush.h>
62
#include <asm/div64.h>
63
#include "internal.h"
64

65
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
66
DEFINE_PER_CPU(int, numa_node);
67
EXPORT_PER_CPU_SYMBOL(numa_node);
68
#endif
69

70
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
71
/*
72
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
73
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
74
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
75
 * defined in <linux/topology.h>.
76
 */
77
DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
78
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
79
#endif
80

81
/*
82
 * Array of node states.
83
 */
84
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
85
	[N_POSSIBLE] = NODE_MASK_ALL,
86
	[N_ONLINE] = { { [0] = 1UL } },
87
#ifndef CONFIG_NUMA
88
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
89
#ifdef CONFIG_HIGHMEM
90
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
91
#endif
92
	[N_CPU] = { { [0] = 1UL } },
93
#endif	/* NUMA */
94
};
95
EXPORT_SYMBOL(node_states);
96

97
unsigned long totalram_pages __read_mostly;
98
unsigned long totalreserve_pages __read_mostly;
99
int percpu_pagelist_fraction;
100
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
101

102
#ifdef CONFIG_PM_SLEEP
103
/*
104
 * The following functions are used by the suspend/hibernate code to temporarily
105
 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
106
 * while devices are suspended.  To avoid races with the suspend/hibernate code,
107
 * they should always be called with pm_mutex held (gfp_allowed_mask also should
108
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
109
 * guaranteed not to run in parallel with that modification).
110
 */
111

112
static gfp_t saved_gfp_mask;
113

114
void pm_restore_gfp_mask(void)
115
{
116
	WARN_ON(!mutex_is_locked(&pm_mutex));
117
	if (saved_gfp_mask) {
118
		gfp_allowed_mask = saved_gfp_mask;
119
		saved_gfp_mask = 0;
120
	}
121
}
122

123
void pm_restrict_gfp_mask(void)
124
{
125
	WARN_ON(!mutex_is_locked(&pm_mutex));
126
	WARN_ON(saved_gfp_mask);
127
	saved_gfp_mask = gfp_allowed_mask;
128
	gfp_allowed_mask &= ~GFP_IOFS;
129
}
130
#endif /* CONFIG_PM_SLEEP */
131

132
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
133
int pageblock_order __read_mostly;
134
#endif
135

136
static void __free_pages_ok(struct page *page, unsigned int order);
137

138
/*
139
 * results with 256, 32 in the lowmem_reserve sysctl:
140
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
141
 *	1G machine -> (16M dma, 784M normal, 224M high)
142
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
143
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
144
 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
145
 *
146
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
147
 * don't need any ZONE_NORMAL reservation
148
 */
149
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
150
#ifdef CONFIG_ZONE_DMA
151
	 256,
152
#endif
153
#ifdef CONFIG_ZONE_DMA32
154
	 256,
155
#endif
156
#ifdef CONFIG_HIGHMEM
157
	 32,
158
#endif
159
	 32,
160
};
161

162
EXPORT_SYMBOL(totalram_pages);
163

164
static char * const zone_names[MAX_NR_ZONES] = {
165
#ifdef CONFIG_ZONE_DMA
166
	 "DMA",
167
#endif
168
#ifdef CONFIG_ZONE_DMA32
169
	 "DMA32",
170
#endif
171
	 "Normal",
172
#ifdef CONFIG_HIGHMEM
173
	 "HighMem",
174
#endif
175
	 "Movable",
176
};
177

178
int min_free_kbytes = 1024;
179

180
static unsigned long __meminitdata nr_kernel_pages;
181
static unsigned long __meminitdata nr_all_pages;
182
static unsigned long __meminitdata dma_reserve;
183

184
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
185
  /*
186
   * MAX_ACTIVE_REGIONS determines the maximum number of distinct
187
   * ranges of memory (RAM) that may be registered with add_active_range().
188
   * Ranges passed to add_active_range() will be merged if possible
189
   * so the number of times add_active_range() can be called is
190
   * related to the number of nodes and the number of holes
191
   */
192
  #ifdef CONFIG_MAX_ACTIVE_REGIONS
193
    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
194
    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
195
  #else
196
    #if MAX_NUMNODES >= 32
197
      /* If there can be many nodes, allow up to 50 holes per node */
198
      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
199
    #else
200
      /* By default, allow up to 256 distinct regions */
201
      #define MAX_ACTIVE_REGIONS 256
202
    #endif
203
  #endif
204

205
  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
206
  static int __meminitdata nr_nodemap_entries;
207
  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
208
  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
209
  static unsigned long __initdata required_kernelcore;
210
  static unsigned long __initdata required_movablecore;
211
  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
212

213
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
214
  int movable_zone;
215
  EXPORT_SYMBOL(movable_zone);
216
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
217

218
#if MAX_NUMNODES > 1
219
int nr_node_ids __read_mostly = MAX_NUMNODES;
220
int nr_online_nodes __read_mostly = 1;
221
EXPORT_SYMBOL(nr_node_ids);
222
EXPORT_SYMBOL(nr_online_nodes);
223
#endif
224

225
int page_group_by_mobility_disabled __read_mostly;
226

227
static void set_pageblock_migratetype(struct page *page, int migratetype)
228
{
229

230
	if (unlikely(page_group_by_mobility_disabled))
231
		migratetype = MIGRATE_UNMOVABLE;
232

233
	set_pageblock_flags_group(page, (unsigned long)migratetype,
234
					PB_migrate, PB_migrate_end);
235
}
236

237
bool oom_killer_disabled __read_mostly;
238

239
#ifdef CONFIG_DEBUG_VM
240
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
241
{
242
	int ret = 0;
243
	unsigned seq;
244
	unsigned long pfn = page_to_pfn(page);
245

246
	do {
247
		seq = zone_span_seqbegin(zone);
248
		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
249
			ret = 1;
250
		else if (pfn < zone->zone_start_pfn)
251
			ret = 1;
252
	} while (zone_span_seqretry(zone, seq));
253

254
	return ret;
255
}
256

257
static int page_is_consistent(struct zone *zone, struct page *page)
258
{
259
	if (!pfn_valid_within(page_to_pfn(page)))
260
		return 0;
261
	if (zone != page_zone(page))
262
		return 0;
263

264
	return 1;
265
}
266
/*
267
 * Temporary debugging check for pages not lying within a given zone.
268
 */
269
static int bad_range(struct zone *zone, struct page *page)
270
{
271
	if (page_outside_zone_boundaries(zone, page))
272
		return 1;
273
	if (!page_is_consistent(zone, page))
274
		return 1;
275

276
	return 0;
277
}
278
#else
279
static inline int bad_range(struct zone *zone, struct page *page)
280
{
281
	return 0;
282
}
283
#endif
284

285
static void bad_page(struct page *page)
286
{
287
	static unsigned long resume;
288
	static unsigned long nr_shown;
289
	static unsigned long nr_unshown;
290

291
	/* Don't complain about poisoned pages */
292
	if (PageHWPoison(page)) {
293
		reset_page_mapcount(page); /* remove PageBuddy */
294
		return;
295
	}
296

297
	/*
298
	 * Allow a burst of 60 reports, then keep quiet for that minute;
299
	 * or allow a steady drip of one report per second.
300
	 */
301
	if (nr_shown == 60) {
302
		if (time_before(jiffies, resume)) {
303
			nr_unshown++;
304
			goto out;
305
		}
306
		if (nr_unshown) {
307
			printk(KERN_ALERT
308
			      "BUG: Bad page state: %lu messages suppressed\n",
309
				nr_unshown);
310
			nr_unshown = 0;
311
		}
312
		nr_shown = 0;
313
	}
314
	if (nr_shown++ == 0)
315
		resume = jiffies + 60 * HZ;
316

317
	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
318
		current->comm, page_to_pfn(page));
319
	dump_page(page);
320

321
	dump_stack();
322
out:
323
	/* Leave bad fields for debug, except PageBuddy could make trouble */
324
	reset_page_mapcount(page); /* remove PageBuddy */
325
	add_taint(TAINT_BAD_PAGE);
326
}
327

328
/*
329
 * Higher-order pages are called "compound pages".  They are structured thusly:
330
 *
331
 * The first PAGE_SIZE page is called the "head page".
332
 *
333
 * The remaining PAGE_SIZE pages are called "tail pages".
334
 *
335
 * All pages have PG_compound set.  All pages have their ->private pointing at
336
 * the head page (even the head page has this).
337
 *
338
 * The first tail page's ->lru.next holds the address of the compound page's
339
 * put_page() function.  Its ->lru.prev holds the order of allocation.
340
 * This usage means that zero-order pages may not be compound.
341
 */
342

343
static void free_compound_page(struct page *page)
344
{
345
	__free_pages_ok(page, compound_order(page));
346
}
347

348
void prep_compound_page(struct page *page, unsigned long order)
349
{
350
	int i;
351
	int nr_pages = 1 << order;
352

353
	set_compound_page_dtor(page, free_compound_page);
354
	set_compound_order(page, order);
355
	__SetPageHead(page);
356
	for (i = 1; i < nr_pages; i++) {
357
		struct page *p = page + i;
358

359
		__SetPageTail(p);
360
		p->first_page = page;
361
	}
362
}
363

364
/* update __split_huge_page_refcount if you change this function */
365
static int destroy_compound_page(struct page *page, unsigned long order)
366
{
367
	int i;
368
	int nr_pages = 1 << order;
369
	int bad = 0;
370

371
	if (unlikely(compound_order(page) != order) ||
372
	    unlikely(!PageHead(page))) {
373
		bad_page(page);
374
		bad++;
375
	}
376

377
	__ClearPageHead(page);
378

379
	for (i = 1; i < nr_pages; i++) {
380
		struct page *p = page + i;
381

382
		if (unlikely(!PageTail(p) || (p->first_page != page))) {
383
			bad_page(page);
384
			bad++;
385
		}
386
		__ClearPageTail(p);
387
	}
388

389
	return bad;
390
}
391

392
static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
393
{
394
	int i;
395

396
	/*
397
	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
398
	 * and __GFP_HIGHMEM from hard or soft interrupt context.
399
	 */
400
	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
401
	for (i = 0; i < (1 << order); i++)
402
		clear_highpage(page + i);
403
}
404

405
static inline void set_page_order(struct page *page, int order)
406
{
407
	set_page_private(page, order);
408
	__SetPageBuddy(page);
409
}
410

411
static inline void rmv_page_order(struct page *page)
412
{
413
	__ClearPageBuddy(page);
414
	set_page_private(page, 0);
415
}
416

417
/*
418
 * Locate the struct page for both the matching buddy in our
419
 * pair (buddy1) and the combined O(n+1) page they form (page).
420
 *
421
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
422
 * the following equation:
423
 *     B2 = B1 ^ (1 << O)
424
 * For example, if the starting buddy (buddy2) is #8 its order
425
 * 1 buddy is #10:
426
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
427
 *
428
 * 2) Any buddy B will have an order O+1 parent P which
429
 * satisfies the following equation:
430
 *     P = B & ~(1 << O)
431
 *
432
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
433
 */
434
static inline unsigned long
435
__find_buddy_index(unsigned long page_idx, unsigned int order)
436
{
437
	return page_idx ^ (1 << order);
438
}
439

440
/*
441
 * This function checks whether a page is free && is the buddy
442
 * we can do coalesce a page and its buddy if
443
 * (a) the buddy is not in a hole &&
444
 * (b) the buddy is in the buddy system &&
445
 * (c) a page and its buddy have the same order &&
446
 * (d) a page and its buddy are in the same zone.
447
 *
448
 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
449
 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
450
 *
451
 * For recording page's order, we use page_private(page).
452
 */
453
static inline int page_is_buddy(struct page *page, struct page *buddy,
454
								int order)
455
{
456
	if (!pfn_valid_within(page_to_pfn(buddy)))
457
		return 0;
458

459
	if (page_zone_id(page) != page_zone_id(buddy))
460
		return 0;
461

462
	if (PageBuddy(buddy) && page_order(buddy) == order) {
463
		VM_BUG_ON(page_count(buddy) != 0);
464
		return 1;
465
	}
466
	return 0;
467
}
468

469
/*
470
 * Freeing function for a buddy system allocator.
471
 *
472
 * The concept of a buddy system is to maintain direct-mapped table
473
 * (containing bit values) for memory blocks of various "orders".
474
 * The bottom level table contains the map for the smallest allocatable
475
 * units of memory (here, pages), and each level above it describes
476
 * pairs of units from the levels below, hence, "buddies".
477
 * At a high level, all that happens here is marking the table entry
478
 * at the bottom level available, and propagating the changes upward
479
 * as necessary, plus some accounting needed to play nicely with other
480
 * parts of the VM system.
481
 * At each level, we keep a list of pages, which are heads of continuous
482
 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
483
 * order is recorded in page_private(page) field.
484
 * So when we are allocating or freeing one, we can derive the state of the
485
 * other.  That is, if we allocate a small block, and both were   
486
 * free, the remainder of the region must be split into blocks.   
487
 * If a block is freed, and its buddy is also free, then this
488
 * triggers coalescing into a block of larger size.            
489
 *
490
 * -- wli
491
 */
492

493
static inline void __free_one_page(struct page *page,
494
		struct zone *zone, unsigned int order,
495
		int migratetype)
496
{
497
	unsigned long page_idx;
498
	unsigned long combined_idx;
499
	unsigned long uninitialized_var(buddy_idx);
500
	struct page *buddy;
501

502
	if (unlikely(PageCompound(page)))
503
		if (unlikely(destroy_compound_page(page, order)))
504
			return;
505

506
	VM_BUG_ON(migratetype == -1);
507

508
	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
509

510
	VM_BUG_ON(page_idx & ((1 << order) - 1));
511
	VM_BUG_ON(bad_range(zone, page));
512

513
	while (order < MAX_ORDER-1) {
514
		buddy_idx = __find_buddy_index(page_idx, order);
515
		buddy = page + (buddy_idx - page_idx);
516
		if (!page_is_buddy(page, buddy, order))
517
			break;
518

519
		/* Our buddy is free, merge with it and move up one order. */
520
		list_del(&buddy->lru);
521
		zone->free_area[order].nr_free--;
522
		rmv_page_order(buddy);
523
		combined_idx = buddy_idx & page_idx;
524
		page = page + (combined_idx - page_idx);
525
		page_idx = combined_idx;
526
		order++;
527
	}
528
	set_page_order(page, order);
529

530
	/*
531
	 * If this is not the largest possible page, check if the buddy
532
	 * of the next-highest order is free. If it is, it's possible
533
	 * that pages are being freed that will coalesce soon. In case,
534
	 * that is happening, add the free page to the tail of the list
535
	 * so it's less likely to be used soon and more likely to be merged
536
	 * as a higher order page
537
	 */
538
	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
539
		struct page *higher_page, *higher_buddy;
540
		combined_idx = buddy_idx & page_idx;
541
		higher_page = page + (combined_idx - page_idx);
542
		buddy_idx = __find_buddy_index(combined_idx, order + 1);
543
		higher_buddy = page + (buddy_idx - combined_idx);
544
		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
545
			list_add_tail(&page->lru,
546
				&zone->free_area[order].free_list[migratetype]);
547
			goto out;
548
		}
549
	}
550

551
	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
552
out:
553
	zone->free_area[order].nr_free++;
554
}
555

556
/*
557
 * free_page_mlock() -- clean up attempts to free and mlocked() page.
558
 * Page should not be on lru, so no need to fix that up.
559
 * free_pages_check() will verify...
560
 */
561
static inline void free_page_mlock(struct page *page)
562
{
563
	__dec_zone_page_state(page, NR_MLOCK);
564
	__count_vm_event(UNEVICTABLE_MLOCKFREED);
565
}
566

567
static inline int free_pages_check(struct page *page)
568
{
569
	if (unlikely(page_mapcount(page) |
570
		(page->mapping != NULL)  |
571
		(atomic_read(&page->_count) != 0) |
572
		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
573
		(mem_cgroup_bad_page_check(page)))) {
574
		bad_page(page);
575
		return 1;
576
	}
577
	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
578
		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
579
	return 0;
580
}
581

582
/*
583
 * Frees a number of pages from the PCP lists
584
 * Assumes all pages on list are in same zone, and of same order.
585
 * count is the number of pages to free.
586
 *
587
 * If the zone was previously in an "all pages pinned" state then look to
588
 * see if this freeing clears that state.
589
 *
590
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
591
 * pinned" detection logic.
592
 */
593
static void free_pcppages_bulk(struct zone *zone, int count,
594
					struct per_cpu_pages *pcp)
595
{
596
	int migratetype = 0;
597
	int batch_free = 0;
598
	int to_free = count;
599

600
	spin_lock(&zone->lock);
601
	zone->all_unreclaimable = 0;
602
	zone->pages_scanned = 0;
603

604
	while (to_free) {
605
		struct page *page;
606
		struct list_head *list;
607

608
		/*
609
		 * Remove pages from lists in a round-robin fashion. A
610
		 * batch_free count is maintained that is incremented when an
611
		 * empty list is encountered.  This is so more pages are freed
612
		 * off fuller lists instead of spinning excessively around empty
613
		 * lists
614
		 */
615
		do {
616
			batch_free++;
617
			if (++migratetype == MIGRATE_PCPTYPES)
618
				migratetype = 0;
619
			list = &pcp->lists[migratetype];
620
		} while (list_empty(list));
621

622
		/* This is the only non-empty list. Free them all. */
623
		if (batch_free == MIGRATE_PCPTYPES)
624
			batch_free = to_free;
625

626
		do {
627
			page = list_entry(list->prev, struct page, lru);
628
			/* must delete as __free_one_page list manipulates */
629
			list_del(&page->lru);
630
			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
631
			__free_one_page(page, zone, 0, page_private(page));
632
			trace_mm_page_pcpu_drain(page, 0, page_private(page));
633
		} while (--to_free && --batch_free && !list_empty(list));
634
	}
635
	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
636
	spin_unlock(&zone->lock);
637
}
638

639
static void free_one_page(struct zone *zone, struct page *page, int order,
640
				int migratetype)
641
{
642
	spin_lock(&zone->lock);
643
	zone->all_unreclaimable = 0;
644
	zone->pages_scanned = 0;
645

646
	__free_one_page(page, zone, order, migratetype);
647
	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
648
	spin_unlock(&zone->lock);
649
}
650

651
static bool free_pages_prepare(struct page *page, unsigned int order)
652
{
653
	int i;
654
	int bad = 0;
655

656
	trace_mm_page_free_direct(page, order);
657
	kmemcheck_free_shadow(page, order);
658

659
	if (PageAnon(page))
660
		page->mapping = NULL;
661
	for (i = 0; i < (1 << order); i++)
662
		bad += free_pages_check(page + i);
663
	if (bad)
664
		return false;
665

666
	if (!PageHighMem(page)) {
667
		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
668
		debug_check_no_obj_freed(page_address(page),
669
					   PAGE_SIZE << order);
670
	}
671
	arch_free_page(page, order);
672
	kernel_map_pages(page, 1 << order, 0);
673

674
	return true;
675
}
676

677
static void __free_pages_ok(struct page *page, unsigned int order)
678
{
679
	unsigned long flags;
680
	int wasMlocked = __TestClearPageMlocked(page);
681

682
	if (!free_pages_prepare(page, order))
683
		return;
684

685
	local_irq_save(flags);
686
	if (unlikely(wasMlocked))
687
		free_page_mlock(page);
688
	__count_vm_events(PGFREE, 1 << order);
689
	free_one_page(page_zone(page), page, order,
690
					get_pageblock_migratetype(page));
691
	local_irq_restore(flags);
692
}
693

694
/*
695
 * permit the bootmem allocator to evade page validation on high-order frees
696
 */
697
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
698
{
699
	if (order == 0) {
700
		__ClearPageReserved(page);
701
		set_page_count(page, 0);
702
		set_page_refcounted(page);
703
		__free_page(page);
704
	} else {
705
		int loop;
706

707
		prefetchw(page);
708
		for (loop = 0; loop < BITS_PER_LONG; loop++) {
709
			struct page *p = &page[loop];
710

711
			if (loop + 1 < BITS_PER_LONG)
712
				prefetchw(p + 1);
713
			__ClearPageReserved(p);
714
			set_page_count(p, 0);
715
		}
716

717
		set_page_refcounted(page);
718
		__free_pages(page, order);
719
	}
720
}
721

722

723
/*
724
 * The order of subdivision here is critical for the IO subsystem.
725
 * Please do not alter this order without good reasons and regression
726
 * testing. Specifically, as large blocks of memory are subdivided,
727
 * the order in which smaller blocks are delivered depends on the order
728
 * they're subdivided in this function. This is the primary factor
729
 * influencing the order in which pages are delivered to the IO
730
 * subsystem according to empirical testing, and this is also justified
731
 * by considering the behavior of a buddy system containing a single
732
 * large block of memory acted on by a series of small allocations.
733
 * This behavior is a critical factor in sglist merging's success.
734
 *
735
 * -- wli
736
 */
737
static inline void expand(struct zone *zone, struct page *page,
738
	int low, int high, struct free_area *area,
739
	int migratetype)
740
{
741
	unsigned long size = 1 << high;
742

743
	while (high > low) {
744
		area--;
745
		high--;
746
		size >>= 1;
747
		VM_BUG_ON(bad_range(zone, &page[size]));
748
		list_add(&page[size].lru, &area->free_list[migratetype]);
749
		area->nr_free++;
750
		set_page_order(&page[size], high);
751
	}
752
}
753

754
/*
755
 * This page is about to be returned from the page allocator
756
 */
757
static inline int check_new_page(struct page *page)
758
{
759
	if (unlikely(page_mapcount(page) |
760
		(page->mapping != NULL)  |
761
		(atomic_read(&page->_count) != 0)  |
762
		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
763
		(mem_cgroup_bad_page_check(page)))) {
764
		bad_page(page);
765
		return 1;
766
	}
767
	return 0;
768
}
769

770
static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
771
{
772
	int i;
773

774
	for (i = 0; i < (1 << order); i++) {
775
		struct page *p = page + i;
776
		if (unlikely(check_new_page(p)))
777
			return 1;
778
	}
779

780
	set_page_private(page, 0);
781
	set_page_refcounted(page);
782

783
	arch_alloc_page(page, order);
784
	kernel_map_pages(page, 1 << order, 1);
785

786
	if (gfp_flags & __GFP_ZERO)
787
		prep_zero_page(page, order, gfp_flags);
788

789
	if (order && (gfp_flags & __GFP_COMP))
790
		prep_compound_page(page, order);
791

792
	return 0;
793
}
794

795
/*
796
 * Go through the free lists for the given migratetype and remove
797
 * the smallest available page from the freelists
798
 */
799
static inline
800
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
801
						int migratetype)
802
{
803
	unsigned int current_order;
804
	struct free_area * area;
805
	struct page *page;
806

807
	/* Find a page of the appropriate size in the preferred list */
808
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
809
		area = &(zone->free_area[current_order]);
810
		if (list_empty(&area->free_list[migratetype]))
811
			continue;
812

813
		page = list_entry(area->free_list[migratetype].next,
814
							struct page, lru);
815
		list_del(&page->lru);
816
		rmv_page_order(page);
817
		area->nr_free--;
818
		expand(zone, page, order, current_order, area, migratetype);
819
		return page;
820
	}
821

822
	return NULL;
823
}
824

825

826
/*
827
 * This array describes the order lists are fallen back to when
828
 * the free lists for the desirable migrate type are depleted
829
 */
830
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
831
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
832
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
833
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
834
	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
835
};
836

837
/*
838
 * Move the free pages in a range to the free lists of the requested type.
839
 * Note that start_page and end_pages are not aligned on a pageblock
840
 * boundary. If alignment is required, use move_freepages_block()
841
 */
842
static int move_freepages(struct zone *zone,
843
			  struct page *start_page, struct page *end_page,
844
			  int migratetype)
845
{
846
	struct page *page;
847
	unsigned long order;
848
	int pages_moved = 0;
849

850
#ifndef CONFIG_HOLES_IN_ZONE
851
	/*
852
	 * page_zone is not safe to call in this context when
853
	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
854
	 * anyway as we check zone boundaries in move_freepages_block().
855
	 * Remove at a later date when no bug reports exist related to
856
	 * grouping pages by mobility
857
	 */
858
	BUG_ON(page_zone(start_page) != page_zone(end_page));
859
#endif
860

861
	for (page = start_page; page <= end_page;) {
862
		/* Make sure we are not inadvertently changing nodes */
863
		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
864

865
		if (!pfn_valid_within(page_to_pfn(page))) {
866
			page++;
867
			continue;
868
		}
869

870
		if (!PageBuddy(page)) {
871
			page++;
872
			continue;
873
		}
874

875
		order = page_order(page);
876
		list_move(&page->lru,
877
			  &zone->free_area[order].free_list[migratetype]);
878
		page += 1 << order;
879
		pages_moved += 1 << order;
880
	}
881

882
	return pages_moved;
883
}
884

885
static int move_freepages_block(struct zone *zone, struct page *page,
886
				int migratetype)
887
{
888
	unsigned long start_pfn, end_pfn;
889
	struct page *start_page, *end_page;
890

891
	start_pfn = page_to_pfn(page);
892
	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
893
	start_page = pfn_to_page(start_pfn);
894
	end_page = start_page + pageblock_nr_pages - 1;
895
	end_pfn = start_pfn + pageblock_nr_pages - 1;
896

897
	/* Do not cross zone boundaries */
898
	if (start_pfn < zone->zone_start_pfn)
899
		start_page = page;
900
	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
901
		return 0;
902

903
	return move_freepages(zone, start_page, end_page, migratetype);
904
}
905

906
static void change_pageblock_range(struct page *pageblock_page,
907
					int start_order, int migratetype)
908
{
909
	int nr_pageblocks = 1 << (start_order - pageblock_order);
910

911
	while (nr_pageblocks--) {
912
		set_pageblock_migratetype(pageblock_page, migratetype);
913
		pageblock_page += pageblock_nr_pages;
914
	}
915
}
916

917
/* Remove an element from the buddy allocator from the fallback list */
918
static inline struct page *
919
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
920
{
921
	struct free_area * area;
922
	int current_order;
923
	struct page *page;
924
	int migratetype, i;
925

926
	/* Find the largest possible block of pages in the other list */
927
	for (current_order = MAX_ORDER-1; current_order >= order;
928
						--current_order) {
929
		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
930
			migratetype = fallbacks[start_migratetype][i];
931

932
			/* MIGRATE_RESERVE handled later if necessary */
933
			if (migratetype == MIGRATE_RESERVE)
934
				continue;
935

936
			area = &(zone->free_area[current_order]);
937
			if (list_empty(&area->free_list[migratetype]))
938
				continue;
939

940
			page = list_entry(area->free_list[migratetype].next,
941
					struct page, lru);
942
			area->nr_free--;
943

944
			/*
945
			 * If breaking a large block of pages, move all free
946
			 * pages to the preferred allocation list. If falling
947
			 * back for a reclaimable kernel allocation, be more
948
			 * aggressive about taking ownership of free pages
949
			 */
950
			if (unlikely(current_order >= (pageblock_order >> 1)) ||
951
					start_migratetype == MIGRATE_RECLAIMABLE ||
952
					page_group_by_mobility_disabled) {
953
				unsigned long pages;
954
				pages = move_freepages_block(zone, page,
955
								start_migratetype);
956

957
				/* Claim the whole block if over half of it is free */
958
				if (pages >= (1 << (pageblock_order-1)) ||
959
						page_group_by_mobility_disabled)
960
					set_pageblock_migratetype(page,
961
								start_migratetype);
962

963
				migratetype = start_migratetype;
964
			}
965

966
			/* Remove the page from the freelists */
967
			list_del(&page->lru);
968
			rmv_page_order(page);
969

970
			/* Take ownership for orders >= pageblock_order */
971
			if (current_order >= pageblock_order)
972
				change_pageblock_range(page, current_order,
973
							start_migratetype);
974

975
			expand(zone, page, order, current_order, area, migratetype);
976

977
			trace_mm_page_alloc_extfrag(page, order, current_order,
978
				start_migratetype, migratetype);
979

980
			return page;
981
		}
982
	}
983

984
	return NULL;
985
}
986

987
/*
988
 * Do the hard work of removing an element from the buddy allocator.
989
 * Call me with the zone->lock already held.
990
 */
991
static struct page *__rmqueue(struct zone *zone, unsigned int order,
992
						int migratetype)
993
{
994
	struct page *page;
995

996
retry_reserve:
997
	page = __rmqueue_smallest(zone, order, migratetype);
998

999
	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1000
		page = __rmqueue_fallback(zone, order, migratetype);
1001

1002
		/*
1003
		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1004
		 * is used because __rmqueue_smallest is an inline function
1005
		 * and we want just one call site
1006
		 */
1007
		if (!page) {
1008
			migratetype = MIGRATE_RESERVE;
1009
			goto retry_reserve;
1010
		}
1011
	}
1012

1013
	trace_mm_page_alloc_zone_locked(page, order, migratetype);
1014
	return page;
1015
}
1016

1017
/* 
1018
 * Obtain a specified number of elements from the buddy allocator, all under
1019
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
1020
 * Returns the number of new pages which were placed at *list.
1021
 */
1022
static int rmqueue_bulk(struct zone *zone, unsigned int order, 
1023
			unsigned long count, struct list_head *list,
1024
			int migratetype, int cold)
1025
{
1026
	int i;
1027
	
1028
	spin_lock(&zone->lock);
1029
	for (i = 0; i < count; ++i) {
1030
		struct page *page = __rmqueue(zone, order, migratetype);
1031
		if (unlikely(page == NULL))
1032
			break;
1033

1034
		/*
1035
		 * Split buddy pages returned by expand() are received here
1036
		 * in physical page order. The page is added to the callers and
1037
		 * list and the list head then moves forward. From the callers
1038
		 * perspective, the linked list is ordered by page number in
1039
		 * some conditions. This is useful for IO devices that can
1040
		 * merge IO requests if the physical pages are ordered
1041
		 * properly.
1042
		 */
1043
		if (likely(cold == 0))
1044
			list_add(&page->lru, list);
1045
		else
1046
			list_add_tail(&page->lru, list);
1047
		set_page_private(page, migratetype);
1048
		list = &page->lru;
1049
	}
1050
	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1051
	spin_unlock(&zone->lock);
1052
	return i;
1053
}
1054

1055
#ifdef CONFIG_NUMA
1056
/*
1057
 * Called from the vmstat counter updater to drain pagesets of this
1058
 * currently executing processor on remote nodes after they have
1059
 * expired.
1060
 *
1061
 * Note that this function must be called with the thread pinned to
1062
 * a single processor.
1063
 */
1064
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1065
{
1066
	unsigned long flags;
1067
	int to_drain;
1068

1069
	local_irq_save(flags);
1070
	if (pcp->count >= pcp->batch)
1071
		to_drain = pcp->batch;
1072
	else
1073
		to_drain = pcp->count;
1074
	free_pcppages_bulk(zone, to_drain, pcp);
1075
	pcp->count -= to_drain;
1076
	local_irq_restore(flags);
1077
}
1078
#endif
1079

1080
/*
1081
 * Drain pages of the indicated processor.
1082
 *
1083
 * The processor must either be the current processor and the
1084
 * thread pinned to the current processor or a processor that
1085
 * is not online.
1086
 */
1087
static void drain_pages(unsigned int cpu)
1088
{
1089
	unsigned long flags;
1090
	struct zone *zone;
1091

1092
	for_each_populated_zone(zone) {
1093
		struct per_cpu_pageset *pset;
1094
		struct per_cpu_pages *pcp;
1095

1096
		local_irq_save(flags);
1097
		pset = per_cpu_ptr(zone->pageset, cpu);
1098

1099
		pcp = &pset->pcp;
1100
		if (pcp->count) {
1101
			free_pcppages_bulk(zone, pcp->count, pcp);
1102
			pcp->count = 0;
1103
		}
1104
		local_irq_restore(flags);
1105
	}
1106
}
1107

1108
/*
1109
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1110
 */
1111
void drain_local_pages(void *arg)
1112
{
1113
	drain_pages(smp_processor_id());
1114
}
1115

1116
/*
1117
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
1118
 */
1119
void drain_all_pages(void)
1120
{
1121
	on_each_cpu(drain_local_pages, NULL, 1);
1122
}
1123

1124
#ifdef CONFIG_HIBERNATION
1125

1126
void mark_free_pages(struct zone *zone)
1127
{
1128
	unsigned long pfn, max_zone_pfn;
1129
	unsigned long flags;
1130
	int order, t;
1131
	struct list_head *curr;
1132

1133
	if (!zone->spanned_pages)
1134
		return;
1135

1136
	spin_lock_irqsave(&zone->lock, flags);
1137

1138
	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1139
	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1140
		if (pfn_valid(pfn)) {
1141
			struct page *page = pfn_to_page(pfn);
1142

1143
			if (!swsusp_page_is_forbidden(page))
1144
				swsusp_unset_page_free(page);
1145
		}
1146

1147
	for_each_migratetype_order(order, t) {
1148
		list_for_each(curr, &zone->free_area[order].free_list[t]) {
1149
			unsigned long i;
1150

1151
			pfn = page_to_pfn(list_entry(curr, struct page, lru));
1152
			for (i = 0; i < (1UL << order); i++)
1153
				swsusp_set_page_free(pfn_to_page(pfn + i));
1154
		}
1155
	}
1156
	spin_unlock_irqrestore(&zone->lock, flags);
1157
}
1158
#endif /* CONFIG_PM */
1159

1160
/*
1161
 * Free a 0-order page
1162
 * cold == 1 ? free a cold page : free a hot page
1163
 */
1164
void free_hot_cold_page(struct page *page, int cold)
1165
{
1166
	struct zone *zone = page_zone(page);
1167
	struct per_cpu_pages *pcp;
1168
	unsigned long flags;
1169
	int migratetype;
1170
	int wasMlocked = __TestClearPageMlocked(page);
1171

1172
	if (!free_pages_prepare(page, 0))
1173
		return;
1174

1175
	migratetype = get_pageblock_migratetype(page);
1176
	set_page_private(page, migratetype);
1177
	local_irq_save(flags);
1178
	if (unlikely(wasMlocked))
1179
		free_page_mlock(page);
1180
	__count_vm_event(PGFREE);
1181

1182
	/*
1183
	 * We only track unmovable, reclaimable and movable on pcp lists.
1184
	 * Free ISOLATE pages back to the allocator because they are being
1185
	 * offlined but treat RESERVE as movable pages so we can get those
1186
	 * areas back if necessary. Otherwise, we may have to free
1187
	 * excessively into the page allocator
1188
	 */
1189
	if (migratetype >= MIGRATE_PCPTYPES) {
1190
		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1191
			free_one_page(zone, page, 0, migratetype);
1192
			goto out;
1193
		}
1194
		migratetype = MIGRATE_MOVABLE;
1195
	}
1196

1197
	pcp = &this_cpu_ptr(zone->pageset)->pcp;
1198
	if (cold)
1199
		list_add_tail(&page->lru, &pcp->lists[migratetype]);
1200
	else
1201
		list_add(&page->lru, &pcp->lists[migratetype]);
1202
	pcp->count++;
1203
	if (pcp->count >= pcp->high) {
1204
		free_pcppages_bulk(zone, pcp->batch, pcp);
1205
		pcp->count -= pcp->batch;
1206
	}
1207

1208
out:
1209
	local_irq_restore(flags);
1210
}
1211

1212
/*
1213
 * split_page takes a non-compound higher-order page, and splits it into
1214
 * n (1<<order) sub-pages: page[0..n]
1215
 * Each sub-page must be freed individually.
1216
 *
1217
 * Note: this is probably too low level an operation for use in drivers.
1218
 * Please consult with lkml before using this in your driver.
1219
 */
1220
void split_page(struct page *page, unsigned int order)
1221
{
1222
	int i;
1223

1224
	VM_BUG_ON(PageCompound(page));
1225
	VM_BUG_ON(!page_count(page));
1226

1227
#ifdef CONFIG_KMEMCHECK
1228
	/*
1229
	 * Split shadow pages too, because free(page[0]) would
1230
	 * otherwise free the whole shadow.
1231
	 */
1232
	if (kmemcheck_page_is_tracked(page))
1233
		split_page(virt_to_page(page[0].shadow), order);
1234
#endif
1235

1236
	for (i = 1; i < (1 << order); i++)
1237
		set_page_refcounted(page + i);
1238
}
1239

1240
/*
1241
 * Similar to split_page except the page is already free. As this is only
1242
 * being used for migration, the migratetype of the block also changes.
1243
 * As this is called with interrupts disabled, the caller is responsible
1244
 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1245
 * are enabled.
1246
 *
1247
 * Note: this is probably too low level an operation for use in drivers.
1248
 * Please consult with lkml before using this in your driver.
1249
 */
1250
int split_free_page(struct page *page)
1251
{
1252
	unsigned int order;
1253
	unsigned long watermark;
1254
	struct zone *zone;
1255

1256
	BUG_ON(!PageBuddy(page));
1257

1258
	zone = page_zone(page);
1259
	order = page_order(page);
1260

1261
	/* Obey watermarks as if the page was being allocated */
1262
	watermark = low_wmark_pages(zone) + (1 << order);
1263
	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1264
		return 0;
1265

1266
	/* Remove page from free list */
1267
	list_del(&page->lru);
1268
	zone->free_area[order].nr_free--;
1269
	rmv_page_order(page);
1270
	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1271

1272
	/* Split into individual pages */
1273
	set_page_refcounted(page);
1274
	split_page(page, order);
1275

1276
	if (order >= pageblock_order - 1) {
1277
		struct page *endpage = page + (1 << order) - 1;
1278
		for (; page < endpage; page += pageblock_nr_pages)
1279
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1280
	}
1281

1282
	return 1 << order;
1283
}
1284

1285
/*
1286
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
1287
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
1288
 * or two.
1289
 */
1290
static inline
1291
struct page *buffered_rmqueue(struct zone *preferred_zone,
1292
			struct zone *zone, int order, gfp_t gfp_flags,
1293
			int migratetype)
1294
{
1295
	unsigned long flags;
1296
	struct page *page;
1297
	int cold = !!(gfp_flags & __GFP_COLD);
1298

1299
again:
1300
	if (likely(order == 0)) {
1301
		struct per_cpu_pages *pcp;
1302
		struct list_head *list;
1303

1304
		local_irq_save(flags);
1305
		pcp = &this_cpu_ptr(zone->pageset)->pcp;
1306
		list = &pcp->lists[migratetype];
1307
		if (list_empty(list)) {
1308
			pcp->count += rmqueue_bulk(zone, 0,
1309
					pcp->batch, list,
1310
					migratetype, cold);
1311
			if (unlikely(list_empty(list)))
1312
				goto failed;
1313
		}
1314

1315
		if (cold)
1316
			page = list_entry(list->prev, struct page, lru);
1317
		else
1318
			page = list_entry(list->next, struct page, lru);
1319

1320
		list_del(&page->lru);
1321
		pcp->count--;
1322
	} else {
1323
		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1324
			/*
1325
			 * __GFP_NOFAIL is not to be used in new code.
1326
			 *
1327
			 * All __GFP_NOFAIL callers should be fixed so that they
1328
			 * properly detect and handle allocation failures.
1329
			 *
1330
			 * We most definitely don't want callers attempting to
1331
			 * allocate greater than order-1 page units with
1332
			 * __GFP_NOFAIL.
1333
			 */
1334
			WARN_ON_ONCE(order > 1);
1335
		}
1336
		spin_lock_irqsave(&zone->lock, flags);
1337
		page = __rmqueue(zone, order, migratetype);
1338
		spin_unlock(&zone->lock);
1339
		if (!page)
1340
			goto failed;
1341
		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1342
	}
1343

1344
	__count_zone_vm_events(PGALLOC, zone, 1 << order);
1345
	zone_statistics(preferred_zone, zone, gfp_flags);
1346
	local_irq_restore(flags);
1347

1348
	VM_BUG_ON(bad_range(zone, page));
1349
	if (prep_new_page(page, order, gfp_flags))
1350
		goto again;
1351
	return page;
1352

1353
failed:
1354
	local_irq_restore(flags);
1355
	return NULL;
1356
}
1357

1358
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1359
#define ALLOC_WMARK_MIN		WMARK_MIN
1360
#define ALLOC_WMARK_LOW		WMARK_LOW
1361
#define ALLOC_WMARK_HIGH	WMARK_HIGH
1362
#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
1363

1364
/* Mask to get the watermark bits */
1365
#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
1366

1367
#define ALLOC_HARDER		0x10 /* try to alloc harder */
1368
#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
1369
#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
1370

1371
#ifdef CONFIG_FAIL_PAGE_ALLOC
1372

1373
static struct fail_page_alloc_attr {
1374
	struct fault_attr attr;
1375

1376
	u32 ignore_gfp_highmem;
1377
	u32 ignore_gfp_wait;
1378
	u32 min_order;
1379

1380
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1381

1382
	struct dentry *ignore_gfp_highmem_file;
1383
	struct dentry *ignore_gfp_wait_file;
1384
	struct dentry *min_order_file;
1385

1386
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1387

1388
} fail_page_alloc = {
1389
	.attr = FAULT_ATTR_INITIALIZER,
1390
	.ignore_gfp_wait = 1,
1391
	.ignore_gfp_highmem = 1,
1392
	.min_order = 1,
1393
};
1394

1395
static int __init setup_fail_page_alloc(char *str)
1396
{
1397
	return setup_fault_attr(&fail_page_alloc.attr, str);
1398
}
1399
__setup("fail_page_alloc=", setup_fail_page_alloc);
1400

1401
static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1402
{
1403
	if (order < fail_page_alloc.min_order)
1404
		return 0;
1405
	if (gfp_mask & __GFP_NOFAIL)
1406
		return 0;
1407
	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1408
		return 0;
1409
	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1410
		return 0;
1411

1412
	return should_fail(&fail_page_alloc.attr, 1 << order);
1413
}
1414

1415
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1416

1417
static int __init fail_page_alloc_debugfs(void)
1418
{
1419
	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1420
	struct dentry *dir;
1421
	int err;
1422

1423
	err = init_fault_attr_dentries(&fail_page_alloc.attr,
1424
				       "fail_page_alloc");
1425
	if (err)
1426
		return err;
1427
	dir = fail_page_alloc.attr.dentries.dir;
1428

1429
	fail_page_alloc.ignore_gfp_wait_file =
1430
		debugfs_create_bool("ignore-gfp-wait", mode, dir,
1431
				      &fail_page_alloc.ignore_gfp_wait);
1432

1433
	fail_page_alloc.ignore_gfp_highmem_file =
1434
		debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1435
				      &fail_page_alloc.ignore_gfp_highmem);
1436
	fail_page_alloc.min_order_file =
1437
		debugfs_create_u32("min-order", mode, dir,
1438
				   &fail_page_alloc.min_order);
1439

1440
	if (!fail_page_alloc.ignore_gfp_wait_file ||
1441
            !fail_page_alloc.ignore_gfp_highmem_file ||
1442
            !fail_page_alloc.min_order_file) {
1443
		err = -ENOMEM;
1444
		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1445
		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1446
		debugfs_remove(fail_page_alloc.min_order_file);
1447
		cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1448
	}
1449

1450
	return err;
1451
}
1452

1453
late_initcall(fail_page_alloc_debugfs);
1454

1455
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1456

1457
#else /* CONFIG_FAIL_PAGE_ALLOC */
1458

1459
static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1460
{
1461
	return 0;
1462
}
1463

1464
#endif /* CONFIG_FAIL_PAGE_ALLOC */
1465

1466
/*
1467
 * Return true if free pages are above 'mark'. This takes into account the order
1468
 * of the allocation.
1469
 */
1470
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1471
		      int classzone_idx, int alloc_flags, long free_pages)
1472
{
1473
	/* free_pages my go negative - that's OK */
1474
	long min = mark;
1475
	int o;
1476

1477
	free_pages -= (1 << order) + 1;
1478
	if (alloc_flags & ALLOC_HIGH)
1479
		min -= min / 2;
1480
	if (alloc_flags & ALLOC_HARDER)
1481
		min -= min / 4;
1482

1483
	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1484
		return false;
1485
	for (o = 0; o < order; o++) {
1486
		/* At the next order, this order's pages become unavailable */
1487
		free_pages -= z->free_area[o].nr_free << o;
1488

1489
		/* Require fewer higher order pages to be free */
1490
		min >>= 1;
1491

1492
		if (free_pages <= min)
1493
			return false;
1494
	}
1495
	return true;
1496
}
1497

1498
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1499
		      int classzone_idx, int alloc_flags)
1500
{
1501
	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1502
					zone_page_state(z, NR_FREE_PAGES));
1503
}
1504

1505
bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1506
		      int classzone_idx, int alloc_flags)
1507
{
1508
	long free_pages = zone_page_state(z, NR_FREE_PAGES);
1509

1510
	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1511
		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1512

1513
	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1514
								free_pages);
1515
}
1516

1517
#ifdef CONFIG_NUMA
1518
/*
1519
 * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
1520
 * skip over zones that are not allowed by the cpuset, or that have
1521
 * been recently (in last second) found to be nearly full.  See further
1522
 * comments in mmzone.h.  Reduces cache footprint of zonelist scans
1523
 * that have to skip over a lot of full or unallowed zones.
1524
 *
1525
 * If the zonelist cache is present in the passed in zonelist, then
1526
 * returns a pointer to the allowed node mask (either the current
1527
 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1528
 *
1529
 * If the zonelist cache is not available for this zonelist, does
1530
 * nothing and returns NULL.
1531
 *
1532
 * If the fullzones BITMAP in the zonelist cache is stale (more than
1533
 * a second since last zap'd) then we zap it out (clear its bits.)
1534
 *
1535
 * We hold off even calling zlc_setup, until after we've checked the
1536
 * first zone in the zonelist, on the theory that most allocations will
1537
 * be satisfied from that first zone, so best to examine that zone as
1538
 * quickly as we can.
1539
 */
1540
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1541
{
1542
	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1543
	nodemask_t *allowednodes;	/* zonelist_cache approximation */
1544

1545
	zlc = zonelist->zlcache_ptr;
1546
	if (!zlc)
1547
		return NULL;
1548

1549
	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1550
		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1551
		zlc->last_full_zap = jiffies;
1552
	}
1553

1554
	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1555
					&cpuset_current_mems_allowed :
1556
					&node_states[N_HIGH_MEMORY];
1557
	return allowednodes;
1558
}
1559

1560
/*
1561
 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1562
 * if it is worth looking at further for free memory:
1563
 *  1) Check that the zone isn't thought to be full (doesn't have its
1564
 *     bit set in the zonelist_cache fullzones BITMAP).
1565
 *  2) Check that the zones node (obtained from the zonelist_cache
1566
 *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1567
 * Return true (non-zero) if zone is worth looking at further, or
1568
 * else return false (zero) if it is not.
1569
 *
1570
 * This check -ignores- the distinction between various watermarks,
1571
 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
1572
 * found to be full for any variation of these watermarks, it will
1573
 * be considered full for up to one second by all requests, unless
1574
 * we are so low on memory on all allowed nodes that we are forced
1575
 * into the second scan of the zonelist.
1576
 *
1577
 * In the second scan we ignore this zonelist cache and exactly
1578
 * apply the watermarks to all zones, even it is slower to do so.
1579
 * We are low on memory in the second scan, and should leave no stone
1580
 * unturned looking for a free page.
1581
 */
1582
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1583
						nodemask_t *allowednodes)
1584
{
1585
	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1586
	int i;				/* index of *z in zonelist zones */
1587
	int n;				/* node that zone *z is on */
1588

1589
	zlc = zonelist->zlcache_ptr;
1590
	if (!zlc)
1591
		return 1;
1592

1593
	i = z - zonelist->_zonerefs;
1594
	n = zlc->z_to_n[i];
1595

1596
	/* This zone is worth trying if it is allowed but not full */
1597
	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1598
}
1599

1600
/*
1601
 * Given 'z' scanning a zonelist, set the corresponding bit in
1602
 * zlc->fullzones, so that subsequent attempts to allocate a page
1603
 * from that zone don't waste time re-examining it.
1604
 */
1605
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1606
{
1607
	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1608
	int i;				/* index of *z in zonelist zones */
1609

1610
	zlc = zonelist->zlcache_ptr;
1611
	if (!zlc)
1612
		return;
1613

1614
	i = z - zonelist->_zonerefs;
1615

1616
	set_bit(i, zlc->fullzones);
1617
}
1618

1619
#else	/* CONFIG_NUMA */
1620

1621
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1622
{
1623
	return NULL;
1624
}
1625

1626
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1627
				nodemask_t *allowednodes)
1628
{
1629
	return 1;
1630
}
1631

1632
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1633
{
1634
}
1635
#endif	/* CONFIG_NUMA */
1636

1637
/*
1638
 * get_page_from_freelist goes through the zonelist trying to allocate
1639
 * a page.
1640
 */
1641
static struct page *
1642
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1643
		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1644
		struct zone *preferred_zone, int migratetype)
1645
{
1646
	struct zoneref *z;
1647
	struct page *page = NULL;
1648
	int classzone_idx;
1649
	struct zone *zone;
1650
	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1651
	int zlc_active = 0;		/* set if using zonelist_cache */
1652
	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
1653

1654
	classzone_idx = zone_idx(preferred_zone);
1655
zonelist_scan:
1656
	/*
1657
	 * Scan zonelist, looking for a zone with enough free.
1658
	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1659
	 */
1660
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
1661
						high_zoneidx, nodemask) {
1662
		if (NUMA_BUILD && zlc_active &&
1663
			!zlc_zone_worth_trying(zonelist, z, allowednodes))
1664
				continue;
1665
		if ((alloc_flags & ALLOC_CPUSET) &&
1666
			!cpuset_zone_allowed_softwall(zone, gfp_mask))
1667
				goto try_next_zone;
1668

1669
		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1670
		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1671
			unsigned long mark;
1672
			int ret;
1673

1674
			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1675
			if (zone_watermark_ok(zone, order, mark,
1676
				    classzone_idx, alloc_flags))
1677
				goto try_this_zone;
1678

1679
			if (zone_reclaim_mode == 0)
1680
				goto this_zone_full;
1681

1682
			ret = zone_reclaim(zone, gfp_mask, order);
1683
			switch (ret) {
1684
			case ZONE_RECLAIM_NOSCAN:
1685
				/* did not scan */
1686
				goto try_next_zone;
1687
			case ZONE_RECLAIM_FULL:
1688
				/* scanned but unreclaimable */
1689
				goto this_zone_full;
1690
			default:
1691
				/* did we reclaim enough */
1692
				if (!zone_watermark_ok(zone, order, mark,
1693
						classzone_idx, alloc_flags))
1694
					goto this_zone_full;
1695
			}
1696
		}
1697

1698
try_this_zone:
1699
		page = buffered_rmqueue(preferred_zone, zone, order,
1700
						gfp_mask, migratetype);
1701
		if (page)
1702
			break;
1703
this_zone_full:
1704
		if (NUMA_BUILD)
1705
			zlc_mark_zone_full(zonelist, z);
1706
try_next_zone:
1707
		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1708
			/*
1709
			 * we do zlc_setup after the first zone is tried but only
1710
			 * if there are multiple nodes make it worthwhile
1711
			 */
1712
			allowednodes = zlc_setup(zonelist, alloc_flags);
1713
			zlc_active = 1;
1714
			did_zlc_setup = 1;
1715
		}
1716
	}
1717

1718
	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1719
		/* Disable zlc cache for second zonelist scan */
1720
		zlc_active = 0;
1721
		goto zonelist_scan;
1722
	}
1723
	return page;
1724
}
1725

1726
/*
1727
 * Large machines with many possible nodes should not always dump per-node
1728
 * meminfo in irq context.
1729
 */
1730
static inline bool should_suppress_show_mem(void)
1731
{
1732
	bool ret = false;
1733

1734
#if NODES_SHIFT > 8
1735
	ret = in_interrupt();
1736
#endif
1737
	return ret;
1738
}
1739

1740
static DEFINE_RATELIMIT_STATE(nopage_rs,
1741
		DEFAULT_RATELIMIT_INTERVAL,
1742
		DEFAULT_RATELIMIT_BURST);
1743

1744
void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1745
{
1746
	va_list args;
1747
	unsigned int filter = SHOW_MEM_FILTER_NODES;
1748

1749
	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
1750
		return;
1751

1752
	/*
1753
	 * This documents exceptions given to allocations in certain
1754
	 * contexts that are allowed to allocate outside current's set
1755
	 * of allowed nodes.
1756
	 */
1757
	if (!(gfp_mask & __GFP_NOMEMALLOC))
1758
		if (test_thread_flag(TIF_MEMDIE) ||
1759
		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
1760
			filter &= ~SHOW_MEM_FILTER_NODES;
1761
	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
1762
		filter &= ~SHOW_MEM_FILTER_NODES;
1763

1764
	if (fmt) {
1765
		printk(KERN_WARNING);
1766
		va_start(args, fmt);
1767
		vprintk(fmt, args);
1768
		va_end(args);
1769
	}
1770

1771
	pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
1772
		   current->comm, order, gfp_mask);
1773

1774
	dump_stack();
1775
	if (!should_suppress_show_mem())
1776
		show_mem(filter);
1777
}
1778

1779
static inline int
1780
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1781
				unsigned long pages_reclaimed)
1782
{
1783
	/* Do not loop if specifically requested */
1784
	if (gfp_mask & __GFP_NORETRY)
1785
		return 0;
1786

1787
	/*
1788
	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1789
	 * means __GFP_NOFAIL, but that may not be true in other
1790
	 * implementations.
1791
	 */
1792
	if (order <= PAGE_ALLOC_COSTLY_ORDER)
1793
		return 1;
1794

1795
	/*
1796
	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1797
	 * specified, then we retry until we no longer reclaim any pages
1798
	 * (above), or we've reclaimed an order of pages at least as
1799
	 * large as the allocation's order. In both cases, if the
1800
	 * allocation still fails, we stop retrying.
1801
	 */
1802
	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1803
		return 1;
1804

1805
	/*
1806
	 * Don't let big-order allocations loop unless the caller
1807
	 * explicitly requests that.
1808
	 */
1809
	if (gfp_mask & __GFP_NOFAIL)
1810
		return 1;
1811

1812
	return 0;
1813
}
1814

1815
static inline struct page *
1816
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1817
	struct zonelist *zonelist, enum zone_type high_zoneidx,
1818
	nodemask_t *nodemask, struct zone *preferred_zone,
1819
	int migratetype)
1820
{
1821
	struct page *page;
1822

1823
	/* Acquire the OOM killer lock for the zones in zonelist */
1824
	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
1825
		schedule_timeout_uninterruptible(1);
1826
		return NULL;
1827
	}
1828

1829
	/*
1830
	 * Go through the zonelist yet one more time, keep very high watermark
1831
	 * here, this is only to catch a parallel oom killing, we must fail if
1832
	 * we're still under heavy pressure.
1833
	 */
1834
	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1835
		order, zonelist, high_zoneidx,
1836
		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1837
		preferred_zone, migratetype);
1838
	if (page)
1839
		goto out;
1840

1841
	if (!(gfp_mask & __GFP_NOFAIL)) {
1842
		/* The OOM killer will not help higher order allocs */
1843
		if (order > PAGE_ALLOC_COSTLY_ORDER)
1844
			goto out;
1845
		/* The OOM killer does not needlessly kill tasks for lowmem */
1846
		if (high_zoneidx < ZONE_NORMAL)
1847
			goto out;
1848
		/*
1849
		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1850
		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
1851
		 * The caller should handle page allocation failure by itself if
1852
		 * it specifies __GFP_THISNODE.
1853
		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
1854
		 */
1855
		if (gfp_mask & __GFP_THISNODE)
1856
			goto out;
1857
	}
1858
	/* Exhausted what can be done so it's blamo time */
1859
	out_of_memory(zonelist, gfp_mask, order, nodemask);
1860

1861
out:
1862
	clear_zonelist_oom(zonelist, gfp_mask);
1863
	return page;
1864
}
1865

1866
#ifdef CONFIG_COMPACTION
1867
/* Try memory compaction for high-order allocations before reclaim */
1868
static struct page *
1869
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1870
	struct zonelist *zonelist, enum zone_type high_zoneidx,
1871
	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1872
	int migratetype, unsigned long *did_some_progress,
1873
	bool sync_migration)
1874
{
1875
	struct page *page;
1876

1877
	if (!order || compaction_deferred(preferred_zone))
1878
		return NULL;
1879

1880
	current->flags |= PF_MEMALLOC;
1881
	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1882
						nodemask, sync_migration);
1883
	current->flags &= ~PF_MEMALLOC;
1884
	if (*did_some_progress != COMPACT_SKIPPED) {
1885

1886
		/* Page migration frees to the PCP lists but we want merging */
1887
		drain_pages(get_cpu());
1888
		put_cpu();
1889

1890
		page = get_page_from_freelist(gfp_mask, nodemask,
1891
				order, zonelist, high_zoneidx,
1892
				alloc_flags, preferred_zone,
1893
				migratetype);
1894
		if (page) {
1895
			preferred_zone->compact_considered = 0;
1896
			preferred_zone->compact_defer_shift = 0;
1897
			count_vm_event(COMPACTSUCCESS);
1898
			return page;
1899
		}
1900

1901
		/*
1902
		 * It's bad if compaction run occurs and fails.
1903
		 * The most likely reason is that pages exist,
1904
		 * but not enough to satisfy watermarks.
1905
		 */
1906
		count_vm_event(COMPACTFAIL);
1907
		defer_compaction(preferred_zone);
1908

1909
		cond_resched();
1910
	}
1911

1912
	return NULL;
1913
}
1914
#else
1915
static inline struct page *
1916
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1917
	struct zonelist *zonelist, enum zone_type high_zoneidx,
1918
	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1919
	int migratetype, unsigned long *did_some_progress,
1920
	bool sync_migration)
1921
{
1922
	return NULL;
1923
}
1924
#endif /* CONFIG_COMPACTION */
1925

1926
/* The really slow allocator path where we enter direct reclaim */
1927
static inline struct page *
1928
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1929
	struct zonelist *zonelist, enum zone_type high_zoneidx,
1930
	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1931
	int migratetype, unsigned long *did_some_progress)
1932
{
1933
	struct page *page = NULL;
1934
	struct reclaim_state reclaim_state;
1935
	bool drained = false;
1936

1937
	cond_resched();
1938

1939
	/* We now go into synchronous reclaim */
1940
	cpuset_memory_pressure_bump();
1941
	current->flags |= PF_MEMALLOC;
1942
	lockdep_set_current_reclaim_state(gfp_mask);
1943
	reclaim_state.reclaimed_slab = 0;
1944
	current->reclaim_state = &reclaim_state;
1945

1946
	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1947

1948
	current->reclaim_state = NULL;
1949
	lockdep_clear_current_reclaim_state();
1950
	current->flags &= ~PF_MEMALLOC;
1951

1952
	cond_resched();
1953

1954
	if (unlikely(!(*did_some_progress)))
1955
		return NULL;
1956

1957
retry:
1958
	page = get_page_from_freelist(gfp_mask, nodemask, order,
1959
					zonelist, high_zoneidx,
1960
					alloc_flags, preferred_zone,
1961
					migratetype);
1962

1963
	/*
1964
	 * If an allocation failed after direct reclaim, it could be because
1965
	 * pages are pinned on the per-cpu lists. Drain them and try again
1966
	 */
1967
	if (!page && !drained) {
1968
		drain_all_pages();
1969
		drained = true;
1970
		goto retry;
1971
	}
1972

1973
	return page;
1974
}
1975

1976
/*
1977
 * This is called in the allocator slow-path if the allocation request is of
1978
 * sufficient urgency to ignore watermarks and take other desperate measures
1979
 */
1980
static inline struct page *
1981
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1982
	struct zonelist *zonelist, enum zone_type high_zoneidx,
1983
	nodemask_t *nodemask, struct zone *preferred_zone,
1984
	int migratetype)
1985
{
1986
	struct page *page;
1987

1988
	do {
1989
		page = get_page_from_freelist(gfp_mask, nodemask, order,
1990
			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1991
			preferred_zone, migratetype);
1992

1993
		if (!page && gfp_mask & __GFP_NOFAIL)
1994
			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1995
	} while (!page && (gfp_mask & __GFP_NOFAIL));
1996

1997
	return page;
1998
}
1999

2000
static inline
2001
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
2002
						enum zone_type high_zoneidx,
2003
						enum zone_type classzone_idx)
2004
{
2005
	struct zoneref *z;
2006
	struct zone *zone;
2007

2008
	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2009
		wakeup_kswapd(zone, order, classzone_idx);
2010
}
2011

2012
static inline int
2013
gfp_to_alloc_flags(gfp_t gfp_mask)
2014
{
2015
	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2016
	const gfp_t wait = gfp_mask & __GFP_WAIT;
2017

2018
	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2019
	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2020

2021
	/*
2022
	 * The caller may dip into page reserves a bit more if the caller
2023
	 * cannot run direct reclaim, or if the caller has realtime scheduling
2024
	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
2025
	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
2026
	 */
2027
	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2028

2029
	if (!wait) {
2030
		/*
2031
		 * Not worth trying to allocate harder for
2032
		 * __GFP_NOMEMALLOC even if it can't schedule.
2033
		 */
2034
		if  (!(gfp_mask & __GFP_NOMEMALLOC))
2035
			alloc_flags |= ALLOC_HARDER;
2036
		/*
2037
		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
2038
		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
2039
		 */
2040
		alloc_flags &= ~ALLOC_CPUSET;
2041
	} else if (unlikely(rt_task(current)) && !in_interrupt())
2042
		alloc_flags |= ALLOC_HARDER;
2043

2044
	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2045
		if (!in_interrupt() &&
2046
		    ((current->flags & PF_MEMALLOC) ||
2047
		     unlikely(test_thread_flag(TIF_MEMDIE))))
2048
			alloc_flags |= ALLOC_NO_WATERMARKS;
2049
	}
2050

2051
	return alloc_flags;
2052
}
2053

2054
static inline struct page *
2055
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2056
	struct zonelist *zonelist, enum zone_type high_zoneidx,
2057
	nodemask_t *nodemask, struct zone *preferred_zone,
2058
	int migratetype)
2059
{
2060
	const gfp_t wait = gfp_mask & __GFP_WAIT;
2061
	struct page *page = NULL;
2062
	int alloc_flags;
2063
	unsigned long pages_reclaimed = 0;
2064
	unsigned long did_some_progress;
2065
	bool sync_migration = false;
2066

2067
	/*
2068
	 * In the slowpath, we sanity check order to avoid ever trying to
2069
	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2070
	 * be using allocators in order of preference for an area that is
2071
	 * too large.
2072
	 */
2073
	if (order >= MAX_ORDER) {
2074
		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2075
		return NULL;
2076
	}
2077

2078
	/*
2079
	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2080
	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2081
	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2082
	 * using a larger set of nodes after it has established that the
2083
	 * allowed per node queues are empty and that nodes are
2084
	 * over allocated.
2085
	 */
2086
	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2087
		goto nopage;
2088

2089
restart:
2090
	if (!(gfp_mask & __GFP_NO_KSWAPD))
2091
		wake_all_kswapd(order, zonelist, high_zoneidx,
2092
						zone_idx(preferred_zone));
2093

2094
	/*
2095
	 * OK, we're below the kswapd watermark and have kicked background
2096
	 * reclaim. Now things get more complex, so set up alloc_flags according
2097
	 * to how we want to proceed.
2098
	 */
2099
	alloc_flags = gfp_to_alloc_flags(gfp_mask);
2100

2101
	/*
2102
	 * Find the true preferred zone if the allocation is unconstrained by
2103
	 * cpusets.
2104
	 */
2105
	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2106
		first_zones_zonelist(zonelist, high_zoneidx, NULL,
2107
					&preferred_zone);
2108

2109
rebalance:
2110
	/* This is the last chance, in general, before the goto nopage. */
2111
	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2112
			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2113
			preferred_zone, migratetype);
2114
	if (page)
2115
		goto got_pg;
2116

2117
	/* Allocate without watermarks if the context allows */
2118
	if (alloc_flags & ALLOC_NO_WATERMARKS) {
2119
		page = __alloc_pages_high_priority(gfp_mask, order,
2120
				zonelist, high_zoneidx, nodemask,
2121
				preferred_zone, migratetype);
2122
		if (page)
2123
			goto got_pg;
2124
	}
2125

2126
	/* Atomic allocations - we can't balance anything */
2127
	if (!wait)
2128
		goto nopage;
2129

2130
	/* Avoid recursion of direct reclaim */
2131
	if (current->flags & PF_MEMALLOC)
2132
		goto nopage;
2133

2134
	/* Avoid allocations with no watermarks from looping endlessly */
2135
	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2136
		goto nopage;
2137

2138
	/*
2139
	 * Try direct compaction. The first pass is asynchronous. Subsequent
2140
	 * attempts after direct reclaim are synchronous
2141
	 */
2142
	page = __alloc_pages_direct_compact(gfp_mask, order,
2143
					zonelist, high_zoneidx,
2144
					nodemask,
2145
					alloc_flags, preferred_zone,
2146
					migratetype, &did_some_progress,
2147
					sync_migration);
2148
	if (page)
2149
		goto got_pg;
2150
	sync_migration = true;
2151

2152
	/* Try direct reclaim and then allocating */
2153
	page = __alloc_pages_direct_reclaim(gfp_mask, order,
2154
					zonelist, high_zoneidx,
2155
					nodemask,
2156
					alloc_flags, preferred_zone,
2157
					migratetype, &did_some_progress);
2158
	if (page)
2159
		goto got_pg;
2160

2161
	/*
2162
	 * If we failed to make any progress reclaiming, then we are
2163
	 * running out of options and have to consider going OOM
2164
	 */
2165
	if (!did_some_progress) {
2166
		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2167
			if (oom_killer_disabled)
2168
				goto nopage;
2169
			page = __alloc_pages_may_oom(gfp_mask, order,
2170
					zonelist, high_zoneidx,
2171
					nodemask, preferred_zone,
2172
					migratetype);
2173
			if (page)
2174
				goto got_pg;
2175

2176
			if (!(gfp_mask & __GFP_NOFAIL)) {
2177
				/*
2178
				 * The oom killer is not called for high-order
2179
				 * allocations that may fail, so if no progress
2180
				 * is being made, there are no other options and
2181
				 * retrying is unlikely to help.
2182
				 */
2183
				if (order > PAGE_ALLOC_COSTLY_ORDER)
2184
					goto nopage;
2185
				/*
2186
				 * The oom killer is not called for lowmem
2187
				 * allocations to prevent needlessly killing
2188
				 * innocent tasks.
2189
				 */
2190
				if (high_zoneidx < ZONE_NORMAL)
2191
					goto nopage;
2192
			}
2193

2194
			goto restart;
2195
		}
2196
	}
2197

2198
	/* Check if we should retry the allocation */
2199
	pages_reclaimed += did_some_progress;
2200
	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2201
		/* Wait for some write requests to complete then retry */
2202
		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2203
		goto rebalance;
2204
	} else {
2205
		/*
2206
		 * High-order allocations do not necessarily loop after
2207
		 * direct reclaim and reclaim/compaction depends on compaction
2208
		 * being called after reclaim so call directly if necessary
2209
		 */
2210
		page = __alloc_pages_direct_compact(gfp_mask, order,
2211
					zonelist, high_zoneidx,
2212
					nodemask,
2213
					alloc_flags, preferred_zone,
2214
					migratetype, &did_some_progress,
2215
					sync_migration);
2216
		if (page)
2217
			goto got_pg;
2218
	}
2219

2220
nopage:
2221
	warn_alloc_failed(gfp_mask, order, NULL);
2222
	return page;
2223
got_pg:
2224
	if (kmemcheck_enabled)
2225
		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2226
	return page;
2227

2228
}
2229

2230
/*
2231
 * This is the 'heart' of the zoned buddy allocator.
2232
 */
2233
struct page *
2234
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2235
			struct zonelist *zonelist, nodemask_t *nodemask)
2236
{
2237
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2238
	struct zone *preferred_zone;
2239
	struct page *page;
2240
	int migratetype = allocflags_to_migratetype(gfp_mask);
2241

2242
	gfp_mask &= gfp_allowed_mask;
2243

2244
	lockdep_trace_alloc(gfp_mask);
2245

2246
	might_sleep_if(gfp_mask & __GFP_WAIT);
2247

2248
	if (should_fail_alloc_page(gfp_mask, order))
2249
		return NULL;
2250

2251
	/*
2252
	 * Check the zones suitable for the gfp_mask contain at least one
2253
	 * valid zone. It's possible to have an empty zonelist as a result
2254
	 * of GFP_THISNODE and a memoryless node
2255
	 */
2256
	if (unlikely(!zonelist->_zonerefs->zone))
2257
		return NULL;
2258

2259
	get_mems_allowed();
2260
	/* The preferred zone is used for statistics later */
2261
	first_zones_zonelist(zonelist, high_zoneidx,
2262
				nodemask ? : &cpuset_current_mems_allowed,
2263
				&preferred_zone);
2264
	if (!preferred_zone) {
2265
		put_mems_allowed();
2266
		return NULL;
2267
	}
2268

2269
	/* First allocation attempt */
2270
	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2271
			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
2272
			preferred_zone, migratetype);
2273
	if (unlikely(!page))
2274
		page = __alloc_pages_slowpath(gfp_mask, order,
2275
				zonelist, high_zoneidx, nodemask,
2276
				preferred_zone, migratetype);
2277
	put_mems_allowed();
2278

2279
	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2280
	return page;
2281
}
2282
EXPORT_SYMBOL(__alloc_pages_nodemask);
2283

2284
/*
2285
 * Common helper functions.
2286
 */
2287
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2288
{
2289
	struct page *page;
2290

2291
	/*
2292
	 * __get_free_pages() returns a 32-bit address, which cannot represent
2293
	 * a highmem page
2294
	 */
2295
	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2296

2297
	page = alloc_pages(gfp_mask, order);
2298
	if (!page)
2299
		return 0;
2300
	return (unsigned long) page_address(page);
2301
}
2302
EXPORT_SYMBOL(__get_free_pages);
2303

2304
unsigned long get_zeroed_page(gfp_t gfp_mask)
2305
{
2306
	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2307
}
2308
EXPORT_SYMBOL(get_zeroed_page);
2309

2310
void __pagevec_free(struct pagevec *pvec)
2311
{
2312
	int i = pagevec_count(pvec);
2313

2314
	while (--i >= 0) {
2315
		trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
2316
		free_hot_cold_page(pvec->pages[i], pvec->cold);
2317
	}
2318
}
2319

2320
void __free_pages(struct page *page, unsigned int order)
2321
{
2322
	if (put_page_testzero(page)) {
2323
		if (order == 0)
2324
			free_hot_cold_page(page, 0);
2325
		else
2326
			__free_pages_ok(page, order);
2327
	}
2328
}
2329

2330
EXPORT_SYMBOL(__free_pages);
2331

2332
void free_pages(unsigned long addr, unsigned int order)
2333
{
2334
	if (addr != 0) {
2335
		VM_BUG_ON(!virt_addr_valid((void *)addr));
2336
		__free_pages(virt_to_page((void *)addr), order);
2337
	}
2338
}
2339

2340
EXPORT_SYMBOL(free_pages);
2341

2342
static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2343
{
2344
	if (addr) {
2345
		unsigned long alloc_end = addr + (PAGE_SIZE << order);
2346
		unsigned long used = addr + PAGE_ALIGN(size);
2347

2348
		split_page(virt_to_page((void *)addr), order);
2349
		while (used < alloc_end) {
2350
			free_page(used);
2351
			used += PAGE_SIZE;
2352
		}
2353
	}
2354
	return (void *)addr;
2355
}
2356

2357
/**
2358
 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2359
 * @size: the number of bytes to allocate
2360
 * @gfp_mask: GFP flags for the allocation
2361
 *
2362
 * This function is similar to alloc_pages(), except that it allocates the
2363
 * minimum number of pages to satisfy the request.  alloc_pages() can only
2364
 * allocate memory in power-of-two pages.
2365
 *
2366
 * This function is also limited by MAX_ORDER.
2367
 *
2368
 * Memory allocated by this function must be released by free_pages_exact().
2369
 */
2370
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2371
{
2372
	unsigned int order = get_order(size);
2373
	unsigned long addr;
2374

2375
	addr = __get_free_pages(gfp_mask, order);
2376
	return make_alloc_exact(addr, order, size);
2377
}
2378
EXPORT_SYMBOL(alloc_pages_exact);
2379

2380
/**
2381
 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2382
 *			   pages on a node.
2383
 * @nid: the preferred node ID where memory should be allocated
2384
 * @size: the number of bytes to allocate
2385
 * @gfp_mask: GFP flags for the allocation
2386
 *
2387
 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2388
 * back.
2389
 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2390
 * but is not exact.
2391
 */
2392
void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2393
{
2394
	unsigned order = get_order(size);
2395
	struct page *p = alloc_pages_node(nid, gfp_mask, order);
2396
	if (!p)
2397
		return NULL;
2398
	return make_alloc_exact((unsigned long)page_address(p), order, size);
2399
}
2400
EXPORT_SYMBOL(alloc_pages_exact_nid);
2401

2402
/**
2403
 * free_pages_exact - release memory allocated via alloc_pages_exact()
2404
 * @virt: the value returned by alloc_pages_exact.
2405
 * @size: size of allocation, same value as passed to alloc_pages_exact().
2406
 *
2407
 * Release the memory allocated by a previous call to alloc_pages_exact.
2408
 */
2409
void free_pages_exact(void *virt, size_t size)
2410
{
2411
	unsigned long addr = (unsigned long)virt;
2412
	unsigned long end = addr + PAGE_ALIGN(size);
2413

2414
	while (addr < end) {
2415
		free_page(addr);
2416
		addr += PAGE_SIZE;
2417
	}
2418
}
2419
EXPORT_SYMBOL(free_pages_exact);
2420

2421
static unsigned int nr_free_zone_pages(int offset)
2422
{
2423
	struct zoneref *z;
2424
	struct zone *zone;
2425

2426
	/* Just pick one node, since fallback list is circular */
2427
	unsigned int sum = 0;
2428

2429
	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2430

2431
	for_each_zone_zonelist(zone, z, zonelist, offset) {
2432
		unsigned long size = zone->present_pages;
2433
		unsigned long high = high_wmark_pages(zone);
2434
		if (size > high)
2435
			sum += size - high;
2436
	}
2437

2438
	return sum;
2439
}
2440

2441
/*
2442
 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
2443
 */
2444
unsigned int nr_free_buffer_pages(void)
2445
{
2446
	return nr_free_zone_pages(gfp_zone(GFP_USER));
2447
}
2448
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2449

2450
/*
2451
 * Amount of free RAM allocatable within all zones
2452
 */
2453
unsigned int nr_free_pagecache_pages(void)
2454
{
2455
	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2456
}
2457

2458
static inline void show_node(struct zone *zone)
2459
{
2460
	if (NUMA_BUILD)
2461
		printk("Node %d ", zone_to_nid(zone));
2462
}
2463

2464
void si_meminfo(struct sysinfo *val)
2465
{
2466
	val->totalram = totalram_pages;
2467
	val->sharedram = 0;
2468
	val->freeram = global_page_state(NR_FREE_PAGES);
2469
	val->bufferram = nr_blockdev_pages();
2470
	val->totalhigh = totalhigh_pages;
2471
	val->freehigh = nr_free_highpages();
2472
	val->mem_unit = PAGE_SIZE;
2473
}
2474

2475
EXPORT_SYMBOL(si_meminfo);
2476

2477
#ifdef CONFIG_NUMA
2478
void si_meminfo_node(struct sysinfo *val, int nid)
2479
{
2480
	pg_data_t *pgdat = NODE_DATA(nid);
2481

2482
	val->totalram = pgdat->node_present_pages;
2483
	val->freeram = node_page_state(nid, NR_FREE_PAGES);
2484
#ifdef CONFIG_HIGHMEM
2485
	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
2486
	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2487
			NR_FREE_PAGES);
2488
#else
2489
	val->totalhigh = 0;
2490
	val->freehigh = 0;
2491
#endif
2492
	val->mem_unit = PAGE_SIZE;
2493
}
2494
#endif
2495

2496
/*
2497
 * Determine whether the node should be displayed or not, depending on whether
2498
 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2499
 */
2500
bool skip_free_areas_node(unsigned int flags, int nid)
2501
{
2502
	bool ret = false;
2503

2504
	if (!(flags & SHOW_MEM_FILTER_NODES))
2505
		goto out;
2506

2507
	get_mems_allowed();
2508
	ret = !node_isset(nid, cpuset_current_mems_allowed);
2509
	put_mems_allowed();
2510
out:
2511
	return ret;
2512
}
2513

2514
#define K(x) ((x) << (PAGE_SHIFT-10))
2515

2516
/*
2517
 * Show free area list (used inside shift_scroll-lock stuff)
2518
 * We also calculate the percentage fragmentation. We do this by counting the
2519
 * memory on each free list with the exception of the first item on the list.
2520
 * Suppresses nodes that are not allowed by current's cpuset if
2521
 * SHOW_MEM_FILTER_NODES is passed.
2522
 */
2523
void show_free_areas(unsigned int filter)
2524
{
2525
	int cpu;
2526
	struct zone *zone;
2527

2528
	for_each_populated_zone(zone) {
2529
		if (skip_free_areas_node(filter, zone_to_nid(zone)))
2530
			continue;
2531
		show_node(zone);
2532
		printk("%s per-cpu:\n", zone->name);
2533

2534
		for_each_online_cpu(cpu) {
2535
			struct per_cpu_pageset *pageset;
2536

2537
			pageset = per_cpu_ptr(zone->pageset, cpu);
2538

2539
			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2540
			       cpu, pageset->pcp.high,
2541
			       pageset->pcp.batch, pageset->pcp.count);
2542
		}
2543
	}
2544

2545
	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2546
		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2547
		" unevictable:%lu"
2548
		" dirty:%lu writeback:%lu unstable:%lu\n"
2549
		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2550
		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2551
		global_page_state(NR_ACTIVE_ANON),
2552
		global_page_state(NR_INACTIVE_ANON),
2553
		global_page_state(NR_ISOLATED_ANON),
2554
		global_page_state(NR_ACTIVE_FILE),
2555
		global_page_state(NR_INACTIVE_FILE),
2556
		global_page_state(NR_ISOLATED_FILE),
2557
		global_page_state(NR_UNEVICTABLE),
2558
		global_page_state(NR_FILE_DIRTY),
2559
		global_page_state(NR_WRITEBACK),
2560
		global_page_state(NR_UNSTABLE_NFS),
2561
		global_page_state(NR_FREE_PAGES),
2562
		global_page_state(NR_SLAB_RECLAIMABLE),
2563
		global_page_state(NR_SLAB_UNRECLAIMABLE),
2564
		global_page_state(NR_FILE_MAPPED),
2565
		global_page_state(NR_SHMEM),
2566
		global_page_state(NR_PAGETABLE),
2567
		global_page_state(NR_BOUNCE));
2568

2569
	for_each_populated_zone(zone) {
2570
		int i;
2571

2572
		if (skip_free_areas_node(filter, zone_to_nid(zone)))
2573
			continue;
2574
		show_node(zone);
2575
		printk("%s"
2576
			" free:%lukB"
2577
			" min:%lukB"
2578
			" low:%lukB"
2579
			" high:%lukB"
2580
			" active_anon:%lukB"
2581
			" inactive_anon:%lukB"
2582
			" active_file:%lukB"
2583
			" inactive_file:%lukB"
2584
			" unevictable:%lukB"
2585
			" isolated(anon):%lukB"
2586
			" isolated(file):%lukB"
2587
			" present:%lukB"
2588
			" mlocked:%lukB"
2589
			" dirty:%lukB"
2590
			" writeback:%lukB"
2591
			" mapped:%lukB"
2592
			" shmem:%lukB"
2593
			" slab_reclaimable:%lukB"
2594
			" slab_unreclaimable:%lukB"
2595
			" kernel_stack:%lukB"
2596
			" pagetables:%lukB"
2597
			" unstable:%lukB"
2598
			" bounce:%lukB"
2599
			" writeback_tmp:%lukB"
2600
			" pages_scanned:%lu"
2601
			" all_unreclaimable? %s"
2602
			"\n",
2603
			zone->name,
2604
			K(zone_page_state(zone, NR_FREE_PAGES)),
2605
			K(min_wmark_pages(zone)),
2606
			K(low_wmark_pages(zone)),
2607
			K(high_wmark_pages(zone)),
2608
			K(zone_page_state(zone, NR_ACTIVE_ANON)),
2609
			K(zone_page_state(zone, NR_INACTIVE_ANON)),
2610
			K(zone_page_state(zone, NR_ACTIVE_FILE)),
2611
			K(zone_page_state(zone, NR_INACTIVE_FILE)),
2612
			K(zone_page_state(zone, NR_UNEVICTABLE)),
2613
			K(zone_page_state(zone, NR_ISOLATED_ANON)),
2614
			K(zone_page_state(zone, NR_ISOLATED_FILE)),
2615
			K(zone->present_pages),
2616
			K(zone_page_state(zone, NR_MLOCK)),
2617
			K(zone_page_state(zone, NR_FILE_DIRTY)),
2618
			K(zone_page_state(zone, NR_WRITEBACK)),
2619
			K(zone_page_state(zone, NR_FILE_MAPPED)),
2620
			K(zone_page_state(zone, NR_SHMEM)),
2621
			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2622
			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2623
			zone_page_state(zone, NR_KERNEL_STACK) *
2624
				THREAD_SIZE / 1024,
2625
			K(zone_page_state(zone, NR_PAGETABLE)),
2626
			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2627
			K(zone_page_state(zone, NR_BOUNCE)),
2628
			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2629
			zone->pages_scanned,
2630
			(zone->all_unreclaimable ? "yes" : "no")
2631
			);
2632
		printk("lowmem_reserve[]:");
2633
		for (i = 0; i < MAX_NR_ZONES; i++)
2634
			printk(" %lu", zone->lowmem_reserve[i]);
2635
		printk("\n");
2636
	}
2637

2638
	for_each_populated_zone(zone) {
2639
 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
2640

2641
		if (skip_free_areas_node(filter, zone_to_nid(zone)))
2642
			continue;
2643
		show_node(zone);
2644
		printk("%s: ", zone->name);
2645

2646
		spin_lock_irqsave(&zone->lock, flags);
2647
		for (order = 0; order < MAX_ORDER; order++) {
2648
			nr[order] = zone->free_area[order].nr_free;
2649
			total += nr[order] << order;
2650
		}
2651
		spin_unlock_irqrestore(&zone->lock, flags);
2652
		for (order = 0; order < MAX_ORDER; order++)
2653
			printk("%lu*%lukB ", nr[order], K(1UL) << order);
2654
		printk("= %lukB\n", K(total));
2655
	}
2656

2657
	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
2658

2659
	show_swap_cache_info();
2660
}
2661

2662
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2663
{
2664
	zoneref->zone = zone;
2665
	zoneref->zone_idx = zone_idx(zone);
2666
}
2667

2668
/*
2669
 * Builds allocation fallback zone lists.
2670
 *
2671
 * Add all populated zones of a node to the zonelist.
2672
 */
2673
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
2674
				int nr_zones, enum zone_type zone_type)
2675
{
2676
	struct zone *zone;
2677

2678
	BUG_ON(zone_type >= MAX_NR_ZONES);
2679
	zone_type++;
2680

2681
	do {
2682
		zone_type--;
2683
		zone = pgdat->node_zones + zone_type;
2684
		if (populated_zone(zone)) {
2685
			zoneref_set_zone(zone,
2686
				&zonelist->_zonerefs[nr_zones++]);
2687
			check_highest_zone(zone_type);
2688
		}
2689

2690
	} while (zone_type);
2691
	return nr_zones;
2692
}
2693

2694

2695
/*
2696
 *  zonelist_order:
2697
 *  0 = automatic detection of better ordering.
2698
 *  1 = order by ([node] distance, -zonetype)
2699
 *  2 = order by (-zonetype, [node] distance)
2700
 *
2701
 *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
2702
 *  the same zonelist. So only NUMA can configure this param.
2703
 */
2704
#define ZONELIST_ORDER_DEFAULT  0
2705
#define ZONELIST_ORDER_NODE     1
2706
#define ZONELIST_ORDER_ZONE     2
2707

2708
/* zonelist order in the kernel.
2709
 * set_zonelist_order() will set this to NODE or ZONE.
2710
 */
2711
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
2712
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
2713

2714

2715
#ifdef CONFIG_NUMA
2716
/* The value user specified ....changed by config */
2717
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
2718
/* string for sysctl */
2719
#define NUMA_ZONELIST_ORDER_LEN	16
2720
char numa_zonelist_order[16] = "default";
2721

2722
/*
2723
 * interface for configure zonelist ordering.
2724
 * command line option "numa_zonelist_order"
2725
 *	= "[dD]efault	- default, automatic configuration.
2726
 *	= "[nN]ode 	- order by node locality, then by zone within node
2727
 *	= "[zZ]one      - order by zone, then by locality within zone
2728
 */
2729

2730
static int __parse_numa_zonelist_order(char *s)
2731
{
2732
	if (*s == 'd' || *s == 'D') {
2733
		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
2734
	} else if (*s == 'n' || *s == 'N') {
2735
		user_zonelist_order = ZONELIST_ORDER_NODE;
2736
	} else if (*s == 'z' || *s == 'Z') {
2737
		user_zonelist_order = ZONELIST_ORDER_ZONE;
2738
	} else {
2739
		printk(KERN_WARNING
2740
			"Ignoring invalid numa_zonelist_order value:  "
2741
			"%s\n", s);
2742
		return -EINVAL;
2743
	}
2744
	return 0;
2745
}
2746

2747
static __init int setup_numa_zonelist_order(char *s)
2748
{
2749
	int ret;
2750

2751
	if (!s)
2752
		return 0;
2753

2754
	ret = __parse_numa_zonelist_order(s);
2755
	if (ret == 0)
2756
		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2757

2758
	return ret;
2759
}
2760
early_param("numa_zonelist_order", setup_numa_zonelist_order);
2761

2762
/*
2763
 * sysctl handler for numa_zonelist_order
2764
 */
2765
int numa_zonelist_order_handler(ctl_table *table, int write,
2766
		void __user *buffer, size_t *length,
2767
		loff_t *ppos)
2768
{
2769
	char saved_string[NUMA_ZONELIST_ORDER_LEN];
2770
	int ret;
2771
	static DEFINE_MUTEX(zl_order_mutex);
2772

2773
	mutex_lock(&zl_order_mutex);
2774
	if (write)
2775
		strcpy(saved_string, (char*)table->data);
2776
	ret = proc_dostring(table, write, buffer, length, ppos);
2777
	if (ret)
2778
		goto out;
2779
	if (write) {
2780
		int oldval = user_zonelist_order;
2781
		if (__parse_numa_zonelist_order((char*)table->data)) {
2782
			/*
2783
			 * bogus value.  restore saved string
2784
			 */
2785
			strncpy((char*)table->data, saved_string,
2786
				NUMA_ZONELIST_ORDER_LEN);
2787
			user_zonelist_order = oldval;
2788
		} else if (oldval != user_zonelist_order) {
2789
			mutex_lock(&zonelists_mutex);
2790
			build_all_zonelists(NULL);
2791
			mutex_unlock(&zonelists_mutex);
2792
		}
2793
	}
2794
out:
2795
	mutex_unlock(&zl_order_mutex);
2796
	return ret;
2797
}
2798

2799

2800
#define MAX_NODE_LOAD (nr_online_nodes)
2801
static int node_load[MAX_NUMNODES];
2802

2803
/**
2804
 * find_next_best_node - find the next node that should appear in a given node's fallback list
2805
 * @node: node whose fallback list we're appending
2806
 * @used_node_mask: nodemask_t of already used nodes
2807
 *
2808
 * We use a number of factors to determine which is the next node that should
2809
 * appear on a given node's fallback list.  The node should not have appeared
2810
 * already in @node's fallback list, and it should be the next closest node
2811
 * according to the distance array (which contains arbitrary distance values
2812
 * from each node to each node in the system), and should also prefer nodes
2813
 * with no CPUs, since presumably they'll have very little allocation pressure
2814
 * on them otherwise.
2815
 * It returns -1 if no node is found.
2816
 */
2817
static int find_next_best_node(int node, nodemask_t *used_node_mask)
2818
{
2819
	int n, val;
2820
	int min_val = INT_MAX;
2821
	int best_node = -1;
2822
	const struct cpumask *tmp = cpumask_of_node(0);
2823

2824
	/* Use the local node if we haven't already */
2825
	if (!node_isset(node, *used_node_mask)) {
2826
		node_set(node, *used_node_mask);
2827
		return node;
2828
	}
2829

2830
	for_each_node_state(n, N_HIGH_MEMORY) {
2831

2832
		/* Don't want a node to appear more than once */
2833
		if (node_isset(n, *used_node_mask))
2834
			continue;
2835

2836
		/* Use the distance array to find the distance */
2837
		val = node_distance(node, n);
2838

2839
		/* Penalize nodes under us ("prefer the next node") */
2840
		val += (n < node);
2841

2842
		/* Give preference to headless and unused nodes */
2843
		tmp = cpumask_of_node(n);
2844
		if (!cpumask_empty(tmp))
2845
			val += PENALTY_FOR_NODE_WITH_CPUS;
2846

2847
		/* Slight preference for less loaded node */
2848
		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
2849
		val += node_load[n];
2850

2851
		if (val < min_val) {
2852
			min_val = val;
2853
			best_node = n;
2854
		}
2855
	}
2856

2857
	if (best_node >= 0)
2858
		node_set(best_node, *used_node_mask);
2859

2860
	return best_node;
2861
}
2862

2863

2864
/*
2865
 * Build zonelists ordered by node and zones within node.
2866
 * This results in maximum locality--normal zone overflows into local
2867
 * DMA zone, if any--but risks exhausting DMA zone.
2868
 */
2869
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2870
{
2871
	int j;
2872
	struct zonelist *zonelist;
2873

2874
	zonelist = &pgdat->node_zonelists[0];
2875
	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
2876
		;
2877
	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2878
							MAX_NR_ZONES - 1);
2879
	zonelist->_zonerefs[j].zone = NULL;
2880
	zonelist->_zonerefs[j].zone_idx = 0;
2881
}
2882

2883
/*
2884
 * Build gfp_thisnode zonelists
2885
 */
2886
static void build_thisnode_zonelists(pg_data_t *pgdat)
2887
{
2888
	int j;
2889
	struct zonelist *zonelist;
2890

2891
	zonelist = &pgdat->node_zonelists[1];
2892
	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2893
	zonelist->_zonerefs[j].zone = NULL;
2894
	zonelist->_zonerefs[j].zone_idx = 0;
2895
}
2896

2897
/*
2898
 * Build zonelists ordered by zone and nodes within zones.
2899
 * This results in conserving DMA zone[s] until all Normal memory is
2900
 * exhausted, but results in overflowing to remote node while memory
2901
 * may still exist in local DMA zone.
2902
 */
2903
static int node_order[MAX_NUMNODES];
2904

2905
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2906
{
2907
	int pos, j, node;
2908
	int zone_type;		/* needs to be signed */
2909
	struct zone *z;
2910
	struct zonelist *zonelist;
2911

2912
	zonelist = &pgdat->node_zonelists[0];
2913
	pos = 0;
2914
	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
2915
		for (j = 0; j < nr_nodes; j++) {
2916
			node = node_order[j];
2917
			z = &NODE_DATA(node)->node_zones[zone_type];
2918
			if (populated_zone(z)) {
2919
				zoneref_set_zone(z,
2920
					&zonelist->_zonerefs[pos++]);
2921
				check_highest_zone(zone_type);
2922
			}
2923
		}
2924
	}
2925
	zonelist->_zonerefs[pos].zone = NULL;
2926
	zonelist->_zonerefs[pos].zone_idx = 0;
2927
}
2928

2929
static int default_zonelist_order(void)
2930
{
2931
	int nid, zone_type;
2932
	unsigned long low_kmem_size,total_size;
2933
	struct zone *z;
2934
	int average_size;
2935
	/*
2936
         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2937
	 * If they are really small and used heavily, the system can fall
2938
	 * into OOM very easily.
2939
	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
2940
	 */
2941
	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2942
	low_kmem_size = 0;
2943
	total_size = 0;
2944
	for_each_online_node(nid) {
2945
		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2946
			z = &NODE_DATA(nid)->node_zones[zone_type];
2947
			if (populated_zone(z)) {
2948
				if (zone_type < ZONE_NORMAL)
2949
					low_kmem_size += z->present_pages;
2950
				total_size += z->present_pages;
2951
			} else if (zone_type == ZONE_NORMAL) {
2952
				/*
2953
				 * If any node has only lowmem, then node order
2954
				 * is preferred to allow kernel allocations
2955
				 * locally; otherwise, they can easily infringe
2956
				 * on other nodes when there is an abundance of
2957
				 * lowmem available to allocate from.
2958
				 */
2959
				return ZONELIST_ORDER_NODE;
2960
			}
2961
		}
2962
	}
2963
	if (!low_kmem_size ||  /* there are no DMA area. */
2964
	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
2965
		return ZONELIST_ORDER_NODE;
2966
	/*
2967
	 * look into each node's config.
2968
  	 * If there is a node whose DMA/DMA32 memory is very big area on
2969
 	 * local memory, NODE_ORDER may be suitable.
2970
         */
2971
	average_size = total_size /
2972
				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
2973
	for_each_online_node(nid) {
2974
		low_kmem_size = 0;
2975
		total_size = 0;
2976
		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2977
			z = &NODE_DATA(nid)->node_zones[zone_type];
2978
			if (populated_zone(z)) {
2979
				if (zone_type < ZONE_NORMAL)
2980
					low_kmem_size += z->present_pages;
2981
				total_size += z->present_pages;
2982
			}
2983
		}
2984
		if (low_kmem_size &&
2985
		    total_size > average_size && /* ignore small node */
2986
		    low_kmem_size > total_size * 70/100)
2987
			return ZONELIST_ORDER_NODE;
2988
	}
2989
	return ZONELIST_ORDER_ZONE;
2990
}
2991

2992
static void set_zonelist_order(void)
2993
{
2994
	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
2995
		current_zonelist_order = default_zonelist_order();
2996
	else
2997
		current_zonelist_order = user_zonelist_order;
2998
}
2999

3000
static void build_zonelists(pg_data_t *pgdat)
3001
{
3002
	int j, node, load;
3003
	enum zone_type i;
3004
	nodemask_t used_mask;
3005
	int local_node, prev_node;
3006
	struct zonelist *zonelist;
3007
	int order = current_zonelist_order;
3008

3009
	/* initialize zonelists */
3010
	for (i = 0; i < MAX_ZONELISTS; i++) {
3011
		zonelist = pgdat->node_zonelists + i;
3012
		zonelist->_zonerefs[0].zone = NULL;
3013
		zonelist->_zonerefs[0].zone_idx = 0;
3014
	}
3015

3016
	/* NUMA-aware ordering of nodes */
3017
	local_node = pgdat->node_id;
3018
	load = nr_online_nodes;
3019
	prev_node = local_node;
3020
	nodes_clear(used_mask);
3021

3022
	memset(node_order, 0, sizeof(node_order));
3023
	j = 0;
3024

3025
	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3026
		int distance = node_distance(local_node, node);
3027

3028
		/*
3029
		 * If another node is sufficiently far away then it is better
3030
		 * to reclaim pages in a zone before going off node.
3031
		 */
3032
		if (distance > RECLAIM_DISTANCE)
3033
			zone_reclaim_mode = 1;
3034

3035
		/*
3036
		 * We don't want to pressure a particular node.
3037
		 * So adding penalty to the first node in same
3038
		 * distance group to make it round-robin.
3039
		 */
3040
		if (distance != node_distance(local_node, prev_node))
3041
			node_load[node] = load;
3042

3043
		prev_node = node;
3044
		load--;
3045
		if (order == ZONELIST_ORDER_NODE)
3046
			build_zonelists_in_node_order(pgdat, node);
3047
		else
3048
			node_order[j++] = node;	/* remember order */
3049
	}
3050

3051
	if (order == ZONELIST_ORDER_ZONE) {
3052
		/* calculate node order -- i.e., DMA last! */
3053
		build_zonelists_in_zone_order(pgdat, j);
3054
	}
3055

3056
	build_thisnode_zonelists(pgdat);
3057
}
3058

3059
/* Construct the zonelist performance cache - see further mmzone.h */
3060
static void build_zonelist_cache(pg_data_t *pgdat)
3061
{
3062
	struct zonelist *zonelist;
3063
	struct zonelist_cache *zlc;
3064
	struct zoneref *z;
3065

3066
	zonelist = &pgdat->node_zonelists[0];
3067
	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3068
	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3069
	for (z = zonelist->_zonerefs; z->zone; z++)
3070
		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3071
}
3072

3073
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3074
/*
3075
 * Return node id of node used for "local" allocations.
3076
 * I.e., first node id of first zone in arg node's generic zonelist.
3077
 * Used for initializing percpu 'numa_mem', which is used primarily
3078
 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3079
 */
3080
int local_memory_node(int node)
3081
{
3082
	struct zone *zone;
3083

3084
	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3085
				   gfp_zone(GFP_KERNEL),
3086
				   NULL,
3087
				   &zone);
3088
	return zone->node;
3089
}
3090
#endif
3091

3092
#else	/* CONFIG_NUMA */
3093

3094
static void set_zonelist_order(void)
3095
{
3096
	current_zonelist_order = ZONELIST_ORDER_ZONE;
3097
}
3098

3099
static void build_zonelists(pg_data_t *pgdat)
3100
{
3101
	int node, local_node;
3102
	enum zone_type j;
3103
	struct zonelist *zonelist;
3104

3105
	local_node = pgdat->node_id;
3106

3107
	zonelist = &pgdat->node_zonelists[0];
3108
	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3109

3110
	/*
3111
	 * Now we build the zonelist so that it contains the zones
3112
	 * of all the other nodes.
3113
	 * We don't want to pressure a particular node, so when
3114
	 * building the zones for node N, we make sure that the
3115
	 * zones coming right after the local ones are those from
3116
	 * node N+1 (modulo N)
3117
	 */
3118
	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3119
		if (!node_online(node))
3120
			continue;
3121
		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3122
							MAX_NR_ZONES - 1);
3123
	}
3124
	for (node = 0; node < local_node; node++) {
3125
		if (!node_online(node))
3126
			continue;
3127
		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3128
							MAX_NR_ZONES - 1);
3129
	}
3130

3131
	zonelist->_zonerefs[j].zone = NULL;
3132
	zonelist->_zonerefs[j].zone_idx = 0;
3133
}
3134

3135
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3136
static void build_zonelist_cache(pg_data_t *pgdat)
3137
{
3138
	pgdat->node_zonelists[0].zlcache_ptr = NULL;
3139
}
3140

3141
#endif	/* CONFIG_NUMA */
3142

3143
/*
3144
 * Boot pageset table. One per cpu which is going to be used for all
3145
 * zones and all nodes. The parameters will be set in such a way
3146
 * that an item put on a list will immediately be handed over to
3147
 * the buddy list. This is safe since pageset manipulation is done
3148
 * with interrupts disabled.
3149
 *
3150
 * The boot_pagesets must be kept even after bootup is complete for
3151
 * unused processors and/or zones. They do play a role for bootstrapping
3152
 * hotplugged processors.
3153
 *
3154
 * zoneinfo_show() and maybe other functions do
3155
 * not check if the processor is online before following the pageset pointer.
3156
 * Other parts of the kernel may not check if the zone is available.
3157
 */
3158
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3159
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3160
static void setup_zone_pageset(struct zone *zone);
3161

3162
/*
3163
 * Global mutex to protect against size modification of zonelists
3164
 * as well as to serialize pageset setup for the new populated zone.
3165
 */
3166
DEFINE_MUTEX(zonelists_mutex);
3167

3168
/* return values int ....just for stop_machine() */
3169
static __init_refok int __build_all_zonelists(void *data)
3170
{
3171
	int nid;
3172
	int cpu;
3173

3174
#ifdef CONFIG_NUMA
3175
	memset(node_load, 0, sizeof(node_load));
3176
#endif
3177
	for_each_online_node(nid) {
3178
		pg_data_t *pgdat = NODE_DATA(nid);
3179

3180
		build_zonelists(pgdat);
3181
		build_zonelist_cache(pgdat);
3182
	}
3183

3184
	/*
3185
	 * Initialize the boot_pagesets that are going to be used
3186
	 * for bootstrapping processors. The real pagesets for
3187
	 * each zone will be allocated later when the per cpu
3188
	 * allocator is available.
3189
	 *
3190
	 * boot_pagesets are used also for bootstrapping offline
3191
	 * cpus if the system is already booted because the pagesets
3192
	 * are needed to initialize allocators on a specific cpu too.
3193
	 * F.e. the percpu allocator needs the page allocator which
3194
	 * needs the percpu allocator in order to allocate its pagesets
3195
	 * (a chicken-egg dilemma).
3196
	 */
3197
	for_each_possible_cpu(cpu) {
3198
		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3199

3200
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3201
		/*
3202
		 * We now know the "local memory node" for each node--
3203
		 * i.e., the node of the first zone in the generic zonelist.
3204
		 * Set up numa_mem percpu variable for on-line cpus.  During
3205
		 * boot, only the boot cpu should be on-line;  we'll init the
3206
		 * secondary cpus' numa_mem as they come on-line.  During
3207
		 * node/memory hotplug, we'll fixup all on-line cpus.
3208
		 */
3209
		if (cpu_online(cpu))
3210
			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3211
#endif
3212
	}
3213

3214
	return 0;
3215
}
3216

3217
/*
3218
 * Called with zonelists_mutex held always
3219
 * unless system_state == SYSTEM_BOOTING.
3220
 */
3221
void __ref build_all_zonelists(void *data)
3222
{
3223
	set_zonelist_order();
3224

3225
	if (system_state == SYSTEM_BOOTING) {
3226
		__build_all_zonelists(NULL);
3227
		mminit_verify_zonelist();
3228
		cpuset_init_current_mems_allowed();
3229
	} else {
3230
		/* we have to stop all cpus to guarantee there is no user
3231
		   of zonelist */
3232
#ifdef CONFIG_MEMORY_HOTPLUG
3233
		if (data)
3234
			setup_zone_pageset((struct zone *)data);
3235
#endif
3236
		stop_machine(__build_all_zonelists, NULL, NULL);
3237
		/* cpuset refresh routine should be here */
3238
	}
3239
	vm_total_pages = nr_free_pagecache_pages();
3240
	/*
3241
	 * Disable grouping by mobility if the number of pages in the
3242
	 * system is too low to allow the mechanism to work. It would be
3243
	 * more accurate, but expensive to check per-zone. This check is
3244
	 * made on memory-hotadd so a system can start with mobility
3245
	 * disabled and enable it later
3246
	 */
3247
	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3248
		page_group_by_mobility_disabled = 1;
3249
	else
3250
		page_group_by_mobility_disabled = 0;
3251

3252
	printk("Built %i zonelists in %s order, mobility grouping %s.  "
3253
		"Total pages: %ld\n",
3254
			nr_online_nodes,
3255
			zonelist_order_name[current_zonelist_order],
3256
			page_group_by_mobility_disabled ? "off" : "on",
3257
			vm_total_pages);
3258
#ifdef CONFIG_NUMA
3259
	printk("Policy zone: %s\n", zone_names[policy_zone]);
3260
#endif
3261
}
3262

3263
/*
3264
 * Helper functions to size the waitqueue hash table.
3265
 * Essentially these want to choose hash table sizes sufficiently
3266
 * large so that collisions trying to wait on pages are rare.
3267
 * But in fact, the number of active page waitqueues on typical
3268
 * systems is ridiculously low, less than 200. So this is even
3269
 * conservative, even though it seems large.
3270
 *
3271
 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3272
 * waitqueues, i.e. the size of the waitq table given the number of pages.
3273
 */
3274
#define PAGES_PER_WAITQUEUE	256
3275

3276
#ifndef CONFIG_MEMORY_HOTPLUG
3277
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3278
{
3279
	unsigned long size = 1;
3280

3281
	pages /= PAGES_PER_WAITQUEUE;
3282

3283
	while (size < pages)
3284
		size <<= 1;
3285

3286
	/*
3287
	 * Once we have dozens or even hundreds of threads sleeping
3288
	 * on IO we've got bigger problems than wait queue collision.
3289
	 * Limit the size of the wait table to a reasonable size.
3290
	 */
3291
	size = min(size, 4096UL);
3292

3293
	return max(size, 4UL);
3294
}
3295
#else
3296
/*
3297
 * A zone's size might be changed by hot-add, so it is not possible to determine
3298
 * a suitable size for its wait_table.  So we use the maximum size now.
3299
 *
3300
 * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
3301
 *
3302
 *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
3303
 *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3304
 *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
3305
 *
3306
 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3307
 * or more by the traditional way. (See above).  It equals:
3308
 *
3309
 *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
3310
 *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
3311
 *    powerpc (64K page size)             : =  (32G +16M)byte.
3312
 */
3313
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3314
{
3315
	return 4096UL;
3316
}
3317
#endif
3318

3319
/*
3320
 * This is an integer logarithm so that shifts can be used later
3321
 * to extract the more random high bits from the multiplicative
3322
 * hash function before the remainder is taken.
3323
 */
3324
static inline unsigned long wait_table_bits(unsigned long size)
3325
{
3326
	return ffz(~size);
3327
}
3328

3329
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3330

3331
/*
3332
 * Check if a pageblock contains reserved pages
3333
 */
3334
static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3335
{
3336
	unsigned long pfn;
3337

3338
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3339
		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3340
			return 1;
3341
	}
3342
	return 0;
3343
}
3344

3345
/*
3346
 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3347
 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3348
 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3349
 * higher will lead to a bigger reserve which will get freed as contiguous
3350
 * blocks as reclaim kicks in
3351
 */
3352
static void setup_zone_migrate_reserve(struct zone *zone)
3353
{
3354
	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3355
	struct page *page;
3356
	unsigned long block_migratetype;
3357
	int reserve;
3358

3359
	/* Get the start pfn, end pfn and the number of blocks to reserve */
3360
	start_pfn = zone->zone_start_pfn;
3361
	end_pfn = start_pfn + zone->spanned_pages;
3362
	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3363
							pageblock_order;
3364

3365
	/*
3366
	 * Reserve blocks are generally in place to help high-order atomic
3367
	 * allocations that are short-lived. A min_free_kbytes value that
3368
	 * would result in more than 2 reserve blocks for atomic allocations
3369
	 * is assumed to be in place to help anti-fragmentation for the
3370
	 * future allocation of hugepages at runtime.
3371
	 */
3372
	reserve = min(2, reserve);
3373

3374
	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3375
		if (!pfn_valid(pfn))
3376
			continue;
3377
		page = pfn_to_page(pfn);
3378

3379
		/* Watch out for overlapping nodes */
3380
		if (page_to_nid(page) != zone_to_nid(zone))
3381
			continue;
3382

3383
		/* Blocks with reserved pages will never free, skip them. */
3384
		block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3385
		if (pageblock_is_reserved(pfn, block_end_pfn))
3386
			continue;
3387

3388
		block_migratetype = get_pageblock_migratetype(page);
3389

3390
		/* If this block is reserved, account for it */
3391
		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
3392
			reserve--;
3393
			continue;
3394
		}
3395

3396
		/* Suitable for reserving if this block is movable */
3397
		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
3398
			set_pageblock_migratetype(page, MIGRATE_RESERVE);
3399
			move_freepages_block(zone, page, MIGRATE_RESERVE);
3400
			reserve--;
3401
			continue;
3402
		}
3403

3404
		/*
3405
		 * If the reserve is met and this is a previous reserved block,
3406
		 * take it back
3407
		 */
3408
		if (block_migratetype == MIGRATE_RESERVE) {
3409
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3410
			move_freepages_block(zone, page, MIGRATE_MOVABLE);
3411
		}
3412
	}
3413
}
3414

3415
/*
3416
 * Initially all pages are reserved - free ones are freed
3417
 * up by free_all_bootmem() once the early boot process is
3418
 * done. Non-atomic initialization, single-pass.
3419
 */
3420
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3421
		unsigned long start_pfn, enum memmap_context context)
3422
{
3423
	struct page *page;
3424
	unsigned long end_pfn = start_pfn + size;
3425
	unsigned long pfn;
3426
	struct zone *z;
3427

3428
	if (highest_memmap_pfn < end_pfn - 1)
3429
		highest_memmap_pfn = end_pfn - 1;
3430

3431
	z = &NODE_DATA(nid)->node_zones[zone];
3432
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3433
		/*
3434
		 * There can be holes in boot-time mem_map[]s
3435
		 * handed to this function.  They do not
3436
		 * exist on hotplugged memory.
3437
		 */
3438
		if (context == MEMMAP_EARLY) {
3439
			if (!early_pfn_valid(pfn))
3440
				continue;
3441
			if (!early_pfn_in_nid(pfn, nid))
3442
				continue;
3443
		}
3444
		page = pfn_to_page(pfn);
3445
		set_page_links(page, zone, nid, pfn);
3446
		mminit_verify_page_links(page, zone, nid, pfn);
3447
		init_page_count(page);
3448
		reset_page_mapcount(page);
3449
		SetPageReserved(page);
3450
		/*
3451
		 * Mark the block movable so that blocks are reserved for
3452
		 * movable at startup. This will force kernel allocations
3453
		 * to reserve their blocks rather than leaking throughout
3454
		 * the address space during boot when many long-lived
3455
		 * kernel allocations are made. Later some blocks near
3456
		 * the start are marked MIGRATE_RESERVE by
3457
		 * setup_zone_migrate_reserve()
3458
		 *
3459
		 * bitmap is created for zone's valid pfn range. but memmap
3460
		 * can be created for invalid pages (for alignment)
3461
		 * check here not to call set_pageblock_migratetype() against
3462
		 * pfn out of zone.
3463
		 */
3464
		if ((z->zone_start_pfn <= pfn)
3465
		    && (pfn < z->zone_start_pfn + z->spanned_pages)
3466
		    && !(pfn & (pageblock_nr_pages - 1)))
3467
			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3468

3469
		INIT_LIST_HEAD(&page->lru);
3470
#ifdef WANT_PAGE_VIRTUAL
3471
		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
3472
		if (!is_highmem_idx(zone))
3473
			set_page_address(page, __va(pfn << PAGE_SHIFT));
3474
#endif
3475
	}
3476
}
3477

3478
static void __meminit zone_init_free_lists(struct zone *zone)
3479
{
3480
	int order, t;
3481
	for_each_migratetype_order(order, t) {
3482
		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3483
		zone->free_area[order].nr_free = 0;
3484
	}
3485
}
3486

3487
#ifndef __HAVE_ARCH_MEMMAP_INIT
3488
#define memmap_init(size, nid, zone, start_pfn) \
3489
	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3490
#endif
3491

3492
static int zone_batchsize(struct zone *zone)
3493
{
3494
#ifdef CONFIG_MMU
3495
	int batch;
3496

3497
	/*
3498
	 * The per-cpu-pages pools are set to around 1000th of the
3499
	 * size of the zone.  But no more than 1/2 of a meg.
3500
	 *
3501
	 * OK, so we don't know how big the cache is.  So guess.
3502
	 */
3503
	batch = zone->present_pages / 1024;
3504
	if (batch * PAGE_SIZE > 512 * 1024)
3505
		batch = (512 * 1024) / PAGE_SIZE;
3506
	batch /= 4;		/* We effectively *= 4 below */
3507
	if (batch < 1)
3508
		batch = 1;
3509

3510
	/*
3511
	 * Clamp the batch to a 2^n - 1 value. Having a power
3512
	 * of 2 value was found to be more likely to have
3513
	 * suboptimal cache aliasing properties in some cases.
3514
	 *
3515
	 * For example if 2 tasks are alternately allocating
3516
	 * batches of pages, one task can end up with a lot
3517
	 * of pages of one half of the possible page colors
3518
	 * and the other with pages of the other colors.
3519
	 */
3520
	batch = rounddown_pow_of_two(batch + batch/2) - 1;
3521

3522
	return batch;
3523

3524
#else
3525
	/* The deferral and batching of frees should be suppressed under NOMMU
3526
	 * conditions.
3527
	 *
3528
	 * The problem is that NOMMU needs to be able to allocate large chunks
3529
	 * of contiguous memory as there's no hardware page translation to
3530
	 * assemble apparent contiguous memory from discontiguous pages.
3531
	 *
3532
	 * Queueing large contiguous runs of pages for batching, however,
3533
	 * causes the pages to actually be freed in smaller chunks.  As there
3534
	 * can be a significant delay between the individual batches being
3535
	 * recycled, this leads to the once large chunks of space being
3536
	 * fragmented and becoming unavailable for high-order allocations.
3537
	 */
3538
	return 0;
3539
#endif
3540
}
3541

3542
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3543
{
3544
	struct per_cpu_pages *pcp;
3545
	int migratetype;
3546

3547
	memset(p, 0, sizeof(*p));
3548

3549
	pcp = &p->pcp;
3550
	pcp->count = 0;
3551
	pcp->high = 6 * batch;
3552
	pcp->batch = max(1UL, 1 * batch);
3553
	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3554
		INIT_LIST_HEAD(&pcp->lists[migratetype]);
3555
}
3556

3557
/*
3558
 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
3559
 * to the value high for the pageset p.
3560
 */
3561

3562
static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3563
				unsigned long high)
3564
{
3565
	struct per_cpu_pages *pcp;
3566

3567
	pcp = &p->pcp;
3568
	pcp->high = high;
3569
	pcp->batch = max(1UL, high/4);
3570
	if ((high/4) > (PAGE_SHIFT * 8))
3571
		pcp->batch = PAGE_SHIFT * 8;
3572
}
3573

3574
static void setup_zone_pageset(struct zone *zone)
3575
{
3576
	int cpu;
3577

3578
	zone->pageset = alloc_percpu(struct per_cpu_pageset);
3579

3580
	for_each_possible_cpu(cpu) {
3581
		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3582

3583
		setup_pageset(pcp, zone_batchsize(zone));
3584

3585
		if (percpu_pagelist_fraction)
3586
			setup_pagelist_highmark(pcp,
3587
				(zone->present_pages /
3588
					percpu_pagelist_fraction));
3589
	}
3590
}
3591

3592
/*
3593
 * Allocate per cpu pagesets and initialize them.
3594
 * Before this call only boot pagesets were available.
3595
 */
3596
void __init setup_per_cpu_pageset(void)
3597
{
3598
	struct zone *zone;
3599

3600
	for_each_populated_zone(zone)
3601
		setup_zone_pageset(zone);
3602
}
3603

3604
static noinline __init_refok
3605
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3606
{
3607
	int i;
3608
	struct pglist_data *pgdat = zone->zone_pgdat;
3609
	size_t alloc_size;
3610

3611
	/*
3612
	 * The per-page waitqueue mechanism uses hashed waitqueues
3613
	 * per zone.
3614
	 */
3615
	zone->wait_table_hash_nr_entries =
3616
		 wait_table_hash_nr_entries(zone_size_pages);
3617
	zone->wait_table_bits =
3618
		wait_table_bits(zone->wait_table_hash_nr_entries);
3619
	alloc_size = zone->wait_table_hash_nr_entries
3620
					* sizeof(wait_queue_head_t);
3621

3622
	if (!slab_is_available()) {
3623
		zone->wait_table = (wait_queue_head_t *)
3624
			alloc_bootmem_node_nopanic(pgdat, alloc_size);
3625
	} else {
3626
		/*
3627
		 * This case means that a zone whose size was 0 gets new memory
3628
		 * via memory hot-add.
3629
		 * But it may be the case that a new node was hot-added.  In
3630
		 * this case vmalloc() will not be able to use this new node's
3631
		 * memory - this wait_table must be initialized to use this new
3632
		 * node itself as well.
3633
		 * To use this new node's memory, further consideration will be
3634
		 * necessary.
3635
		 */
3636
		zone->wait_table = vmalloc(alloc_size);
3637
	}
3638
	if (!zone->wait_table)
3639
		return -ENOMEM;
3640

3641
	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
3642
		init_waitqueue_head(zone->wait_table + i);
3643

3644
	return 0;
3645
}
3646

3647
static int __zone_pcp_update(void *data)
3648
{
3649
	struct zone *zone = data;
3650
	int cpu;
3651
	unsigned long batch = zone_batchsize(zone), flags;
3652

3653
	for_each_possible_cpu(cpu) {
3654
		struct per_cpu_pageset *pset;
3655
		struct per_cpu_pages *pcp;
3656

3657
		pset = per_cpu_ptr(zone->pageset, cpu);
3658
		pcp = &pset->pcp;
3659

3660
		local_irq_save(flags);
3661
		free_pcppages_bulk(zone, pcp->count, pcp);
3662
		setup_pageset(pset, batch);
3663
		local_irq_restore(flags);
3664
	}
3665
	return 0;
3666
}
3667

3668
void zone_pcp_update(struct zone *zone)
3669
{
3670
	stop_machine(__zone_pcp_update, zone, NULL);
3671
}
3672

3673
static __meminit void zone_pcp_init(struct zone *zone)
3674
{
3675
	/*
3676
	 * per cpu subsystem is not up at this point. The following code
3677
	 * relies on the ability of the linker to provide the
3678
	 * offset of a (static) per cpu variable into the per cpu area.
3679
	 */
3680
	zone->pageset = &boot_pageset;
3681

3682
	if (zone->present_pages)
3683
		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
3684
			zone->name, zone->present_pages,
3685
					 zone_batchsize(zone));
3686
}
3687

3688
__meminit int init_currently_empty_zone(struct zone *zone,
3689
					unsigned long zone_start_pfn,
3690
					unsigned long size,
3691
					enum memmap_context context)
3692
{
3693
	struct pglist_data *pgdat = zone->zone_pgdat;
3694
	int ret;
3695
	ret = zone_wait_table_init(zone, size);
3696
	if (ret)
3697
		return ret;
3698
	pgdat->nr_zones = zone_idx(zone) + 1;
3699

3700
	zone->zone_start_pfn = zone_start_pfn;
3701

3702
	mminit_dprintk(MMINIT_TRACE, "memmap_init",
3703
			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
3704
			pgdat->node_id,
3705
			(unsigned long)zone_idx(zone),
3706
			zone_start_pfn, (zone_start_pfn + size));
3707

3708
	zone_init_free_lists(zone);
3709

3710
	return 0;
3711
}
3712

3713
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3714
/*
3715
 * Basic iterator support. Return the first range of PFNs for a node
3716
 * Note: nid == MAX_NUMNODES returns first region regardless of node
3717
 */
3718
static int __meminit first_active_region_index_in_nid(int nid)
3719
{
3720
	int i;
3721

3722
	for (i = 0; i < nr_nodemap_entries; i++)
3723
		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3724
			return i;
3725

3726
	return -1;
3727
}
3728

3729
/*
3730
 * Basic iterator support. Return the next active range of PFNs for a node
3731
 * Note: nid == MAX_NUMNODES returns next region regardless of node
3732
 */
3733
static int __meminit next_active_region_index_in_nid(int index, int nid)
3734
{
3735
	for (index = index + 1; index < nr_nodemap_entries; index++)
3736
		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3737
			return index;
3738

3739
	return -1;
3740
}
3741

3742
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
3743
/*
3744
 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
3745
 * Architectures may implement their own version but if add_active_range()
3746
 * was used and there are no special requirements, this is a convenient
3747
 * alternative
3748
 */
3749
int __meminit __early_pfn_to_nid(unsigned long pfn)
3750
{
3751
	int i;
3752

3753
	for (i = 0; i < nr_nodemap_entries; i++) {
3754
		unsigned long start_pfn = early_node_map[i].start_pfn;
3755
		unsigned long end_pfn = early_node_map[i].end_pfn;
3756

3757
		if (start_pfn <= pfn && pfn < end_pfn)
3758
			return early_node_map[i].nid;
3759
	}
3760
	/* This is a memory hole */
3761
	return -1;
3762
}
3763
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
3764

3765
int __meminit early_pfn_to_nid(unsigned long pfn)
3766
{
3767
	int nid;
3768

3769
	nid = __early_pfn_to_nid(pfn);
3770
	if (nid >= 0)
3771
		return nid;
3772
	/* just returns 0 */
3773
	return 0;
3774
}
3775

3776
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
3777
bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3778
{
3779
	int nid;
3780

3781
	nid = __early_pfn_to_nid(pfn);
3782
	if (nid >= 0 && nid != node)
3783
		return false;
3784
	return true;
3785
}
3786
#endif
3787

3788
/* Basic iterator support to walk early_node_map[] */
3789
#define for_each_active_range_index_in_nid(i, nid) \
3790
	for (i = first_active_region_index_in_nid(nid); i != -1; \
3791
				i = next_active_region_index_in_nid(i, nid))
3792

3793
/**
3794
 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
3795
 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
3796
 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
3797
 *
3798
 * If an architecture guarantees that all ranges registered with
3799
 * add_active_ranges() contain no holes and may be freed, this
3800
 * this function may be used instead of calling free_bootmem() manually.
3801
 */
3802
void __init free_bootmem_with_active_regions(int nid,
3803
						unsigned long max_low_pfn)
3804
{
3805
	int i;
3806

3807
	for_each_active_range_index_in_nid(i, nid) {
3808
		unsigned long size_pages = 0;
3809
		unsigned long end_pfn = early_node_map[i].end_pfn;
3810

3811
		if (early_node_map[i].start_pfn >= max_low_pfn)
3812
			continue;
3813

3814
		if (end_pfn > max_low_pfn)
3815
			end_pfn = max_low_pfn;
3816

3817
		size_pages = end_pfn - early_node_map[i].start_pfn;
3818
		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
3819
				PFN_PHYS(early_node_map[i].start_pfn),
3820
				size_pages << PAGE_SHIFT);
3821
	}
3822
}
3823

3824
#ifdef CONFIG_HAVE_MEMBLOCK
3825
/*
3826
 * Basic iterator support. Return the last range of PFNs for a node
3827
 * Note: nid == MAX_NUMNODES returns last region regardless of node
3828
 */
3829
static int __meminit last_active_region_index_in_nid(int nid)
3830
{
3831
	int i;
3832

3833
	for (i = nr_nodemap_entries - 1; i >= 0; i--)
3834
		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3835
			return i;
3836

3837
	return -1;
3838
}
3839

3840
/*
3841
 * Basic iterator support. Return the previous active range of PFNs for a node
3842
 * Note: nid == MAX_NUMNODES returns next region regardless of node
3843
 */
3844
static int __meminit previous_active_region_index_in_nid(int index, int nid)
3845
{
3846
	for (index = index - 1; index >= 0; index--)
3847
		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3848
			return index;
3849

3850
	return -1;
3851
}
3852

3853
#define for_each_active_range_index_in_nid_reverse(i, nid) \
3854
	for (i = last_active_region_index_in_nid(nid); i != -1; \
3855
				i = previous_active_region_index_in_nid(i, nid))
3856

3857
u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3858
					u64 goal, u64 limit)
3859
{
3860
	int i;
3861

3862
	/* Need to go over early_node_map to find out good range for node */
3863
	for_each_active_range_index_in_nid_reverse(i, nid) {
3864
		u64 addr;
3865
		u64 ei_start, ei_last;
3866
		u64 final_start, final_end;
3867

3868
		ei_last = early_node_map[i].end_pfn;
3869
		ei_last <<= PAGE_SHIFT;
3870
		ei_start = early_node_map[i].start_pfn;
3871
		ei_start <<= PAGE_SHIFT;
3872

3873
		final_start = max(ei_start, goal);
3874
		final_end = min(ei_last, limit);
3875

3876
		if (final_start >= final_end)
3877
			continue;
3878

3879
		addr = memblock_find_in_range(final_start, final_end, size, align);
3880

3881
		if (addr == MEMBLOCK_ERROR)
3882
			continue;
3883

3884
		return addr;
3885
	}
3886

3887
	return MEMBLOCK_ERROR;
3888
}
3889
#endif
3890

3891
int __init add_from_early_node_map(struct range *range, int az,
3892
				   int nr_range, int nid)
3893
{
3894
	int i;
3895
	u64 start, end;
3896

3897
	/* need to go over early_node_map to find out good range for node */
3898
	for_each_active_range_index_in_nid(i, nid) {
3899
		start = early_node_map[i].start_pfn;
3900
		end = early_node_map[i].end_pfn;
3901
		nr_range = add_range(range, az, nr_range, start, end);
3902
	}
3903
	return nr_range;
3904
}
3905

3906
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3907
{
3908
	int i;
3909
	int ret;
3910

3911
	for_each_active_range_index_in_nid(i, nid) {
3912
		ret = work_fn(early_node_map[i].start_pfn,
3913
			      early_node_map[i].end_pfn, data);
3914
		if (ret)
3915
			break;
3916
	}
3917
}
3918
/**
3919
 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3920
 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
3921
 *
3922
 * If an architecture guarantees that all ranges registered with
3923
 * add_active_ranges() contain no holes and may be freed, this
3924
 * function may be used instead of calling memory_present() manually.
3925
 */
3926
void __init sparse_memory_present_with_active_regions(int nid)
3927
{
3928
	int i;
3929

3930
	for_each_active_range_index_in_nid(i, nid)
3931
		memory_present(early_node_map[i].nid,
3932
				early_node_map[i].start_pfn,
3933
				early_node_map[i].end_pfn);
3934
}
3935

3936
/**
3937
 * get_pfn_range_for_nid - Return the start and end page frames for a node
3938
 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3939
 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
3940
 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
3941
 *
3942
 * It returns the start and end page frame of a node based on information
3943
 * provided by an arch calling add_active_range(). If called for a node
3944
 * with no available memory, a warning is printed and the start and end
3945
 * PFNs will be 0.
3946
 */
3947
void __meminit get_pfn_range_for_nid(unsigned int nid,
3948
			unsigned long *start_pfn, unsigned long *end_pfn)
3949
{
3950
	int i;
3951
	*start_pfn = -1UL;
3952
	*end_pfn = 0;
3953

3954
	for_each_active_range_index_in_nid(i, nid) {
3955
		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
3956
		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
3957
	}
3958

3959
	if (*start_pfn == -1UL)
3960
		*start_pfn = 0;
3961
}
3962

3963
/*
3964
 * This finds a zone that can be used for ZONE_MOVABLE pages. The
3965
 * assumption is made that zones within a node are ordered in monotonic
3966
 * increasing memory addresses so that the "highest" populated zone is used
3967
 */
3968
static void __init find_usable_zone_for_movable(void)
3969
{
3970
	int zone_index;
3971
	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
3972
		if (zone_index == ZONE_MOVABLE)
3973
			continue;
3974

3975
		if (arch_zone_highest_possible_pfn[zone_index] >
3976
				arch_zone_lowest_possible_pfn[zone_index])
3977
			break;
3978
	}
3979

3980
	VM_BUG_ON(zone_index == -1);
3981
	movable_zone = zone_index;
3982
}
3983

3984
/*
3985
 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3986
 * because it is sized independent of architecture. Unlike the other zones,
3987
 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3988
 * in each node depending on the size of each node and how evenly kernelcore
3989
 * is distributed. This helper function adjusts the zone ranges
3990
 * provided by the architecture for a given node by using the end of the
3991
 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3992
 * zones within a node are in order of monotonic increases memory addresses
3993
 */
3994
static void __meminit adjust_zone_range_for_zone_movable(int nid,
3995
					unsigned long zone_type,
3996
					unsigned long node_start_pfn,
3997
					unsigned long node_end_pfn,
3998
					unsigned long *zone_start_pfn,
3999
					unsigned long *zone_end_pfn)
4000
{
4001
	/* Only adjust if ZONE_MOVABLE is on this node */
4002
	if (zone_movable_pfn[nid]) {
4003
		/* Size ZONE_MOVABLE */
4004
		if (zone_type == ZONE_MOVABLE) {
4005
			*zone_start_pfn = zone_movable_pfn[nid];
4006
			*zone_end_pfn = min(node_end_pfn,
4007
				arch_zone_highest_possible_pfn[movable_zone]);
4008

4009
		/* Adjust for ZONE_MOVABLE starting within this range */
4010
		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4011
				*zone_end_pfn > zone_movable_pfn[nid]) {
4012
			*zone_end_pfn = zone_movable_pfn[nid];
4013

4014
		/* Check if this whole range is within ZONE_MOVABLE */
4015
		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
4016
			*zone_start_pfn = *zone_end_pfn;
4017
	}
4018
}
4019

4020
/*
4021
 * Return the number of pages a zone spans in a node, including holes
4022
 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4023
 */
4024
static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4025
					unsigned long zone_type,
4026
					unsigned long *ignored)
4027
{
4028
	unsigned long node_start_pfn, node_end_pfn;
4029
	unsigned long zone_start_pfn, zone_end_pfn;
4030

4031
	/* Get the start and end of the node and zone */
4032
	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4033
	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4034
	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4035
	adjust_zone_range_for_zone_movable(nid, zone_type,
4036
				node_start_pfn, node_end_pfn,
4037
				&zone_start_pfn, &zone_end_pfn);
4038

4039
	/* Check that this node has pages within the zone's required range */
4040
	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4041
		return 0;
4042

4043
	/* Move the zone boundaries inside the node if necessary */
4044
	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4045
	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4046

4047
	/* Return the spanned pages */
4048
	return zone_end_pfn - zone_start_pfn;
4049
}
4050

4051
/*
4052
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4053
 * then all holes in the requested range will be accounted for.
4054
 */
4055
unsigned long __meminit __absent_pages_in_range(int nid,
4056
				unsigned long range_start_pfn,
4057
				unsigned long range_end_pfn)
4058
{
4059
	int i = 0;
4060
	unsigned long prev_end_pfn = 0, hole_pages = 0;
4061
	unsigned long start_pfn;
4062

4063
	/* Find the end_pfn of the first active range of pfns in the node */
4064
	i = first_active_region_index_in_nid(nid);
4065
	if (i == -1)
4066
		return 0;
4067

4068
	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
4069

4070
	/* Account for ranges before physical memory on this node */
4071
	if (early_node_map[i].start_pfn > range_start_pfn)
4072
		hole_pages = prev_end_pfn - range_start_pfn;
4073

4074
	/* Find all holes for the zone within the node */
4075
	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
4076

4077
		/* No need to continue if prev_end_pfn is outside the zone */
4078
		if (prev_end_pfn >= range_end_pfn)
4079
			break;
4080

4081
		/* Make sure the end of the zone is not within the hole */
4082
		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
4083
		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
4084

4085
		/* Update the hole size cound and move on */
4086
		if (start_pfn > range_start_pfn) {
4087
			BUG_ON(prev_end_pfn > start_pfn);
4088
			hole_pages += start_pfn - prev_end_pfn;
4089
		}
4090
		prev_end_pfn = early_node_map[i].end_pfn;
4091
	}
4092

4093
	/* Account for ranges past physical memory on this node */
4094
	if (range_end_pfn > prev_end_pfn)
4095
		hole_pages += range_end_pfn -
4096
				max(range_start_pfn, prev_end_pfn);
4097

4098
	return hole_pages;
4099
}
4100

4101
/**
4102
 * absent_pages_in_range - Return number of page frames in holes within a range
4103
 * @start_pfn: The start PFN to start searching for holes
4104
 * @end_pfn: The end PFN to stop searching for holes
4105
 *
4106
 * It returns the number of pages frames in memory holes within a range.
4107
 */
4108
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4109
							unsigned long end_pfn)
4110
{
4111
	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4112
}
4113

4114
/* Return the number of page frames in holes in a zone on a node */
4115
static unsigned long __meminit zone_absent_pages_in_node(int nid,
4116
					unsigned long zone_type,
4117
					unsigned long *ignored)
4118
{
4119
	unsigned long node_start_pfn, node_end_pfn;
4120
	unsigned long zone_start_pfn, zone_end_pfn;
4121

4122
	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4123
	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
4124
							node_start_pfn);
4125
	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
4126
							node_end_pfn);
4127

4128
	adjust_zone_range_for_zone_movable(nid, zone_type,
4129
			node_start_pfn, node_end_pfn,
4130
			&zone_start_pfn, &zone_end_pfn);
4131
	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4132
}
4133

4134
#else
4135
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4136
					unsigned long zone_type,
4137
					unsigned long *zones_size)
4138
{
4139
	return zones_size[zone_type];
4140
}
4141

4142
static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4143
						unsigned long zone_type,
4144
						unsigned long *zholes_size)
4145
{
4146
	if (!zholes_size)
4147
		return 0;
4148

4149
	return zholes_size[zone_type];
4150
}
4151

4152
#endif
4153

4154
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4155
		unsigned long *zones_size, unsigned long *zholes_size)
4156
{
4157
	unsigned long realtotalpages, totalpages = 0;
4158
	enum zone_type i;
4159

4160
	for (i = 0; i < MAX_NR_ZONES; i++)
4161
		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4162
								zones_size);
4163
	pgdat->node_spanned_pages = totalpages;
4164

4165
	realtotalpages = totalpages;
4166
	for (i = 0; i < MAX_NR_ZONES; i++)
4167
		realtotalpages -=
4168
			zone_absent_pages_in_node(pgdat->node_id, i,
4169
								zholes_size);
4170
	pgdat->node_present_pages = realtotalpages;
4171
	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4172
							realtotalpages);
4173
}
4174

4175
#ifndef CONFIG_SPARSEMEM
4176
/*
4177
 * Calculate the size of the zone->blockflags rounded to an unsigned long
4178
 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4179
 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4180
 * round what is now in bits to nearest long in bits, then return it in
4181
 * bytes.
4182
 */
4183
static unsigned long __init usemap_size(unsigned long zonesize)
4184
{
4185
	unsigned long usemapsize;
4186

4187
	usemapsize = roundup(zonesize, pageblock_nr_pages);
4188
	usemapsize = usemapsize >> pageblock_order;
4189
	usemapsize *= NR_PAGEBLOCK_BITS;
4190
	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4191

4192
	return usemapsize / 8;
4193
}
4194

4195
static void __init setup_usemap(struct pglist_data *pgdat,
4196
				struct zone *zone, unsigned long zonesize)
4197
{
4198
	unsigned long usemapsize = usemap_size(zonesize);
4199
	zone->pageblock_flags = NULL;
4200
	if (usemapsize)
4201
		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4202
								   usemapsize);
4203
}
4204
#else
4205
static inline void setup_usemap(struct pglist_data *pgdat,
4206
				struct zone *zone, unsigned long zonesize) {}
4207
#endif /* CONFIG_SPARSEMEM */
4208

4209
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4210

4211
/* Return a sensible default order for the pageblock size. */
4212
static inline int pageblock_default_order(void)
4213
{
4214
	if (HPAGE_SHIFT > PAGE_SHIFT)
4215
		return HUGETLB_PAGE_ORDER;
4216

4217
	return MAX_ORDER-1;
4218
}
4219

4220
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4221
static inline void __init set_pageblock_order(unsigned int order)
4222
{
4223
	/* Check that pageblock_nr_pages has not already been setup */
4224
	if (pageblock_order)
4225
		return;
4226

4227
	/*
4228
	 * Assume the largest contiguous order of interest is a huge page.
4229
	 * This value may be variable depending on boot parameters on IA64
4230
	 */
4231
	pageblock_order = order;
4232
}
4233
#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4234

4235
/*
4236
 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4237
 * and pageblock_default_order() are unused as pageblock_order is set
4238
 * at compile-time. See include/linux/pageblock-flags.h for the values of
4239
 * pageblock_order based on the kernel config
4240
 */
4241
static inline int pageblock_default_order(unsigned int order)
4242
{
4243
	return MAX_ORDER-1;
4244
}
4245
#define set_pageblock_order(x)	do {} while (0)
4246

4247
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4248

4249
/*
4250
 * Set up the zone data structures:
4251
 *   - mark all pages reserved
4252
 *   - mark all memory queues empty
4253
 *   - clear the memory bitmaps
4254
 */
4255
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4256
		unsigned long *zones_size, unsigned long *zholes_size)
4257
{
4258
	enum zone_type j;
4259
	int nid = pgdat->node_id;
4260
	unsigned long zone_start_pfn = pgdat->node_start_pfn;
4261
	int ret;
4262

4263
	pgdat_resize_init(pgdat);
4264
	pgdat->nr_zones = 0;
4265
	init_waitqueue_head(&pgdat->kswapd_wait);
4266
	pgdat->kswapd_max_order = 0;
4267
	pgdat_page_cgroup_init(pgdat);
4268
	
4269
	for (j = 0; j < MAX_NR_ZONES; j++) {
4270
		struct zone *zone = pgdat->node_zones + j;
4271
		unsigned long size, realsize, memmap_pages;
4272
		enum lru_list l;
4273

4274
		size = zone_spanned_pages_in_node(nid, j, zones_size);
4275
		realsize = size - zone_absent_pages_in_node(nid, j,
4276
								zholes_size);
4277

4278
		/*
4279
		 * Adjust realsize so that it accounts for how much memory
4280
		 * is used by this zone for memmap. This affects the watermark
4281
		 * and per-cpu initialisations
4282
		 */
4283
		memmap_pages =
4284
			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
4285
		if (realsize >= memmap_pages) {
4286
			realsize -= memmap_pages;
4287
			if (memmap_pages)
4288
				printk(KERN_DEBUG
4289
				       "  %s zone: %lu pages used for memmap\n",
4290
				       zone_names[j], memmap_pages);
4291
		} else
4292
			printk(KERN_WARNING
4293
				"  %s zone: %lu pages exceeds realsize %lu\n",
4294
				zone_names[j], memmap_pages, realsize);
4295

4296
		/* Account for reserved pages */
4297
		if (j == 0 && realsize > dma_reserve) {
4298
			realsize -= dma_reserve;
4299
			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
4300
					zone_names[0], dma_reserve);
4301
		}
4302

4303
		if (!is_highmem_idx(j))
4304
			nr_kernel_pages += realsize;
4305
		nr_all_pages += realsize;
4306

4307
		zone->spanned_pages = size;
4308
		zone->present_pages = realsize;
4309
#ifdef CONFIG_NUMA
4310
		zone->node = nid;
4311
		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
4312
						/ 100;
4313
		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
4314
#endif
4315
		zone->name = zone_names[j];
4316
		spin_lock_init(&zone->lock);
4317
		spin_lock_init(&zone->lru_lock);
4318
		zone_seqlock_init(zone);
4319
		zone->zone_pgdat = pgdat;
4320

4321
		zone_pcp_init(zone);
4322
		for_each_lru(l)
4323
			INIT_LIST_HEAD(&zone->lru[l].list);
4324
		zone->reclaim_stat.recent_rotated[0] = 0;
4325
		zone->reclaim_stat.recent_rotated[1] = 0;
4326
		zone->reclaim_stat.recent_scanned[0] = 0;
4327
		zone->reclaim_stat.recent_scanned[1] = 0;
4328
		zap_zone_vm_stats(zone);
4329
		zone->flags = 0;
4330
		if (!size)
4331
			continue;
4332

4333
		set_pageblock_order(pageblock_default_order());
4334
		setup_usemap(pgdat, zone, size);
4335
		ret = init_currently_empty_zone(zone, zone_start_pfn,
4336
						size, MEMMAP_EARLY);
4337
		BUG_ON(ret);
4338
		memmap_init(size, nid, j, zone_start_pfn);
4339
		zone_start_pfn += size;
4340
	}
4341
}
4342

4343
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4344
{
4345
	/* Skip empty nodes */
4346
	if (!pgdat->node_spanned_pages)
4347
		return;
4348

4349
#ifdef CONFIG_FLAT_NODE_MEM_MAP
4350
	/* ia64 gets its own node_mem_map, before this, without bootmem */
4351
	if (!pgdat->node_mem_map) {
4352
		unsigned long size, start, end;
4353
		struct page *map;
4354

4355
		/*
4356
		 * The zone's endpoints aren't required to be MAX_ORDER
4357
		 * aligned but the node_mem_map endpoints must be in order
4358
		 * for the buddy allocator to function correctly.
4359
		 */
4360
		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4361
		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
4362
		end = ALIGN(end, MAX_ORDER_NR_PAGES);
4363
		size =  (end - start) * sizeof(struct page);
4364
		map = alloc_remap(pgdat->node_id, size);
4365
		if (!map)
4366
			map = alloc_bootmem_node_nopanic(pgdat, size);
4367
		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4368
	}
4369
#ifndef CONFIG_NEED_MULTIPLE_NODES
4370
	/*
4371
	 * With no DISCONTIG, the global mem_map is just set as node 0's
4372
	 */
4373
	if (pgdat == NODE_DATA(0)) {
4374
		mem_map = NODE_DATA(0)->node_mem_map;
4375
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
4376
		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4377
			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4378
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
4379
	}
4380
#endif
4381
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
4382
}
4383

4384
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4385
		unsigned long node_start_pfn, unsigned long *zholes_size)
4386
{
4387
	pg_data_t *pgdat = NODE_DATA(nid);
4388

4389
	pgdat->node_id = nid;
4390
	pgdat->node_start_pfn = node_start_pfn;
4391
	calculate_node_totalpages(pgdat, zones_size, zholes_size);
4392

4393
	alloc_node_mem_map(pgdat);
4394
#ifdef CONFIG_FLAT_NODE_MEM_MAP
4395
	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4396
		nid, (unsigned long)pgdat,
4397
		(unsigned long)pgdat->node_mem_map);
4398
#endif
4399

4400
	free_area_init_core(pgdat, zones_size, zholes_size);
4401
}
4402

4403
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
4404

4405
#if MAX_NUMNODES > 1
4406
/*
4407
 * Figure out the number of possible node ids.
4408
 */
4409
static void __init setup_nr_node_ids(void)
4410
{
4411
	unsigned int node;
4412
	unsigned int highest = 0;
4413

4414
	for_each_node_mask(node, node_possible_map)
4415
		highest = node;
4416
	nr_node_ids = highest + 1;
4417
}
4418
#else
4419
static inline void setup_nr_node_ids(void)
4420
{
4421
}
4422
#endif
4423

4424
/**
4425
 * add_active_range - Register a range of PFNs backed by physical memory
4426
 * @nid: The node ID the range resides on
4427
 * @start_pfn: The start PFN of the available physical memory
4428
 * @end_pfn: The end PFN of the available physical memory
4429
 *
4430
 * These ranges are stored in an early_node_map[] and later used by
4431
 * free_area_init_nodes() to calculate zone sizes and holes. If the
4432
 * range spans a memory hole, it is up to the architecture to ensure
4433
 * the memory is not freed by the bootmem allocator. If possible
4434
 * the range being registered will be merged with existing ranges.
4435
 */
4436
void __init add_active_range(unsigned int nid, unsigned long start_pfn,
4437
						unsigned long end_pfn)
4438
{
4439
	int i;
4440

4441
	mminit_dprintk(MMINIT_TRACE, "memory_register",
4442
			"Entering add_active_range(%d, %#lx, %#lx) "
4443
			"%d entries of %d used\n",
4444
			nid, start_pfn, end_pfn,
4445
			nr_nodemap_entries, MAX_ACTIVE_REGIONS);
4446

4447
	mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
4448

4449
	/* Merge with existing active regions if possible */
4450
	for (i = 0; i < nr_nodemap_entries; i++) {
4451
		if (early_node_map[i].nid != nid)
4452
			continue;
4453

4454
		/* Skip if an existing region covers this new one */
4455
		if (start_pfn >= early_node_map[i].start_pfn &&
4456
				end_pfn <= early_node_map[i].end_pfn)
4457
			return;
4458

4459
		/* Merge forward if suitable */
4460
		if (start_pfn <= early_node_map[i].end_pfn &&
4461
				end_pfn > early_node_map[i].end_pfn) {
4462
			early_node_map[i].end_pfn = end_pfn;
4463
			return;
4464
		}
4465

4466
		/* Merge backward if suitable */
4467
		if (start_pfn < early_node_map[i].start_pfn &&
4468
				end_pfn >= early_node_map[i].start_pfn) {
4469
			early_node_map[i].start_pfn = start_pfn;
4470
			return;
4471
		}
4472
	}
4473

4474
	/* Check that early_node_map is large enough */
4475
	if (i >= MAX_ACTIVE_REGIONS) {
4476
		printk(KERN_CRIT "More than %d memory regions, truncating\n",
4477
							MAX_ACTIVE_REGIONS);
4478
		return;
4479
	}
4480

4481
	early_node_map[i].nid = nid;
4482
	early_node_map[i].start_pfn = start_pfn;
4483
	early_node_map[i].end_pfn = end_pfn;
4484
	nr_nodemap_entries = i + 1;
4485
}
4486

4487
/**
4488
 * remove_active_range - Shrink an existing registered range of PFNs
4489
 * @nid: The node id the range is on that should be shrunk
4490
 * @start_pfn: The new PFN of the range
4491
 * @end_pfn: The new PFN of the range
4492
 *
4493
 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
4494
 * The map is kept near the end physical page range that has already been
4495
 * registered. This function allows an arch to shrink an existing registered
4496
 * range.
4497
 */
4498
void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
4499
				unsigned long end_pfn)
4500
{
4501
	int i, j;
4502
	int removed = 0;
4503

4504
	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
4505
			  nid, start_pfn, end_pfn);
4506

4507
	/* Find the old active region end and shrink */
4508
	for_each_active_range_index_in_nid(i, nid) {
4509
		if (early_node_map[i].start_pfn >= start_pfn &&
4510
		    early_node_map[i].end_pfn <= end_pfn) {
4511
			/* clear it */
4512
			early_node_map[i].start_pfn = 0;
4513
			early_node_map[i].end_pfn = 0;
4514
			removed = 1;
4515
			continue;
4516
		}
4517
		if (early_node_map[i].start_pfn < start_pfn &&
4518
		    early_node_map[i].end_pfn > start_pfn) {
4519
			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
4520
			early_node_map[i].end_pfn = start_pfn;
4521
			if (temp_end_pfn > end_pfn)
4522
				add_active_range(nid, end_pfn, temp_end_pfn);
4523
			continue;
4524
		}
4525
		if (early_node_map[i].start_pfn >= start_pfn &&
4526
		    early_node_map[i].end_pfn > end_pfn &&
4527
		    early_node_map[i].start_pfn < end_pfn) {
4528
			early_node_map[i].start_pfn = end_pfn;
4529
			continue;
4530
		}
4531
	}
4532

4533
	if (!removed)
4534
		return;
4535

4536
	/* remove the blank ones */
4537
	for (i = nr_nodemap_entries - 1; i > 0; i--) {
4538
		if (early_node_map[i].nid != nid)
4539
			continue;
4540
		if (early_node_map[i].end_pfn)
4541
			continue;
4542
		/* we found it, get rid of it */
4543
		for (j = i; j < nr_nodemap_entries - 1; j++)
4544
			memcpy(&early_node_map[j], &early_node_map[j+1],
4545
				sizeof(early_node_map[j]));
4546
		j = nr_nodemap_entries - 1;
4547
		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
4548
		nr_nodemap_entries--;
4549
	}
4550
}
4551

4552
/**
4553
 * remove_all_active_ranges - Remove all currently registered regions
4554
 *
4555
 * During discovery, it may be found that a table like SRAT is invalid
4556
 * and an alternative discovery method must be used. This function removes
4557
 * all currently registered regions.
4558
 */
4559
void __init remove_all_active_ranges(void)
4560
{
4561
	memset(early_node_map, 0, sizeof(early_node_map));
4562
	nr_nodemap_entries = 0;
4563
}
4564

4565
/* Compare two active node_active_regions */
4566
static int __init cmp_node_active_region(const void *a, const void *b)
4567
{
4568
	struct node_active_region *arange = (struct node_active_region *)a;
4569
	struct node_active_region *brange = (struct node_active_region *)b;
4570

4571
	/* Done this way to avoid overflows */
4572
	if (arange->start_pfn > brange->start_pfn)
4573
		return 1;
4574
	if (arange->start_pfn < brange->start_pfn)
4575
		return -1;
4576

4577
	return 0;
4578
}
4579

4580
/* sort the node_map by start_pfn */
4581
void __init sort_node_map(void)
4582
{
4583
	sort(early_node_map, (size_t)nr_nodemap_entries,
4584
			sizeof(struct node_active_region),
4585
			cmp_node_active_region, NULL);
4586
}
4587

4588
/* Find the lowest pfn for a node */
4589
static unsigned long __init find_min_pfn_for_node(int nid)
4590
{
4591
	int i;
4592
	unsigned long min_pfn = ULONG_MAX;
4593

4594
	/* Assuming a sorted map, the first range found has the starting pfn */
4595
	for_each_active_range_index_in_nid(i, nid)
4596
		min_pfn = min(min_pfn, early_node_map[i].start_pfn);
4597

4598
	if (min_pfn == ULONG_MAX) {
4599
		printk(KERN_WARNING
4600
			"Could not find start_pfn for node %d\n", nid);
4601
		return 0;
4602
	}
4603

4604
	return min_pfn;
4605
}
4606

4607
/**
4608
 * find_min_pfn_with_active_regions - Find the minimum PFN registered
4609
 *
4610
 * It returns the minimum PFN based on information provided via
4611
 * add_active_range().
4612
 */
4613
unsigned long __init find_min_pfn_with_active_regions(void)
4614
{
4615
	return find_min_pfn_for_node(MAX_NUMNODES);
4616
}
4617

4618
/*
4619
 * early_calculate_totalpages()
4620
 * Sum pages in active regions for movable zone.
4621
 * Populate N_HIGH_MEMORY for calculating usable_nodes.
4622
 */
4623
static unsigned long __init early_calculate_totalpages(void)
4624
{
4625
	int i;
4626
	unsigned long totalpages = 0;
4627

4628
	for (i = 0; i < nr_nodemap_entries; i++) {
4629
		unsigned long pages = early_node_map[i].end_pfn -
4630
						early_node_map[i].start_pfn;
4631
		totalpages += pages;
4632
		if (pages)
4633
			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
4634
	}
4635
  	return totalpages;
4636
}
4637

4638
/*
4639
 * Find the PFN the Movable zone begins in each node. Kernel memory
4640
 * is spread evenly between nodes as long as the nodes have enough
4641
 * memory. When they don't, some nodes will have more kernelcore than
4642
 * others
4643
 */
4644
static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
4645
{
4646
	int i, nid;
4647
	unsigned long usable_startpfn;
4648
	unsigned long kernelcore_node, kernelcore_remaining;
4649
	/* save the state before borrow the nodemask */
4650
	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4651
	unsigned long totalpages = early_calculate_totalpages();
4652
	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4653

4654
	/*
4655
	 * If movablecore was specified, calculate what size of
4656
	 * kernelcore that corresponds so that memory usable for
4657
	 * any allocation type is evenly spread. If both kernelcore
4658
	 * and movablecore are specified, then the value of kernelcore
4659
	 * will be used for required_kernelcore if it's greater than
4660
	 * what movablecore would have allowed.
4661
	 */
4662
	if (required_movablecore) {
4663
		unsigned long corepages;
4664

4665
		/*
4666
		 * Round-up so that ZONE_MOVABLE is at least as large as what
4667
		 * was requested by the user
4668
		 */
4669
		required_movablecore =
4670
			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
4671
		corepages = totalpages - required_movablecore;
4672

4673
		required_kernelcore = max(required_kernelcore, corepages);
4674
	}
4675

4676
	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
4677
	if (!required_kernelcore)
4678
		goto out;
4679

4680
	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4681
	find_usable_zone_for_movable();
4682
	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4683

4684
restart:
4685
	/* Spread kernelcore memory as evenly as possible throughout nodes */
4686
	kernelcore_node = required_kernelcore / usable_nodes;
4687
	for_each_node_state(nid, N_HIGH_MEMORY) {
4688
		/*
4689
		 * Recalculate kernelcore_node if the division per node
4690
		 * now exceeds what is necessary to satisfy the requested
4691
		 * amount of memory for the kernel
4692
		 */
4693
		if (required_kernelcore < kernelcore_node)
4694
			kernelcore_node = required_kernelcore / usable_nodes;
4695

4696
		/*
4697
		 * As the map is walked, we track how much memory is usable
4698
		 * by the kernel using kernelcore_remaining. When it is
4699
		 * 0, the rest of the node is usable by ZONE_MOVABLE
4700
		 */
4701
		kernelcore_remaining = kernelcore_node;
4702

4703
		/* Go through each range of PFNs within this node */
4704
		for_each_active_range_index_in_nid(i, nid) {
4705
			unsigned long start_pfn, end_pfn;
4706
			unsigned long size_pages;
4707

4708
			start_pfn = max(early_node_map[i].start_pfn,
4709
						zone_movable_pfn[nid]);
4710
			end_pfn = early_node_map[i].end_pfn;
4711
			if (start_pfn >= end_pfn)
4712
				continue;
4713

4714
			/* Account for what is only usable for kernelcore */
4715
			if (start_pfn < usable_startpfn) {
4716
				unsigned long kernel_pages;
4717
				kernel_pages = min(end_pfn, usable_startpfn)
4718
								- start_pfn;
4719

4720
				kernelcore_remaining -= min(kernel_pages,
4721
							kernelcore_remaining);
4722
				required_kernelcore -= min(kernel_pages,
4723
							required_kernelcore);
4724

4725
				/* Continue if range is now fully accounted */
4726
				if (end_pfn <= usable_startpfn) {
4727

4728
					/*
4729
					 * Push zone_movable_pfn to the end so
4730
					 * that if we have to rebalance
4731
					 * kernelcore across nodes, we will
4732
					 * not double account here
4733
					 */
4734
					zone_movable_pfn[nid] = end_pfn;
4735
					continue;
4736
				}
4737
				start_pfn = usable_startpfn;
4738
			}
4739

4740
			/*
4741
			 * The usable PFN range for ZONE_MOVABLE is from
4742
			 * start_pfn->end_pfn. Calculate size_pages as the
4743
			 * number of pages used as kernelcore
4744
			 */
4745
			size_pages = end_pfn - start_pfn;
4746
			if (size_pages > kernelcore_remaining)
4747
				size_pages = kernelcore_remaining;
4748
			zone_movable_pfn[nid] = start_pfn + size_pages;
4749

4750
			/*
4751
			 * Some kernelcore has been met, update counts and
4752
			 * break if the kernelcore for this node has been
4753
			 * satisified
4754
			 */
4755
			required_kernelcore -= min(required_kernelcore,
4756
								size_pages);
4757
			kernelcore_remaining -= size_pages;
4758
			if (!kernelcore_remaining)
4759
				break;
4760
		}
4761
	}
4762

4763
	/*
4764
	 * If there is still required_kernelcore, we do another pass with one
4765
	 * less node in the count. This will push zone_movable_pfn[nid] further
4766
	 * along on the nodes that still have memory until kernelcore is
4767
	 * satisified
4768
	 */
4769
	usable_nodes--;
4770
	if (usable_nodes && required_kernelcore > usable_nodes)
4771
		goto restart;
4772

4773
	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4774
	for (nid = 0; nid < MAX_NUMNODES; nid++)
4775
		zone_movable_pfn[nid] =
4776
			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4777

4778
out:
4779
	/* restore the node_state */
4780
	node_states[N_HIGH_MEMORY] = saved_node_state;
4781
}
4782

4783
/* Any regular memory on that node ? */
4784
static void check_for_regular_memory(pg_data_t *pgdat)
4785
{
4786
#ifdef CONFIG_HIGHMEM
4787
	enum zone_type zone_type;
4788

4789
	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4790
		struct zone *zone = &pgdat->node_zones[zone_type];
4791
		if (zone->present_pages)
4792
			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4793
	}
4794
#endif
4795
}
4796

4797
/**
4798
 * free_area_init_nodes - Initialise all pg_data_t and zone data
4799
 * @max_zone_pfn: an array of max PFNs for each zone
4800
 *
4801
 * This will call free_area_init_node() for each active node in the system.
4802
 * Using the page ranges provided by add_active_range(), the size of each
4803
 * zone in each node and their holes is calculated. If the maximum PFN
4804
 * between two adjacent zones match, it is assumed that the zone is empty.
4805
 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
4806
 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
4807
 * starts where the previous one ended. For example, ZONE_DMA32 starts
4808
 * at arch_max_dma_pfn.
4809
 */
4810
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4811
{
4812
	unsigned long nid;
4813
	int i;
4814

4815
	/* Sort early_node_map as initialisation assumes it is sorted */
4816
	sort_node_map();
4817

4818
	/* Record where the zone boundaries are */
4819
	memset(arch_zone_lowest_possible_pfn, 0,
4820
				sizeof(arch_zone_lowest_possible_pfn));
4821
	memset(arch_zone_highest_possible_pfn, 0,
4822
				sizeof(arch_zone_highest_possible_pfn));
4823
	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
4824
	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
4825
	for (i = 1; i < MAX_NR_ZONES; i++) {
4826
		if (i == ZONE_MOVABLE)
4827
			continue;
4828
		arch_zone_lowest_possible_pfn[i] =
4829
			arch_zone_highest_possible_pfn[i-1];
4830
		arch_zone_highest_possible_pfn[i] =
4831
			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
4832
	}
4833
	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
4834
	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
4835

4836
	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
4837
	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4838
	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
4839

4840
	/* Print out the zone ranges */
4841
	printk("Zone PFN ranges:\n");
4842
	for (i = 0; i < MAX_NR_ZONES; i++) {
4843
		if (i == ZONE_MOVABLE)
4844
			continue;
4845
		printk("  %-8s ", zone_names[i]);
4846
		if (arch_zone_lowest_possible_pfn[i] ==
4847
				arch_zone_highest_possible_pfn[i])
4848
			printk("empty\n");
4849
		else
4850
			printk("%0#10lx -> %0#10lx\n",
4851
				arch_zone_lowest_possible_pfn[i],
4852
				arch_zone_highest_possible_pfn[i]);
4853
	}
4854

4855
	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
4856
	printk("Movable zone start PFN for each node\n");
4857
	for (i = 0; i < MAX_NUMNODES; i++) {
4858
		if (zone_movable_pfn[i])
4859
			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
4860
	}
4861

4862
	/* Print out the early_node_map[] */
4863
	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
4864
	for (i = 0; i < nr_nodemap_entries; i++)
4865
		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
4866
						early_node_map[i].start_pfn,
4867
						early_node_map[i].end_pfn);
4868

4869
	/* Initialise every node */
4870
	mminit_verify_pageflags_layout();
4871
	setup_nr_node_ids();
4872
	for_each_online_node(nid) {
4873
		pg_data_t *pgdat = NODE_DATA(nid);
4874
		free_area_init_node(nid, NULL,
4875
				find_min_pfn_for_node(nid), NULL);
4876

4877
		/* Any memory on that node */
4878
		if (pgdat->node_present_pages)
4879
			node_set_state(nid, N_HIGH_MEMORY);
4880
		check_for_regular_memory(pgdat);
4881
	}
4882
}
4883

4884
static int __init cmdline_parse_core(char *p, unsigned long *core)
4885
{
4886
	unsigned long long coremem;
4887
	if (!p)
4888
		return -EINVAL;
4889

4890
	coremem = memparse(p, &p);
4891
	*core = coremem >> PAGE_SHIFT;
4892

4893
	/* Paranoid check that UL is enough for the coremem value */
4894
	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
4895

4896
	return 0;
4897
}
4898

4899
/*
4900
 * kernelcore=size sets the amount of memory for use for allocations that
4901
 * cannot be reclaimed or migrated.
4902
 */
4903
static int __init cmdline_parse_kernelcore(char *p)
4904
{
4905
	return cmdline_parse_core(p, &required_kernelcore);
4906
}
4907

4908
/*
4909
 * movablecore=size sets the amount of memory for use for allocations that
4910
 * can be reclaimed or migrated.
4911
 */
4912
static int __init cmdline_parse_movablecore(char *p)
4913
{
4914
	return cmdline_parse_core(p, &required_movablecore);
4915
}
4916

4917
early_param("kernelcore", cmdline_parse_kernelcore);
4918
early_param("movablecore", cmdline_parse_movablecore);
4919

4920
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
4921

4922
/**
4923
 * set_dma_reserve - set the specified number of pages reserved in the first zone
4924
 * @new_dma_reserve: The number of pages to mark reserved
4925
 *
4926
 * The per-cpu batchsize and zone watermarks are determined by present_pages.
4927
 * In the DMA zone, a significant percentage may be consumed by kernel image
4928
 * and other unfreeable allocations which can skew the watermarks badly. This
4929
 * function may optionally be used to account for unfreeable pages in the
4930
 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
4931
 * smaller per-cpu batchsize.
4932
 */
4933
void __init set_dma_reserve(unsigned long new_dma_reserve)
4934
{
4935
	dma_reserve = new_dma_reserve;
4936
}
4937

4938
void __init free_area_init(unsigned long *zones_size)
4939
{
4940
	free_area_init_node(0, zones_size,
4941
			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4942
}
4943

4944
static int page_alloc_cpu_notify(struct notifier_block *self,
4945
				 unsigned long action, void *hcpu)
4946
{
4947
	int cpu = (unsigned long)hcpu;
4948

4949
	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4950
		drain_pages(cpu);
4951

4952
		/*
4953
		 * Spill the event counters of the dead processor
4954
		 * into the current processors event counters.
4955
		 * This artificially elevates the count of the current
4956
		 * processor.
4957
		 */
4958
		vm_events_fold_cpu(cpu);
4959

4960
		/*
4961
		 * Zero the differential counters of the dead processor
4962
		 * so that the vm statistics are consistent.
4963
		 *
4964
		 * This is only okay since the processor is dead and cannot
4965
		 * race with what we are doing.
4966
		 */
4967
		refresh_cpu_vm_stats(cpu);
4968
	}
4969
	return NOTIFY_OK;
4970
}
4971

4972
void __init page_alloc_init(void)
4973
{
4974
	hotcpu_notifier(page_alloc_cpu_notify, 0);
4975
}
4976

4977
/*
4978
 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
4979
 *	or min_free_kbytes changes.
4980
 */
4981
static void calculate_totalreserve_pages(void)
4982
{
4983
	struct pglist_data *pgdat;
4984
	unsigned long reserve_pages = 0;
4985
	enum zone_type i, j;
4986

4987
	for_each_online_pgdat(pgdat) {
4988
		for (i = 0; i < MAX_NR_ZONES; i++) {
4989
			struct zone *zone = pgdat->node_zones + i;
4990
			unsigned long max = 0;
4991

4992
			/* Find valid and maximum lowmem_reserve in the zone */
4993
			for (j = i; j < MAX_NR_ZONES; j++) {
4994
				if (zone->lowmem_reserve[j] > max)
4995
					max = zone->lowmem_reserve[j];
4996
			}
4997

4998
			/* we treat the high watermark as reserved pages. */
4999
			max += high_wmark_pages(zone);
5000

5001
			if (max > zone->present_pages)
5002
				max = zone->present_pages;
5003
			reserve_pages += max;
5004
		}
5005
	}
5006
	totalreserve_pages = reserve_pages;
5007
}
5008

5009
/*
5010
 * setup_per_zone_lowmem_reserve - called whenever
5011
 *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
5012
 *	has a correct pages reserved value, so an adequate number of
5013
 *	pages are left in the zone after a successful __alloc_pages().
5014
 */
5015
static void setup_per_zone_lowmem_reserve(void)
5016
{
5017
	struct pglist_data *pgdat;
5018
	enum zone_type j, idx;
5019

5020
	for_each_online_pgdat(pgdat) {
5021
		for (j = 0; j < MAX_NR_ZONES; j++) {
5022
			struct zone *zone = pgdat->node_zones + j;
5023
			unsigned long present_pages = zone->present_pages;
5024

5025
			zone->lowmem_reserve[j] = 0;
5026

5027
			idx = j;
5028
			while (idx) {
5029
				struct zone *lower_zone;
5030

5031
				idx--;
5032

5033
				if (sysctl_lowmem_reserve_ratio[idx] < 1)
5034
					sysctl_lowmem_reserve_ratio[idx] = 1;
5035

5036
				lower_zone = pgdat->node_zones + idx;
5037
				lower_zone->lowmem_reserve[j] = present_pages /
5038
					sysctl_lowmem_reserve_ratio[idx];
5039
				present_pages += lower_zone->present_pages;
5040
			}
5041
		}
5042
	}
5043

5044
	/* update totalreserve_pages */
5045
	calculate_totalreserve_pages();
5046
}
5047

5048
/**
5049
 * setup_per_zone_wmarks - called when min_free_kbytes changes
5050
 * or when memory is hot-{added|removed}
5051
 *
5052
 * Ensures that the watermark[min,low,high] values for each zone are set
5053
 * correctly with respect to min_free_kbytes.
5054
 */
5055
void setup_per_zone_wmarks(void)
5056
{
5057
	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5058
	unsigned long lowmem_pages = 0;
5059
	struct zone *zone;
5060
	unsigned long flags;
5061

5062
	/* Calculate total number of !ZONE_HIGHMEM pages */
5063
	for_each_zone(zone) {
5064
		if (!is_highmem(zone))
5065
			lowmem_pages += zone->present_pages;
5066
	}
5067

5068
	for_each_zone(zone) {
5069
		u64 tmp;
5070

5071
		spin_lock_irqsave(&zone->lock, flags);
5072
		tmp = (u64)pages_min * zone->present_pages;
5073
		do_div(tmp, lowmem_pages);
5074
		if (is_highmem(zone)) {
5075
			/*
5076
			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5077
			 * need highmem pages, so cap pages_min to a small
5078
			 * value here.
5079
			 *
5080
			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5081
			 * deltas controls asynch page reclaim, and so should
5082
			 * not be capped for highmem.
5083
			 */
5084
			int min_pages;
5085

5086
			min_pages = zone->present_pages / 1024;
5087
			if (min_pages < SWAP_CLUSTER_MAX)
5088
				min_pages = SWAP_CLUSTER_MAX;
5089
			if (min_pages > 128)
5090
				min_pages = 128;
5091
			zone->watermark[WMARK_MIN] = min_pages;
5092
		} else {
5093
			/*
5094
			 * If it's a lowmem zone, reserve a number of pages
5095
			 * proportionate to the zone's size.
5096
			 */
5097
			zone->watermark[WMARK_MIN] = tmp;
5098
		}
5099

5100
		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
5101
		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5102
		setup_zone_migrate_reserve(zone);
5103
		spin_unlock_irqrestore(&zone->lock, flags);
5104
	}
5105

5106
	/* update totalreserve_pages */
5107
	calculate_totalreserve_pages();
5108
}
5109

5110
/*
5111
 * The inactive anon list should be small enough that the VM never has to
5112
 * do too much work, but large enough that each inactive page has a chance
5113
 * to be referenced again before it is swapped out.
5114
 *
5115
 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5116
 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5117
 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5118
 * the anonymous pages are kept on the inactive list.
5119
 *
5120
 * total     target    max
5121
 * memory    ratio     inactive anon
5122
 * -------------------------------------
5123
 *   10MB       1         5MB
5124
 *  100MB       1        50MB
5125
 *    1GB       3       250MB
5126
 *   10GB      10       0.9GB
5127
 *  100GB      31         3GB
5128
 *    1TB     101        10GB
5129
 *   10TB     320        32GB
5130
 */
5131
static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5132
{
5133
	unsigned int gb, ratio;
5134

5135
	/* Zone size in gigabytes */
5136
	gb = zone->present_pages >> (30 - PAGE_SHIFT);
5137
	if (gb)
5138
		ratio = int_sqrt(10 * gb);
5139
	else
5140
		ratio = 1;
5141

5142
	zone->inactive_ratio = ratio;
5143
}
5144

5145
static void __meminit setup_per_zone_inactive_ratio(void)
5146
{
5147
	struct zone *zone;
5148

5149
	for_each_zone(zone)
5150
		calculate_zone_inactive_ratio(zone);
5151
}
5152

5153
/*
5154
 * Initialise min_free_kbytes.
5155
 *
5156
 * For small machines we want it small (128k min).  For large machines
5157
 * we want it large (64MB max).  But it is not linear, because network
5158
 * bandwidth does not increase linearly with machine size.  We use
5159
 *
5160
 * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5161
 *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
5162
 *
5163
 * which yields
5164
 *
5165
 * 16MB:	512k
5166
 * 32MB:	724k
5167
 * 64MB:	1024k
5168
 * 128MB:	1448k
5169
 * 256MB:	2048k
5170
 * 512MB:	2896k
5171
 * 1024MB:	4096k
5172
 * 2048MB:	5792k
5173
 * 4096MB:	8192k
5174
 * 8192MB:	11584k
5175
 * 16384MB:	16384k
5176
 */
5177
int __meminit init_per_zone_wmark_min(void)
5178
{
5179
	unsigned long lowmem_kbytes;
5180

5181
	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5182

5183
	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5184
	if (min_free_kbytes < 128)
5185
		min_free_kbytes = 128;
5186
	if (min_free_kbytes > 65536)
5187
		min_free_kbytes = 65536;
5188
	setup_per_zone_wmarks();
5189
	refresh_zone_stat_thresholds();
5190
	setup_per_zone_lowmem_reserve();
5191
	setup_per_zone_inactive_ratio();
5192
	return 0;
5193
}
5194
module_init(init_per_zone_wmark_min)
5195

5196
/*
5197
 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
5198
 *	that we can call two helper functions whenever min_free_kbytes
5199
 *	changes.
5200
 */
5201
int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
5202
	void __user *buffer, size_t *length, loff_t *ppos)
5203
{
5204
	proc_dointvec(table, write, buffer, length, ppos);
5205
	if (write)
5206
		setup_per_zone_wmarks();
5207
	return 0;
5208
}
5209

5210
#ifdef CONFIG_NUMA
5211
int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5212
	void __user *buffer, size_t *length, loff_t *ppos)
5213
{
5214
	struct zone *zone;
5215
	int rc;
5216

5217
	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5218
	if (rc)
5219
		return rc;
5220

5221
	for_each_zone(zone)
5222
		zone->min_unmapped_pages = (zone->present_pages *
5223
				sysctl_min_unmapped_ratio) / 100;
5224
	return 0;
5225
}
5226

5227
int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5228
	void __user *buffer, size_t *length, loff_t *ppos)
5229
{
5230
	struct zone *zone;
5231
	int rc;
5232

5233
	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5234
	if (rc)
5235
		return rc;
5236

5237
	for_each_zone(zone)
5238
		zone->min_slab_pages = (zone->present_pages *
5239
				sysctl_min_slab_ratio) / 100;
5240
	return 0;
5241
}
5242
#endif
5243

5244
/*
5245
 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5246
 *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5247
 *	whenever sysctl_lowmem_reserve_ratio changes.
5248
 *
5249
 * The reserve ratio obviously has absolutely no relation with the
5250
 * minimum watermarks. The lowmem reserve ratio can only make sense
5251
 * if in function of the boot time zone sizes.
5252
 */
5253
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5254
	void __user *buffer, size_t *length, loff_t *ppos)
5255
{
5256
	proc_dointvec_minmax(table, write, buffer, length, ppos);
5257
	setup_per_zone_lowmem_reserve();
5258
	return 0;
5259
}
5260

5261
/*
5262
 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5263
 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
5264
 * can have before it gets flushed back to buddy allocator.
5265
 */
5266

5267
int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5268
	void __user *buffer, size_t *length, loff_t *ppos)
5269
{
5270
	struct zone *zone;
5271
	unsigned int cpu;
5272
	int ret;
5273

5274
	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5275
	if (!write || (ret == -EINVAL))
5276
		return ret;
5277
	for_each_populated_zone(zone) {
5278
		for_each_possible_cpu(cpu) {
5279
			unsigned long  high;
5280
			high = zone->present_pages / percpu_pagelist_fraction;
5281
			setup_pagelist_highmark(
5282
				per_cpu_ptr(zone->pageset, cpu), high);
5283
		}
5284
	}
5285
	return 0;
5286
}
5287

5288
int hashdist = HASHDIST_DEFAULT;
5289

5290
#ifdef CONFIG_NUMA
5291
static int __init set_hashdist(char *str)
5292
{
5293
	if (!str)
5294
		return 0;
5295
	hashdist = simple_strtoul(str, &str, 0);
5296
	return 1;
5297
}
5298
__setup("hashdist=", set_hashdist);
5299
#endif
5300

5301
/*
5302
 * allocate a large system hash table from bootmem
5303
 * - it is assumed that the hash table must contain an exact power-of-2
5304
 *   quantity of entries
5305
 * - limit is the number of hash buckets, not the total allocation size
5306
 */
5307
void *__init alloc_large_system_hash(const char *tablename,
5308
				     unsigned long bucketsize,
5309
				     unsigned long numentries,
5310
				     int scale,
5311
				     int flags,
5312
				     unsigned int *_hash_shift,
5313
				     unsigned int *_hash_mask,
5314
				     unsigned long limit)
5315
{
5316
	unsigned long long max = limit;
5317
	unsigned long log2qty, size;
5318
	void *table = NULL;
5319

5320
	/* allow the kernel cmdline to have a say */
5321
	if (!numentries) {
5322
		/* round applicable memory size up to nearest megabyte */
5323
		numentries = nr_kernel_pages;
5324
		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
5325
		numentries >>= 20 - PAGE_SHIFT;
5326
		numentries <<= 20 - PAGE_SHIFT;
5327

5328
		/* limit to 1 bucket per 2^scale bytes of low memory */
5329
		if (scale > PAGE_SHIFT)
5330
			numentries >>= (scale - PAGE_SHIFT);
5331
		else
5332
			numentries <<= (PAGE_SHIFT - scale);
5333

5334
		/* Make sure we've got at least a 0-order allocation.. */
5335
		if (unlikely(flags & HASH_SMALL)) {
5336
			/* Makes no sense without HASH_EARLY */
5337
			WARN_ON(!(flags & HASH_EARLY));
5338
			if (!(numentries >> *_hash_shift)) {
5339
				numentries = 1UL << *_hash_shift;
5340
				BUG_ON(!numentries);
5341
			}
5342
		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5343
			numentries = PAGE_SIZE / bucketsize;
5344
	}
5345
	numentries = roundup_pow_of_two(numentries);
5346

5347
	/* limit allocation size to 1/16 total memory by default */
5348
	if (max == 0) {
5349
		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5350
		do_div(max, bucketsize);
5351
	}
5352

5353
	if (numentries > max)
5354
		numentries = max;
5355

5356
	log2qty = ilog2(numentries);
5357

5358
	do {
5359
		size = bucketsize << log2qty;
5360
		if (flags & HASH_EARLY)
5361
			table = alloc_bootmem_nopanic(size);
5362
		else if (hashdist)
5363
			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5364
		else {
5365
			/*
5366
			 * If bucketsize is not a power-of-two, we may free
5367
			 * some pages at the end of hash table which
5368
			 * alloc_pages_exact() automatically does
5369
			 */
5370
			if (get_order(size) < MAX_ORDER) {
5371
				table = alloc_pages_exact(size, GFP_ATOMIC);
5372
				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5373
			}
5374
		}
5375
	} while (!table && size > PAGE_SIZE && --log2qty);
5376

5377
	if (!table)
5378
		panic("Failed to allocate %s hash table\n", tablename);
5379

5380
	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5381
	       tablename,
5382
	       (1UL << log2qty),
5383
	       ilog2(size) - PAGE_SHIFT,
5384
	       size);
5385

5386
	if (_hash_shift)
5387
		*_hash_shift = log2qty;
5388
	if (_hash_mask)
5389
		*_hash_mask = (1 << log2qty) - 1;
5390

5391
	return table;
5392
}
5393

5394
/* Return a pointer to the bitmap storing bits affecting a block of pages */
5395
static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5396
							unsigned long pfn)
5397
{
5398
#ifdef CONFIG_SPARSEMEM
5399
	return __pfn_to_section(pfn)->pageblock_flags;
5400
#else
5401
	return zone->pageblock_flags;
5402
#endif /* CONFIG_SPARSEMEM */
5403
}
5404

5405
static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5406
{
5407
#ifdef CONFIG_SPARSEMEM
5408
	pfn &= (PAGES_PER_SECTION-1);
5409
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5410
#else
5411
	pfn = pfn - zone->zone_start_pfn;
5412
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5413
#endif /* CONFIG_SPARSEMEM */
5414
}
5415

5416
/**
5417
 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5418
 * @page: The page within the block of interest
5419
 * @start_bitidx: The first bit of interest to retrieve
5420
 * @end_bitidx: The last bit of interest
5421
 * returns pageblock_bits flags
5422
 */
5423
unsigned long get_pageblock_flags_group(struct page *page,
5424
					int start_bitidx, int end_bitidx)
5425
{
5426
	struct zone *zone;
5427
	unsigned long *bitmap;
5428
	unsigned long pfn, bitidx;
5429
	unsigned long flags = 0;
5430
	unsigned long value = 1;
5431

5432
	zone = page_zone(page);
5433
	pfn = page_to_pfn(page);
5434
	bitmap = get_pageblock_bitmap(zone, pfn);
5435
	bitidx = pfn_to_bitidx(zone, pfn);
5436

5437
	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5438
		if (test_bit(bitidx + start_bitidx, bitmap))
5439
			flags |= value;
5440

5441
	return flags;
5442
}
5443

5444
/**
5445
 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
5446
 * @page: The page within the block of interest
5447
 * @start_bitidx: The first bit of interest
5448
 * @end_bitidx: The last bit of interest
5449
 * @flags: The flags to set
5450
 */
5451
void set_pageblock_flags_group(struct page *page, unsigned long flags,
5452
					int start_bitidx, int end_bitidx)
5453
{
5454
	struct zone *zone;
5455
	unsigned long *bitmap;
5456
	unsigned long pfn, bitidx;
5457
	unsigned long value = 1;
5458

5459
	zone = page_zone(page);
5460
	pfn = page_to_pfn(page);
5461
	bitmap = get_pageblock_bitmap(zone, pfn);
5462
	bitidx = pfn_to_bitidx(zone, pfn);
5463
	VM_BUG_ON(pfn < zone->zone_start_pfn);
5464
	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5465

5466
	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5467
		if (flags & value)
5468
			__set_bit(bitidx + start_bitidx, bitmap);
5469
		else
5470
			__clear_bit(bitidx + start_bitidx, bitmap);
5471
}
5472

5473
/*
5474
 * This is designed as sub function...plz see page_isolation.c also.
5475
 * set/clear page block's type to be ISOLATE.
5476
 * page allocater never alloc memory from ISOLATE block.
5477
 */
5478

5479
static int
5480
__count_immobile_pages(struct zone *zone, struct page *page, int count)
5481
{
5482
	unsigned long pfn, iter, found;
5483
	/*
5484
	 * For avoiding noise data, lru_add_drain_all() should be called
5485
	 * If ZONE_MOVABLE, the zone never contains immobile pages
5486
	 */
5487
	if (zone_idx(zone) == ZONE_MOVABLE)
5488
		return true;
5489

5490
	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5491
		return true;
5492

5493
	pfn = page_to_pfn(page);
5494
	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5495
		unsigned long check = pfn + iter;
5496

5497
		if (!pfn_valid_within(check))
5498
			continue;
5499

5500
		page = pfn_to_page(check);
5501
		if (!page_count(page)) {
5502
			if (PageBuddy(page))
5503
				iter += (1 << page_order(page)) - 1;
5504
			continue;
5505
		}
5506
		if (!PageLRU(page))
5507
			found++;
5508
		/*
5509
		 * If there are RECLAIMABLE pages, we need to check it.
5510
		 * But now, memory offline itself doesn't call shrink_slab()
5511
		 * and it still to be fixed.
5512
		 */
5513
		/*
5514
		 * If the page is not RAM, page_count()should be 0.
5515
		 * we don't need more check. This is an _used_ not-movable page.
5516
		 *
5517
		 * The problematic thing here is PG_reserved pages. PG_reserved
5518
		 * is set to both of a memory hole page and a _used_ kernel
5519
		 * page at boot.
5520
		 */
5521
		if (found > count)
5522
			return false;
5523
	}
5524
	return true;
5525
}
5526

5527
bool is_pageblock_removable_nolock(struct page *page)
5528
{
5529
	struct zone *zone = page_zone(page);
5530
	return __count_immobile_pages(zone, page, 0);
5531
}
5532

5533
int set_migratetype_isolate(struct page *page)
5534
{
5535
	struct zone *zone;
5536
	unsigned long flags, pfn;
5537
	struct memory_isolate_notify arg;
5538
	int notifier_ret;
5539
	int ret = -EBUSY;
5540

5541
	zone = page_zone(page);
5542

5543
	spin_lock_irqsave(&zone->lock, flags);
5544

5545
	pfn = page_to_pfn(page);
5546
	arg.start_pfn = pfn;
5547
	arg.nr_pages = pageblock_nr_pages;
5548
	arg.pages_found = 0;
5549

5550
	/*
5551
	 * It may be possible to isolate a pageblock even if the
5552
	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5553
	 * notifier chain is used by balloon drivers to return the
5554
	 * number of pages in a range that are held by the balloon
5555
	 * driver to shrink memory. If all the pages are accounted for
5556
	 * by balloons, are free, or on the LRU, isolation can continue.
5557
	 * Later, for example, when memory hotplug notifier runs, these
5558
	 * pages reported as "can be isolated" should be isolated(freed)
5559
	 * by the balloon driver through the memory notifier chain.
5560
	 */
5561
	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5562
	notifier_ret = notifier_to_errno(notifier_ret);
5563
	if (notifier_ret)
5564
		goto out;
5565
	/*
5566
	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5567
	 * We just check MOVABLE pages.
5568
	 */
5569
	if (__count_immobile_pages(zone, page, arg.pages_found))
5570
		ret = 0;
5571

5572
	/*
5573
	 * immobile means "not-on-lru" paes. If immobile is larger than
5574
	 * removable-by-driver pages reported by notifier, we'll fail.
5575
	 */
5576

5577
out:
5578
	if (!ret) {
5579
		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5580
		move_freepages_block(zone, page, MIGRATE_ISOLATE);
5581
	}
5582

5583
	spin_unlock_irqrestore(&zone->lock, flags);
5584
	if (!ret)
5585
		drain_all_pages();
5586
	return ret;
5587
}
5588

5589
void unset_migratetype_isolate(struct page *page)
5590
{
5591
	struct zone *zone;
5592
	unsigned long flags;
5593
	zone = page_zone(page);
5594
	spin_lock_irqsave(&zone->lock, flags);
5595
	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5596
		goto out;
5597
	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5598
	move_freepages_block(zone, page, MIGRATE_MOVABLE);
5599
out:
5600
	spin_unlock_irqrestore(&zone->lock, flags);
5601
}
5602

5603
#ifdef CONFIG_MEMORY_HOTREMOVE
5604
/*
5605
 * All pages in the range must be isolated before calling this.
5606
 */
5607
void
5608
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5609
{
5610
	struct page *page;
5611
	struct zone *zone;
5612
	int order, i;
5613
	unsigned long pfn;
5614
	unsigned long flags;
5615
	/* find the first valid pfn */
5616
	for (pfn = start_pfn; pfn < end_pfn; pfn++)
5617
		if (pfn_valid(pfn))
5618
			break;
5619
	if (pfn == end_pfn)
5620
		return;
5621
	zone = page_zone(pfn_to_page(pfn));
5622
	spin_lock_irqsave(&zone->lock, flags);
5623
	pfn = start_pfn;
5624
	while (pfn < end_pfn) {
5625
		if (!pfn_valid(pfn)) {
5626
			pfn++;
5627
			continue;
5628
		}
5629
		page = pfn_to_page(pfn);
5630
		BUG_ON(page_count(page));
5631
		BUG_ON(!PageBuddy(page));
5632
		order = page_order(page);
5633
#ifdef CONFIG_DEBUG_VM
5634
		printk(KERN_INFO "remove from free list %lx %d %lx\n",
5635
		       pfn, 1 << order, end_pfn);
5636
#endif
5637
		list_del(&page->lru);
5638
		rmv_page_order(page);
5639
		zone->free_area[order].nr_free--;
5640
		__mod_zone_page_state(zone, NR_FREE_PAGES,
5641
				      - (1UL << order));
5642
		for (i = 0; i < (1 << order); i++)
5643
			SetPageReserved((page+i));
5644
		pfn += (1 << order);
5645
	}
5646
	spin_unlock_irqrestore(&zone->lock, flags);
5647
}
5648
#endif
5649

5650
#ifdef CONFIG_MEMORY_FAILURE
5651
bool is_free_buddy_page(struct page *page)
5652
{
5653
	struct zone *zone = page_zone(page);
5654
	unsigned long pfn = page_to_pfn(page);
5655
	unsigned long flags;
5656
	int order;
5657

5658
	spin_lock_irqsave(&zone->lock, flags);
5659
	for (order = 0; order < MAX_ORDER; order++) {
5660
		struct page *page_head = page - (pfn & ((1 << order) - 1));
5661

5662
		if (PageBuddy(page_head) && page_order(page_head) >= order)
5663
			break;
5664
	}
5665
	spin_unlock_irqrestore(&zone->lock, flags);
5666

5667
	return order < MAX_ORDER;
5668
}
5669
#endif
5670

5671
static struct trace_print_flags pageflag_names[] = {
5672
	{1UL << PG_locked,		"locked"	},
5673
	{1UL << PG_error,		"error"		},
5674
	{1UL << PG_referenced,		"referenced"	},
5675
	{1UL << PG_uptodate,		"uptodate"	},
5676
	{1UL << PG_dirty,		"dirty"		},
5677
	{1UL << PG_lru,			"lru"		},
5678
	{1UL << PG_active,		"active"	},
5679
	{1UL << PG_slab,		"slab"		},
5680
	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
5681
	{1UL << PG_arch_1,		"arch_1"	},
5682
	{1UL << PG_reserved,		"reserved"	},
5683
	{1UL << PG_private,		"private"	},
5684
	{1UL << PG_private_2,		"private_2"	},
5685
	{1UL << PG_writeback,		"writeback"	},
5686
#ifdef CONFIG_PAGEFLAGS_EXTENDED
5687
	{1UL << PG_head,		"head"		},
5688
	{1UL << PG_tail,		"tail"		},
5689
#else
5690
	{1UL << PG_compound,		"compound"	},
5691
#endif
5692
	{1UL << PG_swapcache,		"swapcache"	},
5693
	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
5694
	{1UL << PG_reclaim,		"reclaim"	},
5695
	{1UL << PG_swapbacked,		"swapbacked"	},
5696
	{1UL << PG_unevictable,		"unevictable"	},
5697
#ifdef CONFIG_MMU
5698
	{1UL << PG_mlocked,		"mlocked"	},
5699
#endif
5700
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
5701
	{1UL << PG_uncached,		"uncached"	},
5702
#endif
5703
#ifdef CONFIG_MEMORY_FAILURE
5704
	{1UL << PG_hwpoison,		"hwpoison"	},
5705
#endif
5706
	{-1UL,				NULL		},
5707
};
5708

5709
static void dump_page_flags(unsigned long flags)
5710
{
5711
	const char *delim = "";
5712
	unsigned long mask;
5713
	int i;
5714

5715
	printk(KERN_ALERT "page flags: %#lx(", flags);
5716

5717
	/* remove zone id */
5718
	flags &= (1UL << NR_PAGEFLAGS) - 1;
5719

5720
	for (i = 0; pageflag_names[i].name && flags; i++) {
5721

5722
		mask = pageflag_names[i].mask;
5723
		if ((flags & mask) != mask)
5724
			continue;
5725

5726
		flags &= ~mask;
5727
		printk("%s%s", delim, pageflag_names[i].name);
5728
		delim = "|";
5729
	}
5730

5731
	/* check for left over flags */
5732
	if (flags)
5733
		printk("%s%#lx", delim, flags);
5734

5735
	printk(")\n");
5736
}
5737

5738
void dump_page(struct page *page)
5739
{
5740
	printk(KERN_ALERT
5741
	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5742
		page, atomic_read(&page->_count), page_mapcount(page),
5743
		page->mapping, page->index);
5744
	dump_page_flags(page->flags);
5745
	mem_cgroup_print_bad_page(page);
5746
}
5747

5748
Product

Resources

Company