[PATCH -mm] mm: fine-grained dirty_ratio_pcm and dirty_background_ratio_pcm (v2)

Andrea Righi righi.andrea at gmail.com
Mon Nov 10 12:58:28 PST 2008


The current granularity of 5% of dirtyable memory for dirty pages writeback is
too coarse for large memory machines and this will get worse as
memory-size/disk-speed ratio continues to increase.

These large writebacks can be unpleasant for desktop or latency-sensitive
environments, where the time to complete each writeback can be perceived as a
lack of responsiveness by the whole system.

Following there's a similar solution as discussed in [1], but a little
bit simplified in order to provide the same functionality (in particular
to avoid backward compatibility problems) and reduce the amount of code
needed to implement an in-kernel parser to handle percentages with
decimals digits.

The kernel provides the following parameters:
 - dirty_ratio, dirty_background_ratio in percentage (1 ... 100)
 - dirty_ratio_pcm, dirty_background_ratio_pcm in units of percent mille (1 ... 100,000)

Both dirty_ratio and dirty_ratio_pcm refer to the same vm_dirty_ratio variable,
only the interface to read/write this value is different. The same is valid for
dirty_background_ratio.

In this way it's possible to provide a fine-grained interface to configure the
writeback policy and at the same time preserve the compatibility with the old
dirty_ratio / dirty_background_ratio users.

Examples:
 # echo 5 > /proc/sys/vm/dirty_ratio
 # cat /proc/sys/vm/dirty_ratio
 5
 # cat /proc/sys/vm/dirty_ratio_pcm
 5000

 # echo 500 > /proc/sys/vm/dirty_ratio_pcm
 # cat /proc/sys/vm/dirty_ratio
 0
 # cat /proc/sys/vm/dirty_ratio_pcm
 500

 # echo 5500 > /proc/sys/vm/dirty_ratio_pcm
 # cat /proc/sys/vm/dirty_ratio
 5
 # cat /proc/sys/vm/dirty_ratio_pcm
 5500

Changelog: (v1 -> v2)

* fix overflow in 32bit systems (calc_period_shift needs a u64)
* rebase (and tested) to 2.6.28-rc2-mm1

[1] http://lkml.org/lkml/2008/10/7/230

Signed-off-by: Andrea Righi <righi.andrea at gmail.com>
---
 Documentation/filesystems/proc.txt |   20 +++++++++
 include/linux/sysctl.h             |    7 +++
 kernel/sysctl.c                    |   80 +++++++++++++++++++++++++++++++++--
 kernel/sysctl_check.c              |    3 +
 mm/page-writeback.c                |   31 +++++++++++---
 5 files changed, 129 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index bcceb99..38ed5bf 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1389,6 +1389,16 @@ pages + file cache, not including locked pages and HugePages), the number of
 pages at which the pdflush background writeback daemon will start writing out
 dirty data.
 
+dirty_background_ratio_pcm
+--------------------------
+
+A fine-grained interface to configure dirty_background_ratio.
+
+Contains, as a percentage in units of pcm (percent mille) of the dirtyable
+system memory (free pages + mapped pages + file cache, not including locked
+pages and HugePages), the number of pages at which the pdflush background
+writeback daemon will start writing out dirty data.
+
 dirty_ratio
 -----------------
 
@@ -1397,6 +1407,16 @@ pages + file cache, not including locked pages and HugePages), the number of
 pages at which a process which is generating disk writes will itself start
 writing out dirty data.
 
+dirty_ratio_pcm
+---------------
+
+A fine-grained interface to configure dirty_ratio.
+
+Contains, as a percentage in units of pcm (percent mille) of the dirtyable
+system memory (free pages + mapped pages + file cache, not including locked
+pages and HugePages), the number of pages at which a process which is
+generating disk writes will itself start writing out dirty data.
+
 dirty_writeback_centisecs
 -------------------------
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 39d471d..799594b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -32,6 +32,9 @@
 struct file;
 struct completion;
 
+#define PERCENT_PCM	1000
+#define ONE_HUNDRED_PCM (100 * PERCENT_PCM)
+
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
 				   call to sysctl?   In other words, what is
 				   the largest acceptable value for the nlen
@@ -205,6 +208,8 @@ enum
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_DIRTY_BACKGROUND_PCM = 36, /* fine-grained dirty_background_ratio */
+	VM_DIRTY_RATIO_PCM = 37, /* fine-grained dirty_ratio */
 };
 
 
@@ -991,6 +996,8 @@ extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *,
 				    void __user *, size_t *, loff_t *);
+extern int proc_dointvec_pcm_minmax(struct ctl_table *, int, struct file *,
+				    void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *,
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d14953a..06ba902 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -88,9 +88,7 @@ extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
 static int one = 1;
-#endif
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
@@ -103,6 +101,7 @@ static int two = 2;
 
 static int zero;
 static int one_hundred = 100;
+static int one_hundred_pcm = ONE_HUNDRED_PCM;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -926,12 +925,23 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_background_ratio,
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_dointvec_pcm_minmax,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
+		.extra1		= &one,
 		.extra2		= &one_hundred,
 	},
 	{
+		.ctl_name	= VM_DIRTY_BACKGROUND_PCM,
+		.procname	= "dirty_background_ratio_pcm",
+		.data		= &dirty_background_ratio,
+		.maxlen		= sizeof(dirty_background_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &one_hundred_pcm,
+	},
+	{
 		.ctl_name	= VM_DIRTY_RATIO,
 		.procname	= "dirty_ratio",
 		.data		= &vm_dirty_ratio,
@@ -939,10 +949,21 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &dirty_ratio_handler,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
+		.extra1		= &one,
 		.extra2		= &one_hundred,
 	},
 	{
+		.ctl_name	= VM_DIRTY_RATIO_PCM,
+		.procname	= "dirty_ratio_pcm",
+		.data		= &vm_dirty_ratio,
+		.maxlen		= sizeof(vm_dirty_ratio),
+		.mode		= 0644,
+		.proc_handler	= &dirty_ratio_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &one_hundred_pcm,
+	},
+	{
 		.procname	= "dirty_writeback_centisecs",
 		.data		= &dirty_writeback_interval,
 		.maxlen		= sizeof(dirty_writeback_interval),
@@ -2525,6 +2546,35 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 				     lenp, ppos, HZ, 1000l);
 }
 
+static int do_proc_dointvec_pcm_minmax_conv(int *negp, unsigned long *lvalp,
+					 int *valp, int write, void *data)
+{
+	struct do_proc_dointvec_minmax_conv_param *param = data;
+	int val;
+
+	if (write) {
+		if (*lvalp > LONG_MAX / PERCENT_PCM)
+			return -EINVAL;
+		val = *negp ? -*lvalp : *lvalp;
+		if ((param->min && *param->min > val) ||
+		    (param->max && *param->max < val))
+			return -EINVAL;
+		*valp = val * PERCENT_PCM;
+	} else {
+		unsigned long lval;
+
+		val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = lval / PERCENT_PCM;
+	}
+	return 0;
+}
 
 static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
 					 int *valp,
@@ -2663,6 +2713,19 @@ int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *fi
 				do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
+int proc_dointvec_pcm_minmax(struct ctl_table *table, int write,
+			struct file *filp, void __user *buffer, size_t *lenp,
+			loff_t *ppos)
+{
+	struct do_proc_dointvec_minmax_conv_param param = {
+		.min = (int *)table->extra1,
+		.max = (int *)table->extra2,
+	};
+
+	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+				do_proc_dointvec_pcm_minmax_conv, &param);
+}
+
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2711,6 +2774,13 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
 	return -ENOSYS;
 }
 
+int proc_dointvec_pcm_minmax(struct ctl_table *table, int write,
+			struct file *filp, void __user *buffer, size_t *lenp,
+			loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23..83934a8 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,7 +111,9 @@ static const struct trans_ctl_table trans_vm_table[] = {
 	{ VM_OVERCOMMIT_MEMORY,		"overcommit_memory" },
 	{ VM_PAGE_CLUSTER,		"page-cluster" },
 	{ VM_DIRTY_BACKGROUND,		"dirty_background_ratio" },
+	{ VM_DIRTY_BACKGROUND_PCM,	"dirty_background_ratio_pcm" },
 	{ VM_DIRTY_RATIO,		"dirty_ratio" },
+	{ VM_DIRTY_RATIO_PCM,		"dirty_ratio_pcm" },
 	{ VM_DIRTY_WB_CS,		"dirty_writeback_centisecs" },
 	{ VM_DIRTY_EXPIRE_CS,		"dirty_expire_centisecs" },
 	{ VM_NR_PDFLUSH_THREADS,	"nr_pdflush_threads" },
@@ -1494,6 +1496,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 			    (table->proc_handler == proc_dostring) ||
 			    (table->proc_handler == proc_dointvec) ||
 			    (table->proc_handler == proc_dointvec_minmax) ||
+			    (table->proc_handler == proc_dointvec_pcm_minmax) ||
 			    (table->proc_handler == proc_dointvec_jiffies) ||
 			    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
 			    (table->proc_handler == proc_dointvec_ms_jiffies) ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b3584bf..e010a39 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
 /*
  * Start background writeback (via pdflush) at this percentage
  */
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 5 * PERCENT_PCM;
 
 /*
  * free highmem will not be subtracted from the total free memory
@@ -77,7 +77,7 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 10 * PERCENT_PCM;
 
 /*
  * The interval between `kupdate'-style writebacks, in jiffies
@@ -133,9 +133,10 @@ static struct prop_descriptor vm_dirties;
  */
 static int calc_period_shift(void)
 {
-	unsigned long dirty_total;
+	u64 dirty_total;
 
-	dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+	dirty_total = (vm_dirty_ratio * determine_dirtyable_memory())
+			/ ONE_HUNDRED_PCM;
 	return 2 + ilog2(dirty_total - 1);
 }
 
@@ -147,7 +148,23 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 		loff_t *ppos)
 {
 	int old_ratio = vm_dirty_ratio;
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret;
+
+	switch (table->ctl_name) {
+	case VM_DIRTY_RATIO:
+		ret = proc_dointvec_pcm_minmax(table, write, filp, buffer,
+					lenp, ppos);
+		break;
+	case VM_DIRTY_RATIO_PCM:
+		ret = proc_dointvec_minmax(table, write, filp, buffer,
+					lenp, ppos);
+		break;
+	default:
+		ret = -EINVAL;
+		WARN_ON(1);
+		break;
+	}
+
 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
 		int shift = calc_period_shift();
 		prop_change_shift(&vm_completions, shift);
@@ -380,8 +397,8 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
 	if (background_ratio >= dirty_ratio)
 		background_ratio = dirty_ratio / 2;
 
-	background = (background_ratio * available_memory) / 100;
-	dirty = (dirty_ratio * available_memory) / 100;
+	background = (background_ratio * available_memory) / ONE_HUNDRED_PCM;
+	dirty = (dirty_ratio * available_memory) / ONE_HUNDRED_PCM;
 	tsk = current;
 	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
 		background += background / 4;


More information about the Containers mailing list