power: add unpriv. read of turbo % for pstate

Message ID 20220824192859.66095-1-markus.theil@tu-ilmenau.de (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series power: add unpriv. read of turbo % for pstate |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Markus Theil Aug. 24, 2022, 7:28 p.m. UTC
  If DPDK applications should be used with a minimal set of privileges,
using the msr kernel module on linux should not be necessary.

Since at least kernel 4.4 the rdmsr call to obtain the last non-turbo
boost frequency can be left out, if the sysfs interface is used.
Also RHEL 7 with recent kernel updates should include the sysfs interface
for this (I only looked this up for CentOS 7).

Signed-off-by: Markus Theil <markus.theil@tu-ilmenau.de>
---
 lib/power/power_pstate_cpufreq.c | 69 ++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 29 deletions(-)
  

Comments

Hunt, David Aug. 25, 2022, 3:09 p.m. UTC | #1
Hi Markus,

On 24/08/2022 20:28, Markus Theil wrote:
> If DPDK applications should be used with a minimal set of privileges,
> using the msr kernel module on linux should not be necessary.
>
> Since at least kernel 4.4 the rdmsr call to obtain the last non-turbo
> boost frequency can be left out, if the sysfs interface is used.
> Also RHEL 7 with recent kernel updates should include the sysfs interface
> for this (I only looked this up for CentOS 7).
>
> Signed-off-by: Markus Theil <markus.theil@tu-ilmenau.de>
> ---
>   lib/power/power_pstate_cpufreq.c | 69 ++++++++++++++++++--------------
>   1 file changed, 40 insertions(+), 29 deletions(-)
>
> diff --git a/lib/power/power_pstate_cpufreq.c b/lib/power/power_pstate_cpufreq.c
> index 78c9197695..c3d66a8f68 100644
> --- a/lib/power/power_pstate_cpufreq.c
> +++ b/lib/power/power_pstate_cpufreq.c
> @@ -35,15 +35,9 @@
>   		"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
>   #define POWER_SYSFILE_BASE_FREQ  \
>   		"/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
> +#define POWER_SYSFILE_TURBO_PCT  \
> +		"/sys/devices/system/cpu/intel_pstate/turbo_pct"
>   #define POWER_PSTATE_DRIVER "intel_pstate"
> -#define POWER_MSR_PATH  "/dev/cpu/%u/msr"
> -
> -/*
> - * MSR related
> - */
> -#define PLATFORM_INFO     0x0CE
> -#define NON_TURBO_MASK    0xFF00
> -#define NON_TURBO_OFFSET  0x8
>   
>   
>   enum power_state {
> @@ -74,37 +68,33 @@ struct pstate_power_info {
>   static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
>   
>   /**
> - * It is to read the specific MSR.
> + * It is to read the turbo mode percentage from sysfs
>    */
> -
>   static int32_t
> -power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
> +power_read_turbo_pct(uint64_t *outVal)
>   {
>   	int fd, ret;
> -	char fullpath[PATH_MAX];
> +	char val[4] = {0};
>   
> -	snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
> -
> -	fd = open(fullpath, O_RDONLY);
> +	fd = open(POWER_SYSFILE_TURBO_PCT, O_RDONLY);
>   
>   	if (fd < 0) {
> -		RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
> +		RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", POWER_SYSFILE_TURBO_PCT,
>   				 strerror(errno));
>   		return fd;
>   	}
>   
> -	ret = pread(fd, val, sizeof(uint64_t), msr);
> +	ret = read(fd, val, sizeof(val));
>   
>   	if (ret < 0) {
> -		RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
> +		RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", POWER_SYSFILE_TURBO_PCT,
>   				 strerror(errno));
>   		goto out;
>   	}
>   
> -	POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
> -			fullpath, msr, lcore_id);
> +	*outVal = (uint64_t) atol(val);
>   

I'd recommend replacing atol with strtol, it's a safer implementation. 
It's more commonly found in DPDK code than atol.


> -	POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
> +	POWER_DEBUG_TRACE("power turbo pct: %"PRIu64"\n", *outVal);
>   
>   out:	close(fd);
>   	return ret;
> @@ -116,8 +106,9 @@ out:	close(fd);
>   static int
>   power_init_for_setting_freq(struct pstate_power_info *pi)
>   {
> -	FILE *f_base = NULL, *f_base_max = NULL, *f_min = NULL, *f_max = NULL;
> -	uint32_t base_ratio, base_max_ratio;
> +	FILE *f_base = NULL, *f_base_min = NULL, *f_base_max = NULL,
> +	     *f_min = NULL, *f_max = NULL;
> +	uint32_t base_ratio, base_min_ratio, base_max_ratio;
>   	uint64_t max_non_turbo;
>   	int ret;
>   
> @@ -130,6 +121,14 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
>   		goto err;
>   	}
>   
> +	open_core_sysfs_file(&f_base_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
> +			pi->lcore_id);
> +	if (f_base_min == NULL) {
> +		RTE_LOG(ERR, POWER, "failed to open %s\n",
> +				POWER_SYSFILE_BASE_MIN_FREQ);
> +		goto err;
> +	}
> +
>   	open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
>   			pi->lcore_id);
>   	if (f_min == NULL) {
> @@ -158,6 +157,14 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
>   		goto err;
>   	}
>   
> +	/* read base min ratio */
> +	ret = read_core_sysfs_u32(f_base_min, &base_min_ratio);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, POWER, "Failed to read %s\n",
> +				POWER_SYSFILE_BASE_MIN_FREQ);
> +		goto err;
> +	}
> +
>   	/* base ratio may not exist */
>   	if (f_base != NULL) {
>   		ret = read_core_sysfs_u32(f_base, &base_ratio);
> @@ -170,20 +177,22 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
>   		base_ratio = 0;
>   	}
>   
> -	/* Add MSR read to detect turbo status */
> -	if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
> -		goto err;
> -	/* no errors after this point */
> -
>   	/* convert ratios to bins */
>   	base_max_ratio /= BUS_FREQ;
> +	base_min_ratio /= BUS_FREQ;
>   	base_ratio /= BUS_FREQ;
>   
>   	/* assign file handles */
>   	pi->f_cur_min = f_min;
>   	pi->f_cur_max = f_max;
>   
> -	max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
> +	/* try to get turbo from global sysfs entry for less privileges than from MSR */
> +	if (power_read_turbo_pct(&max_non_turbo) < 0)
> +		goto err;
> +	/* no errors after this point */
> +
> +	max_non_turbo = base_min_ratio
> +		      + (100 - max_non_turbo) * (base_max_ratio - base_min_ratio) / 100;
>   
>   	POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
>   
> @@ -220,6 +229,8 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
>   err:
>   	if (f_base != NULL)
>   		fclose(f_base);
> +	if (f_base_min != NULL)
> +		fclose(f_base_min);
>   	if (f_base_max != NULL)
>   		fclose(f_base_max);
>   	if (f_min != NULL)

Nice patch.

I've run the patched code and can confirm that the max_non_turbo value 
agrees with the relevant byte in the 0xCE MSR, so looks good.

Tested-By: David Hunt <david.hunt@intel.com>

Also, once the atol issue mentioned above is resolved:

Acked-By: David Hunt <david.hunt@intel.com>

Thanks!
  

Patch

diff --git a/lib/power/power_pstate_cpufreq.c b/lib/power/power_pstate_cpufreq.c
index 78c9197695..c3d66a8f68 100644
--- a/lib/power/power_pstate_cpufreq.c
+++ b/lib/power/power_pstate_cpufreq.c
@@ -35,15 +35,9 @@ 
 		"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
 #define POWER_SYSFILE_BASE_FREQ  \
 		"/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
+#define POWER_SYSFILE_TURBO_PCT  \
+		"/sys/devices/system/cpu/intel_pstate/turbo_pct"
 #define POWER_PSTATE_DRIVER "intel_pstate"
-#define POWER_MSR_PATH  "/dev/cpu/%u/msr"
-
-/*
- * MSR related
- */
-#define PLATFORM_INFO     0x0CE
-#define NON_TURBO_MASK    0xFF00
-#define NON_TURBO_OFFSET  0x8
 
 
 enum power_state {
@@ -74,37 +68,33 @@  struct pstate_power_info {
 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
 
 /**
- * It is to read the specific MSR.
+ * It is to read the turbo mode percentage from sysfs
  */
-
 static int32_t
-power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
+power_read_turbo_pct(uint64_t *outVal)
 {
 	int fd, ret;
-	char fullpath[PATH_MAX];
+	char val[4] = {0};
 
-	snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
-
-	fd = open(fullpath, O_RDONLY);
+	fd = open(POWER_SYSFILE_TURBO_PCT, O_RDONLY);
 
 	if (fd < 0) {
-		RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
+		RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", POWER_SYSFILE_TURBO_PCT,
 				 strerror(errno));
 		return fd;
 	}
 
-	ret = pread(fd, val, sizeof(uint64_t), msr);
+	ret = read(fd, val, sizeof(val));
 
 	if (ret < 0) {
-		RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
+		RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", POWER_SYSFILE_TURBO_PCT,
 				 strerror(errno));
 		goto out;
 	}
 
-	POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
-			fullpath, msr, lcore_id);
+	*outVal = (uint64_t) atol(val);
 
-	POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
+	POWER_DEBUG_TRACE("power turbo pct: %"PRIu64"\n", *outVal);
 
 out:	close(fd);
 	return ret;
@@ -116,8 +106,9 @@  out:	close(fd);
 static int
 power_init_for_setting_freq(struct pstate_power_info *pi)
 {
-	FILE *f_base = NULL, *f_base_max = NULL, *f_min = NULL, *f_max = NULL;
-	uint32_t base_ratio, base_max_ratio;
+	FILE *f_base = NULL, *f_base_min = NULL, *f_base_max = NULL,
+	     *f_min = NULL, *f_max = NULL;
+	uint32_t base_ratio, base_min_ratio, base_max_ratio;
 	uint64_t max_non_turbo;
 	int ret;
 
@@ -130,6 +121,14 @@  power_init_for_setting_freq(struct pstate_power_info *pi)
 		goto err;
 	}
 
+	open_core_sysfs_file(&f_base_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
+			pi->lcore_id);
+	if (f_base_min == NULL) {
+		RTE_LOG(ERR, POWER, "failed to open %s\n",
+				POWER_SYSFILE_BASE_MIN_FREQ);
+		goto err;
+	}
+
 	open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
 			pi->lcore_id);
 	if (f_min == NULL) {
@@ -158,6 +157,14 @@  power_init_for_setting_freq(struct pstate_power_info *pi)
 		goto err;
 	}
 
+	/* read base min ratio */
+	ret = read_core_sysfs_u32(f_base_min, &base_min_ratio);
+	if (ret < 0) {
+		RTE_LOG(ERR, POWER, "Failed to read %s\n",
+				POWER_SYSFILE_BASE_MIN_FREQ);
+		goto err;
+	}
+
 	/* base ratio may not exist */
 	if (f_base != NULL) {
 		ret = read_core_sysfs_u32(f_base, &base_ratio);
@@ -170,20 +177,22 @@  power_init_for_setting_freq(struct pstate_power_info *pi)
 		base_ratio = 0;
 	}
 
-	/* Add MSR read to detect turbo status */
-	if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
-		goto err;
-	/* no errors after this point */
-
 	/* convert ratios to bins */
 	base_max_ratio /= BUS_FREQ;
+	base_min_ratio /= BUS_FREQ;
 	base_ratio /= BUS_FREQ;
 
 	/* assign file handles */
 	pi->f_cur_min = f_min;
 	pi->f_cur_max = f_max;
 
-	max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
+	/* try to get turbo from global sysfs entry for less privileges than from MSR */
+	if (power_read_turbo_pct(&max_non_turbo) < 0)
+		goto err;
+	/* no errors after this point */
+
+	max_non_turbo = base_min_ratio
+		      + (100 - max_non_turbo) * (base_max_ratio - base_min_ratio) / 100;
 
 	POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
 
@@ -220,6 +229,8 @@  power_init_for_setting_freq(struct pstate_power_info *pi)
 err:
 	if (f_base != NULL)
 		fclose(f_base);
+	if (f_base_min != NULL)
+		fclose(f_base_min);
 	if (f_base_max != NULL)
 		fclose(f_base_max);
 	if (f_min != NULL)