@@ -107,6 +107,25 @@ User Cases
The power management mechanism is used to save power when performing L3 forwarding.
+PM QoS
+------
+
+The "/sys/devices/system/cpu/cpuX/power/pm_qos_resume_latency_us" sysfs
+interface is used to set and get the resume latency limit on the cpuX for
+userspace. Each cpuidle governor in Linux select which idle state to enter
+based on this CPU resume latency in their idle task.
+
+The deeper the idle state, the lower the power consumption, but the longer
+the resume time. Some service are latency sensitive and very except the low
+resume time, like interrupt packet receiving mode.
+
+Applications can set and get the CPU resume latency by the
+``rte_power_qos_set_cpu_resume_latency()`` and ``rte_power_qos_get_cpu_resume_latency()``
+respectively. Applications can set a strict resume latency (zero value) by
+the ``rte_power_qos_set_cpu_resume_latency()`` to low the resume latency and
+get better performance (instead, the power consumption of platform may increase).
+
+
Ethernet PMD Power Management API
---------------------------------
@@ -237,6 +237,11 @@ New Features
This field is used to pass an extra configuration settings such as ability
to lookup IPv4 addresses in network byte order.
+* **Introduce per-CPU PM QoS interface.**
+
+ * Add per-CPU PM QoS interface to low the resume latency when wake up from
+ idle state.
+
* **Added new API to register telemetry endpoint callbacks with private arguments.**
A new ``rte_telemetry_register_cmd_arg`` function is available to pass an opaque value to
@@ -23,12 +23,14 @@ sources = files(
'rte_power.c',
'rte_power_uncore.c',
'rte_power_pmd_mgmt.c',
+ 'rte_power_qos.c',
)
headers = files(
'rte_power.h',
'rte_power_guest_channel.h',
'rte_power_pmd_mgmt.h',
'rte_power_uncore.h',
+ 'rte_power_qos.h',
)
deps += ['timer', 'ethdev']
new file mode 100644
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 HiSilicon Limited
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_lcore.h>
+#include <rte_log.h>
+
+#include "power_common.h"
+#include "rte_power_qos.h"
+
+#define PM_QOS_SYSFILE_RESUME_LATENCY_US \
+ "/sys/devices/system/cpu/cpu%u/power/pm_qos_resume_latency_us"
+
+#define PM_QOS_CPU_RESUME_LATENCY_BUF_LEN 32
+
+int
+rte_power_qos_set_cpu_resume_latency(uint16_t lcore_id, int latency)
+{
+ char buf[PM_QOS_CPU_RESUME_LATENCY_BUF_LEN];
+ uint32_t cpu_id;
+ FILE *f;
+ int ret;
+
+ if (!rte_lcore_is_enabled(lcore_id)) {
+ POWER_LOG(ERR, "lcore id %u is not enabled", lcore_id);
+ return -EINVAL;
+ }
+ ret = power_get_lcore_mapped_cpu_id(lcore_id, &cpu_id);
+ if (ret != 0)
+ return ret;
+
+ if (latency < 0) {
+ POWER_LOG(ERR, "latency should be greater than and equal to 0");
+ return -EINVAL;
+ }
+
+ ret = open_core_sysfs_file(&f, "w", PM_QOS_SYSFILE_RESUME_LATENCY_US, cpu_id);
+ if (ret != 0) {
+ POWER_LOG(ERR, "Failed to open "PM_QOS_SYSFILE_RESUME_LATENCY_US" : %s",
+ cpu_id, strerror(errno));
+ return ret;
+ }
+
+ /*
+ * Based on the sysfs interface pm_qos_resume_latency_us under
+ * @PM_QOS_SYSFILE_RESUME_LATENCY_US directory in kernel, their meaning
+ * is as follows for different input string.
+ * 1> the resume latency is 0 if the input is "n/a".
+ * 2> the resume latency is no constraint if the input is "0".
+ * 3> the resume latency is the actual value to be set.
+ */
+ if (latency == RTE_POWER_QOS_STRICT_LATENCY_VALUE)
+ snprintf(buf, sizeof(buf), "%s", "n/a");
+ else if (latency == RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT)
+ snprintf(buf, sizeof(buf), "%u", 0);
+ else
+ snprintf(buf, sizeof(buf), "%u", latency);
+
+ ret = write_core_sysfs_s(f, buf);
+ if (ret != 0)
+ POWER_LOG(ERR, "Failed to write "PM_QOS_SYSFILE_RESUME_LATENCY_US" : %s",
+ cpu_id, strerror(errno));
+
+ fclose(f);
+
+ return ret;
+}
+
+int
+rte_power_qos_get_cpu_resume_latency(uint16_t lcore_id)
+{
+ char buf[PM_QOS_CPU_RESUME_LATENCY_BUF_LEN];
+ int latency = -1;
+ uint32_t cpu_id;
+ FILE *f;
+ int ret;
+
+ if (!rte_lcore_is_enabled(lcore_id)) {
+ POWER_LOG(ERR, "lcore id %u is not enabled", lcore_id);
+ return -EINVAL;
+ }
+ ret = power_get_lcore_mapped_cpu_id(lcore_id, &cpu_id);
+ if (ret != 0)
+ return ret;
+
+ ret = open_core_sysfs_file(&f, "r", PM_QOS_SYSFILE_RESUME_LATENCY_US, cpu_id);
+ if (ret != 0) {
+ POWER_LOG(ERR, "Failed to open "PM_QOS_SYSFILE_RESUME_LATENCY_US" : %s",
+ cpu_id, strerror(errno));
+ return ret;
+ }
+
+ ret = read_core_sysfs_s(f, buf, sizeof(buf));
+ if (ret != 0) {
+ POWER_LOG(ERR, "Failed to read "PM_QOS_SYSFILE_RESUME_LATENCY_US" : %s",
+ cpu_id, strerror(errno));
+ goto out;
+ }
+
+ /*
+ * Based on the sysfs interface pm_qos_resume_latency_us under
+ * @PM_QOS_SYSFILE_RESUME_LATENCY_US directory in kernel, their meaning
+ * is as follows for different output string.
+ * 1> the resume latency is 0 if the output is "n/a".
+ * 2> the resume latency is no constraint if the output is "0".
+ * 3> the resume latency is the actual value in used for other string.
+ */
+ if (strcmp(buf, "n/a") == 0)
+ latency = RTE_POWER_QOS_STRICT_LATENCY_VALUE;
+ else {
+ latency = strtoul(buf, NULL, 10);
+ latency = latency == 0 ? RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT : latency;
+ }
+
+out:
+ fclose(f);
+
+ return latency != -1 ? latency : ret;
+}
new file mode 100644
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 HiSilicon Limited
+ */
+
+#ifndef RTE_POWER_QOS_H
+#define RTE_POWER_QOS_H
+
+#include <stdint.h>
+
+#include <rte_compat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file rte_power_qos.h
+ *
+ * PM QoS API.
+ *
+ * The CPU-wide resume latency limit has a positive impact on this CPU's idle
+ * state selection in each cpuidle governor.
+ * Please see the PM QoS on CPU wide in the following link:
+ * https://www.kernel.org/doc/html/latest/admin-guide/abi-testing.html?highlight=pm_qos_resume_latency_us#abi-sys-devices-power-pm-qos-resume-latency-us
+ *
+ * The deeper the idle state, the lower the power consumption, but the
+ * longer the resume time. Some service are delay sensitive and very except the
+ * low resume time, like interrupt packet receiving mode.
+ *
+ * In these case, per-CPU PM QoS API can be used to control this CPU's idle
+ * state selection and limit just enter the shallowest idle state to low the
+ * delay after sleep by setting strict resume latency (zero value).
+ */
+
+#define RTE_POWER_QOS_STRICT_LATENCY_VALUE 0
+#define RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT INT32_MAX
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param lcore_id
+ * target logical core id
+ *
+ * @param latency
+ * The latency should be greater than and equal to zero in microseconds unit.
+ *
+ * @return
+ * 0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_power_qos_set_cpu_resume_latency(uint16_t lcore_id, int latency);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the current resume latency of this logical core.
+ * The default value in kernel is @see RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT
+ * if don't set it.
+ *
+ * @return
+ * Negative value on failure.
+ * >= 0 means the actual resume latency limit on this core.
+ */
+__rte_experimental
+int rte_power_qos_get_cpu_resume_latency(uint16_t lcore_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_POWER_QOS_H */
@@ -51,4 +51,8 @@ EXPERIMENTAL {
rte_power_set_uncore_env;
rte_power_uncore_freqs;
rte_power_unset_uncore_env;
+
+ # added in 24.11
+ rte_power_qos_get_cpu_resume_latency;
+ rte_power_qos_set_cpu_resume_latency;
};