[v11,1/4] lib: add generic support for reading PMU events

Message ID 20230216175502.3164820-2-tduszynski@marvell.com (mailing list archive)
State Changes Requested, archived
Delegated to: David Marchand
Headers
Series add support for self monitoring |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Tomasz Duszynski Feb. 16, 2023, 5:54 p.m. UTC
  Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.

This is especially useful in cases where CPU cores are isolated
i.e run dedicated tasks. In such cases one cannot use standard
perf utility without sacrificing latency and performance.

Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 MAINTAINERS                            |   5 +
 app/test/meson.build                   |   2 +
 app/test/test_pmu.c                    |  62 ++++
 doc/api/doxy-api-index.md              |   3 +-
 doc/api/doxy-api.conf.in               |   1 +
 doc/guides/prog_guide/profile_app.rst  |  12 +
 doc/guides/rel_notes/release_23_03.rst |   7 +
 lib/meson.build                        |   1 +
 lib/pmu/meson.build                    |  13 +
 lib/pmu/pmu_private.h                  |  32 ++
 lib/pmu/rte_pmu.c                      | 460 +++++++++++++++++++++++++
 lib/pmu/rte_pmu.h                      | 212 ++++++++++++
 lib/pmu/version.map                    |  15 +
 13 files changed, 824 insertions(+), 1 deletion(-)
 create mode 100644 app/test/test_pmu.c
 create mode 100644 lib/pmu/meson.build
 create mode 100644 lib/pmu/pmu_private.h
 create mode 100644 lib/pmu/rte_pmu.c
 create mode 100644 lib/pmu/rte_pmu.h
 create mode 100644 lib/pmu/version.map
  

Comments

Konstantin Ananyev Feb. 16, 2023, 11:50 p.m. UTC | #1
16/02/2023 17:54, Tomasz Duszynski пишет:
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> i.e run dedicated tasks. In such cases one cannot use standard
> perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   MAINTAINERS                            |   5 +
>   app/test/meson.build                   |   2 +
>   app/test/test_pmu.c                    |  62 ++++
>   doc/api/doxy-api-index.md              |   3 +-
>   doc/api/doxy-api.conf.in               |   1 +
>   doc/guides/prog_guide/profile_app.rst  |  12 +
>   doc/guides/rel_notes/release_23_03.rst |   7 +
>   lib/meson.build                        |   1 +
>   lib/pmu/meson.build                    |  13 +
>   lib/pmu/pmu_private.h                  |  32 ++
>   lib/pmu/rte_pmu.c                      | 460 +++++++++++++++++++++++++
>   lib/pmu/rte_pmu.h                      | 212 ++++++++++++
>   lib/pmu/version.map                    |  15 +
>   13 files changed, 824 insertions(+), 1 deletion(-)
>   create mode 100644 app/test/test_pmu.c
>   create mode 100644 lib/pmu/meson.build
>   create mode 100644 lib/pmu/pmu_private.h
>   create mode 100644 lib/pmu/rte_pmu.c
>   create mode 100644 lib/pmu/rte_pmu.h
>   create mode 100644 lib/pmu/version.map
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 3495946d0f..d37f242120 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
>   M: Pavan Nikhilesh <pbhagavatula@marvell.com>
>   F: lib/node/
>   
> +PMU - EXPERIMENTAL
> +M: Tomasz Duszynski <tduszynski@marvell.com>
> +F: lib/pmu/
> +F: app/test/test_pmu*
> +
>   
>   Test Applications
>   -----------------
> diff --git a/app/test/meson.build b/app/test/meson.build
> index f34d19e3c3..6b61b7fc32 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -111,6 +111,7 @@ test_sources = files(
>           'test_reciprocal_division_perf.c',
>           'test_red.c',
>           'test_pie.c',
> +        'test_pmu.c',
>           'test_reorder.c',
>           'test_rib.c',
>           'test_rib6.c',
> @@ -239,6 +240,7 @@ fast_tests = [
>           ['kni_autotest', false, true],
>           ['kvargs_autotest', true, true],
>           ['member_autotest', true, true],
> +        ['pmu_autotest', true, true],
>           ['power_cpufreq_autotest', false, true],
>           ['power_autotest', true, true],
>           ['power_kvm_vm_autotest', false, true],
> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
> new file mode 100644
> index 0000000000..c257638e8b
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include "test.h"
> +
> +#ifndef RTE_EXEC_ENV_LINUX
> +
> +static int
> +test_pmu(void)
> +{
> +	printf("pmu_autotest only supported on Linux, skipping test\n");
> +	return TEST_SKIPPED;
> +}
> +
> +#else
> +
> +#include <rte_pmu.h>
> +
> +static int
> +test_pmu_read(void)
> +{
> +	const char *name = NULL;
> +	int tries = 10, event;
> +	uint64_t val = 0;
> +
> +	if (name == NULL) {
> +		printf("PMU not supported on this arch\n");
> +		return TEST_SKIPPED;
> +	}
> +
> +	if (rte_pmu_init() < 0)
> +		return TEST_SKIPPED;
> +
> +	event = rte_pmu_add_event(name);
> +	while (tries--)
> +		val += rte_pmu_read(event);
> +
> +	rte_pmu_fini();
> +
> +	return val ? TEST_SUCCESS : TEST_FAILED;
> +}
> +
> +static struct unit_test_suite pmu_tests = {
> +	.suite_name = "pmu autotest",
> +	.setup = NULL,
> +	.teardown = NULL,
> +	.unit_test_cases = {
> +		TEST_CASE(test_pmu_read),
> +		TEST_CASES_END()
> +	}
> +};
> +
> +static int
> +test_pmu(void)
> +{
> +	return unit_test_suite_runner(&pmu_tests);
> +}
> +
> +#endif /* RTE_EXEC_ENV_LINUX */
> +
> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> index 2deec7ea19..a8e04a195d 100644
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -223,7 +223,8 @@ The public API headers are grouped by topics:
>     [log](@ref rte_log.h),
>     [errno](@ref rte_errno.h),
>     [trace](@ref rte_trace.h),
> -  [trace_point](@ref rte_trace_point.h)
> +  [trace_point](@ref rte_trace_point.h),
> +  [pmu](@ref rte_pmu.h)
>   
>   - **misc**:
>     [EAL config](@ref rte_eal.h),
> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
> index e859426099..350b5a8c94 100644
> --- a/doc/api/doxy-api.conf.in
> +++ b/doc/api/doxy-api.conf.in
> @@ -63,6 +63,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
>                             @TOPDIR@/lib/pci \
>                             @TOPDIR@/lib/pdump \
>                             @TOPDIR@/lib/pipeline \
> +                          @TOPDIR@/lib/pmu \
>                             @TOPDIR@/lib/port \
>                             @TOPDIR@/lib/power \
>                             @TOPDIR@/lib/rawdev \
> diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
> index 14292d4c25..89e38cd301 100644
> --- a/doc/guides/prog_guide/profile_app.rst
> +++ b/doc/guides/prog_guide/profile_app.rst
> @@ -7,6 +7,18 @@ Profile Your Application
>   The following sections describe methods of profiling DPDK applications on
>   different architectures.
>   
> +Performance counter based profiling
> +-----------------------------------
> +
> +Majority of architectures support some performance monitoring unit (PMU).
> +Such unit provides programmable counters that monitor specific events.
> +
> +Different tools gather that information, like for example perf.
> +However, in some scenarios when CPU cores are isolated and run
> +dedicated tasks interrupting those tasks with perf may be undesirable.
> +
> +In such cases, an application can use the PMU library to read such events via ``rte_pmu_read()``.
> +
>   
>   Profiling on x86
>   ----------------
> diff --git a/doc/guides/rel_notes/release_23_03.rst b/doc/guides/rel_notes/release_23_03.rst
> index ab998a5357..20622efe58 100644
> --- a/doc/guides/rel_notes/release_23_03.rst
> +++ b/doc/guides/rel_notes/release_23_03.rst
> @@ -147,6 +147,13 @@ New Features
>     * Added support to capture packets at each graph node with packet metadata and
>       node name.
>   
> +* **Added PMU library.**
> +
> +  Added a new performance monitoring unit (PMU) library which allows applications
> +  to perform self monitoring activities without depending on external utilities like perf.
> +  After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
> +  can be stored in CTF format for further analysis.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/meson.build b/lib/meson.build
> index 450c061d2b..8a42d45d20 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -11,6 +11,7 @@
>   libraries = [
>           'kvargs', # eal depends on kvargs
>           'telemetry', # basic info querying
> +        'pmu',
>           'eal', # everything depends on eal
>           'ring',
>           'rcu', # rcu depends on ring
> diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build
> new file mode 100644
> index 0000000000..a4160b494e
> --- /dev/null
> +++ b/lib/pmu/meson.build
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2023 Marvell International Ltd.
> +
> +if not is_linux
> +    build = false
> +    reason = 'only supported on Linux'
> +    subdir_done()
> +endif
> +
> +includes = [global_inc]
> +
> +sources = files('rte_pmu.c')
> +headers = files('rte_pmu.h')
> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h
> new file mode 100644
> index 0000000000..b9f8c1ddc8
> --- /dev/null
> +++ b/lib/pmu/pmu_private.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell
> + */
> +
> +#ifndef _PMU_PRIVATE_H_
> +#define _PMU_PRIVATE_H_
> +
> +/**
> + * Architecture specific PMU init callback.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +int
> +pmu_arch_init(void);
> +
> +/**
> + * Architecture specific PMU cleanup callback.
> + */
> +void
> +pmu_arch_fini(void);
> +
> +/**
> + * Apply architecture specific settings to config before passing it to syscall.
> + *
> + * @param config
> + *   Architecture specific event configuration. Consult kernel sources for available options.
> + */
> +void
> +pmu_arch_fixup_config(uint64_t config[3]);
> +
> +#endif /* _PMU_PRIVATE_H_ */
> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
> new file mode 100644
> index 0000000000..950f999cb7
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.c
> @@ -0,0 +1,460 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_per_lcore.h>
> +#include <rte_pmu.h>
> +#include <rte_spinlock.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"


I suppose that pass (as the whole implementation) is linux specific?
If so, wouldn't it make sense to have it under linux subdir?

> +
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +
> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +struct rte_pmu rte_pmu;

Do we really need struct declaration here?


> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> +{
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +	char path[PATH_MAX];
> +	char *config = NULL;
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	*num = *mask = 0;
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> +	fp = fopen(path, "r");
> +	if (fp == NULL)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +	fp = fopen(path, "r");
> +	if (fp == NULL)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
> +}
> +
> +static int
> +open_events(struct rte_pmu_event_group *group)
> +{
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret)
> +			continue;
> +
> +		ret = do_perf_event_open(config, group->fds[0]);
> +		if (ret == -1) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(struct rte_pmu_event_group *group)
> +{
> +	long page_size = sysconf(_SC_PAGE_SIZE);
> +	unsigned int i;
> +	void *addr;
> +	int ret;
> +
> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
> +			ret = -EPERM;
> +			goto out;
> +		}
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], page_size);
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(struct rte_pmu_event_group *group)
> +{
> +	unsigned int i;
> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	group->enabled = false;
> +}
> +
> +int
> +__rte_pmu_enable_group(void)
> +{
> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +	int ret;
> +
> +	if (rte_pmu.num_group_events == 0)
> +		return -ENODEV;
> +
> +	ret = open_events(group);
> +	if (ret)
> +		goto out;
> +
> +	ret = mmap_events(group);
> +	if (ret)
> +		goto out;
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	rte_spinlock_lock(&rte_pmu.lock);
> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> +	rte_spinlock_unlock(&rte_pmu.lock);
> +	group->enabled = true;
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(group);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (dirp == NULL)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	if (dent) {
> +		rte_pmu.name = strdup(name);
> +		if (rte_pmu.name == NULL) {
> +			closedir(dirp);
> +
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	closedir(dirp);
> +
> +	return rte_pmu.name ? 0 : -ENODEV;
> +}
> +
> +static struct rte_pmu_event *
> +new_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +
> +	event = calloc(1, sizeof(*event));
> +	if (event == NULL)
> +		goto out;
> +
> +	event->name = strdup(name);
> +	if (event->name == NULL) {
> +		free(event);
> +		event = NULL;
> +	}
> +
> +out:
> +	return event;
> +}
> +
> +static void
> +free_event(struct rte_pmu_event *event)
> +{
> +	free(event->name);
> +	free(event);
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	if (rte_pmu.name == NULL)
> +		return -ENODEV;
> +
> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> +		return -ENOSPC;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = new_event(name);
> +	if (event == NULL)
> +		return -ENOMEM;
> +
> +	event->index = rte_pmu.num_group_events++;
> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> +
> +	return event->index;
> +}
> +
> +int
> +rte_pmu_init(void)
> +{
> +	int ret;
> +
> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
> +	 * via command line but application doesn't care enough and performs init/fini again.
> +	 */
> +	if (rte_pmu.initialized != 0) {
> +		rte_pmu.initialized++;
> +		return 0;
> +	}
> +
> +	ret = scan_pmus();
> +	if (ret)
> +		goto out;
> +
> +	ret = pmu_arch_init();
> +	if (ret)
> +		goto out;
> +
> +	TAILQ_INIT(&rte_pmu.event_list);
> +	TAILQ_INIT(&rte_pmu.event_group_list);
> +	rte_spinlock_init(&rte_pmu.lock);
> +	rte_pmu.initialized = 1;
> +
> +	return 0;
> +out:
> +	free(rte_pmu.name);
> +	rte_pmu.name = NULL;
> +
> +	return ret;
> +}
> +
> +void
> +rte_pmu_fini(void)
> +{
> +	struct rte_pmu_event_group *group, *tmp_group;
> +	struct rte_pmu_event *event, *tmp_event;
> +
> +	/* cleanup once init count drops to zero */
> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
> +		return;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> +		free_event(event);
> +	}
> +
> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> +		cleanup_events(group);
> +	}
> +
> +	pmu_arch_fini();
> +	free(rte_pmu.name);
> +	rte_pmu.name = NULL;
> +	rte_pmu.num_group_events = 0;
> +}
> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> new file mode 100644
> index 0000000000..6b664c3336
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.h
> @@ -0,0 +1,212 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_spinlock.h>
> +
> +/** Maximum number of events in a group */
> +#define MAX_NUM_GROUP_EVENTS 8
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +	bool enabled; /**< true if group was enabled on particular lcore */
> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> +} __rte_cache_aligned;
> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /**< name of an event */
> +	unsigned int index; /**< event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> +	rte_spinlock_t lock; /**< serialize access to event group list */
> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> +	unsigned int num_group_events; /**< number of events in a group */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +	unsigned int initialized; /**< initialization counter */
> +};
> +
> +/** lcore event group */
> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +
> +/** PMU state container */
> +extern struct rte_pmu rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read PMU counter.
> + *
> + * @warning This should be not called directly.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +static __rte_always_inline uint64_t
> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t width, offset;
> +	uint32_t seq, index;
> +	int64_t pmc;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();

Are you sure that compiler_barrier() is enough here?
On some archs CPU itself has freedom to re-order reads.
Or I am missing something obvious here?

> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +
> +		/* index set to 0 means that particular counter cannot be used */
> +		if (likely(pc->cap_user_rdpmc && index)) {
> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +			offset += pmc;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return offset;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enable group of events on the calling lcore.
> + *
> + * @warning This should be not called directly.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +__rte_pmu_enable_group(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize PMU library.
> + *
> + * @warning This should be not called directly.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_init(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
> + */
> +__rte_experimental
> +void
> +rte_pmu_fini(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +	int ret;
> +
> +	if (unlikely(!rte_pmu.initialized))
> +		return 0;
> +
> +	if (unlikely(!group->enabled)) {
> +		ret = __rte_pmu_enable_group();
> +		if (ret)
> +			return 0;
> +	}
> +
> +	if (unlikely(index >= rte_pmu.num_group_events))
> +		return 0;
> +
> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
> new file mode 100644
> index 0000000000..39a4f279c1
> --- /dev/null
> +++ b/lib/pmu/version.map
> @@ -0,0 +1,15 @@
> +DPDK_23 {
> +	local: *;
> +};
> +
> +EXPERIMENTAL {
> +	global:
> +
> +	__rte_pmu_enable_group;
> +	per_lcore__event_group;
> +	rte_pmu;
> +	rte_pmu_add_event;
> +	rte_pmu_fini;
> +	rte_pmu_init;
> +	rte_pmu_read;
> +};
  
Tomasz Duszynski Feb. 17, 2023, 8:49 a.m. UTC | #2
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>Sent: Friday, February 17, 2023 12:51 AM
>To: dev@dpdk.org
>Subject: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>16/02/2023 17:54, Tomasz Duszynski пишет:
>> Add support for programming PMU counters and reading their values in
>> runtime bypassing kernel completely.
>>
>> This is especially useful in cases where CPU cores are isolated i.e
>> run dedicated tasks. In such cases one cannot use standard perf
>> utility without sacrificing latency and performance.
>>
>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> ---
>>   MAINTAINERS                            |   5 +
>>   app/test/meson.build                   |   2 +
>>   app/test/test_pmu.c                    |  62 ++++
>>   doc/api/doxy-api-index.md              |   3 +-
>>   doc/api/doxy-api.conf.in               |   1 +
>>   doc/guides/prog_guide/profile_app.rst  |  12 +
>>   doc/guides/rel_notes/release_23_03.rst |   7 +
>>   lib/meson.build                        |   1 +
>>   lib/pmu/meson.build                    |  13 +
>>   lib/pmu/pmu_private.h                  |  32 ++
>>   lib/pmu/rte_pmu.c                      | 460 +++++++++++++++++++++++++
>>   lib/pmu/rte_pmu.h                      | 212 ++++++++++++
>>   lib/pmu/version.map                    |  15 +
>>   13 files changed, 824 insertions(+), 1 deletion(-)
>>   create mode 100644 app/test/test_pmu.c
>>   create mode 100644 lib/pmu/meson.build
>>   create mode 100644 lib/pmu/pmu_private.h
>>   create mode 100644 lib/pmu/rte_pmu.c
>>   create mode 100644 lib/pmu/rte_pmu.h
>>   create mode 100644 lib/pmu/version.map
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS index 3495946d0f..d37f242120
>> 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>   M: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>   F: lib/node/
>>
>> +PMU - EXPERIMENTAL
>> +M: Tomasz Duszynski <tduszynski@marvell.com>
>> +F: lib/pmu/
>> +F: app/test/test_pmu*
>> +
>>
>>   Test Applications
>>   -----------------
>> diff --git a/app/test/meson.build b/app/test/meson.build index
>> f34d19e3c3..6b61b7fc32 100644
>> --- a/app/test/meson.build
>> +++ b/app/test/meson.build
>> @@ -111,6 +111,7 @@ test_sources = files(
>>           'test_reciprocal_division_perf.c',
>>           'test_red.c',
>>           'test_pie.c',
>> +        'test_pmu.c',
>>           'test_reorder.c',
>>           'test_rib.c',
>>           'test_rib6.c',
>> @@ -239,6 +240,7 @@ fast_tests = [
>>           ['kni_autotest', false, true],
>>           ['kvargs_autotest', true, true],
>>           ['member_autotest', true, true],
>> +        ['pmu_autotest', true, true],
>>           ['power_cpufreq_autotest', false, true],
>>           ['power_autotest', true, true],
>>           ['power_kvm_vm_autotest', false, true], diff --git
>> a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode 100644 index
>> 0000000000..c257638e8b
>> --- /dev/null
>> +++ b/app/test/test_pmu.c
>> @@ -0,0 +1,62 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2023 Marvell International Ltd.
>> + */
>> +
>> +#include "test.h"
>> +
>> +#ifndef RTE_EXEC_ENV_LINUX
>> +
>> +static int
>> +test_pmu(void)
>> +{
>> +	printf("pmu_autotest only supported on Linux, skipping test\n");
>> +	return TEST_SKIPPED;
>> +}
>> +
>> +#else
>> +
>> +#include <rte_pmu.h>
>> +
>> +static int
>> +test_pmu_read(void)
>> +{
>> +	const char *name = NULL;
>> +	int tries = 10, event;
>> +	uint64_t val = 0;
>> +
>> +	if (name == NULL) {
>> +		printf("PMU not supported on this arch\n");
>> +		return TEST_SKIPPED;
>> +	}
>> +
>> +	if (rte_pmu_init() < 0)
>> +		return TEST_SKIPPED;
>> +
>> +	event = rte_pmu_add_event(name);
>> +	while (tries--)
>> +		val += rte_pmu_read(event);
>> +
>> +	rte_pmu_fini();
>> +
>> +	return val ? TEST_SUCCESS : TEST_FAILED; }
>> +
>> +static struct unit_test_suite pmu_tests = {
>> +	.suite_name = "pmu autotest",
>> +	.setup = NULL,
>> +	.teardown = NULL,
>> +	.unit_test_cases = {
>> +		TEST_CASE(test_pmu_read),
>> +		TEST_CASES_END()
>> +	}
>> +};
>> +
>> +static int
>> +test_pmu(void)
>> +{
>> +	return unit_test_suite_runner(&pmu_tests);
>> +}
>> +
>> +#endif /* RTE_EXEC_ENV_LINUX */
>> +
>> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
>> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
>> index 2deec7ea19..a8e04a195d 100644
>> --- a/doc/api/doxy-api-index.md
>> +++ b/doc/api/doxy-api-index.md
>> @@ -223,7 +223,8 @@ The public API headers are grouped by topics:
>>     [log](@ref rte_log.h),
>>     [errno](@ref rte_errno.h),
>>     [trace](@ref rte_trace.h),
>> -  [trace_point](@ref rte_trace_point.h)
>> +  [trace_point](@ref rte_trace_point.h),  [pmu](@ref rte_pmu.h)
>>
>>   - **misc**:
>>     [EAL config](@ref rte_eal.h),
>> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in index
>> e859426099..350b5a8c94 100644
>> --- a/doc/api/doxy-api.conf.in
>> +++ b/doc/api/doxy-api.conf.in
>> @@ -63,6 +63,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
>>                             @TOPDIR@/lib/pci \
>>                             @TOPDIR@/lib/pdump \
>>                             @TOPDIR@/lib/pipeline \
>> +                          @TOPDIR@/lib/pmu \
>>                             @TOPDIR@/lib/port \
>>                             @TOPDIR@/lib/power \
>>                             @TOPDIR@/lib/rawdev \ diff --git
>> a/doc/guides/prog_guide/profile_app.rst
>> b/doc/guides/prog_guide/profile_app.rst
>> index 14292d4c25..89e38cd301 100644
>> --- a/doc/guides/prog_guide/profile_app.rst
>> +++ b/doc/guides/prog_guide/profile_app.rst
>> @@ -7,6 +7,18 @@ Profile Your Application
>>   The following sections describe methods of profiling DPDK applications on
>>   different architectures.
>>
>> +Performance counter based profiling
>> +-----------------------------------
>> +
>> +Majority of architectures support some performance monitoring unit (PMU).
>> +Such unit provides programmable counters that monitor specific events.
>> +
>> +Different tools gather that information, like for example perf.
>> +However, in some scenarios when CPU cores are isolated and run
>> +dedicated tasks interrupting those tasks with perf may be undesirable.
>> +
>> +In such cases, an application can use the PMU library to read such events via
>``rte_pmu_read()``.
>> +
>>
>>   Profiling on x86
>>   ----------------
>> diff --git a/doc/guides/rel_notes/release_23_03.rst
>> b/doc/guides/rel_notes/release_23_03.rst
>> index ab998a5357..20622efe58 100644
>> --- a/doc/guides/rel_notes/release_23_03.rst
>> +++ b/doc/guides/rel_notes/release_23_03.rst
>> @@ -147,6 +147,13 @@ New Features
>>     * Added support to capture packets at each graph node with packet metadata and
>>       node name.
>>
>> +* **Added PMU library.**
>> +
>> +  Added a new performance monitoring unit (PMU) library which allows
>> + applications  to perform self monitoring activities without depending on external utilities
>like perf.
>> +  After integration with :doc:`../prog_guide/trace_lib` data gathered
>> + from hardware counters  can be stored in CTF format for further analysis.
>> +
>>
>>   Removed Items
>>   -------------
>> diff --git a/lib/meson.build b/lib/meson.build index
>> 450c061d2b..8a42d45d20 100644
>> --- a/lib/meson.build
>> +++ b/lib/meson.build
>> @@ -11,6 +11,7 @@
>>   libraries = [
>>           'kvargs', # eal depends on kvargs
>>           'telemetry', # basic info querying
>> +        'pmu',
>>           'eal', # everything depends on eal
>>           'ring',
>>           'rcu', # rcu depends on ring diff --git
>> a/lib/pmu/meson.build b/lib/pmu/meson.build new file mode 100644 index
>> 0000000000..a4160b494e
>> --- /dev/null
>> +++ b/lib/pmu/meson.build
>> @@ -0,0 +1,13 @@
>> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(C) 2023 Marvell
>> +International Ltd.
>> +
>> +if not is_linux
>> +    build = false
>> +    reason = 'only supported on Linux'
>> +    subdir_done()
>> +endif
>> +
>> +includes = [global_inc]
>> +
>> +sources = files('rte_pmu.c')
>> +headers = files('rte_pmu.h')
>> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h new file
>> mode 100644 index 0000000000..b9f8c1ddc8
>> --- /dev/null
>> +++ b/lib/pmu/pmu_private.h
>> @@ -0,0 +1,32 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2023 Marvell
>> + */
>> +
>> +#ifndef _PMU_PRIVATE_H_
>> +#define _PMU_PRIVATE_H_
>> +
>> +/**
>> + * Architecture specific PMU init callback.
>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +int
>> +pmu_arch_init(void);
>> +
>> +/**
>> + * Architecture specific PMU cleanup callback.
>> + */
>> +void
>> +pmu_arch_fini(void);
>> +
>> +/**
>> + * Apply architecture specific settings to config before passing it to syscall.
>> + *
>> + * @param config
>> + *   Architecture specific event configuration. Consult kernel sources for available options.
>> + */
>> +void
>> +pmu_arch_fixup_config(uint64_t config[3]);
>> +
>> +#endif /* _PMU_PRIVATE_H_ */
>> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> 100644 index 0000000000..950f999cb7
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.c
>> @@ -0,0 +1,460 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2023 Marvell International Ltd.
>> + */
>> +
>> +#include <ctype.h>
>> +#include <dirent.h>
>> +#include <errno.h>
>> +#include <regex.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/queue.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_per_lcore.h>
>> +#include <rte_pmu.h>
>> +#include <rte_spinlock.h>
>> +#include <rte_tailq.h>
>> +
>> +#include "pmu_private.h"
>> +
>> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>
>
>I suppose that pass (as the whole implementation) is linux specific?
>If so, wouldn't it make sense to have it under linux subdir?
>

There are not any plans to support that elsewhere currently so flat
directory structure is good enough. 

>> +
>> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
>> +(__builtin_ffsll(m) - 1)) & (m))
>> +
>> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> +struct rte_pmu rte_pmu;
>
>Do we really need struct declaration here?
>

What’s the problem with this placement precisely?

>
>> +/*
>> + * Following __rte_weak functions provide default no-op.
>> +Architectures should override them if
>> + * necessary.
>> + */
>> +
>> +int
>> +__rte_weak pmu_arch_init(void)
>> +{
>> +	return 0;
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fini(void)
>> +{
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { }
>> +
>> +static int
>> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> +	char path[PATH_MAX];
>> +	char *config = NULL;
>> +	int high, low, ret;
>> +	FILE *fp;
>> +
>> +	*num = *mask = 0;
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
>> +	fp = fopen(path, "r");
>> +	if (fp == NULL)
>> +		return -errno;
>> +
>> +	errno = 0;
>> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> +	if (ret < 2) {
>> +		ret = -ENODATA;
>> +		goto out;
>> +	}
>> +	if (errno) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	if (ret == 2)
>> +		high = low;
>> +
>> +	*mask = GENMASK_ULL(high, low);
>> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
>> +	*num = config[strlen(config) - 1];
>> +	*num = isdigit(*num) ? *num - '0' : 0;
>> +
>> +	ret = 0;
>> +out:
>> +	free(config);
>> +	fclose(fp);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +parse_event(char *buf, uint64_t config[3]) {
>> +	char *token, *term;
>> +	int num, ret, val;
>> +	uint64_t mask;
>> +
>> +	config[0] = config[1] = config[2] = 0;
>> +
>> +	token = strtok(buf, ",");
>> +	while (token) {
>> +		errno = 0;
>> +		/* <term>=<value> */
>> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> +		if (ret < 1)
>> +			return -ENODATA;
>> +		if (errno)
>> +			return -errno;
>> +		if (ret == 1)
>> +			val = 1;
>> +
>> +		ret = get_term_format(term, &num, &mask);
>> +		free(term);
>> +		if (ret)
>> +			return ret;
>> +
>> +		config[num] |= FIELD_PREP(mask, val);
>> +		token = strtok(NULL, ",");
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int
>> +get_event_config(const char *name, uint64_t config[3]) {
>> +	char path[PATH_MAX], buf[BUFSIZ];
>> +	FILE *fp;
>> +	int ret;
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
>> +	fp = fopen(path, "r");
>> +	if (fp == NULL)
>> +		return -errno;
>> +
>> +	ret = fread(buf, 1, sizeof(buf), fp);
>> +	if (ret == 0) {
>> +		fclose(fp);
>> +
>> +		return -EINVAL;
>> +	}
>> +	fclose(fp);
>> +	buf[ret] = '\0';
>> +
>> +	return parse_event(buf, config);
>> +}
>> +
>> +static int
>> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> +	struct perf_event_attr attr = {
>> +		.size = sizeof(struct perf_event_attr),
>> +		.type = PERF_TYPE_RAW,
>> +		.exclude_kernel = 1,
>> +		.exclude_hv = 1,
>> +		.disabled = 1,
>> +	};
>> +
>> +	pmu_arch_fixup_config(config);
>> +
>> +	attr.config = config[0];
>> +	attr.config1 = config[1];
>> +	attr.config2 = config[2];
>> +
>> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
>> +
>> +static int
>> +open_events(struct rte_pmu_event_group *group) {
>> +	struct rte_pmu_event *event;
>> +	uint64_t config[3];
>> +	int num = 0, ret;
>> +
>> +	/* group leader gets created first, with fd = -1 */
>> +	group->fds[0] = -1;
>> +
>> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> +		ret = get_event_config(event->name, config);
>> +		if (ret)
>> +			continue;
>> +
>> +		ret = do_perf_event_open(config, group->fds[0]);
>> +		if (ret == -1) {
>> +			ret = -errno;
>> +			goto out;
>> +		}
>> +
>> +		group->fds[event->index] = ret;
>> +		num++;
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	for (--num; num >= 0; num--) {
>> +		close(group->fds[num]);
>> +		group->fds[num] = -1;
>> +	}
>> +
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +mmap_events(struct rte_pmu_event_group *group) {
>> +	long page_size = sysconf(_SC_PAGE_SIZE);
>> +	unsigned int i;
>> +	void *addr;
>> +	int ret;
>> +
>> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
>> +		if (addr == MAP_FAILED) {
>> +			ret = -errno;
>> +			goto out;
>> +		}
>> +
>> +		group->mmap_pages[i] = addr;
>> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
>> +			ret = -EPERM;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	for (; i; i--) {
>> +		munmap(group->mmap_pages[i - 1], page_size);
>> +		group->mmap_pages[i - 1] = NULL;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static void
>> +cleanup_events(struct rte_pmu_event_group *group) {
>> +	unsigned int i;
>> +
>> +	if (group->fds[0] != -1)
>> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
>> +
>> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> +		if (group->mmap_pages[i]) {
>> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> +			group->mmap_pages[i] = NULL;
>> +		}
>> +
>> +		if (group->fds[i] != -1) {
>> +			close(group->fds[i]);
>> +			group->fds[i] = -1;
>> +		}
>> +	}
>> +
>> +	group->enabled = false;
>> +}
>> +
>> +int
>> +__rte_pmu_enable_group(void)
>> +{
>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> +	int ret;
>> +
>> +	if (rte_pmu.num_group_events == 0)
>> +		return -ENODEV;
>> +
>> +	ret = open_events(group);
>> +	if (ret)
>> +		goto out;
>> +
>> +	ret = mmap_events(group);
>> +	if (ret)
>> +		goto out;
>> +
>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	rte_spinlock_lock(&rte_pmu.lock);
>> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>> +	rte_spinlock_unlock(&rte_pmu.lock);
>> +	group->enabled = true;
>> +
>> +	return 0;
>> +
>> +out:
>> +	cleanup_events(group);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +scan_pmus(void)
>> +{
>> +	char path[PATH_MAX];
>> +	struct dirent *dent;
>> +	const char *name;
>> +	DIR *dirp;
>> +
>> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> +	if (dirp == NULL)
>> +		return -errno;
>> +
>> +	while ((dent = readdir(dirp))) {
>> +		name = dent->d_name;
>> +		if (name[0] == '.')
>> +			continue;
>> +
>> +		/* sysfs entry should either contain cpus or be a cpu */
>> +		if (!strcmp(name, "cpu"))
>> +			break;
>> +
>> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> +		if (access(path, F_OK) == 0)
>> +			break;
>> +	}
>> +
>> +	if (dent) {
>> +		rte_pmu.name = strdup(name);
>> +		if (rte_pmu.name == NULL) {
>> +			closedir(dirp);
>> +
>> +			return -ENOMEM;
>> +		}
>> +	}
>> +
>> +	closedir(dirp);
>> +
>> +	return rte_pmu.name ? 0 : -ENODEV;
>> +}
>> +
>> +static struct rte_pmu_event *
>> +new_event(const char *name)
>> +{
>> +	struct rte_pmu_event *event;
>> +
>> +	event = calloc(1, sizeof(*event));
>> +	if (event == NULL)
>> +		goto out;
>> +
>> +	event->name = strdup(name);
>> +	if (event->name == NULL) {
>> +		free(event);
>> +		event = NULL;
>> +	}
>> +
>> +out:
>> +	return event;
>> +}
>> +
>> +static void
>> +free_event(struct rte_pmu_event *event) {
>> +	free(event->name);
>> +	free(event);
>> +}
>> +
>> +int
>> +rte_pmu_add_event(const char *name)
>> +{
>> +	struct rte_pmu_event *event;
>> +	char path[PATH_MAX];
>> +
>> +	if (rte_pmu.name == NULL)
>> +		return -ENODEV;
>> +
>> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
>> +		return -ENOSPC;
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
>> +	if (access(path, R_OK))
>> +		return -ENODEV;
>> +
>> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> +		if (!strcmp(event->name, name))
>> +			return event->index;
>> +		continue;
>> +	}
>> +
>> +	event = new_event(name);
>> +	if (event == NULL)
>> +		return -ENOMEM;
>> +
>> +	event->index = rte_pmu.num_group_events++;
>> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> +
>> +	return event->index;
>> +}
>> +
>> +int
>> +rte_pmu_init(void)
>> +{
>> +	int ret;
>> +
>> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
>> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
>> +	 * via command line but application doesn't care enough and performs init/fini again.
>> +	 */
>> +	if (rte_pmu.initialized != 0) {
>> +		rte_pmu.initialized++;
>> +		return 0;
>> +	}
>> +
>> +	ret = scan_pmus();
>> +	if (ret)
>> +		goto out;
>> +
>> +	ret = pmu_arch_init();
>> +	if (ret)
>> +		goto out;
>> +
>> +	TAILQ_INIT(&rte_pmu.event_list);
>> +	TAILQ_INIT(&rte_pmu.event_group_list);
>> +	rte_spinlock_init(&rte_pmu.lock);
>> +	rte_pmu.initialized = 1;
>> +
>> +	return 0;
>> +out:
>> +	free(rte_pmu.name);
>> +	rte_pmu.name = NULL;
>> +
>> +	return ret;
>> +}
>> +
>> +void
>> +rte_pmu_fini(void)
>> +{
>> +	struct rte_pmu_event_group *group, *tmp_group;
>> +	struct rte_pmu_event *event, *tmp_event;
>> +
>> +	/* cleanup once init count drops to zero */
>> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
>> +		return;
>> +
>> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> +		free_event(event);
>> +	}
>> +
>> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
>> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
>> +		cleanup_events(group);
>> +	}
>> +
>> +	pmu_arch_fini();
>> +	free(rte_pmu.name);
>> +	rte_pmu.name = NULL;
>> +	rte_pmu.num_group_events = 0;
>> +}
>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> 100644 index 0000000000..6b664c3336
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.h
>> @@ -0,0 +1,212 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2023 Marvell
>> + */
>> +
>> +#ifndef _RTE_PMU_H_
>> +#define _RTE_PMU_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * PMU event tracing operations
>> + *
>> + * This file defines generic API and types necessary to setup PMU and
>> + * read selected counters in runtime.
>> + */
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#include <linux/perf_event.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_branch_prediction.h>
>> +#include <rte_common.h>
>> +#include <rte_compat.h>
>> +#include <rte_spinlock.h>
>> +
>> +/** Maximum number of events in a group */ #define
>> +MAX_NUM_GROUP_EVENTS 8
>> +
>> +/**
>> + * A structure describing a group of events.
>> + */
>> +struct rte_pmu_event_group {
>> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
>> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> +	bool enabled; /**< true if group was enabled on particular lcore */
>> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> +__rte_cache_aligned;
>> +
>> +/**
>> + * A structure describing an event.
>> + */
>> +struct rte_pmu_event {
>> +	char *name; /**< name of an event */
>> +	unsigned int index; /**< event index into fds/mmap_pages */
>> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> +
>> +/**
>> + * A PMU state container.
>> + */
>> +struct rte_pmu {
>> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> +	rte_spinlock_t lock; /**< serialize access to event group list */
>> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> +	unsigned int num_group_events; /**< number of events in a group */
>> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> +	unsigned int initialized; /**< initialization counter */ };
>> +
>> +/** lcore event group */
>> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> +
>> +/** PMU state container */
>> +extern struct rte_pmu rte_pmu;
>> +
>> +/** Each architecture supporting PMU needs to provide its own version
>> +*/ #ifndef rte_pmu_pmc_read #define rte_pmu_pmc_read(index) ({ 0; })
>> +#endif
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read PMU counter.
>> + *
>> + * @warning This should be not called directly.
>> + *
>> + * @param pc
>> + *   Pointer to the mmapped user page.
>> + * @return
>> + *   Counter value read from hardware.
>> + */
>> +static __rte_always_inline uint64_t
>> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>> +	uint64_t width, offset;
>> +	uint32_t seq, index;
>> +	int64_t pmc;
>> +
>> +	for (;;) {
>> +		seq = pc->lock;
>> +		rte_compiler_barrier();
>
>Are you sure that compiler_barrier() is enough here?
>On some archs CPU itself has freedom to re-order reads.
>Or I am missing something obvious here?
>

It's a matter of not keeping old stuff cached in registers 
and making sure that we have two reads of lock. CPU reordering
won't do any harm here. 

>> +		index = pc->index;
>> +		offset = pc->offset;
>> +		width = pc->pmc_width;
>> +
>> +		/* index set to 0 means that particular counter cannot be used */
>> +		if (likely(pc->cap_user_rdpmc && index)) {
>> +			pmc = rte_pmu_pmc_read(index - 1);
>> +			pmc <<= 64 - width;
>> +			pmc >>= 64 - width;
>> +			offset += pmc;
>> +		}
>> +
>> +		rte_compiler_barrier();
>> +
>> +		if (likely(pc->lock == seq))
>> +			return offset;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Enable group of events on the calling lcore.
>> + *
>> + * @warning This should be not called directly.
>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +__rte_pmu_enable_group(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Initialize PMU library.
>> + *
>> + * @warning This should be not called directly.
>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_init(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
>> + */
>> +__rte_experimental
>> +void
>> +rte_pmu_fini(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Add event to the group of enabled events.
>> + *
>> + * @param name
>> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>> + * @return
>> + *   Event index in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_add_event(const char *name);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read hardware counter configured to count occurrences of an event.
>> + *
>> + * @param index
>> + *   Index of an event to be read.
>> + * @return
>> + *   Event value read from register. In case of errors or lack of support
>> + *   0 is returned. In other words, stream of zeros in a trace file
>> + *   indicates problem with reading particular PMU event register.
>> + */
>> +__rte_experimental
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read(unsigned int index)
>> +{
>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> +	int ret;
>> +
>> +	if (unlikely(!rte_pmu.initialized))
>> +		return 0;
>> +
>> +	if (unlikely(!group->enabled)) {
>> +		ret = __rte_pmu_enable_group();
>> +		if (ret)
>> +			return 0;
>> +	}
>> +
>> +	if (unlikely(index >= rte_pmu.num_group_events))
>> +		return 0;
>> +
>> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_PMU_H_ */
>> diff --git a/lib/pmu/version.map b/lib/pmu/version.map new file mode
>> 100644 index 0000000000..39a4f279c1
>> --- /dev/null
>> +++ b/lib/pmu/version.map
>> @@ -0,0 +1,15 @@
>> +DPDK_23 {
>> +	local: *;
>> +};
>> +
>> +EXPERIMENTAL {
>> +	global:
>> +
>> +	__rte_pmu_enable_group;
>> +	per_lcore__event_group;
>> +	rte_pmu;
>> +	rte_pmu_add_event;
>> +	rte_pmu_fini;
>> +	rte_pmu_init;
>> +	rte_pmu_read;
>> +};
  
Konstantin Ananyev Feb. 17, 2023, 10:14 a.m. UTC | #3
> >>
> >> This is especially useful in cases where CPU cores are isolated i.e
> >> run dedicated tasks. In such cases one cannot use standard perf
> >> utility without sacrificing latency and performance.
> >>
> >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> >> ---
> >>   MAINTAINERS                            |   5 +
> >>   app/test/meson.build                   |   2 +
> >>   app/test/test_pmu.c                    |  62 ++++
> >>   doc/api/doxy-api-index.md              |   3 +-
> >>   doc/api/doxy-api.conf.in               |   1 +
> >>   doc/guides/prog_guide/profile_app.rst  |  12 +
> >>   doc/guides/rel_notes/release_23_03.rst |   7 +
> >>   lib/meson.build                        |   1 +
> >>   lib/pmu/meson.build                    |  13 +
> >>   lib/pmu/pmu_private.h                  |  32 ++
> >>   lib/pmu/rte_pmu.c                      | 460 +++++++++++++++++++++++++
> >>   lib/pmu/rte_pmu.h                      | 212 ++++++++++++
> >>   lib/pmu/version.map                    |  15 +
> >>   13 files changed, 824 insertions(+), 1 deletion(-)
> >>   create mode 100644 app/test/test_pmu.c
> >>   create mode 100644 lib/pmu/meson.build
> >>   create mode 100644 lib/pmu/pmu_private.h
> >>   create mode 100644 lib/pmu/rte_pmu.c
> >>   create mode 100644 lib/pmu/rte_pmu.h
> >>   create mode 100644 lib/pmu/version.map
> >>
> >> diff --git a/MAINTAINERS b/MAINTAINERS index 3495946d0f..d37f242120
> >> 100644
> >> --- a/MAINTAINERS
> >> +++ b/MAINTAINERS
> >> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
> >>   M: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >>   F: lib/node/
> >>
> >> +PMU - EXPERIMENTAL
> >> +M: Tomasz Duszynski <tduszynski@marvell.com>
> >> +F: lib/pmu/
> >> +F: app/test/test_pmu*
> >> +
> >>
> >>   Test Applications
> >>   -----------------
> >> diff --git a/app/test/meson.build b/app/test/meson.build index
> >> f34d19e3c3..6b61b7fc32 100644
> >> --- a/app/test/meson.build
> >> +++ b/app/test/meson.build
> >> @@ -111,6 +111,7 @@ test_sources = files(
> >>           'test_reciprocal_division_perf.c',
> >>           'test_red.c',
> >>           'test_pie.c',
> >> +        'test_pmu.c',
> >>           'test_reorder.c',
> >>           'test_rib.c',
> >>           'test_rib6.c',
> >> @@ -239,6 +240,7 @@ fast_tests = [
> >>           ['kni_autotest', false, true],
> >>           ['kvargs_autotest', true, true],
> >>           ['member_autotest', true, true],
> >> +        ['pmu_autotest', true, true],
> >>           ['power_cpufreq_autotest', false, true],
> >>           ['power_autotest', true, true],
> >>           ['power_kvm_vm_autotest', false, true], diff --git
> >> a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode 100644 index
> >> 0000000000..c257638e8b
> >> --- /dev/null
> >> +++ b/app/test/test_pmu.c
> >> @@ -0,0 +1,62 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(C) 2023 Marvell International Ltd.
> >> + */
> >> +
> >> +#include "test.h"
> >> +
> >> +#ifndef RTE_EXEC_ENV_LINUX
> >> +
> >> +static int
> >> +test_pmu(void)
> >> +{
> >> +	printf("pmu_autotest only supported on Linux, skipping test\n");
> >> +	return TEST_SKIPPED;
> >> +}
> >> +
> >> +#else
> >> +
> >> +#include <rte_pmu.h>
> >> +
> >> +static int
> >> +test_pmu_read(void)
> >> +{
> >> +	const char *name = NULL;
> >> +	int tries = 10, event;
> >> +	uint64_t val = 0;
> >> +
> >> +	if (name == NULL) {
> >> +		printf("PMU not supported on this arch\n");
> >> +		return TEST_SKIPPED;
> >> +	}
> >> +
> >> +	if (rte_pmu_init() < 0)
> >> +		return TEST_SKIPPED;
> >> +
> >> +	event = rte_pmu_add_event(name);
> >> +	while (tries--)
> >> +		val += rte_pmu_read(event);
> >> +
> >> +	rte_pmu_fini();
> >> +
> >> +	return val ? TEST_SUCCESS : TEST_FAILED; }
> >> +
> >> +static struct unit_test_suite pmu_tests = {
> >> +	.suite_name = "pmu autotest",
> >> +	.setup = NULL,
> >> +	.teardown = NULL,
> >> +	.unit_test_cases = {
> >> +		TEST_CASE(test_pmu_read),
> >> +		TEST_CASES_END()
> >> +	}
> >> +};
> >> +
> >> +static int
> >> +test_pmu(void)
> >> +{
> >> +	return unit_test_suite_runner(&pmu_tests);
> >> +}
> >> +
> >> +#endif /* RTE_EXEC_ENV_LINUX */
> >> +
> >> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
> >> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> >> index 2deec7ea19..a8e04a195d 100644
> >> --- a/doc/api/doxy-api-index.md
> >> +++ b/doc/api/doxy-api-index.md
> >> @@ -223,7 +223,8 @@ The public API headers are grouped by topics:
> >>     [log](@ref rte_log.h),
> >>     [errno](@ref rte_errno.h),
> >>     [trace](@ref rte_trace.h),
> >> -  [trace_point](@ref rte_trace_point.h)
> >> +  [trace_point](@ref rte_trace_point.h),  [pmu](@ref rte_pmu.h)
> >>
> >>   - **misc**:
> >>     [EAL config](@ref rte_eal.h),
> >> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in index
> >> e859426099..350b5a8c94 100644
> >> --- a/doc/api/doxy-api.conf.in
> >> +++ b/doc/api/doxy-api.conf.in
> >> @@ -63,6 +63,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
> >>                             @TOPDIR@/lib/pci \
> >>                             @TOPDIR@/lib/pdump \
> >>                             @TOPDIR@/lib/pipeline \
> >> +                          @TOPDIR@/lib/pmu \
> >>                             @TOPDIR@/lib/port \
> >>                             @TOPDIR@/lib/power \
> >>                             @TOPDIR@/lib/rawdev \ diff --git
> >> a/doc/guides/prog_guide/profile_app.rst
> >> b/doc/guides/prog_guide/profile_app.rst
> >> index 14292d4c25..89e38cd301 100644
> >> --- a/doc/guides/prog_guide/profile_app.rst
> >> +++ b/doc/guides/prog_guide/profile_app.rst
> >> @@ -7,6 +7,18 @@ Profile Your Application
> >>   The following sections describe methods of profiling DPDK applications on
> >>   different architectures.
> >>
> >> +Performance counter based profiling
> >> +-----------------------------------
> >> +
> >> +Majority of architectures support some performance monitoring unit (PMU).
> >> +Such unit provides programmable counters that monitor specific events.
> >> +
> >> +Different tools gather that information, like for example perf.
> >> +However, in some scenarios when CPU cores are isolated and run
> >> +dedicated tasks interrupting those tasks with perf may be undesirable.
> >> +
> >> +In such cases, an application can use the PMU library to read such events via
> >``rte_pmu_read()``.
> >> +
> >>
> >>   Profiling on x86
> >>   ----------------
> >> diff --git a/doc/guides/rel_notes/release_23_03.rst
> >> b/doc/guides/rel_notes/release_23_03.rst
> >> index ab998a5357..20622efe58 100644
> >> --- a/doc/guides/rel_notes/release_23_03.rst
> >> +++ b/doc/guides/rel_notes/release_23_03.rst
> >> @@ -147,6 +147,13 @@ New Features
> >>     * Added support to capture packets at each graph node with packet metadata and
> >>       node name.
> >>
> >> +* **Added PMU library.**
> >> +
> >> +  Added a new performance monitoring unit (PMU) library which allows
> >> + applications  to perform self monitoring activities without depending on external utilities
> >like perf.
> >> +  After integration with :doc:`../prog_guide/trace_lib` data gathered
> >> + from hardware counters  can be stored in CTF format for further analysis.
> >> +
> >>
> >>   Removed Items
> >>   -------------
> >> diff --git a/lib/meson.build b/lib/meson.build index
> >> 450c061d2b..8a42d45d20 100644
> >> --- a/lib/meson.build
> >> +++ b/lib/meson.build
> >> @@ -11,6 +11,7 @@
> >>   libraries = [
> >>           'kvargs', # eal depends on kvargs
> >>           'telemetry', # basic info querying
> >> +        'pmu',
> >>           'eal', # everything depends on eal
> >>           'ring',
> >>           'rcu', # rcu depends on ring diff --git
> >> a/lib/pmu/meson.build b/lib/pmu/meson.build new file mode 100644 index
> >> 0000000000..a4160b494e
> >> --- /dev/null
> >> +++ b/lib/pmu/meson.build
> >> @@ -0,0 +1,13 @@
> >> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(C) 2023 Marvell
> >> +International Ltd.
> >> +
> >> +if not is_linux
> >> +    build = false
> >> +    reason = 'only supported on Linux'
> >> +    subdir_done()
> >> +endif
> >> +
> >> +includes = [global_inc]
> >> +
> >> +sources = files('rte_pmu.c')
> >> +headers = files('rte_pmu.h')
> >> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h new file
> >> mode 100644 index 0000000000..b9f8c1ddc8
> >> --- /dev/null
> >> +++ b/lib/pmu/pmu_private.h
> >> @@ -0,0 +1,32 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2023 Marvell
> >> + */
> >> +
> >> +#ifndef _PMU_PRIVATE_H_
> >> +#define _PMU_PRIVATE_H_
> >> +
> >> +/**
> >> + * Architecture specific PMU init callback.
> >> + *
> >> + * @return
> >> + *   0 in case of success, negative value otherwise.
> >> + */
> >> +int
> >> +pmu_arch_init(void);
> >> +
> >> +/**
> >> + * Architecture specific PMU cleanup callback.
> >> + */
> >> +void
> >> +pmu_arch_fini(void);
> >> +
> >> +/**
> >> + * Apply architecture specific settings to config before passing it to syscall.
> >> + *
> >> + * @param config
> >> + *   Architecture specific event configuration. Consult kernel sources for available options.
> >> + */
> >> +void
> >> +pmu_arch_fixup_config(uint64_t config[3]);
> >> +
> >> +#endif /* _PMU_PRIVATE_H_ */
> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
> >> 100644 index 0000000000..950f999cb7
> >> --- /dev/null
> >> +++ b/lib/pmu/rte_pmu.c
> >> @@ -0,0 +1,460 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(C) 2023 Marvell International Ltd.
> >> + */
> >> +
> >> +#include <ctype.h>
> >> +#include <dirent.h>
> >> +#include <errno.h>
> >> +#include <regex.h>
> >> +#include <stdlib.h>
> >> +#include <string.h>
> >> +#include <sys/ioctl.h>
> >> +#include <sys/mman.h>
> >> +#include <sys/queue.h>
> >> +#include <sys/syscall.h>
> >> +#include <unistd.h>
> >> +
> >> +#include <rte_atomic.h>
> >> +#include <rte_per_lcore.h>
> >> +#include <rte_pmu.h>
> >> +#include <rte_spinlock.h>
> >> +#include <rte_tailq.h>
> >> +
> >> +#include "pmu_private.h"
> >> +
> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> >
> >
> >I suppose that pass (as the whole implementation) is linux specific?
> >If so, wouldn't it make sense to have it under linux subdir?
> >
> 
> There are not any plans to support that elsewhere currently so flat
> directory structure is good enough.
> 
> >> +
> >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
> >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
> >> +(__builtin_ffsll(m) - 1)) & (m))
> >> +
> >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> +struct rte_pmu rte_pmu;
> >
> >Do we really need struct declaration here?
> >
> 
> What’s the problem with this placement precisely?

Not a big deal, but It seems excessive for me.
As I understand you do have include just above for the whole .h
that contains definition of that struct anyway.  
 
> 
> >
> >> +/*
> >> + * Following __rte_weak functions provide default no-op.
> >> +Architectures should override them if
> >> + * necessary.
> >> + */
> >> +
> >> +int
> >> +__rte_weak pmu_arch_init(void)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >> +void
> >> +__rte_weak pmu_arch_fini(void)
> >> +{
> >> +}
> >> +
> >> +void
> >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { }
> >> +
> >> +static int
> >> +get_term_format(const char *name, int *num, uint64_t *mask) {
> >> +	char path[PATH_MAX];
> >> +	char *config = NULL;
> >> +	int high, low, ret;
> >> +	FILE *fp;
> >> +
> >> +	*num = *mask = 0;
> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> >> +	fp = fopen(path, "r");
> >> +	if (fp == NULL)
> >> +		return -errno;
> >> +
> >> +	errno = 0;
> >> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> >> +	if (ret < 2) {
> >> +		ret = -ENODATA;
> >> +		goto out;
> >> +	}
> >> +	if (errno) {
> >> +		ret = -errno;
> >> +		goto out;
> >> +	}
> >> +
> >> +	if (ret == 2)
> >> +		high = low;
> >> +
> >> +	*mask = GENMASK_ULL(high, low);
> >> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> >> +	*num = config[strlen(config) - 1];
> >> +	*num = isdigit(*num) ? *num - '0' : 0;
> >> +
> >> +	ret = 0;
> >> +out:
> >> +	free(config);
> >> +	fclose(fp);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +parse_event(char *buf, uint64_t config[3]) {
> >> +	char *token, *term;
> >> +	int num, ret, val;
> >> +	uint64_t mask;
> >> +
> >> +	config[0] = config[1] = config[2] = 0;
> >> +
> >> +	token = strtok(buf, ",");
> >> +	while (token) {
> >> +		errno = 0;
> >> +		/* <term>=<value> */
> >> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> >> +		if (ret < 1)
> >> +			return -ENODATA;
> >> +		if (errno)
> >> +			return -errno;
> >> +		if (ret == 1)
> >> +			val = 1;
> >> +
> >> +		ret = get_term_format(term, &num, &mask);
> >> +		free(term);
> >> +		if (ret)
> >> +			return ret;
> >> +
> >> +		config[num] |= FIELD_PREP(mask, val);
> >> +		token = strtok(NULL, ",");
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int
> >> +get_event_config(const char *name, uint64_t config[3]) {
> >> +	char path[PATH_MAX], buf[BUFSIZ];
> >> +	FILE *fp;
> >> +	int ret;
> >> +
> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> >> +	fp = fopen(path, "r");
> >> +	if (fp == NULL)
> >> +		return -errno;
> >> +
> >> +	ret = fread(buf, 1, sizeof(buf), fp);
> >> +	if (ret == 0) {
> >> +		fclose(fp);
> >> +
> >> +		return -EINVAL;
> >> +	}
> >> +	fclose(fp);
> >> +	buf[ret] = '\0';
> >> +
> >> +	return parse_event(buf, config);
> >> +}
> >> +
> >> +static int
> >> +do_perf_event_open(uint64_t config[3], int group_fd) {
> >> +	struct perf_event_attr attr = {
> >> +		.size = sizeof(struct perf_event_attr),
> >> +		.type = PERF_TYPE_RAW,
> >> +		.exclude_kernel = 1,
> >> +		.exclude_hv = 1,
> >> +		.disabled = 1,
> >> +	};
> >> +
> >> +	pmu_arch_fixup_config(config);
> >> +
> >> +	attr.config = config[0];
> >> +	attr.config1 = config[1];
> >> +	attr.config2 = config[2];
> >> +
> >> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
> >> +
> >> +static int
> >> +open_events(struct rte_pmu_event_group *group) {
> >> +	struct rte_pmu_event *event;
> >> +	uint64_t config[3];
> >> +	int num = 0, ret;
> >> +
> >> +	/* group leader gets created first, with fd = -1 */
> >> +	group->fds[0] = -1;
> >> +
> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> >> +		ret = get_event_config(event->name, config);
> >> +		if (ret)
> >> +			continue;
> >> +
> >> +		ret = do_perf_event_open(config, group->fds[0]);
> >> +		if (ret == -1) {
> >> +			ret = -errno;
> >> +			goto out;
> >> +		}
> >> +
> >> +		group->fds[event->index] = ret;
> >> +		num++;
> >> +	}
> >> +
> >> +	return 0;
> >> +out:
> >> +	for (--num; num >= 0; num--) {
> >> +		close(group->fds[num]);
> >> +		group->fds[num] = -1;
> >> +	}
> >> +
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +mmap_events(struct rte_pmu_event_group *group) {
> >> +	long page_size = sysconf(_SC_PAGE_SIZE);
> >> +	unsigned int i;
> >> +	void *addr;
> >> +	int ret;
> >> +
> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> >> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> >> +		if (addr == MAP_FAILED) {
> >> +			ret = -errno;
> >> +			goto out;
> >> +		}
> >> +
> >> +		group->mmap_pages[i] = addr;
> >> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
> >> +			ret = -EPERM;
> >> +			goto out;
> >> +		}
> >> +	}
> >> +
> >> +	return 0;
> >> +out:
> >> +	for (; i; i--) {
> >> +		munmap(group->mmap_pages[i - 1], page_size);
> >> +		group->mmap_pages[i - 1] = NULL;
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void
> >> +cleanup_events(struct rte_pmu_event_group *group) {
> >> +	unsigned int i;
> >> +
> >> +	if (group->fds[0] != -1)
> >> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> >> +
> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> >> +		if (group->mmap_pages[i]) {
> >> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> >> +			group->mmap_pages[i] = NULL;
> >> +		}
> >> +
> >> +		if (group->fds[i] != -1) {
> >> +			close(group->fds[i]);
> >> +			group->fds[i] = -1;
> >> +		}
> >> +	}
> >> +
> >> +	group->enabled = false;
> >> +}
> >> +
> >> +int
> >> +__rte_pmu_enable_group(void)
> >> +{
> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> >> +	int ret;
> >> +
> >> +	if (rte_pmu.num_group_events == 0)
> >> +		return -ENODEV;
> >> +
> >> +	ret = open_events(group);
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	ret = mmap_events(group);
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> >> +		ret = -errno;
> >> +		goto out;
> >> +	}
> >> +
> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> >> +		ret = -errno;
> >> +		goto out;
> >> +	}
> >> +
> >> +	rte_spinlock_lock(&rte_pmu.lock);
> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> >> +	rte_spinlock_unlock(&rte_pmu.lock);
> >> +	group->enabled = true;
> >> +
> >> +	return 0;
> >> +
> >> +out:
> >> +	cleanup_events(group);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +scan_pmus(void)
> >> +{
> >> +	char path[PATH_MAX];
> >> +	struct dirent *dent;
> >> +	const char *name;
> >> +	DIR *dirp;
> >> +
> >> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> >> +	if (dirp == NULL)
> >> +		return -errno;
> >> +
> >> +	while ((dent = readdir(dirp))) {
> >> +		name = dent->d_name;
> >> +		if (name[0] == '.')
> >> +			continue;
> >> +
> >> +		/* sysfs entry should either contain cpus or be a cpu */
> >> +		if (!strcmp(name, "cpu"))
> >> +			break;
> >> +
> >> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> >> +		if (access(path, F_OK) == 0)
> >> +			break;
> >> +	}
> >> +
> >> +	if (dent) {
> >> +		rte_pmu.name = strdup(name);
> >> +		if (rte_pmu.name == NULL) {
> >> +			closedir(dirp);
> >> +
> >> +			return -ENOMEM;
> >> +		}
> >> +	}
> >> +
> >> +	closedir(dirp);
> >> +
> >> +	return rte_pmu.name ? 0 : -ENODEV;
> >> +}
> >> +
> >> +static struct rte_pmu_event *
> >> +new_event(const char *name)
> >> +{
> >> +	struct rte_pmu_event *event;
> >> +
> >> +	event = calloc(1, sizeof(*event));
> >> +	if (event == NULL)
> >> +		goto out;
> >> +
> >> +	event->name = strdup(name);
> >> +	if (event->name == NULL) {
> >> +		free(event);
> >> +		event = NULL;
> >> +	}
> >> +
> >> +out:
> >> +	return event;
> >> +}
> >> +
> >> +static void
> >> +free_event(struct rte_pmu_event *event) {
> >> +	free(event->name);
> >> +	free(event);
> >> +}
> >> +
> >> +int
> >> +rte_pmu_add_event(const char *name)
> >> +{
> >> +	struct rte_pmu_event *event;
> >> +	char path[PATH_MAX];
> >> +
> >> +	if (rte_pmu.name == NULL)
> >> +		return -ENODEV;
> >> +
> >> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> >> +		return -ENOSPC;
> >> +
> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> >> +	if (access(path, R_OK))
> >> +		return -ENODEV;
> >> +
> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> >> +		if (!strcmp(event->name, name))
> >> +			return event->index;
> >> +		continue;
> >> +	}
> >> +
> >> +	event = new_event(name);
> >> +	if (event == NULL)
> >> +		return -ENOMEM;
> >> +
> >> +	event->index = rte_pmu.num_group_events++;
> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> >> +
> >> +	return event->index;
> >> +}
> >> +
> >> +int
> >> +rte_pmu_init(void)
> >> +{
> >> +	int ret;
> >> +
> >> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
> >> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
> >> +	 * via command line but application doesn't care enough and performs init/fini again.
> >> +	 */
> >> +	if (rte_pmu.initialized != 0) {
> >> +		rte_pmu.initialized++;
> >> +		return 0;
> >> +	}
> >> +
> >> +	ret = scan_pmus();
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	ret = pmu_arch_init();
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	TAILQ_INIT(&rte_pmu.event_list);
> >> +	TAILQ_INIT(&rte_pmu.event_group_list);
> >> +	rte_spinlock_init(&rte_pmu.lock);
> >> +	rte_pmu.initialized = 1;
> >> +
> >> +	return 0;
> >> +out:
> >> +	free(rte_pmu.name);
> >> +	rte_pmu.name = NULL;
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +void
> >> +rte_pmu_fini(void)
> >> +{
> >> +	struct rte_pmu_event_group *group, *tmp_group;
> >> +	struct rte_pmu_event *event, *tmp_event;
> >> +
> >> +	/* cleanup once init count drops to zero */
> >> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
> >> +		return;
> >> +
> >> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> >> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> >> +		free_event(event);
> >> +	}
> >> +
> >> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> >> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> >> +		cleanup_events(group);
> >> +	}
> >> +
> >> +	pmu_arch_fini();
> >> +	free(rte_pmu.name);
> >> +	rte_pmu.name = NULL;
> >> +	rte_pmu.num_group_events = 0;
> >> +}
> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
> >> 100644 index 0000000000..6b664c3336
> >> --- /dev/null
> >> +++ b/lib/pmu/rte_pmu.h
> >> @@ -0,0 +1,212 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2023 Marvell
> >> + */
> >> +
> >> +#ifndef _RTE_PMU_H_
> >> +#define _RTE_PMU_H_
> >> +
> >> +/**
> >> + * @file
> >> + *
> >> + * PMU event tracing operations
> >> + *
> >> + * This file defines generic API and types necessary to setup PMU and
> >> + * read selected counters in runtime.
> >> + */
> >> +
> >> +#ifdef __cplusplus
> >> +extern "C" {
> >> +#endif
> >> +
> >> +#include <linux/perf_event.h>
> >> +
> >> +#include <rte_atomic.h>
> >> +#include <rte_branch_prediction.h>
> >> +#include <rte_common.h>
> >> +#include <rte_compat.h>
> >> +#include <rte_spinlock.h>
> >> +
> >> +/** Maximum number of events in a group */ #define
> >> +MAX_NUM_GROUP_EVENTS 8
> >> +
> >> +/**
> >> + * A structure describing a group of events.
> >> + */
> >> +struct rte_pmu_event_group {
> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >> +	bool enabled; /**< true if group was enabled on particular lcore */
> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
> >> +__rte_cache_aligned;
> >> +
> >> +/**
> >> + * A structure describing an event.
> >> + */
> >> +struct rte_pmu_event {
> >> +	char *name; /**< name of an event */
> >> +	unsigned int index; /**< event index into fds/mmap_pages */
> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
> >> +
> >> +/**
> >> + * A PMU state container.
> >> + */
> >> +struct rte_pmu {
> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> >> +	unsigned int num_group_events; /**< number of events in a group */
> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> >> +	unsigned int initialized; /**< initialization counter */ };
> >> +
> >> +/** lcore event group */
> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> +
> >> +/** PMU state container */
> >> +extern struct rte_pmu rte_pmu;
> >> +
> >> +/** Each architecture supporting PMU needs to provide its own version
> >> +*/ #ifndef rte_pmu_pmc_read #define rte_pmu_pmc_read(index) ({ 0; })
> >> +#endif
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Read PMU counter.
> >> + *
> >> + * @warning This should be not called directly.
> >> + *
> >> + * @param pc
> >> + *   Pointer to the mmapped user page.
> >> + * @return
> >> + *   Counter value read from hardware.
> >> + */
> >> +static __rte_always_inline uint64_t
> >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> >> +	uint64_t width, offset;
> >> +	uint32_t seq, index;
> >> +	int64_t pmc;
> >> +
> >> +	for (;;) {
> >> +		seq = pc->lock;
> >> +		rte_compiler_barrier();
> >
> >Are you sure that compiler_barrier() is enough here?
> >On some archs CPU itself has freedom to re-order reads.
> >Or I am missing something obvious here?
> >
> 
> It's a matter of not keeping old stuff cached in registers
> and making sure that we have two reads of lock. CPU reordering
> won't do any harm here.

Sorry, I didn't get you here:
Suppose CPU will re-order reads and will read lock *after* index or offset value.
Wouldn't it mean that in that case index and/or offset can contain old/invalid values? 
 
> 
> >> +		index = pc->index;
> >> +		offset = pc->offset;
> >> +		width = pc->pmc_width;
> >> +
> >> +		/* index set to 0 means that particular counter cannot be used */
> >> +		if (likely(pc->cap_user_rdpmc && index)) {
> >> +			pmc = rte_pmu_pmc_read(index - 1);
> >> +			pmc <<= 64 - width;
> >> +			pmc >>= 64 - width;
> >> +			offset += pmc;
> >> +		}
> >> +
> >> +		rte_compiler_barrier();
> >> +
> >> +		if (likely(pc->lock == seq))
> >> +			return offset;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Enable group of events on the calling lcore.
> >> + *
> >> + * @warning This should be not called directly.
> >> + *
> >> + * @return
> >> + *   0 in case of success, negative value otherwise.
> >> + */
> >> +__rte_experimental
> >> +int
> >> +__rte_pmu_enable_group(void);
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Initialize PMU library.
> >> + *
> >> + * @warning This should be not called directly.
> >> + *
> >> + * @return
> >> + *   0 in case of success, negative value otherwise.
> >> + */
> >> +__rte_experimental
> >> +int
> >> +rte_pmu_init(void);
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
> >> + */
> >> +__rte_experimental
> >> +void
> >> +rte_pmu_fini(void);
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Add event to the group of enabled events.
> >> + *
> >> + * @param name
> >> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> >> + * @return
> >> + *   Event index in case of success, negative value otherwise.
> >> + */
> >> +__rte_experimental
> >> +int
> >> +rte_pmu_add_event(const char *name);
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Read hardware counter configured to count occurrences of an event.
> >> + *
> >> + * @param index
> >> + *   Index of an event to be read.
> >> + * @return
> >> + *   Event value read from register. In case of errors or lack of support
> >> + *   0 is returned. In other words, stream of zeros in a trace file
> >> + *   indicates problem with reading particular PMU event register.
> >> + */
> >> +__rte_experimental
> >> +static __rte_always_inline uint64_t
> >> +rte_pmu_read(unsigned int index)
> >> +{
> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> >> +	int ret;
> >> +
> >> +	if (unlikely(!rte_pmu.initialized))
> >> +		return 0;
> >> +
> >> +	if (unlikely(!group->enabled)) {
> >> +		ret = __rte_pmu_enable_group();
> >> +		if (ret)
> >> +			return 0;
> >> +	}
> >> +
> >> +	if (unlikely(index >= rte_pmu.num_group_events))
> >> +		return 0;
> >> +
> >> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
> >> +}
> >> +
> >> +#ifdef __cplusplus
> >> +}
> >> +#endif
> >> +
> >> +#endif /* _RTE_PMU_H_ */
> >> diff --git a/lib/pmu/version.map b/lib/pmu/version.map new file mode
> >> 100644 index 0000000000..39a4f279c1
> >> --- /dev/null
> >> +++ b/lib/pmu/version.map
> >> @@ -0,0 +1,15 @@
> >> +DPDK_23 {
> >> +	local: *;
> >> +};
> >> +
> >> +EXPERIMENTAL {
> >> +	global:
> >> +
> >> +	__rte_pmu_enable_group;
> >> +	per_lcore__event_group;
> >> +	rte_pmu;
> >> +	rte_pmu_add_event;
> >> +	rte_pmu_fini;
> >> +	rte_pmu_init;
> >> +	rte_pmu_read;
> >> +};
  
Tomasz Duszynski Feb. 19, 2023, 2:23 p.m. UTC | #4
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
>Sent: Friday, February 17, 2023 11:15 AM
>To: Tomasz Duszynski <tduszynski@marvell.com>; Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>;
>dev@dpdk.org
>Subject: RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>
>
>> >>
>> >> This is especially useful in cases where CPU cores are isolated i.e
>> >> run dedicated tasks. In such cases one cannot use standard perf
>> >> utility without sacrificing latency and performance.
>> >>
>> >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> >> ---
>> >>   MAINTAINERS                            |   5 +
>> >>   app/test/meson.build                   |   2 +
>> >>   app/test/test_pmu.c                    |  62 ++++
>> >>   doc/api/doxy-api-index.md              |   3 +-
>> >>   doc/api/doxy-api.conf.in               |   1 +
>> >>   doc/guides/prog_guide/profile_app.rst  |  12 +
>> >>   doc/guides/rel_notes/release_23_03.rst |   7 +
>> >>   lib/meson.build                        |   1 +
>> >>   lib/pmu/meson.build                    |  13 +
>> >>   lib/pmu/pmu_private.h                  |  32 ++
>> >>   lib/pmu/rte_pmu.c                      | 460 +++++++++++++++++++++++++
>> >>   lib/pmu/rte_pmu.h                      | 212 ++++++++++++
>> >>   lib/pmu/version.map                    |  15 +
>> >>   13 files changed, 824 insertions(+), 1 deletion(-)
>> >>   create mode 100644 app/test/test_pmu.c
>> >>   create mode 100644 lib/pmu/meson.build
>> >>   create mode 100644 lib/pmu/pmu_private.h
>> >>   create mode 100644 lib/pmu/rte_pmu.c
>> >>   create mode 100644 lib/pmu/rte_pmu.h
>> >>   create mode 100644 lib/pmu/version.map
>> >>
>> >> diff --git a/MAINTAINERS b/MAINTAINERS index 3495946d0f..d37f242120
>> >> 100644
>> >> --- a/MAINTAINERS
>> >> +++ b/MAINTAINERS
>> >> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
>> >>   M: Pavan Nikhilesh <pbhagavatula@marvell.com>
>> >>   F: lib/node/
>> >>
>> >> +PMU - EXPERIMENTAL
>> >> +M: Tomasz Duszynski <tduszynski@marvell.com>
>> >> +F: lib/pmu/
>> >> +F: app/test/test_pmu*
>> >> +
>> >>
>> >>   Test Applications
>> >>   -----------------
>> >> diff --git a/app/test/meson.build b/app/test/meson.build index
>> >> f34d19e3c3..6b61b7fc32 100644
>> >> --- a/app/test/meson.build
>> >> +++ b/app/test/meson.build
>> >> @@ -111,6 +111,7 @@ test_sources = files(
>> >>           'test_reciprocal_division_perf.c',
>> >>           'test_red.c',
>> >>           'test_pie.c',
>> >> +        'test_pmu.c',
>> >>           'test_reorder.c',
>> >>           'test_rib.c',
>> >>           'test_rib6.c',
>> >> @@ -239,6 +240,7 @@ fast_tests = [
>> >>           ['kni_autotest', false, true],
>> >>           ['kvargs_autotest', true, true],
>> >>           ['member_autotest', true, true],
>> >> +        ['pmu_autotest', true, true],
>> >>           ['power_cpufreq_autotest', false, true],
>> >>           ['power_autotest', true, true],
>> >>           ['power_kvm_vm_autotest', false, true], diff --git
>> >> a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode 100644
>> >> index 0000000000..c257638e8b
>> >> --- /dev/null
>> >> +++ b/app/test/test_pmu.c
>> >> @@ -0,0 +1,62 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(C) 2023 Marvell International Ltd.
>> >> + */
>> >> +
>> >> +#include "test.h"
>> >> +
>> >> +#ifndef RTE_EXEC_ENV_LINUX
>> >> +
>> >> +static int
>> >> +test_pmu(void)
>> >> +{
>> >> +	printf("pmu_autotest only supported on Linux, skipping test\n");
>> >> +	return TEST_SKIPPED;
>> >> +}
>> >> +
>> >> +#else
>> >> +
>> >> +#include <rte_pmu.h>
>> >> +
>> >> +static int
>> >> +test_pmu_read(void)
>> >> +{
>> >> +	const char *name = NULL;
>> >> +	int tries = 10, event;
>> >> +	uint64_t val = 0;
>> >> +
>> >> +	if (name == NULL) {
>> >> +		printf("PMU not supported on this arch\n");
>> >> +		return TEST_SKIPPED;
>> >> +	}
>> >> +
>> >> +	if (rte_pmu_init() < 0)
>> >> +		return TEST_SKIPPED;
>> >> +
>> >> +	event = rte_pmu_add_event(name);
>> >> +	while (tries--)
>> >> +		val += rte_pmu_read(event);
>> >> +
>> >> +	rte_pmu_fini();
>> >> +
>> >> +	return val ? TEST_SUCCESS : TEST_FAILED; }
>> >> +
>> >> +static struct unit_test_suite pmu_tests = {
>> >> +	.suite_name = "pmu autotest",
>> >> +	.setup = NULL,
>> >> +	.teardown = NULL,
>> >> +	.unit_test_cases = {
>> >> +		TEST_CASE(test_pmu_read),
>> >> +		TEST_CASES_END()
>> >> +	}
>> >> +};
>> >> +
>> >> +static int
>> >> +test_pmu(void)
>> >> +{
>> >> +	return unit_test_suite_runner(&pmu_tests);
>> >> +}
>> >> +
>> >> +#endif /* RTE_EXEC_ENV_LINUX */
>> >> +
>> >> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
>> >> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
>> >> index 2deec7ea19..a8e04a195d 100644
>> >> --- a/doc/api/doxy-api-index.md
>> >> +++ b/doc/api/doxy-api-index.md
>> >> @@ -223,7 +223,8 @@ The public API headers are grouped by topics:
>> >>     [log](@ref rte_log.h),
>> >>     [errno](@ref rte_errno.h),
>> >>     [trace](@ref rte_trace.h),
>> >> -  [trace_point](@ref rte_trace_point.h)
>> >> +  [trace_point](@ref rte_trace_point.h),  [pmu](@ref rte_pmu.h)
>> >>
>> >>   - **misc**:
>> >>     [EAL config](@ref rte_eal.h),
>> >> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
>> >> index
>> >> e859426099..350b5a8c94 100644
>> >> --- a/doc/api/doxy-api.conf.in
>> >> +++ b/doc/api/doxy-api.conf.in
>> >> @@ -63,6 +63,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
>> >>                             @TOPDIR@/lib/pci \
>> >>                             @TOPDIR@/lib/pdump \
>> >>                             @TOPDIR@/lib/pipeline \
>> >> +                          @TOPDIR@/lib/pmu \
>> >>                             @TOPDIR@/lib/port \
>> >>                             @TOPDIR@/lib/power \
>> >>                             @TOPDIR@/lib/rawdev \ diff --git
>> >> a/doc/guides/prog_guide/profile_app.rst
>> >> b/doc/guides/prog_guide/profile_app.rst
>> >> index 14292d4c25..89e38cd301 100644
>> >> --- a/doc/guides/prog_guide/profile_app.rst
>> >> +++ b/doc/guides/prog_guide/profile_app.rst
>> >> @@ -7,6 +7,18 @@ Profile Your Application
>> >>   The following sections describe methods of profiling DPDK applications on
>> >>   different architectures.
>> >>
>> >> +Performance counter based profiling
>> >> +-----------------------------------
>> >> +
>> >> +Majority of architectures support some performance monitoring unit (PMU).
>> >> +Such unit provides programmable counters that monitor specific events.
>> >> +
>> >> +Different tools gather that information, like for example perf.
>> >> +However, in some scenarios when CPU cores are isolated and run
>> >> +dedicated tasks interrupting those tasks with perf may be undesirable.
>> >> +
>> >> +In such cases, an application can use the PMU library to read such
>> >> +events via
>> >``rte_pmu_read()``.
>> >> +
>> >>
>> >>   Profiling on x86
>> >>   ----------------
>> >> diff --git a/doc/guides/rel_notes/release_23_03.rst
>> >> b/doc/guides/rel_notes/release_23_03.rst
>> >> index ab998a5357..20622efe58 100644
>> >> --- a/doc/guides/rel_notes/release_23_03.rst
>> >> +++ b/doc/guides/rel_notes/release_23_03.rst
>> >> @@ -147,6 +147,13 @@ New Features
>> >>     * Added support to capture packets at each graph node with packet metadata and
>> >>       node name.
>> >>
>> >> +* **Added PMU library.**
>> >> +
>> >> +  Added a new performance monitoring unit (PMU) library which
>> >> + allows applications  to perform self monitoring activities
>> >> + without depending on external utilities
>> >like perf.
>> >> +  After integration with :doc:`../prog_guide/trace_lib` data
>> >> + gathered from hardware counters  can be stored in CTF format for further analysis.
>> >> +
>> >>
>> >>   Removed Items
>> >>   -------------
>> >> diff --git a/lib/meson.build b/lib/meson.build index
>> >> 450c061d2b..8a42d45d20 100644
>> >> --- a/lib/meson.build
>> >> +++ b/lib/meson.build
>> >> @@ -11,6 +11,7 @@
>> >>   libraries = [
>> >>           'kvargs', # eal depends on kvargs
>> >>           'telemetry', # basic info querying
>> >> +        'pmu',
>> >>           'eal', # everything depends on eal
>> >>           'ring',
>> >>           'rcu', # rcu depends on ring diff --git
>> >> a/lib/pmu/meson.build b/lib/pmu/meson.build new file mode 100644
>> >> index 0000000000..a4160b494e
>> >> --- /dev/null
>> >> +++ b/lib/pmu/meson.build
>> >> @@ -0,0 +1,13 @@
>> >> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(C) 2023
>> >> +Marvell International Ltd.
>> >> +
>> >> +if not is_linux
>> >> +    build = false
>> >> +    reason = 'only supported on Linux'
>> >> +    subdir_done()
>> >> +endif
>> >> +
>> >> +includes = [global_inc]
>> >> +
>> >> +sources = files('rte_pmu.c')
>> >> +headers = files('rte_pmu.h')
>> >> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h new file
>> >> mode 100644 index 0000000000..b9f8c1ddc8
>> >> --- /dev/null
>> >> +++ b/lib/pmu/pmu_private.h
>> >> @@ -0,0 +1,32 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(c) 2023 Marvell
>> >> + */
>> >> +
>> >> +#ifndef _PMU_PRIVATE_H_
>> >> +#define _PMU_PRIVATE_H_
>> >> +
>> >> +/**
>> >> + * Architecture specific PMU init callback.
>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >> +int
>> >> +pmu_arch_init(void);
>> >> +
>> >> +/**
>> >> + * Architecture specific PMU cleanup callback.
>> >> + */
>> >> +void
>> >> +pmu_arch_fini(void);
>> >> +
>> >> +/**
>> >> + * Apply architecture specific settings to config before passing it to syscall.
>> >> + *
>> >> + * @param config
>> >> + *   Architecture specific event configuration. Consult kernel sources for available options.
>> >> + */
>> >> +void
>> >> +pmu_arch_fixup_config(uint64_t config[3]);
>> >> +
>> >> +#endif /* _PMU_PRIVATE_H_ */
>> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> >> 100644 index 0000000000..950f999cb7
>> >> --- /dev/null
>> >> +++ b/lib/pmu/rte_pmu.c
>> >> @@ -0,0 +1,460 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(C) 2023 Marvell International Ltd.
>> >> + */
>> >> +
>> >> +#include <ctype.h>
>> >> +#include <dirent.h>
>> >> +#include <errno.h>
>> >> +#include <regex.h>
>> >> +#include <stdlib.h>
>> >> +#include <string.h>
>> >> +#include <sys/ioctl.h>
>> >> +#include <sys/mman.h>
>> >> +#include <sys/queue.h>
>> >> +#include <sys/syscall.h>
>> >> +#include <unistd.h>
>> >> +
>> >> +#include <rte_atomic.h>
>> >> +#include <rte_per_lcore.h>
>> >> +#include <rte_pmu.h>
>> >> +#include <rte_spinlock.h>
>> >> +#include <rte_tailq.h>
>> >> +
>> >> +#include "pmu_private.h"
>> >> +
>> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> >
>> >
>> >I suppose that pass (as the whole implementation) is linux specific?
>> >If so, wouldn't it make sense to have it under linux subdir?
>> >
>>
>> There are not any plans to support that elsewhere currently so flat
>> directory structure is good enough.
>>
>> >> +
>> >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
>> >> +(__builtin_ffsll(m) - 1)) & (m))
>> >> +
>> >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> >> +struct rte_pmu rte_pmu;
>> >
>> >Do we really need struct declaration here?
>> >
>>
>> What’s the problem with this placement precisely?
>
>Not a big deal, but It seems excessive for me.
>As I understand you do have include just above for the whole .h that contains definition of that
>struct anyway.
>
>>
>> >
>> >> +/*
>> >> + * Following __rte_weak functions provide default no-op.
>> >> +Architectures should override them if
>> >> + * necessary.
>> >> + */
>> >> +
>> >> +int
>> >> +__rte_weak pmu_arch_init(void)
>> >> +{
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +void
>> >> +__rte_weak pmu_arch_fini(void)
>> >> +{
>> >> +}
>> >> +
>> >> +void
>> >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
>> >> +{ }
>> >> +
>> >> +static int
>> >> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> >> +	char path[PATH_MAX];
>> >> +	char *config = NULL;
>> >> +	int high, low, ret;
>> >> +	FILE *fp;
>> >> +
>> >> +	*num = *mask = 0;
>> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name,
>name);
>> >> +	fp = fopen(path, "r");
>> >> +	if (fp == NULL)
>> >> +		return -errno;
>> >> +
>> >> +	errno = 0;
>> >> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> >> +	if (ret < 2) {
>> >> +		ret = -ENODATA;
>> >> +		goto out;
>> >> +	}
>> >> +	if (errno) {
>> >> +		ret = -errno;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	if (ret == 2)
>> >> +		high = low;
>> >> +
>> >> +	*mask = GENMASK_ULL(high, low);
>> >> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
>> >> +	*num = config[strlen(config) - 1];
>> >> +	*num = isdigit(*num) ? *num - '0' : 0;
>> >> +
>> >> +	ret = 0;
>> >> +out:
>> >> +	free(config);
>> >> +	fclose(fp);
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +parse_event(char *buf, uint64_t config[3]) {
>> >> +	char *token, *term;
>> >> +	int num, ret, val;
>> >> +	uint64_t mask;
>> >> +
>> >> +	config[0] = config[1] = config[2] = 0;
>> >> +
>> >> +	token = strtok(buf, ",");
>> >> +	while (token) {
>> >> +		errno = 0;
>> >> +		/* <term>=<value> */
>> >> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> >> +		if (ret < 1)
>> >> +			return -ENODATA;
>> >> +		if (errno)
>> >> +			return -errno;
>> >> +		if (ret == 1)
>> >> +			val = 1;
>> >> +
>> >> +		ret = get_term_format(term, &num, &mask);
>> >> +		free(term);
>> >> +		if (ret)
>> >> +			return ret;
>> >> +
>> >> +		config[num] |= FIELD_PREP(mask, val);
>> >> +		token = strtok(NULL, ",");
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +static int
>> >> +get_event_config(const char *name, uint64_t config[3]) {
>> >> +	char path[PATH_MAX], buf[BUFSIZ];
>> >> +	FILE *fp;
>> >> +	int ret;
>> >> +
>> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
>name);
>> >> +	fp = fopen(path, "r");
>> >> +	if (fp == NULL)
>> >> +		return -errno;
>> >> +
>> >> +	ret = fread(buf, 1, sizeof(buf), fp);
>> >> +	if (ret == 0) {
>> >> +		fclose(fp);
>> >> +
>> >> +		return -EINVAL;
>> >> +	}
>> >> +	fclose(fp);
>> >> +	buf[ret] = '\0';
>> >> +
>> >> +	return parse_event(buf, config);
>> >> +}
>> >> +
>> >> +static int
>> >> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> >> +	struct perf_event_attr attr = {
>> >> +		.size = sizeof(struct perf_event_attr),
>> >> +		.type = PERF_TYPE_RAW,
>> >> +		.exclude_kernel = 1,
>> >> +		.exclude_hv = 1,
>> >> +		.disabled = 1,
>> >> +	};
>> >> +
>> >> +	pmu_arch_fixup_config(config);
>> >> +
>> >> +	attr.config = config[0];
>> >> +	attr.config1 = config[1];
>> >> +	attr.config2 = config[2];
>> >> +
>> >> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
>> >> +
>> >> +static int
>> >> +open_events(struct rte_pmu_event_group *group) {
>> >> +	struct rte_pmu_event *event;
>> >> +	uint64_t config[3];
>> >> +	int num = 0, ret;
>> >> +
>> >> +	/* group leader gets created first, with fd = -1 */
>> >> +	group->fds[0] = -1;
>> >> +
>> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> >> +		ret = get_event_config(event->name, config);
>> >> +		if (ret)
>> >> +			continue;
>> >> +
>> >> +		ret = do_perf_event_open(config, group->fds[0]);
>> >> +		if (ret == -1) {
>> >> +			ret = -errno;
>> >> +			goto out;
>> >> +		}
>> >> +
>> >> +		group->fds[event->index] = ret;
>> >> +		num++;
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +out:
>> >> +	for (--num; num >= 0; num--) {
>> >> +		close(group->fds[num]);
>> >> +		group->fds[num] = -1;
>> >> +	}
>> >> +
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +mmap_events(struct rte_pmu_event_group *group) {
>> >> +	long page_size = sysconf(_SC_PAGE_SIZE);
>> >> +	unsigned int i;
>> >> +	void *addr;
>> >> +	int ret;
>> >> +
>> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> >> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
>> >> +		if (addr == MAP_FAILED) {
>> >> +			ret = -errno;
>> >> +			goto out;
>> >> +		}
>> >> +
>> >> +		group->mmap_pages[i] = addr;
>> >> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
>> >> +			ret = -EPERM;
>> >> +			goto out;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +out:
>> >> +	for (; i; i--) {
>> >> +		munmap(group->mmap_pages[i - 1], page_size);
>> >> +		group->mmap_pages[i - 1] = NULL;
>> >> +	}
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static void
>> >> +cleanup_events(struct rte_pmu_event_group *group) {
>> >> +	unsigned int i;
>> >> +
>> >> +	if (group->fds[0] != -1)
>> >> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
>> >> +PERF_IOC_FLAG_GROUP);
>> >> +
>> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> >> +		if (group->mmap_pages[i]) {
>> >> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> >> +			group->mmap_pages[i] = NULL;
>> >> +		}
>> >> +
>> >> +		if (group->fds[i] != -1) {
>> >> +			close(group->fds[i]);
>> >> +			group->fds[i] = -1;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	group->enabled = false;
>> >> +}
>> >> +
>> >> +int
>> >> +__rte_pmu_enable_group(void)
>> >> +{
>> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> >> +	int ret;
>> >> +
>> >> +	if (rte_pmu.num_group_events == 0)
>> >> +		return -ENODEV;
>> >> +
>> >> +	ret = open_events(group);
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	ret = mmap_events(group);
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
>> >> +		ret = -errno;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> >> +		ret = -errno;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	rte_spinlock_lock(&rte_pmu.lock);
>> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>> >> +	rte_spinlock_unlock(&rte_pmu.lock);
>> >> +	group->enabled = true;
>> >> +
>> >> +	return 0;
>> >> +
>> >> +out:
>> >> +	cleanup_events(group);
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +scan_pmus(void)
>> >> +{
>> >> +	char path[PATH_MAX];
>> >> +	struct dirent *dent;
>> >> +	const char *name;
>> >> +	DIR *dirp;
>> >> +
>> >> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> >> +	if (dirp == NULL)
>> >> +		return -errno;
>> >> +
>> >> +	while ((dent = readdir(dirp))) {
>> >> +		name = dent->d_name;
>> >> +		if (name[0] == '.')
>> >> +			continue;
>> >> +
>> >> +		/* sysfs entry should either contain cpus or be a cpu */
>> >> +		if (!strcmp(name, "cpu"))
>> >> +			break;
>> >> +
>> >> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> >> +		if (access(path, F_OK) == 0)
>> >> +			break;
>> >> +	}
>> >> +
>> >> +	if (dent) {
>> >> +		rte_pmu.name = strdup(name);
>> >> +		if (rte_pmu.name == NULL) {
>> >> +			closedir(dirp);
>> >> +
>> >> +			return -ENOMEM;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	closedir(dirp);
>> >> +
>> >> +	return rte_pmu.name ? 0 : -ENODEV; }
>> >> +
>> >> +static struct rte_pmu_event *
>> >> +new_event(const char *name)
>> >> +{
>> >> +	struct rte_pmu_event *event;
>> >> +
>> >> +	event = calloc(1, sizeof(*event));
>> >> +	if (event == NULL)
>> >> +		goto out;
>> >> +
>> >> +	event->name = strdup(name);
>> >> +	if (event->name == NULL) {
>> >> +		free(event);
>> >> +		event = NULL;
>> >> +	}
>> >> +
>> >> +out:
>> >> +	return event;
>> >> +}
>> >> +
>> >> +static void
>> >> +free_event(struct rte_pmu_event *event) {
>> >> +	free(event->name);
>> >> +	free(event);
>> >> +}
>> >> +
>> >> +int
>> >> +rte_pmu_add_event(const char *name) {
>> >> +	struct rte_pmu_event *event;
>> >> +	char path[PATH_MAX];
>> >> +
>> >> +	if (rte_pmu.name == NULL)
>> >> +		return -ENODEV;
>> >> +
>> >> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
>> >> +		return -ENOSPC;
>> >> +
>> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
>name);
>> >> +	if (access(path, R_OK))
>> >> +		return -ENODEV;
>> >> +
>> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> >> +		if (!strcmp(event->name, name))
>> >> +			return event->index;
>> >> +		continue;
>> >> +	}
>> >> +
>> >> +	event = new_event(name);
>> >> +	if (event == NULL)
>> >> +		return -ENOMEM;
>> >> +
>> >> +	event->index = rte_pmu.num_group_events++;
>> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> >> +
>> >> +	return event->index;
>> >> +}
>> >> +
>> >> +int
>> >> +rte_pmu_init(void)
>> >> +{
>> >> +	int ret;
>> >> +
>> >> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
>> >> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
>> >> +	 * via command line but application doesn't care enough and performs init/fini again.
>> >> +	 */
>> >> +	if (rte_pmu.initialized != 0) {
>> >> +		rte_pmu.initialized++;
>> >> +		return 0;
>> >> +	}
>> >> +
>> >> +	ret = scan_pmus();
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	ret = pmu_arch_init();
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	TAILQ_INIT(&rte_pmu.event_list);
>> >> +	TAILQ_INIT(&rte_pmu.event_group_list);
>> >> +	rte_spinlock_init(&rte_pmu.lock);
>> >> +	rte_pmu.initialized = 1;
>> >> +
>> >> +	return 0;
>> >> +out:
>> >> +	free(rte_pmu.name);
>> >> +	rte_pmu.name = NULL;
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +void
>> >> +rte_pmu_fini(void)
>> >> +{
>> >> +	struct rte_pmu_event_group *group, *tmp_group;
>> >> +	struct rte_pmu_event *event, *tmp_event;
>> >> +
>> >> +	/* cleanup once init count drops to zero */
>> >> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
>> >> +		return;
>> >> +
>> >> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> >> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> >> +		free_event(event);
>> >> +	}
>> >> +
>> >> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
>> >> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
>> >> +		cleanup_events(group);
>> >> +	}
>> >> +
>> >> +	pmu_arch_fini();
>> >> +	free(rte_pmu.name);
>> >> +	rte_pmu.name = NULL;
>> >> +	rte_pmu.num_group_events = 0;
>> >> +}
>> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> >> 100644 index 0000000000..6b664c3336
>> >> --- /dev/null
>> >> +++ b/lib/pmu/rte_pmu.h
>> >> @@ -0,0 +1,212 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(c) 2023 Marvell
>> >> + */
>> >> +
>> >> +#ifndef _RTE_PMU_H_
>> >> +#define _RTE_PMU_H_
>> >> +
>> >> +/**
>> >> + * @file
>> >> + *
>> >> + * PMU event tracing operations
>> >> + *
>> >> + * This file defines generic API and types necessary to setup PMU
>> >> +and
>> >> + * read selected counters in runtime.
>> >> + */
>> >> +
>> >> +#ifdef __cplusplus
>> >> +extern "C" {
>> >> +#endif
>> >> +
>> >> +#include <linux/perf_event.h>
>> >> +
>> >> +#include <rte_atomic.h>
>> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
>> >> +#include <rte_compat.h> #include <rte_spinlock.h>
>> >> +
>> >> +/** Maximum number of events in a group */ #define
>> >> +MAX_NUM_GROUP_EVENTS 8
>> >> +
>> >> +/**
>> >> + * A structure describing a group of events.
>> >> + */
>> >> +struct rte_pmu_event_group {
>> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages
>*/
>> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> >> +	bool enabled; /**< true if group was enabled on particular lcore */
>> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> >> +__rte_cache_aligned;
>> >> +
>> >> +/**
>> >> + * A structure describing an event.
>> >> + */
>> >> +struct rte_pmu_event {
>> >> +	char *name; /**< name of an event */
>> >> +	unsigned int index; /**< event index into fds/mmap_pages */
>> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> >> +
>> >> +/**
>> >> + * A PMU state container.
>> >> + */
>> >> +struct rte_pmu {
>> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
>> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> >> +	unsigned int num_group_events; /**< number of events in a group */
>> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> >> +	unsigned int initialized; /**< initialization counter */ };
>> >> +
>> >> +/** lcore event group */
>> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> >> +
>> >> +/** PMU state container */
>> >> +extern struct rte_pmu rte_pmu;
>> >> +
>> >> +/** Each architecture supporting PMU needs to provide its own
>> >> +version */ #ifndef rte_pmu_pmc_read #define
>> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Read PMU counter.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >> + *
>> >> + * @param pc
>> >> + *   Pointer to the mmapped user page.
>> >> + * @return
>> >> + *   Counter value read from hardware.
>> >> + */
>> >> +static __rte_always_inline uint64_t __rte_pmu_read_userpage(struct
>> >> +perf_event_mmap_page *pc) {
>> >> +	uint64_t width, offset;
>> >> +	uint32_t seq, index;
>> >> +	int64_t pmc;
>> >> +
>> >> +	for (;;) {
>> >> +		seq = pc->lock;
>> >> +		rte_compiler_barrier();
>> >
>> >Are you sure that compiler_barrier() is enough here?
>> >On some archs CPU itself has freedom to re-order reads.
>> >Or I am missing something obvious here?
>> >
>>
>> It's a matter of not keeping old stuff cached in registers and making
>> sure that we have two reads of lock. CPU reordering won't do any harm
>> here.
>
>Sorry, I didn't get you here:
>Suppose CPU will re-order reads and will read lock *after* index or offset value.
>Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
>

This number is just an indicator whether kernel did change something or not.
If cpu reordering will come into play then this will not change anything from pov of this loop. 
All we want is fresh data when needed and no involvement of compiler when it comes to reordering
code. 

>>
>> >> +		index = pc->index;
>> >> +		offset = pc->offset;
>> >> +		width = pc->pmc_width;
>> >> +
>> >> +		/* index set to 0 means that particular counter cannot be used */
>> >> +		if (likely(pc->cap_user_rdpmc && index)) {
>> >> +			pmc = rte_pmu_pmc_read(index - 1);
>> >> +			pmc <<= 64 - width;
>> >> +			pmc >>= 64 - width;
>> >> +			offset += pmc;
>> >> +		}
>> >> +
>> >> +		rte_compiler_barrier();
>> >> +
>> >> +		if (likely(pc->lock == seq))
>> >> +			return offset;
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Enable group of events on the calling lcore.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >> +__rte_experimental
>> >> +int
>> >> +__rte_pmu_enable_group(void);
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Initialize PMU library.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >> +__rte_experimental
>> >> +int
>> >> +rte_pmu_init(void);
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
>> >> + */
>> >> +__rte_experimental
>> >> +void
>> >> +rte_pmu_fini(void);
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Add event to the group of enabled events.
>> >> + *
>> >> + * @param name
>> >> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>> >> + * @return
>> >> + *   Event index in case of success, negative value otherwise.
>> >> + */
>> >> +__rte_experimental
>> >> +int
>> >> +rte_pmu_add_event(const char *name);
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Read hardware counter configured to count occurrences of an event.
>> >> + *
>> >> + * @param index
>> >> + *   Index of an event to be read.
>> >> + * @return
>> >> + *   Event value read from register. In case of errors or lack of support
>> >> + *   0 is returned. In other words, stream of zeros in a trace file
>> >> + *   indicates problem with reading particular PMU event register.
>> >> + */
>> >> +__rte_experimental
>> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned int
>> >> +index) {
>> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> >> +	int ret;
>> >> +
>> >> +	if (unlikely(!rte_pmu.initialized))
>> >> +		return 0;
>> >> +
>> >> +	if (unlikely(!group->enabled)) {
>> >> +		ret = __rte_pmu_enable_group();
>> >> +		if (ret)
>> >> +			return 0;
>> >> +	}
>> >> +
>> >> +	if (unlikely(index >= rte_pmu.num_group_events))
>> >> +		return 0;
>> >> +
>> >> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> >> +}
>> >> +
>> >> +#ifdef __cplusplus
>> >> +}
>> >> +#endif
>> >> +
>> >> +#endif /* _RTE_PMU_H_ */
>> >> diff --git a/lib/pmu/version.map b/lib/pmu/version.map new file
>> >> mode
>> >> 100644 index 0000000000..39a4f279c1
>> >> --- /dev/null
>> >> +++ b/lib/pmu/version.map
>> >> @@ -0,0 +1,15 @@
>> >> +DPDK_23 {
>> >> +	local: *;
>> >> +};
>> >> +
>> >> +EXPERIMENTAL {
>> >> +	global:
>> >> +
>> >> +	__rte_pmu_enable_group;
>> >> +	per_lcore__event_group;
>> >> +	rte_pmu;
>> >> +	rte_pmu_add_event;
>> >> +	rte_pmu_fini;
>> >> +	rte_pmu_init;
>> >> +	rte_pmu_read;
>> >> +};
  
Konstantin Ananyev Feb. 20, 2023, 2:31 p.m. UTC | #5
> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
> >> >> 100644 index 0000000000..6b664c3336
> >> >> --- /dev/null
> >> >> +++ b/lib/pmu/rte_pmu.h
> >> >> @@ -0,0 +1,212 @@
> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> >> + * Copyright(c) 2023 Marvell
> >> >> + */
> >> >> +
> >> >> +#ifndef _RTE_PMU_H_
> >> >> +#define _RTE_PMU_H_
> >> >> +
> >> >> +/**
> >> >> + * @file
> >> >> + *
> >> >> + * PMU event tracing operations
> >> >> + *
> >> >> + * This file defines generic API and types necessary to setup PMU
> >> >> +and
> >> >> + * read selected counters in runtime.
> >> >> + */
> >> >> +
> >> >> +#ifdef __cplusplus
> >> >> +extern "C" {
> >> >> +#endif
> >> >> +
> >> >> +#include <linux/perf_event.h>
> >> >> +
> >> >> +#include <rte_atomic.h>
> >> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
> >> >> +#include <rte_compat.h> #include <rte_spinlock.h>
> >> >> +
> >> >> +/** Maximum number of events in a group */ #define
> >> >> +MAX_NUM_GROUP_EVENTS 8
> >> >> +
> >> >> +/**
> >> >> + * A structure describing a group of events.
> >> >> + */
> >> >> +struct rte_pmu_event_group {
> >> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages
> >*/
> >> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >> >> +	bool enabled; /**< true if group was enabled on particular lcore */
> >> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
> >> >> +__rte_cache_aligned;
> >> >> +
> >> >> +/**
> >> >> + * A structure describing an event.
> >> >> + */
> >> >> +struct rte_pmu_event {
> >> >> +	char *name; /**< name of an event */
> >> >> +	unsigned int index; /**< event index into fds/mmap_pages */
> >> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
> >> >> +
> >> >> +/**
> >> >> + * A PMU state container.
> >> >> + */
> >> >> +struct rte_pmu {
> >> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> >> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
> >> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> >> >> +	unsigned int num_group_events; /**< number of events in a group */
> >> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> >> >> +	unsigned int initialized; /**< initialization counter */ };
> >> >> +
> >> >> +/** lcore event group */
> >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> >> +
> >> >> +/** PMU state container */
> >> >> +extern struct rte_pmu rte_pmu;
> >> >> +
> >> >> +/** Each architecture supporting PMU needs to provide its own
> >> >> +version */ #ifndef rte_pmu_pmc_read #define
> >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Read PMU counter.
> >> >> + *
> >> >> + * @warning This should be not called directly.
> >> >> + *
> >> >> + * @param pc
> >> >> + *   Pointer to the mmapped user page.
> >> >> + * @return
> >> >> + *   Counter value read from hardware.
> >> >> + */
> >> >> +static __rte_always_inline uint64_t __rte_pmu_read_userpage(struct
> >> >> +perf_event_mmap_page *pc) {
> >> >> +	uint64_t width, offset;
> >> >> +	uint32_t seq, index;
> >> >> +	int64_t pmc;
> >> >> +
> >> >> +	for (;;) {
> >> >> +		seq = pc->lock;
> >> >> +		rte_compiler_barrier();
> >> >
> >> >Are you sure that compiler_barrier() is enough here?
> >> >On some archs CPU itself has freedom to re-order reads.
> >> >Or I am missing something obvious here?
> >> >
> >>
> >> It's a matter of not keeping old stuff cached in registers and making
> >> sure that we have two reads of lock. CPU reordering won't do any harm
> >> here.
> >
> >Sorry, I didn't get you here:
> >Suppose CPU will re-order reads and will read lock *after* index or offset value.
> >Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
> >
> 
> This number is just an indicator whether kernel did change something or not.
 
You are talking about pc->lock, right?
Yes, I do understand that it is sort of seqlock.
That's why I am puzzled why we do not care about possible cpu read-reordering.
Manual for perf_event_open() also has a code snippet with compiler barrier only...

> If cpu reordering will come into play then this will not change anything from pov of this loop.
> All we want is fresh data when needed and no involvement of compiler when it comes to reordering
> code.

Ok, can you probably explain to me why the following could not happen:
T0:
pc->seqlock==0; pc->index==I1; pc->offset==O1;
T1:
      cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
       index=pc->index;  //index==I1;
 T2:
      cpu #1 kernel vent_update_userpage:
      pc->lock++; // pc->lock==1
      pc->index=I2;
      pc->offset=O2;
      ...
      pc->lock++; //pc->lock==2
T3:
      cpu #0 continue with read pmu:
      seq=pc->lock; //seq == 2
       offset=pc->offset; // offset == O2
       ....
       pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
       offset += pmc; //offset == O2 + pmcread(I1-1);       
       if (pc->lock == seq) // they are equal, return
             return offset;
  
Or, it can happen, but by some reason we don't care much?     

> >>
> >> >> +		index = pc->index;
> >> >> +		offset = pc->offset;
> >> >> +		width = pc->pmc_width;
> >> >> +
> >> >> +		/* index set to 0 means that particular counter cannot be used */
> >> >> +		if (likely(pc->cap_user_rdpmc && index)) {
> >> >> +			pmc = rte_pmu_pmc_read(index - 1);
> >> >> +			pmc <<= 64 - width;
> >> >> +			pmc >>= 64 - width;
> >> >> +			offset += pmc;
> >> >> +		}
> >> >> +
> >> >> +		rte_compiler_barrier();
> >> >> +
> >> >> +		if (likely(pc->lock == seq))
> >> >> +			return offset;
> >> >> +	}
> >> >> +
> >> >> +	return 0;
> >> >> +}
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Enable group of events on the calling lcore.
> >> >> + *
> >> >> + * @warning This should be not called directly.
> >> >> + *
> >> >> + * @return
> >> >> + *   0 in case of success, negative value otherwise.
> >> >> + */
> >> >> +__rte_experimental
> >> >> +int
> >> >> +__rte_pmu_enable_group(void);
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Initialize PMU library.
> >> >> + *
> >> >> + * @warning This should be not called directly.
> >> >> + *
> >> >> + * @return
> >> >> + *   0 in case of success, negative value otherwise.
> >> >> + */
> >> >> +__rte_experimental
> >> >> +int
> >> >> +rte_pmu_init(void);
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
> >> >> + */
> >> >> +__rte_experimental
> >> >> +void
> >> >> +rte_pmu_fini(void);
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Add event to the group of enabled events.
> >> >> + *
> >> >> + * @param name
> >> >> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> >> >> + * @return
> >> >> + *   Event index in case of success, negative value otherwise.
> >> >> + */
> >> >> +__rte_experimental
> >> >> +int
> >> >> +rte_pmu_add_event(const char *name);
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Read hardware counter configured to count occurrences of an event.
> >> >> + *
> >> >> + * @param index
> >> >> + *   Index of an event to be read.
> >> >> + * @return
> >> >> + *   Event value read from register. In case of errors or lack of support
> >> >> + *   0 is returned. In other words, stream of zeros in a trace file
> >> >> + *   indicates problem with reading particular PMU event register.
> >> >> + */

Another question - do we really need  to have __rte_pmu_read_userpage()
and rte_pmu_read() as static inline functions in public header?
As I understand, because of that we also have to make 'struct rte_pmu_*' 
definitions also public.   

> >> >> +__rte_experimental
> >> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned int
> >> >> +index) {
> >> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> >> >> +	int ret;
> >> >> +
> >> >> +	if (unlikely(!rte_pmu.initialized))
> >> >> +		return 0;
> >> >> +
> >> >> +	if (unlikely(!group->enabled)) {
> >> >> +		ret = __rte_pmu_enable_group();
> >> >> +		if (ret)
> >> >> +			return 0;
> >> >> +	}
> >> >> +
> >> >> +	if (unlikely(index >= rte_pmu.num_group_events))
> >> >> +		return 0;
> >> >> +
> >> >> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
> >> >> +}
> >> >> +
> >> >> +#ifdef __cplusplus
> >> >> +}
> >> >> +#endif
> >> >> +
  
Tomasz Duszynski Feb. 20, 2023, 4:59 p.m. UTC | #6
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
>Sent: Monday, February 20, 2023 3:31 PM
>To: Tomasz Duszynski <tduszynski@marvell.com>; Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>;
>dev@dpdk.org
>Subject: RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> >> >> 100644 index 0000000000..6b664c3336
>> >> >> --- /dev/null
>> >> >> +++ b/lib/pmu/rte_pmu.h
>> >> >> @@ -0,0 +1,212 @@
>> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> >> + * Copyright(c) 2023 Marvell
>> >> >> + */
>> >> >> +
>> >> >> +#ifndef _RTE_PMU_H_
>> >> >> +#define _RTE_PMU_H_
>> >> >> +
>> >> >> +/**
>> >> >> + * @file
>> >> >> + *
>> >> >> + * PMU event tracing operations
>> >> >> + *
>> >> >> + * This file defines generic API and types necessary to setup
>> >> >> +PMU and
>> >> >> + * read selected counters in runtime.
>> >> >> + */
>> >> >> +
>> >> >> +#ifdef __cplusplus
>> >> >> +extern "C" {
>> >> >> +#endif
>> >> >> +
>> >> >> +#include <linux/perf_event.h>
>> >> >> +
>> >> >> +#include <rte_atomic.h>
>> >> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
>> >> >> +#include <rte_compat.h> #include <rte_spinlock.h>
>> >> >> +
>> >> >> +/** Maximum number of events in a group */ #define
>> >> >> +MAX_NUM_GROUP_EVENTS 8
>> >> >> +
>> >> >> +/**
>> >> >> + * A structure describing a group of events.
>> >> >> + */
>> >> >> +struct rte_pmu_event_group {
>> >> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
>> >> >> +/**< array of user pages
>> >*/
>> >> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> >> >> +	bool enabled; /**< true if group was enabled on particular lcore */
>> >> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> >> >> +__rte_cache_aligned;
>> >> >> +
>> >> >> +/**
>> >> >> + * A structure describing an event.
>> >> >> + */
>> >> >> +struct rte_pmu_event {
>> >> >> +	char *name; /**< name of an event */
>> >> >> +	unsigned int index; /**< event index into fds/mmap_pages */
>> >> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> >> >> +
>> >> >> +/**
>> >> >> + * A PMU state container.
>> >> >> + */
>> >> >> +struct rte_pmu {
>> >> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> >> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
>> >> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> >> >> +	unsigned int num_group_events; /**< number of events in a group */
>> >> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> >> >> +	unsigned int initialized; /**< initialization counter */ };
>> >> >> +
>> >> >> +/** lcore event group */
>> >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
>> >> >> +_event_group);
>> >> >> +
>> >> >> +/** PMU state container */
>> >> >> +extern struct rte_pmu rte_pmu;
>> >> >> +
>> >> >> +/** Each architecture supporting PMU needs to provide its own
>> >> >> +version */ #ifndef rte_pmu_pmc_read #define
>> >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Read PMU counter.
>> >> >> + *
>> >> >> + * @warning This should be not called directly.
>> >> >> + *
>> >> >> + * @param pc
>> >> >> + *   Pointer to the mmapped user page.
>> >> >> + * @return
>> >> >> + *   Counter value read from hardware.
>> >> >> + */
>> >> >> +static __rte_always_inline uint64_t
>> >> >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>> >> >> +	uint64_t width, offset;
>> >> >> +	uint32_t seq, index;
>> >> >> +	int64_t pmc;
>> >> >> +
>> >> >> +	for (;;) {
>> >> >> +		seq = pc->lock;
>> >> >> +		rte_compiler_barrier();
>> >> >
>> >> >Are you sure that compiler_barrier() is enough here?
>> >> >On some archs CPU itself has freedom to re-order reads.
>> >> >Or I am missing something obvious here?
>> >> >
>> >>
>> >> It's a matter of not keeping old stuff cached in registers and
>> >> making sure that we have two reads of lock. CPU reordering won't do
>> >> any harm here.
>> >
>> >Sorry, I didn't get you here:
>> >Suppose CPU will re-order reads and will read lock *after* index or offset value.
>> >Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
>> >
>>
>> This number is just an indicator whether kernel did change something or not.
>
>You are talking about pc->lock, right?
>Yes, I do understand that it is sort of seqlock.
>That's why I am puzzled why we do not care about possible cpu read-reordering.
>Manual for perf_event_open() also has a code snippet with compiler barrier only...
>
>> If cpu reordering will come into play then this will not change anything from pov of this loop.
>> All we want is fresh data when needed and no involvement of compiler
>> when it comes to reordering code.
>
>Ok, can you probably explain to me why the following could not happen:
>T0:
>pc->seqlock==0; pc->index==I1; pc->offset==O1;
>T1:
>      cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
>       index=pc->index;  //index==I1;
> T2:
>      cpu #1 kernel vent_update_userpage:
>      pc->lock++; // pc->lock==1
>      pc->index=I2;
>      pc->offset=O2;
>      ...
>      pc->lock++; //pc->lock==2
>T3:
>      cpu #0 continue with read pmu:
>      seq=pc->lock; //seq == 2
>       offset=pc->offset; // offset == O2
>       ....
>       pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
>       offset += pmc; //offset == O2 + pmcread(I1-1);
>       if (pc->lock == seq) // they are equal, return
>             return offset;
>
>Or, it can happen, but by some reason we don't care much?
>

This code does self-monitoring and user page (whole group actually) is per thread running on
current cpu. Hence I am not sure what are you trying to prove with that example.  

>> >>
>> >> >> +		index = pc->index;
>> >> >> +		offset = pc->offset;
>> >> >> +		width = pc->pmc_width;
>> >> >> +
>> >> >> +		/* index set to 0 means that particular counter cannot be used */
>> >> >> +		if (likely(pc->cap_user_rdpmc && index)) {
>> >> >> +			pmc = rte_pmu_pmc_read(index - 1);
>> >> >> +			pmc <<= 64 - width;
>> >> >> +			pmc >>= 64 - width;
>> >> >> +			offset += pmc;
>> >> >> +		}
>> >> >> +
>> >> >> +		rte_compiler_barrier();
>> >> >> +
>> >> >> +		if (likely(pc->lock == seq))
>> >> >> +			return offset;
>> >> >> +	}
>> >> >> +
>> >> >> +	return 0;
>> >> >> +}
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Enable group of events on the calling lcore.
>> >> >> + *
>> >> >> + * @warning This should be not called directly.
>> >> >> + *
>> >> >> + * @return
>> >> >> + *   0 in case of success, negative value otherwise.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +int
>> >> >> +__rte_pmu_enable_group(void);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Initialize PMU library.
>> >> >> + *
>> >> >> + * @warning This should be not called directly.
>> >> >> + *
>> >> >> + * @return
>> >> >> + *   0 in case of success, negative value otherwise.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +int
>> >> >> +rte_pmu_init(void);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Finalize PMU library. This should be called after PMU counters are no longer being
>read.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +void
>> >> >> +rte_pmu_fini(void);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Add event to the group of enabled events.
>> >> >> + *
>> >> >> + * @param name
>> >> >> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>> >> >> + * @return
>> >> >> + *   Event index in case of success, negative value otherwise.
>> >> >> + */
>> >> >> +__rte_experimental
>> >> >> +int
>> >> >> +rte_pmu_add_event(const char *name);
>> >> >> +
>> >> >> +/**
>> >> >> + * @warning
>> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> + *
>> >> >> + * Read hardware counter configured to count occurrences of an event.
>> >> >> + *
>> >> >> + * @param index
>> >> >> + *   Index of an event to be read.
>> >> >> + * @return
>> >> >> + *   Event value read from register. In case of errors or lack of support
>> >> >> + *   0 is returned. In other words, stream of zeros in a trace file
>> >> >> + *   indicates problem with reading particular PMU event register.
>> >> >> + */
>
>Another question - do we really need  to have __rte_pmu_read_userpage() and rte_pmu_read() as
>static inline functions in public header?
>As I understand, because of that we also have to make 'struct rte_pmu_*'
>definitions also public.
>
>> >> >> +__rte_experimental
>> >> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned int
>> >> >> +index) {
>> >> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> >> >> +	int ret;
>> >> >> +
>> >> >> +	if (unlikely(!rte_pmu.initialized))
>> >> >> +		return 0;
>> >> >> +
>> >> >> +	if (unlikely(!group->enabled)) {
>> >> >> +		ret = __rte_pmu_enable_group();
>> >> >> +		if (ret)
>> >> >> +			return 0;
>> >> >> +	}
>> >> >> +
>> >> >> +	if (unlikely(index >= rte_pmu.num_group_events))
>> >> >> +		return 0;
>> >> >> +
>> >> >> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> >> >> +}
>> >> >> +
>> >> >> +#ifdef __cplusplus
>> >> >> +}
>> >> >> +#endif
>> >> >> +
  
Konstantin Ananyev Feb. 20, 2023, 5:21 p.m. UTC | #7
> >> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
> >> >> >> 100644 index 0000000000..6b664c3336
> >> >> >> --- /dev/null
> >> >> >> +++ b/lib/pmu/rte_pmu.h
> >> >> >> @@ -0,0 +1,212 @@
> >> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> >> >> + * Copyright(c) 2023 Marvell
> >> >> >> + */
> >> >> >> +
> >> >> >> +#ifndef _RTE_PMU_H_
> >> >> >> +#define _RTE_PMU_H_
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @file
> >> >> >> + *
> >> >> >> + * PMU event tracing operations
> >> >> >> + *
> >> >> >> + * This file defines generic API and types necessary to setup
> >> >> >> +PMU and
> >> >> >> + * read selected counters in runtime.
> >> >> >> + */
> >> >> >> +
> >> >> >> +#ifdef __cplusplus
> >> >> >> +extern "C" {
> >> >> >> +#endif
> >> >> >> +
> >> >> >> +#include <linux/perf_event.h>
> >> >> >> +
> >> >> >> +#include <rte_atomic.h>
> >> >> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
> >> >> >> +#include <rte_compat.h> #include <rte_spinlock.h>
> >> >> >> +
> >> >> >> +/** Maximum number of events in a group */ #define
> >> >> >> +MAX_NUM_GROUP_EVENTS 8
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * A structure describing a group of events.
> >> >> >> + */
> >> >> >> +struct rte_pmu_event_group {
> >> >> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
> >> >> >> +/**< array of user pages
> >> >*/
> >> >> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >> >> >> +	bool enabled; /**< true if group was enabled on particular lcore */
> >> >> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
> >> >> >> +__rte_cache_aligned;
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * A structure describing an event.
> >> >> >> + */
> >> >> >> +struct rte_pmu_event {
> >> >> >> +	char *name; /**< name of an event */
> >> >> >> +	unsigned int index; /**< event index into fds/mmap_pages */
> >> >> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * A PMU state container.
> >> >> >> + */
> >> >> >> +struct rte_pmu {
> >> >> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> >> >> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
> >> >> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> >> >> >> +	unsigned int num_group_events; /**< number of events in a group */
> >> >> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> >> >> >> +	unsigned int initialized; /**< initialization counter */ };
> >> >> >> +
> >> >> >> +/** lcore event group */
> >> >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
> >> >> >> +_event_group);
> >> >> >> +
> >> >> >> +/** PMU state container */
> >> >> >> +extern struct rte_pmu rte_pmu;
> >> >> >> +
> >> >> >> +/** Each architecture supporting PMU needs to provide its own
> >> >> >> +version */ #ifndef rte_pmu_pmc_read #define
> >> >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @warning
> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> >> + *
> >> >> >> + * Read PMU counter.
> >> >> >> + *
> >> >> >> + * @warning This should be not called directly.
> >> >> >> + *
> >> >> >> + * @param pc
> >> >> >> + *   Pointer to the mmapped user page.
> >> >> >> + * @return
> >> >> >> + *   Counter value read from hardware.
> >> >> >> + */
> >> >> >> +static __rte_always_inline uint64_t
> >> >> >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> >> >> >> +	uint64_t width, offset;
> >> >> >> +	uint32_t seq, index;
> >> >> >> +	int64_t pmc;
> >> >> >> +
> >> >> >> +	for (;;) {
> >> >> >> +		seq = pc->lock;
> >> >> >> +		rte_compiler_barrier();
> >> >> >
> >> >> >Are you sure that compiler_barrier() is enough here?
> >> >> >On some archs CPU itself has freedom to re-order reads.
> >> >> >Or I am missing something obvious here?
> >> >> >
> >> >>
> >> >> It's a matter of not keeping old stuff cached in registers and
> >> >> making sure that we have two reads of lock. CPU reordering won't do
> >> >> any harm here.
> >> >
> >> >Sorry, I didn't get you here:
> >> >Suppose CPU will re-order reads and will read lock *after* index or offset value.
> >> >Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
> >> >
> >>
> >> This number is just an indicator whether kernel did change something or not.
> >
> >You are talking about pc->lock, right?
> >Yes, I do understand that it is sort of seqlock.
> >That's why I am puzzled why we do not care about possible cpu read-reordering.
> >Manual for perf_event_open() also has a code snippet with compiler barrier only...
> >
> >> If cpu reordering will come into play then this will not change anything from pov of this loop.
> >> All we want is fresh data when needed and no involvement of compiler
> >> when it comes to reordering code.
> >
> >Ok, can you probably explain to me why the following could not happen:
> >T0:
> >pc->seqlock==0; pc->index==I1; pc->offset==O1;
> >T1:
> >      cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
> >       index=pc->index;  //index==I1;
> > T2:
> >      cpu #1 kernel vent_update_userpage:
> >      pc->lock++; // pc->lock==1
> >      pc->index=I2;
> >      pc->offset=O2;
> >      ...
> >      pc->lock++; //pc->lock==2
> >T3:
> >      cpu #0 continue with read pmu:
> >      seq=pc->lock; //seq == 2
> >       offset=pc->offset; // offset == O2
> >       ....
> >       pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
> >       offset += pmc; //offset == O2 + pmcread(I1-1);
> >       if (pc->lock == seq) // they are equal, return
> >             return offset;
> >
> >Or, it can happen, but by some reason we don't care much?
> >
> 
> This code does self-monitoring and user page (whole group actually) is per thread running on
> current cpu. Hence I am not sure what are you trying to prove with that example.

I am not trying to prove anything so far.
I am asking is such situation possible or not, and if not, why?
My current understanding (possibly wrong) is that after you mmaped these pages,
kernel still can asynchronously update them.
So, when reading the data from these pages you have to check 'lock' value before and
after accessing other data.
If so, why possible cpu read-reordering doesn't matter?    

Also there was another question below, which you probably  missed, so I copied it here:
Another question - do we really need  to have __rte_pmu_read_userpage() and rte_pmu_read() as
static inline functions in public header?
As I understand, because of that we also have to make 'struct rte_pmu_*'
definitions also public.

> 
> >> >>
> >> >> >> +		index = pc->index;
> >> >> >> +		offset = pc->offset;
> >> >> >> +		width = pc->pmc_width;
> >> >> >> +
> >> >> >> +		/* index set to 0 means that particular counter cannot be used */
> >> >> >> +		if (likely(pc->cap_user_rdpmc && index)) {
> >> >> >> +			pmc = rte_pmu_pmc_read(index - 1);
> >> >> >> +			pmc <<= 64 - width;
> >> >> >> +			pmc >>= 64 - width;
> >> >> >> +			offset += pmc;
> >> >> >> +		}
> >> >> >> +
> >> >> >> +		rte_compiler_barrier();
> >> >> >> +
> >> >> >> +		if (likely(pc->lock == seq))
> >> >> >> +			return offset;
> >> >> >> +	}
> >> >> >> +
> >> >> >> +	return 0;
> >> >> >> +}
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @warning
> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> >> + *
> >> >> >> + * Enable group of events on the calling lcore.
> >> >> >> + *
> >> >> >> + * @warning This should be not called directly.
> >> >> >> + *
> >> >> >> + * @return
> >> >> >> + *   0 in case of success, negative value otherwise.
> >> >> >> + */
> >> >> >> +__rte_experimental
> >> >> >> +int
> >> >> >> +__rte_pmu_enable_group(void);
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @warning
> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> >> + *
> >> >> >> + * Initialize PMU library.
> >> >> >> + *
> >> >> >> + * @warning This should be not called directly.
> >> >> >> + *
> >> >> >> + * @return
> >> >> >> + *   0 in case of success, negative value otherwise.
> >> >> >> + */
> >> >> >> +__rte_experimental
> >> >> >> +int
> >> >> >> +rte_pmu_init(void);
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @warning
> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> >> + *
> >> >> >> + * Finalize PMU library. This should be called after PMU counters are no longer being
> >read.
> >> >> >> + */
> >> >> >> +__rte_experimental
> >> >> >> +void
> >> >> >> +rte_pmu_fini(void);
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @warning
> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> >> + *
> >> >> >> + * Add event to the group of enabled events.
> >> >> >> + *
> >> >> >> + * @param name
> >> >> >> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> >> >> >> + * @return
> >> >> >> + *   Event index in case of success, negative value otherwise.
> >> >> >> + */
> >> >> >> +__rte_experimental
> >> >> >> +int
> >> >> >> +rte_pmu_add_event(const char *name);
> >> >> >> +
> >> >> >> +/**
> >> >> >> + * @warning
> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> >> + *
> >> >> >> + * Read hardware counter configured to count occurrences of an event.
> >> >> >> + *
> >> >> >> + * @param index
> >> >> >> + *   Index of an event to be read.
> >> >> >> + * @return
> >> >> >> + *   Event value read from register. In case of errors or lack of support
> >> >> >> + *   0 is returned. In other words, stream of zeros in a trace file
> >> >> >> + *   indicates problem with reading particular PMU event register.
> >> >> >> + */
> >
> >Another question - do we really need  to have __rte_pmu_read_userpage() and rte_pmu_read() as
> >static inline functions in public header?
> >As I understand, because of that we also have to make 'struct rte_pmu_*'
> >definitions also public.
> >
> >> >> >> +__rte_experimental
> >> >> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned int
> >> >> >> +index) {
> >> >> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> >> >> >> +	int ret;
> >> >> >> +
> >> >> >> +	if (unlikely(!rte_pmu.initialized))
> >> >> >> +		return 0;
> >> >> >> +
> >> >> >> +	if (unlikely(!group->enabled)) {
> >> >> >> +		ret = __rte_pmu_enable_group();
> >> >> >> +		if (ret)
> >> >> >> +			return 0;
> >> >> >> +	}
> >> >> >> +
> >> >> >> +	if (unlikely(index >= rte_pmu.num_group_events))
> >> >> >> +		return 0;
> >> >> >> +
> >> >> >> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
> >> >> >> +}
> >> >> >> +
> >> >> >> +#ifdef __cplusplus
> >> >> >> +}
> >> >> >> +#endif
> >> >> >> +
  
Tomasz Duszynski Feb. 20, 2023, 8:42 p.m. UTC | #8
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
>Sent: Monday, February 20, 2023 6:21 PM
>To: Tomasz Duszynski <tduszynski@marvell.com>; Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>;
>dev@dpdk.org
>Subject: RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>
>> >> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file
>> >> >> >> mode
>> >> >> >> 100644 index 0000000000..6b664c3336
>> >> >> >> --- /dev/null
>> >> >> >> +++ b/lib/pmu/rte_pmu.h
>> >> >> >> @@ -0,0 +1,212 @@
>> >> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> >> >> + * Copyright(c) 2023 Marvell  */
>> >> >> >> +
>> >> >> >> +#ifndef _RTE_PMU_H_
>> >> >> >> +#define _RTE_PMU_H_
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @file
>> >> >> >> + *
>> >> >> >> + * PMU event tracing operations
>> >> >> >> + *
>> >> >> >> + * This file defines generic API and types necessary to
>> >> >> >> +setup PMU and
>> >> >> >> + * read selected counters in runtime.
>> >> >> >> + */
>> >> >> >> +
>> >> >> >> +#ifdef __cplusplus
>> >> >> >> +extern "C" {
>> >> >> >> +#endif
>> >> >> >> +
>> >> >> >> +#include <linux/perf_event.h>
>> >> >> >> +
>> >> >> >> +#include <rte_atomic.h>
>> >> >> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
>> >> >> >> +#include <rte_compat.h> #include <rte_spinlock.h>
>> >> >> >> +
>> >> >> >> +/** Maximum number of events in a group */ #define
>> >> >> >> +MAX_NUM_GROUP_EVENTS 8
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * A structure describing a group of events.
>> >> >> >> + */
>> >> >> >> +struct rte_pmu_event_group {
>> >> >> >> +	struct perf_event_mmap_page
>> >> >> >> +*mmap_pages[MAX_NUM_GROUP_EVENTS];
>> >> >> >> +/**< array of user pages
>> >> >*/
>> >> >> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> >> >> >> +	bool enabled; /**< true if group was enabled on particular lcore */
>> >> >> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> >> >> >> +__rte_cache_aligned;
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * A structure describing an event.
>> >> >> >> + */
>> >> >> >> +struct rte_pmu_event {
>> >> >> >> +	char *name; /**< name of an event */
>> >> >> >> +	unsigned int index; /**< event index into fds/mmap_pages */
>> >> >> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * A PMU state container.
>> >> >> >> + */
>> >> >> >> +struct rte_pmu {
>> >> >> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> >> >> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
>> >> >> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> >> >> >> +	unsigned int num_group_events; /**< number of events in a group */
>> >> >> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> >> >> >> +	unsigned int initialized; /**< initialization counter */ };
>> >> >> >> +
>> >> >> >> +/** lcore event group */
>> >> >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
>> >> >> >> +_event_group);
>> >> >> >> +
>> >> >> >> +/** PMU state container */
>> >> >> >> +extern struct rte_pmu rte_pmu;
>> >> >> >> +
>> >> >> >> +/** Each architecture supporting PMU needs to provide its
>> >> >> >> +own version */ #ifndef rte_pmu_pmc_read #define
>> >> >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @warning
>> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> >> + *
>> >> >> >> + * Read PMU counter.
>> >> >> >> + *
>> >> >> >> + * @warning This should be not called directly.
>> >> >> >> + *
>> >> >> >> + * @param pc
>> >> >> >> + *   Pointer to the mmapped user page.
>> >> >> >> + * @return
>> >> >> >> + *   Counter value read from hardware.
>> >> >> >> + */
>> >> >> >> +static __rte_always_inline uint64_t
>> >> >> >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>> >> >> >> +	uint64_t width, offset;
>> >> >> >> +	uint32_t seq, index;
>> >> >> >> +	int64_t pmc;
>> >> >> >> +
>> >> >> >> +	for (;;) {
>> >> >> >> +		seq = pc->lock;
>> >> >> >> +		rte_compiler_barrier();
>> >> >> >
>> >> >> >Are you sure that compiler_barrier() is enough here?
>> >> >> >On some archs CPU itself has freedom to re-order reads.
>> >> >> >Or I am missing something obvious here?
>> >> >> >
>> >> >>
>> >> >> It's a matter of not keeping old stuff cached in registers and
>> >> >> making sure that we have two reads of lock. CPU reordering won't
>> >> >> do any harm here.
>> >> >
>> >> >Sorry, I didn't get you here:
>> >> >Suppose CPU will re-order reads and will read lock *after* index or offset value.
>> >> >Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
>> >> >
>> >>
>> >> This number is just an indicator whether kernel did change something or not.
>> >
>> >You are talking about pc->lock, right?
>> >Yes, I do understand that it is sort of seqlock.
>> >That's why I am puzzled why we do not care about possible cpu read-reordering.
>> >Manual for perf_event_open() also has a code snippet with compiler barrier only...
>> >
>> >> If cpu reordering will come into play then this will not change anything from pov of this
>loop.
>> >> All we want is fresh data when needed and no involvement of
>> >> compiler when it comes to reordering code.
>> >
>> >Ok, can you probably explain to me why the following could not happen:
>> >T0:
>> >pc->seqlock==0; pc->index==I1; pc->offset==O1;
>> >T1:
>> >      cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
>> >       index=pc->index;  //index==I1;
>> > T2:
>> >      cpu #1 kernel vent_update_userpage:
>> >      pc->lock++; // pc->lock==1
>> >      pc->index=I2;
>> >      pc->offset=O2;
>> >      ...
>> >      pc->lock++; //pc->lock==2
>> >T3:
>> >      cpu #0 continue with read pmu:
>> >      seq=pc->lock; //seq == 2
>> >       offset=pc->offset; // offset == O2
>> >       ....
>> >       pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
>> >       offset += pmc; //offset == O2 + pmcread(I1-1);
>> >       if (pc->lock == seq) // they are equal, return
>> >             return offset;
>> >
>> >Or, it can happen, but by some reason we don't care much?
>> >
>>
>> This code does self-monitoring and user page (whole group actually) is
>> per thread running on current cpu. Hence I am not sure what are you trying to prove with that
>example.
>
>I am not trying to prove anything so far.
>I am asking is such situation possible or not, and if not, why?
>My current understanding (possibly wrong) is that after you mmaped these pages, kernel still can
>asynchronously update them.
>So, when reading the data from these pages you have to check 'lock' value before and after
>accessing other data.
>If so, why possible cpu read-reordering doesn't matter?
>

Look. I'll reiterate that.

1. That user page/group/PMU config is per process. Other processes do not access that.
   All this happens on the very same CPU where current thread is running.
2. Suppose you've already read seq. Now for some reason kernel updates data in page seq was read from. 
3. Kernel will enter critical section during update. seq changes along with other data without app knowing about it. 
   If you want nitty gritty details consult kernel sources. 
4. app resumes and has some stale data but *WILL* read new seq. Code loops again because values do not match.  
5. Otherwise seq values match and data is valid. 

>Also there was another question below, which you probably  missed, so I copied it here:
>Another question - do we really need  to have __rte_pmu_read_userpage() and rte_pmu_read() as
>static inline functions in public header?
>As I understand, because of that we also have to make 'struct rte_pmu_*'
>definitions also public.
>

These functions need to be inlined otherwise performance takes a hit. 

>>
>> >> >>
>> >> >> >> +		index = pc->index;
>> >> >> >> +		offset = pc->offset;
>> >> >> >> +		width = pc->pmc_width;
>> >> >> >> +
>> >> >> >> +		/* index set to 0 means that particular counter cannot be used */
>> >> >> >> +		if (likely(pc->cap_user_rdpmc && index)) {
>> >> >> >> +			pmc = rte_pmu_pmc_read(index - 1);
>> >> >> >> +			pmc <<= 64 - width;
>> >> >> >> +			pmc >>= 64 - width;
>> >> >> >> +			offset += pmc;
>> >> >> >> +		}
>> >> >> >> +
>> >> >> >> +		rte_compiler_barrier();
>> >> >> >> +
>> >> >> >> +		if (likely(pc->lock == seq))
>> >> >> >> +			return offset;
>> >> >> >> +	}
>> >> >> >> +
>> >> >> >> +	return 0;
>> >> >> >> +}
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @warning
>> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> >> + *
>> >> >> >> + * Enable group of events on the calling lcore.
>> >> >> >> + *
>> >> >> >> + * @warning This should be not called directly.
>> >> >> >> + *
>> >> >> >> + * @return
>> >> >> >> + *   0 in case of success, negative value otherwise.
>> >> >> >> + */
>> >> >> >> +__rte_experimental
>> >> >> >> +int
>> >> >> >> +__rte_pmu_enable_group(void);
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @warning
>> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> >> + *
>> >> >> >> + * Initialize PMU library.
>> >> >> >> + *
>> >> >> >> + * @warning This should be not called directly.
>> >> >> >> + *
>> >> >> >> + * @return
>> >> >> >> + *   0 in case of success, negative value otherwise.
>> >> >> >> + */
>> >> >> >> +__rte_experimental
>> >> >> >> +int
>> >> >> >> +rte_pmu_init(void);
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @warning
>> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> >> + *
>> >> >> >> + * Finalize PMU library. This should be called after PMU
>> >> >> >> +counters are no longer being
>> >read.
>> >> >> >> + */
>> >> >> >> +__rte_experimental
>> >> >> >> +void
>> >> >> >> +rte_pmu_fini(void);
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @warning
>> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> >> + *
>> >> >> >> + * Add event to the group of enabled events.
>> >> >> >> + *
>> >> >> >> + * @param name
>> >> >> >> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>> >> >> >> + * @return
>> >> >> >> + *   Event index in case of success, negative value otherwise.
>> >> >> >> + */
>> >> >> >> +__rte_experimental
>> >> >> >> +int
>> >> >> >> +rte_pmu_add_event(const char *name);
>> >> >> >> +
>> >> >> >> +/**
>> >> >> >> + * @warning
>> >> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> >> >> + *
>> >> >> >> + * Read hardware counter configured to count occurrences of an event.
>> >> >> >> + *
>> >> >> >> + * @param index
>> >> >> >> + *   Index of an event to be read.
>> >> >> >> + * @return
>> >> >> >> + *   Event value read from register. In case of errors or lack of support
>> >> >> >> + *   0 is returned. In other words, stream of zeros in a trace file
>> >> >> >> + *   indicates problem with reading particular PMU event register.
>> >> >> >> + */
>> >
>> >Another question - do we really need  to have
>> >__rte_pmu_read_userpage() and rte_pmu_read() as static inline functions in public header?
>> >As I understand, because of that we also have to make 'struct rte_pmu_*'
>> >definitions also public.
>> >
>> >> >> >> +__rte_experimental
>> >> >> >> +static __rte_always_inline uint64_t rte_pmu_read(unsigned
>> >> >> >> +int
>> >> >> >> +index) {
>> >> >> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> >> >> >> +	int ret;
>> >> >> >> +
>> >> >> >> +	if (unlikely(!rte_pmu.initialized))
>> >> >> >> +		return 0;
>> >> >> >> +
>> >> >> >> +	if (unlikely(!group->enabled)) {
>> >> >> >> +		ret = __rte_pmu_enable_group();
>> >> >> >> +		if (ret)
>> >> >> >> +			return 0;
>> >> >> >> +	}
>> >> >> >> +
>> >> >> >> +	if (unlikely(index >= rte_pmu.num_group_events))
>> >> >> >> +		return 0;
>> >> >> >> +
>> >> >> >> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> >> >> >> +}
>> >> >> >> +
>> >> >> >> +#ifdef __cplusplus
>> >> >> >> +}
>> >> >> >> +#endif
>> >> >> >> +
  
Konstantin Ananyev Feb. 21, 2023, 12:48 a.m. UTC | #9
>>>>>>>>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file
>>>>>>>>> mode
>>>>>>>>> 100644 index 0000000000..6b664c3336
>>>>>>>>> --- /dev/null
>>>>>>>>> +++ b/lib/pmu/rte_pmu.h
>>>>>>>>> @@ -0,0 +1,212 @@
>>>>>>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>>>>>>> + * Copyright(c) 2023 Marvell  */
>>>>>>>>> +
>>>>>>>>> +#ifndef _RTE_PMU_H_
>>>>>>>>> +#define _RTE_PMU_H_
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @file
>>>>>>>>> + *
>>>>>>>>> + * PMU event tracing operations
>>>>>>>>> + *
>>>>>>>>> + * This file defines generic API and types necessary to
>>>>>>>>> +setup PMU and
>>>>>>>>> + * read selected counters in runtime.
>>>>>>>>> + */
>>>>>>>>> +
>>>>>>>>> +#ifdef __cplusplus
>>>>>>>>> +extern "C" {
>>>>>>>>> +#endif
>>>>>>>>> +
>>>>>>>>> +#include <linux/perf_event.h>
>>>>>>>>> +
>>>>>>>>> +#include <rte_atomic.h>
>>>>>>>>> +#include <rte_branch_prediction.h> #include <rte_common.h>
>>>>>>>>> +#include <rte_compat.h> #include <rte_spinlock.h>
>>>>>>>>> +
>>>>>>>>> +/** Maximum number of events in a group */ #define
>>>>>>>>> +MAX_NUM_GROUP_EVENTS 8
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * A structure describing a group of events.
>>>>>>>>> + */
>>>>>>>>> +struct rte_pmu_event_group {
>>>>>>>>> +	struct perf_event_mmap_page
>>>>>>>>> +*mmap_pages[MAX_NUM_GROUP_EVENTS];
>>>>>>>>> +/**< array of user pages
>>>>>> */
>>>>>>>>> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>>>>>>>>> +	bool enabled; /**< true if group was enabled on particular lcore */
>>>>>>>>> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>>>>>>>>> +__rte_cache_aligned;
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * A structure describing an event.
>>>>>>>>> + */
>>>>>>>>> +struct rte_pmu_event {
>>>>>>>>> +	char *name; /**< name of an event */
>>>>>>>>> +	unsigned int index; /**< event index into fds/mmap_pages */
>>>>>>>>> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * A PMU state container.
>>>>>>>>> + */
>>>>>>>>> +struct rte_pmu {
>>>>>>>>> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>>>>>>>>> +	rte_spinlock_t lock; /**< serialize access to event group list */
>>>>>>>>> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>>>>>>>>> +	unsigned int num_group_events; /**< number of events in a group */
>>>>>>>>> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>>>>>>>>> +	unsigned int initialized; /**< initialization counter */ };
>>>>>>>>> +
>>>>>>>>> +/** lcore event group */
>>>>>>>>> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
>>>>>>>>> +_event_group);
>>>>>>>>> +
>>>>>>>>> +/** PMU state container */
>>>>>>>>> +extern struct rte_pmu rte_pmu;
>>>>>>>>> +
>>>>>>>>> +/** Each architecture supporting PMU needs to provide its
>>>>>>>>> +own version */ #ifndef rte_pmu_pmc_read #define
>>>>>>>>> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @warning
>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>> + *
>>>>>>>>> + * Read PMU counter.
>>>>>>>>> + *
>>>>>>>>> + * @warning This should be not called directly.
>>>>>>>>> + *
>>>>>>>>> + * @param pc
>>>>>>>>> + *   Pointer to the mmapped user page.
>>>>>>>>> + * @return
>>>>>>>>> + *   Counter value read from hardware.
>>>>>>>>> + */
>>>>>>>>> +static __rte_always_inline uint64_t
>>>>>>>>> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>>>>>>>>> +	uint64_t width, offset;
>>>>>>>>> +	uint32_t seq, index;
>>>>>>>>> +	int64_t pmc;
>>>>>>>>> +
>>>>>>>>> +	for (;;) {
>>>>>>>>> +		seq = pc->lock;
>>>>>>>>> +		rte_compiler_barrier();
>>>>>>>>
>>>>>>>> Are you sure that compiler_barrier() is enough here?
>>>>>>>> On some archs CPU itself has freedom to re-order reads.
>>>>>>>> Or I am missing something obvious here?
>>>>>>>>
>>>>>>>
>>>>>>> It's a matter of not keeping old stuff cached in registers and
>>>>>>> making sure that we have two reads of lock. CPU reordering won't
>>>>>>> do any harm here.
>>>>>>
>>>>>> Sorry, I didn't get you here:
>>>>>> Suppose CPU will re-order reads and will read lock *after* index or offset value.
>>>>>> Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
>>>>>>
>>>>>
>>>>> This number is just an indicator whether kernel did change something or not.
>>>>
>>>> You are talking about pc->lock, right?
>>>> Yes, I do understand that it is sort of seqlock.
>>>> That's why I am puzzled why we do not care about possible cpu read-reordering.
>>>> Manual for perf_event_open() also has a code snippet with compiler barrier only...
>>>>
>>>>> If cpu reordering will come into play then this will not change anything from pov of this
>> loop.
>>>>> All we want is fresh data when needed and no involvement of
>>>>> compiler when it comes to reordering code.
>>>>
>>>> Ok, can you probably explain to me why the following could not happen:
>>>> T0:
>>>> pc->seqlock==0; pc->index==I1; pc->offset==O1;
>>>> T1:
>>>>       cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
>>>>        index=pc->index;  //index==I1;
>>>> T2:
>>>>       cpu #1 kernel vent_update_userpage:
>>>>       pc->lock++; // pc->lock==1
>>>>       pc->index=I2;
>>>>       pc->offset=O2;
>>>>       ...
>>>>       pc->lock++; //pc->lock==2
>>>> T3:
>>>>       cpu #0 continue with read pmu:
>>>>       seq=pc->lock; //seq == 2
>>>>        offset=pc->offset; // offset == O2
>>>>        ....
>>>>        pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
>>>>        offset += pmc; //offset == O2 + pmcread(I1-1);
>>>>        if (pc->lock == seq) // they are equal, return
>>>>              return offset;
>>>>
>>>> Or, it can happen, but by some reason we don't care much?
>>>>
>>>
>>> This code does self-monitoring and user page (whole group actually) is
>>> per thread running on current cpu. Hence I am not sure what are you trying to prove with that
>> example.
>>
>> I am not trying to prove anything so far.
>> I am asking is such situation possible or not, and if not, why?
>> My current understanding (possibly wrong) is that after you mmaped these pages, kernel still can
>> asynchronously update them.
>> So, when reading the data from these pages you have to check 'lock' value before and after
>> accessing other data.
>> If so, why possible cpu read-reordering doesn't matter?
>>
> 
> Look. I'll reiterate that.
> 
> 1. That user page/group/PMU config is per process. Other processes do not access that.

Ok, that's clear.


>     All this happens on the very same CPU where current thread is running.

Ok... but can't this page be updated by kernel thread running 
simultaneously on different CPU?


> 2. Suppose you've already read seq. Now for some reason kernel updates data in page seq was read from.
> 3. Kernel will enter critical section during update. seq changes along with other data without app knowing about it.
>     If you want nitty gritty details consult kernel sources.

Look, I don't have to beg you to answer these questions.
In fact, I expect library author to document all such narrow things 
clearly either in in PG, or in source code comments (ideally in both).
If not, then from my perspective the patch is not ready stage and 
shouldn't be accepted.
I don't know is compiler-barrier is enough here or not, but I think it 
is definitely worth a clear explanation in the docs.
I suppose it wouldn't be only me who will get confused here.
So please take an effort and document it clearly why you believe there 
is no race-condition.

> 4. app resumes and has some stale data but *WILL* read new seq. Code loops again because values do not match.

If the kernel will always execute update for this page in the same 
thread context, then yes, - user code will always note the difference
after resume.
But why it can't happen that your user-thread reads this page on one 
CPU, while some kernel code on other CPU updates it simultaneously?


> 5. Otherwise seq values match and data is valid.
> 
>> Also there was another question below, which you probably  missed, so I copied it here:
>> Another question - do we really need  to have __rte_pmu_read_userpage() and rte_pmu_read() as
>> static inline functions in public header?
>> As I understand, because of that we also have to make 'struct rte_pmu_*'
>> definitions also public.
>>
> 
> These functions need to be inlined otherwise performance takes a hit.

I understand that perfomance might be affected, but how big is hit?
I expect actual PMU read will not be free anyway, right?
If the diff is small, might be it is worth to go for such change,
removing unneeded structures from public headers would help a lot in 
future in terms of ABI/API stability.



>>>
>>>>>>>
>>>>>>>>> +		index = pc->index;
>>>>>>>>> +		offset = pc->offset;
>>>>>>>>> +		width = pc->pmc_width;
>>>>>>>>> +
>>>>>>>>> +		/* index set to 0 means that particular counter cannot be used */
>>>>>>>>> +		if (likely(pc->cap_user_rdpmc && index)) {
>>>>>>>>> +			pmc = rte_pmu_pmc_read(index - 1);
>>>>>>>>> +			pmc <<= 64 - width;
>>>>>>>>> +			pmc >>= 64 - width;
>>>>>>>>> +			offset += pmc;
>>>>>>>>> +		}
>>>>>>>>> +
>>>>>>>>> +		rte_compiler_barrier();
>>>>>>>>> +
>>>>>>>>> +		if (likely(pc->lock == seq))
>>>>>>>>> +			return offset;
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @warning
>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>> + *
>>>>>>>>> + * Enable group of events on the calling lcore.
>>>>>>>>> + *
>>>>>>>>> + * @warning This should be not called directly.
>>>>>>>>> + *
>>>>>>>>> + * @return
>>>>>>>>> + *   0 in case of success, negative value otherwise.
>>>>>>>>> + */
>>>>>>>>> +__rte_experimental
>>>>>>>>> +int
>>>>>>>>> +__rte_pmu_enable_group(void);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @warning
>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>> + *
>>>>>>>>> + * Initialize PMU library.
>>>>>>>>> + *
>>>>>>>>> + * @warning This should be not called directly.
>>>>>>>>> + *
>>>>>>>>> + * @return
>>>>>>>>> + *   0 in case of success, negative value otherwise.
>>>>>>>>> + */
>>>>>>>>> +__rte_experimental
>>>>>>>>> +int
>>>>>>>>> +rte_pmu_init(void);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @warning
>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>> + *
>>>>>>>>> + * Finalize PMU library. This should be called after PMU
>>>>>>>>> +counters are no longer being
>>>> read.
>>>>>>>>> + */
>>>>>>>>> +__rte_experimental
>>>>>>>>> +void
>>>>>>>>> +rte_pmu_fini(void);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @warning
>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>> + *
>>>>>>>>> + * Add event to the group of enabled events.
>>>>>>>>> + *
>>>>>>>>> + * @param name
>>>>>>>>> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>>>>>>>>> + * @return
>>>>>>>>> + *   Event index in case of success, negative value otherwise.
>>>>>>>>> + */
>>>>>>>>> +__rte_experimental
>>>>>>>>> +int
>>>>>>>>> +rte_pmu_add_event(const char *name);
>>>>>>>>> +
>>>>>>>>> +/**
>>>>>>>>> + * @warning
>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>> + *
>>>>>>>>> + * Read hardware counter configured to count occurrences of an event.
>>>>>>>>> + *
>>>>>>>>> + * @param index
>>>>>>>>> + *   Index of an event to be read.
>>>>>>>>> + * @return
>>>>>>>>> + *   Event value read from register. In case of errors or lack of support
>>>>>>>>> + *   0 is returned. In other words, stream of zeros in a trace file
>>>>>>>>> + *   indicates problem with reading particular PMU event register.
>>>>>>>>> + */
>>>>
>>>> Another question - do we really need  to have
>>>> __rte_pmu_read_userpage() and rte_pmu_read() as static inline functions in public header?
>>>> As I understand, because of that we also have to make 'struct rte_pmu_*'
>>>> definitions also public.
>>>>
>>>>>>>>> +__rte_experimental
>>>>>>>>> +static __rte_always_inline uint64_t rte_pmu_read(unsigned
>>>>>>>>> +int
>>>>>>>>> +index) {
>>>>>>>>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>>>>>>>>> +	int ret;
>>>>>>>>> +
>>>>>>>>> +	if (unlikely(!rte_pmu.initialized))
>>>>>>>>> +		return 0;
>>>>>>>>> +
>>>>>>>>> +	if (unlikely(!group->enabled)) {
>>>>>>>>> +		ret = __rte_pmu_enable_group();
>>>>>>>>> +		if (ret)
>>>>>>>>> +			return 0;
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	if (unlikely(index >= rte_pmu.num_group_events))
>>>>>>>>> +		return 0;
>>>>>>>>> +
>>>>>>>>> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +#ifdef __cplusplus
>>>>>>>>> +}
>>>>>>>>> +#endif
>>>>>>>>> +
>
  
Konstantin Ananyev Feb. 21, 2023, 2:17 a.m. UTC | #10
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> i.e run dedicated tasks. In such cases one cannot use standard
> perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>

Few more comments/questions below.


> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
> new file mode 100644
> index 0000000000..950f999cb7
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.c
> @@ -0,0 +1,460 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_per_lcore.h>
> +#include <rte_pmu.h>
> +#include <rte_spinlock.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +
> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +struct rte_pmu rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> +{
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +	char path[PATH_MAX];
> +	char *config = NULL;
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	*num = *mask = 0;
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> +	fp = fopen(path, "r");
> +	if (fp == NULL)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +	fp = fopen(path, "r");
> +	if (fp == NULL)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
> +}
> +
> +static int
> +open_events(struct rte_pmu_event_group *group)
> +{
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret)
> +			continue;
> +
> +		ret = do_perf_event_open(config, group->fds[0]);
> +		if (ret == -1) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(struct rte_pmu_event_group *group)
> +{
> +	long page_size = sysconf(_SC_PAGE_SIZE);
> +	unsigned int i;
> +	void *addr;
> +	int ret;
> +
> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
> +			ret = -EPERM;
> +			goto out;
> +		}
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], page_size);
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(struct rte_pmu_event_group *group)
> +{
> +	unsigned int i;
> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	group->enabled = false;
> +}
> +
> +int
> +__rte_pmu_enable_group(void)
> +{
> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +	int ret;
> +
> +	if (rte_pmu.num_group_events == 0)
> +		return -ENODEV;
> +
> +	ret = open_events(group);
> +	if (ret)
> +		goto out;
> +
> +	ret = mmap_events(group);
> +	if (ret)
> +		goto out;
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	rte_spinlock_lock(&rte_pmu.lock);
> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);

Hmm.. so we insert pointer to TLS variable into the global list?
Wonder what would happen if that thread get terminated?
Can memory from its TLS block get re-used (by other thread or for other 
purposes)?


> +	rte_spinlock_unlock(&rte_pmu.lock);
> +	group->enabled = true;
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(group);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (dirp == NULL)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	if (dent) {
> +		rte_pmu.name = strdup(name);
> +		if (rte_pmu.name == NULL) {
> +			closedir(dirp);
> +
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	closedir(dirp);
> +
> +	return rte_pmu.name ? 0 : -ENODEV;
> +}
> +
> +static struct rte_pmu_event *
> +new_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +
> +	event = calloc(1, sizeof(*event));
> +	if (event == NULL)
> +		goto out;
> +
> +	event->name = strdup(name);
> +	if (event->name == NULL) {
> +		free(event);
> +		event = NULL;
> +	}
> +
> +out:
> +	return event;
> +}
> +
> +static void
> +free_event(struct rte_pmu_event *event)
> +{
> +	free(event->name);
> +	free(event);
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	if (rte_pmu.name == NULL)
> +		return -ENODEV;
> +
> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> +		return -ENOSPC;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = new_event(name);
> +	if (event == NULL)
> +		return -ENOMEM;
> +
> +	event->index = rte_pmu.num_group_events++;
> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> +
> +	return event->index;
> +}
> +
> +int
> +rte_pmu_init(void)
> +{
> +	int ret;
> +
> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
> +	 * via command line but application doesn't care enough and performs init/fini again.
> +	 */
> +	if (rte_pmu.initialized != 0) {
> +		rte_pmu.initialized++;
> +		return 0;
> +	}
> +
> +	ret = scan_pmus();
> +	if (ret)
> +		goto out;
> +
> +	ret = pmu_arch_init();
> +	if (ret)
> +		goto out;
> +
> +	TAILQ_INIT(&rte_pmu.event_list);
> +	TAILQ_INIT(&rte_pmu.event_group_list);
> +	rte_spinlock_init(&rte_pmu.lock);
> +	rte_pmu.initialized = 1;
> +
> +	return 0;
> +out:
> +	free(rte_pmu.name);
> +	rte_pmu.name = NULL;
> +
> +	return ret;
> +}
> +
> +void
> +rte_pmu_fini(void)
> +{
> +	struct rte_pmu_event_group *group, *tmp_group;
> +	struct rte_pmu_event *event, *tmp_event;
> +
> +	/* cleanup once init count drops to zero */
> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
> +		return;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> +		free_event(event);
> +	}
> +
> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> +		cleanup_events(group);
> +	}
> +
> +	pmu_arch_fini();
> +	free(rte_pmu.name);
> +	rte_pmu.name = NULL;
> +	rte_pmu.num_group_events = 0;
> +}
> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> new file mode 100644
> index 0000000000..6b664c3336
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.h
> @@ -0,0 +1,212 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_spinlock.h>
> +
> +/** Maximum number of events in a group */
> +#define MAX_NUM_GROUP_EVENTS 8

forgot RTE_ prefix.
In fact, do you really need number of events in group to be hard-coded?
Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?

> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +	bool enabled; /**< true if group was enabled on particular lcore */
> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> +} __rte_cache_aligned;
> +

Even if we'd decide to keep rte_pmu_read() as static inline (still not 
sure it is a good idea),
why these two struct below (rte_pmu_event and rte_pmu) have to be public?
I think both can be safely moved away from public headers.


> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /**< name of an event */
> +	unsigned int index; /**< event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
> +};

> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> +	rte_spinlock_t lock; /**< serialize access to event group list */
> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> +	unsigned int num_group_events; /**< number of events in a group */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +	unsigned int initialized; /**< initialization counter */
> +};
> +
> +/** lcore event group */
> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> +
> +/** PMU state container */
> +extern struct rte_pmu rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read PMU counter.
> + *
> + * @warning This should be not called directly.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +static __rte_always_inline uint64_t
> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t width, offset;
> +	uint32_t seq, index;
> +	int64_t pmc;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();
> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +
> +		/* index set to 0 means that particular counter cannot be used */
> +		if (likely(pc->cap_user_rdpmc && index)) {

In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
Do you need another check here? Or this capability can be disabled by 
kernel at run-time?


> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +			offset += pmc;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return offset;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enable group of events on the calling lcore.
> + *
> + * @warning This should be not called directly.

__rte_internal ?

> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +__rte_pmu_enable_group(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize PMU library.
> + *
> + * @warning This should be not called directly.

Hmm.. then who should call it?
If it not supposed to be called directly, why to declare it here?

> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */

Probably worth to mention that this function is not MT safe.
Same for _fini_ and add_event.
Also worth to mention that all control-path functions 
(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.

> +__rte_experimental
> +int
> +rte_pmu_init(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
> + */
> +__rte_experimental
> +void
> +rte_pmu_fini(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> +	int ret;
> +
> +	if (unlikely(!rte_pmu.initialized))
> +		return 0;
> +
> +	if (unlikely(!group->enabled)) {
> +		ret = __rte_pmu_enable_group();
> +		if (ret)
> +			return 0;
> +	}
> +
> +	if (unlikely(index >= rte_pmu.num_group_events))
> +		return 0;
> +
> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
> new file mode 100644
> index 0000000000..39a4f279c1
> --- /dev/null
> +++ b/lib/pmu/version.map
> @@ -0,0 +1,15 @@
> +DPDK_23 {
> +	local: *;
> +};
> +
> +EXPERIMENTAL {
> +	global:
> +
> +	__rte_pmu_enable_group;
> +	per_lcore__event_group;
> +	rte_pmu;
> +	rte_pmu_add_event;
> +	rte_pmu_fini;
> +	rte_pmu_init;
> +	rte_pmu_read;
> +};
  
Konstantin Ananyev Feb. 21, 2023, 12:15 p.m. UTC | #11
> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
> >> 100644 index 0000000000..950f999cb7
> >> --- /dev/null
> >> +++ b/lib/pmu/rte_pmu.c
> >> @@ -0,0 +1,460 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(C) 2023 Marvell International Ltd.
> >> + */
> >> +
> >> +#include <ctype.h>
> >> +#include <dirent.h>
> >> +#include <errno.h>
> >> +#include <regex.h>
> >> +#include <stdlib.h>
> >> +#include <string.h>
> >> +#include <sys/ioctl.h>
> >> +#include <sys/mman.h>
> >> +#include <sys/queue.h>
> >> +#include <sys/syscall.h>
> >> +#include <unistd.h>
> >> +
> >> +#include <rte_atomic.h>
> >> +#include <rte_per_lcore.h>
> >> +#include <rte_pmu.h>
> >> +#include <rte_spinlock.h>
> >> +#include <rte_tailq.h>
> >> +
> >> +#include "pmu_private.h"
> >> +
> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> >
> >
> >I suppose that pass (as the whole implementation) is linux specific?
> >If so, wouldn't it make sense to have it under linux subdir?
> >
> 
> There are not any plans to support that elsewhere currently so flat
> directory structure is good enough.

Ok, I suppose then best choice is to ask freebsd and windows maintainers.
Guys, any thoughts here?
Thanks
Konstantin
  
Tomasz Duszynski Feb. 27, 2023, 8:12 a.m. UTC | #12
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>Sent: Tuesday, February 21, 2023 1:48 AM
>To: Tomasz Duszynski <tduszynski@marvell.com>; Konstantin Ananyev <konstantin.ananyev@huawei.com>;
>dev@dpdk.org
>Subject: Re: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>
>>>>>>>>>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file
>>>>>>>>>> mode
>>>>>>>>>> 100644 index 0000000000..6b664c3336
>>>>>>>>>> --- /dev/null
>>>>>>>>>> +++ b/lib/pmu/rte_pmu.h
>>>>>>>>>> @@ -0,0 +1,212 @@
>>>>>>>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>>>>>>>> + * Copyright(c) 2023 Marvell  */
>>>>>>>>>> +
>>>>>>>>>> +#ifndef _RTE_PMU_H_
>>>>>>>>>> +#define _RTE_PMU_H_
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @file
>>>>>>>>>> + *
>>>>>>>>>> + * PMU event tracing operations
>>>>>>>>>> + *
>>>>>>>>>> + * This file defines generic API and types necessary to setup
>>>>>>>>>> +PMU and
>>>>>>>>>> + * read selected counters in runtime.
>>>>>>>>>> + */
>>>>>>>>>> +
>>>>>>>>>> +#ifdef __cplusplus
>>>>>>>>>> +extern "C" {
>>>>>>>>>> +#endif
>>>>>>>>>> +
>>>>>>>>>> +#include <linux/perf_event.h>
>>>>>>>>>> +
>>>>>>>>>> +#include <rte_atomic.h>
>>>>>>>>>> +#include <rte_branch_prediction.h> #include <rte_common.h>
>>>>>>>>>> +#include <rte_compat.h> #include <rte_spinlock.h>
>>>>>>>>>> +
>>>>>>>>>> +/** Maximum number of events in a group */ #define
>>>>>>>>>> +MAX_NUM_GROUP_EVENTS 8
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * A structure describing a group of events.
>>>>>>>>>> + */
>>>>>>>>>> +struct rte_pmu_event_group {
>>>>>>>>>> +	struct perf_event_mmap_page
>>>>>>>>>> +*mmap_pages[MAX_NUM_GROUP_EVENTS];
>>>>>>>>>> +/**< array of user pages
>>>>>>> */
>>>>>>>>>> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>>>>>>>>>> +	bool enabled; /**< true if group was enabled on particular lcore */
>>>>>>>>>> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>>>>>>>>>> +__rte_cache_aligned;
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * A structure describing an event.
>>>>>>>>>> + */
>>>>>>>>>> +struct rte_pmu_event {
>>>>>>>>>> +	char *name; /**< name of an event */
>>>>>>>>>> +	unsigned int index; /**< event index into fds/mmap_pages */
>>>>>>>>>> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * A PMU state container.
>>>>>>>>>> + */
>>>>>>>>>> +struct rte_pmu {
>>>>>>>>>> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>>>>>>>>>> +	rte_spinlock_t lock; /**< serialize access to event group list */
>>>>>>>>>> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>>>>>>>>>> +	unsigned int num_group_events; /**< number of events in a group */
>>>>>>>>>> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>>>>>>>>>> +	unsigned int initialized; /**< initialization counter */ };
>>>>>>>>>> +
>>>>>>>>>> +/** lcore event group */
>>>>>>>>>> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
>>>>>>>>>> +_event_group);
>>>>>>>>>> +
>>>>>>>>>> +/** PMU state container */
>>>>>>>>>> +extern struct rte_pmu rte_pmu;
>>>>>>>>>> +
>>>>>>>>>> +/** Each architecture supporting PMU needs to provide its own
>>>>>>>>>> +version */ #ifndef rte_pmu_pmc_read #define
>>>>>>>>>> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @warning
>>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>>> + *
>>>>>>>>>> + * Read PMU counter.
>>>>>>>>>> + *
>>>>>>>>>> + * @warning This should be not called directly.
>>>>>>>>>> + *
>>>>>>>>>> + * @param pc
>>>>>>>>>> + *   Pointer to the mmapped user page.
>>>>>>>>>> + * @return
>>>>>>>>>> + *   Counter value read from hardware.
>>>>>>>>>> + */
>>>>>>>>>> +static __rte_always_inline uint64_t
>>>>>>>>>> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>>>>>>>>>> +	uint64_t width, offset;
>>>>>>>>>> +	uint32_t seq, index;
>>>>>>>>>> +	int64_t pmc;
>>>>>>>>>> +
>>>>>>>>>> +	for (;;) {
>>>>>>>>>> +		seq = pc->lock;
>>>>>>>>>> +		rte_compiler_barrier();
>>>>>>>>>
>>>>>>>>> Are you sure that compiler_barrier() is enough here?
>>>>>>>>> On some archs CPU itself has freedom to re-order reads.
>>>>>>>>> Or I am missing something obvious here?
>>>>>>>>>
>>>>>>>>
>>>>>>>> It's a matter of not keeping old stuff cached in registers and
>>>>>>>> making sure that we have two reads of lock. CPU reordering won't
>>>>>>>> do any harm here.
>>>>>>>
>>>>>>> Sorry, I didn't get you here:
>>>>>>> Suppose CPU will re-order reads and will read lock *after* index or offset value.
>>>>>>> Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
>>>>>>>
>>>>>>
>>>>>> This number is just an indicator whether kernel did change something or not.
>>>>>
>>>>> You are talking about pc->lock, right?
>>>>> Yes, I do understand that it is sort of seqlock.
>>>>> That's why I am puzzled why we do not care about possible cpu read-reordering.
>>>>> Manual for perf_event_open() also has a code snippet with compiler barrier only...
>>>>>
>>>>>> If cpu reordering will come into play then this will not change
>>>>>> anything from pov of this
>>> loop.
>>>>>> All we want is fresh data when needed and no involvement of
>>>>>> compiler when it comes to reordering code.
>>>>>
>>>>> Ok, can you probably explain to me why the following could not happen:
>>>>> T0:
>>>>> pc->seqlock==0; pc->index==I1; pc->offset==O1;
>>>>> T1:
>>>>>       cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
>>>>>        index=pc->index;  //index==I1;
>>>>> T2:
>>>>>       cpu #1 kernel vent_update_userpage:
>>>>>       pc->lock++; // pc->lock==1
>>>>>       pc->index=I2;
>>>>>       pc->offset=O2;
>>>>>       ...
>>>>>       pc->lock++; //pc->lock==2
>>>>> T3:
>>>>>       cpu #0 continue with read pmu:
>>>>>       seq=pc->lock; //seq == 2
>>>>>        offset=pc->offset; // offset == O2
>>>>>        ....
>>>>>        pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
>>>>>        offset += pmc; //offset == O2 + pmcread(I1-1);
>>>>>        if (pc->lock == seq) // they are equal, return
>>>>>              return offset;
>>>>>
>>>>> Or, it can happen, but by some reason we don't care much?
>>>>>
>>>>
>>>> This code does self-monitoring and user page (whole group actually)
>>>> is per thread running on current cpu. Hence I am not sure what are
>>>> you trying to prove with that
>>> example.
>>>
>>> I am not trying to prove anything so far.
>>> I am asking is such situation possible or not, and if not, why?
>>> My current understanding (possibly wrong) is that after you mmaped
>>> these pages, kernel still can asynchronously update them.
>>> So, when reading the data from these pages you have to check 'lock'
>>> value before and after accessing other data.
>>> If so, why possible cpu read-reordering doesn't matter?
>>>
>>
>> Look. I'll reiterate that.
>>
>> 1. That user page/group/PMU config is per process. Other processes do not access that.
>
>Ok, that's clear.
>
>
>>     All this happens on the very same CPU where current thread is running.
>
>Ok... but can't this page be updated by kernel thread running simultaneously on different CPU?
>

I already pointed out that event/counter configuration is bound to current cpu. How can possibly
other cpu update that configuration? This cannot work. 


If you think that there's some problem with the code (or is simply broken on your setup) and logic 
has obvious flaw and you can provide meaningful evidence of that then I'd be more than happy to 
apply that fix. Otherwise that discussion will get us nowhere. 

>
>> 2. Suppose you've already read seq. Now for some reason kernel updates data in page seq was read
>from.
>> 3. Kernel will enter critical section during update. seq changes along with other data without
>app knowing about it.
>>     If you want nitty gritty details consult kernel sources.
>
>Look, I don't have to beg you to answer these questions.
>In fact, I expect library author to document all such narrow things
>clearly either in in PG, or in source code comments (ideally in both).
>If not, then from my perspective the patch is not ready stage and
>shouldn't be accepted.
>I don't know is compiler-barrier is enough here or not, but I think it
>is definitely worth a clear explanation in the docs.
>I suppose it wouldn't be only me who will get confused here.
>So please take an effort and document it clearly why you believe there
>is no race-condition.
>
>> 4. app resumes and has some stale data but *WILL* read new seq. Code loops again because values
>do not match.
>
>If the kernel will always execute update for this page in the same
>thread context, then yes, - user code will always note the difference
>after resume.
>But why it can't happen that your user-thread reads this page on one
>CPU, while some kernel code on other CPU updates it simultaneously?
>
>
>> 5. Otherwise seq values match and data is valid.
>>
>>> Also there was another question below, which you probably  missed, so I copied it here:
>>> Another question - do we really need  to have __rte_pmu_read_userpage() and rte_pmu_read() as
>>> static inline functions in public header?
>>> As I understand, because of that we also have to make 'struct rte_pmu_*'
>>> definitions also public.
>>>
>>
>> These functions need to be inlined otherwise performance takes a hit.
>
>I understand that perfomance might be affected, but how big is hit?
>I expect actual PMU read will not be free anyway, right?
>If the diff is small, might be it is worth to go for such change,
>removing unneeded structures from public headers would help a lot in
>future in terms of ABI/API stability.
>
>
>
>>>>
>>>>>>>>
>>>>>>>>>> +		index = pc->index;
>>>>>>>>>> +		offset = pc->offset;
>>>>>>>>>> +		width = pc->pmc_width;
>>>>>>>>>> +
>>>>>>>>>> +		/* index set to 0 means that particular counter cannot be used */
>>>>>>>>>> +		if (likely(pc->cap_user_rdpmc && index)) {
>>>>>>>>>> +			pmc = rte_pmu_pmc_read(index - 1);
>>>>>>>>>> +			pmc <<= 64 - width;
>>>>>>>>>> +			pmc >>= 64 - width;
>>>>>>>>>> +			offset += pmc;
>>>>>>>>>> +		}
>>>>>>>>>> +
>>>>>>>>>> +		rte_compiler_barrier();
>>>>>>>>>> +
>>>>>>>>>> +		if (likely(pc->lock == seq))
>>>>>>>>>> +			return offset;
>>>>>>>>>> +	}
>>>>>>>>>> +
>>>>>>>>>> +	return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @warning
>>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>>> + *
>>>>>>>>>> + * Enable group of events on the calling lcore.
>>>>>>>>>> + *
>>>>>>>>>> + * @warning This should be not called directly.
>>>>>>>>>> + *
>>>>>>>>>> + * @return
>>>>>>>>>> + *   0 in case of success, negative value otherwise.
>>>>>>>>>> + */
>>>>>>>>>> +__rte_experimental
>>>>>>>>>> +int
>>>>>>>>>> +__rte_pmu_enable_group(void);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @warning
>>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>>> + *
>>>>>>>>>> + * Initialize PMU library.
>>>>>>>>>> + *
>>>>>>>>>> + * @warning This should be not called directly.
>>>>>>>>>> + *
>>>>>>>>>> + * @return
>>>>>>>>>> + *   0 in case of success, negative value otherwise.
>>>>>>>>>> + */
>>>>>>>>>> +__rte_experimental
>>>>>>>>>> +int
>>>>>>>>>> +rte_pmu_init(void);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @warning
>>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>>> + *
>>>>>>>>>> + * Finalize PMU library. This should be called after PMU
>>>>>>>>>> +counters are no longer being
>>>>> read.
>>>>>>>>>> + */
>>>>>>>>>> +__rte_experimental
>>>>>>>>>> +void
>>>>>>>>>> +rte_pmu_fini(void);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @warning
>>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>>> + *
>>>>>>>>>> + * Add event to the group of enabled events.
>>>>>>>>>> + *
>>>>>>>>>> + * @param name
>>>>>>>>>> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>>>>>>>>>> + * @return
>>>>>>>>>> + *   Event index in case of success, negative value otherwise.
>>>>>>>>>> + */
>>>>>>>>>> +__rte_experimental
>>>>>>>>>> +int
>>>>>>>>>> +rte_pmu_add_event(const char *name);
>>>>>>>>>> +
>>>>>>>>>> +/**
>>>>>>>>>> + * @warning
>>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>>>>>>>>> + *
>>>>>>>>>> + * Read hardware counter configured to count occurrences of an event.
>>>>>>>>>> + *
>>>>>>>>>> + * @param index
>>>>>>>>>> + *   Index of an event to be read.
>>>>>>>>>> + * @return
>>>>>>>>>> + *   Event value read from register. In case of errors or lack of support
>>>>>>>>>> + *   0 is returned. In other words, stream of zeros in a trace file
>>>>>>>>>> + *   indicates problem with reading particular PMU event register.
>>>>>>>>>> + */
>>>>>
>>>>> Another question - do we really need  to have
>>>>> __rte_pmu_read_userpage() and rte_pmu_read() as static inline functions in public header?
>>>>> As I understand, because of that we also have to make 'struct rte_pmu_*'
>>>>> definitions also public.
>>>>>
>>>>>>>>>> +__rte_experimental
>>>>>>>>>> +static __rte_always_inline uint64_t rte_pmu_read(unsigned
>>>>>>>>>> +int
>>>>>>>>>> +index) {
>>>>>>>>>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>>>>>>>>>> +	int ret;
>>>>>>>>>> +
>>>>>>>>>> +	if (unlikely(!rte_pmu.initialized))
>>>>>>>>>> +		return 0;
>>>>>>>>>> +
>>>>>>>>>> +	if (unlikely(!group->enabled)) {
>>>>>>>>>> +		ret = __rte_pmu_enable_group();
>>>>>>>>>> +		if (ret)
>>>>>>>>>> +			return 0;
>>>>>>>>>> +	}
>>>>>>>>>> +
>>>>>>>>>> +	if (unlikely(index >= rte_pmu.num_group_events))
>>>>>>>>>> +		return 0;
>>>>>>>>>> +
>>>>>>>>>> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +#ifdef __cplusplus
>>>>>>>>>> +}
>>>>>>>>>> +#endif
>>>>>>>>>> +
>>
  
Tomasz Duszynski Feb. 27, 2023, 9:19 a.m. UTC | #13
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>Sent: Tuesday, February 21, 2023 3:17 AM
>To: dev@dpdk.org
>Subject: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>
>> Add support for programming PMU counters and reading their values in
>> runtime bypassing kernel completely.
>>
>> This is especially useful in cases where CPU cores are isolated i.e
>> run dedicated tasks. In such cases one cannot use standard perf
>> utility without sacrificing latency and performance.
>>
>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>
>Few more comments/questions below.
>
>
>> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> 100644 index 0000000000..950f999cb7
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.c
>> @@ -0,0 +1,460 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2023 Marvell International Ltd.
>> + */
>> +
>> +#include <ctype.h>
>> +#include <dirent.h>
>> +#include <errno.h>
>> +#include <regex.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/queue.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_per_lcore.h>
>> +#include <rte_pmu.h>
>> +#include <rte_spinlock.h>
>> +#include <rte_tailq.h>
>> +
>> +#include "pmu_private.h"
>> +
>> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> +
>> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
>> +(__builtin_ffsll(m) - 1)) & (m))
>> +
>> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> +struct rte_pmu rte_pmu;
>> +
>> +/*
>> + * Following __rte_weak functions provide default no-op.
>> +Architectures should override them if
>> + * necessary.
>> + */
>> +
>> +int
>> +__rte_weak pmu_arch_init(void)
>> +{
>> +	return 0;
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fini(void)
>> +{
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { }
>> +
>> +static int
>> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> +	char path[PATH_MAX];
>> +	char *config = NULL;
>> +	int high, low, ret;
>> +	FILE *fp;
>> +
>> +	*num = *mask = 0;
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
>> +	fp = fopen(path, "r");
>> +	if (fp == NULL)
>> +		return -errno;
>> +
>> +	errno = 0;
>> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> +	if (ret < 2) {
>> +		ret = -ENODATA;
>> +		goto out;
>> +	}
>> +	if (errno) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	if (ret == 2)
>> +		high = low;
>> +
>> +	*mask = GENMASK_ULL(high, low);
>> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
>> +	*num = config[strlen(config) - 1];
>> +	*num = isdigit(*num) ? *num - '0' : 0;
>> +
>> +	ret = 0;
>> +out:
>> +	free(config);
>> +	fclose(fp);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +parse_event(char *buf, uint64_t config[3]) {
>> +	char *token, *term;
>> +	int num, ret, val;
>> +	uint64_t mask;
>> +
>> +	config[0] = config[1] = config[2] = 0;
>> +
>> +	token = strtok(buf, ",");
>> +	while (token) {
>> +		errno = 0;
>> +		/* <term>=<value> */
>> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> +		if (ret < 1)
>> +			return -ENODATA;
>> +		if (errno)
>> +			return -errno;
>> +		if (ret == 1)
>> +			val = 1;
>> +
>> +		ret = get_term_format(term, &num, &mask);
>> +		free(term);
>> +		if (ret)
>> +			return ret;
>> +
>> +		config[num] |= FIELD_PREP(mask, val);
>> +		token = strtok(NULL, ",");
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int
>> +get_event_config(const char *name, uint64_t config[3]) {
>> +	char path[PATH_MAX], buf[BUFSIZ];
>> +	FILE *fp;
>> +	int ret;
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
>> +	fp = fopen(path, "r");
>> +	if (fp == NULL)
>> +		return -errno;
>> +
>> +	ret = fread(buf, 1, sizeof(buf), fp);
>> +	if (ret == 0) {
>> +		fclose(fp);
>> +
>> +		return -EINVAL;
>> +	}
>> +	fclose(fp);
>> +	buf[ret] = '\0';
>> +
>> +	return parse_event(buf, config);
>> +}
>> +
>> +static int
>> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> +	struct perf_event_attr attr = {
>> +		.size = sizeof(struct perf_event_attr),
>> +		.type = PERF_TYPE_RAW,
>> +		.exclude_kernel = 1,
>> +		.exclude_hv = 1,
>> +		.disabled = 1,
>> +	};
>> +
>> +	pmu_arch_fixup_config(config);
>> +
>> +	attr.config = config[0];
>> +	attr.config1 = config[1];
>> +	attr.config2 = config[2];
>> +
>> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
>> +
>> +static int
>> +open_events(struct rte_pmu_event_group *group) {
>> +	struct rte_pmu_event *event;
>> +	uint64_t config[3];
>> +	int num = 0, ret;
>> +
>> +	/* group leader gets created first, with fd = -1 */
>> +	group->fds[0] = -1;
>> +
>> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> +		ret = get_event_config(event->name, config);
>> +		if (ret)
>> +			continue;
>> +
>> +		ret = do_perf_event_open(config, group->fds[0]);
>> +		if (ret == -1) {
>> +			ret = -errno;
>> +			goto out;
>> +		}
>> +
>> +		group->fds[event->index] = ret;
>> +		num++;
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	for (--num; num >= 0; num--) {
>> +		close(group->fds[num]);
>> +		group->fds[num] = -1;
>> +	}
>> +
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +mmap_events(struct rte_pmu_event_group *group) {
>> +	long page_size = sysconf(_SC_PAGE_SIZE);
>> +	unsigned int i;
>> +	void *addr;
>> +	int ret;
>> +
>> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
>> +		if (addr == MAP_FAILED) {
>> +			ret = -errno;
>> +			goto out;
>> +		}
>> +
>> +		group->mmap_pages[i] = addr;
>> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
>> +			ret = -EPERM;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	for (; i; i--) {
>> +		munmap(group->mmap_pages[i - 1], page_size);
>> +		group->mmap_pages[i - 1] = NULL;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static void
>> +cleanup_events(struct rte_pmu_event_group *group) {
>> +	unsigned int i;
>> +
>> +	if (group->fds[0] != -1)
>> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
>> +
>> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> +		if (group->mmap_pages[i]) {
>> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> +			group->mmap_pages[i] = NULL;
>> +		}
>> +
>> +		if (group->fds[i] != -1) {
>> +			close(group->fds[i]);
>> +			group->fds[i] = -1;
>> +		}
>> +	}
>> +
>> +	group->enabled = false;
>> +}
>> +
>> +int
>> +__rte_pmu_enable_group(void)
>> +{
>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> +	int ret;
>> +
>> +	if (rte_pmu.num_group_events == 0)
>> +		return -ENODEV;
>> +
>> +	ret = open_events(group);
>> +	if (ret)
>> +		goto out;
>> +
>> +	ret = mmap_events(group);
>> +	if (ret)
>> +		goto out;
>> +
>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	rte_spinlock_lock(&rte_pmu.lock);
>> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>
>Hmm.. so we insert pointer to TLS variable into the global list?
>Wonder what would happen if that thread get terminated?

Nothing special. Any pointers to that thread-local in that thread are invalided.

>Can memory from its TLS block get re-used (by other thread or for other purposes)?
>

Why would any other thread reuse that? Eventually main thread will need that data to do the cleanup. 

>
>> +	rte_spinlock_unlock(&rte_pmu.lock);
>> +	group->enabled = true;
>> +
>> +	return 0;
>> +
>> +out:
>> +	cleanup_events(group);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +scan_pmus(void)
>> +{
>> +	char path[PATH_MAX];
>> +	struct dirent *dent;
>> +	const char *name;
>> +	DIR *dirp;
>> +
>> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> +	if (dirp == NULL)
>> +		return -errno;
>> +
>> +	while ((dent = readdir(dirp))) {
>> +		name = dent->d_name;
>> +		if (name[0] == '.')
>> +			continue;
>> +
>> +		/* sysfs entry should either contain cpus or be a cpu */
>> +		if (!strcmp(name, "cpu"))
>> +			break;
>> +
>> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> +		if (access(path, F_OK) == 0)
>> +			break;
>> +	}
>> +
>> +	if (dent) {
>> +		rte_pmu.name = strdup(name);
>> +		if (rte_pmu.name == NULL) {
>> +			closedir(dirp);
>> +
>> +			return -ENOMEM;
>> +		}
>> +	}
>> +
>> +	closedir(dirp);
>> +
>> +	return rte_pmu.name ? 0 : -ENODEV;
>> +}
>> +
>> +static struct rte_pmu_event *
>> +new_event(const char *name)
>> +{
>> +	struct rte_pmu_event *event;
>> +
>> +	event = calloc(1, sizeof(*event));
>> +	if (event == NULL)
>> +		goto out;
>> +
>> +	event->name = strdup(name);
>> +	if (event->name == NULL) {
>> +		free(event);
>> +		event = NULL;
>> +	}
>> +
>> +out:
>> +	return event;
>> +}
>> +
>> +static void
>> +free_event(struct rte_pmu_event *event)
>> +{
>> +	free(event->name);
>> +	free(event);
>> +}
>> +
>> +int
>> +rte_pmu_add_event(const char *name)
>> +{
>> +	struct rte_pmu_event *event;
>> +	char path[PATH_MAX];
>> +
>> +	if (rte_pmu.name == NULL)
>> +		return -ENODEV;
>> +
>> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
>> +		return -ENOSPC;
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
>> +	if (access(path, R_OK))
>> +		return -ENODEV;
>> +
>> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> +		if (!strcmp(event->name, name))
>> +			return event->index;
>> +		continue;
>> +	}
>> +
>> +	event = new_event(name);
>> +	if (event == NULL)
>> +		return -ENOMEM;
>> +
>> +	event->index = rte_pmu.num_group_events++;
>> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> +
>> +	return event->index;
>> +}
>> +
>> +int
>> +rte_pmu_init(void)
>> +{
>> +	int ret;
>> +
>> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
>> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
>> +	 * via command line but application doesn't care enough and performs init/fini again.
>> +	 */
>> +	if (rte_pmu.initialized != 0) {
>> +		rte_pmu.initialized++;
>> +		return 0;
>> +	}
>> +
>> +	ret = scan_pmus();
>> +	if (ret)
>> +		goto out;
>> +
>> +	ret = pmu_arch_init();
>> +	if (ret)
>> +		goto out;
>> +
>> +	TAILQ_INIT(&rte_pmu.event_list);
>> +	TAILQ_INIT(&rte_pmu.event_group_list);
>> +	rte_spinlock_init(&rte_pmu.lock);
>> +	rte_pmu.initialized = 1;
>> +
>> +	return 0;
>> +out:
>> +	free(rte_pmu.name);
>> +	rte_pmu.name = NULL;
>> +
>> +	return ret;
>> +}
>> +
>> +void
>> +rte_pmu_fini(void)
>> +{
>> +	struct rte_pmu_event_group *group, *tmp_group;
>> +	struct rte_pmu_event *event, *tmp_event;
>> +
>> +	/* cleanup once init count drops to zero */
>> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
>> +		return;
>> +
>> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> +		free_event(event);
>> +	}
>> +
>> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
>> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
>> +		cleanup_events(group);
>> +	}
>> +
>> +	pmu_arch_fini();
>> +	free(rte_pmu.name);
>> +	rte_pmu.name = NULL;
>> +	rte_pmu.num_group_events = 0;
>> +}
>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
>> new file mode 100644
>> index 0000000000..6b664c3336
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.h
>> @@ -0,0 +1,212 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2023 Marvell
>> + */
>> +
>> +#ifndef _RTE_PMU_H_
>> +#define _RTE_PMU_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * PMU event tracing operations
>> + *
>> + * This file defines generic API and types necessary to setup PMU and
>> + * read selected counters in runtime.
>> + */
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#include <linux/perf_event.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_branch_prediction.h>
>> +#include <rte_common.h>
>> +#include <rte_compat.h>
>> +#include <rte_spinlock.h>
>> +
>> +/** Maximum number of events in a group */
>> +#define MAX_NUM_GROUP_EVENTS 8
>
>forgot RTE_ prefix.
>In fact, do you really need number of events in group to be hard-coded?
>Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?
>

8 is reasonable number I think. X86/ARM have actually less that that (was that something like 4?). 
Moreover events are scheduled as a group so there must be enough hw counters available
for that to succeed. So this number should cover current needs.  

>> +
>> +/**
>> + * A structure describing a group of events.
>> + */
>> +struct rte_pmu_event_group {
>> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
>> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> +	bool enabled; /**< true if group was enabled on particular lcore */
>> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
>> +} __rte_cache_aligned;
>> +
>
>Even if we'd decide to keep rte_pmu_read() as static inline (still not
>sure it is a good idea),

We want to save as much cpu cycles as we possibly can and inlining does helps
in that matter.

>why these two struct below (rte_pmu_event and rte_pmu) have to be public?
>I think both can be safely moved away from public headers.
>

struct rte_pmu_event can be hidden I guess. 
struct rte_pmu is used in this header hence cannot be moved elsewhere. 

>
>> +/**
>> + * A structure describing an event.
>> + */
>> +struct rte_pmu_event {
>> +	char *name; /**< name of an event */
>> +	unsigned int index; /**< event index into fds/mmap_pages */
>> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
>> +};
>
>> +
>> +/**
>> + * A PMU state container.
>> + */
>> +struct rte_pmu {
>> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> +	rte_spinlock_t lock; /**< serialize access to event group list */
>> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> +	unsigned int num_group_events; /**< number of events in a group */
>> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> +	unsigned int initialized; /**< initialization counter */
>> +};
>> +
>> +/** lcore event group */
>> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> +
>> +/** PMU state container */
>> +extern struct rte_pmu rte_pmu;
>> +
>> +/** Each architecture supporting PMU needs to provide its own version */
>> +#ifndef rte_pmu_pmc_read
>> +#define rte_pmu_pmc_read(index) ({ 0; })
>> +#endif
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read PMU counter.
>> + *
>> + * @warning This should be not called directly.
>> + *
>> + * @param pc
>> + *   Pointer to the mmapped user page.
>> + * @return
>> + *   Counter value read from hardware.
>> + */
>> +static __rte_always_inline uint64_t
>> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
>> +{
>> +	uint64_t width, offset;
>> +	uint32_t seq, index;
>> +	int64_t pmc;
>> +
>> +	for (;;) {
>> +		seq = pc->lock;
>> +		rte_compiler_barrier();
>> +		index = pc->index;
>> +		offset = pc->offset;
>> +		width = pc->pmc_width;
>> +
>> +		/* index set to 0 means that particular counter cannot be used */
>> +		if (likely(pc->cap_user_rdpmc && index)) {
>
>In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
>Do you need another check here? Or this capability can be disabled by
>kernel at run-time?
>

That extra check in mmap_event() may be removed actually. Some archs allow
disabling reading rdpmc (I think that on x86 one can do that) so this check needs to stay. 

>
>> +			pmc = rte_pmu_pmc_read(index - 1);
>> +			pmc <<= 64 - width;
>> +			pmc >>= 64 - width;
>> +			offset += pmc;
>> +		}
>> +
>> +		rte_compiler_barrier();
>> +
>> +		if (likely(pc->lock == seq))
>> +			return offset;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Enable group of events on the calling lcore.
>> + *
>> + * @warning This should be not called directly.
>
>__rte_internal ?
>

No this cannot be internal because that will make functions calling it 
internal as well hence apps won't be able to use that. This has
already been brought up by one of the reviewers. 

>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +__rte_pmu_enable_group(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Initialize PMU library.
>> + *
>> + * @warning This should be not called directly.
>
>Hmm.. then who should call it?
>If it not supposed to be called directly, why to declare it here?
>

This is inlined and has one caller i.e rte_pmu_read(). 

>> + *
>> + * @return
>> + *   0 in case of success, negative value otherwise.
>> + */
>
>Probably worth to mention that this function is not MT safe.
>Same for _fini_ and add_event.
>Also worth to mention that all control-path functions
>(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.
>

Yes they are meant to be called from main thread. 

>> +__rte_experimental
>> +int
>> +rte_pmu_init(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Finalize PMU library. This should be called after PMU counters are no longer being read.
>> + */
>> +__rte_experimental
>> +void
>> +rte_pmu_fini(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Add event to the group of enabled events.
>> + *
>> + * @param name
>> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
>> + * @return
>> + *   Event index in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_add_event(const char *name);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read hardware counter configured to count occurrences of an event.
>> + *
>> + * @param index
>> + *   Index of an event to be read.
>> + * @return
>> + *   Event value read from register. In case of errors or lack of support
>> + *   0 is returned. In other words, stream of zeros in a trace file
>> + *   indicates problem with reading particular PMU event register.
>> + */
>> +__rte_experimental
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read(unsigned int index)
>> +{
>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> +	int ret;
>> +
>> +	if (unlikely(!rte_pmu.initialized))
>> +		return 0;
>> +
>> +	if (unlikely(!group->enabled)) {
>> +		ret = __rte_pmu_enable_group();
>> +		if (ret)
>> +			return 0;
>> +	}
>> +
>> +	if (unlikely(index >= rte_pmu.num_group_events))
>> +		return 0;
>> +
>> +	return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_PMU_H_ */
>> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
>> new file mode 100644
>> index 0000000000..39a4f279c1
>> --- /dev/null
>> +++ b/lib/pmu/version.map
>> @@ -0,0 +1,15 @@
>> +DPDK_23 {
>> +	local: *;
>> +};
>> +
>> +EXPERIMENTAL {
>> +	global:
>> +
>> +	__rte_pmu_enable_group;
>> +	per_lcore__event_group;
>> +	rte_pmu;
>> +	rte_pmu_add_event;
>> +	rte_pmu_fini;
>> +	rte_pmu_init;
>> +	rte_pmu_read;
>> +};
  
Konstantin Ananyev Feb. 27, 2023, 8:53 p.m. UTC | #14
> >> Add support for programming PMU counters and reading their values in
> >> runtime bypassing kernel completely.
> >>
> >> This is especially useful in cases where CPU cores are isolated i.e
> >> run dedicated tasks. In such cases one cannot use standard perf
> >> utility without sacrificing latency and performance.
> >>
> >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> >
> >Few more comments/questions below.
> >
> >
> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
> >> 100644 index 0000000000..950f999cb7
> >> --- /dev/null
> >> +++ b/lib/pmu/rte_pmu.c
> >> @@ -0,0 +1,460 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(C) 2023 Marvell International Ltd.
> >> + */
> >> +
> >> +#include <ctype.h>
> >> +#include <dirent.h>
> >> +#include <errno.h>
> >> +#include <regex.h>
> >> +#include <stdlib.h>
> >> +#include <string.h>
> >> +#include <sys/ioctl.h>
> >> +#include <sys/mman.h>
> >> +#include <sys/queue.h>
> >> +#include <sys/syscall.h>
> >> +#include <unistd.h>
> >> +
> >> +#include <rte_atomic.h>
> >> +#include <rte_per_lcore.h>
> >> +#include <rte_pmu.h>
> >> +#include <rte_spinlock.h>
> >> +#include <rte_tailq.h>
> >> +
> >> +#include "pmu_private.h"
> >> +
> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> >> +
> >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
> >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
> >> +(__builtin_ffsll(m) - 1)) & (m))
> >> +
> >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> +struct rte_pmu rte_pmu;
> >> +
> >> +/*
> >> + * Following __rte_weak functions provide default no-op.
> >> +Architectures should override them if
> >> + * necessary.
> >> + */
> >> +
> >> +int
> >> +__rte_weak pmu_arch_init(void)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >> +void
> >> +__rte_weak pmu_arch_fini(void)
> >> +{
> >> +}
> >> +
> >> +void
> >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { }
> >> +
> >> +static int
> >> +get_term_format(const char *name, int *num, uint64_t *mask) {
> >> +	char path[PATH_MAX];
> >> +	char *config = NULL;
> >> +	int high, low, ret;
> >> +	FILE *fp;
> >> +
> >> +	*num = *mask = 0;
> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> >> +	fp = fopen(path, "r");
> >> +	if (fp == NULL)
> >> +		return -errno;
> >> +
> >> +	errno = 0;
> >> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> >> +	if (ret < 2) {
> >> +		ret = -ENODATA;
> >> +		goto out;
> >> +	}
> >> +	if (errno) {
> >> +		ret = -errno;
> >> +		goto out;
> >> +	}
> >> +
> >> +	if (ret == 2)
> >> +		high = low;
> >> +
> >> +	*mask = GENMASK_ULL(high, low);
> >> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> >> +	*num = config[strlen(config) - 1];
> >> +	*num = isdigit(*num) ? *num - '0' : 0;
> >> +
> >> +	ret = 0;
> >> +out:
> >> +	free(config);
> >> +	fclose(fp);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +parse_event(char *buf, uint64_t config[3]) {
> >> +	char *token, *term;
> >> +	int num, ret, val;
> >> +	uint64_t mask;
> >> +
> >> +	config[0] = config[1] = config[2] = 0;
> >> +
> >> +	token = strtok(buf, ",");
> >> +	while (token) {
> >> +		errno = 0;
> >> +		/* <term>=<value> */
> >> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> >> +		if (ret < 1)
> >> +			return -ENODATA;
> >> +		if (errno)
> >> +			return -errno;
> >> +		if (ret == 1)
> >> +			val = 1;
> >> +
> >> +		ret = get_term_format(term, &num, &mask);
> >> +		free(term);
> >> +		if (ret)
> >> +			return ret;
> >> +
> >> +		config[num] |= FIELD_PREP(mask, val);
> >> +		token = strtok(NULL, ",");
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int
> >> +get_event_config(const char *name, uint64_t config[3]) {
> >> +	char path[PATH_MAX], buf[BUFSIZ];
> >> +	FILE *fp;
> >> +	int ret;
> >> +
> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> >> +	fp = fopen(path, "r");
> >> +	if (fp == NULL)
> >> +		return -errno;
> >> +
> >> +	ret = fread(buf, 1, sizeof(buf), fp);
> >> +	if (ret == 0) {
> >> +		fclose(fp);
> >> +
> >> +		return -EINVAL;
> >> +	}
> >> +	fclose(fp);
> >> +	buf[ret] = '\0';
> >> +
> >> +	return parse_event(buf, config);
> >> +}
> >> +
> >> +static int
> >> +do_perf_event_open(uint64_t config[3], int group_fd) {
> >> +	struct perf_event_attr attr = {
> >> +		.size = sizeof(struct perf_event_attr),
> >> +		.type = PERF_TYPE_RAW,
> >> +		.exclude_kernel = 1,
> >> +		.exclude_hv = 1,
> >> +		.disabled = 1,
> >> +	};
> >> +
> >> +	pmu_arch_fixup_config(config);
> >> +
> >> +	attr.config = config[0];
> >> +	attr.config1 = config[1];
> >> +	attr.config2 = config[2];
> >> +
> >> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
> >> +
> >> +static int
> >> +open_events(struct rte_pmu_event_group *group) {
> >> +	struct rte_pmu_event *event;
> >> +	uint64_t config[3];
> >> +	int num = 0, ret;
> >> +
> >> +	/* group leader gets created first, with fd = -1 */
> >> +	group->fds[0] = -1;
> >> +
> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> >> +		ret = get_event_config(event->name, config);
> >> +		if (ret)
> >> +			continue;
> >> +
> >> +		ret = do_perf_event_open(config, group->fds[0]);
> >> +		if (ret == -1) {
> >> +			ret = -errno;
> >> +			goto out;
> >> +		}
> >> +
> >> +		group->fds[event->index] = ret;
> >> +		num++;
> >> +	}
> >> +
> >> +	return 0;
> >> +out:
> >> +	for (--num; num >= 0; num--) {
> >> +		close(group->fds[num]);
> >> +		group->fds[num] = -1;
> >> +	}
> >> +
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +mmap_events(struct rte_pmu_event_group *group) {
> >> +	long page_size = sysconf(_SC_PAGE_SIZE);
> >> +	unsigned int i;
> >> +	void *addr;
> >> +	int ret;
> >> +
> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> >> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> >> +		if (addr == MAP_FAILED) {
> >> +			ret = -errno;
> >> +			goto out;
> >> +		}
> >> +
> >> +		group->mmap_pages[i] = addr;
> >> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
> >> +			ret = -EPERM;
> >> +			goto out;
> >> +		}
> >> +	}
> >> +
> >> +	return 0;
> >> +out:
> >> +	for (; i; i--) {
> >> +		munmap(group->mmap_pages[i - 1], page_size);
> >> +		group->mmap_pages[i - 1] = NULL;
> >> +	}
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void
> >> +cleanup_events(struct rte_pmu_event_group *group) {
> >> +	unsigned int i;
> >> +
> >> +	if (group->fds[0] != -1)
> >> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> >> +
> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> >> +		if (group->mmap_pages[i]) {
> >> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> >> +			group->mmap_pages[i] = NULL;
> >> +		}
> >> +
> >> +		if (group->fds[i] != -1) {
> >> +			close(group->fds[i]);
> >> +			group->fds[i] = -1;
> >> +		}
> >> +	}
> >> +
> >> +	group->enabled = false;
> >> +}
> >> +
> >> +int
> >> +__rte_pmu_enable_group(void)
> >> +{
> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> >> +	int ret;
> >> +
> >> +	if (rte_pmu.num_group_events == 0)
> >> +		return -ENODEV;
> >> +
> >> +	ret = open_events(group);
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	ret = mmap_events(group);
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> >> +		ret = -errno;
> >> +		goto out;
> >> +	}
> >> +
> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> >> +		ret = -errno;
> >> +		goto out;
> >> +	}
> >> +
> >> +	rte_spinlock_lock(&rte_pmu.lock);
> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> >
> >Hmm.. so we insert pointer to TLS variable into the global list?
> >Wonder what would happen if that thread get terminated?
> 
> Nothing special. Any pointers to that thread-local in that thread are invalided.
> 
> >Can memory from its TLS block get re-used (by other thread or for other purposes)?
> >
> 
> Why would any other thread reuse that? 
> Eventually main thread will need that data to do the cleanup.

I understand that main thread would need to access that data.
I am not sure that it would be able to.
Imagine thread calls rte_pmu_read(...) and then terminates, while program continues to run.
As I understand address of its RTE_PER_LCORE(_event_group) will still remain in rte_pmu.event_group_list,
even if it is probably not valid any more. 

> >
> >> +	rte_spinlock_unlock(&rte_pmu.lock);
> >> +	group->enabled = true;
> >> +
> >> +	return 0;
> >> +
> >> +out:
> >> +	cleanup_events(group);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static int
> >> +scan_pmus(void)
> >> +{
> >> +	char path[PATH_MAX];
> >> +	struct dirent *dent;
> >> +	const char *name;
> >> +	DIR *dirp;
> >> +
> >> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> >> +	if (dirp == NULL)
> >> +		return -errno;
> >> +
> >> +	while ((dent = readdir(dirp))) {
> >> +		name = dent->d_name;
> >> +		if (name[0] == '.')
> >> +			continue;
> >> +
> >> +		/* sysfs entry should either contain cpus or be a cpu */
> >> +		if (!strcmp(name, "cpu"))
> >> +			break;
> >> +
> >> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> >> +		if (access(path, F_OK) == 0)
> >> +			break;
> >> +	}
> >> +
> >> +	if (dent) {
> >> +		rte_pmu.name = strdup(name);
> >> +		if (rte_pmu.name == NULL) {
> >> +			closedir(dirp);
> >> +
> >> +			return -ENOMEM;
> >> +		}
> >> +	}
> >> +
> >> +	closedir(dirp);
> >> +
> >> +	return rte_pmu.name ? 0 : -ENODEV;
> >> +}
> >> +
> >> +static struct rte_pmu_event *
> >> +new_event(const char *name)
> >> +{
> >> +	struct rte_pmu_event *event;
> >> +
> >> +	event = calloc(1, sizeof(*event));
> >> +	if (event == NULL)
> >> +		goto out;
> >> +
> >> +	event->name = strdup(name);
> >> +	if (event->name == NULL) {
> >> +		free(event);
> >> +		event = NULL;
> >> +	}
> >> +
> >> +out:
> >> +	return event;
> >> +}
> >> +
> >> +static void
> >> +free_event(struct rte_pmu_event *event)
> >> +{
> >> +	free(event->name);
> >> +	free(event);
> >> +}
> >> +
> >> +int
> >> +rte_pmu_add_event(const char *name)
> >> +{
> >> +	struct rte_pmu_event *event;
> >> +	char path[PATH_MAX];
> >> +
> >> +	if (rte_pmu.name == NULL)
> >> +		return -ENODEV;
> >> +
> >> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> >> +		return -ENOSPC;
> >> +
> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> >> +	if (access(path, R_OK))
> >> +		return -ENODEV;
> >> +
> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> >> +		if (!strcmp(event->name, name))
> >> +			return event->index;
> >> +		continue;
> >> +	}
> >> +
> >> +	event = new_event(name);
> >> +	if (event == NULL)
> >> +		return -ENOMEM;
> >> +
> >> +	event->index = rte_pmu.num_group_events++;
> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> >> +
> >> +	return event->index;
> >> +}
> >> +
> >> +int
> >> +rte_pmu_init(void)
> >> +{
> >> +	int ret;
> >> +
> >> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
> >> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
> >> +	 * via command line but application doesn't care enough and performs init/fini again.
> >> +	 */
> >> +	if (rte_pmu.initialized != 0) {
> >> +		rte_pmu.initialized++;
> >> +		return 0;
> >> +	}
> >> +
> >> +	ret = scan_pmus();
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	ret = pmu_arch_init();
> >> +	if (ret)
> >> +		goto out;
> >> +
> >> +	TAILQ_INIT(&rte_pmu.event_list);
> >> +	TAILQ_INIT(&rte_pmu.event_group_list);
> >> +	rte_spinlock_init(&rte_pmu.lock);
> >> +	rte_pmu.initialized = 1;
> >> +
> >> +	return 0;
> >> +out:
> >> +	free(rte_pmu.name);
> >> +	rte_pmu.name = NULL;
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +void
> >> +rte_pmu_fini(void)
> >> +{
> >> +	struct rte_pmu_event_group *group, *tmp_group;
> >> +	struct rte_pmu_event *event, *tmp_event;
> >> +
> >> +	/* cleanup once init count drops to zero */
> >> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
> >> +		return;
> >> +
> >> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> >> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> >> +		free_event(event);
> >> +	}
> >> +
> >> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> >> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> >> +		cleanup_events(group);
> >> +	}
> >> +
> >> +	pmu_arch_fini();
> >> +	free(rte_pmu.name);
> >> +	rte_pmu.name = NULL;
> >> +	rte_pmu.num_group_events = 0;
> >> +}
> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> >> new file mode 100644
> >> index 0000000000..6b664c3336
> >> --- /dev/null
> >> +++ b/lib/pmu/rte_pmu.h
> >> @@ -0,0 +1,212 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2023 Marvell
> >> + */
> >> +
> >> +#ifndef _RTE_PMU_H_
> >> +#define _RTE_PMU_H_
> >> +
> >> +/**
> >> + * @file
> >> + *
> >> + * PMU event tracing operations
> >> + *
> >> + * This file defines generic API and types necessary to setup PMU and
> >> + * read selected counters in runtime.
> >> + */
> >> +
> >> +#ifdef __cplusplus
> >> +extern "C" {
> >> +#endif
> >> +
> >> +#include <linux/perf_event.h>
> >> +
> >> +#include <rte_atomic.h>
> >> +#include <rte_branch_prediction.h>
> >> +#include <rte_common.h>
> >> +#include <rte_compat.h>
> >> +#include <rte_spinlock.h>
> >> +
> >> +/** Maximum number of events in a group */
> >> +#define MAX_NUM_GROUP_EVENTS 8
> >
> >forgot RTE_ prefix.
> >In fact, do you really need number of events in group to be hard-coded?
> >Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?
> >
> 
> 8 is reasonable number I think. X86/ARM have actually less that that (was that something like 4?).
> Moreover events are scheduled as a group so there must be enough hw counters available
> for that to succeed. So this number should cover current needs.

If you think 8 will be enough to cover all possible future cases - I am ok either way.
Still need RTE_ prefix for it.

> >> +
> >> +/**
> >> + * A structure describing a group of events.
> >> + */
> >> +struct rte_pmu_event_group {
> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >> +	bool enabled; /**< true if group was enabled on particular lcore */
> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> >> +} __rte_cache_aligned;
> >> +
> >
> >Even if we'd decide to keep rte_pmu_read() as static inline (still not
> >sure it is a good idea),
> 
> We want to save as much cpu cycles as we possibly can and inlining does helps
> in that matter.

Ok, so asking same question from different thread: how many cycles it will save?
What is the difference in terms of performance when you have this function
inlined vs not inlined?
 
> >why these two struct below (rte_pmu_event and rte_pmu) have to be public?
> >I think both can be safely moved away from public headers.
> >
> 
> struct rte_pmu_event can be hidden I guess.
> struct rte_pmu is used in this header hence cannot be moved elsewhere.

Not sure why? 
Is that because you use it inside rte_pmu_read()?
But that check I think can be safely moved into __rte_pmu_enable_group()
or probably even into rte_pmu_add_event(). 

> >
> >> +/**
> >> + * A structure describing an event.
> >> + */
> >> +struct rte_pmu_event {
> >> +	char *name; /**< name of an event */
> >> +	unsigned int index; /**< event index into fds/mmap_pages */
> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
> >> +};
> >
> >> +
> >> +/**
> >> + * A PMU state container.
> >> + */
> >> +struct rte_pmu {
> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> >> +	unsigned int num_group_events; /**< number of events in a group */
> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> >> +	unsigned int initialized; /**< initialization counter */
> >> +};
> >> +
> >> +/** lcore event group */
> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> +
> >> +/** PMU state container */
> >> +extern struct rte_pmu rte_pmu;
> >> +
> >> +/** Each architecture supporting PMU needs to provide its own version */
> >> +#ifndef rte_pmu_pmc_read
> >> +#define rte_pmu_pmc_read(index) ({ 0; })
> >> +#endif
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Read PMU counter.
> >> + *
> >> + * @warning This should be not called directly.
> >> + *
> >> + * @param pc
> >> + *   Pointer to the mmapped user page.
> >> + * @return
> >> + *   Counter value read from hardware.
> >> + */
> >> +static __rte_always_inline uint64_t
> >> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> >> +{
> >> +	uint64_t width, offset;
> >> +	uint32_t seq, index;
> >> +	int64_t pmc;
> >> +
> >> +	for (;;) {
> >> +		seq = pc->lock;
> >> +		rte_compiler_barrier();
> >> +		index = pc->index;
> >> +		offset = pc->offset;
> >> +		width = pc->pmc_width;
> >> +
> >> +		/* index set to 0 means that particular counter cannot be used */
> >> +		if (likely(pc->cap_user_rdpmc && index)) {
> >
> >In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
> >Do you need another check here? Or this capability can be disabled by
> >kernel at run-time?
> >
> 
> That extra check in mmap_event() may be removed actually. Some archs allow
> disabling reading rdpmc (I think that on x86 one can do that) so this check needs to stay.
> 
> >
> >> +			pmc = rte_pmu_pmc_read(index - 1);
> >> +			pmc <<= 64 - width;
> >> +			pmc >>= 64 - width;
> >> +			offset += pmc;
> >> +		}
> >> +
> >> +		rte_compiler_barrier();
> >> +
> >> +		if (likely(pc->lock == seq))
> >> +			return offset;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Enable group of events on the calling lcore.
> >> + *
> >> + * @warning This should be not called directly.
> >
> >__rte_internal ?
> >
> 
> No this cannot be internal because that will make functions calling it
> internal as well hence apps won't be able to use that. This has
> already been brought up by one of the reviewers.

Ok, then we probably can mark it with ' @internal' tag in
formal comments?

> 
> >> + *
> >> + * @return
> >> + *   0 in case of success, negative value otherwise.
> >> + */
> >> +__rte_experimental
> >> +int
> >> +__rte_pmu_enable_group(void);
> >> +
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Initialize PMU library.
> >> + *
> >> + * @warning This should be not called directly.
> >
> >Hmm.. then who should call it?
> >If it not supposed to be called directly, why to declare it here?
> >
> 
> This is inlined and has one caller i.e rte_pmu_read().

I thought we are talking here about rte_pmu_init().
I don't see where it is inlined and still not clear why it can't be called directly.

> >> + *
> >> + * @return
> >> + *   0 in case of success, negative value otherwise.
> >> + */
> >
> >Probably worth to mention that this function is not MT safe.
> >Same for _fini_ and add_event.
> >Also worth to mention that all control-path functions
> >(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.
> >
> 
> Yes they are meant to be called from main thread.

Ok, then please add that into formal API comments. 
 
> >> +__rte_experimental
> >> +int
> >> +rte_pmu_init(void);
> >> +
  
Morten Brørup Feb. 28, 2023, 8:25 a.m. UTC | #15
> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Monday, 27 February 2023 21.53
> 
> > >> Add support for programming PMU counters and reading their values in
> > >> runtime bypassing kernel completely.
> > >>
> > >> This is especially useful in cases where CPU cores are isolated i.e
> > >> run dedicated tasks. In such cases one cannot use standard perf
> > >> utility without sacrificing latency and performance.
> > >>
> > >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> > >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > >

[...]

> > >> +int
> > >> +__rte_pmu_enable_group(void)
> > >> +{
> > >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> > >> +	int ret;
> > >> +
> > >> +	if (rte_pmu.num_group_events == 0)
> > >> +		return -ENODEV;
> > >> +
> > >> +	ret = open_events(group);
> > >> +	if (ret)
> > >> +		goto out;
> > >> +
> > >> +	ret = mmap_events(group);
> > >> +	if (ret)
> > >> +		goto out;
> > >> +
> > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -
> 1) {
> > >> +		ret = -errno;
> > >> +		goto out;
> > >> +	}
> > >> +
> > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) ==
> -1) {
> > >> +		ret = -errno;
> > >> +		goto out;
> > >> +	}
> > >> +
> > >> +	rte_spinlock_lock(&rte_pmu.lock);
> > >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> > >
> > >Hmm.. so we insert pointer to TLS variable into the global list?
> > >Wonder what would happen if that thread get terminated?
> >
> > Nothing special. Any pointers to that thread-local in that thread are
> invalided.
> >
> > >Can memory from its TLS block get re-used (by other thread or for other
> purposes)?
> > >
> >
> > Why would any other thread reuse that?
> > Eventually main thread will need that data to do the cleanup.
> 
> I understand that main thread would need to access that data.
> I am not sure that it would be able to.
> Imagine thread calls rte_pmu_read(...) and then terminates, while program
> continues to run.

Is the example you describe here (i.e. a thread terminating in the middle of doing something) really a scenario DPDK is supposed to support?

> As I understand address of its RTE_PER_LCORE(_event_group) will still remain
> in rte_pmu.event_group_list,
> even if it is probably not valid any more.

There should be a "destructor/done/finish" function available to remove this from the list.

[...]

> > >Even if we'd decide to keep rte_pmu_read() as static inline (still not
> > >sure it is a good idea),
> >
> > We want to save as much cpu cycles as we possibly can and inlining does
> helps
> > in that matter.
> 
> Ok, so asking same question from different thread: how many cycles it will
> save?
> What is the difference in terms of performance when you have this function
> inlined vs not inlined?

We expect to use this in our in-house profiler library. For this reason, I have a very strong preference for absolute maximum performance.

Reading PMU events is for performance profiling, so I expect other potential users of the PMU library to share my opinion on this.
  
Tomasz Duszynski Feb. 28, 2023, 9:57 a.m. UTC | #16
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
>Sent: Monday, February 27, 2023 9:53 PM
>To: Tomasz Duszynski <tduszynski@marvell.com>; Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>;
>dev@dpdk.org
>Subject: RE: [EXT] Re: [PATCH v11 1/4] lib: add generic support for reading PMU events
>
>
>
>> >> Add support for programming PMU counters and reading their values
>> >> in runtime bypassing kernel completely.
>> >>
>> >> This is especially useful in cases where CPU cores are isolated i.e
>> >> run dedicated tasks. In such cases one cannot use standard perf
>> >> utility without sacrificing latency and performance.
>> >>
>> >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> >
>> >Few more comments/questions below.
>> >
>> >
>> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> >> 100644 index 0000000000..950f999cb7
>> >> --- /dev/null
>> >> +++ b/lib/pmu/rte_pmu.c
>> >> @@ -0,0 +1,460 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(C) 2023 Marvell International Ltd.
>> >> + */
>> >> +
>> >> +#include <ctype.h>
>> >> +#include <dirent.h>
>> >> +#include <errno.h>
>> >> +#include <regex.h>
>> >> +#include <stdlib.h>
>> >> +#include <string.h>
>> >> +#include <sys/ioctl.h>
>> >> +#include <sys/mman.h>
>> >> +#include <sys/queue.h>
>> >> +#include <sys/syscall.h>
>> >> +#include <unistd.h>
>> >> +
>> >> +#include <rte_atomic.h>
>> >> +#include <rte_per_lcore.h>
>> >> +#include <rte_pmu.h>
>> >> +#include <rte_spinlock.h>
>> >> +#include <rte_tailq.h>
>> >> +
>> >> +#include "pmu_private.h"
>> >> +
>> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> >> +
>> >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
>> >> +(__builtin_ffsll(m) - 1)) & (m))
>> >> +
>> >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> >> +struct rte_pmu rte_pmu;
>> >> +
>> >> +/*
>> >> + * Following __rte_weak functions provide default no-op.
>> >> +Architectures should override them if
>> >> + * necessary.
>> >> + */
>> >> +
>> >> +int
>> >> +__rte_weak pmu_arch_init(void)
>> >> +{
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +void
>> >> +__rte_weak pmu_arch_fini(void)
>> >> +{
>> >> +}
>> >> +
>> >> +void
>> >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
>> >> +{ }
>> >> +
>> >> +static int
>> >> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> >> +	char path[PATH_MAX];
>> >> +	char *config = NULL;
>> >> +	int high, low, ret;
>> >> +	FILE *fp;
>> >> +
>> >> +	*num = *mask = 0;
>> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name,
>name);
>> >> +	fp = fopen(path, "r");
>> >> +	if (fp == NULL)
>> >> +		return -errno;
>> >> +
>> >> +	errno = 0;
>> >> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> >> +	if (ret < 2) {
>> >> +		ret = -ENODATA;
>> >> +		goto out;
>> >> +	}
>> >> +	if (errno) {
>> >> +		ret = -errno;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	if (ret == 2)
>> >> +		high = low;
>> >> +
>> >> +	*mask = GENMASK_ULL(high, low);
>> >> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
>> >> +	*num = config[strlen(config) - 1];
>> >> +	*num = isdigit(*num) ? *num - '0' : 0;
>> >> +
>> >> +	ret = 0;
>> >> +out:
>> >> +	free(config);
>> >> +	fclose(fp);
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +parse_event(char *buf, uint64_t config[3]) {
>> >> +	char *token, *term;
>> >> +	int num, ret, val;
>> >> +	uint64_t mask;
>> >> +
>> >> +	config[0] = config[1] = config[2] = 0;
>> >> +
>> >> +	token = strtok(buf, ",");
>> >> +	while (token) {
>> >> +		errno = 0;
>> >> +		/* <term>=<value> */
>> >> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> >> +		if (ret < 1)
>> >> +			return -ENODATA;
>> >> +		if (errno)
>> >> +			return -errno;
>> >> +		if (ret == 1)
>> >> +			val = 1;
>> >> +
>> >> +		ret = get_term_format(term, &num, &mask);
>> >> +		free(term);
>> >> +		if (ret)
>> >> +			return ret;
>> >> +
>> >> +		config[num] |= FIELD_PREP(mask, val);
>> >> +		token = strtok(NULL, ",");
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +static int
>> >> +get_event_config(const char *name, uint64_t config[3]) {
>> >> +	char path[PATH_MAX], buf[BUFSIZ];
>> >> +	FILE *fp;
>> >> +	int ret;
>> >> +
>> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
>name);
>> >> +	fp = fopen(path, "r");
>> >> +	if (fp == NULL)
>> >> +		return -errno;
>> >> +
>> >> +	ret = fread(buf, 1, sizeof(buf), fp);
>> >> +	if (ret == 0) {
>> >> +		fclose(fp);
>> >> +
>> >> +		return -EINVAL;
>> >> +	}
>> >> +	fclose(fp);
>> >> +	buf[ret] = '\0';
>> >> +
>> >> +	return parse_event(buf, config);
>> >> +}
>> >> +
>> >> +static int
>> >> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> >> +	struct perf_event_attr attr = {
>> >> +		.size = sizeof(struct perf_event_attr),
>> >> +		.type = PERF_TYPE_RAW,
>> >> +		.exclude_kernel = 1,
>> >> +		.exclude_hv = 1,
>> >> +		.disabled = 1,
>> >> +	};
>> >> +
>> >> +	pmu_arch_fixup_config(config);
>> >> +
>> >> +	attr.config = config[0];
>> >> +	attr.config1 = config[1];
>> >> +	attr.config2 = config[2];
>> >> +
>> >> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
>> >> +
>> >> +static int
>> >> +open_events(struct rte_pmu_event_group *group) {
>> >> +	struct rte_pmu_event *event;
>> >> +	uint64_t config[3];
>> >> +	int num = 0, ret;
>> >> +
>> >> +	/* group leader gets created first, with fd = -1 */
>> >> +	group->fds[0] = -1;
>> >> +
>> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> >> +		ret = get_event_config(event->name, config);
>> >> +		if (ret)
>> >> +			continue;
>> >> +
>> >> +		ret = do_perf_event_open(config, group->fds[0]);
>> >> +		if (ret == -1) {
>> >> +			ret = -errno;
>> >> +			goto out;
>> >> +		}
>> >> +
>> >> +		group->fds[event->index] = ret;
>> >> +		num++;
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +out:
>> >> +	for (--num; num >= 0; num--) {
>> >> +		close(group->fds[num]);
>> >> +		group->fds[num] = -1;
>> >> +	}
>> >> +
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +mmap_events(struct rte_pmu_event_group *group) {
>> >> +	long page_size = sysconf(_SC_PAGE_SIZE);
>> >> +	unsigned int i;
>> >> +	void *addr;
>> >> +	int ret;
>> >> +
>> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> >> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
>> >> +		if (addr == MAP_FAILED) {
>> >> +			ret = -errno;
>> >> +			goto out;
>> >> +		}
>> >> +
>> >> +		group->mmap_pages[i] = addr;
>> >> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
>> >> +			ret = -EPERM;
>> >> +			goto out;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +out:
>> >> +	for (; i; i--) {
>> >> +		munmap(group->mmap_pages[i - 1], page_size);
>> >> +		group->mmap_pages[i - 1] = NULL;
>> >> +	}
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static void
>> >> +cleanup_events(struct rte_pmu_event_group *group) {
>> >> +	unsigned int i;
>> >> +
>> >> +	if (group->fds[0] != -1)
>> >> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
>> >> +PERF_IOC_FLAG_GROUP);
>> >> +
>> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
>> >> +		if (group->mmap_pages[i]) {
>> >> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> >> +			group->mmap_pages[i] = NULL;
>> >> +		}
>> >> +
>> >> +		if (group->fds[i] != -1) {
>> >> +			close(group->fds[i]);
>> >> +			group->fds[i] = -1;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	group->enabled = false;
>> >> +}
>> >> +
>> >> +int
>> >> +__rte_pmu_enable_group(void)
>> >> +{
>> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>> >> +	int ret;
>> >> +
>> >> +	if (rte_pmu.num_group_events == 0)
>> >> +		return -ENODEV;
>> >> +
>> >> +	ret = open_events(group);
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	ret = mmap_events(group);
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
>> >> +		ret = -errno;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> >> +		ret = -errno;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >> +	rte_spinlock_lock(&rte_pmu.lock);
>> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>> >
>> >Hmm.. so we insert pointer to TLS variable into the global list?
>> >Wonder what would happen if that thread get terminated?
>>
>> Nothing special. Any pointers to that thread-local in that thread are invalided.
>>
>> >Can memory from its TLS block get re-used (by other thread or for other purposes)?
>> >
>>
>> Why would any other thread reuse that?
>> Eventually main thread will need that data to do the cleanup.
>
>I understand that main thread would need to access that data.
>I am not sure that it would be able to.
>Imagine thread calls rte_pmu_read(...) and then terminates, while program continues to run.
>As I understand address of its RTE_PER_LCORE(_event_group) will still remain in
>rte_pmu.event_group_list, even if it is probably not valid any more.
>

Okay got your point. In DPDK that will not happen. We do not spawn/kill lcores in runtime. 
In other scenarios such approach will not work because once thread terminates it's per-thread-data
becomes invalid. 

>> >
>> >> +	rte_spinlock_unlock(&rte_pmu.lock);
>> >> +	group->enabled = true;
>> >> +
>> >> +	return 0;
>> >> +
>> >> +out:
>> >> +	cleanup_events(group);
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +scan_pmus(void)
>> >> +{
>> >> +	char path[PATH_MAX];
>> >> +	struct dirent *dent;
>> >> +	const char *name;
>> >> +	DIR *dirp;
>> >> +
>> >> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> >> +	if (dirp == NULL)
>> >> +		return -errno;
>> >> +
>> >> +	while ((dent = readdir(dirp))) {
>> >> +		name = dent->d_name;
>> >> +		if (name[0] == '.')
>> >> +			continue;
>> >> +
>> >> +		/* sysfs entry should either contain cpus or be a cpu */
>> >> +		if (!strcmp(name, "cpu"))
>> >> +			break;
>> >> +
>> >> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> >> +		if (access(path, F_OK) == 0)
>> >> +			break;
>> >> +	}
>> >> +
>> >> +	if (dent) {
>> >> +		rte_pmu.name = strdup(name);
>> >> +		if (rte_pmu.name == NULL) {
>> >> +			closedir(dirp);
>> >> +
>> >> +			return -ENOMEM;
>> >> +		}
>> >> +	}
>> >> +
>> >> +	closedir(dirp);
>> >> +
>> >> +	return rte_pmu.name ? 0 : -ENODEV; }
>> >> +
>> >> +static struct rte_pmu_event *
>> >> +new_event(const char *name)
>> >> +{
>> >> +	struct rte_pmu_event *event;
>> >> +
>> >> +	event = calloc(1, sizeof(*event));
>> >> +	if (event == NULL)
>> >> +		goto out;
>> >> +
>> >> +	event->name = strdup(name);
>> >> +	if (event->name == NULL) {
>> >> +		free(event);
>> >> +		event = NULL;
>> >> +	}
>> >> +
>> >> +out:
>> >> +	return event;
>> >> +}
>> >> +
>> >> +static void
>> >> +free_event(struct rte_pmu_event *event) {
>> >> +	free(event->name);
>> >> +	free(event);
>> >> +}
>> >> +
>> >> +int
>> >> +rte_pmu_add_event(const char *name) {
>> >> +	struct rte_pmu_event *event;
>> >> +	char path[PATH_MAX];
>> >> +
>> >> +	if (rte_pmu.name == NULL)
>> >> +		return -ENODEV;
>> >> +
>> >> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
>> >> +		return -ENOSPC;
>> >> +
>> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
>name);
>> >> +	if (access(path, R_OK))
>> >> +		return -ENODEV;
>> >> +
>> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> >> +		if (!strcmp(event->name, name))
>> >> +			return event->index;
>> >> +		continue;
>> >> +	}
>> >> +
>> >> +	event = new_event(name);
>> >> +	if (event == NULL)
>> >> +		return -ENOMEM;
>> >> +
>> >> +	event->index = rte_pmu.num_group_events++;
>> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> >> +
>> >> +	return event->index;
>> >> +}
>> >> +
>> >> +int
>> >> +rte_pmu_init(void)
>> >> +{
>> >> +	int ret;
>> >> +
>> >> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
>> >> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
>> >> +	 * via command line but application doesn't care enough and performs init/fini again.
>> >> +	 */
>> >> +	if (rte_pmu.initialized != 0) {
>> >> +		rte_pmu.initialized++;
>> >> +		return 0;
>> >> +	}
>> >> +
>> >> +	ret = scan_pmus();
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	ret = pmu_arch_init();
>> >> +	if (ret)
>> >> +		goto out;
>> >> +
>> >> +	TAILQ_INIT(&rte_pmu.event_list);
>> >> +	TAILQ_INIT(&rte_pmu.event_group_list);
>> >> +	rte_spinlock_init(&rte_pmu.lock);
>> >> +	rte_pmu.initialized = 1;
>> >> +
>> >> +	return 0;
>> >> +out:
>> >> +	free(rte_pmu.name);
>> >> +	rte_pmu.name = NULL;
>> >> +
>> >> +	return ret;
>> >> +}
>> >> +
>> >> +void
>> >> +rte_pmu_fini(void)
>> >> +{
>> >> +	struct rte_pmu_event_group *group, *tmp_group;
>> >> +	struct rte_pmu_event *event, *tmp_event;
>> >> +
>> >> +	/* cleanup once init count drops to zero */
>> >> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
>> >> +		return;
>> >> +
>> >> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> >> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> >> +		free_event(event);
>> >> +	}
>> >> +
>> >> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
>> >> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
>> >> +		cleanup_events(group);
>> >> +	}
>> >> +
>> >> +	pmu_arch_fini();
>> >> +	free(rte_pmu.name);
>> >> +	rte_pmu.name = NULL;
>> >> +	rte_pmu.num_group_events = 0;
>> >> +}
>> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> >> 100644 index 0000000000..6b664c3336
>> >> --- /dev/null
>> >> +++ b/lib/pmu/rte_pmu.h
>> >> @@ -0,0 +1,212 @@
>> >> +/* SPDX-License-Identifier: BSD-3-Clause
>> >> + * Copyright(c) 2023 Marvell
>> >> + */
>> >> +
>> >> +#ifndef _RTE_PMU_H_
>> >> +#define _RTE_PMU_H_
>> >> +
>> >> +/**
>> >> + * @file
>> >> + *
>> >> + * PMU event tracing operations
>> >> + *
>> >> + * This file defines generic API and types necessary to setup PMU
>> >> +and
>> >> + * read selected counters in runtime.
>> >> + */
>> >> +
>> >> +#ifdef __cplusplus
>> >> +extern "C" {
>> >> +#endif
>> >> +
>> >> +#include <linux/perf_event.h>
>> >> +
>> >> +#include <rte_atomic.h>
>> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
>> >> +#include <rte_compat.h> #include <rte_spinlock.h>
>> >> +
>> >> +/** Maximum number of events in a group */ #define
>> >> +MAX_NUM_GROUP_EVENTS 8
>> >
>> >forgot RTE_ prefix.
>> >In fact, do you really need number of events in group to be hard-coded?
>> >Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?
>> >
>>
>> 8 is reasonable number I think. X86/ARM have actually less that that (was that something like
>4?).
>> Moreover events are scheduled as a group so there must be enough hw
>> counters available for that to succeed. So this number should cover current needs.
>
>If you think 8 will be enough to cover all possible future cases - I am ok either way.
>Still need RTE_ prefix for it.
>

Okay that can be added. 

>> >> +
>> >> +/**
>> >> + * A structure describing a group of events.
>> >> + */
>> >> +struct rte_pmu_event_group {
>> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages
>*/
>> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> >> +	bool enabled; /**< true if group was enabled on particular lcore */
>> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
>> >> +__rte_cache_aligned;
>> >> +
>> >
>> >Even if we'd decide to keep rte_pmu_read() as static inline (still
>> >not sure it is a good idea),
>>
>> We want to save as much cpu cycles as we possibly can and inlining
>> does helps in that matter.
>
>Ok, so asking same question from different thread: how many cycles it will save?
>What is the difference in terms of performance when you have this function inlined vs not inlined?
>

On x86 setup which is not under load, no cpusets configured, etc *just* not inlining rte_pmu_read() 
decreases performance by roughly 24% (44 vs 58 cpu cycles). At least that is reported by 
trace_perf_autotest. 


>> >why these two struct below (rte_pmu_event and rte_pmu) have to be public?
>> >I think both can be safely moved away from public headers.
>> >
>>
>> struct rte_pmu_event can be hidden I guess.
>> struct rte_pmu is used in this header hence cannot be moved elsewhere.
>
>Not sure why?
>Is that because you use it inside rte_pmu_read()?
>But that check I think can be safely moved into __rte_pmu_enable_group() or probably even into
>rte_pmu_add_event().

No, we should not do that. Otherwise we'll need to call function. Even though check will happen
early on still function prologue/epilogue will happen. This takes cycles. 

>
>> >
>> >> +/**
>> >> + * A structure describing an event.
>> >> + */
>> >> +struct rte_pmu_event {
>> >> +	char *name; /**< name of an event */
>> >> +	unsigned int index; /**< event index into fds/mmap_pages */
>> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
>> >
>> >> +
>> >> +/**
>> >> + * A PMU state container.
>> >> + */
>> >> +struct rte_pmu {
>> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
>> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
>> >> +	unsigned int num_group_events; /**< number of events in a group */
>> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
>> >> +	unsigned int initialized; /**< initialization counter */ };
>> >> +
>> >> +/** lcore event group */
>> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
>> >> +
>> >> +/** PMU state container */
>> >> +extern struct rte_pmu rte_pmu;
>> >> +
>> >> +/** Each architecture supporting PMU needs to provide its own
>> >> +version */ #ifndef rte_pmu_pmc_read #define
>> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Read PMU counter.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >> + *
>> >> + * @param pc
>> >> + *   Pointer to the mmapped user page.
>> >> + * @return
>> >> + *   Counter value read from hardware.
>> >> + */
>> >> +static __rte_always_inline uint64_t __rte_pmu_read_userpage(struct
>> >> +perf_event_mmap_page *pc) {
>> >> +	uint64_t width, offset;
>> >> +	uint32_t seq, index;
>> >> +	int64_t pmc;
>> >> +
>> >> +	for (;;) {
>> >> +		seq = pc->lock;
>> >> +		rte_compiler_barrier();
>> >> +		index = pc->index;
>> >> +		offset = pc->offset;
>> >> +		width = pc->pmc_width;
>> >> +
>> >> +		/* index set to 0 means that particular counter cannot be used */
>> >> +		if (likely(pc->cap_user_rdpmc && index)) {
>> >
>> >In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
>> >Do you need another check here? Or this capability can be disabled by
>> >kernel at run-time?
>> >
>>
>> That extra check in mmap_event() may be removed actually. Some archs
>> allow disabling reading rdpmc (I think that on x86 one can do that) so this check needs to stay.
>>
>> >
>> >> +			pmc = rte_pmu_pmc_read(index - 1);
>> >> +			pmc <<= 64 - width;
>> >> +			pmc >>= 64 - width;
>> >> +			offset += pmc;
>> >> +		}
>> >> +
>> >> +		rte_compiler_barrier();
>> >> +
>> >> +		if (likely(pc->lock == seq))
>> >> +			return offset;
>> >> +	}
>> >> +
>> >> +	return 0;
>> >> +}
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Enable group of events on the calling lcore.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >
>> >__rte_internal ?
>> >
>>
>> No this cannot be internal because that will make functions calling it
>> internal as well hence apps won't be able to use that. This has
>> already been brought up by one of the reviewers.
>
>Ok, then we probably can mark it with ' @internal' tag in formal comments?
>

I added a warning not to call that directly. Since function is not internal (in DPDK parlance) per se
I don’t think we should add more confusion that extra tag. 

>>
>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >> +__rte_experimental
>> >> +int
>> >> +__rte_pmu_enable_group(void);
>> >> +
>> >> +/**
>> >> + * @warning
>> >> + * @b EXPERIMENTAL: this API may change without prior notice
>> >> + *
>> >> + * Initialize PMU library.
>> >> + *
>> >> + * @warning This should be not called directly.
>> >
>> >Hmm.. then who should call it?
>> >If it not supposed to be called directly, why to declare it here?
>> >
>>
>> This is inlined and has one caller i.e rte_pmu_read().
>
>I thought we are talking here about rte_pmu_init().
>I don't see where it is inlined and still not clear why it can't be called directly.
>

No this cannot be called by init because groups are configured in runtime. That is why
__rte_pmu_enable_group() is called once in rte_pmu_read().

*Other* code should not call that directly. And yes, that is not inlined - my mistake. 

>> >> + *
>> >> + * @return
>> >> + *   0 in case of success, negative value otherwise.
>> >> + */
>> >
>> >Probably worth to mention that this function is not MT safe.
>> >Same for _fini_ and add_event.
>> >Also worth to mention that all control-path functions
>> >(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.
>> >
>>
>> Yes they are meant to be called from main thread.
>
>Ok, then please add that into formal API comments.
>
>> >> +__rte_experimental
>> >> +int
>> >> +rte_pmu_init(void);
>> >> +
  
Konstantin Ananyev Feb. 28, 2023, 11:35 a.m. UTC | #17
> >>>>>>>>>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file
> >>>>>>>>>> mode
> >>>>>>>>>> 100644 index 0000000000..6b664c3336
> >>>>>>>>>> --- /dev/null
> >>>>>>>>>> +++ b/lib/pmu/rte_pmu.h
> >>>>>>>>>> @@ -0,0 +1,212 @@
> >>>>>>>>>> +/* SPDX-License-Identifier: BSD-3-Clause
> >>>>>>>>>> + * Copyright(c) 2023 Marvell  */
> >>>>>>>>>> +
> >>>>>>>>>> +#ifndef _RTE_PMU_H_
> >>>>>>>>>> +#define _RTE_PMU_H_
> >>>>>>>>>> +
> >>>>>>>>>> +/**
> >>>>>>>>>> + * @file
> >>>>>>>>>> + *
> >>>>>>>>>> + * PMU event tracing operations
> >>>>>>>>>> + *
> >>>>>>>>>> + * This file defines generic API and types necessary to setup
> >>>>>>>>>> +PMU and
> >>>>>>>>>> + * read selected counters in runtime.
> >>>>>>>>>> + */
> >>>>>>>>>> +
> >>>>>>>>>> +#ifdef __cplusplus
> >>>>>>>>>> +extern "C" {
> >>>>>>>>>> +#endif
> >>>>>>>>>> +
> >>>>>>>>>> +#include <linux/perf_event.h>
> >>>>>>>>>> +
> >>>>>>>>>> +#include <rte_atomic.h>
> >>>>>>>>>> +#include <rte_branch_prediction.h> #include <rte_common.h>
> >>>>>>>>>> +#include <rte_compat.h> #include <rte_spinlock.h>
> >>>>>>>>>> +
> >>>>>>>>>> +/** Maximum number of events in a group */ #define
> >>>>>>>>>> +MAX_NUM_GROUP_EVENTS 8
> >>>>>>>>>> +
> >>>>>>>>>> +/**
> >>>>>>>>>> + * A structure describing a group of events.
> >>>>>>>>>> + */
> >>>>>>>>>> +struct rte_pmu_event_group {
> >>>>>>>>>> +	struct perf_event_mmap_page
> >>>>>>>>>> +*mmap_pages[MAX_NUM_GROUP_EVENTS];
> >>>>>>>>>> +/**< array of user pages
> >>>>>>> */
> >>>>>>>>>> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >>>>>>>>>> +	bool enabled; /**< true if group was enabled on particular lcore */
> >>>>>>>>>> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
> >>>>>>>>>> +__rte_cache_aligned;
> >>>>>>>>>> +
> >>>>>>>>>> +/**
> >>>>>>>>>> + * A structure describing an event.
> >>>>>>>>>> + */
> >>>>>>>>>> +struct rte_pmu_event {
> >>>>>>>>>> +	char *name; /**< name of an event */
> >>>>>>>>>> +	unsigned int index; /**< event index into fds/mmap_pages */
> >>>>>>>>>> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
> >>>>>>>>>> +
> >>>>>>>>>> +/**
> >>>>>>>>>> + * A PMU state container.
> >>>>>>>>>> + */
> >>>>>>>>>> +struct rte_pmu {
> >>>>>>>>>> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> >>>>>>>>>> +	rte_spinlock_t lock; /**< serialize access to event group list */
> >>>>>>>>>> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> >>>>>>>>>> +	unsigned int num_group_events; /**< number of events in a group */
> >>>>>>>>>> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> >>>>>>>>>> +	unsigned int initialized; /**< initialization counter */ };
> >>>>>>>>>> +
> >>>>>>>>>> +/** lcore event group */
> >>>>>>>>>> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group,
> >>>>>>>>>> +_event_group);
> >>>>>>>>>> +
> >>>>>>>>>> +/** PMU state container */
> >>>>>>>>>> +extern struct rte_pmu rte_pmu;
> >>>>>>>>>> +
> >>>>>>>>>> +/** Each architecture supporting PMU needs to provide its own
> >>>>>>>>>> +version */ #ifndef rte_pmu_pmc_read #define
> >>>>>>>>>> +rte_pmu_pmc_read(index) ({ 0; }) #endif
> >>>>>>>>>> +
> >>>>>>>>>> +/**
> >>>>>>>>>> + * @warning
> >>>>>>>>>> + * @b EXPERIMENTAL: this API may change without prior notice
> >>>>>>>>>> + *
> >>>>>>>>>> + * Read PMU counter.
> >>>>>>>>>> + *
> >>>>>>>>>> + * @warning This should be not called directly.
> >>>>>>>>>> + *
> >>>>>>>>>> + * @param pc
> >>>>>>>>>> + *   Pointer to the mmapped user page.
> >>>>>>>>>> + * @return
> >>>>>>>>>> + *   Counter value read from hardware.
> >>>>>>>>>> + */
> >>>>>>>>>> +static __rte_always_inline uint64_t
> >>>>>>>>>> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> >>>>>>>>>> +	uint64_t width, offset;
> >>>>>>>>>> +	uint32_t seq, index;
> >>>>>>>>>> +	int64_t pmc;
> >>>>>>>>>> +
> >>>>>>>>>> +	for (;;) {
> >>>>>>>>>> +		seq = pc->lock;
> >>>>>>>>>> +		rte_compiler_barrier();
> >>>>>>>>>
> >>>>>>>>> Are you sure that compiler_barrier() is enough here?
> >>>>>>>>> On some archs CPU itself has freedom to re-order reads.
> >>>>>>>>> Or I am missing something obvious here?
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> It's a matter of not keeping old stuff cached in registers and
> >>>>>>>> making sure that we have two reads of lock. CPU reordering won't
> >>>>>>>> do any harm here.
> >>>>>>>
> >>>>>>> Sorry, I didn't get you here:
> >>>>>>> Suppose CPU will re-order reads and will read lock *after* index or offset value.
> >>>>>>> Wouldn't it mean that in that case index and/or offset can contain old/invalid values?
> >>>>>>>
> >>>>>>
> >>>>>> This number is just an indicator whether kernel did change something or not.
> >>>>>
> >>>>> You are talking about pc->lock, right?
> >>>>> Yes, I do understand that it is sort of seqlock.
> >>>>> That's why I am puzzled why we do not care about possible cpu read-reordering.
> >>>>> Manual for perf_event_open() also has a code snippet with compiler barrier only...
> >>>>>
> >>>>>> If cpu reordering will come into play then this will not change
> >>>>>> anything from pov of this
> >>> loop.
> >>>>>> All we want is fresh data when needed and no involvement of
> >>>>>> compiler when it comes to reordering code.
> >>>>>
> >>>>> Ok, can you probably explain to me why the following could not happen:
> >>>>> T0:
> >>>>> pc->seqlock==0; pc->index==I1; pc->offset==O1;
> >>>>> T1:
> >>>>>       cpu #0 read pmu (due to cpu read reorder, we get index value before seqlock):
> >>>>>        index=pc->index;  //index==I1;
> >>>>> T2:
> >>>>>       cpu #1 kernel vent_update_userpage:
> >>>>>       pc->lock++; // pc->lock==1
> >>>>>       pc->index=I2;
> >>>>>       pc->offset=O2;
> >>>>>       ...
> >>>>>       pc->lock++; //pc->lock==2
> >>>>> T3:
> >>>>>       cpu #0 continue with read pmu:
> >>>>>       seq=pc->lock; //seq == 2
> >>>>>        offset=pc->offset; // offset == O2
> >>>>>        ....
> >>>>>        pmc = rte_pmu_pmc_read(index - 1);  // Note that we read at I1, not I2
> >>>>>        offset += pmc; //offset == O2 + pmcread(I1-1);
> >>>>>        if (pc->lock == seq) // they are equal, return
> >>>>>              return offset;
> >>>>>
> >>>>> Or, it can happen, but by some reason we don't care much?
> >>>>>
> >>>>
> >>>> This code does self-monitoring and user page (whole group actually)
> >>>> is per thread running on current cpu. Hence I am not sure what are
> >>>> you trying to prove with that
> >>> example.
> >>>
> >>> I am not trying to prove anything so far.
> >>> I am asking is such situation possible or not, and if not, why?
> >>> My current understanding (possibly wrong) is that after you mmaped
> >>> these pages, kernel still can asynchronously update them.
> >>> So, when reading the data from these pages you have to check 'lock'
> >>> value before and after accessing other data.
> >>> If so, why possible cpu read-reordering doesn't matter?
> >>>
> >>
> >> Look. I'll reiterate that.
> >>
> >> 1. That user page/group/PMU config is per process. Other processes do not access that.
> >
> >Ok, that's clear.
> >
> >
> >>     All this happens on the very same CPU where current thread is running.
> >
> >Ok... but can't this page be updated by kernel thread running simultaneously on different CPU?
> >
> 
> I already pointed out that event/counter configuration is bound to current cpu. How can possibly
> other cpu update that configuration? This cannot work.
 
Can you elaborate a bit what you mean with 'event/counter configuration is bound to current cpu'?
If that means it could be updated only by code running on given, CPU - yes it is clear.
But can this page be read by user-space from different CPU? 
Or you just assume that your user-space thread will *always* be bounded just to one
particular CPU and would never switch?

> 
> 
> If you think that there's some problem with the code (or is simply broken on your setup) and logic[] a bit
> has obvious flaw and you can provide meaningful evidence of that then I'd be more than happy to
> apply that fix. Otherwise that discussion will get us nowhere.
> 

Yes, we are going in cycles here.
I keep asking you same questions about library function internals, you keep refusing to explain things
to me insisting that it is 'way too obvious'.
Well, sorry but it is not obvious to me.
So I still insist that a clearly documented internal design and expected usage is required for that patch
before it can be accepted.

Konstantin
  
Konstantin Ananyev Feb. 28, 2023, 11:58 a.m. UTC | #18
> >> >> Add support for programming PMU counters and reading their values
> >> >> in runtime bypassing kernel completely.
> >> >>
> >> >> This is especially useful in cases where CPU cores are isolated i.e
> >> >> run dedicated tasks. In such cases one cannot use standard perf
> >> >> utility without sacrificing latency and performance.
> >> >>
> >> >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> >> >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> >> >
> >> >Few more comments/questions below.
> >> >
> >> >
> >> >> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
> >> >> 100644 index 0000000000..950f999cb7
> >> >> --- /dev/null
> >> >> +++ b/lib/pmu/rte_pmu.c
> >> >> @@ -0,0 +1,460 @@
> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> >> + * Copyright(C) 2023 Marvell International Ltd.
> >> >> + */
> >> >> +
> >> >> +#include <ctype.h>
> >> >> +#include <dirent.h>
> >> >> +#include <errno.h>
> >> >> +#include <regex.h>
> >> >> +#include <stdlib.h>
> >> >> +#include <string.h>
> >> >> +#include <sys/ioctl.h>
> >> >> +#include <sys/mman.h>
> >> >> +#include <sys/queue.h>
> >> >> +#include <sys/syscall.h>
> >> >> +#include <unistd.h>
> >> >> +
> >> >> +#include <rte_atomic.h>
> >> >> +#include <rte_per_lcore.h>
> >> >> +#include <rte_pmu.h>
> >> >> +#include <rte_spinlock.h>
> >> >> +#include <rte_tailq.h>
> >> >> +
> >> >> +#include "pmu_private.h"
> >> >> +
> >> >> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> >> >> +
> >> >> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
> >> >> +((64 - 1 - (h))))) #define FIELD_PREP(m, v) (((uint64_t)(v) <<
> >> >> +(__builtin_ffsll(m) - 1)) & (m))
> >> >> +
> >> >> +RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> >> +struct rte_pmu rte_pmu;
> >> >> +
> >> >> +/*
> >> >> + * Following __rte_weak functions provide default no-op.
> >> >> +Architectures should override them if
> >> >> + * necessary.
> >> >> + */
> >> >> +
> >> >> +int
> >> >> +__rte_weak pmu_arch_init(void)
> >> >> +{
> >> >> +	return 0;
> >> >> +}
> >> >> +
> >> >> +void
> >> >> +__rte_weak pmu_arch_fini(void)
> >> >> +{
> >> >> +}
> >> >> +
> >> >> +void
> >> >> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> >> >> +{ }
> >> >> +
> >> >> +static int
> >> >> +get_term_format(const char *name, int *num, uint64_t *mask) {
> >> >> +	char path[PATH_MAX];
> >> >> +	char *config = NULL;
> >> >> +	int high, low, ret;
> >> >> +	FILE *fp;
> >> >> +
> >> >> +	*num = *mask = 0;
> >> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name,
> >name);
> >> >> +	fp = fopen(path, "r");
> >> >> +	if (fp == NULL)
> >> >> +		return -errno;
> >> >> +
> >> >> +	errno = 0;
> >> >> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> >> >> +	if (ret < 2) {
> >> >> +		ret = -ENODATA;
> >> >> +		goto out;
> >> >> +	}
> >> >> +	if (errno) {
> >> >> +		ret = -errno;
> >> >> +		goto out;
> >> >> +	}
> >> >> +
> >> >> +	if (ret == 2)
> >> >> +		high = low;
> >> >> +
> >> >> +	*mask = GENMASK_ULL(high, low);
> >> >> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> >> >> +	*num = config[strlen(config) - 1];
> >> >> +	*num = isdigit(*num) ? *num - '0' : 0;
> >> >> +
> >> >> +	ret = 0;
> >> >> +out:
> >> >> +	free(config);
> >> >> +	fclose(fp);
> >> >> +
> >> >> +	return ret;
> >> >> +}
> >> >> +
> >> >> +static int
> >> >> +parse_event(char *buf, uint64_t config[3]) {
> >> >> +	char *token, *term;
> >> >> +	int num, ret, val;
> >> >> +	uint64_t mask;
> >> >> +
> >> >> +	config[0] = config[1] = config[2] = 0;
> >> >> +
> >> >> +	token = strtok(buf, ",");
> >> >> +	while (token) {
> >> >> +		errno = 0;
> >> >> +		/* <term>=<value> */
> >> >> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> >> >> +		if (ret < 1)
> >> >> +			return -ENODATA;
> >> >> +		if (errno)
> >> >> +			return -errno;
> >> >> +		if (ret == 1)
> >> >> +			val = 1;
> >> >> +
> >> >> +		ret = get_term_format(term, &num, &mask);
> >> >> +		free(term);
> >> >> +		if (ret)
> >> >> +			return ret;
> >> >> +
> >> >> +		config[num] |= FIELD_PREP(mask, val);
> >> >> +		token = strtok(NULL, ",");
> >> >> +	}
> >> >> +
> >> >> +	return 0;
> >> >> +}
> >> >> +
> >> >> +static int
> >> >> +get_event_config(const char *name, uint64_t config[3]) {
> >> >> +	char path[PATH_MAX], buf[BUFSIZ];
> >> >> +	FILE *fp;
> >> >> +	int ret;
> >> >> +
> >> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
> >name);
> >> >> +	fp = fopen(path, "r");
> >> >> +	if (fp == NULL)
> >> >> +		return -errno;
> >> >> +
> >> >> +	ret = fread(buf, 1, sizeof(buf), fp);
> >> >> +	if (ret == 0) {
> >> >> +		fclose(fp);
> >> >> +
> >> >> +		return -EINVAL;
> >> >> +	}
> >> >> +	fclose(fp);
> >> >> +	buf[ret] = '\0';
> >> >> +
> >> >> +	return parse_event(buf, config);
> >> >> +}
> >> >> +
> >> >> +static int
> >> >> +do_perf_event_open(uint64_t config[3], int group_fd) {
> >> >> +	struct perf_event_attr attr = {
> >> >> +		.size = sizeof(struct perf_event_attr),
> >> >> +		.type = PERF_TYPE_RAW,
> >> >> +		.exclude_kernel = 1,
> >> >> +		.exclude_hv = 1,
> >> >> +		.disabled = 1,
> >> >> +	};
> >> >> +
> >> >> +	pmu_arch_fixup_config(config);
> >> >> +
> >> >> +	attr.config = config[0];
> >> >> +	attr.config1 = config[1];
> >> >> +	attr.config2 = config[2];
> >> >> +
> >> >> +	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
> >> >> +
> >> >> +static int
> >> >> +open_events(struct rte_pmu_event_group *group) {
> >> >> +	struct rte_pmu_event *event;
> >> >> +	uint64_t config[3];
> >> >> +	int num = 0, ret;
> >> >> +
> >> >> +	/* group leader gets created first, with fd = -1 */
> >> >> +	group->fds[0] = -1;
> >> >> +
> >> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> >> >> +		ret = get_event_config(event->name, config);
> >> >> +		if (ret)
> >> >> +			continue;
> >> >> +
> >> >> +		ret = do_perf_event_open(config, group->fds[0]);
> >> >> +		if (ret == -1) {
> >> >> +			ret = -errno;
> >> >> +			goto out;
> >> >> +		}
> >> >> +
> >> >> +		group->fds[event->index] = ret;
> >> >> +		num++;
> >> >> +	}
> >> >> +
> >> >> +	return 0;
> >> >> +out:
> >> >> +	for (--num; num >= 0; num--) {
> >> >> +		close(group->fds[num]);
> >> >> +		group->fds[num] = -1;
> >> >> +	}
> >> >> +
> >> >> +
> >> >> +	return ret;
> >> >> +}
> >> >> +
> >> >> +static int
> >> >> +mmap_events(struct rte_pmu_event_group *group) {
> >> >> +	long page_size = sysconf(_SC_PAGE_SIZE);
> >> >> +	unsigned int i;
> >> >> +	void *addr;
> >> >> +	int ret;
> >> >> +
> >> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> >> >> +		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> >> >> +		if (addr == MAP_FAILED) {
> >> >> +			ret = -errno;
> >> >> +			goto out;
> >> >> +		}
> >> >> +
> >> >> +		group->mmap_pages[i] = addr;
> >> >> +		if (!group->mmap_pages[i]->cap_user_rdpmc) {
> >> >> +			ret = -EPERM;
> >> >> +			goto out;
> >> >> +		}
> >> >> +	}
> >> >> +
> >> >> +	return 0;
> >> >> +out:
> >> >> +	for (; i; i--) {
> >> >> +		munmap(group->mmap_pages[i - 1], page_size);
> >> >> +		group->mmap_pages[i - 1] = NULL;
> >> >> +	}
> >> >> +
> >> >> +	return ret;
> >> >> +}
> >> >> +
> >> >> +static void
> >> >> +cleanup_events(struct rte_pmu_event_group *group) {
> >> >> +	unsigned int i;
> >> >> +
> >> >> +	if (group->fds[0] != -1)
> >> >> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
> >> >> +PERF_IOC_FLAG_GROUP);
> >> >> +
> >> >> +	for (i = 0; i < rte_pmu.num_group_events; i++) {
> >> >> +		if (group->mmap_pages[i]) {
> >> >> +			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> >> >> +			group->mmap_pages[i] = NULL;
> >> >> +		}
> >> >> +
> >> >> +		if (group->fds[i] != -1) {
> >> >> +			close(group->fds[i]);
> >> >> +			group->fds[i] = -1;
> >> >> +		}
> >> >> +	}
> >> >> +
> >> >> +	group->enabled = false;
> >> >> +}
> >> >> +
> >> >> +int
> >> >> +__rte_pmu_enable_group(void)
> >> >> +{
> >> >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> >> >> +	int ret;
> >> >> +
> >> >> +	if (rte_pmu.num_group_events == 0)
> >> >> +		return -ENODEV;
> >> >> +
> >> >> +	ret = open_events(group);
> >> >> +	if (ret)
> >> >> +		goto out;
> >> >> +
> >> >> +	ret = mmap_events(group);
> >> >> +	if (ret)
> >> >> +		goto out;
> >> >> +
> >> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> >> >> +		ret = -errno;
> >> >> +		goto out;
> >> >> +	}
> >> >> +
> >> >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> >> >> +		ret = -errno;
> >> >> +		goto out;
> >> >> +	}
> >> >> +
> >> >> +	rte_spinlock_lock(&rte_pmu.lock);
> >> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> >> >
> >> >Hmm.. so we insert pointer to TLS variable into the global list?
> >> >Wonder what would happen if that thread get terminated?
> >>
> >> Nothing special. Any pointers to that thread-local in that thread are invalided.
> >>
> >> >Can memory from its TLS block get re-used (by other thread or for other purposes)?
> >> >
> >>
> >> Why would any other thread reuse that?
> >> Eventually main thread will need that data to do the cleanup.
> >
> >I understand that main thread would need to access that data.
> >I am not sure that it would be able to.
> >Imagine thread calls rte_pmu_read(...) and then terminates, while program continues to run.
> >As I understand address of its RTE_PER_LCORE(_event_group) will still remain in
> >rte_pmu.event_group_list, even if it is probably not valid any more.
> >
> 
> Okay got your point. In DPDK that will not happen. We do not spawn/kill lcores in runtime.

Well, yes usually DPDK app doesn't do that, but in theory there is an API to register/unregister
non-eal threads as lcores: rte_thread_register()/rte_thread_unregister().
Also besides of lcores there are control threads, some house-keeping threads, plus user is free
to spawn/kill his own threads.
Are you saying that this library doesn't support none of them?
If so, then at least that should be very clearly documented.
Though I think a proper way is  handle this situation somehow -
either return error at __rte_pmu_enable_group(), or change the code to allow it to work
properly from any thread. I don't think it is that hard.

> In other scenarios such approach will not work because once thread terminates it's per-thread-data
> becomes invalid.
> 
> >> >
> >> >> +	rte_spinlock_unlock(&rte_pmu.lock);
> >> >> +	group->enabled = true;
> >> >> +
> >> >> +	return 0;
> >> >> +
> >> >> +out:
> >> >> +	cleanup_events(group);
> >> >> +
> >> >> +	return ret;
> >> >> +}
> >> >> +
> >> >> +static int
> >> >> +scan_pmus(void)
> >> >> +{
> >> >> +	char path[PATH_MAX];
> >> >> +	struct dirent *dent;
> >> >> +	const char *name;
> >> >> +	DIR *dirp;
> >> >> +
> >> >> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> >> >> +	if (dirp == NULL)
> >> >> +		return -errno;
> >> >> +
> >> >> +	while ((dent = readdir(dirp))) {
> >> >> +		name = dent->d_name;
> >> >> +		if (name[0] == '.')
> >> >> +			continue;
> >> >> +
> >> >> +		/* sysfs entry should either contain cpus or be a cpu */
> >> >> +		if (!strcmp(name, "cpu"))
> >> >> +			break;
> >> >> +
> >> >> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> >> >> +		if (access(path, F_OK) == 0)
> >> >> +			break;
> >> >> +	}
> >> >> +
> >> >> +	if (dent) {
> >> >> +		rte_pmu.name = strdup(name);
> >> >> +		if (rte_pmu.name == NULL) {
> >> >> +			closedir(dirp);
> >> >> +
> >> >> +			return -ENOMEM;
> >> >> +		}
> >> >> +	}
> >> >> +
> >> >> +	closedir(dirp);
> >> >> +
> >> >> +	return rte_pmu.name ? 0 : -ENODEV; }
> >> >> +
> >> >> +static struct rte_pmu_event *
> >> >> +new_event(const char *name)
> >> >> +{
> >> >> +	struct rte_pmu_event *event;
> >> >> +
> >> >> +	event = calloc(1, sizeof(*event));
> >> >> +	if (event == NULL)
> >> >> +		goto out;
> >> >> +
> >> >> +	event->name = strdup(name);
> >> >> +	if (event->name == NULL) {
> >> >> +		free(event);
> >> >> +		event = NULL;
> >> >> +	}
> >> >> +
> >> >> +out:
> >> >> +	return event;
> >> >> +}
> >> >> +
> >> >> +static void
> >> >> +free_event(struct rte_pmu_event *event) {
> >> >> +	free(event->name);
> >> >> +	free(event);
> >> >> +}
> >> >> +
> >> >> +int
> >> >> +rte_pmu_add_event(const char *name) {
> >> >> +	struct rte_pmu_event *event;
> >> >> +	char path[PATH_MAX];
> >> >> +
> >> >> +	if (rte_pmu.name == NULL)
> >> >> +		return -ENODEV;
> >> >> +
> >> >> +	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
> >> >> +		return -ENOSPC;
> >> >> +
> >> >> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name,
> >name);
> >> >> +	if (access(path, R_OK))
> >> >> +		return -ENODEV;
> >> >> +
> >> >> +	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> >> >> +		if (!strcmp(event->name, name))
> >> >> +			return event->index;
> >> >> +		continue;
> >> >> +	}
> >> >> +
> >> >> +	event = new_event(name);
> >> >> +	if (event == NULL)
> >> >> +		return -ENOMEM;
> >> >> +
> >> >> +	event->index = rte_pmu.num_group_events++;
> >> >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> >> >> +
> >> >> +	return event->index;
> >> >> +}
> >> >> +
> >> >> +int
> >> >> +rte_pmu_init(void)
> >> >> +{
> >> >> +	int ret;
> >> >> +
> >> >> +	/* Allow calling init from multiple contexts within a single thread. This simplifies
> >> >> +	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
> >> >> +	 * via command line but application doesn't care enough and performs init/fini again.
> >> >> +	 */
> >> >> +	if (rte_pmu.initialized != 0) {
> >> >> +		rte_pmu.initialized++;
> >> >> +		return 0;
> >> >> +	}
> >> >> +
> >> >> +	ret = scan_pmus();
> >> >> +	if (ret)
> >> >> +		goto out;
> >> >> +
> >> >> +	ret = pmu_arch_init();
> >> >> +	if (ret)
> >> >> +		goto out;
> >> >> +
> >> >> +	TAILQ_INIT(&rte_pmu.event_list);
> >> >> +	TAILQ_INIT(&rte_pmu.event_group_list);
> >> >> +	rte_spinlock_init(&rte_pmu.lock);
> >> >> +	rte_pmu.initialized = 1;
> >> >> +
> >> >> +	return 0;
> >> >> +out:
> >> >> +	free(rte_pmu.name);
> >> >> +	rte_pmu.name = NULL;
> >> >> +
> >> >> +	return ret;
> >> >> +}
> >> >> +
> >> >> +void
> >> >> +rte_pmu_fini(void)
> >> >> +{
> >> >> +	struct rte_pmu_event_group *group, *tmp_group;
> >> >> +	struct rte_pmu_event *event, *tmp_event;
> >> >> +
> >> >> +	/* cleanup once init count drops to zero */
> >> >> +	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
> >> >> +		return;
> >> >> +
> >> >> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> >> >> +		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> >> >> +		free_event(event);
> >> >> +	}
> >> >> +
> >> >> +	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
> >> >> +		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
> >> >> +		cleanup_events(group);
> >> >> +	}
> >> >> +
> >> >> +	pmu_arch_fini();
> >> >> +	free(rte_pmu.name);
> >> >> +	rte_pmu.name = NULL;
> >> >> +	rte_pmu.num_group_events = 0;
> >> >> +}
> >> >> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
> >> >> 100644 index 0000000000..6b664c3336
> >> >> --- /dev/null
> >> >> +++ b/lib/pmu/rte_pmu.h
> >> >> @@ -0,0 +1,212 @@
> >> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> >> + * Copyright(c) 2023 Marvell
> >> >> + */
> >> >> +
> >> >> +#ifndef _RTE_PMU_H_
> >> >> +#define _RTE_PMU_H_
> >> >> +
> >> >> +/**
> >> >> + * @file
> >> >> + *
> >> >> + * PMU event tracing operations
> >> >> + *
> >> >> + * This file defines generic API and types necessary to setup PMU
> >> >> +and
> >> >> + * read selected counters in runtime.
> >> >> + */
> >> >> +
> >> >> +#ifdef __cplusplus
> >> >> +extern "C" {
> >> >> +#endif
> >> >> +
> >> >> +#include <linux/perf_event.h>
> >> >> +
> >> >> +#include <rte_atomic.h>
> >> >> +#include <rte_branch_prediction.h> #include <rte_common.h>
> >> >> +#include <rte_compat.h> #include <rte_spinlock.h>
> >> >> +
> >> >> +/** Maximum number of events in a group */ #define
> >> >> +MAX_NUM_GROUP_EVENTS 8
> >> >
> >> >forgot RTE_ prefix.
> >> >In fact, do you really need number of events in group to be hard-coded?
> >> >Couldn't mmap_pages[] and fds[] be allocated dynamically by enable_group()?
> >> >
> >>
> >> 8 is reasonable number I think. X86/ARM have actually less that that (was that something like
> >4?).
> >> Moreover events are scheduled as a group so there must be enough hw
> >> counters available for that to succeed. So this number should cover current needs.
> >
> >If you think 8 will be enough to cover all possible future cases - I am ok either way.
> >Still need RTE_ prefix for it.
> >
> 
> Okay that can be added.
> 
> >> >> +
> >> >> +/**
> >> >> + * A structure describing a group of events.
> >> >> + */
> >> >> +struct rte_pmu_event_group {
> >> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages
> >*/
> >> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >> >> +	bool enabled; /**< true if group was enabled on particular lcore */
> >> >> +	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */ }
> >> >> +__rte_cache_aligned;
> >> >> +
> >> >
> >> >Even if we'd decide to keep rte_pmu_read() as static inline (still
> >> >not sure it is a good idea),
> >>
> >> We want to save as much cpu cycles as we possibly can and inlining
> >> does helps in that matter.
> >
> >Ok, so asking same question from different thread: how many cycles it will save?
> >What is the difference in terms of performance when you have this function inlined vs not inlined?
> >
> 
> On x86 setup which is not under load, no cpusets configured, etc *just* not inlining rte_pmu_read()
> decreases performance by roughly 24% (44 vs 58 cpu cycles). At least that is reported by
> trace_perf_autotest.

From my perspective 14 cycles is not that much...
Considering that user will probably not call it very often, and by enabling measurements he
probably already prepared to get some hit. 

> 
> >> >why these two struct below (rte_pmu_event and rte_pmu) have to be public?
> >> >I think both can be safely moved away from public headers.
> >> >
> >>
> >> struct rte_pmu_event can be hidden I guess.
> >> struct rte_pmu is used in this header hence cannot be moved elsewhere.
> >
> >Not sure why?
> >Is that because you use it inside rte_pmu_read()?
> >But that check I think can be safely moved into __rte_pmu_enable_group() or probably even into
> >rte_pmu_add_event().
> 
> No, we should not do that. Otherwise we'll need to call function. Even though check will happen
> early on still function prologue/epilogue will happen. This takes cycles.

Not necessary. You can store this value in pmu_group itself,
and by this value decide is pmu and group initialized, etc.   
 
> >
> >> >
> >> >> +/**
> >> >> + * A structure describing an event.
> >> >> + */
> >> >> +struct rte_pmu_event {
> >> >> +	char *name; /**< name of an event */
> >> >> +	unsigned int index; /**< event index into fds/mmap_pages */
> >> >> +	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */ };
> >> >
> >> >> +
> >> >> +/**
> >> >> + * A PMU state container.
> >> >> + */
> >> >> +struct rte_pmu {
> >> >> +	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> >> >> +	rte_spinlock_t lock; /**< serialize access to event group list */
> >> >> +	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
> >> >> +	unsigned int num_group_events; /**< number of events in a group */
> >> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> >> >> +	unsigned int initialized; /**< initialization counter */ };
> >> >> +
> >> >> +/** lcore event group */
> >> >> +RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
> >> >> +
> >> >> +/** PMU state container */
> >> >> +extern struct rte_pmu rte_pmu;
> >> >> +
> >> >> +/** Each architecture supporting PMU needs to provide its own
> >> >> +version */ #ifndef rte_pmu_pmc_read #define
> >> >> +rte_pmu_pmc_read(index) ({ 0; }) #endif
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Read PMU counter.
> >> >> + *
> >> >> + * @warning This should be not called directly.
> >> >> + *
> >> >> + * @param pc
> >> >> + *   Pointer to the mmapped user page.
> >> >> + * @return
> >> >> + *   Counter value read from hardware.
> >> >> + */
> >> >> +static __rte_always_inline uint64_t __rte_pmu_read_userpage(struct
> >> >> +perf_event_mmap_page *pc) {
> >> >> +	uint64_t width, offset;
> >> >> +	uint32_t seq, index;
> >> >> +	int64_t pmc;
> >> >> +
> >> >> +	for (;;) {
> >> >> +		seq = pc->lock;
> >> >> +		rte_compiler_barrier();
> >> >> +		index = pc->index;
> >> >> +		offset = pc->offset;
> >> >> +		width = pc->pmc_width;
> >> >> +
> >> >> +		/* index set to 0 means that particular counter cannot be used */
> >> >> +		if (likely(pc->cap_user_rdpmc && index)) {
> >> >
> >> >In mmap_events() you return EPERM if cap_user_rdpmc is not enabled.
> >> >Do you need another check here? Or this capability can be disabled by
> >> >kernel at run-time?
> >> >
> >>
> >> That extra check in mmap_event() may be removed actually. Some archs
> >> allow disabling reading rdpmc (I think that on x86 one can do that) so this check needs to stay.
> >>
> >> >
> >> >> +			pmc = rte_pmu_pmc_read(index - 1);
> >> >> +			pmc <<= 64 - width;
> >> >> +			pmc >>= 64 - width;
> >> >> +			offset += pmc;
> >> >> +		}
> >> >> +
> >> >> +		rte_compiler_barrier();
> >> >> +
> >> >> +		if (likely(pc->lock == seq))
> >> >> +			return offset;
> >> >> +	}
> >> >> +
> >> >> +	return 0;
> >> >> +}
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Enable group of events on the calling lcore.
> >> >> + *
> >> >> + * @warning This should be not called directly.
> >> >
> >> >__rte_internal ?
> >> >
> >>
> >> No this cannot be internal because that will make functions calling it
> >> internal as well hence apps won't be able to use that. This has
> >> already been brought up by one of the reviewers.
> >
> >Ok, then we probably can mark it with ' @internal' tag in formal comments?
> >
> 
> I added a warning not to call that directly. Since function is not internal (in DPDK parlance) per se
> I don’t think we should add more confusion that extra tag.

We doing it in other places, why not to add it here? 
 
> >>
> >> >> + *
> >> >> + * @return
> >> >> + *   0 in case of success, negative value otherwise.
> >> >> + */
> >> >> +__rte_experimental
> >> >> +int
> >> >> +__rte_pmu_enable_group(void);
> >> >> +
> >> >> +/**
> >> >> + * @warning
> >> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> >> + *
> >> >> + * Initialize PMU library.
> >> >> + *
> >> >> + * @warning This should be not called directly.
> >> >
> >> >Hmm.. then who should call it?
> >> >If it not supposed to be called directly, why to declare it here?
> >> >
> >>
> >> This is inlined and has one caller i.e rte_pmu_read().
> >
> >I thought we are talking here about rte_pmu_init().
> >I don't see where it is inlined and still not clear why it can't be called directly.
> >
> 
> No this cannot be called by init because groups are configured in runtime. That is why
> __rte_pmu_enable_group() is called once in rte_pmu_read().
> 
> *Other* code should not call that directly. And yes, that is not inlined - my mistake.

Once again: we are discussing comments for rte_pmu_init() function.
Why it can't be called directly?
In test_pmu_read() you do call it directly.
 
> >> >> + *
> >> >> + * @return
> >> >> + *   0 in case of success, negative value otherwise.
> >> >> + */
> >> >
> >> >Probably worth to mention that this function is not MT safe.
> >> >Same for _fini_ and add_event.
> >> >Also worth to mention that all control-path functions
> >> >(init/fini/add_event) and data-path (pmu_read) can't be called concurrently.
> >> >
> >>
> >> Yes they are meant to be called from main thread.
> >
> >Ok, then please add that into formal API comments.
> >
> >> >> +__rte_experimental
> >> >> +int
> >> >> +rte_pmu_init(void);
> >> >> +
  
Konstantin Ananyev Feb. 28, 2023, 12:04 p.m. UTC | #19
> > > >> Add support for programming PMU counters and reading their values in
> > > >> runtime bypassing kernel completely.
> > > >>
> > > >> This is especially useful in cases where CPU cores are isolated i.e
> > > >> run dedicated tasks. In such cases one cannot use standard perf
> > > >> utility without sacrificing latency and performance.
> > > >>
> > > >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> > > >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > >
> 
> [...]
> 
> > > >> +int
> > > >> +__rte_pmu_enable_group(void)
> > > >> +{
> > > >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> > > >> +	int ret;
> > > >> +
> > > >> +	if (rte_pmu.num_group_events == 0)
> > > >> +		return -ENODEV;
> > > >> +
> > > >> +	ret = open_events(group);
> > > >> +	if (ret)
> > > >> +		goto out;
> > > >> +
> > > >> +	ret = mmap_events(group);
> > > >> +	if (ret)
> > > >> +		goto out;
> > > >> +
> > > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -
> > 1) {
> > > >> +		ret = -errno;
> > > >> +		goto out;
> > > >> +	}
> > > >> +
> > > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) ==
> > -1) {
> > > >> +		ret = -errno;
> > > >> +		goto out;
> > > >> +	}
> > > >> +
> > > >> +	rte_spinlock_lock(&rte_pmu.lock);
> > > >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> > > >
> > > >Hmm.. so we insert pointer to TLS variable into the global list?
> > > >Wonder what would happen if that thread get terminated?
> > >
> > > Nothing special. Any pointers to that thread-local in that thread are
> > invalided.
> > >
> > > >Can memory from its TLS block get re-used (by other thread or for other
> > purposes)?
> > > >
> > >
> > > Why would any other thread reuse that?
> > > Eventually main thread will need that data to do the cleanup.
> >
> > I understand that main thread would need to access that data.
> > I am not sure that it would be able to.
> > Imagine thread calls rte_pmu_read(...) and then terminates, while program
> > continues to run.
> 
> Is the example you describe here (i.e. a thread terminating in the middle of doing something) really a scenario DPDK is supposed to
> support?

I am not talking about some abnormal termination.
We do have ability to spawn control threads, user can spawn his own thread, all these
threads can have limited life-time.
Not to mention about  rte_thread_register()/rte_thread_unregister().
 
> > As I understand address of its RTE_PER_LCORE(_event_group) will still remain
> > in rte_pmu.event_group_list,
> > even if it is probably not valid any more.
> 
> There should be a "destructor/done/finish" function available to remove this from the list.
> 
> [...]
> 
> > > >Even if we'd decide to keep rte_pmu_read() as static inline (still not
> > > >sure it is a good idea),
> > >
> > > We want to save as much cpu cycles as we possibly can and inlining does
> > helps
> > > in that matter.
> >
> > Ok, so asking same question from different thread: how many cycles it will
> > save?
> > What is the difference in terms of performance when you have this function
> > inlined vs not inlined?
> 
> We expect to use this in our in-house profiler library. For this reason, I have a very strong preference for absolute maximum
> performance.
> 
> Reading PMU events is for performance profiling, so I expect other potential users of the PMU library to share my opinion on this.

Well, from my perspective 14 cycles are not that much...
Though yes, it would be good to hear more opinions here.
  
Morten Brørup Feb. 28, 2023, 1:15 p.m. UTC | #20
> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Tuesday, 28 February 2023 13.05
> 
> > > > >> Add support for programming PMU counters and reading their values in
> > > > >> runtime bypassing kernel completely.
> > > > >>
> > > > >> This is especially useful in cases where CPU cores are isolated i.e
> > > > >> run dedicated tasks. In such cases one cannot use standard perf
> > > > >> utility without sacrificing latency and performance.
> > > > >>
> > > > >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> > > > >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > >
> >
> > [...]
> >
> > > > >> +int
> > > > >> +__rte_pmu_enable_group(void)
> > > > >> +{
> > > > >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> > > > >> +	int ret;
> > > > >> +
> > > > >> +	if (rte_pmu.num_group_events == 0)
> > > > >> +		return -ENODEV;
> > > > >> +
> > > > >> +	ret = open_events(group);
> > > > >> +	if (ret)
> > > > >> +		goto out;
> > > > >> +
> > > > >> +	ret = mmap_events(group);
> > > > >> +	if (ret)
> > > > >> +		goto out;
> > > > >> +
> > > > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET,
> PERF_IOC_FLAG_GROUP) == -
> > > 1) {
> > > > >> +		ret = -errno;
> > > > >> +		goto out;
> > > > >> +	}
> > > > >> +
> > > > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE,
> PERF_IOC_FLAG_GROUP) ==
> > > -1) {
> > > > >> +		ret = -errno;
> > > > >> +		goto out;
> > > > >> +	}
> > > > >> +
> > > > >> +	rte_spinlock_lock(&rte_pmu.lock);
> > > > >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> > > > >
> > > > >Hmm.. so we insert pointer to TLS variable into the global list?
> > > > >Wonder what would happen if that thread get terminated?
> > > >
> > > > Nothing special. Any pointers to that thread-local in that thread are
> > > invalided.
> > > >
> > > > >Can memory from its TLS block get re-used (by other thread or for other
> > > purposes)?
> > > > >
> > > >
> > > > Why would any other thread reuse that?
> > > > Eventually main thread will need that data to do the cleanup.
> > >
> > > I understand that main thread would need to access that data.
> > > I am not sure that it would be able to.
> > > Imagine thread calls rte_pmu_read(...) and then terminates, while program
> > > continues to run.
> >
> > Is the example you describe here (i.e. a thread terminating in the middle of
> doing something) really a scenario DPDK is supposed to
> > support?
> 
> I am not talking about some abnormal termination.

Then I misunderstood your example; I thought you meant the tread was terminated while inside the rte_pmu_read() function.

> We do have ability to spawn control threads, user can spawn his own thread,
> all these
> threads can have limited life-time.
> Not to mention about  rte_thread_register()/rte_thread_unregister().
> 

I agree that normal thread termination should be supported.

> > > As I understand address of its RTE_PER_LCORE(_event_group) will still
> remain
> > > in rte_pmu.event_group_list,
> > > even if it is probably not valid any more.
> >
> > There should be a "destructor/done/finish" function available to remove this
> from the list.
> >
> > [...]
> >
> > > > >Even if we'd decide to keep rte_pmu_read() as static inline (still not
> > > > >sure it is a good idea),
> > > >
> > > > We want to save as much cpu cycles as we possibly can and inlining does
> > > helps
> > > > in that matter.
> > >
> > > Ok, so asking same question from different thread: how many cycles it will
> > > save?
> > > What is the difference in terms of performance when you have this function
> > > inlined vs not inlined?
> >
> > We expect to use this in our in-house profiler library. For this reason, I
> have a very strong preference for absolute maximum
> > performance.
> >
> > Reading PMU events is for performance profiling, so I expect other potential
> users of the PMU library to share my opinion on this.
> 
> Well, from my perspective 14 cycles are not that much...

For reference, the i40e testpmd per-core performance report shows that it uses 36 cycles per packet.

This is a total of 1152 cycles per burst of 32 packets. 14 cycles overhead per burst / 1152 cycles per burst = 1.2 % overhead.

But that is not all: If the application's pipeline has three stages, where the PMU counters are read for each stage, the per-invocation overhead of 14 cycles adds up, and the overhead per burst is now 3 * 14 / 1152 = 3.6 %.

Generalizing...

In my example here, the same function with 14 wasted cycles is called three times. It might as well be three individual libraries each wasting 14 cycles in its individual fast path processing function, due to a similarly relaxed attitude regarding wasting 14 cycles.

My point is:

Real applications do much more work than testpmd, so all this "insignificant" extra overhead in the libraries adds up!

Generally, I would like the DPDK Project to remain loyal to its original philosophy, where performance is considered a Key Performance Indicator, and overhead in the fast path is kept at an absolute minimum.

> Though yes, it would be good to hear more opinions here.
  
Morten Brørup Feb. 28, 2023, 4:22 p.m. UTC | #21
> From: Morten Brørup
> Sent: Tuesday, 28 February 2023 14.16
> 
> > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> > Sent: Tuesday, 28 February 2023 13.05
> >
> > > > > >> Add support for programming PMU counters and reading their values
> in
> > > > > >> runtime bypassing kernel completely.
> > > > > >>
> > > > > >> This is especially useful in cases where CPU cores are isolated i.e
> > > > > >> run dedicated tasks. In such cases one cannot use standard perf
> > > > > >> utility without sacrificing latency and performance.
> > > > > >>
> > > > > >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> > > > > >> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > >
> > >
> > > [...]
> > >
> > > > > >> +int
> > > > > >> +__rte_pmu_enable_group(void)
> > > > > >> +{
> > > > > >> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
> > > > > >> +	int ret;
> > > > > >> +
> > > > > >> +	if (rte_pmu.num_group_events == 0)
> > > > > >> +		return -ENODEV;
> > > > > >> +
> > > > > >> +	ret = open_events(group);
> > > > > >> +	if (ret)
> > > > > >> +		goto out;
> > > > > >> +
> > > > > >> +	ret = mmap_events(group);
> > > > > >> +	if (ret)
> > > > > >> +		goto out;
> > > > > >> +
> > > > > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET,
> > PERF_IOC_FLAG_GROUP) == -
> > > > 1) {
> > > > > >> +		ret = -errno;
> > > > > >> +		goto out;
> > > > > >> +	}
> > > > > >> +
> > > > > >> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE,
> > PERF_IOC_FLAG_GROUP) ==
> > > > -1) {
> > > > > >> +		ret = -errno;
> > > > > >> +		goto out;
> > > > > >> +	}
> > > > > >> +
> > > > > >> +	rte_spinlock_lock(&rte_pmu.lock);
> > > > > >> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
> > > > > >
> > > > > >Hmm.. so we insert pointer to TLS variable into the global list?
> > > > > >Wonder what would happen if that thread get terminated?
> > > > >
> > > > > Nothing special. Any pointers to that thread-local in that thread are
> > > > invalided.
> > > > >
> > > > > >Can memory from its TLS block get re-used (by other thread or for
> other
> > > > purposes)?
> > > > > >
> > > > >
> > > > > Why would any other thread reuse that?
> > > > > Eventually main thread will need that data to do the cleanup.
> > > >
> > > > I understand that main thread would need to access that data.
> > > > I am not sure that it would be able to.
> > > > Imagine thread calls rte_pmu_read(...) and then terminates, while
> program
> > > > continues to run.
> > >
> > > Is the example you describe here (i.e. a thread terminating in the middle
> of
> > doing something) really a scenario DPDK is supposed to
> > > support?
> >
> > I am not talking about some abnormal termination.
> 
> Then I misunderstood your example; I thought you meant the tread was
> terminated while inside the rte_pmu_read() function.
> 
> > We do have ability to spawn control threads, user can spawn his own thread,
> > all these
> > threads can have limited life-time.
> > Not to mention about  rte_thread_register()/rte_thread_unregister().
> >
> 
> I agree that normal thread termination should be supported.
> 
> > > > As I understand address of its RTE_PER_LCORE(_event_group) will still
> > remain
> > > > in rte_pmu.event_group_list,
> > > > even if it is probably not valid any more.
> > >
> > > There should be a "destructor/done/finish" function available to remove
> this
> > from the list.
> > >
> > > [...]
> > >
> > > > > >Even if we'd decide to keep rte_pmu_read() as static inline (still
> not
> > > > > >sure it is a good idea),
> > > > >
> > > > > We want to save as much cpu cycles as we possibly can and inlining
> does
> > > > helps
> > > > > in that matter.
> > > >
> > > > Ok, so asking same question from different thread: how many cycles it
> will
> > > > save?
> > > > What is the difference in terms of performance when you have this
> function
> > > > inlined vs not inlined?
> > >
> > > We expect to use this in our in-house profiler library. For this reason, I
> > have a very strong preference for absolute maximum
> > > performance.
> > >
> > > Reading PMU events is for performance profiling, so I expect other
> potential
> > users of the PMU library to share my opinion on this.
> >
> > Well, from my perspective 14 cycles are not that much...
> 
> For reference, the i40e testpmd per-core performance report shows that it uses
> 36 cycles per packet.
> 
> This is a total of 1152 cycles per burst of 32 packets. 14 cycles overhead per
> burst / 1152 cycles per burst = 1.2 % overhead.
> 
> But that is not all: If the application's pipeline has three stages, where the
> PMU counters are read for each stage, the per-invocation overhead of 14 cycles
> adds up, and the overhead per burst is now 3 * 14 / 1152 = 3.6 %.

I was too fast on the keyboard here... If the application does more work than testpmd, it certainly also uses more than 1152 cycles to do that work. So please ignore the 3.6 % as a wild exaggeration from an invalid example, and just stick with the 1.2 % overhead - which I still consider significant, and thus worth avoiding.

> 
> Generalizing...
> 
> In my example here, the same function with 14 wasted cycles is called three
> times. It might as well be three individual libraries each wasting 14 cycles
> in its individual fast path processing function, due to a similarly relaxed
> attitude regarding wasting 14 cycles.
> 
> My point is:
> 
> Real applications do much more work than testpmd, so all this "insignificant"
> extra overhead in the libraries adds up!
> 
> Generally, I would like the DPDK Project to remain loyal to its original
> philosophy, where performance is considered a Key Performance Indicator, and
> overhead in the fast path is kept at an absolute minimum.
> 
> > Though yes, it would be good to hear more opinions here.
  
Konstantin Ananyev March 5, 2023, 4:30 p.m. UTC | #22
>>>>>>>> Add support for programming PMU counters and reading their values
>> in
>>>>>>>> runtime bypassing kernel completely.
>>>>>>>>
>>>>>>>> This is especially useful in cases where CPU cores are isolated i.e
>>>>>>>> run dedicated tasks. In such cases one cannot use standard perf
>>>>>>>> utility without sacrificing latency and performance.
>>>>>>>>
>>>>>>>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>>>>>>>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>>>>>>>
>>>>
>>>> [...]
>>>>
>>>>>>>> +int
>>>>>>>> +__rte_pmu_enable_group(void)
>>>>>>>> +{
>>>>>>>> +	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
>>>>>>>> +	int ret;
>>>>>>>> +
>>>>>>>> +	if (rte_pmu.num_group_events == 0)
>>>>>>>> +		return -ENODEV;
>>>>>>>> +
>>>>>>>> +	ret = open_events(group);
>>>>>>>> +	if (ret)
>>>>>>>> +		goto out;
>>>>>>>> +
>>>>>>>> +	ret = mmap_events(group);
>>>>>>>> +	if (ret)
>>>>>>>> +		goto out;
>>>>>>>> +
>>>>>>>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET,
>>> PERF_IOC_FLAG_GROUP) == -
>>>>> 1) {
>>>>>>>> +		ret = -errno;
>>>>>>>> +		goto out;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE,
>>> PERF_IOC_FLAG_GROUP) ==
>>>>> -1) {
>>>>>>>> +		ret = -errno;
>>>>>>>> +		goto out;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	rte_spinlock_lock(&rte_pmu.lock);
>>>>>>>> +	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
>>>>>>>
>>>>>>> Hmm.. so we insert pointer to TLS variable into the global list?
>>>>>>> Wonder what would happen if that thread get terminated?
>>>>>>
>>>>>> Nothing special. Any pointers to that thread-local in that thread are
>>>>> invalided.
>>>>>>
>>>>>>> Can memory from its TLS block get re-used (by other thread or for
>> other
>>>>> purposes)?
>>>>>>>
>>>>>>
>>>>>> Why would any other thread reuse that?
>>>>>> Eventually main thread will need that data to do the cleanup.
>>>>>
>>>>> I understand that main thread would need to access that data.
>>>>> I am not sure that it would be able to.
>>>>> Imagine thread calls rte_pmu_read(...) and then terminates, while
>> program
>>>>> continues to run.
>>>>
>>>> Is the example you describe here (i.e. a thread terminating in the middle
>> of
>>> doing something) really a scenario DPDK is supposed to
>>>> support?
>>>
>>> I am not talking about some abnormal termination.
>>
>> Then I misunderstood your example; I thought you meant the tread was
>> terminated while inside the rte_pmu_read() function.
>>
>>> We do have ability to spawn control threads, user can spawn his own thread,
>>> all these
>>> threads can have limited life-time.
>>> Not to mention about  rte_thread_register()/rte_thread_unregister().
>>>
>>
>> I agree that normal thread termination should be supported.
>>
>>>>> As I understand address of its RTE_PER_LCORE(_event_group) will still
>>> remain
>>>>> in rte_pmu.event_group_list,
>>>>> even if it is probably not valid any more.
>>>>
>>>> There should be a "destructor/done/finish" function available to remove
>> this
>>> from the list.
>>>>
>>>> [...]
>>>>
>>>>>>> Even if we'd decide to keep rte_pmu_read() as static inline (still
>> not
>>>>>>> sure it is a good idea),
>>>>>>
>>>>>> We want to save as much cpu cycles as we possibly can and inlining
>> does
>>>>> helps
>>>>>> in that matter.
>>>>>
>>>>> Ok, so asking same question from different thread: how many cycles it
>> will
>>>>> save?
>>>>> What is the difference in terms of performance when you have this
>> function
>>>>> inlined vs not inlined?
>>>>
>>>> We expect to use this in our in-house profiler library. For this reason, I
>>> have a very strong preference for absolute maximum
>>>> performance.
>>>>
>>>> Reading PMU events is for performance profiling, so I expect other
>> potential
>>> users of the PMU library to share my opinion on this.
>>>
>>> Well, from my perspective 14 cycles are not that much...
>>
>> For reference, the i40e testpmd per-core performance report shows that it uses
>> 36 cycles per packet.
>>
>> This is a total of 1152 cycles per burst of 32 packets. 14 cycles overhead per
>> burst / 1152 cycles per burst = 1.2 % overhead.
>>
>> But that is not all: If the application's pipeline has three stages, where the
>> PMU counters are read for each stage, the per-invocation overhead of 14 cycles
>> adds up, and the overhead per burst is now 3 * 14 / 1152 = 3.6 %.
> 
> I was too fast on the keyboard here... If the application does more work than testpmd, it certainly also uses more than 1152 cycles to do that work. So please ignore the 3.6 % as a wild exaggeration from an invalid example, and just stick with the 1.2 % overhead - which I still consider significant, and thus worth avoiding.

Wonder can we do both - hide struct rte_pmu_event_group from public API 
and have inline function to read pmu stats?
if we can add a separate function that will allow user to get
struct perf_event_mmap_page * for given event index (or event name),
then later user can use
__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
directly.

> 
>>
>> Generalizing...
>>
>> In my example here, the same function with 14 wasted cycles is called three
>> times. It might as well be three individual libraries each wasting 14 cycles
>> in its individual fast path processing function, due to a similarly relaxed
>> attitude regarding wasting 14 cycles.
>>
>> My point is:
>>
>> Real applications do much more work than testpmd, so all this "insignificant"
>> extra overhead in the libraries adds up!
>>
>> Generally, I would like the DPDK Project to remain loyal to its original
>> philosophy, where performance is considered a Key Performance Indicator, and
>> overhead in the fast path is kept at an absolute minimum.
>>
>>> Though yes, it would be good to hear more opinions here.
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 3495946d0f..d37f242120 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1697,6 +1697,11 @@  M: Nithin Dabilpuram <ndabilpuram@marvell.com>
 M: Pavan Nikhilesh <pbhagavatula@marvell.com>
 F: lib/node/
 
+PMU - EXPERIMENTAL
+M: Tomasz Duszynski <tduszynski@marvell.com>
+F: lib/pmu/
+F: app/test/test_pmu*
+
 
 Test Applications
 -----------------
diff --git a/app/test/meson.build b/app/test/meson.build
index f34d19e3c3..6b61b7fc32 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -111,6 +111,7 @@  test_sources = files(
         'test_reciprocal_division_perf.c',
         'test_red.c',
         'test_pie.c',
+        'test_pmu.c',
         'test_reorder.c',
         'test_rib.c',
         'test_rib6.c',
@@ -239,6 +240,7 @@  fast_tests = [
         ['kni_autotest', false, true],
         ['kvargs_autotest', true, true],
         ['member_autotest', true, true],
+        ['pmu_autotest', true, true],
         ['power_cpufreq_autotest', false, true],
         ['power_autotest', true, true],
         ['power_kvm_vm_autotest', false, true],
diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
new file mode 100644
index 0000000000..c257638e8b
--- /dev/null
+++ b/app/test/test_pmu.c
@@ -0,0 +1,62 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include "test.h"
+
+#ifndef RTE_EXEC_ENV_LINUX
+
+static int
+test_pmu(void)
+{
+	printf("pmu_autotest only supported on Linux, skipping test\n");
+	return TEST_SKIPPED;
+}
+
+#else
+
+#include <rte_pmu.h>
+
+static int
+test_pmu_read(void)
+{
+	const char *name = NULL;
+	int tries = 10, event;
+	uint64_t val = 0;
+
+	if (name == NULL) {
+		printf("PMU not supported on this arch\n");
+		return TEST_SKIPPED;
+	}
+
+	if (rte_pmu_init() < 0)
+		return TEST_SKIPPED;
+
+	event = rte_pmu_add_event(name);
+	while (tries--)
+		val += rte_pmu_read(event);
+
+	rte_pmu_fini();
+
+	return val ? TEST_SUCCESS : TEST_FAILED;
+}
+
+static struct unit_test_suite pmu_tests = {
+	.suite_name = "pmu autotest",
+	.setup = NULL,
+	.teardown = NULL,
+	.unit_test_cases = {
+		TEST_CASE(test_pmu_read),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_pmu(void)
+{
+	return unit_test_suite_runner(&pmu_tests);
+}
+
+#endif /* RTE_EXEC_ENV_LINUX */
+
+REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 2deec7ea19..a8e04a195d 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -223,7 +223,8 @@  The public API headers are grouped by topics:
   [log](@ref rte_log.h),
   [errno](@ref rte_errno.h),
   [trace](@ref rte_trace.h),
-  [trace_point](@ref rte_trace_point.h)
+  [trace_point](@ref rte_trace_point.h),
+  [pmu](@ref rte_pmu.h)
 
 - **misc**:
   [EAL config](@ref rte_eal.h),
diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
index e859426099..350b5a8c94 100644
--- a/doc/api/doxy-api.conf.in
+++ b/doc/api/doxy-api.conf.in
@@ -63,6 +63,7 @@  INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
                           @TOPDIR@/lib/pci \
                           @TOPDIR@/lib/pdump \
                           @TOPDIR@/lib/pipeline \
+                          @TOPDIR@/lib/pmu \
                           @TOPDIR@/lib/port \
                           @TOPDIR@/lib/power \
                           @TOPDIR@/lib/rawdev \
diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
index 14292d4c25..89e38cd301 100644
--- a/doc/guides/prog_guide/profile_app.rst
+++ b/doc/guides/prog_guide/profile_app.rst
@@ -7,6 +7,18 @@  Profile Your Application
 The following sections describe methods of profiling DPDK applications on
 different architectures.
 
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some performance monitoring unit (PMU).
+Such unit provides programmable counters that monitor specific events.
+
+Different tools gather that information, like for example perf.
+However, in some scenarios when CPU cores are isolated and run
+dedicated tasks interrupting those tasks with perf may be undesirable.
+
+In such cases, an application can use the PMU library to read such events via ``rte_pmu_read()``.
+
 
 Profiling on x86
 ----------------
diff --git a/doc/guides/rel_notes/release_23_03.rst b/doc/guides/rel_notes/release_23_03.rst
index ab998a5357..20622efe58 100644
--- a/doc/guides/rel_notes/release_23_03.rst
+++ b/doc/guides/rel_notes/release_23_03.rst
@@ -147,6 +147,13 @@  New Features
   * Added support to capture packets at each graph node with packet metadata and
     node name.
 
+* **Added PMU library.**
+
+  Added a new performance monitoring unit (PMU) library which allows applications
+  to perform self monitoring activities without depending on external utilities like perf.
+  After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
+  can be stored in CTF format for further analysis.
+
 
 Removed Items
 -------------
diff --git a/lib/meson.build b/lib/meson.build
index 450c061d2b..8a42d45d20 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -11,6 +11,7 @@ 
 libraries = [
         'kvargs', # eal depends on kvargs
         'telemetry', # basic info querying
+        'pmu',
         'eal', # everything depends on eal
         'ring',
         'rcu', # rcu depends on ring
diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build
new file mode 100644
index 0000000000..a4160b494e
--- /dev/null
+++ b/lib/pmu/meson.build
@@ -0,0 +1,13 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2023 Marvell International Ltd.
+
+if not is_linux
+    build = false
+    reason = 'only supported on Linux'
+    subdir_done()
+endif
+
+includes = [global_inc]
+
+sources = files('rte_pmu.c')
+headers = files('rte_pmu.h')
diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h
new file mode 100644
index 0000000000..b9f8c1ddc8
--- /dev/null
+++ b/lib/pmu/pmu_private.h
@@ -0,0 +1,32 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ *
+ * @param config
+ *   Architecture specific event configuration. Consult kernel sources for available options.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+#endif /* _PMU_PRIVATE_H_ */
diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
new file mode 100644
index 0000000000..950f999cb7
--- /dev/null
+++ b/lib/pmu/rte_pmu.c
@@ -0,0 +1,460 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_atomic.h>
+#include <rte_per_lcore.h>
+#include <rte_pmu.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+
+RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+struct rte_pmu rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+	return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
+{
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+	char path[PATH_MAX];
+	char *config = NULL;
+	int high, low, ret;
+	FILE *fp;
+
+	*num = *mask = 0;
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
+	fp = fopen(path, "r");
+	if (fp == NULL)
+		return -errno;
+
+	errno = 0;
+	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+	if (ret < 2) {
+		ret = -ENODATA;
+		goto out;
+	}
+	if (errno) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ret == 2)
+		high = low;
+
+	*mask = GENMASK_ULL(high, low);
+	/* Last digit should be [012]. If last digit is missing 0 is implied. */
+	*num = config[strlen(config) - 1];
+	*num = isdigit(*num) ? *num - '0' : 0;
+
+	ret = 0;
+out:
+	free(config);
+	fclose(fp);
+
+	return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+	char *token, *term;
+	int num, ret, val;
+	uint64_t mask;
+
+	config[0] = config[1] = config[2] = 0;
+
+	token = strtok(buf, ",");
+	while (token) {
+		errno = 0;
+		/* <term>=<value> */
+		ret = sscanf(token, "%m[^=]=%i", &term, &val);
+		if (ret < 1)
+			return -ENODATA;
+		if (errno)
+			return -errno;
+		if (ret == 1)
+			val = 1;
+
+		ret = get_term_format(term, &num, &mask);
+		free(term);
+		if (ret)
+			return ret;
+
+		config[num] |= FIELD_PREP(mask, val);
+		token = strtok(NULL, ",");
+	}
+
+	return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+	char path[PATH_MAX], buf[BUFSIZ];
+	FILE *fp;
+	int ret;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+	fp = fopen(path, "r");
+	if (fp == NULL)
+		return -errno;
+
+	ret = fread(buf, 1, sizeof(buf), fp);
+	if (ret == 0) {
+		fclose(fp);
+
+		return -EINVAL;
+	}
+	fclose(fp);
+	buf[ret] = '\0';
+
+	return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int group_fd)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(struct perf_event_attr),
+		.type = PERF_TYPE_RAW,
+		.exclude_kernel = 1,
+		.exclude_hv = 1,
+		.disabled = 1,
+	};
+
+	pmu_arch_fixup_config(config);
+
+	attr.config = config[0];
+	attr.config1 = config[1];
+	attr.config2 = config[2];
+
+	return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
+}
+
+static int
+open_events(struct rte_pmu_event_group *group)
+{
+	struct rte_pmu_event *event;
+	uint64_t config[3];
+	int num = 0, ret;
+
+	/* group leader gets created first, with fd = -1 */
+	group->fds[0] = -1;
+
+	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+		ret = get_event_config(event->name, config);
+		if (ret)
+			continue;
+
+		ret = do_perf_event_open(config, group->fds[0]);
+		if (ret == -1) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->fds[event->index] = ret;
+		num++;
+	}
+
+	return 0;
+out:
+	for (--num; num >= 0; num--) {
+		close(group->fds[num]);
+		group->fds[num] = -1;
+	}
+
+
+	return ret;
+}
+
+static int
+mmap_events(struct rte_pmu_event_group *group)
+{
+	long page_size = sysconf(_SC_PAGE_SIZE);
+	unsigned int i;
+	void *addr;
+	int ret;
+
+	for (i = 0; i < rte_pmu.num_group_events; i++) {
+		addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
+		if (addr == MAP_FAILED) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->mmap_pages[i] = addr;
+		if (!group->mmap_pages[i]->cap_user_rdpmc) {
+			ret = -EPERM;
+			goto out;
+		}
+	}
+
+	return 0;
+out:
+	for (; i; i--) {
+		munmap(group->mmap_pages[i - 1], page_size);
+		group->mmap_pages[i - 1] = NULL;
+	}
+
+	return ret;
+}
+
+static void
+cleanup_events(struct rte_pmu_event_group *group)
+{
+	unsigned int i;
+
+	if (group->fds[0] != -1)
+		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+	for (i = 0; i < rte_pmu.num_group_events; i++) {
+		if (group->mmap_pages[i]) {
+			munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
+			group->mmap_pages[i] = NULL;
+		}
+
+		if (group->fds[i] != -1) {
+			close(group->fds[i]);
+			group->fds[i] = -1;
+		}
+	}
+
+	group->enabled = false;
+}
+
+int
+__rte_pmu_enable_group(void)
+{
+	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+	int ret;
+
+	if (rte_pmu.num_group_events == 0)
+		return -ENODEV;
+
+	ret = open_events(group);
+	if (ret)
+		goto out;
+
+	ret = mmap_events(group);
+	if (ret)
+		goto out;
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	rte_spinlock_lock(&rte_pmu.lock);
+	TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
+	rte_spinlock_unlock(&rte_pmu.lock);
+	group->enabled = true;
+
+	return 0;
+
+out:
+	cleanup_events(group);
+
+	return ret;
+}
+
+static int
+scan_pmus(void)
+{
+	char path[PATH_MAX];
+	struct dirent *dent;
+	const char *name;
+	DIR *dirp;
+
+	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+	if (dirp == NULL)
+		return -errno;
+
+	while ((dent = readdir(dirp))) {
+		name = dent->d_name;
+		if (name[0] == '.')
+			continue;
+
+		/* sysfs entry should either contain cpus or be a cpu */
+		if (!strcmp(name, "cpu"))
+			break;
+
+		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+		if (access(path, F_OK) == 0)
+			break;
+	}
+
+	if (dent) {
+		rte_pmu.name = strdup(name);
+		if (rte_pmu.name == NULL) {
+			closedir(dirp);
+
+			return -ENOMEM;
+		}
+	}
+
+	closedir(dirp);
+
+	return rte_pmu.name ? 0 : -ENODEV;
+}
+
+static struct rte_pmu_event *
+new_event(const char *name)
+{
+	struct rte_pmu_event *event;
+
+	event = calloc(1, sizeof(*event));
+	if (event == NULL)
+		goto out;
+
+	event->name = strdup(name);
+	if (event->name == NULL) {
+		free(event);
+		event = NULL;
+	}
+
+out:
+	return event;
+}
+
+static void
+free_event(struct rte_pmu_event *event)
+{
+	free(event->name);
+	free(event);
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+	struct rte_pmu_event *event;
+	char path[PATH_MAX];
+
+	if (rte_pmu.name == NULL)
+		return -ENODEV;
+
+	if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
+		return -ENOSPC;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+	if (access(path, R_OK))
+		return -ENODEV;
+
+	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+		if (!strcmp(event->name, name))
+			return event->index;
+		continue;
+	}
+
+	event = new_event(name);
+	if (event == NULL)
+		return -ENOMEM;
+
+	event->index = rte_pmu.num_group_events++;
+	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
+
+	return event->index;
+}
+
+int
+rte_pmu_init(void)
+{
+	int ret;
+
+	/* Allow calling init from multiple contexts within a single thread. This simplifies
+	 * resource management a bit e.g in case fast-path tracepoint has already been enabled
+	 * via command line but application doesn't care enough and performs init/fini again.
+	 */
+	if (rte_pmu.initialized != 0) {
+		rte_pmu.initialized++;
+		return 0;
+	}
+
+	ret = scan_pmus();
+	if (ret)
+		goto out;
+
+	ret = pmu_arch_init();
+	if (ret)
+		goto out;
+
+	TAILQ_INIT(&rte_pmu.event_list);
+	TAILQ_INIT(&rte_pmu.event_group_list);
+	rte_spinlock_init(&rte_pmu.lock);
+	rte_pmu.initialized = 1;
+
+	return 0;
+out:
+	free(rte_pmu.name);
+	rte_pmu.name = NULL;
+
+	return ret;
+}
+
+void
+rte_pmu_fini(void)
+{
+	struct rte_pmu_event_group *group, *tmp_group;
+	struct rte_pmu_event *event, *tmp_event;
+
+	/* cleanup once init count drops to zero */
+	if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
+		return;
+
+	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
+		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
+		free_event(event);
+	}
+
+	RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
+		TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
+		cleanup_events(group);
+	}
+
+	pmu_arch_fini();
+	free(rte_pmu.name);
+	rte_pmu.name = NULL;
+	rte_pmu.num_group_events = 0;
+}
diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
new file mode 100644
index 0000000000..6b664c3336
--- /dev/null
+++ b/lib/pmu/rte_pmu.h
@@ -0,0 +1,212 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_spinlock.h>
+
+/** Maximum number of events in a group */
+#define MAX_NUM_GROUP_EVENTS 8
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
+	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
+	bool enabled; /**< true if group was enabled on particular lcore */
+	TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
+} __rte_cache_aligned;
+
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+	char *name; /**< name of an event */
+	unsigned int index; /**< event index into fds/mmap_pages */
+	TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+	char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
+	rte_spinlock_t lock; /**< serialize access to event group list */
+	TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
+	unsigned int num_group_events; /**< number of events in a group */
+	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+	unsigned int initialized; /**< initialization counter */
+};
+
+/** lcore event group */
+RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+
+/** PMU state container */
+extern struct rte_pmu rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read PMU counter.
+ *
+ * @warning This should be not called directly.
+ *
+ * @param pc
+ *   Pointer to the mmapped user page.
+ * @return
+ *   Counter value read from hardware.
+ */
+static __rte_always_inline uint64_t
+__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+	uint64_t width, offset;
+	uint32_t seq, index;
+	int64_t pmc;
+
+	for (;;) {
+		seq = pc->lock;
+		rte_compiler_barrier();
+		index = pc->index;
+		offset = pc->offset;
+		width = pc->pmc_width;
+
+		/* index set to 0 means that particular counter cannot be used */
+		if (likely(pc->cap_user_rdpmc && index)) {
+			pmc = rte_pmu_pmc_read(index - 1);
+			pmc <<= 64 - width;
+			pmc >>= 64 - width;
+			offset += pmc;
+		}
+
+		rte_compiler_barrier();
+
+		if (likely(pc->lock == seq))
+			return offset;
+	}
+
+	return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable group of events on the calling lcore.
+ *
+ * @warning This should be not called directly.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+__rte_pmu_enable_group(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize PMU library.
+ *
+ * @warning This should be not called directly.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_init(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Finalize PMU library. This should be called after PMU counters are no longer being read.
+ */
+__rte_experimental
+void
+rte_pmu_fini(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ *   Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ *   Index of an event to be read.
+ * @return
+ *   Event value read from register. In case of errors or lack of support
+ *   0 is returned. In other words, stream of zeros in a trace file
+ *   indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(unsigned int index)
+{
+	struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+	int ret;
+
+	if (unlikely(!rte_pmu.initialized))
+		return 0;
+
+	if (unlikely(!group->enabled)) {
+		ret = __rte_pmu_enable_group();
+		if (ret)
+			return 0;
+	}
+
+	if (unlikely(index >= rte_pmu.num_group_events))
+		return 0;
+
+	return __rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
diff --git a/lib/pmu/version.map b/lib/pmu/version.map
new file mode 100644
index 0000000000..39a4f279c1
--- /dev/null
+++ b/lib/pmu/version.map
@@ -0,0 +1,15 @@ 
+DPDK_23 {
+	local: *;
+};
+
+EXPERIMENTAL {
+	global:
+
+	__rte_pmu_enable_group;
+	per_lcore__event_group;
+	rte_pmu;
+	rte_pmu_add_event;
+	rte_pmu_fini;
+	rte_pmu_init;
+	rte_pmu_read;
+};