[v2,1/2] member: implement NitroSketch mode

Message ID 20220831064639.4163765-2-leyi.rong@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series introduce NitroSketch Mode into membership library |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Leyi Rong Aug. 31, 2022, 6:46 a.m. UTC
  Sketching algorithm provide high-fidelity approximate measurements and
appears as a promising alternative to traditional approaches such as
packet sampling.

NitroSketch [1] is a software sketching framework that optimizes
performance, provides accuracy guarantees, and supports a variety of
sketches.

This commit adds a new data structure called sketch into
membership library. This new data structure is an efficient
way to profile the traffic for heavy hitters. Also use min-heap
structure to maintain the top-k flow keys.

[1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
https://dl.acm.org/doi/pdf/10.1145/3341302.3342076

Signed-off-by: Alan Liu <zaoxingliu@gmail.com>
Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Signed-off-by: Leyi Rong <leyi.rong@intel.com>
---
 lib/member/meson.build                |  38 +-
 lib/member/rte_member.c               |  75 ++++
 lib/member/rte_member.h               | 151 ++++++-
 lib/member/rte_member_heap.h          | 424 ++++++++++++++++++
 lib/member/rte_member_sketch.c        | 594 ++++++++++++++++++++++++++
 lib/member/rte_member_sketch.h        |  97 +++++
 lib/member/rte_member_sketch_avx512.c |  69 +++
 lib/member/rte_member_sketch_avx512.h |  36 ++
 lib/member/rte_xxh64_avx512.h         | 117 +++++
 lib/member/version.map                |   3 +
 10 files changed, 1600 insertions(+), 4 deletions(-)
 create mode 100644 lib/member/rte_member_heap.h
 create mode 100644 lib/member/rte_member_sketch.c
 create mode 100644 lib/member/rte_member_sketch.h
 create mode 100644 lib/member/rte_member_sketch_avx512.c
 create mode 100644 lib/member/rte_member_sketch_avx512.h
 create mode 100644 lib/member/rte_xxh64_avx512.h
  

Comments

David Marchand Sept. 13, 2022, 2:56 p.m. UTC | #1
On Wed, Aug 31, 2022 at 8:47 AM Leyi Rong <leyi.rong@intel.com> wrote:
>
> Sketching algorithm provide high-fidelity approximate measurements and
> appears as a promising alternative to traditional approaches such as
> packet sampling.
>
> NitroSketch [1] is a software sketching framework that optimizes
> performance, provides accuracy guarantees, and supports a variety of
> sketches.
>
> This commit adds a new data structure called sketch into
> membership library. This new data structure is an efficient
> way to profile the traffic for heavy hitters. Also use min-heap
> structure to maintain the top-k flow keys.

http://mails.dpdk.org/archives/test-report/2022-August/304026.html

This patch adds new symbols in the API without going through the
experimental phase.
What is the rationale for skipping it?

>
> [1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
> Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
> Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
> https://dl.acm.org/doi/pdf/10.1145/3341302.3342076
>
> Signed-off-by: Alan Liu <zaoxingliu@gmail.com>
> Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>
> ---
>  lib/member/meson.build                |  38 +-
>  lib/member/rte_member.c               |  75 ++++
>  lib/member/rte_member.h               | 151 ++++++-
>  lib/member/rte_member_heap.h          | 424 ++++++++++++++++++
>  lib/member/rte_member_sketch.c        | 594 ++++++++++++++++++++++++++
>  lib/member/rte_member_sketch.h        |  97 +++++
>  lib/member/rte_member_sketch_avx512.c |  69 +++
>  lib/member/rte_member_sketch_avx512.h |  36 ++
>  lib/member/rte_xxh64_avx512.h         | 117 +++++
>  lib/member/version.map                |   3 +
>  10 files changed, 1600 insertions(+), 4 deletions(-)
>  create mode 100644 lib/member/rte_member_heap.h
>  create mode 100644 lib/member/rte_member_sketch.c
>  create mode 100644 lib/member/rte_member_sketch.h
>  create mode 100644 lib/member/rte_member_sketch_avx512.c
>  create mode 100644 lib/member/rte_member_sketch_avx512.h
>  create mode 100644 lib/member/rte_xxh64_avx512.h
>
> diff --git a/lib/member/meson.build b/lib/member/meson.build
> index e06fddc240..9b3418c25c 100644
> --- a/lib/member/meson.build
> +++ b/lib/member/meson.build
> @@ -7,6 +7,42 @@ if is_windows
>      subdir_done()
>  endif
>
> -sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c')
> +sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c', 'rte_member_sketch.c')
>  headers = files('rte_member.h')
>  deps += ['hash']
> +includes += include_directories('../hash', '../ring')
> +
> +# compile AVX512 version if:
> +# we are building 64-bit binary AND binutils can generate proper code
> +if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
> +    # compile AVX512 version if either:
> +    # a. we have AVX512 supported in minimum instruction set
> +    #    baseline
> +    # b. it's not minimum instruction set, but supported by
> +    #    compiler
> +    #
> +    # in former case, just add avx512 C file to files list
> +    # in latter case, compile c file to static lib, using correct
> +    # compiler flags, and then have the .o file from static lib
> +    # linked into main lib.
> +    sketch_avx512_cpu_support = (
> +        cc.get_define('__AVX512F__', args: machine_args) != ''
> +    )
> +
> +    if sketch_avx512_cpu_support == true
> +       cflags += ['-DCC_AVX512_SUPPORT']
> +       if cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
> +           cflags += ['-mavx512f', '-mavx512dq', '-mavx512ifma']

Pushing those flags in the cflags is probably wrong, as the rest of
the library objects will be compiled with those AVX512 flags.
If later this library code is run on a non supporting AVX512 system,
it will trigger a runtime error.


Please look at how other libraries integrated AVX512.
Thanks.
  
Leyi Rong Sept. 14, 2022, 9:42 a.m. UTC | #2
Hi David,

> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Tuesday, September 13, 2022 10:56 PM
> To: Rong, Leyi <leyi.rong@intel.com>
> Cc: ferruh.yigit@xilinx.com; suanmingm@nvidia.com; Wang, Yipeng1
> <yipeng1.wang@intel.com>; zaoxingliu@gmail.com; Gobriel, Sameh
> <sameh.gobriel@intel.com>; dev@dpdk.org; Richardson, Bruce
> <bruce.richardson@intel.com>; Aaron Conole <aconole@redhat.com>; Michael
> Santana <maicolgabriel@hotmail.com>; Lincoln Lavoie <lylavoie@iol.unh.edu>
> Subject: Re: [PATCH v2 1/2] member: implement NitroSketch mode
> 
> On Wed, Aug 31, 2022 at 8:47 AM Leyi Rong <leyi.rong@intel.com> wrote:
> >
> > Sketching algorithm provide high-fidelity approximate measurements and
> > appears as a promising alternative to traditional approaches such as
> > packet sampling.
> >
> > NitroSketch [1] is a software sketching framework that optimizes
> > performance, provides accuracy guarantees, and supports a variety of
> > sketches.
> >
> > This commit adds a new data structure called sketch into membership
> > library. This new data structure is an efficient way to profile the
> > traffic for heavy hitters. Also use min-heap structure to maintain the
> > top-k flow keys.
> 
> http://mails.dpdk.org/archives/test-report/2022-August/304026.html
> 
> This patch adds new symbols in the API without going through the experimental
> phase.
> What is the rationale for skipping it?
> 
Yes,  these new APIs should be marked as "experimental", will fix it.

> >
> > [1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
> > Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
> > Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
> > https://dl.acm.org/doi/pdf/10.1145/3341302.3342076
> >
> > Signed-off-by: Alan Liu <zaoxingliu@gmail.com>
> > Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> > Signed-off-by: Leyi Rong <leyi.rong@intel.com>
> > ---
> >  lib/member/meson.build                |  38 +-
> >  lib/member/rte_member.c               |  75 ++++
> >  lib/member/rte_member.h               | 151 ++++++-
> >  lib/member/rte_member_heap.h          | 424 ++++++++++++++++++
> >  lib/member/rte_member_sketch.c        | 594 ++++++++++++++++++++++++++
> >  lib/member/rte_member_sketch.h        |  97 +++++
> >  lib/member/rte_member_sketch_avx512.c |  69 +++
> > lib/member/rte_member_sketch_avx512.h |  36 ++
> >  lib/member/rte_xxh64_avx512.h         | 117 +++++
> >  lib/member/version.map                |   3 +
> >  10 files changed, 1600 insertions(+), 4 deletions(-)  create mode
> > 100644 lib/member/rte_member_heap.h  create mode 100644
> > lib/member/rte_member_sketch.c  create mode 100644
> > lib/member/rte_member_sketch.h  create mode 100644
> > lib/member/rte_member_sketch_avx512.c
> >  create mode 100644 lib/member/rte_member_sketch_avx512.h
> >  create mode 100644 lib/member/rte_xxh64_avx512.h
> >
> > diff --git a/lib/member/meson.build b/lib/member/meson.build index
> > e06fddc240..9b3418c25c 100644
> > --- a/lib/member/meson.build
> > +++ b/lib/member/meson.build
> > @@ -7,6 +7,42 @@ if is_windows
> >      subdir_done()
> >  endif
> >
> > -sources = files('rte_member.c', 'rte_member_ht.c',
> > 'rte_member_vbf.c')
> > +sources = files('rte_member.c', 'rte_member_ht.c',
> > +'rte_member_vbf.c', 'rte_member_sketch.c')
> >  headers = files('rte_member.h')
> >  deps += ['hash']
> > +includes += include_directories('../hash', '../ring')
> > +
> > +# compile AVX512 version if:
> > +# we are building 64-bit binary AND binutils can generate proper code
> > +if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
> > +    # compile AVX512 version if either:
> > +    # a. we have AVX512 supported in minimum instruction set
> > +    #    baseline
> > +    # b. it's not minimum instruction set, but supported by
> > +    #    compiler
> > +    #
> > +    # in former case, just add avx512 C file to files list
> > +    # in latter case, compile c file to static lib, using correct
> > +    # compiler flags, and then have the .o file from static lib
> > +    # linked into main lib.
> > +    sketch_avx512_cpu_support = (
> > +        cc.get_define('__AVX512F__', args: machine_args) != ''
> > +    )
> > +
> > +    if sketch_avx512_cpu_support == true
> > +       cflags += ['-DCC_AVX512_SUPPORT']
> > +       if cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
> > +           cflags += ['-mavx512f', '-mavx512dq', '-mavx512ifma']
> 
> Pushing those flags in the cflags is probably wrong, as the rest of the library
> objects will be compiled with those AVX512 flags.
> If later this library code is run on a non supporting AVX512 system, it will trigger
> a runtime error.
> 
Yes, thanks for the reminder, will fix it.

> 
> Please look at how other libraries integrated AVX512.
> Thanks.
> 
> --
> David Marchand
  

Patch

diff --git a/lib/member/meson.build b/lib/member/meson.build
index e06fddc240..9b3418c25c 100644
--- a/lib/member/meson.build
+++ b/lib/member/meson.build
@@ -7,6 +7,42 @@  if is_windows
     subdir_done()
 endif
 
-sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c')
+sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c', 'rte_member_sketch.c')
 headers = files('rte_member.h')
 deps += ['hash']
+includes += include_directories('../hash', '../ring')
+
+# compile AVX512 version if:
+# we are building 64-bit binary AND binutils can generate proper code
+if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
+    # compile AVX512 version if either:
+    # a. we have AVX512 supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+    sketch_avx512_cpu_support = (
+        cc.get_define('__AVX512F__', args: machine_args) != ''
+    )
+
+    if sketch_avx512_cpu_support == true
+	cflags += ['-DCC_AVX512_SUPPORT']
+	if cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
+	    cflags += ['-mavx512f', '-mavx512dq', '-mavx512ifma']
+	endif
+	sources += files('rte_member_sketch_avx512.c')
+    elif cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
+	cflags += ['-DCC_AVX512_SUPPORT']
+	cflags += ['-mavx512f', '-mavx512dq', '-mavx512ifma']
+	sketch_avx512_tmp = static_library('sketch_avx512_tmp',
+	    'rte_member_sketch_avx512.c',
+	    include_directories: includes,
+	    dependencies: static_rte_eal,
+	    c_args: cflags)
+	objs += sketch_avx512_tmp.extract_objects('rte_member_sketch_avx512.c')
+    endif
+endif
diff --git a/lib/member/rte_member.c b/lib/member/rte_member.c
index 7e1632e6b5..8f859f7fbd 100644
--- a/lib/member/rte_member.c
+++ b/lib/member/rte_member.c
@@ -9,10 +9,12 @@ 
 #include <rte_malloc.h>
 #include <rte_errno.h>
 #include <rte_tailq.h>
+#include <rte_ring_elem.h>
 
 #include "rte_member.h"
 #include "rte_member_ht.h"
 #include "rte_member_vbf.h"
+#include "rte_member_sketch.h"
 
 TAILQ_HEAD(rte_member_list, rte_tailq_entry);
 static struct rte_tailq_elem rte_member_tailq = {
@@ -72,6 +74,9 @@  rte_member_free(struct rte_member_setsum *setsum)
 	case RTE_MEMBER_TYPE_VBF:
 		rte_member_free_vbf(setsum);
 		break;
+	case RTE_MEMBER_TYPE_SKETCH:
+		rte_member_free_sketch(setsum);
+		break;
 	default:
 		break;
 	}
@@ -86,6 +91,8 @@  rte_member_create(const struct rte_member_parameters *params)
 	struct rte_member_list *member_list;
 	struct rte_member_setsum *setsum;
 	int ret;
+	char ring_name[RTE_RING_NAMESIZE];
+	struct rte_ring *sketch_key_ring = NULL;
 
 	if (params == NULL) {
 		rte_errno = EINVAL;
@@ -100,6 +107,16 @@  rte_member_create(const struct rte_member_parameters *params)
 		return NULL;
 	}
 
+	if (params->type == RTE_MEMBER_TYPE_SKETCH) {
+		snprintf(ring_name, sizeof(ring_name), "SK_%s", params->name);
+		sketch_key_ring = rte_ring_create_elem(ring_name, sizeof(uint32_t),
+				rte_align32pow2(params->top_k), params->socket_id, 0);
+		if (sketch_key_ring == NULL) {
+			RTE_MEMBER_LOG(ERR, "Sketch Ring Memory allocation failed\n");
+			return NULL;
+		}
+	}
+
 	member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list);
 
 	rte_mcfg_tailq_write_lock();
@@ -145,6 +162,9 @@  rte_member_create(const struct rte_member_parameters *params)
 	case RTE_MEMBER_TYPE_VBF:
 		ret = rte_member_create_vbf(setsum, params);
 		break;
+	case RTE_MEMBER_TYPE_SKETCH:
+		ret = rte_member_create_sketch(setsum, params, sketch_key_ring);
+		break;
 	default:
 		goto error_unlock_exit;
 	}
@@ -162,6 +182,7 @@  rte_member_create(const struct rte_member_parameters *params)
 error_unlock_exit:
 	rte_free(te);
 	rte_free(setsum);
+	rte_ring_free(sketch_key_ring);
 	rte_mcfg_tailq_write_unlock();
 	return NULL;
 }
@@ -178,6 +199,23 @@  rte_member_add(const struct rte_member_setsum *setsum, const void *key,
 		return rte_member_add_ht(setsum, key, set_id);
 	case RTE_MEMBER_TYPE_VBF:
 		return rte_member_add_vbf(setsum, key, set_id);
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_add_sketch(setsum, key, set_id);
+	default:
+		return -EINVAL;
+	}
+}
+
+int
+rte_member_add_byte_count(const struct rte_member_setsum *setsum,
+			  const void *key, uint32_t byte_count)
+{
+	if (setsum == NULL || key == NULL || byte_count == 0)
+		return -EINVAL;
+
+	switch (setsum->type) {
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_add_sketch_byte_count(setsum, key, byte_count);
 	default:
 		return -EINVAL;
 	}
@@ -195,6 +233,8 @@  rte_member_lookup(const struct rte_member_setsum *setsum, const void *key,
 		return rte_member_lookup_ht(setsum, key, set_id);
 	case RTE_MEMBER_TYPE_VBF:
 		return rte_member_lookup_vbf(setsum, key, set_id);
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_lookup_sketch(setsum, key, set_id);
 	default:
 		return -EINVAL;
 	}
@@ -261,6 +301,36 @@  rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum,
 	}
 }
 
+int
+rte_member_query_count(const struct rte_member_setsum *setsum,
+		       const void *key, uint64_t *output)
+{
+	if (setsum == NULL || key == NULL || output == NULL)
+		return -EINVAL;
+
+	switch (setsum->type) {
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_query_sketch(setsum, key, output);
+	default:
+		return -EINVAL;
+	}
+}
+
+int
+rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
+				void **key, uint64_t *count)
+{
+	if (setsum == NULL || key == NULL || count == NULL)
+		return -EINVAL;
+
+	switch (setsum->type) {
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_report_heavyhitter_sketch(setsum, key, count);
+	default:
+		return -EINVAL;
+	}
+}
+
 int
 rte_member_delete(const struct rte_member_setsum *setsum, const void *key,
 			member_set_t set_id)
@@ -272,6 +342,8 @@  rte_member_delete(const struct rte_member_setsum *setsum, const void *key,
 	case RTE_MEMBER_TYPE_HT:
 		return rte_member_delete_ht(setsum, key, set_id);
 	/* current vBF implementation does not support delete function */
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_delete_sketch(setsum, key);
 	case RTE_MEMBER_TYPE_VBF:
 	default:
 		return -EINVAL;
@@ -290,6 +362,9 @@  rte_member_reset(const struct rte_member_setsum *setsum)
 	case RTE_MEMBER_TYPE_VBF:
 		rte_member_reset_vbf(setsum);
 		return;
+	case RTE_MEMBER_TYPE_SKETCH:
+		rte_member_reset_sketch(setsum);
+		return;
 	default:
 		return;
 	}
diff --git a/lib/member/rte_member.h b/lib/member/rte_member.h
index 2611015771..c133fa3ed7 100644
--- a/lib/member/rte_member.h
+++ b/lib/member/rte_member.h
@@ -39,6 +39,18 @@ 
  * |          |                     | not overwrite  |                         |
  * |          |                     | existing key.  |                         |
  * +----------+---------------------+----------------+-------------------------+
+ * +==========+=============================+
+ * |   type   |      sketch                 |
+ * +==========+=============================+
+ * |structure | counting bloom filter array |
+ * +----------+-----------------------------+
+ * |set id    | 1: heavy set, 0: light set  |
+ * |          |                             |
+ * +----------+-----------------------------+
+ * |usages &  | count size of a flow,       |
+ * |properties| used for heavy hitter       |
+ * |          | detection.                  |
+ * +----------+-----------------------------+
  * -->
  */
 
@@ -50,6 +62,8 @@  extern "C" {
 #endif
 
 #include <stdint.h>
+#include <stdbool.h>
+#include <inttypes.h>
 
 #include <rte_common.h>
 
@@ -65,6 +79,20 @@  typedef uint16_t member_set_t;
 #define RTE_MEMBER_BUCKET_ENTRIES 16
 /** Maximum number of characters in setsum name. */
 #define RTE_MEMBER_NAMESIZE 32
+/** Max value of the random number */
+#define RTE_RAND_MAX      ~0LLU
+/**
+ * As packets skipped in the sampling-based algorithm, the accounting
+ * results accuracy is not guaranteed in the start stage. There should
+ * be a "convergence time" to achieve the accuracy after receiving enough
+ * packets.
+ * For sketch, use the flag if prefer always bounded mode, which only
+ * starts sampling after receiving enough packets to keep the results
+ * accuracy always bounded.
+ */
+#define RTE_MEMBER_SKETCH_ALWAYS_BOUNDED 0x01
+/** For sketch, use the flag if to count packet size instead of packet count */
+#define RTE_MEMBER_SKETCH_COUNT_BYTE 0x02
 
 /** @internal Hash function used by membership library. */
 #if defined(RTE_ARCH_X86) || defined(__ARM_FEATURE_CRC32)
@@ -104,6 +132,7 @@  struct rte_member_parameters;
 enum rte_member_setsum_type {
 	RTE_MEMBER_TYPE_HT = 0,  /**< Hash table based set summary. */
 	RTE_MEMBER_TYPE_VBF,     /**< Vector of bloom filters. */
+	RTE_MEMBER_TYPE_SKETCH,
 	RTE_MEMBER_NUM_TYPE
 };
 
@@ -114,6 +143,19 @@  enum rte_member_sig_compare_function {
 	RTE_MEMBER_COMPARE_NUM
 };
 
+/* sketch update function with different implementations. */
+typedef void (*sketch_update_fn_t)(const struct rte_member_setsum *ss,
+				   const void *key,
+				   uint32_t count);
+
+/* sketch lookup function with different implementations. */
+typedef uint64_t (*sketch_lookup_fn_t)(const struct rte_member_setsum *ss,
+				       const void *key);
+
+/* sketch delete function with different implementations. */
+typedef void (*sketch_delete_fn_t)(const struct rte_member_setsum *ss,
+				   const void *key);
+
 /** @internal setsummary structure. */
 struct rte_member_setsum {
 	enum rte_member_setsum_type type; /* Type of the set summary. */
@@ -134,6 +176,21 @@  struct rte_member_setsum {
 	uint32_t bit_mask;	/* Bit mask to get bit location in bf. */
 	uint32_t num_hashes;	/* Number of hash values to index bf. */
 
+	/* Parameters for sketch */
+	float error_rate;
+	float sample_rate;
+	uint32_t num_col;
+	uint32_t num_row;
+	int always_bounded;
+	double converge_thresh;
+	uint32_t topk;
+	uint32_t count_byte;
+	uint64_t *hash_seeds;
+	sketch_update_fn_t sketch_update; /* Pointer to the sketch update function */
+	sketch_lookup_fn_t sketch_lookup; /* Pointer to the sketch lookup function */
+	sketch_delete_fn_t sketch_delete; /* Pointer to the sketch delete function */
+
+	void *runtime_var;
 	uint32_t mul_shift;  /* vbf internal variable used during bit test. */
 	uint32_t div_shift;  /* vbf internal variable used during bit test. */
 
@@ -143,6 +200,9 @@  struct rte_member_setsum {
 	/* Second cache line should start here. */
 	uint32_t socket_id;          /* NUMA Socket ID for memory. */
 	char name[RTE_MEMBER_NAMESIZE]; /* Name of this set summary. */
+#ifdef RTE_ARCH_X86
+	bool use_avx512;
+#endif
 } __rte_cache_aligned;
 
 /**
@@ -261,8 +321,33 @@  struct rte_member_parameters {
 	 */
 	uint32_t sec_hash_seed;
 
+	/**
+	 * For count(min) sketch data structure, error rate defines the accuracy
+	 * required by the user. Higher accuracy leads to more memory usage, but
+	 * the flow size is estimated more accurately.
+	 */
+	float error_rate;
+
+	/**
+	 * Sampling rate means the internal sample rate of the rows of the count
+	 * min sketches. Lower sampling rate can reduce CPU overhead, but the
+	 * data structure will require more time to converge statistically.
+	 */
+	float sample_rate;
+
+	/**
+	 * How many top heavy hitter to be reported. The library will internally
+	 * keep the keys of heavy hitters for final report.
+	 */
+	uint32_t top_k;
+
+	/**
+	 * Extra flags that may passed in by user
+	 */
+	uint32_t extra_flag;
+
 	int socket_id;			/**< NUMA Socket ID for memory. */
-};
+} __rte_cache_aligned;
 
 /**
  * @warning
@@ -418,7 +503,7 @@  rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum,
  *   RTE_MEMBER_NO_MATCH by default is set as 0.
  *   For HT mode, the set_id has range as [1, 0x7FFF], MSB is reserved.
  *   For vBF mode the set id is limited by the num_set parameter when create
- *   the set-summary.
+ *   the set-summary. For sketch mode, this id is ignored.
  * @return
  *   HT (cache mode) and vBF should never fail unless the set_id is not in the
  *   valid range. In such case -EINVAL is returned.
@@ -429,12 +514,72 @@  rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum,
  *   Return 0 for HT (cache mode) if the add does not cause
  *   eviction, return 1 otherwise. Return 0 for non-cache mode if success,
  *   -ENOSPC for full, and 1 if cuckoo eviction happens.
- *   Always returns 0 for vBF mode.
+ *   Always returns 0 for vBF mode and sketch.
  */
 int
 rte_member_add(const struct rte_member_setsum *setsum, const void *key,
 			member_set_t set_id);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add the packet byte size into the sketch.
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the key to be added.
+ * @param byte_count
+ *   Add the byte count of the packet into the sketch.
+ * @return
+ * Return -EINVAL for invalid parameters, otherwise return 0.
+ */
+int
+rte_member_add_byte_count(const struct rte_member_setsum *setsum,
+			  const void *key, uint32_t byte_count);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Query packet count for a certain flow-key.
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the key to be added.
+ * @param count
+ *   The output packet count or byte count.
+ * @return
+ *   Return -EINVAL for invalid parameters.
+ */
+int
+rte_member_query_count(const struct rte_member_setsum *setsum,
+		       const void *key, uint64_t *count);
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Report heavyhitter flow-keys into set-summary (SS).
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param keys
+ *   Pointer of the output top-k key array.
+ * @param counts
+ *   Pointer of the output packet count or byte count array of the top-k keys.
+ * @return
+ *   Return -EINVAL for invalid parameters. Return a positive integer indicate
+ *   how many heavy hitters are reported.
+ */
+int
+rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
+			      void **keys, uint64_t *counts);
+
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice
diff --git a/lib/member/rte_member_heap.h b/lib/member/rte_member_heap.h
new file mode 100644
index 0000000000..3ced34160a
--- /dev/null
+++ b/lib/member/rte_member_heap.h
@@ -0,0 +1,424 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ * Copyright(c) 2020, Alan Liu <zaoxingliu@gmail.com>
+ */
+
+#ifndef _RTE_MEMBER_HEAP_H_
+#define _RTE_MEMBER_HEAP_H_
+
+#include <rte_ring_elem.h>
+#include "rte_member.h"
+
+#define LCHILD(x) (2 * x + 1)
+#define RCHILD(x) (2 * x + 2)
+#define PARENT(x) ((x - 1) / 2)
+
+#define HASH_BKT_SIZE 16
+#define HASH_HP_MULTI 4
+#define HASH_RESIZE_MULTI 2
+
+struct hash_bkt {
+	uint16_t sig[HASH_BKT_SIZE];
+	uint16_t idx[HASH_BKT_SIZE];
+};
+
+struct hash {
+	uint16_t bkt_cnt;
+	uint16_t num_item;
+	uint32_t seed;
+	struct hash_bkt buckets[0];
+};
+
+struct node {
+	void *key;
+	uint64_t count;
+};
+
+struct minheap {
+	uint32_t key_len;
+	uint32_t size;
+	uint32_t socket;
+	struct hash *hashtable;
+	struct node *elem;
+};
+
+static int
+hash_table_insert(const void *key, int value, int key_len, struct hash *table)
+{
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+	int i;
+
+	for (i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].idx[i] == 0) {
+			table->buckets[idx].idx[i] = value;
+			table->buckets[idx].sig[i] = sig;
+			table->num_item++;
+			return 0;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int
+hash_table_update(const void *key, int old_value, int value, int key_len, struct hash *table)
+{
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+	int i;
+
+	for (i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] == old_value) {
+			table->buckets[idx].idx[i] = value;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static int
+hash_table_del(const void *key, uint16_t value, int key_len, struct hash *table)
+{
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+	int i;
+
+	for (i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] == value) {
+			table->buckets[idx].idx[i] = 0;
+			table->num_item--;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static int
+hash_table_lookup(const void *key, int key_len, struct minheap *hp)
+{
+	struct hash *table = hp->hashtable;
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+	int i;
+
+	for (i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] != 0) {
+			uint32_t hp_idx = table->buckets[idx].idx[i] - 1;
+
+			if (memcmp(hp->elem[hp_idx].key, key, hp->key_len) == 0)
+				return hp_idx;
+		}
+	}
+
+	return -ENOENT; /* key doesn't exist */
+}
+
+static int
+resize_hash_table(struct minheap *hp)
+{
+	uint32_t i;
+	uint32_t new_bkt_cnt;
+
+	while (1) {
+		new_bkt_cnt = hp->hashtable->bkt_cnt * HASH_RESIZE_MULTI;
+
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT load factor is [%f]\n",
+			hp->hashtable->num_item / ((float)hp->hashtable->bkt_cnt * HASH_BKT_SIZE));
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT resize happen!\n");
+		rte_free(hp->hashtable);
+		hp->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
+						new_bkt_cnt * sizeof(struct hash_bkt),
+						RTE_CACHE_LINE_SIZE, hp->socket);
+
+		if (hp->hashtable == NULL) {
+			RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n");
+			return -ENOMEM;
+		}
+
+		hp->hashtable->bkt_cnt = new_bkt_cnt;
+
+		for (i = 0; i < hp->size; ++i) {
+			if (hash_table_insert(hp->elem[i].key,
+				i + 1, hp->key_len, hp->hashtable) < 0) {
+				RTE_MEMBER_LOG(ERR,
+					"Sketch Minheap HT resize insert fail!\n");
+				break;
+			}
+		}
+		if (i == hp->size)
+			break;
+	}
+
+	return 0;
+}
+
+/* find the item in the given minheap */
+static int
+rte_member_minheap_find(struct minheap *hp, const void *key)
+{
+	int idx = hash_table_lookup(key, hp->key_len, hp);
+	return idx;
+}
+
+static int
+rte_member_minheap_init(struct minheap *heap, int size,
+			uint32_t socket, uint32_t seed)
+{
+	heap->elem = rte_zmalloc_socket(NULL, sizeof(struct node) * size,
+				RTE_CACHE_LINE_SIZE, socket);
+	if (heap->elem == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap elem allocation failed\n");
+		return -ENOMEM;
+	}
+
+	uint32_t hash_bkt_cnt = rte_align32pow2(size * HASH_HP_MULTI) / HASH_BKT_SIZE;
+
+	if (hash_bkt_cnt == 0)
+		hash_bkt_cnt = 1;
+
+	heap->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
+					hash_bkt_cnt * sizeof(struct hash_bkt),
+					RTE_CACHE_LINE_SIZE, socket);
+
+	if (heap->hashtable == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n");
+		rte_free(heap->elem);
+		return -ENOMEM;
+	}
+
+	heap->hashtable->seed = seed;
+	heap->hashtable->bkt_cnt = hash_bkt_cnt;
+	heap->socket = socket;
+
+	return 0;
+}
+
+/* swap the minheap nodes */
+static __rte_always_inline void
+rte_member_heap_swap(struct node *n1, struct node *n2)
+{
+	struct node temp = *n1;
+	*n1 = *n2;
+	*n2 = temp;
+}
+
+/* heapify function */
+static void
+rte_member_heapify(struct minheap *hp, uint32_t idx, bool update_hash)
+{
+	uint32_t smallest;
+
+	if (LCHILD(idx) < hp->size &&
+			hp->elem[LCHILD(idx)].count < hp->elem[idx].count)
+		smallest = LCHILD(idx);
+	else
+		smallest = idx;
+
+	if (RCHILD(idx) < hp->size &&
+			hp->elem[RCHILD(idx)].count < hp->elem[smallest].count)
+		smallest = RCHILD(idx);
+
+	if (smallest != idx) {
+		rte_member_heap_swap(&(hp->elem[idx]), &(hp->elem[smallest]));
+
+		if (update_hash) {
+			if (hash_table_update(hp->elem[smallest].key, idx + 1, smallest + 1,
+					hp->key_len, hp->hashtable) < 0) {
+				RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+				return;
+			}
+
+			if (hash_table_update(hp->elem[idx].key, smallest + 1, idx + 1,
+					hp->key_len, hp->hashtable) < 0) {
+				RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+				return;
+			}
+		}
+		rte_member_heapify(hp, smallest, update_hash);
+	}
+}
+
+/* insert a node into the minheap */
+static int
+rte_member_minheap_insert_node(struct minheap *hp, const void *key,
+			       int counter, void *key_slot,
+			       struct rte_ring *free_key_slot)
+{
+	struct node nd;
+	uint32_t slot_id;
+
+	if (rte_ring_sc_dequeue_elem(free_key_slot, &slot_id, sizeof(uint32_t)) != 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap get empty keyslot failed\n");
+		return -1;
+	}
+
+	nd.count = counter;
+	nd.key = RTE_PTR_ADD(key_slot, slot_id * hp->key_len);
+
+	memcpy(nd.key, key, hp->key_len);
+
+	uint32_t i = (hp->size)++;
+
+	while (i && nd.count < hp->elem[PARENT(i)].count) {
+		hp->elem[i] = hp->elem[PARENT(i)];
+		if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
+				hp->key_len, hp->hashtable) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+			return -1;
+		}
+		i = PARENT(i);
+	}
+	hp->elem[i] = nd;
+
+	if (hash_table_insert(key, i + 1, hp->key_len, hp->hashtable) < 0) {
+		if (resize_hash_table(hp) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table resize failed\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* delete a key from the minheap */
+static int
+rte_member_minheap_delete_node(struct minheap *hp, const void *key,
+			       void *key_slot, struct rte_ring *free_key_slot)
+{
+	int idx = rte_member_minheap_find(hp, key);
+	uint32_t offset = RTE_PTR_DIFF(hp->elem[idx].key, key_slot) / hp->key_len;
+
+	if (hash_table_del(key, idx + 1, hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
+		return -1;
+	}
+
+	rte_ring_sp_enqueue_elem(free_key_slot, &offset, sizeof(uint32_t));
+
+	if (idx == (int)(hp->size - 1)) {
+		hp->size--;
+		return 0;
+	}
+
+	hp->elem[idx] = hp->elem[hp->size - 1];
+
+	if (hash_table_update(hp->elem[idx].key, hp->size, idx + 1,
+				hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+		return -1;
+	}
+	hp->size--;
+	rte_member_heapify(hp, idx, true);
+
+	return 0;
+}
+
+/* replace a min node with a new key. */
+static int
+rte_member_minheap_replace_node(struct minheap *hp,
+				const void *new_key,
+				int new_counter)
+{
+	struct node nd;
+	void *recycle_key = NULL;
+
+	recycle_key = hp->elem[0].key;
+
+	if (hash_table_del(recycle_key, 1, hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
+		return -1;
+	}
+
+	hp->elem[0] = hp->elem[hp->size - 1];
+
+	if (hash_table_update(hp->elem[0].key, hp->size, 1,
+				hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+		return -1;
+	}
+	hp->size--;
+
+	rte_member_heapify(hp, 0, true);
+
+	nd.count = new_counter;
+	nd.key = recycle_key;
+
+	memcpy(nd.key, new_key, hp->key_len);
+
+	uint32_t i = (hp->size)++;
+
+	while (i && nd.count < hp->elem[PARENT(i)].count) {
+		hp->elem[i] = hp->elem[PARENT(i)];
+		if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
+				hp->key_len, hp->hashtable) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+			return -1;
+		}
+		i = PARENT(i);
+	}
+
+	hp->elem[i] = nd;
+
+	if (hash_table_insert(new_key, i + 1, hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace insert failed\n");
+		if (resize_hash_table(hp) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace resize failed\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* sort the heap into a descending array */
+static void
+rte_member_heapsort(struct minheap *hp, struct node *result_array)
+{
+	struct minheap new_hp;
+
+	/* build a new heap for using the given array */
+	new_hp.size = hp->size;
+	new_hp.key_len = hp->key_len;
+	new_hp.elem = result_array;
+	memcpy(result_array, hp->elem, hp->size * sizeof(struct node));
+
+	/* sort the new heap */
+	while (new_hp.size > 1) {
+		rte_member_heap_swap(&(new_hp.elem[0]), &(new_hp.elem[new_hp.size - 1]));
+		new_hp.size--;
+		rte_member_heapify(&new_hp, 0, false);
+	}
+}
+
+static void
+rte_member_minheap_free(struct minheap *hp)
+{
+	if (hp == NULL)
+		return;
+
+	rte_free(hp->elem);
+	rte_free(hp->hashtable);
+}
+
+static void
+rte_member_minheap_reset(struct minheap *hp)
+{
+	if (hp == NULL)
+		return;
+
+	memset(hp->elem, 0, sizeof(struct node) * hp->size);
+	hp->size = 0;
+
+	memset((char *)hp->hashtable + sizeof(struct hash), 0,
+			hp->hashtable->bkt_cnt * sizeof(struct hash_bkt));
+	hp->hashtable->num_item = 0;
+}
+
+#endif /* _RTE_MEMBER_HEAP_H_ */
diff --git a/lib/member/rte_member_sketch.c b/lib/member/rte_member_sketch.c
new file mode 100644
index 0000000000..524ba77620
--- /dev/null
+++ b/lib/member/rte_member_sketch.c
@@ -0,0 +1,594 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ * Copyright(c) 2020, Alan Liu <zaoxingliu@gmail.com>
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_prefetch.h>
+#include <rte_ring_elem.h>
+
+#include "rte_member.h"
+#include "rte_member_sketch.h"
+#include "rte_member_heap.h"
+
+#ifdef CC_AVX512_SUPPORT
+#include "rte_member_sketch_avx512.h"
+#endif /* CC_AVX512_SUPPORT */
+
+struct sketch_runtime {
+	uint64_t pkt_cnt;
+	uint32_t until_next;
+	int converged;
+	struct minheap heap;
+	struct node *report_array;
+	void *key_slots;
+	struct rte_ring *free_key_slots;
+} __rte_cache_aligned;
+
+/*
+ * Geometric sampling to calculate how many packets needs to be
+ * skipped until next update. This method can mitigate the CPU
+ * overheads compared with coin-toss sampling.
+ */
+static uint32_t
+draw_geometric(const struct rte_member_setsum *ss)
+{
+	double rand = 1;
+
+	if (ss->sample_rate == 1)
+		return 1;
+
+	while (rand == 1 || rand == 0)
+		rand = (double) rte_rand() / (double)(RTE_RAND_MAX);
+
+	return (uint32_t)ceil(log(1 - rand) / log(1 - ss->sample_rate));
+}
+
+static void
+isort(uint64_t *array, int n)
+{
+	int i;
+
+	for (i = 1; i < n; i++) {
+		uint64_t t = array[i];
+		int j;
+
+		for (j = i - 1; j >= 0; j--) {
+			if (t < array[j])
+				array[j + 1] = array[j];
+			else
+				break;
+		}
+		array[j + 1] = t;
+	}
+}
+
+static __rte_always_inline void
+swap(uint64_t *a, uint64_t *b)
+{
+	uint64_t tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+static uint64_t
+medianof5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
+{
+	if (a > b)
+		swap(&a, &b);
+	if (c > d)
+		swap(&c, &d);
+	if (a > c) {
+		if (d > e)
+			swap(&c, &e);
+		else {
+			swap(&c, &d);
+			swap(&d, &e);
+		}
+	} else {
+		if (b > e)
+			swap(&a, &e);
+		else {
+			swap(&a, &b);
+			swap(&b, &e);
+		}
+	}
+
+	if (a > c)
+		return a > d ? d : a;
+	else
+		return b > c ? c : b;
+}
+
+int
+rte_member_create_sketch(struct rte_member_setsum *ss,
+			 const struct rte_member_parameters *params,
+			 struct rte_ring *ring)
+{
+	struct sketch_runtime *runtime;
+	uint32_t num_col;
+	uint32_t i;
+
+	if (params->sample_rate == 0 || params->sample_rate > 1) {
+		rte_errno = EINVAL;
+		RTE_MEMBER_LOG(ERR,
+			"Membership Sketch created with invalid parameters\n");
+		return -EINVAL;
+	}
+
+	if (params->extra_flag & RTE_MEMBER_SKETCH_COUNT_BYTE)
+		ss->count_byte = 1;
+
+#ifdef RTE_ARCH_X86
+	if (ss->count_byte == 1 &&
+		rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512IFMA) == 1) {
+#ifdef CC_AVX512_SUPPORT
+		ss->use_avx512 = true;
+#else
+		ss->use_avx512 = false;
+#endif
+	}
+
+	if (ss->use_avx512 == true) {
+#ifdef CC_AVX512_SUPPORT
+		ss->num_row = NUM_ROW_VEC;
+		RTE_MEMBER_LOG(NOTICE,
+			"Membership Sketch AVX512 update/lookup/delete ops is selected\n");
+		ss->sketch_update = sketch_update_avx512;
+		ss->sketch_lookup = sketch_lookup_avx512;
+		ss->sketch_delete = sketch_delete_avx512;
+#endif
+	} else
+#endif
+	{
+		ss->num_row = NUM_ROW_SCALAR;
+		RTE_MEMBER_LOG(NOTICE,
+			"Membership Sketch SCALAR update/lookup/delete ops is selected\n");
+		ss->sketch_update = sketch_update_scalar;
+		ss->sketch_lookup = sketch_lookup_scalar;
+		ss->sketch_delete = sketch_delete_scalar;
+	}
+
+	ss->socket_id = params->socket_id;
+
+	if (ss->count_byte == 0)
+		num_col = 4.0 / params->error_rate / params->sample_rate;
+#ifdef RTE_ARCH_X86
+	else if (ss->use_avx512 == true)
+		num_col = rte_align32pow2(4.0 / params->error_rate);
+#endif
+	else
+		num_col = 4.0 / params->error_rate;
+
+	ss->table = rte_zmalloc_socket(NULL,
+			sizeof(uint64_t) * num_col * ss->num_row,
+			RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (ss->table == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Table memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	ss->hash_seeds = rte_zmalloc_socket(NULL, sizeof(uint64_t) * ss->num_row,
+			RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (ss->hash_seeds == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Hashseeds memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	ss->runtime_var = rte_zmalloc_socket(NULL, sizeof(struct sketch_runtime),
+					RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (ss->runtime_var == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Runtime memory allocation failed\n");
+		rte_free(ss);
+		return -ENOMEM;
+	}
+	runtime = ss->runtime_var;
+
+	ss->num_col = num_col;
+	ss->sample_rate = params->sample_rate;
+	ss->prim_hash_seed = params->prim_hash_seed;
+	ss->sec_hash_seed = params->sec_hash_seed;
+	ss->error_rate = params->error_rate;
+	ss->topk = params->top_k;
+	ss->key_len = params->key_len;
+	runtime->heap.key_len = ss->key_len;
+
+	runtime->key_slots = rte_zmalloc_socket(NULL, ss->key_len * ss->topk,
+					RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (runtime->key_slots == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Key Slots allocation failed\n");
+		goto error;
+	}
+
+	runtime->free_key_slots = ring;
+	for (i = 0; i < ss->topk; i++)
+		rte_ring_sp_enqueue_elem(runtime->free_key_slots,
+					&i, sizeof(uint32_t));
+
+	if (rte_member_minheap_init(&(runtime->heap), params->top_k,
+			ss->socket_id, params->prim_hash_seed) < 0) {
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap allocation failed\n");
+		goto error_runtime;
+	}
+
+	runtime->report_array = rte_zmalloc_socket(NULL, sizeof(struct node) * ss->topk,
+					RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (runtime->report_array == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Runtime Report Array allocation failed\n");
+		goto error_runtime;
+	}
+
+	rte_srand(ss->prim_hash_seed);
+	for (i = 0; i < ss->num_row; i++)
+		ss->hash_seeds[i] = rte_rand();
+
+	if (params->extra_flag & RTE_MEMBER_SKETCH_ALWAYS_BOUNDED)
+		ss->always_bounded = 1;
+
+	if (ss->always_bounded) {
+		double delta = 1.0 / (pow(2, ss->num_row));
+
+		ss->converge_thresh = 10 * pow(ss->error_rate, -2.0) * sqrt(log(1 / delta));
+	}
+
+	RTE_MEMBER_LOG(DEBUG, "Sketch created, "
+		"the total memory required is %u Bytes\n",  ss->num_col * ss->num_row * 8);
+
+	return 0;
+
+error_runtime:
+	rte_member_minheap_free(&runtime->heap);
+	rte_ring_free(runtime->free_key_slots);
+	rte_free(runtime->key_slots);
+error:
+	rte_free(runtime);
+	rte_free(ss);
+
+	return -ENOMEM;
+}
+
+uint64_t
+sketch_lookup_scalar(const struct rte_member_setsum *ss, const void *key)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t col[ss->num_row];
+	uint64_t count_row[ss->num_row];
+	uint32_t cur_row;
+	uint64_t count;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
+			ss->hash_seeds[cur_row]) % ss->num_col;
+
+		rte_prefetch0(&count_array[cur_row * ss->num_col + col[cur_row]]);
+	}
+
+	/* if sample rate is 1, it is a regular count-min, we report the min */
+	if (ss->sample_rate == 1 || ss->count_byte == 1)
+		return count_min(ss, col);
+
+	memset(count_row, 0, sizeof(uint64_t) * ss->num_row);
+
+	/* otherwise we report the median number */
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+		count_row[cur_row] = count_array[cur_row * ss->num_col + col[cur_row]];
+
+	if (ss->num_row == 5)
+		return medianof5(count_row[0], count_row[1],
+				count_row[2], count_row[3], count_row[4]);
+
+	isort(count_row, ss->num_row);
+
+	if (ss->num_row % 2 == 0) {
+		count = (count_row[ss->num_row / 2] + count_row[ss->num_row / 2 - 1]) / 2;
+		return count;
+	}
+	/* ss->num_row % 2 != 0 */
+	count = count_row[ss->num_row / 2];
+
+	return count;
+}
+
+void
+sketch_delete_scalar(const struct rte_member_setsum *ss, const void *key)
+{
+	uint32_t col[ss->num_row];
+	uint64_t *count_array = ss->table;
+	uint32_t cur_row;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
+			ss->hash_seeds[cur_row]) % ss->num_col;
+
+		/* set corresponding counter to 0 */
+		count_array[cur_row * ss->num_col + col[cur_row]] = 0;
+	}
+}
+
+int
+rte_member_query_sketch(const struct rte_member_setsum *ss,
+			const void *key,
+			uint64_t *output)
+{
+	uint64_t count = ss->sketch_lookup(ss, key);
+	*output = count;
+
+	return 0;
+}
+
+void
+rte_member_update_heap(const struct rte_member_setsum *ss)
+{
+	uint32_t i;
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	for (i = 0; i < runtime_var->heap.size; i++) {
+		uint64_t count = ss->sketch_lookup(ss, runtime_var->heap.elem[i].key);
+
+		runtime_var->heap.elem[i].count = count;
+	}
+}
+
+int
+rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum,
+				     void **key,
+				     uint64_t *count)
+{
+	uint32_t i;
+	struct sketch_runtime *runtime_var = setsum->runtime_var;
+
+	rte_member_update_heap(setsum);
+	rte_member_heapsort(&(runtime_var->heap), runtime_var->report_array);
+
+	for (i = 0; i < runtime_var->heap.size; i++) {
+		key[i] = runtime_var->report_array[i].key;
+		count[i] = runtime_var->report_array[i].count;
+	}
+
+	return runtime_var->heap.size;
+}
+
+int
+rte_member_lookup_sketch(const struct rte_member_setsum *ss,
+			 const void *key, member_set_t *set_id)
+{
+	uint64_t count = ss->sketch_lookup(ss, key);
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	if (runtime_var->heap.size > 0 && count >= runtime_var->heap.elem[0].count)
+		*set_id = 1;
+	else
+		*set_id = 0;
+
+	if (count == 0)
+		return 0;
+	else
+		return 1;
+}
+
+static void
+should_converge(const struct rte_member_setsum *ss)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	/* For count min sketch - L1 norm */
+	if (runtime_var->pkt_cnt > ss->converge_thresh) {
+		runtime_var->converged = 1;
+		RTE_MEMBER_LOG(DEBUG, "Sketch converged, begin sampling "
+					"from key count %"PRIu64"\n",
+					runtime_var->pkt_cnt);
+	}
+}
+
+static void
+sketch_update_row(const struct rte_member_setsum *ss, const void *key,
+		  uint32_t count, uint32_t cur_row)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t col = MEMBER_HASH_FUNC(key, ss->key_len,
+			ss->hash_seeds[cur_row]) % ss->num_col;
+
+	/* sketch counter update */
+	count_array[cur_row * ss->num_col + col] +=
+			ceil(count / (ss->sample_rate));
+}
+
+void
+sketch_update_scalar(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t col;
+	uint32_t cur_row;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		col = MEMBER_HASH_FUNC(key, ss->key_len,
+				ss->hash_seeds[cur_row]) % ss->num_col;
+		count_array[cur_row * ss->num_col + col] += count;
+	}
+}
+
+static void
+heap_update(const struct rte_member_setsum *ss, const void *key)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint64_t key_cnt = 0;
+	int found;
+
+	/* We also update the heap for this key */
+	key_cnt = ss->sketch_lookup(ss, key);
+	if (key_cnt > runtime_var->heap.elem[0].count) {
+		found = rte_member_minheap_find(&runtime_var->heap, key);
+		/* the key is found in the top-k heap */
+		if (found >= 0) {
+			if (runtime_var->heap.elem[found].count < key_cnt)
+				rte_member_heapify(&runtime_var->heap, found, true);
+
+			runtime_var->heap.elem[found].count = key_cnt;
+		} else if (runtime_var->heap.size < ss->topk) {
+			rte_member_minheap_insert_node(&runtime_var->heap, key,
+				key_cnt, runtime_var->key_slots, runtime_var->free_key_slots);
+		} else {
+			rte_member_minheap_replace_node(&runtime_var->heap, key, key_cnt);
+		}
+	} else if (runtime_var->heap.size < ss->topk) {
+		found = rte_member_minheap_find(&runtime_var->heap, key);
+		if (found >= 0) {
+			if (runtime_var->heap.elem[found].count < key_cnt)
+				rte_member_heapify(&runtime_var->heap, found, true);
+
+			runtime_var->heap.elem[found].count = key_cnt;
+		} else
+			rte_member_minheap_insert_node(&runtime_var->heap, key,
+				key_cnt, runtime_var->key_slots, runtime_var->free_key_slots);
+	}
+}
+
+/*
+ * Add a single packet into the sketch.
+ * Sketch value is meatured by packet numbers in this mode.
+ */
+int
+rte_member_add_sketch(const struct rte_member_setsum *ss,
+		      const void *key,
+		      __rte_unused member_set_t set_id)
+{
+	uint32_t cur_row;
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint32_t *until_next = &(runtime_var->until_next);
+
+	/*
+	 * If sketch is measured by byte count,
+	 * the rte_member_add_sketch_byte_count routine should be used.
+	 */
+	if (ss->count_byte == 1) {
+		RTE_MEMBER_LOG(ERR, "Sketch is Byte Mode, "
+			"should use rte_member_add_byte_count()!\n");
+		return -EINVAL;
+	}
+
+	if (ss->sample_rate == 1) {
+		ss->sketch_update(ss, key, 1);
+		heap_update(ss, key);
+		return 0;
+	}
+
+	/* convergence stage if it's needed */
+	if (ss->always_bounded && !runtime_var->converged) {
+		ss->sketch_update(ss, key, 1);
+
+		if (!((++runtime_var->pkt_cnt) & (INTERVAL - 1)))
+			should_converge(ss);
+
+		heap_update(ss, key);
+		return 0;
+	}
+
+	/* should we skip this packet */
+	if (*until_next >= ss->num_row) {
+		*until_next -= ss->num_row;
+		return 0;
+	}
+	cur_row = *until_next;
+	do {
+		sketch_update_row(ss, key, 1, cur_row);
+		*until_next = draw_geometric(ss);
+		if (cur_row + *until_next >= ss->num_row)
+			break;
+		cur_row += *until_next;
+	} while (1);
+
+	*until_next -= (ss->num_row - cur_row);
+
+	heap_update(ss, key);
+
+	return 0;
+}
+
+/*
+ * Add the byte count of the packet into the sketch.
+ * Sketch value is meatured by byte count numbers in this mode.
+ */
+int
+rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
+				 const void *key,
+				 uint32_t byte_count)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint32_t *until_next = &(runtime_var->until_next);
+
+	/* should not call this API if not in count byte mode */
+	if (ss->count_byte == 0) {
+		RTE_MEMBER_LOG(ERR, "Sketch is Pkt Mode, "
+			"should use rte_member_add()!\n");
+		return -EINVAL;
+	}
+
+	/* there's specific optimization for the sketch update */
+	ss->sketch_update(ss, key, byte_count);
+
+	if (*until_next != 0) {
+		*until_next = *until_next - 1;
+		return 0;
+	}
+
+	*until_next = draw_geometric(ss) - 1;
+
+	heap_update(ss, key);
+
+	return 0;
+}
+
+int
+rte_member_delete_sketch(const struct rte_member_setsum *ss,
+			 const void *key)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	int found;
+
+	found = rte_member_minheap_find(&runtime_var->heap, key);
+	if (found < 0)
+		return -1;
+
+	ss->sketch_delete(ss, key);
+
+	return rte_member_minheap_delete_node
+		(&runtime_var->heap, key, runtime_var->key_slots, runtime_var->free_key_slots);
+}
+
+void
+rte_member_free_sketch(struct rte_member_setsum *ss)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	rte_free(ss->table);
+	rte_member_minheap_free(&runtime_var->heap);
+	rte_free(runtime_var->key_slots);
+	rte_ring_free(runtime_var->free_key_slots);
+	rte_free(runtime_var);
+}
+
+void
+rte_member_reset_sketch(const struct rte_member_setsum *ss)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint64_t *sketch = ss->table;
+	uint32_t i;
+
+	memset(sketch, 0, sizeof(uint64_t) * ss->num_col * ss->num_row);
+	rte_member_minheap_reset(&runtime_var->heap);
+	rte_ring_reset(runtime_var->free_key_slots);
+
+	for (i = 0; i < ss->topk; i++)
+		rte_ring_sp_enqueue_elem(runtime_var->free_key_slots, &i, sizeof(uint32_t));
+}
diff --git a/lib/member/rte_member_sketch.h b/lib/member/rte_member_sketch.h
new file mode 100644
index 0000000000..219323008b
--- /dev/null
+++ b/lib/member/rte_member_sketch.h
@@ -0,0 +1,97 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017 Intel Corporation
+ */
+
+#ifndef _RTE_MEMBER_SKETCH_H_
+#define _RTE_MEMBER_SKETCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_vect.h>
+#include <rte_ring_elem.h>
+
+#define NUM_ROW_SCALAR 5
+#define INTERVAL (1 << 15)
+
+#if !RTE_IS_POWER_OF_2(INTERVAL)
+#error sketch INTERVAL macro must be a power of 2
+#endif
+
+int
+rte_member_create_sketch(struct rte_member_setsum *ss,
+			 const struct rte_member_parameters *params,
+			 struct rte_ring *r);
+
+int
+rte_member_lookup_sketch(const struct rte_member_setsum *setsum,
+			 const void *key, member_set_t *set_id);
+
+int
+rte_member_add_sketch(const struct rte_member_setsum *setsum,
+		      const void *key,
+		      member_set_t set_id);
+
+int
+rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
+				 const void *key, uint32_t byte_count);
+
+void
+sketch_update_scalar(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count);
+
+uint64_t
+sketch_lookup_scalar(const struct rte_member_setsum *ss,
+		     const void *key);
+
+void
+sketch_delete_scalar(const struct rte_member_setsum *ss,
+		     const void *key);
+
+int
+rte_member_delete_sketch(const struct rte_member_setsum *setsum,
+			 const void *key);
+
+int
+rte_member_query_sketch(const struct rte_member_setsum *setsum,
+			const void *key, uint64_t *output);
+
+void
+rte_member_free_sketch(struct rte_member_setsum *ss);
+
+void
+rte_member_reset_sketch(const struct rte_member_setsum *setsum);
+
+int
+rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum,
+				     void **key, uint64_t *count);
+
+void
+rte_member_update_heap(const struct rte_member_setsum *ss);
+
+static __rte_always_inline uint64_t
+count_min(const struct rte_member_setsum *ss, const uint32_t *hash_results)
+{
+	uint64_t *count_array = ss->table;
+	uint64_t count;
+	uint32_t cur_row;
+	uint64_t min = UINT64_MAX;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		uint64_t cnt = count_array[cur_row * ss->num_col + hash_results[cur_row]];
+
+		if (cnt < min)
+			min = cnt;
+	}
+	count = min;
+
+	return count;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMBER_SKETCH_H_ */
diff --git a/lib/member/rte_member_sketch_avx512.c b/lib/member/rte_member_sketch_avx512.c
new file mode 100644
index 0000000000..c83f4b6fd1
--- /dev/null
+++ b/lib/member/rte_member_sketch_avx512.c
@@ -0,0 +1,69 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include "rte_member_sketch_avx512.h"
+
+__rte_always_inline void
+sketch_update_avx512(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t num_col = ss->num_col;
+	uint32_t key_len = ss->key_len;
+	__m256i v_row_base;
+	__m256i v_hash_result;
+	__m512i current_sketch;
+	__m512i updated_sketch;
+	__m512i v_count;
+
+	const __m256i v_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+	const __m256i v_col = _mm256_set1_epi32(num_col);
+
+	/* compute the hash result parallelly */
+	v_hash_result = rte_xxh64_sketch_avx512
+		(key, key_len, *(__m512i *)ss->hash_seeds, num_col);
+	v_row_base = _mm256_mullo_epi32(v_idx, v_col);
+	v_hash_result = _mm256_add_epi32(v_row_base, v_hash_result);
+
+	current_sketch =
+		_mm512_i32gather_epi64(v_hash_result, count_array, 8);
+	v_count = _mm512_set1_epi64(count);
+	updated_sketch = _mm512_add_epi64(current_sketch, v_count);
+	_mm512_i32scatter_epi64
+		((void *)count_array, v_hash_result, updated_sketch, 8);
+}
+
+uint64_t
+sketch_lookup_avx512(const struct rte_member_setsum *ss, const void *key)
+{
+	uint32_t col[ss->num_row];
+
+	/* currently only for sketch byte count mode */
+	__m256i v_hash_result = rte_xxh64_sketch_avx512
+		(key, ss->key_len, *(__m512i *)ss->hash_seeds, ss->num_col);
+	_mm256_storeu_si256((__m256i *)col, v_hash_result);
+
+	return count_min(ss, col);
+}
+
+void
+sketch_delete_avx512(const struct rte_member_setsum *ss, const void *key)
+{
+	uint32_t col[ss->num_row];
+	uint64_t *count_array = ss->table;
+	uint64_t min = UINT64_MAX;
+	uint32_t cur_row;
+
+	__m256i v_hash_result = rte_xxh64_sketch_avx512
+		(key, ss->key_len, *(__m512i *)ss->hash_seeds,
+		 RTE_ALIGN_FLOOR(ss->num_col, 32));
+	_mm256_storeu_si256((__m256i *)col, v_hash_result);
+
+	min = count_min(ss, col);
+
+	/* subtract the min value from all the counters */
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+		count_array[cur_row * ss->num_col + col[cur_row]] -= min;
+}
diff --git a/lib/member/rte_member_sketch_avx512.h b/lib/member/rte_member_sketch_avx512.h
new file mode 100644
index 0000000000..e7c25da643
--- /dev/null
+++ b/lib/member/rte_member_sketch_avx512.h
@@ -0,0 +1,36 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_MEMBER_SKETCH_AVX512_H_
+#define _RTE_MEMBER_SKETCH_AVX512_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_vect.h>
+#include "rte_member.h"
+#include "rte_member_sketch.h"
+#include "rte_xxh64_avx512.h"
+
+#define NUM_ROW_VEC 8
+
+void
+sketch_update_avx512(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count);
+
+uint64_t
+sketch_lookup_avx512(const struct rte_member_setsum *ss,
+		     const void *key);
+
+void
+sketch_delete_avx512(const struct rte_member_setsum *ss,
+		     const void *key);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMBER_SKETCH_AVX512_H_ */
diff --git a/lib/member/rte_xxh64_avx512.h b/lib/member/rte_xxh64_avx512.h
new file mode 100644
index 0000000000..50ca1b52c7
--- /dev/null
+++ b/lib/member/rte_xxh64_avx512.h
@@ -0,0 +1,117 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_XXH64_AVX512_H_
+#define _RTE_XXH64_AVX512_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <immintrin.h>
+
+/* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
+/* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
+/* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
+/* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
+/* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
+
+static __rte_always_inline  __m512i
+xxh64_round_avx512(__m512i hash, __m512i input)
+{
+	hash = _mm512_madd52lo_epu64(hash,
+			input,
+			_mm512_set1_epi64(PRIME64_2));
+
+	hash = _mm512_rol_epi64(hash, 31);
+
+	return hash;
+}
+
+static __rte_always_inline  __m512i
+xxh64_fmix_avx512(__m512i hash)
+{
+	hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33));
+
+	return hash;
+}
+
+static __rte_always_inline __m256i
+rte_xxh64_sketch_avx512(const void *key, uint32_t key_len,
+			__m512i v_seed, uint32_t modulo)
+{
+	__m512i v_prime64_5, v_hash;
+	size_t remaining = key_len;
+	size_t offset = 0;
+	__m512i input;
+
+	v_prime64_5 = _mm512_set1_epi64(PRIME64_5);
+	v_hash = _mm512_add_epi64
+			(_mm512_add_epi64(v_seed, v_prime64_5),
+			 _mm512_set1_epi64(key_len));
+
+	while (remaining >= 8) {
+		input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key, offset));
+		v_hash = _mm512_xor_epi64(v_hash,
+				xxh64_round_avx512(_mm512_setzero_si512(), input));
+		v_hash = _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4),
+				v_hash,
+				_mm512_set1_epi64(PRIME64_1));
+
+		remaining -= 8;
+		offset += 8;
+	}
+
+	if (remaining >= 4) {
+		input = _mm512_set1_epi64
+			(*(uint32_t *)RTE_PTR_ADD(key, offset));
+		v_hash = _mm512_xor_epi64(v_hash,
+			_mm512_mullo_epi64(input,
+				_mm512_set1_epi64(PRIME64_1)));
+		v_hash = _mm512_madd52lo_epu64
+				(_mm512_set1_epi64(PRIME64_3),
+				_mm512_rol_epi64(v_hash, 23),
+				_mm512_set1_epi64(PRIME64_2));
+
+		offset += 4;
+		remaining -= 4;
+	}
+
+	while (remaining != 0) {
+		input = _mm512_set1_epi64
+			(*(uint8_t *)RTE_PTR_ADD(key, offset));
+		v_hash = _mm512_xor_epi64(v_hash,
+			_mm512_mullo_epi64(input,
+				_mm512_set1_epi64(PRIME64_5)));
+		v_hash = _mm512_mullo_epi64
+			(_mm512_rol_epi64(v_hash, 11),
+			_mm512_set1_epi64(PRIME64_1));
+		offset++;
+		remaining--;
+	}
+
+	v_hash = xxh64_fmix_avx512(v_hash);
+
+	/*
+	 * theoritically, such modular operations can be replaced by
+	 * _mm512_rem_epi64(), but seems it depends on the compiler's
+	 * implementation. so here is the limitation that the modulo
+	 * value should be power of 2.
+	 */
+	__m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1));
+
+	return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash, v_hash_remainder));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_XXH64_AVX512_H_ */
diff --git a/lib/member/version.map b/lib/member/version.map
index 19469c6aba..35a811d0ac 100644
--- a/lib/member/version.map
+++ b/lib/member/version.map
@@ -2,6 +2,9 @@  DPDK_23 {
 	global:
 
 	rte_member_add;
+	rte_member_add_byte_count;
+	rte_member_query_count;
+	rte_member_report_heavyhitter;
 	rte_member_create;
 	rte_member_delete;
 	rte_member_find_existing;