[RFC,1/2] member: implement NitroSketch mode

Message ID 20220601082228.10158-2-leyi.rong@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series introduce NitroSketch Mode into membership library |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Leyi Rong June 1, 2022, 8:22 a.m. UTC
  Sketching algorithm provide high-fidelity approximate measurements and
appears as a promising alternative to traditional approches such as
packet sampling.

NitroSketch [1] is a software sketching framework that optimizes
performance, provides accuracy guarantees, and supports a variety of
sketches.

This commit adds a new data structure called sketch into
membership library. This new data structure is an efficient
way to profile the traffic for heavy hitters. Also use min-heap
structure to maintain the top-k flow keys.

[1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.

Signed-off-by: Alan Liu <zaoxingliu@gmail.com>
Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
Signed-off-by: Leyi Rong <leyi.rong@intel.com>
---
 lib/member/meson.build                |  35 +-
 lib/member/rte_member.c               |  75 ++++
 lib/member/rte_member.h               | 141 ++++++-
 lib/member/rte_member_heap.h          | 449 ++++++++++++++++++++
 lib/member/rte_member_sketch.c        | 584 ++++++++++++++++++++++++++
 lib/member/rte_member_sketch.h        |  96 +++++
 lib/member/rte_member_sketch_avx512.c |  69 +++
 lib/member/rte_member_sketch_avx512.h |  36 ++
 lib/member/rte_xxh64_avx512.h         | 117 ++++++
 9 files changed, 1598 insertions(+), 4 deletions(-)
 create mode 100644 lib/member/rte_member_heap.h
 create mode 100644 lib/member/rte_member_sketch.c
 create mode 100644 lib/member/rte_member_sketch.h
 create mode 100644 lib/member/rte_member_sketch_avx512.c
 create mode 100644 lib/member/rte_member_sketch_avx512.h
 create mode 100644 lib/member/rte_xxh64_avx512.h
  

Comments

Wang, Yipeng1 July 31, 2022, 10:10 p.m. UTC | #1
> -----Original Message-----
> From: Rong, Leyi <leyi.rong@intel.com>
> Sent: Wednesday, June 1, 2022 1:22 AM
> To: Wang, Yipeng1 <yipeng1.wang@intel.com>; zaoxingliu@gmail.com; Gobriel,
> Sameh <sameh.gobriel@intel.com>
> Cc: dev@dpdk.org; Rong, Leyi <leyi.rong@intel.com>
> Subject: [RFC,1/2] member: implement NitroSketch mode
> 
> Sketching algorithm provide high-fidelity approximate measurements and
> appears as a promising alternative to traditional approches such as
> packet sampling.
> 
> NitroSketch [1] is a software sketching framework that optimizes
> performance, provides accuracy guarantees, and supports a variety of
> sketches.
> 
> This commit adds a new data structure called sketch into
> membership library. This new data structure is an efficient
> way to profile the traffic for heavy hitters. Also use min-heap
> structure to maintain the top-k flow keys.
> 
> [1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
> Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
> Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
> 
> Signed-off-by: Alan Liu <zaoxingliu@gmail.com>
> Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> Signed-off-by: Leyi Rong <leyi.rong@intel.com>
> ---
>  lib/member/meson.build                |  35 +-
>  lib/member/rte_member.c               |  75 ++++
>  lib/member/rte_member.h               | 141 ++++++-
>  lib/member/rte_member_heap.h          | 449 ++++++++++++++++++++
>  lib/member/rte_member_sketch.c        | 584 ++++++++++++++++++++++++++
>  lib/member/rte_member_sketch.h        |  96 +++++
>  lib/member/rte_member_sketch_avx512.c |  69 +++
>  lib/member/rte_member_sketch_avx512.h |  36 ++
>  lib/member/rte_xxh64_avx512.h         | 117 ++++++
>  9 files changed, 1598 insertions(+), 4 deletions(-)
>  create mode 100644 lib/member/rte_member_heap.h
>  create mode 100644 lib/member/rte_member_sketch.c
>  create mode 100644 lib/member/rte_member_sketch.h
>  create mode 100644 lib/member/rte_member_sketch_avx512.c
>  create mode 100644 lib/member/rte_member_sketch_avx512.h
>  create mode 100644 lib/member/rte_xxh64_avx512.h
> 
> diff --git a/lib/member/meson.build b/lib/member/meson.build
> index e06fddc240..426c9891c2 100644
> --- a/lib/member/meson.build
> +++ b/lib/member/meson.build
> @@ -7,6 +7,39 @@ if is_windows
>      subdir_done()
>  endif
> 
> -sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c')
> +sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c',
> 'rte_member_sketch.c')
>  headers = files('rte_member.h')
>  deps += ['hash']
> +
> +# compile AVX512 version if:
> +# we are building 64-bit binary AND binutils can generate proper code
> +if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
> +    # compile AVX512 version if either:
> +    # a. we have AVX512 supported in minimum instruction set
> +    #    baseline
> +    # b. it's not minimum instruction set, but supported by
> +    #    compiler
> +    #
> +    # in former case, just add avx512 C file to files list
> +    # in latter case, compile c file to static lib, using correct
> +    # compiler flags, and then have the .o file from static lib
> +    # linked into main lib.
> +    sketch_avx512_cpu_support = (
> +        cc.get_define('__AVX512F__', args: machine_args) != ''
> +    )
> +
> +    if sketch_avx512_cpu_support == true
> +	cflags += ['-DCC_AVX512_SUPPORT']
> +	if cc.has_argument('-mavx512f')
> +	    cflags += ['-mavx', '-mavx2', '-mavx512f', '-mavx512ifma', '-
> march=icelake-server']
> +	endif
> +	sources += files('rte_member_sketch_avx512.c')
> +    elif cc.has_argument('-mavx512f')
> +	cflags += '-DCC_AVX512_SUPPORT'
> +	sketch_avx512_tmp = static_library('sketch_avx512_tmp',
> +	    'rte_member_sketch_avx512.c',
> +	    dependencies: static_rte_eal,
> +	    c_args: cflags + ['-mavx512f'])
> +	objs +=
> sketch_avx512_tmp.extract_objects('rte_member_sketch_avx512.c')
> +    endif
> +endif
> diff --git a/lib/member/rte_member.c b/lib/member/rte_member.c
> index 7e1632e6b5..8f859f7fbd 100644
> --- a/lib/member/rte_member.c
> +++ b/lib/member/rte_member.c
> @@ -9,10 +9,12 @@
>  #include <rte_malloc.h>
>  #include <rte_errno.h>
>  #include <rte_tailq.h>
> +#include <rte_ring_elem.h>
> 
>  #include "rte_member.h"
>  #include "rte_member_ht.h"
>  #include "rte_member_vbf.h"
> +#include "rte_member_sketch.h"
> 
>  TAILQ_HEAD(rte_member_list, rte_tailq_entry);
>  static struct rte_tailq_elem rte_member_tailq = {
> @@ -72,6 +74,9 @@ rte_member_free(struct rte_member_setsum *setsum)
>  	case RTE_MEMBER_TYPE_VBF:
>  		rte_member_free_vbf(setsum);
>  		break;
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		rte_member_free_sketch(setsum);
> +		break;
>  	default:
>  		break;
>  	}
> @@ -86,6 +91,8 @@ rte_member_create(const struct rte_member_parameters
> *params)
>  	struct rte_member_list *member_list;
>  	struct rte_member_setsum *setsum;
>  	int ret;
> +	char ring_name[RTE_RING_NAMESIZE];
> +	struct rte_ring *sketch_key_ring = NULL;
> 
>  	if (params == NULL) {
>  		rte_errno = EINVAL;
> @@ -100,6 +107,16 @@ rte_member_create(const struct
> rte_member_parameters *params)
>  		return NULL;
>  	}
> 
> +	if (params->type == RTE_MEMBER_TYPE_SKETCH) {
> +		snprintf(ring_name, sizeof(ring_name), "SK_%s", params-
> >name);
> +		sketch_key_ring = rte_ring_create_elem(ring_name,
> sizeof(uint32_t),
> +				rte_align32pow2(params->top_k), params-
> >socket_id, 0);
> +		if (sketch_key_ring == NULL) {
> +			RTE_MEMBER_LOG(ERR, "Sketch Ring Memory
> allocation failed\n");
> +			return NULL;
> +		}
> +	}
> +
>  	member_list = RTE_TAILQ_CAST(rte_member_tailq.head,
> rte_member_list);
> 
>  	rte_mcfg_tailq_write_lock();
> @@ -145,6 +162,9 @@ rte_member_create(const struct
> rte_member_parameters *params)
>  	case RTE_MEMBER_TYPE_VBF:
>  		ret = rte_member_create_vbf(setsum, params);
>  		break;
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		ret = rte_member_create_sketch(setsum, params,
> sketch_key_ring);
> +		break;
>  	default:
>  		goto error_unlock_exit;
>  	}
> @@ -162,6 +182,7 @@ rte_member_create(const struct
> rte_member_parameters *params)
>  error_unlock_exit:
>  	rte_free(te);
>  	rte_free(setsum);
> +	rte_ring_free(sketch_key_ring);
>  	rte_mcfg_tailq_write_unlock();
>  	return NULL;
>  }
> @@ -178,6 +199,23 @@ rte_member_add(const struct rte_member_setsum
> *setsum, const void *key,
>  		return rte_member_add_ht(setsum, key, set_id);
>  	case RTE_MEMBER_TYPE_VBF:
>  		return rte_member_add_vbf(setsum, key, set_id);
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		return rte_member_add_sketch(setsum, key, set_id);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +int
> +rte_member_add_byte_count(const struct rte_member_setsum *setsum,
> +			  const void *key, uint32_t byte_count)
> +{
> +	if (setsum == NULL || key == NULL || byte_count == 0)
> +		return -EINVAL;
> +
> +	switch (setsum->type) {
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		return rte_member_add_sketch_byte_count(setsum, key,
> byte_count);
>  	default:
>  		return -EINVAL;
>  	}
> @@ -195,6 +233,8 @@ rte_member_lookup(const struct rte_member_setsum
> *setsum, const void *key,
>  		return rte_member_lookup_ht(setsum, key, set_id);
>  	case RTE_MEMBER_TYPE_VBF:
>  		return rte_member_lookup_vbf(setsum, key, set_id);
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		return rte_member_lookup_sketch(setsum, key, set_id);
>  	default:
>  		return -EINVAL;
>  	}
> @@ -261,6 +301,36 @@ rte_member_lookup_multi_bulk(const struct
> rte_member_setsum *setsum,
>  	}
>  }
> 
> +int
> +rte_member_query_count(const struct rte_member_setsum *setsum,
> +		       const void *key, uint64_t *output)
> +{
> +	if (setsum == NULL || key == NULL || output == NULL)
> +		return -EINVAL;
> +
> +	switch (setsum->type) {
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		return rte_member_query_sketch(setsum, key, output);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +int
> +rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
> +				void **key, uint64_t *count)
> +{
> +	if (setsum == NULL || key == NULL || count == NULL)
> +		return -EINVAL;
> +
> +	switch (setsum->type) {
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		return rte_member_report_heavyhitter_sketch(setsum, key,
> count);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int
>  rte_member_delete(const struct rte_member_setsum *setsum, const void *key,
>  			member_set_t set_id)
> @@ -272,6 +342,8 @@ rte_member_delete(const struct rte_member_setsum
> *setsum, const void *key,
>  	case RTE_MEMBER_TYPE_HT:
>  		return rte_member_delete_ht(setsum, key, set_id);
>  	/* current vBF implementation does not support delete function */
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		return rte_member_delete_sketch(setsum, key);
>  	case RTE_MEMBER_TYPE_VBF:
>  	default:
>  		return -EINVAL;
> @@ -290,6 +362,9 @@ rte_member_reset(const struct rte_member_setsum
> *setsum)
>  	case RTE_MEMBER_TYPE_VBF:
>  		rte_member_reset_vbf(setsum);
>  		return;
> +	case RTE_MEMBER_TYPE_SKETCH:
> +		rte_member_reset_sketch(setsum);
> +		return;
>  	default:
>  		return;
>  	}
> diff --git a/lib/member/rte_member.h b/lib/member/rte_member.h
> index 567ee0c84b..6afeda542f 100644
> --- a/lib/member/rte_member.h
> +++ b/lib/member/rte_member.h
> @@ -39,6 +39,18 @@
>   * |          |                     | not overwrite  |                         |
>   * |          |                     | existing key.  |                         |
>   * +----------+---------------------+----------------+-------------------------+
> + * +==========+=============================+
> + * |   type   |      sketch                 |
> + * +==========+=============================+
> + * |structure | counting bloom filter array |
> + * +----------+-----------------------------+
> + * |set id    | 1: heavy set, 0: light set  |
> + * |          |                             |
> + * +----------+-----------------------------+
> + * |usages &  | count size of a flow,       |
> + * |properties| used for heavy hitter       |
> + * |          | detection.                  |
> + * +----------+-----------------------------+
>   * -->
>   */
> 
> @@ -50,6 +62,7 @@ extern "C" {
>  #endif
> 
>  #include <stdint.h>
> +#include <stdbool.h>
> 
>  #include <rte_common.h>
> 
> @@ -66,6 +79,11 @@ typedef uint16_t member_set_t;
>  /** Maximum number of characters in setsum name. */
>  #define RTE_MEMBER_NAMESIZE 32
> 
> +/** For sketch, use the flag if prefer always bounded mode. */
[Wang, Yipeng] Maybe a bit more information on what is always-bounded, I don't see
any explanation of this parameter.

> +#define RTE_MEMBER_SKETCH_ALWAYS_BOUNDED 0x01
> +/** For sketch, use the flag if to count packet size instead of packet count */
> +#define RTE_MEMBER_SKETCH_COUNT_BYTE 0x02
> +
>  /** @internal Hash function used by membership library. */
>  #if defined(RTE_ARCH_X86) || defined(__ARM_FEATURE_CRC32)
>  #include <rte_hash_crc.h>
> @@ -104,6 +122,7 @@ struct rte_member_parameters;
>  enum rte_member_setsum_type {
>  	RTE_MEMBER_TYPE_HT = 0,  /**< Hash table based set summary. */
>  	RTE_MEMBER_TYPE_VBF,     /**< Vector of bloom filters. */
> +	RTE_MEMBER_TYPE_SKETCH,
>  	RTE_MEMBER_NUM_TYPE
>  };
> 
> @@ -114,6 +133,19 @@ enum rte_member_sig_compare_function {
>  	RTE_MEMBER_COMPARE_NUM
>  };
> 
> +/* sketch update function with different implementations. */
> +typedef void (*sketch_update_fn_t)(const struct rte_member_setsum *ss,
> +				   const void *key,
> +				   uint32_t count);
> +
> +/* sketch lookup function with different implementations. */
> +typedef uint64_t (*sketch_lookup_fn_t)(const struct rte_member_setsum *ss,
> +				       const void *key);
> +
> +/* sketch delete function with different implementations. */
> +typedef void (*sketch_delete_fn_t)(const struct rte_member_setsum *ss,
> +				   const void *key);
> +
>  /** @internal setsummary structure. */
>  struct rte_member_setsum {
>  	enum rte_member_setsum_type type; /* Type of the set summary. */
> @@ -134,6 +166,21 @@ struct rte_member_setsum {
>  	uint32_t bit_mask;	/* Bit mask to get bit location in bf. */
>  	uint32_t num_hashes;	/* Number of hash values to index bf. */
> 
> +	/* Parameters for sketch */
> +	float error_rate;
> +	float sample_rate;
> +	uint32_t num_col;
> +	uint32_t num_row;
> +	int always_bounded;
> +	double converge_thresh;
> +	uint32_t topk;
> +	uint32_t count_byte;
> +	uint64_t *hash_seeds;
> +	sketch_update_fn_t sketch_update; /* Pointer to the sketch update
> function */
> +	sketch_lookup_fn_t sketch_lookup; /* Pointer to the sketch lookup
> function */
> +	sketch_delete_fn_t sketch_delete; /* Pointer to the sketch delete
> function */
> +
> +	void *runtime_var;
>  	uint32_t mul_shift;  /* vbf internal variable used during bit test. */
>  	uint32_t div_shift;  /* vbf internal variable used during bit test. */
> 
> @@ -143,6 +190,9 @@ struct rte_member_setsum {
>  	/* Second cache line should start here. */
>  	uint32_t socket_id;          /* NUMA Socket ID for memory. */
>  	char name[RTE_MEMBER_NAMESIZE]; /* Name of this set summary. */
> +#ifdef RTE_ARCH_X86
> +	bool use_avx512;
> +#endif
>  } __rte_cache_aligned;
> 
>  /**
> @@ -261,8 +311,33 @@ struct rte_member_parameters {
>  	 */
>  	uint32_t sec_hash_seed;
> 
> +	/**
> +	 * For count(min) sketch data structure, error rate defines the accuracy
> +	 * required by the user. Higher accuracy leads to more memory usage,
> but
> +	 * the flow size is estimated more accurately.
> +	 */
> +	float error_rate;
> +
> +	/**
> +	 * Sampling rate means the internal sample rate of the rows of the
> count
> +	 * min sketches. Lower sampling rate can reduce CPU overhead, but the
> +	 * data structure will require more time to converge statistically.
> +	 */
> +	float sample_rate;
> +
> +	/**
> +	 * How many top heavy hitter to be reported. The library will internally
> +	 * keep the keys of heavy hitters for final report.
> +	 */
> +	uint32_t top_k;
> +
> +	/**
> +	 * Extra flags that may passed in by user
> +	 */
> +	uint32_t extra_flag;
> +
>  	int socket_id;			/**< NUMA Socket ID for memory. */
> -};
> +} __rte_cache_aligned;
> 
>  /**
>   * @warning
> @@ -418,7 +493,7 @@ rte_member_lookup_multi_bulk(const struct
> rte_member_setsum *setsum,
>   *   RTE_MEMBER_NO_MATCH by default is set as 0.
>   *   For HT mode, the set_id has range as [1, 0x7FFF], MSB is reserved.
>   *   For vBF mode the set id is limited by the num_set parameter when create
> - *   the set-summary.
> + *   the set-summary. For sketch mode, this id is ignored.
>   * @return
>   *   HT (cache mode) and vBF should never fail unless the set_id is not in the
>   *   valid range. In such case -EINVAL is returned.
> @@ -429,12 +504,72 @@ rte_member_lookup_multi_bulk(const struct
> rte_member_setsum *setsum,
>   *   Return 0 for HT (cache mode) if the add does not cause
>   *   eviction, return 1 otherwise. Return 0 for non-cache mode if success,
>   *   -ENOSPC for full, and 1 if cuckoo eviction happens.
> - *   Always returns 0 for vBF mode.
> + *   Always returns 0 for vBF mode and sketch.
>   */
>  int
>  rte_member_add(const struct rte_member_setsum *setsum, const void *key,
>  			member_set_t set_id);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add the packet byte size into the sketch
> + *
> + * @param setsum
> + *   Pointer of a set-summary.
> + * @param key
> + *   Pointer of the key to be added.
> + * @param byte_count
> + *   Add the byte count of the packet into the sketch.
> + * @return
> + * Return -EINVAL for invalid parameters, otherwise return 0.
> + */
> +int
> +rte_member_add_byte_count(const struct rte_member_setsum *setsum,
> +			  const void *key, uint32_t byte_count);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * query packet count
[Wang, Yipeng] Query packet count for a certain flow-key.

> + *
> + * @param setsum
> + *   Pointer of a set-summary.
> + * @param key
> + *   Pointer of the key to be added.
> + * @param count
> + *   The output packet count or byte count.
> + * @return
> + *   Return -EINVAL for invalid parameters.
> + */
> +int
> +rte_member_query_count(const struct rte_member_setsum *setsum,
> +		       const void *key, uint64_t *count);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Insert key into set-summary (SS).
[Wang, Yipeng] This API is for reporting.  The comment for this API is wrong.
> + *
> + * @param setsum
> + *   Pointer of a set-summary.
> + * @param key
> + *   Pointer of the output top-k key array.
> + * @param count
> + *   Pointer of the output packet count or byte count array of the top-k keys.
> + * @return
> + *   Return -EINVAL for invalid parameters. Return a positive integer indicate
> + *   how many heavy hitters are reported.
> + */
> +int
> +rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
> +			      void **keys, uint64_t *counts);
> +
> +
>  /**
>   * @warning
>   * @b EXPERIMENTAL: this API may change without prior notice
> diff --git a/lib/member/rte_member_heap.h b/lib/member/rte_member_heap.h
[Wang, Yipeng] This is a software heap enhanced by a hash table.
I think there should be more comment describing the feature of this file, and the algorithm.

> new file mode 100644
> index 0000000000..3d265c793f
> --- /dev/null
> +++ b/lib/member/rte_member_heap.h
> @@ -0,0 +1,449 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + * Copyright(c) 2020, Alan Liu <zaoxingliu@gmail.com>
> + */
> +
> +#ifndef _RTE_MEMBER_HEAP_H_
> +#define _RTE_MEMBER_HEAP_H_
> +
> +#include <rte_ring_elem.h>
> +#include "rte_member.h"
> +
> +#define LCHILD(x) (2 * x + 1)
> +#define RCHILD(x) (2 * x + 2)
> +#define PARENT(x) ((x - 1) / 2)
> +
> +#define HASH_BKT_SIZE 16
> +#define HASH_HP_MULTI 4
> +#define HASH_RESIZE_MULTI 2
> +
> +#define MINHEAP_OPTIMIZED_HT
> +
> +struct hash_bkt {
> +	uint16_t sig[HASH_BKT_SIZE];
> +	uint16_t idx[HASH_BKT_SIZE];
> +};
> +
> +struct hash {
> +	uint16_t bkt_cnt;
> +	uint16_t num_item;
> +	uint32_t seed;
> +	struct hash_bkt buckets[0];
> +};
> +
> +struct node {
> +	void *key;
> +	uint64_t count;
> +};
> +
> +struct minheap {
> +	uint32_t key_len;
> +	uint32_t size;
> +	uint32_t socket;
> +	struct hash *hashtable;
> +	struct node *elem;
> +};
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
[Wang, Yipeng] Is this macro defined somewhere? Do we need both optimized and non-optimized code?

> +static int
> +hash_table_insert(const void *key, int value, int key_len, struct hash *table)
> +{
> +	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
> +	uint16_t idx = hash % table->bkt_cnt;
> +	uint16_t sig = hash >> 16;
> +
> +	for (int i = 0; i < HASH_BKT_SIZE; i++) {
> +		if (table->buckets[idx].idx[i] == 0) {
> +			table->buckets[idx].idx[i] = value;
> +			table->buckets[idx].sig[i] = sig;
> +			table->num_item++;
> +			return 0;
> +		}
> +	}
> +
> +	return -ENOMEM;
> +}
> +
> +static int
> +hash_table_update(const void *key, int old_value, int value, int key_len, struct
> hash *table)
> +{
> +	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
> +	uint16_t idx = hash % table->bkt_cnt;
> +	uint16_t sig = hash >> 16;
> +
> +	for (int i = 0; i < HASH_BKT_SIZE; i++) {
> +		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i]
> == old_value) {
> +			table->buckets[idx].idx[i] = value;
> +			return 0;
> +		}
> +	}
> +
> +	return -1;
> +}
> +
> +static int
> +hash_table_del(const void *key, uint16_t value, int key_len, struct hash *table)
> +{
> +	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
> +	uint16_t idx = hash % table->bkt_cnt;
> +	uint16_t sig = hash >> 16;
> +
> +	for (int i = 0; i < HASH_BKT_SIZE; i++) {
> +		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i]
> == value) {
> +			table->buckets[idx].idx[i] = 0;
> +			table->num_item--;
> +			return 0;
> +		}
> +	}
> +
> +	return -1;
> +}
> +
> +static int
> +hash_table_lookup(const void *key, int key_len, struct minheap *hp)
> +{
> +	struct hash *table = hp->hashtable;
> +	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
> +	uint16_t idx = hash % table->bkt_cnt;
> +	uint16_t sig = hash >> 16;
> +
> +	for (int i = 0; i < HASH_BKT_SIZE; i++) {
> +		if (table->buckets[idx].sig[i] == sig && table-
> >buckets[idx].idx[i] != 0) {
> +			uint32_t hp_idx = table->buckets[idx].idx[i] - 1;
> +
> +			if (memcmp(hp->elem[hp_idx].key, key, hp->key_len)
> == 0)
> +				return hp_idx;
> +		}
> +	}
> +
> +	return -ENOENT; /* key doesn't exist */
> +}
> +
> +static int
> +resize_hash_table(struct minheap *hp)
> +{
> +	uint32_t i;
> +	uint32_t new_bkt_cnt;
> +
> +	while (1) {
> +		new_bkt_cnt = hp->hashtable->bkt_cnt * HASH_RESIZE_MULTI;
> +
> +		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT load factor is
> [%f]\n",
> +			hp->hashtable->num_item / ((float)hp->hashtable-
> >bkt_cnt * HASH_BKT_SIZE));
> +		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT resize
> happen!\n");
> +		rte_free(hp->hashtable);
> +		hp->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
> +						new_bkt_cnt * sizeof(struct
> hash_bkt),
> +						RTE_CACHE_LINE_SIZE, hp-
> >socket);
> +
> +		if (hp->hashtable == NULL) {
> +			RTE_MEMBER_LOG(ERR, "Sketch Minheap HT
> allocation failed\n");
> +			return -ENOMEM;
> +		}
> +
> +		hp->hashtable->bkt_cnt = new_bkt_cnt;
> +
> +		for (i = 0; i < hp->size; ++i) {
> +			if (hash_table_insert(hp->elem[i].key,
> +				i + 1, hp->key_len, hp->hashtable) < 0) {
> +				RTE_MEMBER_LOG(ERR,
> +					"Sketch Minheap HT resize insert
> fail!\n");
> +				break;
> +			}
> +		}
> +		if (i == hp->size)
> +			break;
> +	}
> +
> +	return 0;
> +}
> +#endif /* MINHEAP_OPTIMIZED_HT */
> +
> +/* find the item in the given minheap */
> +static int
> +rte_member_minheap_find(struct minheap *hp, const void *key)
> +{
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	int idx = hash_table_lookup(key, hp->key_len, hp);
> +	return idx;
> +#else
> +	uint32_t idx;
> +
> +	for (idx = 0; idx < hp->size; ++idx) {
> +		if (memcmp(hp->elem[idx].key, key, hp->key_len) == 0)
> +			return idx;
> +	}
> +
> +	return -ENOENT; /* key doesn't exist */
> +#endif
> +}
> +
> +static int
> +rte_member_minheap_init(struct minheap *heap, int size,
> +			uint32_t socket, uint32_t seed)
> +{
> +	heap->elem = rte_zmalloc_socket(NULL, sizeof(struct node) * size,
> +				RTE_CACHE_LINE_SIZE, socket);
> +	if (heap->elem == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Minheap elem allocation
> failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	uint32_t hash_bkt_cnt = rte_align32pow2(size * HASH_HP_MULTI) /
> HASH_BKT_SIZE;
> +
> +	if (hash_bkt_cnt == 0)
> +		hash_bkt_cnt = 1;
> +
> +	heap->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
> +					hash_bkt_cnt * sizeof(struct hash_bkt),
> +					RTE_CACHE_LINE_SIZE, socket);
> +
> +	if (heap->hashtable == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation
> failed\n");
> +		rte_free(heap->elem);
> +		return -ENOMEM;
> +	}
> +
> +	heap->hashtable->seed = seed;
> +	heap->hashtable->bkt_cnt = hash_bkt_cnt;
> +	heap->socket = socket;
> +
> +	return 0;
> +}
> +
> +/* swap the minheap nodes */
> +static __rte_always_inline void
> +rte_member_heap_swap(struct node *n1, struct node *n2)
> +{
> +	struct node temp = *n1;
> +	*n1 = *n2;
> +	*n2 = temp;
> +}
> +
> +/* heapify function */
> +static void
> +rte_member_heapify(struct minheap *hp, uint32_t idx, bool update_hash)
> +{
> +	uint32_t smallest;
> +
> +	if (LCHILD(idx) < hp->size &&
> +			hp->elem[LCHILD(idx)].count < hp->elem[idx].count)
> +		smallest = LCHILD(idx);
> +	else
> +		smallest = idx;
> +
> +	if (RCHILD(idx) < hp->size &&
> +			hp->elem[RCHILD(idx)].count < hp-
> >elem[smallest].count)
> +		smallest = RCHILD(idx);
> +
> +	if (smallest != idx) {
> +		rte_member_heap_swap(&(hp->elem[idx]), &(hp-
> >elem[smallest]));
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
> +		if (update_hash) {
> +			if (hash_table_update(hp->elem[smallest].key, idx + 1,
> smallest + 1,
> +					hp->key_len, hp->hashtable) < 0) {
> +				RTE_MEMBER_LOG(ERR, "Minheap Hash Table
> update failed\n");
> +				return;
> +			}
> +
> +			if (hash_table_update(hp->elem[idx].key, smallest + 1,
> idx + 1,
> +					hp->key_len, hp->hashtable) < 0) {
> +				RTE_MEMBER_LOG(ERR, "Minheap Hash Table
> update failed\n");
> +				return;
> +			}
> +		}
> +#endif
> +		rte_member_heapify(hp, smallest, update_hash);
> +	}
> +}
> +
> +/* insert a node into the minheap */
> +static int
> +rte_member_minheap_insert_node(struct minheap *hp, const void *key,
> +			       int counter, void *key_slot,
> +			       struct rte_ring *free_key_slot)
> +{
> +	struct node nd;
> +	uint32_t slot_id;
> +
> +	if (rte_ring_sc_dequeue_elem(free_key_slot, &slot_id,
> sizeof(uint32_t)) != 0) {
> +		RTE_MEMBER_LOG(ERR, "Minheap get empty keyslot failed\n");
> +		return -1;
> +	}
> +
> +	nd.count = counter;
> +	nd.key = RTE_PTR_ADD(key_slot, slot_id * hp->key_len);
> +
> +	memcpy(nd.key, key, hp->key_len);
> +
> +	uint32_t i = (hp->size)++;
> +
> +	while (i && nd.count < hp->elem[PARENT(i)].count) {
> +		hp->elem[i] = hp->elem[PARENT(i)];
> +#ifdef MINHEAP_OPTIMIZED_HT
> +		if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
> +				hp->key_len, hp->hashtable) < 0) {
> +			RTE_MEMBER_LOG(ERR, "Minheap Hash Table update
> failed\n");
> +			return -1;
> +		}
> +#endif
> +		i = PARENT(i);
> +	}
> +	hp->elem[i] = nd;
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	if (hash_table_insert(key, i + 1, hp->key_len, hp->hashtable) < 0) {
> +		if (resize_hash_table(hp) < 0) {
> +			RTE_MEMBER_LOG(ERR, "Minheap Hash Table resize
> failed\n");
> +			return -1;
> +		}
> +	}
> +#endif
> +	return 0;
> +}
> +
> +/* delete a key from the minheap */
> +static int
> +rte_member_minheap_delete_node(struct minheap *hp, const void *key,
> +			       void *key_slot, struct rte_ring *free_key_slot)
> +{
> +	int idx = rte_member_minheap_find(hp, key);
> +	uint32_t offset = RTE_PTR_DIFF(hp->elem[idx].key, key_slot) / hp-
> >key_len;
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	if (hash_table_del(key, idx + 1, hp->key_len, hp->hashtable) < 0) {
> +		RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
> +		return -1;
> +	}
> +#endif
> +	rte_ring_sp_enqueue_elem(free_key_slot, &offset, sizeof(uint32_t));
> +
> +	if (idx == (int)(hp->size - 1)) {
> +		hp->size--;
> +		return 0;
> +	}
> +
> +	hp->elem[idx] = hp->elem[hp->size - 1];
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	if (hash_table_update(hp->elem[idx].key, hp->size, idx + 1,
> +				hp->key_len, hp->hashtable) < 0) {
> +		RTE_MEMBER_LOG(ERR, "Minheap Hash Table update
> failed\n");
> +		return -1;
> +	}
> +#endif
> +	hp->size--;
> +	rte_member_heapify(hp, idx, true);
> +
> +	return 0;
> +}
> +
> +/* replace a min node with a new key. */
> +static int
> +rte_member_minheap_replace_node(struct minheap *hp,
> +				const void *new_key,
> +				int new_counter)
> +{
> +	struct node nd;
> +	void *recycle_key = NULL;
> +
> +	recycle_key = hp->elem[0].key;
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	if (hash_table_del(recycle_key, 1, hp->key_len, hp->hashtable) < 0) {
> +		RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
> +		return -1;
> +	}
> +#endif
> +	hp->elem[0] = hp->elem[hp->size - 1];
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	if (hash_table_update(hp->elem[0].key, hp->size, 1,
> +				hp->key_len, hp->hashtable) < 0) {
> +		RTE_MEMBER_LOG(ERR, "Minheap Hash Table update
> failed\n");
> +		return -1;
> +	}
> +#endif
> +	hp->size--;
> +
> +	rte_member_heapify(hp, 0, true);
> +
> +	nd.count = new_counter;
> +	nd.key = recycle_key;
> +
> +	memcpy(nd.key, new_key, hp->key_len);
> +
> +	uint32_t i = (hp->size)++;
> +
> +	while (i && nd.count < hp->elem[PARENT(i)].count) {
> +		hp->elem[i] = hp->elem[PARENT(i)];
> +#ifdef MINHEAP_OPTIMIZED_HT
> +		if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
> +				hp->key_len, hp->hashtable) < 0) {
> +			RTE_MEMBER_LOG(ERR, "Minheap Hash Table update
> failed\n");
> +			return -1;
> +		}
> +#endif
> +		i = PARENT(i);
> +	}
> +
> +	hp->elem[i] = nd;
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	if (hash_table_insert(new_key, i + 1, hp->key_len, hp->hashtable) < 0) {
> +		RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace insert
> failed\n");
> +		if (resize_hash_table(hp) < 0) {
> +			RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace
> resize failed\n");
> +			return -1;
> +		}
> +	}
> +#endif
> +	return 0;
> +}
> +
> +/* sort the heap into a decending array */
> +static void
> +rte_member_heapsort(struct minheap *hp, struct node *result_array)
> +{
> +	struct minheap new_hp;
> +
> +	/* build a new heap for using the given array */
> +	new_hp.size = hp->size;
> +	new_hp.key_len = hp->key_len;
> +	new_hp.elem = result_array;
> +	memcpy(result_array, hp->elem, hp->size * sizeof(struct node));
> +
> +	/* sort the new heap */
> +	while (new_hp.size > 1) {
> +		rte_member_heap_swap(&(new_hp.elem[0]),
> &(new_hp.elem[new_hp.size - 1]));
> +		new_hp.size--;
> +		rte_member_heapify(&new_hp, 0, false);
> +	}
> +}
> +
> +static void
> +rte_member_minheap_free(struct minheap *hp)
> +{
> +	if (hp == NULL)
> +		return;
> +
> +	rte_free(hp->elem);
> +	rte_free(hp->hashtable);
> +}
> +
> +static void
> +rte_member_minheap_reset(struct minheap *hp)
> +{
> +	if (hp == NULL)
> +		return;
> +
> +	memset(hp->elem, 0, sizeof(struct node) * hp->size);
> +	hp->size = 0;
> +
> +#ifdef MINHEAP_OPTIMIZED_HT
> +	memset((char *)hp->hashtable + sizeof(struct hash), 0,
> +			hp->hashtable->bkt_cnt * sizeof(struct hash_bkt));
> +	hp->hashtable->num_item = 0;
> +#endif
> +}
> +
> +#endif /* _RTE_MEMBER_HEAP_H_ */
> diff --git a/lib/member/rte_member_sketch.c
> b/lib/member/rte_member_sketch.c
> new file mode 100644
> index 0000000000..0c5a0bad99
> --- /dev/null
> +++ b/lib/member/rte_member_sketch.c
> @@ -0,0 +1,584 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + * Copyright(c) 2020, Alan Liu <zaoxingliu@gmail.com>
> + */
> +
> +#include <math.h>
> +#include <string.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_log.h>
> +#include <rte_random.h>
> +#include <rte_prefetch.h>
> +#include <rte_ring_elem.h>
> +
> +#include "rte_member.h"
> +#include "rte_member_sketch.h"
> +#include "rte_member_heap.h"
> +
> +#ifdef CC_AVX512_SUPPORT
> +#include "rte_member_sketch_avx512.h"
> +#endif /* CC_AVX512_SUPPORT */
> +
> +struct sketch_runtime {
> +	uint64_t pkt_cnt;
> +	uint32_t until_next;
> +	int converged;
> +	struct minheap heap;
> +	struct node *report_array;
> +	void *key_slots;
> +	struct rte_ring *free_key_slots;
> +} __rte_cache_aligned;
> +
> +static uint32_t
> +draw_geometric(const struct rte_member_setsum *ss)
> +{
> +	double rand = 1;
> +
> +	if (ss->sample_rate == 1)
> +		return 1;
> +
> +	while (rand == 1 || rand == 0)
> +		rand = (double) rte_rand() / (UINT64_MAX);
> +
> +	return (uint32_t)ceil(log(1 - rand) / log(1 - ss->sample_rate));
> +}
[Wang, Yipeng] Log might be an expensive operation. Is there alternative, or if it is called frequently?

> +
> +static void
> +isort(uint64_t *array, int n)
> +{
> +	for (int i = 1; i < n; i++) {
> +		uint64_t t = array[i];
> +		int j;
> +
> +		for (j = i - 1; j >= 0; j--) {
> +			if (t < array[j])
> +				array[j + 1] = array[j];
> +			else
> +				break;
> +		}
> +		array[j + 1] = t;
> +	}
> +}
[Wang, Yipeng] Is insertion sort the fastest to sort less-than-10 count of numbers?

> +
> +static __rte_always_inline void
> +swap(uint64_t *a, uint64_t *b)
> +{
> +	uint64_t tmp = *a;
> +	*a = *b;
> +	*b = tmp;
> +}
> +
> +static uint64_t
> +medianof5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
> +{
> +	if (a > b)
> +		swap(&a, &b);
> +	if (c > d)
> +		swap(&c, &d);
> +	if (a > c) {
> +		if (d > e)
> +			swap(&c, &e);
> +		else {
> +			swap(&c, &d);
> +			swap(&d, &e);
> +		}
> +	} else {
> +		if (b > e)
> +			swap(&a, &e);
> +		else {
> +			swap(&a, &b);
> +			swap(&b, &e);
> +		}
> +	}
> +
> +	if (a > c)
> +		return a > d ? d : a;
> +	else
> +		return b > c ? c : b;
> +}
> +
> +int
> +rte_member_create_sketch(struct rte_member_setsum *ss,
> +			 const struct rte_member_parameters *params,
> +			 struct rte_ring *ring)
> +{
> +	struct sketch_runtime *runtime;
> +	uint32_t num_col;
> +	uint32_t i;
> +
> +	if (params->sample_rate == 0 || params->sample_rate > 1) {
> +		rte_errno = EINVAL;
> +		RTE_MEMBER_LOG(ERR,
> +			"Membership Sketch created with invalid
> parameters\n");
> +		return -EINVAL;
> +	}
> +
> +	if (params->extra_flag & RTE_MEMBER_SKETCH_COUNT_BYTE)
> +		ss->count_byte = 1;
> +
> +	if (ss->count_byte == 1 &&
> +		rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512
> &&
> +		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
> +		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512IFMA) == 1) {
> +#ifdef CC_AVX512_SUPPORT
> +		ss->use_avx512 = true;
> +#else
> +		ss->use_avx512 = false;
> +#endif
> +	}
> +
> +	if (ss->use_avx512 == true) {
> +		ss->num_row = NUM_ROW_VEC;
> +		RTE_MEMBER_LOG(NOTICE,
> +			"Membership Sketch AVX512 update/lookup/delete ops
> is selected\n");
> +		ss->sketch_update = sketch_update_avx512;
> +		ss->sketch_lookup = sketch_lookup_avx512;
> +		ss->sketch_delete = sketch_delete_avx512;
> +	} else {
> +		ss->num_row = NUM_ROW_SCALAR;
> +		RTE_MEMBER_LOG(NOTICE,
> +			"Membership Sketch SCALAR update/lookup/delete ops
> is selected\n");
> +		ss->sketch_update = sketch_update_scalar;
> +		ss->sketch_lookup = sketch_lookup_scalar;
> +		ss->sketch_delete = sketch_delete_scalar;
> +	}
> +
> +	ss->socket_id = params->socket_id;
> +
> +	if (ss->count_byte == 0)
> +		num_col = 4.0 / params->error_rate / params->sample_rate;
> +	else if (ss->use_avx512 == true)
> +		num_col = rte_align32pow2(4.0 / params->error_rate);
> +	else
> +		num_col = 4.0 / params->error_rate;
[Wang, Yipeng] Could here be div/0 fault? A pointer to the formula, or comment the formula inline would
help people understand the constants. Similar for other cases where algorithm-specific constants are used.

> +
> +	ss->table = rte_zmalloc_socket(NULL,
> +			sizeof(uint64_t) * num_col * ss->num_row,
> +			RTE_CACHE_LINE_SIZE, ss->socket_id);
> +	if (ss->table == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Table memory allocation
> failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	ss->hash_seeds = rte_zmalloc_socket(NULL, sizeof(uint64_t) * ss-
> >num_row,
> +			RTE_CACHE_LINE_SIZE, ss->socket_id);
> +	if (ss->hash_seeds == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Hashseeds memory allocation
> failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	ss->runtime_var = rte_zmalloc_socket(NULL, sizeof(struct
> sketch_runtime),
> +					RTE_CACHE_LINE_SIZE, ss->socket_id);
> +	if (ss->runtime_var == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Runtime memory allocation
> failed\n");
> +		rte_free(ss);
> +		return -ENOMEM;
> +	}
> +	runtime = ss->runtime_var;
> +
> +	ss->num_col = num_col;
> +	ss->sample_rate = params->sample_rate;
> +	ss->prim_hash_seed = params->prim_hash_seed;
> +	ss->sec_hash_seed = params->sec_hash_seed;
> +	ss->error_rate = params->error_rate;
> +	ss->topk = params->top_k;
> +	ss->key_len = params->key_len;
> +	runtime->heap.key_len = ss->key_len;
> +
> +	runtime->key_slots = rte_zmalloc_socket(NULL, ss->key_len * ss->topk,
> +					RTE_CACHE_LINE_SIZE, ss->socket_id);
> +	if (runtime->key_slots == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Key Slots allocation failed\n");
> +		goto error;
> +	}
> +
> +	runtime->free_key_slots = ring;
> +	for (i = 0; i < ss->topk; i++)
> +		rte_ring_sp_enqueue_elem(runtime->free_key_slots,
> +					&i, sizeof(uint32_t));
> +
> +	if (rte_member_minheap_init(&(runtime->heap), params->top_k,
> +			ss->socket_id, params->prim_hash_seed) < 0) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Minheap allocation failed\n");
> +		goto error_runtime;
> +	}
> +
> +	runtime->report_array = rte_zmalloc_socket(NULL, sizeof(struct node) *
> ss->topk,
> +					RTE_CACHE_LINE_SIZE, ss->socket_id);
> +	if (runtime->report_array == NULL) {
> +		RTE_MEMBER_LOG(ERR, "Sketch Runtime Report Array
> allocation failed\n");
> +		goto error_runtime;
> +	}
> +
> +	rte_srand(ss->prim_hash_seed);
> +	for (uint32_t i = 0; i < ss->num_row; i++)
> +		ss->hash_seeds[i] = rte_rand();
> +
> +	if (params->extra_flag & RTE_MEMBER_SKETCH_ALWAYS_BOUNDED)
> +		ss->always_bounded = 1;
> +
> +	if (ss->always_bounded) {
> +		double delta = 1.0 / (pow(2, ss->num_row));
> +
> +		ss->converge_thresh = 10 * pow(ss->error_rate, -2.0) *
> sqrt(log(1/delta));
> +	}
> +
> +	RTE_MEMBER_LOG(DEBUG, "Sketch created, "
> +		"the total memory required is %u Bytes\n",  ss->num_col * ss-
> >num_row * 8);
> +
> +	return 0;
> +
> +error_runtime:
> +	rte_member_minheap_free(&runtime->heap);
> +	rte_ring_free(runtime->free_key_slots);
> +	rte_free(runtime->key_slots);
> +error:
> +	rte_free(runtime);
> +	rte_free(ss);
> +
> +	return -ENOMEM;
> +}
> +
> +uint64_t
> +sketch_lookup_scalar(const struct rte_member_setsum *ss, const void *key)
> +{
> +	uint64_t *count_array = ss->table;
> +	uint32_t col[ss->num_row];
> +	uint64_t count_row[ss->num_row];
> +	uint32_t cur_row;
> +	uint64_t count;
> +
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
> +		col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
> +			ss->hash_seeds[cur_row]) % ss->num_col;
> +
> +		rte_prefetch0(&count_array[cur_row * ss->num_col +
> col[cur_row]]);
> +	}
> +
> +	/* if sample rate is 1, it is a regular count-min, we report the min */
> +	if (ss->sample_rate == 1 || ss->count_byte == 1)
> +		return count_min(ss, col);
> +
> +	/* otherwise we report the median number */
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
> +		count_row[cur_row] = count_array[cur_row * ss->num_col +
> col[cur_row]];
> +
> +	if (ss->num_row == 5)
> +		return medianof5(count_row[0], count_row[1],
> +				count_row[2], count_row[3], count_row[4]);
> +
> +	isort(count_row, ss->num_row);
> +
> +	if (ss->num_row % 2 == 0) {
> +		count = (count_row[ss->num_row / 2] + count_row[ss-
> >num_row / 2 - 1]) / 2;
> +		return count;
> +	}
> +	/* ss->num_row % 2 != 0 */
> +	count = count_row[ss->num_row / 2];
> +
> +	return count;
> +}
> +
> +void
> +sketch_delete_scalar(const struct rte_member_setsum *ss, const void *key)
> +{
> +	uint32_t col[ss->num_row];
> +	uint64_t *count_array = ss->table;
> +	uint64_t min = UINT64_MAX;
> +	uint32_t cur_row;
> +
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
> +		col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
> +			ss->hash_seeds[cur_row]) % ss->num_col;
> +
> +		rte_prefetch0(&count_array[cur_row * ss->num_col +
> col[cur_row]]);
> +	}
> +
> +	/* if sample rate is 1, it is a regular count-min, we report the min */
[Wang, Yipeng] For count-min, key could be deleted, but If it is not the count-min, does it still support delete function?
When delete key, do you also need to delete from the heap?

> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
> +		uint64_t cnt = count_array[cur_row * ss->num_col +
> col[cur_row]];
> +
> +		if (cnt < min)
> +			min = cnt;
> +	}
> +
> +	/* subtract the min value from all the counters */
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
> +		count_array[cur_row * ss->num_col + col[cur_row]] -= min;
> +}
> +
> +int
> +rte_member_query_sketch(const struct rte_member_setsum *ss,
> +			const void *key,
> +			uint64_t *output)
> +{
> +	uint64_t count = ss->sketch_lookup(ss, key);
> +	*output = count;
> +
> +	return 0;
> +}
> +
> +void
> +rte_member_update_heap(const struct rte_member_setsum *ss)
> +{
> +	uint32_t i;
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +
> +	for (i = 0; i < runtime_var->heap.size; i++) {
> +		uint64_t count = ss->sketch_lookup(ss, runtime_var-
> >heap.elem[i].key);
> +
> +		runtime_var->heap.elem[i].count = count;
> +	}
> +}
> +
> +int
> +rte_member_report_heavyhitter_sketch(const struct rte_member_setsum
> *setsum,
> +				     void **key,
> +				     uint64_t *count)
> +{
> +	uint32_t i;
> +	struct sketch_runtime *runtime_var = setsum->runtime_var;
> +
> +	rte_member_update_heap(setsum);
> +	rte_member_heapsort(&(runtime_var->heap), runtime_var-
> >report_array);
> +
> +	for (i = 0; i < runtime_var->heap.size; i++) {
> +		key[i] = runtime_var->report_array[i].key;
> +		count[i] = runtime_var->report_array[i].count;
> +	}
> +
> +	return runtime_var->heap.size;
> +}
> +
> +int
> +rte_member_lookup_sketch(const struct rte_member_setsum *ss,
> +			 const void *key, member_set_t *set_id)
> +{
> +	uint64_t count = ss->sketch_lookup(ss, key);
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +
> +	if (runtime_var->heap.size > 0 && count >= runtime_var-
> >heap.elem[0].count)
> +		*set_id = 1;
> +	else
> +		*set_id = 0;
> +
> +	if (count == 0)
> +		return 0;
> +	else
> +		return 1;
> +}
> +
> +static void
> +should_converge(const struct rte_member_setsum *ss)
> +{
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +
> +	/* For count min sketch - L1 norm */
[Wang, Yipeng] When sample_rate is not 1, i.e. it is not a count-min, does it still apply?

> +	if (runtime_var->pkt_cnt > ss->converge_thresh) {
> +		runtime_var->converged = 1;
> +		RTE_MEMBER_LOG(DEBUG, "Sketch converged, begin sampling
> "
> +					"from key count %lu\n",
> +					runtime_var->pkt_cnt);
> +	}
> +}
> +
> +static void
> +sketch_update_row(const struct rte_member_setsum *ss, const void *key,
> +		  uint32_t count, uint32_t cur_row)
> +{
> +	uint64_t *count_array = ss->table;
> +	uint32_t col = MEMBER_HASH_FUNC(key, ss->key_len,
> +			ss->hash_seeds[cur_row]) % ss->num_col;
> +
> +	/* sketch counter update */
> +	count_array[cur_row * ss->num_col + col] +=
> +			ceil(count / (ss->sample_rate));
> +}
> +
> +void
> +sketch_update_scalar(const struct rte_member_setsum *ss,
> +		     const void *key,
> +		     uint32_t count)
> +{
> +	uint64_t *count_array = ss->table;
> +	uint32_t col;
> +	uint32_t cur_row;
> +
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
> +		col = MEMBER_HASH_FUNC(key, ss->key_len,
> +				ss->hash_seeds[cur_row]) % ss->num_col;
> +		count_array[cur_row * ss->num_col + col] += count;
> +	}
> +}
> +
> +static void
> +heap_update(const struct rte_member_setsum *ss, const void *key)
> +{
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +	uint64_t key_cnt = 0;
> +	int found;
> +
> +	/* We also update the heap for this key */
> +	key_cnt = ss->sketch_lookup(ss, key);
> +	if (key_cnt > runtime_var->heap.elem[0].count) {
> +		found = rte_member_minheap_find(&runtime_var->heap, key);
> +		/* the key is found in the top-k heap */
> +		if (found >= 0) {
> +			if (runtime_var->heap.elem[found].count < key_cnt)
> +				rte_member_heapify(&runtime_var->heap,
> found, true);
> +
> +			runtime_var->heap.elem[found].count = key_cnt;
> +		} else if (runtime_var->heap.size < ss->topk) {
> +			rte_member_minheap_insert_node(&runtime_var-
> >heap, key,
> +				key_cnt, runtime_var->key_slots, runtime_var-
> >free_key_slots);
> +		} else {
> +			rte_member_minheap_replace_node(&runtime_var-
> >heap, key, key_cnt);
> +		}
> +	} else if (runtime_var->heap.size < ss->topk) {
> +		found = rte_member_minheap_find(&runtime_var->heap, key);
> +		if (found >= 0) {
> +			if (runtime_var->heap.elem[found].count < key_cnt)
> +				rte_member_heapify(&runtime_var->heap,
> found, true);
> +
> +			runtime_var->heap.elem[found].count = key_cnt;
> +		} else
> +			rte_member_minheap_insert_node(&runtime_var-
> >heap, key,
> +				key_cnt, runtime_var->key_slots, runtime_var-
> >free_key_slots);
> +	}
> +}
> +
> +/*
> + * Add a single packet into the sketch.
> + * Sketch value is meatured by packet numbers in this mode.
> + */
> +int
> +rte_member_add_sketch(const struct rte_member_setsum *ss,
> +		      const void *key,
> +		      __rte_unused member_set_t set_id)
> +{
> +	uint32_t cur_row;
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +	uint32_t *until_next = &(runtime_var->until_next);
> +
> +	/*
> +	 * If sketch is mesured by byte count,
> +	 * the rte_member_add_sketch_byte_count routine should be used.
> +	 */
> +	if (ss->count_byte == 1)
> +		return -EINVAL;
[Wang, Yipeng] Maybe a message to user on the reason of error.
> +
> +	if (ss->sample_rate == 1) {
> +		ss->sketch_update(ss, key, 1);
> +		heap_update(ss, key);
> +		return 0;
> +	}
> +
> +	/* convergence stage if it's needed */
> +	if (ss->always_bounded && !runtime_var->converged) {
> +		ss->sketch_update(ss, key, 1);
> +
> +		if (!((++runtime_var->pkt_cnt) & (INTERVAL - 1)))
> +			should_converge(ss);
> +
> +		heap_update(ss, key);
> +		return 0;
> +	}
> +
> +	/* should we skip this packet */
> +	if (*until_next >= ss->num_row) {
> +		*until_next -= ss->num_row;
> +		return 0;
> +	}
> +	cur_row = *until_next;
> +	do {
> +		sketch_update_row(ss, key, 1, cur_row);
> +		*until_next = draw_geometric(ss);
> +		if (cur_row + *until_next >= ss->num_row)
> +			break;
> +		cur_row += *until_next;
> +	} while (1);
> +
> +	*until_next -= (ss->num_row - cur_row);
> +
> +	heap_update(ss, key);
> +
> +	return 0;
> +}
> +
> +/*
> + * Add the byte count of the packet into the sketch.
> + * Sketch value is meatured by byte count numbers in this mode.
> + */
> +int
> +rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
> +				 const void *key,
> +				 uint32_t byte_count)
> +{
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +	uint32_t *until_next = &(runtime_var->until_next);
> +
> +	/* should not call this API if not in count byte mode */
> +	if (ss->count_byte == 0)
> +		return -EINVAL;
 [Wang, Yipeng] Maybe a message to user on the reason of error.
> +
> +	/* there's specific optimization for the sketch update */
> +	ss->sketch_update(ss, key, byte_count);
> +
> +	if (*until_next != 0) {
> +		*until_next = *until_next - 1;
> +		return 0;
> +	}
> +
> +	*until_next = draw_geometric(ss) - 1;
> +
> +	heap_update(ss, key);
> +
> +	return 0;
> +}
> +
> +int
> +rte_member_delete_sketch(const struct rte_member_setsum *ss,
> +			 const void *key)
> +{
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +	int found;
> +
> +	found = rte_member_minheap_find(&runtime_var->heap, key);
> +	if (found < 0)
> +		return -1;
> +
> +	ss->sketch_delete(ss, key);
> +
> +	return rte_member_minheap_delete_node
> +		(&runtime_var->heap, key, runtime_var->key_slots,
> runtime_var->free_key_slots);
> +}
> +
> +void
> +rte_member_free_sketch(struct rte_member_setsum *ss)
> +{
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +
> +	rte_free(ss->table);
> +	rte_member_minheap_free(&runtime_var->heap);
> +	rte_free(runtime_var->key_slots);
> +	rte_ring_free(runtime_var->free_key_slots);
> +	rte_free(runtime_var);
> +}
> +
> +void
> +rte_member_reset_sketch(const struct rte_member_setsum *ss)
> +{
> +	struct sketch_runtime *runtime_var = ss->runtime_var;
> +	uint64_t *sketch = ss->table;
> +	uint32_t i;
> +
> +	memset(sketch, 0, sizeof(uint64_t) * ss->num_col * ss->num_row);
> +	rte_member_minheap_reset(&runtime_var->heap);
> +	rte_ring_reset(runtime_var->free_key_slots);
> +
> +	for (i = 0; i < ss->topk; i++)
> +		rte_ring_sp_enqueue_elem(runtime_var->free_key_slots, &i,
> sizeof(uint32_t));
> +}
> diff --git a/lib/member/rte_member_sketch.h
> b/lib/member/rte_member_sketch.h
> new file mode 100644
> index 0000000000..a5e633a74e
> --- /dev/null
> +++ b/lib/member/rte_member_sketch.h
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2017 Intel Corporation
> + */
> +
> +#ifndef _RTE_MEMBER_SKETCH_H_
> +#define _RTE_MEMBER_SKETCH_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_ring_elem.h>
> +
> +#define NUM_ROW_SCALAR 5
> +#define INTERVAL (1 << 15)
> +
> +#if !RTE_IS_POWER_OF_2(INTERVAL)
> +#error sketch INTERVAL macro must be a power of 2
> +#endif
> +
> +int
> +rte_member_create_sketch(struct rte_member_setsum *ss,
> +			 const struct rte_member_parameters *params,
> +			 struct rte_ring *r);
> +
> +int
> +rte_member_lookup_sketch(const struct rte_member_setsum *setsum,
> +			 const void *key, member_set_t *set_id);
> +
> +int
> +rte_member_add_sketch(const struct rte_member_setsum *setsum,
> +		      const void *key,
> +		      member_set_t set_id);
> +
> +int
> +rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
> +				 const void *key, uint32_t byte_count);
> +
> +void
> +sketch_update_scalar(const struct rte_member_setsum *ss,
> +		     const void *key,
> +		     uint32_t count);
> +
> +uint64_t
> +sketch_lookup_scalar(const struct rte_member_setsum *ss,
> +		     const void *key);
> +
> +void
> +sketch_delete_scalar(const struct rte_member_setsum *ss,
> +		     const void *key);
> +
> +int
> +rte_member_delete_sketch(const struct rte_member_setsum *setsum,
> +			 const void *key);
> +
> +int
> +rte_member_query_sketch(const struct rte_member_setsum *setsum,
> +			const void *key, uint64_t *output);
> +
> +void
> +rte_member_free_sketch(struct rte_member_setsum *ss);
> +
> +void
> +rte_member_reset_sketch(const struct rte_member_setsum *setsum);
> +
> +int
> +rte_member_report_heavyhitter_sketch(const struct rte_member_setsum
> *setsum,
> +				     void **key, uint64_t *count);
> +
> +void
> +rte_member_update_heap(const struct rte_member_setsum *ss);
> +
> +static __rte_always_inline uint64_t
> +count_min(const struct rte_member_setsum *ss, const uint32_t *hash_results)
> +{
> +	uint64_t *count_array = ss->table;
> +	uint64_t count;
> +	uint32_t cur_row;
> +	uint64_t min = UINT64_MAX;
> +
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
> +		uint64_t cnt = count_array[cur_row * ss->num_col +
> hash_results[cur_row]];
> +
> +		if (cnt < min)
> +			min = cnt;
> +	}
> +	count = min;
> +
> +	return count;
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_MEMBER_SKETCH_H_ */
> diff --git a/lib/member/rte_member_sketch_avx512.c
> b/lib/member/rte_member_sketch_avx512.c
> new file mode 100644
> index 0000000000..c83f4b6fd1
> --- /dev/null
> +++ b/lib/member/rte_member_sketch_avx512.c
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#include "rte_member_sketch_avx512.h"
> +
> +__rte_always_inline void
> +sketch_update_avx512(const struct rte_member_setsum *ss,
> +		     const void *key,
> +		     uint32_t count)
> +{
> +	uint64_t *count_array = ss->table;
> +	uint32_t num_col = ss->num_col;
> +	uint32_t key_len = ss->key_len;
> +	__m256i v_row_base;
> +	__m256i v_hash_result;
> +	__m512i current_sketch;
> +	__m512i updated_sketch;
> +	__m512i v_count;
> +
> +	const __m256i v_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
> +	const __m256i v_col = _mm256_set1_epi32(num_col);
> +
> +	/* compute the hash result parallelly */
> +	v_hash_result = rte_xxh64_sketch_avx512
> +		(key, key_len, *(__m512i *)ss->hash_seeds, num_col);
> +	v_row_base = _mm256_mullo_epi32(v_idx, v_col);
> +	v_hash_result = _mm256_add_epi32(v_row_base, v_hash_result);
> +
> +	current_sketch =
> +		_mm512_i32gather_epi64(v_hash_result, count_array, 8);
> +	v_count = _mm512_set1_epi64(count);
> +	updated_sketch = _mm512_add_epi64(current_sketch, v_count);
> +	_mm512_i32scatter_epi64
> +		((void *)count_array, v_hash_result, updated_sketch, 8);
> +}
> +
> +uint64_t
> +sketch_lookup_avx512(const struct rte_member_setsum *ss, const void *key)
> +{
> +	uint32_t col[ss->num_row];
> +
> +	/* currently only for sketch byte count mode */
> +	__m256i v_hash_result = rte_xxh64_sketch_avx512
> +		(key, ss->key_len, *(__m512i *)ss->hash_seeds, ss->num_col);
> +	_mm256_storeu_si256((__m256i *)col, v_hash_result);
> +
> +	return count_min(ss, col);
> +}
> +
> +void
> +sketch_delete_avx512(const struct rte_member_setsum *ss, const void *key)
> +{
> +	uint32_t col[ss->num_row];
> +	uint64_t *count_array = ss->table;
> +	uint64_t min = UINT64_MAX;
> +	uint32_t cur_row;
> +
> +	__m256i v_hash_result = rte_xxh64_sketch_avx512
> +		(key, ss->key_len, *(__m512i *)ss->hash_seeds,
> +		 RTE_ALIGN_FLOOR(ss->num_col, 32));
> +	_mm256_storeu_si256((__m256i *)col, v_hash_result);
> +
> +	min = count_min(ss, col);
> +
> +	/* subtract the min value from all the counters */
> +	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
> +		count_array[cur_row * ss->num_col + col[cur_row]] -= min;
> +}
> diff --git a/lib/member/rte_member_sketch_avx512.h
> b/lib/member/rte_member_sketch_avx512.h
> new file mode 100644
> index 0000000000..e7c25da643
> --- /dev/null
> +++ b/lib/member/rte_member_sketch_avx512.h
> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_MEMBER_SKETCH_AVX512_H_
> +#define _RTE_MEMBER_SKETCH_AVX512_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_vect.h>
> +#include "rte_member.h"
> +#include "rte_member_sketch.h"
> +#include "rte_xxh64_avx512.h"
> +
> +#define NUM_ROW_VEC 8
> +
> +void
> +sketch_update_avx512(const struct rte_member_setsum *ss,
> +		     const void *key,
> +		     uint32_t count);
> +
> +uint64_t
> +sketch_lookup_avx512(const struct rte_member_setsum *ss,
> +		     const void *key);
> +
> +void
> +sketch_delete_avx512(const struct rte_member_setsum *ss,
> +		     const void *key);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_MEMBER_SKETCH_AVX512_H_ */
> diff --git a/lib/member/rte_xxh64_avx512.h b/lib/member/rte_xxh64_avx512.h
> new file mode 100644
> index 0000000000..574748fc38
> --- /dev/null
> +++ b/lib/member/rte_xxh64_avx512.h
> @@ -0,0 +1,117 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_XXH64_AVX512_H_
> +#define _RTE_XXH64_AVX512_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <immintrin.h>
> +
> +/*
> 0b100111100011011101111001101100011000010111101011110010101000011
> 1 */
> +static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
> +/*
> 0b110000101011001010101110001111010010011111010100111010110100111
> 1 */
> +static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
> +/*
> 0b000101100101011001100111101100011001111000110111011110011111100
> 1 */
> +static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
> +/*
> 0b100001011110101111001010011101111100001010110010101011100110001
> 1 */
> +static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
> +/*
> 0b001001111101010011101011001011110001011001010110011001111100010
> 1 */
> +static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
> +
> +static __rte_always_inline  __m512i
> +xxh64_round_avx512(__m512i hash, __m512i input)
> +{
> +	hash = _mm512_madd52lo_epu64(hash,
> +			input,
> +			_mm512_set1_epi64(PRIME64_2));
> +
> +	hash = _mm512_rol_epi64(hash, 31);
> +
> +	return hash;
> +}
> +
> +static __rte_always_inline  __m512i
> +xxh64_fmix_avx512(__m512i hash)
> +{
> +	hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33));
> +
> +	return hash;
> +}
> +
> +static __rte_always_inline __m256i
> +rte_xxh64_sketch_avx512(const void *key, uint32_t key_len,
> +			__m512i v_seed, uint32_t modulo)
> +{
> +	__m512i v_prime64_5, v_hash;
> +	size_t remaining = key_len;
> +	size_t offset = 0;
> +	__m512i input;
> +
> +	v_prime64_5 = _mm512_set1_epi64(PRIME64_5);
> +	v_hash = _mm512_add_epi64
> +			(_mm512_add_epi64(v_seed, v_prime64_5),
> +			 _mm512_set1_epi64(key_len));
> +
> +	while (remaining >= 8) {
> +		input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key,
> offset));
> +		v_hash = _mm512_xor_epi64(v_hash,
> +				xxh64_round_avx512(_mm512_setzero_si512(),
> input));
> +		v_hash =
> _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4),
> +				_mm512_rol_epi64(v_hash, 27),
> +				_mm512_set1_epi64(PRIME64_1));
> +
> +		remaining -= 8;
> +		offset += 8;
> +	}
> +
> +	if (remaining >= 4) {
> +		input = _mm512_set1_epi64
> +			(*(uint32_t *)RTE_PTR_ADD(key, offset));
> +		v_hash = _mm512_xor_epi64(v_hash,
> +			_mm512_mullo_epi64(input,
> +				_mm512_set1_epi64(PRIME64_1)));
> +		v_hash = _mm512_madd52lo_epu64
> +				(_mm512_set1_epi64(PRIME64_3),
> +				_mm512_rol_epi64(v_hash, 23),
> +				_mm512_set1_epi64(PRIME64_2));
> +
> +		offset += 4;
> +		remaining -= 4;
> +	}
> +
> +	while (remaining != 0) {
> +		input = _mm512_set1_epi64
> +			(*(uint8_t *)RTE_PTR_ADD(key, offset));
> +		v_hash = _mm512_xor_epi64(v_hash,
> +			_mm512_mullo_epi64(input,
> +				_mm512_set1_epi64(PRIME64_5)));
> +		v_hash = _mm512_mullo_epi64
> +			(_mm512_rol_epi64(v_hash, 11),
> +			_mm512_set1_epi64(PRIME64_1));
> +		offset++;
> +		remaining--;
> +	}
> +
> +	v_hash = xxh64_fmix_avx512(v_hash);
> +
> +	/*
> +	 * theoritically, such modular operations can be replaced by
> +	 * _mm512_rem_epi64(), but seems it depends on the complier's
> +	 * implementation. so here is the limitation that the modulo
> +	 * value should be power of 2.
> +	 */
> +	__m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1));
> +
> +	return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash,
> v_hash_remainder));
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_XXH64_AVX512_H_ */
> --
> 2.25.1
[Wang, Yipeng] 
Leyi, thanks for drafting the code and posting the RFC. As I may understand sketch algorithms better than some of the other audients here,
it would be helpful if you could add some use cases explaining the contribution (e.g. some real usages, details on why this algorithm is needed).
So that others could evaluate and comment on the patch set better.

Algorithm-wise, my major question is:
1. Is deletion supported for non-count-min case?
2. Byte-count estimation mode was not originally from the paper (as I remember), any caveat to use this mode? Alan may have better idea.

Code generally looks OK, but you may want to add comments for people to review more easily. Without knowledge of the algorithm, it is hard to understand many of the function usages.
Several modes are also involved (e.g. byte-count mode, always-bounded mode, etc.) May be beneficial to know which is most useful and set it to default. Also explain them well to users and reviewers.
  

Patch

diff --git a/lib/member/meson.build b/lib/member/meson.build
index e06fddc240..426c9891c2 100644
--- a/lib/member/meson.build
+++ b/lib/member/meson.build
@@ -7,6 +7,39 @@  if is_windows
     subdir_done()
 endif
 
-sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c')
+sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c', 'rte_member_sketch.c')
 headers = files('rte_member.h')
 deps += ['hash']
+
+# compile AVX512 version if:
+# we are building 64-bit binary AND binutils can generate proper code
+if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
+    # compile AVX512 version if either:
+    # a. we have AVX512 supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+    sketch_avx512_cpu_support = (
+        cc.get_define('__AVX512F__', args: machine_args) != ''
+    )
+
+    if sketch_avx512_cpu_support == true
+	cflags += ['-DCC_AVX512_SUPPORT']
+	if cc.has_argument('-mavx512f')
+	    cflags += ['-mavx', '-mavx2', '-mavx512f', '-mavx512ifma', '-march=icelake-server']
+	endif
+	sources += files('rte_member_sketch_avx512.c')
+    elif cc.has_argument('-mavx512f')
+	cflags += '-DCC_AVX512_SUPPORT'
+	sketch_avx512_tmp = static_library('sketch_avx512_tmp',
+	    'rte_member_sketch_avx512.c',
+	    dependencies: static_rte_eal,
+	    c_args: cflags + ['-mavx512f'])
+	objs += sketch_avx512_tmp.extract_objects('rte_member_sketch_avx512.c')
+    endif
+endif
diff --git a/lib/member/rte_member.c b/lib/member/rte_member.c
index 7e1632e6b5..8f859f7fbd 100644
--- a/lib/member/rte_member.c
+++ b/lib/member/rte_member.c
@@ -9,10 +9,12 @@ 
 #include <rte_malloc.h>
 #include <rte_errno.h>
 #include <rte_tailq.h>
+#include <rte_ring_elem.h>
 
 #include "rte_member.h"
 #include "rte_member_ht.h"
 #include "rte_member_vbf.h"
+#include "rte_member_sketch.h"
 
 TAILQ_HEAD(rte_member_list, rte_tailq_entry);
 static struct rte_tailq_elem rte_member_tailq = {
@@ -72,6 +74,9 @@  rte_member_free(struct rte_member_setsum *setsum)
 	case RTE_MEMBER_TYPE_VBF:
 		rte_member_free_vbf(setsum);
 		break;
+	case RTE_MEMBER_TYPE_SKETCH:
+		rte_member_free_sketch(setsum);
+		break;
 	default:
 		break;
 	}
@@ -86,6 +91,8 @@  rte_member_create(const struct rte_member_parameters *params)
 	struct rte_member_list *member_list;
 	struct rte_member_setsum *setsum;
 	int ret;
+	char ring_name[RTE_RING_NAMESIZE];
+	struct rte_ring *sketch_key_ring = NULL;
 
 	if (params == NULL) {
 		rte_errno = EINVAL;
@@ -100,6 +107,16 @@  rte_member_create(const struct rte_member_parameters *params)
 		return NULL;
 	}
 
+	if (params->type == RTE_MEMBER_TYPE_SKETCH) {
+		snprintf(ring_name, sizeof(ring_name), "SK_%s", params->name);
+		sketch_key_ring = rte_ring_create_elem(ring_name, sizeof(uint32_t),
+				rte_align32pow2(params->top_k), params->socket_id, 0);
+		if (sketch_key_ring == NULL) {
+			RTE_MEMBER_LOG(ERR, "Sketch Ring Memory allocation failed\n");
+			return NULL;
+		}
+	}
+
 	member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list);
 
 	rte_mcfg_tailq_write_lock();
@@ -145,6 +162,9 @@  rte_member_create(const struct rte_member_parameters *params)
 	case RTE_MEMBER_TYPE_VBF:
 		ret = rte_member_create_vbf(setsum, params);
 		break;
+	case RTE_MEMBER_TYPE_SKETCH:
+		ret = rte_member_create_sketch(setsum, params, sketch_key_ring);
+		break;
 	default:
 		goto error_unlock_exit;
 	}
@@ -162,6 +182,7 @@  rte_member_create(const struct rte_member_parameters *params)
 error_unlock_exit:
 	rte_free(te);
 	rte_free(setsum);
+	rte_ring_free(sketch_key_ring);
 	rte_mcfg_tailq_write_unlock();
 	return NULL;
 }
@@ -178,6 +199,23 @@  rte_member_add(const struct rte_member_setsum *setsum, const void *key,
 		return rte_member_add_ht(setsum, key, set_id);
 	case RTE_MEMBER_TYPE_VBF:
 		return rte_member_add_vbf(setsum, key, set_id);
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_add_sketch(setsum, key, set_id);
+	default:
+		return -EINVAL;
+	}
+}
+
+int
+rte_member_add_byte_count(const struct rte_member_setsum *setsum,
+			  const void *key, uint32_t byte_count)
+{
+	if (setsum == NULL || key == NULL || byte_count == 0)
+		return -EINVAL;
+
+	switch (setsum->type) {
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_add_sketch_byte_count(setsum, key, byte_count);
 	default:
 		return -EINVAL;
 	}
@@ -195,6 +233,8 @@  rte_member_lookup(const struct rte_member_setsum *setsum, const void *key,
 		return rte_member_lookup_ht(setsum, key, set_id);
 	case RTE_MEMBER_TYPE_VBF:
 		return rte_member_lookup_vbf(setsum, key, set_id);
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_lookup_sketch(setsum, key, set_id);
 	default:
 		return -EINVAL;
 	}
@@ -261,6 +301,36 @@  rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum,
 	}
 }
 
+int
+rte_member_query_count(const struct rte_member_setsum *setsum,
+		       const void *key, uint64_t *output)
+{
+	if (setsum == NULL || key == NULL || output == NULL)
+		return -EINVAL;
+
+	switch (setsum->type) {
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_query_sketch(setsum, key, output);
+	default:
+		return -EINVAL;
+	}
+}
+
+int
+rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
+				void **key, uint64_t *count)
+{
+	if (setsum == NULL || key == NULL || count == NULL)
+		return -EINVAL;
+
+	switch (setsum->type) {
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_report_heavyhitter_sketch(setsum, key, count);
+	default:
+		return -EINVAL;
+	}
+}
+
 int
 rte_member_delete(const struct rte_member_setsum *setsum, const void *key,
 			member_set_t set_id)
@@ -272,6 +342,8 @@  rte_member_delete(const struct rte_member_setsum *setsum, const void *key,
 	case RTE_MEMBER_TYPE_HT:
 		return rte_member_delete_ht(setsum, key, set_id);
 	/* current vBF implementation does not support delete function */
+	case RTE_MEMBER_TYPE_SKETCH:
+		return rte_member_delete_sketch(setsum, key);
 	case RTE_MEMBER_TYPE_VBF:
 	default:
 		return -EINVAL;
@@ -290,6 +362,9 @@  rte_member_reset(const struct rte_member_setsum *setsum)
 	case RTE_MEMBER_TYPE_VBF:
 		rte_member_reset_vbf(setsum);
 		return;
+	case RTE_MEMBER_TYPE_SKETCH:
+		rte_member_reset_sketch(setsum);
+		return;
 	default:
 		return;
 	}
diff --git a/lib/member/rte_member.h b/lib/member/rte_member.h
index 567ee0c84b..6afeda542f 100644
--- a/lib/member/rte_member.h
+++ b/lib/member/rte_member.h
@@ -39,6 +39,18 @@ 
  * |          |                     | not overwrite  |                         |
  * |          |                     | existing key.  |                         |
  * +----------+---------------------+----------------+-------------------------+
+ * +==========+=============================+
+ * |   type   |      sketch                 |
+ * +==========+=============================+
+ * |structure | counting bloom filter array |
+ * +----------+-----------------------------+
+ * |set id    | 1: heavy set, 0: light set  |
+ * |          |                             |
+ * +----------+-----------------------------+
+ * |usages &  | count size of a flow,       |
+ * |properties| used for heavy hitter       |
+ * |          | detection.                  |
+ * +----------+-----------------------------+
  * -->
  */
 
@@ -50,6 +62,7 @@  extern "C" {
 #endif
 
 #include <stdint.h>
+#include <stdbool.h>
 
 #include <rte_common.h>
 
@@ -66,6 +79,11 @@  typedef uint16_t member_set_t;
 /** Maximum number of characters in setsum name. */
 #define RTE_MEMBER_NAMESIZE 32
 
+/** For sketch, use the flag if prefer always bounded mode. */
+#define RTE_MEMBER_SKETCH_ALWAYS_BOUNDED 0x01
+/** For sketch, use the flag if to count packet size instead of packet count */
+#define RTE_MEMBER_SKETCH_COUNT_BYTE 0x02
+
 /** @internal Hash function used by membership library. */
 #if defined(RTE_ARCH_X86) || defined(__ARM_FEATURE_CRC32)
 #include <rte_hash_crc.h>
@@ -104,6 +122,7 @@  struct rte_member_parameters;
 enum rte_member_setsum_type {
 	RTE_MEMBER_TYPE_HT = 0,  /**< Hash table based set summary. */
 	RTE_MEMBER_TYPE_VBF,     /**< Vector of bloom filters. */
+	RTE_MEMBER_TYPE_SKETCH,
 	RTE_MEMBER_NUM_TYPE
 };
 
@@ -114,6 +133,19 @@  enum rte_member_sig_compare_function {
 	RTE_MEMBER_COMPARE_NUM
 };
 
+/* sketch update function with different implementations. */
+typedef void (*sketch_update_fn_t)(const struct rte_member_setsum *ss,
+				   const void *key,
+				   uint32_t count);
+
+/* sketch lookup function with different implementations. */
+typedef uint64_t (*sketch_lookup_fn_t)(const struct rte_member_setsum *ss,
+				       const void *key);
+
+/* sketch delete function with different implementations. */
+typedef void (*sketch_delete_fn_t)(const struct rte_member_setsum *ss,
+				   const void *key);
+
 /** @internal setsummary structure. */
 struct rte_member_setsum {
 	enum rte_member_setsum_type type; /* Type of the set summary. */
@@ -134,6 +166,21 @@  struct rte_member_setsum {
 	uint32_t bit_mask;	/* Bit mask to get bit location in bf. */
 	uint32_t num_hashes;	/* Number of hash values to index bf. */
 
+	/* Parameters for sketch */
+	float error_rate;
+	float sample_rate;
+	uint32_t num_col;
+	uint32_t num_row;
+	int always_bounded;
+	double converge_thresh;
+	uint32_t topk;
+	uint32_t count_byte;
+	uint64_t *hash_seeds;
+	sketch_update_fn_t sketch_update; /* Pointer to the sketch update function */
+	sketch_lookup_fn_t sketch_lookup; /* Pointer to the sketch lookup function */
+	sketch_delete_fn_t sketch_delete; /* Pointer to the sketch delete function */
+
+	void *runtime_var;
 	uint32_t mul_shift;  /* vbf internal variable used during bit test. */
 	uint32_t div_shift;  /* vbf internal variable used during bit test. */
 
@@ -143,6 +190,9 @@  struct rte_member_setsum {
 	/* Second cache line should start here. */
 	uint32_t socket_id;          /* NUMA Socket ID for memory. */
 	char name[RTE_MEMBER_NAMESIZE]; /* Name of this set summary. */
+#ifdef RTE_ARCH_X86
+	bool use_avx512;
+#endif
 } __rte_cache_aligned;
 
 /**
@@ -261,8 +311,33 @@  struct rte_member_parameters {
 	 */
 	uint32_t sec_hash_seed;
 
+	/**
+	 * For count(min) sketch data structure, error rate defines the accuracy
+	 * required by the user. Higher accuracy leads to more memory usage, but
+	 * the flow size is estimated more accurately.
+	 */
+	float error_rate;
+
+	/**
+	 * Sampling rate means the internal sample rate of the rows of the count
+	 * min sketches. Lower sampling rate can reduce CPU overhead, but the
+	 * data structure will require more time to converge statistically.
+	 */
+	float sample_rate;
+
+	/**
+	 * How many top heavy hitter to be reported. The library will internally
+	 * keep the keys of heavy hitters for final report.
+	 */
+	uint32_t top_k;
+
+	/**
+	 * Extra flags that may passed in by user
+	 */
+	uint32_t extra_flag;
+
 	int socket_id;			/**< NUMA Socket ID for memory. */
-};
+} __rte_cache_aligned;
 
 /**
  * @warning
@@ -418,7 +493,7 @@  rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum,
  *   RTE_MEMBER_NO_MATCH by default is set as 0.
  *   For HT mode, the set_id has range as [1, 0x7FFF], MSB is reserved.
  *   For vBF mode the set id is limited by the num_set parameter when create
- *   the set-summary.
+ *   the set-summary. For sketch mode, this id is ignored.
  * @return
  *   HT (cache mode) and vBF should never fail unless the set_id is not in the
  *   valid range. In such case -EINVAL is returned.
@@ -429,12 +504,72 @@  rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum,
  *   Return 0 for HT (cache mode) if the add does not cause
  *   eviction, return 1 otherwise. Return 0 for non-cache mode if success,
  *   -ENOSPC for full, and 1 if cuckoo eviction happens.
- *   Always returns 0 for vBF mode.
+ *   Always returns 0 for vBF mode and sketch.
  */
 int
 rte_member_add(const struct rte_member_setsum *setsum, const void *key,
 			member_set_t set_id);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add the packet byte size into the sketch
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the key to be added.
+ * @param byte_count
+ *   Add the byte count of the packet into the sketch.
+ * @return
+ * Return -EINVAL for invalid parameters, otherwise return 0.
+ */
+int
+rte_member_add_byte_count(const struct rte_member_setsum *setsum,
+			  const void *key, uint32_t byte_count);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * query packet count
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the key to be added.
+ * @param count
+ *   The output packet count or byte count.
+ * @return
+ *   Return -EINVAL for invalid parameters.
+ */
+int
+rte_member_query_count(const struct rte_member_setsum *setsum,
+		       const void *key, uint64_t *count);
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Insert key into set-summary (SS).
+ *
+ * @param setsum
+ *   Pointer of a set-summary.
+ * @param key
+ *   Pointer of the output top-k key array.
+ * @param count
+ *   Pointer of the output packet count or byte count array of the top-k keys.
+ * @return
+ *   Return -EINVAL for invalid parameters. Return a positive integer indicate
+ *   how many heavy hitters are reported.
+ */
+int
+rte_member_report_heavyhitter(const struct rte_member_setsum *setsum,
+			      void **keys, uint64_t *counts);
+
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice
diff --git a/lib/member/rte_member_heap.h b/lib/member/rte_member_heap.h
new file mode 100644
index 0000000000..3d265c793f
--- /dev/null
+++ b/lib/member/rte_member_heap.h
@@ -0,0 +1,449 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ * Copyright(c) 2020, Alan Liu <zaoxingliu@gmail.com>
+ */
+
+#ifndef _RTE_MEMBER_HEAP_H_
+#define _RTE_MEMBER_HEAP_H_
+
+#include <rte_ring_elem.h>
+#include "rte_member.h"
+
+#define LCHILD(x) (2 * x + 1)
+#define RCHILD(x) (2 * x + 2)
+#define PARENT(x) ((x - 1) / 2)
+
+#define HASH_BKT_SIZE 16
+#define HASH_HP_MULTI 4
+#define HASH_RESIZE_MULTI 2
+
+#define MINHEAP_OPTIMIZED_HT
+
+struct hash_bkt {
+	uint16_t sig[HASH_BKT_SIZE];
+	uint16_t idx[HASH_BKT_SIZE];
+};
+
+struct hash {
+	uint16_t bkt_cnt;
+	uint16_t num_item;
+	uint32_t seed;
+	struct hash_bkt buckets[0];
+};
+
+struct node {
+	void *key;
+	uint64_t count;
+};
+
+struct minheap {
+	uint32_t key_len;
+	uint32_t size;
+	uint32_t socket;
+	struct hash *hashtable;
+	struct node *elem;
+};
+
+#ifdef MINHEAP_OPTIMIZED_HT
+static int
+hash_table_insert(const void *key, int value, int key_len, struct hash *table)
+{
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+
+	for (int i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].idx[i] == 0) {
+			table->buckets[idx].idx[i] = value;
+			table->buckets[idx].sig[i] = sig;
+			table->num_item++;
+			return 0;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int
+hash_table_update(const void *key, int old_value, int value, int key_len, struct hash *table)
+{
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+
+	for (int i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] == old_value) {
+			table->buckets[idx].idx[i] = value;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static int
+hash_table_del(const void *key, uint16_t value, int key_len, struct hash *table)
+{
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+
+	for (int i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] == value) {
+			table->buckets[idx].idx[i] = 0;
+			table->num_item--;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static int
+hash_table_lookup(const void *key, int key_len, struct minheap *hp)
+{
+	struct hash *table = hp->hashtable;
+	uint32_t hash = MEMBER_HASH_FUNC(key, key_len, table->seed);
+	uint16_t idx = hash % table->bkt_cnt;
+	uint16_t sig = hash >> 16;
+
+	for (int i = 0; i < HASH_BKT_SIZE; i++) {
+		if (table->buckets[idx].sig[i] == sig && table->buckets[idx].idx[i] != 0) {
+			uint32_t hp_idx = table->buckets[idx].idx[i] - 1;
+
+			if (memcmp(hp->elem[hp_idx].key, key, hp->key_len) == 0)
+				return hp_idx;
+		}
+	}
+
+	return -ENOENT; /* key doesn't exist */
+}
+
+static int
+resize_hash_table(struct minheap *hp)
+{
+	uint32_t i;
+	uint32_t new_bkt_cnt;
+
+	while (1) {
+		new_bkt_cnt = hp->hashtable->bkt_cnt * HASH_RESIZE_MULTI;
+
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT load factor is [%f]\n",
+			hp->hashtable->num_item / ((float)hp->hashtable->bkt_cnt * HASH_BKT_SIZE));
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT resize happen!\n");
+		rte_free(hp->hashtable);
+		hp->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
+						new_bkt_cnt * sizeof(struct hash_bkt),
+						RTE_CACHE_LINE_SIZE, hp->socket);
+
+		if (hp->hashtable == NULL) {
+			RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n");
+			return -ENOMEM;
+		}
+
+		hp->hashtable->bkt_cnt = new_bkt_cnt;
+
+		for (i = 0; i < hp->size; ++i) {
+			if (hash_table_insert(hp->elem[i].key,
+				i + 1, hp->key_len, hp->hashtable) < 0) {
+				RTE_MEMBER_LOG(ERR,
+					"Sketch Minheap HT resize insert fail!\n");
+				break;
+			}
+		}
+		if (i == hp->size)
+			break;
+	}
+
+	return 0;
+}
+#endif /* MINHEAP_OPTIMIZED_HT */
+
+/* find the item in the given minheap */
+static int
+rte_member_minheap_find(struct minheap *hp, const void *key)
+{
+#ifdef MINHEAP_OPTIMIZED_HT
+	int idx = hash_table_lookup(key, hp->key_len, hp);
+	return idx;
+#else
+	uint32_t idx;
+
+	for (idx = 0; idx < hp->size; ++idx) {
+		if (memcmp(hp->elem[idx].key, key, hp->key_len) == 0)
+			return idx;
+	}
+
+	return -ENOENT; /* key doesn't exist */
+#endif
+}
+
+static int
+rte_member_minheap_init(struct minheap *heap, int size,
+			uint32_t socket, uint32_t seed)
+{
+	heap->elem = rte_zmalloc_socket(NULL, sizeof(struct node) * size,
+				RTE_CACHE_LINE_SIZE, socket);
+	if (heap->elem == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap elem allocation failed\n");
+		return -ENOMEM;
+	}
+
+	uint32_t hash_bkt_cnt = rte_align32pow2(size * HASH_HP_MULTI) / HASH_BKT_SIZE;
+
+	if (hash_bkt_cnt == 0)
+		hash_bkt_cnt = 1;
+
+	heap->hashtable = rte_zmalloc_socket(NULL, sizeof(struct hash) +
+					hash_bkt_cnt * sizeof(struct hash_bkt),
+					RTE_CACHE_LINE_SIZE, socket);
+
+	if (heap->hashtable == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap HT allocation failed\n");
+		rte_free(heap->elem);
+		return -ENOMEM;
+	}
+
+	heap->hashtable->seed = seed;
+	heap->hashtable->bkt_cnt = hash_bkt_cnt;
+	heap->socket = socket;
+
+	return 0;
+}
+
+/* swap the minheap nodes */
+static __rte_always_inline void
+rte_member_heap_swap(struct node *n1, struct node *n2)
+{
+	struct node temp = *n1;
+	*n1 = *n2;
+	*n2 = temp;
+}
+
+/* heapify function */
+static void
+rte_member_heapify(struct minheap *hp, uint32_t idx, bool update_hash)
+{
+	uint32_t smallest;
+
+	if (LCHILD(idx) < hp->size &&
+			hp->elem[LCHILD(idx)].count < hp->elem[idx].count)
+		smallest = LCHILD(idx);
+	else
+		smallest = idx;
+
+	if (RCHILD(idx) < hp->size &&
+			hp->elem[RCHILD(idx)].count < hp->elem[smallest].count)
+		smallest = RCHILD(idx);
+
+	if (smallest != idx) {
+		rte_member_heap_swap(&(hp->elem[idx]), &(hp->elem[smallest]));
+
+#ifdef MINHEAP_OPTIMIZED_HT
+		if (update_hash) {
+			if (hash_table_update(hp->elem[smallest].key, idx + 1, smallest + 1,
+					hp->key_len, hp->hashtable) < 0) {
+				RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+				return;
+			}
+
+			if (hash_table_update(hp->elem[idx].key, smallest + 1, idx + 1,
+					hp->key_len, hp->hashtable) < 0) {
+				RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+				return;
+			}
+		}
+#endif
+		rte_member_heapify(hp, smallest, update_hash);
+	}
+}
+
+/* insert a node into the minheap */
+static int
+rte_member_minheap_insert_node(struct minheap *hp, const void *key,
+			       int counter, void *key_slot,
+			       struct rte_ring *free_key_slot)
+{
+	struct node nd;
+	uint32_t slot_id;
+
+	if (rte_ring_sc_dequeue_elem(free_key_slot, &slot_id, sizeof(uint32_t)) != 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap get empty keyslot failed\n");
+		return -1;
+	}
+
+	nd.count = counter;
+	nd.key = RTE_PTR_ADD(key_slot, slot_id * hp->key_len);
+
+	memcpy(nd.key, key, hp->key_len);
+
+	uint32_t i = (hp->size)++;
+
+	while (i && nd.count < hp->elem[PARENT(i)].count) {
+		hp->elem[i] = hp->elem[PARENT(i)];
+#ifdef MINHEAP_OPTIMIZED_HT
+		if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
+				hp->key_len, hp->hashtable) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+			return -1;
+		}
+#endif
+		i = PARENT(i);
+	}
+	hp->elem[i] = nd;
+#ifdef MINHEAP_OPTIMIZED_HT
+	if (hash_table_insert(key, i + 1, hp->key_len, hp->hashtable) < 0) {
+		if (resize_hash_table(hp) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table resize failed\n");
+			return -1;
+		}
+	}
+#endif
+	return 0;
+}
+
+/* delete a key from the minheap */
+static int
+rte_member_minheap_delete_node(struct minheap *hp, const void *key,
+			       void *key_slot, struct rte_ring *free_key_slot)
+{
+	int idx = rte_member_minheap_find(hp, key);
+	uint32_t offset = RTE_PTR_DIFF(hp->elem[idx].key, key_slot) / hp->key_len;
+
+#ifdef MINHEAP_OPTIMIZED_HT
+	if (hash_table_del(key, idx + 1, hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
+		return -1;
+	}
+#endif
+	rte_ring_sp_enqueue_elem(free_key_slot, &offset, sizeof(uint32_t));
+
+	if (idx == (int)(hp->size - 1)) {
+		hp->size--;
+		return 0;
+	}
+
+	hp->elem[idx] = hp->elem[hp->size - 1];
+
+#ifdef MINHEAP_OPTIMIZED_HT
+	if (hash_table_update(hp->elem[idx].key, hp->size, idx + 1,
+				hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+		return -1;
+	}
+#endif
+	hp->size--;
+	rte_member_heapify(hp, idx, true);
+
+	return 0;
+}
+
+/* replace a min node with a new key. */
+static int
+rte_member_minheap_replace_node(struct minheap *hp,
+				const void *new_key,
+				int new_counter)
+{
+	struct node nd;
+	void *recycle_key = NULL;
+
+	recycle_key = hp->elem[0].key;
+
+#ifdef MINHEAP_OPTIMIZED_HT
+	if (hash_table_del(recycle_key, 1, hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table delete failed\n");
+		return -1;
+	}
+#endif
+	hp->elem[0] = hp->elem[hp->size - 1];
+#ifdef MINHEAP_OPTIMIZED_HT
+	if (hash_table_update(hp->elem[0].key, hp->size, 1,
+				hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+		return -1;
+	}
+#endif
+	hp->size--;
+
+	rte_member_heapify(hp, 0, true);
+
+	nd.count = new_counter;
+	nd.key = recycle_key;
+
+	memcpy(nd.key, new_key, hp->key_len);
+
+	uint32_t i = (hp->size)++;
+
+	while (i && nd.count < hp->elem[PARENT(i)].count) {
+		hp->elem[i] = hp->elem[PARENT(i)];
+#ifdef MINHEAP_OPTIMIZED_HT
+		if (hash_table_update(hp->elem[i].key, PARENT(i) + 1, i + 1,
+				hp->key_len, hp->hashtable) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table update failed\n");
+			return -1;
+		}
+#endif
+		i = PARENT(i);
+	}
+
+	hp->elem[i] = nd;
+
+#ifdef MINHEAP_OPTIMIZED_HT
+	if (hash_table_insert(new_key, i + 1, hp->key_len, hp->hashtable) < 0) {
+		RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace insert failed\n");
+		if (resize_hash_table(hp) < 0) {
+			RTE_MEMBER_LOG(ERR, "Minheap Hash Table replace resize failed\n");
+			return -1;
+		}
+	}
+#endif
+	return 0;
+}
+
+/* sort the heap into a decending array */
+static void
+rte_member_heapsort(struct minheap *hp, struct node *result_array)
+{
+	struct minheap new_hp;
+
+	/* build a new heap for using the given array */
+	new_hp.size = hp->size;
+	new_hp.key_len = hp->key_len;
+	new_hp.elem = result_array;
+	memcpy(result_array, hp->elem, hp->size * sizeof(struct node));
+
+	/* sort the new heap */
+	while (new_hp.size > 1) {
+		rte_member_heap_swap(&(new_hp.elem[0]), &(new_hp.elem[new_hp.size - 1]));
+		new_hp.size--;
+		rte_member_heapify(&new_hp, 0, false);
+	}
+}
+
+static void
+rte_member_minheap_free(struct minheap *hp)
+{
+	if (hp == NULL)
+		return;
+
+	rte_free(hp->elem);
+	rte_free(hp->hashtable);
+}
+
+static void
+rte_member_minheap_reset(struct minheap *hp)
+{
+	if (hp == NULL)
+		return;
+
+	memset(hp->elem, 0, sizeof(struct node) * hp->size);
+	hp->size = 0;
+
+#ifdef MINHEAP_OPTIMIZED_HT
+	memset((char *)hp->hashtable + sizeof(struct hash), 0,
+			hp->hashtable->bkt_cnt * sizeof(struct hash_bkt));
+	hp->hashtable->num_item = 0;
+#endif
+}
+
+#endif /* _RTE_MEMBER_HEAP_H_ */
diff --git a/lib/member/rte_member_sketch.c b/lib/member/rte_member_sketch.c
new file mode 100644
index 0000000000..0c5a0bad99
--- /dev/null
+++ b/lib/member/rte_member_sketch.c
@@ -0,0 +1,584 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ * Copyright(c) 2020, Alan Liu <zaoxingliu@gmail.com>
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_prefetch.h>
+#include <rte_ring_elem.h>
+
+#include "rte_member.h"
+#include "rte_member_sketch.h"
+#include "rte_member_heap.h"
+
+#ifdef CC_AVX512_SUPPORT
+#include "rte_member_sketch_avx512.h"
+#endif /* CC_AVX512_SUPPORT */
+
+struct sketch_runtime {
+	uint64_t pkt_cnt;
+	uint32_t until_next;
+	int converged;
+	struct minheap heap;
+	struct node *report_array;
+	void *key_slots;
+	struct rte_ring *free_key_slots;
+} __rte_cache_aligned;
+
+static uint32_t
+draw_geometric(const struct rte_member_setsum *ss)
+{
+	double rand = 1;
+
+	if (ss->sample_rate == 1)
+		return 1;
+
+	while (rand == 1 || rand == 0)
+		rand = (double) rte_rand() / (UINT64_MAX);
+
+	return (uint32_t)ceil(log(1 - rand) / log(1 - ss->sample_rate));
+}
+
+static void
+isort(uint64_t *array, int n)
+{
+	for (int i = 1; i < n; i++) {
+		uint64_t t = array[i];
+		int j;
+
+		for (j = i - 1; j >= 0; j--) {
+			if (t < array[j])
+				array[j + 1] = array[j];
+			else
+				break;
+		}
+		array[j + 1] = t;
+	}
+}
+
+static __rte_always_inline void
+swap(uint64_t *a, uint64_t *b)
+{
+	uint64_t tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+static uint64_t
+medianof5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
+{
+	if (a > b)
+		swap(&a, &b);
+	if (c > d)
+		swap(&c, &d);
+	if (a > c) {
+		if (d > e)
+			swap(&c, &e);
+		else {
+			swap(&c, &d);
+			swap(&d, &e);
+		}
+	} else {
+		if (b > e)
+			swap(&a, &e);
+		else {
+			swap(&a, &b);
+			swap(&b, &e);
+		}
+	}
+
+	if (a > c)
+		return a > d ? d : a;
+	else
+		return b > c ? c : b;
+}
+
+int
+rte_member_create_sketch(struct rte_member_setsum *ss,
+			 const struct rte_member_parameters *params,
+			 struct rte_ring *ring)
+{
+	struct sketch_runtime *runtime;
+	uint32_t num_col;
+	uint32_t i;
+
+	if (params->sample_rate == 0 || params->sample_rate > 1) {
+		rte_errno = EINVAL;
+		RTE_MEMBER_LOG(ERR,
+			"Membership Sketch created with invalid parameters\n");
+		return -EINVAL;
+	}
+
+	if (params->extra_flag & RTE_MEMBER_SKETCH_COUNT_BYTE)
+		ss->count_byte = 1;
+
+	if (ss->count_byte == 1 &&
+		rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512IFMA) == 1) {
+#ifdef CC_AVX512_SUPPORT
+		ss->use_avx512 = true;
+#else
+		ss->use_avx512 = false;
+#endif
+	}
+
+	if (ss->use_avx512 == true) {
+		ss->num_row = NUM_ROW_VEC;
+		RTE_MEMBER_LOG(NOTICE,
+			"Membership Sketch AVX512 update/lookup/delete ops is selected\n");
+		ss->sketch_update = sketch_update_avx512;
+		ss->sketch_lookup = sketch_lookup_avx512;
+		ss->sketch_delete = sketch_delete_avx512;
+	} else {
+		ss->num_row = NUM_ROW_SCALAR;
+		RTE_MEMBER_LOG(NOTICE,
+			"Membership Sketch SCALAR update/lookup/delete ops is selected\n");
+		ss->sketch_update = sketch_update_scalar;
+		ss->sketch_lookup = sketch_lookup_scalar;
+		ss->sketch_delete = sketch_delete_scalar;
+	}
+
+	ss->socket_id = params->socket_id;
+
+	if (ss->count_byte == 0)
+		num_col = 4.0 / params->error_rate / params->sample_rate;
+	else if (ss->use_avx512 == true)
+		num_col = rte_align32pow2(4.0 / params->error_rate);
+	else
+		num_col = 4.0 / params->error_rate;
+
+	ss->table = rte_zmalloc_socket(NULL,
+			sizeof(uint64_t) * num_col * ss->num_row,
+			RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (ss->table == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Table memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	ss->hash_seeds = rte_zmalloc_socket(NULL, sizeof(uint64_t) * ss->num_row,
+			RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (ss->hash_seeds == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Hashseeds memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	ss->runtime_var = rte_zmalloc_socket(NULL, sizeof(struct sketch_runtime),
+					RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (ss->runtime_var == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Runtime memory allocation failed\n");
+		rte_free(ss);
+		return -ENOMEM;
+	}
+	runtime = ss->runtime_var;
+
+	ss->num_col = num_col;
+	ss->sample_rate = params->sample_rate;
+	ss->prim_hash_seed = params->prim_hash_seed;
+	ss->sec_hash_seed = params->sec_hash_seed;
+	ss->error_rate = params->error_rate;
+	ss->topk = params->top_k;
+	ss->key_len = params->key_len;
+	runtime->heap.key_len = ss->key_len;
+
+	runtime->key_slots = rte_zmalloc_socket(NULL, ss->key_len * ss->topk,
+					RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (runtime->key_slots == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Key Slots allocation failed\n");
+		goto error;
+	}
+
+	runtime->free_key_slots = ring;
+	for (i = 0; i < ss->topk; i++)
+		rte_ring_sp_enqueue_elem(runtime->free_key_slots,
+					&i, sizeof(uint32_t));
+
+	if (rte_member_minheap_init(&(runtime->heap), params->top_k,
+			ss->socket_id, params->prim_hash_seed) < 0) {
+		RTE_MEMBER_LOG(ERR, "Sketch Minheap allocation failed\n");
+		goto error_runtime;
+	}
+
+	runtime->report_array = rte_zmalloc_socket(NULL, sizeof(struct node) * ss->topk,
+					RTE_CACHE_LINE_SIZE, ss->socket_id);
+	if (runtime->report_array == NULL) {
+		RTE_MEMBER_LOG(ERR, "Sketch Runtime Report Array allocation failed\n");
+		goto error_runtime;
+	}
+
+	rte_srand(ss->prim_hash_seed);
+	for (uint32_t i = 0; i < ss->num_row; i++)
+		ss->hash_seeds[i] = rte_rand();
+
+	if (params->extra_flag & RTE_MEMBER_SKETCH_ALWAYS_BOUNDED)
+		ss->always_bounded = 1;
+
+	if (ss->always_bounded) {
+		double delta = 1.0 / (pow(2, ss->num_row));
+
+		ss->converge_thresh = 10 * pow(ss->error_rate, -2.0) * sqrt(log(1/delta));
+	}
+
+	RTE_MEMBER_LOG(DEBUG, "Sketch created, "
+		"the total memory required is %u Bytes\n",  ss->num_col * ss->num_row * 8);
+
+	return 0;
+
+error_runtime:
+	rte_member_minheap_free(&runtime->heap);
+	rte_ring_free(runtime->free_key_slots);
+	rte_free(runtime->key_slots);
+error:
+	rte_free(runtime);
+	rte_free(ss);
+
+	return -ENOMEM;
+}
+
+uint64_t
+sketch_lookup_scalar(const struct rte_member_setsum *ss, const void *key)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t col[ss->num_row];
+	uint64_t count_row[ss->num_row];
+	uint32_t cur_row;
+	uint64_t count;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
+			ss->hash_seeds[cur_row]) % ss->num_col;
+
+		rte_prefetch0(&count_array[cur_row * ss->num_col + col[cur_row]]);
+	}
+
+	/* if sample rate is 1, it is a regular count-min, we report the min */
+	if (ss->sample_rate == 1 || ss->count_byte == 1)
+		return count_min(ss, col);
+
+	/* otherwise we report the median number */
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+		count_row[cur_row] = count_array[cur_row * ss->num_col + col[cur_row]];
+
+	if (ss->num_row == 5)
+		return medianof5(count_row[0], count_row[1],
+				count_row[2], count_row[3], count_row[4]);
+
+	isort(count_row, ss->num_row);
+
+	if (ss->num_row % 2 == 0) {
+		count = (count_row[ss->num_row / 2] + count_row[ss->num_row / 2 - 1]) / 2;
+		return count;
+	}
+	/* ss->num_row % 2 != 0 */
+	count = count_row[ss->num_row / 2];
+
+	return count;
+}
+
+void
+sketch_delete_scalar(const struct rte_member_setsum *ss, const void *key)
+{
+	uint32_t col[ss->num_row];
+	uint64_t *count_array = ss->table;
+	uint64_t min = UINT64_MAX;
+	uint32_t cur_row;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		col[cur_row] = MEMBER_HASH_FUNC(key, ss->key_len,
+			ss->hash_seeds[cur_row]) % ss->num_col;
+
+		rte_prefetch0(&count_array[cur_row * ss->num_col + col[cur_row]]);
+	}
+
+	/* if sample rate is 1, it is a regular count-min, we report the min */
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		uint64_t cnt = count_array[cur_row * ss->num_col + col[cur_row]];
+
+		if (cnt < min)
+			min = cnt;
+	}
+
+	/* subtract the min value from all the counters */
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+		count_array[cur_row * ss->num_col + col[cur_row]] -= min;
+}
+
+int
+rte_member_query_sketch(const struct rte_member_setsum *ss,
+			const void *key,
+			uint64_t *output)
+{
+	uint64_t count = ss->sketch_lookup(ss, key);
+	*output = count;
+
+	return 0;
+}
+
+void
+rte_member_update_heap(const struct rte_member_setsum *ss)
+{
+	uint32_t i;
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	for (i = 0; i < runtime_var->heap.size; i++) {
+		uint64_t count = ss->sketch_lookup(ss, runtime_var->heap.elem[i].key);
+
+		runtime_var->heap.elem[i].count = count;
+	}
+}
+
+int
+rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum,
+				     void **key,
+				     uint64_t *count)
+{
+	uint32_t i;
+	struct sketch_runtime *runtime_var = setsum->runtime_var;
+
+	rte_member_update_heap(setsum);
+	rte_member_heapsort(&(runtime_var->heap), runtime_var->report_array);
+
+	for (i = 0; i < runtime_var->heap.size; i++) {
+		key[i] = runtime_var->report_array[i].key;
+		count[i] = runtime_var->report_array[i].count;
+	}
+
+	return runtime_var->heap.size;
+}
+
+int
+rte_member_lookup_sketch(const struct rte_member_setsum *ss,
+			 const void *key, member_set_t *set_id)
+{
+	uint64_t count = ss->sketch_lookup(ss, key);
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	if (runtime_var->heap.size > 0 && count >= runtime_var->heap.elem[0].count)
+		*set_id = 1;
+	else
+		*set_id = 0;
+
+	if (count == 0)
+		return 0;
+	else
+		return 1;
+}
+
+static void
+should_converge(const struct rte_member_setsum *ss)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	/* For count min sketch - L1 norm */
+	if (runtime_var->pkt_cnt > ss->converge_thresh) {
+		runtime_var->converged = 1;
+		RTE_MEMBER_LOG(DEBUG, "Sketch converged, begin sampling "
+					"from key count %lu\n",
+					runtime_var->pkt_cnt);
+	}
+}
+
+static void
+sketch_update_row(const struct rte_member_setsum *ss, const void *key,
+		  uint32_t count, uint32_t cur_row)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t col = MEMBER_HASH_FUNC(key, ss->key_len,
+			ss->hash_seeds[cur_row]) % ss->num_col;
+
+	/* sketch counter update */
+	count_array[cur_row * ss->num_col + col] +=
+			ceil(count / (ss->sample_rate));
+}
+
+void
+sketch_update_scalar(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t col;
+	uint32_t cur_row;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		col = MEMBER_HASH_FUNC(key, ss->key_len,
+				ss->hash_seeds[cur_row]) % ss->num_col;
+		count_array[cur_row * ss->num_col + col] += count;
+	}
+}
+
+static void
+heap_update(const struct rte_member_setsum *ss, const void *key)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint64_t key_cnt = 0;
+	int found;
+
+	/* We also update the heap for this key */
+	key_cnt = ss->sketch_lookup(ss, key);
+	if (key_cnt > runtime_var->heap.elem[0].count) {
+		found = rte_member_minheap_find(&runtime_var->heap, key);
+		/* the key is found in the top-k heap */
+		if (found >= 0) {
+			if (runtime_var->heap.elem[found].count < key_cnt)
+				rte_member_heapify(&runtime_var->heap, found, true);
+
+			runtime_var->heap.elem[found].count = key_cnt;
+		} else if (runtime_var->heap.size < ss->topk) {
+			rte_member_minheap_insert_node(&runtime_var->heap, key,
+				key_cnt, runtime_var->key_slots, runtime_var->free_key_slots);
+		} else {
+			rte_member_minheap_replace_node(&runtime_var->heap, key, key_cnt);
+		}
+	} else if (runtime_var->heap.size < ss->topk) {
+		found = rte_member_minheap_find(&runtime_var->heap, key);
+		if (found >= 0) {
+			if (runtime_var->heap.elem[found].count < key_cnt)
+				rte_member_heapify(&runtime_var->heap, found, true);
+
+			runtime_var->heap.elem[found].count = key_cnt;
+		} else
+			rte_member_minheap_insert_node(&runtime_var->heap, key,
+				key_cnt, runtime_var->key_slots, runtime_var->free_key_slots);
+	}
+}
+
+/*
+ * Add a single packet into the sketch.
+ * Sketch value is meatured by packet numbers in this mode.
+ */
+int
+rte_member_add_sketch(const struct rte_member_setsum *ss,
+		      const void *key,
+		      __rte_unused member_set_t set_id)
+{
+	uint32_t cur_row;
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint32_t *until_next = &(runtime_var->until_next);
+
+	/*
+	 * If sketch is mesured by byte count,
+	 * the rte_member_add_sketch_byte_count routine should be used.
+	 */
+	if (ss->count_byte == 1)
+		return -EINVAL;
+
+	if (ss->sample_rate == 1) {
+		ss->sketch_update(ss, key, 1);
+		heap_update(ss, key);
+		return 0;
+	}
+
+	/* convergence stage if it's needed */
+	if (ss->always_bounded && !runtime_var->converged) {
+		ss->sketch_update(ss, key, 1);
+
+		if (!((++runtime_var->pkt_cnt) & (INTERVAL - 1)))
+			should_converge(ss);
+
+		heap_update(ss, key);
+		return 0;
+	}
+
+	/* should we skip this packet */
+	if (*until_next >= ss->num_row) {
+		*until_next -= ss->num_row;
+		return 0;
+	}
+	cur_row = *until_next;
+	do {
+		sketch_update_row(ss, key, 1, cur_row);
+		*until_next = draw_geometric(ss);
+		if (cur_row + *until_next >= ss->num_row)
+			break;
+		cur_row += *until_next;
+	} while (1);
+
+	*until_next -= (ss->num_row - cur_row);
+
+	heap_update(ss, key);
+
+	return 0;
+}
+
+/*
+ * Add the byte count of the packet into the sketch.
+ * Sketch value is meatured by byte count numbers in this mode.
+ */
+int
+rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
+				 const void *key,
+				 uint32_t byte_count)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint32_t *until_next = &(runtime_var->until_next);
+
+	/* should not call this API if not in count byte mode */
+	if (ss->count_byte == 0)
+		return -EINVAL;
+
+	/* there's specific optimization for the sketch update */
+	ss->sketch_update(ss, key, byte_count);
+
+	if (*until_next != 0) {
+		*until_next = *until_next - 1;
+		return 0;
+	}
+
+	*until_next = draw_geometric(ss) - 1;
+
+	heap_update(ss, key);
+
+	return 0;
+}
+
+int
+rte_member_delete_sketch(const struct rte_member_setsum *ss,
+			 const void *key)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	int found;
+
+	found = rte_member_minheap_find(&runtime_var->heap, key);
+	if (found < 0)
+		return -1;
+
+	ss->sketch_delete(ss, key);
+
+	return rte_member_minheap_delete_node
+		(&runtime_var->heap, key, runtime_var->key_slots, runtime_var->free_key_slots);
+}
+
+void
+rte_member_free_sketch(struct rte_member_setsum *ss)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+
+	rte_free(ss->table);
+	rte_member_minheap_free(&runtime_var->heap);
+	rte_free(runtime_var->key_slots);
+	rte_ring_free(runtime_var->free_key_slots);
+	rte_free(runtime_var);
+}
+
+void
+rte_member_reset_sketch(const struct rte_member_setsum *ss)
+{
+	struct sketch_runtime *runtime_var = ss->runtime_var;
+	uint64_t *sketch = ss->table;
+	uint32_t i;
+
+	memset(sketch, 0, sizeof(uint64_t) * ss->num_col * ss->num_row);
+	rte_member_minheap_reset(&runtime_var->heap);
+	rte_ring_reset(runtime_var->free_key_slots);
+
+	for (i = 0; i < ss->topk; i++)
+		rte_ring_sp_enqueue_elem(runtime_var->free_key_slots, &i, sizeof(uint32_t));
+}
diff --git a/lib/member/rte_member_sketch.h b/lib/member/rte_member_sketch.h
new file mode 100644
index 0000000000..a5e633a74e
--- /dev/null
+++ b/lib/member/rte_member_sketch.h
@@ -0,0 +1,96 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017 Intel Corporation
+ */
+
+#ifndef _RTE_MEMBER_SKETCH_H_
+#define _RTE_MEMBER_SKETCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_ring_elem.h>
+
+#define NUM_ROW_SCALAR 5
+#define INTERVAL (1 << 15)
+
+#if !RTE_IS_POWER_OF_2(INTERVAL)
+#error sketch INTERVAL macro must be a power of 2
+#endif
+
+int
+rte_member_create_sketch(struct rte_member_setsum *ss,
+			 const struct rte_member_parameters *params,
+			 struct rte_ring *r);
+
+int
+rte_member_lookup_sketch(const struct rte_member_setsum *setsum,
+			 const void *key, member_set_t *set_id);
+
+int
+rte_member_add_sketch(const struct rte_member_setsum *setsum,
+		      const void *key,
+		      member_set_t set_id);
+
+int
+rte_member_add_sketch_byte_count(const struct rte_member_setsum *ss,
+				 const void *key, uint32_t byte_count);
+
+void
+sketch_update_scalar(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count);
+
+uint64_t
+sketch_lookup_scalar(const struct rte_member_setsum *ss,
+		     const void *key);
+
+void
+sketch_delete_scalar(const struct rte_member_setsum *ss,
+		     const void *key);
+
+int
+rte_member_delete_sketch(const struct rte_member_setsum *setsum,
+			 const void *key);
+
+int
+rte_member_query_sketch(const struct rte_member_setsum *setsum,
+			const void *key, uint64_t *output);
+
+void
+rte_member_free_sketch(struct rte_member_setsum *ss);
+
+void
+rte_member_reset_sketch(const struct rte_member_setsum *setsum);
+
+int
+rte_member_report_heavyhitter_sketch(const struct rte_member_setsum *setsum,
+				     void **key, uint64_t *count);
+
+void
+rte_member_update_heap(const struct rte_member_setsum *ss);
+
+static __rte_always_inline uint64_t
+count_min(const struct rte_member_setsum *ss, const uint32_t *hash_results)
+{
+	uint64_t *count_array = ss->table;
+	uint64_t count;
+	uint32_t cur_row;
+	uint64_t min = UINT64_MAX;
+
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++) {
+		uint64_t cnt = count_array[cur_row * ss->num_col + hash_results[cur_row]];
+
+		if (cnt < min)
+			min = cnt;
+	}
+	count = min;
+
+	return count;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMBER_SKETCH_H_ */
diff --git a/lib/member/rte_member_sketch_avx512.c b/lib/member/rte_member_sketch_avx512.c
new file mode 100644
index 0000000000..c83f4b6fd1
--- /dev/null
+++ b/lib/member/rte_member_sketch_avx512.c
@@ -0,0 +1,69 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include "rte_member_sketch_avx512.h"
+
+__rte_always_inline void
+sketch_update_avx512(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count)
+{
+	uint64_t *count_array = ss->table;
+	uint32_t num_col = ss->num_col;
+	uint32_t key_len = ss->key_len;
+	__m256i v_row_base;
+	__m256i v_hash_result;
+	__m512i current_sketch;
+	__m512i updated_sketch;
+	__m512i v_count;
+
+	const __m256i v_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+	const __m256i v_col = _mm256_set1_epi32(num_col);
+
+	/* compute the hash result parallelly */
+	v_hash_result = rte_xxh64_sketch_avx512
+		(key, key_len, *(__m512i *)ss->hash_seeds, num_col);
+	v_row_base = _mm256_mullo_epi32(v_idx, v_col);
+	v_hash_result = _mm256_add_epi32(v_row_base, v_hash_result);
+
+	current_sketch =
+		_mm512_i32gather_epi64(v_hash_result, count_array, 8);
+	v_count = _mm512_set1_epi64(count);
+	updated_sketch = _mm512_add_epi64(current_sketch, v_count);
+	_mm512_i32scatter_epi64
+		((void *)count_array, v_hash_result, updated_sketch, 8);
+}
+
+uint64_t
+sketch_lookup_avx512(const struct rte_member_setsum *ss, const void *key)
+{
+	uint32_t col[ss->num_row];
+
+	/* currently only for sketch byte count mode */
+	__m256i v_hash_result = rte_xxh64_sketch_avx512
+		(key, ss->key_len, *(__m512i *)ss->hash_seeds, ss->num_col);
+	_mm256_storeu_si256((__m256i *)col, v_hash_result);
+
+	return count_min(ss, col);
+}
+
+void
+sketch_delete_avx512(const struct rte_member_setsum *ss, const void *key)
+{
+	uint32_t col[ss->num_row];
+	uint64_t *count_array = ss->table;
+	uint64_t min = UINT64_MAX;
+	uint32_t cur_row;
+
+	__m256i v_hash_result = rte_xxh64_sketch_avx512
+		(key, ss->key_len, *(__m512i *)ss->hash_seeds,
+		 RTE_ALIGN_FLOOR(ss->num_col, 32));
+	_mm256_storeu_si256((__m256i *)col, v_hash_result);
+
+	min = count_min(ss, col);
+
+	/* subtract the min value from all the counters */
+	for (cur_row = 0; cur_row < ss->num_row; cur_row++)
+		count_array[cur_row * ss->num_col + col[cur_row]] -= min;
+}
diff --git a/lib/member/rte_member_sketch_avx512.h b/lib/member/rte_member_sketch_avx512.h
new file mode 100644
index 0000000000..e7c25da643
--- /dev/null
+++ b/lib/member/rte_member_sketch_avx512.h
@@ -0,0 +1,36 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_MEMBER_SKETCH_AVX512_H_
+#define _RTE_MEMBER_SKETCH_AVX512_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_vect.h>
+#include "rte_member.h"
+#include "rte_member_sketch.h"
+#include "rte_xxh64_avx512.h"
+
+#define NUM_ROW_VEC 8
+
+void
+sketch_update_avx512(const struct rte_member_setsum *ss,
+		     const void *key,
+		     uint32_t count);
+
+uint64_t
+sketch_lookup_avx512(const struct rte_member_setsum *ss,
+		     const void *key);
+
+void
+sketch_delete_avx512(const struct rte_member_setsum *ss,
+		     const void *key);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMBER_SKETCH_AVX512_H_ */
diff --git a/lib/member/rte_xxh64_avx512.h b/lib/member/rte_xxh64_avx512.h
new file mode 100644
index 0000000000..574748fc38
--- /dev/null
+++ b/lib/member/rte_xxh64_avx512.h
@@ -0,0 +1,117 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_XXH64_AVX512_H_
+#define _RTE_XXH64_AVX512_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <immintrin.h>
+
+/* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
+/* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
+/* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
+/* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
+/* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
+
+static __rte_always_inline  __m512i
+xxh64_round_avx512(__m512i hash, __m512i input)
+{
+	hash = _mm512_madd52lo_epu64(hash,
+			input,
+			_mm512_set1_epi64(PRIME64_2));
+
+	hash = _mm512_rol_epi64(hash, 31);
+
+	return hash;
+}
+
+static __rte_always_inline  __m512i
+xxh64_fmix_avx512(__m512i hash)
+{
+	hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33));
+
+	return hash;
+}
+
+static __rte_always_inline __m256i
+rte_xxh64_sketch_avx512(const void *key, uint32_t key_len,
+			__m512i v_seed, uint32_t modulo)
+{
+	__m512i v_prime64_5, v_hash;
+	size_t remaining = key_len;
+	size_t offset = 0;
+	__m512i input;
+
+	v_prime64_5 = _mm512_set1_epi64(PRIME64_5);
+	v_hash = _mm512_add_epi64
+			(_mm512_add_epi64(v_seed, v_prime64_5),
+			 _mm512_set1_epi64(key_len));
+
+	while (remaining >= 8) {
+		input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key, offset));
+		v_hash = _mm512_xor_epi64(v_hash,
+				xxh64_round_avx512(_mm512_setzero_si512(), input));
+		v_hash = _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4),
+				_mm512_rol_epi64(v_hash, 27),
+				_mm512_set1_epi64(PRIME64_1));
+
+		remaining -= 8;
+		offset += 8;
+	}
+
+	if (remaining >= 4) {
+		input = _mm512_set1_epi64
+			(*(uint32_t *)RTE_PTR_ADD(key, offset));
+		v_hash = _mm512_xor_epi64(v_hash,
+			_mm512_mullo_epi64(input,
+				_mm512_set1_epi64(PRIME64_1)));
+		v_hash = _mm512_madd52lo_epu64
+				(_mm512_set1_epi64(PRIME64_3),
+				_mm512_rol_epi64(v_hash, 23),
+				_mm512_set1_epi64(PRIME64_2));
+
+		offset += 4;
+		remaining -= 4;
+	}
+
+	while (remaining != 0) {
+		input = _mm512_set1_epi64
+			(*(uint8_t *)RTE_PTR_ADD(key, offset));
+		v_hash = _mm512_xor_epi64(v_hash,
+			_mm512_mullo_epi64(input,
+				_mm512_set1_epi64(PRIME64_5)));
+		v_hash = _mm512_mullo_epi64
+			(_mm512_rol_epi64(v_hash, 11),
+			_mm512_set1_epi64(PRIME64_1));
+		offset++;
+		remaining--;
+	}
+
+	v_hash = xxh64_fmix_avx512(v_hash);
+
+	/*
+	 * theoritically, such modular operations can be replaced by
+	 * _mm512_rem_epi64(), but seems it depends on the complier's
+	 * implementation. so here is the limitation that the modulo
+	 * value should be power of 2.
+	 */
+	__m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1));
+
+	return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash, v_hash_remainder));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_XXH64_AVX512_H_ */