[dpdk-dev,04/13] baseband/turbo_sw: memcpy changed or removed from driver

Message ID 20180426133008.12388-4-kamilx.chalupnik@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Pablo de Lara Guarch
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Kamil Chalupnik April 26, 2018, 1:29 p.m. UTC
  From: KamilX Chalupnik <kamilx.chalupnik@intel.com>

Optimization of Turbo Software driver:
- usage of memcpy changed or removed
- minor changes in defines definitions

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 drivers/baseband/turbo_sw/bbdev_turbo_software.c | 143 +++++++++++++----------
 lib/librte_bbdev/rte_bbdev_op.h                  |  18 ++-
 2 files changed, 100 insertions(+), 61 deletions(-)
  

Comments

De Lara Guarch, Pablo May 7, 2018, 12:26 p.m. UTC | #1
> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Thursday, April 26, 2018 2:30 PM
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>; Chalupnik, KamilX
> <kamilx.chalupnik@intel.com>
> Subject: [PATCH 04/13] baseband/turbo_sw: memcpy changed or removed from
> driver
> 

It is not clear what this patch is trying to do, based on the title?
Minimizing memory copying? Please, modify the title to be more clear.

> From: KamilX Chalupnik <kamilx.chalupnik@intel.com>
> 
> Optimization of Turbo Software driver:
> - usage of memcpy changed or removed
> - minor changes in defines definitions

Could you split this patch in two patches? One with the macro changes
and the other one making changes on the memory copying?

Also, do those macros need to be in bbdev layer?
Will they be used in other PMDs?

> 
> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
> ---
>  drivers/baseband/turbo_sw/bbdev_turbo_software.c | 143 +++++++++++++-----
> -----
>  lib/librte_bbdev/rte_bbdev_op.h                  |  18 ++-
>  2 files changed, 100 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
> b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
> index 26b8560..2728b30 100644
> --- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
> +++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c

...

> -		/* copy the input to the temporary buffer to be able to extend
> -		 * it by 3 CRC bytes
> -		 */
> -		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
>  		crc_req.data = in;
>  		crc_req.len = (k - 24) >> 3;
> -		crc_resp.data = q->enc_in;
> -		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
> +		/* Check if there is a room for CRC bits if not use
> +		 * the temporary buffer.
> +		 */

Check if there is room for CRC bits. If not....

> +		if (rte_pktmbuf_append(m_in, 3) == NULL) {
> +			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
> +			in = q->enc_in;
> +		} else {
> +			/* Store 3 first bytes of next CB as they will be
> +			 * overwritten by CRC bytes. If it is the last CB then
> +			 * there is no point to store 3 next bytes and this
> +			 * if..else branch will be omitted.
> +			 */
> +			first_3_bytes = *((uint64_t *)&in[(k - 32) >> 3]);
> 

...

> -		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
>  		crc_req.data = in;
>  		crc_req.len = (k - 24) >> 3;
> -		crc_resp.data = q->enc_in;
> -		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
> +		/* Check if there is a room for CRC bits if this is the last
> +		 * CB in TB. If not use temporary buffer.

Same as above.

> +		 */
  

Patch

diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 26b8560..2728b30 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -32,12 +32,6 @@  static int bbdev_turbo_sw_logtype;
 	rte_bbdev_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
 		##__VA_ARGS__)
 
-/* Number of columns in sub-block interleaver (36.212, section 5.1.4.1.1) */
-#define C_SUBBLOCK (32)
-#define MAX_TB_SIZE (391656)
-#define MAX_CB_SIZE (6144)
-#define MAX_KW (18528)
-
 /* private data structure */
 struct bbdev_private {
 	unsigned int max_nb_queues;  /**< Max number of queues */
@@ -90,7 +84,7 @@  compute_idx(uint16_t k)
 {
 	int32_t result = 0;
 
-	if (k < 40 || k > MAX_CB_SIZE)
+	if (k < RTE_BBDEV_MIN_CB_SIZE || k > RTE_BBDEV_MAX_CB_SIZE)
 		return -1;
 
 	if (k > 2048) {
@@ -234,7 +228,8 @@  q_setup(struct rte_bbdev *dev, uint16_t q_id,
 		return -ENAMETOOLONG;
 	}
 	q->enc_out = rte_zmalloc_socket(name,
-			((MAX_TB_SIZE >> 3) + 3) * sizeof(*q->enc_out) * 3,
+			((RTE_BBDEV_MAX_TB_SIZE >> 3) + 3) *
+			sizeof(*q->enc_out) * 3,
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->enc_out == NULL) {
 		rte_bbdev_log(ERR,
@@ -253,7 +248,7 @@  q_setup(struct rte_bbdev *dev, uint16_t q_id,
 		return -ENAMETOOLONG;
 	}
 	q->enc_in = rte_zmalloc_socket(name,
-			(MAX_CB_SIZE >> 3) * sizeof(*q->enc_in),
+			(RTE_BBDEV_MAX_CB_SIZE >> 3) * sizeof(*q->enc_in),
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->enc_in == NULL) {
 		rte_bbdev_log(ERR,
@@ -271,7 +266,7 @@  q_setup(struct rte_bbdev *dev, uint16_t q_id,
 		return -ENAMETOOLONG;
 	}
 	q->ag = rte_zmalloc_socket(name,
-			MAX_CB_SIZE * 10 * sizeof(*q->ag),
+			RTE_BBDEV_MAX_CB_SIZE * 10 * sizeof(*q->ag),
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->ag == NULL) {
 		rte_bbdev_log(ERR,
@@ -308,7 +303,7 @@  q_setup(struct rte_bbdev *dev, uint16_t q_id,
 		return -ENAMETOOLONG;
 	}
 	q->deint_input = rte_zmalloc_socket(name,
-			MAX_KW * sizeof(*q->deint_input),
+			RTE_BBDEV_MAX_KW * sizeof(*q->deint_input),
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->deint_input == NULL) {
 		rte_bbdev_log(ERR,
@@ -327,7 +322,7 @@  q_setup(struct rte_bbdev *dev, uint16_t q_id,
 		return -ENAMETOOLONG;
 	}
 	q->deint_output = rte_zmalloc_socket(NULL,
-			MAX_KW * sizeof(*q->deint_output),
+			RTE_BBDEV_MAX_KW * sizeof(*q->deint_output),
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->deint_output == NULL) {
 		rte_bbdev_log(ERR,
@@ -346,7 +341,7 @@  q_setup(struct rte_bbdev *dev, uint16_t q_id,
 		return -ENAMETOOLONG;
 	}
 	q->adapter_output = rte_zmalloc_socket(NULL,
-			MAX_CB_SIZE * 6 * sizeof(*q->adapter_output),
+			RTE_BBDEV_MAX_CB_SIZE * 6 * sizeof(*q->adapter_output),
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->adapter_output == NULL) {
 		rte_bbdev_log(ERR,
@@ -414,9 +409,9 @@  is_enc_input_valid(const uint16_t k, const int32_t k_idx,
 		return -1;
 	}
 
-	if (k > MAX_CB_SIZE) {
+	if (k > RTE_BBDEV_MAX_CB_SIZE) {
 		rte_bbdev_log(ERR, "CB size (%u) is too big, max: %d",
-				k, MAX_CB_SIZE);
+				k, RTE_BBDEV_MAX_CB_SIZE);
 		return -1;
 	}
 
@@ -441,9 +436,9 @@  is_dec_input_valid(int32_t k_idx, int16_t kw, int16_t in_length)
 		return -1;
 	}
 
-	if (kw > MAX_KW) {
+	if (kw > RTE_BBDEV_MAX_KW) {
 		rte_bbdev_log(ERR, "Input length (%u) is too big, max: %d",
-				kw, MAX_KW);
+				kw, RTE_BBDEV_MAX_KW);
 		return -1;
 	}
 
@@ -452,7 +447,7 @@  is_dec_input_valid(int32_t k_idx, int16_t kw, int16_t in_length)
 
 static inline void
 process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
-		uint8_t cb_idx, uint8_t c, uint16_t k, uint16_t ncb,
+		uint8_t r, uint8_t c, uint16_t k, uint16_t ncb,
 		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out,
 		uint16_t in_offset, uint16_t out_offset, uint16_t total_left)
 {
@@ -460,6 +455,7 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 	int16_t k_idx;
 	uint16_t m;
 	uint8_t *in, *out0, *out1, *out2, *tmp_out, *rm_out;
+	uint64_t first_3_bytes = 0;
 	struct rte_bbdev_op_turbo_enc *enc = &op->turbo_enc;
 	struct bblib_crc_request crc_req;
 	struct bblib_crc_response crc_resp;
@@ -479,16 +475,25 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
 		}
-		/* copy the input to the temporary buffer to be able to extend
-		 * it by 3 CRC bytes
-		 */
-		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
 		crc_req.data = in;
 		crc_req.len = (k - 24) >> 3;
-		crc_resp.data = q->enc_in;
-		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
+		/* Check if there is a room for CRC bits if not use
+		 * the temporary buffer.
+		 */
+		if (rte_pktmbuf_append(m_in, 3) == NULL) {
+			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
+			in = q->enc_in;
+		} else {
+			/* Store 3 first bytes of next CB as they will be
+			 * overwritten by CRC bytes. If it is the last CB then
+			 * there is no point to store 3 next bytes and this
+			 * if..else branch will be omitted.
+			 */
+			first_3_bytes = *((uint64_t *)&in[(k - 32) >> 3]);
+		}
 
-		in = q->enc_in;
+		crc_resp.data = in;
+		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
 		ret = is_enc_input_valid(k - 24, k_idx, total_left);
@@ -496,16 +501,25 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
 		}
-		/* copy the input to the temporary buffer to be able to extend
-		 * it by 3 CRC bytes
-		 */
-		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
 		crc_req.data = in;
 		crc_req.len = (k - 24) >> 3;
-		crc_resp.data = q->enc_in;
-		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
+		/* Check if there is a room for CRC bits if this is the last
+		 * CB in TB. If not use temporary buffer.
+		 */
+		if ((c - r == 1) && (rte_pktmbuf_append(m_in, 3) == NULL)) {
+			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
+			in = q->enc_in;
+		} else if (c - r > 1) {
+			/* Store 3 first bytes of next CB as they will be
+			 * overwritten by CRC bytes. If it is the last CB then
+			 * there is no point to store 3 next bytes and this
+			 * if..else branch will be omitted.
+			 */
+			first_3_bytes = *((uint64_t *)&in[(k - 32) >> 3]);
+		}
 
-		in = q->enc_in;
+		crc_resp.data = in;
+		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
 	} else {
 		ret = is_enc_input_valid(k, k_idx, total_left);
 		if (ret != 0) {
@@ -519,10 +533,32 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 	/* Each bit layer output from turbo encoder is (k+4) bits long, i.e.
 	 * input length + 4 tail bits. That's (k/8) + 1 bytes after rounding up.
 	 * So dst_data's length should be 3*(k/8) + 3 bytes.
+	 * In Rate-matching bypass case outputs pointers passed to encoder
+	 * (out0, out1 and out2) can directly point to addresses of output from
+	 * turbo_enc entity.
 	 */
-	out0 = q->enc_out;
-	out1 = RTE_PTR_ADD(out0, (k >> 3) + 1);
-	out2 = RTE_PTR_ADD(out1, (k >> 3) + 1);
+	if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH) {
+		out0 = q->enc_out;
+		out1 = RTE_PTR_ADD(out0, (k >> 3) + 1);
+		out2 = RTE_PTR_ADD(out1, (k >> 3) + 1);
+	} else {
+		out0 = (uint8_t *)rte_pktmbuf_append(m_out, (k >> 3) * 3 + 2);
+		if (out0 == NULL) {
+			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+			rte_bbdev_log(ERR,
+					"Too little space in output mbuf");
+			return;
+		}
+		enc->output.length += (k >> 3) * 3 + 2;
+		/* rte_bbdev_op_data.offset can be different than the
+		 * offset of the appended bytes
+		 */
+		out0 = rte_pktmbuf_mtod_offset(m_out, uint8_t *, out_offset);
+		out1 = rte_pktmbuf_mtod_offset(m_out, uint8_t *,
+				out_offset + (k >> 3) + 1);
+		out2 = rte_pktmbuf_mtod_offset(m_out, uint8_t *,
+				out_offset + 2 * ((k >> 3) + 1));
+	}
 
 	turbo_req.case_id = k_idx;
 	turbo_req.input_win = in;
@@ -536,6 +572,10 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		return;
 	}
 
+	/* Restore 3 first bytes of next CB if they were overwritten by CRC*/
+	if (first_3_bytes != 0)
+		*((uint64_t *)&in[(k - 32) >> 3]) = first_3_bytes;
+
 	/* Rate-matching */
 	if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH) {
 		/* get output data starting address */
@@ -552,7 +592,7 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		rm_out = rte_pktmbuf_mtod_offset(m_out, uint8_t *, out_offset);
 
 		/* index of current code block */
-		rm_req.r = cb_idx;
+		rm_req.r = r;
 		/* total number of code block */
 		rm_req.C = c;
 		/* For DL - 1, UL - 0 */
@@ -613,23 +653,6 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 			tmp_out++;
 		}
 		*tmp_out = 0;
-
-		/* copy shifted output to turbo_enc entity */
-		out0 = (uint8_t *)rte_pktmbuf_append(m_out,
-				(k >> 3) * 3 + 2);
-		if (out0 == NULL) {
-			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
-			rte_bbdev_log(ERR,
-					"Too little space in output mbuf");
-			return;
-		}
-		enc->output.length += (k >> 3) * 3 + 2;
-		/* rte_bbdev_op_data.offset can be different than the
-		 * offset of the appended bytes
-		 */
-		out0 = rte_pktmbuf_mtod_offset(m_out, uint8_t *,
-				out_offset);
-		rte_memcpy(out0, q->enc_out, (k >> 3) * 3 + 2);
 	}
 }
 
@@ -649,9 +672,9 @@  enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
 	/* Clear op status */
 	op->status = 0;
 
-	if (total_left > MAX_TB_SIZE >> 3) {
+	if (total_left > RTE_BBDEV_MAX_TB_SIZE >> 3) {
 		rte_bbdev_log(ERR, "TB size (%u) is too big, max: %d",
-				total_left, MAX_TB_SIZE);
+				total_left, RTE_BBDEV_MAX_TB_SIZE);
 		op->status = 1 << RTE_BBDEV_DATA_ERROR;
 		return;
 	}
@@ -736,11 +759,11 @@  remove_nulls_from_circular_buf(const uint8_t *in, uint8_t *out, uint16_t k,
 	const uint32_t d = k + 4;
 	const uint32_t kw = (ncb / 3);
 	const uint32_t nd = kw - d;
-	const uint32_t r_subblock = kw / C_SUBBLOCK;
+	const uint32_t r_subblock = kw / RTE_BBDEV_C_SUBBLOCK;
 	/* Inter-column permutation pattern */
-	const uint32_t P[C_SUBBLOCK] = {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10,
-			26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19,
-			11, 27, 7, 23, 15, 31};
+	const uint32_t P[RTE_BBDEV_C_SUBBLOCK] = {0, 16, 8, 24, 4, 20, 12, 28,
+			2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13,
+			29, 3, 19, 11, 27, 7, 23, 15, 31};
 	in_idx = 0;
 	out_idx = 0;
 
@@ -955,7 +978,7 @@  enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op)
 		 * where D is the size of each output from turbo encoder block
 		 * (k + 4).
 		 */
-		kw = RTE_ALIGN_CEIL(k + 4, C_SUBBLOCK) * 3;
+		kw = RTE_ALIGN_CEIL(k + 4, RTE_BBDEV_C_SUBBLOCK) * 3;
 
 		process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset,
 				out_offset, check_bit(dec->op_flags,
diff --git a/lib/librte_bbdev/rte_bbdev_op.h b/lib/librte_bbdev/rte_bbdev_op.h
index 9a80c64..1a80588 100644
--- a/lib/librte_bbdev/rte_bbdev_op.h
+++ b/lib/librte_bbdev/rte_bbdev_op.h
@@ -25,7 +25,23 @@  extern "C" {
 #include <rte_memory.h>
 #include <rte_mempool.h>
 
-#define RTE_BBDEV_MAX_CODE_BLOCKS 64
+/* Number of columns in sub-block interleaver (36.212, section 5.1.4.1.1) */
+#define RTE_BBDEV_C_SUBBLOCK (32)
+/* Maximum size of Transport Block (36.213, Table, Table 7.1.7.2.5-1) */
+#define RTE_BBDEV_MAX_TB_SIZE (391656)
+/* Maximum size of Code Block (36.212, Table 5.1.3-3) */
+#define RTE_BBDEV_MAX_CB_SIZE (6144)
+/* Minimum size of Code Block (36.212, Table 5.1.3-3) */
+#define RTE_BBDEV_MIN_CB_SIZE (40)
+/* Maximum size of circular buffer */
+#define RTE_BBDEV_MAX_KW (18528)
+/*
+ * Maximum number of Code Blocks in Transport Block. It is calculated based on
+ * maximum size of one Code Block and one Transport Block (considering CRC24A
+ * and CRC24B):
+ * (391656 + 24) / (6144 - 24) = 64
+ */
+#define RTE_BBDEV_MAX_CODE_BLOCKS (64)
 
 /** Flags for turbo decoder operation and capability structure */
 enum rte_bbdev_op_td_flag_bitmasks {