Patchwork [dpdk-dev] baseband: enhancement of offload cost test

login
register
mail settings
Submitter KamilX Chalupnik
Date Dec. 6, 2018, 12:03 p.m.
Message ID <20181206120317.16156-1-kamilx.chalupnik@intel.com>
Download mbox | patch
Permalink /patch/673963/
State New
Headers show

Comments

KamilX Chalupnik - Dec. 6, 2018, 12:03 p.m.
Offload cost test was improved in order to collect
more accurate results.

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c                 | 154 +++++++++++------------
 config/common_base                               |   2 +-
 drivers/baseband/turbo_sw/bbdev_turbo_software.c |  70 ++++++++---
 lib/librte_bbdev/rte_bbdev.h                     |   9 +-
 4 files changed, 136 insertions(+), 99 deletions(-)
Amr Mokhtar - Dec. 13, 2018, 8:12 p.m.
> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Thursday 6 December 2018 12:03
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; akhil.goyal@nxp.com;
> Chalupnik, KamilX <kamilx.chalupnik@intel.com>
> Subject: [PATCH] baseband: enhancement of offload cost test
> 
> Offload cost test was improved in order to collect
> more accurate results.
> 
> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
> ---

Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>

Patch

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index fbe6cc9..21be574 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -88,19 +88,19 @@  struct thread_params {
 /* Stores time statistics */
 struct test_time_stats {
 	/* Stores software enqueue total working time */
-	uint64_t enq_sw_tot_time;
+	uint64_t enq_sw_total_time;
 	/* Stores minimum value of software enqueue working time */
 	uint64_t enq_sw_min_time;
 	/* Stores maximum value of software enqueue working time */
 	uint64_t enq_sw_max_time;
 	/* Stores turbo enqueue total working time */
-	uint64_t enq_tur_tot_time;
-	/* Stores minimum value of turbo enqueue working time */
-	uint64_t enq_tur_min_time;
-	/* Stores maximum value of turbo enqueue working time */
-	uint64_t enq_tur_max_time;
+	uint64_t enq_acc_total_time;
+	/* Stores minimum value of accelerator enqueue working time */
+	uint64_t enq_acc_min_time;
+	/* Stores maximum value of accelerator enqueue working time */
+	uint64_t enq_acc_max_time;
 	/* Stores dequeue total working time */
-	uint64_t deq_tot_time;
+	uint64_t deq_total_time;
 	/* Stores minimum value of dequeue working time */
 	uint64_t deq_min_time;
 	/* Stores maximum value of dequeue working time */
@@ -1200,12 +1200,15 @@  typedef int (test_case_function)(struct active_device *ad,
 	burst_sz = tp->op_params->burst_sz;
 	num_to_process = tp->op_params->num_to_process;
 
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
 				burst_sz);
-	else
+		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+	} else {
 		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id, enc_ops,
 				burst_sz);
+		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+	}
 
 	if (deq < burst_sz) {
 		printf(
@@ -1316,8 +1319,6 @@  typedef int (test_case_function)(struct active_device *ad,
 
 		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id, ops,
 				num_to_enq);
-
-		rte_bbdev_dec_op_free_bulk(ops, num_to_enq);
 	}
 
 	if (allocs_failed > 0)
@@ -1380,8 +1381,6 @@  typedef int (test_case_function)(struct active_device *ad,
 
 		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id, ops,
 				num_to_enq);
-
-		rte_bbdev_enc_op_free_bulk(ops, num_to_enq);
 	}
 
 	if (allocs_failed > 0)
@@ -1575,13 +1574,14 @@  typedef int (test_case_function)(struct active_device *ad,
 	RTE_LCORE_FOREACH(lcore_id) {
 		if (iter++ >= used_cores)
 			break;
-		printf("\tlcore_id: %u, throughput: %.8lg MOPS, %.8lg Mbps\n",
-		lcore_id, t_params[lcore_id].mops, t_params[lcore_id].mbps);
+		printf("Throughput for core (%u): %.8lg MOPS, %.8lg Mbps\n",
+				lcore_id, t_params[lcore_id].mops,
+				t_params[lcore_id].mbps);
 		total_mops += t_params[lcore_id].mops;
 		total_mbps += t_params[lcore_id].mbps;
 	}
 	printf(
-		"\n\tTotal stats for %u cores: throughput: %.8lg MOPS, %.8lg Mbps\n",
+		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps\n",
 		used_cores, total_mops, total_mbps);
 }
 
@@ -1609,7 +1609,7 @@  typedef int (test_case_function)(struct active_device *ad,
 			test_vector.op_type);
 
 	printf(
-		"Throughput test: dev: %s, nb_queues: %u, burst size: %u, num ops: %u, num_lcores: %u, op type: %s, int mode: %s, GHz: %lg\n",
+		"\nThroughput test: dev: %s, nb_queues: %u, burst size: %u, num ops: %u, num_lcores: %u, op type: %s, int mode: %s, GHz: %lg\n",
 			info.dev_name, ad->nb_queues, op_params->burst_sz,
 			op_params->num_to_process, op_params->num_lcores,
 			op_type_str,
@@ -1882,7 +1882,7 @@  typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Validation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nValidation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -1899,10 +1899,10 @@  typedef int (test_case_function)(struct active_device *ad,
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\toperation latency:\n"
-			"\t\tavg latency: %lg cycles, %lg us\n"
-			"\t\tmin latency: %lg cycles, %lg us\n"
-			"\t\tmax latency: %lg cycles, %lg us\n",
+	printf("Operation latency:\n"
+			"\tavg latency: %lg cycles, %lg us\n"
+			"\tmin latency: %lg cycles, %lg us\n"
+			"\tmax latency: %lg cycles, %lg us\n",
 			(double)total_time / (double)iter,
 			(double)(total_time * 1000000) / (double)iter /
 			(double)rte_get_tsc_hz(), (double)min_time,
@@ -1930,7 +1930,7 @@  typedef int (test_case_function)(struct active_device *ad,
 	stats->dequeued_count = q_stats->dequeued_count;
 	stats->enqueue_err_count = q_stats->enqueue_err_count;
 	stats->dequeue_err_count = q_stats->dequeue_err_count;
-	stats->offload_time = q_stats->offload_time;
+	stats->acc_offload_cycles = q_stats->acc_offload_cycles;
 
 	return 0;
 }
@@ -1974,18 +1974,18 @@  typedef int (test_case_function)(struct active_device *ad,
 				queue_id, dev_id);
 
 		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
-				stats.offload_time;
+				stats.acc_offload_cycles;
 		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
 				enq_sw_last_time);
 		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
 				enq_sw_last_time);
-		time_st->enq_sw_tot_time += enq_sw_last_time;
+		time_st->enq_sw_total_time += enq_sw_last_time;
 
-		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
-				stats.offload_time);
-		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
-				stats.offload_time);
-		time_st->enq_tur_tot_time += stats.offload_time;
+		time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
@@ -2003,7 +2003,7 @@  typedef int (test_case_function)(struct active_device *ad,
 				deq_last_time);
 		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
 				deq_last_time);
-		time_st->deq_tot_time += deq_last_time;
+		time_st->deq_total_time += deq_last_time;
 
 		/* Dequeue remaining operations if needed*/
 		while (burst_sz != deq)
@@ -2055,18 +2055,18 @@  typedef int (test_case_function)(struct active_device *ad,
 				queue_id, dev_id);
 
 		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
-				stats.offload_time;
+				stats.acc_offload_cycles;
 		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
 				enq_sw_last_time);
 		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
 				enq_sw_last_time);
-		time_st->enq_sw_tot_time += enq_sw_last_time;
+		time_st->enq_sw_total_time += enq_sw_last_time;
 
-		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
-				stats.offload_time);
-		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
-				stats.offload_time);
-		time_st->enq_tur_tot_time += stats.offload_time;
+		time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
@@ -2084,7 +2084,7 @@  typedef int (test_case_function)(struct active_device *ad,
 				deq_last_time);
 		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
 				deq_last_time);
-		time_st->deq_tot_time += deq_last_time;
+		time_st->deq_total_time += deq_last_time;
 
 		while (burst_sz != deq)
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
@@ -2121,7 +2121,7 @@  typedef int (test_case_function)(struct active_device *ad,
 
 	memset(&time_st, 0, sizeof(struct test_time_stats));
 	time_st.enq_sw_min_time = UINT64_MAX;
-	time_st.enq_tur_min_time = UINT64_MAX;
+	time_st.enq_acc_min_time = UINT64_MAX;
 	time_st.deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
@@ -2134,7 +2134,7 @@  typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Offload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nOffload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -2149,36 +2149,36 @@  typedef int (test_case_function)(struct active_device *ad,
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tenq offload cost latency:\n"
-			"\t\tsoftware avg %lg cycles, %lg us\n"
-			"\t\tsoftware min %lg cycles, %lg us\n"
-			"\t\tsoftware max %lg cycles, %lg us\n"
-			"\t\tturbo avg %lg cycles, %lg us\n"
-			"\t\tturbo min %lg cycles, %lg us\n"
-			"\t\tturbo max %lg cycles, %lg us\n",
-			(double)time_st.enq_sw_tot_time / (double)iter,
-			(double)(time_st.enq_sw_tot_time * 1000000) /
+	printf("Enqueue offload cost latency:\n"
+			"\tDriver offload avg %lg cycles, %lg us\n"
+			"\tDriver offload min %lg cycles, %lg us\n"
+			"\tDriver offload max %lg cycles, %lg us\n"
+			"\tAccelerator offload avg %lg cycles, %lg us\n"
+			"\tAccelerator offload min %lg cycles, %lg us\n"
+			"\tAccelerator offload max %lg cycles, %lg us\n",
+			(double)time_st.enq_sw_total_time / (double)iter,
+			(double)(time_st.enq_sw_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
 			(double)time_st.enq_sw_min_time,
 			(double)(time_st.enq_sw_min_time * 1000000) /
 			rte_get_tsc_hz(), (double)time_st.enq_sw_max_time,
 			(double)(time_st.enq_sw_max_time * 1000000) /
-			rte_get_tsc_hz(), (double)time_st.enq_tur_tot_time /
+			rte_get_tsc_hz(), (double)time_st.enq_acc_total_time /
 			(double)iter,
-			(double)(time_st.enq_tur_tot_time * 1000000) /
+			(double)(time_st.enq_acc_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
-			(double)time_st.enq_tur_min_time,
-			(double)(time_st.enq_tur_min_time * 1000000) /
-			rte_get_tsc_hz(), (double)time_st.enq_tur_max_time,
-			(double)(time_st.enq_tur_max_time * 1000000) /
+			(double)time_st.enq_acc_min_time,
+			(double)(time_st.enq_acc_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_acc_max_time,
+			(double)(time_st.enq_acc_max_time * 1000000) /
 			rte_get_tsc_hz());
 
-	printf("\tdeq offload cost latency - one op:\n"
-			"\t\tavg %lg cycles, %lg us\n"
-			"\t\tmin %lg cycles, %lg us\n"
-			"\t\tmax %lg cycles, %lg us\n",
-			(double)time_st.deq_tot_time / (double)iter,
-			(double)(time_st.deq_tot_time * 1000000) /
+	printf("Dequeue offload cost latency - one op:\n"
+			"\tavg %lg cycles, %lg us\n"
+			"\tmin %lg cycles, %lg us\n"
+			"\tmax %lg cycles, %lg us\n",
+			(double)time_st.deq_total_time / (double)iter,
+			(double)(time_st.deq_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
 			(double)time_st.deq_min_time,
 			(double)(time_st.deq_min_time * 1000000) /
@@ -2194,7 +2194,7 @@  typedef int (test_case_function)(struct active_device *ad,
 static int
 offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_total_time, uint64_t *deq_min_time,
 		uint64_t *deq_max_time)
 {
 	int i, deq_total;
@@ -2214,7 +2214,7 @@  typedef int (test_case_function)(struct active_device *ad,
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
 		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
-		*deq_tot_time += deq_last_time;
+		*deq_total_time += deq_last_time;
 	}
 
 	return i;
@@ -2223,7 +2223,7 @@  typedef int (test_case_function)(struct active_device *ad,
 static int
 offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_total_time, uint64_t *deq_min_time,
 		uint64_t *deq_max_time)
 {
 	int i, deq_total;
@@ -2242,7 +2242,7 @@  typedef int (test_case_function)(struct active_device *ad,
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
 		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
-		*deq_tot_time += deq_last_time;
+		*deq_total_time += deq_last_time;
 	}
 
 	return i;
@@ -2261,7 +2261,7 @@  typedef int (test_case_function)(struct active_device *ad,
 	return TEST_SKIPPED;
 #else
 	int iter;
-	uint64_t deq_tot_time, deq_min_time, deq_max_time;
+	uint64_t deq_total_time, deq_min_time, deq_max_time;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -2269,7 +2269,7 @@  typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_info info;
 	const char *op_type_str;
 
-	deq_tot_time = deq_max_time = 0;
+	deq_total_time = deq_max_time = 0;
 	deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
@@ -2281,27 +2281,27 @@  typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Offload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nOffload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_tot_time,
+				num_to_process, burst_sz, &deq_total_time,
 				&deq_min_time, &deq_max_time);
 	else
 		iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_tot_time,
+				num_to_process, burst_sz, &deq_total_time,
 				&deq_min_time, &deq_max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tempty deq offload\n"
-			"\t\tavg. latency: %lg cycles, %lg us\n"
-			"\t\tmin. latency: %lg cycles, %lg us\n"
-			"\t\tmax. latency: %lg cycles, %lg us\n",
-			(double)deq_tot_time / (double)iter,
-			(double)(deq_tot_time * 1000000) / (double)iter /
+	printf("Empty dequeue offload\n"
+			"\tavg. latency: %lg cycles, %lg us\n"
+			"\tmin. latency: %lg cycles, %lg us\n"
+			"\tmax. latency: %lg cycles, %lg us\n",
+			(double)deq_total_time / (double)iter,
+			(double)(deq_total_time * 1000000) / (double)iter /
 			(double)rte_get_tsc_hz(), (double)deq_min_time,
 			(double)(deq_min_time * 1000000) / rte_get_tsc_hz(),
 			(double)deq_max_time, (double)(deq_max_time * 1000000) /
diff --git a/config/common_base b/config/common_base
index d12ae98..3ff98bb 100644
--- a/config/common_base
+++ b/config/common_base
@@ -481,7 +481,7 @@  CONFIG_RTE_PMD_PACKET_PREFETCH=y
 #
 CONFIG_RTE_LIBRTE_BBDEV=y
 CONFIG_RTE_BBDEV_MAX_DEVS=128
-CONFIG_RTE_BBDEV_OFFLOAD_COST=n
+CONFIG_RTE_BBDEV_OFFLOAD_COST=y
 
 #
 # Compile PMD for NULL bbdev device
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 8ceb276..57f6ba1 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -510,9 +510,10 @@  struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
+		/* CRC24A generation */
 		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
@@ -542,9 +543,10 @@  struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
+		/* CRC24B generation */
 		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else {
 		ret = is_enc_input_valid(k, k_idx, total_left);
@@ -596,15 +598,14 @@  struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 	start_time = rte_rdtsc_precise();
 #endif
-
+	/* Turbo encoding */
 	if (bblib_turbo_encoder(&turbo_req, &turbo_resp) != 0) {
 		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 		rte_bbdev_log(ERR, "Turbo Encoder failed");
 		return;
 	}
-
 #ifdef RTE_BBDEV_OFFLOAD_COST
-	q_stats->offload_time += rte_rdtsc_precise() - start_time;
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 
 	/* Restore 3 first bytes of next CB if they were overwritten by CRC*/
@@ -671,23 +672,21 @@  struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
-
+		/* Rate-Matching */
 		if (bblib_rate_match_dl(&rm_req, &rm_resp) != 0) {
 			op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 			rte_bbdev_log(ERR, "Rate matching failed");
 			return;
 		}
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 
 		/* SW fills an entire last byte even if E%8 != 0. Clear the
 		 * superfluous data bits for consistency with HW device.
 		 */
 		mask_id = (e & 7) >> 1;
 		rm_out[out_len - 1] &= mask_out[mask_id];
-
-#ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
-#endif
-
 		enc->output.length += rm_resp.OutputLen;
 	} else {
 		/* Rate matching is bypassed */
@@ -798,7 +797,7 @@  struct turbo_sw_queue {
 {
 	uint16_t i;
 #ifdef RTE_BBDEV_OFFLOAD_COST
-	queue_stats->offload_time = 0;
+	queue_stats->acc_offload_cycles = 0;
 #endif
 
 	for (i = 0; i < nb_ops; ++i)
@@ -905,7 +904,8 @@  struct turbo_sw_queue {
 process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 		uint8_t c, uint16_t k, uint16_t kw, struct rte_mbuf *m_in,
 		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
-		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left)
+		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left,
+		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int32_t k_idx;
@@ -917,6 +917,11 @@  struct turbo_sw_queue {
 	struct bblib_turbo_decoder_request turbo_req;
 	struct bblib_turbo_decoder_response turbo_resp;
 	struct rte_bbdev_op_turbo_dec *dec = &op->turbo_dec;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	uint64_t start_time;
+#else
+	RTE_SET_USED(q_stats);
+#endif
 
 	k_idx = compute_idx(k);
 
@@ -942,7 +947,14 @@  struct turbo_sw_queue {
 		deint_req.pharqbuffer = q->deint_input;
 		deint_req.ncb = ncb_without_null;
 		deint_resp.pinteleavebuffer = q->deint_output;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		start_time = rte_rdtsc_precise();
+#endif
 		bblib_deinterleave_ul(&deint_req, &deint_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 	} else
 		move_padding_bytes(in, q->deint_output, k, ncb);
 
@@ -961,7 +973,15 @@  struct turbo_sw_queue {
 	adapter_req.ncb = ncb_without_null;
 	adapter_req.pinteleavebuffer = adapter_input;
 	adapter_resp.pharqout = q->adapter_output;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	start_time = rte_rdtsc_precise();
+#endif
+	/* Turbo decode adaptation */
 	bblib_turbo_adapter_ul(&adapter_req, &adapter_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 
 	out = (uint8_t *)rte_pktmbuf_append(m_out, ((k - crc24_overlap) >> 3));
 	if (out == NULL) {
@@ -986,12 +1006,20 @@  struct turbo_sw_queue {
 	turbo_resp.ag_buf = q->ag;
 	turbo_resp.cb_buf = q->code_block;
 	turbo_resp.output = out;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	start_time = rte_rdtsc_precise();
+#endif
+	/* Turbo decode */
 	iter_cnt = bblib_turbo_decoder(&turbo_req, &turbo_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 	dec->hard_output.length += (k >> 3);
 
 	if (iter_cnt > 0) {
 		/* Temporary solution for returned iter_count from SDK */
-		iter_cnt = (iter_cnt - 1) / 2;
+		iter_cnt = (iter_cnt - 1) >> 1;
 		dec->iter_count = RTE_MAX(iter_cnt, dec->iter_count);
 	} else {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
@@ -1001,7 +1029,8 @@  struct turbo_sw_queue {
 }
 
 static inline void
-enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op)
+enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
+		struct rte_bbdev_stats *queue_stats)
 {
 	uint8_t c, r = 0;
 	uint16_t kw, k = 0;
@@ -1053,7 +1082,7 @@  struct turbo_sw_queue {
 		process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset,
 				out_offset, check_bit(dec->op_flags,
 				RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap,
-				total_left);
+				total_left, queue_stats);
 		/* To keep CRC24 attached to end of Code block, use
 		 * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it
 		 * removed by default once verified.
@@ -1075,12 +1104,15 @@  struct turbo_sw_queue {
 
 static inline uint16_t
 enqueue_dec_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_dec_op **ops,
-		uint16_t nb_ops)
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
 {
 	uint16_t i;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	queue_stats->acc_offload_cycles = 0;
+#endif
 
 	for (i = 0; i < nb_ops; ++i)
-		enqueue_dec_one_op(q, ops[i]);
+		enqueue_dec_one_op(q, ops[i], queue_stats);
 
 	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
 			NULL);
@@ -1112,7 +1144,7 @@  struct turbo_sw_queue {
 	struct turbo_sw_queue *q = queue;
 	uint16_t nb_enqueued = 0;
 
-	nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops);
+	nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops, &q_data->queue_stats);
 
 	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
 	q_data->queue_stats.enqueued_count += nb_enqueued;
diff --git a/lib/librte_bbdev/rte_bbdev.h b/lib/librte_bbdev/rte_bbdev.h
index 25ef409..da8cf07 100644
--- a/lib/librte_bbdev/rte_bbdev.h
+++ b/lib/librte_bbdev/rte_bbdev.h
@@ -239,8 +239,13 @@  struct rte_bbdev_stats {
 	uint64_t enqueue_err_count;
 	/** Total error count on operations dequeued */
 	uint64_t dequeue_err_count;
-	/** Offload time */
-	uint64_t offload_time;
+	/** CPU cycles consumed by the (HW/SW) accelerator device to offload
+	 *  the enqueue request to its internal queues.
+	 *  - For a HW device this is the cycles consumed in MMIO write
+	 *  - For a SW (vdev) device, this is the processing time of the
+	 *     bbdev operation
+	 */
+	uint64_t acc_offload_cycles;
 };
 
 /**