tests: scheduler queue benchmarks

Implements a set of tests designed to show how the performance of the three scheduler queue implementations (DUMB, SCALABLE and MULTIQ) varies with respect to the number of threads in the ready queue. Signed-off-by: Peter Mitsis <peter.mitsis@intel.com>
10 months ago · 318b49570a
9 changed files with 515 additions and 0 deletions
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -1599,3 +1599,13 @@ int z_sched_waitq_walk(_wait_q_t  *wait_q,
				@@ -1599,3 +1599,13 @@ int z_sched_waitq_walk(_wait_q_t  *wait_q,

 	return status;
 }
+
+/* This routine exists for benchmarking purposes. It is not used in
+ * general production code.
+ */
+void z_unready_thread(struct k_thread *thread)
+{
+	K_SPINLOCK(&_sched_spinlock) {
+		unready_thread(thread);
+	}
+}
--- a/tests/benchmarks/sched_queues/CMakeLists.txt
+++ b/tests/benchmarks/sched_queues/CMakeLists.txt
@ -0,0 +1,12 @@
				@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20.0)
+find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
+project(sched_queues)
+
+FILE(GLOB app_sources src/*.c)
+target_sources(app PRIVATE ${app_sources})
+target_include_directories(app PRIVATE
+  ${ZEPHYR_BASE}/kernel/include
+  ${ZEPHYR_BASE}/arch/${ARCH}/include
+  )
--- a/tests/benchmarks/sched_queues/Kconfig
+++ b/tests/benchmarks/sched_queues/Kconfig
@ -0,0 +1,30 @@
				@@ -0,0 +1,30 @@
+# Copyright (c) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+mainmenu "Scheduler Queue Benchmark"
+
+source "Kconfig.zephyr"
+
+config BENCHMARK_NUM_ITERATIONS
+	int "Number of iterations to gather data"
+	default 1000
+	help
+	  This option specifies the number of times each test will be executed
+	  before calculating the average times for reporting.
+
+config BENCHMARK_NUM_THREADS
+	int "Number of threads"
+	default 100
+	help
+	  This option specifies the maximum number of threads that the test
+	  will add to the ready queue. Increasing this value will places greater
+	  stress on the ready queue and better highlight the performance
+	  differences as the number of threads in the ready queue changes.
+
+config BENCHMARK_VERBOSE
+	bool "Display detailed results"
+	default y
+	help
+	  This option displays the average time of all the iterations done for
+	  each thread in the tests. This generates large amounts of output. To
+	  analyze it, it is recommended to redirect the output to a file.
--- a/tests/benchmarks/sched_queues/README.rst
+++ b/tests/benchmarks/sched_queues/README.rst
@ -0,0 +1,21 @@
				@@ -0,0 +1,21 @@
+Scheduling Queue Measurements
+#############################
+
+A Zephyr application developer may choose between three different scheduling
+algorithms--dumb, scalable and multiq. These different algorithms have
+different performance characteristics--characteristics that vary as the
+number of ready threads increases. This benchmark can be used to help
+determine which scheduling algorithm may best suit the developer's application.
+
+This benchmark measures the ...
+* Time to add a threads of increasing priority to the ready queue
+* Time to add threads of decreasing priority to the ready queue
+* Time to remove highest priority thread from a wait queue
+* Time to remove lowest priority thread from a wait queue
+
+By default, these tests show the minimum, maximum, and averages of the measured
+times. However, if the verbose option is enabled then the set of measured
+times will be displayed. The following will build this project with verbose
+support:
+
+    EXTRA_CONF_FILE="prj.verbose.conf" west build -p -b <board> <path to project>
--- a/tests/benchmarks/sched_queues/prj.conf
+++ b/tests/benchmarks/sched_queues/prj.conf
@ -0,0 +1,31 @@
				@@ -0,0 +1,31 @@
+# Default base configuration file
+
+CONFIG_TEST=y
+
+# eliminate timer interrupts during the benchmark
+CONFIG_SYS_CLOCK_TICKS_PER_SEC=1
+
+# We use irq_offload(), enable it
+CONFIG_IRQ_OFFLOAD=y
+
+# Reduce memory/code footprint
+CONFIG_BT=n
+CONFIG_FORCE_NO_ASSERT=y
+
+CONFIG_TEST_HW_STACK_PROTECTION=n
+# Disable HW Stack Protection (see #28664)
+CONFIG_HW_STACK_PROTECTION=n
+CONFIG_COVERAGE=n
+
+# Disable system power management
+CONFIG_PM=n
+
+CONFIG_TIMING_FUNCTIONS=y
+
+CONFIG_HEAP_MEM_POOL_SIZE=2048
+CONFIG_APPLICATION_DEFINED_SYSCALL=y
+
+# Disable time slicing
+CONFIG_TIMESLICING=n
+
+CONFIG_SPEED_OPTIMIZATIONS=y
--- a/tests/benchmarks/sched_queues/prj.verbose.conf
+++ b/tests/benchmarks/sched_queues/prj.verbose.conf
@ -0,0 +1,4 @@
				@@ -0,0 +1,4 @@
+# Extra configuration file to enable verbose reporting
+# Use with EXTRA_CONF_FILE
+
+CONFIG_BENCHMARK_VERBOSE=y
--- a/tests/benchmarks/sched_queues/src/main.c
+++ b/tests/benchmarks/sched_queues/src/main.c
@ -0,0 +1,328 @@
				@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * @file
+ * This file contains the main testing module that invokes all the tests.
+ */
+
+#include <zephyr/kernel.h>
+#include <zephyr/timestamp.h>
+#include "utils.h"
+#include <zephyr/tc_util.h>
+#include <ksched.h>
+
+#define TEST_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
+#define BUSY_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
+
+uint32_t tm_off;
+
+/*
+ * Warning! Most of the created threads in this test use the same stack!
+ * This is done to reduce the memory footprint as having unique stacks
+ * for hundreds or thousands of threads would require substantial memory.
+ * We can get away with this approach as the threads sharing the same
+ * stack will not be executing, even though they will be ready to run.
+ */
+
+static K_THREAD_STACK_DEFINE(test_stack, TEST_STACK_SIZE);
+
+K_THREAD_STACK_ARRAY_DEFINE(busy_stack, CONFIG_MP_MAX_NUM_CPUS - 1, BUSY_STACK_SIZE);
+static struct k_thread busy_thread[CONFIG_MP_MAX_NUM_CPUS - 1];
+
+static struct k_thread test_thread[CONFIG_BENCHMARK_NUM_THREADS];
+
+static uint64_t add_cycles[CONFIG_BENCHMARK_NUM_THREADS];
+static uint64_t remove_cycles[CONFIG_BENCHMARK_NUM_THREADS];
+
+extern void z_unready_thread(struct k_thread *thread);
+
+static void busy_entry(void *p1, void *p2, void *p3)
+{
+	ARG_UNUSED(p1);
+	ARG_UNUSED(p2);
+	ARG_UNUSED(p3);
+
+	while (1) {
+	}
+}
+
+/**
+ * The test entry routine is not expected to execute.
+ */
+static void test_entry(void *p1, void *p2, void *p3)
+{
+	ARG_UNUSED(p2);
+	ARG_UNUSED(p3);
+
+	printk("Thread %u unexpectedly executed\n",
+	       (unsigned int)(uintptr_t)p1);
+
+	while (1) {
+	}
+}
+
+static void start_threads(unsigned int num_threads)
+{
+	unsigned int i;
+	unsigned int bucket_size;
+
+	/* Start the busy threads to execute on the other processors */
+
+	for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS - 1; i++) {
+		k_thread_create(&busy_thread[i], busy_stack[i], BUSY_STACK_SIZE,
+				busy_entry, NULL, NULL, NULL,
+				-1, 0, K_NO_WAIT);
+	}
+
+	bucket_size = (num_threads / CONFIG_NUM_PREEMPT_PRIORITIES) + 1;
+
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		k_thread_create(&test_thread[i], test_stack, TEST_STACK_SIZE,
+				test_entry, (void *)(uintptr_t)i, NULL, NULL,
+				i / bucket_size, 0, K_NO_WAIT);
+	}
+}
+
+static void cycles_reset(unsigned int num_threads)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_threads; i++) {
+		add_cycles[i] = 0ULL;
+		remove_cycles[i] = 0ULL;
+	}
+}
+
+static void test_decreasing_priority(unsigned int num_threads)
+{
+	unsigned int i;
+	timing_t start;
+	timing_t finish;
+
+	for (i = num_threads; i > 0; i--) {
+		start = timing_counter_get();
+		z_unready_thread(&test_thread[i - 1]);
+		finish = timing_counter_get();
+		remove_cycles[i - 1] += timing_cycles_get(&start, &finish);
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		start = timing_counter_get();
+		z_ready_thread(&test_thread[i]);
+		finish = timing_counter_get();
+		add_cycles[i] += timing_cycles_get(&start, &finish);
+	}
+}
+
+static void test_increasing_priority(unsigned int num_threads)
+{
+	unsigned int i;
+	timing_t start;
+	timing_t finish;
+
+	for (i = num_threads; i > 0; i--) {
+		start = timing_counter_get();
+		z_unready_thread(&test_thread[num_threads - i]);
+		finish = timing_counter_get();
+		remove_cycles[i - 1] += timing_cycles_get(&start, &finish);
+	}
+
+	for (i = num_threads; i > 0; i--) {
+		start = timing_counter_get();
+		z_ready_thread(&test_thread[i - 1]);
+		finish = timing_counter_get();
+		add_cycles[num_threads - i] += timing_cycles_get(&start, &finish);
+	}
+}
+
+static uint64_t sqrt_u64(uint64_t square)
+{
+	if (square > 1) {
+		uint64_t lo = sqrt_u64(square >> 2) << 1;
+		uint64_t hi = lo + 1;
+
+		return ((hi * hi) > square) ? lo : hi;
+	}
+
+	return square;
+}
+
+static void compute_and_report_stats(unsigned int num_threads,
+				     unsigned int num_iterations,
+				     uint64_t *cycles,
+				     const char *str)
+{
+	uint64_t minimum = cycles[0];
+	uint64_t maximum = cycles[0];
+	uint64_t total = cycles[0];
+	uint64_t average;
+	uint64_t std_dev = 0;
+	uint64_t tmp;
+	uint64_t diff;
+	unsigned int i;
+
+	for (i = 1; i < num_threads; i++) {
+		if (cycles[i] > maximum) {
+			maximum = cycles[i];
+		}
+
+		if (cycles[i] < minimum) {
+			minimum = cycles[i];
+		}
+
+		total += cycles[i];
+	}
+
+	minimum /= (uint64_t)num_iterations;
+	maximum /= (uint64_t)num_iterations;
+	average = total / (num_threads * num_iterations);
+
+	for (i = 0; i < num_threads; i++) {
+		tmp = cycles[i] / num_iterations;
+		diff = (average > tmp) ? (average - tmp) : (tmp - average);
+
+		std_dev += (diff * diff);
+	}
+	std_dev /= num_threads;
+	std_dev = sqrt_u64(std_dev);
+
+	printk("%s\n", str);
+
+	printk("    Minimum : %7llu cycles (%7u nsec)\n",
+	       minimum, (uint32_t)timing_cycles_to_ns(minimum));
+	printk("    Maximum : %7llu cycles (%7u nsec)\n",
+	       maximum, (uint32_t)timing_cycles_to_ns(maximum));
+	printk("    Average : %7llu cycles (%7u nsec)\n",
+	       average, (uint32_t)timing_cycles_to_ns(average));
+	printk("    Std Deviation: %7llu cycles (%7u nsec)\n",
+	       std_dev, (uint32_t)timing_cycles_to_ns(std_dev));
+}
+
+int main(void)
+{
+	unsigned int i;
+	unsigned int freq;
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	char description[120];
+	char tag[50];
+	struct k_thread *thread;
+#endif
+
+	timing_init();
+
+	bench_test_init();
+
+	freq = timing_freq_get_mhz();
+
+	printk("Time Measurements for %s sched queues\n",
+	       IS_ENABLED(CONFIG_SCHED_DUMB) ? "dumb" :
+	       IS_ENABLED(CONFIG_SCHED_SCALABLE) ? "scalable" : "multiq");
+	printk("Timing results: Clock frequency: %u MHz\n", freq);
+
+	start_threads(CONFIG_BENCHMARK_NUM_THREADS);
+
+	timing_start();
+
+	cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
+
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
+		test_decreasing_priority(CONFIG_BENCHMARK_NUM_THREADS);
+	}
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 add_cycles,
+				 "Add threads of decreasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			 "ReadyQ.add.to.tail.%04u.waiters", i);
+		snprintf(description, sizeof(description),
+			 "%-40s - Add thread of priority (%u)",
+			 tag, test_thread[i].base.prio);
+		PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	printk("------------------------------------\n");
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 remove_cycles,
+				 "Remove threads of decreasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			 "ReadyQ.remove.from.head.%04u.waiters", i);
+		snprintf(description, sizeof(description),
+			 "%-40s - Remove thread of priority %u",
+			 tag, test_thread[i].base.prio);
+		PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	printk("------------------------------------\n");
+
+	cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
+
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
+		test_increasing_priority(CONFIG_BENCHMARK_NUM_THREADS);
+	}
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 add_cycles,
+				 "Add threads of increasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			 "ReadyQ.add.to.head.%04u.waiters", i);
+		thread = &test_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
+		snprintf(description, sizeof(description),
+			 "%-40s - Add priority %u to readyq",
+			 tag, thread->base.prio);
+		PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	printk("------------------------------------\n");
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 remove_cycles,
+				 "Remove threads or increasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			"ReadyQ.remove.from.tail.%04u.waiters",
+			CONFIG_BENCHMARK_NUM_THREADS - i);
+		thread = &test_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
+		snprintf(description, sizeof(description),
+			 "%-40s - Remove lowest priority from readyq (%u)",
+			 tag, thread->base.prio);
+		PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		k_thread_abort(&test_thread[i]);
+	}
+
+	timing_stop();
+
+	TC_END_REPORT(0);
+
+	return 0;
+}
--- a/tests/benchmarks/sched_queues/src/utils.h
+++ b/tests/benchmarks/sched_queues/src/utils.h
@ -0,0 +1,54 @@
				@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __BENCHMARK_SCHEDQ_UTILS_H
+#define __BENCHMARK_SCHEDQ_UTILS_H
+/*
+ * @brief This file contains macros used in the scheduler queue benchmarking.
+ */
+
+#include <zephyr/timing/timing.h>
+#include <zephyr/sys/printk.h>
+#include <stdio.h>
+
+#ifdef CSV_FORMAT_OUTPUT
+#define FORMAT_STR   "%-74s,%s,%s\n"
+#define CYCLE_FORMAT "%8u"
+#define NSEC_FORMAT  "%8u"
+#else
+#define FORMAT_STR   "%-74s:%s , %s\n"
+#define CYCLE_FORMAT "%8u cycles"
+#define NSEC_FORMAT  "%8u ns"
+#endif
+
+/**
+ * @brief Display a line of statistics
+ *
+ * This macro displays the following:
+ *  1. Test description summary
+ *  2. Number of cycles
+ *  3. Number of nanoseconds
+ */
+#define PRINT_F(summary, cycles, nsec)                            \
+	do {                                                      \
+		char cycle_str[32];                               \
+		char nsec_str[32];                                \
+								  \
+		snprintk(cycle_str, 30, CYCLE_FORMAT, cycles);    \
+		snprintk(nsec_str, 30, NSEC_FORMAT, nsec);        \
+		printk(FORMAT_STR, summary, cycle_str, nsec_str); \
+	} while (0)
+
+#define PRINT_STATS(summary, value)                   \
+	PRINT_F(summary, value,                       \
+		(uint32_t)timing_cycles_to_ns(value))
+
+#define PRINT_STATS_AVG(summary, value, counter)                    \
+	PRINT_F(summary, value / counter,                           \
+		(uint32_t)timing_cycles_to_ns_avg(value, counter))
+
+
+#endif
--- a/tests/benchmarks/sched_queues/testcase.yaml
+++ b/tests/benchmarks/sched_queues/testcase.yaml
@ -0,0 +1,25 @@
				@@ -0,0 +1,25 @@
+common:
+  tags:
+    - kernel
+    - benchmark
+  integration_platforms:
+    - qemu_x86
+    - qemu_cortex_a53
+  harness: console
+  harness_config:
+    type: one_line
+    regex:
+      - "PROJECT EXECUTION SUCCESSFUL"
+
+tests:
+  benchmark.sched_queues.dumb:
+    extra_configs:
+      - CONFIG_SCHED_DUMB=y
+
+  benchmark.sched_queues.scalable:
+    extra_configs:
+      - CONFIG_SCHED_SCALABLE=y
+
+  benchmark.sched_queues.multiq:
+    extra_configs:
+      - CONFIG_SCHED_MULTIQ=y