diff --git a/kernel/sched.c b/kernel/sched.c index 6f57bd95589..d91a2d32cfb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1599,3 +1599,13 @@ int z_sched_waitq_walk(_wait_q_t *wait_q, return status; } + +/* This routine exists for benchmarking purposes. It is not used in + * general production code. + */ +void z_unready_thread(struct k_thread *thread) +{ + K_SPINLOCK(&_sched_spinlock) { + unready_thread(thread); + } +} diff --git a/tests/benchmarks/sched_queues/CMakeLists.txt b/tests/benchmarks/sched_queues/CMakeLists.txt new file mode 100644 index 00000000000..751e2e0233c --- /dev/null +++ b/tests/benchmarks/sched_queues/CMakeLists.txt @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20.0) +find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE}) +project(sched_queues) + +FILE(GLOB app_sources src/*.c) +target_sources(app PRIVATE ${app_sources}) +target_include_directories(app PRIVATE + ${ZEPHYR_BASE}/kernel/include + ${ZEPHYR_BASE}/arch/${ARCH}/include + ) diff --git a/tests/benchmarks/sched_queues/Kconfig b/tests/benchmarks/sched_queues/Kconfig new file mode 100644 index 00000000000..f952fe24e1f --- /dev/null +++ b/tests/benchmarks/sched_queues/Kconfig @@ -0,0 +1,30 @@ +# Copyright (c) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +mainmenu "Scheduler Queue Benchmark" + +source "Kconfig.zephyr" + +config BENCHMARK_NUM_ITERATIONS + int "Number of iterations to gather data" + default 1000 + help + This option specifies the number of times each test will be executed + before calculating the average times for reporting. + +config BENCHMARK_NUM_THREADS + int "Number of threads" + default 100 + help + This option specifies the maximum number of threads that the test + will add to the ready queue. Increasing this value will places greater + stress on the ready queue and better highlight the performance + differences as the number of threads in the ready queue changes. + +config BENCHMARK_VERBOSE + bool "Display detailed results" + default y + help + This option displays the average time of all the iterations done for + each thread in the tests. This generates large amounts of output. To + analyze it, it is recommended to redirect the output to a file. diff --git a/tests/benchmarks/sched_queues/README.rst b/tests/benchmarks/sched_queues/README.rst new file mode 100644 index 00000000000..94bd45d41f6 --- /dev/null +++ b/tests/benchmarks/sched_queues/README.rst @@ -0,0 +1,21 @@ +Scheduling Queue Measurements +############################# + +A Zephyr application developer may choose between three different scheduling +algorithms--dumb, scalable and multiq. These different algorithms have +different performance characteristics--characteristics that vary as the +number of ready threads increases. This benchmark can be used to help +determine which scheduling algorithm may best suit the developer's application. + +This benchmark measures the ... +* Time to add a threads of increasing priority to the ready queue +* Time to add threads of decreasing priority to the ready queue +* Time to remove highest priority thread from a wait queue +* Time to remove lowest priority thread from a wait queue + +By default, these tests show the minimum, maximum, and averages of the measured +times. However, if the verbose option is enabled then the set of measured +times will be displayed. The following will build this project with verbose +support: + + EXTRA_CONF_FILE="prj.verbose.conf" west build -p -b diff --git a/tests/benchmarks/sched_queues/prj.conf b/tests/benchmarks/sched_queues/prj.conf new file mode 100644 index 00000000000..2840887ae5e --- /dev/null +++ b/tests/benchmarks/sched_queues/prj.conf @@ -0,0 +1,31 @@ +# Default base configuration file + +CONFIG_TEST=y + +# eliminate timer interrupts during the benchmark +CONFIG_SYS_CLOCK_TICKS_PER_SEC=1 + +# We use irq_offload(), enable it +CONFIG_IRQ_OFFLOAD=y + +# Reduce memory/code footprint +CONFIG_BT=n +CONFIG_FORCE_NO_ASSERT=y + +CONFIG_TEST_HW_STACK_PROTECTION=n +# Disable HW Stack Protection (see #28664) +CONFIG_HW_STACK_PROTECTION=n +CONFIG_COVERAGE=n + +# Disable system power management +CONFIG_PM=n + +CONFIG_TIMING_FUNCTIONS=y + +CONFIG_HEAP_MEM_POOL_SIZE=2048 +CONFIG_APPLICATION_DEFINED_SYSCALL=y + +# Disable time slicing +CONFIG_TIMESLICING=n + +CONFIG_SPEED_OPTIMIZATIONS=y diff --git a/tests/benchmarks/sched_queues/prj.verbose.conf b/tests/benchmarks/sched_queues/prj.verbose.conf new file mode 100644 index 00000000000..b6204397cea --- /dev/null +++ b/tests/benchmarks/sched_queues/prj.verbose.conf @@ -0,0 +1,4 @@ +# Extra configuration file to enable verbose reporting +# Use with EXTRA_CONF_FILE + +CONFIG_BENCHMARK_VERBOSE=y diff --git a/tests/benchmarks/sched_queues/src/main.c b/tests/benchmarks/sched_queues/src/main.c new file mode 100644 index 00000000000..669f0fd633e --- /dev/null +++ b/tests/benchmarks/sched_queues/src/main.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2024 Intel Corporation + * + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * @file + * This file contains the main testing module that invokes all the tests. + */ + +#include +#include +#include "utils.h" +#include +#include + +#define TEST_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE) +#define BUSY_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE) + +uint32_t tm_off; + +/* + * Warning! Most of the created threads in this test use the same stack! + * This is done to reduce the memory footprint as having unique stacks + * for hundreds or thousands of threads would require substantial memory. + * We can get away with this approach as the threads sharing the same + * stack will not be executing, even though they will be ready to run. + */ + +static K_THREAD_STACK_DEFINE(test_stack, TEST_STACK_SIZE); + +K_THREAD_STACK_ARRAY_DEFINE(busy_stack, CONFIG_MP_MAX_NUM_CPUS - 1, BUSY_STACK_SIZE); +static struct k_thread busy_thread[CONFIG_MP_MAX_NUM_CPUS - 1]; + +static struct k_thread test_thread[CONFIG_BENCHMARK_NUM_THREADS]; + +static uint64_t add_cycles[CONFIG_BENCHMARK_NUM_THREADS]; +static uint64_t remove_cycles[CONFIG_BENCHMARK_NUM_THREADS]; + +extern void z_unready_thread(struct k_thread *thread); + +static void busy_entry(void *p1, void *p2, void *p3) +{ + ARG_UNUSED(p1); + ARG_UNUSED(p2); + ARG_UNUSED(p3); + + while (1) { + } +} + +/** + * The test entry routine is not expected to execute. + */ +static void test_entry(void *p1, void *p2, void *p3) +{ + ARG_UNUSED(p2); + ARG_UNUSED(p3); + + printk("Thread %u unexpectedly executed\n", + (unsigned int)(uintptr_t)p1); + + while (1) { + } +} + +static void start_threads(unsigned int num_threads) +{ + unsigned int i; + unsigned int bucket_size; + + /* Start the busy threads to execute on the other processors */ + + for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS - 1; i++) { + k_thread_create(&busy_thread[i], busy_stack[i], BUSY_STACK_SIZE, + busy_entry, NULL, NULL, NULL, + -1, 0, K_NO_WAIT); + } + + bucket_size = (num_threads / CONFIG_NUM_PREEMPT_PRIORITIES) + 1; + + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + k_thread_create(&test_thread[i], test_stack, TEST_STACK_SIZE, + test_entry, (void *)(uintptr_t)i, NULL, NULL, + i / bucket_size, 0, K_NO_WAIT); + } +} + +static void cycles_reset(unsigned int num_threads) +{ + unsigned int i; + + for (i = 0; i < num_threads; i++) { + add_cycles[i] = 0ULL; + remove_cycles[i] = 0ULL; + } +} + +static void test_decreasing_priority(unsigned int num_threads) +{ + unsigned int i; + timing_t start; + timing_t finish; + + for (i = num_threads; i > 0; i--) { + start = timing_counter_get(); + z_unready_thread(&test_thread[i - 1]); + finish = timing_counter_get(); + remove_cycles[i - 1] += timing_cycles_get(&start, &finish); + } + + for (i = 0; i < num_threads; i++) { + start = timing_counter_get(); + z_ready_thread(&test_thread[i]); + finish = timing_counter_get(); + add_cycles[i] += timing_cycles_get(&start, &finish); + } +} + +static void test_increasing_priority(unsigned int num_threads) +{ + unsigned int i; + timing_t start; + timing_t finish; + + for (i = num_threads; i > 0; i--) { + start = timing_counter_get(); + z_unready_thread(&test_thread[num_threads - i]); + finish = timing_counter_get(); + remove_cycles[i - 1] += timing_cycles_get(&start, &finish); + } + + for (i = num_threads; i > 0; i--) { + start = timing_counter_get(); + z_ready_thread(&test_thread[i - 1]); + finish = timing_counter_get(); + add_cycles[num_threads - i] += timing_cycles_get(&start, &finish); + } +} + +static uint64_t sqrt_u64(uint64_t square) +{ + if (square > 1) { + uint64_t lo = sqrt_u64(square >> 2) << 1; + uint64_t hi = lo + 1; + + return ((hi * hi) > square) ? lo : hi; + } + + return square; +} + +static void compute_and_report_stats(unsigned int num_threads, + unsigned int num_iterations, + uint64_t *cycles, + const char *str) +{ + uint64_t minimum = cycles[0]; + uint64_t maximum = cycles[0]; + uint64_t total = cycles[0]; + uint64_t average; + uint64_t std_dev = 0; + uint64_t tmp; + uint64_t diff; + unsigned int i; + + for (i = 1; i < num_threads; i++) { + if (cycles[i] > maximum) { + maximum = cycles[i]; + } + + if (cycles[i] < minimum) { + minimum = cycles[i]; + } + + total += cycles[i]; + } + + minimum /= (uint64_t)num_iterations; + maximum /= (uint64_t)num_iterations; + average = total / (num_threads * num_iterations); + + for (i = 0; i < num_threads; i++) { + tmp = cycles[i] / num_iterations; + diff = (average > tmp) ? (average - tmp) : (tmp - average); + + std_dev += (diff * diff); + } + std_dev /= num_threads; + std_dev = sqrt_u64(std_dev); + + printk("%s\n", str); + + printk(" Minimum : %7llu cycles (%7u nsec)\n", + minimum, (uint32_t)timing_cycles_to_ns(minimum)); + printk(" Maximum : %7llu cycles (%7u nsec)\n", + maximum, (uint32_t)timing_cycles_to_ns(maximum)); + printk(" Average : %7llu cycles (%7u nsec)\n", + average, (uint32_t)timing_cycles_to_ns(average)); + printk(" Std Deviation: %7llu cycles (%7u nsec)\n", + std_dev, (uint32_t)timing_cycles_to_ns(std_dev)); +} + +int main(void) +{ + unsigned int i; + unsigned int freq; +#ifdef CONFIG_BENCHMARK_VERBOSE + char description[120]; + char tag[50]; + struct k_thread *thread; +#endif + + timing_init(); + + bench_test_init(); + + freq = timing_freq_get_mhz(); + + printk("Time Measurements for %s sched queues\n", + IS_ENABLED(CONFIG_SCHED_DUMB) ? "dumb" : + IS_ENABLED(CONFIG_SCHED_SCALABLE) ? "scalable" : "multiq"); + printk("Timing results: Clock frequency: %u MHz\n", freq); + + start_threads(CONFIG_BENCHMARK_NUM_THREADS); + + timing_start(); + + cycles_reset(CONFIG_BENCHMARK_NUM_THREADS); + + for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) { + test_decreasing_priority(CONFIG_BENCHMARK_NUM_THREADS); + } + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + add_cycles, + "Add threads of decreasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "ReadyQ.add.to.tail.%04u.waiters", i); + snprintf(description, sizeof(description), + "%-40s - Add thread of priority (%u)", + tag, test_thread[i].base.prio); + PRINT_STATS_AVG(description, (uint32_t)add_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + printk("------------------------------------\n"); + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + remove_cycles, + "Remove threads of decreasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "ReadyQ.remove.from.head.%04u.waiters", i); + snprintf(description, sizeof(description), + "%-40s - Remove thread of priority %u", + tag, test_thread[i].base.prio); + PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + printk("------------------------------------\n"); + + cycles_reset(CONFIG_BENCHMARK_NUM_THREADS); + + for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) { + test_increasing_priority(CONFIG_BENCHMARK_NUM_THREADS); + } + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + add_cycles, + "Add threads of increasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "ReadyQ.add.to.head.%04u.waiters", i); + thread = &test_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1]; + snprintf(description, sizeof(description), + "%-40s - Add priority %u to readyq", + tag, thread->base.prio); + PRINT_STATS_AVG(description, (uint32_t)add_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + printk("------------------------------------\n"); + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + remove_cycles, + "Remove threads or increasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "ReadyQ.remove.from.tail.%04u.waiters", + CONFIG_BENCHMARK_NUM_THREADS - i); + thread = &test_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1]; + snprintf(description, sizeof(description), + "%-40s - Remove lowest priority from readyq (%u)", + tag, thread->base.prio); + PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + k_thread_abort(&test_thread[i]); + } + + timing_stop(); + + TC_END_REPORT(0); + + return 0; +} diff --git a/tests/benchmarks/sched_queues/src/utils.h b/tests/benchmarks/sched_queues/src/utils.h new file mode 100644 index 00000000000..cca95dfc02a --- /dev/null +++ b/tests/benchmarks/sched_queues/src/utils.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024 Intel Corporation + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __BENCHMARK_SCHEDQ_UTILS_H +#define __BENCHMARK_SCHEDQ_UTILS_H +/* + * @brief This file contains macros used in the scheduler queue benchmarking. + */ + +#include +#include +#include + +#ifdef CSV_FORMAT_OUTPUT +#define FORMAT_STR "%-74s,%s,%s\n" +#define CYCLE_FORMAT "%8u" +#define NSEC_FORMAT "%8u" +#else +#define FORMAT_STR "%-74s:%s , %s\n" +#define CYCLE_FORMAT "%8u cycles" +#define NSEC_FORMAT "%8u ns" +#endif + +/** + * @brief Display a line of statistics + * + * This macro displays the following: + * 1. Test description summary + * 2. Number of cycles + * 3. Number of nanoseconds + */ +#define PRINT_F(summary, cycles, nsec) \ + do { \ + char cycle_str[32]; \ + char nsec_str[32]; \ + \ + snprintk(cycle_str, 30, CYCLE_FORMAT, cycles); \ + snprintk(nsec_str, 30, NSEC_FORMAT, nsec); \ + printk(FORMAT_STR, summary, cycle_str, nsec_str); \ + } while (0) + +#define PRINT_STATS(summary, value) \ + PRINT_F(summary, value, \ + (uint32_t)timing_cycles_to_ns(value)) + +#define PRINT_STATS_AVG(summary, value, counter) \ + PRINT_F(summary, value / counter, \ + (uint32_t)timing_cycles_to_ns_avg(value, counter)) + + +#endif diff --git a/tests/benchmarks/sched_queues/testcase.yaml b/tests/benchmarks/sched_queues/testcase.yaml new file mode 100644 index 00000000000..e3ecc3b3426 --- /dev/null +++ b/tests/benchmarks/sched_queues/testcase.yaml @@ -0,0 +1,25 @@ +common: + tags: + - kernel + - benchmark + integration_platforms: + - qemu_x86 + - qemu_cortex_a53 + harness: console + harness_config: + type: one_line + regex: + - "PROJECT EXECUTION SUCCESSFUL" + +tests: + benchmark.sched_queues.dumb: + extra_configs: + - CONFIG_SCHED_DUMB=y + + benchmark.sched_queues.scalable: + extra_configs: + - CONFIG_SCHED_SCALABLE=y + + benchmark.sched_queues.multiq: + extra_configs: + - CONFIG_SCHED_MULTIQ=y