Browse Source

tests: scheduler queue benchmarks

Implements a set of tests designed to show how the performance of the
three scheduler queue implementations (DUMB, SCALABLE and MULTIQ)
varies with respect to the number of threads in the ready queue.

Signed-off-by: Peter Mitsis <peter.mitsis@intel.com>
pull/79524/head
Peter Mitsis 10 months ago committed by Anas Nashif
parent
commit
318b49570a
  1. 10
      kernel/sched.c
  2. 12
      tests/benchmarks/sched_queues/CMakeLists.txt
  3. 30
      tests/benchmarks/sched_queues/Kconfig
  4. 21
      tests/benchmarks/sched_queues/README.rst
  5. 31
      tests/benchmarks/sched_queues/prj.conf
  6. 4
      tests/benchmarks/sched_queues/prj.verbose.conf
  7. 328
      tests/benchmarks/sched_queues/src/main.c
  8. 54
      tests/benchmarks/sched_queues/src/utils.h
  9. 25
      tests/benchmarks/sched_queues/testcase.yaml

10
kernel/sched.c

@ -1599,3 +1599,13 @@ int z_sched_waitq_walk(_wait_q_t *wait_q, @@ -1599,3 +1599,13 @@ int z_sched_waitq_walk(_wait_q_t *wait_q,
return status;
}
/* This routine exists for benchmarking purposes. It is not used in
* general production code.
*/
void z_unready_thread(struct k_thread *thread)
{
K_SPINLOCK(&_sched_spinlock) {
unready_thread(thread);
}
}

12
tests/benchmarks/sched_queues/CMakeLists.txt

@ -0,0 +1,12 @@ @@ -0,0 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
cmake_minimum_required(VERSION 3.20.0)
find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
project(sched_queues)
FILE(GLOB app_sources src/*.c)
target_sources(app PRIVATE ${app_sources})
target_include_directories(app PRIVATE
${ZEPHYR_BASE}/kernel/include
${ZEPHYR_BASE}/arch/${ARCH}/include
)

30
tests/benchmarks/sched_queues/Kconfig

@ -0,0 +1,30 @@ @@ -0,0 +1,30 @@
# Copyright (c) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
mainmenu "Scheduler Queue Benchmark"
source "Kconfig.zephyr"
config BENCHMARK_NUM_ITERATIONS
int "Number of iterations to gather data"
default 1000
help
This option specifies the number of times each test will be executed
before calculating the average times for reporting.
config BENCHMARK_NUM_THREADS
int "Number of threads"
default 100
help
This option specifies the maximum number of threads that the test
will add to the ready queue. Increasing this value will places greater
stress on the ready queue and better highlight the performance
differences as the number of threads in the ready queue changes.
config BENCHMARK_VERBOSE
bool "Display detailed results"
default y
help
This option displays the average time of all the iterations done for
each thread in the tests. This generates large amounts of output. To
analyze it, it is recommended to redirect the output to a file.

21
tests/benchmarks/sched_queues/README.rst

@ -0,0 +1,21 @@ @@ -0,0 +1,21 @@
Scheduling Queue Measurements
#############################
A Zephyr application developer may choose between three different scheduling
algorithms--dumb, scalable and multiq. These different algorithms have
different performance characteristics--characteristics that vary as the
number of ready threads increases. This benchmark can be used to help
determine which scheduling algorithm may best suit the developer's application.
This benchmark measures the ...
* Time to add a threads of increasing priority to the ready queue
* Time to add threads of decreasing priority to the ready queue
* Time to remove highest priority thread from a wait queue
* Time to remove lowest priority thread from a wait queue
By default, these tests show the minimum, maximum, and averages of the measured
times. However, if the verbose option is enabled then the set of measured
times will be displayed. The following will build this project with verbose
support:
EXTRA_CONF_FILE="prj.verbose.conf" west build -p -b <board> <path to project>

31
tests/benchmarks/sched_queues/prj.conf

@ -0,0 +1,31 @@ @@ -0,0 +1,31 @@
# Default base configuration file
CONFIG_TEST=y
# eliminate timer interrupts during the benchmark
CONFIG_SYS_CLOCK_TICKS_PER_SEC=1
# We use irq_offload(), enable it
CONFIG_IRQ_OFFLOAD=y
# Reduce memory/code footprint
CONFIG_BT=n
CONFIG_FORCE_NO_ASSERT=y
CONFIG_TEST_HW_STACK_PROTECTION=n
# Disable HW Stack Protection (see #28664)
CONFIG_HW_STACK_PROTECTION=n
CONFIG_COVERAGE=n
# Disable system power management
CONFIG_PM=n
CONFIG_TIMING_FUNCTIONS=y
CONFIG_HEAP_MEM_POOL_SIZE=2048
CONFIG_APPLICATION_DEFINED_SYSCALL=y
# Disable time slicing
CONFIG_TIMESLICING=n
CONFIG_SPEED_OPTIMIZATIONS=y

4
tests/benchmarks/sched_queues/prj.verbose.conf

@ -0,0 +1,4 @@ @@ -0,0 +1,4 @@
# Extra configuration file to enable verbose reporting
# Use with EXTRA_CONF_FILE
CONFIG_BENCHMARK_VERBOSE=y

328
tests/benchmarks/sched_queues/src/main.c

@ -0,0 +1,328 @@ @@ -0,0 +1,328 @@
/*
* Copyright (c) 2024 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* @file
* This file contains the main testing module that invokes all the tests.
*/
#include <zephyr/kernel.h>
#include <zephyr/timestamp.h>
#include "utils.h"
#include <zephyr/tc_util.h>
#include <ksched.h>
#define TEST_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
#define BUSY_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
uint32_t tm_off;
/*
* Warning! Most of the created threads in this test use the same stack!
* This is done to reduce the memory footprint as having unique stacks
* for hundreds or thousands of threads would require substantial memory.
* We can get away with this approach as the threads sharing the same
* stack will not be executing, even though they will be ready to run.
*/
static K_THREAD_STACK_DEFINE(test_stack, TEST_STACK_SIZE);
K_THREAD_STACK_ARRAY_DEFINE(busy_stack, CONFIG_MP_MAX_NUM_CPUS - 1, BUSY_STACK_SIZE);
static struct k_thread busy_thread[CONFIG_MP_MAX_NUM_CPUS - 1];
static struct k_thread test_thread[CONFIG_BENCHMARK_NUM_THREADS];
static uint64_t add_cycles[CONFIG_BENCHMARK_NUM_THREADS];
static uint64_t remove_cycles[CONFIG_BENCHMARK_NUM_THREADS];
extern void z_unready_thread(struct k_thread *thread);
static void busy_entry(void *p1, void *p2, void *p3)
{
ARG_UNUSED(p1);
ARG_UNUSED(p2);
ARG_UNUSED(p3);
while (1) {
}
}
/**
* The test entry routine is not expected to execute.
*/
static void test_entry(void *p1, void *p2, void *p3)
{
ARG_UNUSED(p2);
ARG_UNUSED(p3);
printk("Thread %u unexpectedly executed\n",
(unsigned int)(uintptr_t)p1);
while (1) {
}
}
static void start_threads(unsigned int num_threads)
{
unsigned int i;
unsigned int bucket_size;
/* Start the busy threads to execute on the other processors */
for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS - 1; i++) {
k_thread_create(&busy_thread[i], busy_stack[i], BUSY_STACK_SIZE,
busy_entry, NULL, NULL, NULL,
-1, 0, K_NO_WAIT);
}
bucket_size = (num_threads / CONFIG_NUM_PREEMPT_PRIORITIES) + 1;
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
k_thread_create(&test_thread[i], test_stack, TEST_STACK_SIZE,
test_entry, (void *)(uintptr_t)i, NULL, NULL,
i / bucket_size, 0, K_NO_WAIT);
}
}
static void cycles_reset(unsigned int num_threads)
{
unsigned int i;
for (i = 0; i < num_threads; i++) {
add_cycles[i] = 0ULL;
remove_cycles[i] = 0ULL;
}
}
static void test_decreasing_priority(unsigned int num_threads)
{
unsigned int i;
timing_t start;
timing_t finish;
for (i = num_threads; i > 0; i--) {
start = timing_counter_get();
z_unready_thread(&test_thread[i - 1]);
finish = timing_counter_get();
remove_cycles[i - 1] += timing_cycles_get(&start, &finish);
}
for (i = 0; i < num_threads; i++) {
start = timing_counter_get();
z_ready_thread(&test_thread[i]);
finish = timing_counter_get();
add_cycles[i] += timing_cycles_get(&start, &finish);
}
}
static void test_increasing_priority(unsigned int num_threads)
{
unsigned int i;
timing_t start;
timing_t finish;
for (i = num_threads; i > 0; i--) {
start = timing_counter_get();
z_unready_thread(&test_thread[num_threads - i]);
finish = timing_counter_get();
remove_cycles[i - 1] += timing_cycles_get(&start, &finish);
}
for (i = num_threads; i > 0; i--) {
start = timing_counter_get();
z_ready_thread(&test_thread[i - 1]);
finish = timing_counter_get();
add_cycles[num_threads - i] += timing_cycles_get(&start, &finish);
}
}
static uint64_t sqrt_u64(uint64_t square)
{
if (square > 1) {
uint64_t lo = sqrt_u64(square >> 2) << 1;
uint64_t hi = lo + 1;
return ((hi * hi) > square) ? lo : hi;
}
return square;
}
static void compute_and_report_stats(unsigned int num_threads,
unsigned int num_iterations,
uint64_t *cycles,
const char *str)
{
uint64_t minimum = cycles[0];
uint64_t maximum = cycles[0];
uint64_t total = cycles[0];
uint64_t average;
uint64_t std_dev = 0;
uint64_t tmp;
uint64_t diff;
unsigned int i;
for (i = 1; i < num_threads; i++) {
if (cycles[i] > maximum) {
maximum = cycles[i];
}
if (cycles[i] < minimum) {
minimum = cycles[i];
}
total += cycles[i];
}
minimum /= (uint64_t)num_iterations;
maximum /= (uint64_t)num_iterations;
average = total / (num_threads * num_iterations);
for (i = 0; i < num_threads; i++) {
tmp = cycles[i] / num_iterations;
diff = (average > tmp) ? (average - tmp) : (tmp - average);
std_dev += (diff * diff);
}
std_dev /= num_threads;
std_dev = sqrt_u64(std_dev);
printk("%s\n", str);
printk(" Minimum : %7llu cycles (%7u nsec)\n",
minimum, (uint32_t)timing_cycles_to_ns(minimum));
printk(" Maximum : %7llu cycles (%7u nsec)\n",
maximum, (uint32_t)timing_cycles_to_ns(maximum));
printk(" Average : %7llu cycles (%7u nsec)\n",
average, (uint32_t)timing_cycles_to_ns(average));
printk(" Std Deviation: %7llu cycles (%7u nsec)\n",
std_dev, (uint32_t)timing_cycles_to_ns(std_dev));
}
int main(void)
{
unsigned int i;
unsigned int freq;
#ifdef CONFIG_BENCHMARK_VERBOSE
char description[120];
char tag[50];
struct k_thread *thread;
#endif
timing_init();
bench_test_init();
freq = timing_freq_get_mhz();
printk("Time Measurements for %s sched queues\n",
IS_ENABLED(CONFIG_SCHED_DUMB) ? "dumb" :
IS_ENABLED(CONFIG_SCHED_SCALABLE) ? "scalable" : "multiq");
printk("Timing results: Clock frequency: %u MHz\n", freq);
start_threads(CONFIG_BENCHMARK_NUM_THREADS);
timing_start();
cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
test_decreasing_priority(CONFIG_BENCHMARK_NUM_THREADS);
}
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
add_cycles,
"Add threads of decreasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"ReadyQ.add.to.tail.%04u.waiters", i);
snprintf(description, sizeof(description),
"%-40s - Add thread of priority (%u)",
tag, test_thread[i].base.prio);
PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
printk("------------------------------------\n");
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
remove_cycles,
"Remove threads of decreasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"ReadyQ.remove.from.head.%04u.waiters", i);
snprintf(description, sizeof(description),
"%-40s - Remove thread of priority %u",
tag, test_thread[i].base.prio);
PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
printk("------------------------------------\n");
cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
test_increasing_priority(CONFIG_BENCHMARK_NUM_THREADS);
}
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
add_cycles,
"Add threads of increasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"ReadyQ.add.to.head.%04u.waiters", i);
thread = &test_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
snprintf(description, sizeof(description),
"%-40s - Add priority %u to readyq",
tag, thread->base.prio);
PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
printk("------------------------------------\n");
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
remove_cycles,
"Remove threads or increasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"ReadyQ.remove.from.tail.%04u.waiters",
CONFIG_BENCHMARK_NUM_THREADS - i);
thread = &test_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
snprintf(description, sizeof(description),
"%-40s - Remove lowest priority from readyq (%u)",
tag, thread->base.prio);
PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
k_thread_abort(&test_thread[i]);
}
timing_stop();
TC_END_REPORT(0);
return 0;
}

54
tests/benchmarks/sched_queues/src/utils.h

@ -0,0 +1,54 @@ @@ -0,0 +1,54 @@
/*
* Copyright (c) 2024 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef __BENCHMARK_SCHEDQ_UTILS_H
#define __BENCHMARK_SCHEDQ_UTILS_H
/*
* @brief This file contains macros used in the scheduler queue benchmarking.
*/
#include <zephyr/timing/timing.h>
#include <zephyr/sys/printk.h>
#include <stdio.h>
#ifdef CSV_FORMAT_OUTPUT
#define FORMAT_STR "%-74s,%s,%s\n"
#define CYCLE_FORMAT "%8u"
#define NSEC_FORMAT "%8u"
#else
#define FORMAT_STR "%-74s:%s , %s\n"
#define CYCLE_FORMAT "%8u cycles"
#define NSEC_FORMAT "%8u ns"
#endif
/**
* @brief Display a line of statistics
*
* This macro displays the following:
* 1. Test description summary
* 2. Number of cycles
* 3. Number of nanoseconds
*/
#define PRINT_F(summary, cycles, nsec) \
do { \
char cycle_str[32]; \
char nsec_str[32]; \
\
snprintk(cycle_str, 30, CYCLE_FORMAT, cycles); \
snprintk(nsec_str, 30, NSEC_FORMAT, nsec); \
printk(FORMAT_STR, summary, cycle_str, nsec_str); \
} while (0)
#define PRINT_STATS(summary, value) \
PRINT_F(summary, value, \
(uint32_t)timing_cycles_to_ns(value))
#define PRINT_STATS_AVG(summary, value, counter) \
PRINT_F(summary, value / counter, \
(uint32_t)timing_cycles_to_ns_avg(value, counter))
#endif

25
tests/benchmarks/sched_queues/testcase.yaml

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@
common:
tags:
- kernel
- benchmark
integration_platforms:
- qemu_x86
- qemu_cortex_a53
harness: console
harness_config:
type: one_line
regex:
- "PROJECT EXECUTION SUCCESSFUL"
tests:
benchmark.sched_queues.dumb:
extra_configs:
- CONFIG_SCHED_DUMB=y
benchmark.sched_queues.scalable:
extra_configs:
- CONFIG_SCHED_SCALABLE=y
benchmark.sched_queues.multiq:
extra_configs:
- CONFIG_SCHED_MULTIQ=y
Loading…
Cancel
Save