samples: Add MetaIRQ dispatch sample

MetaIRQs are described in docs and exercised in tests, but there's no sample explaining how they are intended to be used to perform interrupt bottom half processing. This simple tool spawns a set of worker threads at different priorities (some cooperative) which process "messages" (which require CPU time to handle) received from a fake "device" implemented with timer interrupts. The device hands off the events directly to a MetaIRQ thread which is expected to parse and dispatch them to the worker threads. The test demonstrates that no matter the state of the system, the MetaIRQ thread always runs synchronously when woken by the ISR and is able to preempt all worker threads to do its job. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
6 years ago · 7defd09b97
9 changed files with 523 additions and 0 deletions
--- a/samples/index.rst
+++ b/samples/index.rst
@ -24,6 +24,7 @@ Samples and Demos
				@@ -24,6 +24,7 @@ Samples and Demos
   portability/*
   posix/*
   video/*
+   scheduler/*
   smp/*

 .. comment
--- a/samples/scheduler/index.rst
+++ b/samples/scheduler/index.rst
@ -0,0 +1,10 @@
				@@ -0,0 +1,10 @@
+.. _scheduler-samples:
+
+Various Kernel and Scheduler Samples
+####################################
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   **/*
--- a/samples/scheduler/metairq_dispatch/CMakeLists.txt
+++ b/samples/scheduler/metairq_dispatch/CMakeLists.txt
@ -0,0 +1,8 @@
				@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.13.1)
+
+include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
+project(metairq_dispatch)
+
+target_sources(app PRIVATE src/main.c src/msgdev.c)
--- a/samples/scheduler/metairq_dispatch/README.rst
+++ b/samples/scheduler/metairq_dispatch/README.rst
@ -0,0 +1,63 @@
				@@ -0,0 +1,63 @@
+MetaIRQ Thread Priority Demonstration
+#####################################
+
+Overview
+********
+
+This sample demonstrates the use of a thread running at a MetaIRQ
+priority level to implement "bottom half" style processing
+synchronously with the end of a hardware ISR.  It implements a
+simulated "device" that produces messages that need to be dispatched
+to asynchronous queues feeding several worker threads, each running at
+a different priority.  The dispatch is handled by a MetaIRQ thread fed
+via a queue from the device ISR (really just a timer interrupt).
+
+Each message has a random (and non-trivial) amount of processing that
+must happen in the worker thread.  This implements a "bursty load"
+environment where occassional spikes in load require preemption of
+running threads and delay scheduling of lower priority threads.
+Messages are accompanied by a timestamp that allows per-message
+latencies to be computed at several points:
+
+* The cycle time between message creation in the ISR and receipt by
+  the MetaIRQ thread for dispatch.
+
+* The time between ISR and receipt by the worker thread.
+
+* The real time spent processing the message in the worker thread, for
+  comparison with the required processing time.  This provides a way
+  to measure preemption overhead where the thread is not scheduled.
+
+Aspects to note in the results:
+
+* On average, higher priority (lower numbered) threads have better
+  latencies and lower processing delays, as expected.
+
+* Cooperatively scheduled threads have significantly better processing
+  delay behavior than preemtible ones, as they can only be preempted
+  by the MetaIRQ thread.
+
+* Because of queueing and the bursty load, all worker threads of any
+  priority will experience some load-dependent delays, as the CPU
+  occasionally has more work to do than time available.
+
+* But, no matter the system load or thread configuration, the MetaIRQ
+  thread always runs immediately after the ISR.  It shows reliable,
+  constant latency under all circumstances because it can preempt all
+  other threads, including cooperative ones that cannot normally be
+  preempted.
+
+Requirements
+************
+
+This sample should run well on any Zephyr platform that provides
+preemption of running threads by interrupts, a working timer driver,
+and working log output.  For precision reasons, it produces better
+(and more) data on systems with a high timer tick rate (ideally 10+
+kHz).
+
+Note that because the test is fundamentally measuring thread
+preemption behavior, it does not run without modification on
+native_posix platforms.  In that emulation environment, threads will
+not be preempted except at specific instrumentation points (e.g. in
+k_busy_wait()) where they will voluntarily release the CPU.
--- a/samples/scheduler/metairq_dispatch/prj.conf
+++ b/samples/scheduler/metairq_dispatch/prj.conf
@ -0,0 +1,9 @@
				@@ -0,0 +1,9 @@
+CONFIG_ASSERT=y
+CONFIG_NUM_METAIRQ_PRIORITIES=1
+CONFIG_TEST_RANDOM_GENERATOR=y
+CONFIG_LOG=y
+CONFIG_LOG_MINIMAL=y
+
+# We're testing delivery latency to threads on a single CPU, MP
+# dispatch will mess up the statistics.
+CONFIG_MP_NUM_CPUS=1
--- a/samples/scheduler/metairq_dispatch/sample.yaml
+++ b/samples/scheduler/metairq_dispatch/sample.yaml
@ -0,0 +1,19 @@
				@@ -0,0 +1,19 @@
+sample:
+  description: Sample demonstrating dispatch of hardware events from a
+    MetaIRQ thread
+  name: MetaIRQ Dispatch
+common:
+    harness: console
+    harness_config:
+      type: one_line
+      regex:
+        - "MetaIRQ Test Complete"
+
+# Note that native_posix architectures are filtered, they require
+# instrumentation (e.g. a k_busy_wait()) inside the worker threads
+# "busy" loops in order for the interrupts to fire on time, and the
+# sample is designed to demonstrate completely arbitrary CPU work.
+tests:
+  sample.metairq_dispatch:
+    tags: introduction
+    filter: not CONFIG_ARCH_POSIX
--- a/samples/scheduler/metairq_dispatch/src/main.c
+++ b/samples/scheduler/metairq_dispatch/src/main.c
@ -0,0 +1,239 @@
				@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <zephyr.h>
+#include "main.h"
+
+#include <logging/log.h>
+LOG_MODULE_REGISTER(main, LOG_LEVEL_INF);
+
+#define STACK_SIZE 2048
+
+/* How many messages can be queued for a single thread */
+#define QUEUE_DEPTH 16
+
+/* Array of worker threads, and their stacks */
+static struct thread_rec {
+	struct k_thread thread;
+	struct k_msgq msgq;
+	struct msg msgq_buf[QUEUE_DEPTH];
+} threads[NUM_THREADS];
+
+K_THREAD_STACK_ARRAY_DEFINE(thread_stacks, NUM_THREADS, STACK_SIZE);
+
+/* The static metairq thread we'll use for dispatch */
+static void metairq_fn(void *p1, void *p2, void *p3);
+K_THREAD_DEFINE(metairq_thread, STACK_SIZE, metairq_fn,
+		NULL, NULL, NULL, K_HIGHEST_THREAD_PRIO, 0, K_NO_WAIT);
+
+/* Accumulated list of latencies, for a naive variance computation at
+ * the end.
+ */
+struct {
+	atomic_t num_mirq;
+	u32_t mirq_latencies[MAX_EVENTS];
+	struct {
+		u32_t nevt;
+		u32_t latencies[MAX_EVENTS * 2 / NUM_THREADS];
+	} threads[NUM_THREADS];
+} stats;
+
+/* A semaphore with an initial count, used to allow only one thread to
+ * log the final report.
+ */
+K_SEM_DEFINE(report_cookie, 1, 1);
+
+static void metairq_fn(void *p1, void *p2, void *p3)
+{
+	ARG_UNUSED(p1);
+	ARG_UNUSED(p2);
+	ARG_UNUSED(p3);
+
+	while (true) {
+		/* Receive a message, immediately check a timestamp
+		 * and compute a latency value, then dispatch it to
+		 * the queue for its target thread
+		 */
+		struct msg m;
+
+		message_dev_fetch(&m);
+		m.metairq_latency = k_cycle_get_32() - m.timestamp;
+
+		int ret = k_msgq_put(&threads[m.target].msgq, &m, K_NO_WAIT);
+
+		if (ret) {
+			LOG_INF("Thread %d queue full, message %d dropped",
+			       m.target, m.seq);
+		}
+	}
+}
+
+/* Simple recursive implementation of an integer square root, cribbed
+ * from wikipedia
+ */
+static u32_t isqrt(u64_t n)
+{
+	if (n > 1) {
+		u64_t lo = isqrt(n >> 2) << 1;
+		u64_t hi = lo + 1;
+
+		return (u32_t)(((hi * hi) > n) ? lo : hi);
+	}
+	return (u32_t) n;
+}
+
+static void calc_stats(const u32_t *array, u32_t n,
+		       u32_t *lo, u32_t *hi, u32_t *mean, u32_t *stdev)
+{
+	u64_t tot = 0, totsq = 0;
+
+	*lo = INT_MAX;
+	*hi = 0;
+	for (int i = 0; i < n; i++) {
+		*lo = MIN(*lo, array[i]);
+		*hi = MAX(*hi, array[i]);
+		tot += array[i];
+	}
+
+	*mean = (u32_t)((tot + (n / 2)) / n);
+
+	for (int i = 0; i < n; i++) {
+		s64_t d = (s32_t) (array[i] - *mean);
+
+		totsq += d * d;
+	}
+
+	*stdev = isqrt((totsq + (n / 2)) / n);
+}
+
+static void record_latencies(struct msg *m, u32_t latency)
+{
+	/* Workaround: qemu emulation shows an erroneously high
+	 * metairq latency for the very first event of 7-8us.  Maybe
+	 * it needs to fault in the our code pages in the host?
+	 */
+	if (IS_ENABLED(CONFIG_QEMU_TARGET) && m->seq == 0) {
+		return;
+	}
+
+	int t = m->target;
+	int lidx = stats.threads[t].nevt++;
+
+	if (lidx < ARRAY_SIZE(stats.threads[t].latencies)) {
+		stats.threads[t].latencies[lidx] = latency;
+	}
+
+	stats.mirq_latencies[atomic_inc(&stats.num_mirq)] = m->metairq_latency;
+
+	/* Once we've logged our final event, print a report.  We use
+	 * a semaphore with an initial count of 1 to ensure that only
+	 * one thread gets to do this.  Also events can be processed
+	 * out of order, so add a small sleep to let the queues
+	 * finish.
+	 */
+	if (m->seq == MAX_EVENTS - 1) {
+		u32_t hi, lo, mean, stdev, ret;
+
+		ret = k_sem_take(&report_cookie, K_FOREVER);
+		__ASSERT_NO_MSG(ret == 0);
+		k_sleep(100);
+
+		calc_stats(stats.mirq_latencies, stats.num_mirq,
+			   &lo, &hi, &mean, &stdev);
+
+		LOG_INF("        ---------- Latency (cyc) ----------");
+		LOG_INF("            Best    Worst     Mean    Stdev");
+		LOG_INF("MetaIRQ %8d %8d %8d %8d", lo, hi, mean, stdev);
+
+
+		for (int i = 0; i < NUM_THREADS; i++) {
+			if (stats.threads[i].nevt == 0) {
+				LOG_WRN("No events for thread %d", i);
+				continue;
+			}
+
+			calc_stats(stats.threads[i].latencies,
+				   stats.threads[i].nevt,
+				   &lo, &hi, &mean, &stdev);
+
+			LOG_INF("Thread%d %8d %8d %8d %8d",
+				i, lo, hi, mean, stdev);
+		}
+
+		LOG_INF("MetaIRQ Test Complete");
+	}
+}
+
+static void thread_fn(void *p1, void *p2, void *p3)
+{
+	ARG_UNUSED(p2);
+	ARG_UNUSED(p3);
+	int id = (long)p1;
+	struct msg m;
+
+	LOG_INF("Starting Thread%d at priority %d", id,
+		k_thread_priority_get(k_current_get()));
+
+	while (true) {
+		int ret = k_msgq_get(&threads[id].msgq, &m, K_FOREVER);
+		u32_t start = k_cycle_get_32();
+
+		__ASSERT_NO_MSG(ret == 0);
+
+		/* Spin on the CPU for the requested number of cycles
+		 * doing the "work" required to "process" the event.
+		 * Note the inner loop: hammering on k_cycle_get_32()
+		 * on some platforms requires locking around the timer
+		 * driver internals and can affect interrupt latency.
+		 * Obviously we may be preempted as new events arrive
+		 * and get queued.
+		 */
+		while (k_cycle_get_32() - start < m.proc_cyc) {
+			for (volatile int i = 0; i < 100; i++) {
+			}
+		}
+
+		u32_t dur = k_cycle_get_32() - start;
+
+#ifdef LOG_EVERY_EVENT
+		/* Log the message, its thread, and the following cycle values:
+		 * 1. Receive it from the driver in the MetaIRQ thread
+		 * 2. Begin processing it out of the queue in the worker thread
+		 * 3. The requested processing time in the message
+		 * 4. The actual time taken to process the message
+		 *    (may be higher if the thread was preempted)
+		 */
+		LOG_INF("M%d T%d mirq %d disp %d proc %d real %d",
+			m.seq, id, m.metairq_latency,
+			start - m.timestamp, m.proc_cyc, dur);
+#endif
+
+		/* Collect the latency values in a big statistics array */
+		record_latencies(&m, start - m.timestamp);
+	}
+}
+
+void main(void)
+{
+	for (long i = 0; i < NUM_THREADS; i++) {
+		/* Each thread gets a different priority.  Half should
+		 * be at (negative) cooperative priorities.  Lower
+		 * thread numbers have higher priority values,
+		 * e.g. thread 0 will be preempted only by the
+		 * metairq.
+		 */
+		int prio = (-NUM_THREADS/2) + i;
+
+		k_msgq_init(&threads[i].msgq, (char *)threads[i].msgq_buf,
+			    sizeof(struct msg), QUEUE_DEPTH);
+
+		k_thread_create(&threads[i].thread,
+				thread_stacks[i], STACK_SIZE,
+				thread_fn, (void *)i, NULL, NULL,
+				prio, 0, K_NO_WAIT);
+	}
+
+	message_dev_init();
+}
--- a/samples/scheduler/metairq_dispatch/src/main.h
+++ b/samples/scheduler/metairq_dispatch/src/main.h
@ -0,0 +1,70 @@
				@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLE_METAIRQ_MAIN_H
+
+/* Define this to enable logging of every event as it is processed to
+ * carefully inspect behavior.  Note that at high event rates (see
+ * MAX_EVENT_DELAY_TICKS) the log backend won't be able to keep up and
+ * that the log processing will add CPU load that is not accounted for
+ * in AVERAGE_LOAD_TARGET_PCT.  Disable for more accurate statistics
+ * measurement.  Note that if you don't want the log subsystem
+ * throttling to drop messages, you'll probably want to increase
+ * MAX_EVENT_DELAY_TICKS too, to slow down the reporting.
+ */
+#define LOG_EVERY_EVENT 1
+
+/* Number of worker threads, each at a separate priority, split evenly
+ * between cooperative and preemptible priorities.
+ */
+#define NUM_THREADS 4
+
+/* The proc_cyc duty cycle parameters are chosen to use approximately
+ * this fraction of one CPU's available cycles.  Default 60%
+ */
+#define AVERAGE_LOAD_TARGET_PCT 60
+
+/* "Hardware interrupts" for our fake device will arrive after a
+ * random delay of between zero and this number of ticks.  The event
+ * rate should be high enough to collect enough data but low enough
+ * that it is not regular.
+ */
+#define MAX_EVENT_DELAY_TICKS 8
+
+/* How many events will be sent before the test completes?  Note that
+ * we keep a naive array of latencies to compute variance, so this
+ * value has memory costs.
+ */
+#define MAX_EVENTS 40
+
+/* The "messages" dispatched by our MetaIRQ thread */
+struct msg {
+	/* Sequence number */
+	u32_t seq;
+
+	/* Cycle time when the message was "received" */
+	u32_t timestamp;
+
+	/* Thread to which the message is targeted */
+	u32_t target;
+
+	/* Cycles of processing the message requires */
+	u32_t proc_cyc;
+
+	/* Cycles of latency before the MetaIRQ thread received the message */
+	u32_t metairq_latency;
+};
+
+/* Initialize the fake "message" device, after this messages will
+ * begin arriving via message_dev_fetch().
+ */
+void message_dev_init(void);
+
+/* Retrieve the next message from the "device", blocks until
+ * delivery
+ */
+void message_dev_fetch(struct msg *m);
+
+#endif
--- a/samples/scheduler/metairq_dispatch/src/msgdev.c
+++ b/samples/scheduler/metairq_dispatch/src/msgdev.c
@ -0,0 +1,104 @@
				@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <timeout_q.h>
+#include "main.h"
+
+/* This file implements a fake device that creates and enqueues
+ * "struct msg" messages for handling by the rest of the test.  It's
+ * based on Zephyr kernel timeouts only.
+ */
+
+/* Note: we use the internal timeout API to get tick precision,
+ * k_timer limits us to milliseconds.
+ */
+static struct _timeout timeout;
+
+/* The "proc_cyc" parameter in the message, indicating how many cycles
+ * the target thread should delay while "processing" the message, will
+ * be a random number between zero and this value.
+ */
+u32_t max_duty_cyc;
+
+u32_t msg_seq;
+
+K_MSGQ_DEFINE(hw_msgs, sizeof(struct msg), 2, sizeof(u32_t));
+
+static void timeout_reset(void);
+
+/* Use a custom RNG for good statistics, sys_rand32_get() is just a
+ * timer counter on some platforms.  Note that this is used only
+ * inside the ISR and needs no locking for the otherwise non-atomic
+ * state.
+ */
+static u32_t rand32(void)
+{
+	static u64_t state;
+
+	if (!state) {
+		state = ((u64_t)k_cycle_get_32()) << 16;
+	}
+
+	/* MMIX LCRNG parameters */
+	state = state * 6364136223846793005ULL + 1442695040888963407ULL;
+	return (u32_t)(state >> 32);
+}
+
+/* This acts as the "ISR" for our fake device.  It "reads from the
+ * hardware" a single timestamped message which needs to be dispatched
+ * (by the MetaIRQ) to a random thread, with a random argument
+ * indicating how long the thread should "process" the message.
+ */
+static void dev_timer_expired(struct _timeout *t)
+{
+	__ASSERT_NO_MSG(t == &timeout);
+	u32_t timestamp = k_cycle_get_32();
+	struct msg m;
+
+	m.seq = msg_seq++;
+	m.timestamp = timestamp;
+	m.target = rand32() % NUM_THREADS;
+	m.proc_cyc = rand32() % max_duty_cyc;
+
+	int ret = k_msgq_put(&hw_msgs, &m, K_NO_WAIT);
+
+	if (ret != 0) {
+		printk("ERROR: Queue full, event dropped!\n");
+	}
+
+	if (m.seq < MAX_EVENTS) {
+		timeout_reset();
+	}
+}
+
+static void timeout_reset(void)
+{
+	u32_t ticks = rand32() % MAX_EVENT_DELAY_TICKS;
+
+	z_add_timeout(&timeout, dev_timer_expired, ticks);
+}
+
+void message_dev_init(void)
+{
+	/* Compute a bound for the proc_cyc message parameter such
+	 * that on average we request a known percent of available
+	 * CPU.  We want the load to sometimes back up and require
+	 * queueing, but to be achievable over time.
+	 */
+	u64_t cyc_per_tick = k_ticks_to_cyc_near64(1);
+	u64_t avg_ticks_per_event = MAX_EVENT_DELAY_TICKS / 2;
+	u64_t avg_cyc_per_event = cyc_per_tick * avg_ticks_per_event;
+
+	max_duty_cyc = (2 * avg_cyc_per_event * AVERAGE_LOAD_TARGET_PCT) / 100;
+
+	z_add_timeout(&timeout, dev_timer_expired, K_NO_WAIT);
+}
+
+void message_dev_fetch(struct msg *m)
+{
+	int ret = k_msgq_get(&hw_msgs, m, K_FOREVER);
+
+	__ASSERT_NO_MSG(ret == 0);
+}