Samples: Add SMP Pi

This sample showcases efficient utilization of SMP system with processing of independent resource-hungry workloads. With no cross-dependencies between workers and no usage of shared resources (during heavy-lifting itself) we may demonstrate almost linear scaling of efficiency. I.e. 2 cores do the same amount of calculations twice faster than only 1 core. 4 cores complete the same calculations 2 times faster than 2 cores. Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
6 years ago · 7c3498dc43
7 changed files with 200 additions and 0 deletions
--- a/samples/index.rst
+++ b/samples/index.rst
@ -25,6 +25,7 @@ Samples and Demos
				@@ -25,6 +25,7 @@ Samples and Demos
   posix/*
   gui/*
   video/*
+   smp/*

 .. comment
   To add a new sample document, please use the template available under
--- a/samples/smp/index.rst
+++ b/samples/smp/index.rst
@ -0,0 +1,10 @@
				@@ -0,0 +1,10 @@
+.. _smp-samples:
+
+Various SMP Samples
+###################
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   **/*
--- a/samples/smp/pi/CMakeLists.txt
+++ b/samples/smp/pi/CMakeLists.txt
@ -0,0 +1,8 @@
				@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.13.1)
+
+include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
+project(smp_pi)
+
+target_sources(app PRIVATE src/main.c)
--- a/samples/smp/pi/README.rst
+++ b/samples/smp/pi/README.rst
@ -0,0 +1,46 @@
				@@ -0,0 +1,46 @@
+.. _smp_pi:
+
+SMP Pi
+###########
+
+Overview
+********
+This sample application calculates Pi independently in many threads, and
+demonstrates the benefit of multiple execution units (CPU cores)
+when compute-intensive tasks can be run in parallel, with
+no cross-dependencies or shared resources.
+
+By changing the value of CONFIG_MP_NUM_CPUS on SMP systems, you
+can see that using more cores takes almost linearly less time
+to complete the computational task.
+
+You can also edit the sample source code to change the
+number of digits calculated (``DIGITS_NUM``), and the
+number of threads to use (``THREADS_NUM``).
+
+Building and Running
+********************
+
+This project outputs Pi values calculated by each thread and in the end total time
+required for all the calculation to be done. It can be built and executed
+on Synopsys ARC HSDK board as follows:
+
+.. zephyr-app-commands::
+   :zephyr-app: samples/smp_pi
+   :host-os: unix
+   :board: qemu_x86_64
+   :goals: run
+   :compact:
+
+Sample Output
+=============
+
+.. code-block:: console
+
+    Calculate first 240 digits of Pi independently by 16 threads.
+    Pi value calculated by thread #0: 3141592653589793238462643383279502884197...
+    Pi value calculated by thread #1: 3141592653589793238462643383279502884197...
+    ...
+    Pi value calculated by thread #14: 314159265358979323846264338327950288419...
+    Pi value calculated by thread #15: 314159265358979323846264338327950288419...
+    All 16 threads executed by 4 cores in 28 msec
--- a/samples/smp/pi/prj.conf
+++ b/samples/smp/pi/prj.conf
@ -0,0 +1,5 @@
				@@ -0,0 +1,5 @@
+# Allow worker threads to capture all resources
+CONFIG_MAIN_THREAD_PRIORITY=11
+
+# Enable SMP
+CONFIG_SMP=y
--- a/samples/smp/pi/sample.yaml
+++ b/samples/smp/pi/sample.yaml
@ -0,0 +1,18 @@
				@@ -0,0 +1,18 @@
+sample:
+  description: Calculation of Pi independently in
+    a number of threads
+  name: SMP Pi
+common:
+    tags: introduction
+    harness: console
+    harness_config:
+      type: multi_line
+      ordered: yes
+      regex:
+        - "Calculate first [0-9]+ digits of Pi independently by [0-9]+ threads.(.*)"
+        - "Pi value calculated by thread #[0-9]+: [0-9]+(.*)"
+        - "All [0-9]+ threads executed by [0-9]+ cores in [0-9]+ msec(.*)"
+tests:
+  sample.smp_pi:
+    tags: introduction
+    platform_whitelist: nsim_hs_smp qemu_x86_64
--- a/samples/smp/pi/src/main.c
+++ b/samples/smp/pi/src/main.c
@ -0,0 +1,112 @@
				@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 Synopsys, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <zephyr.h>
+#include <stdio.h>
+
+/* Amount of execution threads to create and run */
+#define THREADS_NUM	16
+
+/*
+ * Amount of digits of Pi to calculate, must be a multiple of 4,
+ * as used algorythm spits 4 digits on every iteration.
+ */
+#define DIGITS_NUM	240
+
+#define LENGTH		((DIGITS_NUM / 4) * 14)
+#define STACK_SIZE	(LENGTH * sizeof(int) + 512)
+
+#ifdef CONFIG_SMP
+#define CORES_NUM	CONFIG_MP_NUM_CPUS
+#else
+#define CORES_NUM	1
+#endif
+
+static K_THREAD_STACK_ARRAY_DEFINE(tstack, THREADS_NUM, STACK_SIZE);
+static struct k_thread tthread[THREADS_NUM];
+static char buffer[THREADS_NUM][DIGITS_NUM + 1];
+static atomic_t counter = THREADS_NUM;
+
+void test_thread(void *arg1, void *arg2, void *arg3)
+{
+	atomic_t *counter = (atomic_t *)arg1;
+	char *buffer = (char *)arg2;
+
+	ARG_UNUSED(arg3);
+
+	/*
+	 * Adapted and improved (for random number of digits) version of Pi
+	 * calculation program initially proposed by Dik T. Winter as:
+	 * -------------------------------->8--------------------------------
+	 * int a=10000,b,c=2800,d,e,f[2801],g;main(){for(;b-c;)f[b++]=a/5;
+	 * for(;d=0,g=c*2;c-=14,printf("%.4d",e+d/a),e=d%a)for(b=c;d+=f[b]*a,
+	 * f[b]=d%--g,d/=g--,--b;d*=b);}
+	 * -------------------------------->8--------------------------------
+	 */
+	#define NEW_BASE	10000
+	#define ARRAY_INIT	2000
+
+	int array[LENGTH + 1] = {};
+	int carry = 0;
+	int i, j;
+
+	for (i = 0; i < LENGTH; i++)
+		array[i] = ARRAY_INIT;
+
+	for (i = LENGTH; i > 0; i -= 14) {
+		int sum = 0, value;
+
+		for (j = i; j > 0; --j) {
+			sum = sum * j + NEW_BASE * array[j];
+			array[j] = sum % (j * 2 - 1);
+			sum /= j * 2 - 1;
+		}
+
+		value = carry + sum / NEW_BASE;
+		carry = sum % NEW_BASE;
+
+		/* Convert 4-digit int to string */
+		sprintf(buffer, "%.4d", value);
+		buffer += 4;
+	}
+
+	atomic_dec(counter);
+}
+
+void main(void)
+{
+	u32_t start_time, stop_time, cycles_spent, nanoseconds_spent;
+	int i;
+
+	printk("Calculate first %d digits of Pi independently by %d threads.\n",
+	       DIGITS_NUM, THREADS_NUM);
+
+	/* Capture initial time stamp */
+	start_time = k_cycle_get_32();
+
+	for (i = 0; i < THREADS_NUM; i++) {
+		k_thread_create(&tthread[i], tstack[i], STACK_SIZE,
+			       (k_thread_entry_t)test_thread,
+			       (void *)&counter, (void *)buffer[i], NULL,
+			       K_PRIO_COOP(10), 0, K_NO_WAIT);
+	}
+
+	/* Wait for all workers to finish their calculations */
+	while (counter)
+		k_sleep(1);
+
+	/* Capture final time stamp */
+	stop_time = k_cycle_get_32();
+
+	cycles_spent = stop_time - start_time;
+	nanoseconds_spent = SYS_CLOCK_HW_CYCLES_TO_NS(cycles_spent);
+
+	for (i = 0; i < THREADS_NUM; i++)
+		printk("Pi value calculated by thread #%d: %s\n", i, buffer[i]);
+
+	printk("All %d threads executed by %d cores in %d msec\n", THREADS_NUM,
+	       CORES_NUM, nanoseconds_spent / 1000 / 1000);
+}