Browse Source
This sample showcases efficient utilization of SMP system with processing of independent resource-hungry workloads. With no cross-dependencies between workers and no usage of shared resources (during heavy-lifting itself) we may demonstrate almost linear scaling of efficiency. I.e. 2 cores do the same amount of calculations twice faster than only 1 core. 4 cores complete the same calculations 2 times faster than 2 cores. Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>pull/20294/head
7 changed files with 200 additions and 0 deletions
@ -0,0 +1,10 @@
@@ -0,0 +1,10 @@
|
||||
.. _smp-samples: |
||||
|
||||
Various SMP Samples |
||||
################### |
||||
|
||||
.. toctree:: |
||||
:maxdepth: 1 |
||||
:glob: |
||||
|
||||
**/* |
@ -0,0 +1,8 @@
@@ -0,0 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0 |
||||
|
||||
cmake_minimum_required(VERSION 3.13.1) |
||||
|
||||
include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE) |
||||
project(smp_pi) |
||||
|
||||
target_sources(app PRIVATE src/main.c) |
@ -0,0 +1,46 @@
@@ -0,0 +1,46 @@
|
||||
.. _smp_pi: |
||||
|
||||
SMP Pi |
||||
########### |
||||
|
||||
Overview |
||||
******** |
||||
This sample application calculates Pi independently in many threads, and |
||||
demonstrates the benefit of multiple execution units (CPU cores) |
||||
when compute-intensive tasks can be run in parallel, with |
||||
no cross-dependencies or shared resources. |
||||
|
||||
By changing the value of CONFIG_MP_NUM_CPUS on SMP systems, you |
||||
can see that using more cores takes almost linearly less time |
||||
to complete the computational task. |
||||
|
||||
You can also edit the sample source code to change the |
||||
number of digits calculated (``DIGITS_NUM``), and the |
||||
number of threads to use (``THREADS_NUM``). |
||||
|
||||
Building and Running |
||||
******************** |
||||
|
||||
This project outputs Pi values calculated by each thread and in the end total time |
||||
required for all the calculation to be done. It can be built and executed |
||||
on Synopsys ARC HSDK board as follows: |
||||
|
||||
.. zephyr-app-commands:: |
||||
:zephyr-app: samples/smp_pi |
||||
:host-os: unix |
||||
:board: qemu_x86_64 |
||||
:goals: run |
||||
:compact: |
||||
|
||||
Sample Output |
||||
============= |
||||
|
||||
.. code-block:: console |
||||
|
||||
Calculate first 240 digits of Pi independently by 16 threads. |
||||
Pi value calculated by thread #0: 3141592653589793238462643383279502884197... |
||||
Pi value calculated by thread #1: 3141592653589793238462643383279502884197... |
||||
... |
||||
Pi value calculated by thread #14: 314159265358979323846264338327950288419... |
||||
Pi value calculated by thread #15: 314159265358979323846264338327950288419... |
||||
All 16 threads executed by 4 cores in 28 msec |
@ -0,0 +1,5 @@
@@ -0,0 +1,5 @@
|
||||
# Allow worker threads to capture all resources |
||||
CONFIG_MAIN_THREAD_PRIORITY=11 |
||||
|
||||
# Enable SMP |
||||
CONFIG_SMP=y |
@ -0,0 +1,18 @@
@@ -0,0 +1,18 @@
|
||||
sample: |
||||
description: Calculation of Pi independently in |
||||
a number of threads |
||||
name: SMP Pi |
||||
common: |
||||
tags: introduction |
||||
harness: console |
||||
harness_config: |
||||
type: multi_line |
||||
ordered: yes |
||||
regex: |
||||
- "Calculate first [0-9]+ digits of Pi independently by [0-9]+ threads.(.*)" |
||||
- "Pi value calculated by thread #[0-9]+: [0-9]+(.*)" |
||||
- "All [0-9]+ threads executed by [0-9]+ cores in [0-9]+ msec(.*)" |
||||
tests: |
||||
sample.smp_pi: |
||||
tags: introduction |
||||
platform_whitelist: nsim_hs_smp qemu_x86_64 |
@ -0,0 +1,112 @@
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Copyright (c) 2019 Synopsys, Inc. |
||||
* |
||||
* SPDX-License-Identifier: Apache-2.0 |
||||
*/ |
||||
|
||||
#include <zephyr.h> |
||||
#include <stdio.h> |
||||
|
||||
/* Amount of execution threads to create and run */ |
||||
#define THREADS_NUM 16 |
||||
|
||||
/*
|
||||
* Amount of digits of Pi to calculate, must be a multiple of 4, |
||||
* as used algorythm spits 4 digits on every iteration. |
||||
*/ |
||||
#define DIGITS_NUM 240 |
||||
|
||||
#define LENGTH ((DIGITS_NUM / 4) * 14) |
||||
#define STACK_SIZE (LENGTH * sizeof(int) + 512) |
||||
|
||||
#ifdef CONFIG_SMP |
||||
#define CORES_NUM CONFIG_MP_NUM_CPUS |
||||
#else |
||||
#define CORES_NUM 1 |
||||
#endif |
||||
|
||||
static K_THREAD_STACK_ARRAY_DEFINE(tstack, THREADS_NUM, STACK_SIZE); |
||||
static struct k_thread tthread[THREADS_NUM]; |
||||
static char buffer[THREADS_NUM][DIGITS_NUM + 1]; |
||||
static atomic_t counter = THREADS_NUM; |
||||
|
||||
void test_thread(void *arg1, void *arg2, void *arg3) |
||||
{ |
||||
atomic_t *counter = (atomic_t *)arg1; |
||||
char *buffer = (char *)arg2; |
||||
|
||||
ARG_UNUSED(arg3); |
||||
|
||||
/*
|
||||
* Adapted and improved (for random number of digits) version of Pi |
||||
* calculation program initially proposed by Dik T. Winter as: |
||||
* -------------------------------->8-------------------------------- |
||||
* int a=10000,b,c=2800,d,e,f[2801],g;main(){for(;b-c;)f[b++]=a/5; |
||||
* for(;d=0,g=c*2;c-=14,printf("%.4d",e+d/a),e=d%a)for(b=c;d+=f[b]*a, |
||||
* f[b]=d%--g,d/=g--,--b;d*=b);} |
||||
* -------------------------------->8-------------------------------- |
||||
*/ |
||||
#define NEW_BASE 10000 |
||||
#define ARRAY_INIT 2000 |
||||
|
||||
int array[LENGTH + 1] = {}; |
||||
int carry = 0; |
||||
int i, j; |
||||
|
||||
for (i = 0; i < LENGTH; i++) |
||||
array[i] = ARRAY_INIT; |
||||
|
||||
for (i = LENGTH; i > 0; i -= 14) { |
||||
int sum = 0, value; |
||||
|
||||
for (j = i; j > 0; --j) { |
||||
sum = sum * j + NEW_BASE * array[j]; |
||||
array[j] = sum % (j * 2 - 1); |
||||
sum /= j * 2 - 1; |
||||
} |
||||
|
||||
value = carry + sum / NEW_BASE; |
||||
carry = sum % NEW_BASE; |
||||
|
||||
/* Convert 4-digit int to string */ |
||||
sprintf(buffer, "%.4d", value); |
||||
buffer += 4; |
||||
} |
||||
|
||||
atomic_dec(counter); |
||||
} |
||||
|
||||
void main(void) |
||||
{ |
||||
u32_t start_time, stop_time, cycles_spent, nanoseconds_spent; |
||||
int i; |
||||
|
||||
printk("Calculate first %d digits of Pi independently by %d threads.\n", |
||||
DIGITS_NUM, THREADS_NUM); |
||||
|
||||
/* Capture initial time stamp */ |
||||
start_time = k_cycle_get_32(); |
||||
|
||||
for (i = 0; i < THREADS_NUM; i++) { |
||||
k_thread_create(&tthread[i], tstack[i], STACK_SIZE, |
||||
(k_thread_entry_t)test_thread, |
||||
(void *)&counter, (void *)buffer[i], NULL, |
||||
K_PRIO_COOP(10), 0, K_NO_WAIT); |
||||
} |
||||
|
||||
/* Wait for all workers to finish their calculations */ |
||||
while (counter) |
||||
k_sleep(1); |
||||
|
||||
/* Capture final time stamp */ |
||||
stop_time = k_cycle_get_32(); |
||||
|
||||
cycles_spent = stop_time - start_time; |
||||
nanoseconds_spent = SYS_CLOCK_HW_CYCLES_TO_NS(cycles_spent); |
||||
|
||||
for (i = 0; i < THREADS_NUM; i++) |
||||
printk("Pi value calculated by thread #%d: %s\n", i, buffer[i]); |
||||
|
||||
printk("All %d threads executed by %d cores in %d msec\n", THREADS_NUM, |
||||
CORES_NUM, nanoseconds_spent / 1000 / 1000); |
||||
} |
Loading…
Reference in new issue