Compare revisions

5a04518a · 5a04518a · 5a04518a · 5a04518a · 5a04518a · 5a04518a
--- a/docs.it4i/src/ompi/hello_oshmem_c.c
+++ b/docs.it4i/src/ompi/hello_oshmem_c.c
+/*
+ * Copyright (c) 2014      Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * Copyright (c) 2015 Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <stdio.h>
+#include "shmem.h"
+
+#if !defined(OSHMEM_SPEC_VERSION) || OSHMEM_SPEC_VERSION < 10200
+#error This application uses API 1.2 and up
+#endif
+
+int main(int argc, char* argv[])
+{
+    int proc, nproc;
+    char name[SHMEM_MAX_NAME_LEN];
+    int major, minor;
+
+    shmem_init();
+    nproc = shmem_n_pes();
+    proc = shmem_my_pe();
+    shmem_info_get_name(name);
+    shmem_info_get_version(&major, &minor);
+
+    printf("Hello, world, I am %d of %d: %s (version: %d.%d)\n",
+           proc, nproc, name, major, minor);
+    shmem_finalize();
+
+    return 0;
+}
--- a/docs.it4i/src/ompi/hello_oshmem_cxx.cc
+++ b/docs.it4i/src/ompi/hello_oshmem_cxx.cc
+/*
+ * Copyright (c) 2014      Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2017      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <iostream>
+#include "shmem.h"
+
+#if !defined(OSHMEM_SPEC_VERSION) || OSHMEM_SPEC_VERSION < 10200
+#error This application uses API 1.2 and up
+#endif
+
+int main(int argc, char* argv[])
+{
+    int proc, nproc;
+    char name[SHMEM_MAX_NAME_LEN];
+    int major, minor;
+
+    shmem_init();
+    nproc = shmem_n_pes();
+    proc = shmem_my_pe();
+    shmem_info_get_name(name);
+    shmem_info_get_version(&major, &minor);
+
+    std::cout << "Hello, world, I am " << proc << " of " << nproc << ": " << name
+              << " (version: " << major << "." << minor << ")" << std::endl;
+
+    shmem_finalize();
+
+    return 0;
+}
--- a/docs.it4i/src/ompi/hello_oshmemfh.f90
+++ b/docs.it4i/src/ompi/hello_oshmemfh.f90
+!
+! Copyright (c) 2014      Mellanox Technologies, Inc.
+!                         All rights reserved.
+! Copyright (c) 2014-2015 Cisco Systems, Inc.  All rights reserved.
+! $COPYRIGHT$
+!
+! Additional copyrights may follow
+!
+! $HEADER$
+!
+program hello_oshmem
+    implicit none
+    include 'shmem.fh'
+
+    integer proc, nproc
+    integer shmem_my_pe, shmem_n_pes
+    integer major, minor, len
+    character(len=SHMEM_MAX_NAME_LEN) name
+
+    call SHMEM_INIT()
+    proc = SHMEM_MY_PE()
+    nproc = SHMEM_N_PES()
+    call SHMEM_INFO_GET_VERSION(major, minor)
+    call SHMEM_INFO_GET_NAME(name)
+
+    write(*, '("Hello, world, I am ", i2, " of ", i2, ": (version: ", i0, ".", i0, ")")') proc, nproc, major, minor
+    call SHMEM_FINALIZE()
+
+end program hello_oshmem
--- a/docs.it4i/src/ompi/hello_usempi.f90
+++ b/docs.it4i/src/ompi/hello_usempi.f90
+!
+! Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+!                         University Research and Technology
+!                         Corporation.  All rights reserved.
+! Copyright (c) 2004-2005 The Regents of the University of California.
+!                         All rights reserved.
+! Copyright (c) 2006-2015 Cisco Systems, Inc.  All rights reserved.
+! $COPYRIGHT$
+!
+! Sample MPI "hello world" application using the Fortran mpi module
+! bindings.
+!
+program main
+    use mpi
+    implicit none
+    integer :: ierr, rank, size, len
+    character(len=MPI_MAX_LIBRARY_VERSION_STRING) :: version
+
+    call MPI_INIT(ierr)
+    call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
+    call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
+    call MPI_GET_LIBRARY_VERSION(version, len, ierr)
+
+    write(*, '("Hello, world, I am ", i2, " of ", i2, ": ", a)') &
+          rank, size, version
+
+    call MPI_FINALIZE(ierr)
+end
--- a/docs.it4i/src/ompi/hello_usempif08.f90
+++ b/docs.it4i/src/ompi/hello_usempif08.f90
+! -*- f90 -*-
+!
+! Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+!                         University Research and Technology
+!                         Corporation.  All rights reserved.
+! Copyright (c) 2004-2005 The Regents of the University of California.
+!                         All rights reserved.
+! Copyright (c) 2006-2015 Cisco Systems, Inc.  All rights reserved.
+! Copyright (c) 2009-2012 Los Alamos National Security, LLC.
+!                         All rights reserved.
+! $COPYRIGHT$
+!
+! Sample MPI "hello world" application using the Fortran mpi_f08
+! module bindings.
+!
+program main
+    use mpi_f08
+    implicit none
+    integer :: rank, size, len
+    character(len=MPI_MAX_LIBRARY_VERSION_STRING) :: version
+
+    call MPI_INIT()
+    call MPI_COMM_RANK(MPI_COMM_WORLD, rank)
+    call MPI_COMM_SIZE(MPI_COMM_WORLD, size)
+    call MPI_GET_LIBRARY_VERSION(version, len)
+
+    write(*, '("Hello, world, I am ", i2, " of ", i2, ": ", a)') &
+          rank, size, version
+
+    call MPI_FINALIZE()
+end
--- a/docs.it4i/src/ompi/ompi.tar.gz
+++ b/docs.it4i/src/ompi/ompi.tar.gz
--- a/docs.it4i/src/ompi/oshmem_circular_shift.c
+++ b/docs.it4i/src/ompi/oshmem_circular_shift.c
+/*
+ * Copyright (c) 2014-2016 Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <stdio.h>
+#include <shmem.h>
+
+int main (void)
+{
+    static int aaa, bbb;
+    int num_pes, my_pe, peer;
+
+    shmem_init();
+
+    num_pes = shmem_n_pes();
+    my_pe = shmem_my_pe();
+
+    peer = (my_pe + 1) % num_pes;
+
+    printf("Process %d gets message from %d (%d processes in ring)\n", my_pe, peer, num_pes);
+    shmem_int_get(&aaa, &bbb, 1, peer);
+
+    shmem_barrier_all();
+    printf("Process %d exiting\n", my_pe);
+    shmem_finalize();
+
+    return 0;
+}
+
--- a/docs.it4i/src/ompi/oshmem_max_reduction.c
+++ b/docs.it4i/src/ompi/oshmem_max_reduction.c
+/*
+ * Copyright (c) 2014-2016 Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * reduce [0,1,2] + _my_pe() across 4 PEs with MAX()
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <shmem.h>
+
+long pSync[_SHMEM_BCAST_SYNC_SIZE];
+
+#define N 3
+
+long src[N];
+long dst[N];
+long pWrk[_SHMEM_REDUCE_SYNC_SIZE];
+
+int  main(void)
+{
+    int i;
+    int my_pe, num_pes;
+
+    for (i = 0; i < SHMEM_BCAST_SYNC_SIZE; i += 1) {
+        pSync[i] = _SHMEM_SYNC_VALUE;
+    }
+
+    shmem_init();
+
+    my_pe = shmem_my_pe();
+    num_pes = shmem_n_pes();
+
+    for (i = 0; i < N; i += 1) {
+        src[i] = my_pe + i;
+    }
+
+    shmem_barrier_all();
+
+    shmem_long_max_to_all(dst, src, N, 0, 0, num_pes, pWrk, pSync);
+
+    printf("%d/%d dst =", my_pe, num_pes);
+
+    for (i = 0; i < N; i+= 1) {
+        printf(" %ld", dst[i]);
+    }
+
+    printf("\n");
+    shmem_finalize();
+
+    return 0;
+}
+
--- a/docs.it4i/src/ompi/oshmem_shmalloc.c
+++ b/docs.it4i/src/ompi/oshmem_shmalloc.c
+/*
+ * Copyright (c) 2014-2016 Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This sample allocates (shmalloc) symmetric memory (1 long integer),
+ * and then frees it. Success of allocation is not checked.
+ *
+ * Produces no output.
+ */
+
+#include <shmem.h>
+
+int main(void)
+{
+    long *x;
+
+    shmem_init();
+
+    x = (long *) shmem_malloc(sizeof(*x));
+
+    shmem_free(x);
+
+    shmem_finalize();
+}
+
--- a/docs.it4i/src/ompi/oshmem_strided_puts.c
+++ b/docs.it4i/src/ompi/oshmem_strided_puts.c
+/*
+ * Copyright (c) 2014-2016 Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This program is an adaptation of examples found in the man pages
+ * of SGI’s SHMEM implementation.
+ *
+ * In this program, iput is used to select 5 elements from array source separated by
+ * a stride of 2 and write them to array target using a stride of 1.
+ *
+ * Given the array source = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }
+ * iput will select 5 elements from array source on PE 0, using a stride of 2:
+ *
+ * selected elements = { 1, 3, 5, 7, 9 }
+ *
+ * These elements will then be written to the array source on PE 1 using a stride of 1:
+ *
+ * target = { 1, 3, 5, 7, 9 }
+ *
+ */
+
+#include <stdio.h>
+#include <shmem.h>
+
+int main(void)
+{
+    short source[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    static short target[10];
+    int me;
+
+    shmem_init();
+    me = shmem_my_pe();
+
+    if (me == 0) {
+        /* put 10 words into target on PE 1 */
+        shmem_short_iput(target, source, 1, 2, 5, 1);
+    }
+
+    shmem_barrier_all(); /* sync sender and receiver */
+
+    if (me == 1) {
+        printf("target on PE %d is %hd %hd %hd %hd %hd\n", me,
+        target[0], target[1], target[2],
+        target[3], target[4] );
+    }
+    shmem_barrier_all(); /* sync before exiting */
+    shmem_finalize();
+
+    return 0;
+}
--- a/docs.it4i/src/ompi/oshmem_symmetric_data.c
+++ b/docs.it4i/src/ompi/oshmem_symmetric_data.c
+/*
+ * Copyright (c) 2014-2016 Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <stdio.h>
+#include <shmem.h>
+
+#define SIZE 16
+
+int main(int argc, char* argv[])
+{
+    short source[SIZE];
+    static short target[SIZE];
+    int i;
+    int num_pe, my_pe;
+
+    shmem_init();
+
+    num_pe = shmem_n_pes();
+    my_pe = shmem_my_pe();
+
+    if (my_pe == 0) {
+        /* initialize array */
+        for(i = 0; i < SIZE; i++) {
+            source[i] = i;
+        }
+        /* local, not symmetric */
+        /* static makes it symmetric */
+        /* put "size" words into target on each PE */
+        for(i = 1; i < num_pe; i++) {
+            shmem_short_put(target, source, SIZE, i);
+        }
+    }
+
+    shmem_barrier_all(); /* sync sender and receiver */
+
+    if (my_pe != 0) {
+        printf("Target on PE %d is \t", my_pe);
+
+        for(i = 0; i < SIZE; i++) {
+            printf("%hd \t", target[i]);
+        }
+        printf("\n");
+    }
+
+    shmem_barrier_all(); /* sync before exiting */
+    shmem_finalize();
+
+    return 0;
+}
--- a/docs.it4i/src/ompi/ring_c.c
+++ b/docs.it4i/src/ompi/ring_c.c
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
+ *
+ * Simple ring test program in C.
+ */
+
+#include <stdio.h>
+#include "mpi.h"
+
+int main(int argc, char *argv[])
+{
+    int rank, size, next, prev, message, tag = 201;
+
+    /* Start up MPI */
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    /* Calculate the rank of the next process in the ring.  Use the
+       modulus operator so that the last process "wraps around" to
+       rank zero. */
+
+    next = (rank + 1) % size;
+    prev = (rank + size - 1) % size;
+
+    /* If we are the "master" process (i.e., MPI_COMM_WORLD rank 0),
+       put the number of times to go around the ring in the
+       message. */
+
+    if (0 == rank) {
+        message = 10;
+
+        printf("Process 0 sending %d to %d, tag %d (%d processes in ring)\n",
+               message, next, tag, size);
+        MPI_Send(&message, 1, MPI_INT, next, tag, MPI_COMM_WORLD);
+        printf("Process 0 sent to %d\n", next);
+    }
+
+    /* Pass the message around the ring.  The exit mechanism works as
+       follows: the message (a positive integer) is passed around the
+       ring.  Each time it passes rank 0, it is decremented.  When
+       each processes receives a message containing a 0 value, it
+       passes the message on to the next process and then quits.  By
+       passing the 0 message first, every process gets the 0 message
+       and can quit normally. */
+
+    while (1) {
+        MPI_Recv(&message, 1, MPI_INT, prev, tag, MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+
+        if (0 == rank) {
+            --message;
+            printf("Process 0 decremented value: %d\n", message);
+        }
+
+        MPI_Send(&message, 1, MPI_INT, next, tag, MPI_COMM_WORLD);
+        if (0 == message) {
+            printf("Process %d exiting\n", rank);
+            break;
+        }
+    }
+
+    /* The last process does one extra send to process 0, which needs
+       to be received before the program can exit */
+
+    if (0 == rank) {
+        MPI_Recv(&message, 1, MPI_INT, prev, tag, MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+    }
+
+    /* All done */
+
+    MPI_Finalize();
+    return 0;
+}
--- a/docs.it4i/src/ompi/ring_cxx.cc
+++ b/docs.it4i/src/ompi/ring_cxx.cc
+//
+// Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+//                         University Research and Technology
+//                         Corporation.  All rights reserved.
+// Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
+//
+// Simple ring test program in C++.
+//
+// NOTE: The MPI C++ bindings were deprecated in MPI-2.2 and removed
+// from the standard in MPI-3.  Open MPI still provides C++ MPI
+// bindings, but they are no longer built by default (and may be
+// removed in a future version of Open MPI).  You must
+// --enable-mpi-cxx when configuring Open MPI to enable the MPI C++
+// bindings.
+//
+
+#include "mpi.h"
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+    int rank, size, next, prev, message, tag = 201;
+
+    // Start up MPI
+
+    MPI::Init();
+    rank = MPI::COMM_WORLD.Get_rank();
+    size = MPI::COMM_WORLD.Get_size();
+
+    // Calculate the rank of the next process in the ring.  Use the
+    // modulus operator so that the last process "wraps around" to
+    // rank zero.
+
+    next = (rank + 1) % size;
+    prev = (rank + size - 1) % size;
+
+    // If we are the "master" process (i.e., MPI_COMM_WORLD rank 0),
+    // put the number of times to go around the ring in the message.
+
+    if (0 == rank) {
+        message = 10;
+
+        std::cout << "Process 0 sending " << message << " to " << next
+                  << ", tag " << tag << " (" << size << " processes in ring)"
+                  << std::endl;
+        MPI::COMM_WORLD.Send(&message, 1, MPI::INT, next, tag);
+        std::cout << "Process 0 sent to " << next << std::endl;
+    }
+
+    // Pass the message around the ring.  The exit mechanism works as
+    // follows: the message (a positive integer) is passed around the
+    // ring.  Each time it passes rank 0, it is decremented.  When
+    // each processes receives a message containing a 0 value, it
+    // passes the message on to the next process and then quits.  By
+    // passing the 0 message first, every process gets the 0 message
+    // and can quit normally.
+
+    while (1) {
+        MPI::COMM_WORLD.Recv(&message, 1, MPI::INT, prev, tag);
+
+        if (0 == rank) {
+            --message;
+            std::cout << "Process 0 decremented value: " << message
+                      << std::endl;
+        }
+
+        MPI::COMM_WORLD.Send(&message, 1, MPI::INT, next, tag);
+        if (0 == message) {
+            std::cout << "Process " << rank << " exiting" << std::endl;
+            break;
+        }
+    }
+
+    // The last process does one extra send to process 0, which needs
+    // to be received before the program can exit */
+
+    if (0 == rank) {
+        MPI::COMM_WORLD.Recv(&message, 1, MPI::INT, prev, tag);
+    }
+
+    // All done
+
+    MPI::Finalize();
+    return 0;
+}
--- a/docs.it4i/src/ompi/ring_mpifh.f
+++ b/docs.it4i/src/ompi/ring_mpifh.f
+C
+C Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+C                         University Research and Technology
+C                         Corporation.  All rights reserved.
+C Copyright (c) 2006-2015 Cisco Systems, Inc.  All rights reserved.
+C $COPYRIGHT$
+C
+C Simple ring test program using the mpif.h Fortran bindings.
+C
+      program ring_f77
+      implicit none
+      include 'mpif.h'
+      integer rank, size, tag, next, from, message, ierr
+
+C     Start up MPI */
+
+      call MPI_INIT(ierr)
+      call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
+      call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
+
+C     Calculate the rank of the next process in the ring.  Use the
+C     modulus operator so that the last process "wraps around" to rank
+C     zero.
+
+      tag = 201
+      next = mod((rank + 1), size)
+      from = mod((rank + size - 1), size)
+
+C     If we are the "master" process (i.e., MPI_COMM_WORLD rank 0), put
+C     the number of times to go around the ring in the message.
+
+      if (rank .eq. 0) then
+         message = 10
+
+         write(*, '("Process 0 sending ", i2, " to ", i2, " tag ",
+     &        i3, " (", i2, " processes in ring)")')
+     &        message, next, tag, size
+         call MPI_SEND(message, 1, MPI_INTEGER, next, tag,
+     &        MPI_COMM_WORLD, ierr)
+         write(*, '("Process 0 sent to ", i2)')
+     &        next
+      endif
+
+C     Pass the message around the ring.  The exit mechanism works as
+C     follows: the message (a positive integer) is passed around the
+C     ring.  Each time it passes rank 0, it is decremented.  When each
+C     processes receives a message containing a 0 value, it passes the
+C     message on to the next process and then quits.  By passing the 0
+C     message first, every process gets the 0 message and can quit
+C     normally.
+
+ 10   call MPI_RECV(message, 1, MPI_INTEGER, from, tag,
+     &     MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+      if (rank .eq. 0) then
+         message = message - 1
+         write(*, '("Process 0 decremented value: ", i2)') message
+      endif
+
+      call MPI_SEND(message, 1, MPI_INTEGER, next, tag,
+     &     MPI_COMM_WORLD, ierr)
+
+      if (message .eq. 0) then
+         write(*, '("Process ", i2, " exiting")') rank
+         goto 20
+      endif
+      goto 10
+
+C     The last process does one extra send to process 0, which needs to
+C     be received before the program can exit
+
+ 20   if (rank .eq. 0) then
+         call MPI_RECV(message, 1, MPI_INTEGER, from, tag,
+     &        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+      endif
+
+C     All done
+
+      call MPI_FINALIZE(ierr)
+      end
+
--- a/docs.it4i/src/ompi/ring_oshmem_c.c
+++ b/docs.it4i/src/ompi/ring_oshmem_c.c
+/*
+ * Copyright (c) 2014      Mellanox Technologies, Inc.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <shmem.h>
+#include <stdio.h>
+
+#if !defined(OSHMEM_SPEC_VERSION) || OSHMEM_SPEC_VERSION < 10200
+#error This application uses API 1.2 and up
+#endif
+
+int main (int argc, char * argv[])
+{
+    static int rbuf = -1;
+    int proc, nproc, next;
+    int message = 10;
+
+    shmem_init();
+    nproc = shmem_n_pes();
+    proc = shmem_my_pe();
+
+    /* Calculate the PE number of the next process in the ring.  Use the
+       modulus operator so that the last process "wraps around" to PE 0. */
+
+    next = (proc + 1) % nproc;
+
+    if(proc == 0)
+    {
+        printf("Process 0 puts message %d to %d (%d processes in ring)\n", message, next, nproc);
+        shmem_int_put(&rbuf, &message, 1, next);
+    }
+
+    /* Pass the message around the ring.  The exit mechanism works as
+       follows: the message (a positive integer) is passed around the
+       ring.  Each time it passes PE 0, it is decremented.  When each
+       processes receives a message containing a 0 value, it passes the
+       message on to the next process and then quits.  By passing the 0
+       message first, every process gets the 0 message and can quit
+       normally. */
+
+    while(message > 0) {
+        shmem_int_wait_until(&rbuf, SHMEM_CMP_EQ, message);
+        if(proc == 0) {
+            --message;
+            printf("Process 0 decremented value: %d\n", message);
+        }
+        shmem_int_put(&rbuf, &message, 1, next);
+        if(proc != 0) {
+            --message;
+        }
+    }
+    shmem_finalize();
+
+    /* All done */
+
+    printf("Process %d exiting\n", proc);
+
+    return 0;
+}
--- a/docs.it4i/src/ompi/ring_oshmemfh.f90
+++ b/docs.it4i/src/ompi/ring_oshmemfh.f90
+!
+! Copyright (c) 2014      Mellanox Technologies, Inc.
+!                         All rights reserved.
+! Copyright (c) 2014 Cisco Systems, Inc.  All rights reserved.
+! $COPYRIGHT$
+!
+! Additional copyrights may follow
+!
+! $HEADER$
+!
+
+program ring_oshmem
+    implicit none
+    include 'shmem.fh'
+
+    integer*8, save   :: rbuf
+    integer*8         :: message
+    integer           :: proc, nproc, next
+    integer           :: my_pe, num_pes
+
+    rbuf = -1
+    message = 10
+
+    call start_pes(0)
+    proc = my_pe()
+    nproc = num_pes()
+
+!   Calculate the PE number of the next process in the ring.  Use the
+!   modulus operator so that the last process "wraps around" to PE 0.
+
+    next = mod((proc + 1), nproc)
+
+    if (proc .eq. 0) then
+        write(*, '("Process 0 sending ", i2, " to", i2, " (", i2, " processes in ring)")') message, next, nproc
+        call shmem_put8(rbuf, message, 1, next)
+        write(*, '("Process 0 sent to ", i2)') next
+    end if
+
+!   Pass the message around the ring.  The exit mechanism works as
+!   follows: the message (a positive integer) is passed around the
+!   ring.  Each time it passes PE 0, it is decremented.  When each
+!   processes receives a message containing a 0 value, it passes the
+!   message on to the next process and then quits.  By passing the 0
+!   message first, every process gets the 0 message and can quit
+!   normally.
+
+    do while (message .gt. 0)
+        call shmem_int8_wait_until(rbuf, SHMEM_CMP_EQ, message)
+
+        if (proc .eq. 0) then
+            message = message - 1
+            write(*, '("Process 0 decremented value:", i2)') message
+        end if
+
+        call shmem_put8(rbuf, message, 1, next)
+
+        if (proc .gt. 0) then
+            message = message - 1
+        end if
+    end do
+
+!     All done
+
+    write(*, '("Process", i2," exiting.")') proc
+
+end program ring_oshmem
--- a/docs.it4i/src/ompi/ring_usempi.f90
+++ b/docs.it4i/src/ompi/ring_usempi.f90
+!
+! Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+!                         University Research and Technology
+!                         Corporation.  All rights reserved.
+! Copyright (c) 2006-2015 Cisco Systems, Inc.  All rights reserved.
+! $COPYRIGHT$
+!
+! Simple ring test program using the Fortran mpi module bindings.
+!
+program ring
+  use mpi
+  implicit none
+  integer :: rank, size, tag, next, from, ierr, i, message
+
+! Start up MPI
+
+  call MPI_INIT(ierr)
+  call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
+  call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
+
+! Calculate the rank of the next process in the ring.  Use the modulus
+! operator so that the last process "wraps around" to rank zero.
+
+  tag = 201
+  next = mod((rank + 1), size)
+  from = mod((rank + size - 1), size)
+
+! If we are the "master" process (i.e., MPI_COMM_WORLD rank 0), put
+! the number of times to go around the ring in the message.
+
+  if (rank .eq. 0) then
+     message = 10
+
+      write(*, '("Process 0 sending ", i2, " to ", i2, " tag ", i3, " (", i2, " processes in ring)")') message, next, tag, size
+      call MPI_SEND(message, 1, MPI_INTEGER, next, tag, MPI_COMM_WORLD, ierr)
+      write(*, '("Process 0 sent to ", i2)') next
+  endif
+
+! Pass the message around the ring.  The exit mechanism works as
+! follows: the message (a positive integer) is passed around the ring.
+! Each time it passes rank 0, it is decremented.  When each processes
+! receives a message containing a 0 value, it passes the message on to
+! the next process and then quits.  By passing the 0 message first,
+! every process gets the 0 message and can quit normally.
+
+   i = 1
+10 call MPI_Recv(message, i, MPI_INTEGER, from, tag, MPI_COMM_WORLD, &
+                 MPI_STATUS_IGNORE, ierr)
+
+  if (rank .eq. 0) then
+     message = message - 1
+     write(*, '("Process 0 decremented value: ", i2)') message
+  endif
+
+  call MPI_SEND(message, 1, MPI_INTEGER, next, tag, MPI_COMM_WORLD, ierr)
+
+  if (message .eq. 0) then
+     write(*, '("Process ", i2, " exiting")') rank
+     goto 20
+  endif
+  goto 10
+
+! The last process does one extra send to process 0, which needs to be
+! received before the program can exit
+
+ 20 if (rank .eq. 0) then
+     call MPI_RECV(message, 1, MPI_INTEGER, from, tag, MPI_COMM_WORLD, &
+                   MPI_STATUS_IGNORE, ierr)
+  endif
+
+! All done
+
+  call MPI_FINALIZE(ierr)
+end program
+
--- a/docs.it4i/src/ompi/ring_usempif08.f90
+++ b/docs.it4i/src/ompi/ring_usempif08.f90
+! -*- f90 -*-
+!
+! Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+!                         University Research and Technology
+!                         Corporation.  All rights reserved.
+! Copyright (c) 2006-2015 Cisco Systems, Inc.  All rights reserved.
+! Copyright (c) 2009-2012 Los Alamos National Security, LLC.
+!                         All rights reserved.
+! $COPYRIGHT$
+!
+! Simple ring test program using the Fortran mpi_f08 module bindings.
+!
+program ring
+  use mpi_f08
+  implicit none
+  integer :: rank, size, tag, next, from, i, message
+
+
+! Start up MPI
+
+  call MPI_INIT()
+  call MPI_COMM_RANK(MPI_COMM_WORLD, rank)
+  call MPI_COMM_SIZE(MPI_COMM_WORLD, size)
+
+! Calculate the rank of the next process in the ring.  Use the modulus
+! operator so that the last process "wraps around" to rank zero.
+
+  tag = 201
+  next = mod((rank + 1), size)
+  from = mod((rank + size - 1), size)
+
+! If we are the "master" process (i.e., MPI_COMM_WORLD rank 0), put
+! the number of times to go around the ring in the message.
+
+  if (rank .eq. 0) then
+     message = 10
+
+     write(*, '("Process 0 sending ", i2, " to ", i2, " tag ", i3, " (", i2, " processes in ring)")') message, next, tag, size
+     call MPI_SEND(message, 1, MPI_INTEGER, next, tag, MPI_COMM_WORLD)
+     write(*, '("Process 0 sent to ", i2)') next
+  endif
+
+! Pass the message around the ring.  The exit mechanism works as
+! follows: the message (a positive integer) is passed around the ring.
+! Each time it passes rank 0, it is decremented.  When each processes
+! receives a message containing a 0 value, it passes the message on to
+! the next process and then quits.  By passing the 0 message first,
+! every process gets the 0 message and can quit normally.
+
+   i = 1
+10 call MPI_Recv(message, i, MPI_INTEGER, from, tag, MPI_COMM_WORLD, &
+                 MPI_STATUS_IGNORE)
+
+  if (rank .eq. 0) then
+     message = message - 1
+     write(*, '("Process 0 decremented value: ", i2)') message
+  endif
+
+  call MPI_SEND(message, 1, MPI_INTEGER, next, tag, MPI_COMM_WORLD)
+
+  if (message .eq. 0) then
+     write(*, '("Process ", i2, " exiting")') rank
+     goto 20
+  endif
+  goto 10
+
+! The last process does one extra send to process 0, which needs to be
+! received before the program can exit
+
+ 20 if (rank .eq. 0) then
+     call MPI_RECV(message, 1, MPI_INTEGER, from, tag, MPI_COMM_WORLD, &
+                   MPI_STATUS_IGNORE)
+  endif
+
+! All done
+
+  call MPI_FINALIZE()
+end program
+
--- a/docs.it4i/src/ompi/spc_example.c
+++ b/docs.it4i/src/ompi/spc_example.c
+/*
+ * Copyright (c) 2018      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Simple example usage of SPCs through MPI_T.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "mpi.h"
+
+/* Sends 'num_messages' messages of 'message_size' bytes from rank 0 to rank 1.
+ * All messages are send synchronously and with the same tag in MPI_COMM_WORLD.
+ */
+void message_exchange(int num_messages, int message_size)
+{
+    int i, rank;
+    /* Use calloc to initialize data to 0's */
+    char *data = (char*)calloc(message_size, sizeof(char));
+    MPI_Status status;
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    if(rank == 0) {
+        for(i = 0; i < num_messages; i++)
+            MPI_Send(data, message_size, MPI_BYTE, 1, 123, MPI_COMM_WORLD);
+    } else if(rank == 1) {
+        for(i = 0; i < num_messages; i++)
+            MPI_Recv(data, message_size, MPI_BYTE, 0, 123, MPI_COMM_WORLD, &status);
+    }
+
+    free(data);
+}
+
+int main(int argc, char **argv)
+{
+    int num_messages, message_size;
+
+    if(argc < 3) {
+        printf("Usage: mpirun -np 2 --mca mpi_spc_attach all --mca mpi_spc_dump_enabled true ./spc_example [num_messages] [message_size]\n");
+        return -1;
+    } else {
+        num_messages = atoi(argv[1]);
+        message_size = atoi(argv[2]);
+    }
+
+    int i, rank, size, provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index;
+    MPI_Datatype datatype;
+    MPI_T_enum enumtype;
+    MPI_Comm comm;
+    char name[256], description[256];
+
+    /* Counter names to be read by ranks 0 and 1 */
+    char *counter_names[] = {"runtime_spc_OMPI_BYTES_SENT_USER",
+                             "runtime_spc_OMPI_BYTES_RECEIVED_USER" };
+
+    MPI_Init(NULL, NULL);
+    MPI_T_init_thread(MPI_THREAD_SINGLE, &provided);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    if(size != 2) {
+        fprintf(stderr, "ERROR: This test should be run with two MPI processes.\n");
+        MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+
+    /* Determine the MPI_T pvar indices for the OMPI_BYTES_SENT/RECIEVED_USER SPCs */
+    index = -1;
+    MPI_T_pvar_get_num(&num);
+    for(i = 0; i < num; i++) {
+        name_len = desc_len = 256;
+        PMPI_T_pvar_get_info(i, name, &name_len, &verbosity,
+                             &var_class, &datatype, &enumtype, description, &desc_len, &bind,
+                             &readonly, &continuous, &atomic);
+        if(strcmp(name, counter_names[rank]) == 0) {
+            index = i;
+            printf("[%d] %s -> %s\n", rank, name, description);
+        }
+    }
+
+    /* Make sure we found the counters */
+    if(index == -1) {
+        fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n");
+        MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+
+    int ret;
+    long long value;
+
+    MPI_T_pvar_session session;
+    MPI_T_pvar_handle handle;
+    /* Create the MPI_T sessions/handles for the counters and start the counters */
+    ret = MPI_T_pvar_session_create(&session);
+    ret = MPI_T_pvar_handle_alloc(session, index, NULL, &handle, &count);
+    ret = MPI_T_pvar_start(session, handle);
+
+    message_exchange(num_messages, message_size);
+
+    ret = MPI_T_pvar_read(session, handle, &value);
+    /* Print the counter values in order by rank */
+    for(i = 0; i < 2; i++) {
+        if(i == rank) {
+            printf("[%d] Value Read: %lld\n", rank, value);
+            fflush(stdout);
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    /* Stop the MPI_T session, free the handle, and then free the session */
+    ret = MPI_T_pvar_stop(session, handle);
+    ret = MPI_T_pvar_handle_free(session, &handle);
+    ret = MPI_T_pvar_session_free(&session);
+
+    MPI_T_finalize();
+    MPI_Finalize();
+
+    return 0;
+}
--- a/docs.it4i/src/qnn_example.txt
+++ b/docs.it4i/src/qnn_example.txt
+#!/usr/bin/env python
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import torch
+from torch.autograd import Function
+from torchvision import datasets, transforms
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+
+import cudaq
+from cudaq import spin
+
+
+
+
+# GPU utilities
+for tar in cudaq.get_targets():
+    print(f'{tar.description} {tar.name} {tar.platform} {tar.simulator} {tar.num_qpus}')
+cudaq.set_target("default")  # Set CUDAQ to run on GPU's
+torch.cuda.is_available(
+)  # If this is True then the NVIDIA drivers are correctly installed
+
+torch.cuda.device_count()  # Counts the number of GPU's available
+
+torch.cuda.current_device()
+
+torch.cuda.get_device_name(0)
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+
+
+# Training set
+sample_count = 140
+
+X_train = datasets.FashionMNIST(
+    root="./data",
+    train=True,
+    download=True,
+    transform=transforms.Compose([transforms.ToTensor()]),
+)
+
+# Leaving only labels 0 and 1
+idx = np.append(
+    np.where(X_train.targets == 0)[0][:sample_count],
+    np.where(X_train.targets == 1)[0][:sample_count],
+)
+X_train.data = X_train.data[idx]
+X_train.targets = X_train.targets[idx]
+train_loader = torch.utils.data.DataLoader(X_train, batch_size=1, shuffle=True)
+
+# Test set
+sample_count = 70
+
+X_test = datasets.FashionMNIST(
+    root="./data",
+    train=False,
+    download=True,
+    transform=transforms.Compose([transforms.ToTensor()]),
+)
+idx = np.append(
+    np.where(X_test.targets == 0)[0][:sample_count],
+    np.where(X_test.targets == 1)[0][:sample_count],
+)
+
+X_test.data = X_test.data[idx]
+X_test.targets = X_test.targets[idx]
+
+test_loader = torch.utils.data.DataLoader(X_test, batch_size=1, shuffle=True)
+
+
+class QuantumCircuit:
+    """This class defines the quantum circuit structure and the run method which is used to calculate an expectation value"""
+
+    def __init__(self, qubit_count: int):
+        """Define the quantum circuit in CUDA Quantum"""
+
+        kernel, thetas = cudaq.make_kernel(list)
+
+        self.kernel = kernel
+
+        self.theta = thetas
+
+        qubits = kernel.qalloc(qubit_count)
+
+        self.kernel.h(qubits)
+
+        # Variational gate parameters which are optimised during training
+        kernel.ry(thetas[0], qubits[0])
+        kernel.rx(thetas[1], qubits[0])
+
+    def run(self, thetas: torch.tensor) -> torch.tensor:
+        """Excetute the quantum circuit to output an expectation value"""
+
+        expectation = torch.tensor(cudaq.observe(self.kernel, spin.z(0),
+                                                 thetas).expectation_z(),
+                                   device=device)
+
+        return expectation
+
+
+
+
+class QuantumFunction(Function):
+    """Allows the quantum circuit to pass data through it and compute the gradients"""
+
+    @staticmethod
+    def forward(ctx, thetas: torch.tensor, quantum_circuit,
+                shift) -> torch.tensor:
+        # Save shift and quantum_circuit in context to use in backward
+        ctx.shift = shift
+        ctx.quantum_circuit = quantum_circuit
+
+        # Calculate exp_val
+        expectation_z = ctx.quantum_circuit.run(thetas)
+
+        ctx.save_for_backward(thetas, expectation_z)
+
+        return expectation_z
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward pass computation via finite difference parameter shift"""
+
+        thetas, expectation_z = ctx.saved_tensors
+
+        gradients = torch.zeros(len(thetas), device=device)
+
+        for i in range(len(thetas)):
+            shift_right = torch.clone(thetas)
+
+            shift_right[i] += ctx.shift
+
+            shift_left = torch.clone(thetas)
+
+            shift_left[i] -= ctx.shift
+
+            expectation_right = ctx.quantum_circuit.run(shift_right)
+            expectation_left = ctx.quantum_circuit.run(shift_left)
+
+            gradients[i] = 0.5 * (expectation_right - expectation_left)
+
+        return gradients * grad_output.float(), None, None
+
+
+
+class QuantumLayer(nn.Module):
+    """Encapsulates a quantum circuit and a quantum function into a quantum layer"""
+
+    def __init__(self, shift: torch.tensor):
+        super(QuantumLayer, self).__init__()
+        self.quantum_circuit = QuantumCircuit(1)  # 1 qubit quantum circuit
+        self.shift = shift
+
+    def forward(self, input):
+        ans = QuantumFunction.apply(input, self.quantum_circuit, self.shift)
+
+        return ans
+
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super(Net, self).__init__()
+
+        # Neural network structure
+        self.conv1 = nn.Conv2d(1, 6, kernel_size=5)
+        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
+        self.dropout = nn.Dropout2d()
+        self.fc1 = nn.Linear(256, 64)
+        self.fc2 = nn.Linear(
+            64, 2
+        )  # Output a 2D tensor since we have 2 variational parameters in our quantum circuit
+        self.hybrid = QuantumLayer(
+            torch.tensor(np.pi / 2)
+        )  # Input is the magnitude of the parameter shifts to calculate gradients
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2)
+        x = self.dropout(x)
+        x = x.view(1, -1)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x).reshape(
+            -1)  # Reshapes required to satisfy input dimensions to CUDAQ
+        x = self.hybrid(x).reshape(-1)
+
+        return torch.cat((x, 1 - x), -1).unsqueeze(0)
+
+
+
+
+# We move our model to the CUDA device to minimise data transfer between GPU and CPU
+
+model = Net().to(device)
+print(model)
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+loss_func = nn.NLLLoss().to(device)
+
+epochs = 20
+
+epoch_loss = []
+
+model.train()
+for epoch in range(epochs):
+    batch_loss = 0.0
+    for batch_idx, (data, target) in enumerate(train_loader):  # batch training
+        optimizer.zero_grad()
+
+        data, target = data.to(device), target.to(device)
+
+        # Forward pass
+        output = model(data).to(device)
+        # Calculating loss
+        loss = loss_func(output, target).to(device)
+
+        # Backward pass
+        loss.backward()
+
+        # Optimize the weights
+        optimizer.step()
+
+        batch_loss += loss.item()
+
+    epoch_loss.append(batch_loss / batch_idx)
+
+    print("Training [{:.0f}%]\tLoss: {:.4f}".format(
+        100.0 * (epoch + 1) / epochs, epoch_loss[-1]))
+
+
+
+
+plt.plot(epoch_loss)
+plt.title("Hybrid NN Training Convergence")
+plt.xlabel("Training Iterations")
+
+plt.ylabel("Neg Log Likelihood Loss")
+
+
+
+
+# Testing on the test set
+
+model.eval()
+with torch.no_grad():
+    correct = 0
+    for batch_idx, (data, target) in enumerate(test_loader):
+        data, target = data.to(device), target.to(device)
+
+        output = model(data).to(device)
+
+        pred = output.argmax(dim=1, keepdim=True)
+        correct += pred.eq(target.view_as(pred)).sum().item()
+
+        loss = loss_func(output, target)
+        epoch_loss.append(loss.item())
+
+    print("Performance on test data:\n\tAccuracy: {:.1f}%".format(
+        correct / len(test_loader) * 100))
+
No results found