Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mandelbrot
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Branislav Jansik
mandelbrot
Commits
9f07b53c
Commit
9f07b53c
authored
Feb 26, 2024
by
jansik
Browse files
Options
Downloads
Patches
Plain Diff
NEON FMLA for the ARM AArch64 architecture
parent
fb8c9280
Branches
Branches containing commit
Tags
build-84afddea8f5e98b3b4e176597c946f3ce8fe5f84
Tags containing commit
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
jansik-real-fmla-neon-f64-omp.c
+254
-0
254 additions, 0 deletions
jansik-real-fmla-neon-f64-omp.c
with
254 additions
and
0 deletions
jansik-real-fmla-neon-f64-omp.c
0 → 100644
+
254
−
0
View file @
9f07b53c
//by Branislav Jansik, @IT4Innovations, 2024
//
#include
<stdio.h>
#include
<stdlib.h>
#include
<omp.h>
#include
<math.h>
#define REPEAT10(x) x x x x x x x x x x
/* Cycle measurement not implemented for ARM NEON yet
static __inline__ unsigned long long int get_cycles(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long int)lo)|( ((unsigned long long int)hi)<<32 );
return 1;
}
*/
int
main
(
int
argc
,
char
**
argv
)
{
double
*
ar
,
*
br
;
double
rsum
,
rmax
,
rmin
;
double
ainc
;
double
to
,
to0
;
int
i
,
size
,
nsve
=
16
,
niter
=
1
;
//inputs
if
(
argc
>
1
)
niter
=
atoi
(
argv
[
1
]);
//find out threads and SVE vector length
size
=
omp_get_max_threads
();
//print info
printf
(
"FLIPS: Double Precision NEON (%dbit) FMLA Instructions Per Second
\n
Run %d times on %d cores
\n
"
,
8
*
nsve
,
niter
,
size
);
// allocate
posix_memalign
((
void
*
)
&
ar
,
(
size_t
)
64
,
(
size_t
)
sizeof
(
double
)
*
2
*
nsve
);
posix_memalign
((
void
*
)
&
br
,
(
size_t
)
64
,
(
size_t
)
sizeof
(
double
)
*
2
*
nsve
);
// initialize c constants within the mandelbrot set
ainc
=
0
.
7
/
((
double
)
2
*
nsve
);
for
(
i
=
0
;
i
<
2
*
nsve
;
i
++
)
ar
[
i
]
=
0
.
0
+
ainc
*
i
;
// iterations
for
(
i
=
1
;
i
<=
niter
;
i
++
)
{
rsum
=
0
.
0
;
rmax
=-
INFINITY
;
rmin
=
INFINITY
;
//fqmax=-INFINITY;
//fqmin=INFINITY;
to0
=
omp_get_wtime
();
#pragma omp parallel default(shared) reduction(+:rsum) reduction(max:rmax) reduction (min:rmin)
{
double
t0
,
t
,
flips
;
//cycle measurement not implemented for ARM yet
//unsigned long long int c0, c;
#pragma omp barrier
t0
=
omp_get_wtime
();
//c0 = get_cycles();
__asm__
(
//init
"ldr q0, [%0, 0]
\n\t
"
"ldr q2, [%0, 16]
\n\t
"
"ldr q4, [%0, 32]
\n\t
"
"ldr q6, [%0, 48]
\n\t
"
"ldr q8, [%0, 64]
\n\t
"
"ldr q10, [%0, 80]
\n\t
"
"ldr q12, [%0, 96]
\n\t
"
"ldr q14, [%0, 112]
\n\t
"
"ldr q16, [%0, 128]
\n\t
"
"ldr q18, [%0, 144]
\n\t
"
"ldr q20, [%0, 160]
\n\t
"
"ldr q22, [%0, 176]
\n\t
"
"ldr q24, [%0, 192]
\n\t
"
"ldr q26, [%0, 208]
\n\t
"
"ldr q28, [%0, 224]
\n\t
"
"ldr q30, [%0, 240]
\n\t
"
//zero out registers
"eor v1.16b, v1.16b, v1.16b
\n\t
"
"eor v3.16b, v3.16b, v3.16b
\n\t
"
"eor v5.16b, v5.16b, v5.16b
\n\t
"
"eor v7.16b, v7.16b, v7.16b
\n\t
"
"eor v9.16b, v9.16b, v9.16b
\n\t
"
"eor v11.16b, v11.16b, v11.16b
\n\t
"
"eor v13.16b, v13.16b, v13.16b
\n\t
"
"eor v15.16b, v15.16b, v15.16b
\n\t
"
"eor v17.16b, v17.16b, v17.16b
\n\t
"
"eor v19.16b, v19.16b, v19.16b
\n\t
"
"eor v21.16b, v21.16b, v21.16b
\n\t
"
"eor v23.16b, v23.16b, v23.16b
\n\t
"
"eor v25.16b, v25.16b, v25.16b
\n\t
"
"eor v27.16b, v27.16b, v27.16b
\n\t
"
"eor v29.16b, v29.16b, v29.16b
\n\t
"
"eor v31.16b, v31.16b, v31.16b
\n\t
"
//zero out loop counter
"mov x0, #0
\n\t
"
//set loop limit to 8000000 = 2000*2000
"mov x1, #2000
\n\t
"
"mul x1, x1, x1
\n\t
"
//loop
"label:
\n\t
"
"add x0, x0, #1
\n\t
"
//jansik iterations
REPEAT10
(
REPEAT10
(
//C += Z*Z;
"fmla v0.2d, v1.2d, v1.2d
\n\t
"
"fmla v2.2d, v3.2d, v3.2d
\n\t
"
"fmla v4.2d, v5.2d, v5.2d
\n\t
"
"fmla v6.2d, v7.2d, v7.2d
\n\t
"
"fmla v8.2d, v9.2d, v9.2d
\n\t
"
"fmla v10.2d, v11.2d, v11.2d
\n\t
"
"fmla v12.2d, v13.2d, v13.2d
\n\t
"
"fmla v14.2d, v15.2d, v15.2d
\n\t
"
"fmla v16.2d, v17.2d, v17.2d
\n\t
"
"fmla v18.2d, v19.2d, v19.2d
\n\t
"
"fmla v20.2d, v21.2d, v21.2d
\n\t
"
"fmla v22.2d, v23.2d, v23.2d
\n\t
"
"fmla v24.2d, v25.2d, v25.2d
\n\t
"
"fmla v26.2d, v27.2d, v27.2d
\n\t
"
"fmla v28.2d, v29.2d, v29.2d
\n\t
"
"fmla v30.2d, v31.2d, v31.2d
\n\t
"
//Z += C*C;
"fmla v1.2d, v0.2d, v0.2d
\n\t
"
"fmla v3.2d, v2.2d, v2.2d
\n\t
"
"fmla v5.2d, v4.2d, v4.2d
\n\t
"
"fmla v7.2d, v6.2d, v6.2d
\n\t
"
"fmla v9.2d, v8.2d, v8.2d
\n\t
"
"fmla v11.2d, v10.2d, v10.2d
\n\t
"
"fmla v13.2d, v12.2d, v12.2d
\n\t
"
"fmla v15.2d, v14.2d, v14.2d
\n\t
"
"fmla v17.2d, v16.2d, v16.2d
\n\t
"
"fmla v19.2d, v18.2d, v18.2d
\n\t
"
"fmla v21.2d, v20.2d, v20.2d
\n\t
"
"fmla v23.2d, v21.2d, v21.2d
\n\t
"
"fmla v25.2d, v24.2d, v24.2d
\n\t
"
"fmla v27.2d, v26.2d, v26.2d
\n\t
"
"fmla v29.2d, v28.2d, v28.2d
\n\t
"
"fmla v31.2d, v30.2d, v30.2d
\n\t
"
//C-= Z*Z;
"fmls v0.2d, v1.2d, v1.2d
\n\t
"
"fmls v2.2d, v3.2d, v3.2d
\n\t
"
"fmls v4.2d, v5.2d, v5.2d
\n\t
"
"fmls v6.2d, v7.2d, v7.2d
\n\t
"
"fmls v8.2d, v9.2d, v9.2d
\n\t
"
"fmls v10.2d, v11.2d, v11.2d
\n\t
"
"fmls v12.2d, v13.2d, v13.2d
\n\t
"
"fmls v14.2d, v15.2d, v15.2d
\n\t
"
"fmls v16.2d, v17.2d, v17.2d
\n\t
"
"fmls v18.2d, v19.2d, v19.2d
\n\t
"
"fmls v20.2d, v21.2d, v21.2d
\n\t
"
"fmls v22.2d, v23.2d, v23.2d
\n\t
"
"fmls v24.2d, v25.2d, v25.2d
\n\t
"
"fmls v26.2d, v27.2d, v27.2d
\n\t
"
"fmls v28.2d, v29.2d, v29.2d
\n\t
"
"fmls v30.2d, v31.2d, v31.2d
\n\t
"
//Z-= C*C;
"fmls v1.2d, v0.2d, v0.2d
\n\t
"
"fmls v3.2d, v2.2d, v2.2d
\n\t
"
"fmls v5.2d, v4.2d, v4.2d
\n\t
"
"fmls v7.2d, v6.2d, v6.2d
\n\t
"
"fmls v9.2d, v8.2d, v8.2d
\n\t
"
"fmls v11.2d, v10.2d, v10.2d
\n\t
"
"fmls v13.2d, v12.2d, v12.2d
\n\t
"
"fmls v15.2d, v14.2d, v14.2d
\n\t
"
"fmls v17.2d, v16.2d, v16.2d
\n\t
"
"fmls v19.2d, v18.2d, v18.2d
\n\t
"
"fmls v21.2d, v20.2d, v20.2d
\n\t
"
"fmls v23.2d, v21.2d, v21.2d
\n\t
"
"fmls v25.2d, v24.2d, v24.2d
\n\t
"
"fmls v27.2d, v26.2d, v26.2d
\n\t
"
"fmls v29.2d, v28.2d, v28.2d
\n\t
"
"fmls v31.2d, v30.2d, v30.2d
\n\t
"
)
)
//end loop
"cmp x0, x1
\n\t
"
"b.lt label
\n\t
"
//offload
//offload to br array from multiple threads creates race condition
//we ignore it for now.
"str q1, [%1, 0]
\n\t
"
"str q3, [%1, 16]
\n\t
"
"str q5, [%1, 32]
\n\t
"
"str q7, [%1, 48]
\n\t
"
"str q9, [%1, 64]
\n\t
"
"str q11, [%1, 80]
\n\t
"
"str q13, [%1, 96]
\n\t
"
"str q15, [%1, 112]
\n\t
"
"str q17, [%1, 128]
\n\t
"
"str q19, [%1, 144]
\n\t
"
"str q21, [%1, 160]
\n\t
"
"str q23, [%1, 176]
\n\t
"
"str q25, [%1, 192]
\n\t
"
"str q27, [%1, 208]
\n\t
"
"str q29, [%1, 224]
\n\t
"
"str q31, [%1, 240]
\n\t
"
//inputs, outputs and clobbers
:
:
"r"
(
ar
)
,
"r"
(
br
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"q16"
,
"q17"
,
"q18"
,
"q19"
,
"q20"
,
"q21"
,
"q22"
,
"q23"
,
"q24"
,
"q25"
,
"q26"
,
"q27"
,
"q28"
,
"q29"
,
"q30"
,
"q31"
,
"x0"
,
"x1"
,
"memory"
);
//c = get_cycles() - c0;
t
=
omp_get_wtime
()
-
t0
;
//count flips: 4000000*100*64/1e9 = 25.6
flips
=
25
.
6
/
t
;
//freq = c/t/1e9;
rsum
=
flips
;
rmax
=
flips
;
rmin
=
flips
;
//fqmax=freq;
//fqmin=freq;
}
to
=
omp_get_wtime
()
-
to0
;
printf
(
"%d: Summary FLIPS: Min: %6.4fG, Avg: %6.4fG, Max: %6.4fG, Sum: %6.4fG GrandSum: %6.4fG
\n
"
,
i
,
rmin
,
rsum
/
size
,
rmax
,
rsum
,
25
.
6
*
size
/
to
);
}
printf
(
"Outputs:
\n
"
);
for
(
i
=
0
;
i
<
2
*
nsve
;
i
+=
4
)
printf
(
"zr: %d: %f %f %f %f
\n
"
,
i
,
br
[
i
+
0
],
br
[
i
+
1
],
br
[
i
+
2
],
br
[
i
+
3
]);
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment