-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgemm.cpp
More file actions
114 lines (102 loc) · 1.94 KB
/
gemm.cpp
File metadata and controls
114 lines (102 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include <benchmark/benchmark.h>
#include <random>
#include <memory>
#include <immintrin.h>
static constexpr int N = 1024;
alignas(64) float A[N * N];
alignas(64) float B[N * N];
alignas(64) float C[N * N];
static constexpr float scale = 100.0f / static_cast<float>(RAND_MAX);
void setup(const benchmark::State &)
{
for (int j = 0; j < N; ++j)
{
for (int i = 0; i < N; ++i)
{
A[j * N + i] = rand() * scale;
A[j * N + i] = rand() * scale;
}
}
}
void mult_naive()
{
for (int j = 0; j < N; ++j)
{
for (int i = 0; i < N; ++i)
{
float acc = 0;
for (int k = 0; k < N; ++k)
{
acc += A[j * N + k] * B[k * N + i];
}
C[j * N + i] = acc;
}
}
}
void bm_naive(benchmark::State &s)
{
for (auto _ : s)
{
mult_naive();
}
}
void mult_transpose()
{
auto Bt = std::make_unique_for_overwrite<float[]>(N * N);
for (int j = 0; j < N; ++j)
{
for (int i = 0; i < N; ++i)
{
Bt[i * N + j] = B[j * N + i];
}
}
for (int j = 0; j < N; ++j)
{
for (int i = 0; i < N; ++i)
{
float acc = 0;
for (int k = 0; k < N; ++k)
{
acc += A[j * N + k] * Bt[i * N + k];
}
C[j * N + i] = acc;
}
}
}
void bm_transpose(benchmark::State &s)
{
for (auto _ : s)
{
mult_transpose();
}
}
void mult_avx()
{
float zero = 0.0f;
for (int j = 0; j < N; ++j)
{
for (int i = 0; i < N; i += 8)
{
__m256 acc = _mm256_broadcast_ss(&zero);
for (int k = 0; k < N; ++k)
{
__m256 as = _mm256_broadcast_ss(&A[j * N + k]);
auto bs = _mm256_load_ps(&B[k * N + i]);
auto ms = _mm256_mul_ps(as, bs);
acc = _mm256_add_ps(ms, acc);
}
_mm256_store_ps(&C[j * N + i], acc);
}
}
}
void bm_avx(benchmark::State &s)
{
for (auto _ : s)
{
mult_avx();
}
}
BENCHMARK(bm_naive)->Setup(setup);
BENCHMARK(bm_transpose)->Setup(setup);
BENCHMARK(bm_avx)->Setup(setup);
BENCHMARK_MAIN();