Skip to content

Commit c79b069

Browse files
Add MPI+OpenMP implementation.
1 parent 0588525 commit c79b069

File tree

6 files changed

+239
-0
lines changed

6 files changed

+239
-0
lines changed

Diff for: build_all.sh

+5
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ if [[ $TASKBENCH_USE_MPI -eq 1 ]]; then
3131
make -C mpi all -j$THREADS
3232
fi
3333

34+
if [[ $USE_MPI_OPENMP -eq 1 ]]; then
35+
make -C mpi_openmp clean
36+
make -C mpi_openmp all -j$THREADS
37+
fi
38+
3439
if [[ $USE_GASNET -eq 1 ]]; then
3540
make -C "$GASNET_DIR"
3641
fi

Diff for: get_deps.sh

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ fi
3838

3939
cat >>deps/env.sh <<EOF
4040
export TASKBENCH_USE_MPI=${TASKBENCH_USE_MPI:-$DEFAULT_FEATURES}
41+
export USE_MPI_OPENMP=${USE_MPI_OPENMP:-$DEFAULT_FEATURES}
4142
export USE_GASNET=${USE_GASNET:-0}
4243
export TASKBENCH_USE_HWLOC=${TASKBENCH_USE_HWLOC:-$DEFAULT_FEATURES}
4344
export USE_LEGION=${USE_LEGION:-$DEFAULT_FEATURES}

Diff for: mpi_openmp/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/forall

Diff for: mpi_openmp/Makefile

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
MPICXX ?= mpicxx
2+
3+
DEBUG ?= 0
4+
5+
CXXFLAGS ?=
6+
CXXFLAGS += -fopenmp -std=c++11 -I../core
7+
8+
LDFLAGS ?=
9+
LDFLAGS += -L../core -lcore_s
10+
11+
ifeq ($(strip $(DEBUG)),0)
12+
CXXFLAGS += -O3
13+
else
14+
CXXFLAGS += -O0 -ggdb
15+
endif
16+
17+
include ../core/make_blas.mk
18+
19+
BIN := forall
20+
21+
.PHONY: all
22+
all: $(BIN)
23+
24+
$(BIN): %:%.cc
25+
$(MPICXX) -o $@ $(CXXFLAGS) $< $(LDFLAGS)
26+
27+
.PHONY: clean
28+
clean:
29+
rm -f *.o $(BIN)

Diff for: mpi_openmp/forall.cc

+192
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
/* Copyright 2019 Stanford University
2+
*
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
#include <cassert>
17+
#include <cstdio>
18+
#include <cstdlib>
19+
20+
#include "core.h"
21+
22+
#include "mpi.h"
23+
24+
int main(int argc, char *argv[])
25+
{
26+
MPI_Init(&argc, &argv);
27+
int n_ranks, rank;
28+
MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
29+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
30+
31+
App app(argc, argv);
32+
if (rank == 0) app.display();
33+
34+
double elapsed_time = 0.0;
35+
for (int iter = 0; iter < 2; ++iter) {
36+
MPI_Barrier(MPI_COMM_WORLD);
37+
38+
double start_time = MPI_Wtime();
39+
40+
std::vector<MPI_Request> requests;
41+
42+
for (auto graph : app.graphs) {
43+
size_t scratch_bytes = graph.scratch_bytes_per_task;
44+
char *scratch_ptr = (char *)malloc(scratch_bytes);
45+
46+
long first_point = rank * graph.max_width / n_ranks;
47+
long last_point = (rank + 1) * graph.max_width / n_ranks - 1;
48+
long n_points = last_point - first_point + 1;
49+
50+
std::vector<int> rank_by_point(graph.max_width);
51+
std::vector<int> tag_bits_by_point(graph.max_width);
52+
for (int r = 0; r < n_ranks; ++r) {
53+
long r_first_point = r * graph.max_width / n_ranks;
54+
long r_last_point = (r + 1) * graph.max_width / n_ranks - 1;
55+
for (long p = r_first_point; p <= r_last_point; ++p) {
56+
rank_by_point[p] = r;
57+
tag_bits_by_point[p] = p - r_first_point;
58+
// Has to fit in 7 bits because MPI only guarrantees that
59+
// tags can use 15 bits.
60+
assert((tag_bits_by_point[p] & ~0x7F) == 0);
61+
}
62+
}
63+
64+
long max_deps = 0;
65+
for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) {
66+
for (long point = first_point; point <= last_point; ++point) {
67+
long deps = 0;
68+
for (auto interval : graph.dependencies(dset, point)) {
69+
deps += interval.second - interval.first + 1;
70+
}
71+
max_deps = std::max(max_deps, deps);
72+
}
73+
}
74+
75+
std::vector<std::vector<std::vector<char> > > inputs(n_points);
76+
std::vector<std::vector<const char *> > input_ptr(n_points);
77+
std::vector<std::vector<size_t> > input_bytes(n_points);
78+
std::vector<long> n_inputs(n_points);
79+
std::vector<std::vector<char> > outputs(n_points);
80+
for (long point = first_point; point <= last_point; ++point) {
81+
long point_index = point - first_point;
82+
83+
auto &point_inputs = inputs[point_index];
84+
auto &point_input_ptr = input_ptr[point_index];
85+
auto &point_input_bytes = input_bytes[point_index];
86+
87+
point_inputs.resize(max_deps);
88+
point_input_ptr.resize(max_deps);
89+
point_input_bytes.resize(max_deps);
90+
91+
for (long dep = 0; dep < max_deps; ++dep) {
92+
point_inputs[dep].resize(graph.output_bytes_per_task);
93+
point_input_ptr[dep] = point_inputs[dep].data();
94+
point_input_bytes[dep] = point_inputs[dep].size();
95+
}
96+
97+
auto &point_outputs = outputs[point_index];
98+
point_outputs.resize(graph.output_bytes_per_task);
99+
}
100+
101+
for (long timestep = 0; timestep < graph.timesteps; ++timestep) {
102+
long offset = graph.offset_at_timestep(timestep);
103+
long width = graph.width_at_timestep(timestep);
104+
105+
long last_offset = graph.offset_at_timestep(timestep-1);
106+
long last_width = graph.width_at_timestep(timestep-1);
107+
108+
long dset = graph.dependence_set_at_timestep(timestep);
109+
110+
requests.clear();
111+
112+
for (long point = first_point; point <= last_point; ++point) {
113+
long point_index = point - first_point;
114+
115+
auto &point_inputs = inputs[point_index];
116+
auto &point_n_inputs = n_inputs[point_index];
117+
auto &point_output = outputs[point_index];
118+
119+
/* Receive */
120+
point_n_inputs = 0;
121+
if (point >= offset && point < offset + width) {
122+
for (auto interval : graph.dependencies(dset, point)) {
123+
for (long dep = interval.first; dep <= interval.second; ++dep) {
124+
if (dep < last_offset || dep >= last_offset + last_width) {
125+
continue;
126+
}
127+
128+
int from = tag_bits_by_point[dep];
129+
int to = tag_bits_by_point[point];
130+
int tag = (from << 8) | to;
131+
MPI_Request req;
132+
MPI_Irecv(point_inputs[point_n_inputs].data(),
133+
point_inputs[point_n_inputs].size(), MPI_BYTE,
134+
rank_by_point[dep], tag, MPI_COMM_WORLD, &req);
135+
requests.push_back(req);
136+
point_n_inputs++;
137+
}
138+
}
139+
}
140+
141+
/* Send */
142+
if (point >= last_offset && point < last_offset + last_width) {
143+
for (auto interval : graph.reverse_dependencies(dset, point)) {
144+
for (long dep = interval.first; dep <= interval.second; dep++) {
145+
if (dep < offset || dep >= offset + width) {
146+
continue;
147+
}
148+
149+
int from = tag_bits_by_point[point];
150+
int to = tag_bits_by_point[dep];
151+
int tag = (from << 8) | to;
152+
MPI_Request req;
153+
MPI_Isend(point_output.data(), point_output.size(), MPI_BYTE,
154+
rank_by_point[dep], tag, MPI_COMM_WORLD, &req);
155+
requests.push_back(req);
156+
}
157+
}
158+
}
159+
}
160+
161+
MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
162+
163+
#pragma omp parallel for schedule(dynamic, 1)
164+
for (long point = std::max(first_point, offset); point <= std::min(last_point, offset + width - 1); ++point) {
165+
long point_index = point - first_point;
166+
167+
auto &point_input_ptr = input_ptr[point_index];
168+
auto &point_input_bytes = input_bytes[point_index];
169+
auto &point_n_inputs = n_inputs[point_index];
170+
auto &point_output = outputs[point_index];
171+
172+
graph.execute_point(timestep, point,
173+
point_output.data(), point_output.size(),
174+
point_input_ptr.data(), point_input_bytes.data(), point_n_inputs,
175+
scratch_ptr, scratch_bytes);
176+
}
177+
}
178+
free(scratch_ptr);
179+
}
180+
181+
MPI_Barrier(MPI_COMM_WORLD);
182+
183+
double stop_time = MPI_Wtime();
184+
elapsed_time = stop_time - start_time;
185+
}
186+
187+
if (rank == 0) {
188+
app.report_timing(elapsed_time);
189+
}
190+
191+
MPI_Finalize();
192+
}

Diff for: test_all.sh

+11
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ if [[ $TASKBENCH_USE_MPI -eq 1 ]]; then
6060
done
6161
fi
6262

63+
if [[ $USE_MPI_OPENMP -eq 1 ]]; then
64+
for t in "${extended_types[@]}"; do
65+
for k in "${kernels[@]}"; do
66+
mpirun -np 1 ./mpi_openmp/forall -steps $steps -type $t $k
67+
mpirun -np 2 ./mpi_openmp/forall -steps $steps -type $t $k
68+
mpirun -np 4 ./mpi_openmp/forall -steps $steps -type $t $k
69+
mpirun -np 4 ./mpi_openmp/forall -steps $steps -type $t $k -and -steps $steps -type $t $k
70+
done
71+
done
72+
fi
73+
6374
if [[ $USE_LEGION -eq 1 ]]; then
6475
for t in "${extended_types[@]}"; do
6576
for k in "${kernels[@]}"; do

0 commit comments

Comments
 (0)