Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ test/request_tracking/fenix_request_tracking_test
test/request_tracking/fenix_request_tracking_test_nofenix
build/
install/
spack-*

# Other
*~
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

cmake_minimum_required(VERSION 3.10.2)

project(Fenix C)
project(Fenix C CXX)
# The version number.
set(FENIX_VERSION_MAJOR 1)
set(FENIX_VERSION_MINOR 0)
Expand Down
1 change: 1 addition & 0 deletions examples/01_hello_world/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

add_executable(fenix_hello_world fenix_hello_world.c)
target_link_libraries(fenix_hello_world fenix ${MPI_C_LIBRARIES})
set_target_properties(fenix_hello_world PROPERTIES LINKER_LANGUAGE C)

if(BUILD_TESTING)
add_test(NAME hello_world
Expand Down
1 change: 1 addition & 0 deletions examples/01_hello_world/fenix/fenix_hello_world.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include <mpi.h>
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>

Expand Down
7 changes: 4 additions & 3 deletions examples/02_send_recv/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
#

add_executable(fenix_ring fenix_ring.c)
target_link_libraries(fenix_ring fenix ${MPI_C_LIBRARIES} m )
target_link_libraries(fenix_ring fenix ${MPI_C_LIBRARIES})
set_target_properties(fenix_ring PROPERTIES LINKER_LANGUAGE C)

if(BUILD_TESTING)
add_test(NAME ring
add_test(NAME send_recv
COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} fenix_ring ${MPIEXEC_POSTFLAGS} 1 2)
set_tests_properties(ring PROPERTIES
set_tests_properties(send_recv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
36 changes: 22 additions & 14 deletions examples/05_subset_create/subset_create.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,12 @@
int max_iter = 2;
const int kCount = 100;
const int kKillID = 2;
const int my_group = 0;
const int my_member = 0;

int main(int argc, char **argv) {
fprintf(stderr, "Started\n");
int i;
int subset[500];
int subset[kCount];
MPI_Status status;

if (argc < 2) {
Expand All @@ -86,7 +87,6 @@ fprintf(stderr, "Started\n");
int num_ranks;
int rank;
int error;
int my_group = 0;
int my_timestamp = 0;
int my_depth = 1;
int recovered = 0;
Expand Down Expand Up @@ -120,24 +120,33 @@ fprintf(stderr, "Started\n");

if (fenix_role == FENIX_ROLE_INITIAL_RANK) {
// init my subset data
int index;
for (index = 0; index < kCount; index++) {
for (int index = 0; index < kCount; index++) {
subset[index] = -1;
}

Fenix_Data_member_create(my_group, 777, subset, kCount, MPI_INT);
Fenix_Data_member_create(my_group, my_member, subset, kCount, MPI_INT);

//Store the entire data set for the initial commit. This is not a requirement.
Fenix_Data_member_store(my_group, 777, FENIX_DATA_SUBSET_FULL);
Fenix_Data_member_store(my_group, my_member, FENIX_DATA_SUBSET_FULL);
Fenix_Data_commit_barrier(my_group, NULL);

} else {
//We've had a failure! Time to recover data.
fprintf(stderr, "Starting data recovery on node %d\n", rank);
Fenix_Data_member_restore(my_group, 777, subset, kCount, FENIX_TIME_STAMP_MAX, NULL);
fprintf(stderr, "Starting data recovery on rank %d\n", rank);

//Set all data to a value that was never stored
for (int index = 0; index < kCount; index++) {
subset[index] = -2;
}

int restore_ret = Fenix_Data_member_restore(my_group, my_member, subset, kCount, FENIX_TIME_STAMP_MAX, NULL);

if(restore_ret != FENIX_SUCCESS){
fprintf(stderr, "Rank %d restore failure w/ code %d\n", rank, restore_ret);
}

int out_flag;
Fenix_Data_member_attr_set(my_group, 777, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER,
Fenix_Data_member_attr_set(my_group, my_member, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER,
subset, &out_flag);


Expand All @@ -159,20 +168,19 @@ fprintf(stderr, "Started\n");
//We'll store only the small subset that we specified, though.
//This means that as far as Fenix is concerned only data within that
//subset was ever changed from the initialized value of -1
Fenix_Data_member_store(my_group, 777, subset_specifier);
Fenix_Data_member_store(my_group, my_member, subset_specifier);
Fenix_Data_commit_barrier(my_group, NULL);

MPI_Barrier(new_comm); //Make sure everyone is done committing before we kill and restart everyone
//else we may end up with only some nodes having the commit, and it being unusable

}


//Kill a rank to test that we can recover from the commits we've made.
if (rank == kKillID && recovered == 0) {
fprintf(stderr, "Doing kill on node %d\n", rank);
pid_t pid = getpid();
kill(pid, SIGTERM);
kill(pid, SIGKILL);
}

//Make sure we've let rank 2 fail before proceeding, so we're definitely checking
Expand Down Expand Up @@ -214,6 +222,6 @@ fprintf(stderr, "Started\n");


Fenix_Finalize();
MPI_Finalize();
//MPI_Finalize();
return !successful; //return error status
}
21 changes: 21 additions & 0 deletions examples/07_resizeable_member/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#
# This file is part of Fenix
# Copyright (c) 2016 Rutgers University and Sandia Corporation.
# This software is distributed under the BSD License.
# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
# the U.S. Government retains certain rights in this software.
# For more information, see the LICENSE file in the top Fenix
# directory.
#

add_executable(resizeable resizeable.cpp)
target_link_libraries(resizeable fenix ${MPI_C_LIBRARIES})

target_compile_features(resizeable PRIVATE cxx_std_20)

if(BUILD_TESTING)
add_test(NAME resizeable
COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} resizeable ${MPIEXEC_POSTFLAGS} 1)
set_tests_properties(resizeable PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE" LABELS "Example")
endif()
203 changes: 203 additions & 0 deletions examples/07_resizeable_member/resizeable.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
//@HEADER
// ************************************************************************
//
//
// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
// _| _| _|_| _| _| _| _|
// _|_|_| _|_|_| _| _| _| _| _|
// _| _| _| _|_| _| _| _|
// _| _|_|_|_| _| _| _|_|_| _| _|
//
//
//
//
// Copyright (C) 2016 Rutgers University and Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY RUTGERS UNIVERSITY and SANDIA CORPORATION
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RUTGERS
// UNIVERISY, SANDIA CORPORATION OR THE CONTRIBUTORS BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
// IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
// IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
// Marc Gamell (mgamell@cac.rutgers.edu)
//
// ************************************************************************
//@HEADER
*/

#include <fenix.hpp>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/types.h>
#include <unistd.h>
#include <vector>

constexpr int kKillID = 2;
constexpr int my_group = 0;
constexpr int my_member = 0;
constexpr int start_timestamp = 0;
constexpr int group_depth = 1;
int errflag;

using Fenix::DataSubset;
using namespace Fenix::Data;

int main(int argc, char **argv) {
MPI_Init(&argc, &argv);

MPI_Comm res_comm;
Fenix::init({.out_comm = &res_comm, .spares = 1});

int num_ranks, rank;
MPI_Comm_size(res_comm, &num_ranks);
MPI_Comm_rank(res_comm, &rank);

std::vector<int> data;

bool should_throw = Fenix_get_role() == FENIX_ROLE_RECOVERED_RANK;
while(true) try {
if(should_throw){
should_throw = false;
Fenix::throw_exception();
}

//Initial work and commits
if(Fenix_get_role() == FENIX_ROLE_INITIAL_RANK){
Fenix_Data_group_create(
my_group, res_comm, start_timestamp, group_depth, FENIX_DATA_POLICY_IMR,
NULL, &errflag
);
Fenix_Data_member_create(
my_group, my_member, data.data(), FENIX_RESIZEABLE, MPI_INT
);

data.resize(100);
for(int& i : data) i = -1;


//Store the whole array first. We need to keep our buffer pointer updated
//since resizing an array can change it
Fenix_Data_member_attr_set(
my_group, my_member, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, data.data(),
&errflag
);
member_store(my_group, my_member, {{0, data.size()-1}});
Fenix_Data_commit_barrier(my_group, NULL);


//Now commit a smaller portion with different data.
data.resize(50);
int val = 1;
for(int& i : data) i = val++;

Fenix_Data_member_attr_set(
my_group, my_member, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, data.data(),
&errflag
);
member_store(my_group, my_member, {{0, data.size()-1}});
Fenix_Data_commit_barrier(my_group, NULL);


if(rank == kKillID){
fprintf(stderr, "Doing kill on node %d\n", rank);
raise(SIGTERM);
}
}

Fenix_Finalize();


break;
} catch (const Fenix::CommException& e) {
const Fenix::CommException* err = &e;
while(true) try {
//We've had a failure! Time to recover data.
fprintf(stderr, "Starting data recovery on rank %d\n", rank);
if(err->fenix_err != FENIX_SUCCESS){
fprintf(stderr, "FAILURE on Fenix Init (%d). Exiting.\n", err->fenix_err);
exit(1);
}

Fenix_Data_group_create(
my_group, res_comm, start_timestamp, group_depth, FENIX_DATA_POLICY_IMR,
NULL, &errflag
);

//Do a null restore to get information about the stored subset
DataSubset stored_subset;
int ret = member_restore(
my_group, my_member, nullptr, 0, FENIX_TIME_STAMP_MAX, stored_subset
);
if(ret != FENIX_SUCCESS) {
fprintf(stderr, "Rank %d restore failure w/ code %d\n", rank, ret);
MPI_Abort(MPI_COMM_WORLD, 1);
}

//Resize data to fit all stored data
data.resize(stored_subset.max_count());

//Set all data to a value that was never stored, just for testing
for(int& i : data) i = -2;

//Now do an lrestore to get the recovered data.
ret = member_lrestore(
my_group, my_member, data.data(), data.size(), FENIX_TIME_STAMP_MAX,
stored_subset
);

break;
} catch (const Fenix::CommException& nested){
err = &nested;
}
}

//Ensure data is correct after execution and recovery
bool successful = data.size() == 50;
if(!successful) printf("Rank %d expected data size 50, but got %d\n", rank, data.size());

for(int i = 0; i < data.size() && successful; i++){
successful &= data[i] == i+1;
if(!successful) printf("Rank %d data[%d]=%d, but should be %d!\n", rank, i, data[i], i+1);
}

if(successful){
printf("Rank %d successfully recovered\n", rank);
} else {
printf("FAILURE on rank %d\n", rank);
}

MPI_Finalize();
return !successful; //return error status
}
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ add_subdirectory(03_reduce/fenix)
add_subdirectory(04_Isend_Irecv/fenix)
add_subdirectory(05_subset_create)
add_subdirectory(06_subset_createv)
add_subdirectory(07_resizeable_member)
Loading
Loading