@inproceedings{703653c2f6764c38a573e1a47e137235,
title = "Transparent high-speed network checkpoint/Restart in MPI",
abstract = "Fault-tolerance has always been an important topic when it comes to running massively parallel programs at scale. Statistically, hardware and software failures are expected to occur more often on systems gathering millions of computing units. Moreover, the larger jobs are, the more computing hours would be wasted by a crash. In this paper, we describe the work done in our MPI runtime to enable transparent checkpointing mechanism. Unlike the MPI 4.0 User-Level Failure Mitigation (ULFM) interface, our work targets solely Checkpoint/Restart (C/R) and ignores wider features such as resiliency. We show how existing transparent checkpointing methods can be practically applied to MPI implementations given a sufficient collaboration from the MPI runtime. Our C/R technique is then measured on MPI benchmarks such as IMB and Lulesh relying on Infiniband high-speed network, demonstrating that the chosen approach is sufficiently general and that performance is mostly preserved. We argue that enabling fault-tolerance without any modification inside target MPI applications is possible, and show how it could be the first step for more integrated resiliency combined with failure mitigation like ULFM.",
keywords = "Checkpoint-Restart, DMTCP, Fault-Tolerance, Infiniband",
author = "Julien Adam and Sameer Shende and Besnard, \{Jean Baptiste\} and Marc P{\'e}rache and Julien Jaeger and Malony, \{Allen D.\} and Patrick Carribault",
note = "Publisher Copyright: {\textcopyright} 2018 ACM.; 25th European MPI Users' Group Meeting, EuroMPI 2018 ; Conference date: 23-09-2018 Through 26-09-2018",
year = "2018",
month = sep,
day = "23",
doi = "10.1145/3236367.3236383",
language = "English",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery",
booktitle = "EuroMPI 2018 - Proceedings of the 25th European MPI Users' Group Meeting",
}