@inproceedings{64d88589c13e4b09ad95273cdaec49a5,
title = "High performance checksum computation for fault-tolerant MPI over infiniband",
abstract = "With the increase of the number of nodes in clusters, the probability of failures and unusual events increases. In this paper, we present checksum mechanisms to detect data corruption. We study the impact of checksums on network communication performance and we propose a mechanism to amortize their cost on InfiniBand. We have implemented our mechanisms in the NewMadeleine communication library. Our evaluation shows that our mechanisms to ensure message integrity do not impact noticeably the application performance, which is an improvement over the state of the art MPI implementations.",
keywords = "Checksum, Fault-Tolerance, High-performance networks, InfiniBand",
author = "Alexandre Denis and Francois Trahay and Yutaka Ishikawa",
year = "2012",
month = oct,
day = "24",
doi = "10.1007/978-3-642-33518-1\_23",
language = "English",
isbn = "9783642335174",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
pages = "183--192",
booktitle = "Recent Advances in the Message Passing Interface - 19th European MPI Users' Group Meeting, EuroMPI 2012, Proceedings",
note = "19th European MPI Users' Group Meeting on Recent Advances in the Message Passing Interface, EuroMPI 2012 ; Conference date: 23-09-2012 Through 26-09-2012",
}