@inproceedings{d89f60c7cc8b471eafaa54df1ec7a5a8,
title = "Maintaining Sanity: Algorithm-based Comprehensive Fault Tolerance for CNNs",
abstract = "As the deployment of neural networks in safety-critical applications proliferates, it becomes imperative that they exhibit consistent and dependable performance amidst hardware malfunctions. Several protection schemes have been proposed to protect neural networks, but they suffer from huge overheads or insufficient fault coverage. This paper presents Maintaining Sanity, a comprehensive and efficient protection technique for CNNs. Maintaining Sanity extends the state-of-the-art algorithm-based fault tolerance for CNN, utilizing hamming codes and checkpointing to correct over 99.6\% of critical faults with about 72\% runtime overhead and minimal memory overhead compared to traditional triple modular redundancy (TMR) techniques.",
keywords = "ABFT, CNN, Fault Injection, Reliability, Reliable Machine Learning, Soft Errors, Transient Faults",
author = "Jinhyo Jung and Hwisoo So and Woobin Ko and Joshi, \{Sumedh Shridhar\} and Yebon Kim and Yohan Ko and Aviral Shrivastava and Kyoungwoo Lee",
note = "Publisher Copyright: {\textcopyright} 2024 Copyright held by the owner/author(s).; 61st ACM/IEEE Design Automation Conference, DAC 2024 ; Conference date: 23-06-2024 Through 27-06-2024",
year = "2024",
month = nov,
day = "7",
doi = "10.1145/3649329.3657355",
language = "English",
series = "Proceedings - Design Automation Conference",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "Proceedings of the 61st ACM/IEEE Design Automation Conference, DAC 2024",
address = "United States",
}