@inproceedings{4c14a997e93b44aa97d8a021c76f8776,
title = "DocExtractor: An off-the-shelf historical document element extraction",
abstract = "We present docExtractor, a generic approach for extracting visual elements such as text lines or illustrations from historical documents without requiring any real data annotation. We demonstrate it provides high-quality performances as an off-the-shelf system across a wide variety of datasets and leads to results on par with state-of-the-art when fine-tuned. We argue that the performance obtained without fine-tuning on a specific dataset is critical for applications, in particular in digital humanities, and that the line-level page segmentation we address is the most relevant for a general purpose element extraction engine. We rely on a fast generator of rich synthetic documents and design a fully convolutional network, which we show to generalize better than a detection-based approach. Furthermore, we introduce a new public dataset dubbed IlluHisDoc dedicated to the fine evaluation of illustration segmentation in historical documents.",
keywords = "deep learning, document layout analysis, historical document, page segmentation, synthetic data, text line detection",
author = "Tom Monnier and Mathieu Aubry",
note = "Publisher Copyright: {\textcopyright} 2020 IEEE.; 17th International Conference on Frontiers in Handwriting Recognition, ICFHR 2020 ; Conference date: 07-09-2020 Through 10-09-2020",
year = "2020",
month = sep,
day = "1",
doi = "10.1109/ICFHR2020.2020.00027",
language = "English",
series = "Proceedings of International Conference on Frontiers in Handwriting Recognition, ICFHR",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "91--96",
booktitle = "Proceedings - 2020 17th International Conference on Frontiers in Handwriting Recognition, ICFHR 2020",
}