Isharah is a large-scale dataset for Continuous Saudi Sign Language (SSL) recognition and translation. It features over 30,000 video samples signed by deaf and hearing-impaired individuals using smartphones in varied settings.
@article{alyami2025isharah,
title={Isharah: A Large-Scale Multi-Scene Dataset for Continuous Sign Language Recognition},
author={Alyami, Sarah and Luqman, Hamzah and Al-Azani, Sadam and Alowaifeer, Maad and Alharbi, Yazeed and Alonaizan, Yaser},
journal={arXiv preprint arXiv:2506.03615},
year={2025}
}
KArSL is the largest video dataset for Word-Level Arabic Sign Language (ArSL). The database consists of 502 isolated sign words collected using Microsoft Kinect V2. Each sign of the database is performed by three professional signers. Each signer repeated each sign 50 times, which resulted in a total of 75,300 samples.
@article{sidig2021karsl,
title={KArSL: Arabic sign language database},
author={Sidig, Ala Addin I and Luqman, Hamzah and Mahmoud, Sabri and Mohandes, Mohamed},
journal={ACM Transactions on Asian and Low-Resource Language Information Processing (TALLIP)},
volume={20},
number={1},
pages={1--19},
year={2021},
publisher={ACM New York, NY, USA}
}
ArabSign, a continuous Arabic sign language (ArSL) dataset consists of 9,335 samples representing 50 sentences of Arabic sign language. The dataset’s sentences were performed by 6 signers. Each sentence was repeated by each signer at least 30 times at different sessions. All signers are male with different skin colors. ArabSign dataset was recorded using a Kinect V2 camera that provides three types of information (color, depth, and skeleton joint points) recorded simultaneously for each sentence.
@inproceedings{luqman2023arabsign,
title={ArabSign: A multi-modality dataset and benchmark for continuous Arabic Sign Language recognition},
author={Luqman, Hamzah},
booktitle={2023 IEEE 17th International Conference on Automatic Face and Gesture Recognition (FG)},
pages={1--8},
year={2023},
organization={IEEE}
}
mArSL is an isolated Arabic sign language (ArSL) dataset consisting of 6,748 samples representing 50 signs of Arabic sign language. The dataset’s signs were performed by four signers. Each sign was repeated by each signer several times at different sessions. All signers are male with different skin colors. The main focus of this dataset was to record signs that require
@article{luqman2021towards,
title={Towards hybrid multimodal manual and non-manual Arabic sign language recognition: MArSL database and pilot study},
author={Luqman, Hamzah and El-Alfy, El-Sayed M},
journal={Electronics},
volume={10},
number={14},
pages={1739},
year={2021},
publisher={MDPI}
}
Pearl is a large-scale Arabic multimodal dataset and benchmark explicitly designed for cultural understanding. Constructed through advanced agentic workflows and extensive human-in-the-loop annotations by 45 annotators from across the Arab world, PEARL comprises over 309K multimodal examples spanning ten culturally significant domains covering all Arab countries.
@inproceedings{alwajih-etal-2025-pearl,
title = "Pearl: A Multimodal Culturally-Aware {A}rabic Instruction Dataset",
author = "Alwajih, Fakhraddin and
Magdy, Samar M. and
El Mekki, Abdellah and
Nacar, Omer and
Nafea, Youssef and
Abdelfadil, Safaa Taher and
Yahya, Abdulfattah Mohammed and
Luqman, Hamzah and
Almarwani, Nada and
Aloufi, Samah and
Qawasmeh, Baraah and
Atou, Houdaifa and
Sibaee, Serry and
Alsayadi, Hamzah A. and
Al-Dhabyani, Walid and
Al-shaibani, Maged S. and
El aatar, Aya and
Qandos, Nour and
Alhamouri, Rahaf and
Ahmad, Samar and
AL-Ghrawi, Mohammed Anwar and
Yacoub, Aminetou and
AbuHweidi, Ruwa and
Lemin, Vatimetou Mohamed and
Abdel-Salam, Reem and
Bashiti, Ahlam and
Ammar, Adel and
Alansari, Aisha and
Ashraf, Ahmed and
Alturayeif, Nora and
Alcoba Inciarte, Alcides and
Elmadany, AbdelRahim A. and
Tourad, Mohamedou Cheikh and
Berrada, Ismail and
Jarrar, Mustafa and
Shehata, Shady and
Abdul-Mageed, Muhammad",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1254/",
doi = "10.18653/v1/2025.findings-emnlp.1254",
pages = "23048--23079",
ISBN = "979-8-89176-335-7"
}
The first Arabic dataset for target-specific stance detection, composed of 4,121 tweets. Mawqi is a multi-label dataset where each data point is annotated for stance, sentiment, and sarcasm, which will provide a benchmark for the three tasks.
@inproceedings{alturayeif-etal-2022-mawqif,
title = "Mawqif: A Multi-label {A}rabic Dataset for Target-specific Stance Detection",
author = "Alturayeif, Nora Saleh and
Luqman, Hamzah Abdullah and
Ahmed, Moataz Aly Kamaleldin",
booktitle = "Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wanlp-1.16",
pages = "174--184",
}
AraHalluEval is a manually annotated Arabic hallucination evaluation dataset designed to enable fine-grained analysis of hallucinations in LLMs. It covers two core Arabic natural language generation tasks: generative question answering (GQA) and abstractive summarization. The dataset includes model outputs annotated along factuality and faithfulness dimensions using 12 fine-grained hallucination indicators. AraHalluEval comprises 300 GQA instances and 100 summarization instances, with outputs generated by 12 LLMs, including Arabic, multilingual, and reasoning-based models.
@inproceedings{alansari-luqman-2025-arahallueval,
title = "{A}ra{H}allu{E}val: A Fine-grained Hallucination Evaluation Framework for {A}rabic {LLM}s",
author = "Alansari, Aisha and
Luqman, Hamzah",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.arabicnlp-main.12/",
doi = "10.18653/v1/2025.arabicnlp-main.12",
pages = "148--161",
ISBN = "979-8-89176-352-4"
}
The WASM dataset is a newly introduced Arabic Twitter dataset designed for hashtag recommendation and classification tasks. It contains over 100,000 Arabic tweets that have been manually and automatically filtered and annotated with 87 distinct hashtags, making it suitable for developing and benchmarking models that suggest relevant hashtags based on tweet content and filter unrelated tweets.
@article{al2024wasm,
title={Wasm: A dataset for hashtag recommendation for arabic tweets},
author={Al-Shaibani, Maged S and Luqman, Hamzah and Al-Ghofaily, Abdulaziz S and Al-Najim, Abdullatif A},
journal={Arabian Journal for Science and Engineering},
volume={49},
number={9},
pages={12131--12145},
year={2024},
publisher={Springer}
}
MA’AKS is a novel Arabic parallel dataset for sentiment style transfer. MA’AKS consists of 5k sentences in modern standard Arabic with positive/negative sentiments. Each sentence is meticulously annotated to ensure high-quality parallel sentiment pairs, supporting both supervised and unsupervised learning.
@article{mughaus2026ma,
title={Ma’aks: manually-curated parallel dataset for Arabic text sentiment swap},
author={Mughaus, Raed and Abudalfa, Shadi and Luqman, Hamzah and Abdu, Fahad and AlAli, Mohammed and Al-Dowayan, Nawaf and Abdelali, Ahmed},
journal={Language Resources and Evaluation},
volume={60},
number={1},
pages={1},
year={2026},
publisher={Springer}
}
KAFD is a multi-font, multi-size, multi-style, and multi-resolution Arabic text images database. It consists of 40 Arabic fonts and is available in two forms (Page and Line). The dataset can be used for developing and evaluating Arabic OCR systems and text understanding on text images.
@article{luqman2014kafd,
title={KAFD Arabic font database},
author={Luqman, Hamzah and Mahmoud, Sabri A and Awaida, Sameh},
journal={Pattern Recognition},
volume={47},
number={6},
pages={2231--2240},
year={2014},
publisher={Elsevier}
}
The database consists of 10,040 lines of Arabic handwritten text written by 623 writers using Android- and Windows-based devices. We have segmented part of the collected data into characters along with their ground truths.
@article{mahmoud2018online,
title={Online-khatt: an open-vocabulary database for Arabic online-text processing},
author={Mahmoud, Sabri A and Luqman, Hamzah and Al-Helali, Baligh M and BinMakhashen, Galal and Parvez, Mohammad Tanvir},
journal={The Open Cybernetics \& Systemics Journal},
volume={12},
number={1},
pages={42--59},
year={2018}
}