@ARTICLE{BELBACHIR_2025_ARTICLE_BKCEMN_563028, AUTHOR = {Belbachir, S. and Khlif, N. and Chahhou, M. and El Mohajir, M. and Mazroui, A. and Nahli, O.}, TITLE = {A Proposed Approach for Extracting Semantic and Lexical Relations for Low-Resource Languages: A Case Study of Darija}, YEAR = {2025}, ABSTRACT = {Extracting semantic relations between words is crucial for the development and enrichment of lexical resources, especially for under-resourced languages like Moroccan Darija. This paper presents an automated methodology for identifying synonyms, antonyms, hypernyms, and hyponyms by leveraging bilingual Darija-English resources, Princeton WordNet (PWN), the Suggested Upper Merged Ontology (SUMO), and the NLTK toolkit. Experimental evaluation was conducted on a dataset of 361 Darija nouns, selected as a preliminary testbed to validate the methodology before scaling it to the full lexicon. The results show that 83. 10% were successfully aligned with PWN synsets, resulting in the extraction of 14, 201 semantic relations, of which 5, 475 (38. 55%) were validated through back-translation. These findings confirm the potential of transferring semantic knowledge from English into Darija, despite cultural and lexical mismatches. The proposed pipeline substantially enriches Darija's lexical coverage and offers a scalable and replicable approach for developing semantic resources in other low-resource dialects. © 2025 IEEE}, KEYWORDS = {Darija, NLP, NLTK, Ontology, semantic relation, sumo, Wordnet}, PAGES = {153-160}, DOI = {10.1109/CiSt65886}, JOURNAL = {A Proposed Approach for Extracting Semantic and Lexical Relations for Low-Resource Languages: A Case Study of Darija}, } @ARTICLE{KHLIF_2025_ARTICLE_KMN_563230, AUTHOR = {Khlif, N. and Mazroui, A. and Nahli, O.}, TITLE = {ENHANCING ARABIC DIALECT ANALYSIS: INTRODUCING DIMORPH FOR DARIJA}, YEAR = {2025}, ABSTRACT = {While Modern Standard Arabic (MSA) is well-studied, dialectal Arabic texts, such as Moroccan dialect (Darija), pose unique challenges due to their informal structure and lack of a standardized grammar. In this paper, we provide an in-depth study of Darija detailing its morphological and syntactic features, and we introduce DiMorph (Dialectal Morphological Analyzer), a specialized morphological engine, which is designed to address these complexities automatically. In detail, we focus on DiMorph’s multi-phase approach, involving both pre-and post-processing phases. Such approach effectively manages dialectal variability and achieves high accuracy in token recognition and analysis, particularly in social media contexts. Finally, we underscore the importance of developing tools that respect the linguistic diversity of Arabic dialects, laying a strong foundation for advanced computational research in Arabic dialectology. © 2025 Societa Editrice Il Mulino. All rights reserved}, KEYWORDS = {DiMorph, Moroccan dialect, morphological analyzer}, PAGES = {363-390}, VOLUME = {24 (2)}, DOI = {10.1418/116951}, JOURNAL = {ENHANCING ARABIC DIALECT ANALYSIS: INTRODUCING DIMORPH FOR DARIJA}, } @INPROCEEDINGS{LENTO_2024_INPROCEEDINGS_LNKPMF_519724, AUTHOR = {Lento, A. and Nadalini, A. and Khlif, N. and Pirrelli, V. and Marzi, C. and Ferro, M.}, TITLE = {Comparative Evaluation of Computational Models Predicting Eye Fixation Patterns During Reading: Insights from Transformers and Simpler Architectures}, YEAR = {2024}, ABSTRACT = {Eye tracking records of natural text reading are known to provide significant insights into the cognitive processes underlying word processing and text comprehension, with gaze patterns, such as fixation duration and saccadic movements, being modulated by morphological, lexical, and higher-level structural properties of the text being read. Although some of these effects have been simulated with computational models, it is still not clear how accurately computational modelling can predict complex fixation patterns in connected text reading. State-of-the-art neural architectures have shown promising results, with pre-trained transformer-based classifiers having recently been claimed to outperform other competitors, achieving beyond 95% accuracy. However, transformer-based models have neither been compared with alternative architectures nor adequately evaluated for their sensitivity to the linguistic factors affecting human reading. Here we address these issues by evaluating the performance of a pool of neural networks in classifying eye-fixation English data as a function of both lexical and contextual factors. We show that i) accuracy of transformer-based models has largely been overestimated, ii) other simpler models make comparable or even better predictions, iii) most models are sensitive to some of the major lexical factors accounting for at least 50% of human fixation variance, iv) most models fail to capture some significant context-sensitive interactions, such as those accounting for spillover effects in reading. The work shows the benefits of combining accuracy-based evaluation metrics with non-linear regression modelling of fixed and random effects on both real and simulated eye-tracking data}, KEYWORDS = {eye-tracking, eye fixation time prediction, neural network, contextual word embeddings, lexical features}, PAGES = {10}, URL = {https://ceur-ws.org/Vol-3878/}, VOLUME = {VOL-3878}, PUBLISHER = {CEUR (Aachen, DEU)}, ISBN = {979-12-210-7060-6}, CONFERENCE_NAME = {Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Aachen}, BOOKTITLE = {Proceedings of the Tenth Italian Conference on Computational Linguistics (CLiC-it 2024)}, } @INPROCEEDINGS{NAHLI_2023_INPROCEEDINGS_NGKB_481366, AUTHOR = {Nahli, O. and Gugliotta, E. and Khlif, N. and Benotto, G.}, TITLE = {Challenges and Progress in Constructing Arabic Dialect Corpora and Linguistic tools: A Focus on Moroccan and Tunisian Dialects}, YEAR = {2023}, ABSTRACT = {Given the lack of resources for Arabic dialects, the construction of corpora, lexical resources, and tools is a non-trivial challenge. The focus of the article is to describe our in-progress work to address these deficiencies. We start with Moroccan and Tunisian dialects to provide annotated corpora and corpus-based lexical resources. We also aim to extend an existing morphological engine with linguistic resources built \emph(ad hoc) for each dialect. In addition, we develop an integrated component in the morphological engine to better address linguistic and sociolinguistic characteristics while preserving the integrity of dialectal texts}, KEYWORDS = {Arabic dialects, Moroccan dialect, Tunisian dialect, corpora, lexical resources, Aramorph}, PAGES = {293-298}, URL = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=10410009}, DOI = {10.1109/cist56084.2023.10410009}, PUBLISHER = {IEEE (USA)}, ISBN = {978-1-6654-6133-7}, CONFERENCE_NAME = {7th IEEE Congress on Information Science and Technology (CiSt)}, CONFERENCE_PLACE = {USA}, BOOKTITLE = {2023 7th IEEE Congress on Information Science and Technology (CiSt)}, }