@INPROCEEDINGS{MORONI_2025_INPROCEEDINGS_MPHBBMDEN_552066, AUTHOR = {Moroni, L. and Puccetti, G. and Huguet Cabot, P. L. and Bejgu, A. S. and Barba, E. and Miaschi, A. and Dell'Orletta, F. and Esuli, A. and Navigli, R.}, TITLE = {Optimizing LLMs for Italian: reducing token fertility and enhancing efficiency through vocabulary adaptation}, YEAR = {2025}, ABSTRACT = {The number of pretrained Large Language Models (LLMs) is increasing steadily, though the majority are designed predominantly for the English language. While state-of-the-art LLMs can handle other languages, due to language contamination or some degree of multilingual pretraining data, they are not optimized for non-English languages, leading to inefficient encoding (high token ``fertility'') and slower inference speed. In this work, we thoroughly compare a variety of vocabulary adaptation techniques for optimizing English LLMs for the Italian language, and put forward Semantic Alignment Vocabulary Adaptation (SAVA), a novel method that leverages neural mapping for vocabulary substitution. SAVA achieves competitive performance across multiple downstream tasks, enhancing grounded alignment strategies. We adapt two LLMs: Mistral-7B-v0. 1, reducing token fertility by 25(\%), and Llama-3. 1-8B, optimizing the vocabulary and reducing the number of parameters by 1 billion. We show that, following the adaptation of the vocabulary, these models can recover their performance with a relatively limited stage of continual training on the target language. Finally, we test the capabilities of the adapted models on various multi-choice and generative tasks}, KEYWORDS = {Large Languiage Models, Italia LLM, Vocabulary Adaptation}, PAGES = {6646-6660}, URL = {https://aclanthology.org/2025.findings-naacl.371/}, DOI = {10.18653/v1/2025.findings-naacl.371}, PUBLISHER = {Association for Computational Linguistics}, ISBN = {979-8-89176-195-7}, CONFERENCE_NAME = {NAACL 2025-Annual Conference of the Nations of the Americas Chapter. Findings of the Association for Computational Linguistics}, BOOKTITLE = {NAACL 2025 Findings proceedings}, } @INPROCEEDINGS{PEDROTTI_2025_INPROCEEDINGS_PPCMPDE_554367, AUTHOR = {Pedrotti, A. and Papucci, M. and Ciaccio, C. and Miaschi, A. and Puccetti, G. and Dell'Orletta, F. and Esuli, A.}, TITLE = {Stress-testing machine generated text detection: shifting language models writing style to fool detectors}, YEAR = {2025}, ABSTRACT = {Recent advancements in Generative AI and Large Language Models (LLMs) have enabled the creation of highly realistic synthetic content, raising concerns about the potential for malicious use, such as misinformation and manipulation. Moreover, detecting Machine-Generated Text (MGT) remains challenging due to the lack of robust benchmarks that assess generalization to real-world scenarios. In this work, we evaluate the resilience of state-of-the-art MGT detectors (e. g., Mage, Radar, LLM-DetectAIve) to linguistically informed adversarial attacks. We develop a pipeline that fine-tunes language models using Direct Preference Optimization (DPO) to shift the MGT style toward human-written text (HWT), obtaining generations more challenging to detect by current models. Additionally, we analyze the linguistic shifts induced by the alignment and how detectors rely on “linguistic shortcuts” to detect texts. Our results show that detectors can be easily fooled with relatively few examples, resulting in a significant drop in detecting performances. This highlights the importance of improving detection methods and making them robust to unseen in-domain texts. We release code, models, and data to support future research on more robust MGT detection benchmarks}, KEYWORDS = {machine-generated text detection, synthetic content detection}, PAGES = {3010-3031}, URL = {https://aclanthology.org/2025.findings-acl.156/}, DOI = {10.18653/v1/2025.findings-acl.156}, PUBLISHER = {Association for Computational Linguistics}, ISBN = {979-8-89176-256-5}, CONFERENCE_NAME = {NAACL 2025-Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics. Findings}, BOOKTITLE = {NAACL 2025 Findings proceedings}, } @INPROCEEDINGS{MIASCHI_2024_INPROCEEDINGS_MDV_518427, AUTHOR = {Miaschi, A. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Evaluating Large Language Models via Linguistic Profiling}, YEAR = {2024}, ABSTRACT = {Large Language Models (LLMs) undergo extensive evaluation against various benchmarks collected in established leaderboards to assess their performance across multiple tasks. However, to the best of our knowledge, there is a lack of comprehensive studies evaluating these models’ linguistic abilities independent of specific tasks. In this paper, we introduce a novel evaluation methodology designed to test LLMs’ sentence generation abilities under specific linguistic constraints. Drawing on the ‘linguistic profiling’ approach, we rigorously investigate the extent to which five LLMs of varying sizes, tested in both zero-and few-shot scenarios, effectively adhere to (morpho)syntactic constraints. Our findings shed light on the linguistic proficiency of LLMs, revealing both their capabilities and limitations in generating linguistically-constrained sentences}, KEYWORDS = {Large Language Models, Controllable Text Generation, Linguistic Profiling}, PAGES = {2835-2848}, URL = {https://aclanthology.org/2024.emnlp-main.166}, DOI = {10.18653/v1/2024.emnlp-main.166}, PUBLISHER = {Association for Computational Linguistics (USA)}, ISBN = {979-8-89176-164-3}, CONFERENCE_NAME = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, CONFERENCE_PLACE = {USA}, BOOKTITLE = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, } @INPROCEEDINGS{MIASCHI_2024_INPROCEEDINGS_MDV_487005, AUTHOR = {Miaschi, A. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Linguistic Knowledge Can Enhance Encoder-Decoder Models (If You Let It)}, YEAR = {2024}, ABSTRACT = {In this paper, we explore the impact of augmenting pre-trained Encoder-Decoder models, specifically T5, with linguistic knowledge for the prediction of a target task. In particular, we investigate whether fine-tuning a T5 model on an intermediate task that predicts structural linguistic properties of sentences modifies its performance in the target task of predicting sentence-level complexity. Our study encompasses diverse experiments conducted on Italian and English datasets, employing both monolingual and multilingual T5 models at various sizes. Results obtained for both languages and in cross-lingual configurations show that linguistically motivated intermediate fine-tuning has generally a positive impact on target task performance, especially when applied to smaller models and in scenarios with limited data availability}, KEYWORDS = {encoder-decoder, intermediate fine-tuning, linguistic features, sentence complexity}, PAGES = {10539-10554}, URL = {https://aclanthology.org/2024.lrec-main.922/}, PUBLISHER = {ELRA and ICCL}, ISBN = {978-2-493814-10-4}, CONFERENCE_NAME = {Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, BOOKTITLE = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, } @INPROCEEDINGS{MOTGER_2024_INPROCEEDINGS_MMDFM_519997, AUTHOR = {Motger, Q. and Miaschi, A. and Dell'Orletta, F. and Franch, X. and Marco, J.}, TITLE = {T-FREX: A Transformer-based Feature Extraction Method from Mobile App Reviews}, YEAR = {2024}, ABSTRACT = {Mobile app reviews are a large-scale data source for software-related knowledge generation activities, including software maintenance, evolution and feedback analysis. Effective extraction of features (i. e., functionalities or characteristics) from these reviews is key to support analysis on the acceptance of these features, identification of relevant new feature requests and prioritization of feature development, among others. Traditional methods focus on syntactic pattern-based approaches, typically context-agnostic, evaluated on a closed set of apps, difficult to replicate and limited to a reduced set and domain of apps. Mean-while, the pervasiveness of Large Language Models (LLMs) based on the Transformer architecture in software engineering tasks lays the groundwork for empirical evaluation of the performance of these models to support feature extraction. In this study, we present T-FREX, a Transformer-based, fully automatic approach for mobile app review feature extraction. First, we collect a set of ground truth features from users in a real crowdsourced software recommendation platform and transfer them automatically into a dataset of app reviews. Then, we use this newly created dataset to fine-tune multiple LLMs on a named entity recognition task under different data configurations. We assess the performance of T-FREX with respect to this ground truth, and we complement our analysis by comparing T-FREX with a baseline method from the field. Finally, we assess the quality of new features predicted by T-FREX through an external human evaluation. Results show that T-FREX outperforms on average the traditional syntactic-based method, especially when discovering new features from a domain for which the model has been fine-tuned}, KEYWORDS = {feature extraction, large language models, mobile apps, named entity recognition, reviews, token classification}, PAGES = {227-238}, URL = {https://iris.cnr.it/handle/20.500.14243/519997}, DOI = {10.1109/SANER60148.2024.00030}, PUBLISHER = {Institute of Electrical and Electronics Engineers Inc}, CONFERENCE_NAME = {31st IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024}, BOOKTITLE = {Proceedings-2024 IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024}, } @ARTICLE{ALZETTA_2023_ARTICLE_ADMPV_439017, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Miaschi, A. and Prat, E. and Venturi, G.}, TITLE = {Tell me how you write and I'll tell you what you read: a study on the writing style of book reviews}, YEAR = {2023}, ABSTRACT = {The paper aims at investigating variations in the writing style of book reviews published on different social reading platforms and referring to books of different genres, which enables acquiring insights into communication strategies adopted by readers to share their reading experiences. To this end, we introduce a corpus-based study focused on the analysis of A Good Review, a novel corpus of online book reviews written in Italian, posted on Amazon and Goodreads, and covering six literary fiction genres. We rely on stylometric analysis to explore the linguistic properties and lexicon of reviews and the authors conducted automatic classification experiments using multiple approaches and feature configurations to predict either the review's platform or the literary genre. The analysis of user-generated reviews demonstrates that language is a quite variable dimension across reading platforms, but not as much across book genres. The classification experiments revealed that features modelling the syntactic structure of the sentence are reliable proxies for discerning Amazon and Goodreads reviews, whereas lexical information showed a higher predictive role for automatically discriminating the genre}, KEYWORDS = {Stylometric analysis, Textual Genre detection, Book reviews}, PAGES = {23}, URL = {https://www.emerald.com/insight/content/doi/10.1108/JD-04-2023-0073/full/html}, VOLUME = {79}, DOI = {10.1108/JD-04-2023-0073}, ISSN = {0022-0418}, JOURNAL = {JOURNAL OF DOCUMENTATION}, } @ARTICLE{MIASCHI_2023_ARTICLE_MABDV_439018, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Testing the Effectiveness of the Diagnostic Probing Paradigm on Italian Treebanks}, YEAR = {2023}, ABSTRACT = {The outstanding performance recently reached by neural language models (NLMs) across many natural language processing (NLP) tasks has steered the debate towards understanding whether NLMs implicitly learn linguistic competence. Probes, i. e., supervised models trained using NLM representations to predict linguistic properties, are frequently adopted to investigate this issue. However, it is still questioned if probing classification tasks really enable such investigation or if they simply hint at surface patterns in the data. This work contributes to this debate by presenting an approach to assessing the effectiveness of a suite of probing tasks aimed at testing the linguistic knowledge implicitly encoded by one of the most prominent NLMs, BERT. To this aim, we compared the performance of probes when predicting gold and automatically altered values of a set of linguistic features. Our experiments were performed on Italian and were evaluated across BERT's layers and for sentences with different lengths. As a general result, we observed higher performance in the prediction of gold values, thus suggesting that the probing model is sensitive to the distortion of feature values. However, our experiments also showed that the length of a sentence is a highly influential factor that is able to confound the probing model's predictions}, KEYWORDS = {Neural language model, Probing tasks, Treebanks}, PAGES = {19}, URL = {https://www.mdpi.com/2078-2489/14/3/144}, VOLUME = {14 (3)}, DOI = {10.3390/info14030144}, ISSN = {2078-2489}, JOURNAL = {INFORMATION}, } @INPROCEEDINGS{ALZETTA_2023_INPROCEEDINGS_ABDMSSV_470901, AUTHOR = {Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Miaschi, A. and Sagae, K. and Sánchez Gutiérrez, C. H. and Venturi, G.}, TITLE = {LangLearn at EVALITA 2023: Overview of the Language Learning Development Task}, YEAR = {2023}, ABSTRACT = {Language Learning Development (LangLearn) is the EVALITA 2023 shared task on automatic language development assessment, which consists in predicting the evolution of the written language abilities of learners across time. LangLearn is conceived to be multilingual, relying on written productions of Italian and Spanish learners, and representative of L1 and L2 learning scenarios. A total of 9 systems were submitted by 5 teams. The results highlight the open challenges of automatic language development assessment}, URL = {https://iris.cnr.it/handle/20.500.14243/470901}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {9791255000693}, CONFERENCE_NAME = {8th Evaluation Campaign of Natural Language Processing and Speech Tools for Italian}, CONFERENCE_PLACE = {Torino}, BOOKTITLE = {Proceedings of EVALITA 2023}, } @INPROCEEDINGS{ALZETTA_2023_INPROCEEDINGS_ADFMV_470921, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Fazzone, C. and Miaschi, A. and Venturi, G.}, TITLE = {Unmasking the Wordsmith: Revealing Author Identity through Reader Reviews}, YEAR = {2023}, ABSTRACT = {Traditional genre-based approaches for book recommendations face challenges due to the vague definition of genres. To overcome this, we propose a novel task called Book Author Prediction, where we predict the author of a book based on user-generated reviews’ writing style. To this aim, we first introduce the ‘Literary Voices Corpus’ (LVC), a dataset of Italian book reviews, and use it to train and test machine learning models. Our study contributes valuable insights for developing user-centric systems that recommend leisure readings based on individual readers’ interests and writing styles}, URL = {https://ceur-ws.org/Vol-3596/paper4.pdf}, CONFERENCE_NAME = {9th Italian Conference on Computational Linguistics}, BOOKTITLE = {Proceedings of the 9th Italian Conference on Computational Linguistics}, } @INPROCEEDINGS{MIASCHI_2023_INPROCEEDINGS_MPD_520527, AUTHOR = {Miaschi, A. and Papucci, M. and Dell'Orletta, F.}, TITLE = {Lost in Labels: An Ongoing Quest to Optimize Text-to-Text Label Selection for Classification}, YEAR = {2023}, ABSTRACT = {In this paper, we present an evaluation of the influence of label selection on the performance of a Sequence-to-Sequence Transformer model in a classification task. Our study investigates whether the choice of words used to represent classification categories affects the model’s performance, and if there exists a relationship between the model’s performance and the selected words. To achieve this, we fine-tuned an Italian T5 model on topic classification using various labels. Our results indicate that the different label choices can significantly impact the model’s performance. That being said, we did not find a clear answer on how these choices affect the model performances, highlighting the need for further research in optimizing label selection}, KEYWORDS = {encoder-decoder, label selection, topic classification}, URL = {https://iris.cnr.it/handle/20.500.14243/520527}, VOLUME = {516 (394)}, BOOKTITLE = {Proceedings of the 9th Italian Conference on Computational Linguistics CLiC-it 2023: Venice, Italy, November 30-December 2, 2023}, } @ARTICLE{MIASCHI_2022_ARTICLE_MBDV_417257, AUTHOR = {Miaschi, A. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {On Robustness and Sensitivity of a Neural Language Model: A Case Study on Italian L1 Learner Errors}, YEAR = {2022}, ABSTRACT = {In this paper, we propose a comprehensive linguistic study aimed at assessing the implicit behavior of one of the most prominent Neural Language Models (NLM) based on Transformer architectures, BERT (Devlin et al., 2019), when dealing with a particular source of noisy data, namely essays written by L1 Italian learners containing a variety of errors targeting grammar, orthography and lexicon. Differently from previous works, we focus on the pre-training stage and we devise two complementary evaluation tasks aimed at assessing the impact of errors on sentence-level inner representations in terms of semantic robustness and linguistic sensitivity. While the first evaluation perspective is meant to probe the model's ability to encode the semantic similarity between sentences also in the presence of errors, the second type of probing task evaluates the influence of errors on BERT's implicit knowledge of a set of raw and morpho-syntactic properties of a sentence. Our experiments show that BERT's ability to compute sentence similarity and to correctly encode multi-leveled linguistic information of a sentence are differently modulated by the category of errors and that the error hierarchies in terms of robustness and sensitivity change across layer-wise representations}, KEYWORDS = {Natural Language Processing, Neural Language Model, Interpretability}, PAGES = {426-438}, URL = {https://doi.org/10.1109/TASLP.2022.3226333}, VOLUME = {31}, DOI = {10.1109/TASLP.2022.3226333}, ISSN = {2329-9290}, JOURNAL = {IEEE/ACM TRANSACTIONS ON AUDIO, SPEECH, AND LANGUAGE PROCESSING}, } @ARTICLE{MIASCHI_2022_ARTICLE_MSBDV_443057, AUTHOR = {Miaschi, A. and Sarti, G. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Probing Linguistic Knowledge in Italian Neural Language Models across Language Varieties}, YEAR = {2022}, ABSTRACT = {In this paper, we present an in-depth investigation of the linguistic knowledge encoded by the transformer models currently available for the Italian language. In particular, we investigate how the complexity of two different architectures of probing models affects the performance of the Transformers in encoding a wide spectrum of linguistic features. Moreover, we explore how this implicit knowledge varies according to different textual genres and language varieties}, KEYWORDS = {Neural Language Models, Interpretability, Language Varieties}, PAGES = {25-44}, URL = {http://www.aaccademia.it/ita/scheda-libro?aaref=1518}, DOI = {10.4000/ijcol.965}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @INPROCEEDINGS{MIASCHI_2022_INPROCEEDINGS_MRD_443056, AUTHOR = {Miaschi, A. and Ravelli, A. and Dell'Orletta, F.}, TITLE = {Punctuation Restoration in Spoken Italian Transcripts with Transformers}, YEAR = {2022}, ABSTRACT = {In this paper, we propose an evaluation of a Transformer-based punctuation restoration model for the Italian language. Experimenting with a BERT-base model, we perform several fine-tuning with different training data and sizes and tested them in an in-and cross-domain scenario. Moreover, we conducted an error analysis of the main weaknesses of the model related to specific punctuation marks. Finally, we test our system either quantitatively and qualitatively, by offering a typical task-oriented and a perception-based acceptability evaluation}, KEYWORDS = {nlp, transformer models, puncutation restoration}, PAGES = {245-260}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85135083576\&origin=inward}, VOLUME = {13196 LNAI}, DOI = {10.1007/978-3-031-08421-8_17}, CONFERENCE_NAME = {AIxIA 2021-Advances in Artificial Intelligence}, BOOKTITLE = {Proccedings of AIxIA 2021-Advances in Artificial Intelligence}, } @INPROCEEDINGS{PAPUCCI_2022_INPROCEEDINGS_PDMD_415084, AUTHOR = {Papucci, M. and De Nigris, C. and Miaschi, A. and Dell'Orletta, F.}, TITLE = {Evaluating Text-To-Text Framework for Topic and Style Classification of Italian texts}, YEAR = {2022}, ABSTRACT = {In this paper, we propose an extensive evaluation of the first text-to-text Italian Neural Language Model (NLM), IT5 [1], on a classification scenario. In particular, we test the performance of IT5 on several tasks involving both the classification of the topic and the style of a set of Italian posts. We assess the model in two different configurations, single-and multi-task classification, and we compare it with a more traditional NLM based on the Transformer architecture (i. e. BERT). Moreover, we test its performance in a few-shot learning scenario. We also perform a qualitative investigation on the impact of label representations in modeling the classification of the IT5 model. Results show that IT5 could achieve good results, although generally lower than the BERT model. Nevertheless, we observe a significant performance improvement of the Text-to-text model in a multi-task classification scenario. Finally, we found that altering the representation of the labels mainly impacts the classification of the topic}, KEYWORDS = {bert, style classification, t5, text-to-text, topic classification, transformers}, PAGES = {56-70}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85143252156\&origin=inward}, VOLUME = {3287}, CONFERENCE_NAME = {Sixth Workshop on Natural Language for Artificial Intelligence, NL4AI 2022}, } @ARTICLE{MIASCHI_2021_ARTICLE_MBD_402654, AUTHOR = {Miaschi, A. and Brunato, D. P. and Dell'Orletta, F.}, TITLE = {A NLP-based stylometric approach for tracking the evolution of L1 written language competence}, YEAR = {2021}, ABSTRACT = {In this study we present a Natural Language Processing (NLP)-based stylometric approach for tracking the evolution of written language competence in Italian L1 learners. The approach relies on a wide set of linguistically motivated features capturing stylistic aspects of a text, which were extracted from students' essays contained in CItA (Corpus Italiano di Apprendenti L1), the first longitudinal corpus of texts written by Italian L1 learners enrolled in the first and second year of lower secondary school. We address the problem of modeling written language development as a supervised classification task consisting in predicting the chronological order of essays written by the same student at different temporal spans. The promising results obtained in several classification scenarios allow us to conclude that it is possible to automatically model the highly relevant changes affecting written language evolution across time, as well as identifying which features are more predictive of this process. In the last part of the article, we focus the attention on the possible influence of background variables on language learning and we present preliminary results of a pilot study aiming at understanding how the observed developmental patterns are affected by information related to the school environment of the student}, KEYWORDS = {stylometry, computational linguistics, language competence}, PAGES = {71-105}, URL = {https://www.jowr.org/abstracts/vol13_1/Miaschi_et_al_2021_13_1_abstract.html}, VOLUME = {VOL. 13}, DOI = {10.17239/jowr-2021.13.01.03}, ISSN = {2030-1006}, JOURNAL = {JOURNAL OF WRITING RESEARCH}, } @INPROCEEDINGS{ALBERTIN_2021_INPROCEEDINGS_AMB_440996, AUTHOR = {Albertin, G. and Miaschi, A. and Brunato, D.}, TITLE = {On the role of textual connectives in sentence comprehension: A new dataset for Italian}, YEAR = {2021}, ABSTRACT = {In this paper we present a new evaluation resource for Italian aimed at assessing the role of textual connectives in the comprehension of the meaning of a sentence. The resource is arranged in two sections (acceptability assessment and cloze test), each one corresponding to a distinct challenge task conceived to test how subtle modifications involving connectives in real usage sentences influence the perceived acceptability of the sentence by native speakers and Neural Language Models (NLMs). Although the main focus is the presentation of the dataset, we also provide some preliminary data comparing human judgments and NLMs performance in the two tasks}, KEYWORDS = {neural language models, textual connectives, sentence acceptability}, URL = {http://ceur-ws.org/Vol-3033/paper16.pdf}, VOLUME = {3033}, CONFERENCE_NAME = {8th Italian Conference on Computational Linguistics (CLIC-it 2021)}, } @INPROCEEDINGS{MESSINA_2021_INPROCEEDINGS_MBCMPSN_400472, AUTHOR = {Messina, L. and Busso, L. and Combei, C. R. and Miaschi, A. and Pannitto, L. and Sarti, G. and Nissim, M.}, TITLE = {A dissemination workshop for introducing young Italian students to NLP}, YEAR = {2021}, ABSTRACT = {We describe and make available the game-based material developed for a laboratory run at several Italian science festivals to popularize NLP among young students}, KEYWORDS = {nlp, teaching}, PAGES = {52-54}, URL = {https://www.aclweb.org/anthology/2021.teachingnlp-1.7}, ISBN = {978-1-954085-36-7}, CONFERENCE_NAME = {5th Workshop on Teaching NLP}, BOOKTITLE = {Proceedings of the 5th Workshop on Teaching NLP}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MABDV_446048, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Probing tasks under pressure}, YEAR = {2021}, ABSTRACT = {Probing tasks are frequently used to evaluate whether the representations of Neural Language Models (NLMs) encode linguistic information. However, it is still questioned if probing classification tasks really enable such investigation or they simply hint for surface patterns in the data. We present a method to investigate this question by comparing the accuracies of a set of probing tasks on gold and automatically generated control datasets. Our results suggest that probing tasks can be used as reliable diagnostic methods to investigate the linguistic information encoded in NLMs representations}, KEYWORDS = {Neural Language Models, Linguistic probing, Treebanks}, PAGES = {1-7}, URL = {http://ceur-ws.org/Vol-3033/paper29.pdf}, VOLUME = {3033}, CONFERENCE_NAME = {8th Italian Conference on Computational Linguistics (CLIC-it 2021)}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MBDV_400474, AUTHOR = {Miaschi, A. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {What Makes My Model Perplexed? A Linguistic Investigation on Neural Language Models Perplexity}, YEAR = {2021}, ABSTRACT = {This paper presents an investigation aimed at studying how the linguistic structure of a sentence affects the perplexity of two of the most popular Neural Language Models (NLMs), BERT and GPT-2. We first compare the sentence-level likelihood computed with BERT and the GPT-2's perplexity showing that the two metrics are correlated. In addition, we exploit linguistic features capturing a wide set of morpho-syntactic and syntactic phenomena showing how they contribute to predict the perplexity of the two NLMs}, KEYWORDS = {nlp, interpretability, deep learning}, PAGES = {40-47}, URL = {https://www.aclweb.org/anthology/2021.deelio-1.5}, ISBN = {978-1-954085-30-5}, CONFERENCE_NAME = {2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, BOOKTITLE = {Proceedings of the 2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MRD_443055, AUTHOR = {Miaschi, A. and Ravelli, A. A. and Dell'Orletta, F.}, TITLE = {Evaluating Transformer Models for Punctuation Restoration in Italian}, YEAR = {2021}, ABSTRACT = {In this paper, we propose an evaluation of a Transformerbased punctuation restoration model for the Italian language. Experimenting with a BERT-base model, we perform several fine-tuning with different training data and sizes and tested them in an in-and crossdomain scenario. Moreover, we offer a comparison in a multilingual setting with the same model fine-tuned on English transcriptions. Finally, we conclude with an error analysis of the main weaknesses of the model related to specific punctuation marks}, KEYWORDS = {transformer models, nlp, punctuation restoration}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85121647978\&origin=inward}, VOLUME = {3015}, CONFERENCE_NAME = {5th Workshop on Natural Language for Artificial Intelligence (NL4AI 2021)}, } @INPROCEEDINGS{PANNITTO_2021_INPROCEEDINGS_PBCMMSN_400471, AUTHOR = {Pannitto, L. and Busso, L. and Combei, C. R. and Messina, L. and Miaschi, A. and Sarti, G. and Nissim, M.}, TITLE = {Teaching NLP with Bracelets and Restaurant Menus: An Interactive Workshop for Italian Students}, YEAR = {2021}, ABSTRACT = {Although Natural Language Processing is at the core of many tools young people use in their everyday life, high school curricula (in Italy) do not include any computational linguistics education. This lack of exposure makes the use of such tools less responsible than it could be, and makes choosing computational linguistics as a university degree unlikely. To raise awareness, curiosity, and longer-term interest in young people, we have developed an interactive workshop designed to illustrate the basic principles of NLP and computational linguistics to high school Italian students aged between 13 and 18 years. The workshop takes the form of a game in which participants play the role of machines needing to solve some of the most common problems a computer faces in understanding language: from voice recognition to Markov chains to syntactic parsing. Participants are guided through the workshop with the help of instructors, who present the activities and explain core concepts from computational linguistics. The workshop was presented at numerous outlets in Italy between 2019 and 2020, both face-to-face and online}, KEYWORDS = {nlp, teaching}, PAGES = {160-170}, URL = {https://www.aclweb.org/anthology/2021.teachingnlp-1.26}, ISBN = {978-1-954085-36-7}, CONFERENCE_NAME = {5th Workshop on Teaching NLP}, BOOKTITLE = {Proceedings of the 5th Workshop on Teaching NLP}, } @INPROCEEDINGS{PUCCETTI_2021_INPROCEEDINGS_PMD_400473, AUTHOR = {Puccetti, G. and Miaschi, A. and Dell'Orletta, F.}, TITLE = {How do BERT embeddings organize linguistic knowledge?}, YEAR = {2021}, ABSTRACT = {Several studies investigated the linguistic information implicitly encoded in Neural Language Models. Most of these works focused on quantifying the amount and type of information available within their internal representations and across their layers. In line with this scenario, we proposed a different study, based on Lasso regression, aimed at understanding how the information encoded by BERT sentence-level representations is arrange within its hidden units. Using a suite of several probing tasks, we showed the existence of a relationship between the implicit knowledge learned by the model and the number of individual units involved in the encodings of this competence. Moreover, we found that it is possible to identify groups of hidden units more relevant for specific linguistic properties}, KEYWORDS = {NLP, Interpretability, Deep Learning}, PAGES = {48-57}, URL = {https://www.aclweb.org/anthology/2021.deelio-1.6}, DOI = {10.18653/v1/2021.deelio-1.6}, ISBN = {978-1-954085-30-5}, CONFERENCE_NAME = {2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, BOOKTITLE = {Proceedings of the 2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_AMDKFTI_421771, AUTHOR = {Alzetta, C. and Miaschi, A. and Dell'Orletta, F. and Koceva and Frosina and Torre and Ilaria}, TITLE = {PRELEARN @ EVALITA 2020: Overview of the Prerequisite Relation Learning Task for Italian}, YEAR = {2020}, ABSTRACT = {The Prerequisite Relation Learning (PRELEARN) task is the EVALITA 2020 shared task on concept prerequisite learning, which consists of classifying prerequisite relations between pairs of concepts distinguishing between prerequisite pairs and non-prerequisite pairs. Four sub-tasks were defined: two of them define different types of features that participants are allowed to use when training their model, while the other two define the classification scenarios where the proposed models would be tested. In total, 14 runs were submitted by 3 teams comprising 9 total individual participants}, KEYWORDS = {nlp, prerequisite learning, shared task}, URL = {http://ceur-ws.org/Vol-2765/paper164.pdf}, CONFERENCE_NAME = {Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, BOOKTITLE = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{DEMATTEI_2020_INPROCEEDINGS_DDIMPR_421769, AUTHOR = {De Mattei, L. and De Martino, G. and Iovine, A. and Miaschi, A. and Polignano, M. and Rambelli, G.}, TITLE = {ATE ABSITA@ EVALITA2020: Overview of the Aspect Term Extraction and Aspect-based Sentiment Analysis Task}, YEAR = {2020}, ABSTRACT = {Over the last years, the rise of novel sentiment analysis techniques to assess aspect-based opinions on product reviews has become a key component for providing valuable insights to both consumers and businesses. To this extent, we propose ATE\_ABSITA: the EVALITA 2020 shared task on Aspect Term Extraction and Aspect-Based Sentiment Analysis. In particular, we approach the task as a cascade of three subtasks: Aspect Term Extraction (ATE), Aspect-based Sentiment Analysis (ABSA) and Sentiment Analysis (SA). Therefore, we invited participants to submit systems designed to automatically identify the "aspect terms" in each review and to predict the sentiment expressed for each aspect, along with the sentiment of the entire review. The task received broad interest, with 27 teams registered and more than 45 participants. However, only three teams submitted their working systems. The results obtained underline the task's difficulty, but they also show how it is possible to deal with it using innovative approaches and models. Indeed, two of them are based on large pre-trained language models as typical in the current state of the art for the English language}, KEYWORDS = {nlp, sentiment analysis, shared task}, URL = {http://ceur-ws.org/Vol-2765/paper153.pdf}, CONFERENCE_NAME = {Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, BOOKTITLE = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MABDV_421767, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Is Neural Language Model Perplexity Related to Readability?}, YEAR = {2020}, ABSTRACT = {This paper explores the relationship between Neural Language Model (NLM) perplexity and sentence readability. Starting from the evidence that NLMs implicitly acquire sophisticated linguistic knowledge from a huge amount of training data, our goal is to investigate whether perplexity is affected by linguistic features used to automatically assess sentence readability and if there is a correlation between the two metrics. Our findings suggest that this correlation is actually quite weak and the two metrics are affected by different linguistic phenomena}, KEYWORDS = {nlp, neural language models, readability}, URL = {http://ceur-ws.org/Vol-2769/paper_57.pdf}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics}, BOOKTITLE = {Proceedings of the Seventh Italian Conference on Computational Linguistics}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MBDV_379646, AUTHOR = {Miaschi, A. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Linguistic Profiling of a Neural Language Model}, YEAR = {2020}, ABSTRACT = {In this paper we investigate the linguistic knowledge learned by a Neural Language Model (NLM) before and after a fine-tuning process and how this knowledge affects its predictions during several classification problems. We use a wide set of probing tasks, each of which corresponds to a distinct sentence-level feature extracted from different levels of linguistic annotation. We show that BERT is able to encode a wide range of linguistic characteristics, but it tends to lose this information when trained on specific downstream tasks. We also find that BERT's capacity to encode different kind of linguistic properties has a positive influence on its predictions: the more it stores readable linguistic information of a sentence, the higher will be its capacity of predicting the expected label assigned to that sentence}, KEYWORDS = {Linguistic Profiling, Neural Language Model, Interpretability}, PAGES = {745-756}, URL = {https://www.aclweb.org/anthology/2020.coling-main.65/}, DOI = {10.18653/v1/2020.coling-main.65}, ISBN = {978-1-952148-27-9}, CONFERENCE_NAME = {International Conference on Computational Linguistics (COLING)}, BOOKTITLE = {International Conference on Computational Linguistics (COLING)}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MDBDSSV_384933, AUTHOR = {Miaschi, A. and Davidson, S. and Brunato, D. P. and Dell'Orletta, F. and Sagae, K. and Sanchez Gutierrez, C. H. and Venturi, G.}, TITLE = {Tracking the Evolution of Written Language Competence in L2 Spanish Learners}, YEAR = {2020}, ABSTRACT = {In this paper we present an NLP-based approach for tracking the evolution of written language competence in L2 Spanish learners using a wide range of linguistic features automatically extracted from students' written productions. Beyond reporting classification results for different scenarios, we explore the connection between the most predictive features and the teaching curriculum, finding that our set of linguistic features often reflects the explicit instruction that students receive during each course}, KEYWORDS = {Evolution of Language Competence, Natural Language Processing, Linguistic Profiling}, PAGES = {92-101}, URL = {https://www.aclweb.org/anthology/2020.bea-1.9.pdf}, DOI = {10.18653/v1/W16-05}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-941643-83-9}, CONFERENCE_NAME = {15th Workshop on Innovative Use of NLP for Building Educational Applications}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Proceedings of 15th Workshop on Innovative Use of NLP for Building Educational Applications}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MD_421763, AUTHOR = {Miaschi, A. and Dell'Orletta, F.}, TITLE = {Contextual and Non-Contextual Word Embeddings: an in-depth Linguistic Investigation}, YEAR = {2020}, ABSTRACT = {In this paper we present a comparison between the linguistic knowledge encoded in the internal representations of a contextual Language Model (BERT) and a contextual-independent one (Word2vec). We use a wide set of probing tasks, each of which corresponds to a distinct sentence-level feature extracted from different levels of linguistic annotation. We show that, although BERT is capable of understanding the full context of each word in an input sequence, the implicit knowledge encoded in its aggregated sentence representations is still comparable to that of a contextual-independent model. We also find that BERT is able to encode sentence-level properties even within single-word embeddings, obtaining comparable or even superior results than those obtained with sentence representations}, KEYWORDS = {nlp, interpretability, representation learning}, PAGES = {110-119}, URL = {https://www.aclweb.org/anthology/2020.repl4nlp-1.15}, DOI = {10.18653/v1/2020.repl4nlp-1.15}, ISBN = {978-1-952148-15-6}, CONFERENCE_NAME = {5th Workshop on Representation Learning for NLP}, BOOKTITLE = {Proceedings of the 5th Workshop on Representation Learning for NLP}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MSBDV_421765, AUTHOR = {Miaschi, A. and Sarti, G. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Italian Transformers Under the Linguistic Lens}, YEAR = {2020}, ABSTRACT = {In this paper we present an in-depth investigation of the linguistic knowledge encoded by the transformer models currently available for the Italian language. In particular, we investigate whether and how using different architectures of probing models affects the performance of Italian transformers in encoding a wide spectrum of linguistic features. Moreover, we explore how this implicit knowledge varies according to different textual genres}, KEYWORDS = {nlp, neural language models, interpretability}, URL = {http://ceur-ws.org/Vol-2769/paper_56.pdf}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it)}, BOOKTITLE = {Proceedings of the Seventh Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{ALZETTA_2019_INPROCEEDINGS_AMADKPT_390427, AUTHOR = {Alzetta, C. and Miaschi, A. and Adorni, G. and Dell'Orletta, F. and Koceva, F. and Passalacqua, S. and Torre, I.}, TITLE = {Prerequisite or not prerequisite? That's the problem! An NLP-based Approach for Concept Prerequisites Learning}, YEAR = {2019}, ABSTRACT = {This paper presents a method for prerequisite learning classification between educational concepts. The proposed system was developed by adapting a classification algorithm designed for sequencing Learning Objects to the task of ordering concepts from a computer science textbook. In order to apply the system to the new task, for each concept we automatically created a learning unit from the textbook using two criteria based on concept occurrences and burst intervals. Results are promising and suggest that further improvements could highly benefit the results}, URL = {https://iris.cnr.it/handle/20.500.14243/390427}, } @INPROCEEDINGS{MIASCHI_2019_INPROCEEDINGS_MACD_390439, AUTHOR = {Miaschi, A. and Alzetta, C. and Cardillo, F. A. and Dell'Orletta, F.}, TITLE = {Linguistically-Driven Strategy for Concept Prerequisites Learning on Italian}, YEAR = {2019}, ABSTRACT = {We present a new concept prerequisite learning method for Learning Object (LO) ordering that exploits only linguistic features extracted from textual educational resources. The method was tested in a cross-and in-domain scenario both for Italian and English. Additionally, we performed experiments based on a incremental training strategy to study the impact of the training set size on the classifier performances. The paper also introduces ITA-PREREQ, to the best of our knowledge the first Italian dataset annotated with prerequisite relations between pairs of educational concepts, and describe the automatic strategy devised to build it}, KEYWORDS = {Concept Prerequisites Learning}, PAGES = {285-295}, URL = {https://iris.cnr.it/handle/20.500.14243/390439}, CONFERENCE_NAME = {14th Workshop on Innovative Use of NLP for Building Educational Applications}, BOOKTITLE = {Proceedings of the Fourteenth Workshop on Innovative Use of NLP for Building Educational Applications}, } @ARTICLE{SALVATORI_2017_ARTICLE_SRADMM_493650, AUTHOR = {Salvatori, E. and Rosselli Del Turco, R. and Alzetta, C. and Di Pietro, C. and Mannari, C. and Miaschi, A.}, TITLE = {The Codice Pelavicino between digital edition and Public History}, YEAR = {2017}, ABSTRACT = {The Codice Pelavicino Digitale Project aims to publish an online digital edition of the relevant manuscript of the XIII century. In this paper features of the edition and related issues are addressed. Secondly we explain motivations for choosing a digital edition as a medium: we address the background, and common concerns in the context of Academy and clerical and historical archives. Finally we give insights on the international standard adopted to markup the text, i. e. XML-TEI, and EVT, a tool adopted to generate the final website and display texts and images}, KEYWORDS = {Diplomatica, Filologia digitale, Latino medievale, Storia pubblica, TEI XML}, PAGES = {105-117}, URL = {https://iris.cnr.it/handle/20.500.14243/493650}, VOLUME = {2017 (1)}, DOI = {10.6092/issn.2532-8816/7232}, ISSN = {2532-8816}, JOURNAL = {UMANISTICA DIGITALE}, }