@ARTICLE{ALZETTA_2025_ARTICLE_AMDVM_570443, AUTHOR = {Alzetta, C. and Miaschi, A. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Parallel Trees: a novel resource with aligned dependency and constituency syntactic representations}, YEAR = {2025}, ABSTRACT = {The paper introduces Parallel Trees, a novel multilingual treebank collection that includes 20 treebanks for 10 languages. The distinguishing property of this resource is that the sentences of each language are annotated using two syntactic representation paradigms (SRPs), respectively based on the notions of dependency and constituency. By aligning the annotations of existing resources, Parallel Trees represents an example of exploiting pre-existing treebanks to adapt them to novel applications. To illustrate its potential, we present a case study where the resource is employed as a benchmark to investigate whether and how BERT, one of the first prominent neural language models (NLMs), is sensitive to the dependency-and constituency-based approaches for representing the syntactic structure of a sentence. The case study results indicate that the model's sensitivity fluctuates across languages and experimental settings. The unique nature of the Parallel Trees resource creates the prerequisites for innovative studies comparing dependency and phrase-structure trees, allowing for more focused investigations without the interference of lexical variation}, KEYWORDS = {Parallel treebanks, Syntactic representation, Diagnostic probing paradigm, Neural language model}, PAGES = {3445-3485}, URL = {https://iris.cnr.it/handle/20.500.14243/570443}, VOLUME = {59 (4)}, DOI = {10.1007/s10579-025-09826-3}, ISSN = {1574-020X}, JOURNAL = {LANGUAGE RESOURCES AND EVALUATION}, } @ARTICLE{ALZETTA_2025_ARTICLE_AT_545881, AUTHOR = {Alzetta, C. and Torre, I.}, TITLE = {Prerequisite Relations Annotation Tool: Annotation and analysis of educational relations in texts}, YEAR = {2025}, ABSTRACT = {Relations between terms in texts have long been studied in linguistics and specialized knowledge domains, especially when occurring in educational materials like textbooks, where they play a crucial role in guiding instructional design and learning. Prerequisite relations (PR), which determine the sequence of presentation of domain terms, are particularly crucial for effective learning. Therefore, the authors consider them carefully when writing instructional texts. The reverse process of identifying PR within texts aims to extract the inherent knowledge structure they are based on and is a key task in the field of corpora annotation for educational knowledge modeling. Although there are tools for manual annotation, there is a need for specialized tools tailored to the unique properties of PR, enabling easy creation, analysis, and sharing of annotated datasets. In this paper, we introduce Prerequisite Relations Annotation Tool (PRAT), a novel tool designed for annotating PR based on a validated protocol. PRAT simplifies the process of capturing, analyzing, and visualizing prerequisite structures in educational texts. We outline PRAT's architecture and functionalities, emphasizing its unique features compared to existing corpora annotation tools. Through a user study involving users with diverse backgrounds, we show PRAT's effectiveness in real-world scenarios}, KEYWORDS = {Annotation, Annotation Tool, Prerequisite Relations, User Evaluation, Usability,}, PAGES = {1-22}, URL = {https://asistdl.onlinelibrary.wiley.com/doi/10.1002/asi.24992?af=R}, DOI = {10.1002/asi.24992}, ISSN = {2330-1643}, JOURNAL = {JOURNAL OF THE ASSOCIATION FOR INFORMATION SCIENCE AND TECHNOLOGY}, } @ARTICLE{SOLASALES_2025_ARTICLE_SAMD_570482, AUTHOR = {Sola Sales, S. and Alzetta, C. and Moret Tatay, C. and Dell'Orletta, F.}, TITLE = {When Time Matters: Exploring the Impact of Recall Techniques and Educational Levels on Witness Testimony Quality}, YEAR = {2025}, ABSTRACT = {Mental reconstruction (MRC) and Free Recall (FR) have been recognized for enhancing the quality of witness statements. However, the mechanisms underlying this association remain insufficiently understood. This study explores how the time allocated to MRC and FR and variations in educational level influence the quality of eyewitness testimonies. Testimony quality is evaluated based on manually annotated content information provided by experts in testimony assessment, which measures adherence to the events. This is further complemented by fine-grained linguistic features, automatically extracted using linguistic analysis tools, to capture stylistic aspects. As a proof of concept, the analysis is performed on a corpus of 96 testimonies in Spanish describing two robbery cases. The results suggest that both mental reconstruction and narration times positively impact the accuracy of testimonies, as inaccuracies predominantly involve peripheral details. Furthermore, while the study confirms that educational level affects testimony quality, no significant differences were observed in the frequency of erroneous reports. This study contributes to the understanding of the relationship between cognitive strategies and the accuracy of witness statements, proposing an analysis approach applicable to forensic psychology for witness assessment}, KEYWORDS = {automatic linguistic analysis, content analysis, eyewitness testimonies, free recall, mental reinstatement of context}, URL = {https://iris.cnr.it/handle/20.500.14243/570482}, VOLUME = {16 (2)}, DOI = {10.3390/info16020122}, ISSN = {2078-2489}, JOURNAL = {INFORMATION}, } @INPROCEEDINGS{ALZETTA_2025_INPROCEEDINGS_AM_571223, AUTHOR = {Alzetta, C. and Montemagni, S.}, TITLE = {Low-vs High-level Lemmatization for Historical Languages. A Case study on Italian}, YEAR = {2025}, ABSTRACT = {Lemmatization remains a foundational yet challenging task in the processing of historical Italian texts, due to the complex interplay of orthographic, morphological, and diatopic variation. A crucial, yet often overlooked, aspect is the degree of normalization applied during lemmatization. A conservative approach preserves attested historical forms, ensuring greater linguistic fidelity but increasing data sparsity. Conversely, an abstract normalization strategy aligns historical variants with standardized contemporary lemmas, improving generalization but potentially introducing inaccurate mappings. In this paper, we present a comparative evaluation of conservative and normalized lemmatization strategies for historical Italian. To our knowledge, this is the first study to explicitly assess the impact of lemmatization strategies in the context of historical languages, particularly those that are morphologically rich. Our results indicate that high-level normalization offers a promising trade-off between precision and generalization}, KEYWORDS = {Data-driven Lemmatization, Historical Italian, Universal Dependencies, Normalization}, PAGES = {10}, URL = {https://aclanthology.org/2025.clicit-1.4.pdf}, PUBLISHER = {CEUR Workshop Proceeding}, ISBN = {979-12-243-0587-3}, CONFERENCE_NAME = {Eleventh Italian Conference on Computational Linguistics (CLiC-it 2025)}, BOOKTITLE = {Proceedings of the Eleventh Italian Conference on Computational Linguistics (CLiC-it 2025)}, } @ARTICLE{ALZETTA_2024_ARTICLE_A_493660, AUTHOR = {Alzetta, C.}, TITLE = {Investigating the Interplay between Text Difficulty and Prerequisite Relation Identification in Educational Texts}, YEAR = {2024}, ABSTRACT = {Prerequisite relations (PR) are fundamental in knowledge acquisition and the applications of Artificial Intelligence to distance learning, particularly with regard to personalized learning plans. The role of these relations is to specify the sequence of information acquisition necessary for understanding a target concept. Despite their significance, identifying PRs in educational texts is challenging, mainly due to the lack of systematic procedures for their identification on educational texts. This paper contributes to the ongoing research on PR identification by exploring the relationship between text difficulty, assessed across various linguistic properties and target audiences, and prerequisite relations. We conducted a crowd-based study on the novel task of prerequisite concept ordering. The study yielded preliminary yet valuable insights into the impact of text difficulty on the task. Such evidence sheds light on the need to account for the linguistic properties of texts when identifying PRs, thus advancing the field’s comprehension of PRs within the educational landscape. Ultimately, we hope that this work could foster novel linguistically-aware research on PR. Top of page}, KEYWORDS = {Prerequisite relations, Text difficulty, Educational texts}, PAGES = {39-64}, URL = {https://journals.openedition.org/ijcol/1362}, VOLUME = {10 (1)}, DOI = {10.4000/125nn}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @ARTICLE{ALZETTA_2024_ARTICLE_AMSV_484441, AUTHOR = {Alzetta, C. and Montemagni, S. and Sartor, M. and Venturi, G.}, TITLE = {Parlamint-it: an 18-karat UD treebank of Italian parliamentary speeches}, YEAR = {2024}, ABSTRACT = {The paper presents ParlaMint-It, a new treebank of Italian parliamentary debates, linguistically annotated based on the Universal Dependencies (UD) framework. The resource comprises 20, 460 tokens and represents a hybrid language variety that is underrepresented in the UD initiative. ParlaMint-It results from a manual revision process that relies on a semi-automatic methodology able to identify sentences that are most likely to contain inconsistencies and recurrent error patterns generated by the automatic annotation. Such a method made the revision process faster and more efficient than revising the entire treebank. In addition, it allowed the identification and correction of annotation errors resulting from linguistic constructions inconsis-tently represented in UD treebanks and from characteristics specific to parliamentary speeches. Hence, the treebank is deemed as an 18-karat resource, since, although not fully manually revised, it is a valuable resource for researchers working on Italian language processing tasks}, KEYWORDS = {Universal dependencies treebanks, Annotation revision, Italian parliamentary debates, Linguistic annotation}, PAGES = {25}, URL = {https://link.springer.com/content/pdf/10.1007/s10579-024-09748-6.pdf}, DOI = {10.1007/s10579-024-09748-6}, ISSN = {1574-020X}, JOURNAL = {LANGUAGE RESOURCES AND EVALUATION}, } @BOOK{ALZETTA_2024_BOOK_A_493657, AUTHOR = {Alzetta, C.}, TITLE = {Unlocking Knowledge in the Digital Age. A Guide to Modelling Propaedeutic Relations in Educational Texts}, YEAR = {2024}, ABSTRACT = {This volume, intended for educational researchers and practitioners, discusses the pivotal role of prerequisite relations between educational contents in shaping learning paths and offers tools for exploring and analyzing teaching materials. It demonstrates how uncovering the structured representation of educational text content fosters its dynamic and adaptive navigation, all while tackling the complexities of identifying prerequisite relations within such texts. Through a multidisciplinary methodology integrating corpus annotation, knowledge modelling, and deep textual analysis, the volume illustrates the interplay between form and content in textual materials, underscoring the importance of employing level-appropriate language for fostering effective learning. The efficacy of this approach is demonstrated through case studies on content modelling and textbook exploration that illustrate its potential to enhance teaching and learning across diverse domains}, KEYWORDS = {Prerequisite relations, Annotation, Modelling Framework, Educational application}, URL = {https://iris.cnr.it/handle/20.500.14243/493657}, PUBLISHER = {Edizioni ETS (Pisa, ITA)}, ISBN = {9788846769237}, CONFERENCE_PLACE = {Pisa}, } @INPROCEEDINGS{PUCCETTI_2024_INPROCEEDINGS_PRADE_519993, AUTHOR = {Puccetti, G. and Rogers, A. and Alzetta, C. and Dell'Orletta, F. and Esuli, A.}, TITLE = {AI 'News' Content Farms Are Easy to Make and Hard to Detect: A Case Study in Italian}, YEAR = {2024}, ABSTRACT = {Large Language Models (LLMs) are increasingly used as 'content farm' models (CFMs), to generate synthetic text that could pass for real news articles. This is already happening even for languages that do not have high-quality monolingual LLMs. We show that fine-tuning Llama (v1), mostly trained on English, on as little as 40K Italian news articles, is sufficient for producing news-like texts that native speakers of Italian struggle to identify as synthetic. We investigate three LLMs and three methods of detecting synthetic texts (log-likelihood, DetectGPT, and supervised classification), finding that they all perform better than human raters, but they are all impractical in the real world (requiring either access to token likelihood information or a large dataset of CFM texts). We also explore the possibility of creating a proxy CFM: an LLM fine-tuned on a similar dataset to one used by the real 'content farm'. We find that even a small amount of fine-tuning data suffices for creating a successful detector, but we need to know which base LLM is used, which is a major challenge. Our results suggest that there are currently no practical methods for detecting synthetic news-like texts 'in the wild', while generating them is too easy. We highlight the urgency of more NLP research on this problem}, KEYWORDS = {Large Language Models (LLMs), Detecting synthetic texts}, PAGES = {15312-15338}, URL = {https://aclanthology.org/2024.acl-long.817/}, VOLUME = {1}, DOI = {10.18653/v1/2024.acl-long.817}, PUBLISHER = {Association for Computational Linguistics (ACL)}, CONFERENCE_NAME = {ACL 2024-62nd Annual Meeting of the Association for Computational Linguistics}, BOOKTITLE = {Proceedings of the Annual Meeting of the Association for Computational Linguistics}, } @ARTICLE{ALZETTA_2023_ARTICLE_ADMPV_439017, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Miaschi, A. and Prat, E. and Venturi, G.}, TITLE = {Tell me how you write and I'll tell you what you read: a study on the writing style of book reviews}, YEAR = {2023}, ABSTRACT = {The paper aims at investigating variations in the writing style of book reviews published on different social reading platforms and referring to books of different genres, which enables acquiring insights into communication strategies adopted by readers to share their reading experiences. To this end, we introduce a corpus-based study focused on the analysis of A Good Review, a novel corpus of online book reviews written in Italian, posted on Amazon and Goodreads, and covering six literary fiction genres. We rely on stylometric analysis to explore the linguistic properties and lexicon of reviews and the authors conducted automatic classification experiments using multiple approaches and feature configurations to predict either the review's platform or the literary genre. The analysis of user-generated reviews demonstrates that language is a quite variable dimension across reading platforms, but not as much across book genres. The classification experiments revealed that features modelling the syntactic structure of the sentence are reliable proxies for discerning Amazon and Goodreads reviews, whereas lexical information showed a higher predictive role for automatically discriminating the genre}, KEYWORDS = {Stylometric analysis, Textual Genre detection, Book reviews}, PAGES = {23}, URL = {https://www.emerald.com/insight/content/doi/10.1108/JD-04-2023-0073/full/html}, VOLUME = {79}, DOI = {10.1108/JD-04-2023-0073}, ISSN = {0022-0418}, JOURNAL = {JOURNAL OF DOCUMENTATION}, } @ARTICLE{ALZETTA_2023_ARTICLE_ATK_450158, AUTHOR = {Alzetta, C. and Torre, I. and Koceva, F.}, TITLE = {Annotation Protocol for Textbook Enrichment with Prerequisite Knowledge Graph}, YEAR = {2023}, ABSTRACT = {Extracting and formally representing the knowledge embedded in textbooks, such as the concepts explained and the relations between them, can support the provision of advanced knowledge-based services for learning environments and digital libraries. In this paper, we consider a specific type of relation in textbooks referred to as prerequisite relations (PR). PRs represent precedence relations between concepts aimed to provide the reader with the knowledge needed to understand a further concept(s). Their annotation in educational texts produces datasets that can be represented as a graph of concepts connected by PRs. However, building good-quality and reliable datasets of PRs from a textbook is still an open issue, not just for automated annotation methods but even for manual annotation. In turn, the lack of good-quality datasets and well-defined criteria to identify PRs affect the development and validation of automated methods for prerequisite identification. As a contribution to this issue, in this paper, we propose PREAP, a protocol for the annotation of prerequisite relations in textbooks aimed at obtaining reliable annotated data that can be shared, compared, and reused in the research community. PREAP defines a novel textbook-driven annotation method aimed to capture the structure of prerequisites underlying the text. The protocol has been evaluated against baseline methods for manual and automatic annotation. The findings show that PREAP enables the creation of prerequisite knowledge graphs that have higher inter-annotator agreement, accuracy, and alignment with text than the baseline methods. This suggests that the protocol is able to accurately capture the PRs expressed in the text. Furthermore, the findings show that the time required to complete the annotation using PREAP are significantly shorter than with the other manual baseline methods. The paper includes also guidelines for using PREAP in three annotation scenarios, experimentally tested. We also provide example datasets and a user interface that we developed to support prerequisite annotation}, KEYWORDS = {Text annotation, annotation protocol, knowledge engeneering, educational textbook}, URL = {https://rdcu.be/dxjsm}, DOI = {10.1007/s10758-023-09682-6}, ISSN = {2211-1662}, JOURNAL = {TECHNOLOGY, KNOWLEDGE AND LEARNING}, } @ARTICLE{MIASCHI_2023_ARTICLE_MABDV_439018, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Testing the Effectiveness of the Diagnostic Probing Paradigm on Italian Treebanks}, YEAR = {2023}, ABSTRACT = {The outstanding performance recently reached by neural language models (NLMs) across many natural language processing (NLP) tasks has steered the debate towards understanding whether NLMs implicitly learn linguistic competence. Probes, i. e., supervised models trained using NLM representations to predict linguistic properties, are frequently adopted to investigate this issue. However, it is still questioned if probing classification tasks really enable such investigation or if they simply hint at surface patterns in the data. This work contributes to this debate by presenting an approach to assessing the effectiveness of a suite of probing tasks aimed at testing the linguistic knowledge implicitly encoded by one of the most prominent NLMs, BERT. To this aim, we compared the performance of probes when predicting gold and automatically altered values of a set of linguistic features. Our experiments were performed on Italian and were evaluated across BERT's layers and for sentences with different lengths. As a general result, we observed higher performance in the prediction of gold values, thus suggesting that the probing model is sensitive to the distortion of feature values. However, our experiments also showed that the length of a sentence is a highly influential factor that is able to confound the probing model's predictions}, KEYWORDS = {Neural language model, Probing tasks, Treebanks}, PAGES = {19}, URL = {https://www.mdpi.com/2078-2489/14/3/144}, VOLUME = {14 (3)}, DOI = {10.3390/info14030144}, ISSN = {2078-2489}, JOURNAL = {INFORMATION}, } @ARTICLE{SOLSALES_2023_ARTICLE_SAMD_439019, AUTHOR = {Solà Sales, S. and Alzetta, C. and Moret Tatay, C. and Dell'Orletta, F.}, TITLE = {Analysing Deception in Witness Memory Though Linguistic Styles in Spontaneous Language}, YEAR = {2023}, ABSTRACT = {The act of lying and its detection have raised interest in many fields, from the legal system to our daily lives. Considering that testimonies are commonly based on linguistic parameters, natural language processing, a research field concerned with programming computers to process and analyse natural language texts or speech, is a topic of interest on this front. This study aimed to examine the linguistic styles of simulated deception and true testimonies collected with the aim of studying witness memory. Study participants were asked to act as a witness of a crime by retelling the story they had just read. Cognitive interviewing techniques were used to collect testimony under two conditions: truth and simulated deception. A sample of 48 participants volunteered to participate in the study. Analyses of the linguistic indicators and content were carried out. Specifically, we performed a comparison of testimonies of the same participant by condition to analyse the variation between (i) lexical and (ii) linguistic features and (iii) content and speech characteristics (disfluencies) depending on the narrative condition. Concerning lexical properties, adjectives were the most-varying grammatical category between truthful and deceptive testimonies. Furthermore, in the linguistic analysis, we observed that truthful testimonies were generally longer than deceptive ones in terms of the number of words and sentences and also characterised by more articulated sentence structures, and these differences were also statistically significant. Regarding the analysis of the content, cognitive criteria (details) and admitting lack of memory were more present in truthful statements. By providing an objective measure, these results are of interest in developing NLP tools for assessing the credibility of testimonies in forensics}, KEYWORDS = {Natural language processing, Simulated deception, Stylometric analysis}, PAGES = {26}, URL = {https://www.mdpi.com/2076-3425/13/2/317}, VOLUME = {13 (2)}, DOI = {10.3390/brainsci13020317}, ISSN = {2076-3425}, JOURNAL = {BRAIN SCIENCES}, } @INPROCEEDINGS{ALZETTA_2023_INPROCEEDINGS_ABDMSSV_470901, AUTHOR = {Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Miaschi, A. and Sagae, K. and Sánchez Gutiérrez, C. H. and Venturi, G.}, TITLE = {LangLearn at EVALITA 2023: Overview of the Language Learning Development Task}, YEAR = {2023}, ABSTRACT = {Language Learning Development (LangLearn) is the EVALITA 2023 shared task on automatic language development assessment, which consists in predicting the evolution of the written language abilities of learners across time. LangLearn is conceived to be multilingual, relying on written productions of Italian and Spanish learners, and representative of L1 and L2 learning scenarios. A total of 9 systems were submitted by 5 teams. The results highlight the open challenges of automatic language development assessment}, URL = {https://iris.cnr.it/handle/20.500.14243/470901}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {9791255000693}, CONFERENCE_NAME = {8th Evaluation Campaign of Natural Language Processing and Speech Tools for Italian}, CONFERENCE_PLACE = {Torino}, BOOKTITLE = {Proceedings of EVALITA 2023}, } @INPROCEEDINGS{ALZETTA_2023_INPROCEEDINGS_ADFMV_470921, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Fazzone, C. and Miaschi, A. and Venturi, G.}, TITLE = {Unmasking the Wordsmith: Revealing Author Identity through Reader Reviews}, YEAR = {2023}, ABSTRACT = {Traditional genre-based approaches for book recommendations face challenges due to the vague definition of genres. To overcome this, we propose a novel task called Book Author Prediction, where we predict the author of a book based on user-generated reviews’ writing style. To this aim, we first introduce the ‘Literary Voices Corpus’ (LVC), a dataset of Italian book reviews, and use it to train and test machine learning models. Our study contributes valuable insights for developing user-centric systems that recommend leisure readings based on individual readers’ interests and writing styles}, URL = {https://ceur-ws.org/Vol-3596/paper4.pdf}, CONFERENCE_NAME = {9th Italian Conference on Computational Linguistics}, BOOKTITLE = {Proceedings of the 9th Italian Conference on Computational Linguistics}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MABDV_446048, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Probing tasks under pressure}, YEAR = {2021}, ABSTRACT = {Probing tasks are frequently used to evaluate whether the representations of Neural Language Models (NLMs) encode linguistic information. However, it is still questioned if probing classification tasks really enable such investigation or they simply hint for surface patterns in the data. We present a method to investigate this question by comparing the accuracies of a set of probing tasks on gold and automatically generated control datasets. Our results suggest that probing tasks can be used as reliable diagnostic methods to investigate the linguistic information encoded in NLMs representations}, KEYWORDS = {Neural Language Models, Linguistic probing, Treebanks}, PAGES = {1-7}, URL = {http://ceur-ws.org/Vol-3033/paper29.pdf}, VOLUME = {3033}, CONFERENCE_NAME = {8th Italian Conference on Computational Linguistics (CLIC-it 2021)}, } @ARTICLE{ALZETTA_2020_ARTICLE_ADMV_446043, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Linguistically-driven Selection of Difficult-to-Parse Dependency Structures}, YEAR = {2020}, ABSTRACT = {The paper illustrates a novel methodology meeting a twofold goal, namely quantifying the reliability of automatically generated dependency relations without using gold data on the one hand, and identifying which are the linguistic constructions negatively affecting the parser performance on the other hand. These represent objectives typically investigated in different lines of research, with different methods and techniques. Our methodology, at the crossroads of these perspectives, allows not only to quantify the parsing reliability of individual dependency types but also to identify and weight the contextual properties making relation instances more or less difficult to parse. The proposed methodology was tested in two different and complementary experiments, aimed at assessing the degree of parsing difficulty across (a) different dependency relation types, and (b) different instances of the same relation. The results show that the proposed methodology is able to identify difficult-to-parse dependency relations without relying on gold data and by taking into account a variety of intertwined linguistic factors. These findings pave the way to novel applications of the methodology, both in the direction of defining new evaluation metrics based purely on automatically parsed data and towards the automatic creation of challenge sets}, KEYWORDS = {Linguistic Complexity, Syntactic Parsing, Evaluation metrics}, PAGES = {37-60}, URL = {https://journals.openedition.org/ijcol/719}, VOLUME = {6 (2)}, DOI = {10.4000/ijcol.719}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_ADMOSV_423610, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Osenova, P. and Simov, K. and Venturi, G.}, TITLE = {Quantitative linguistic investigations across universal dependencies treebanks}, YEAR = {2020}, ABSTRACT = {The paper illustrates a case study aimed at identifying cross-lingual quantitative trends in the distribution of dependency relations in treebanks for typologically different languages. Preliminary results show interesting differences rooted either in language-specific peculiarities or cross-lingual annotation inconsistencies, with a potential impact on different application scenarios}, KEYWORDS = {Universal Dependencies Treebanks, Cross-linguistic analysis, Typology}, PAGES = {1-7}, URL = {http://ceur-ws.org/Vol-2769/paper_59.pdf}, VOLUME = {2769}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {7th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_AGKPT_493643, AUTHOR = {Alzetta, C. and Galluccio, I. and Koceva, F. and Passalacqua, S. and Torre, I.}, TITLE = {Digging into prerequisite annotation}, YEAR = {2020}, ABSTRACT = {Intelligent textbooks are often engineered with an explicit representation of their concepts and prerequisite relations (PR). PR identification is hence crucial for intelligent textbooks but still presents some challenges, also when performed by human experts. This may cause PR-annotated datasets to be inconsistent and compromise the accuracy of automatic creation of enhanced learning materials. This paper investigates possible reasons for PR disagreement and the nature of PR itself, with the aim of contributing to the development of shared strategies for PR annotation, analysis and modelling in textbooks}, KEYWORDS = {Agreement, Annotation, Prerequisite relation}, PAGES = {29-34}, URL = {https://iris.cnr.it/handle/20.500.14243/493643}, VOLUME = {2674}, PUBLISHER = {CEUR-WS}, CONFERENCE_NAME = {2nd International Workshop on Intelligent Textbooks, iTextbooks 2020}, BOOKTITLE = {CEUR Workshop Proceedings of the 2nd International Workshop on Intelligent Textbooks, iTextbooks 2020}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_AMDKFTI_421771, AUTHOR = {Alzetta, C. and Miaschi, A. and Dell'Orletta, F. and Koceva and Frosina and Torre and Ilaria}, TITLE = {PRELEARN @ EVALITA 2020: Overview of the Prerequisite Relation Learning Task for Italian}, YEAR = {2020}, ABSTRACT = {The Prerequisite Relation Learning (PRELEARN) task is the EVALITA 2020 shared task on concept prerequisite learning, which consists of classifying prerequisite relations between pairs of concepts distinguishing between prerequisite pairs and non-prerequisite pairs. Four sub-tasks were defined: two of them define different types of features that participants are allowed to use when training their model, while the other two define the classification scenarios where the proposed models would be tested. In total, 14 runs were submitted by 3 teams comprising 9 total individual participants}, KEYWORDS = {nlp, prerequisite learning, shared task}, URL = {http://ceur-ws.org/Vol-2765/paper164.pdf}, CONFERENCE_NAME = {Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, BOOKTITLE = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MABDV_421767, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Is Neural Language Model Perplexity Related to Readability?}, YEAR = {2020}, ABSTRACT = {This paper explores the relationship between Neural Language Model (NLM) perplexity and sentence readability. Starting from the evidence that NLMs implicitly acquire sophisticated linguistic knowledge from a huge amount of training data, our goal is to investigate whether perplexity is affected by linguistic features used to automatically assess sentence readability and if there is a correlation between the two metrics. Our findings suggest that this correlation is actually quite weak and the two metrics are affected by different linguistic phenomena}, KEYWORDS = {nlp, neural language models, readability}, URL = {http://ceur-ws.org/Vol-2769/paper_57.pdf}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics}, BOOKTITLE = {Proceedings of the Seventh Italian Conference on Computational Linguistics}, } @ARTICLE{ALZETTA_2019_ARTICLE_ADMV_403586, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {INFERRING QUANTITATIVE TYPOLOGICAL TRENDS FROM MULTILINGUAL TREEBANKS. A CASE STUDY}, YEAR = {2019}, ABSTRACT = {In the past decades, linguistic typology went through a renewing phase that involved a significant change in the research questions and methods of the discipline, which is now interested in fine-grained features underlying language diversity. In this paper, we propose a novel approach to address the newly defined needs of linguistic typology by extracting qualitative and quantitative information about a wide range of features from multilingual annotated corpora based on Natural Language Processing methods and techniques. We tested our method in a case study focusing on word order variation in two widely investigated constructions, VERB-SUBJ(ect) and NOUN-ADJ(ective), with a specific view to structural and functional factors underlying the preference for one or the other order, both intra-and cross-linguistically, and their interaction. Preliminary experiments have been carried out aimed at acquiring typological evidence from a selection of linguistically annotated treebanks for three different languages, namely Italian, Spanish and English. Our results show the effectiveness of the method in letting similarities and differences also emerge from typologically close languages}, KEYWORDS = {language typology, multilingual annotated corpora, linguistic knowledge extraction and modelling, word order variation}, PAGES = {209-242}, URL = {https://www.rivisteweb.it/doi/10.1418/95391}, VOLUME = {18 (2)}, DOI = {10.1418/95391}, ISSN = {1720-9331}, JOURNAL = {LINGUE E LINGUAGGIO}, } @ARTICLE{GERBAUDO_2019_ARTICLE_GMA_493651, AUTHOR = {Gerbaudo, P. and Marogna, F. and Alzetta, C.}, TITLE = {When “Positive Posting” Attracts Voters: User Engagement and Emotions in the 2017 UK Election Campaign on Facebook}, YEAR = {2019}, ABSTRACT = {Social media are widely held to have played an important role in the 2017 UK general elections. But it is not altogether clear how exactly they contributed to the communication battle between Labour and the Conservatives. This article analyses the posts and comments on the official Facebook pages of the Labour Party and the Conservative Party and their respective leaders, Jeremy Corbyn and Theresa May. We look at the relationship between topics, emotions, and user engagement. Labour clearly outperformed the Tories, with Corbyn’s personal page having 10 times the interactions of May’s. We retrieve part of the reason for this success in the “positive posting” strategy adopted by Labour and the way it helped to attract user engagement. While the Conservative Party focused on negative issues such as Brexit, terrorism, and national security, Labour focused on positive issues, such as the promise of higher social spending and appeals to the grassroots, generating far higher levels of engagement. Overall, positive topic tended to fare better than more negative and controversial issues, such as security and Brexit. Our findings thus suggest the need for a more balanced understanding of the relationship between content, emotions, and user engagement on social media, moving beyond simplistic views of social media politics as necessarily biased in favor of aggressive and negative campaigning}, KEYWORDS = {2017 UK national elections, Corbyn, Facebook, Labour, online campaigning, social media}, URL = {https://iris.cnr.it/handle/20.500.14243/493651}, VOLUME = {5 (4)}, DOI = {10.1177/2056305119881695}, ISSN = {2056-3051}, JOURNAL = {SOCIAL MEDIA SOCIETY}, } @INCOLLECTION{ADORNI_2019_INCOLLECTION_AAKPT_493649, AUTHOR = {Adorni, G. and Alzetta, C. and Koceva, F. and Passalacqua, S. and Torre, I.}, TITLE = {Towards the identification of propaedeutic relations in textbooks}, YEAR = {2019}, ABSTRACT = {As well-known, structuring knowledge and digital content has a tremendous potential to enhance meaningful learning. A straightforward approach is representing key concepts of the subject matter and organizing them in a knowledge structure by means of semantic relations. This results in hypergraphs with typed n-ary relationships, including the so-called prerequisite or propaedeutic relations among concepts. While extracting the whole concept graph from a textbook is our final goal, the focus of this paper is the identification of the propaedeutic relations among concepts. To this aim, we employ a method based on burst analysis and co-occurrence which recognizes, by means of temporal reasoning, prerequisite relations among concepts that share intense periods in the text. The experimental evaluation shows promising results for the extraction of propaedeutic relations without the support of external knowledge}, KEYWORDS = {Knowledge structure, Relation extraction, Temporal reasoning}, PAGES = {1-13}, URL = {https://iris.cnr.it/handle/20.500.14243/493649}, VOLUME = {11625}, DOI = {10.1007/978-3-030-23204-7_1}, PUBLISHER = {Springer Verlag}, ISBN = {9783030232030}, BOOKTITLE = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, } @INPROCEEDINGS{ALZETTA_2019_INPROCEEDINGS_ADMV_403587, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dissecting Treebanks to Uncover Typological Trends. A Multilingual Comparative Approach}, YEAR = {2019}, ABSTRACT = {Over the last years, linguistic typology started attracting the interest of the community working on cross-and multi-lingual NLP as a way to tackle the bottleneck deriving from the lack of annotated data for many languages. Typological information is mostly acquired from publicly accessible typological databases, manually constructed by linguists. As reported in Ponti et al. (2018), despite the abundant information contained in them for many languages, these resources suffer from two main shortcomings, i. e. their limited coverage and the discrete nature of features (only "the majority value rather than the full range of possible values and their corresponding frequencies" is reported). Corpus-based studies can help to automatically acquire quantitative typological evidence which might be exploited for polyglot NLP. Recently, the availability of corpora annotated following a cross-linguistically consistent annotation scheme such as the one developed in the Universal Dependencies project is prompting new comparative linguistic studies aimed to identify similarities as well as idiosyncrasies among typologically different languages (Nivre, 2015). The line of research described here is aimed at acquiring quantitative typological evidence from UD treebanks through a multilingual contrastive approach}, KEYWORDS = {Natural Language Processing, Linguistic Typology}, PAGES = {1-3}, URL = {https://typology-and-nlp.github.io/2019/assets/2019/papers/5.pdf}, ISBN = {978-1-950737-29-1}, CONFERENCE_NAME = {1st TyP-NLP: The Workshop on Typology for Polyglot NLP, ACL workshop}, } @INPROCEEDINGS{ALZETTA_2019_INPROCEEDINGS_AMADKPT_390427, AUTHOR = {Alzetta, C. and Miaschi, A. and Adorni, G. and Dell'Orletta, F. and Koceva, F. and Passalacqua, S. and Torre, I.}, TITLE = {Prerequisite or not prerequisite? That's the problem! An NLP-based Approach for Concept Prerequisites Learning}, YEAR = {2019}, ABSTRACT = {This paper presents a method for prerequisite learning classification between educational concepts. The proposed system was developed by adapting a classification algorithm designed for sequencing Learning Objects to the task of ordering concepts from a computer science textbook. In order to apply the system to the new task, for each concept we automatically created a learning unit from the textbook using two criteria based on concept occurrences and burst intervals. Results are promising and suggest that further improvements could highly benefit the results}, URL = {https://iris.cnr.it/handle/20.500.14243/390427}, ISBN = {9791280136008}, } @INPROCEEDINGS{MIASCHI_2019_INPROCEEDINGS_MACD_390439, AUTHOR = {Miaschi, A. and Alzetta, C. and Cardillo, F. A. and Dell'Orletta, F.}, TITLE = {Linguistically-Driven Strategy for Concept Prerequisites Learning on Italian}, YEAR = {2019}, ABSTRACT = {We present a new concept prerequisite learning method for Learning Object (LO) ordering that exploits only linguistic features extracted from textual educational resources. The method was tested in a cross-and in-domain scenario both for Italian and English. Additionally, we performed experiments based on a incremental training strategy to study the impact of the training set size on the classifier performances. The paper also introduces ITA-PREREQ, to the best of our knowledge the first Italian dataset annotated with prerequisite relations between pairs of educational concepts, and describe the automatic strategy devised to build it}, KEYWORDS = {Concept Prerequisites Learning}, PAGES = {285-295}, URL = {https://iris.cnr.it/handle/20.500.14243/390439}, CONFERENCE_NAME = {14th Workshop on Innovative Use of NLP for Building Educational Applications}, BOOKTITLE = {Proceedings of the Fourteenth Workshop on Innovative Use of NLP for Building Educational Applications}, } @INPROCEEDINGS{PASSALACQUA_2019_INPROCEEDINGS_PKATA_493645, AUTHOR = {Passalacqua, S. and Koceva, F. and Alzetta, C. and Torre, I. and Adorni, G.}, TITLE = {Visualisation analysis for exploring prerequisite relations in textbooks}, YEAR = {2019}, ABSTRACT = {Building automatic strategies for organising knowledge contained in textbooks has a tremendous potential to enhance meaningful learning. Automatic identification of prerequisite relation (PR) between concepts in a textbook is a well-known way for knowledge structuring, yet it is still an open issue. Our research contributes for better understanding and exploring the phenomenon of PR in textbooks, by providing a collection of visualisation techniques for PR exploration and analysis, that we used for the design of and then the refinement of our algorithm for PR extraction}, KEYWORDS = {Information visualisation, Knowledge structuring, Prerequisite relation}, PAGES = {18-21}, URL = {https://iris.cnr.it/handle/20.500.14243/493645}, VOLUME = {2384}, PUBLISHER = {CEUR-WS}, CONFERENCE_NAME = {1st Workshop on Intelligent Textbooks, iText 2019}, BOOKTITLE = {CEUR Workshop Proceedings of the 1st International Workshop on Intelligent Textbooks}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_AACKT_493654, AUTHOR = {Alzetta, C. and Adorni, G. and Celik, I. and Koceva, F. and Torre, I.}, TITLE = {Toward a user-adapted question/answering educational approach}, YEAR = {2018}, ABSTRACT = {This paper addresses the design of a model for Question/Answering in an interactive and mobile learning environment. The learner's question can be made through vocal interaction or typed text and the answer is the generation of a personalized learning path. This takes into account the focus and type of the question and some personal features of the learner extracted both from the question and prosodic features, in case of vocal questions. The response is a learning path that preserves the precedence of the prerequisite relations and contains all the relevant concepts for answering the user's question. The main contribution of the paper is to investigate the possibility to exploit educational concept maps in a Q/A interactive learning system}, KEYWORDS = {Education, Educational concept map, MOOC, Personilized Learning Path, Q/A}, PAGES = {173-177}, URL = {https://iris.cnr.it/handle/20.500.14243/493654}, DOI = {10.1145/3213586.3226214}, PUBLISHER = {Association for Computing Machinery, Inc}, ISBN = {9781450357845}, CONFERENCE_NAME = {26th ACM International Conference on User Modeling, Adaptation and Personalization, UMAP 2018}, BOOKTITLE = {UMAP 2018-Adjunct Publication of the 26th Conference on User Modeling, Adaptation and Personalization}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMSV_371344, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Simi, M. and Venturi, G.}, TITLE = {Assessing the Impact of Iterative Error Detection and Correction. A Case Study on the Italian Universal Dependency Treebank}, YEAR = {2018}, ABSTRACT = {Detection and correction of errors and inconsistencies in "gold treebanks" are becoming more and more central topics of corpus annotation. The paper illustrates a new incremental method for enhancing treebanks, with particular emphasis on the extension of error patterns across different textual genres and registers. Impact and role of corrections have been assessed in a dependency parsing experiment carried out with four different parsers, whose results are promising. For both evaluation datasets, the performance of parsers increases, in terms of the standard LAS and UAS measures and of a more focused measure taking into account only relations involved in error patterns, and at the level of individual dependencies}, KEYWORDS = {Error Detection, Universal Dependency Treebanks, Syntactic parsing}, PAGES = {1-7}, URL = {http://universaldependencies.org/udw18/PDFs/39_Paper.pdf}, ISBN = {978-1-948087-84-1}, CONFERENCE_NAME = {Universal Dependencies Workshop 2018 (UDW 2018)}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMSV_493647, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Simi, M. and Venturi, G.}, TITLE = {Assessing the Impact of Incremental Error Detection and Correction. A Case Study on the Italian Universal Dependency Treebank}, YEAR = {2018}, ABSTRACT = {Detection and correction of errors and inconsistencies in “gold treebanks” are becoming more and more central topics of corpus annotation. The paper illustrates a new incremental method for enhancing treebanks, with particular emphasis on the extension of error patterns across different textual genres and registers. Impact and role of corrections have been assessed in a dependency parsing experiment carried out with four different parsers, whose results are promising. For both evaluation datasets, the performance of parsers increases, in terms of the standard LAS and UAS measures and of a more focused measure taking into account only relations involved in error patterns, and at the level of individual dependencies}, KEYWORDS = {Treebank, annotation, annotation error}, PAGES = {1-7}, URL = {https://iris.cnr.it/handle/20.500.14243/493647}, PUBLISHER = {Association for Computational Linguistics (ACL)}, ISBN = {9781948087780}, CONFERENCE_NAME = {2nd Workshop on Universal Dependencies, UDW 2018, held in conjunction with EMNLP 2018}, BOOKTITLE = {EMNLP 2018-2nd Workshop on Universal Dependencies, UDW 2018-Proceedings of the Workshop}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMV_374901, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Universal Dependencies and Quantitative Typological Trends. A Case Study on Word Order}, YEAR = {2018}, ABSTRACT = {The paper presents a new methodology aimed at acquiring typological evidence from "gold" treebanks for different languages. In particular, it investigates whether and to what extent algorithms developed for assessing the plausibility of automatically produced syntactic annotations could contribute to shed light on key issues of the linguistic typological literature. It reports the first and promising results of a case study focusing on word order patterns carried out on three different languages (English, Italian and Spanish)}, KEYWORDS = {Linguistic Knowledge Extraction, Dependency Treebanks, Linguistic Typology}, PAGES = {4540-4549}, URL = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/1109.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-00-9}, CONFERENCE_NAME = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC 2018)}, CONFERENCE_PLACE = {Paris}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMV_334766, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dangerous Relations in Dependency Treebanks}, YEAR = {2018}, ABSTRACT = {The paper illustrates an effective and innovative method for detecting erroneously annotated arcs in gold dependency treebanks based on an algorithm originally developed to measure the reliability of automatically produced dependency relations. The method permits to significantly restrict the error search space and, more importantly, to reliably identify patterns of systematic recurrent errors which represent dangerous evidence to a parser which tendentially will replicate them. Achieved results demonstrate effectiveness and reliability of the method}, KEYWORDS = {Dependency treebanks, Error Detection, Linguistic Annotation}, PAGES = {201-210}, URL = {http://aclweb.org/anthology/W/W17/W17-7624.pdf}, ISBN = {978-80-88132-04-2}, CONFERENCE_NAME = {16th International Workshop on Treebanks and Linguistic Theories}, BOOKTITLE = {Proceedings of the 16th International Workshop on Treebanks and Linguistic Theories}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_AKPTA_493653, AUTHOR = {Alzetta, C. and Koceva, F. and Passalacqua, S. and Torre, I. and Adorni, G.}, TITLE = {PRET: Prerequisite-enriched terminology. A case study on educational texts}, YEAR = {2018}, ABSTRACT = {In this paper we present PRET, a gold dataset annotated for prerequisite relations between educational concepts extracted from a computer science textbook, and we describe the language and domain independent approach for the creation of the resource. Additionally, we have created an annotation tool to support, validate and analyze the annotation}, KEYWORDS = {prerequisite relations, resource}, PAGES = {14-20}, URL = {https://iris.cnr.it/handle/20.500.14243/493653}, VOLUME = {2253}, DOI = {10.4000/books.aaccademia.3028}, PUBLISHER = {CEUR-WS}, ISBN = {9788831978682}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics, CLiC-it 2018}, BOOKTITLE = {CEUR Workshop Proceedings of the 5th Italian Conference on Computational Linguistics, CLiC-it 2018}, } @ARTICLE{SALVATORI_2017_ARTICLE_SRADMM_493650, AUTHOR = {Salvatori, E. and Rosselli Del Turco, R. and Alzetta, C. and Di Pietro, C. and Mannari, C. and Miaschi, A.}, TITLE = {The Codice Pelavicino between digital edition and Public History}, YEAR = {2017}, ABSTRACT = {The Codice Pelavicino Digitale Project aims to publish an online digital edition of the relevant manuscript of the XIII century. In this paper features of the edition and related issues are addressed. Secondly we explain motivations for choosing a digital edition as a medium: we address the background, and common concerns in the context of Academy and clerical and historical archives. Finally we give insights on the international standard adopted to markup the text, i. e. XML-TEI, and EVT, a tool adopted to generate the final website and display texts and images}, KEYWORDS = {Diplomatica, Filologia digitale, Latino medievale, Storia pubblica, TEI XML}, PAGES = {105-117}, URL = {https://iris.cnr.it/handle/20.500.14243/493650}, VOLUME = {2017 (1)}, DOI = {10.6092/issn.2532-8816/7232}, ISSN = {2532-8816}, JOURNAL = {UMANISTICA DIGITALE}, } @INPROCEEDINGS{ATTARDI_2016_INPROCEEDINGS_ASAS_493652, AUTHOR = {Attardi, G. and Sartiano, D. and Alzetta, C. and Semplici, F.}, TITLE = {Convolutional neural networks for sentiment analysis on Italian tweets}, YEAR = {2016}, ABSTRACT = {The paper describes our submission to the task 2 of Sentiment Polarity Classification in Italian Tweets at Evalita 2016. Our approach is based on a convolutional neural network that exploits both word embeddings and Sentiment Specific word embeddings. We also experimented a model trained with a distant supervised corpus. Our submission with Sentiment Specific word embeddings achieved the first official score}, KEYWORDS = {convolutional networks, sentiment analysis}, PAGES = {156-160}, URL = {https://iris.cnr.it/handle/20.500.14243/493652}, VOLUME = {1749}, DOI = {10.4000/books.aaccademia.1995}, PUBLISHER = {CEUR-WS}, ISBN = {9788899982553}, CONFERENCE_NAME = {3rd Italian Conference on Computational Linguistics, CLiC-it 2016 and 5th Evaluation Campaign of Natural Language Processing and Speech Tools for Italian, EVALITA 2016}, BOOKTITLE = {CEUR Workshop Proceedings of the 5th Evaluation Campaign of Natural Language Processing and Speech Tools for Italian, EVALITA 2016}, }