@INPROCEEDINGS{MORONI_2025_INPROCEEDINGS_MPHBBMDEN_552066, AUTHOR = {Moroni, L. and Puccetti, G. and Huguet Cabot, P. L. and Bejgu, A. S. and Barba, E. and Miaschi, A. and Dell'Orletta, F. and Esuli, A. and Navigli, R.}, TITLE = {Optimizing LLMs for Italian: reducing token fertility and enhancing efficiency through vocabulary adaptation}, YEAR = {2025}, ABSTRACT = {The number of pretrained Large Language Models (LLMs) is increasing steadily, though the majority are designed predominantly for the English language. While state-of-the-art LLMs can handle other languages, due to language contamination or some degree of multilingual pretraining data, they are not optimized for non-English languages, leading to inefficient encoding (high token ``fertility'') and slower inference speed. In this work, we thoroughly compare a variety of vocabulary adaptation techniques for optimizing English LLMs for the Italian language, and put forward Semantic Alignment Vocabulary Adaptation (SAVA), a novel method that leverages neural mapping for vocabulary substitution. SAVA achieves competitive performance across multiple downstream tasks, enhancing grounded alignment strategies. We adapt two LLMs: Mistral-7B-v0. 1, reducing token fertility by 25(\%), and Llama-3. 1-8B, optimizing the vocabulary and reducing the number of parameters by 1 billion. We show that, following the adaptation of the vocabulary, these models can recover their performance with a relatively limited stage of continual training on the target language. Finally, we test the capabilities of the adapted models on various multi-choice and generative tasks}, KEYWORDS = {Large Languiage Models, Italia LLM, Vocabulary Adaptation}, PAGES = {6646-6660}, URL = {https://aclanthology.org/2025.findings-naacl.371/}, DOI = {10.18653/v1/2025.findings-naacl.371}, PUBLISHER = {Association for Computational Linguistics}, ISBN = {979-8-89176-195-7}, CONFERENCE_NAME = {NAACL 2025-Annual Conference of the Nations of the Americas Chapter. Findings of the Association for Computational Linguistics}, BOOKTITLE = {NAACL 2025 Findings proceedings}, } @INPROCEEDINGS{PEDROTTI_2025_INPROCEEDINGS_PPCMPDE_554367, AUTHOR = {Pedrotti, A. and Papucci, M. and Ciaccio, C. and Miaschi, A. and Puccetti, G. and Dell'Orletta, F. and Esuli, A.}, TITLE = {Stress-testing machine generated text detection: shifting language models writing style to fool detectors}, YEAR = {2025}, ABSTRACT = {Recent advancements in Generative AI and Large Language Models (LLMs) have enabled the creation of highly realistic synthetic content, raising concerns about the potential for malicious use, such as misinformation and manipulation. Moreover, detecting Machine-Generated Text (MGT) remains challenging due to the lack of robust benchmarks that assess generalization to real-world scenarios. In this work, we evaluate the resilience of state-of-the-art MGT detectors (e. g., Mage, Radar, LLM-DetectAIve) to linguistically informed adversarial attacks. We develop a pipeline that fine-tunes language models using Direct Preference Optimization (DPO) to shift the MGT style toward human-written text (HWT), obtaining generations more challenging to detect by current models. Additionally, we analyze the linguistic shifts induced by the alignment and how detectors rely on “linguistic shortcuts” to detect texts. Our results show that detectors can be easily fooled with relatively few examples, resulting in a significant drop in detecting performances. This highlights the importance of improving detection methods and making them robust to unseen in-domain texts. We release code, models, and data to support future research on more robust MGT detection benchmarks}, KEYWORDS = {machine-generated text detection, synthetic content detection}, PAGES = {3010-3031}, URL = {https://aclanthology.org/2025.findings-acl.156/}, DOI = {10.18653/v1/2025.findings-acl.156}, PUBLISHER = {Association for Computational Linguistics}, ISBN = {979-8-89176-256-5}, CONFERENCE_NAME = {NAACL 2025-Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics. Findings}, BOOKTITLE = {NAACL 2025 Findings proceedings}, } @ARTICLE{BONFIGLI_2024_ARTICLE_BBMD_518430, AUTHOR = {Bonfigli, A. and Bacco, L. and Merone, M. and Dell'Orletta, F.}, TITLE = {From pre-training to fine-tuning: An in-depth analysis of Large Language Models in the biomedical domain}, YEAR = {2024}, ABSTRACT = {In this study, we delve into the adaptation and effectiveness of Transformer-based, pre-trained Large Language Models (LLMs) within the biomedical domain, a field that poses unique challenges due to its complexity and the specialized nature of its data. Building on the foundation laid by the transformative architecture of Transformers, we investigate the nuanced dynamics of LLMs through a multifaceted lens, focusing on two domain-specific tasks, i. e., Natural Language Inference (NLI) and Named Entity Recognition (NER). Our objective is to bridge the knowledge gap regarding how these models’ downstream performances correlate with their capacity to encapsulate task-relevant information. To achieve this goal, we probed and analyzed the inner encoding and attention mechanisms in LLMs, both encoder-and decoder-based, tailored for either general or biomedical-specific applications. This examination occurs before and after the models are fine-tuned across various data volumes. Our findings reveal that the models’ downstream effectiveness is intricately linked to specific patterns within their internal mechanisms, shedding light on the nuanced ways in which LLMs process and apply knowledge in the biomedical context. The source code for this paper is available at https: //github. com/agnesebonfigli99/LLMs-in-the-Biomedical-Domain}, KEYWORDS = {Biomedical domain, Domain adaptation, Large Language Models}, URL = {https://iris.cnr.it/handle/20.500.14243/518430}, VOLUME = {157}, DOI = {10.1016/j.artmed.2024.103003}, ISSN = {0933-3657}, JOURNAL = {ARTIFICIAL INTELLIGENCE IN MEDICINE}, } @ARTICLE{IAVARONE_2024_ARTICLE_ISBGPVDG_501741, AUTHOR = {Iavarone, B. and Sole Morelli, M. and Brunato, D. and Ghiasi, S. and Pasquale Scilingo, E. and Vanello, N. and Dell'Orletta, F. and Greco, A.}, TITLE = {The linguistic structure of an emotional text influences the sympathetic activity and the speech prosody of the reader}, YEAR = {2024}, ABSTRACT = {In this study, we present an analysis of the relationship between the linguistic profile of a text and the physiological and acoustic characteristics of the reader to improve the emotion recognition systems. To this aim, we recorded the speech and electrodermal activity (EDA) signals from 33 healthy volunteers reading neutral and affective texts aloud. We used the BioVoice toolbox and cvxEDA algorithm to estimate some of the main speech and EDA features, respectively. The selected texts were analyzed to quantify their lexical, morpho-syntactic, and syntactic properties. Correlation and Support Vector Regression analyses between linguistic and speech and EDA features have shown a significant bidirectional association between the morpho-syntactic structure of the text and both sympathetic markers and voice acoustic properties. Specifically, significant relationships were observed between linguistic properties and certain EDA and speech features commonly used to evaluate human emotional state (e. g., edaSymp, mean tonic, F0). These findings suggest that lexical, morpho-syntactic, and syntactic properties may have a significant impact on an individual’s emotional dynamics}, KEYWORDS = {Speech analysis, Linguistic profile, Electrodermal activity, Support Vector Regressor}, PAGES = {8}, URL = {https://iris.cnr.it/handle/20.500.14243/501741}, VOLUME = {89}, DOI = {10.1016/j.bspc.2023.105776}, ISSN = {1746-8108}, JOURNAL = {BIOMEDICAL SIGNAL PROCESSING AND CONTROL (ONLINE)}, } @INPROCEEDINGS{MIASCHI_2024_INPROCEEDINGS_MDV_518427, AUTHOR = {Miaschi, A. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Evaluating Large Language Models via Linguistic Profiling}, YEAR = {2024}, ABSTRACT = {Large Language Models (LLMs) undergo extensive evaluation against various benchmarks collected in established leaderboards to assess their performance across multiple tasks. However, to the best of our knowledge, there is a lack of comprehensive studies evaluating these models’ linguistic abilities independent of specific tasks. In this paper, we introduce a novel evaluation methodology designed to test LLMs’ sentence generation abilities under specific linguistic constraints. Drawing on the ‘linguistic profiling’ approach, we rigorously investigate the extent to which five LLMs of varying sizes, tested in both zero-and few-shot scenarios, effectively adhere to (morpho)syntactic constraints. Our findings shed light on the linguistic proficiency of LLMs, revealing both their capabilities and limitations in generating linguistically-constrained sentences}, KEYWORDS = {Large Language Models, Controllable Text Generation, Linguistic Profiling}, PAGES = {2835-2848}, URL = {https://aclanthology.org/2024.emnlp-main.166}, DOI = {10.18653/v1/2024.emnlp-main.166}, PUBLISHER = {Association for Computational Linguistics (USA)}, ISBN = {979-8-89176-164-3}, CONFERENCE_NAME = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, CONFERENCE_PLACE = {USA}, BOOKTITLE = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, } @INPROCEEDINGS{MIASCHI_2024_INPROCEEDINGS_MDV_487005, AUTHOR = {Miaschi, A. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Linguistic Knowledge Can Enhance Encoder-Decoder Models (If You Let It)}, YEAR = {2024}, ABSTRACT = {In this paper, we explore the impact of augmenting pre-trained Encoder-Decoder models, specifically T5, with linguistic knowledge for the prediction of a target task. In particular, we investigate whether fine-tuning a T5 model on an intermediate task that predicts structural linguistic properties of sentences modifies its performance in the target task of predicting sentence-level complexity. Our study encompasses diverse experiments conducted on Italian and English datasets, employing both monolingual and multilingual T5 models at various sizes. Results obtained for both languages and in cross-lingual configurations show that linguistically motivated intermediate fine-tuning has generally a positive impact on target task performance, especially when applied to smaller models and in scenarios with limited data availability}, KEYWORDS = {encoder-decoder, intermediate fine-tuning, linguistic features, sentence complexity}, PAGES = {10539-10554}, URL = {https://aclanthology.org/2024.lrec-main.922/}, PUBLISHER = {ELRA and ICCL}, ISBN = {978-2-493814-10-4}, CONFERENCE_NAME = {Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, BOOKTITLE = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, } @INPROCEEDINGS{MOTGER_2024_INPROCEEDINGS_MMDFM_519997, AUTHOR = {Motger, Q. and Miaschi, A. and Dell'Orletta, F. and Franch, X. and Marco, J.}, TITLE = {T-FREX: A Transformer-based Feature Extraction Method from Mobile App Reviews}, YEAR = {2024}, ABSTRACT = {Mobile app reviews are a large-scale data source for software-related knowledge generation activities, including software maintenance, evolution and feedback analysis. Effective extraction of features (i. e., functionalities or characteristics) from these reviews is key to support analysis on the acceptance of these features, identification of relevant new feature requests and prioritization of feature development, among others. Traditional methods focus on syntactic pattern-based approaches, typically context-agnostic, evaluated on a closed set of apps, difficult to replicate and limited to a reduced set and domain of apps. Mean-while, the pervasiveness of Large Language Models (LLMs) based on the Transformer architecture in software engineering tasks lays the groundwork for empirical evaluation of the performance of these models to support feature extraction. In this study, we present T-FREX, a Transformer-based, fully automatic approach for mobile app review feature extraction. First, we collect a set of ground truth features from users in a real crowdsourced software recommendation platform and transfer them automatically into a dataset of app reviews. Then, we use this newly created dataset to fine-tune multiple LLMs on a named entity recognition task under different data configurations. We assess the performance of T-FREX with respect to this ground truth, and we complement our analysis by comparing T-FREX with a baseline method from the field. Finally, we assess the quality of new features predicted by T-FREX through an external human evaluation. Results show that T-FREX outperforms on average the traditional syntactic-based method, especially when discovering new features from a domain for which the model has been fine-tuned}, KEYWORDS = {feature extraction, large language models, mobile apps, named entity recognition, reviews, token classification}, PAGES = {227-238}, URL = {https://iris.cnr.it/handle/20.500.14243/519997}, DOI = {10.1109/SANER60148.2024.00030}, PUBLISHER = {Institute of Electrical and Electronics Engineers Inc}, CONFERENCE_NAME = {31st IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024}, BOOKTITLE = {Proceedings-2024 IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024}, } @INPROCEEDINGS{OCCHIPINTI_2024_INPROCEEDINGS_OMMLDNG_519999, AUTHOR = {Occhipinti, D. and Marchi, M. and Mondella, I. and Lai, H. and Dell'Orletta, F. and Nissim, M. and Guerini, M.}, TITLE = {Fine-tuning with HED-IT: The impact of human post-editing for dialogical language models}, YEAR = {2024}, ABSTRACT = {Automatic methods for generating and gathering linguistic data have proven effective for fine-tuning Language Models (LMs) in languages less resourced than English. Still, while there has been emphasis on data quantity, less attention has been given to its quality. In this work, we investigate the impact of human intervention on machine-generated data when fine-tuning dialogical models. In particular, we study (1) whether post-edited dialogues exhibit higher perceived quality compared to the originals that were automatically generated; (2) whether fine-tuning with post-edited dialogues results in noticeable differences in the generated outputs; and (3) whether post-edited dialogues influence the outcomes when considering the parameter size of the LMs. To this end we created HED-IT, a large-scale dataset where machine-generated dialogues are paired with the version post-edited by humans. Using both the edited and unedited portions of HED-IT, we fine-tuned three different sizes of an LM. Results from both human and automatic evaluation show that the different quality of training data is clearly perceived and it has an impact also on the models trained on such data. Additionally, our findings indicate that larger models are less sensitive to data quality, whereas this has a crucial impact on smaller models. These results enhance our comprehension of the impact of human intervention on training data in the development of high-quality LMs}, KEYWORDS = {Large Language Models (LLMs), Detecting Synthetic Texts}, PAGES = {11892-11907}, URL = {https://iris.cnr.it/handle/20.500.14243/519999}, PUBLISHER = {Association for Computational Linguistics (ACL)}, CONFERENCE_NAME = {Findings of the 62nd Annual Meeting of the Association for Computational Linguistics, ACL 2024}, BOOKTITLE = {Proceedings of the Annual Meeting of the Association for Computational Linguistics}, } @INPROCEEDINGS{PUCCETTI_2024_INPROCEEDINGS_PRADE_519993, AUTHOR = {Puccetti, G. and Rogers, A. and Alzetta, C. and Dell'Orletta, F. and Esuli, A.}, TITLE = {AI 'News' Content Farms Are Easy to Make and Hard to Detect: A Case Study in Italian}, YEAR = {2024}, ABSTRACT = {Large Language Models (LLMs) are increasingly used as 'content farm' models (CFMs), to generate synthetic text that could pass for real news articles. This is already happening even for languages that do not have high-quality monolingual LLMs. We show that fine-tuning Llama (v1), mostly trained on English, on as little as 40K Italian news articles, is sufficient for producing news-like texts that native speakers of Italian struggle to identify as synthetic. We investigate three LLMs and three methods of detecting synthetic texts (log-likelihood, DetectGPT, and supervised classification), finding that they all perform better than human raters, but they are all impractical in the real world (requiring either access to token likelihood information or a large dataset of CFM texts). We also explore the possibility of creating a proxy CFM: an LLM fine-tuned on a similar dataset to one used by the real 'content farm'. We find that even a small amount of fine-tuning data suffices for creating a successful detector, but we need to know which base LLM is used, which is a major challenge. Our results suggest that there are currently no practical methods for detecting synthetic news-like texts 'in the wild', while generating them is too easy. We highlight the urgency of more NLP research on this problem}, KEYWORDS = {Large Language Models (LLMs), Detecting synthetic texts}, PAGES = {15312-15338}, URL = {https://aclanthology.org/2024.acl-long.817/}, VOLUME = {1}, DOI = {10.18653/v1/2024.acl-long.817}, PUBLISHER = {Association for Computational Linguistics (ACL)}, CONFERENCE_NAME = {ACL 2024-62nd Annual Meeting of the Association for Computational Linguistics}, BOOKTITLE = {Proceedings of the Annual Meeting of the Association for Computational Linguistics}, } @ARTICLE{ALZETTA_2023_ARTICLE_ADMPV_439017, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Miaschi, A. and Prat, E. and Venturi, G.}, TITLE = {Tell me how you write and I'll tell you what you read: a study on the writing style of book reviews}, YEAR = {2023}, ABSTRACT = {The paper aims at investigating variations in the writing style of book reviews published on different social reading platforms and referring to books of different genres, which enables acquiring insights into communication strategies adopted by readers to share their reading experiences. To this end, we introduce a corpus-based study focused on the analysis of A Good Review, a novel corpus of online book reviews written in Italian, posted on Amazon and Goodreads, and covering six literary fiction genres. We rely on stylometric analysis to explore the linguistic properties and lexicon of reviews and the authors conducted automatic classification experiments using multiple approaches and feature configurations to predict either the review's platform or the literary genre. The analysis of user-generated reviews demonstrates that language is a quite variable dimension across reading platforms, but not as much across book genres. The classification experiments revealed that features modelling the syntactic structure of the sentence are reliable proxies for discerning Amazon and Goodreads reviews, whereas lexical information showed a higher predictive role for automatically discriminating the genre}, KEYWORDS = {Stylometric analysis, Textual Genre detection, Book reviews}, PAGES = {23}, URL = {https://www.emerald.com/insight/content/doi/10.1108/JD-04-2023-0073/full/html}, VOLUME = {79}, DOI = {10.1108/JD-04-2023-0073}, ISSN = {0022-0418}, JOURNAL = {JOURNAL OF DOCUMENTATION}, } @ARTICLE{BACCO_2023_ARTICLE_BDLMN_439016, AUTHOR = {Bacco, L. and Dell'Orletta, F. and Lai, H. and Merone, M. and Nissim, M.}, TITLE = {A text style transfer system for reducing the physician-patient expertise gap: An analysis with automatic and human evaluations}, YEAR = {2023}, ABSTRACT = {Physicians and patients often come from different backgrounds and have varying levels of education, which can result in communication difficulties in the healthcare process. To address this expertise gap, we present a "Text Style Transfer" system. Our system uses Semantic Textual Similarity techniques based on Sentence Transformers models to create pseudo-parallel datasets from a large, non-parallel corpus of lay and expert texts. This approach allowed us to train a denoising autoencoder model (BART), overcoming the limitations of previous systems. Our extensive analysis, which includes both automatic metrics and human evaluations from both lay (patients) and expert (physicians) individuals, shows that our system outperforms state-of-the-art models and is comparable to human-provided gold references in some cases}, KEYWORDS = {Natural language processing, Text style transfer, Text simplification}, PAGES = {1-18}, URL = {https://www.sciencedirect.com/science/article/pii/S0957417423013763}, VOLUME = {233}, DOI = {10.1016/j.eswa.2023.120874}, ISSN = {0957-4174}, JOURNAL = {EXPERT SYSTEMS WITH APPLICATIONS}, } @ARTICLE{CERULLI_2023_ARTICLE_CBD_455146, AUTHOR = {Cerulli, A. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Linguistic Profile of a Text and Human Ratings of Writing Quality: a Case Study on Italian L1 Learner Essays}, YEAR = {2023}, ABSTRACT = {This paper presents a study based on the linguistic profiling methodology to explore the relationship between the linguistic structure of a text and how it is perceived in terms of writing quality byhumans. The approach is tested on a selection of Italian L1 learners essays, which were taken from a larger longitudinal corpus of essays written by Italian L1 students enrolled in the first and secondyear of lower secondary school. Human ratings of writing quality by Italian native speakers were collected through a crowdsourcing task, in which annotators were asked to read pairs of essays andrated which one they believed to be better written. By analyzing these ratings, the study identifies a variety of linguistic phenomena spanning across distinct levels of linguistic description thatdistinguish the essays considered as 'winners' and evaluates the impact of students' errors on the human perception of writing quality}, KEYWORDS = {text quality, human ratings, Natural Language Processing, learner corpus}, PAGES = {7-34}, URL = {https://www.ai-lc.it/wp-content/uploads/2023/09/IJCOL_9_1_1_cerulli_et_al.pdf}, VOLUME = {1 (9)}, DOI = {10.4000/ijcol.1104}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @ARTICLE{FOLESANI_2023_ARTICLE_FBPTMTZNCBRDCG_460174, AUTHOR = {Folesani, F. and Belvederi, M. M. and Puggioni, C. and Tiberto, E. and Marella, M. and Toffanin, T. and Zerbinati, L. and Nanni, M. G. and Caruso, R. and Brunato, D. and Ravelli, A. A. and Dell'Orletta, F. and Chochinov, H. M. and Grassi, L.}, TITLE = {Linguistic markers of demoralization improvement in schizophrenia: A pilot study}, YEAR = {2023}, ABSTRACT = {Background and objectives: Individuals with schizophrenia display language impairments involving pragmatics, semantics and syntax. Language impairments may show diagnostic specificity and could relate to the ability of engaging in psychotherapy. This pilot study sought to: (1) identify linguistic features that might differentiate individuals with schizophrenia from distressed controls without psychotic symptoms; and (2) examine the association between linguistic abilities and clinical changes during psychotherapy. Methods: We recruited patients with schizophrenia and a comparison group of individuals with demoralization and distress due to cancer. Participants underwent Dignity Therapy (DT), an existentially-oriented brief psychotherapy focused on legacy and subjective dignity. Verbatim transcripts of the DT sessions were analysed using Natural Language Processing (NLP). In addition, we measured changes in levels of demoralization and dignity-related distress before and after DT, exploring the association with linguistic variables with network analysis. Results: Patients with schizophrenia could be differentiated from those with cancer-related distress using only three out of 141 linguistic variables: total number of words, number of prepositional chains and conversational elements. Across groups, better levels of discourse coherence and higher number of arguments controlled by a predicate (verb "arity") were associated with larger improvements in demoralization and, indirectly, dignity-related distress. Conclusions: Reproducible linguistic markers may be able to differentiate individuals with schizophrenia from those with less severe psychopathology, and to predict better uptake of psychotherapy independent from diagnosis. Future studies should explore whether linguistic features derived from NLP may be exploited as accessible diagnostic or prognostic markers to tailor psychotherapy and other interventions in schizophrenia}, KEYWORDS = {Schizophrenia, Dignity Therapy, Natural Language Processing, Linguistic Profiling, Psychotherapy}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85153800425\&origin=inward}, DOI = {10.1016/j.ejpsy.2023.03.001}, ISSN = {0213-6163}, JOURNAL = {EUROPEAN JOURNAL OF PSYCHIATRY}, } @ARTICLE{MAGNANI_2023_ARTICLE_MCDBCILMTMASAACSCPMETMBRCFGMMMPRS_460175, AUTHOR = {Magnani, L. and Carmisciano, L. and Dell'Orletta, F. and Bettinardi, O. and Chiesa, S. and Imbesi, M. and Limonta, G. and Montagna, E. and Turone, I. and Martinasso, D. and Aguglia, A. and Serafini, G. and Amore, M. and Amerio, A. and Costanza, A. and Sibilla, F. and Calcagno, P. and Patti, S. and Molino, G. and Escelsior, A. and Trabucco, A. and Marzano, L. and Brunato, D. and Ravelli, A. A. and Cappucciati, M. and Fiocchi, R. and Guerzoni, G. and Maravita, D. and Macchetti, F. and Mori, E. and Paglia, C. A. and Roscigno, F. and Saginario, A.}, TITLE = {Linguistic profile automated characterisation in pluripotential clinical high-risk mental state (CHARMS) conditions: methodology of a multicentre observational study}, YEAR = {2023}, ABSTRACT = {Introduction Language is usually considered the social vehicle of thought in intersubjective communications. However, the relationship between language and high-order cognition seems to evade this canonical and unidirectional description (ie, the notion of language as a simple means of thought communication). In recent years, clinical high at-risk mental state (CHARMS) criteria (evolved from the Ultra-High-Risk paradigm) and the introduction of the Clinical Staging system have been proposed to address the dynamicity of early psychopathology. At the same time, natural language processing (NLP) techniques have greatly evolved and have been successfully applied to investigate different neuropsychiatric conditions. The combination of at-risk mental state paradigm, clinical staging system and automated NLP methods, the latter applied on spoken language transcripts, could represent a useful and convenient approach to the problem of early psychopathological distress within a transdiagnostic risk paradigm. Methods and analysis Help-seeking young people presenting psychological distress (CHARMS /-and Clinical Stage 1a or 1b; target sample size for both groups n=90) will be assessed through several psychometric tools and multiple speech analyses during an observational period of 1-year, in the context of an Italian multicentric study. Subjects will be enrolled in different contexts: Department of Neuroscience, Rehabilitation, Ophthalmology, Genetics, Maternal and Child Health (DINOGMI), Section of Psychiatry, University of Genoa-IRCCS Ospedale Policlinico San Martino, Genoa, Italy; Mental Health Department-territorial mental services (ASL 3-Genoa), Genoa, Italy; and Mental Health Department-territorial mental services (AUSL-Piacenza), Piacenza, Italy. The conversion rate to full-blown psychopathology (CS 2) will be evaluated over 2 years of clinical observation, to further confirm the predictive and discriminative value of CHARMS criteria and to verify the possibility of enriching them with several linguistic features, derived from a fine-grained automated linguistic analysis of speech. Ethics and dissemination The methodology described in this study adheres to ethical principles as formulated in the Declaration of Helsinki and is compatible with International Conference on Harmonization (ICH)-good clinical practice. The research protocol was reviewed and approved by two different ethics committees (CER Liguria approval code: 591/2020-id. 10993; Comitato Etico dell'Area Vasta Emilia Nord approval code: 2022/0071963). Participants will provide their written informed consent prior to study enrolment and parental consent will be needed in the case of participants aged less than 18 years old. Experimental results will be carefully shared through publication in peer-reviewed journals, to ensure proper data reproducibility}, KEYWORDS = {Clinical High At Risk Mental State, Natural Language Processing, Linguistic profiling, Depression}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85150775470\&origin=inward}, VOLUME = {13}, DOI = {10.1136/bmjopen-2022-066642}, ISSN = {2044-6055}, JOURNAL = {BMJ OPEN}, } @ARTICLE{MIASCHI_2023_ARTICLE_MABDV_439018, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Testing the Effectiveness of the Diagnostic Probing Paradigm on Italian Treebanks}, YEAR = {2023}, ABSTRACT = {The outstanding performance recently reached by neural language models (NLMs) across many natural language processing (NLP) tasks has steered the debate towards understanding whether NLMs implicitly learn linguistic competence. Probes, i. e., supervised models trained using NLM representations to predict linguistic properties, are frequently adopted to investigate this issue. However, it is still questioned if probing classification tasks really enable such investigation or if they simply hint at surface patterns in the data. This work contributes to this debate by presenting an approach to assessing the effectiveness of a suite of probing tasks aimed at testing the linguistic knowledge implicitly encoded by one of the most prominent NLMs, BERT. To this aim, we compared the performance of probes when predicting gold and automatically altered values of a set of linguistic features. Our experiments were performed on Italian and were evaluated across BERT's layers and for sentences with different lengths. As a general result, we observed higher performance in the prediction of gold values, thus suggesting that the probing model is sensitive to the distortion of feature values. However, our experiments also showed that the length of a sentence is a highly influential factor that is able to confound the probing model's predictions}, KEYWORDS = {Neural language model, Probing tasks, Treebanks}, PAGES = {19}, URL = {https://www.mdpi.com/2078-2489/14/3/144}, VOLUME = {14 (3)}, DOI = {10.3390/info14030144}, ISSN = {2078-2489}, JOURNAL = {INFORMATION}, } @ARTICLE{SOLSALES_2023_ARTICLE_SAMD_439019, AUTHOR = {Solà Sales, S. and Alzetta, C. and Moret Tatay, C. and Dell'Orletta, F.}, TITLE = {Analysing Deception in Witness Memory Though Linguistic Styles in Spontaneous Language}, YEAR = {2023}, ABSTRACT = {The act of lying and its detection have raised interest in many fields, from the legal system to our daily lives. Considering that testimonies are commonly based on linguistic parameters, natural language processing, a research field concerned with programming computers to process and analyse natural language texts or speech, is a topic of interest on this front. This study aimed to examine the linguistic styles of simulated deception and true testimonies collected with the aim of studying witness memory. Study participants were asked to act as a witness of a crime by retelling the story they had just read. Cognitive interviewing techniques were used to collect testimony under two conditions: truth and simulated deception. A sample of 48 participants volunteered to participate in the study. Analyses of the linguistic indicators and content were carried out. Specifically, we performed a comparison of testimonies of the same participant by condition to analyse the variation between (i) lexical and (ii) linguistic features and (iii) content and speech characteristics (disfluencies) depending on the narrative condition. Concerning lexical properties, adjectives were the most-varying grammatical category between truthful and deceptive testimonies. Furthermore, in the linguistic analysis, we observed that truthful testimonies were generally longer than deceptive ones in terms of the number of words and sentences and also characterised by more articulated sentence structures, and these differences were also statistically significant. Regarding the analysis of the content, cognitive criteria (details) and admitting lack of memory were more present in truthful statements. By providing an objective measure, these results are of interest in developing NLP tools for assessing the credibility of testimonies in forensics}, KEYWORDS = {Natural language processing, Simulated deception, Stylometric analysis}, PAGES = {26}, URL = {https://www.mdpi.com/2076-3425/13/2/317}, VOLUME = {13 (2)}, DOI = {10.3390/brainsci13020317}, ISSN = {2076-3425}, JOURNAL = {BRAIN SCIENCES}, } @INPROCEEDINGS{ALZETTA_2023_INPROCEEDINGS_ABDMSSV_470901, AUTHOR = {Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Miaschi, A. and Sagae, K. and Sánchez Gutiérrez, C. H. and Venturi, G.}, TITLE = {LangLearn at EVALITA 2023: Overview of the Language Learning Development Task}, YEAR = {2023}, ABSTRACT = {Language Learning Development (LangLearn) is the EVALITA 2023 shared task on automatic language development assessment, which consists in predicting the evolution of the written language abilities of learners across time. LangLearn is conceived to be multilingual, relying on written productions of Italian and Spanish learners, and representative of L1 and L2 learning scenarios. A total of 9 systems were submitted by 5 teams. The results highlight the open challenges of automatic language development assessment}, URL = {https://iris.cnr.it/handle/20.500.14243/470901}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {9791255000693}, CONFERENCE_NAME = {8th Evaluation Campaign of Natural Language Processing and Speech Tools for Italian}, CONFERENCE_PLACE = {Torino}, BOOKTITLE = {Proceedings of EVALITA 2023}, } @INPROCEEDINGS{ALZETTA_2023_INPROCEEDINGS_ADFMV_470921, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Fazzone, C. and Miaschi, A. and Venturi, G.}, TITLE = {Unmasking the Wordsmith: Revealing Author Identity through Reader Reviews}, YEAR = {2023}, ABSTRACT = {Traditional genre-based approaches for book recommendations face challenges due to the vague definition of genres. To overcome this, we propose a novel task called Book Author Prediction, where we predict the author of a book based on user-generated reviews’ writing style. To this aim, we first introduce the ‘Literary Voices Corpus’ (LVC), a dataset of Italian book reviews, and use it to train and test machine learning models. Our study contributes valuable insights for developing user-centric systems that recommend leisure readings based on individual readers’ interests and writing styles}, URL = {https://ceur-ws.org/Vol-3596/paper4.pdf}, CONFERENCE_NAME = {9th Italian Conference on Computational Linguistics}, BOOKTITLE = {Proceedings of the 9th Italian Conference on Computational Linguistics}, } @INPROCEEDINGS{BRUNATO_2023_INPROCEEDINGS_BDDR_455142, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Dini, I. and Ravelli, A. A.}, TITLE = {Coherent or Not? Stressing a Neural Language Model for Discourse Coherence in Multiple Languages}, YEAR = {2023}, ABSTRACT = {In this study, we investigate the capability of a Neural Language Model (NLM) to distinguish between coherent and incoherent text, where the latter has been artificially created to gradually undermine local coherence within text. While previous research on coherence assessment using NLMs has primarily focused on English, we extend our investigation to multiple languages. We employ a consistent evaluation framework to compare the performance of monolingual and multilingual models in both in-domain and out-domain settings. Additionally, we explore the model's performance in a cross-language scenario}, KEYWORDS = {text coherence, neural language models, multilingual corpora}, PAGES = {10690-10700}, URL = {https://aclanthology.org/2023.findings-acl.680}, DOI = {10.18653/v1/2023.findings-acl.680}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-959429-62-3}, CONFERENCE_NAME = {61st Annual Meeting of the Association for Computational Linguistics (ACL 2023)}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Findings of the Association for Computational Linguistics: ACL 2023}, } @INPROCEEDINGS{MIASCHI_2023_INPROCEEDINGS_MPD_520527, AUTHOR = {Miaschi, A. and Papucci, M. and Dell'Orletta, F.}, TITLE = {Lost in Labels: An Ongoing Quest to Optimize Text-to-Text Label Selection for Classification}, YEAR = {2023}, ABSTRACT = {In this paper, we present an evaluation of the influence of label selection on the performance of a Sequence-to-Sequence Transformer model in a classification task. Our study investigates whether the choice of words used to represent classification categories affects the model’s performance, and if there exists a relationship between the model’s performance and the selected words. To achieve this, we fine-tuned an Italian T5 model on topic classification using various labels. Our results indicate that the different label choices can significantly impact the model’s performance. That being said, we did not find a clear answer on how these choices affect the model performances, highlighting the need for further research in optimizing label selection}, KEYWORDS = {encoder-decoder, label selection, topic classification}, URL = {https://iris.cnr.it/handle/20.500.14243/520527}, VOLUME = {516 (394)}, BOOKTITLE = {Proceedings of the 9th Italian Conference on Computational Linguistics CLiC-it 2023: Venice, Italy, November 30-December 2, 2023}, } @ARTICLE{BACCO_2022_ARTICLE_BRADVVDMPD_446362, AUTHOR = {Bacco, L. and Russo, F. and Ambrosio, L. and D'Antoni, F. and Vollero, L. and Vadala, G. and Dell'Orletta, F. and Merone, M. and Papalia, R. and Denaro, V.}, TITLE = {Natural language processing in low back pain and spine diseases: A systematic review}, YEAR = {2022}, ABSTRACT = {Natural Language Processing (NLP) is a discipline at the intersection between Computer Science (CS), Artificial Intelligence (AI), and Linguistics that leverages unstructured human-interpretable (natural) language text. In recent years, it gained momentum also in health-related applications and research. Although preliminary, studies concerning Low Back Pain (LBP) and other related spine disorders with relevant applications of NLP methodologies have been reported in the literature over the last few years. It motivated us to systematically review the literature comprised of two major public databases, PubMed and Scopus. To do so, we first formulated our research question following the PICO guidelines. Then, we followed a PRISMA-like protocol by performing a search query including terminologies of both technical (e. g., natural language and computational linguistics) and clinical (e. g., lumbar and spine surgery) domains. We collected 221 non-duplicated studies, 16 of which were eligible for our analysis. In this work, we present these studies divided into sub-categories, from both tasks and exploited models' points of view. Furthermore, we report a detailed description of techniques used to extract and process textual features and the several evaluation metrics used to assess the performance of the NLP models. However, what is clear from our analysis is that additional studies on larger datasets are needed to better define the role of NLP in the care of patients with spinal disorders}, KEYWORDS = {natural language processing, Low Back Pain, Survey}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85135163810\&origin=inward}, VOLUME = {9}, DOI = {10.3389/fsurg.2022.957085}, ISSN = {2296-875X}, JOURNAL = {FRONTIERS IN SURGERY}, } @ARTICLE{BRUNATO_2022_ARTICLE_BDV_440157, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Linguistically-Based Comparison of Different Approaches to Building Corpora for Text Simplification: A Case Study on Italian}, YEAR = {2022}, ABSTRACT = {In this paper, we present an overview of existing parallel corpora for Automatic Text Simplification (ATS) in different languages focusing on the approach adopted for their construction. We make the main distinction between manual and (semi)-automatic approaches in order to investigate in which respect complex and simple texts vary and whether and how the observed modifications may depend on the underlying approach. To this end, we perform a two-level comparison on Italian corpora, since this is the only language, with the exception of English, for which there are large parallel resources derived through the two approaches considered. The first level of comparison accounts for the main types of sentence transformations occurring in the simplification process, the second one examines the results of a linguistic profiling analysis based on Natural Language Processing techniques and carried out on the original and the simple version of the same texts. For both levels of analysis, we chose to focus our discussion mostly on sentence transformations and linguistic characteristics that pertain to the morpho-syntactic and syntactic structure of the sentence}, KEYWORDS = {linguistic complexity, corpus construction, text simplification}, PAGES = {1-19}, URL = {https://www.frontiersin.org/articles/10.3389/fpsyg.2022.707630/full}, VOLUME = {13}, DOI = {10.3389/fpsyg.2022.707630}, ISSN = {1664-1078}, JOURNAL = {FRONTIERS IN PSYCHOLOGY}, } @ARTICLE{BRUNATO_2022_ARTICLE_BMD_414977, AUTHOR = {Brunato, D. and Mattei, A. and Dell'Orletta, F.}, TITLE = {Analisi della scrittura giovanile da una prospettiva linguistico-computazionale: il caso di studio della Fanfiction}, YEAR = {2022}, ABSTRACT = {This paper presents a study aimed at characterizing the linguistic style of an emerging literary genre of the web, particularly appreciated by teens and young adults: fanfiction. By relying on Natural Language Processing approaches, and in particular on the methodology of linguistic profiling applied to a novel corpus of Italian fanfiction stories inspired by the fantasy saga "Harry Potter", we investigate the relationship between linguistic style and 'success', measured in terms of number of reviews obtained by the readers. We show that it is possible to detect a set of features, among a wide set of linguistic ones modeling lexical, morpho-syntactic and syntactic phenomena, which help more in discriminating between 'successful' and 'unsuccessful' fanfics}, KEYWORDS = {stilometria computazionale, linguistic profiling, fanfiction}, PAGES = {171-189}, URL = {https://iris.cnr.it/handle/20.500.14243/414977}, VOLUME = {2021/3}, ISSN = {0033-9725}, JOURNAL = {RASSEGNA ITALIANA DI LINGUISTICA APPLICATA}, } @ARTICLE{MIASCHI_2022_ARTICLE_MBDV_417257, AUTHOR = {Miaschi, A. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {On Robustness and Sensitivity of a Neural Language Model: A Case Study on Italian L1 Learner Errors}, YEAR = {2022}, ABSTRACT = {In this paper, we propose a comprehensive linguistic study aimed at assessing the implicit behavior of one of the most prominent Neural Language Models (NLM) based on Transformer architectures, BERT (Devlin et al., 2019), when dealing with a particular source of noisy data, namely essays written by L1 Italian learners containing a variety of errors targeting grammar, orthography and lexicon. Differently from previous works, we focus on the pre-training stage and we devise two complementary evaluation tasks aimed at assessing the impact of errors on sentence-level inner representations in terms of semantic robustness and linguistic sensitivity. While the first evaluation perspective is meant to probe the model's ability to encode the semantic similarity between sentences also in the presence of errors, the second type of probing task evaluates the influence of errors on BERT's implicit knowledge of a set of raw and morpho-syntactic properties of a sentence. Our experiments show that BERT's ability to compute sentence similarity and to correctly encode multi-leveled linguistic information of a sentence are differently modulated by the category of errors and that the error hierarchies in terms of robustness and sensitivity change across layer-wise representations}, KEYWORDS = {Natural Language Processing, Neural Language Model, Interpretability}, PAGES = {426-438}, URL = {https://doi.org/10.1109/TASLP.2022.3226333}, VOLUME = {31}, DOI = {10.1109/TASLP.2022.3226333}, ISSN = {2329-9290}, JOURNAL = {IEEE/ACM TRANSACTIONS ON AUDIO, SPEECH, AND LANGUAGE PROCESSING}, } @ARTICLE{MIASCHI_2022_ARTICLE_MSBDV_443057, AUTHOR = {Miaschi, A. and Sarti, G. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Probing Linguistic Knowledge in Italian Neural Language Models across Language Varieties}, YEAR = {2022}, ABSTRACT = {In this paper, we present an in-depth investigation of the linguistic knowledge encoded by the transformer models currently available for the Italian language. In particular, we investigate how the complexity of two different architectures of probing models affects the performance of the Transformers in encoding a wide spectrum of linguistic features. Moreover, we explore how this implicit knowledge varies according to different textual genres and language varieties}, KEYWORDS = {Neural Language Models, Interpretability, Language Varieties}, PAGES = {25-44}, URL = {http://www.aaccademia.it/ita/scheda-libro?aaref=1518}, DOI = {10.4000/ijcol.965}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @BOOK{VENTURI_2022_BOOK_VCD_440167, AUTHOR = {Venturi, G. and Cimino, A. and Dell'Orletta, F.}, TITLE = {La fede dichiarata. Un'analisi linguistico-computazionale}, YEAR = {2022}, ABSTRACT = {Il volume indaga l'apporto di tecnologie basate sul Natural Language Processing (NLP) all'analisi di un corpus di trascrizioni di 164 interviste orali raccolte durante la ricerca 2017 sulla "Religiosità in Italia". Gli autori illustrano metodologie e strumenti che permettono di trasformare l'informazione implicitamente contenuta nelle interviste in informazione esplicitamente strutturata. Il risultato finale di questo processo interpretativo spazia dall'acquisizione di conoscenze lessicali e terminologiche complesse alla loro organizzazione in strutture proto-concettuali, fino ad arrivare alla qualificazione dell'atteggiamento con il quale l'intervistato si esprime. Il lettore viene accompagnato a scoprire quale sia il valore aggiunto delle analisi basate su NLP e quali nuovi orizzonti di ricerca siano aperti da queste analisi}, KEYWORDS = {Knowledge Extraction, Knowledge Organization}, PAGES = {1-181}, URL = {https://iris.cnr.it/handle/20.500.14243/440167}, PUBLISHER = {Franco Angeli Editore (Milano, ITA)}, ISBN = {978-88-351-2146-6}, CONFERENCE_PLACE = {Milano}, } @INPROCEEDINGS{MIASCHI_2022_INPROCEEDINGS_MRD_443056, AUTHOR = {Miaschi, A. and Ravelli, A. and Dell'Orletta, F.}, TITLE = {Punctuation Restoration in Spoken Italian Transcripts with Transformers}, YEAR = {2022}, ABSTRACT = {In this paper, we propose an evaluation of a Transformer-based punctuation restoration model for the Italian language. Experimenting with a BERT-base model, we perform several fine-tuning with different training data and sizes and tested them in an in-and cross-domain scenario. Moreover, we conducted an error analysis of the main weaknesses of the model related to specific punctuation marks. Finally, we test our system either quantitatively and qualitatively, by offering a typical task-oriented and a perception-based acceptability evaluation}, KEYWORDS = {nlp, transformer models, puncutation restoration}, PAGES = {245-260}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85135083576\&origin=inward}, VOLUME = {13196 LNAI}, DOI = {10.1007/978-3-031-08421-8_17}, CONFERENCE_NAME = {AIxIA 2021-Advances in Artificial Intelligence}, BOOKTITLE = {Proccedings of AIxIA 2021-Advances in Artificial Intelligence}, } @INPROCEEDINGS{PAPUCCI_2022_INPROCEEDINGS_PDMD_415084, AUTHOR = {Papucci, M. and De Nigris, C. and Miaschi, A. and Dell'Orletta, F.}, TITLE = {Evaluating Text-To-Text Framework for Topic and Style Classification of Italian texts}, YEAR = {2022}, ABSTRACT = {In this paper, we propose an extensive evaluation of the first text-to-text Italian Neural Language Model (NLM), IT5 [1], on a classification scenario. In particular, we test the performance of IT5 on several tasks involving both the classification of the topic and the style of a set of Italian posts. We assess the model in two different configurations, single-and multi-task classification, and we compare it with a more traditional NLM based on the Transformer architecture (i. e. BERT). Moreover, we test its performance in a few-shot learning scenario. We also perform a qualitative investigation on the impact of label representations in modeling the classification of the IT5 model. Results show that IT5 could achieve good results, although generally lower than the BERT model. Nevertheless, we observe a significant performance improvement of the Text-to-text model in a multi-task classification scenario. Finally, we found that altering the representation of the labels mainly impacts the classification of the topic}, KEYWORDS = {bert, style classification, t5, text-to-text, topic classification, transformers}, PAGES = {56-70}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85143252156\&origin=inward}, VOLUME = {3287}, CONFERENCE_NAME = {Sixth Workshop on Natural Language for Artificial Intelligence, NL4AI 2022}, } @INPROCEEDINGS{PUCCETTI_2022_INPROCEEDINGS_PRDD_521513, AUTHOR = {Puccetti, G. and Rogers, A. and Drozd, A. and Dell'Orletta, F.}, TITLE = {Outlier dimensions that disrupt transformers are driven by frequency}, YEAR = {2022}, ABSTRACT = {While Transformer-based language models are generally very robust to pruning, there is the recently discovered outlier phenomenon: disabling only 48 out of 110M parameters in BERT-base drops its performance by nearly 30% on MNLI. We replicate the original evidence for the outlier phenomenon and we link it to the geometry of the embedding space. We find that in both BERT and RoBERTa the magnitude of hidden state coefficients corresponding to outlier dimensions correlates with the frequency of encoded tokens in pre-training data, and it also contributes to the “vertical” self-attention pattern enabling the model to focus on the special tokens. This explains the drop in performance from disabling the outliers, and it suggests that to decrease anisotropicity in future models we need pre-training schemas that would better take into account the skewed token distributions}, KEYWORDS = {Large Language Models, Mechanistic interpretability, Natural Language Processing}, PAGES = {1286-1304}, URL = {https://aclanthology.org/2022.findings-emnlp.93/}, DOI = {10.18653/v1/2022.findings-emnlp.93}, PUBLISHER = {Association for Computational Linguistics (ACL)}, ISBN = {978-1-959429-43-2}, CONFERENCE_NAME = {EMNLP 2022-Findings of the Association for Computational Linguistics}, BOOKTITLE = {Findings of the Association for Computational Linguistics: EMNLP 2022}, EDITOR = {Goldberg, Y. and Kozareva, Z. and Zhang, Y.}, } @INPROCEEDINGS{ZAMPARELLI_2022_INPROCEEDINGS_ZABCDHV_448002, AUTHOR = {Zamparelli, R. and A Chowdhury, S. and Brunato, D. and Chesi, C. and Dell'Orletta, F. and Hasan, A. and Venturi, G.}, TITLE = {SemEval-2022 Task 3: PreTENS-Evaluating Neural Networks on Presuppositional Semantic Knowledge}, YEAR = {2022}, ABSTRACT = {We report the results of the SemEval 2022 Task 3, PreTENS, on evaluation the acceptability of simple sentences containing constructions whose two arguments are presupposed to be or not to be in an ordered taxonomic relation. The task featured two sub-tasks articulated as: (i) binary prediction task and (ii) regression task, predicting the acceptability in a continuous scale. The sentences were artificially generated in three languages (English, Italian and French). 21 systems, with 8 system papers were submitted for the task, all based on various types of fine-tuned transformer systems, often with ensemble methods and various data augmentation techniques. The best systemsreached an F1-macro score of 94. 49 (sub-task1) and a Spearman correlation coefficient of 0. 80 (sub-task2), with interesting variations in specific constructions and/or languages}, KEYWORDS = {Neural Networks, Presuppositional Knowledge, Evaluation}, PAGES = {228-238}, URL = {https://aclanthology.org/2022.semeval-1.29.pdf}, CONFERENCE_NAME = {16th International Workshop on Semantic Evaluation (SemEval-2022)}, BOOKTITLE = {16th International Workshop on Semantic Evaluation (SemEval-2022)}, } @TECHREPORT{ANTONINI_2022_TECHREPORT_ADFNPRSSSTT_445394, AUTHOR = {Antonini, G. and Dell'Orletta, F. and Filippetti, A. and Nuzzolese, A. G. and Palaia, R. and Reale, E. and Saccone, M. and Sfameni, C. and Spinello, A. O. and Trufelli, L. and Tuzi, F.}, TITLE = {Le attività dipartimentali di monitoraggio e programmazione della ricerca nel settore delle scienze umane, sociali e del patrimonio culturale: metodologie, processi, risultati e soluzioni proposte per lo sviluppo di un sistema CNR integrato e multi livello per il monitoraggio e la programmazione della ricerca}, YEAR = {2022}, ABSTRACT = {Il Dipartimento Scienze Umane e Sociali, Patrimonio Culturale (DSU) del CNR, con la fattiva collaborazione di tutti gli Istituti ad esso afferenti, ha condotto un'iniziativa finalizzata a migliorare l'efficacia dei processi di monitoraggio e programmazione delle attività progettuali, infrastrutturali e di ricerca. È stato costituito uno specifico gruppo di lavoro che, interagendo costruttivamente con gli Istituti, ha svolto attività di studio e analisi con l'obiettivo di elaborare una proposta di revisione/aggiornamento delle Aree strategiche e delle relative Aree progettuali dipartimentali. Il presente rapporto illustra il percorso che ha portato a tale risultato e fornisce una serie di elementi utili a verificare l'efficacia delle modalità di conduzione dei lavori: dalle metodologie adottate alle attività di rilevazione e analisi dei dati e delle informazioni raccolti con metodi diversi; dall'analisi dei settori di interesse nello scenario più ampio di livello nazionale, europeo e internazionale al ruolo delle scienze sociali, delle discipline umanistiche e del patrimonio culturale in ambito CNR; dalla disamina di tutti gli elementi informativi, una volta disponibili, alla proposta di aggiornamento dell'impianto programmatico delle attività progettuali e di ricerca dipartimentali. Infine, sono evidenziate alcune delle principali criticità riconducibili prevalentemente al contesto organizzativo-gestionale, con particolare riferimento alle misure organizzative, ai metodi e agli strumenti dedicati al monitoraggio e alla programmazione scientifica}, KEYWORDS = {Monitoraggio della ricerca, programmazione della ricerca, Scienze umane - sociali e del patrimonio culturale, PIANO TRIENNALE DI ATTIVITÀ DEL CONSIGLIO NAZIONALE DELLE RICERCHE}, PAGES = {103}, URL = {https://iris.cnr.it/handle/20.500.14243/445394}, PUBLISHER = {Consiglio Nazionale delle Ricerche (Roma, ITA)}, CONFERENCE_PLACE = {Roma}, } @ARTICLE{BACCO_2021_ARTICLE_BCDM_444101, AUTHOR = {Bacco, L. and Cimino, A. and Dell'Orletta, F. and Merone, M.}, TITLE = {Explainable sentiment analysis: A hierarchical transformer-based extractive summarization approach}, YEAR = {2021}, ABSTRACT = {In recent years, the explainable artificial intelligence (XAI) paradigm is gaining wide research interest. The natural language processing (NLP) community is also approaching the shift of paradigm: building a suite of models that provide an explanation of the decision on some main task, without affecting the performances. It is not an easy job for sure, especially when very poorly interpretable models are involved, like the almost ubiquitous (at least in the NLP literature of the last years) transformers. Here, we propose two different transformer-based methodologies exploiting the inner hierarchy of the documents to perform a sentiment analysis task while extracting the most important (with regards to the model decision) sentences to build a summary as the explanation of the output. For the first architecture, we placed two transformers in cascade and leveraged the attention weights of the second one to build the summary. For the other architecture, we employed a single transformer to classify the single sentences in the document and then combine the probability scores of each to perform the classification and then build the summary. We compared the two methodologies by using the IMDB dataset, both in terms of classification and explainability performances. To assess the explainability part, we propose two kinds of metrics, based on benchmarking the models' summaries with human annotations. We recruited four independent operators to annotate few documents retrieved from the original dataset. Furthermore, we conducted an ablation study to highlight how implementing some strategies leads to important improvements on the explainability performance of the cascade transformers model}, KEYWORDS = {Natural Language Processing, Sentiment Analysis, Explainable IA}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85114289346\&origin=inward}, VOLUME = {10}, DOI = {10.3390/electronics10182195}, ISSN = {2079-9292}, JOURNAL = {ELECTRONICS}, } @ARTICLE{CHINELLO_2021_ARTICLE_CRFDBE_401397, AUTHOR = {Chinello, A. and Richichi, V. and Fanelli, M. and Dell'Orletta, F. and Boschetti, F. and Enrico Zappa, L.}, TITLE = {La semantica del potere nella rappresentazione materna e paterna: uno studio-pilota linguistico e computazionale dei siti proana in Italia}, YEAR = {2021}, ABSTRACT = {Numerose ricerche hanno cercato di tratteggiare le caratteristiche relazionali dei genitori con figlie affette da Anoressia Nervosa (AN) utilizzando metodi strutturati (interviste, questionari). Considerando il modello delle polarità semantiche, il presente studio pilota vuole esplorare la presenza della semantica del potere, tipica delle famiglie con AN, nei commenti degli utenti di 10 blog pro-anoressia attraverso l'analisi testuale (T-LAB), focalizzando l'attenzione sulle rappresentazioni delle due figure genitoriali. Le analisi mostrano un'alta frequenza dei commenti riguardanti la figura materna e paterna, associate a specifici sentimenti (colpa, conflitto), all'interno di una vasta rete di attori sociali. La rappresentazione materna risulta essere associata a tematiche riguardanti la semantica del potere, il controllo e la conversazione su tematiche alimentari. Alternativamente, la figura paterna risulta essere maggiormente legata ai simboli del potere lavorativo-economico e al tema della mancanza. Questi risultati, sebbene preliminari, suggeriscono la presenza di alcuni termini legati al potere anche negli utenti di blog pro-anoressia in linea a quanto già mostrato in pazienti con AN}, KEYWORDS = {natural language processing, anoressia}, URL = {https://iris.cnr.it/handle/20.500.14243/401397}, VOLUME = {68}, ISSN = {0030-5391}, JOURNAL = {ORIENTAMENTI PEDAGOGICI}, } @ARTICLE{FANTONI_2021_ARTICLE_FCCADP_401396, AUTHOR = {Fantoni, G. and Coli, E. and Chiarello, F. and Apreda, R. and Dell'Orletta, F. and Pratelli, G.}, TITLE = {Text mining tool for translating terms of contract into technical specifications: Development and application in the railway sector}, YEAR = {2021}, ABSTRACT = {Tenders or technical terms contain a large quantity of both technical, legal, managerial information mixed in a nested and complex net of relationships. Extracting technical and design information from a document whose aim is both legal and technical, and that is written using several specific jargons, is not a trivial task: the purpose of the research is to try to detect, extract, split and assign information from the text of a tender in an automatic way. It means being able to understand technical and legal terms and organize them in multiple ways: according to product structure, internal organisational structure, etc. The focus is in providing a handy tool that could speed up and facilitate human analysis and allow tackling also the process of transforming customer's requirements into design specifications. The approach chosen to overcome the various issues is to support state-of-the-art Computational Linguistic tools with a wide Knowledge Base. The latter has been constructed both manually and automatically and comprises not only keywords but also concepts, relationships and regular expressions. The implementation of the methodology has been carried out during a project for AnsaldoBreda S. p. A. (now Hitachi Rail Europe). A case study about the tender for a high-speed train has been included to show the functioning and output of the entire software system. (C) 2020 Elsevier B. V. All rights reserved}, KEYWORDS = {Contract terms, Technical requirements, Natural language processing}, PAGES = {17}, URL = {https://iris.cnr.it/handle/20.500.14243/401396}, VOLUME = {124}, DOI = {10.1016/j.compind.2020.103357}, ISSN = {0166-3615}, JOURNAL = {COMPUTERS IN INDUSTRY}, } @ARTICLE{MIASCHI_2021_ARTICLE_MBD_402654, AUTHOR = {Miaschi, A. and Brunato, D. P. and Dell'Orletta, F.}, TITLE = {A NLP-based stylometric approach for tracking the evolution of L1 written language competence}, YEAR = {2021}, ABSTRACT = {In this study we present a Natural Language Processing (NLP)-based stylometric approach for tracking the evolution of written language competence in Italian L1 learners. The approach relies on a wide set of linguistically motivated features capturing stylistic aspects of a text, which were extracted from students' essays contained in CItA (Corpus Italiano di Apprendenti L1), the first longitudinal corpus of texts written by Italian L1 learners enrolled in the first and second year of lower secondary school. We address the problem of modeling written language development as a supervised classification task consisting in predicting the chronological order of essays written by the same student at different temporal spans. The promising results obtained in several classification scenarios allow us to conclude that it is possible to automatically model the highly relevant changes affecting written language evolution across time, as well as identifying which features are more predictive of this process. In the last part of the article, we focus the attention on the possible influence of background variables on language learning and we present preliminary results of a pilot study aiming at understanding how the observed developmental patterns are affected by information related to the school environment of the student}, KEYWORDS = {stylometry, computational linguistics, language competence}, PAGES = {71-105}, URL = {https://www.jowr.org/abstracts/vol13_1/Miaschi_et_al_2021_13_1_abstract.html}, VOLUME = {VOL. 13}, DOI = {10.17239/jowr-2021.13.01.03}, ISSN = {2030-1006}, JOURNAL = {JOURNAL OF WRITING RESEARCH}, } @ARTICLE{VARGAS_2021_ARTICLE_VCDS_400917, AUTHOR = {Vargas, A. M. and Cominelli, L. and Dell'Orletta, F. and Scilingo, E. P.}, TITLE = {Verbal communication in robotics: a study on salient terms, research fields and trends in the last decades based on a computational linguistic analysis}, YEAR = {2021}, ABSTRACT = {Verbal communication is an expanding field in robotics showing a significant increase in both the industrial and research field. The application of verbal communication in robotics aims to reach a natural human-like interaction with robots. In this study, we investigated how salient terms related to verbal communication in robotics have evolved over the years, what are the topics that recur in the related literature, and what are their trends. The study is based on a computational linguistic analysis conducted on a database of 7, 435 scientific publications over the last 2 decades. This comprehensive dataset was extracted from the Scopus database using specific key-words. Our results show how relevant terms of verbal communication evolved, which are the main coherent topics and how they have changed over the years. We highlighted positive and negative trends for the most coherent topics and the distribution over the years for the most significant ones. In particular, verbal communication resulted in being highly relevant for social robotics. Potentially, achieving natural verbal communication with a robot can have a great impact on the scientific, societal, and economic role of robotics in the future}, KEYWORDS = {social robotics, affective computing, speech synthesis, speech generation, computational linguistic analysis, data mining, topic modeling, verbal communication}, URL = {https://iris.cnr.it/handle/20.500.14243/400917}, DOI = {10.3389/fcomp.2020.591164}, ISSN = {2624-9898}, JOURNAL = {FRONTIERS IN COMPUTER SCIENCE}, } @INPROCEEDINGS{DEMATTEI_2021_INPROCEEDINGS_DLDN_445812, AUTHOR = {De Mattei, L. and Lai, H. and Dell'Orletta, F. and Nissim, M.}, TITLE = {Human Perception in Natural Language Generation}, YEAR = {2021}, ABSTRACT = {We take a collection of short texts, some of which are human-written, while others are automatically generated, and ask subjects, who are unaware of the texts' source, whether they perceive them as human-produced. We use this data to fine-tune a GPT-2 model to push it to generate more human-like texts, and observe that the production of this fine-tuned model is indeed perceived as more human-like than that of the original model. Contextually, we show that our automatic evaluation strategy correlates well with human judgements. We also run a linguistic analysis to unveil the characteristics of human-vs machine-perceived language}, KEYWORDS = {Natural Language Generation, Neural Language Models, Evaluation}, PAGES = {15-23}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85123713456\&origin=inward}, DOI = {10.18653/v1/2021.gem-1.2}, ISBN = {978-1-954085-67-1}, CONFERENCE_NAME = {First Workshop on Generation Evaluation and Metrics (GEM 2021)}, BOOKTITLE = {Proceedings of the First Workshop on Generation Evaluation and Metrics (GEM 2021)}, } @INPROCEEDINGS{IAVARONE_2021_INPROCEEDINGS_IBD_440176, AUTHOR = {Iavarone, B. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Sentence Complexity in Context}, YEAR = {2021}, ABSTRACT = {We study the influence of context on how humans evaluate the complexity of a sentence in English. We collect a new dataset of sentences, where each sentence is rated for perceived complexity within different contextual windows. We carry out an in-depth analysis to detect which linguistic features correlate more with complexity judgments and with the degree of agreement among annotators. We train several regression models, using either explicit linguistic features or contextualized word embeddings, to predict the mean complexity values assigned to sentences in the different contextual windows, as well as their standard deviation. Results show that models leveraging explicit features capturing morphosyntactic and syntactic phenomena perform always better, especially when they have access to features extracted from all contextual sentences}, URL = {https://iris.cnr.it/handle/20.500.14243/440176}, DOI = {10.18653/v1/2021.cmcl-1.23}, } @INPROCEEDINGS{IAVARONE_2021_INPROCEEDINGS_IMBGSVDG_445809, AUTHOR = {Iavarone, B. and Morelli, M. S. and Brunato, D. and Ghiasi, S. and Scilingo, E. P. and Vanello, N. and Dell'Orletta, F. and Greco, A.}, TITLE = {Analyzing the Interaction between the Reader's Voice and the Linguistic Structure of the Text: a Preliminary Study}, YEAR = {2021}, ABSTRACT = {In this study, we present a preliminary analysis of the relationship between the linguistic profile of a text and the voice properties of the reader aiming to improve the speech-based emotion recognition systems. To this aim, we recorded the speech signals from a group of 32 healthy volunteers reading aloud neutral and affective texts and used the BioVoice toolbox to compute some of the main speech features. The selected texts were analyzed to quantify their lexical, morpho-syntactic, and syntactic content. Correlation and Support Vector Regressor analyses between linguistic and speech features have shown a significant modulation of some voice acoustic properties performed by the linguistic structure of the text. Particularly, a significant effect was shown on some specific speech features often used for the assessment of human emotional state (e. g., F0). This suggests that the lexical, morpho-syntactic, and syntactic properties could play an important role in the emotional dynamics of a person}, KEYWORDS = {Natural Language Processing, Speech analysis, linguistic profile}, URL = {https://iris.cnr.it/handle/20.500.14243/445809}, DOI = {10.36253/978-88-5518-449-6}, ISBN = {978-88-5518-448-9}, CONFERENCE_NAME = {12th INTERNATIONAL WORKSHOP "MODELS AND ANALYSIS OF VOCAL EMISSIONS FOR BIOMEDICAL APPLICATIONS"}, BOOKTITLE = {Proceedings of 12th INTERNATIONAL WORKSHOP "MODELS AND ANALYSIS OF VOCAL EMISSIONS FOR BIOMEDICAL APPLICATIONS"}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MABDV_446048, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Probing tasks under pressure}, YEAR = {2021}, ABSTRACT = {Probing tasks are frequently used to evaluate whether the representations of Neural Language Models (NLMs) encode linguistic information. However, it is still questioned if probing classification tasks really enable such investigation or they simply hint for surface patterns in the data. We present a method to investigate this question by comparing the accuracies of a set of probing tasks on gold and automatically generated control datasets. Our results suggest that probing tasks can be used as reliable diagnostic methods to investigate the linguistic information encoded in NLMs representations}, KEYWORDS = {Neural Language Models, Linguistic probing, Treebanks}, PAGES = {1-7}, URL = {http://ceur-ws.org/Vol-3033/paper29.pdf}, VOLUME = {3033}, CONFERENCE_NAME = {8th Italian Conference on Computational Linguistics (CLIC-it 2021)}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MBDV_400474, AUTHOR = {Miaschi, A. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {What Makes My Model Perplexed? A Linguistic Investigation on Neural Language Models Perplexity}, YEAR = {2021}, ABSTRACT = {This paper presents an investigation aimed at studying how the linguistic structure of a sentence affects the perplexity of two of the most popular Neural Language Models (NLMs), BERT and GPT-2. We first compare the sentence-level likelihood computed with BERT and the GPT-2's perplexity showing that the two metrics are correlated. In addition, we exploit linguistic features capturing a wide set of morpho-syntactic and syntactic phenomena showing how they contribute to predict the perplexity of the two NLMs}, KEYWORDS = {nlp, interpretability, deep learning}, PAGES = {40-47}, URL = {https://www.aclweb.org/anthology/2021.deelio-1.5}, ISBN = {978-1-954085-30-5}, CONFERENCE_NAME = {2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, BOOKTITLE = {Proceedings of the 2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, } @INPROCEEDINGS{MIASCHI_2021_INPROCEEDINGS_MRD_443055, AUTHOR = {Miaschi, A. and Ravelli, A. A. and Dell'Orletta, F.}, TITLE = {Evaluating Transformer Models for Punctuation Restoration in Italian}, YEAR = {2021}, ABSTRACT = {In this paper, we propose an evaluation of a Transformerbased punctuation restoration model for the Italian language. Experimenting with a BERT-base model, we perform several fine-tuning with different training data and sizes and tested them in an in-and crossdomain scenario. Moreover, we offer a comparison in a multilingual setting with the same model fine-tuned on English transcriptions. Finally, we conclude with an error analysis of the main weaknesses of the model related to specific punctuation marks}, KEYWORDS = {transformer models, nlp, punctuation restoration}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85121647978\&origin=inward}, VOLUME = {3015}, CONFERENCE_NAME = {5th Workshop on Natural Language for Artificial Intelligence (NL4AI 2021)}, } @INPROCEEDINGS{PUCCETTI_2021_INPROCEEDINGS_PMD_400473, AUTHOR = {Puccetti, G. and Miaschi, A. and Dell'Orletta, F.}, TITLE = {How do BERT embeddings organize linguistic knowledge?}, YEAR = {2021}, ABSTRACT = {Several studies investigated the linguistic information implicitly encoded in Neural Language Models. Most of these works focused on quantifying the amount and type of information available within their internal representations and across their layers. In line with this scenario, we proposed a different study, based on Lasso regression, aimed at understanding how the information encoded by BERT sentence-level representations is arrange within its hidden units. Using a suite of several probing tasks, we showed the existence of a relationship between the implicit knowledge learned by the model and the number of individual units involved in the encodings of this competence. Moreover, we found that it is possible to identify groups of hidden units more relevant for specific linguistic properties}, KEYWORDS = {NLP, Interpretability, Deep Learning}, PAGES = {48-57}, URL = {https://www.aclweb.org/anthology/2021.deelio-1.6}, DOI = {10.18653/v1/2021.deelio-1.6}, ISBN = {978-1-954085-30-5}, CONFERENCE_NAME = {2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, BOOKTITLE = {Proceedings of the 2nd Workshop on Knowledge Extraction and Integrationfor Deep Learning Architectures}, } @INPROCEEDINGS{SARTI_2021_INPROCEEDINGS_SBD_440173, AUTHOR = {Sarti, G. and Brunato, D. and Dell'Orletta, F.}, TITLE = {That Looks Hard: Characterizing Linguistic Complexity in Humans and Language Models}, YEAR = {2021}, ABSTRACT = {This paper investigates the relationship between two complementary perspectives in the human assessment of sentence complexity and how they are modeled in a neural language model (NLM). The first perspective takes into account multiple online behavioral metrics obtained from eye-tracking recordings. The second one concerns the offline perception of complexity measured by explicit human judgments. Using a broad spectrum of linguistic features modeling lexical, morpho-syntactic, and syntactic properties of sentences, we perform a comprehensive analysis of linguistic phenomena associated with the two complexity viewpoints and report similarities and differences. We then show the effectiveness of linguistic features when explicitly leveraged by a regression model for predicting sentence complexity and compare its results with the ones obtained by a fine-tuned neural language model. We finally probe the NLM's linguistic competence before and after fine-tuning, highlighting how linguistic information encoded in representations changes when the model learns to predict complexity}, KEYWORDS = {linguistic complexity, eyetracking, human evaluation}, PAGES = {48-60}, URL = {https://aclanthology.org/2021.cmcl-1.5}, DOI = {10.18653/v1/2021.cmcl-1.5}, ISBN = {978-1-954085-35-0}, CONFERENCE_NAME = {Proceedings of Workshop on Cognitive Modeling and Computational Linguistics (CMCL 2021)}, } @ARTICLE{ALZETTA_2020_ARTICLE_ADMV_446043, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Linguistically-driven Selection of Difficult-to-Parse Dependency Structures}, YEAR = {2020}, ABSTRACT = {The paper illustrates a novel methodology meeting a twofold goal, namely quantifying the reliability of automatically generated dependency relations without using gold data on the one hand, and identifying which are the linguistic constructions negatively affecting the parser performance on the other hand. These represent objectives typically investigated in different lines of research, with different methods and techniques. Our methodology, at the crossroads of these perspectives, allows not only to quantify the parsing reliability of individual dependency types but also to identify and weight the contextual properties making relation instances more or less difficult to parse. The proposed methodology was tested in two different and complementary experiments, aimed at assessing the degree of parsing difficulty across (a) different dependency relation types, and (b) different instances of the same relation. The results show that the proposed methodology is able to identify difficult-to-parse dependency relations without relying on gold data and by taking into account a variety of intertwined linguistic factors. These findings pave the way to novel applications of the methodology, both in the direction of defining new evaluation metrics based purely on automatically parsed data and towards the automatic creation of challenge sets}, KEYWORDS = {Linguistic Complexity, Syntactic Parsing, Evaluation metrics}, PAGES = {37-60}, URL = {https://journals.openedition.org/ijcol/719}, VOLUME = {6 (2)}, DOI = {10.4000/ijcol.719}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @ARTICLE{BUONGIOVANN_2020_ARTICLE_BGBD_401391, AUTHOR = {Buongiovann, C. and Gracci, F. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Lost in Text: A Cross-Genre Analysis of Linguistic Phenomena within Text}, YEAR = {2020}, ABSTRACT = {Moving from the assumption that formal, rather than content features, can be used to detect differences and similarities among textual genres and registers, this paper presents a new approach to linguistic profiling-a well-established methodological framework to study language variation-which is applied to detect significant variations within the internal structure of a text. We test this approach on the Italian language using a wide spectrum of linguistic features automatically extracted from parsed corpora representative of four main genres and two levels of complexity for each, and we show that it is possible to model the degree of stylistic variance within texts according to genre and language complexity}, KEYWORDS = {natural language processing, computational stylometry}, URL = {https://www.ai-lc.it/wp-content/uploads/2021/03/IJCOL_6_1_3_buongiovanni_et_al.pdf}, VOLUME = {6 (1)}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @ARTICLE{CHINELLO_2020_ARTICLE_CPFGRZDB_401387, AUTHOR = {Chinello, A. and Parma, F. and Frigerio, F. and Galli, C. M. and Richichi, V. and Zappa, L. E. and Dell'Orletta, F. and Boschetti, F.}, TITLE = {Food semantics on pro-anorexia websites in Italy}, YEAR = {2020}, ABSTRACT = {Introduction. The term pro-ana (pro-anorexia) means the spread of restrictive eating behaviors and anorectic advices in virtual spaces written by teenagers. The purpose of this pilot study consists in a qualitative and quantitative analysis of foods contained in a linguistic corpus made up of users' comments on pro-ana websites. Method. The corpus of pro-ana websites was analyzed through the T2K tool based on word-frequency processing. Results. The results show conversations regarding beverages, products of vegetable origin (fruit, vegetables) and low-calorie foods, with a tendency to limit the fear linked to the choice of high-calorie foods through reassuring and reconcilable language labels ("light", "sugar free"). Conclusions. These findings specify the food semantics on pro-ana websites associated to an anorectic vocabulary with restrictive diets. The results could be used to characterize the most common food as risk factors within the eating disorders framework}, KEYWORDS = {pro-ana, anorexia, language, food, natural language processing}, PAGES = {297-300}, URL = {https://iris.cnr.it/handle/20.500.14243/401387}, VOLUME = {55 (5)}, ISSN = {0035-6484}, JOURNAL = {RIVISTA DI PSICHIATRIA (TESTO STAMP.)}, } @ARTICLE{VENTURI_2020_ARTICLE_VDMM_426118, AUTHOR = {Venturi, G. and Dell'Orletta, F. and Montemagni, S. and Morini E, S. M.}, TITLE = {Metodi e Tecniche di Trattamento Automatico della Lingua per l'Estrazione di Conoscenza dalla Documentazione Scolastica}, YEAR = {2020}, ABSTRACT = {Il contributo riguarda la creazione di un sistema integrato di "knowledge management", per la gestione e condivisione della conoscenza prodotta e utilizzata dalla scuola}, KEYWORDS = {Estrazione di informazione, Documenti scolastici, Indicizzazione, Terminology extraction}, PAGES = {49-68}, URL = {https://iris.cnr.it/handle/20.500.14243/426118}, VOLUME = {2}, DOI = {10.3280/CAD2020-002005}, ISSN = {1122-5165}, JOURNAL = {CADMO}, } @ARTICLE{VERTECCHI_2020_ARTICLE_VADMV_426114, AUTHOR = {Vertecchi, B. and Agrusti, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Verba et Acta. Un esperimento per promuovere l'evoluzione delle compe-tenze linguistiche degli studenti degli istituti professionali}, YEAR = {2020}, ABSTRACT = {Ricerche in corso. Verba et Acta. Un esperimento per promuovere l'evoluzione delle competenze linguistiche degli studenti degli istituti professionali}, KEYWORDS = {Evoluzione competenze linguistiche, Annotazione linguistica, Previsione dello sviluppo delle competenze di scrittura}, PAGES = {109-117}, URL = {https://iris.cnr.it/handle/20.500.14243/426114}, VOLUME = {(1)}, DOI = {10.3280/CAD2020-001008}, ISSN = {1122-5165}, JOURNAL = {CADMO}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_ADMOSV_423610, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Osenova, P. and Simov, K. and Venturi, G.}, TITLE = {Quantitative linguistic investigations across universal dependencies treebanks}, YEAR = {2020}, ABSTRACT = {The paper illustrates a case study aimed at identifying cross-lingual quantitative trends in the distribution of dependency relations in treebanks for typologically different languages. Preliminary results show interesting differences rooted either in language-specific peculiarities or cross-lingual annotation inconsistencies, with a potential impact on different application scenarios}, KEYWORDS = {Universal Dependencies Treebanks, Cross-linguistic analysis, Typology}, PAGES = {1-7}, URL = {http://ceur-ws.org/Vol-2769/paper_59.pdf}, VOLUME = {2769}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {7th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_AMDKFTI_421771, AUTHOR = {Alzetta, C. and Miaschi, A. and Dell'Orletta, F. and Koceva and Frosina and Torre and Ilaria}, TITLE = {PRELEARN @ EVALITA 2020: Overview of the Prerequisite Relation Learning Task for Italian}, YEAR = {2020}, ABSTRACT = {The Prerequisite Relation Learning (PRELEARN) task is the EVALITA 2020 shared task on concept prerequisite learning, which consists of classifying prerequisite relations between pairs of concepts distinguishing between prerequisite pairs and non-prerequisite pairs. Four sub-tasks were defined: two of them define different types of features that participants are allowed to use when training their model, while the other two define the classification scenarios where the proposed models would be tested. In total, 14 runs were submitted by 3 teams comprising 9 total individual participants}, KEYWORDS = {nlp, prerequisite learning, shared task}, URL = {http://ceur-ws.org/Vol-2765/paper164.pdf}, CONFERENCE_NAME = {Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, BOOKTITLE = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{BACCO_2020_INPROCEEDINGS_BBDF_370407, AUTHOR = {Bacco, F. and Brunori, G. and Dell'Orletta, F. and Ferrari, A.}, TITLE = {Using NLP to support terminology extraction and domain scoping: report on the H2020 DESIRA project}, YEAR = {2020}, ABSTRACT = {The ongoing phenomenon of digitisation is changing social and work life, with tangible effects on the socio-economic context. Understanding the impact, opportunities, and threats of digital transformation requires the identication of viewpoints from a large diversity of stakeholders, from policy makers to domain experts, and from engineers to common citizens. The DESIRA (Digitisation: Economic and Social Impacts in Rural Areas) EU H2020 project1 considers rural areas, with a strong focus on agricultural and forestry activities, and aims at assessing the impact of digital technologies in those domains by involving a large number of stakeholders, all across Europe, around 20 focal questions. Given the involvement of stakeholders with diverse background and skills, a primary goal of the project is to develop domain-specic and interactive reference taxonomies (i. e., structured classications of terms) to facilitate common understanding of technologies in use in each domain at today. The taxonomies, which aims at easing the learning of the meaning of technical and domain-specic terms, are going to be exploited by the stakeholders in 20 Living Labs built around the focal questions. This report paper focuses on the semi-automatic development of the taxonomies through natural language processing (NLP) techniques based on context-specic term extraction. Furthermore, we crawl Wikipedia to enrich the taxonomies with additional categories and denitions. We plan to validate the taxonomies through fieeld studies within the Living Labs}, KEYWORDS = {NLP, WIkipedia, Socio-economic impact, Taxonomy, Knowledge graph, Terminology extraction, Domain scoping}, PAGES = {1-5}, URL = {http://ceur-ws.org/Vol-2584/}, PUBLISHER = {CEUR-WS. org (Aachen, DEU)}, CONFERENCE_NAME = {Third Workshop on Natural Language Processing for Requirements Engineering}, CONFERENCE_PLACE = {Aachen}, } @INPROCEEDINGS{BACCO_2020_INPROCEEDINGS_BCPMD_401373, AUTHOR = {Bacco, L. and Cimino, A. and Paulon, L. and Merone, M. and Dell'Orletta, F.}, TITLE = {A Machine Learning approach for Sentiment Analysis for Italian Reviews in Healthcare}, YEAR = {2020}, ABSTRACT = {In this paper, we present our approach to the task of binary sentiment classification for Italian reviews in healthcare domain. We first collected a new dataset for such domain. Then, we compared the results obtained by two different systems, one including a Support Vector Machine and one with BERT. For the first one, we linguistic pre-processed the dataset to extract hand-crafted features exploited by the classifier. For the second one, we oversampled the dataset to achieve better results. Our results show that the SVM-based system, without the worry of having to oversample, has better performance than the BERT-based one, achieving anF1-score of 91. 21%}, KEYWORDS = {natural language processing, sentiment analisys}, URL = {https://iris.cnr.it/handle/20.500.14243/401373}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it 2020)}, } @INPROCEEDINGS{BRUNATO_2020_INPROCEEDINGS_BCDMVZ_423611, AUTHOR = {Brunato, D. and Chesi, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G. and Zamparelli, R.}, TITLE = {AcCompl-it @ EVALITA2020: Overview of the acceptability & complexity evaluation task for Italian}, YEAR = {2020}, ABSTRACT = {The Acceptability and Complexity evaluation task for Italian (AcCompl-it) was aimed at developing and evaluating methods to classify Italian sentences according to Acceptability and Complexity. It consists of two independent tasks asking participants to predict either the acceptability or the complexity rate (or both) of a given set of sentences previously scored by native speakers on a 1-to-7 points Likert scale. In this paper, we introduce the datasets distributed to the participants, we describe the different approaches of the participating systems and provide a first analysis of the obtained results}, KEYWORDS = {Shared Task, Linguistic Complexity, Acceptability}, PAGES = {1-8}, URL = {http://ceur-ws.org/Vol-2765/paper163.pdf}, VOLUME = {2765}, CONFERENCE_NAME = {EVALITA '20, Evaluation of NLP and Speech Tools for Italian}, } @INPROCEEDINGS{BRUNATO_2020_INPROCEEDINGS_BCDMV_384930, AUTHOR = {Brunato, D. and Cimino, A. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Profiling-UD: a Tool for Linguistic Profiling of Texts}, YEAR = {2020}, ABSTRACT = {In this paper, we introduce Profiling-UD, a new text analysis tool inspired to the principles of linguistic profiling that can support language variation research from different perspectives. It allows the extraction of more than 130 features, spanning across different levels of linguistic description. Beyond the large number of features that can be monitored, a main novelty of Profiling-UD is that it has been specifically devised to be multilingual since it is based on the Universal Dependencies framework. In the second part of the paper, we demonstrate the effectiveness of these features in a number of theoretical and applicative studies in which they were successfully used for text and author profiling}, KEYWORDS = {Computational Language Variation Analysis, Linguistic Profiling, Universal Dependencies}, PAGES = {7145-7151}, URL = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.883.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-34-4}, CONFERENCE_NAME = {Conference on Language Resources and Evaluation (LREC)}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of the 12th Language Resources and Evaluation Conference-LREC 2020}, } @INPROCEEDINGS{CIMINO_2020_INPROCEEDINGS_CDN_400929, AUTHOR = {Cimino, A. and Dell'Orletta, F. and Nissim, M.}, TITLE = {TAG-it@EVALITA2020: Overview of the Topic, Age, and Gender prediction task for Italian}, YEAR = {2020}, ABSTRACT = {The Topic, Age, and Gender (TAG-it) pre-diction task in Italian was organised in the context of EVALITA 2020, using forum posts as textual evidence for profiling their authors. The task was articulated in two separate subtasks: one where all three dimensions (topic, gender, age) were to be predicted at once; the other where training and test sets were drawn from different forum topics and gender or age had to be predicted separately. Teams tackled the problems both with classical machine learning methods as well as neural models. Using the training-data to fine-tuning a BERT-based monolingual model for Italian proved eventually as the most successful strategy in both subtasks. We observe that topic and gender are easier to predict than age. The higher results for gender obtained in this shared task with respect to a comparable challenge at EVALITA 2018might be due to the larger evidence per author provided at this edition, as well as to the availability of pre-trained large models for fine-tuning, which have shown improvement on very many NLP tasks}, KEYWORDS = {natural language processing, linguistic proifiling}, URL = {https://iris.cnr.it/handle/20.500.14243/400929}, CONFERENCE_NAME = {Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{DEMATTEI_2020_INPROCEEDINGS_DCDN_401393, AUTHOR = {De Mattei, L. and Cafagna, M. and Dell'Orletta, F. and Nissim, M.}, TITLE = {Invisible to People but not to Machines: Evaluation of Style-aware Headline Generation in Absence of Reliable Human Judgment}, YEAR = {2020}, ABSTRACT = {We automatically generate headlines that are expected to comply with the specific styles of two different Italian newspapers. Through a data alignment strategy and different training/testing settings, we aim at decoupling content from style and preserve the latter in generation. In order to evaluate the generated headlines' quality in terms of their specific newspaper-compliance, we devise a fine-grained evaluation strategy based on automatic classification. We observe that our models do indeed learn newspaper-specific style. Importantly, we also observe that humans aren't reliable judges for this task, since although familiar with the newspapers, they are notable to discern their specific styles even in the original human-written headlines. The utility of automatic evaluation goes therefore beyond saving the costs and hurdles of manual annotation, and deserves particular care in its design}, KEYWORDS = {Natural Language Generation, Stylistic variations, Evaluation}, URL = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.828.pdf}, CONFERENCE_NAME = {12th Edition of International Conference on Language Resources and Evaluation (LREC 2020)}, } @INPROCEEDINGS{DEMATTEI_2020_INPROCEEDINGS_DCDNG_400923, AUTHOR = {De Mattei, L. and Cafagna, M. and Dell'Orletta, F. and Nissim, M. and Gatt, A.}, TITLE = {CHANGE-IT@EVALITA 2020: Change Headlines, Adapt News, GEnerate}, YEAR = {2020}, ABSTRACT = {We propose a generation task for Italian-more specifically, a style transfer task for headlines of Italian newspapers. This is the first shared task on generation included in the EVALITA evaluation framework. Indeed, one of the reasons to have this task is to stimulate more research on generation within the Italian community. With this aim in mind, we release to the participating teams not only training data, but also a baseline sequence to sequence model that performs the task in order to help everyone get started, even when not accustomed to Natural Language Generation (NLG) approaches. Contextually, we explore the complex issue of automatic evaluation of generated text, which is receiving particular attention in the NLG community}, KEYWORDS = {Natural Language Generation, Style transfer}, URL = {https://iris.cnr.it/handle/20.500.14243/400923}, CONFERENCE_NAME = {Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{DEMATTEI_2020_INPROCEEDINGS_DCDNG_401384, AUTHOR = {De Mattei, L. and Cafagna, M. and Dell'Orletta, F. and Nissim, M. and Guerini, M.}, TITLE = {GePpeTto Carves Italian into a Language Model}, YEAR = {2020}, ABSTRACT = {In the last few years, pre-trained neural architectures have provided impressive improvements across several NLP tasks. Still, generative language models are available mainly for English. We develop GePpeTto, the first generative language model for Italian, built using the GPT-2 architecture. We provide a thorough analysis of GePpeTto's quality by means of both an automatic and a human-based evaluation. The automatic assessment consists in (i) calculating perplexity across different genres and (ii) a profiling analysis over GePpeTto's writing characteristics. We find that GePpeTto's production is a sort of bonsai version of human production, with shorter but yet complex sentences. Human evaluation is performed over a sentence completion task, whereGePpeTto's output is judged as natural more often than not, and much closer to the original human texts than to a simpler language model which we take as baseline}, KEYWORDS = {natural language generation}, URL = {https://iris.cnr.it/handle/20.500.14243/401384}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it 2020)}, } @INPROCEEDINGS{DEMATTEI_2020_INPROCEEDINGS_DCLDNG_400921, AUTHOR = {De Mattei, L. and Cafagna, M. and Lai, H. and Dell'Orletta, F. and Nissim, M. and Gatt, A.}, TITLE = {On the interaction of automatic evaluationand task framing in headline style transfer}, YEAR = {2020}, ABSTRACT = {An ongoing debate in the NLG communityconcerns the best way to evaluate systems, with human evaluation often being consideredthe most reliable method, compared to corpus-based metrics. However, tasks involving sub-tle textual differences, such as style transfer, tend to be hard for humans to perform. In thispaper, we propose an evaluation method forthis task based on purposely-trained classifiers, showing that it better reflects system differ-ences than traditional metrics such as BLEUand ROUGE}, URL = {https://iris.cnr.it/handle/20.500.14243/400921}, } @INPROCEEDINGS{IAVARONE_2020_INPROCEEDINGS_ID_400968, AUTHOR = {Iavarone, B. and Dell'Orletta, F.}, TITLE = {Predicting movie-elicited emotions from dialogue in screenplay text: A study on "Forrest Gump"}, YEAR = {2020}, ABSTRACT = {We present a new dataset of sentences extracted from the movie Forrest Gump, annotated with the emotions perceived by a group of subjects while watching the movie. We run experiments to predict these emotions using two classifiers, one based on a Support Vector Machine with linguistic and lexical features, the other based on BERT. The experiments showed that contextual embeddings are effective in predicting human-perceived emotions}, KEYWORDS = {natural language processing, affective computing}, URL = {https://iris.cnr.it/handle/20.500.14243/400968}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it 2020)}, } @INPROCEEDINGS{LENCI_2020_INPROCEEDINGS_LMBDDDDMCPVL_384922, AUTHOR = {Lenci, A. and Montemagni, S. and Boschetti, F. and De Felice, I. and Dei Rossi, S. and Dell'Orletta, F. and Di Giorgio, M. and Miliani, M. and C Passaro, L. and Puddu, A. and Venturi, G. and Labanca, N.}, TITLE = {Voices of the Great War: A Richly Annotated Corpus of Italian Texts on the First World War}, YEAR = {2020}, ABSTRACT = {Voci della Grande Guerra ("Voices of the Great War") is the first large corpus of Italian historical texts dating back to the period of First World War. This corpus differs from other existing resources in several respects. First, from the linguistic point of view it gives account of the wide range of varieties in which Italian was articulated in that period, namely from a diastratic (educated vs. uneducated writers), diaphasic (low/informal vs. high/formal registers) and diatopic (regional varieties, dialects) points of view. From the historical perspective, through a collection of texts belonging to different genres it represents different views on the war and the various styles of narrating war events and experiences. The final corpus is balanced along various dimensions, corresponding to the textual genre, the language variety used, the author type and the typology of conveyed contents. The corpus is annotated with lemmas, part-of-speech, terminology, and named entities. Significant corpus samples representative of the different "voices" have also been enriched with meta-linguistic and syntactic information. The layer of syntactic annotation forms the first nucleus of an Italian historical treebank complying with the Universal Dependencies standard. The paper illustrates the final resource, the methodology and tools used to build it, and the Web Interface for navigating it}, KEYWORDS = {Historical Corpora, Linguistic and Meta-linguistic Annotation, Information Extraction}, PAGES = {911-918}, URL = {https://www.aclweb.org/anthology/2020.lrec-1.114.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-34-4}, CONFERENCE_NAME = {Conference on Language Resources and Evaluation (LREC)}, CONFERENCE_PLACE = {Paris}, } @INPROCEEDINGS{MATTEI_2020_INPROCEEDINGS_MBD_400967, AUTHOR = {Mattei, A. and Brunato, D. and Dell'Orletta, F.}, TITLE = {The Style of a Successful Story: a Computational Study on the Fanfiction Genre}, YEAR = {2020}, ABSTRACT = {This paper presents a new corpus for the Italian language representative of the fan-fiction genre. It comprises about 55k user-generated stories inspired to the original fantasy saga "Harry Potter" and published on a popular website. The corpus is large enough to support data-driven investigations in many directions, from more traditional studies on language variation aimed at characterizing this genre with respect to more traditional ones, to emerging topics in computational social science such as the identification of factors involved in the success of a story. The latter is the focus of the presented case-study, in which a wide set of multi-level linguistic features has been automatically extracted from a subset of the corpus and analysed in order to detect the ones which significantly discriminate successful from unsuccessful stories}, KEYWORDS = {natural language processing, Computational Sociolinguistics, stylistic analysis}, URL = {https://iris.cnr.it/handle/20.500.14243/400967}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it 2020)}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MABDV_421767, AUTHOR = {Miaschi, A. and Alzetta, C. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Is Neural Language Model Perplexity Related to Readability?}, YEAR = {2020}, ABSTRACT = {This paper explores the relationship between Neural Language Model (NLM) perplexity and sentence readability. Starting from the evidence that NLMs implicitly acquire sophisticated linguistic knowledge from a huge amount of training data, our goal is to investigate whether perplexity is affected by linguistic features used to automatically assess sentence readability and if there is a correlation between the two metrics. Our findings suggest that this correlation is actually quite weak and the two metrics are affected by different linguistic phenomena}, KEYWORDS = {nlp, neural language models, readability}, URL = {http://ceur-ws.org/Vol-2769/paper_57.pdf}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics}, BOOKTITLE = {Proceedings of the Seventh Italian Conference on Computational Linguistics}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MBDV_379646, AUTHOR = {Miaschi, A. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Linguistic Profiling of a Neural Language Model}, YEAR = {2020}, ABSTRACT = {In this paper we investigate the linguistic knowledge learned by a Neural Language Model (NLM) before and after a fine-tuning process and how this knowledge affects its predictions during several classification problems. We use a wide set of probing tasks, each of which corresponds to a distinct sentence-level feature extracted from different levels of linguistic annotation. We show that BERT is able to encode a wide range of linguistic characteristics, but it tends to lose this information when trained on specific downstream tasks. We also find that BERT's capacity to encode different kind of linguistic properties has a positive influence on its predictions: the more it stores readable linguistic information of a sentence, the higher will be its capacity of predicting the expected label assigned to that sentence}, KEYWORDS = {Linguistic Profiling, Neural Language Model, Interpretability}, PAGES = {745-756}, URL = {https://www.aclweb.org/anthology/2020.coling-main.65/}, DOI = {10.18653/v1/2020.coling-main.65}, ISBN = {978-1-952148-27-9}, CONFERENCE_NAME = {International Conference on Computational Linguistics (COLING)}, BOOKTITLE = {International Conference on Computational Linguistics (COLING)}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MDBDSSV_384933, AUTHOR = {Miaschi, A. and Davidson, S. and Brunato, D. P. and Dell'Orletta, F. and Sagae, K. and Sanchez Gutierrez, C. H. and Venturi, G.}, TITLE = {Tracking the Evolution of Written Language Competence in L2 Spanish Learners}, YEAR = {2020}, ABSTRACT = {In this paper we present an NLP-based approach for tracking the evolution of written language competence in L2 Spanish learners using a wide range of linguistic features automatically extracted from students' written productions. Beyond reporting classification results for different scenarios, we explore the connection between the most predictive features and the teaching curriculum, finding that our set of linguistic features often reflects the explicit instruction that students receive during each course}, KEYWORDS = {Evolution of Language Competence, Natural Language Processing, Linguistic Profiling}, PAGES = {92-101}, URL = {https://www.aclweb.org/anthology/2020.bea-1.9.pdf}, DOI = {10.18653/v1/W16-05}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-941643-83-9}, CONFERENCE_NAME = {15th Workshop on Innovative Use of NLP for Building Educational Applications}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Proceedings of 15th Workshop on Innovative Use of NLP for Building Educational Applications}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MD_421763, AUTHOR = {Miaschi, A. and Dell'Orletta, F.}, TITLE = {Contextual and Non-Contextual Word Embeddings: an in-depth Linguistic Investigation}, YEAR = {2020}, ABSTRACT = {In this paper we present a comparison between the linguistic knowledge encoded in the internal representations of a contextual Language Model (BERT) and a contextual-independent one (Word2vec). We use a wide set of probing tasks, each of which corresponds to a distinct sentence-level feature extracted from different levels of linguistic annotation. We show that, although BERT is capable of understanding the full context of each word in an input sequence, the implicit knowledge encoded in its aggregated sentence representations is still comparable to that of a contextual-independent model. We also find that BERT is able to encode sentence-level properties even within single-word embeddings, obtaining comparable or even superior results than those obtained with sentence representations}, KEYWORDS = {nlp, interpretability, representation learning}, PAGES = {110-119}, URL = {https://www.aclweb.org/anthology/2020.repl4nlp-1.15}, DOI = {10.18653/v1/2020.repl4nlp-1.15}, ISBN = {978-1-952148-15-6}, CONFERENCE_NAME = {5th Workshop on Representation Learning for NLP}, BOOKTITLE = {Proceedings of the 5th Workshop on Representation Learning for NLP}, } @INPROCEEDINGS{MIASCHI_2020_INPROCEEDINGS_MSBDV_421765, AUTHOR = {Miaschi, A. and Sarti, G. and Brunato, D. P. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Italian Transformers Under the Linguistic Lens}, YEAR = {2020}, ABSTRACT = {In this paper we present an in-depth investigation of the linguistic knowledge encoded by the transformer models currently available for the Italian language. In particular, we investigate whether and how using different architectures of probing models affects the performance of Italian transformers in encoding a wide spectrum of linguistic features. Moreover, we explore how this implicit knowledge varies according to different textual genres}, KEYWORDS = {nlp, neural language models, interpretability}, URL = {http://ceur-ws.org/Vol-2769/paper_56.pdf}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it)}, BOOKTITLE = {Proceedings of the Seventh Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{RAVELLI_2020_INPROCEEDINGS_ROD_400965, AUTHOR = {Ravelli, A. A. and Origlia, A. and Dell'Orletta, F.}, TITLE = {Exploring Attention in a Multimodal Corpus of Guided Tours}, YEAR = {2020}, ABSTRACT = {This paper explores the possibility to annotate engagement as an extra-linguistic information in a multimodal corpus of guided tours in cultural sites. Engagement has been annotated in terms of gain or loss of perceived attention from the audience, and this information has been aligned to the transcription of the speech from the guide. A preliminary analysis suggests that the level of engagement cor-relates with some specific linguistic features, opening up to possible future exploitation}, KEYWORDS = {Multimodal Corpus, natural language processing, engagement}, URL = {https://iris.cnr.it/handle/20.500.14243/400965}, CONFERENCE_NAME = {Seventh Italian Conference on Computational Linguistics (CLiC-it 2020)}, } @ARTICLE{ALZETTA_2019_ARTICLE_ADMV_403586, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {INFERRING QUANTITATIVE TYPOLOGICAL TRENDS FROM MULTILINGUAL TREEBANKS. A CASE STUDY}, YEAR = {2019}, ABSTRACT = {In the past decades, linguistic typology went through a renewing phase that involved a significant change in the research questions and methods of the discipline, which is now interested in fine-grained features underlying language diversity. In this paper, we propose a novel approach to address the newly defined needs of linguistic typology by extracting qualitative and quantitative information about a wide range of features from multilingual annotated corpora based on Natural Language Processing methods and techniques. We tested our method in a case study focusing on word order variation in two widely investigated constructions, VERB-SUBJ(ect) and NOUN-ADJ(ective), with a specific view to structural and functional factors underlying the preference for one or the other order, both intra-and cross-linguistically, and their interaction. Preliminary experiments have been carried out aimed at acquiring typological evidence from a selection of linguistically annotated treebanks for three different languages, namely Italian, Spanish and English. Our results show the effectiveness of the method in letting similarities and differences also emerge from typologically close languages}, KEYWORDS = {language typology, multilingual annotated corpora, linguistic knowledge extraction and modelling, word order variation}, PAGES = {209-242}, URL = {https://www.rivisteweb.it/doi/10.1418/95391}, VOLUME = {18 (2)}, DOI = {10.1418/95391}, ISSN = {1720-9331}, JOURNAL = {LINGUE E LINGUAGGIO}, } @ARTICLE{APREDA_2019_ARTICLE_ABDF_392529, AUTHOR = {Apreda, R. and Bonaccorsi, A. and Dell'Orletta, F. and Fantoni, G.}, TITLE = {Expert forecast and realized outcomes in technology foresight}, YEAR = {2019}, ABSTRACT = {Contrary to what happens in forecasting, in which the repetitive nature of events lends itself to the ex post validation of expert judgments, it is usually very difficult to compare directly the forecast of technology foresight studies with realized outcomes. When the comparison is feasible, therefore, there is large opportunity for learning and methodological refinement. The authors of this study had the opportunity to re-examine the findings of a technology foresight exercise on the medical device industry with realized technological performance, five years later. Among the findings of the comparison exercise, intriguing false positive as well as false negative cases have been identified. The paper suggests that these cases are due to specific cognitive and motivational biases of experts and examines the way in which they are at work in the foresight process. It argues that these biases are due to the inability of experts to reason systematically in abstract (or "functional") terms during the whole foresight process. It also suggests a methodology to mitigate the biases and to manage the emergence of false positives and false negatives}, KEYWORDS = {Expert forecast, Medical device industry, Cognitive biases, Abstract reasoning, Failure mode analysis, Functional analysis}, PAGES = {277-288}, URL = {https://iris.cnr.it/handle/20.500.14243/392529}, VOLUME = {141}, DOI = {10.1016/j.techfore.2018.12.006}, ISSN = {0040-1625}, JOURNAL = {TECHNOLOGICAL FORECASTING AND SOCIAL CHANGE}, } @ARTICLE{DELLORLETTA_2019_ARTICLE_DGMMRSV_403580, AUTHOR = {Dell'Orletta, F. and Greco, S. and Montemagni, S. and Morini, E. and Rossi, F. and Sagri, M. and Venturi, G.}, TITLE = {Le parole del miglioramento. Come le scuole descrivono il cambiamento}, YEAR = {2019}, ABSTRACT = {Il presente contributo intende illustrare i risultati di una ricerca condotta con l'uso di strumenti di trattamento automatico del linguaggio (Natural Language Processing: nlp) su quanto dichiarato dalle scuole in circa 2500 Piani di Miglioramento (modello indire) con l'obiettivo di comprendere le scelte strategiche in un'ottica di miglioramento continuo. Il disegno d'analisi permette di restituire sia una visione complessiva dei Piani di Miglioramento che approfondimenti qualitativi di confronto tra tipologie di scuola e aree geografiche e relativi a tematiche strategiche quali formazione e innovazione}, KEYWORDS = {Piano d, Natural Language Processing, Formazione, Innovazione}, PAGES = {47-68}, URL = {https://www.rivistainfanzia.it/pvw/app/default/pvw_sito.php?sede_codice=1PWPSE01\&page=2432193}, VOLUME = {1/2019}, ISSN = {1971-3711}, JOURNAL = {PSICOLOGIA DELL'EDUCAZIONE}, } @ARTICLE{DELLAGALA_2019_ARTICLE_DCDPV_403584, AUTHOR = {Della Gala, V. and Chiriatti, G. and Dell'Orletta, F. and Pettenati, M. C. and Venturi, G.}, TITLE = {Analytics dei testi riflessivi scritti dai docenti neoassunti nel portfolio digitale}, YEAR = {2019}, ABSTRACT = {Presentiamo i risultati preliminari e l'analisi svolta su circa 50. 000 testi scritti dai docenti neo nominati in ruolo per riflettere su due attività didattiche svolte con gli studenti, nel contesto del percorso dell'anno di formazione e prova 2016/17. Il percorso prevede attività in presenza e attività a distanza completate sul portfolio digitale, ospitato nell'ambiente online gestito dall'Indire. Nell'ambito del monitoraggio della formazione, con il fine di ottimizzare gli strumenti e il supporto fornito, abbiamo interrogato i dati testuali prodotti dai docenti nell'interazione con l'ambiente per capire se i testi presentassero evidenze riconducibili alle scritture riflessive. Obiettivi dell'indagine sono stati la definizione di uno schema per la classificazione dei testi sulla base del livello di riflessività evidenziato e l'impiego di strumenti di Trattamento Automatico del Linguaggio (TAL) per l'analisi dell'interocorpus testuale prodotto dai docenti. Descriveremo il contesto scientifico e progettuale, le caratteristiche dei dati analizzati, come questo abbia determinato il disegno d'indagine; descriveremo inoltre la sua implementazione e dunque le procedure, gli strumenti e le metriche adottate o elaborate per rappresentare il contenuto dei dati; infine discuteremo i primi risultati e alcuni vantaggi e limiti dell'approccio adottato}, KEYWORDS = {Teacher professional development, Natural Language Processing, Reflective writing, Linguistic Profiling, Document Classification}, PAGES = {187-204}, URL = {https://ojs.pensamultimedia.it/index.php/sird/article/view/3454/3360}, VOLUME = {SPECIAL ISSUE}, DOI = {10.7346/SIRD-2S2019-P189}, ISSN = {2038-9744}, JOURNAL = {GIORNALE ITALIANO DELLA RICERCA EDUCATIVA (ONLINE)}, } @ARTICLE{SAROGNI_2019_ARTICLE_SPSADGCBPADSCTBFM_390441, AUTHOR = {Sarogni, P. and Palumbo, O. and Servadio, A. and Astigiano, S. and D'Alessio, B. and Gatti, V. and Cukrov, D. and Baldari, S. and Pallotta, M. M. and Aretini, P. and Dell'Orletta, F. and Soddu, S. and Carella, M. and Toietta, G. and Barbieri, O. and Fontanini, G. and Musio, A.}, TITLE = {Overexpression of the cohesin-core subunit SMC1A contributes to colorectal cancer development}, YEAR = {2019}, ABSTRACT = {BackgroundCancer cells are characterized by chromosomal instability (CIN) and it is thought that errors in pathways involved in faithful chromosome segregation play a pivotal role in the genesis of CIN. Cohesin forms a large protein ring that binds DNA strands by encircling them. In addition to this central role in chromosome segregation, cohesin is also needed for DNA repair, gene transcription regulation and chromatin architecture. Though mutations in both cohesin and cohesin-regulator genes have been identified in many human cancers, the contribution of cohesin to cancer development is still under debate. MethodsNormal mucosa, early adenoma, and carcinoma samples deriving from 16 subjects affected by colorectal cancer (CRC) were analyzed by OncoScan for scoring both chromosome gains and losses (CNVs) and loss of heterozygosity (LOH). Then the expression of SMC1A was analyzed by immunochemistry in 66 subjects affected by CRC. The effects of SMC1A overexpression and mutated SMC1A were analyzed in vivo using immunocompromised mouse models. Finally, we measured global gene expression profiles in induced-tumors by RNA-seq. ResultsHere we showed that SMC1A cohesin core gene was present as extra-copies, mutated, and overexpressed in human colorectal carcinomas. We then demonstrated that cohesin overexpression led to the development of aggressive cancers in immunocompromised mice through gene expression dysregulation. ConclusionCollectively, these results support a role of defective cohesin in the development of human colorectal cancer}, KEYWORDS = {Cohesin, SMC1A, Chromosome instability, Gene expression dysregulation, Human colorectal cancer development}, PAGES = {16}, URL = {https://iris.cnr.it/handle/20.500.14243/390441}, VOLUME = {38}, DOI = {10.1186/s13046-019-1116-0}, ISSN = {1756-9966}, JOURNAL = {JOURNAL OF EXPERIMENTAL \& CLINICAL CANCER RESEARCH (ONLINE)}, } @INPROCEEDINGS{ALZETTA_2019_INPROCEEDINGS_ADMV_403587, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dissecting Treebanks to Uncover Typological Trends. A Multilingual Comparative Approach}, YEAR = {2019}, ABSTRACT = {Over the last years, linguistic typology started attracting the interest of the community working on cross-and multi-lingual NLP as a way to tackle the bottleneck deriving from the lack of annotated data for many languages. Typological information is mostly acquired from publicly accessible typological databases, manually constructed by linguists. As reported in Ponti et al. (2018), despite the abundant information contained in them for many languages, these resources suffer from two main shortcomings, i. e. their limited coverage and the discrete nature of features (only "the majority value rather than the full range of possible values and their corresponding frequencies" is reported). Corpus-based studies can help to automatically acquire quantitative typological evidence which might be exploited for polyglot NLP. Recently, the availability of corpora annotated following a cross-linguistically consistent annotation scheme such as the one developed in the Universal Dependencies project is prompting new comparative linguistic studies aimed to identify similarities as well as idiosyncrasies among typologically different languages (Nivre, 2015). The line of research described here is aimed at acquiring quantitative typological evidence from UD treebanks through a multilingual contrastive approach}, KEYWORDS = {Natural Language Processing, Linguistic Typology}, PAGES = {1-3}, URL = {https://typology-and-nlp.github.io/2019/assets/2019/papers/5.pdf}, ISBN = {978-1-950737-29-1}, CONFERENCE_NAME = {1st TyP-NLP: The Workshop on Typology for Polyglot NLP, ACL workshop}, } @INPROCEEDINGS{ALZETTA_2019_INPROCEEDINGS_AMADKPT_390427, AUTHOR = {Alzetta, C. and Miaschi, A. and Adorni, G. and Dell'Orletta, F. and Koceva, F. and Passalacqua, S. and Torre, I.}, TITLE = {Prerequisite or not prerequisite? That's the problem! An NLP-based Approach for Concept Prerequisites Learning}, YEAR = {2019}, ABSTRACT = {This paper presents a method for prerequisite learning classification between educational concepts. The proposed system was developed by adapting a classification algorithm designed for sequencing Learning Objects to the task of ordering concepts from a computer science textbook. In order to apply the system to the new task, for each concept we automatically created a learning unit from the textbook using two criteria based on concept occurrences and burst intervals. Results are promising and suggest that further improvements could highly benefit the results}, URL = {https://iris.cnr.it/handle/20.500.14243/390427}, } @INPROCEEDINGS{BUONGIOVANNI_2019_INPROCEEDINGS_BGBD_390429, AUTHOR = {Buongiovanni, C. and Gracci, F. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Lost in text. A cross-genre analysis of linguistic phenomena within text}, YEAR = {2019}, ABSTRACT = {Moving from the assumption that formal, rather than content features, can be used to detect differences and similarities among textual genres and registers, this paper presents a new approach to the linguistic profiling methodology, which focuses on the internal parts of a text. A case study is presented showing that it is possible to model the degree of variance within texts representative of four traditional genres and two levels of complexity for each}, URL = {https://iris.cnr.it/handle/20.500.14243/390429}, } @INPROCEEDINGS{CHIRIATTI_2019_INPROCEEDINGS_CBDV_380338, AUTHOR = {Chiriatti, G. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {What makes a review helpful? Predicting the helpfulness of Italian tripadvisor reviews}, YEAR = {2019}, ABSTRACT = {In this paper we introduce a classification system devoted to predict the helpfulness of Italian online reviews. It is based on a wide set of features reflecting the different factors involved and tested on different categories of TripAdvisor reviews. For this purpose, we collected the first Italian corpus of online reviews enriched with metadata related to their helpfulness and we carried out an in-depth analysis of the most predictive features}, KEYWORDS = {Natural Language Processing, Documenti Classification, Linguistic Profiling}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85074834351\&origin=inward}, VOLUME = {2481}, CONFERENCE_NAME = {6th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{DOMINUTTI_2019_INPROCEEDINGS_DPDMQ_390432, AUTHOR = {Dominutti, E. and Pifferi, L. and Dell'Orletta, F. and Montemagni, S. and Quochi, V.}, TITLE = {Building an Italian written-spoken parallel corpus: A pilot study}, YEAR = {2019}, ABSTRACT = {This paper presents a pilot study towards the creation of a monolingual written-spoken parallel corpus in Italian, featuring two main novelties in the general landscape of spoken corpora: the alignment with the written counterpart of the same content and the spoken variety dealt with, represented by transcriptions of radio news broadcasting}, URL = {https://iris.cnr.it/handle/20.500.14243/390432}, } @INPROCEEDINGS{FIEROMONTE_2019_INPROCEEDINGS_FBDV_380336, AUTHOR = {Fieromonte, M. and Brunato, D. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Italian and English sentence simplification: How many differences?}, YEAR = {2019}, ABSTRACT = {The paper proposes a cross-linguistic analysis of two parallel monolingual corpora conceived for automatic text simplification in two languages, Italian and English. The aim is to find similarities and differences in the process of simplification in two typologically different languages. To carry out the comparison, 1, 000 sentences were extracted from the two corpora and annotated with a scheme previously used to annotate simplification phenomena}, KEYWORDS = {Natural Language Processing, Automatic Text Simplification}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85074816689\&origin=inward}, VOLUME = {2481}, CONFERENCE_NAME = {6th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{MASLENNIKOVA_2019_INPROCEEDINGS_MLCD_390425, AUTHOR = {Maslennikova, A. and Labruna, P. and Cimino, A. and Dell'Orletta, F.}, TITLE = {Quanti anni hai? Age identification for Italian}, YEAR = {2019}, ABSTRACT = {We present the first work to our knowledge on automatic age identification for Italian texts. For this work we built a dataset consisting of more than 2. 400. 000 posts extracted from publicly available forums and containing authorship attribution metadata, such as age and gender. We developed an age classifier and performed a set of experiments with the aim of evaluating the possibility of assigning the correct age of an user and which information is useful to tackle this task: lexical or linguistic information spanning across different levels of linguistic descriptions. The performed experiments show the importance of lexical information in age classification, but also that exists writing style that relates to the age of an user}, URL = {https://iris.cnr.it/handle/20.500.14243/390425}, } @INPROCEEDINGS{MIASCHI_2019_INPROCEEDINGS_MACD_390439, AUTHOR = {Miaschi, A. and Alzetta, C. and Cardillo, F. A. and Dell'Orletta, F.}, TITLE = {Linguistically-Driven Strategy for Concept Prerequisites Learning on Italian}, YEAR = {2019}, ABSTRACT = {We present a new concept prerequisite learning method for Learning Object (LO) ordering that exploits only linguistic features extracted from textual educational resources. The method was tested in a cross-and in-domain scenario both for Italian and English. Additionally, we performed experiments based on a incremental training strategy to study the impact of the training set size on the classifier performances. The paper also introduces ITA-PREREQ, to the best of our knowledge the first Italian dataset annotated with prerequisite relations between pairs of educational concepts, and describe the automatic strategy devised to build it}, KEYWORDS = {Concept Prerequisites Learning}, PAGES = {285-295}, URL = {https://iris.cnr.it/handle/20.500.14243/390439}, CONFERENCE_NAME = {14th Workshop on Innovative Use of NLP for Building Educational Applications}, BOOKTITLE = {Proceedings of the Fourteenth Workshop on Innovative Use of NLP for Building Educational Applications}, } @ARTICLE{CHIARELLO_2018_ARTICLE_CCFD_392587, AUTHOR = {Chiarello, F. and Cimino, A. and Fantoni, G. and Dell'Orletta, F.}, TITLE = {Automatic users extraction from patents}, YEAR = {2018}, ABSTRACT = {The purpose of the present research is to design a method capable of automatically detecting and extracting one of the multiple entities hidden in patents: the users of the invention}, KEYWORDS = {Patent analysis, Deep learning, Text mining}, PAGES = {28-38}, URL = {https://iris.cnr.it/handle/20.500.14243/392587}, VOLUME = {54}, DOI = {10.1016/j.wpi.2018.07.006}, ISSN = {0172-2190}, JOURNAL = {WORLD PATENT INFORMATION}, } @INPROCEEDINGS{ADORNI_2018_INPROCEEDINGS_ADKTV_374898, AUTHOR = {Adorni, G. and Dell'Orletta, F. and Koceva, F. and Torre, I. and Venturi, G.}, TITLE = {Extracting dependency relations from digital learning content}, YEAR = {2018}, ABSTRACT = {Digital Libraries present tremendous potential for developing e-learning applications, such as text comprehension and question-answering tools. A way to build this kind of tools is structuring the digital content into relevant concepts and dependency relations among them. While the literature offers several approaches for the former, the identification of dependencies, and specifically of prerequisite relations, is still an open issue. We present an approach to manage this task}, KEYWORDS = {Prerequisite relationship, Concept extraction, Graph mining}, PAGES = {114-119}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85041860435\&origin=inward}, VOLUME = {806}, DOI = {10.1007/978-3-319-73165-0_11}, CONFERENCE_NAME = {14th Italian Research Conference on Digital Libraries (IRCDL 2018)}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMSV_493647, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Simi, M. and Venturi, G.}, TITLE = {Assessing the Impact of Incremental Error Detection and Correction. A Case Study on the Italian Universal Dependency Treebank}, YEAR = {2018}, ABSTRACT = {Detection and correction of errors and inconsistencies in “gold treebanks” are becoming more and more central topics of corpus annotation. The paper illustrates a new incremental method for enhancing treebanks, with particular emphasis on the extension of error patterns across different textual genres and registers. Impact and role of corrections have been assessed in a dependency parsing experiment carried out with four different parsers, whose results are promising. For both evaluation datasets, the performance of parsers increases, in terms of the standard LAS and UAS measures and of a more focused measure taking into account only relations involved in error patterns, and at the level of individual dependencies}, KEYWORDS = {Treebank, annotation, annotation error}, PAGES = {1-7}, URL = {https://iris.cnr.it/handle/20.500.14243/493647}, PUBLISHER = {Association for Computational Linguistics (ACL)}, ISBN = {9781948087780}, CONFERENCE_NAME = {2nd Workshop on Universal Dependencies, UDW 2018, held in conjunction with EMNLP 2018}, BOOKTITLE = {EMNLP 2018-2nd Workshop on Universal Dependencies, UDW 2018-Proceedings of the Workshop}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMSV_371344, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Simi, M. and Venturi, G.}, TITLE = {Assessing the Impact of Iterative Error Detection and Correction. A Case Study on the Italian Universal Dependency Treebank}, YEAR = {2018}, ABSTRACT = {Detection and correction of errors and inconsistencies in "gold treebanks" are becoming more and more central topics of corpus annotation. The paper illustrates a new incremental method for enhancing treebanks, with particular emphasis on the extension of error patterns across different textual genres and registers. Impact and role of corrections have been assessed in a dependency parsing experiment carried out with four different parsers, whose results are promising. For both evaluation datasets, the performance of parsers increases, in terms of the standard LAS and UAS measures and of a more focused measure taking into account only relations involved in error patterns, and at the level of individual dependencies}, KEYWORDS = {Error Detection, Universal Dependency Treebanks, Syntactic parsing}, PAGES = {1-7}, URL = {http://universaldependencies.org/udw18/PDFs/39_Paper.pdf}, ISBN = {978-1-948087-84-1}, CONFERENCE_NAME = {Universal Dependencies Workshop 2018 (UDW 2018)}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMV_334766, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dangerous Relations in Dependency Treebanks}, YEAR = {2018}, ABSTRACT = {The paper illustrates an effective and innovative method for detecting erroneously annotated arcs in gold dependency treebanks based on an algorithm originally developed to measure the reliability of automatically produced dependency relations. The method permits to significantly restrict the error search space and, more importantly, to reliably identify patterns of systematic recurrent errors which represent dangerous evidence to a parser which tendentially will replicate them. Achieved results demonstrate effectiveness and reliability of the method}, KEYWORDS = {Dependency treebanks, Error Detection, Linguistic Annotation}, PAGES = {201-210}, URL = {http://aclweb.org/anthology/W/W17/W17-7624.pdf}, ISBN = {978-80-88132-04-2}, CONFERENCE_NAME = {16th International Workshop on Treebanks and Linguistic Theories}, BOOKTITLE = {Proceedings of the 16th International Workshop on Treebanks and Linguistic Theories}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMV_374901, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Universal Dependencies and Quantitative Typological Trends. A Case Study on Word Order}, YEAR = {2018}, ABSTRACT = {The paper presents a new methodology aimed at acquiring typological evidence from "gold" treebanks for different languages. In particular, it investigates whether and to what extent algorithms developed for assessing the plausibility of automatically produced syntactic annotations could contribute to shed light on key issues of the linguistic typological literature. It reports the first and promising results of a case study focusing on word order patterns carried out on three different languages (English, Italian and Spanish)}, KEYWORDS = {Linguistic Knowledge Extraction, Dependency Treebanks, Linguistic Typology}, PAGES = {4540-4549}, URL = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/1109.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-00-9}, CONFERENCE_NAME = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC 2018)}, CONFERENCE_PLACE = {Paris}, } @INPROCEEDINGS{BOSCO_2018_INPROCEEDINGS_BSDPT_351854, AUTHOR = {Bosco, C. and Sanguinetti, M. and Dell'Orletta, F. and Poletto, F. and Tesconi, M.}, TITLE = {Overview of the EVALITA 2018 hate speech detection task}, YEAR = {2018}, ABSTRACT = {The Hate Speech Detection (HaSpeeDe) task is a shared task on Italian social media (Facebook and Twitter) for the detection of hateful content, and it has been proposed for the first time at EVALITA 2018. Providing two datasets from two different online social platforms differently featured from the linguistic and communicative point of view, we organized the task in three tasks where systems must be trained and tested on the same resource or using one in training and the other in testing: HaSpeeDe-FB, HaSpeeDe-TW and Cross-HaSpeeDe (further subdivided into Cross-HaSpeeDe FB and Cross-HaSpeeDe TW sub-tasks). Overall, 9 teams participated in the task, and the best system achieved a macro F1-score of 0. 8288 for HaSpeeDe-FB, 0. 7993 for HaSpeeDe-TW, 0. 6541 for Cross-HaSpeeDe FB and 0. 6985 for Cross-HaSpeeDe TW. In this report, we describe the datasets released and the evaluation measures, and we discuss results}, KEYWORDS = {Hate Speech Detection, Social Media Analysis}, PAGES = {9}, URL = {http://www.scopus.com/inward/record.url?eid=2-s2.0-85058647605\&partnerID=q2rCbXpz}, VOLUME = {2263}, CONFERENCE_NAME = {EVALITA 2018-Sixth Evaluation Campaign of Natural Language Processing and Speech Tools for Italian}, } @INPROCEEDINGS{BRUNATO_2018_INPROCEEDINGS_BDDIV_371346, AUTHOR = {Brunato, D. and De Mattei, L. and Dell'Orletta, F. and Iavarone, B. and Venturi, G.}, TITLE = {Is this sentence difficult? Do you agree?}, YEAR = {2018}, ABSTRACT = {In this paper, we present a crowdsourcing-based approach to model the human perception of sentence complexity. We collect a large corpus of sentences rated with judgments of complexity for two typologically-different languages, Italian and English. We test our approach in two experimental scenarios aimed to investigate the contribution of a wide set of lexical, morpho-syntactic and syntactic phenomena in predicting i) the degree of agreement among annotators independently from the assigned judgment and ii) the perception of sentence complexity}, KEYWORDS = {Linguistic complexity, Crowdsourcing, Human perception}, PAGES = {1-10}, URL = {https://www.aclweb.org/anthology/D18-1289/}, DOI = {10.18653/v1/D18-1289}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-948087-84-1}, CONFERENCE_NAME = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, CONFERENCE_PLACE = {Stroudsburg}, } @INPROCEEDINGS{BRUNATO_2018_INPROCEEDINGS_BVD_392547, AUTHOR = {Brunato, D. and Valeriani, M. and Dell'Orletta, F.}, TITLE = {DARC-IT: A DAtaset for reading comprehension in Italian}, YEAR = {2018}, ABSTRACT = {In this paper, we present DARC-IT, a new reading comprehension dataset for the Italian language aimed at identifying 'question-worthy' sentences, i. e. sentences in a text which contain information that is worth asking a question about. The purpose of the corpus is twofold: to investigate the linguistic profile of question-worthy sentences and to support the development of automatic question generation systems}, URL = {https://iris.cnr.it/handle/20.500.14243/392547}, DOI = {10.4000/books.aaccademia.3099}, } @INPROCEEDINGS{CHIARELLO_2018_INPROCEEDINGS_CBFOCD_392588, AUTHOR = {Chiarello, F. and Bonaccorsi, A. and Fantoni, G. and Ossola, G. and Cimino, A. and Dell'Orletta, F.}, TITLE = {Technical Sentiment Analysis: Measuring Advantages and Drawbacks of New Products Using Social Media}, YEAR = {2018}, ABSTRACT = {This work proposes a new social media based model to measure how users perceive new products from a technical point of view. This model relies on the analysis of advantages and drawbacks of products, which are both important aspects evaluated by consumers during the buying decision process. This model is based on a lexicon developed in a related work (Chiarello et. al, 2017) to analyse patents and detect advantages and drawbacks connected to a certain technology}, URL = {https://iris.cnr.it/handle/20.500.14243/392588}, DOI = {10.4995/CARMA2018.2018.8336}, } @INPROCEEDINGS{CHIRIATTI_2018_INPROCEEDINGS_CDDMPSV_403577, AUTHOR = {Chiriatti, G. and Della Gala, V. and Dell'Orletta, F. and Montemagni, S. and Pettenati, M. C. and Sagri, M. T. and Venturi, G.}, TITLE = {A NLP-based analysis of reflective writings by Italian teachers}, YEAR = {2018}, ABSTRACT = {This paper reports first results of a wider study devoted to exploit the potentialities of a NLP-based approach to the analysis of a corpus of reflective writings on teaching activities. We investigate how a wide set of linguistic features allows reconstructing the linguistic profile of the texts written by the Italian teachers and predicting whether are reflective}, KEYWORDS = {Natural Language Processing, Reflective Writings, Linguistic Profiling, Document Classification}, PAGES = {1-7}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057733802\&origin=inward}, VOLUME = {2253}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{CIMINO_2018_INPROCEEDINGS_CDD_392545, AUTHOR = {Cimino, A. and De Mattei, L. and Dell'Orletta, F.}, TITLE = {Multi-task learning in deep neural networks at EVALITA 2018}, YEAR = {2018}, ABSTRACT = {In this paper we describe the system used for the participation to the ABSITA, GxG, HaSpeeDe and IronITA shared tasks of the EVALITA 2018 conference. We developed a classifier that can be configured to use Bidirectional Long Short Term Memories and linear Support Vector Machines as learning algorithms. When using Bi-LSTMs we tested a multitask learning approach which learns the optimized parameters of the network exploiting simultaneously all the annotated dataset labels and a multiclassifier voting approach based on a k-fold technique. In addition, we developed generic and specific word embedding lexicons to further improve classification performances. When evaluated on the official test sets, our system ranked 1st in almost all subtasks for each shared task, showing the effectiveness of our approach}, URL = {https://iris.cnr.it/handle/20.500.14243/392545}, } @INPROCEEDINGS{CIMINO_2018_INPROCEEDINGS_CDBV_403576, AUTHOR = {Cimino, A. and Dell'Orletta, F. and Brunato, D. and Venturi, G.}, TITLE = {Sentences and documents in native language identification}, YEAR = {2018}, ABSTRACT = {Starting from a wide set of linguistic features, we present the first in depth feature analysis in two different Native Language Identification (NLI) scenarios. We compare the results obtained in a traditional NLI document classification task and in a newly introduced sentence classification task, investigating the different role played by the considered features. Finally, we study the impact of a set of selected features extracted from the sentence classifier in document classification}, KEYWORDS = {Natural Language Processing, Native Language Identification}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057749754\&origin=inward}, VOLUME = {2253}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{COCCIU_2018_INPROCEEDINGS_CBVD_403579, AUTHOR = {Cocciu, E. and Brunato, D. and Venturi, G. and Dell'Orletta, F.}, TITLE = {Gender and Genre Linguistic profiling: A case study on female and male journalistic and diary prose}, YEAR = {2018}, ABSTRACT = {This paper intends to investigate the linguistic profile of male-and female-authored texts belonging to two very different textual genres: newspaper articles and diary prose. By using a wide set of linguistic features automatically extracted from text and spanning across different levels of linguistic description, from lexicon to syntax, our analysis highlights the peculiarities of the two examined genres and how the genre dimension is influenced by variation depending on author's gender (and vice versa)}, KEYWORDS = {Natural Language Processing, Genre Classification, Linguistic Profiling}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057759773\&origin=inward}, VOLUME = {2253}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{CRESCI_2018_INPROCEEDINGS_CCATD_392594, AUTHOR = {Cresci, S. and Cimino, A. and Avvenuti, M. and Tesconi, M. and Dell'Orletta, F.}, TITLE = {Real-world witness detection in social media via hybrid crowdsensing}, YEAR = {2018}, ABSTRACT = {The task of witness detection in social media is crucial for many practical applications, including rumor debunking, emergency management, and public opinion mining. Yet to date, it has been approached in an approximated way. We propose a method for addressing witness detection in a strict and realistic fashion. By employing hybrid crowdsensing over Twitter, we contact real-life witnesses and use their reactions to build a strong ground-truth, thus avoiding a manual, subjective annotation of the dataset. Using this dataset, we develop a witness detection system based on a machine learning classifier using a wide set of linguistic features and metadata associated with the tweets}, URL = {https://iris.cnr.it/handle/20.500.14243/392594}, } @INPROCEEDINGS{CUTUGNO_2018_INPROCEEDINGS_CDPSS_358247, AUTHOR = {Cutugno, F. and Dell'Orletta, F. and Poggi, I. and Savy, R. and Sorgente, A.}, TITLE = {The CHROME Manifesto: Integrating multimodal data into Cultural Heritage Resources}, YEAR = {2018}, ABSTRACT = {The CHROME Project aims at collecting a wide portfolio of digital resources oriented to technological application in Cultural Heritage (henceforth CH). The contributions for the realisation of such objective come from the efforts of computer scientists, psychologists, architects, and computational linguists, who constitute an interdisciplinary equipe. We are collecting and analyzing texts, spoken materials, architectural surveys, and human motion videos, attempting the integration of these data in a multidimensional platform based on multilevel annotation systems, game engines importing, and virtualization techniques. As case of study we choose to work on the magic travel along three Charterhouses located in Campania region: S. Martino in Naples, S. Lorenzo in Padula (Salerno) and S. Giacomo, in Capri}, KEYWORDS = {cultural heritage, multimodal interaction}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057736501\&origin=inward}, VOLUME = {2253}, CONFERENCE_NAME = {CLiC-it 2018 Italian Conference on Computational Linguistics}, } @INPROCEEDINGS{DEFELICE_2018_INPROCEEDINGS_DDVLM_403578, AUTHOR = {De Felice, I. and Dell'Orletta, F. and Venturi, G. and Lenci, A. and Montemagni, S.}, TITLE = {Italian in the Trenches: Linguistic annotation and analysis of texts of the great war}, YEAR = {2018}, ABSTRACT = {The paper illustrates the design and development of a textual corpus representative of the historical variants of Italian during the Great War, which was enriched with linguistic (lemmatization and pos-tagging) and meta-linguistic annotation. The corpus, after a manual revision of the linguistic annotation, was used for specializing existing NLP tools to process historical texts with promising results}, KEYWORDS = {Natural Language Processing, Automatic Linguistic Annotation}, PAGES = {1-5}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057734451\&origin=inward}, VOLUME = {2253}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{DEMATTEI_2018_INPROCEEDINGS_DCD_392584, AUTHOR = {De Mattei, L. and Cimino, A. and Dell'Orletta, F.}, TITLE = {Multi-task learning in deep neural network for sentiment polarity and irony classification}, YEAR = {2018}, ABSTRACT = {We study the impact of a new multi-task learning approach in deep neural network for polarity and irony detection in Italian Twitter posts. We compare this approach with traditional single-task learning models. The different behavior of the two approaches shows the effectiveness of the proposed method that is able to combine the information from the two tasks improving the accuracy in both tasks. This is particularly true on edge cases in which knowledge about the two tasks is needed to classify a tweet, this is the case, for example, when the literal polarity of a tweet is inverted by irony}, URL = {https://iris.cnr.it/handle/20.500.14243/392584}, } @INPROCEEDINGS{DELLOGLIO_2018_INPROCEEDINGS_DBD_392583, AUTHOR = {Dell'Oglio, P. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Lexicon and Syntax: Complexity across genres and language varieties}, YEAR = {2018}, ABSTRACT = {This paper presents first results of an ongoing work to investigate the interplay between lexical complexity and syntactic complexity with respect to nominal lexicon and how it is affected by textual genre and level of linguistic complexity within genre. A cross-genre analysis is carried out for the Italian language using multi-leveled linguistic features automatically extracted from dependency parsed corpora}, URL = {https://iris.cnr.it/handle/20.500.14243/392583}, DOI = {10.4000/books.aaccademia.3282}, } @INPROCEEDINGS{DELLORLETTA_2018_INPROCEEDINGS_DN_392544, AUTHOR = {Dell'Orletta, F. and Nissim, M.}, TITLE = {Overview of the Evalita 2018 cross-genre gender prediction (GXG) task}, YEAR = {2018}, ABSTRACT = {The Gender Cross-Genre (GxG) task is a shared task on author profiling (in terms of gender) on Italian texts, with a specific focus on cross-genre performance. This task has been proposed for the first time at EVALITA 2018, providing different datasets from different textual genres: Twitter, YouTube, Children writing, Journalism, Personal diaries. Results from a total of 50 different runs show that the task is difficult to learn in itself: while almost all runs beat a 50% baseline, no model reaches an accuracy above 70%. We also observe that cross-genre modelling yields a drop in performance, but not as substantial as one would expect}, URL = {https://iris.cnr.it/handle/20.500.14243/392544}, DOI = {10.4000/books.aaccademia.4478}, } @INPROCEEDINGS{PETROLITO_2018_INPROCEEDINGS_PD_392548, AUTHOR = {Petrolito, R. and Dell'Orletta, F.}, TITLE = {Word embeddings in sentiment analysis}, YEAR = {2018}, ABSTRACT = {In the late years sentiment analysis and its applications have reached growing popularity. Concerning this field of research, in the very late years machine learning and word representation learning derived from distributional semantics field (i. e. word embeddings) have proven to be very successful in performing sentiment analysis tasks. In this paper we describe a set of experiments, with the aim of evaluating the impact of word embedding-based features in sentiment analysis tasks}, URL = {https://iris.cnr.it/handle/20.500.14243/392548}, DOI = {10.4000/books.aaccademia.3589}, } @ARTICLE{FERRARI_2017_ARTICLE_FDEGG_341661, AUTHOR = {Ferrari, A. and Dell'Orletta, F. and Esuli, A. and Gervasi, V. and Gnesi, S.}, TITLE = {Natural language requirements processing: a 4D vision}, YEAR = {2017}, ABSTRACT = {Natural language processing (NLP) and requirements engineering (RE) have had a long relationship, yet their combined use isn't well established in industrial practice. This situation should soon change. The future evolution of the application of NLP technologies in RE can be viewed from four dimensions: discipline, dynamism, domain knowledge, and datasets}, KEYWORDS = {Natural Language Processing, Requirement Processing}, PAGES = {28-35}, URL = {http://ieeexplore.ieee.org/abstract/document/8106888/}, VOLUME = {34 (6)}, DOI = {10.1109/MS.2017.4121207}, ISSN = {0740-7459}, JOURNAL = {IEEE SOFTWARE}, } @ARTICLE{VENTURI_2017_ARTICLE_VDMFB_342151, AUTHOR = {Venturi, G. and Dell'Orletta, F. and Montemagni, S. and Flore, E. and Bellandi, T.}, TITLE = {La qualità dei consensi informati. Un'analisi linguistico-computazionale della leggibilità dei testi}, YEAR = {2017}, ABSTRACT = {La leggibilità dei testi delle informative di consenso per le procedure diagnostico-terapeutiche è un requisito fondamentale, per offrire alle persone assistite l'accesso alle informazioni necessarie a una scelta consapevole delle opzioni disponibili per curare i diversi problemi di salute. La disponibilità di un testo leggibile è inoltre un aiuto per i medici responsabili della comunicazione e della raccolta del consenso, che possono impiegarlo come un ausilio alle informazioni presentate in forma verbale durante il colloquio, in modo tale da poter condividere una base di conoscenze minime da condividere con il paziente e i suoi familiari. Seppure le evidenze siano limitate in merito alla relazione tra la qualità del consenso e l'attitudine al contenzioso da parte dei pazienti in caso di trattamenti che esitano in un danno attribuibile alle cure (Durand et al., 2015), si tratta di un ambito di ricerca di crescente interesse nella letteratura sulla sicurezza (Wu et al., 2005; Manta et al., 2017). Nella casistica regionale della Toscana sulle richieste di risarcimento, solo l'1% dei sinistri include problemi di consenso informato (dati Centro GRC), probabilmente anche a causa di una sottovalutazione del diritto all'informazione da parte dei cittadini che si sottopongono a interventi programmati, connessa con una limitata consapevolezza del potere di scegliere le proprie cure che ogni persona dovrebbe poter esercitare posta di fronte alle opzioni terapeutiche disponibili per i propri problemi di salute}, KEYWORDS = {Consenso informato, valutazione automatica della leggibilità, Trattamento Automatico del Linguaggio}, PAGES = {35-39}, URL = {http://www.formas.toscana.it/rivistadellasalute/fileadmin/files/fascicoli/2017/212/SeT_fascicolo_212.pdf}, VOLUME = {212}, ISSN = {0392-4505}, JOURNAL = {SALUTE E TERRITORIO}, } @INPROCEEDINGS{BRUNATO_2017_INPROCEEDINGS_BD_339434, AUTHOR = {Brunato, D. and Dell'Orletta, F.}, TITLE = {On the order of words in Italian: a study on genre vs complexity}, YEAR = {2017}, ABSTRACT = {In this paper we present a cross-genre study on word order variation in Italian based on automatically dependency-parsed corpora. A comparative analysis focused on dependency direction and dependency distance for major constituents in the sentence is carried out in order to assess the influence of both textual genre and linguistic complexity on the distribution of phenonemena of syntactic markedeness}, KEYWORDS = {word order, syntactic analysis, linguistic complexity, natural language processing}, PAGES = {25-31}, URL = {https://iris.cnr.it/handle/20.500.14243/339434}, CONFERENCE_NAME = {International Conference on Dependency Linguistics (Depling 2017)}, } @INPROCEEDINGS{CIMINO_2017_INPROCEEDINGS_CD_341662, AUTHOR = {Cimino, A. and Dell'Orletta, F.}, TITLE = {Stacked Sentence-Document Classifier Approach for Improving Native Language Identification}, YEAR = {2017}, ABSTRACT = {In this paper, we describe the approach of the ItaliaNLP Lab team to native language identification and discuss the results we submitted as participants to the essay track of NLI Shared Task 2017. We introduce for the first time a 2-stacked sentencedocument architecture for native language identification that is able to exploit both local sentence information and a wide set of general-purpose features qualifying the lexical and grammatical structure of the whole document. When evaluated on the official test set, our sentence-document stacked architecture obtained the best result among all the participants of the essay track with an F1 score of 0. 8818}, KEYWORDS = {Native Language Identification}, PAGES = {430-437}, URL = {https://iris.cnr.it/handle/20.500.14243/341662}, CONFERENCE_NAME = {Workshop on Innovative Use of NLP for Building Educational Applications}, } @INPROCEEDINGS{CIMINO_2017_INPROCEEDINGS_CWDMV_342154, AUTHOR = {Cimino, A. and Wieling, M. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Identifying predictive features for textual genre classification: The key role of syntax}, YEAR = {2017}, ABSTRACT = {The paper investigates impact and role of different feature types for the specific task of Automatic Genre Classification with the final aim of identifying the most predictive ones. The goal was pursued by carrying out incremental feature selection through Grafting using different sets of linguistic features. Achieved results for discriminating among four traditional textual genres show the key role played by syntactic features, whose impact turned out to vary across genres}, KEYWORDS = {Textual Genre Classification, Feature Selection, Syntactic Features}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85037370866\&origin=inward}, VOLUME = {2006}, CONFERENCE_NAME = {Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{DELVIGNA_2017_INPROCEEDINGS_DCDPT_355348, AUTHOR = {Del Vigna, F. and Cimino, A. and Dell'Orletta, F. and Petrocchi, M. and Tesconi, M.}, TITLE = {Hate me, hate me not: Hate speech detection on Facebook}, YEAR = {2017}, ABSTRACT = {While favouring communications and easing information sharing, Social Network Sites are also used to launch harmful campaigns against specific groups and individuals. Cyberbullism, incitement to self-harm practices, sexual predation are just some of the severe effects of massive online offensives. Moreover, attacks can be carried out against groups of victims and can degenerate in physical violence. In this work, we aim at containing and preventing the alarming diffusion of such hate campaigns. Using Facebook as a benchmark, we consider the textual content of comments appeared on a set of public Italian pages. We first propose a variety of hate categories to distinguish the kind of hate. Crawled comments are then annotated by up to five distinct human annotators, according to the defined taxonomy. Leveraging morpho-syntactical features, sentiment polarity and word embedding lexicons, we design and implement two classifiers for the Italian language, based on different learning algorithms: the first based on Support Vector Machines (SVM) and the second on a particular Recurrent Neural Network named Long Short Term Memory (LSTM). We test these two learning algorithms in order to verify their classification performances on the task of hate speech recognition. The results show the effectiveness of the two classification approaches tested over the first manually annotated Italian Hate Speech Corpus of social media text}, KEYWORDS = {Hate speech, NLP, Social Networks}, PAGES = {86-95}, URL = {http://www.scopus.com/inward/record.url?eid=2-s2.0-85017337270\&partnerID=q2rCbXpz}, VOLUME = {1816}, CONFERENCE_NAME = {ITA-SEC 17}, } @INPROCEEDINGS{GIOVANNETTI_2017_INPROCEEDINGS_GABDD_340694, AUTHOR = {Giovannetti, E. and Albanesi, D. and Bellandi, A. and Dattilo, D. and Dell'Orletta, F.}, TITLE = {Stylometry in Computer-Assisted Translation: Experiments on the Babylonian Talmud}, YEAR = {2017}, ABSTRACT = {The purpose of this research is to experiment the application of stylometric techniques in the area of Computer-Assisted Translation to reduce the revision effort in the context of a collaborative, large scale translation project. The obtained results show a correlation between the editing extent and the compliance to some specific linguistic features, proving that supporting translators in writing translations following a desired style can actually reduce the number of following necessary interventions (and, consequently, save time) by revisors, editors and curators}, KEYWORDS = {traduco, babylonian talmud, computer-assisted translation, stylometry, readability}, PAGES = {177-182}, URL = {https://iris.cnr.it/handle/20.500.14243/340694}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {9788899982942}, CONFERENCE_NAME = {Fourth Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Torino}, BOOKTITLE = {Proceedings of 4th Italian Conference on Computational Linguistics (CLiC-it)}, EDITOR = {Basili, R. and Nissim, M. and Satta, G.}, } @INPROCEEDINGS{VADICAMO_2017_INPROCEEDINGS_VCFCDCT_340636, AUTHOR = {Vadicamo, L. and Carrara, F. and Falchi, F. and Cimino, A. and Dell'Orletta, F. and Cresci, S. and Tesconi, M.}, TITLE = {Cross-media learning for image sentiment analysis in the wild}, YEAR = {2017}, ABSTRACT = {Much progress has been made in the field of sentiment analysis in the past years. Researchers relied on textual data for this task, while only recently they have started investigating approaches to predict sentiments from multimedia content. With the increasing amount of data shared on social media, there is also a rapidly growing interest in approaches that work "in the wild", i. e. that are able to deal with uncontrolled conditions. In this work, we faced the challenge of training a visual sentiment classifier starting from a large set of user-generated and unlabeled contents. In particular, we collected more than 3 million tweets containing both text and images, and we leveraged on the sentiment polarity of the textual contents to train a visual sentiment classifier. To the best of our knowledge, this is the first time that a cross-media learning approach is proposed and tested in this context. We assessed the validity of our model by conducting comparative studies and evaluations on a benchmark for visual sentiment analysis. Our empirical study shows that although the text associated to each image is often noisy and weakly correlated with the image content, it can be profitably exploited to train a deep Convolutional Neural Network that effectively predicts the sentiment polarity of previously unseen images}, KEYWORDS = {Big data, Data Mining, Sentiment Analysis, Social Media Analysis}, PAGES = {10}, URL = {https://ieeexplore.ieee.org/document/8265255}, DOI = {10.1109/ICCVW.2017.45}, ISBN = {978-1-5386-1034-3}, CONFERENCE_NAME = {ICCV 2017 IEEE International Conference on Computer Vision Workshops}, } @MISC{CARRARA_2017_MISC_CCCDFVT_411838, AUTHOR = {Carrara, F. and Cimino, A. and Cresci, S. and Dell'Orletta, F. and Falchi, F. and Vadicamo, L. and Tesconi, M.}, TITLE = {T4SA: Twitter for Sentiment Analysis}, YEAR = {2017}, ABSTRACT = {T4SA is intended for training and testing image sentiment analysis approaches. It contains little less than a million tweets, corresponding to about 1. 5M images. We initially collected about 3. 4M tweets corresponding to about 4M images. We classified the sentiment polarity of the texts (as described in Section 4) and we selected the tweets having the most confident textual sentiment predictions to build our Twitter for Sentiment Analysis (T4SA) dataset. The dataset is publicly available at: http: //www. t4sa. it/}, KEYWORDS = {social media, sentiment analysis, image analysis, image sentiment analysis, deep learning, multimedia sentiment analysis, dataset, tweets}, URL = {http://www.t4sa.it/}, } @ARTICLE{APREDA_2016_ARTICLE_ABDF_392597, AUTHOR = {Apreda, R. and Bonaccorsi, A. and Dell'Orletta, F. and Fantoni, G.}, TITLE = {Functional technology foresight. A novel methodology to identify emerging technologies}, YEAR = {2016}, ABSTRACT = {The speed and complexity of the technology evolution faced by modern societies need new approaches to the analysis and understanding of the world. Indeed, an exclusive focus on technological goals can miss to recognize all the stakeholders of a technology and address real user needs; moreover, on the one hand low signals are becoming more and more important in fast evolving markets, on the other hand the excess of hype, fashions, or vested interests sometimes deeply alter indicators. However, the so called Big Data promise to be a huge low cost set of valuable information, available and affordable to all (SMEs included). But, analyzing them is not trivial especially if we deal with academic papers and patents. To tackle these issues, the present paper proposes to apply a powerful methodological tool called Functional Analysis to the Technology Foresight process. Actually the rigorous study of the functions, that an artefact should perform to satisfy the user needs, provides a universal and thus unifying point of view, which is able to correlate the user perspective on the product with its technical features. Functional reasoning has been applied to (i) detect possible patterns of development, spotting missing elements and highlighting strengths as well as potential sources of failure; (ii) to enhance traditional bibliometric tools such as the analysis of S-curves and (iii), integrated with a natural language processing analysis toolchain, tailored for patent documents, to identify emerging technologies. The paper describes the functional approach to technology foresight activity, presents how to integrate it with text mining algorithms and experts' domain knowledge, and finally discusses its benefits in the context of Technology Foresight also from an economic point of view, showing that oresight is affordable also for Small and Medium Enterprises}, URL = {https://iris.cnr.it/handle/20.500.14243/392597}, DOI = {10.1007/s40309-016-0093-1}, ISSN = {2195-2248}, JOURNAL = {EUROPEAN JOURNAL OF FUTURES RESEARCH}, } @ARTICLE{BRUNATO_2016_ARTICLE_BD_325818, AUTHOR = {Brunato, D. and Dell'Orletta, F.}, TITLE = {ISACCO: a corpus for investigating spoken and written language development in Italian school-age children}, YEAR = {2016}, ABSTRACT = {In this paper we present ISACCO (Italian School-Age Children COrpus), a corpus of oral and written retellings of Italian-speaking children attending primary school. All texts were digitalized and automatically enriched with multi-level linguistic annotation. Preliminary explorations of both the form and the content of children's productions were carried out based on a set of features automatically extracted by NLP tools. Written retellings were manually annotated with a typology of errors belonging to three different linguistic levels. The resource, which has been made publicly available1, is conceived to support research and computational modeling of "later language acquisition", with an emphasis on comparative assessment of the evolution of oral and written language competencies in early school grades}, KEYWORDS = {Child language acquisition, Oral and Written language, multi-level linguistic analysis}, PAGES = {63-76}, URL = {http://www.italianlp.it/wp-content/uploads/2016/09/04_brunato_dell-orletta.pdf}, VOLUME = {2 (1)}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @ARTICLE{BRUNATO_2016_ARTICLE_BDMV_372675, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Monitoraggio linguistico di Scritture Brevi: aspetti metodologici e primi risultati}, YEAR = {2016}, ABSTRACT = {Se da un lato le tecnologie del linguaggio svolgono un ruolo ormai indiscusso per l'accesso al contenuto testuale, ciò non appare scontato quando si va a considerare il loro ruolo nella valutazione delle strutture linguistiche sottostanti al testo. Questo contributo si focalizza sulla definizione di una metodologia innovativa di monitoraggio linguistico della lingua italiana che a partire dall'output di strumenti di annotazione linguistica automatica permette di ricostruire un profilo linguistico di una collezione di testi rappresentativa di una specifica varietà d'uso della lingua. Tale metodologia è stata applicata a un corpus di tweet allo scopo di far luce su interrogativi aperti quali la possibilità di rintracciare tendenze lessicali, morfo-sintattiche e sintattiche peculiari all'interno di questa tipologia testuale; di studiare come queste tendenze si rapportino ai tratti caratterizzanti della lingua scritta e parlata; di individuare possibili differenze nella forma linguistica in cui si twittano contenuti di natura diversa}, KEYWORDS = {Trattamento Automatico del Linguaggio, Monitoraggio Linguistico, Varietà d'Uso della Lingua, Lingua del Web}, PAGES = {149-176}, URL = {https://iris.cnr.it/handle/20.500.14243/372675}, VOLUME = {N. S. 5}, ISSN = {1825-2796}, JOURNAL = {QUADERNI DI AIŌN}, } @EDITORIAL{BRUNATO_2016_EDITORIAL_BDVFB_351623, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Venturi, G. and François, T. and Blache, P.}, TITLE = {Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC 2016)}, YEAR = {2016}, ABSTRACT = {Introduzione agli atti della prima edizione del workshop "Computational Linguistics for Linguistic Complexity" che raccoglie lavori che studiano da prospettive diverse il tema della complessità linguistica workshop allo scopo di promuovere una riflessione comune su approcci diversi all'indagine, al trattamento e alla valutazione di aspetti che rendono complessa la lingua}, KEYWORDS = {Linguistic Complexity, Computational Linguistics}, PAGES = {1-245}, URL = {https://aclweb.org/anthology/W/W16/W16-41.pdf}, ISBN = {978-4-87974-709-9}, } @INPROCEEDINGS{BARBAGLI_2016_INPROCEEDINGS_BLDMV_325812, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {CItA: an L1 Italian Learners Corpus to Study the Development of Writing Competence}, YEAR = {2016}, ABSTRACT = {In this paper, we present the CItA corpus (Corpus Italiano di Apprendenti L1), a collection of essays written by Italian L1 learners collected during the first and second year of lower secondary school. The corpus was built in the framework of an interdisciplinary study jointly carried out by computational linguistics and experimental pedagogists and aimed at tracking the development of written language competence over the years and students' background information}, KEYWORDS = {Italian Learner Corpus, Diachronic Evolution of Written Language Competence, Error Annotation}, PAGES = {88-95}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/536_Paper.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, CONFERENCE_PLACE = {Paris}, } @INPROCEEDINGS{BRUNATO_2016_INPROCEEDINGS_BCDV_333951, AUTHOR = {Brunato, D. and Cimino, A. and Dell'Orletta, F. and Venturi, G.}, TITLE = {PaCCSS-IT: A Parallel Corpus of Complex-Simple Sentences for Automatic Text Simplification}, YEAR = {2016}, ABSTRACT = {In this paper we present PaCCSS-IT, a Parallel Corpus of Complex-Simple Sentences for ITalian. To build the resource we develop a new method for automatically acquiring a corpus of complex-simple paired sentences able to intercept structural transformations and particularly suitable for text simplification. The method requires a wide amount of texts that can be easily extracted from the web making it suitable also for less-resourced languages. We test it on the Italian language making available the biggest Italian corpus for automatic text simplification}, KEYWORDS = {Automatic Text Simplification, Sentence alignment, Italian corpus}, PAGES = {351-361}, URL = {https://www.aclweb.org/anthology/D/D16/D16-1034.pdf}, DOI = {10.18653/v1/d16-1034}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-945626-25-8}, CONFERENCE_NAME = {Conference on Empirical Methods in Natural Language Processing (EMNLP 2016)}, CONFERENCE_PLACE = {Stroudsburg}, } @INPROCEEDINGS{CIMINO_2016_INPROCEEDINGS_CD_333954, AUTHOR = {Cimino, A. and Dell'Orletta, F.}, TITLE = {Tandem LSTM-SVM approach for sentiment analysis}, YEAR = {2016}, ABSTRACT = {In this paper we describe our approach to EVALITA 2016 SENTIPOLC task. We participated in all the subtasks with constrained setting: Subjectivity Classification, Polarity Classification and Irony Detection. We developed a tandem architecture where Long Short Term Memory recurrent neural network is used to learn the feature space and to capture temporal dependencies, while the Support Vector Machines is used for classification. SVMs combine the document embedding produced by the LSTM with a wide set of general-purpose features qualifying the lexical and grammatical structure of the text. We achieved the second best accuracy in Subjectivity Classification, the third position in Polarity Classification, the sixth position in Irony Detection}, URL = {https://iris.cnr.it/handle/20.500.14243/333954}, } @INPROCEEDINGS{CIMINO_2016_INPROCEEDINGS_CD_333953, AUTHOR = {Cimino, A. and Dell'Orletta, F.}, TITLE = {Building the state-of-the-art in POS tagging of Italian Tweets}, YEAR = {2016}, ABSTRACT = {In this paper we describe our approach to EVALITA 2016 POS tagging for Italian Social Media Texts (PoSTWITA). We developed a two-branch bidirectional Long Short Term Memory recurrent neural network, where the first bi-LSTM uses a typical vector representation for the input words, while the second one uses a newly introduced word-vector representation able to encode information about the characters in the words avoiding the increasing of computational costs due to the hierarchical LSTM introduced by the character-based LSTM architectures. The vector representations calculated by the two LSTM are then merged by the sum operation. Even if participants were allowed to use other annotated resources in their systems, we used only the distributed data set to train our system. When evaluated on the official test set, our system outperformed all the other systems achieving the highest accuracy score in EVALITA 2016 PoSTWITA, with a tagging accuracy of 93. 19%. Further experiments carried out after the official evaluation period allowed us to develop a system able to achieve a higher accuracy. These experiments showed the central role played by the handcrafted features even when machine learning algorithms based on neural networks are used}, URL = {https://iris.cnr.it/handle/20.500.14243/333953}, } @INPROCEEDINGS{DELLORLETTA_2016_INPROCEEDINGS_DM_325820, AUTHOR = {Dell'Orletta, F. and Montemagni S, V. G.}, TITLE = {Esplorazioni computazionali nello spazio dell'interlingua: verso una nuova metodologia di indagine}, YEAR = {2016}, ABSTRACT = {Il presente contributo intende proporre un innovativo approccio all'identificazione delle caratteristiche linguistiche che aiutano a definire l'interlingua. Tale approccio consiste nella ricostruzione del profilo linguistico di corpora di produzioni scritte da apprendenti una lingua seconda basato su strumenti di trattamento automatico del linguaggio}, KEYWORDS = {interlingua, annotazione linguistica automatica, monitoraggio linguistico}, PAGES = {143-161}, URL = {https://www.bulzoni.it/it/catalogo/lingue-in-contatto-contact-linguistics.html}, PUBLISHER = {Bulzoni Editore (Roma, ITA)}, ISBN = {978-88-6897-029-1}, CONFERENCE_NAME = {XLVIII Congresso Internazionale di Studi della Società di Linguistica Italiana (SLI 2014)}, CONFERENCE_PLACE = {Roma}, } @INPROCEEDINGS{DELLORLETTA_2016_INPROCEEDINGS_DV_325815, AUTHOR = {Dell'Orletta, F. and Venturi, G.}, TITLE = {ULISSE: una strategia di adattamento al dominio per l'annotazione sintattica automatica}, YEAR = {2016}, ABSTRACT = {This paper deals with Domain Adaptation for automatic syntactic annotation. Until the half of the 1980s, automatic linguistic annotation was based on algorithms built on groups of hand-written rules, defined a priori on the basis of the knowledge of the system to formalise. Subsequently, thanks to the progress of research in the field of Artificial Intelligence and to the development of linguistic resources, algorithms based on machine learning techniques began to be employed. The major difficulties of those algorithms were due to certain aspects of natural language such as ambiguities, diachronic evolutions, or language variations from the original domain of knowledge. More specifically, the issue of Domain Adaptation can be put in the following terms: "can an annotated corpus [which is representative of a specific linguistic variety] be used for the syntactic analysis of a second corpus [which is representative of a different linguistic variety]?". The author answer presenting an algorithm called ULISSE (Unsupervised LInguistically-driven Selection of dEpendency parses), which selects in an optima way the most representative sentences of a new target domain and feed them to the parser in addition to the original training set}, KEYWORDS = {Domain Adaptation, annotazione sintattica automatica}, PAGES = {55-79}, URL = {http://www.italianlp.it/wp-content/uploads/2016/10/Compter_Parler_Soigner_ULISSE.pdf}, ISBN = {978-88-6952-038-9}, CONFERENCE_NAME = {Atti del convegno "Compter parler soigner: tra linguistica e intelligenza artificiale"}, } @INPROCEEDINGS{ORLETTI_2016_INPROCEEDINGS_ODI_333948, AUTHOR = {Orletti, F. and Dell'Orletta, F. and Iovino, R.}, TITLE = {La leggibilità dei testi di ambito medico rivolti al paziente: Il caso dei bugiardini di farmaci senza obbligo di prescrizione medica}, YEAR = {2016}, ABSTRACT = {In this paper we present the first results of an exploratory analysis of simplification of the package leaflets of medicines, considered representative texts of doctor-patient communication. It will be shown how natural language processing tools can be used to reconstruct the linguistic profile of these texts and to guide their simplification}, KEYWORDS = {leggibilità}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85009291162\&origin=inward}, VOLUME = {1749}, CONFERENCE_NAME = {Third Italian Conference on Computational Linguistics (CLiC-it)}, } @INPROCEEDINGS{PIERI_2016_INPROCEEDINGS_PBD_392599, AUTHOR = {Pieri, G. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Studio sull'ordinamento dei costituenti nel confronto tra generi e complessità}, YEAR = {2016}, ABSTRACT = {In questo articolo presentiamo uno studio sull'ordine dei costituenti in italiano basato su corpora annotati in maniera automatica fino all'analisi sintattica a dipendenze. L'indagine comparativa ha permesso di valutare l'influenza sia del genere testuale sia della complessità linguistica nella distribuzione dei fenomeni di marcatezza sintattica}, URL = {https://iris.cnr.it/handle/20.500.14243/392599}, } @INPROCEEDINGS{PIERI_2016_INPROCEEDINGS_PBD_333949, AUTHOR = {Pieri, G. and Brunato, D. and Dell'Orletta, F.}, TITLE = {Studio sull'ordinamento dei costituenti nel confronto tra generi e complessità}, YEAR = {2016}, ABSTRACT = {In this paper we present a study on the order of constituents in Italian based on automatically dependency-parsed corpora. The comparative investigation has allowed to evaluate the influence of the textual genre and the linguistic complexity on the distribution of phenonemena of syntactic markedeness}, KEYWORDS = {Complessità linguistica, Corpora annotati, Generi testuali}, PAGES = {5}, URL = {http://ceur-ws.org/Vol-1749/paper44.pdf}, VOLUME = {1749}, CONFERENCE_NAME = {Third Italian Conference on Computational Linguistics (CLiC-it 2016)}, EDITOR = {Basile, P. and Corazza, A. and Cutugno, F. and Montemagni, S. and Nissim, M. and Patti, V. and Semeraro, G. and Sprugnoli, R.}, } @INPROCEEDINGS{TUSA_2016_INPROCEEDINGS_TDMV_325817, AUTHOR = {Tusa, E. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dieci sfumature di marcatezza sintattica: Verso una nozione computazionale di complessita}, YEAR = {2016}, ABSTRACT = {In this work, we will investigate whether and to what extent algorithms typically used to assess the reliability of the output of syntactic parsers can be used to study the correlation between processing complexity and the linguistic notion of markedness. Although still preliminary, achieved results show the key role of features such as dependency direction and length in defining the markedness degrees of a given syntactic construction}, KEYWORDS = {marcatezza sintattica, complessità linguistica, annotazione linguistica automatica}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85009279517\&origin=inward}, VOLUME = {1749}, CONFERENCE_NAME = {Italian Conference on Computational Linguistics (CLiC-it)}, } @ARTICLE{ATTARDI_2015_ARTICLE_ABBCDMPSS_333938, AUTHOR = {Attardi, G. and Basile, V. and Bosco, C. and Caselli, T. and Dell'Orletta, F. and Montemagni, S. and Patti, V. and Simi, M. and Sprugnoli, R.}, TITLE = {State of the Art Language Technologies for Italian: The EVALITA 2014 Perspective}, YEAR = {2015}, ABSTRACT = {Shared task evaluation campaigns represent a well established form of competitive evaluation, an important opportunity to propose and tackle new challenges for a specific research area and a way to foster the development of benchmarks, tools and resources. The advantages of this approach are evident in any experimental field, including the area of Natural Language Processing. An outlook on state-of-the-art language technologies for Italian can be obtained by reflecting on the results of the recently held workshop "Evaluation of NLP and Speech Tools for Italian", EVALITA 2014. The motivations underlying individual shared tasks, the level of knowledge and development achieved within each of them, the impact on applications, society and economy at large as well as directions for future research will be discussed from this perspective}, KEYWORDS = {Evaluation Campaign, Natural Language Processing, Dependency Parsing, Sentiment Analysis, Temporal Processing}, PAGES = {43-61}, URL = {https://iris.cnr.it/handle/20.500.14243/333938}, VOLUME = {9 (1)}, DOI = {10.3233/IA-150076}, ISSN = {1724-8035}, JOURNAL = {INTELLIGENZA ARTIFICIALE}, } @ARTICLE{BARBAGLI_2015_ARTICLE_BLDMV_322610, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Il ruolo delle tecnologie del linguaggio nel monitoraggio dell'evoluzione delle abilità di scrittura: primi risultati}, YEAR = {2015}, ABSTRACT = {L'ultimo decennio ha visto l'affermarsi a livello internazionale dell'uso di tecnologie del linguaggio per lo studio dei processi di apprendimento. Questo contributo riporta i primi e promettenti risultati di uno studio interdisciplinare che si è avvalso di metodi e tecniche di analisi propri della linguistica computazionale, della linguistica e della pedagogia sperimentale. Lo studio, finalizzato al monitoraggio dell'evoluzione del processo di apprendimento della lingua italiana, è stato condotto a partire dalle produzione scritte di studenti della scuola secondaria di primo grado con strumenti di annotazione linguistica automatica e di estrazione di conoscenza e ha portato all'identificazione di un insieme di tratti qualificanti il processo di apprendimento linguistico}, KEYWORDS = {evoluzione delle competenze linguistiche, Didattica Sperimentale, Estrazione di conoscenza, Annotazione linguistica automatica}, PAGES = {99-117}, URL = {https://journals.openedition.org/ijcol/326}, DOI = {10.4000/ijcol.326}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @INPROCEEDINGS{BARBAGLI_2015_INPROCEEDINGS_BLDMV_322147, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {CItA: un Corpus di Produzioni Scritte di Apprendenti l'Italiano L1 Annotato con Errori}, YEAR = {2015}, ABSTRACT = {In questo articolo presentiamo CItA il primo corpus di produzioni scritte di apprendenti l'italiano L1 del primo e del secondo anno della scuola secondaria di primo grado annotato con errori grammaticali, ortografici e lessicali. Le specificità del corpus e la sua natura diacronica lo rendono particolarmente utile sia per applicazioni linguistico-computazionali sia per studi socio-pedagogici}, KEYWORDS = {Apprendiemento della lingua madre, evoluzione delle competenze linguistiche}, PAGES = {31-35}, URL = {http://www.italianlp.it/wp-content/uploads/2016/03/CItA_errori.pdf}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {978-88-99200-62-6}, CONFERENCE_NAME = {2nd Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Torino}, } @INPROCEEDINGS{BRUNATO_2015_INPROCEEDINGS_BD_321093, AUTHOR = {Brunato, D. and Dell'Orletta, F.}, TITLE = {ISACCO: a corpus for investigating spoken and written language development in Italian school-age children}, YEAR = {2015}, ABSTRACT = {We present ISACCO (Italian school-age children corpus)1, a new corpus of oral and written retellings of Italian speaking children attending the primary school. All texts were digitalized and automatically enriched with linguistic information allowing preliminary explorations based on NLP features. Written retellings were also manually annotated with a typology of linguistic errors. The resource is conceived to support research and computational modeling of "later language acquisition", with an emphasis for comparative assessment of oral and written language skills across early school grades}, KEYWORDS = {Child language acquisition, Oral and written language, multi-level linguistic analysis}, PAGES = {62-66}, URL = {http://www.italianlp.it/wp-content/uploads/2016/03/IsaccoCorpus.pdf}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {978-88-99200-62-6}, CONFERENCE_NAME = {Second Italian Conference on Computational Linguistics (CLiC-it 2015)}, CONFERENCE_PLACE = {Torino}, BOOKTITLE = {Proceedings of the Second Italian Conference on Computational Linguistics (CLiC-it 2015)}, EDITOR = {Bosco, C. and Tonelli, S. and Zanzotto, F. M.}, } @INPROCEEDINGS{BRUNATO_2015_INPROCEEDINGS_BDVM_296574, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Design and Annotation of the First Italian Corpus for Text Simplification}, YEAR = {2015}, ABSTRACT = {In this paper, we present design and construction of the first Italian corpus for automatic and semi-automatic text simplification. In line with current approaches, we propose a new annotation scheme specifically conceived to identify the typology of changes an original sentence undergoes when it is manually simplified. Such a scheme has been applied to two aligned Italian corpora, containing original texts with corresponding simplified versions, selected as representative of two different manual simplification strategies and addressing different target reader populations. Each corpus was annotated with the operations foreseen in the annotation scheme, covering different levels of linguistic description. Annotation results were analysed with the final aim of capturing peculiarities and differences of the different simplification strategies pursued in the two corpora}, KEYWORDS = {Annotation Scheme, Automatic Text Simplification}, PAGES = {31-34}, URL = {https://aclweb.org/anthology/W/W15/W15-1604.pdf}, ISBN = {978-1-941643-47-1}, CONFERENCE_NAME = {Proceedings of LAW IX-The 9th Linguistic Annotation Workshop}, } @INPROCEEDINGS{CRESCI_2015_INPROCEEDINGS_CCDT_303899, AUTHOR = {Cresci, S. and Cimino, A. and Dell'Orletta, F. and Tesconi, M.}, TITLE = {Crisis Mapping during Natural Disasters via Text Analysis of Social Media Messages}, YEAR = {2015}, ABSTRACT = {Recent disasters demonstrated the central role of social media during emergencies thus motivating the exploitation of such data for crisis mapping. We propose a crisis mapping system that addresses limitations of current state-of-the-art approaches by analyzing the textual content of disaster reports from a twofold perspective. A damage detection component employs a SVM classifier to detect mentions of damage among emergency reports. A novel geoparsing technique is proposed and used to perform message geolocation. We report on a case study to show how the information extracted through damage detection and message geolocation can be combined to produce accurate crisis maps. Our crisis maps clearly detect both highly and lightly damaged areas, thus opening up the possibility to prioritize rescue efforts where they are most needed}, KEYWORDS = {crisis informatics, Emergency Management, geoparsing, social media mining, Twitter}, PAGES = {1-8}, URL = {https://iris.cnr.it/handle/20.500.14243/303899}, CONFERENCE_NAME = {Web Information Systems Engineering-WISE 2015}, } @INPROCEEDINGS{CRESCI_2015_INPROCEEDINGS_CTCD_271161, AUTHOR = {Cresci, S. and Tesconi, M. and Cimino, A. and Dell'Orletta, F.}, TITLE = {A Linguistically-driven Approach to Cross-Event Damage Assessment of Natural Disasters from Social Media Messages}, YEAR = {2015}, ABSTRACT = {This work focuses on the analysis of Italian social media messages for disaster management and aims at the detection of messages carrying critical information for the damage assessment task. A main novelty of this study consists in the focus on out-domain and cross-event damage detection, and on the investigation of the most relevant tweet-derived features for these tasks. We devised different experiments by resorting to a wide set of linguistic features qualifying the lexical and grammatical structure of a text as well as ad-hoc features specifically implemented for this task. We investigated the most effective features that allow to achieve the best results. A further result of this study is the construction of the first manually annotated Italian corpus of social media messages for damage assessment}, KEYWORDS = {crisis informatics, Damage assessment, Emergency Management, feature selection, social media mining, Social Sensing}, PAGES = {6}, URL = {https://iris.cnr.it/handle/20.500.14243/271161}, CONFERENCE_NAME = {Proceedings of the 24th international conference companion on World Wide Web. ACM, 2015}, } @INPROCEEDINGS{FERRARI_2015_INPROCEEDINGS_FSGD_310030, AUTHOR = {Ferrari, A. and Spagnolo, G. O. and Gnesi, S. and Dell'Orletta, F.}, TITLE = {CMT and FDE: tools to bridge the gap between natural language documents and feature diagrams}, YEAR = {2015}, ABSTRACT = {A business subject who wishes to enter an established technological market is required to accurately analyse the features of the products of the different competitors. Such features are normally accessible through natural language (NL) brochures, or NL Web pages, which describe the products to potential customers. Building a feature model that hierarchically summarises the different features available in competing products can bring relevant benefits in market analysis. A company can easily visualise existing features, and reason about aspects that are not covered by the available solutions. However, designing a feature model starting from publicly available documents of existing products is a time consuming and error-prone task. In this paper, we present two tools, namely Commonality Mining Tool (CMT) and Feature Diagram Editor (FDE), which can jointly support the feature model definition process. CMT allows mining common and variant features from NL descriptions of existing products, by leveraging a natural language processing (NLP) approach based on contrastive analysis, which allows identifying domain-relevant terms from NL documents. FDE takes the commonalities and variabilities extracted by CMT, and renders them in a visual form. Moreover, FDE allows the graphical design and refinement of the final feature model, by means of an intuitive GUI}, KEYWORDS = {Software Product Lines, Variability Mining, Tools}, PAGES = {402-410}, URL = {http://dl.acm.org/citation.cfm?doid=2791060.2791117}, DOI = {10.1145/2791060.2791117}, ISBN = {978-1-4503-3613-0}, CONFERENCE_NAME = {19th International Conference on Software Product Line}, } @INPROCEEDINGS{RICHTER_2015_INPROCEEDINGS_RCDV_322145, AUTHOR = {Richter, S. and Cimino, A. and Dell'Orletta, F. and Venturi, G.}, TITLE = {Tracking the Evolution of Written Language Competence: an NLP-based Approach}, YEAR = {2015}, ABSTRACT = {In this paper, we present an NLP-based innovative approach for tracking the evolution of written language competence relying on different sets of linguistic features that predict text quality. This approach was tested on a corpus essays written by Italian L1 learners of the first and second year of the lower secondary school}, KEYWORDS = {Evolution of Written Language Competence, multi-level linguistic analysis}, PAGES = {236-240}, URL = {http://www.italianlp.it/wp-content/uploads/2016/03/tracking-language-competence.pdf}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {978-88-99200-62-6}, CONFERENCE_NAME = {2nd Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Torino}, } @INPROCEEDINGS{SPRUGNOLI_2015_INPROCEEDINGS_SDCMB_333943, AUTHOR = {Sprugnoli, R. and Dell'Orletta, F. and Caselli, T. and Montemagni, S. and Bosco, C.}, TITLE = {Parsing Events: a New Perspective on Old Challenges}, YEAR = {2015}, ABSTRACT = {The paper proposes a new evaluation exercise, meant to shed light on the syntax-semantics interface for the analysis of written Italian and resulting from the combination of the EVALITA 2014 dependency parsing and event extraction tasks. It aims at investigating the cross-fertilization of tasks, generating a new resource combining dependency and event annotations, and devising metrics able to evaluate the applicative impact of the achieved results}, URL = {https://iris.cnr.it/handle/20.500.14243/333943}, } @INPROCEEDINGS{VENTURI_2015_INPROCEEDINGS_VBDM_304237, AUTHOR = {Venturi, G. and Bellandi, T. and Dell'Orletta, F. and Montemagni, S.}, TITLE = {NLP-Based Readability Assessment of Health-Related Texts: a Case Study on Italian Informed Consent Forms}, YEAR = {2015}, ABSTRACT = {The paper illustrates the results of a case study aimed at investigating and enhancing the accessibility of Italian health-related documents by relying on advanced NLP techniques, with particular attention to informed consent forms. Results achieved show that the features automatically extracted from the linguistically annotated text and ranging across different levels of linguistic description have a high discriminative power in order to guarantee a reliable readability assessment}, KEYWORDS = {Readability assessment, health-related information}, PAGES = {131-141}, URL = {http://www.aclweb.org/anthology/W15-2618}, ISBN = {978-1-941643-32-7}, CONFERENCE_NAME = {Sixth International Workshop on Health Text Mining and Information Analysis (Louhi)}, } @ARTICLE{DELLORLETTA_2014_ARTICLE_DMV_260898, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Assessing document and sentence readability in less resourced languages and across textual genres}, YEAR = {2014}, ABSTRACT = {In this paper, we tackle three underresearched issues of the automatic readability assessment literature, namely the evaluation of text readability in less resourced languages, with respect to sentences (as opposed to documents) as well as across textual genres. Different solutions to these issues have been tested by using and refining READ-IT, the first advanced readability assessment tool for Italian, which combines traditional raw text features with lexical, morpho-syntactic and syntactic information. In READ-IT readability assessment is carried out with respect to both documents and sentences, with the latter constituting an important novelty of the proposed approach: READ-IT shows a high accuracy in the document classification task and promising results in the sentence classification scenario. By comparing the results of two versions of READ-IT, adopting a classification-versus ranking-based approach, we also show that readability assessment is strongly influenced by textual genre; for this reason a genre-oriented notion of readability is needed. With classification-based approaches, reliable results can only be achieved with genre-specific models: Since this is far from being a workable solution, especially for less resourced languages, a new ranking method for readability assessment is proposed, based on the notion of distance}, KEYWORDS = {readability assessment, less resourced languages, multi-level linguistic annotation, textual genres}, PAGES = {163-193}, URL = {http://www.ingentaconnect.com/content/jbp/itl/2014/00000165/00000002/art00005}, VOLUME = {165 (2)}, DOI = {10.1075/itl.165.2.03del}, } @EDITORIAL{BOSCO_2014_EDITORIAL_BCDFMS_297502, AUTHOR = {Bosco, C. and Cosi, P. and Dell'Orletta, F. and Falcone, M. and Montemagni, S. and Simi, M.}, TITLE = {Proceedings of the Fourth International Workshop EVALITA 2014}, YEAR = {2014}, KEYWORDS = {Trattamento Automatico del Linguaggio, Speech Processing, Lingua Italiana}, PAGES = {167}, URL = {http://clic.humnet.unipi.it/proceedings/Proceedings-EVALITA-2014.pdf}, PUBLISHER = {Pisa University Press (Pisa, ITA)}, ISBN = {978-88-67414-72-7}, CONFERENCE_PLACE = {Pisa}, } @INPROCEEDINGS{BARBAGLI_2014_INPROCEEDINGS_BLDMV_266268, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Tecnologie del linguaggio e monitoraggio dell'evoluzione delle abilità di scrittura nella scuola secondaria di primo grado}, YEAR = {2014}, ABSTRACT = {Over the last ten years, the use of language technologies was successfully extended to the study of learning processes. The paper reports the first results of a study, which is part of a broader experimental pedagogy project, aimed at monitoring the evolution of the learning process of the Italian language based on a corpus of written productions by students and exploiting automatic linguistic annotation and knowledge extraction tools}, PAGES = {23-27}, URL = {http://www.italianlp.it/wp-content/uploads/2014/12/Tecnologie-del-linguaggio-per-la-scuola.pdf}, DOI = {10.12871/CLICIT201415}, PUBLISHER = {Pisa University Press srl (Pisa, ITA)}, ISBN = {978-8-86741-472-7}, CONFERENCE_NAME = {First Italian Conference on Computational Linguistics (CLiC-it 2014)}, CONFERENCE_PLACE = {Pisa}, BOOKTITLE = {Proceedings of the First Italian Conference on Computational Linguistics (CLiC-it 2014)}, EDITOR = {Basili, R. and Lenci, A. and Magnini, B.}, } @INPROCEEDINGS{BOSCHETTI_2014_INPROCEEDINGS_BCDLPPVM_228541, AUTHOR = {Boschetti, F. and Cimino, A. and Dell'Orletta, F. and Lebani, G. E. and Passaro, L. and Picchi, P. and Venturi, G. and Montemagni, S. L. A.}, TITLE = {Computational Analysis of Historical Documents: An Application to Italian War Bulletins in World War I and II}, YEAR = {2014}, ABSTRACT = {World War (WW) I and II represent crucial landmarks in the history on mankind: They have affected the destiny of whole generations and their consequences are still alive throughout Europe. In this paper we present an ongoing project to carry out a computational analysis of Italian war bulletins in WWI and WWII, by applying state-of-the-art tools for NLP and Information Extraction. The annotated texts and extracted information will be explored with a dedicated Web interface, allowing for multidimensional access and exploration of historical events through space and time}, KEYWORDS = {World War I}, PAGES = {70-75}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/workshops/LREC2014Workshop-LRT4HDA%20Proceedings.pdf}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, CONFERENCE_NAME = {LREC 2014}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of workshop on Language resources and technologies for processing and linking historical documents and archives-Deploying Linked Open Data in Cultural Heritage-LREC 2014, 26 May, Reykjavik, Iceland}, } @INPROCEEDINGS{BRUNATO_2014_INPROCEEDINGS_BDVM_266263, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Defining an annotation scheme with a view to automatic text simplification}, YEAR = {2014}, ABSTRACT = {This paper presents the preliminary steps of ongoing research in the field of automatic text simplification. In line with current approaches, we propose here a new annotation scheme specifically conceived to identify the typologies of changes an original sentence undergoes when it is manually simplified. Such a scheme has been tested on a parallel corpus available for Italian, which we have first aligned at sentence level and then annotated with simplification rules}, PAGES = {87-92}, URL = {http://www.italianlp.it/wp-content/uploads/2014/12/Text-simplification.pdf}, DOI = {10.12871/CLICIT2014118}, PUBLISHER = {Pisa University Press srl (Pisa, ITA)}, ISBN = {978-8-86741-472-7}, CONFERENCE_NAME = {First Italian Conference on Computational Linguistics (CLiC-it 2014)}, CONFERENCE_PLACE = {Pisa}, BOOKTITLE = {Proceedings of the First Italian Conference on Computational Linguistics (CLiC-it 2014)}, EDITOR = {Basili, R. and Lenci, A. and Magnini, B.}, } @INPROCEEDINGS{CIMINO_2014_INPROCEEDINGS_CCDT_255596, AUTHOR = {Cimino, A. and Cresci, S. and Dell'Orletta, F. and Tesconi, M.}, TITLE = {Linguistically-motivated and Lexicon Features for Sentiment Analysis of Italian Tweets}, YEAR = {2014}, ABSTRACT = {In this paper we describe our approach to EVALITA 2014 SENTIment POLarity Classification (SENTIPOLC) task. We participated only in the Polarity Classification sub-task. By resorting to a wide set of general-purpose features qualifying the lexical and grammatical structure of a text, automatically created ad-hoc lexicons and existing free available resources, we achieved the second best accuracy}, KEYWORDS = {Lexicons resources}, URL = {https://iris.cnr.it/handle/20.500.14243/255596}, CONFERENCE_NAME = {The 4th Conference for Evaluation of NLP and Speech Tools for Italian (EVALITA)}, } @INPROCEEDINGS{DELLORLETTA_2014_INPROCEEDINGS_DVCM_226944, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Cimino, A. and Montemagni, S.}, TITLE = {T2K: a System for Automatically Extracting and Organizing Knowledge from Texts}, YEAR = {2014}, ABSTRACT = {In this paper, we present T2K, a suite of tools for automatically extracting domain-specific knowledge from collections of Italian and English texts. T2K (Text-To-Knowledge v2) relies on a battery of tools for Natural Language Processing (NLP), statistical text analysis and machine learning which are dynamically integrated to provide an accurate and incremental representation of the content of vast repositories of unstructured documents. Extracted knowledge ranges from domain-specific entities and named entities to the relations connecting them and can be used for indexing document collections with respect to different information types. T2K also includes "linguistic profiling" functionalities aimed at supporting the user in constructing the acquisition corpus, e. g. in selecting texts belonging to the same genre or characterized by the same degree of specialization or in monitoring the "added value" of newly inserted documents. T2K is a web application which can be accessed from any browser through a personal account which has been tested in a wide range of domains}, KEYWORDS = {Natural Language Processing, Information Extraction, Knowledge Management}, PAGES = {2062-2070}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/590_Paper.pdf}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {International Conference on Language Resources and Evaluation (LREC)}, } @INPROCEEDINGS{DELLORLETTA_2014_INPROCEEDINGS_DWCVM_266274, AUTHOR = {Dell'Orletta, F. and Wieling, M. and Cimino, A. and Venturi, G. and Montemagni, S.}, TITLE = {Assessing the readability of sentences: which corpora and features?}, YEAR = {2014}, ABSTRACT = {The paper investigates the problem of sentence readability assessment, which is modelled as a classification task, with a specific view to text simplification. In particular, it addresses two open issues connected with it, i. e. the corpora to be used for training, and the identification of the most effective features to determine sentence readability. An existing readability assessment tool developed for Italian was specialized at the level of training corpus and learning algorithm. A maximum entropy-based feature selection and ranking algorithm (grafting) was used to identify to the most relevant features: it turned out that assessing the readability of sentences is a complex task, requiring a high number of features, mainly syntactic ones}, PAGES = {163-173}, URL = {http://acl2014.org/acl2014/W14-18/pdf/W14-1820.pdf}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-941643-03-7}, CONFERENCE_NAME = {9th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2014)}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Proceedings of 9th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2014)}, } @INPROCEEDINGS{FERRARI_2014_INPROCEEDINGS_FDSG_261380, AUTHOR = {Ferrari, A. and Dell'Orletta, F. and Spagnolo, G. and Gnesi, S.}, TITLE = {Measuring and improving the completeness of natural language requirements}, YEAR = {2014}, ABSTRACT = {[Context and motivation] System requirements specifications are normally written in natural language. These documents are required to be complete with respect to the input documents of the requirements definition phase, such as preliminary specifications, transcripts of meetings with the customers, etc. In other terms, they shall include all the relevant concepts and all the relevant interactions among concepts expressed in the input documents. [Question/Problem] Means are required to measure and improve the completeness of the requirements with respect to the input documents. [Principal idea/results] To measure this completeness, we propose two metrics that take into account the relevant terms of the input documents, and the relevant relationships among terms. Furthermore, to improve the completeness, we present a natural language processing tool named Completeness Assistant for Requirements (CAR), which supports the definition of the requirements: the tool helps the requirements engineer in discovering relevant concepts and interactions. [Contribution] We have performed a pilot test with CAR, which shows that the tool can help improving the completeness of the requirements with respect to the input documents. The study has also shown that CAR is actually useful in the identification of specific/alternative system behaviours that might be overseen without the tool. © 2014 Springer International Publishing Switzerland}, KEYWORDS = {natural language processing, relation extraction, Requirements analysis}, PAGES = {23-38}, URL = {https://link.springer.com/chapter/10.1007%2F978-3-319-05843-6_3#citeas}, DOI = {10.1007/978-3-319-05843-6_3}, ISBN = {978-3-319-05843-6}, CONFERENCE_NAME = {REFSQ 2014, Requirements Engineering: Foundation for Software Quality. 20th International Working Conference}, BOOKTITLE = {Requirements Engineering: Foundation for Software Quality 20th International Working Conference, REFSQ 2014, Essen, Germany, April 7-10, 2014. Proceedings}, EDITOR = {Salinesi, C. and Van De Weerd, I.}, } @INPROCEEDINGS{LYDING_2014_INPROCEEDINGS_LSBBCDDLP_261825, AUTHOR = {Lyding, V. and Stemle, E. and Borghetti, C. and Brunello, M. and Castagnoli, S. and Dell'Orletta, F. and Dittmann, H. and Lenci, A. and Pirrelli, V.}, TITLE = {The PAISÀ Corpus of Italian Web Texts}, YEAR = {2014}, ABSTRACT = {PAIS`A is a Creative Commons licensed, large web corpus of contemporary Italian. We describe the design, harvesting, and processing steps involved in its creation}, PAGES = {36-43}, URL = {http://aclweb.org/anthology/W14-04}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, CONFERENCE_NAME = {Corpus annotation, Tree-bank, Corpus design, Corpus harvesting}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Proceedings of the 9th Web as Corpus Workshop (WaC-9)}, EDITOR = {Bildhauer, F. and Schäfer, R.}, } @ARTICLE{DELLORLETTA_2013_ARTICLE_DVM_280032, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Linguistically-driven selection of correct arcs for dependency parsing}, YEAR = {2013}, ABSTRACT = {LISCA is an unsupervised algorithm aimed at assigning a quality score to each arc generated by a dependency parser in order to produce a decreasing ranking of arcs from correct to incorrect ones. LISCA exploits statistics about a set of linguistically-motivated and dependency-based features extracted from a large corpus of automatically parsed sentences and uses them to assign a quality score to each arc of a parsed sentence belonging to the same domain of the automatically parsed corpus. LISCA has been successfully tested on two datasets belonging to two different domains and in all experiments it turned out to outperform different baselines, thus showing to be able to reliably detect correct arcs also representing domain-specific peculiarities}, KEYWORDS = {Correct arcs, Dependency parsing}, PAGES = {125-136}, URL = {http://cys.cic.ipn.mx/ojs/index.php/CyS/article/view/1517}, VOLUME = {17 (2)}, ISSN = {1405-5546}, JOURNAL = {COMPUTACIÓN Y SISTEMAS}, } @ARTICLE{FANTONI_2013_ARTICLE_FADM_226945, AUTHOR = {Fantoni, G. and Apreda, R. and Dell'Orletta, F. and Monge, M.}, TITLE = {Automatic extraction of function-behaviour-state information from patents}, YEAR = {2013}, ABSTRACT = {Patents contain a large quantity of technical information not available elsewhere and therefore very interesting for both academia and industry. The purpose of the research is to try to detect and extract information about the functions, the physical behaviours and the states of the system directly from the text of a patent in an automatic way. The above three categories constitute a well-known set of relevant entities in the theory of engineering design, and their study allows powerful analysis of individual artefacts as well as that of groups of products or technologies. The focus is in providing a handy tool that could speed up and facilitate human analysis and allow tackling also large corpora of documents. A second goal is to develop a protocol based on free software and database resources, so that it could be replicable with limited effort by everyone without having to rely on commercial databases. Extracting technical and design information from a document whose aim is more legal than technical, and that is written using a specific jargon, is not a trivial task. The approach chosen to overcome the various issues is to support state-of-the-art Computational Linguistic tools with a large Knowledge Base. The latter has been constructed both manually and automatically and comprises not only keywords but also concepts, relationships and regular expressions. A case study about a very recent patent describing a mechanical device has been included to show the functioning and output of the entire system. © 2013 Elsevier Ltd. All rights reserved}, KEYWORDS = {Function-Behaviour-Structure, Pa, Product development, Semantic elaboration}, PAGES = {317-334}, URL = {http://www.sciencedirect.com/science/article/pii/S1474034613000487}, VOLUME = {27 (3)}, DOI = {10.1016/j.aei.2013.04.004}, ISSN = {1474-0346}, JOURNAL = {ADVANCED ENGINEERING INFORMATICS}, } @INCOLLECTION{DELLORLETTA_2013_INCOLLECTION_DMMVAF_220446, AUTHOR = {Dell'Orletta, F. and Marchi, S. and Montemagni, S. and Venturi, G. and Agnoloni, T. and Francesconi, E.}, TITLE = {Domain Adaptation for Dependency Parsing at EVALITA 2011}, YEAR = {2013}, ABSTRACT = {The domain adaptation task was aimed at investigating techniques for adapting state-of-the-art dependency parsing systems to new domains. Both the language dealt with, i. e. Italian, and the target do-main, namely the legal domain, represent two main novelties of the task organised at Evalita 2011 with respect to previous domain adaptation ini-tiatives. In this paper, we define the task and describe how the datasets were created from different resources. In addition, we characterize the different approaches of the participating systems, report the test results, and provide a first analysis of these results}, KEYWORDS = {Dependency Parsing, Domain Adaptation, Self-training, Active Learning, Legal-NLP}, PAGES = {58-69}, URL = {https://iris.cnr.it/handle/20.500.14243/220446}, PUBLISHER = {Springer (Berlin Heidelberg, DEU)}, ISBN = {978-3-642-35827-2}, CONFERENCE_PLACE = {Berlin Heidelberg}, BOOKTITLE = {Evaluation of NLP and Speech Tools for Italian}, EDITOR = {Magnini, B. and Cutugno, F. and Falcone, M. and Pianta, E.}, } @INPROCEEDINGS{CIMINO_2013_INPROCEEDINGS_CDVM_227043, AUTHOR = {Cimino, A. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Linguistic Profiling based on General-purpose Features and Native Language Identification}, YEAR = {2013}, ABSTRACT = {In this paper, we describe our approach to native language identification and discuss the results we submitted as participants to the First NLI Shared Task. By resorting to a wide set of general-purpose features qualifying the lexical and grammatical structure of a text, rather than to ad hoc features specifically selected for the NLI task, we achieved encouraging results, which show that the proposed approach is general-purpose and portable across different tasks, domains and languages}, KEYWORDS = {Native Language Identification, Linguistic Profiling}, PAGES = {207-215}, URL = {http://www.aclweb.org/anthology/W13-1727}, ISBN = {978-1-937284-47-3}, CONFERENCE_NAME = {8th workshop on "Innovative Use of NLP for Building Educational Applications"}, } @INPROCEEDINGS{DELLORLETTA_2013_INPROCEEDINGS_DVM_227044, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Unsupervised Linguistically-Driven Reliable Dependency Parses Detection and Self-Training for Adaptation to the Biomedical Domain}, YEAR = {2013}, ABSTRACT = {In this paper, a new self-training method for domain adaptation is illustrated, where the selection of reliable parses is carried out by an unsupervised linguistically-driven algorithm, ULISSE. The method has been tested on biomedical texts with results showing a significant improvement with respect to considered baselines, which demonstrates its ability to capture both reliability of parses and domain-specificity of linguistic constructions}, KEYWORDS = {Self-training, Domain Adaptation, Biomedical Texts}, PAGES = {45-53}, URL = {http://www.aclweb.org/anthology/W13-1906}, ISBN = {978-1-937284-55-8}, CONFERENCE_NAME = {12th workshop on "Biomedical Natural Language Processing" (BioNLP)}, } @INPROCEEDINGS{FERRARI_2013_INPROCEEDINGS_FSD_253217, AUTHOR = {Ferrari, A. and Spagnolo, G. and Dell'Orletta, F.}, TITLE = {Mining commonalities and variabilities from natural language documents}, YEAR = {2013}, ABSTRACT = {A company who wishes to enter an established marked with a new, competitive product is required to analyse the product solutions of the competitors. Identifying and comparing the features provided by the other vendors might greatly help during the market analysis. However, mining common and variant features of from the publicly available documents of the competitors is a time consuming and error-prone task. In this paper, we suggest to employ a natural language processing approach based on textit(contrastive analysis) to identify commonalities and variabilities from the brochures of a group of vendors. We present a first step towards a practical application of the approach, in the the context of the market of Communications-Based Train Control (CBTC) systems}, KEYWORDS = {Software Product Lines, Variability Mining, CBTC, D.2 SOFTWARE ENGINEERING, 68N30}, PAGES = {116-120}, URL = {http://dl.acm.org/citation.cfm?id=2491634}, ISBN = {978-1-4503-1968-3}, CONFERENCE_NAME = {SPLC 2013-17th International Software Product Line Conference}, EDITOR = {Kishi, T.}, } @INPROCEEDINGS{BONIN_2012_INPROCEEDINGS_BDMV_289376, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Lessico settoriale e lessico comune dell'estrazione di terminologia specialistica da corpora di dominio}, YEAR = {2012}, PAGES = {207-220}, URL = {https://iris.cnr.it/handle/20.500.14243/289376}, PUBLISHER = {Bulzoni Editore (Roma, ITA)}, ISBN = {978-88-7870-655-2}, CONFERENCE_NAME = {XLIV congresso internazionale di studi della società di linguistica italiana}, CONFERENCE_PLACE = {Roma}, BOOKTITLE = {Lessico e Lessicologia. Atti del XLIV congresso internazionale di studi della società di linguistica italiana}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DMMPV_5141, AUTHOR = {Dell'Orletta, F. and Marchi, S. and Montemagni, S. and Plank, B. and Venturi, G.}, TITLE = {The SPLeT-2012 Shared Task on Dependency Parsing of Legal Texts}, YEAR = {2012}, ABSTRACT = {The 4th Workshop on "Semantic Processing of Legal Texts" (SPLeT-2012) presents the first multilingual shared task on Dependency Parsing of Legal Texts. In this paper, we define the general task and its internal organization into sub-tasks, describe the datasets and the domain-specific linguistic peculiarities characterizing them. We finally report the results achieved by the participating systems, describe the underlying approaches and provide a first analysis of the final test results}, KEYWORDS = {Dependency Parsing, Domain Adaptation, Legal Text Processing}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/workshops/27.LREC%202012%20Workshop%20Proceedings%20SPLeT.pdf}, CONFERENCE_NAME = {Fourth Workshop on Semantic Processing of Legal Texts (SPLeT 2012)-First Shared Task on Dependency Parsing of Legal Texts (SPLeT 2012)}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DMMVAF_5136, AUTHOR = {Dell'Orletta, F. and Marchi, S. and Montemagni, S. and Venturi, G. and Agnoloni, T. and Francesconi, E.}, TITLE = {Domain Adaptation for Dependency Parsing at Evalita 2011}, YEAR = {2012}, ABSTRACT = {The domain adaptation task was aimed at investigating techniques for adapting state-of-the-art dependency parsing systems to new domains. Both the language dealt with, i. e. Italian, and the target domain, namely the legal domain, represent two main novelties of the task organised at Evalita 2011. In this paper, we define the task and describe how the datasets were created from different resources. In addition, we characterize the different approaches of the participating systems, report the test results, and provide a first analysis of these results}, KEYWORDS = {Dependency Parsing, Domain Adaptation, Legal Text Processing}, PAGES = {1-7}, URL = {http://www.evalita.it/sites/evalita.fbk.eu/files/working_notes2011/Domain_Adaptation/}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian (EVALITA 2011): Domain Adaptation track}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DM_349404, AUTHOR = {Dell'Orletta, F. and Montemagni, S.}, TITLE = {Tecnologie linguistico-computazionali per la valutazione delle competenze linguistiche in ambito scolastico}, YEAR = {2012}, ABSTRACT = {Se da una lato le tecnologie linguistico-computazionali svolgono un ruolo ormai indiscusso per l'accesso al contenuto testuale, sia esso rappresentato dalla conoscenza specifica di un dominio oppure dalla conoscenza linguistica sottostante (es. collocazioni, strutture argomentali, relazioni semantico-lessicali tra parole, ecc.), ciò non appare scontato quando si vada a considerare il loro ruolo nella valutazione della competenza linguistica di apprendenti. La presente comunicazione intende indagare questo interrogativo, in particolare se e in che misura le tecnologie linguistico-computazionali possano costituire un valido ausilio nella valutazione della competenza linguistica italiana di studenti in ambito scolastico}, KEYWORDS = {tecnologie linguistico-computazionali, competenze linguistiche in ambito scolastico}, PAGES = {343-359}, URL = {https://iris.cnr.it/handle/20.500.14243/349404}, PUBLISHER = {Bulzoni Editore (Roma, ITA)}, CONFERENCE_NAME = {XLIV congresso internazionale di studi della società di linguistica italiana}, CONFERENCE_PLACE = {Roma}, BOOKTITLE = {Linguistica Educativa. Atti del XLIV congresso internazionale di studi della società di linguistica italiana}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DMV_266008, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Genre-oriented Readability Assessment: a Case Study}, YEAR = {2012}, URL = {https://iris.cnr.it/handle/20.500.14243/266008}, } @INCOLLECTION{DELLORLETTA_2011_INCOLLECTION_DMVV_94003, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Vecchi Eva, M. and Venturi, G.}, TITLE = {Tecnologie linguistico-computazionali per il monitoraggio della competenza linguistica italiana degli alunni stranieri nella scuola primaria e secondaria}, YEAR = {2011}, ABSTRACT = {La possibilità di disporre di tecnologie avanzate e innovative che permettano di monitorare la competenza linguistica degli alunni stranieri e, al contempo, valutare l'adeguatezza dei materiali didattici a loro offerti può essere di supporto all'insegnante nell'orientare la propria azione formativa, rendendo così il processo di integrazione linguistico-culturale meno faticoso e traumatico. In tale ottica, questo studio, realizzato col supporto di una piattaforma ormai consolidata di metodi e strumenti per il trattamento automatico dell'italiano, costituisce il primo tentativo condotto in relazione alla lingua italiana, per mettere a punto una metodologia di monitoraggio linguistico rivolta specificamente agli studenti apprendenti la lingua italiana come L2 ed alle loro produzioni scritte}, KEYWORDS = {Trattamento Automatico del Linguaggio, Stranieri, Lingua italiana}, PAGES = {319-336}, URL = {https://iris.cnr.it/handle/20.500.14243/94003}, PUBLISHER = {Mc Graw-Hill (Milano, ITA)}, ISBN = {978-88-386-7296-5}, CONFERENCE_PLACE = {Milano}, BOOKTITLE = {Percorsi Migranti}, EDITOR = {Bruno, G. C. and Caruso, I. and Sanna, M. and Vellecco, I.}, } @INPROCEEDINGS{DELLORLETTA_2011_INPROCEEDINGS_DMV_214930, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {READ-IT: assessing readability of Italian texts with a view to text simplification}, YEAR = {2011}, ABSTRACT = {In this paper, we propose a new approach to readability assessment with a specific view to the task of text simplification: the intended audience includes people with low literacy skills and/or with mild cognitive impairment. READ-IT represents the first advanced readability assessment tool for what concerns Italian, which combines traditional raw text features with lexical, morpho-syntactic and syntactic information. In READ-IT readability assessment is carried out with respect to both documents and sentences where the latter represents an important novelty of the proposed approach creating the prerequisites for aligning the readability assessment step with the text simplification process. READ-IT shows a high accuracy in the document classification task and promising results in the sentence classification scenario}, KEYWORDS = {Readability Assessment, Text Simplification}, PAGES = {73-83}, URL = {http://dl.acm.org/citation.cfm?id=2140511}, ISBN = {978-1-937284-14-5}, CONFERENCE_NAME = {SLPAT '11 Proceedings of the Second Workshop on Speech and Language Processing for Assistive Technologies}, } @INPROCEEDINGS{DELLORLETTA_2011_INPROCEEDINGS_DVM_214925, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {ULISSE: an unsupervised algorithm for detecting reliable dependency parses}, YEAR = {2011}, ABSTRACT = {In this paper we present ULISSE, an unsupervised linguistically-driven algorithm to select reliable parses from the output of a dependency parser. Different experiments were devised to show that the algorithm is robust enough to deal with the output of different parsers and with different languages, as well as to be used across different domains. In all cases, ULISSE appears to outperform the baseline algorithms}, KEYWORDS = {Dependency Parsing, Selection of Reliable Parses, Unsupervised Algorithm}, PAGES = {115-124}, URL = {http://dl.acm.org/citation.cfm?id=2018950}, ISBN = {978-1-932432-92-3}, CONFERENCE_NAME = {CoNLL '11 Proceedings of the Fifteenth Conference on Computational Natural Language Learning}, } @MISC{DELLORLETTA_2011_MISC_DM_217961, AUTHOR = {Dell'Orletta, F. and Montemagni, S.}, TITLE = {Towards an NLP-based approach for measuring syntactic complexity: preliminary experiments with Italian texts from different registers}, YEAR = {2011}, ABSTRACT = {In this paper, we explore how NLP can be used to automatically identify relevant syntactic complexity features in texts with the aim of assessing their correlation with specific linguistic registers. Our final goal is twofold. On the one hand, we demonstrate that automatic morpho-syntactic and syntactic annotation of texts provides sufficiently accurate output for use in the automatic extraction and measurement of syntactic complexity features. On the other hand, we identify the set of syntactic features strongly correlating with considered linguistic registers}, KEYWORDS = {Language Variation, Natural Language Processing, Syntactic Complexity}, URL = {http://www.benszm.net/BSBWWS/Dellorletta_Montemagni.pdf}, CONFERENCE_NAME = {Workshop on "Cross-linguistic and language-internal variation in text and speech: focus on the joint analysis of multiple characteristics"}, } @INPROCEEDINGS{BONIN_2010_INPROCEEDINGS_BDMV_65162, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {A Contrastive Approach to Multi-word Extraction from Domain-specific Corpora}, YEAR = {2010}, ABSTRACT = {In this paper we present a novel approach to multi-word terminology extraction combining a well-known automatic term recognition approach, the C-NC value method, with a contrastive ranking technique, aimed at refining obtained results either by filtering noise due to common words or by discerning between semantically different types of terms within heterogeneous terminologies. The proposed methodology has been tested in two case studies carried out in the History of Art and Legal domains with promising results}, KEYWORDS = {Terminology Extraction, Domain-specific Corpora, Multi-word Expression}, PAGES = {3222-3229}, URL = {https://iris.cnr.it/handle/20.500.14243/65162}, ISBN = {2-9517408-6-7}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, } @INPROCEEDINGS{BONIN_2010_INPROCEEDINGS_BDVM_65168, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Contrastive filtering of domain specific multi-word terms from different types of corpora}, YEAR = {2010}, ABSTRACT = {In this paper we tackle the challenging task of Multi-word term (MWT) extraction from different types of specialized corpora. Contrastive filtering of previously extracted MWTs results in a considerable increment of acquired domain-specific terms}, KEYWORDS = {multi-word terms extraction, co}, PAGES = {76-79}, URL = {https://iris.cnr.it/handle/20.500.14243/65168}, ISBN = {978-7-900268-00-6}, CONFERENCE_NAME = {The 23rd International Conference on Computational Linguistics (COLING 2010). Multiword Expressions: from Theory to Applications (MWE 2010)}, } @INPROCEEDINGS{BOSCO_2010_INPROCEEDINGS_BMMDL_65165, AUTHOR = {Bosco, C. and Montemagni, S. and Mazzei, A. and Dell'Orletta, F. and Lenci, A.}, TITLE = {Evalita'09 Parsing Task: comparing dependency parsers and treebanks}, YEAR = {2010}, KEYWORDS = {dependency parsing, dependency}, URL = {https://iris.cnr.it/handle/20.500.14243/65165}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian. EVALITA 2009}, } @INPROCEEDINGS{BOSCO_2010_INPROCEEDINGS_BMMLDLLASLHNN_65156, AUTHOR = {Bosco, C. and Montemagni, S. and Mazzei, A. and Lombardo, V. and Dell'Orletta, F. and Lenci, A. and Lesmo, L. and Attardi, G. and Simi, M. and Lavelli, A. and Hall, J. and Nilsson, J. and Nivre, J.}, TITLE = {Comparing the Influence of Different Treebank Annotations on Dependency Parsing}, YEAR = {2010}, KEYWORDS = {Parsing, Corpus (creation, annotation, etc.), Evaluation methodologies}, URL = {https://iris.cnr.it/handle/20.500.14243/65156}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, } @INPROCEEDINGS{PASSAROTTI_2010_INPROCEEDINGS_PD_65148, AUTHOR = {Passarotti, M. and Dell'Orletta, F.}, TITLE = {Improvements in Parsing the Index Thomisticus Treebank. Revision, Combination and a Feature Model for Medieval Latin}, YEAR = {2010}, KEYWORDS = {Parsing, Corpus (creation, annotation, etc.)}, URL = {https://iris.cnr.it/handle/20.500.14243/65148}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, } @TECHREPORT{PIRRELLI_2010_TECHREPORT_PLMDGM_353288, AUTHOR = {Pirrelli, V. and Lenci, A. and Montemagni, S. and Dell'Orletta, F. and Giovannetti, E. and Marchi, S.}, TITLE = {ConnectToLife (modulo semantico)-Rapporto tecnico finale}, YEAR = {2010}, ABSTRACT = {Il presente documento costituisce il rapporto tecnico finale del progetto Connect-To-Life (modulo semantico) relativo alle attività svolte dall'unità ILC-CNR}, KEYWORDS = {annotazione linguistica, estrazione di termini, clustering semantico, trattamento automatico della lingua, costruzione di ontologie}, PAGES = {16}, URL = {https://iris.cnr.it/handle/20.500.14243/353288}, } @MISC{BONIN_2010_MISC_BDMV_106763, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Lessico settoriale e lessico comune nell'estrazione di terminologia specialistica da corpora di dominio}, YEAR = {2010}, KEYWORDS = {Automatic Term Extraction}, URL = {https://iris.cnr.it/handle/20.500.14243/106763}, CONFERENCE_NAME = {XLIV Congresso Internazionale di Studi della Società di Linguistica Italiana}, } @MISC{DELLORLETTA_2010_MISC_DMVV_155081, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Vecchi, E. M. and Venturi, G.}, TITLE = {Tecnologie linguistico-computazionali per il monitoraggio delle competenze linguistiche di apprendenti l'italiano come L2}, YEAR = {2010}, KEYWORDS = {Natural Language Processing, Educational Linguistics, Language Learning}, URL = {https://iris.cnr.it/handle/20.500.14243/155081}, CONFERENCE_NAME = {Congresso "IT. L2: italiano lingua seconda nell'università, nella scuola e sul territorio. Esperienze didattiche e ricerche" Università del Piemonte Orientale "Amedeo Avogadro", Facoltà di Lettere e Filosofia}, } @INCOLLECTION{DELLORLETTA_2009_INCOLLECTION_DLMMP_233257, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-2-Knowledge: una piattaforma linguistico-computazionale per l'estrazione di conoscenza da testi}, YEAR = {2009}, ABSTRACT = {The paper describes the automatic extraction of domain knowledge from Italian document collections and presents a fully-implemented ontology learning system (T2K, Text-2-Knowledge) that includes a battery of tools for Natural Language Processing, statistical text analysis and machine learning. Evaluated results show the considerable potential of systems like T2K, exploiting an incremental interleaving of NLP and machine learning techniques for accurate large-scale semi-automatic extraction and structuring of domain-specific knowledge}, KEYWORDS = {Term extraction, Ontology Learning}, PAGES = {285-300}, URL = {https://iris.cnr.it/handle/20.500.14243/233257}, PUBLISHER = {Bulzoni (Roma, ITA)}, ISBN = {978-88-7870-469-5}, CONFERENCE_PLACE = {Roma}, EDITOR = {Ferrari, G. and Benatti, R. and Mosca, M.}, } @INPROCEEDINGS{ATTARDI_2009_INPROCEEDINGS_AD_65126, AUTHOR = {Attardi, G. and Dell'Orletta, F.}, TITLE = {Reverse Revision and Linear Tree Combination for Dependency Parsing}, YEAR = {2009}, KEYWORDS = {Dependency parsing, revision parsing, dependency parsing combination}, URL = {https://iris.cnr.it/handle/20.500.14243/65126}, CONFERENCE_NAME = {North American Chapter of the Association for Computational Linguistics-Human Language Technologies}, } @INPROCEEDINGS{ATTARDI_2009_INPROCEEDINGS_ADSDV_65133, AUTHOR = {Attardi, G. and Dell'Orletta, F. and Simi, M. and Dei Rossi, S. and Vecchi, E. M.}, TITLE = {The Tanl Named Entity Recognizer at Evalita 2009}, YEAR = {2009}, KEYWORDS = {Named Entity Recognizer}, URL = {https://iris.cnr.it/handle/20.500.14243/65133}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian 2009}, } @INPROCEEDINGS{ATTARDI_2009_INPROCEEDINGS_ADST_65108, AUTHOR = {Attardi, G. and Dell'Orletta, F. and Simi, M. and Turian, J.}, TITLE = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron}, YEAR = {2009}, KEYWORDS = {Dependency Parsing, Parsing, Multilayer Perceptron}, URL = {https://iris.cnr.it/handle/20.500.14243/65108}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian 2009}, } @INPROCEEDINGS{CASELLI_2009_INPROCEEDINGS_CDP_65129, AUTHOR = {Caselli, T. and Dell'Orletta, F. and Prodanof, I.}, TITLE = {Temporal Relations with Signals: the Case of Italian Temporal Prepositions}, YEAR = {2009}, KEYWORDS = {temporal relations, taggers, information extraction}, URL = {https://iris.cnr.it/handle/20.500.14243/65129}, CONFERENCE_NAME = {16th International Symposium on Temporal Representation and Reasoning}, } @INPROCEEDINGS{CASELLI_2009_INPROCEEDINGS_CDP_65128, AUTHOR = {Caselli, T. and Dell'Orletta, F. and Prodanof, I.}, TITLE = {TETI: a TimeML Compliant TimEx Tagger for Italian}, YEAR = {2009}, KEYWORDS = {temporal expression, information extraction}, URL = {https://iris.cnr.it/handle/20.500.14243/65128}, CONFERENCE_NAME = {International Multiconference on Computer Science and Information Technology}, } @INPROCEEDINGS{DELLORLETTA_2009_INPROCEEDINGS_D_65107, AUTHOR = {Dell'Orletta, F.}, TITLE = {Ensemble system for Part-of-Speech tagging}, YEAR = {2009}, KEYWORDS = {Part-of-Speech tagging, Ensemble system}, URL = {https://iris.cnr.it/handle/20.500.14243/65107}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian, 2009}, } @ARTICLE{DELLORLETTA_2008_ARTICLE_DLMMPV_37713, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Dal testo alla conoscenza e ritorno: estrazione terminologica e annotazione semantica di basi documentali di dominio}, YEAR = {2008}, ABSTRACT = {The paper focuses on the automatic extraction of domain knowledge from Italian legal texts and presents a fully-implemented ontology learning system (T2K, Text-2-Knowledge) that includes a battery of tools for Natural Language Processing, statistical text analysis and machine learning. Evaluated results show the considerable potential of systems like T2K, exploiting an incremental interleaving of NLP and machine learning techniques for accurate large-scale semi-automatic extraction and structuring of domain-specific knowledge}, KEYWORDS = {Natural Language Processing, Machine Learning, Knowledge extraction from texts, Ontology learning, Legal ontologies}, PAGES = {197-218}, URL = {https://iris.cnr.it/handle/20.500.14243/37713}, VOLUME = {26 (1-2)}, ISSN = {1594-2201}, JOURNAL = {AIDA INFORMAZIONI (ONLINE)}, } @INPROCEEDINGS{DELLORLETTA_2008_INPROCEEDINGS_DLMMPV_65083, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Dal testo alla conoscenza e ritorno: estrazione terminologica e annotazione semantica di basi documentali di dominio}, YEAR = {2008}, ABSTRACT = {The paper focuses on the automatic extraction of domain knowledge from Italian legal texts and presents a fully-implemented ontology learning system (T2K, Text-2-Knowledge) that includes a battery of tools for Natural Language Processing, statistical text analysis and machine learning. Evaluated results show the considerable potential of systems like T2K, exploiting an incremental interleaving of NLP and machine learning techniques for accurate large-scale semi-automatic extraction and structuring of domain-specific knowledge}, KEYWORDS = {Natural Language Processing, Machine Learning, Knowledge extraction from texts, Ontology learning, Legal ontologies}, PAGES = {197-218}, URL = {http://www.assiterm91.it/wp-content/uploads/2010/11/Convegno-2008.pdf}, VOLUME = {ANNO 26, NUMERO 1-2}, ISSN = {1121-0095}, CONFERENCE_NAME = {Atti del Convegno Nazionale Ass. I. Term}, BOOKTITLE = {AIDA INFORMAZIONI}, } @INPROCEEDINGS{DELLORLETTA_2008_INPROCEEDINGS_DLMMPV_65074, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Marchi, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Acquiring Legal Ontologies from Domain-specific Texts}, YEAR = {2008}, ABSTRACT = {The paper reports on methodology and preliminary results ofa case study in automatically extracting ontological knowledgefrom Italian legislative texts in the environmental domain. Weuse a fully-implemented ontology learning system (T2K) thatincludes a battery of tools for Natural Language Processing(NLP), statistical text analysis and machine language learn-ing. Tools are dynamically integrated to provide an incremen-tal representation of the content of vast repositories of unstruc-tured documents. Evaluated results, however preliminary, arevery encouraging, showing the great potential of NLP-poweredincremental systems like T2K for accurate large-scale semi-automatic extraction of legal ontologies}, KEYWORDS = {Ontology learning, Document management, knowledge extraction from texts, Natural Language Processing}, PAGES = {98-101}, URL = {https://iris.cnr.it/handle/20.500.14243/65074}, CONFERENCE_NAME = {LangTech 2008}, } @ARTICLE{DELLORLETTA_2007_ARTICLE_DFLMP_37710, AUTHOR = {Dell'Orletta, F. and Federico, M. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Maximum Entropy for Italian PoS Tagging}, YEAR = {2007}, ABSTRACT = {L'articolo illustra le prestazioni del ILC-UniPi MaxEnt PoS Tagger in Evalita 2007. The report contains a description of the ILC-UniPi MaxEnt PoS Tagger performance in Evalita 2007}, PAGES = {10-11}, URL = {https://iris.cnr.it/handle/20.500.14243/37710}, VOLUME = {IV(2)}, } @INPROCEEDINGS{DELLORLETTA_2007_INPROCEEDINGS_DFLMP_65073, AUTHOR = {Dell'Orletta, F. and Federico, M. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Maximum Entropy for Italian PoS Tagging}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/65073}, CONFERENCE_NAME = {Evaluation of NLP Tools for Italian-EVALITA 2007}, } @INPROCEEDINGS{DELLORLETTA_2007_INPROCEEDINGS_DLMMP_65065, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, S.}, TITLE = {Text-2-Knowledge: una piattaforma linguistico-computazionale per l'estrazione di conoscenza da testi}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/65065}, CONFERENCE_NAME = {XL Congresso Internazionale di Studi della Società di Linguistica Italiana (SLI 2006)}, } @TECHREPORT{DELLORLETTA_2007_TECHREPORT_DFGLMTP_457837, AUTHOR = {Dell'Orletta, F. and Federico, M. and Giovannetti, E. and Lenci, A. and Marchi, S. and Trabucco, A. and Pirrelli, V.}, TITLE = {Language Recognition Tool, Specifiche di Implementazione}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/457837}, } @TECHREPORT{DELLORLETTA_2007_TECHREPORT_DFGLMTP_195930, AUTHOR = {Dell'Orletta, F. and Federico, M. and Giovannetti, E. and Lenci, A. and Marchi, S. and Trabucco, A. and Pirrelli, V.}, TITLE = {Segmentazione di un Testo Italiano in Token}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/195930}, } @TECHREPORT{DELLORLETTA_2007_TECHREPORT_DFGLMTP_457838, AUTHOR = {Dell'Orletta, F. and Federico, M. and Giovannetti, E. and Lenci, A. and Marchi, S. and Trabucco, A. and Pirrelli, V.}, TITLE = {Segmentazione di un Testo Inglese in Token}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/457838}, } @TECHREPORT{DELLORLETTA_2007_TECHREPORT_DFGLMTP_195932, AUTHOR = {Dell'Orletta, F. and Federico, M. and Giovannetti, E. and Lenci, A. and Marchi, S. and Trabucco, A. and Pirrelli, V.}, TITLE = {Specifiche di Chunking per l'Italiano}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/195932}, } @TECHREPORT{DELLORLETTA_2007_TECHREPORT_DFGLMTP_195931, AUTHOR = {Dell'Orletta, F. and Federico, M. and Giovannetti, E. and Lenci, A. and Marchi, S. and Trabucco, A. and Pirrelli, V.}, TITLE = {Analisi Morfosintattica per l'Italiano}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/195931}, } @TECHREPORT{DELLORLETTA_2007_TECHREPORT_DFGLMTP_195933, AUTHOR = {Dell'Orletta, F. and Federico, M. and Giovannetti, E. and Lenci, A. and Marchi, S. and Trabucco, A. and Pirrelli, V.}, TITLE = {Specifiche di Named Entity Recognition per l'Italiano}, YEAR = {2007}, URL = {https://iris.cnr.it/handle/20.500.14243/195933}, } @INPROCEEDINGS{DELLORLETTA_2006_INPROCEEDINGS_DLMP_65043, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Probing the space of grammatical variation: induction of cross-lingual grammatical constraints from treebanks}, YEAR = {2006}, ABSTRACT = {The paper reports on a detailed quantitative analysis of distributional language data of both Italian and Czech, highlighting the relative contribution of a number of distributed grammatical factors to sentence-based identification of subjects and direct objects. The work uses a Maximum Entropy model of stochastic resolution of conflicting grammatical constraints and is demonstrably capable of putting explanatory theoretical accounts to the test of usage-based empirical verification}, PAGES = {21-28}, URL = {https://iris.cnr.it/handle/20.500.14243/65043}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {1-932432-78-7}, CONFERENCE_NAME = {Coling/ACL 2006}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Proceedings of the Workshop on Frontiers in Linguistically Annotated Corpora 2006 (LAC 06)}, } @MISC{BARTOLINI_2006_MISC_BDLMMP_192748, AUTHOR = {Bartolini, R. and Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-to-Knowledge (T2K) Versione 2}, YEAR = {2006}, ABSTRACT = {Versione 2. Text-to-Knowledge (T2K) è una piattaforma software di supporto avanzato alla gestione documentale per la creazione dinamica di repertori terminologici e ontologie di dominio a partire da testi e per l'indicizzazione concettuale di documenti. Il sistema T2K si propone di offrire una batteria integrata di strumenti avanzati di analisi linguistica del testo, analisi statistica e apprendimento automatico del linguaggio, destinati a offrire una rappresentazione accurata del contenuto di una base documentale non strutturata, per scopi di indicizzazione avanzata e navigazione intelligente. I risultati di questo processo di acquisizione sono annotati in forma di metadati XML, offrendo in tal modo la prospettiva di una sempre crescente e diretta interoperabilità con sistemi automatici per la produzione di contenuti digitali selezionati e strutturati dinamicamente su misura, per diversi profili di utenza. Versioni prototipali di T2K sono già operative su alcuni portali della pubblica amministrazione e sono state applicate per l'indicizzazione di contenuti didattici multimediali. E' in corso l'integrazione della tecnologia T2K nel sistema di gestione informatica di documentazione scientifica del CNR}, KEYWORDS = {text to knowledge, nlp, estrazione terminologica, ontology learning, indicizzazione terminologica}, URL = {https://iris.cnr.it/handle/20.500.14243/192748}, } @INPROCEEDINGS{DELLORLETTA_2005_INPROCEEDINGS_DLMP_77226, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Climbing the path to grammar: a maximum entropy model of subject/object learning}, YEAR = {2005}, URL = {https://iris.cnr.it/handle/20.500.14243/77226}, CONFERENCE_NAME = {Psychocomputational Models of Human Language Acquisition (PsychoCompLA-2005)}, } @MISC{BARTOLINI_2005_MISC_BDGMLMP_192739, AUTHOR = {Bartolini, R. and Dell'Orletta, F. and Giorgetti, D. and Marchi, S. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-to-Knowledge (T2K)}, YEAR = {2005}, ABSTRACT = {Piattaforma di estrazione e indicizzazione terminologica}, KEYWORDS = {NLP, estrazione terminologica}, URL = {https://iris.cnr.it/handle/20.500.14243/192739}, }