@ARTICLE{BIFFI_2023_ARTICLE_BGMS_490948, AUTHOR = {Biffi, M. and Guadagnini, E. and Montemagni, S. and Sassolini, E.}, TITLE = {Il lemmario del «GDLI»: dati quantitativi e prime osservazioni}, YEAR = {2023}, ABSTRACT = {Dopo la realizzazione della versione elettronica del solo testo del "Grande dizionario della lingua italiana" (GDLI), si è avviato un progetto di graduale informatizzazione della sua struttura. Questo articolo ne presenta il primo risultato, vale a dire l'estrazione automatica del lemmario che è così per la prima volta quantificabile e individuabile. Una prima parte del testo è dedicata all'illustrazione della strutturazione dei contenuti del dizionario e la loro rappresentazione secondo standard internazionalmente riconosciuti (XML-TEI); la seconda presenta una prima elaborazione dei dati del lemmario estratto; la terza propone una prima analisi comparativa con i lemmari di altri dizionari della lingua italiana.}, KEYWORDS = {Lessicografia, Lessicografia digitale, Lessicografia storica}, PAGES = {331-351}, URL = {https://accademiadellacrusca.it/it/riviste/articoli/slei-xl-2023/8679}, VOLUME = {40}, PUBLISHER = {Le Lettere (Firenze, Italia)}, ISSN = {0392-5218}, JOURNAL = {Studi di lessicografia italiana}, } @ARTICLE{BIFFI_2022_ARTICLE_BDFGMS_477716, AUTHOR = {Biffi, M. and De Blasi, F. and Favaro, M. and Guadagnini, E. and Montemagni, S. and Sassolini, E.}, TITLE = {Parole in rete / reti di parole. Possibili impieghi didattici dei grandi vocabolari storici digitalizzati}, YEAR = {2022}, ABSTRACT = {After a brief presentation of the great historical dictionaries of Italian, which are free to use online thanks to the digitalisation work carried out by the Accademia della Crusca, the contribution offers a number of examples of how these tools can be used for educational purposes. Finally, further didactic uses are described, which will be made possible thanks to the advanced digital tools that the Accademia della Crusca and the Istituto di Linguistica Computazionale "Antonio Zampolli" del Consiglio Nazionale delle Ricerche (ILC) are currently working on.}, KEYWORDS = {Lessicografia italiana, Didattica dell'italiano, Lessicografia digitale}, PAGES = {143-188}, URL = {https://italianoascuola.unibo.it/article/view/14866}, VOLUME = {4}, DOI = {10.6092/issn.2704-8128/14866}, PUBLISHER = {ABIS-AlmaDL (Bologna, Italia)}, ISSN = {2704-8128}, JOURNAL = {Italiano a scuola}, } @INPROCEEDINGS{AGNOLONI_2022_INPROCEEDINGS_ABFMMQRV_472294, AUTHOR = {Agnoloni, T. and Bartolini, R. and Frontini, F. and Montemagni, S. and Marchetti, C. and Quochi, V. and Ruisi, M. and Venturi, G.}, TITLE = {Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus}, YEAR = {2022}, ABSTRACT = {This paper describes the process of acquisition, cleaning, interpretation, coding and linguistic annotation of a collection of parliamentary debates from the Senate of the Italian Republic covering the COVID-19 pandemic emergency period and a former period for reference and comparison according to the CLARIN ParlaMint prescriptions. The corpus contains 1199 sessions and 79,373 speeches for a total of about 31 million words, and was encoded according to the ParlaCLARIN TEI XML format. It includes extensive metadata about the speakers, sessions, political parties and parliamentary groups. As required by the ParlaMint initiative, the corpus was also linguistically annotated for sentences, tokens, POS tags, lemmas and dependency syntax according to the universal dependencies guidelines. Named entity annotation and classification is also included. All linguistic annotation was performed automatically using state-of-the-art NLP technology with no manual revision. The Italian dataset is freely available as part of the larger ParlaMint 2.1 corpus deposited and archived in CLARIN repository together with all other national corpora. It is also available for direct analysis and inspection via various CLARIN services and has already been used both for research and educational purposes.}, KEYWORDS = {parliamentary debates, CLARIN ParlaMint, corpus creation, corpus annotation}, PAGES = {117-124}, URL = {https://aclanthology.org/2022.parlaclarin-1.17/}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, CONFERENCE_NAME = {Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference}, CONFERENCE_PLACE = {Marseille, France}, CONFERENCE_DATE = {20/06/2022}, } @INPROCEEDINGS{SASSOLINI_2021_INPROCEEDINGS_SBDGM_455303, AUTHOR = {Sassolini, E. and Biffi, M. and De Blasi, F. and Guadagnini, E. and Montemagni, S.}, TITLE = {La digitalizzazione del GDLI: un approccio linguistico per la corretta acquisizione del testo?}, YEAR = {2021}, ABSTRACT = {In questo articolo sono discussi metodi e strategie in via di elaborazione per la correzione (propedeutica alla successiva strutturazione) dei contenuti del Grande dizionario della lingua italiana (GDLI) fondato da Salvatore Battaglia, estratti da un formato digitale non standard. La presenza, in questo formato, di errori distribuiti di vario tipo ha condizionato la scelta dell'approccio all'estrazione e messo in luce tutte le difficoltà dell'operazione. Le sperimentazioni fatte sino a oggi portano a privilegiare una strategia di correzione multilivello, che procede scomponendo in sezioni distinte l'individuazione e la correzione degli errori, in modo da rendere gestibili interventi complessi di correzione semi-automatica, altrimenti improponibili, e consentire un loro raffinamento progressivo. Parallelamente alla definizione di regole di riconoscimento di struttura e formato, stiamo analizzando metodi e procedure in grado di migliorare la qualità dell'input e specializzare i moduli di estrazione per i singoli campi della voce a partire dal "lemma". Le finalità del lavoro sono duplici: l'estrazione e strutturazione dei contenuti e la produzione di un formato standard di rappresentazione dei dati. Si tratta di un percorso difficile perché il formato dei dati rende l'uso di strumenti reperibili in letteratura non applicabile. Solamente al termine del lavoro potremo capire se esistono le condizioni per trasformare l'approccio adottato in un protocollo di intervento replicabile.}, KEYWORDS = {dizionari digitali, risorse linguistiche, estrazione dell'informazione, correzione del testo post OCR}, PAGES = {159-166}, URL = {https://aiucd2021.labcd.unipi.it/wp-content/uploads/2021/05/AIUCD2021_BOA-versione3A.pdf}, DOI = {10.6092/unibo/amsacta/6712}, ISBN = {9788894253559}, CONFERENCE_NAME = {AIUCD 2021-DH per la società: e-guaglianza, partecipazione, diritti e valori nell'era digitale}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {19-22/01/2021}, } @MISC{ERJAVEC_2021_MISC_EOOLSGRPKBSVDDJHNCDVMLCAFMQVRMBSRDUPBKMDLR_463861, AUTHOR = {Erjavec, T. and Ogrodniczuk, M. and Osenova, P. and Ljubešić, N. and Simov, K. and Grigorova, V. and Rudolf, M. and Pančur, A. and Kopp, M. and Barkarson, S. and Steingrímsson, S. and Van Der Pol, H. and Depoorter, G. and De Does, J. and Jongejan, B. and Haltrup Hansen, D. and Navarretta, C. and Calzada Pérez, M. and De Macedo, L. D. and Van Heusden, R. and Marx, M. and Çöltekin, Ç. and Coole, M. and Agnoloni, T. and Frontini, F. and Montemagni, S. and Quochi, V. and Venturi, G. and Ruisi, M. and Marchetti, C. and Battistoni, R. and Sebők, M. and Ring, O. and Darģis, R. and Utka, A. and Petkevičius, M. and Briedienė, M. and Krilavičius, T. and Morkevičius, V. and Diwersy, S. and Luxardo, G. and Rayson, P.}, TITLE = {Linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint. ana 2. 1}, YEAR = {2021}, ABSTRACT = {ParlaMint 2.1 is a multilingual set of 17 comparable corpora containing parliamentary debates mostly starting in 2015 and extending to mid-2020, with each corpus being about 20 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (from November 1st 2019), or being "reference" (before that date). The corpora have extensive metadata, including aspects of the parliament; the speakers (name, gender, MP status, party affiliation, party coalition/opposition); are structured into time-stamped terms, sessions and meetings; with speeches being marked by the speaker and their role (e.g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. Note that some corpora have further information, e.g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The corpora are encoded according to the Parla-CLARIN TEI recommendation (https://clarin-eric.github.io/parla-clarin/), but have been validated against the compatible, but much stricter ParlaMint schemas. This entry contains the linguistically marked-up version of the corpus, while the text version is available at http://hdl.handle.net/11356/1432. The ParlaMint.ana linguistic annotation includes tokenization, sentence segmentation, lemmatisation, Universal Dependencies part-of-speech, morphological features, and syntactic dependencies, and the 4-class CoNLL-2003 named entities. Some corpora also have further linguistic annotations, such as PoS tagging or named entities according to language-specific schemes, with their corpus TEI headers giving further details on the annotation vocabularies and tools.}, KEYWORDS = {dibattiti parlamentari, covid-19, ParlaCLARIN, parlamenti, discorso politico, CLARIN, linguistic annotation, pos-tagging, ner, linguistic dependency annotation, UD}, URL = {http://hdl.handle.net/11356/1432}, } @MISC{ERJAVEC_2021_MISC_EOOLSGRPKBSVDDJHNCDVMLCAFMQVRMBSRDUPBKMDLR_463865, AUTHOR = {Erjavec, T. and Ogrodniczuk, M. and Osenova, P. and Ljubešić, N. and Simov, K. and Grigorova, V. and Rudolf, M. and Pančur, A. and Kopp, M. and Barkarson, S. and Steingrímsson, S. and Van Der Pol, H. and Depoorter, G. and De Does, J. and Jongejan, B. and Haltrup Hansen, D. and Navarretta, C. and Calzada Pérez, M. and De Macedo, L. D. and Van Heusden, R. and Marx, M. and Çöltekin, Ç. and Coole, M. and Agnoloni, T. and Frontini, F. and Montemagni, S. and Quochi, V. and Venturi, G. and Ruisi, M. and Marchetti, C. and Battistoni, R. and Sebők, M. and Ring, O. and Darģis, R. and Utka, A. and Petkevičius, M. and Briedienė, M. and Krilavičius, T. and Morkevičius, V. and Diwersy, S. and Luxardo, G. and Rayson, P.}, TITLE = {Multilingual comparable corpora of parliamentary debates ParlaMint 2. 1}, YEAR = {2021}, ABSTRACT = {ParlaMint 2.1 is a multilingual set of 17 comparable corpora containing parliamentary debates mostly starting in 2015 and extending to mid-2020, with each corpus being about 20 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (after November 1st 2019), or being "reference" (before that date). The corpora have extensive metadata, including aspects of the parliament; the speakers (name, gender, MP status, party affiliation, party coalition/opposition); are structured into time-stamped terms, sessions and meetings; with speeches being marked by the speaker and their role (e.g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. Note that some corpora have further information, e.g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The corpora are encoded according to the Parla-CLARIN TEI recommendation (https://clarin-eric.github.io/parla-clarin/), but have been validated against the compatible, but much stricter ParlaMint schemas. This entry contains the ParlaMint TEI-encoded corpora with the derived plain text version of the corpus along with TSV metadata on the speeches. Also included is the 2.0 release of the data and scripts available at the GitHub repository of the ParlaMint project. Note that there also exists the linguistically marked-up version of the corpus, which is available at http://hdl.handle.net/11356/1431.}, KEYWORDS = {dibattiti parlamentari, covid-19, discorso politico, CLARIN, parlamenti, ParlaCLARIN}, URL = {http://hdl.handle.net/11356/1431}, } @ARTICLE{ALZETTA_2020_ARTICLE_ADMV_463828, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Linguistically-driven Selection of Difficult-to-Parse Dependency Structures}, YEAR = {2020}, ABSTRACT = {The paper illustrates a novel methodology meeting a twofold goal, namely quantifying the reliability of automatically generated dependency relations without using gold data on the one hand, and identifying which are the linguistic constructions negatively affecting the parser performance on the other hand. These represent objectives typically investigated in different lines of research, with different methods and techniques. Our methodology, at the crossroads of these perspectives, allows not only to quantify the parsing reliability of individual dependency types but also to identify and weight the contextual properties making relation instances more or less difficult to parse. The proposed methodology was tested in two different and complementary experiments, aimed at assessing the degree of parsing difficulty across (a) different dependency relation types, and (b) different instances of the same relation. The results show that the proposed methodology is able to identify difficult-to-parse dependency relations without relying on gold data and by taking into account a variety of intertwined linguistic factors. These findings pave the way to novel applications of the methodology, both in the direction of defining new evaluation metrics based purely on automatically parsed data and towards the automatic creation of challenge sets.}, KEYWORDS = {Linguistic Complexity, Syntactic Parsing, Evaluation metrics}, PAGES = {37-60}, URL = {https://journals.openedition.org/ijcol/719}, VOLUME = {6}, DOI = {10.4000/ijcol.719}, PUBLISHER = {aAccademia University Press, Torino (Italia)}, ISSN = {2499-4553}, JOURNAL = {Italian Journal of Computational Linguistics}, } @ARTICLE{VENTURI_2020_ARTICLE_VDMMS_441971, AUTHOR = {Venturi, G. and Dell'Orletta, F. and Montemagni, S. and Morini, E. and Sagri, M. T.}, TITLE = {Metodi e Tecniche di Trattamento Automatico della Lingua per l'Estrazione di Conoscenza dalla Documentazione Scolastica}, YEAR = {2020}, ABSTRACT = {Il contributo riguarda la creazione di un sistema integrato di "knowledge management", per la gestione e condivisione della conoscenza prodotta e utilizzata dalla scuola.}, KEYWORDS = {Estrazione di informazione, Documenti scolastici, Indicizzazione, Terminology extraction}, PAGES = {49-68}, URL = {https://publications.cnr.it/doc/441971}, VOLUME = {2}, DOI = {10.3280/CAD2020-002005}, PUBLISHER = {Franco Angeli (Napoli, Italia)}, ISSN = {1122-5165}, JOURNAL = {Cadmo (Testo stamp.)}, } @ARTICLE{VERTECCHI_2020_ARTICLE_VADMV_441967, AUTHOR = {Vertecchi, B. and Agrusti, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Verba et Acta. Un esperimento per promuovere l'evoluzione delle compe-tenze linguistiche degli studenti degli istituti professionali}, YEAR = {2020}, ABSTRACT = {Ricerche in corso. Verba et Acta. Un esperimento per promuovere l'evoluzione delle competenze linguistiche degli studenti degli istituti professionali}, KEYWORDS = {Evoluzione competenze linguistiche, Annotazione linguistica, Previsione dello sviluppo delle competenze di scrittura}, PAGES = {109-117}, URL = {https://publications.cnr.it/doc/441967}, DOI = {10.3280/CAD2020-001008}, PUBLISHER = {Franco Angeli (Napoli, Italia)}, ISSN = {1122-5165}, JOURNAL = {Cadmo (Testo stamp.)}, } @INPROCEEDINGS{ALZETTA_2020_INPROCEEDINGS_ADMOSV_444113, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Osenova, P. and Simov, K. and Venturi, G.}, TITLE = {Quantitative linguistic investigations across universal dependencies treebanks}, YEAR = {2020}, ABSTRACT = {The paper illustrates a case study aimed at identifying cross-lingual quantitative trends in the distribution of dependency relations in treebanks for typologically different languages. Preliminary results show interesting differences rooted either in language-specific peculiarities or cross-lingual annotation inconsistencies, with a potential impact on different application scenarios.}, KEYWORDS = {Universal Dependencies Treebanks, Cross-linguistic analysis, Typology}, PAGES = {1-7}, URL = {http://ceur-ws.org/Vol-2769/paper_59.pdf}, VOLUME = {2769}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, ISBN = {979-12-80136-28-2}, CONFERENCE_NAME = {7th Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Online}, CONFERENCE_DATE = {1-3/03/2021}, BOOKTITLE = {CEUR workshop proceedings}, } @INPROCEEDINGS{BRUNATO_2020_INPROCEEDINGS_BCDMVZ_444114, AUTHOR = {Brunato, D. and Chesi, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G. and Zamparelli, R.}, TITLE = {AcCompl-it @ EVALITA2020: Overview of the acceptability & complexity evaluation task for Italian}, YEAR = {2020}, ABSTRACT = {The Acceptability and Complexity evaluation task for Italian (AcCompl-it) was aimed at developing and evaluating methods to classify Italian sentences according to Acceptability and Complexity. It consists of two independent tasks asking participants to predict either the acceptability or the complexity rate (or both) of a given set of sentences previously scored by native speakers on a 1-to-7 points Likert scale. In this paper, we introduce the datasets distributed to the participants, we describe the different approaches of the participating systems and provide a first analysis of the obtained results.}, KEYWORDS = {Shared Task, Linguistic Complexity, Acceptability}, PAGES = {1-8}, URL = {http://ceur-ws.org/Vol-2765/paper163.pdf}, VOLUME = {2765}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {EVALITA '20, Evaluation of NLP and Speech Tools for Italian}, CONFERENCE_PLACE = {Online}, CONFERENCE_DATE = {17/12/2020}, BOOKTITLE = {CEUR workshop proceedings}, } @INPROCEEDINGS{BRUNATO_2020_INPROCEEDINGS_BCDMV_435966, AUTHOR = {Brunato, D. and Cimino, A. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Profiling-UD: a Tool for Linguistic Profiling of Texts}, YEAR = {2020}, ABSTRACT = {In this paper, we introduce Profiling-UD, a new text analysis tool inspired to the principles of linguistic profiling that can support language variation research from different perspectives. It allows the extraction of more than 130 features, spanning across different levels of linguistic description. Beyond the large number of features that can be monitored, a main novelty of Profiling-UD is that it has been specifically devised to be multilingual since it is based on the Universal Dependencies framework. In the second part of the paper, we demonstrate the effectiveness of these features in a number of theoretical and applicative studies in which they were successfully used for text and author profiling.}, KEYWORDS = {Computational Language Variation Analysis, Linguistic Profiling, Universal Dependencies}, PAGES = {7145-7151}, URL = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.883.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-34-4}, CONFERENCE_NAME = {Conference on Language Resources and Evaluation (LREC)}, CONFERENCE_DATE = {11-16/05/2020}, } @INPROCEEDINGS{LENCI_2020_INPROCEEDINGS_LMBDDDDMPPVL_435958, AUTHOR = {Lenci, A. and Montemagni, S. and Boschetti, F. and De Felice, I. and Dei Rossi, S. and Dell'Orletta, F. and Di Giorgio, M. and Miliani, M. and Passaro, L. C. and Puddu, A. and Venturi, G. and Labanca, N.}, TITLE = {Voices of the Great War: A Richly Annotated Corpus of Italian Texts on the First World War}, YEAR = {2020}, ABSTRACT = {Voci della Grande Guerra ("Voices of the Great War") is the first large corpus of Italian historical texts dating back to the period of First World War. This corpus differs from other existing resources in several respects. First, from the linguistic point of view it gives account of the wide range of varieties in which Italian was articulated in that period, namely from a diastratic (educated vs. uneducated writers), diaphasic (low/informal vs. high/formal registers) and diatopic (regional varieties, dialects) points of view. From the historical perspective, through a collection of texts belonging to different genres it represents different views on the war and the various styles of narrating war events and experiences. The final corpus is balanced along various dimensions, corresponding to the textual genre, the language variety used, the author type and the typology of conveyed contents. The corpus is annotated with lemmas, part-of-speech, terminology, and named entities. Significant corpus samples representative of the different "voices" have also been enriched with meta-linguistic and syntactic information. The layer of syntactic annotation forms the first nucleus of an Italian historical treebank complying with the Universal Dependencies standard. The paper illustrates the final resource, the methodology and tools used to build it, and the Web Interface for navigating it.}, KEYWORDS = {Historical Corpora, Linguistic and Meta-linguistic Annotation, Information Extraction}, PAGES = {911-918}, URL = {https://www.aclweb.org/anthology/2020.lrec-1.114.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-34-4}, CONFERENCE_NAME = {Conference on Language Resources and Evaluation (LREC)}, CONFERENCE_DATE = {11-16/05/2020}, } @ARTICLE{ALZETTA_2019_ARTICLE_ADMV_423880, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {INFERRING QUANTITATIVE TYPOLOGICAL TRENDS FROM MULTILINGUAL TREEBANKS. A CASE STUDY}, YEAR = {2019}, ABSTRACT = {In the past decades, linguistic typology went through a renewing phase that involved a significant change in the research questions and methods of the discipline, which is now interested in fine-grained features underlying language diversity. In this paper, we propose a novel approach to address the newly defined needs of linguistic typology by extracting qualitative and quantitative information about a wide range of features from multilingual annotated corpora based on Natural Language Processing methods and techniques. We tested our method in a case study focusing on word order variation in two widely investigated constructions, VERB-SUBJ(ect) and NOUN-ADJ(ective), with a specific view to structural and functional factors underlying the preference for one or the other order, both intra- and cross-linguistically, and their interaction. Preliminary experiments have been carried out aimed at acquiring typological evidence from a selection of linguistically annotated treebanks for three different languages, namely Italian, Spanish and English. Our results show the effectiveness of the method in letting similarities and differences also emerge from typologically close languages.}, KEYWORDS = {language typology, multilingual annotated corpora, linguistic knowledge extraction and modelling, word order variation}, PAGES = {209-242}, URL = {https://www.rivisteweb.it/doi/10.1418/95391}, VOLUME = {18}, DOI = {10.1418/95391}, PUBLISHER = {Il Mulino, Bologna (Italia)}, ISSN = {1720-9331}, JOURNAL = {Lingue e linguaggio}, } @ARTICLE{DELLORLETTA_2019_ARTICLE_DGMMRSV_423874, AUTHOR = {Dell'Orletta, F. and Greco, S. and Montemagni, S. and Morini, E. and Rossi, F. and Sagri, M. T. and Venturi, G.}, TITLE = {Le parole del miglioramento. Come le scuole descrivono il cambiamento}, YEAR = {2019}, ABSTRACT = {Il presente contributo intende illustrare i risultati di una ricerca condotta con l'uso di strumenti di trattamento automatico del linguaggio (Natural Language Processing: nlp) su quanto dichiarato dalle scuole in circa 2500 Piani di Miglioramento (modello indire ) con l'obiettivo di comprendere le scelte strategiche in un'ottica di miglioramento continuo. Il disegno d'analisi permette di restituire sia una visione complessiva dei Piani di Miglioramento che approfondimenti qualitativi di confronto tra tipologie di scuola e aree geografiche e relativi a tematiche strategiche quali formazione e innovazione.}, KEYWORDS = {Piano di Miglioramento, Natural Language Processing, Formazione, Innovazione}, PAGES = {47-68}, URL = {https://www.rivistainfanzia.it/pvw/app/default/pvw_sito.php?sede_codice=1PWPSE01\&page=2432193}, VOLUME = {1/2019}, PUBLISHER = {Edizioni Centro Studi Erickson (Gardolo (TN), Italia)}, ISSN = {1971-3711}, JOURNAL = {Psicologia dell'educazione}, } @INPROCEEDINGS{ALZETTA_2019_INPROCEEDINGS_ADMV_423881, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dissecting Treebanks to Uncover Typological Trends. A Multilingual Comparative Approach}, YEAR = {2019}, ABSTRACT = {Over the last years, linguistic typology started attracting the interest of the community working on cross- and multi-lingual NLP as a way to tackle the bottleneck deriving from the lack of annotated data for many languages. Typological information is mostly acquired from publicly accessible typological databases, manually constructed by linguists. As reported in Ponti et al. (2018), despite the abundant information contained in them for many languages, these resources suffer from two main shortcomings, i.e. their limited coverage and the discrete nature of features (only "the majority value rather than the full range of possible values and their corresponding frequencies" is reported). Corpus-based studies can help to automatically acquire quantitative typological evidence which might be exploited for polyglot NLP. Recently, the availability of corpora annotated following a cross-linguistically consistent annotation scheme such as the one developed in the Universal Dependencies project is prompting new comparative linguistic studies aimed to identify similarities as well as idiosyncrasies among typologically different languages (Nivre, 2015). The line of research described here is aimed at acquiring quantitative typological evidence from UD treebanks through a multilingual contrastive approach.}, KEYWORDS = {Natural Language Processing, Linguistic Typology}, PAGES = {1-3}, URL = {https://typology-and-nlp.github.io/2019/assets/2019/papers/5.pdf}, ISBN = {978-1-950737-29-1}, CONFERENCE_NAME = {1st TyP-NLP: The Workshop on Typology for Polyglot NLP, ACL workshop}, CONFERENCE_PLACE = {Firenze}, CONFERENCE_DATE = {01/08/2019}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMSV_391617, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Simi, M. and Venturi, G.}, TITLE = {Assessing the Impact of Iterative Error Detection and Correction. A Case Study on the Italian Universal Dependency Treebank}, YEAR = {2018}, ABSTRACT = {Detection and correction of errors and inconsistencies in "gold treebanks" are becoming more and more central topics of corpus annotation. The paper illustrates a new incremental method for enhancing treebanks, with particular emphasis on the extension of error patterns across different textual genres and registers. Impact and role of corrections have been assessed in a dependency parsing experiment carried out with four different parsers, whose results are promising. For both evaluation datasets, the performance of parsers increases, in terms of the standard LAS and UAS measures and of a more focused measure taking into account only relations involved in error patterns, and at the level of individual dependencies.}, KEYWORDS = {Error Detection, Universal Dependency Treebanks, Syntactic parsing}, PAGES = {1-7}, URL = {http://universaldependencies.org/udw18/PDFs/39_Paper.pdf}, ISBN = {978-1-948087-84-1}, CONFERENCE_NAME = {Universal Dependencies Workshop 2018 (UDW 2018)}, CONFERENCE_PLACE = {Brussels}, CONFERENCE_DATE = {01/11/2018}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMV_382333, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dangerous Relations in Dependency Treebanks}, YEAR = {2018}, ABSTRACT = {The paper illustrates an effective and innovative method for detecting erroneously annotated arcs in gold dependency treebanks based on an algorithm originally developed to measure the reliability of automatically produced dependency relations. The method permits to significantly restrict the error search space and, more importantly, to reliably identify patterns of systematic recurrent errors which represent dangerous evidence to a parser which tendentially will replicate them. Achieved results demonstrate effectiveness and reliability of the method.}, KEYWORDS = {Dependency treebanks, Error Detection, Linguistic Annotation}, PAGES = {201-210}, URL = {http://aclweb.org/anthology/W/W17/W17-7624.pdf}, ISBN = {978-80-88132-04-2}, CONFERENCE_NAME = {16th International Workshop on Treebanks and Linguistic Theories}, CONFERENCE_PLACE = {Praga}, CONFERENCE_DATE = {23-24 gennaio 2018}, } @INPROCEEDINGS{ALZETTA_2018_INPROCEEDINGS_ADMV_385342, AUTHOR = {Alzetta, C. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Universal Dependencies and Quantitative Typological Trends. A Case Study on Word Order}, YEAR = {2018}, ABSTRACT = {The paper presents a new methodology aimed at acquiring typological evidence from "gold" treebanks for different languages. In particular, it investigates whether and to what extent algorithms developed for assessing the plausibility of automatically produced syntactic annotations could contribute to shed light on key issues of the linguistic typological literature. It reports the first and promising results of a case study focusing on word order patterns carried out on three different languages (English, Italian and Spanish).}, KEYWORDS = {Linguistic Knowledge Extraction, Dependency Treebanks, Linguistic Typology}, PAGES = {4540-4549}, URL = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/1109.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-00-9}, CONFERENCE_NAME = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC 2018)}, CONFERENCE_PLACE = {Miyazaki (Japan)}, CONFERENCE_DATE = {7-12 maggio 2018}, } @INPROCEEDINGS{CHIRIATTI_2018_INPROCEEDINGS_CDDMPSV_423871, AUTHOR = {Chiriatti, G. and Della Gala, V. and Dell'Orletta, F. and Montemagni, S. and Pettenati, M. C. and Sagri, M. T. and Venturi, G.}, TITLE = {A NLP-based analysis of reflective writings by Italian teachers}, YEAR = {2018}, ABSTRACT = {This paper reports first results of a wider study devoted to exploit the potentialities of a NLP-based approach to the analysis of a corpus of reflective writings on teaching activities. We investigate how a wide set of linguistic features allows reconstructing the linguistic profile of the texts written by the Italian teachers and predicting whether are reflective.}, KEYWORDS = {Natural Language Processing, Reflective Writings, Linguistic Profiling, Document Classification}, PAGES = {1-7}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057733802\&origin=inward}, VOLUME = {2253}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Torino}, CONFERENCE_DATE = {10-12/12/2018}, BOOKTITLE = {CEUR workshop proceedings}, } @INPROCEEDINGS{DEFELICE_2018_INPROCEEDINGS_DDVLM_423872, AUTHOR = {De Felice, I. and Dell'Orletta, F. and Venturi, G. and Lenci, A. and Montemagni, S.}, TITLE = {Italian in the Trenches: Linguistic annotation and analysis of texts of the great war}, YEAR = {2018}, ABSTRACT = {The paper illustrates the design and development of a textual corpus representative of the historical variants of Italian during the Great War, which was enriched with linguistic (lemmatization and pos-tagging) and meta-linguistic annotation. The corpus, after a manual revision of the linguistic annotation, was used for specializing existing NLP tools to process historical texts with promising results.}, KEYWORDS = {Natural Language Processing, Automatic Linguistic Annotation}, PAGES = {1-5}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85057734451\&origin=inward}, VOLUME = {2253}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {5th Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {10-12/12/2018}, BOOKTITLE = {CEUR workshop proceedings}, } @ARTICLE{VENTURI_2017_ARTICLE_VDMFB_382249, AUTHOR = {Venturi, G. and Dell'Orletta, F. and Montemagni, S. and Flore, E. and Bellandi, T.}, TITLE = {La qualità dei consensi informati. Un'analisi linguistico-computazionale della leggibilità dei testi}, YEAR = {2017}, ABSTRACT = {La leggibilità dei testi delle informative di consenso per le procedure diagnostico-terapeutiche è un requisito fondamentale, per offrire alle persone assistite l'accesso alle informazioni necessarie a una scelta consapevole delle opzioni disponibili per curare i diversi problemi di salute. La disponibilità di un testo leggibile è inoltre un aiuto per i medici responsabili della comunicazione e della raccolta del consenso, che possono impiegarlo come un ausilio alle informazioni presentate in forma verbale durante il colloquio, in modo tale da poter condividere una base di conoscenze minime da condividere con il paziente e i suoi familiari. Seppure le evidenze siano limitate in merito alla relazione tra la qualità del consenso e l'attitudine al contenzioso da parte dei pazienti in caso di trattamenti che esitano in un danno attribuibile alle cure (Durand et al., 2015), si tratta di un ambito di ricerca di crescente interesse nella letteratura sulla sicurezza (Wu et al., 2005; Manta et al., 2017). Nella casistica regionale della Toscana sulle richieste di risarcimento, solo l'1% dei sinistri include problemi di consenso informato (dati Centro GRC), probabilmente anche a causa di una sottovalutazione del diritto all'informazione da parte dei cittadini che si sottopongono a interventi programmati, connessa con una limitata consapevolezza del potere di scegliere le proprie cure che ogni persona dovrebbe poter esercitare posta di fronte alle opzioni terapeutiche disponibili per i propri problemi di salute.}, KEYWORDS = {Consenso informato, valutazione automatica della leggibilità, Trattamento Automatico del Linguaggio}, PAGES = {35-39}, URL = {http://www.formas.toscana.it/rivistadellasalute/fileadmin/files/fascicoli/2017/212/SeT_fascicolo_212.pdf}, VOLUME = {212}, PUBLISHER = {ETS (Pisa, Italia)}, ISSN = {0392-4505}, JOURNAL = {Salute e territorio}, } @INCOLLECTION{MONTEMAGNI_2017_INCOLLECTION_MW_367892, AUTHOR = {Montemagni, S. and Wieling, M.}, TITLE = {Exploring the role of extra-linguistic factors in defining dialectal variation patterns through cluster comparison}, YEAR = {2017}, ABSTRACT = {This paper contributes to two open issues in the dialectometric literature, i.e. i) whether and how patterns of linguistic variation are influenced by extra-linguistc features such as the geomorphology of the area, or cultural, administrative and political boundaries, and ii) whether and how the influence of extra-linguistic factors remains stable across linguistically-grounded partitions of data. To investigate these issues, a case study focusing on lexical variation has been carried out on a regional lexical atlas of Tuscan dialects. A variety of extra-linguistic features was taken into account, whose impact and role has been evaluated with respect to both the whole dialectal dataset and across different semantic fields.}, KEYWORDS = {dialectometric literature, dialectology, linguistic variation, dialect, Tuscan, lexical atlas}, PAGES = {241-251}, URL = {http://www.let.rug.nl/festschriftnerbonne/25.%20Montemagni%20\&%20Wieling.pdf}, VOLUME = {Tributes 32}, BOOKTITLE = {From Semantics to Dialectometry. Festschrift in honor of John Nerbonne}, EDITOR = {Wieling, M. and Kroon, M. and Van Noord, G. and Bouma, G.}, } @INPROCEEDINGS{CIMINO_2017_INPROCEEDINGS_CWDMV_382252, AUTHOR = {Cimino, A. and Wieling, M. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Identifying predictive features for textual genre classification: The key role of syntax}, YEAR = {2017}, ABSTRACT = {The paper investigates impact and role of different feature types for the specific task of Automatic Genre Classification with the final aim of identifying the most predictive ones. The goal was pursued by carrying out incremental feature selection through Grafting using different sets of linguistic features. Achieved results for discriminating among four traditional textual genres show the key role played by syntactic features, whose impact turned out to vary across genres.}, KEYWORDS = {Textual Genre Classification, Feature Selection, Syntactic Features}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85037370866\&origin=inward}, VOLUME = {2006}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {11-12 dicembre 2017}, BOOKTITLE = {CEUR workshop proceedings}, } @ARTICLE{BRUNATO_2016_ARTICLE_BDMV_385220, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Monitoraggio linguistico di Scritture Brevi: aspetti metodologici e primi risultati}, YEAR = {2016}, ABSTRACT = {Se da un lato le tecnologie del linguaggio svolgono un ruolo ormai indiscusso per l'accesso al contenuto testuale, ciò non appare scontato quando si va a considerare il loro ruolo nella valutazione delle strutture linguistiche sottostanti al testo. Questo contributo si focalizza sulla definizione di una metodologia innovativa di monitoraggio linguistico della lingua italiana che a partire dall'output di strumenti di annotazione linguistica automatica permette di ricostruire un profilo linguistico di una collezione di testi rappresentativa di una specifica varietà d'uso della lingua. Tale metodologia è stata applicata a un corpus di tweet allo scopo di far luce su interrogativi aperti quali la possibilità di rintracciare tendenze lessicali, morfo-sintattiche e sintattiche peculiari all'interno di questa tipologia testuale; di studiare come queste tendenze si rapportino ai tratti caratterizzanti della lingua scritta e parlata; di individuare possibili differenze nella forma linguistica in cui si twittano contenuti di natura diversa.}, KEYWORDS = {Trattamento Automatico del Linguaggio, Monitoraggio Linguistico, Varietà d'Uso della Lingua, Lingua del Web}, PAGES = {149-176}, URL = {https://publications.cnr.it/doc/385220}, VOLUME = {N. S. 5}, PUBLISHER = {Università degli Studi di Napoli "L'Orientale" (Napoli, Italia)}, ISSN = {1825-2796}, JOURNAL = {Quaderni Aion}, } @ARTICLE{LENCI_2016_ARTICLE_LLMM_367820, AUTHOR = {Lenci, A. and Labanca, N. and Marazzini, C. and Montemagni, S.}, TITLE = {Voci della Grande Guerra: An Annotated Corpus of Italian Texts on World War I}, YEAR = {2016}, ABSTRACT = {Voci della Grande Guerra (Voices of the Great War) is a scientific and cultural initiative with the aim of preserving and promoting the memory of Italy in World War I through the creation of a corpus of digital texts selected by historians and linguists in order to be representative of the different ways to experience and describe the Italian war by its protagonists. With the help of advanced techniques of computational linguistics, semantic web and information visualization, the digitized historical materials will be explored with an online interface to enable easy but effective and innovative search modalities. The project will allow experts as well as non-experts to become acquainted with "linguistic polyphony" of Italy during World War I.}, KEYWORDS = {Great War, World War, digital texts, corpus, Italian, Voci della Grande Guerra, Voices of the Great War}, PAGES = {101-108}, URL = {http://www.ai-lc.it/IJCoL/v2n2/6-lenci_et_al.pdf}, VOLUME = {2}, PUBLISHER = {aAccademia University Press, Torino (Italia)}, ISSN = {2499-4553}, JOURNAL = {Italian Journal of Computational Linguistics}, } @INCOLLECTION{MONTEMAGNI_2016_INCOLLECTION_MW_367809, AUTHOR = {Montemagni, S. and Wieling, M.}, TITLE = {Tracking linguistic features underlying lexical variation patterns: A case study on Tuscan dialects}, YEAR = {2016}, ABSTRACT = {In this paper, we illustrate the application of hierarchical spectral partitioning of bipartite graphs in the study of lexical variation in Tuscany based on the data from a regional linguistic atlas. This method makes it possible not only to identify existing patterns of lexical variation in Tuscany, but also to uncover the underlying lexical features in terms of the most characteristic concept-lexicalization pairs. The results are promising, demonstrating the potential of the method for tracking the linguistic features underlying identified patterns of lexical variation and change across generations.}, KEYWORDS = {tuscan, dialects, lexical variation, linguistic atlas}, PAGES = {117-135}, URL = {http://langsci-press.org/catalog/view/81/146/376-1}, VOLUME = {1}, DOI = {10.17169/langsci.b81.146}, PUBLISHER = {Language Science Press (Berlin, DEU)}, BOOKTITLE = {The future of dialects}, EDITOR = {Côté, M. and Knooihuizen, R. and Nerbonne, J.}, } @INCOLLECTION{WIELING_2016_INCOLLECTION_WM_367813, AUTHOR = {Wieling, M. and Montemagni, S.}, TITLE = {Infrequent forms: Noise or not?}, YEAR = {2016}, ABSTRACT = {In this study we ask the question whether simplifying the data in dialectometrical studies by removing infrequent forms is advantageous to uncovering the geographical structure in dialect data. By investigating lexical variation in a large corpus of Tuscan dialect data via hierarchical bipartite spectral graph partitioning, we are able to identify the main geographical areas together with their linguistic basis. In order to assess the influence of infrequent forms, we conduct two analyses: one which includes only lexical variants used by at least 0.5% of the informants, and another which includes all lexical variants in the data. Using this approach we show that using all data enables us to find a geographical characterization with a more adequate linguistic basis than by using the trimmed data.}, KEYWORDS = {dialectometrical studies, dialectology, dialect data, lexical variation, Tuscan}, PAGES = {215-224}, URL = {http://langsci-press.org/catalog/view/81/78/367-1}, VOLUME = {Language Variation 1}, DOI = {10.17169/langsci.b81.78}, PUBLISHER = {Language Science Press (Berlin, DEU)}, ISBN = {978-3-946234-18-0}, BOOKTITLE = {The Future of Dialects}, EDITOR = {Côté, M. and Knooihuizen, R. and Nerbonne, J.}, } @EDITORIAL{MONTEMAGNI_2016_EDITORIAL_M_372004, AUTHOR = {Montemagni, S.}, TITLE = {Proceedings CLiC-it 2016 and EVALITA 2016}, YEAR = {2016}, ABSTRACT = {Proceedings of Third Italian Conference on Computational Linguistics (CLiC-it 2016) \& Fifth Evaluation Campaign of Natural Language Processing and Speech Tools for Italian. Final Workshop (EVALITA 2016)}, KEYWORDS = {Computational Linguistics Natural Language Processing Speech Tools for Italian CLiC-it EVALITA}, URL = {http://ceur-ws.org/Vol-1749/}, VOLUME = {1749}, PUBLISHER = {CEUR-WS. org (Aachen, DEU)}, } @EDITORIAL{BASILI_2016_EDITORIAL_BM_372022, AUTHOR = {Basili, R. and Montemagni, S.}, TITLE = {Nota Editoriale}, YEAR = {2016}, KEYWORDS = {Computational Linguistics CLiC-it natural language processing}, PAGES = {7-10}, URL = {http://www.ai-lc.it/IJCoL/v2n1/00_nota_editoriale.pdf}, VOLUME = {2}, PUBLISHER = {aAccademia University Press, Torino (Italia)}, ISSN = {2499-4553}, BOOKTITLE = {Italian Journal of Computational Linguistics}, } @INPROCEEDINGS{BARBAGLI_2016_INPROCEEDINGS_BLDMV_366749, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {CItA: an L1 Italian Learners Corpus to Study the Development of Writing Competence}, YEAR = {2016}, ABSTRACT = {In this paper, we present the CItA corpus (Corpus Italiano di Apprendenti L1), a collection of essays written by Italian L1 learners collected during the first and second year of lower secondary school. The corpus was built in the framework of an interdisciplinary study jointly carried out by computational linguistics and experimental pedagogists and aimed at tracking the development of written language competence over the years and students' background information.}, KEYWORDS = {Italian Learner Corpus, Diachronic Evolution of Written Language Competence, Error Annotation}, PAGES = {88-95}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/536_Paper.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, CONFERENCE_PLACE = {Portoroz (Slovenia)}, CONFERENCE_DATE = {23-28 maggio 2016}, } @INPROCEEDINGS{DELLORLETTA_2016_INPROCEEDINGS_DMV_366757, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Esplorazioni computazionali nello spazio dell'interlingua: verso una nuova metodologia di indagine}, YEAR = {2016}, ABSTRACT = {Il presente contributo intende proporre un innovativo approccio all'identificazione delle caratteristiche linguistiche che aiutano a definire l'interlingua. Tale approccio consiste nella ricostruzione del profilo linguistico di corpora di produzioni scritte da apprendenti una lingua seconda basato su strumenti di trattamento automatico del linguaggio.}, KEYWORDS = {interlingua, annotazione linguistica automatica, monitoraggio linguistico}, PAGES = {143-161}, URL = {https://www.bulzoni.it/it/catalogo/lingue-in-contatto-contact-linguistics.html}, PUBLISHER = {Bulzoni Editore (Roma, ITA)}, ISBN = {978-88-6897-029-1}, CONFERENCE_NAME = {XLVIII Congresso Internazionale di Studi della Società di Linguistica Italiana (SLI 2014)}, CONFERENCE_PLACE = {Udine}, CONFERENCE_DATE = {25-27 settembre 2014}, } @INPROCEEDINGS{MONTEMAGNI_2016_INPROCEEDINGS_M_372010, AUTHOR = {Montemagni, S.}, TITLE = {Preface}, YEAR = {2016}, ABSTRACT = {Our very warm welcome to CLiC - it 2016 (http://clic - it2016.dieti.unina.it/), the 3 rd edition of the Italian Conference on Computational Linguistics , held on December 5 th and 6 th , in Naples, Italy, co - located with Evalita 2016 ( http://www.evalita.it/2016 ), hosted and locally organized by Università Federico II, one the oldest public and laic universities in the world. The organization of the conference is the result of a fruitful conjoint effort of different research groups (Istituto di Linguistica Computazionale "Antonio Zampolli" del CNR, Università degli Studi di Bari Aldo Moro and Università degli Studi di Napoli Federico II) showing the nationwide spreading of Computational Linguistics in Italy. The CLiC - it conference series is organized by the Italian Association for Computational Linguistics (AILC) and has clearly established itself as the premier national forum for research and development in the fields of Computational Linguistics (CL) and Natural Language Processing (NLP), where leading researchers and practitioners from academia and industry meet to share their challenges, solutions, research results, and experiences. CLiC - it covers all aspects of computational linguistics and natural language (both written and spoken) processing, and targets state - of - art theoretical results, experimental methodologies, technologies, as well as application perspectives, which may contribute to advance the field.}, KEYWORDS = {Computational Linguistics Natural Language Processing Speech Tools for Italian CLiC-it EVALITA}, URL = {https://publications.cnr.it/doc/372010}, VOLUME = {1749}, PUBLISHER = {CEUR-WS. org (Aachen, DEU)}, CONFERENCE_NAME = {CLiC-it \& EVALITA 2016}, CONFERENCE_PLACE = {Napoli}, CONFERENCE_DATE = {5-7/12/2016}, BOOKTITLE = {Proceedings CLiC-it 2016 and EVALITA 2016}, EDITOR = {Montemagni, S.}, } @INPROCEEDINGS{TUSA_2016_INPROCEEDINGS_TDMV_366754, AUTHOR = {Tusa, E. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Dieci sfumature di marcatezza sintattica: Verso una nozione computazionale di complessita}, YEAR = {2016}, ABSTRACT = {In this work, we will investigate whether and to what extent algorithms typically used to assess the reliability of the output of syntactic parsers can be used to study the correlation between processing complexity and the linguistic notion of markedness. Although still preliminary, achieved results show the key role of features such as dependency direction and length in defining the markedness degrees of a given syntactic construction.}, KEYWORDS = {marcatezza sintattica, complessità linguistica, annotazione linguistica automatica}, PAGES = {1-6}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85009279517\&origin=inward}, VOLUME = {1749}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Napoli}, CONFERENCE_DATE = {5-6 dicembre 2016}, BOOKTITLE = {CEUR workshop proceedings}, } @INPROCEEDINGS{WIELING_2016_INPROCEEDINGS_WSCM_359168, AUTHOR = {Wieling, M. and Sassolini, E. and Cucurullo, S. and Montemagni, S.}, TITLE = {ALT Explored: Integrating an Online Dialectometric Tool and an Online Dialect Atlas}, YEAR = {2016}, ABSTRACT = {In this paper, we illustrate the integration of an online dialectometric tool, Gabmap, together with an online dialect atlas, the Atlante Lessicale Toscano (ALT-Web). By using a newly created url-based interface to Gabmap, ALT-Web is able to take advantage of the sophisticated dialect visualization and exploration options incorporated in Gabmap. For example, distribution maps showing the distribution in the Tuscan dialect area of a specific dialectal form (selected via the ALT-Web website) are easily obtainable. Furthermore, the complete ALT-Web dataset as well as subsets of the data (selected via the ALT-Web website) can be automatically uploaded and explored in Gabmap. By combining these two online applications, macro- and micro-analyses of dialectal data (respectively offered by Gabmap and ALT-Web) are effectively and dynamically combined.}, KEYWORDS = {Lexicon, Lexical Database, Tools, Systems, Applications}, PAGES = {3265-3272}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {LREC 2016}, CONFERENCE_PLACE = {Portorož, Slovenia}, CONFERENCE_DATE = {23/10/2016}, } @ARTICLE{ATTARDI_2015_ARTICLE_ABBCDMPSS_366713, AUTHOR = {Attardi, G. and Basile, V. and Bosco, C. and Caselli, T. and Dell'Orletta, F. and Montemagni, S. and Patti, V. and Simi, M. and Sprugnoli, R.}, TITLE = {State of the Art Language Technologies for Italian: The EVALITA 2014 Perspective}, YEAR = {2015}, ABSTRACT = {Shared task evaluation campaigns represent a well established form of competitive evaluation, an important opportunity to propose and tackle new challenges for a specific research area and a way to foster the development of benchmarks, tools and resources. The advantages of this approach are evident in any experimental field, including the area of Natural Language Processing. An outlook on state-of-the-art language technologies for Italian can be obtained by reflecting on the results of the recently held workshop "Evaluation of NLP and Speech Tools for Italian", EVALITA 2014. The motivations underlying individual shared tasks, the level of knowledge and development achieved within each of them, the impact on applications, society and economy at large as well as directions for future research will be discussed from this perspective.}, KEYWORDS = {Evaluation Campaign, Natural Language Processing, Dependency Parsing, Sentiment Analysis, Temporal Processing}, PAGES = {43-61}, URL = {https://publications.cnr.it/doc/366713}, VOLUME = {9}, DOI = {10.3233/IA-150076}, PUBLISHER = {Associazione Italiana per l'Intelligenza Artificiale (Bari, Italia)}, ISSN = {1724-8035}, JOURNAL = {Intelligenza Artificiale}, } @ARTICLE{BARBAGLI_2015_ARTICLE_BLDMV_357152, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Il ruolo delle tecnologie del linguaggio nel monitoraggio dell'evoluzione delle abilità di scrittura: primi risultati}, YEAR = {2015}, ABSTRACT = {L'ultimo decennio ha visto l'affermarsi a livello internazionale dell'uso di tecnologie del linguaggio per lo studio dei processi di apprendimento. Questo contributo riporta i primi e promettenti risultati di uno studio interdisciplinare che si è avvalso di metodi e tecniche di analisi propri della linguistica computazionale, della linguistica e della pedagogia sperimentale. Lo studio, finalizzato al monitoraggio dell'evoluzione del processo di apprendimento della lingua italiana, è stato condotto a partire dalle produzione scritte di studenti della scuola secondaria di primo grado con strumenti di annotazione linguistica automatica e di estrazione di conoscenza e ha portato all'identificazione di un insieme di tratti qualificanti il processo di apprendimento linguistico.}, KEYWORDS = {evoluzione delle competenze linguistiche, Didattica Sperimentale, Estrazione di conoscenza, Annotazione linguistica automatica}, PAGES = {99-117}, URL = {https://journals.openedition.org/ijcol/326}, DOI = {10.4000/ijcol.326}, PUBLISHER = {aAccademia University Press, Torino (Italia)}, ISSN = {2499-4553}, JOURNAL = {Italian Journal of Computational Linguistics}, } @INCOLLECTION{SIMI_2015_INCOLLECTION_SMB_330110, AUTHOR = {Simi, M. and Montemagni, S. and Bosco, C.}, TITLE = {Harmonizing and merging Italian treebanks: Towards a merged Italian dependency treebank and beyond}, YEAR = {2015}, ABSTRACT = {In this paper we address the challenge of combining existing CoNLL-compliant dependency-annotated corpora with the final aim of constructing a bigger treebank for the Italian language. To this end, we defined amethodology formapping different annotation schemes, based on: (i)The analysis of similarities and differences of considered source and target dependency annotation schemes; (ii) The analysis of the performance of state of the art dependency parsers trained on the source and target treebanks; (iii) The mapping of the source annotation scheme(s) onto a set of target (possibly underspecified) data categories. This methodology was applied in two different case studies. The first one was aimed at constructing a "Merged Italian Dependency Treebank" (MIDT) starting from existing Italian dependency treebanks, namely TUT and ISST-TANL. The second case study, still ongoing, consists in the conversion of the MIDT resource into the Stanford Dependencies de facto standard with the final aim of developing an "Italian Stanford Dependency Treebank" (ISDT).}, KEYWORDS = {Harmonization and merging of resources, Italian, Dependency Treebank}, PAGES = {3-23}, URL = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84927143016\&partnerID=q2rCbXpz}, VOLUME = {589}, DOI = {10.1007/978-3-319-14206-7_1}, PUBLISHER = {Springer International Publishing (CH-6330 Cham (ZG), CHE)}, ISBN = {978-3-319-14205-0}, BOOKTITLE = {Harmonization and Development of Resources and Tools for Italian Natural Language Processing within the PARLI Project}, EDITOR = {Basili, R. and Bosco, C. and Delmonte, R. and Moschitti, A. and Simi, M.}, } @INPROCEEDINGS{BARBAGLI_2015_INPROCEEDINGS_BLDMV_357146, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {CItA: un Corpus di Produzioni Scritte di Apprendenti l'Italiano L1 Annotato con Errori}, YEAR = {2015}, ABSTRACT = {In questo articolo presentiamo CItA il primo corpus di produzioni scritte di apprendenti l'italiano L1 del primo e del secondo anno della scuola secondaria di primo grado annotato con errori grammaticali, ortografici e lessicali. Le specificità del corpus e la sua natura diacronica lo rendono particolarmente utile sia per applicazioni linguistico-computazionali sia per studi socio-pedagogici.}, KEYWORDS = {Apprendiemento della lingua madre, evoluzione delle competenze linguistiche}, PAGES = {31-35}, URL = {http://www.italianlp.it/wp-content/uploads/2016/03/CItA_errori.pdf}, PUBLISHER = {Accademia University Press (Torino, ITA)}, ISBN = {978-88-99200-62-6}, CONFERENCE_NAME = {2nd Italian Conference on Computational Linguistics (CLiC-it)}, CONFERENCE_PLACE = {Trento}, CONFERENCE_DATE = {3-4 dicembre 2015}, } @INPROCEEDINGS{BRUNATO_2015_INPROCEEDINGS_BDVM_332693, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Design and Annotation of the First Italian Corpus for Text Simplification}, YEAR = {2015}, ABSTRACT = {In this paper, we present design and construction of the first Italian corpus for automatic and semi--automatic text simplification. In line with current approaches, we propose a new annotation scheme specifically conceived to identify the typology of changes an original sentence undergoes when it is manually simplified. Such a scheme has been applied to two aligned Italian corpora, containing original texts with corresponding simplified versions, selected as representative of two different manual simplification strategies and addressing different target reader populations. Each corpus was annotated with the operations foreseen in the annotation scheme, covering different levels of linguistic description. Annotation results were analysed with the final aim of capturing peculiarities and differences of the different simplification strategies pursued in the two corpora.}, KEYWORDS = {Annotation Scheme, Automatic Text Simplification}, PAGES = {31-34}, URL = {https://aclweb.org/anthology/W/W15/W15-1604.pdf}, ISBN = {978-1-941643-47-1}, CONFERENCE_NAME = {Proceedings of LAW IX-The 9th Linguistic Annotation Workshop}, CONFERENCE_PLACE = {Denver, Colorado}, CONFERENCE_DATE = {5 giugno 2015}, } @INPROCEEDINGS{VENTURI_2015_INPROCEEDINGS_VBDM_340387, AUTHOR = {Venturi, G. and Bellandi, T. and Dell'Orletta, F. and Montemagni, S.}, TITLE = {NLP-Based Readability Assessment of Health-Related Texts: a Case Study on Italian Informed Consent Forms}, YEAR = {2015}, ABSTRACT = {The paper illustrates the results of a case study aimed at investigating and enhancing the accessibility of Italian health-related documents by relying on advanced NLP techniques, with particular attention to informed consent forms. Results achieved show that the features automatically extracted from the linguistically annotated text and ranging across different levels of linguistic description have a high discriminative power in order to guarantee a reliable readability assessment.}, KEYWORDS = {Readability assessment, health-related information}, PAGES = {131-141}, URL = {http://www.aclweb.org/anthology/W15-2618}, ISBN = {978-1-941643-32-7}, CONFERENCE_NAME = {Sixth International Workshop on Health Text Mining and Information Analysis (Louhi)}, CONFERENCE_PLACE = {Lisbona}, CONFERENCE_DATE = {17 settembre 2015}, } @INPROCEEDINGS{MONTEMAGNI_2015_INPROCEEDINGS_MWN_367807, AUTHOR = {Montemagni, S. and Wieling, M. and Nerbonne, J.}, TITLE = {The contribution of dialectometry to the study of the dialects of Italy. A case study on Tuscan}, YEAR = {2015}, ABSTRACT = {We will illustrate the extent to which the recent advances of dialectometry can help to gain insight into the nature of linguistic variation - both synchronically and diachronically - in the study of the dialects of Tuscany, which have a special status in the complex puzzle of Italian dialects. This will be done by discussing the results achieved in a case study carried out over the last five years based on the corpus of dialectal data of the Atlante Lessicale Toscano ('Lexical Atlas of Tuscany', henceforth ALT, Giacomelli et al., 2000), a regional linguistic atlas focusing on dialec tal variation throughout Tuscany, a region where both Tuscan and non-Tuscan dialects are spoken.}, KEYWORDS = {dialectometry, lexical atlas, italian dialects, Tuscany, Atlante Lessicale Toscano}, URL = {http://media.leidenuniv.nl/legacy/montemagni-wieling-nerbonne.pdf}, CONFERENCE_NAME = {Italian Dialect Meeting 2015 \& CIDSM X}, CONFERENCE_PLACE = {Leiden University-Centre for Linguistics}, CONFERENCE_DATE = {23 June 2015}, } @INPROCEEDINGS{VENTURI_2015_INPROCEEDINGS_VRMSTFB_340388, AUTHOR = {Venturi, G. and Rinnone, S. and Montemagni, S. and Sassi, M. and Terranova, G. and Flore, E. and Bellandi, T.}, TITLE = {Language technologies for automatic readability assessment of health-related Information: a preliminary investigation into the informed consent forms used in a regional health service}, YEAR = {2015}, ABSTRACT = {Rationale: Within an information society, where everyone should be able to access all available information, improving access to written language is becoming more and more a central issue. This is the case for health-related information which should be accessible to all members of the society, including people who have reading difficulties as a result of a low education level or of language-based learning disabilities or because the language of the text is not their native language. Moreover, the breakdown of doctor-patient communication is one of the most frequent cause of adverse events. Research questions: We conducted a preliminary investigation to assess the readability of a corpus of informed consent forms used before a clinical procedure in the hospitals of a Regional Healthcare Service. Secondary goals include the comparison of readability across specialties and healthcare trusts. Methods: Providing complex scientific information in a way that is comprehensible to a lay person is a challenge that nowadays can be addressed by resorting to advanced Natural Language Processing (NLP) techniques, which make it possible to monitor the linguistic complexity of texts at the syntactic and lexical levels and to support their simplification, whenever needed. The study has been carried out by combining NLP-enabled feature extraction and state-of-the-art machine learning algorithms. To this end we used READ-IT, the first NLP-based readability assessment tool for Italian. Results: We analysed 584 documents, covering 29 specialties, for a total of 607.790 word tokens, currently used at the 36 public hospitals in Tuscany. Although the readability level of all documents in the corpus is low, both at the lexical and syntactic level, significant differences can be observed between specialties and healthcare trust releasing the forms. With the readability level ranging between 0 (easy-to-read) and 100 (difficult-to-read), it resulted that the pediatric informed consent documents are the most easy-to-read forms (with an average score of 75) while the most difficult-to read documents are documents of the surgical area (whose average score is 80) (standard deviation 2). Discussion: The state of the art resulting from this preliminary study shows that NLP-based readability assessment tools can help to measure the linguistic complexity of informed consent forms and guide the editor to identify linguistically complex passages that need to be simplified, either syntactically or lexically. The use of an assessment tool designed for the general language is the main limitation of the study and should be addressed through the customization of the tool to assess the readability of the healthcare jargon. A further step of the research consider also the design of a guidance to prepare readable informed consent forms.}, KEYWORDS = {Readability assessment, health-related information}, URL = {http://static1.squarespace.com/static/561c0d01e4b0b5ad2e65cc48/t/561d44dfe4b089431662d174/1444758751213/LibrettoProgramma.pdf}, CONFERENCE_NAME = {ISCOME 2015 Conference: "The Golden Bridge: Communication and Patient Safety"}, CONFERENCE_PLACE = {Montecatini Terme}, CONFERENCE_DATE = {15-16 giugno 2015}, } @ARTICLE{DELLORLETTA_2014_ARTICLE_DMV_285640, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Assessing document and sentence readability in less resourced languages and across textual genres}, YEAR = {2014}, ABSTRACT = {In this paper, we tackle three underresearched issues of the automatic readability assessment literature, namely the evaluation of text readability in less resourced languages, with respect to sentences (as opposed to documents) as well as across textual genres. Different solutions to these issues have been tested by using and refining READ-IT, the first advanced readability assessment tool for Italian, which combines traditional raw text features with lexical, morpho-syntactic and syntactic information. In READ-IT readability assessment is carried out with respect to both documents and sentences, with the latter constituting an important novelty of the proposed approach: READ-IT shows a high accuracy in the document classification task and promising results in the sentence classification scenario. By comparing the results of two versions of READ-IT, adopting a classification- versus ranking-based approach, we also show that readability assessment is strongly influenced by textual genre; for this reason a genre-oriented notion of readability is needed. With classification-based approaches, reliable results can only be achieved with genre-specific models: Since this is far from being a workable solution, especially for less resourced languages, a new ranking method for readability assessment is proposed, based on the notion of distance.}, KEYWORDS = {readability assessment, less resourced languages, multi-level linguistic annotation, textual genres}, PAGES = {163-193}, URL = {http://www.ingentaconnect.com/content/jbp/itl/2014/00000165/00000002/art00005}, VOLUME = {165}, DOI = {10.1075/itl.165.2.03del}, PUBLISHER = {Peeters Publishers (Leuven, Belgio)}, ISSN = {1783-1490}, JOURNAL = {ITL. Internationaler technischer Literaturanzeiger (Online)}, } @ARTICLE{WIELING_2014_ARTICLE_WMNB_285543, AUTHOR = {Wieling, M. and Montemagni, S. and Nerbonne, J. and Baayen, R. H.}, TITLE = {Lexical differences between Tuscan dialects and standard Italian: Accounting for geographic and socio-demographic variation using generalized additive mixed modeling}, YEAR = {2014}, ABSTRACT = {This study uses a generalized additive mixed-effects regression model to predict lexical differences in Tuscan dialects with respect to standard Italian. We used lexical information for 170 concepts used by 2,060 speakers in 213 locations in Tuscany. In our model, geographical position was found to be an important predictor, with locations more distant from Florence having lexical forms more likely to differ from standard Italian. In addition, the geographical pattern varied significantly for low- versus high-frequency concepts and older versus younger speakers. Younger speakers generally used variants more likely to match the standard language. Several other factors emerged as significant. Male speakers as well as farmers were more likely to use lexical forms different from standard Italian. In contrast, higher-educated speakers used lexical forms more likely to match the standard. The model also indicates that lexical variants used in smaller communities are more likely to differ from standard Italian. The impact of community size, however, varied from concept to concept. For a majority of concepts, lexical variants used in smaller communities are more likely to differ from the standard Italian form. For a minority of concepts, however, lexical variants used in larger communities are more likely to differ from standard Italian. Similarly, the effect of the other community- and speaker-related predictors varied per concept. These results clearly show that the model succeeds in teasing apart different forces influencing the dialect landscape and helps us to shed light on the complex interaction between the standard Italian language and the Tuscan dialectal varieties. In addition, this study illustrates the potential of generalized additive mixed-effects regression modeling applied to dialect data.*}, KEYWORDS = {Tuscan dialects, lexical variation, generalized additive modeling, mixed-effects regression modeling, geographical variation}, PAGES = {669-692}, URL = {http://www.linguisticsociety.org/files/wieling.pdf}, VOLUME = {90}, PUBLISHER = {Linguistic Society of America [etc. ] (Washington, DC [etc. ], Stati Uniti d'America)}, ISSN = {0097-8507}, JOURNAL = {Language (Baltimore)}, } @EDITORIAL{BOSCO_2014_EDITORIAL_BCDFMS_330112, AUTHOR = {Bosco, C. and Cosi, P. and Dell'Orletta, F. and Falcone, M. and Montemagni, S. and Simi, M.}, TITLE = {Proceedings of the Fourth International Workshop EVALITA 2014}, YEAR = {2014}, KEYWORDS = {Trattamento Automatico del Linguaggio, Speech Processing, Lingua Italiana}, PAGES = {167}, URL = {http://clic.humnet.unipi.it/proceedings/Proceedings-EVALITA-2014.pdf}, PUBLISHER = {Pisa University Press (Pisa, ITA)}, ISBN = {978-88-67414-72-7}, } @EDITORIAL{FRANCESCONI_2014_EDITORIAL_FMPVW_310637, AUTHOR = {Francesconi, E. and Montemagni, S. and Peters, W. and Venturi, G. and Wyner, A.}, TITLE = {Proceedings of the Fourth Workshop on Semantic Processing of Legal Texts}, YEAR = {2014}, PAGES = {33}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/workshops/LREC2014Workshop-SPLeT%20Proceedings.pdf}, PUBLISHER = {PARIGI: ELRA (Parigi, FRA)}, ISBN = {978-2-9517408-8-4}, } @INPROCEEDINGS{BARBAGLI_2014_INPROCEEDINGS_BLDMV_294078, AUTHOR = {Barbagli, A. and Lucisano, P. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Tecnologie del linguaggio e monitoraggio dell'evoluzione delle abilità di scrittura nella scuola secondaria di primo grado}, YEAR = {2014}, ABSTRACT = {L'ultimo decennio ha visto l'affermarsi a livello internazionale dell'uso di tecnologie del linguaggio per lo studio dei processi di apprendimento. Questo contributo, che si colloca all'interno di una ricerca più ampia di pedagogia sperimentale, riporta i primi e promettenti risultati di uno studio finalizzato al monitoraggio dell'evoluzione del processo di apprendimento della lingua italiana condotto a partire dalle produzione scritte degli studenti con strumenti di annotazione linguistica automatica e di estrazione di conoscenza.}, PAGES = {23-27}, URL = {http://www.italianlp.it/wp-content/uploads/2014/12/Tecnologie-del-linguaggio-per-la-scuola.pdf}, DOI = {10.12871/CLICIT201415}, PUBLISHER = {Pisa University Press srl (Pisa, ITA)}, ISBN = {978-8-86741-472-7}, CONFERENCE_NAME = {First Italian Conference on Computational Linguistics (CLiC-it 2014)}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {9-11 dicembre 2014}, BOOKTITLE = {Proceedings of the First Italian Conference on Computational Linguistics (CLiC-it 2014)}, EDITOR = {Basili, R. and Lenci, A. and Magnini, B.}, } @INPROCEEDINGS{BOSCHETTI_2014_INPROCEEDINGS_BCDLPPVML_288050, AUTHOR = {Boschetti, F. and Cimino, A. and Dell'Orletta, F. and Lebani, G. E. and Passaro, L. and Picchi, P. and Venturi, G. and Montemagni, S. and Lenci, A.}, TITLE = {Computational Analysis of Historical Documents: An Application to Italian War Bulletins in World War I and II}, YEAR = {2014}, ABSTRACT = {World War (WW) I and II represent crucial landmarks in the history on mankind: They have affected the destiny of whole generations and their consequences are still alive throughout Europe. In this paper we present an ongoing project to carry out a computational analysis of Italian war bulletins in WWI and WWII, by applying state-of-the-art tools for NLP and Information Extraction. The annotated texts and extracted information will be explored with a dedicated Web interface, allowing for multidimensional access and exploration of historical events through space and time.}, KEYWORDS = {World War I}, PAGES = {70-75}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/workshops/LREC2014Workshop-LRT4HDA%20Proceedings.pdf}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, CONFERENCE_NAME = {LREC 2014}, CONFERENCE_PLACE = {Reykjavik}, CONFERENCE_DATE = {26 May}, BOOKTITLE = {Proceedings of workshop on Language resources and technologies for processing and linking historical documents and archives-Deploying Linked Open Data in Cultural Heritage-LREC 2014, 26 May, Reykjavik, Iceland}, } @INPROCEEDINGS{BRUNATO_2014_INPROCEEDINGS_BDVM_294073, AUTHOR = {Brunato, D. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Defining an annotation scheme with a view to automatic text simplification}, YEAR = {2014}, ABSTRACT = {This paper presents the preliminary steps of ongoing research in the field of automatic text simplification. In line with current approaches, we propose here a new annotation scheme specifically conceived to identify the typologies of changes an original sentence undergoes when it is manually simplified. Such a scheme has been tested on a parallel corpus available for Italian, which we have first aligned at sentence level and then annotated with simplification rules.}, PAGES = {87-92}, URL = {http://www.italianlp.it/wp-content/uploads/2014/12/Text-simplification.pdf}, DOI = {10.12871/CLICIT2014118}, PUBLISHER = {Pisa University Press srl (Pisa, ITA)}, ISBN = {978-8-86741-472-7}, CONFERENCE_NAME = {First Italian Conference on Computational Linguistics (CLiC-it 2014)}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {9-11 dicembre 2014}, BOOKTITLE = {Proceedings of the First Italian Conference on Computational Linguistics (CLiC-it 2014)}, EDITOR = {Basili, R. and Lenci, A. and Magnini, B.}, } @INPROCEEDINGS{DELLORLETTA_2014_INPROCEEDINGS_DVCM_285670, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Cimino, A. and Montemagni, S.}, TITLE = {T2K: a System for Automatically Extracting and Organizing Knowledge from Texts}, YEAR = {2014}, ABSTRACT = {In this paper, we present T2K, a suite of tools for automatically extracting domain-specific knowledge from collections of Italian and English texts. T2K (Text-To-Knowledge v2) relies on a battery of tools for Natural Language Processing (NLP), statistical text analysis and machine learning which are dynamically integrated to provide an accurate and incremental representation of the content of vast repositories of unstructured documents. Extracted knowledge ranges from domain-specific entities and named entities to the relations connecting them and can be used for indexing document collections with respect to different information types. T2K also includes "linguistic profiling" functionalities aimed at supporting the user in constructing the acquisition corpus, e.g. in selecting texts belonging to the same genre or characterized by the same degree of specialization or in monitoring the "added value" of newly inserted documents. T2K is a web application which can be accessed from any browser through a personal account which has been tested in a wide range of domains.}, KEYWORDS = {Natural Language Processing, Information Extraction, Knowledge Management}, PAGES = {2062-2070}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/590_Paper.pdf}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {International Conference on Language Resources and Evaluation (LREC)}, CONFERENCE_PLACE = {Reykjavik}, CONFERENCE_DATE = {26-31 maggio 2014}, } @INPROCEEDINGS{DELLORLETTA_2014_INPROCEEDINGS_DWCVM_294084, AUTHOR = {Dell'Orletta, F. and Wieling, M. and Cimino, A. and Venturi, G. and Montemagni, S.}, TITLE = {Assessing the readability of sentences: which corpora and features?}, YEAR = {2014}, ABSTRACT = {The paper investigates the problem of sentence readability assessment, which is modelled as a classification task, with a specific view to text simplification. In particular, it addresses two open issues connected with it, i.e. the corpora to be used for training, and the identification of the most effective features to determine sentence readability. An existing readability assessment tool developed for Italian was specialized at the level of training corpus and learning algorithm. A maximum entropy-based feature selection and ranking algorithm (grafting) was used to identify to the most relevant features: it turned out that assessing the readability of sentences is a complex task, requiring a high number of features, mainly syntactic ones.}, PAGES = {163-173}, URL = {http://acl2014.org/acl2014/W14-18/pdf/W14-1820.pdf}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-941643-03-7}, CONFERENCE_NAME = {9th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2014)}, CONFERENCE_PLACE = {Baltimore, Maryland, USA}, CONFERENCE_DATE = {26 giugno 2014}, BOOKTITLE = {Proceedings of 9th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2014)}, } @INPROCEEDINGS{MONTEMAGNI_2014_INPROCEEDINGS_M_330111, AUTHOR = {Montemagni, S.}, TITLE = {DH@ILC: linee di attività e ricerca}, YEAR = {2014}, ABSTRACT = {Le principali linee di ricerca e sviluppo dell'ILC nel settore delle DH possono essere ricondotte ai seguenti filoni: acquisizione e conservazione di testi; progettazione e sviluppo di risorse e strumenti per il trattamento automatico di lingue classiche e varietà storiche della lingua; progettazione e sviluppo di strumenti per l'analisi del testo; costruzione di un'infrastruttura italiana per la ricerca nell'ambito delle scienze umane e sociali.}, KEYWORDS = {Digital Humanities, Trattamento Automatico del Linguaggio, Risorse Linguistiche}, PAGES = {101-111}, URL = {https://publications.cnr.it/doc/330111}, PUBLISHER = {CLEUP (Padova, ITA)}, ISBN = {9788867872602}, CONFERENCE_NAME = {2nd AIUCD Annual Conference}, CONFERENCE_PLACE = {Padova, Italy}, CONFERENCE_DATE = {11-12 December 2013}, BOOKTITLE = {Collaborative Research Practices and Shared Infrastructures for Humanities Computing}, EDITOR = {Agosti, M. and Tomasi, F.}, } @INPROCEEDINGS{SIMI_2014_INPROCEEDINGS_SBM_329779, AUTHOR = {Simi, M. and Bosco, C. and Montemagni, S.}, TITLE = {Less is More? Towards a Reduced Inventory of Categories for Training a Parser for the Italian Stanford Dependencies}, YEAR = {2014}, ABSTRACT = {Stanford Dependencies (SD) represent nowadays a de facto standard as far as dependency annotation is concerned. The goal of this paper is to explore pros and cons of different strategies for generating SD annotated Italian texts to enrich the existing Italian Stanford Dependency Treebank (ISDT). This is done by comparing the performance of a statistical parser (DeSR) trained on a simpler resource (the augmented version of the Merged Italian Dependency Treebank or MIDT+) and whose output was automatically converted to SD, with the results of the parser directly trained on ISDT. Experiments carried out to test reliability and effectiveness of the two strategies show that the performance of a parser trained on the reduced dependencies repertoire, whose output can be easily converted to SD, is slightly higher than the performance of a parser directly trained on ISDT. A non-negligible advantage of the first strategy for generating SD annotated texts is that semi-automatic extensions of the training resource are more easily and consistently carried out with respect to a reduced dependency tagset. Preliminary experiments carried out for generating the collapsed and propagated SD representation are also reported.}, KEYWORDS = {Italian Treebank, Harmonization and Merging of Resources, Stanford Dependencie s}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/818_Paper.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {Ninth International Conference on Language Resources and Evaluation (LREC'14)}, CONFERENCE_PLACE = {Reykjavik, Iceland}, CONFERENCE_DATE = {26-31 May 2014}, BOOKTITLE = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{SAGRI_2014_INPROCEEDINGS_STMV_310539, AUTHOR = {Sagri, M. T. and Tiscornia, D. and Montemagni, S. and Venturi, G.}, TITLE = {Investigating the relationship between neuroscience and law: a case study on a corpus of Italian case law texts}, YEAR = {2014}, KEYWORDS = {Neuroscience linguistic and lexico-semantic analysis}, URL = {https://publications.cnr.it/doc/310539}, CONFERENCE_NAME = {Language and Law in Social Practice 3rd International Conference}, CONFERENCE_PLACE = {Florence}, CONFERENCE_DATE = {14-15-16-17 May 2014}, } @ARTICLE{DELLORLETTA_2013_ARTICLE_DVM_310619, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Linguistically-driven selection of correct arcs for dependency parsing}, YEAR = {2013}, ABSTRACT = {LISCA is an unsupervised algorithm aimed at assigning a quality score to each arc generated by a dependency parser in order to produce a decreasing ranking of arcs from correct to incorrect ones. LISCA exploits statistics about a set of linguistically-motivated and dependency-based features extracted from a large corpus of automatically parsed sentences and uses them to assign a quality score to each arc of a parsed sentence belonging to the same domain of the automatically parsed corpus. LISCA has been successfully tested on two datasets belonging to two different domains and in all experiments it turned out to outperform different baselines, thus showing to be able to reliably detect correct arcs also representing domain-specific peculiarities.}, KEYWORDS = {Correct arcs, Dependency parsing}, PAGES = {125-136}, URL = {http://cys.cic.ipn.mx/ojs/index.php/CyS/article/view/1517}, VOLUME = {17}, ISSN = {1405-5546}, JOURNAL = {Computación y Sistemas}, } @ARTICLE{MONTEMAGNI_2013_ARTICLE_M_329781, AUTHOR = {Montemagni, S.}, TITLE = {Tecnologie linguistico-computazionali e monitoraggio della lingua italiana}, YEAR = {2013}, ABSTRACT = {In una riflessione su dove stia andando l'italiano del terzo millennio, è legittimo chiedersi se e in che misura le tecnologie linguistico-computazionali possano essere di aiuto nel monitoraggio della lingua italiana nelle sue varietà diamesiche, diafasiche e diastratiche, nonché sull'asse diacronico. L'obiettivo del presente contributo consiste nel fornire una risposta, sebbene preliminare, a questo interrogativo, primariamente sul versante metodologico. In particolare, si vuole mostrare che mediante il ricorso a tecnologie linguistico-computazionali è oggi possibile monitorare un ampio spettro di tratti, che spaziano tra i diversi livelli di descrizione linguistica (primariamente, lessico, morfo-sintassi e sintassi), in relazione a corpora di sempre più vaste dimensioni. Questo rappresenta un cambio fondamentale nello studio della variazione linguistica, in particolare della lingua italiana, fino a oggi basato su corpora di dimensioni relativamente ridotte e tipicamente condotto mediante un'analisi (semi-)manuale del testo. Come vedremo, l'uso di vasti corpora testuali combinato con il ricorso a tecnologie linguistico-computazionali per l'analisi e il monitoraggio linguistico rendono oggi possibili analisi sempre più accurate e affidabili, che coprono aspetti della struttura linguistica rimasti fino a ora inesplorati in quanto difficilmente attingibili mediante un'analisi manuale del testo.}, KEYWORDS = {Monitoraggio Linguistico, Trattamento Automatico del Linguaggio, Varietà d'Uso della Lingua}, PAGES = {145-172}, URL = {http://www.italianlp.it/wp-content/uploads/2014/04/montemagni_silta_submission_rif.pdf}, VOLUME = {XLII}, PUBLISHER = {Pacini Editore (Opedaletto (PI), Italia)}, ISSN = {0390-6809}, JOURNAL = {Studi italiani di linguistica teorica ed applicata}, } @ARTICLE{MONTEMAGNI_2013_ARTICLE_MWDN_288064, AUTHOR = {Montemagni, S. and Wieling, M. and De Jonge, B. and Nerbonne, J.}, TITLE = {Synchronic patterns of Tuscan phonetic variation and diachronic change: Evidence from a dialectometric study}, YEAR = {2013}, ABSTRACT = {A careful investigation of synchronic patterns of linguistic variation with underlying linguistic features can lead to important insights into the comprehension of diachronic phonetic processes. In this article, we showed that the method of spectral partitioning of bipartite graphs applied to synchronic dialectal data can effectively and reliably be used to investigate diachronic processes, thus contributing to a deeper understanding of the relationship between synchronic variation and diachronic change. This was illustrated through a case study carried out on Tuscan dialects, focusing on so-called Tuscan 'gorgia', a lenition process consisting of the spirantization of stop consonants. In particular, from a quantitative analysis of the sound correspondences involving voiceless and voiced stops, we tracked the evolution of the spirantization phenomenon in several respects. First, we tracked spirantization geographically, across Tuscany from the influential center of Florence to the peripheral areas. Second, we tracked it phonologically, from voiceless to voiced stops, and within each voicing class from velars to dentals and then to bilabials. Finally, we tracked it demographically, with young speakers using the most innovative sound correspondences more than old speakers. The fact that these results are in line with the literature on the topic of Tuscan 'gorgia' demonstrates the potential of the method of spectral partitioning of bipartite graphs with respect to the reconstruction of diachronic processes starting from diatopically distributed synchronic dialectal data.}, KEYWORDS = {Tuscan dialactelogy, dialectometry, diachronic variation}, PAGES = {157-172}, URL = {https://publications.cnr.it/doc/288064}, VOLUME = {28}, DOI = {10.1093/llc/fqs057}, PUBLISHER = {Oxford University Press (Oxford, Regno Unito)}, ISSN = {0268-1145}, JOURNAL = {Literary and linguistic computing}, } @INCOLLECTION{DELLORLETTA_2013_INCOLLECTION_DMMVAF_266373, AUTHOR = {Dell'Orletta, F. and Marchi, S. and Montemagni, S. and Venturi, G. and Agnoloni, T. and Francesconi, E.}, TITLE = {Domain Adaptation for Dependency Parsing at EVALITA 2011}, YEAR = {2013}, ABSTRACT = {The domain adaptation task was aimed at investigating techniques for adapting state-of-the-art dependency parsing systems to new domains. Both the language dealt with, i.e. Italian, and the target do- main, namely the legal domain, represent two main novelties of the task organised at Evalita 2011 with respect to previous domain adaptation ini- tiatives. In this paper, we define the task and describe how the datasets were created from different resources. In addition, we characterize the different approaches of the participating systems, report the test results, and provide a first analysis of these results.}, KEYWORDS = {Dependency Parsing, Domain Adaptation, Self-training, Active Learning, Legal-NLP}, PAGES = {58-69}, URL = {https://publications.cnr.it/doc/266373}, VOLUME = {7689}, PUBLISHER = {Springer (Berlin Heidelberg, DEU)}, ISBN = {978-3-642-35827-2}, BOOKTITLE = {Evaluation of NLP and Speech Tools for Italian}, EDITOR = {Magnini, B. and Cutugno, F. and Falcone, M. and Pianta, E.}, } @INCOLLECTION{MONTEMAGNI_2013_INCOLLECTION_M_329778, AUTHOR = {Montemagni, S.}, TITLE = {Estrazione Terminologica Automatica e Indicizzazione: Scenari Applicativi, Problemi e Possibili Soluzioni}, YEAR = {2013}, ABSTRACT = {Il ricorso a metodi e tecniche di estrazione automatica di terminologia settoriale da corpora di dominio, ovvero da insiemi di documenti relativi a uno specifico settore della conoscenza, rappresenta una sempre più diffusa pratica di supporto al processo di indicizzazione di collezioni documentali, inteso come l'operazione volta all'individuazione delle voci indice che ne costituiscono il contenuto concettuale. L'obiettivo di questo contributo è una rivisitazione critica di esperienze condotte all'interno di diversi scenari applicativi in cui i risultati del processo di estrazione automatica di terminologia sono utilizzati per la costruzione di vocabolari controllati o di thesauri sulla base dei quali è condotto il processo di indicizzazione.}, KEYWORDS = {Trattamento Automatico del Linguaggio, Estrazione Terminologica, Indicizzazione}, PAGES = {241-284}, URL = {https://publications.cnr.it/doc/329778}, PUBLISHER = {Iter (Milano) (Milano, ITA)}, ISBN = {978-88-903419-3-9}, BOOKTITLE = {Documenti Digitali}, EDITOR = {Guarasci, R. and Folino, A.}, } @INPROCEEDINGS{BOSCO_2013_INPROCEEDINGS_BMS_329780, AUTHOR = {Bosco, C. and Montemagni, S. and Simi, M.}, TITLE = {Converting Italian Treebanks: Towards an Italian Stanford Dependency Treebank}, YEAR = {2013}, ABSTRACT = {The paper addresses the challenge of converting MIDT, an existing dependency-based Italian treebank resulting from the harmonization and merging of smaller resources, into the Stanford Dependencies annotation formalism, with the final aim of constructing a standard-compliant resource for the Italian language. Achieved results include a methodology for converting treebank annotations belonging to the same dependency-based family, the Italian Stanford Dependency Treebank (ISDT), and an Italian localization of the Stanford Dependency scheme.}, KEYWORDS = {Italian Treebank, Harmonization and Merging of Resources, Stanford Dependencie s}, PAGES = {61-69}, URL = {http://aclweb.org/anthology/W13-2308}, ISBN = {978-1-937284-58-9}, CONFERENCE_NAME = {7th Linguistic Annotation Workshop and Interoperability with Discourse}, CONFERENCE_PLACE = {Sofia, Bulgaria}, CONFERENCE_DATE = {8-9 August 2013}, BOOKTITLE = {Proceedings of the 7th Linguistic Annotation Workshop and Interoperability with Discourse}, } @INPROCEEDINGS{CIMINO_2013_INPROCEEDINGS_CDVM_285772, AUTHOR = {Cimino, A. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Linguistic Profiling based on General-purpose Features and Native Language Identification}, YEAR = {2013}, ABSTRACT = {In this paper, we describe our approach to native language identification and discuss the results we submitted as participants to the First NLI Shared Task. By resorting to a wide set of general-purpose features qualifying the lexical and grammatical structure of a text, rather than to ad hoc features specifically selected for the NLI task, we achieved encouraging results, which show that the proposed approach is general-purpose and portable across different tasks, domains and languages.}, KEYWORDS = {Native Language Identification, Linguistic Profiling}, PAGES = {207-215}, URL = {http://www.aclweb.org/anthology/W13-1727}, ISBN = {978-1-937284-47-3}, CONFERENCE_NAME = {8th workshop on "Innovative Use of NLP for Building Educational Applications"}, CONFERENCE_PLACE = {Atlanta (Georgia)}, CONFERENCE_DATE = {13 giugno 2013}, } @INPROCEEDINGS{DELLORLETTA_2013_INPROCEEDINGS_DMV_278421, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Linguistic Profiling of Texts Across Textual Genre and Readability Level. An exploratory Study on Italian Fictional Prose}, YEAR = {2013}, PAGES = {189-197}, URL = {https://publications.cnr.it/doc/278421}, CONFERENCE_NAME = {Recent Advances in Natural Language Processing (RANLP 2013)}, CONFERENCE_PLACE = {Hissar, Bulgaria}, CONFERENCE_DATE = {7-13 settembre}, BOOKTITLE = {Proceedings of Recent Advances in Natural Language Processing (RANLP 2013)}, } @INPROCEEDINGS{DELLORLETTA_2013_INPROCEEDINGS_DVM_285773, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Unsupervised Linguistically-Driven Reliable Dependency Parses Detection and Self-Training for Adaptation to the Biomedical Domain}, YEAR = {2013}, ABSTRACT = {In this paper, a new self-training method for domain adaptation is illustrated, where the selection of reliable parses is carried out by an unsupervised linguistically-driven algorithm, ULISSE. The method has been tested on biomedical texts with results showing a significant improvement with respect to considered baselines, which demonstrates its ability to capture both reliability of parses and domain-specificity of linguistic constructions.}, KEYWORDS = {Self-training, Domain Adaptation, Biomedical Texts}, PAGES = {45-53}, URL = {http://www.aclweb.org/anthology/W13-1906}, ISBN = {978-1-937284-55-8}, CONFERENCE_NAME = {12th workshop on "Biomedical Natural Language Processing" (BioNLP)}, CONFERENCE_PLACE = {Sofia (Bulgaria)}, CONFERENCE_DATE = {8-9 agosto 2013}, } @EDITORIAL{FRANCESCONI_2012_EDITORIAL_FMPW_330113, AUTHOR = {Francesconi, E. and Montemagni, S. and Peters, W. and Wyner, A.}, TITLE = {Proceedings of the Fourth Workshop on Semantic Processing of Legal Texts (SPLeT 2012)}, YEAR = {2012}, KEYWORDS = {Trattamento Automatico del Linguaggio, Linguaggio Giuridico, Estrazione di Conoscenza}, PAGES = {71}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/workshops/27.LREC%202012%20Workshop%20Proceedings%20SPLeT.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-7-7}, } @INPROCEEDINGS{BONIN_2012_INPROCEEDINGS_BDMV_310580, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Lessico settoriale e lessico comune dell'estrazione di terminologia specialistica da corpora di dominio}, YEAR = {2012}, PAGES = {207-220}, URL = {https://publications.cnr.it/doc/310580}, PUBLISHER = {Bulzoni Editore (Roma, ITA)}, ISBN = {978-88-7870-655-2}, CONFERENCE_NAME = {XLIV congresso internazionale di studi della società di linguistica italiana}, CONFERENCE_PLACE = {Viterbo}, CONFERENCE_DATE = {27-29 settembre 2010}, BOOKTITLE = {Lessico e Lessicologia. Atti del XLIV congresso internazionale di studi della società di linguistica italiana}, } @INPROCEEDINGS{BOSCO_2012_INPROCEEDINGS_BMS_330109, AUTHOR = {Bosco, C. and Montemagni, S. and Simi, M.}, TITLE = {Harmonization and Merging of two Italian Dependency Treebanks}, YEAR = {2012}, ABSTRACT = {The paper describes the methodology which is currently being defined for the construction of a "Merged Italian Dependency Treebank" (MIDT) starting from already existing resources. In particular, it reports the results of a case study carried out on two available dependency treebanks, i.e. TUT and ISST-TANL. The issues raised during the comparison of the annotation schemes underlying the two treebanks are discussed and investigated with a particular emphasis on the definition of a set of linguistic categories to be used as a "bridge" between the specific schemes. As an encoding format, the CoNLL de facto standard is used.}, KEYWORDS = {Syntactic Annotation, Merging of Resources, Dependency Parsing}, PAGES = {23-30}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/workshops/06.LREC%202012%20Merging%20Proceedings.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {LREC 2012 Workshop on Language Resource Merging}, CONFERENCE_PLACE = {Istambul}, CONFERENCE_DATE = {22 May 2012}, BOOKTITLE = {Proceedings of the LREC 2012 Workshop on Language Resource Merging}, EDITOR = {Bel, N.}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DMMPV_219489, AUTHOR = {Dell'Orletta, F. and Marchi, S. and Montemagni, S. and Plank, B. and Venturi, G.}, TITLE = {The SPLeT-2012 Shared Task on Dependency Parsing of Legal Texts}, YEAR = {2012}, ABSTRACT = {The 4th Workshop on "Semantic Processing of Legal Texts" (SPLeT-2012) presents the first multilingual shared task on Dependency Parsing of Legal Texts. In this paper, we define the general task and its internal organization into sub-tasks, describe the datasets and the domain-specific linguistic peculiarities characterizing them. We finally report the results achieved by the participating systems, describe the underlying approaches and provide a first analysis of the final test results.}, KEYWORDS = {Dependency Parsing, Domain Adaptation, Legal Text Processing}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/workshops/27.LREC%202012%20Workshop%20Proceedings%20SPLeT.pdf}, CONFERENCE_NAME = {Fourth Workshop on Semantic Processing of Legal Texts (SPLeT 2012)-First Shared Task on Dependency Parsing of Legal Texts (SPLeT 2012)}, CONFERENCE_PLACE = {Istanbul}, CONFERENCE_DATE = {27 Maggio 2012}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DMMVAF_219483, AUTHOR = {Dell'Orletta, F. and Marchi, S. and Montemagni, S. and Venturi, G. and Agnoloni, T. and Francesconi, E.}, TITLE = {Domain Adaptation for Dependency Parsing at Evalita 2011}, YEAR = {2012}, ABSTRACT = {The domain adaptation task was aimed at investigating techniques for adapting state-of-the-art dependency parsing systems to new domains. Both the language dealt with, i.e. Italian, and the target domain, namely the legal domain, represent two main novelties of the task organised at Evalita 2011. In this paper, we define the task and describe how the datasets were created from different resources. In addition, we characterize the different approaches of the participating systems, report the test results, and provide a first analysis of these results.}, KEYWORDS = {Dependency Parsing, Domain Adaptation, Legal Text Processing}, PAGES = {1-7}, URL = {http://www.evalita.it/sites/evalita.fbk.eu/files/working_notes2011/Domain_Adaptation/}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian (EVALITA 2011): Domain Adaptation track}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {24-25 Gennaio 2012}, } @INPROCEEDINGS{DELLORLETTA_2012_INPROCEEDINGS_DMV_278420, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Genre-oriented Readability Assessment: a Case Study}, YEAR = {2012}, PAGES = {91-98}, URL = {https://publications.cnr.it/doc/278420}, ISBN = {978-1-62748-389-6}, CONFERENCE_NAME = {Workshop on "Speech and Language Processing Tools in Education" (SLP-TED)}, CONFERENCE_PLACE = {Mumbai, India}, CONFERENCE_DATE = {15 December, 2012}, BOOKTITLE = {Proceedings of Workshop on "Speech and Language Processing Tools in Education" (SLP-TED)}, } @INPROCEEDINGS{LENCI_2012_INPROCEEDINGS_LMVC_285544, AUTHOR = {Lenci, A. and Montemagni, S. and Venturi, G. and Cutrulla, M. R.}, TITLE = {Enriching the ISST-TANL Corpus with Semantic Frames}, YEAR = {2012}, ABSTRACT = {The paper describes the design and the results of a manual annotation methodology devoted to enrich the ISST-TANL Corpus with Semantic Frames information. The main issues encountered in applying the English FrameNet annotation criteria to a corpus of Italian language are discussed together with the choice of anchoring the semantic annotation layer to the underlying dependency syntactic structure. We also describe an experiment to measure inter-annotator agreement and a first case study to extend and specialise FrameNet annotation to a corpus of legislative texts.}, KEYWORDS = {Semantic annotation, FrameNet, Multi-layer annotated corpus}, PAGES = {3719-3726}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/986_Paper.pdf}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {Eight International Conference on Language Resources and Evaluation (LREC'12)}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {23-25 May 2012}, BOOKTITLE = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Doğan, M. U. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{MONTEMAGNI_2012_INPROCEEDINGS_MWDN_330114, AUTHOR = {Montemagni, S. and Wieling, M. and De Jonge, B. and Nerbonne, J.}, TITLE = {Patterns of Language Variation and Underlying Linguistic Features: A New Dialectometric Approach}, YEAR = {2012}, PAGES = {879-889}, URL = {https://publications.cnr.it/doc/330114}, VOLUME = {II}, PUBLISHER = {Franco Cesati Editore (Firenze, ITA)}, ISBN = {978-88-7667-433-4}, CONFERENCE_NAME = {XI Congresso SILFI (Società Internazionale di Linguistica e Filologia Italiana)}, CONFERENCE_PLACE = {Napoli}, CONFERENCE_DATE = {5-7 Ottobre 2010}, BOOKTITLE = {La variazione nell'italiano e nella sua storia. Varietà e varianti linguistiche e testuali. Atti dell'XI Congresso SILFI (Società Internazionale di Linguistica e Filologia Italiana)}, EDITOR = {Bianchi, P. and De Blasi, N. and De Caprio, C. and Montuori, F.}, } @ARTICLE{THOMPSON_2011_ARTICLE_TMMCDLMMPQRSVRA_205232, AUTHOR = {Thompson, P. and McNaught, J. and Montemagni, S. and Calzolari, N. and Del Gratta, R. and Lee, V. and Marchi, S. and Monachini, M. and Pezik, P. and Quochi, V. and Rupp, C. and Sasaki, Y. and Venturi, G. and Rebholz Schuhmann, D. and Ananiadou, S.}, TITLE = {The BioLexicon: a large-scale terminological resource for biomedical text mining}, YEAR = {2011}, ABSTRACT = {Background Due to the rapidly expanding body of biomedical literature, biologists require increasingly sophisticated and efficient systems to help them to search for relevant information. Such systems should account for the multiple written variants used to represent biomedical concepts, and allow the user to search for specific pieces of knowledge (or events) involving these concepts, e.g., protein-protein interactions. Such functionality requires access to detailed information about words used in the biomedical literature. Existing databases and ontologies often have a specific focus and are oriented towards human use. Consequently, biological knowledge is dispersed amongst many resources, which often do not attempt to account for the large and frequently changing set of variants that appear in the literature. Additionally, such resources typically do not provide information about how terms relate to each other in texts to describe events. Results This article provides an overview of the design, construction and evaluation of a large-scale lexical and conceptual resource for the biomedical domain, the BioLexicon. The resource can be exploited by text mining tools at several levels, e.g., part-of-speech tagging, recognition of biomedical entities, and the extraction of events in which they are involved. As such, the BioLexicon must account for real usage of words in biomedical texts. In particular, the BioLexicon gathers together different types of terms from several existing data resources into a single, unified repository, and augments them with new term variants automatically extracted from biomedical literature. Extraction of events is facilitated through the inclusion of biologically pertinent verbs (around which events are typically organized) together with information about typical patterns of grammatical and semantic behaviour, which are acquired from domain-specific texts. In order to foster interoperability, the BioLexicon is modelled using the Lexical Markup Framework, an ISO standard. Conclusions The BioLexicon contains over 2.2 M lexical entries and over 1.8 M terminological variants, as well as over 3.3 M semantic relations, including over 2 M synonymy relations. Its exploitation can benefit both application developers and users. We demonstrate some such benefits by describing integration of the resource into a number of different tools, and evaluating improvements in performance that this can bring.}, KEYWORDS = {Text Mining, Information Extraction, Computational Lexicon}, PAGES = {1-29}, URL = {http://www.biomedcentral.com/1471-2105/12/397}, VOLUME = {12}, DOI = {10.1186/1471-2105-12-397}, PUBLISHER = {BioMed Central ([London], Regno Unito)}, ISSN = {1471-2105}, JOURNAL = {BMC bioinformatics}, } @INCOLLECTION{DELLORLETTA_2011_INCOLLECTION_DMVV_138775, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Vecchi, E. M. and Venturi, G.}, TITLE = {Tecnologie linguistico-computazionali per il monitoraggio della competenza linguistica italiana degli alunni stranieri nella scuola primaria e secondaria}, YEAR = {2011}, ABSTRACT = {La possibilità di disporre di tecnologie avanzate e innovative che permettano di monitorare la competenza linguistica degli alunni stranieri e, al contempo, valutare l'adeguatezza dei materiali didattici a loro offerti può essere di supporto all'insegnante nell'orientare la propria azione formativa, rendendo così il processo di integrazione linguistico-culturale meno faticoso e traumatico. In tale ottica, questo studio, realizzato col supporto di una piattaforma ormai consolidata di metodi e strumenti per il trattamento automatico dell'italiano, costituisce il primo tentativo condotto in relazione alla lingua italiana, per mettere a punto una metodologia di monitoraggio linguistico rivolta specificamente agli studenti apprendenti la lingua italiana come L2 ed alle loro produzioni scritte.}, KEYWORDS = {Trattamento Automatico del Linguaggio, Stranieri, Lingua italiana}, PAGES = {319-336}, URL = {https://publications.cnr.it/doc/138775}, PUBLISHER = {Mc Graw-Hill (Milano, ITA)}, ISBN = {978-88-386-7296-5}, BOOKTITLE = {Percorsi Migranti}, EDITOR = {Bruno, G. C. and Caruso, I. and Sanna, M. and Vellecco, I.}, } @INPROCEEDINGS{DELLORLETTA_2011_INPROCEEDINGS_DMV_205510, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {READ-IT: assessing readability of Italian texts with a view to text simplification}, YEAR = {2011}, ABSTRACT = {In this paper, we propose a new approach to readability assessment with a specific view to the task of text simplification: the intended audience includes people with low literacy skills and/or with mild cognitive impairment. READ-IT represents the first advanced readability assessment tool for what concerns Italian, which combines traditional raw text features with lexical, morpho-syntactic and syntactic information. In READ-IT readability assessment is carried out with respect to both documents and sentences where the latter represents an important novelty of the proposed approach creating the prerequisites for aligning the readability assessment step with the text simplification process. READ-IT shows a high accuracy in the document classification task and promising results in the sentence classification scenario.}, KEYWORDS = {Readability Assessment, Text Simplification}, PAGES = {73-83}, URL = {http://dl.acm.org/citation.cfm?id=2140511}, ISBN = {978-1-937284-14-5}, CONFERENCE_NAME = {SLPAT '11 Proceedings of the Second Workshop on Speech and Language Processing for Assistive Technologies}, CONFERENCE_PLACE = {Edimburgo, UK}, CONFERENCE_DATE = {30 Luglio 2011}, } @INPROCEEDINGS{DELLORLETTA_2011_INPROCEEDINGS_DVM_205505, AUTHOR = {Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {ULISSE: an unsupervised algorithm for detecting reliable dependency parses}, YEAR = {2011}, ABSTRACT = {In this paper we present ULISSE, an unsupervised linguistically--driven algorithm to select reliable parses from the output of a dependency parser. Different experiments were devised to show that the algorithm is robust enough to deal with the output of different parsers and with different languages, as well as to be used across different domains. In all cases, ULISSE appears to outperform the baseline algorithms.}, KEYWORDS = {Dependency Parsing, Selection of Reliable Parses, Unsupervised Algorithm}, PAGES = {115-124}, URL = {http://dl.acm.org/citation.cfm?id=2018950}, ISBN = {978-1-932432-92-3}, CONFERENCE_NAME = {CoNLL '11 Proceedings of the Fifteenth Conference on Computational Natural Language Learning}, CONFERENCE_PLACE = {Portland, Oregon, USA}, CONFERENCE_DATE = {23-24 Giugno 2011}, } @INPROCEEDINGS{DELLORLETTA_2011_INPROCEEDINGS_DM_205737, AUTHOR = {Dell'Orletta, F. and Montemagni, S.}, TITLE = {Towards an NLP-based approach for measuring syntactic complexity: preliminary experiments with Italian texts from different registers}, YEAR = {2011}, ABSTRACT = {In this paper, we explore how NLP can be used to automatically identify relevant syntactic complexity features in texts with the aim of assessing their correlation with specific linguistic registers. Our final goal is twofold. On the one hand, we demonstrate that automatic morpho-syntactic and syntactic annotation of texts provides sufficiently accurate output for use in the automatic extraction and measurement of syntactic complexity features. On the other hand, we identify the set of syntactic features strongly correlating with considered linguistic registers.}, KEYWORDS = {Language Variation, Natural Language Processing, Syntactic Complexity}, URL = {http://www.benszm.net/BSBWWS/Dellorletta_Montemagni.pdf}, CONFERENCE_NAME = {Workshop on "Cross-linguistic and language-internal variation in text and speech: focus on the joint analysis of multiple characteristics"}, CONFERENCE_PLACE = {Freiburg Institute for Advanced Studies (FRIAS), University of Freiburg}, CONFERENCE_DATE = {29/10/2010}, } @INPROCEEDINGS{MONTEMAGNI_2011_INPROCEEDINGS_M_205779, AUTHOR = {Montemagni, S.}, TITLE = {Ontology Learning. An introduction}, YEAR = {2011}, ABSTRACT = {The tutorial is organised into two parts: PART 1 is devoted to provide the basic notions underlying Ontology Learning, in particular why it is needed, how it can be carried out and how its results can be evaluated. PART 2 discusses the topic of Ontology Learning in the Legal domain, with particular attention to the specific challenges posed by it. It also provides an overview of different feasibility studies carried out in the legal domain.}, KEYWORDS = {Ontology Learning, Legal Information extraction, Natural Language Processing}, URL = {https://publications.cnr.it/doc/205779}, CONFERENCE_NAME = {Summer School LEX 2011, Ravenna, Italy "Managing Legal Resources in the Semantic Web"}, CONFERENCE_PLACE = {Ravenna, Italia}, CONFERENCE_DATE = {8 settembre 2011}, } @INPROCEEDINGS{MONTEMAGNI_2011_INPROCEEDINGS_MWDN_205911, AUTHOR = {Montemagni, S. and Wieling, M. and De Jonge, B. and Nerbonne, J.}, TITLE = {Synchronic patterns of Tuscan phonetic variation and diachronic change: evidence from a dialectometric study}, YEAR = {2011}, ABSTRACT = {A careful investigation of synchronic patterns of linguistic variation with underlying linguistic features can lead to important insights into the comprehension of diachronic phonetic processes. Starting from the analysis of synchronic patterns of phonetic variation in Tuscany we tackled one of the main and most debated features of Tuscan dialects, the phenomenon of spirantization with a specific view to the so-called Tuscan "gorgia" (i.e. voiceless spirantization). In particular, we showed that the newly proposed method of spectral partitioning of bipartite graphs applied to synchronic dialectal data can effectively be used to investigate diachronic phonetic processes. From a careful analysis of the sound correspondences involving voiceless and voiced stops, we tracked the evolution of the spirantization phenomenon in several respects. First, we tracked spirantization geographically, across Tuscany from the influential center of Florence to the peripheral areas. Second, we tracked it phonologically, from voiceless to voiced stops, and within each voicing class from velars to dentals and then to bilabials. Finally, we tracked it demographically, with young speakers using the most innovative sound correspondences more than old speakers. The fact that these results are in line with the literature on the topic of Tuscan "gorgia" demonstrates the potential of the method of spectral partitioning of bipartite graphs with respect to the reconstruction of diachronic processes starting from diatopically distributed synchronic dialectal data.}, KEYWORDS = {Dialectometry, Phonetic Variation, Tuscan Dialects}, PAGES = {120-121}, URL = {http://westernlinguistics.ca/methods14/files/all_abstracts_one_document.pdf}, CONFERENCE_NAME = {Fourteenth Methods in Dialectology Conference}, CONFERENCE_PLACE = {University of Western Ontario}, CONFERENCE_DATE = {2-6 August 2011}, } @TECHREPORT{MONTEMAGNI_2011_TECHREPORT_MW_206506, AUTHOR = {Montemagni, S. and Wieling, M.}, TITLE = {Definizione di un modello computazionale della variazione dialettale basato sull'integrazione di fattori socio-demografici e geografici}, YEAR = {2011}, ABSTRACT = {In this study, we used a mixed-effects logistic regression model in combination with generalized additive logistic modeling to predict lexical differences in Tuscan dialects with respect to standard Italian. We used lexical information for 170 concepts in 213 locations in Tuscany. Although geographical position is an important predictor with locations distant from Florence having lexical forms more likely to differ from standard Italian, several other factors emerged as significant. The model predicts that lexical variants used by older speakers and in smaller as well as poorer communities are more likely to differ from standard Italian. The impact of the demographic variables, however, varied from concept to concept. For a majority of concepts, smaller and poorer communities have lexical forms different from standard Italian. For a smaller minority of concepts, however, larger and richer communities have lexical forms different from standard Italian. Similarly, the effect of speaker age and the average community age also varied per concept. While not significant as a fixed effect, the concept frequency showed significant geographical variation. These results clearly identify important factors involved in dialect variation at the lexical level. In addition, this study illustrates the usefulness of mixed-effects regression techniques together with generalized additive modeling for analyzing lexical dialect data.}, KEYWORDS = {Dialettologia toscana, Dialettometria, variazione lessicale}, URL = {https://publications.cnr.it/doc/206506}, } @ARTICLE{BONIN_2010_ARTICLE_BDVM_278419, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Singling out Legal Knowledge from World Knowledge}, YEAR = {2010}, PAGES = {217-229}, URL = {https://publications.cnr.it/doc/278419}, PUBLISHER = {Edizioni Scientifiche Italiane (Firenze, Italia)}, ISSN = {0390-0975}, JOURNAL = {Informatica e diritto}, } @ARTICLE{FRANCESCONI_2010_ARTICLE_FMPT_30888, AUTHOR = {Francesconi, E. and Montemagni, S. and Peters, W. and Tiscornia, D.}, TITLE = {Integrating a Bottom-Up and Top-Down Methodology for Building Semantic Resources for the Multilingual Legal Domain}, YEAR = {2010}, ABSTRACT = {This article presents a methodology for multilingual legal knowledge acquisition and modelling. It encompasses two comlementary strategies. On the one hand, there is the top-down definition of the conceptual structure of the legal domain under consideration on the basis of expert jugdment. This structure is language-independent, modeled as an ontology, and can be aligned with other ontologies that capture similar or complementary knowledge, in order to provide a wider conceptual embedding. Another top-down approach is the exploitation of the explicit structure of legal texts, which enables the targeted identification of text spans that play an ontological role and their subsequent inclusion in the knowledge model. On the other hand, the linguistically motivated, text-based bottom-up population and incremental refinement of this conceptual structure using (semi-)automatic NLP techniques, maximizes the completeness and domain-specificity of the resulting knowledge. The proposed methodology is concerned with the relation between these two differently derived types of knowledge, and defines a framework for interfacing lexical and ontological knowledge, the result of which offers various perspectives on multilingual legal knowledge. Two case-studies combining bottom-up and top-down methodologies for knowledge modelling and learning are presented as illustrations of the methodology.}, KEYWORDS = {Knowledge Modelling, Knowledge Acquisition, Natural Language Processing, Ontology Learning}, PAGES = {95-121}, URL = {https://publications.cnr.it/doc/30888}, VOLUME = {6036/}, PUBLISHER = {Springer (Berlin, Germania)}, ISSN = {0302-9743}, JOURNAL = {Lecture notes in computer science}, } @BOOK{FRANCESCONI_2010_BOOK_FMPT_170395, AUTHOR = {Francesconi, E. and Montemagni, S. and Peters, W. and Tiscornia, D.}, TITLE = {Semantic Processing of Legal Texts: Where the Language of Law Meets the Law of Language}, YEAR = {2010}, ABSTRACT = {The last few years have seen a growing body of research and practice addressing aspects such as automated legal reasoning and argumentation, semantic and cross-language legal information retrieval, document classification, legal drafting, legal knowledge discovery and extraction. This State-of-the-Art Survey contains invited contributions of leading researchers and groups eminently active in the field, which were complemented with selected papers from the Workshop on Semantic Processing of Legal Texts, held in Marrakech, Morocco, in 2008, within the framework of the Sixth International Conference on Language Resources and Evaluation (LREC 2008). These publications mirror the state-of-the-art in linguistic technologies, tools and resources focusing on the automatic extraction of relevant information from legal texts, and the structured organization of this extracted knowledge for legal knowledge representation and scholarly activity, with particular emphasis on the crucial role played by language resources and human language technologies. The contents are organized in three topical sections on information extraction; construction of knowledge resources; and semantic indexing, summarization and translation.}, KEYWORDS = {Legal Text Processing, Ontology Learning, Information Extraction}, URL = {https://publications.cnr.it/doc/170395}, } @EDITORIAL{FRANCESCONI_2010_EDITORIAL_FMPT_186091, AUTHOR = {Francesconi, E. and Montemagni, S. and Peters, W. and Tiscornia, D.}, TITLE = {Semantic Processing of Legal Texts: Where the Language of Law Meets the Law of Language}, YEAR = {2010}, ABSTRACT = {The last few years have seen a growing body of research and practice addressing aspects such as automated legal reasoning and argumentation, semantic and cross-language legal information retrieval, document classification, legal drafting, legal knowledge discovery and extraction. This State-of-the-Art Survey contains invited contributions of leading researchers and groups eminently active in the field, which were complemented with selected papers from the Workshop on Semantic Processing of Legal Texts, held in Marrakech, Morocco, in 2008, within the framework of the Sixth International Conference on Language Resources and Evaluation (LREC 2008). These publications mirror the state-of-the-art in linguistic technologies, tools and resources focusing on the automatic extraction of relevant information from legal texts, and the structured organization of this extracted knowledge for legal knowledge representation and scholarly activity, with particular emphasis on the crucial role played by language resources and human language technologies. The contents are organized in three topical sections on information extraction; construction of knowledge resources; and semantic indexing, summarization and translation.}, PAGES = {249}, URL = {https://publications.cnr.it/doc/186091}, VOLUME = {6036}, ISBN = {978-3-642-12836-3}, } @EDITORIAL{FRANCESCONI_2010_EDITORIAL_FMPW_136477, AUTHOR = {Francesconi, E. and Montemagni, S. and Peters, W. and Wyner, A.}, TITLE = {Proceedings of the LREC 2010 Workshop on SEMANTIC PROCESSING OF LEGAL TEXTS (SPLeT-2010)}, YEAR = {2010}, KEYWORDS = {Legal Knowledge Extraction, Natural Language Processing}, URL = {https://publications.cnr.it/doc/136477}, } @EDITORIAL{FRANCESCONI_2010_EDITORIAL_FMRT_136476, AUTHOR = {Francesconi, E. and Montemagni, S. and Rossi, P. and Tiscornia, D.}, TITLE = {Proceedings of the 4th Workshop on Legal Ontologies and Artificial Intelligence Techniques (LOAIT 2010)}, YEAR = {2010}, KEYWORDS = {Legal Ontologies, Ontology Learning, Legal Knowledge Extraction, Legal Knowledge Modelling}, URL = {https://publications.cnr.it/doc/136476}, } @INPROCEEDINGS{ATTARDI_2010_INPROCEEDINGS_ADDLMS_84775, AUTHOR = {Attardi, G. and Dei Rossi, S. and Di Pietro, G. and Lenci, A. and Montemagni, S. and Simi, M.}, TITLE = {A Resource and Tool for Super-sense Tagging of Italian Texts}, YEAR = {2010}, KEYWORDS = {Corpus (creation, annotation, etc.), Tools, Systems, Applications, Statistical and machine learning methods}, URL = {https://publications.cnr.it/doc/84775}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Valletta, Malta}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{BONIN_2010_INPROCEEDINGS_BDMV_84796, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {A Contrastive Approach to Multi-word Extraction from Domain-specific Corpora}, YEAR = {2010}, ABSTRACT = {In this paper we present a novel approach to multi-word terminology extraction combining a well-known automatic term recognition approach, the C-NC value method, with a contrastive ranking technique, aimed at refining obtained results either by filtering noise due to common words or by discerning between semantically different types of terms within heterogeneous terminologies. The proposed methodology has been tested in two case studies carried out in the History of Art and Legal domains with promising results.}, KEYWORDS = {Terminology Extraction, Domain-specific Corpora, Multi-word Expression}, PAGES = {3222-3229}, URL = {https://publications.cnr.it/doc/84796}, ISBN = {2-9517408-6-7}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Valletta, Malta}, CONFERENCE_DATE = {19-21 maggio 2010}, } @INPROCEEDINGS{BONIN_2010_INPROCEEDINGS_BDVM_84802, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Venturi, G. and Montemagni, S.}, TITLE = {Contrastive filtering of domain specific multi-word terms from different types of corpora}, YEAR = {2010}, ABSTRACT = {In this paper we tackle the challenging task of Multi-word term (MWT) extraction from different types of specialized corpora. Contrastive filtering of previously extracted MWTs results in a considerable increment of acquired domain-specific terms.}, KEYWORDS = {multi-word terms extraction, corpora}, PAGES = {76-79}, URL = {https://publications.cnr.it/doc/84802}, ISBN = {978-7-900268-00-6}, CONFERENCE_NAME = {The 23rd International Conference on Computational Linguistics (COLING 2010). Multiword Expressions: from Theory to Applications (MWE 2010)}, CONFERENCE_PLACE = {Beijing, China}, CONFERENCE_DATE = {28 agosto 2010}, } @INPROCEEDINGS{BOSCO_2010_INPROCEEDINGS_BMMDL_84799, AUTHOR = {Bosco, C. and Montemagni, S. and Mazzei, A. and Dell'Orletta, F. and Lenci, A.}, TITLE = {Evalita'09 Parsing Task: comparing dependency parsers and treebanks}, YEAR = {2010}, KEYWORDS = {dependency parsing, dependency treebank}, URL = {https://publications.cnr.it/doc/84799}, CONFERENCE_NAME = {Evaluation of NLP and Speech Tools for Italian. EVALITA 2009}, CONFERENCE_PLACE = {Reggio Emilia, Italy}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{BOSCO_2010_INPROCEEDINGS_BMMLDLLASLHNN_84789, AUTHOR = {Bosco, C. and Montemagni, S. and Mazzei, A. and Lombardo, V. and Dell'Orletta, F. and Lenci, A. and Lesmo, L. and Attardi, G. and Simi, M. and Lavelli, A. and Hall, J. and Nilsson, J. and Nivre, J.}, TITLE = {Comparing the Influence of Different Treebank Annotations on Dependency Parsing}, YEAR = {2010}, KEYWORDS = {Parsing, Corpus (creation, annotation, etc.), Evaluation methodologies}, URL = {https://publications.cnr.it/doc/84789}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Valletta, Malta}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{MONTEMAGNI_2010_INPROCEEDINGS_M_84772, AUTHOR = {Montemagni, S.}, TITLE = {Esplorazioni computazionali nello spazio della variazione lessicale in Toscana}, YEAR = {2010}, ABSTRACT = {Il passaggio dalla descrizione della distribuzione geografica di singole parole a un livello di descrizione più astratto volto a formulare generalizzazioni relative alla variazione diatopica è oggi reso possibile dal ricorso a tecnologie linguistico-computazionali affiancate da tecniche di analisi statistica multivariata. L'uso combinato di queste tecniche si è dimostrato particolarmente promettente nello studio della variazione linguistica (principalmente fonetica e lessicale) di diverse lingue e dialetti, tipologicamente anche molto distanti. Tali tecniche sono state anche proficuamente utilizzate per l'analisi del contatto tra varietà linguistiche e una norma di riferimento. L'articolo si colloca all'interno di questo filone di ricerca, riportando i risultati di esplorazioni computazionali nello spazio della variazione lessicale in Toscana. Tali esplorazioni intendono ripercorrere i passi di Gabriella Giacomelli, ideatrice e direttrice dell'impresa dell'Atlante Lessicale Toscano (ALT) e profonda conoscitrice della realtà dialettale toscana, nel suo studio sulle aree lessicali toscane (Giacomelli 1975). Questa rivisitazione dello studio sulle aree lessicali toscane di Giacomelli si è avvalsa, più di tre decenni dopo, di due importanti elementi di novità, ovvero: i) sul versante dei dati, si è basata sull'intero corpus dei materiali dialettali dell'ALT disponibili nel sito di ALT-Web (http://serverdbt.ilc.cnr.it/altweb); ii) sul versante degli strumenti di analisi, è stata condotta attraverso l'uso combinato di tecnologie linguistiche e tecniche di analisi statistica multivariata che rendono possibile un'analisi aggregata di corpora di materiali dialettali anche di vaste dimensioni. Lo studio si focalizza su due dei tre aspetti indicati come fondamentali da Giacomelli per l'analisi delle aree lessicali toscane, ovvero quello dei "rapporti interni, tra aree subregionali" e quello dei "rapporti con la lingua".}, KEYWORDS = {Computational Dialectology, Lexical Variation}, PAGES = {609-634}, URL = {https://publications.cnr.it/doc/84772}, PUBLISHER = {Centro Editoriale e Librario (Arcavacata di Rende, ITA)}, ISBN = {9788874581030}, CONFERENCE_NAME = {Convegno 'Parole. Il lessico come strumento per organizzare e trasmettere gli etnosaperi'}, CONFERENCE_PLACE = {Rende, Università della Calabr}, CONFERENCE_DATE = {2-4 luglio 2009}, BOOKTITLE = {Parole. Il lessico come strumento per organizzare e trasmettere gli etnosaperi}, EDITOR = {Prantera, N. and Mendicino, A. and Citraro, C.}, } @INPROCEEDINGS{BONIN_2010_INPROCEEDINGS_BDMV_112966, AUTHOR = {Bonin, F. and Dell'Orletta, F. and Montemagni, S. and Venturi, G.}, TITLE = {Lessico settoriale e lessico comune nell'estrazione di terminologia specialistica da corpora di dominio}, YEAR = {2010}, KEYWORDS = {Automatic Term Extraction}, URL = {https://publications.cnr.it/doc/112966}, CONFERENCE_NAME = {XLIV Congresso Internazionale di Studi della Società di Linguistica Italiana}, CONFERENCE_PLACE = {Viterbo, Università degli Stud}, } @INPROCEEDINGS{DELLORLETTA_2010_INPROCEEDINGS_DMVV_173723, AUTHOR = {Dell'Orletta, F. and Montemagni, S. and Vecchi, E. M. and Venturi, G.}, TITLE = {Tecnologie linguistico-computazionali per il monitoraggio delle competenze linguistiche di apprendenti l'italiano come L2}, YEAR = {2010}, KEYWORDS = {Natural Language Processing, Educational Linguistics, Language Learning}, URL = {https://publications.cnr.it/doc/173723}, CONFERENCE_NAME = {Congresso "IT. L2: italiano lingua seconda nell'università, nella scuola e sul territorio. Esperienze didattiche e ricerche" Università del Piemonte Orientale "Amedeo Avogadro", Facoltà di Lettere e Filosofia}, CONFERENCE_PLACE = {Vercelli}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{MONTEMAGNI_2010_INPROCEEDINGS_M_112955, AUTHOR = {Montemagni, S.}, TITLE = {Ontology Learning. An introduction}, YEAR = {2010}, KEYWORDS = {Legal Text Processing, Ontology Learning, NLP}, URL = {https://publications.cnr.it/doc/112955}, CONFERENCE_NAME = {Summer School LEX2010-Managing Legal Resources in the Semantic Web, Session "Ontology in the Legal Domain"}, CONFERENCE_PLACE = {Ravenna}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{MONTEMAGNI_2010_INPROCEEDINGS_M_112957, AUTHOR = {Montemagni, S.}, TITLE = {Tecnologie linguistico-computazionali per il monitoraggio della lingua italiana}, YEAR = {2010}, KEYWORDS = {Language Variation, Natural Language Processing}, URL = {https://publications.cnr.it/doc/112957}, CONFERENCE_NAME = {Giornata di Studio "Lo stato della lingua. Il CNR e l'italiano nel terzo millennio" organizzata dal Consiglio Nazionale delle Ricerche-Dipartimento Identità Culturale}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{MONTEMAGNI_2010_INPROCEEDINGS_M_112958, AUTHOR = {Montemagni, S.}, TITLE = {The BioLexicon: a Large-Scale Domain-Specific Lexical Resource for Biomedical Text Mining}, YEAR = {2010}, KEYWORDS = {Text Mining, Knowledge Extraction, Lexical Resources}, URL = {https://publications.cnr.it/doc/112958}, CONFERENCE_NAME = {LREC 2010 2nd Workshop on Building and evaluating resources for biomedical text mining}, CONFERENCE_PLACE = {Malta}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{MONTEMAGNI_2010_INPROCEEDINGS_M_112962, AUTHOR = {Montemagni, S.}, TITLE = {Design, Construction and Use of an Italian Dependency Treebank: Methodological Issues and Empirical Results}, YEAR = {2010}, KEYWORDS = {Syntactic Annotation, Treebanks}, URL = {https://publications.cnr.it/doc/112962}, CONFERENCE_NAME = {TheCopenhagen Dependency Treebank Workshop on "Designing Treebanks"}, CONFERENCE_PLACE = {Copehagen (DK)}, CONFERENCE_DATE = {2010}, } @INPROCEEDINGS{MONTEMAGNI_2010_INPROCEEDINGS_MWDN_112967, AUTHOR = {Montemagni, S. and Wieling, M. and De Jonge, B. and Nerbonne, J.}, TITLE = {Modelli di variazione dialettale e analisi dei tratti linguistici sottostanti: un nuovo approccio dialettometrico}, YEAR = {2010}, KEYWORDS = {Computational dialectology}, URL = {https://publications.cnr.it/doc/112967}, CONFERENCE_NAME = {XI Congresso Silfi-Congresso della Società Internazionale di Linguistica e Filologia Italiana}, CONFERENCE_PLACE = {Napoli}, CONFERENCE_DATE = {2010}, } @TECHREPORT{MONTEMAGNI_2010_TECHREPORT_M_157485, AUTHOR = {Montemagni, S.}, TITLE = {Computational Models of Dialectal Variation and Underlying Linguistic Features}, YEAR = {2010}, KEYWORDS = {Computational Dialectology, Language Variation}, URL = {https://publications.cnr.it/doc/157485}, } @TECHREPORT{PIRRELLI_2010_TECHREPORT_PLMDGM_367784, AUTHOR = {Pirrelli, V. and Lenci, A. and Montemagni, S. and Dell'Orletta, F. and Giovannetti, E. and Marchi, S.}, TITLE = {ConnectToLife (modulo semantico)-Rapporto tecnico finale}, YEAR = {2010}, ABSTRACT = {Il presente documento costituisce il rapporto tecnico finale del progetto Connect-To-Life (modulo semantico) relativo alle attività svolte dall'unità ILC-CNR.}, KEYWORDS = {annotazione linguistica, estrazione di termini, clustering semantico, trattamento automatico della lingua, costruzione di ontologie}, PAGES = {16}, URL = {https://publications.cnr.it/doc/367784}, } @INCOLLECTION{AGNOLONI_2009_INCOLLECTION_ABFPMV_173012, AUTHOR = {Agnoloni, T. and Bacci, L. and Francesconi, E. and Peters, W. and Montemagni, S. and Venturi, G.}, TITLE = {A two-level Knowledge approach to support multilingual legislative drafting}, YEAR = {2009}, KEYWORDS = {DALOS project, Ontological-linguistic}, URL = {https://publications.cnr.it/doc/173012}, } @INCOLLECTION{DELLORLETTA_2009_INCOLLECTION_DLMMP_184585, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-2-Knowledge: una piattaforma linguistico-computazionale per l'estrazione di conoscenza da testi}, YEAR = {2009}, ABSTRACT = {The paper describes the automatic extraction of domain knowledge from Italian document collections and presents a fully-implemented ontology learning system (T2K, Text-2-Knowledge) that includes a battery of tools for Natural Language Processing, statistical text analysis and machine learning. Evaluated results show the considerable potential of systems like T2K, exploiting an incremental interleaving of NLP and machine learning techniques for accurate large-scale semi-automatic extraction and structuring of domain-specific knowledge.}, KEYWORDS = {Term extraction, Ontology Learning}, PAGES = {285-300}, URL = {https://publications.cnr.it/doc/184585}, PUBLISHER = {Bulzoni (Roma, ITA)}, ISBN = {978-88-7870-469-5}, EDITOR = {Ferrari, G. and Benatti, R. and Mosca, M.}, } @INCOLLECTION{LENCI_2009_INCOLLECTION_LMP_186141, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Annotazione sintattica di corpora: aspetti metodologici}, YEAR = {2009}, ABSTRACT = {Un assunto sempre più condiviso nell'ambito degli studi sull'acquisizione sia di L1 che di L2 è che l'evidenza empirica privilegiata debba essere rappresentata da corpora di produzioni scritte o orali degli apprendenti, estensivamente annotate a molteplici livelli di rappresentazione linguistica. Più in generale, corpora lemmatizzati e annotati a livello morfosintattico fanno ormai parte dello strumentario comune del linguista. Accanto ad essi, si fa però strada l'esigenza di disporre di risorse testuali più sofisticate dal punto di vista delle modalità di esplorazione linguistica, come ad esempio corpora annotati a livello sintattico (le cosiddette treebank). Questi consentono infatti di osservare i processi di convergenza degli apprendenti verso la lingua "obiettivo" anche a livello di specifici tratti grammaticali astratti o di macro-strutture linguistiche. L'articolo propone uno schema di annotazione sintattica caratterizzato da un doppio livello di codifica. Si tratta di un approccio originale che differisce dalla maggior degli schemi di annotazione sintattica esistenti per due aspetti: 1. la separazione della dimensione relazionale da quella a costituenti, che sono trattati a livelli di annotazione indipendenti, ma al tempo stesso correlati, in modo tale che lo stesso testo è simultaneamente interrogabile ai due livelli; 2. la rappresentazione a costituenti fornisce una rappresentazione del testo come sequenza di proto-costituenti sintagmatici non ricorsivi. Questa strategia di annotazione permette una fattorizzazione di diversi aspetti e dimensioni della struttura sintattica che risulta promettente da un lato per l'annotazione di corpora di lingua "non-standard" come quelli contenenti produzioni di apprendenti di L1 o L2, sia come punto di partenza per successivi processi di estrazione di informazione linguistica dal testo. Dopo aver illustrato le motivazioni sottostanti allo schema proposto, ciascun livello di rappresentazione (chunking e dipendenze funzionali) viene illustrato in dettaglio, mostrandone anche la possibilità di combinazione sullo stesso testo. L'articolo si chiude con la discussione di prospettive di uso di corpora annotati secondo lo schema di annotazione proposto.}, KEYWORDS = {Corpora annotati, annotazione sintattica}, PAGES = {25-46}, URL = {https://publications.cnr.it/doc/186141}, PUBLISHER = {Guerra Edizioni (Perugia, ITA)}, ISBN = {978-88-557-0168-6}, BOOKTITLE = {CORPORA DI ITALIANO L2: TECNOLOGIE, METODI, SPUNTI TEORICI}, EDITOR = {Andorno, C. and Rastelli, S.}, } @INCOLLECTION{LENCI_2009_INCOLLECTION_LMPV_136465, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Ontology learning from Italian legal texts}, YEAR = {2009}, ABSTRACT = {The paper reports on the methodology and preliminary results of a case study in automatically extracting ontological knowledge from Italian legislative texts. We use a fully-implemented ontology learning system (T2K) that includes a battery of tools for Natural Language Processing (NLP), statistical text analysis and machine language learning. Tools are dynamically integrated to provide an incremental representation of the content of vast repositories of unstructured documents. Evaluated results, however preliminary, show the great potential of NLP-powered incremental systems like T2K for accurate large-scale semi-automatic extraction of legal ontologies.}, KEYWORDS = {Ontology Learning, document management, legal knowledge extraction}, PAGES = {75-94}, URL = {https://publications.cnr.it/doc/136465}, VOLUME = {188}, DOI = {10.3233/978-1-58603-942-4-75}, ISBN = {978-1-58603-942-4}, BOOKTITLE = {Law, Ontologies and the Semantic Web-Channelling the Legal Information Flood}, EDITOR = {Breuker, J. and Casanovas, P. and Klein, M. C. A. and Francesconi, E.}, } @EDITORIAL{CASELLAS_2009_EDITORIAL_CFHM_143540, AUTHOR = {Casellas, N. and Francesconi, E. and Hokstra, R. and Montemagni, S.}, TITLE = {Proceedings of the 3rd Workshop on Legal Ontologies and Artificial Intelligence Techniques (LOAIT '09) joint with the 2nd Workshop on Semantic Processing of Legal Texts}, YEAR = {2009}, URL = {https://publications.cnr.it/doc/143540}, VOLUME = {2}, } @INPROCEEDINGS{VENTURI_2009_INPROCEEDINGS_VLMVSTA_173712, AUTHOR = {Venturi, G. and Lenci, A. and Montemagni, S. and Vecchi, E. M. and Sagri, M. T. and Tiscornia, D. and Agnoloni, T.}, TITLE = {Towards a FrameNet Resource for the Legal Domain}, YEAR = {2009}, KEYWORDS = {Frame Semantics, Legal Ontologies, Knowledge Representation, Corpus Annotation}, URL = {https://publications.cnr.it/doc/173712}, CONFERENCE_NAME = {3rd Workshop on Legal Ontologies and Artificial Intelligence Techniques joint with 2nd Workshop on Semantic Processing of Legal text}, CONFERENCE_PLACE = {Barcelona, Spain}, CONFERENCE_DATE = {2009}, } @INPROCEEDINGS{VENTURI_2009_INPROCEEDINGS_VMMSTMA_84736, AUTHOR = {Venturi, G. and Montemagni, S. and Marchi, S. and Sasaki, Y. and Thompson, P. and McNaught, J. and Ananiadou, S.}, TITLE = {Bootstrapping a Verb Lexicon for Biomedical Information Extraction}, YEAR = {2009}, ABSTRACT = {The extraction of information from texts requires resources that contain both syntactic and semantic properties of lexical units. As the use of language in specialized domains, such as biology, can be very different to the general domain, there is a need for domain-specific resources to ensure that the information extracted is as accurate as possible. We are building a large-scale lexical resource for the biology domain, providing information about predicate-argument structure that has been bootstrapped from a biomedical corpus on the subject of E. Coli. The lexicon is currently focussed on verbs, and includes both automatically-extracted syntactic subcategorization frames, as well as semantic event frames that are based on annotation by domain experts. In addition, the lexicon contains manually-added explicit links between semantic and syntactic slots in corresponding frames. To our knowledge, this lexicon currently represents a unique resource within in the biomedical domain.}, KEYWORDS = {domain-specific lexical resources, Biological Language Processing, syntax-semantic linking}, PAGES = {137-148}, URL = {https://publications.cnr.it/doc/84736}, DOI = {10.1007/978-3-642-00382-0_11}, PUBLISHER = {Springer-Verlag (Berlin Heidelberg, DEU)}, ISBN = {9783642003813}, CONFERENCE_NAME = {10th International Conference on Intelligent Text Processing and Computational Linguistics}, CONFERENCE_PLACE = {Mexico City, Mexico}, CONFERENCE_DATE = {1-7/03/2009}, } @INPROCEEDINGS{SPINOSA_2009_INPROCEEDINGS_SGCMVM_130118, AUTHOR = {Spinosa, P. and Giardiello, G. and Cherubini, M. and Marchi, S. and Venturi, G. and Montemagni, S.}, TITLE = {NLP–based Metadata Extraction for Legal Text Consolidation}, YEAR = {2009}, KEYWORDS = {Natural Language Processing, textual amendments, XML representation, metadata extraction, consolidation of legal text}, URL = {https://publications.cnr.it/doc/130118}, CONFERENCE_NAME = {Twelfth International Conference on Artificial Intelligence and Law (ICAIL 2009)}, CONFERENCE_PLACE = {Barcelona}, CONFERENCE_DATE = {June 8-12, 2009}, } @INPROCEEDINGS{VENTURI_2009_INPROCEEDINGS_VMMSTMA_112956, AUTHOR = {Venturi, G. and Montemagni, S. and Marchi, S. and Sasaki, Y. and Thompson, P. and McNaught, J. and Ananiadou, S.}, TITLE = {Bootstrapping a Verb Lexicon for Biomedical Information Extraction}, YEAR = {2009}, ABSTRACT = {The extraction of information from texts requires resources that contain both syntactic and semantic properties of lexical units. As the use Of language in specialized domains, such as biology, can be very different to the general domain, there is a need for domain-specific resources to ensure that the information extracted is as accurate as possible. We are building a large-scale lexical resource for the biology domain. providing information about predicate-argument structure that has been bootstrapped from a biomedical corpus on the subject of E. Coli. The lexicon is currently focussed on verbs, and includes both automatically-extracted syntactic subcategorization frames, as well as semantic event frames that are based on annotation by domain experts. In addition, the lexicon contains manually-added explicit links between semantic and syntactic slots in corresponding frames. To Our knowledge, this lexicon currently represents a unique resource within in the biomedical domain.}, KEYWORDS = {domain-specific lexical resources, lexical acquisition, syntax-semantics linking, Information Extraction, Biological Language Processing}, PAGES = {137-148}, URL = {https://publications.cnr.it/doc/112956}, VOLUME = {5449}, PUBLISHER = {Springer (Berlin, Germania)}, ISSN = {0302-9743}, ISBN = {978-3-642-00381-3}, CONFERENCE_NAME = {International Conference on Intelligent Text Processing and Computational Linguistics (CICLing 2009)}, CONFERENCE_PLACE = {Mexico City, Mexico}, CONFERENCE_DATE = {March 1-7, 2009}, BOOKTITLE = {Proceedings of the 10th International Conference on Intelligent Text Processing and Computational Linguistics (CICLing 2009)}, EDITOR = {Gelbukh, A.}, } @MISC{CASELLAS_2009_MISC_CFHM_157461, AUTHOR = {Casellas, N. and Francesconi, E. and Hoekstra, R. and Montemagni, S.}, TITLE = {3rd Workshop on Legal Ontologies and Artificial Intelligence Techniques joint with 2nd Workshop on Semantic Processing of Legal text}, YEAR = {2009}, KEYWORDS = {Legal Ontologies, Computational Semantics}, URL = {https://publications.cnr.it/doc/157461}, } @ARTICLE{DELLORLETTA_2008_ARTICLE_DLMMPV_64541, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Dal testo alla conoscenza e ritorno: estrazione terminologica e annotazione semantica di basi documentali di dominio}, YEAR = {2008}, ABSTRACT = {The paper focuses on the automatic extraction of domain knowledge from Italian legal texts and presents a fully-implemented ontology learning system (T2K, Text-2-Knowledge) that includes a battery of tools for Natural Language Processing, statistical text analysis and machine learning. Evaluated results show the considerable potential of systems like T2K, exploiting an incremental interleaving of NLP and machine learning techniques for accurate large-scale semi-automatic extraction and structuring of domain-specific knowledge.}, KEYWORDS = {Natural Language Processing, Machine Learning, Knowledge extraction from texts, Ontology learning, Legal ontologies}, PAGES = {197-218}, URL = {https://publications.cnr.it/doc/64541}, VOLUME = {26}, PUBLISHER = {Aida (Roma, Italia)}, ISSN = {1594-2201}, JOURNAL = {Aida Informazioni (Online)}, } @ARTICLE{MONTEMAGNI_2008_ARTICLE_M_64543, AUTHOR = {Montemagni, S.}, TITLE = {The space of Tuscan dialectal variation. A correlation study}, YEAR = {2008}, ABSTRACT = {The paper illustrates the results of a correlation study focusing on linguistic variation in an Italian region, Tuscany. By exploiting a multi-level representation scheme of dialectal data, the study analyses attested patterns of phonetic and morpho-lexical variation with the aim of testing the degree of correlation between a) phonetic and morpho-lexical variation, and b) linguistic variation and geographic distance. The correlation analysis was performed by combining two complementary approaches proposed in dialectometric literature, namely by computing both global and place-specific correlation measures and by inspecting their spatial distribution. Achieved results demonstrate that phonetic and morpho-lexical variations in Tuscany seem to follow a different pattern than encountered in previous studies.}, KEYWORDS = {Computational dialectology, Dialectometry}, PAGES = {135-152}, URL = {http://www.euppublishing.com/doi/abs/10.3366/E1753854809000354}, VOLUME = {2}, DOI = {10.3366/E1753854809000354}, PUBLISHER = {Edinburgh University Press for the Association for History and Computing (Edinburgh, Regno Unito)}, ISSN = {1753-8548}, JOURNAL = {International journal of humanities and arts computing (Print)}, } @INCOLLECTION{MONTEMAGNI_2008_INCOLLECTION_M_136460, AUTHOR = {Montemagni, S.}, TITLE = {Analisi linguistico-computazionali del corpus dialettale dell'Atlante Lessicale Toscano. Primi risultati sul rapporto toscano-italiano}, YEAR = {2008}, KEYWORDS = {Corpus dialettale}, URL = {https://publications.cnr.it/doc/136460}, PUBLISHER = {Pacini (Pisa, ITA)}, } @INPROCEEDINGS{DELLORLETTA_2008_INPROCEEDINGS_DLMMPV_84707, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Dal testo alla conoscenza e ritorno: estrazione terminologica e annotazione semantica di basi documentali di dominio}, YEAR = {2008}, ABSTRACT = {The paper focuses on the automatic extraction of domain knowledge from Italian legal texts and presents a fully-implemented ontology learning system (T2K, Text-2-Knowledge) that includes a battery of tools for Natural Language Processing, statistical text analysis and machine learning. Evaluated results show the considerable potential of systems like T2K, exploiting an incremental interleaving of NLP and machine learning techniques for accurate large-scale semi-automatic extraction and structuring of domain-specific knowledge.}, KEYWORDS = {Natural Language Processing, Machine Learning, Knowledge extraction from texts, Ontology learning, Legal ontologies}, PAGES = {197-218}, URL = {http://www.assiterm91.it/wp-content/uploads/2010/11/Convegno-2008.pdf}, VOLUME = {Anno 26, numero 1-2}, PUBLISHER = {Aida (Roma, Italia)}, ISSN = {1594-2201}, CONFERENCE_NAME = {Atti del Convegno Nazionale Ass. I. Term}, CONFERENCE_PLACE = {Arcavacata di Rende (CS)}, CONFERENCE_DATE = {5-7/06/2008}, BOOKTITLE = {Terminologia analisi testuale e documentazione nella città digitale}, } @INPROCEEDINGS{DELLORLETTA_2008_INPROCEEDINGS_DLMMPV_84698, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Marchi, S. and Pirrelli, V. and Venturi, G.}, TITLE = {Acquiring Legal Ontologies from Domain-specific Texts}, YEAR = {2008}, ABSTRACT = {The paper reports on methodology and preliminary results ofa case study in automatically extracting ontological knowledgefrom Italian legislative texts in the environmental domain. Weuse a fully-implemented ontology learning system (T2K) thatincludes a battery of tools for Natural Language Processing(NLP), statistical text analysis and machine language learn-ing. Tools are dynamically integrated to provide an incremen-tal representation of the content of vast repositories of unstruc-tured documents. Evaluated results, however preliminary, arevery encouraging, showing the great potential of NLP-poweredincremental systems like T2K for accurate large-scale semi-automatic extraction of legal ontologies.}, KEYWORDS = {Ontology learning, Document management, knowledge extraction from texts, Natural Language Processing}, PAGES = {98-101}, URL = {https://publications.cnr.it/doc/84698}, CONFERENCE_NAME = {LangTech 2008}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {28-29/02/2008}, } @INPROCEEDINGS{GIOVANNETTI_2008_INPROCEEDINGS_GMM_84706, AUTHOR = {Giovannetti, E. and Marchi, S. and Montemagni, S.}, TITLE = {Combining statistical techniques and lexico-syntactic patterns for semantic relations extraction from text}, YEAR = {2008}, ABSTRACT = {We describe here a methodology to combine two different techniques for Semantic Relation Extraction from texts. On the one hand, generic lexicosyntactic patterns are applied to the linguistically analyzed corpus to detect a first set of pairs of co-occurring words, possibly involved in "syntagmatic" relations. On the other hand, a statistical unsupervised association system is used to obtain a second set of pairs of "distributionally similar" terms, that appear to occur in similar contexts, thus possibly involved in "paradigmatic" relations. The approach aims at learning ontological information by filtering the candidate relations obtained through generic lexico-syntactic patterns and by labelling the anonymous relations obtained through the statistical system. The resulting set of relations can be used to enrich existing ontologies and for semantic annotation of documents or web pages.}, KEYWORDS = {Ontology Learning from Text, Semantic Relation Extraction, Lexico-syntactic Patterns, Distributional Similarity}, URL = {http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-426/swap2008_submission_54.pdf}, CONFERENCE_NAME = {SWAP 2008-Semantic Web Applications and Perspectives}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {15-17 December 2008}, EDITOR = {Gangemi, A. and Keizer, J. and Presutti, V. and Stoermer, H.}, } @INPROCEEDINGS{GIOVANNETTI_2008_INPROCEEDINGS_GMMB_84726, AUTHOR = {Giovannetti, E. and Marchi, S. and Montemagni, S. and Bartolini, R.}, TITLE = {Ontology Learning and Semantic Annotation: a Necessary Symbiosis}, YEAR = {2008}, ABSTRACT = {Semantic annotation of text requires the dynamic merging of linguistically structured information and a "world model", usually represented as a domain-specific ontology. On the other hand, the process of engineering a domain-ontology through semi-automatic ontology learning system requires the availability of a considerable amount of semantically annotated documents. Facing this bootstrapping paradox requires an incremental process of annotation-acquisition-annotation, whereby domain-specific knowledge is acquired from linguistically-annotated texts and then projected back onto texts for extra linguistic information to be annotated and further knowledge layers to be extracted. The presented methodology is a first step in the direction of a full "virtuous" circle where the semantic annotation platform and the evolving ontology interact in symbiosis. As a case study we have chosen the semantic annotation of product catalogues. We propose a hybrid approach, combining pattern matching techniques to exploit the regular structure of product descriptions in catalogues, and Natural Language Processing techniques which are resorted to analyze natural language descriptions. The semantic annotation involves the access to the ontology, semi-automatically bootstrapped with an ontology learning tool from annotated collections of catalogues.}, KEYWORDS = {Information Extraction, Information Retrieval, Ontologies, Tools, Systems}, PAGES = {2079-2085}, URL = {http://www.lrec-conf.org/proceedings/lrec2008/}, PUBLISHER = {European Language Resources Association (ELRA)-Evaluations and Language resources Distribution Agency (ELDA) (Paris, FRA)}, ISBN = {2-9517408-4-0}, CONFERENCE_NAME = {LREC 2008, Sixth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Marrakech, Marocco}, CONFERENCE_DATE = {2008}, BOOKTITLE = {LREC 2008, Sixth International Conference on Language Resources and Evaluation}, EDITOR = {Calzolari, N. and Choukri, K. and Maegaard, B. and Mariani, J. and Odjik, J. and Piperidis, S. and Tapias, D.}, } @INPROCEEDINGS{LENCI_2008_INPROCEEDINGS_LMPM_84730, AUTHOR = {Lenci, A. and McGillivray, B. and Pirrelli, V. and Montemagni, S.}, TITLE = {Unsupervised Acquisition of Verb Subcategorization Frames from Shallow-Parsed Corpora}, YEAR = {2008}, KEYWORDS = {Acquisition, Machine Learning, Corpus (creation, annotation, etc.), Lexicon, Lexical database}, URL = {https://publications.cnr.it/doc/84730}, CONFERENCE_NAME = {LREC 2008, Sixth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Marrakech, Marocco}, CONFERENCE_DATE = {2008}, } @INPROCEEDINGS{SASAKI_2008_INPROCEEDINGS_SMPRMA_84703, AUTHOR = {Sasaki, Y. and Montemagni, S. and Pezik, P. and Rebholz Schuhmann, D. and McNaught, J. and Ananiadou, S.}, TITLE = {BioLexicon: A Lexical Resource for the Biology Domain}, YEAR = {2008}, KEYWORDS = {BioLexicon, Terminological verbs}, URL = {https://publications.cnr.it/doc/84703}, CONFERENCE_NAME = {Third International Symposium on Semantic Mining in Biomedicine}, CONFERENCE_PLACE = {Turku, Finland}, CONFERENCE_DATE = {2008}, } @INPROCEEDINGS{THOMPSON_2008_INPROCEEDINGS_TCAMMTV_84704, AUTHOR = {Thompson, P. and Cotter, P. and Ananiadou, S. and McNaught, J. and Montemagni, S. and Trabucco, A. and Venturi, G.}, TITLE = {Building a Bio-Event Annotated Corpus for the Acquisition of Semantic Frames from Biomedical Corpora}, YEAR = {2008}, KEYWORDS = {Corpus (creation, annotation, etc.), Text mining, Semantics, Event Extraction}, PAGES = {2159-2166}, URL = {https://publications.cnr.it/doc/84704}, ISBN = {2-9517408-4-0}, CONFERENCE_NAME = {LREC 2008, Sixth International Conference on Language Resouces and Evaluation}, CONFERENCE_PLACE = {Marrakech, Morocco}, CONFERENCE_DATE = {28-30 maggio 2014}, } @INPROCEEDINGS{THOMPSON_2008_INPROCEEDINGS_TVMMA_84705, AUTHOR = {Thompson, P. and Venturi, G. and McNaught, J. and Montemagni, S. and Ananiadou, S.}, TITLE = {Categorising Modality in Biomedical Texts}, YEAR = {2008}, ABSTRACT = {The accurate recognition of modal information is vital for the correct interpretation of statements. In this paper, we report on the collection a list of words and phrases that express modal information in biomedical texts, and propose a categorisation scheme according to the type of information conveyed. We have performed a small pilot study through the annotation of 202 MEDLINE abstracts according to our proposed scheme. Our initial results suggest that modality in biomedical statements can be predicted fairly reliably though the presence of particular lexical items, together with a small amount of contextual information.}, KEYWORDS = {Biomedical texts, Modality}, PAGES = {27-34}, URL = {https://publications.cnr.it/doc/84705}, ISBN = {2-9517408-4-0}, CONFERENCE_NAME = {LREC 2008, Sixth International Conference on Language Resources and Evaluation: Workshop 'Building and Evaluating Resources for Biomedical Text Mining'}, CONFERENCE_PLACE = {Marrakech, Marocco}, CONFERENCE_DATE = {26 maggio 2008}, } @INPROCEEDINGS{MONTEMAGNI_2008_INPROCEEDINGS_M_112936, AUTHOR = {Montemagni, S.}, TITLE = {Exploring the correlation between phonetic and lexical variation in Tuscany}, YEAR = {2008}, KEYWORDS = {Dialectal variation, ALT-Web}, URL = {https://publications.cnr.it/doc/112936}, CONFERENCE_NAME = {Thirteenth International Conference on Methods in Dialectology}, CONFERENCE_PLACE = {Leeds}, CONFERENCE_DATE = {2008}, } @INPROCEEDINGS{REBHOLZSCHUHMANN_2008_INPROCEEDINGS_RPLDKSMMMCA_112935, AUTHOR = {Rebholz Schuhmann, D. and Pezik, P. and Lee, V. and Del Gratta, R. and Kim, J. and Sasaki, Y. and McNaught, J. and Montemagni, S. and Monachini, M. and Calzolari, N. and Ananiadou, S.}, TITLE = {BioLexicon: Towards a reference terminological resource in the biomedical domain}, YEAR = {2008}, ABSTRACT = {The BioLexicon is a publicly available large-scale terminological resource which brings together potential terms from several resources representing selected semantic types (genes, proteins, chemicals, species, enzymes, selected ontological terms). The schema of the BioLexicon enables improved resolution of term ambiguity and follows lexical standards for terminological resources.}, KEYWORDS = {BioLexicon}, URL = {https://publications.cnr.it/doc/112935}, ISBN = {978-1-61567-371-1}, CONFERENCE_NAME = {16th Annual International Conference on Intelligent Systems for Molecular Biology}, CONFERENCE_PLACE = {Toronto, Canada}, CONFERENCE_DATE = {19-23 Luglio 2008}, } @TECHREPORT{MONTEMAGNI_2008_TECHREPORT_M_157448, AUTHOR = {Montemagni, S.}, TITLE = {Augmented version of the bio-lexicon extended with bio event information and term-to-term weighted links}, YEAR = {2008}, KEYWORDS = {Bio-lexicon}, URL = {https://publications.cnr.it/doc/157448}, } @MISC{PIRRELLI_2008_MISC_PM_151569, AUTHOR = {Pirrelli, V. and Montemagni, S.}, TITLE = {AnITA}, YEAR = {2008}, KEYWORDS = {NLP Tools}, URL = {https://publications.cnr.it/doc/151569}, } @ARTICLE{DELLORLETTA_2007_ARTICLE_DFLMP_64537, AUTHOR = {Dell'Orletta, F. and Federico, M. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Maximum Entropy for Italian PoS Tagging}, YEAR = {2007}, ABSTRACT = {L'articolo illustra le prestazioni del ILC-UniPi MaxEnt PoS Tagger in Evalita 2007. The report contains a description of the ILC-UniPi MaxEnt PoS Tagger performance in Evalita 2007.}, PAGES = {10-11}, URL = {https://publications.cnr.it/doc/64537}, VOLUME = {IV(2)}, } @INCOLLECTION{DELLORLETTA_2007_INCOLLECTION_DLMP_136459, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Corpus-based Modelling of Grammar Variation}, YEAR = {2007}, KEYWORDS = {Grammar variation, stochastic parsing, linguistic typology}, PAGES = {38-55}, URL = {https://publications.cnr.it/doc/136459}, PUBLISHER = {Angeli (Milano, ITA)}, ISBN = {9788846489449}, BOOKTITLE = {Language resources and linguistic theory}, EDITOR = {Sansò, A.}, } @INPROCEEDINGS{AGNOLONI_2007_INPROCEEDINGS_ABFSTMV_171352, AUTHOR = {Agnoloni, T. and Bacci, L. and Francesconi, E. and Spinosa, P. and Tiscornia, D. and Montemagni, S. and Venturi, G.}, TITLE = {Building an ontological support for multilingual legislative drafting}, YEAR = {2007}, PAGES = {9-18}, URL = {https://publications.cnr.it/doc/171352}, CONFERENCE_NAME = {International Conference on Legal Knowledge and Information Systems (JURIX 2007)}, CONFERENCE_PLACE = {Leiden}, CONFERENCE_DATE = {2007}, BOOKTITLE = {Legal Knowledge and information Systems}, EDITOR = {Ar, L. and Mommers, L.}, } @INPROCEEDINGS{DELLORLETTA_2007_INPROCEEDINGS_DFLMP_84696, AUTHOR = {Dell'Orletta, F. and Federico, M. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Maximum Entropy for Italian PoS Tagging}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/84696}, CONFERENCE_NAME = {Evaluation of NLP Tools for Italian-EVALITA 2007}, CONFERENCE_PLACE = {Roma}, } @INPROCEEDINGS{DELLORLETTA_2007_INPROCEEDINGS_DLMMP_84687, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-2-Knowledge: una piattaforma linguistico-computazionale per l'estrazione di conoscenza da testi}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/84687}, CONFERENCE_NAME = {XL Congresso Internazionale di Studi della Società di Linguistica Italiana (SLI 2006)}, CONFERENCE_PLACE = {Roma}, } @INPROCEEDINGS{GIOVANNETTI_2007_INPROCEEDINGS_GMMB_84690, AUTHOR = {Giovannetti, E. and Marchi, S. and Montemagni, S. and Bartolini, R.}, TITLE = {Ontology-based Semantic Annotation of Product Catalogues}, YEAR = {2007}, ABSTRACT = {This paper describes a methodology for the semantic annotation of product catalogues. We propose a hybrid approach, combining pattern matching techniques to exploit the regular structure of product descriptions in catalogues, and Natural Language Processing techniques which are resorted to analyze natural language descriptions. It also includes the access to an application ontology, semi-automatically bootstrapped from collections of catalogues with an ontology learning tool, which is used to drive the semantic annotation process.}, KEYWORDS = {Semantic Annotation of texts, Ontology Learning, Information Extraction for e-commerce}, PAGES = {235-239}, URL = {https://publications.cnr.it/doc/84690}, CONFERENCE_NAME = {Recent Advances in Natural Language Processing (RANLP-2007)}, CONFERENCE_PLACE = {Borovets}, CONFERENCE_DATE = {27-29 settembre 2007}, BOOKTITLE = {Proceedings of the International Conference "Recent Advances in Natural Language Processing"}, } @INPROCEEDINGS{LENCI_2007_INPROCEEDINGS_LMPV_84693, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V. and Venturi, G.}, TITLE = {NLP-based ontology learning from legal texts. A case study}, YEAR = {2007}, ABSTRACT = {The paper reports on the methodology and preliminary results of a case study in automatically extracting ontological knowledge from Italian legislative texts in the environmental domain. We use a fully-implemented ontology learning system (T2K) that includes a battery of tools for Natural Language Processing (NLP), statistical text analysis and machine language learning. Tools are dynamically integrated to provide an incremental representation of the content of vast repositories of unstructured documents. Evaluated results, however preliminary, are very encouraging, showing the great potential of NLP-powered incremental systems like T2K for accurate large-scale semi-automatic extraction of legal ontologies.}, PAGES = {113-129}, URL = {https://publications.cnr.it/doc/84693}, CONFERENCE_NAME = {II Workshop on Legal Ontologies and Artificial Intelligence Techniques (LOAIT'07)}, CONFERENCE_PLACE = {Stanford}, CONFERENCE_DATE = {4 giugno 2007}, } @INPROCEEDINGS{MONTEMAGNI_2007_INPROCEEDINGS_M_84692, AUTHOR = {Montemagni, S.}, TITLE = {Patterns of phonetic variation in Tuscany: using dialectometric techniques on multi-level representations of dialectal data}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/84692}, CONFERENCE_NAME = {International Workshop on Computational Phonology}, CONFERENCE_PLACE = {Borovets}, CONFERENCE_DATE = {2007}, } @INPROCEEDINGS{MONTEMAGNI_2007_INPROCEEDINGS_M_84694, AUTHOR = {Montemagni, S.}, TITLE = {Aree fonetiche e lessicali toscane a confronto: prime elaborazioni computazionale dei dati dell’Atlante Lessicale Toscano}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/84694}, CONFERENCE_NAME = {XL Congresso Internazionale di Studi della Società di Linguistica Italiana}, CONFERENCE_PLACE = {Vercelli}, CONFERENCE_DATE = {2007}, } @INPROCEEDINGS{MONTEMAGNI_2007_INPROCEEDINGS_M_84695, AUTHOR = {Montemagni, S.}, TITLE = {Acquisizione automatica di termini da testi: primi esperimenti di estrazione e strutturazione di terminologia metalinguistica}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/84695}, CONFERENCE_NAME = {DLM su Lessicologia e metalinguaggio}, CONFERENCE_PLACE = {Macerata}, CONFERENCE_DATE = {2007}, } @INPROCEEDINGS{SORIA_2007_INPROCEEDINGS_SBLMP_84682, AUTHOR = {Soria, C. and Bartolini, R. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Automatic Extraction of Semantics in Law Documents}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/84682}, CONFERENCE_NAME = {V Legislative XML Workshop}, CONFERENCE_PLACE = {Firenze}, CONFERENCE_DATE = {2007}, } @TECHREPORT{AITMOKHTAR_2007_TECHREPORT_ABBDGGMSS_157418, AUTHOR = {Ait Mokhtar, S. and Barker, E. and Brunelli, R. and Demetriou, G. and Gaizauskas, R. and Giovannetti, E. and Montemagni, S. and Sándor, A. and Sun, H.}, TITLE = {Semantic Annotation Services for Virtual Information and Knowledge Environments}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157418}, } @TECHREPORT{BOUQUET_2007_TECHREPORT_BSMGSNSBCJ_157419, AUTHOR = {Bouquet, P. and Stoermer, H. and Montemagni, S. and Giovannetti, E. and Semeraro, G. and Niederee, C. and Stecher, R. and Brunelli, R. and Chanod, J. P. and Jacquin, T.}, TITLE = {Semantic Representation and Management Report}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157419}, } @TECHREPORT{MONTEMAGNI_2007_TECHREPORT_MMVBBRPT_157440, AUTHOR = {Montemagni, S. and Marchi, S. and Venturi, G. and Bartolini, R. and Bertagna, F. and Ruffolo, P. and Peters, W. and Tiscornia, D.}, TITLE = {Report on Ontology learning tool and testing}, YEAR = {2007}, ABSTRACT = {This deliverable documents the work done within the DALOS EU project for what concerns the definition and implementation of methodologies and techniques to bootstrap terminological and ontological knowledge from domain corpora. Starting from a corpus of legacy legislative texts in different languages, linguistic technologies combined with statistical techniques have been used to extract significant terms as well as to structure them in conceptual structures for the different languages dealt with within the project, namely Italian, English, Spanish and Dutch.}, KEYWORDS = {Ontology Learning, Term Extraction, Natural Language Processing, Conceptual Indexing}, URL = {https://publications.cnr.it/doc/157440}, } @TECHREPORT{MONTEMAGNI_2007_TECHREPORT_MS_157420, AUTHOR = {Montemagni, S. and Simi, M.}, TITLE = {The Italian dependency annotated corpus developed for the CoNLL-2007 Shared Task}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157420}, } @TECHREPORT{MONTEMAGNI_2007_TECHREPORT_MTV_157421, AUTHOR = {Montemagni, S. and Trabucco, A. and Venturi, G.}, TITLE = {Bio-Event Linguistic Annotation Tool. User Manual}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157421}, } @TECHREPORT{MONTEMAGNI_2007_TECHREPORT_MTVTCAMKRP_157422, AUTHOR = {Montemagni, S. and Trabucco, A. and Venturi, G. and Thompson, P. and Cotter, P. and Ananiadou, S. and McNaught, J. and Kim, J. and Rebholz, D. and Pezik, P.}, TITLE = {Event annotation of domain corpora}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157422}, } @TECHREPORT{SASAKI_2007_TECHREPORT_SMAPMMP_157423, AUTHOR = {Sasaki, Y. and McNaught, J. and Ananiadou, S. and Pezik, P. and McGillivray, B. and Montemagni, S. and Pirrelli, V.}, TITLE = {Augmented Version of Bio-Lexicon}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157423}, } @MISC{PICCHI_2007_MISC_PMCSP_157436, AUTHOR = {Picchi, E. and Montemagni, S. and Cucurullo, S. and Sassolini, E. and Paoli, M.}, TITLE = {ALT-Web. Sito dell’Atlante Lessicale Toscano (ALT) in rete}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157436}, } @INPROCEEDINGS{BARTOLINI_2006_INPROCEEDINGS_BGMMABSB_84664, AUTHOR = {Bartolini, R. and Giovannetti, E. and Marchi, S. and Montemagni, S. and Andreatta, C. and Brunelli, R. and Stecher, R. and Bouquet, P.}, TITLE = {Multimedia Information Extraction in Ontology-based Semantic Annotation of Product Catalogues}, YEAR = {2006}, ABSTRACT = {The demand for efficient methods for extracting knowledge from multimedia content has led to a growing research community investigating the convergence of multimedia and knowledge technologies. In this paper we describe a methodology for extracting multimedia information from product catalogues empowered by the synergetic use and extension of a domain ontology. The methodology was implemented in the Trade Fair Advanced Semantic Annotation Pipeline of the VIKE-framework.}, KEYWORDS = {Semantic Web Technologies, ontology creation, ontology extraction, ontology evolution, semantic annotation of multimedia content}, URL = {https://publications.cnr.it/doc/84664}, CONFERENCE_NAME = {SWAP 2006}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {18-20 December 2006}, } @INPROCEEDINGS{CUCURULLO_2006_INPROCEEDINGS_CMPPS_84629, AUTHOR = {Cucurullo, S. and Montemagni, S. and Paoli, M. and Picchi, E. and Sassolini, E.}, TITLE = {Dialectal resources on-line: the ALT-Web experience}, YEAR = {2006}, ABSTRACT = {The paper presents an on-line dialectal resource, ALT-Web, which gives access to the linguistic data of the Atlante Lessicale Toscano, a specially designed linguistic atlas in which lexical data have both a diatopic and diastratic characterisation. The paper focuses on: the dialectal data representation model; the access modalities to the ALT dialectal corpus; ontology-based search.}, KEYWORDS = {Computational dialectology, Dialectal databases, Construction of lexical resources}, PAGES = {1846-1851}, URL = {http://www.lrec-conf.org/lrec2006/}, VOLUME = {Proceedings}, ISBN = {2-9517408-2-4}, CONFERENCE_NAME = {LREC 2006: 5th International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Genoa}, CONFERENCE_DATE = {24-25-26 Maggio 2006}, BOOKTITLE = {Dialectal resources on-line: the ALT-Web experience}, } @INPROCEEDINGS{CUCURULLO_2006_INPROCEEDINGS_CMPPS_84661, AUTHOR = {Cucurullo, S. and Montemagni, S. and Paoli, M. and Picchi, E. and Sassolini, E.}, TITLE = {Atlante Dialettale in rete: ALT-Web}, YEAR = {2006}, ABSTRACT = {The paper presents an on-line dialectal resource, ALT-Web, which gives access to the linguistic data of the Lexical Atlas of Tuscany or Atlante Lessicale Toscano, a specially designed linguistic atlas in which lexical data have both a diatopic and diastratic characterisation. The paper illustrates ALT-Web with particular emphasis on: 1) the dialectal data representation model; 2) the access modalities to the ALT dialectal corpus designed to produce an output tailored to the specific needs of the different classes of users (both professionals and common citizens); 3) ontology-based search. These represent three main features which differentiate ALT-Web both from the previous digitalised ALT version and, most interestingly, from other on-line dialectal resources. At the time of writing, this is the first resource of this type in Italy, and one of the few at the international level.}, KEYWORDS = {dialectal resources, information retrieval}, PAGES = {661-672}, URL = {http://www.euralex.org/publications/}, VOLUME = {2}, PUBLISHER = {Edizioni dell'ORSO (Alessandria, ITA)}, ISBN = {8876949186}, CONFERENCE_NAME = {12° EURALEX International Congress}, CONFERENCE_PLACE = {Torino}, CONFERENCE_DATE = {6-9 Settembre 2006}, BOOKTITLE = {Proceedings in 12° EURALEX International Congress, Congresso internazionale di lessicografia}, EDITOR = {Corino, E. and Marello, C. and Onesti, C.}, } @INPROCEEDINGS{DELLORLETTA_2006_INPROCEEDINGS_DLMP_84630, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Searching treebanks for functional constraints: cross-lingual experiments in grammatical relation assignment}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/84630}, CONFERENCE_NAME = {LREC 2006: 5th International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Genoa}, } @INPROCEEDINGS{DELLORLETTA_2006_INPROCEEDINGS_DLMP_84660, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Probing the space of grammatical variation: induction of cross-lingual grammatical constraints from treebanks}, YEAR = {2006}, ABSTRACT = {The paper reports on a detailed quantitative analysis of distributional language data of both Italian and Czech, highlighting the relative contribution of a number of distributed grammatical factors to sentence-based identification of subjects and direct objects. The work uses a Maximum Entropy model of stochastic resolution of conflicting grammatical constraints and is demonstrably capable of putting explanatory theoretical accounts to the test of usage-based empirical verification.}, PAGES = {21-28}, URL = {https://publications.cnr.it/doc/84660}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {1-932432-78-7}, CONFERENCE_NAME = {Coling/ACL 2006}, CONFERENCE_PLACE = {Sydney (Australia)}, CONFERENCE_DATE = {22 July 2006}, BOOKTITLE = {Proceedings of the Workshop on Frontiers in Linguistically Annotated Corpora 2006 (LAC 06)}, } @INPROCEEDINGS{MONTEMAGNI_2006_INPROCEEDINGS_MPP_84659, AUTHOR = {Montemagni, S. and Paoli, M. and Picchi, E.}, TITLE = {ALT-WEB: l’'Atlante Lessicale Toscano in rete}, YEAR = {2006}, ABSTRACT = {Scopo dell'articolo è la presentazione di ALT-Web, ovvero l'Atlante Lessicale Toscano in rete. ALT-Web è stato ideato per rendere il patrimonio linguistico-culturale testimoniato dall'Atlante Lessicale Toscano una risorsa educativa realmente disponibile in modo che possa fornire un contributo alla conservazione della memoria dell'identità culturale toscana e al contempo costituisca un prezioso punto di riferimento per lo studio di dinamiche linguistiche sia a livello areale sia a livello socio-culturale. La sua collocazione in rete porta inevitabilmente ALT-Web a rivolgersi a una vasta gamma di utenti non più circoscritta agli addetti ai lavori (ovvero dialettologi, linguisti, etno-linguisti), ma che include anche insegnanti, operatori culturali (ad esempio, personale di musei e di istituzioni culturali pubbliche e private) fino al cittadino navigatore di Internet che voglia capire di più della propria identità linguistica e culturale. Il vasto e variegato bacino di utenza a cui intende rivolgersi ALT-Web ha portato alla trasformazione della versione informatizzata dell'Atlante Lessicale Toscano (conosciuta come DBT-ALT) in una rete ipertestuale con modalità e funzionalità di accesso differenziate in relazione alle diverse classi di utenza; a questo aspetto, è legata l'altra interpretazione dell'acronimo ALT-Web, ovvero quella di "ALT come rete". L'articolo illustra aspetti del processo di progettazione e realizzazione dell'opera che rivestono un qualche interesse per il linguista e il dialettologo. In particolare, dopo un breve excursus che riepiloga le caratteristiche principali della risorsa di partenza, l'articolo illustra la progettazione e realizzazione di ALT-Web, partendo dall'analisi dei requisiti e la definizione delle caratteristiche generali per arrivare ad aspetti più specifici che riguardano le modalità di accesso ai materiali e la normalizzazione dei materiali dialettali in trascrizione fonetica.}, KEYWORDS = {Dialettologia Computazionale-Risorse dialettali in rete-Atlante lessicale}, PAGES = {209-241}, URL = {https://publications.cnr.it/doc/84659}, PUBLISHER = {Antenore (Roma, ITA)}, ISBN = {88-8455-606-6}, CONFERENCE_NAME = {Lessicografia Dialettale. Ricordando Paolo Zolli. Atti del Convegno di Studi}, CONFERENCE_PLACE = {Venezia}, CONFERENCE_DATE = {9-11 dicembre 2004}, BOOKTITLE = {Lessicografia dialettale: ricordando Paolo Zolli-Atti del convegno di studi, Venezia, 9-11 dicembre 2004}, EDITOR = {Bruni, F. and Marcato, C.}, } @INPROCEEDINGS{BARTOLINI_2006_INPROCEEDINGS_BGMMABSNBB_84663, AUTHOR = {Bartolini, R. and Giovannetti, E. and Marchi, S. and Montemagni, S. and Andreatta, C. and Brunelli, R. and Stecher, R. and Niederée, C. and Bouquet, P. and Bortoli, S.}, TITLE = {Ontology Learning in Multimedia Information Extraction from Product Catalogues}, YEAR = {2006}, ABSTRACT = {We propose a methodology for extracting multimedia information from product catalogues empowered by the synergetic use and extension of a domain ontology. The use of domain ontologies in this context additionally opens up innovative ways of catalogue use. The method is characterized by incrementally feeding and exploiting the ontology during an information extraction process, implemented by the semantic annotation of the analysed document, and by providing support for detecting existing similar ontologies to enable reuse of (parts of) them.}, KEYWORDS = {knowledge-drive multimedia analysis, ontology learning, semi-automatic content annotation tools}, URL = {https://publications.cnr.it/doc/84663}, CONFERENCE_NAME = {BOEMIE 2006}, CONFERENCE_PLACE = {Podebrady, Czech Republic}, CONFERENCE_DATE = {6 ottobre 2006}, } @INPROCEEDINGS{PIRRELLI_2006_INPROCEEDINGS_PLM_112916, AUTHOR = {Pirrelli, V. and Lenci, A. and Montemagni, S.}, TITLE = {Probing the space of grammatical variation: induction of cross-lingual grammatical constraints from treebanks}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/112916}, CONFERENCE_NAME = {Language resources and language research: typology, second language acquisition, English Linguistics}, CONFERENCE_PLACE = {Pavia}, CONFERENCE_DATE = {2006}, } @MISC{BARTOLINI_2006_MISC_BDLMMP_151563, AUTHOR = {Bartolini, R. and Dell'Orletta, F. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-to-Knowledge (T2K) Versione 2}, YEAR = {2006}, ABSTRACT = {Versione 2. Text-to-Knowledge (T2K) è una piattaforma software di supporto avanzato alla gestione documentale per la creazione dinamica di repertori terminologici e ontologie di dominio a partire da testi e per l'indicizzazione concettuale di documenti. Il sistema T2K si propone di offrire una batteria integrata di strumenti avanzati di analisi linguistica del testo, analisi statistica e apprendimento automatico del linguaggio, destinati a offrire una rappresentazione accurata del contenuto di una base documentale non strutturata, per scopi di indicizzazione avanzata e navigazione intelligente. I risultati di questo processo di acquisizione sono annotati in forma di metadati XML, offrendo in tal modo la prospettiva di una sempre crescente e diretta interoperabilità con sistemi automatici per la produzione di contenuti digitali selezionati e strutturati dinamicamente su misura, per diversi profili di utenza. Versioni prototipali di T2K sono già operative su alcuni portali della pubblica amministrazione e sono state applicate per l'indicizzazione di contenuti didattici multimediali. E' in corso l'integrazione della tecnologia T2K nel sistema di gestione informatica di documentazione scientifica del CNR.}, KEYWORDS = {text to knowledge, nlp, estrazione terminologica, ontology learning, indicizzazione terminologica}, URL = {https://publications.cnr.it/doc/151563}, } @MISC{MONTEMAGNI_2006_MISC_M_151556, AUTHOR = {Montemagni, S.}, TITLE = {La Treebank Sintattico Semantica dell'Italiano del progetto SI-TAL}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/151556}, } @MISC{PICCHI_2006_MISC_PMSCP_151557, AUTHOR = {Picchi, E. and Montemagni, S. and Sassolini, E. and Cucurullo, S. and Paoli, M.}, TITLE = {ALTWEB}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/151557}, } @ARTICLE{LENCI_2005_ARTICLE_LMP_64502, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Acquiring and Representing Meaning: Theoretical and Computational Perspectives}, YEAR = {2005}, PAGES = {19-66}, URL = {https://publications.cnr.it/doc/64502}, VOLUME = {22-23}, } @BOOK{LENCI_2005_BOOK_LMP_136436, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Acquiring and Representing Word Meaning: Computational perspectives}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/136436}, PUBLISHER = {Istituti Editoriali e Poligrafici Internazionali (Pisa-Roma, ITA)}, ISBN = {88-8147-413-1}, } @BOOK{LENCI_2005_BOOK_LMP_136437, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Testo e computer-Elementi di linguistica computazionale}, YEAR = {2005}, ABSTRACT = {In che modo il computer può aiutarci a comprendere come funziona la nostra lingua? Cosa significa analizzare un testo con l'aiuto di un calcolatore? In che misura possiamo estendere le potenzialità del computer rendendolo capace di interagire con gli utenti umani nella loro lingua' Queste e altre domande sono l'oggetto di indagine della linguistica computazionale, una disciplina che ha al suo centro proprio il rapporto tra lingua e computer. Il libro fornisce gli elementi di base della linguistica computazionale partendo da un interesse primario per il testo, la sua struttura e il suo contenuto. Il volume propone una sintesi equilibrata e accessibile tra sapere e fare, nozioni di base e loro applicazione, ed è destinato in primo luogo agli studenti delle facoltà umanistiche e scientifiche interessati all'interazione tra scienze umane e informatica, ma anche agli studiosi che vogliano imparare a usare il computer come strumento di ricerca sul linguaggio.}, KEYWORDS = {Linguistica Computazionale}, PAGES = {255}, URL = {https://publications.cnr.it/doc/136437}, PUBLISHER = {Carocci (Roma, ITA)}, ISBN = {8843034251}, } @EDITORIAL{PIRRELLI_2005_EDITORIAL_PM_146069, AUTHOR = {Pirrelli, V. and Montemagni, S.}, TITLE = {Acquisition and Representation of Word Meaning: Theoretical and computational perspectives}, YEAR = {2005}, KEYWORDS = {Lexical semantics, Distributional semantics, Lexicon acquisition}, URL = {https://publications.cnr.it/doc/146069}, VOLUME = {XXII-XXIII}, PUBLISHER = {Istituti Editoriali e Poligrafici Internazionali (Pisa-Roma, ITA)}, ISBN = {88-8147-413-1}, } @INPROCEEDINGS{BARTOLINI_2005_INPROCEEDINGS_BGLMP_84576, AUTHOR = {Bartolini, R. and Giorgetti, D. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Automatic Incremental Term Acquisition from Domain Corpora}, YEAR = {2005}, ABSTRACT = {We describe a technique for the acquisition of terms from Italian domain text corpora, which relies both on sophisticated linguistic analysis and on statistical measures applied to linguistically processed text rather than to raw text as it is usually the case. The main advantage of this technique is that minimal a priori knowledge of term structure is required, thus allowing to explore and discover terms in a given domain without imposing a strict pattern matching structure on them, and also to easily extend it to different domains. The approach we present in this paper is incremental as it may be iterated to discover terms of increasing complexity built on top of terms discovered in the previous iteration. The reason why it is convenient to adopt such an incremental approach is that it allows to "clean" data from noise in the first step, elicitating the constituent terms, and then to refine term acquisition on "skimmed" term data.}, PAGES = {293-300}, URL = {https://publications.cnr.it/doc/84576}, CONFERENCE_NAME = {7th International conference on Terminology and Knowledge Engineering (TKE2005)}, CONFERENCE_PLACE = {Copenhagen}, CONFERENCE_DATE = {2005}, BOOKTITLE = {Proceedings of TKE 2005-7th International Conference on Terminology and Knowledge Engineering}, } @INPROCEEDINGS{BIAGIOLI_2005_INPROCEEDINGS_BFPMS_172458, AUTHOR = {Biagioli, C. and Francesconi, E. and Passerini, A. and Montemagni, S. and Soria, C.}, TITLE = {Automatic semantics extraction in law documents}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/172458}, CONFERENCE_NAME = {Tenth International Conference on Artificial Intelligence and Law (ICAIL 2005)}, CONFERENCE_PLACE = {Bologna}, CONFERENCE_DATE = {2005}, } @INPROCEEDINGS{DELLORLETTA_2005_INPROCEEDINGS_DLMP_84579, AUTHOR = {Dell'Orletta, F. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Climbing the path to grammar: a maximum entropy model of subject/object learning}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/84579}, CONFERENCE_NAME = {Psychocomputational Models of Human Language Acquisition (PsychoCompLA-2005)}, CONFERENCE_PLACE = {Ann Arbour (USA)}, } @TECHREPORT{BARTOLINI_2005_TECHREPORT_BGMM_157366, AUTHOR = {Bartolini, R. and Giorgetti, D. and Marchi, S. and Montemagni, S.}, TITLE = {ILC-CNR Contribution to Deliverable 4. 1}, YEAR = {2005}, ABSTRACT = {The goal of the semantic annotation is the annotation of entities and relations starting from input documents conformant with the harmonisation output schema as defined within WP3. This harmonisation schema will focus on the structural and logical organisation of the documents, while WP4 will concentrate on the annotation of textual entities and image elements. The results of semantic annotation are intended to populate the domain ontology.}, KEYWORDS = {NLP}, URL = {https://publications.cnr.it/doc/157366}, } @TECHREPORT{BARTOLINI_2005_TECHREPORT_BLMMP_157367, AUTHOR = {Bartolini, R. and Lenci, A. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Personalizzazione degli Italian NLP tools}, YEAR = {2005}, ABSTRACT = {Il presente documento intende offrire criteri e risultati della fase di personalizzazione dei moduli per l'analisi automatica del testo (Italian NLP tools o "AnITA") all'interno dell'architettura prevista nell'ambito del progetto FuLL.}, KEYWORDS = {NLP}, PAGES = {13}, URL = {https://publications.cnr.it/doc/157367}, } @TECHREPORT{BARTOLINI_2005_TECHREPORT_BLMP_157369, AUTHOR = {Bartolini, R. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Modellazione del motore sintattico e delle strutture dati di supporto}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/157369}, } @TECHREPORT{BARTOLINI_2005_TECHREPORT_BLMMP_157370, AUTHOR = {Bartolini, R. and Lenci, L. and Marchi, S. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-2-Knowledge: Acquisizione semi-automatica di ontologie per l'indicizzazione semantica di documenti}, YEAR = {2005}, ABSTRACT = {Text-2-Knowledge, Acquisizione semi-automatica di ontologie per l'indicizzazione semantica di documenti}, KEYWORDS = {nlp, terminology extraction}, URL = {https://publications.cnr.it/doc/157370}, } @TECHREPORT{CUCURULLO_2005_TECHREPORT_CMPPS_157373, AUTHOR = {Cucurullo, S. and Montemagni, S. and Paoli, M. and Picchi, E. and Sassolini, E.}, TITLE = {Atlante Lessicale Toscano in rete (ALT-Web). Relazione finale}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/157373}, } @TECHREPORT{GIORGETTI_2005_TECHREPORT_GMM_157380, AUTHOR = {Giorgetti, D. and Marchi, S. and Montemagni, S.}, TITLE = {ILC-CNR Contribution to Deliverable 5. 1}, YEAR = {2005}, ABSTRACT = {This document describes the high level infrastructure designed as part of the project VIKEF for creating a Virtual Information and Knowledge Environment (VIKE), namely an environment made up of explicit representation of the information and knowledge implicitly contained in one or more collections of Information-Content-Knowledge (ICK) resources, and of a collection of services operating on this explicit representation of information and knowledge; it is a virtual environment, as the representation and the services for accessing information and knowledge is almost completely independent from the physical properties of the original data.}, KEYWORDS = {NLP}, URL = {https://publications.cnr.it/doc/157380}, } @TECHREPORT{MARCHI_2005_TECHREPORT_MM_157384, AUTHOR = {Marchi, S. and Montemagni, S.}, TITLE = {ILC-CNR Contribution to Deliverable 3. 1}, YEAR = {2005}, ABSTRACT = {This document presents the first set of knowledge and content acquisition components. Starting from the Annotation Schema definition, it will then describe the Harmonization support and the Annotation components, as well as the various resources needed all along the current chain.}, KEYWORDS = {NLP}, URL = {https://publications.cnr.it/doc/157384}, } @MISC{BARTOLINI_2005_MISC_BDGMLMP_151548, AUTHOR = {Bartolini, R. and Dell'Orletta, F. and Giorgetti, D. and Marchi, S. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-to-Knowledge (T2K)}, YEAR = {2005}, ABSTRACT = {Piattaforma di estrazione e indicizzazione terminologica.}, KEYWORDS = {NLP, estrazione terminologica}, URL = {https://publications.cnr.it/doc/151548}, } @MISC{BARTOLINI_2005_MISC_BMLMP_151550, AUTHOR = {Bartolini, R. and Marchi, S. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {NLPtools}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/151550}, } @MISC{PICCHI_2005_MISC_PMSCP_151532, AUTHOR = {Picchi, E. and Montemagni, S. and Sassolini, E. and Cucurullo, S. and Paoli, M.}, TITLE = {ALTWEB}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/151532}, } @INCOLLECTION{AGOSTINIANI_2004_INCOLLECTION_AMPP_136438, AUTHOR = {Agostiniani, L. and Montemagni, S. and Paoli, M. and Picchi, E.}, TITLE = {Lessicografia dialettale e computer: questioni di rappresentazione e recupero dei dati}, YEAR = {2004}, KEYWORDS = {Lessicografia computazionale, Lessicografia Dialettale}, URL = {https://publications.cnr.it/doc/136438}, PUBLISHER = {Centro Interuniversitario di Studi Veneti (Venezia, ITA)}, } @INCOLLECTION{BARTOLINI_2004_INCOLLECTION_BLMPS_30867, AUTHOR = {Bartolini, R. and Lenci, A. and Montemagni, S. and Pirrelli, V. and Soria, C.}, TITLE = {Automatic Classification and Analysis of Provisions in Italian Legal Texts: A Case Study}, YEAR = {2004}, ABSTRACT = {In this paper we address the problem of automatically enriching legal texts with semantic annotation, an essential pre–requisite to effective indexing and retrieval of legal documents. This is done through illustration of SALEM (Semantic Annotation for LEgal Management), a computational system developed for automated semantic annotation of (Italian) law texts. SALEM is an incremental system using Natural Language Processing techniques to perform two tasks: i) classify law paragraphs according to their regulatory content, and ii) extract relevant text fragments corresponding to specific semantic roles that are relevant for the different types of regulatory content. The paper sketches the overall architecture of SALEM and reports results of a preliminary case study on a sample of Italian law texts.}, KEYWORDS = {Annotazione semantica, Classificazione automatica}, PAGES = {593-604}, URL = {https://rdcu.be/dftjm}, VOLUME = {3292}, DOI = {10.1007/978-3-540-30470-8_72}, PUBLISHER = {Springer (Berlin, DEU)}, ISBN = {978-3-540-23664-1}, BOOKTITLE = {On the Move to Meaningful Internet Systems 2004: OTM 2004 Workshops. OTM 2004}, EDITOR = {Meersman, R. and Tari, Z. and Corsaro, A.}, } @INCOLLECTION{PAOLI_2004_INCOLLECTION_PMP_136444, AUTHOR = {Paoli, M. and Montemagni, S. and Picchi, E.}, TITLE = {ALT Web: l'Atlante Lessicale Toscano in rete}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/136444}, PUBLISHER = {Centro Interuniversitario di Studi Veneti (Venezia, ITA)}, } @INPROCEEDINGS{BARTOLINI_2004_INPROCEEDINGS_BLMP_84570, AUTHOR = {Bartolini, R. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Hybrid Constraints for Robust Parsing: First Experiments and Evaluation}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/84570}, CONFERENCE_NAME = {LREC 2004: Fourth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Lisbon, Portugal}, CONFERENCE_DATE = {2004}, } @INPROCEEDINGS{BARTOLINI_2004_INPROCEEDINGS_BLMPS_84571, AUTHOR = {Bartolini, R. and Lenci, A. and Montemagni, S. and Pirrelli, V. and Soria, C.}, TITLE = {Semantic Mark-up of Italian Legal Texts Through NLP-based Techniques}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/84571}, ISBN = {2-9517408-1-6}, CONFERENCE_NAME = {LREC 2004: Fourth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Lisbon, Portugal}, CONFERENCE_DATE = {2004}, } @INPROCEEDINGS{HEPPLE_2004_INPROCEEDINGS_HIAMMG_84609, AUTHOR = {Hepple, M. and Ireson, N. and Allegrini, P. and Marchi, S. and Montemagni, S. and Gómez Hidalgo, J. M.}, TITLE = {NLP-enhanced Content filtering within the POESIA Project}, YEAR = {2004}, ABSTRACT = {This paper introduces the POESIA internet filtering system, which is open-source, and which combines standard filtering methods, such as positive/negative URL lists, with more advanced techniques, such as image processing and NLP-enhanced text filtering. The description here focusses on components providing textual content filtering for three European languages (English, Italian and Spanish), employing NLP methods to enhance performance. We address also the acquisition of language data needed to develop these filters, and the evaluation of the system and its components.}, KEYWORDS = {Image processing, Natural language processing systems, Open systems}, PAGES = {1967-1970}, URL = {https://www.aclweb.org/anthology/L04-1507/}, ISBN = {2-9517408-1-6}, CONFERENCE_NAME = {LREC 2004: Fourth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Lisbona}, CONFERENCE_DATE = {26-28 May 2004}, BOOKTITLE = {Proceedings of the Fourth International Conference on Language Resources and Evaluation (LREC 2004)}, EDITOR = {Lino, M. T. and Xavier, M. F. and Ferreira, F. and Costa, R. and Silva, R.}, } @INPROCEEDINGS{PIRRELLI_2004_INPROCEEDINGS_PAM_112920, AUTHOR = {Pirrelli, V. and Allegrini, P. and Montemagni, S.}, TITLE = {Classifying text through time: a complexity science approach to dynamic web page filtering}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/112920}, CONFERENCE_NAME = {International Conference on Text Mining (CIFT)}, CONFERENCE_PLACE = {La Rochelle Francia}, CONFERENCE_DATE = {2004}, } @INPROCEEDINGS{PIRRELLI_2004_INPROCEEDINGS_PLM_112923, AUTHOR = {Pirrelli, V. and Lenci, A. and Montemagni, S.}, TITLE = {The lexicon in context: distributional evidence and representational issues}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/112923}, CONFERENCE_NAME = {International Colloquium: Word Structure and Lexical Systems: models and applications}, CONFERENCE_PLACE = {Pavia}, CONFERENCE_DATE = {2004}, } @TECHREPORT{BARTOLINI_2004_TECHREPORT_BGLMP_157375, AUTHOR = {Bartolini, E. and Giorgetti, D. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Text-2-Knowledge: Acquisizione automatica di ontologie per l'indicizzazione semantica di documenti}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/157375}, } @ARTICLE{ALLEGRINI_2003_ARTICLE_AMP_64466, AUTHOR = {Allegrini, P. and Montemagni, S. and Pirrelli, V.}, TITLE = {Example-based automatic induction of semantic classes through entropic scores}, YEAR = {2003}, ABSTRACT = {Abstract - The paper deals in some detail with the application of examplebased machine learning techniques to the task of automatically acquiring semantic information from functionally annotated texts. Special emphasis is placed on the use of “analogical proportions” as a means of structuring the knowledge embodied in attested examples, and weighing up their contribution to a variety of lexico-semantic classification tasks. Careful quantitative analysis of automatically acquired information proves to shed considerable light on the semantic inter-connectivity of input data, their structure and organising principles.}, PAGES = {1-45}, URL = {https://publications.cnr.it/doc/64466}, VOLUME = {16-17}, } @ARTICLE{LENCI_2003_ARTICLE_LMP_64476, AUTHOR = {Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Chunk-it. An Italian shallow parser for robust syntactic annotation}, YEAR = {2003}, PAGES = {353-386}, URL = {https://publications.cnr.it/doc/64476}, VOLUME = {16-17}, } @ARTICLE{MONTEMAGNI_2003_ARTICLE_MBBCCLPZFMRBPSZMPD_64477, AUTHOR = {Montemagni, S. and Barsotti, F. and Battista, M. and Calzolari, N. and Corazzari, O. and Lenci, A. and Pirrelli, V. and Zampolli, A. and Fanciulli, F. and Massetani, M. and Raffaelli, R. and Basili, R. and Pazienza, M. T. and Saracino, D. and Zanzotto, F. and Mana, N. and Pianesi, F. and Delmonte, R.}, TITLE = {The syntactic-semantic Treebank of Italian. An Overview}, YEAR = {2003}, PAGES = {461-492}, URL = {https://publications.cnr.it/doc/64477}, VOLUME = {16-17}, } @ARTICLE{MONTEMAGNI_2003_ARTICLE_MPB_64478, AUTHOR = {Montemagni, S. and Picchi, E. and Biagini, L.}, TITLE = {DBT-ALT: a system for storing and querying the data of the 'Atlante Linguistico Toscano'}, YEAR = {2003}, ABSTRACT = {Abstract - Computers can help dialectologists to make full use of the information they have so laboriously and painstakingly acquired: the basic dimensions of dialectal research can be enlarged and its possible outcomes can become more sophisticated. In this paper, we describe a lexical database for dialectal data, DBT-ALT, which has been designed and constructed to contain linguistic data collected for the Atlante Lessicale Toscano (ALT), a lexical atlas of Tuscany. DBT-ALT is illustrated in detail, with particular emphasis on its search functions which allow for complex queries taking into account a wide range of parameters interactively defined by the user on the basis of his/her research interests.}, PAGES = {493-517}, URL = {https://publications.cnr.it/doc/64478}, VOLUME = {18-19}, } @INCOLLECTION{ALLEGRINI_2003_INCOLLECTION_ALMP_136427, AUTHOR = {Allegrini, P. and Lenci, A. and Montemagni, S. and Pirrelli, V.}, TITLE = {Le forme del significato. Acquisizione e rappresentazione dell'informazione semantica}, YEAR = {2003}, KEYWORDS = {Acquisizione, Semantica Lessicale, Ontologia, Machine Learning}, URL = {https://publications.cnr.it/doc/136427}, } @INCOLLECTION{MONTEMAGNI_2003_INCOLLECTION_MBBCCLZRPMD_136422, AUTHOR = {Montemagni, S. and Barsotti, F. and Battista, M. and Calzolari, N. and Corazzari, O. and Lenci, A. and Zampolli, A. and Raffaelli, R. and Pazienza, M. T. and Mana, N. and Delmonte, R.}, TITLE = {Building the Italian Syntactic-Semantic Treebank}, YEAR = {2003}, KEYWORDS = {Corpora testuali, Annot. sintattica, Annot. semantica, Treebank}, URL = {https://publications.cnr.it/doc/136422}, } @TECHREPORT{ALLEGRINI_2003_TECHREPORT_ACMMHIGCDP_157348, AUTHOR = {Allegrini, P. and Calzolari, N. and Marchi, S. and Montemagni, S. and Hepple, M. and Ireson, N. and Gomez Hidalgo, J. M. and Carrero Garcia, F. and De Buenaga Rodriguez, M. and Puera Sanz, E.}, TITLE = {POESIA Lexical Resources and Tools for Each Language}, YEAR = {2003}, ABSTRACT = {The aim of this report is to review the various resources that the different language processing sites expect to use in the development of their language-specific text filtering components. Some of the required resources are ones that were developed before Poesia, possibly by one the Poesia partners, or possibly elsewhere but being now in the public domain. Such resources may require adaptation to the Poesia task. Other resources required for Poesia will be developed as part of the project. In some cases, this development has already been done or is in progress, whilst in others, it is yet to be undertaken. In what follows, the status of each of the resources described will be made clear in terms of these alternatives.}, KEYWORDS = {Lexical Resources, nlp}, PAGES = {30}, URL = {https://publications.cnr.it/doc/157348}, } @TECHREPORT{STARYNKEVITCH_2002_TECHREPORT_SDTZHIGACMMG_430635, AUTHOR = {Starynkevitch, B. and Daoudi, M. and Tombelle, C. and Zheng, H. and Hepple, M. and Ireson, N. and Gomez Hildago, J. and Allegrini, P. and Calzolari, N. and Marchi, S. and Montemagni, S. and Guerra, S.}, TITLE = {POESIA Software Architecture Definition Document}, YEAR = {2002}, ABSTRACT = {Software Architecture Definition Document}, KEYWORDS = {NLP, Software Engineering}, PAGES = {68-80}, URL = {https://publications.cnr.it/doc/430635}, } @ARTICLE{PICCHI_2001_ARTICLE_PMB_64487, AUTHOR = {Picchi, E. and Montemagni, S. and Biagini, L.}, TITLE = {DBT-ALT: a System for Storying and Querying the Data of the Atlante Lessicale Toscano (ALT)}, YEAR = {2001}, KEYWORDS = {Atlanti linguistici, Dialettologia comput, Lessicografia dialet, Geolinguistica, Sociolinguistica}, PAGES = {85-103}, URL = {https://publications.cnr.it/doc/64487}, VOLUME = {9}, } @INPROCEEDINGS{PETERS_1994_INPROCEEDINGS_PFMZ_409402, AUTHOR = {Peters, C. and Federici, S. and Montemagni, S. and Zamorani, C. N.}, TITLE = {From machine readable dictionaries to lexicons for NLP: the cobuild dictionaries-a different approach}, YEAR = {1994}, ABSTRACT = {We describe the results of a syntactic-semantic parser for Cobuild dictionary definitions. Unlike previous work on the automatic analysis of machine readable dictionaries, the particular structure of the Cobuild definition allows us to derive information that classifies the lexical item mainly in terms of the selectional restrictions or preferences encoded on its arguments. The resulting formalized lexical entries contain data that has generally been lacking in other lexical representations but which is expected to be very useful in a wide range of NLP purposes. We show how this information can be used in dictionary sense disambiguation by creating links throughout the lexicon both on the paradigmatic and the syntagmatic axes.}, KEYWORDS = {Lexical databases, Information storage and retrieval. Dictionaries}, PAGES = {147-157}, URL = {https://publications.cnr.it/doc/409402}, CONFERENCE_NAME = {6th International Congress on Lexicography}, CONFERENCE_PLACE = {Amsterdam, The Netherlands}, CONFERENCE_DATE = {1994}, BOOKTITLE = {Euralex 1994 Proceedings}, EDITOR = {Martin, W.}, } @TECHREPORT{BARNBROOK_1994_TECHREPORT_BCFHMPSS_446186, AUTHOR = {Barnbrook, G. and Calzolari, N. and Federici, S. and Hoelter, M. and Montemagni, S. and Peters, C. and Schnelle, H. and Sinclair, J.}, TITLE = {ET10/51-Deliverable 8: Evaluation Report}, YEAR = {1994}, ABSTRACT = {The objective of the work in Pisa has been to translate and produce instantiations of the syntactically parsed definitions of the Cobuild dictionary: provided by Birmingham in a Typed Feature Structure formalism. However, as described in Methodology above, our results have been produced at two different levels: intermediate results; final results in the form of TFS entries. In the following, we will discuss briefly the possible applications of these different results for the three user types recognized in the introduction to this section: i. Human user ii. Human user-assisted by the machine iii. The machine Obviously, the discussion here below refers entirely to the results that would be obtained once the parser has been applied to the whole dictionary.}, KEYWORDS = {Language, Computational linguistics, Formal Definitions and Theory}, PAGES = {38}, URL = {https://publications.cnr.it/doc/446186}, } @TECHREPORT{CALZOLARI_1994_TECHREPORT_CFMP_446200, AUTHOR = {Calzolari, N. and Federici, S. and Montemagni, S. and Peters, C.}, TITLE = {ET-10/51-Final Report: Par. 3-Extracting, representing and using syntactic-semantic information from cobuild definitions}, YEAR = {1994}, ABSTRACT = {In May 1992 a new research project brought together the authors of this report. With the help and support of several other people and institutions, they worked steadily for two years, trying to improve the design and building of machine-usable lexicons, for automatic translation and many other applications. The starting point was clear. Around 1989 Helmut Schnelle of the Ruhr-Universitat Bochum became interested in the way in which words were defined in a new kind of dictionary called Cobuild. He thought that since theywere couched in sentences of apparently ordinary English, and had distinctive and repetitive shapes according to their meanings, it should be possible to represent them in logical form by means of regular rules.}, KEYWORDS = {Language, Computational linguistics, Formal Definitions and Theory, Semantics}, PAGES = {162}, URL = {https://publications.cnr.it/doc/446200}, }