@BOOK{GARCIAMACHO_2018_BOOK_GS_389832, AUTHOR = {Garcia Macho, M. L. and Sassi, M.}, TITLE = {Léxico del Tratado del esphera y del arte de marear con el regimiento de las alturas, con algunas reglas nuevamente escritas muy necessarias de Francisco de Falero}, YEAR = {2018}, ABSTRACT = {El léxico del Tratado del esphera y del Arte del marear de Francisco Faleiro, forma parte del conjunto lexicográfico del Diccionario de la navegación del Siglo de Oro. Para la realización de este diccionario, se ha contado con dos proyectos de investigación: HUM2006, financiado por el Ministerio de Educación y Ciencia de España, y FFI2012-36768, del Ministerio de Economía y Competitividad y cuatro ayudas de movilidad: dos concedidas por el Consiglio Nazionale della Ricerca italiano, CNR [Istituto di Linguistica Computazionale de Italia (2006 y 2007)] y dos por el Ministerio de Ciencia e Innovación de España [Programa de Estancias de Profesores de Universidad e Investigadores del CSIC en Centros de Investigación Extranjeros (2005 y 2010)]. Este volumen contiene la concordancia lematizada, los índices de frecuencia de lemas y formas, los índices de los nombres propios y el diccionario inverso del Tratado.}, KEYWORDS = {Indici vari, Dizionario della Navigazione, Siglo de Oro, Concordanze per lemma}, PAGES = {1-488}, URL = {http://portal.uned.es/portal/page?_pageid=93,62295002\&_dad=portal\&_schema=PORTAL}, ISBN = {978-84-362-7383-0}, } @INCOLLECTION{SASSI_2017_INCOLLECTION_S_382198, AUTHOR = {Sassi, M.}, TITLE = {Cuestiones pertinentes e impertinentes de los Diccionarios Temáticos}, YEAR = {2017}, ABSTRACT = {The Dictionaries in general, and in particular the thematic ones, have represented the thread of my career in the Istituto di Linguistica Computazionale of Pisa from and before its origins (in the years 1965-1978 it was called Divisione Linguistica of the CNUCE and later ILC- CNR). In the '60 -'70 we worked on the DMI (Italian Machine Dictionary) under the auspices of the Italian Parliament. In the 1980s, the first studies of dictionaries organized in lexical fields began, starting from the intuitions of Julio Casares and the same principles were applied to Italian. From these distant experiences, over 50 years, different studies, applications, corpora and databases have been developed in relation to several disciplines: Literature, Philology, Law, Justice, Administration, Tourism, Medicine, etc. There have been compilations of several Authors in Spanish language: Bolivar, Carpentier, Cervantes, Encina, Machado, Marquez, Neruda, Salinas, Teresa de Avila, Unamuno, Vallejo, that will be described in this presentation, with examples of online search with the DBT-Web interface. It is also discussed to preserve this data for the future through its maintenance for consultation on the network.}, KEYWORDS = {Computational Linguistics, Thematic Dictionaries, Corpora, Preservation and Reuse of data, Historical overview of Text Processing}, PAGES = {37-49}, URL = {https://publications.cnr.it/doc/382198}, VOLUME = {VII}, ISBN = {978-84-617-4512-8}, BOOKTITLE = {El diccionario en la encrucijada: de la sintaxis y la cultura al desafío digital}, EDITOR = {López, S. and Cuadrado, I. G. and Escribano, J. G. and Cecilio}, } @INPROCEEDINGS{GOGGI_2015_INPROCEEDINGS_GPSGB_318501, AUTHOR = {Goggi, S. and Pardelli, G. and Sassi, M. and Giannini, S. and Biagioni, S.}, TITLE = {A terminological survey on the titles of the Seventh Framework Programme (FP7)}, YEAR = {2015}, ABSTRACT = {This paper focuses on the automatic extraction of domain-specific knowledge from the European Commission projects of the 7th Framework Programme, hereinafter referred as FP7. The study is divided in three parts: the first part introduces the work starting from the building up of a corpus containing the titles of European Projects of the whole FP7 in order to obtain a relevant terminological sample for the different domains; the second describes software and methods while the third part focuses on the evaluation of results. Finally, we conclude by suggesting possible directions for further development of a comparison between terminological extraction from FP7 and FP5/FP6.}, KEYWORDS = {7th Framework Programme (FP7), Natural Language Processing, Terminology, Knowledge extraction, Grey Literature, I. 2. 7 Natural Language Processing. Text analysis, I. 2. 1 Applications and Expert Systems. Natural language interfaces}, PAGES = {223-227}, URL = {https://publications.cnr.it/doc/318501}, ISBN = {978-959-7174-28-8}, CONFERENCE_NAME = {Fourteenth International Symposium on Comunicación Social: retos y perspectivas}, CONFERENCE_PLACE = {Santiago de Cuba}, CONFERENCE_DATE = {19-23 de enero 2015}, EDITOR = {Ruiz Miyares, L. and Álvarez Silva, M. R. and Muñoz Alvarado, A.}, } @INPROCEEDINGS{VENTURI_2015_INPROCEEDINGS_VRMSTFB_340388, AUTHOR = {Venturi, G. and Rinnone, S. and Montemagni, S. and Sassi, M. and Terranova, G. and Flore, E. and Bellandi, T.}, TITLE = {Language technologies for automatic readability assessment of health-related Information: a preliminary investigation into the informed consent forms used in a regional health service}, YEAR = {2015}, ABSTRACT = {Rationale: Within an information society, where everyone should be able to access all available information, improving access to written language is becoming more and more a central issue. This is the case for health-related information which should be accessible to all members of the society, including people who have reading difficulties as a result of a low education level or of language-based learning disabilities or because the language of the text is not their native language. Moreover, the breakdown of doctor-patient communication is one of the most frequent cause of adverse events. Research questions: We conducted a preliminary investigation to assess the readability of a corpus of informed consent forms used before a clinical procedure in the hospitals of a Regional Healthcare Service. Secondary goals include the comparison of readability across specialties and healthcare trusts. Methods: Providing complex scientific information in a way that is comprehensible to a lay person is a challenge that nowadays can be addressed by resorting to advanced Natural Language Processing (NLP) techniques, which make it possible to monitor the linguistic complexity of texts at the syntactic and lexical levels and to support their simplification, whenever needed. The study has been carried out by combining NLP-enabled feature extraction and state-of-the-art machine learning algorithms. To this end we used READ-IT, the first NLP-based readability assessment tool for Italian. Results: We analysed 584 documents, covering 29 specialties, for a total of 607.790 word tokens, currently used at the 36 public hospitals in Tuscany. Although the readability level of all documents in the corpus is low, both at the lexical and syntactic level, significant differences can be observed between specialties and healthcare trust releasing the forms. With the readability level ranging between 0 (easy-to-read) and 100 (difficult-to-read), it resulted that the pediatric informed consent documents are the most easy-to-read forms (with an average score of 75) while the most difficult-to read documents are documents of the surgical area (whose average score is 80) (standard deviation 2). Discussion: The state of the art resulting from this preliminary study shows that NLP-based readability assessment tools can help to measure the linguistic complexity of informed consent forms and guide the editor to identify linguistically complex passages that need to be simplified, either syntactically or lexically. The use of an assessment tool designed for the general language is the main limitation of the study and should be addressed through the customization of the tool to assess the readability of the healthcare jargon. A further step of the research consider also the design of a guidance to prepare readable informed consent forms.}, KEYWORDS = {Readability assessment, health-related information}, URL = {http://static1.squarespace.com/static/561c0d01e4b0b5ad2e65cc48/t/561d44dfe4b089431662d174/1444758751213/LibrettoProgramma.pdf}, CONFERENCE_NAME = {ISCOME 2015 Conference: "The Golden Bridge: Communication and Patient Safety"}, CONFERENCE_PLACE = {Montecatini Terme}, CONFERENCE_DATE = {15-16 giugno 2015}, } @ARTICLE{SASSI_2014_ARTICLE_SBP_280559, AUTHOR = {Sassi, M. and Biagioni, S. and Pardelli, G.}, TITLE = {A Linguistic and Gender Approach to 1841 Tuscany Population Census}, YEAR = {2014}, ABSTRACT = {The Census of 1841 in Tuscany was the first official data registry which tried to describe Tuscan population as a whole on the basis of the Granducato's territory. With the use of special ad-hoc created forms, all demographic and socioeconomic characteristics of families and single persons in "Granducato di Toscana" were described. Work is developed in five points: (1) informatics retrieval of linguistic information from Tuscany of 1800 focused by the arts and craftsmanship more in use in families of that time; (2) gender division of works and craftsmanship; (3) observation of lexical disparity in the four communities and terminological curiosities of that historical period; (4) actually no longer existing craftsmanship; and (5) diachronic analysis of communities, where possible. In this scenario, the authors will introduce the methodology they used for data analysis. Tables and figures will be used to better focus different moments and results of the work. A Glossary in Appendix will contain the English translation of the Italian terms extracted from the Corpus.}, KEYWORDS = {Tuscany Population Census, sociological analysis, gender analysis, the 19th work terminology, linguistic statistics}, PAGES = {318-329}, URL = {http://www.davidpublishing.com/show.html?16049}, VOLUME = {12}, PUBLISHER = {USA-China Business Review (Journal), Inc (New York, NY, Stati Uniti d'America)}, ISSN = {1539-8080}, JOURNAL = {US-China foreign language}, } @INPROCEEDINGS{BELEFFI_2014_INPROCEEDINGS_BS_319421, AUTHOR = {Beleffi, E. and Sassi, M.}, TITLE = {La sicurezza del paziente sui quotidiani in Italia: indagine preliminare sui termini e l'andamento degli eventi}, YEAR = {2014}, URL = {https://publications.cnr.it/doc/319421}, CONFERENCE_NAME = {FORUM RISK MANAGEMENT IN SANITÀ 2014}, CONFERENCE_PLACE = {Arezzo}, CONFERENCE_DATE = {25-28 novembre 2014}, } @INPROCEEDINGS{SASSOLINI_2014_INPROCEEDINGS_SSCCS_319040, AUTHOR = {Sassolini, E. and Sassi, M. and Cucurullo, S. and Cinini, A. and Sbrulli, S.}, TITLE = {Industrial Philology: Problems and techniques of data and archives preservation for future generations}, YEAR = {2014}, ABSTRACT = {The main objective of digital archiving of texts is their re-use and preservation. The concept that guides these initiatives is linked to structural and organizational needs which heavily influence the definition of the format specifications that describe the organisation of the archives at various levels and consists of a more or less complex document. A format specification provides the details needed to build a file from a text, establishes the admitted encodings and software applications that can decode the file and make its content accessible. These structural specifications can have an extremely variable size and they depend on the complexity of the format. Although some format specifications are, for the most part, independent of the specific software (for example, ASCII and Unicode codes), many of them are related to the historical period in which the texts were acquired and also by dated software technologies. The file format specification should evolve hand in hand with the related software, and the fate of one is in fact often linked to that of the other. It is therefore appropriate to face the issue of obsolescence of software together with the obsolescence of file formats and of storage medium.}, KEYWORDS = {text management, text analysis}, PAGES = {168-172}, URL = {https://publications.cnr.it/doc/319040}, PUBLISHER = {TransAtlantic (Amsterdam, Paesi Bassi)}, ISSN = {1386-2316}, ISBN = {978-90-77484-22-7}, CONFERENCE_NAME = {GL15: Fifteenth International Conference on Grey Literature}, CONFERENCE_PLACE = {Bratislava}, CONFERENCE_DATE = {2, 3 december 2013}, BOOKTITLE = {The GL-conference series. Conference proceedings}, } @INCOLLECTION{CUCURULLO_2013_INCOLLECTION_CS_353214, AUTHOR = {Cucurullo, S. and Sassi, M.}, TITLE = {Il Contributo Tecnologico dell'ILC al Progetto LinCi}, YEAR = {2013}, ABSTRACT = {Il progetto "la Lingua delle Città (LinCi)" ha l'obiettivo di mettere in luce alcuni fenomeni rilevanti - sia dal punto di vista lessicale che grammaticale - dell'italiano comune e informale, secondo l'opinione sull'uso dei parlanti intervistati. A tale scopo il gruppo dei linguisti che ha ideato il progetto ha elaborato un questionario di 200 domande riconducibili a vari campi semantici. La struttura del questionario consente inoltre al raccoglitore di ricavare informazioni di tipo grammaticale (per esempio sull'uso di certe forme pronominali o verbali), nonché giudizi di carattere metalinguistico sulla "dialettalità" o meno di certe forme, sulle differenze tra registro formale e informale, sulla frequenza d'uso.}, KEYWORDS = {lingua italiana, banche dati}, PAGES = {81-99}, URL = {https://publications.cnr.it/doc/353214}, PUBLISHER = {Accademia della Crusca (Firenze, ITA)}, ISBN = {978-88-89369-51-7}, BOOKTITLE = {La lingua delle città LinCi. La banca dati}, EDITOR = {Nesi, A. and Salani, T. P.}, } @INCOLLECTION{CUCURULLO_2013_INCOLLECTION_CS_353219, AUTHOR = {Cucurullo, S. and Sassi, M.}, TITLE = {ASPETTI TECNICO-METODOLOGICI DEL PROGETTO LinCi}, YEAR = {2013}, ABSTRACT = {l progetto "la Lingua delle Città (LinCi)" ha l'obiettivo di mettere in luce i cambiamenti e l'evoluzione dei diversi dialetti italiani regionali, sia dal punto di vista grammaticale che lessicale. A tale scopo, è stato elaborato un questionario di 200 domande su vari campi semantici: determinazioni temporali; forme di saluto; corpo umano; mestieri; oggetti domestici; cibi, frutta e verdura; rapporti sociali, ecc. La struttura del questionario consente inoltre al raccoglitore di ricavare informazioni di tipo grammaticale (per esempio sull'uso di certe forme pronominali e verbali), nonché giudizi di carattere metalinguistico, sulla "dialettalità" o meno di certe forme, sulle differenze tra registro formale e informale, ecc. La collaborazione dell'Istituto di Linguistica Computazionale (ILC) al Progetto LinCi ha come obiettivo quello di fornire il supporto informatico all'unità di coordinamento per la creazione e gestione della banca-dati e la sua consultazione tramite il sito dedicato.}, KEYWORDS = {Sociolinguistica, banca-dati DBT}, PAGES = {47-50}, URL = {https://publications.cnr.it/doc/353219}, VOLUME = {11}, PUBLISHER = {Franco Cesati Editore (Firenze, ITA)}, ISBN = {9788876674563}, BOOKTITLE = {La lingua delle città Raccolta di studi}, EDITOR = {Nesi, A.}, } @INCOLLECTION{SASSI_2013_INCOLLECTION_SG_319402, AUTHOR = {Sassi, M. and Grava, M.}, TITLE = {Una metamorfosi chiamata GIS: dai Database ai Geo-database}, YEAR = {2013}, ABSTRACT = {In questo contributo si descrivono le fasi di sviluppo (pionieristico) della ricerca Popolazione e agricoltura nel territorio toscano durante l'Ottocento, iniziata da Giuliana Biagioli con metodi tradizionali. Grazie al suo particolare intuito ha poi trovato nell'Università di Pisa il terreno fertile per poi lanciarsi nell'avventura informatica. Qui ricordiamo che la prima facoltà di Scienze dell'Informazione era stata fondata a Pisa pochi anni prima (si era nel 1969) e quindi negli anni successivi si poteva già contare con i primi specialisti del settore. La prima parte descriverà le prime tappe di lavoro con l'ausilio del calcolatore, che a quel tempo era denominato Mainframe e oltre ad occupare enormi spazi si "nutriva" con schede meccanografiche e nastri magnetici. Con lo sviluppo della tecnologia si è poi passati all'uso dei terminali periferici, che permettevano l'inserimento dati diretto, in comunicazione telematica con il cervellone, per poi arrivare al trasferimento dei dati e dei risultati delle elaborazioni su Personal Computer. In questa relazione suddivideremo le due parti: prima e dopo Internet.}, KEYWORDS = {Storia, Banca-dati, Catasto leopoldino, Storia dell'Informatica}, PAGES = {439-458}, URL = {https://publications.cnr.it/doc/319402}, PUBLISHER = {Edizioni ETS (Pisa, ITA)}, ISBN = {9788846736765}, BOOKTITLE = {Il mondo a metà-Studi storici sul territorio e l'ambiente-In onore di Giuliana Biagioli}, EDITOR = {Pazzagli, R.}, } @INPROCEEDINGS{MARCONI_2013_INPROCEEDINGS_MCCMS_282745, AUTHOR = {Marconi, L. and Cutugno, P. and Chiarella, D. and Morgavi, G. and Sassi, M.}, TITLE = {Análisis de Blogs y Temas en "Narrarsi in rete: linguaggi a confronto"}, YEAR = {2013}, URL = {https://publications.cnr.it/doc/282745}, ISBN = {9789597152194}, CONFERENCE_NAME = {VIII Conferencia Científica Internacional Lingüística}, CONFERENCE_PLACE = {Habana}, CONFERENCE_DATE = {27-29 Novembre 2013}, } @INPROCEEDINGS{PARDELLI_2013_INPROCEEDINGS_PGS_254484, AUTHOR = {Pardelli, G. and Goggi, S. and Sassi, M.}, TITLE = {Open Grey for Language Technology: a ride on the network}, YEAR = {2013}, ABSTRACT = {Sommario in IngleseThe aim of this paper is to introduce the Open Access movement for Natural Language Processing (NLP) by means of a wide range of open access Grey Literature documentation available on the web. In 2008 Robert Dale, in the last issue of volume 35 of Computational Linguistics said: "There are a number of definitions of the term 'open access' in circulation, but almost all share the key principle that scientific literature should be freely available for all to read, download, copy, distribute, and use (with appropriate attribution) without restriction". At first glance it might seem that the Open Access movement has gradually become more influential in the field of language technology by building repositories accessible through the network. Today's digital archives are niches of intellectual production spread by means of a wide range of documents (such as journal articles and proceedings) which, paradoxically, the search engines do not always reach. The use of inappropriate terms in the formulation of queries and the fragmentation of repositories in this area of investigation does not allow to retrieve information on a large scale. The full paper, after a first introductory section, will be organized in two sections: 1) the first dedicated to the methodology for searching and tracing open access resources and to the criteria for analyzing and selecting the online documentation; 2) the second devoted to a description of the state-of-the-art of Open Access Grey Literature material in a statistical and thematic scenario. As things stand, standardization of computational systems interconnected by links and tools of various nature allowing Internet users to easily retrieve the information that the web naturally makes available would then be essential. Topics: Sustainability, Public Accessible Resources, Product and Service enhancements, Open Access, Curation and Preservation}, KEYWORDS = {Open Access Movement. Natural Lanuage Processing}, PAGES = {161-165}, URL = {https://publications.cnr.it/doc/254484}, PUBLISHER = {TEXTRELEASE, GL PROGRAM \& CONFERENCE BUREAU (Amsterdam, NLD)}, ISBN = {978-90-77484-20-3}, CONFERENCE_NAME = {GL14 Fourteenth International Conference on Grey Literature. Tracking Innovation Through Grey Literature}, CONFERENCE_PLACE = {Roma, Italy (CNR)}, CONFERENCE_DATE = {29-30 November 2012}, EDITOR = {Farace, D. J. and Frantzen, J. and Greynet}, } @INPROCEEDINGS{SASSI_2013_INPROCEEDINGS_SBP_254230, AUTHOR = {Sassi, M. and Biagioni, S. and Pardelli, G.}, TITLE = {A linguistic and gender approach to 1841 Tuscany population Census}, YEAR = {2013}, ABSTRACT = {The Census of 1841 in Tuscany was first official data registry which tried to describe Tuscan population as a whole on granducal basis. With the use of special ad hoc created forms all demographic and socioeconomic characteristics of families and single persons in "Granducato di Toscana" were described. These data of Census, now kept by the State Archive of Florence , supply a precious source for studies of all different aspects of the population and include following information: name, surname, age, gender, marital status, employment, religion, schoolarity, "social status". In the registrers for each community and parish a full account is given of homes, resident families, and composition of families including family servants. Each of those entities had a proper incremental code number. [Registers were generated and updated by priests, who at that time were only surely scholarised officers widespread on territory, that is why they are divided by parish, which is an administrative unit typical of canonic right, instead of quarters or "rioni" or "contrade" which instead had been long practiced in civil right. ] During early 80:s the research group of prof. Biagioli of Department of Modern History of Pisa University, charged the computational linguistic Institute of CNR with digitalization and the electronic processing of these data as well as of data from "Catasto" [public registry of buildings and land ownership] to enable statistical, demographical, historical, sociological and economic analysis . In this work the authors have used the only partially usable subset of data left of that work, concerning four communities in the province of Pisa i.e actual Bièntina, Càscina, Pontedera and San Giuliano Terme (at that time named "Baths of San Giuliano") and is more concerned with terminological and lexical issues a gender related analysis of work and craftmanships. Each of the four communities has its own peculiar profile. Work is developed in 5 points: a) Informatics retrieval of linguistic information from Tuscany of 1800 focused by the arts and craftmanships more in use in families of that time, b) gender division of works and craftmanships, c) observation of lexical disparity in the four communities and terminological curiosities of that historical period, d) actually no longer existing craftmanships, e) diacronic analysis of communities, where possible. In this scenery the authors will introduce the methodology they employed for data analysis. Tables and graphs will be used to better focus different moments and results of work. The authors give the English translation of the terms extracted from the Corpus (see Appendix Glossary).}, KEYWORDS = {1841 Tuscany Population Census, Terminology}, PAGES = {200-205}, URL = {https://publications.cnr.it/doc/254230}, PUBLISHER = {Centro de Lingüística Aplicada, Ministero de Ciencia, Tecnología y Medio Ambiente (Santiago de Cuba, CUB)}, ISBN = {978-959-7174-22-6}, CONFERENCE_NAME = {XIII Simposio Internacional de Comunicación Social-Actulaizaciones en Comunicacion Social}, CONFERENCE_PLACE = {Santiago de Cuba}, CONFERENCE_DATE = {21-25 Jan 2013}, EDITOR = {Ruiz Miyares, L. and Álvarez Silva, M. R. and Muñoz Alvarado, A.}, } @EDITORIAL{DIRETTODASEGRE_2012_EDITORIAL_DAMS_221663, AUTHOR = {Diretto Da Segre, C. and A Cura Di Martignoni, C. and Morini, L. and Sassi, M.}, TITLE = {Rimario diacronico dell'Orlando Furioso}, YEAR = {2012}, ABSTRACT = {Il rimario dell'Orlando Furioso qui edito è diacronico perchè rappresenta sistematicamente i numerosi cambi di rimante fra le tre redazioni dell'Orlando Furioso, tutte e tre curate personalmente dall'autore (A, 1516, B, 1521; c, 1532). Se perciò un verso ha subito cambi di rimante esso è presente sotto i rimanti successivamente impiegati nelle tre redazioni. Il confronto è immediato in questo rimario diacronico, sia che si parta da un verso di A per arrivare alla forma assunta in B e poi in C, sia che si parta da C e si risalga a B e ad A. Gli sviluppi del contesto analizzato si possono riportare anche all'insieme del macrocontesto grazie all'acclusa Tavola comparativa delle tre edizioni originali ...}, KEYWORDS = {Orlando Furioso. Rimario Diacronico}, PAGES = {1-1702}, URL = {https://publications.cnr.it/doc/221663}, PUBLISHER = {Iuss Press (Pavia, ITA)}, ISBN = {9788861980686}, } @INPROCEEDINGS{PARDELLI_2012_INPROCEEDINGS_PGS_218940, AUTHOR = {Pardelli, G. and Goggi, S. and Sassi, M.}, TITLE = {Grey Literature Between Tradition and Innovation: Is There a Continuum?}, YEAR = {2012}, ABSTRACT = {This study wants to explore ways of social media communication for Grey Literature. In particular it describes the role of social media in relation with traditional channels and how social media applications can be used for Grey.}, KEYWORDS = {Grey Literature, Communication Networks, Knowledge Networking, Knowledge Exchange}, PAGES = {165-169}, URL = {https://publications.cnr.it/doc/218940}, VOLUME = {13}, CONFERENCE_NAME = {Thirteenth International Conference on Grey Literature: The Grey Circuit, From Social Networking to Wealth Creation (GL 13)}, CONFERENCE_PLACE = {Washington D. C. USA}, CONFERENCE_DATE = {5-6 December 2011}, } @INPROCEEDINGS{PARDELLI_2012_INPROCEEDINGS_PSGB_217173, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S. and Biagioni, S.}, TITLE = {From medical language processing to BioNLP domain}, YEAR = {2012}, ABSTRACT = {This paper presents the results of a terminological work on a reference corpus in the domain of Biomedicine. In particular, the research tends to analyse the use of certain terms in Biomedicine in order to verify their change over the time with the aim of retrieving from the net the very essence of documentation. The terminological sample contains words used in BioNLP and biomedicine and identifies which terms are passing from scientific publications to the daily press and which are rather reserved to scientific production. The final scope of this work is to determine how scientific dissemination to an ever larger part of the society enables a public of common citizens to approach communication on biomedical research and development; and its main source is a reference corpus made up of three main repositories from which information related to BioNLP and Biomedicine is extracted. The paper is divided in three sections: 1) an introduction dedicated to data extracted from scientific documentation; 2) the second section devoted to methodology and data description; 3) the third part containing a statistical representation of terms extracted from the archive: indexes and concordances allow to reflect on the use of certain terms in this field and give possible keys for having access to the extraction of knowledge in the digital era.}, KEYWORDS = {Information Extraction, Information Retrieval, Text mining, Digital Libraries}, PAGES = {2049-2055}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/687_Paper.pdf}, VOLUME = {7}, PUBLISHER = {European Language Resources Association (ELRA)-Evaluations and Language resources Distribution Agency (ELDA) (Paris, FRA)}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {Eight International Conference on Language Resources and Evaluation. LREC'12}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {21-27 may 2012}, BOOKTITLE = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Doğan, M. U. and Maegaard, B. and Mariani, J. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{PARDELLI_2012_INPROCEEDINGS_PSG_220806, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S.}, TITLE = {Open Grey for Language Technology: a ride on the network}, YEAR = {2012}, ABSTRACT = {The aim of this paper is to introduce the Open Access movement for Natural Language Processing (NLP) by means of a wide range of open access Grey Literature documentation available on the web. In 2008 Robert Dale, in the last issue of volume 35 of Computational Linguistics said: "There are a number of definitions of the term 'open access' in circulation, but almost all share the key principle that scientific literature should be freely available for all to read, download, copy, distribute, and use (with appropriate attribution) without restriction". At first glance it might seem that the Open Access movement has gradually become more influential in the field of language technology by building repositories accessible through the network. Today's digital archives are niches of intellectual production spread by means of a wide range of documents (such as journal articles and proceedings) which, paradoxically, the search engines do not always reach. The use of inappropriate terms in the formulation of queries and the fragmentation of repositories in this area of investigation does not allow to retrieve information on a large scale. The full paper, after a first introductory section, will be organized in two sections: 1) the first dedicated to the methodology for searching and tracing open access resources and to the criteria for analyzing and selecting the online documentation; 2) the second devoted to a description of the state-of-the-art of Open Access Grey Literature material in a statistical and thematic scenario. As things stand, standardization of computational systems interconnected by links and tools of various nature allowing Internet users to easily retrieve the information that the web naturally makes available would then be essential. Topics: Sustainability, Public Accessible Resources, Product and Service enhancements, Open Access, Curation and Preservation}, KEYWORDS = {Open Access Movement Natural Language Processing}, PAGES = {89-94}, URL = {https://publications.cnr.it/doc/220806}, VOLUME = {14}, CONFERENCE_NAME = {GL14 Fourteenth International Conference on Grey Literature (GL14)}, CONFERENCE_PLACE = {National Research Council, Rome, Italy}, CONFERENCE_DATE = {29-30 November 2012}, } @ARTICLE{CARDUCCI_2011_ARTICLE_CASCC_30889, AUTHOR = {Carducci, A. and Alfani, S. and Sassi, M. and Cinini, A. and Calamusa, A.}, TITLE = {Mass media health information: Quantitative and qualitative analysis of daily press coverage and its relation with public perceptions}, YEAR = {2011}, ABSTRACT = {Objective: This paper describes the methods followed by the Pisa University OCS for collecting, storing and analyzing all health-related articles and database contents. Moreover, an example population survey on the topic of food safety based on such analysis is shown. Methods: Articles published each day since 1999 in Italy's three most popular newspapers are collected and stored in a Data Base Text; on these articles quantitative and qualitative analyses were conducted. On the basis of these results as well as of epidemiological data, a questionnaire survey was carried out about sources of information, knowledge and risk perception of citizens regarding food safety. Results: On a total of 24,434 articles on all health topics, 18% regarded food related hazards: their evolution over time showed peaks on BSE, avian flu and dioxin. A large proportion of the people surveyed declared having changed their food habits, at least temporarily, as a consequence of media information. Most get their information on food safety mainly from television. Most respondents remembered having previously heard news on BSE, avian flu and dioxin, but did not recall having heard of listeriosis, brucellosis or typhoid fever. Conclusions: Newspapers articles facing food related hazards tend to be alarming thus affecting the citizens risk perception. On the other hand people often ignore how to manage their own food safety in a practical way. Practice implications: Analysis of media messages can help to evaluate and correct the negative effects that may result in wrong information.}, KEYWORDS = {Risk perception, Food safety, Mass media Communication, Population survey}, PAGES = {475-478}, URL = {http://www.sciencedirect.com/science/article/pii/S0738399111000061}, VOLUME = {Volume 82, Issue 3}, DOI = {10.1016/j.pec.2010.12.025}, PUBLISHER = {Excerpta Medica (Princeton, N. J, Stati Uniti d'America)}, ISSN = {0738-3991}, JOURNAL = {Patient education and counseling}, } @ARTICLE{MARZI_2011_ARTICLE_MPS_186118, AUTHOR = {Marzi, C. and Pardelli, G. and Sassi, M.}, TITLE = {A terminology based re-definition of Grey Literature}, YEAR = {2011}, ABSTRACT = {The conventionally accepted definition of Grey Literature, as Information produced and distributed by non-commercial publishing, does not take into consideration either the increasing availability of forms of grey knowledge, or the growing importance of computerbased encoding and management as the standard mode of creating and developing grey literature. Semi-automated terminological analysis of almost twenty years of terminological creativity in the proceedings of eleven GL International Conferences offers the opportunity to pave the way to a bottom-up redefinition of Grey Literature stemming from attested terminological creativity and lexical innovation. In this paper, we focus on a set of automatically-acquired terms obtained by subjecting our reference Corpus to a number of pre-processing steps of automated text analysis, such as concordances, frequency lists and lexical association scores. Acquired terms allow us to throw in sharp relief developing trends and important shifts of emphasis in the current understanding of the notion of Grey Literature.}, KEYWORDS = {Grey Literature, Terminology extraction}, PAGES = {19-23}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-84869064979\&origin=inward}, VOLUME = {7}, PUBLISHER = {TextRelease (Amsterdam, Paesi Bassi)}, ISSN = {1574-1796}, JOURNAL = {The Grey journal (Print)}, } @INCOLLECTION{SASSI_2011_INCOLLECTION_SC_206089, AUTHOR = {Sassi, M. and Cinini, A.}, TITLE = {La banca dati dei provvedimenti della sezione disciplinare del Consiglio Superiore della Magistratura(1990-2007)}, YEAR = {2011}, KEYWORDS = {Linguistica Computazionale, Analisi Sentenze}, PAGES = {129-150}, URL = {https://publications.cnr.it/doc/206089}, PUBLISHER = {CLUEB (Bologna, ITA)}, ISBN = {978-88-491-3513-8}, EDITOR = {Fabri, M.}, } @INPROCEEDINGS{MARZI_2011_INPROCEEDINGS_MPS_176389, AUTHOR = {Marzi, C. and Pardelli, G. and Sassi, M.}, TITLE = {A terminology based re-definition of Grey Literature}, YEAR = {2011}, ABSTRACT = {The conventionally accepted definition of Grey Literature, as Information produced and distributed by non-commercial publishing, does not take into consideration either the increasing availability of forms of grey knowledge, or the growing importance of computer-based encoding and management as the standard mode of creating and developing grey literature. Semi-automated terminological analysis of almost twenty years of terminological creativity in the proceedings of eleven GL International Conferences offers the opportunity to pave the way to a bottom-up redefinition of Grey Literature stemming from attested terminological creativity and lexical innovation. In this paper, we focus on a set of automatically-acquired terms obtained by subjecting our reference Corpus to a number of pre-processing steps of automated text analysis, such as concordances, frequency lists and lexical association scores. Acquired terms allow us to throw in sharp relief developing trends and important shifts of emphasis in the current understanding of the notion of Grey Literature.}, KEYWORDS = {GL conference corpus, Grey literature definition, Terminology extraction}, PAGES = {27-31}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-84883303651\&origin=inward}, VOLUME = {12}, PUBLISHER = {TextRelease (Amsterdam, NLD)}, ISSN = {1386-2316}, ISBN = {9789077484166}, CONFERENCE_NAME = {Twelfth International Conference on Grey Literature: Trasparency in Grey Literature, Grey Tech Approaches to High Tech Issues}, CONFERENCE_PLACE = {Praga}, CONFERENCE_DATE = {6-7 dicembre 2010}, BOOKTITLE = {Trasparency in Grey Literature, Grey Tech Approaches to High Tech Issues}, EDITOR = {Farace, D. J. and Fratzen, J.}, } @INPROCEEDINGS{PARDELLI_2011_INPROCEEDINGS_PSOBG_199282, AUTHOR = {Pardelli, G. and Sassi, M. and Orsolini, P. and Biagioni, S. and Giannini, S.}, TITLE = {An open archive of scientific communication}, YEAR = {2011}, ABSTRACT = {This paper presents the results of a terminological work conducted by the authors on a Digital Archives Net of the Italian National Research Council (CNR) in the field of Computer Science. In particular, the research tends to analyse the use of certain terms in Computer Science in order to verify their change over the time with the aim of retrieving from the net the very essence of documentation. Its main source is a reference corpus made up of 13,500 documents which collects the scientific productions of three CNR research Institutes. They are ISTI (Institute of Information Science and Technologies), IIT (Institute of Informatics and Telematics) and ILC (Institute of Computational Linguistics), all of them born from the "Centro Studi sulle Calcolatrici Elettroniche (CSCE)" and now belonging to the CNR Department of Information \& Communication Technologies and Cultural Identity. This study is divided in three sections: an introductory one dedicated to the data extracted from the scientific documentation: the data have in common the use of some terms proper of the Computer Science lexicon although these term belong to different branches (Linguistics, Informatics and Telematics); the second section is devoted to the description of the contents managed by the PUMA (Publication Management System) system; the third section contains a statistical representation of terms extracted from archive: some comparison tables between the occurrences of the most used terms in the scientific documentation produced by the three Institutes will be created and diagrams with percentages about the most frequently used terms will be displayed too. Lastly, indexes and concordances will allow to reflect on the use of certain terms in this field and give possible keys for having access to the extraction of knowledge in the digital era.}, KEYWORDS = {Digital Archives, Communication, Terminology, Open Access}, PAGES = {914-918}, URL = {http://www.santiago.cu/hosting/linguistica/simposios.php?s=XII}, VOLUME = {II}, PUBLISHER = {Centro de linguística aplicada, Ministerio de ciencia, tecnología y medio ambiente (Santiago de Cuba, CUB)}, ISBN = {978-959-7174-19-6}, CONFERENCE_NAME = {Comunicación Social en el Siglo XXI. XII Simposio Internacional de Comunicacion Social}, CONFERENCE_PLACE = {Santiago de Cuba}, CONFERENCE_DATE = {17-21 gennaio 2011}, BOOKTITLE = {Comunicacion social en el siglo XXI, vol. II}, EDITOR = {Miyares, L. R. and Silva, M. R. Á.}, } @INPROCEEDINGS{PARDELLI_2011_INPROCEEDINGS_PSG_205788, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S.}, TITLE = {Grey Literature Between Tradition and Innovation: Is there a Continuum?}, YEAR = {2011}, ABSTRACT = {This study wants to explore ways of social media communication for Grey Literature. In particular it describes the role of social media in relation with traditional channels and how social media applications can be used for Grey.}, KEYWORDS = {Grey Literature, Communication networks, Knowledge networking, knowledge exchange}, PAGES = {64-65}, URL = {https://publications.cnr.it/doc/205788}, VOLUME = {13}, ISBN = {978-90-77484-00-5}, CONFERENCE_NAME = {Thirteenth International Conference on Grey Literature: The Grey Circuit, From Social Networking to Wealth Creation}, CONFERENCE_PLACE = {Washington D. C. USA-Library of Congress}, CONFERENCE_DATE = {5-6 december 2011}, EDITOR = {Farace, D. J. and Frantzen, J.}, } @TECHREPORT{CUCURULLO_2011_TECHREPORT_CS_206469, AUTHOR = {Cucurullo, S. and Sassi, M.}, TITLE = {Archivio Elettronico delle Concordanze Diacroniche dell'Orlando Furioso}, YEAR = {2011}, ABSTRACT = {The project to build a digital electronic archiving of the concordances diachronic Orlando Furioso began in the '70s and had as its objective the study of a computational method for the treatment of variants.The basic text, provided by the "Accademia della Crusca", refers to the latest edition published in 1532 by the author, while the first 2 editions, published respectively in 1516 and in 1521, had not yet been the subject of electronic transcription. This has directed the staff of the project towards a reconstruction of the previous witnesses through accurate recording of the critical apparatus Debenedetti - Segre, 1960.}, KEYWORDS = {DBT, Orlando Furioso, banca-dati testuale}, PAGES = {11}, URL = {https://publications.cnr.it/doc/206469}, } @ARTICLE{MARZI_2010_ARTICLE_MPS_64555, AUTHOR = {Marzi, C. and Pardelli, G. and Sassi, M.}, TITLE = {Grey literature and computational linguistics: From paper to net}, YEAR = {2010}, ABSTRACT = {The advent and exponential development of the World Wide Web has led to an increasing availability of unstructured knowledge and distributed information sources, meeting general public requirements that are hardly addressed by other more traditional information channels. This trend has concurrently raised a considerable interest in the application of Computational Linguistics (CL) methodologies to document access and retrieval, as they offer the unprecedented opportunity to make the subjective, user- centred information demands of Net citizens meet the ever changing and heterogeneous information flow of the web. Over the last five years, more and more Italian Universities have introduced CL courses into their Humanities curricula, making available on-line teaching materials, tutorials and language engineering software that appear to supply the lack of offer from traditional Italian publishing houses. In this paper, we consider in some detail the role played by this type of Grey Literature in bringing up a wider and increasingly more aware community of web users in Italy.}, KEYWORDS = {Grey Literature}, PAGES = {145-148}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-78149461778\&origin=inward}, VOLUME = {6}, PUBLISHER = {TextRelease (Amsterdam, Paesi Bassi)}, ISSN = {1574-1796}, JOURNAL = {The Grey journal (Print)}, } @INPROCEEDINGS{MARZI_2010_INPROCEEDINGS_MPS_84790, AUTHOR = {Marzi, C. and Pardelli, G. and Sassi, M.}, TITLE = {Grey Literature and Computational Linguistics: From Paper to Net}, YEAR = {2010}, ABSTRACT = {The advent and exponential development of the World Wide Web has led to an increasing availability of unstructured knowledge and distributed information sources, meeting general public requirements that are hardly addressed by other more traditional information channels. This trend has concurrently raised a considerable interest in the application of Computational Linguistics (CL) methodologies to document access and retrieval, as they offer the unprecedented opportunity to make the subjective, user-centred information demands of Net citizens meet the ever changing and heterogeneous information flow of the web. Over the last five years, more and more Italian Universities have introduced CL courses into their Humanities curricula, making available on-line teaching materials, tutorials and language engineering software that appear to supply the lack of offer from traditional Italian publishing houses. In this paper, we consider in some detail the role played by this type of Grey Literature in bringing up a wider and increasingly more aware community of web users in Italy.}, KEYWORDS = {Computational Linguistics, Grey, Web-based information}, PAGES = {81-84}, URL = {https://publications.cnr.it/doc/84790}, VOLUME = {11}, PUBLISHER = {TextRelease (Amsterdam, NLD)}, ISSN = {1386-2316}, ISBN = {978-90-77484-13-5}, CONFERENCE_NAME = {Eleventh International Conference on Grey Literature. The Grey Mosaic, Piecing it All Together}, CONFERENCE_PLACE = {Washington, DC}, CONFERENCE_DATE = {14-15 dicembre 2009}, BOOKTITLE = {The Grey Mosaic, Piecing it All Together}, EDITOR = {Farace, D. J. and Frantzen, J.}, } @INPROCEEDINGS{SASSI_2010_INPROCEEDINGS_SPBCG_171547, AUTHOR = {Sassi, M. and Pardelli, G. and Biagioni, S. and Carlesi, C. and Goggi, S.}, TITLE = {A Digital Archive of Research Papers in Computer Science}, YEAR = {2010}, ABSTRACT = {This paper presents the results of a terminological work conducted by the authors on a Digital Archives Net of the Italian National Research Council (CNR) in the field of Computer Science. In particular, the research tends to analyse the use of certain terms in Computer Science in order to verify their change over the time with the aim of retrieving from the net the very essence of documentation. Its main source is a reference corpus made up of 13,500 documents which collects the scientific productions of three CNR research Institutes. They are ISTI (Institute of Information Science and Technologies), IIT (Institute of Informatics and Telematics) and ILC (Institute of Computational Linguistics), all of them born from the "Centro Studi sulle Calcolatrici Elettroniche (CSCE)" and now belonging to the CNR Department of Information \& Communication Technologies and Cultural Identity. This study is divided in three sections: 1) an introductory one dedicated to the data extracted from the scientific documentation: the data have in common the use of some terms proper of the Computer Science lexicon although these term belong to different branches (Linguistics, Informatics and Telematics); 2) the second section is devoted to the description of the contents managed by the PUMA (Publication Management System) system; 3) the third part contains a statistical representation of terms extracted from archive: some comparison tables between the occurrences of the most used terms in the scientific documentation produced by the three Institutes will be created and diagrams with percentages about the most frequently used terms will be displayed too. Lastly, indexes and concordances will allow to reflect on the use of certain terms in this field and give possible keys for having access to the extraction of knowledge in the digital era.}, KEYWORDS = {Digital libraries, Document Classification, Text categorisation, Text mining, Natural Language Processing. Text analysis}, PAGES = {1245-1248}, URL = {http://www.lrec-conf.org/proceedings/lrec2010/summaries/945.html}, PUBLISHER = {European Language Resources Association (ELRA)-Evaluations and Language resources Distribution Agency (ELDA) (Paris, FRA)}, ISBN = {2-9517408-6-7}, CONFERENCE_NAME = {Seventh International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Valletta, Malta}, CONFERENCE_DATE = {17-23 Maggio 2010}, BOOKTITLE = {Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC'10)}, EDITOR = {Calzolari, N. and Choukri, K. and Maegaard, B. and Mariani, J. and Odjik, J. and Piperidis, S. and Rosner, M. and Tapias, D.}, } @INPROCEEDINGS{MARZI_2010_INPROCEEDINGS_MPS_186131, AUTHOR = {Marzi, C. and Pardelli, G. and Sassi, M.}, TITLE = {A Terminology Based Re-Definition of Grey Literature}, YEAR = {2010}, ABSTRACT = {The Luxembourg Convention on Grey Literature held in 1997 offered the following definition of Grey Literature (expanded in New York, 2004): "Information produced and distributed on all levels of government, academics, business and industry in electronic and print formats not controlled by commercial publishing, i.e. where publishing is not the primary activity of the producing body". Is this definition still valuable? Is it so far completely satisfactory? Or does it rather need important modifications? We suggest that an interesting re-definition of GL can be based upon careful examination of the longitudinal trend of 10 years of terminological creativity in the proceedings of the GL international Conference. Our empirical basis is the Corpus of GreyText Inhouse Archive, available on http://www.greynet.org/opensiglerepository.html consisting of titles, themes, keywords and full abstracts, for a total amount of more than sixty thousand word tokens. In the full version of our paper, we intend to focus on a set of automatically-acquired terms (both single-word and multi-word terms) obtained by subjecting our reference Corpus to a number of pre-processing steps of automated text analysis, such as concordances, frequency lists and lexical association scores (e.g. Mutual Information on word pairs). To anticipate some of our results, the following three terms, that appear to be shared by various disciplinary sub-fields, mark, in our view, important stages in the evolution of our current understanding of GL: digital, access and web. The attribute digital, an increasingly popular synonym of the now obsolete electronic, emphasises the growing importance of computer-based encoding as the standard medium of GL. The noun access (defining the process of accessing text documents) is seen in the company of adjectives like easy, full, grey and open to shape up important conceptual innovations in the way GL material is distributed: e.g. open access focuses on the free accessibility of digital contents. Coupled with information, document and repository (note, however, that repository is generally understood as a technical synonym of open archive), access points to a conception of world-wide available, structured cultural contents. Finally, reference to the web lays emphasis on the huge importance of the World Wide Web as the standard means of disseminating GL. All these aspects are not fully taken into account in the standard definition of GL reported above. Our inquiry is intended to pave the way to a bottom-up re-definition of GL, stemming from the terminological creativity and lexical innovation monitored over ten years of technical work in the field.}, KEYWORDS = {Terminology extraction, Grey Literature definition, GL Conference corpus}, PAGES = {24-28}, URL = {https://publications.cnr.it/doc/186131}, VOLUME = {12}, ISSN = {1385-2308}, ISBN = {978-90-77484-15-9}, CONFERENCE_NAME = {Twelfth International Conference on Grey Literature: Trasparency in Grey Literature, Grey Tech Approaches to High Tech Issues}, CONFERENCE_PLACE = {Prague}, CONFERENCE_DATE = {6-7/12/2010}, BOOKTITLE = {Trasparency in Grey Literature, Grey Tech Approaches to High Tech Issues}, EDITOR = {Farace, D. J. and Fratzen, J.}, } @INPROCEEDINGS{PICCHI_2010_INPROCEEDINGS_PSBG_120718, AUTHOR = {Picchi, E. and Sassi, M. and Biagioni, S. and Giannini, S.}, TITLE = {Extending the "Facets" concept by applying NLP tools to catalog records of scientific literature}, YEAR = {2010}, ABSTRACT = {The prototype of an "intelligent" navigation system, which has been implemented on the contents of PUMA (http://puma.isti.cnr.it), a digital library of scientific literature, is presented. The system has been implemented by integrating our core textual search engine (known as DBT) with the TextPower (TP) technology. TP is based on NLP techniques and linguistic resources and provides tools specialized for the evaluation, analysis, classification and browsing of scientific literature. TP extends the facet concept by extracting "field + content" pairs not only from structured fields but also from free text, eg. abstracts, using a linguistic-statistical approach to annotate relevant terminology, named entities, etc. The enriched text can be queried, analysed, and classified using a new version of the DBT System known as "DBT\&Facets". DBT\&Facets has been implemented on the full bibliographic records of the documents archived in the PUMA digital library of the Italian National Research Council (CNR). PUMA is a user-focused, service-oriented infrastructure which manages 30 CNR institutional repositories containing about 25,000 published or open access documents in a wide variety of disciplines. In an open domain like scientific documentation, our approach based on the criteria of "semantic similarity" is useful - and perhaps more objective than one based on hierarchical elements - as it makes it possible to link different types of information, also across domains if necessary. DBT\&Facets is an advanced search tool that permits the user to query and refine their results, and to identify particular relations between them. The aim of the project has been to structure a knowledge system of domain-specific information which assists the user by suggesting possible directions for their search.}, KEYWORDS = {NLP tools, Digital libraries}, PAGES = {82-87}, URL = {https://publications.cnr.it/doc/120718}, ISBN = {978-90-77484-15-9}, CONFERENCE_NAME = {Twelfth International Conference on Grey Literature}, CONFERENCE_PLACE = {Praga}, CONFERENCE_DATE = {6-7 December 2010}, EDITOR = {Farace, D. J. and Frantzen, J. and Greynet}, } @INPROCEEDINGS{CIGNONI_2009_INPROCEEDINGS_CPS_84740, AUTHOR = {Cignoni, L. and Pardelli, G. and Sassi, M.}, TITLE = {Grey Literature for Natural Language Processing: a Terminological and Statistical Approach}, YEAR = {2009}, ABSTRACT = {This paper presents the results of a study on grey literature (GL) in the field of Natural Language Processing (NLP). Our data has been collected in a corpus of ca 13,000 records corresponding to the titles of papers presented at International Conferences from 1950 to June 2008. A statistical representation of the most significant terms relative to GL in NLP and other interrelated disciplines associates old and new words, highlighting the terminological changes that have taken place in the course of time. Aim of our study is to contribute to the creation of language resources for the extraction of GL coming from the Web in order to help prevent the disappearance of documents containing NLP words that have undergone rapid development over the last decades. This paper is organised as follows: after a general introduction to our work, section 2 provides a historical overview of NLP; sections 3 and 4 offer an account of the most relevant terms used by specialists in different periods, and indicative of the changes that have taken place; section 5 describes the methodology we have used and also contains information on our GL database and a graphical representation of the data. Finally, the conclusions stress the need to integrate pre-existing or obsolete words and expressions, creating NLP synonym relations.}, KEYWORDS = {Computational Linguistics, Terminology, Grey Literature}, PAGES = {93-100}, URL = {https://publications.cnr.it/doc/84740}, VOLUME = {10}, PUBLISHER = {TextRelease (Amsterdam, NLD)}, ISBN = {978-90-77484-11-1}, CONFERENCE_NAME = {Tenth International Conference on Grey Literature: Designing the Grey Grid for Information Society}, CONFERENCE_PLACE = {Amsterdam}, CONFERENCE_DATE = {DEC 08-09, 2008}, BOOKTITLE = {Designing the Grey Grid for Information Society}, EDITOR = {Farace, D. J. and Frantzen, J.}, } @INPROCEEDINGS{PARDELLI_2009_INPROCEEDINGS_PSGO_84738, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S. and Orsolini, P.}, TITLE = {Computational Linguistics Terminology}, YEAR = {2009}, ABSTRACT = {The aim of this article is to provide a statistical representation of significant terms used in the field of Natural Language Processing from the 1960's till nowadays, in order to draft a survey on the most significant research trends in that period. By retrieving these keywords it should be possible to highlight the ebb and flow of some thematic topics. The NLP terminological sample derives from a database - created for this purpose using the DBT software (Textual Data Base, ILC patent). Scientific presentations at the above-mentioned conferences point out a frequent recurrence of expressions such as mécanisation des études lexicologique, les machines à cartes perforées et leurs application lexicologique which trace back to the origin of electronic processing of linguistic data and to some solutions of linguistic-literary problems, to lexicographic researches, to the scientific terminology, to automatic dictionaries, to homographs, synonyms and the possibility of producing indexes and concordances by means of an electronic processor: Terms such as meccanizzazione, mechanical translation, machine à traduire used by experts of the field in the 1950s and 1960s seem to well testify the change, the shift, the beginning and then the final consecration of a rapidly evolving field: Natural Language Processing.}, KEYWORDS = {Computational Linguistics, Terminology}, PAGES = {303-307}, URL = {https://publications.cnr.it/doc/84738}, PUBLISHER = {Centro de linguística aplicada, Ministerio de ciencia, tecnología y medio ambiente (Santiago de Cuba, CUB)}, ISBN = {978-959-7174-14-1}, CONFERENCE_NAME = {XI Simposio Internacional de Communicación Social}, CONFERENCE_PLACE = {Santiago de Cuba}, CONFERENCE_DATE = {19-23 de enero de 2009}, EDITOR = {Silvia, M. R. A. and Moreno, C. A. and Miyares, L. R.}, } @INPROCEEDINGS{SASSI_2009_INPROCEEDINGS_S_84746, AUTHOR = {Sassi, M.}, TITLE = {La obra de Alejo Carpentier en versión digital: historial, descripción y propuestas}, YEAR = {2009}, KEYWORDS = {Banca dati Testuale, Alejo Carpentier}, URL = {https://publications.cnr.it/doc/84746}, CONFERENCE_NAME = {XI Simposio Internacional de Comunicación social}, CONFERENCE_PLACE = {Santiago de Cuba}, CONFERENCE_DATE = {2009}, } @INPROCEEDINGS{SASSI_2009_INPROCEEDINGS_SPG_84757, AUTHOR = {Sassi, M. and Pardelli, G. and Goggi, S.}, TITLE = {Terminology Extraction from the web}, YEAR = {2009}, ABSTRACT = {This paper presents the results of a study on textual resources in the field of Human Language Technology (HLT). A statistical representation of the most significant terms in HLT and other interrelated disciplines associates old and new words, highlighting the terminological changes that have taken place in the course of time. Aim of our study is to contribute to the creation of language resources for the extraction of documentation coming from the Web in order to help preventing the disappearance of documents containing HLT words that have undergone rapid development over the last decades. This paper is organised as follows: after a general introduction to our work, section 2 provides a historical overview of HLT; sections 3 and 4 offer an account of the most relevant terms used by specialists in different periods, and those indicative of the changes that have taken place; section 5 describes the methodology we have used and also contains information on our database and a graphical representation of the data. Finally, the conclusions stress the need to integrate pre-existing or obsolete words and expressions, creating HLT synonym relations.}, KEYWORDS = {Terminology, Computational Linguistics, Web-based information}, PAGES = {417-420}, URL = {https://publications.cnr.it/doc/84757}, ISBN = {978-83-7177-746-2}, CONFERENCE_NAME = {4th Language Technology Conference: Human Language Technology as a challenge for Computer Science and Linguistics}, CONFERENCE_PLACE = {Poznan, PL}, CONFERENCE_DATE = {November 6-8, 2009}, EDITOR = {Vetulani, Z.}, } @INPROCEEDINGS{MARZI_2009_INPROCEEDINGS_MPS_112950, AUTHOR = {Marzi, C. and Pardelli, G. and Sassi, M.}, TITLE = {Grey Literature and Computational Limguistics: From Paper to Net}, YEAR = {2009}, ABSTRACT = {The advent and exponential development of the World Wide Web has led to an increasing availability of unstructured knowledge and distributed information sources, meeting general public requirements that are hardly addressed by other more traditional information channels. This trend has concurrently raised a considerable interest in the application of Computational Linguistics (CL) methodologies to document access and retrieval, as they offer the unprecedented opportunity to make the subjective, user-centred information demands of Net citizens meet the ever changing and heterogeneous information flow of the web. Over the last five years, more and more Italian Universities have introduced CL courses into their Humanities curricula, making available on-line teaching materials, tutorials and language engineering software that appear to supply the lack of offer from traditional Italian publishing houses. In this paper, we consider in some detail the role played by this type of Grey Literature in bringing up a wider and increasingly more aware community of web users in Italy. Theme: Impact of Grey Literature on Net Citizens}, KEYWORDS = {Computational Linguistics, Grey Literature, Web-based information}, PAGES = {81-84}, URL = {https://publications.cnr.it/doc/112950}, VOLUME = {11}, ISBN = {978-90-77484-14-2}, CONFERENCE_NAME = {Eleventh International Conference on Grey Literature "The Grey Mosaic, Piecing it All Together"}, CONFERENCE_PLACE = {Washington, DC}, CONFERENCE_DATE = {14-15 December 2009}, BOOKTITLE = {Eleventh International Conference on Grey Literature "The Grey Mosaic, Piecing it All Together" Acronimo titolo evento}, EDITOR = {Farace, D. J. and Frantzen, J.}, } @TECHREPORT{SASSI_2009_TECHREPORT_SC_157456, AUTHOR = {Sassi, M. and Cinini, A.}, TITLE = {Dieci anni di informazione sanitaria. Analisi di tre quotidiani a tiratura nazionale}, YEAR = {2009}, ABSTRACT = {L'Istituto di Linguistica Computazionale (ILC-CNR) ha realizzato In collaborazione con L'Osservatorio della Comunicazione Sanitaria (OCS) una Banca Dati Testuale composta dagli articoli che trattano Informazione Sanitaria, pubblicati a partire dall'anno 1999 su tre quotidiani a diffusione nazionale: Corriere della Sera (C), Repubblica (R) e Stampa (S).}, KEYWORDS = {Analisi Informazione sanitaria, Banca dati Testuale}, PAGES = {1-31}, URL = {https://publications.cnr.it/doc/157456}, } @INPROCEEDINGS{CIGNONI_2008_INPROCEEDINGS_CPS_112937, AUTHOR = {Cignoni, L. and Pardelli, G. and Sassi, M.}, TITLE = {Grey Literature for Natural Language Processing: a Terminological and Statistical Approach}, YEAR = {2008}, ABSTRACT = {This paper presents the results of a study on grey literature (GL) in the field of Natural Language Processing (NLP). Our data has been collected in a corpus of ca 13,000 records corresponding to the titles of papers presented at International Conferences from 1950 to June 2008. A statistical representation of the most significant terms relative to GL in NLP and other interrelated disciplines associates old and new words, highlighting the terminological changes that have taken place in the course of time. Aim of our study is to contribute to the creation of language resources for the extraction of GL coming from the Web in order to help prevent the disappearance of documents containing NLP words that have undergone rapid development over the last decades. This paper is organised as follows: after a general introduction to our work, section 2 provides a historical overview of NLP; sections 3 and 4 offer an account of the most relevant terms used by specialists in different periods, and indicative of the changes that have taken place; section 5 describes the methodology we have used and also contains information on our GL database and a graphical representation of the data. Finally, the conclusions stress the need to integrate pre-existing or obsolete words and expressions, creating NLP synonym relations.}, KEYWORDS = {Computational Linguistics, Grey Literature}, PAGES = {116-120}, URL = {http://hdl.handle.net/10068/697993}, ISBN = {978-90-77484-12-8}, CONFERENCE_NAME = {Tenth International Conference on Grey Literature: Designing the Grey Grid for Information Society}, CONFERENCE_PLACE = {Amsterdam}, CONFERENCE_DATE = {December 8-9 2008}, EDITOR = {Farace, D. G. and Frantzen, J.}, } @TECHREPORT{CININI_2008_TECHREPORT_CS_390626, AUTHOR = {Cinini, A. and Sassi, M.}, TITLE = {Aggiornamento della Banca Dati del CSM}, YEAR = {2008}, ABSTRACT = {Aggiornamento della Banca dati delle sentenze e ordinanze della Sezione Disciplinare del Consiglio Superiore della Magistratura, realizzata in collaborazione con l'Istituto di Ricerca sui Sistemi Giudiziari (IRSIG-CNR), con i documenti relativi agli anni 2004-2007. Sperimentazione di nuove funzioni di estrazione dell'informazione tramite software di trattamento automatico del linguaggio (TAL), con particolare riferimento ad analisi diacroniche dei risultati di ricerche complesse su dati testuali.}, KEYWORDS = {Informatica giuridica documentale, Analisi Sentenze, DBT}, PAGES = {1-17}, URL = {https://publications.cnr.it/doc/390626}, } @TECHREPORT{PARDELLI_2008_TECHREPORT_PSOP_255578, AUTHOR = {Pardelli, G. and Sassi, M. and Orsolini, P. and Parrinelli, V.}, TITLE = {Verso la costruzione di una Biblioteca Digitale}, YEAR = {2008}, ABSTRACT = {A data base of the "Antonio Zampolli Fund" has been created and the respective catalogue has been published1. The work of analysis and selection of texts for cataloguing helped in creating this bibliography, in large part built on references extracted by books and journals. Very old bibliographical references have also been retrieved by curricula prepared by Professor Zampolli for various projects and commissions.}, KEYWORDS = {Biblioteca Digitale, Linguistica Computazionale}, PAGES = {1-43}, URL = {https://publications.cnr.it/doc/255578}, } @MISC{CUCURULLO_2008_MISC_CPSSMM_151565, AUTHOR = {Cucurullo, S. and Picchi, E. and Sassi, M. and Segre, C. and Martignoni, C. and Morini, L.}, TITLE = {Le concordanze diacroniche dell'Orlando Furioso}, YEAR = {2008}, KEYWORDS = {Furioso, Orlando, Concordanze diacroniche}, URL = {https://publications.cnr.it/doc/151565}, } @INPROCEEDINGS{PARDELLI_2007_INPROCEEDINGS_PSG_84678, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S.}, TITLE = {A survey on Human Language Technology Terminology}, YEAR = {2007}, ABSTRACT = {This article originates from the revision of a 1969 unpublished article by Professor Antonio Zampolli carried out by Gabriella Pardelli and Manuela Sassi, two of his collaborators at the Institute of Computational Linguistics in Pisa. It is a technical report titled "Due Conversazioni sul Panorama Attuale della Linguistica Computazionale", drawn up by Zampolli on the occasion of two lectures at the Istituto di Matematica Ulisse Dini of Florence in June 1969. A synthesis of the introductory part - mainly based on some classifications for the various areas of Computational Linguistics - is reported here because the most interesting from the point of view of the relationship between automatic processing of linguistic data and other sciences. The rich bibliographic part has been extracted as well from the report and used for a terminological statistical analysis. Some sections, for example those on the International Conference on Computational Linguistics of 1969 and on the "Sezione Linguistica" of CNUCE in Pisa, have not - or only partly - been taken into account because already published by Zampolli in other books and journals (and not because considered less important). The whole revised technical report will soon be published in the "Quaderni di Linguistica Computazionale" edited by the Istituto di Linguistica Computazionale. The paper is divided in three parts: the first section is a terminological overview on the use of terms such like Computational Linguistics, Applied Linguistics and Mathematical Linguistics; the second has a statistical approach and shows the graphical representation of terms extracted from bibliographies and used in the 1960s; lastly, the conclusions. This contribution is a "historical" document which places itself at the beginning of a field which afterwards knew an exceptional development and it highlights both the continuity and the change which brought to the present Human Language Technology.}, KEYWORDS = {Human Language Technology, Terminology}, PAGES = {364-368}, URL = {https://publications.cnr.it/doc/84678}, PUBLISHER = {Wydawnictwo Poznanskie Sp. z o. o (Poznan, POL)}, ISBN = {978-90-77484-17-3}, CONFERENCE_NAME = {3rd Language \& Technology Conference}, CONFERENCE_PLACE = {Poznan}, CONFERENCE_DATE = {october 5-7, 2007}, BOOKTITLE = {Human Language Technologies as a Challenge for Computer Science and Linguistics}, EDITOR = {Vetulani, Z.}, } @INPROCEEDINGS{SASSI_2007_INPROCEEDINGS_SC_112932, AUTHOR = {Sassi, M. and Cinini, A.}, TITLE = {Il monitoraggio dell'amministrazione della giustizia}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/112932}, CONFERENCE_NAME = {Tecnologia dell'informazione e della comunicazione per la giustizia}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {2007}, } @TECHREPORT{CININI_2007_TECHREPORT_CS_157430, AUTHOR = {Cinini, A. and Sassi, M.}, TITLE = {Archivio del Digesto Latino-Italiano}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157430}, } @TECHREPORT{CININI_2007_TECHREPORT_CS_157431, AUTHOR = {Cinini, A. and Sassi, M.}, TITLE = {L'Informazione sanitaria. Analisi di tre quotidiani a tiratura nazionale}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157431}, } @TECHREPORT{SASSI_2007_TECHREPORT_SC_157432, AUTHOR = {Sassi, M. and Cinini, A.}, TITLE = {Content analysis dei provvedimenti della sezione disciplinare del C. S. M}, YEAR = {2007}, URL = {https://publications.cnr.it/doc/157432}, } @BOOK{CININI_2006_BOOK_CS_143963, AUTHOR = {Cinini, A. and Sassi, M.}, TITLE = {Content analysis dei provvedimenti della sezione disciplinare del C. S. M}, YEAR = {2006}, ABSTRACT = {Nell'ambito del progetto di ricerca denominato "Tecnologie dell'informazione e della comunicazione per la giustizia" coordinato dall'Istituto di Ricerca sui Sistemi Giudiziari (IRSIGCNR di Bologna) e co-finanziato dai Fondi Integrativi Ricerca di Base (FIRB) del Ministero dell'Università e della Ricerca, l'Istituto di Linguistica Computazionale (ILC-CNR di Pisa) ha collaborato con l'IRSIG e con il Centro Studi e Ricerche sull'Ordinamento Giudiziario dell'Università di Bologna per la realizzazione di una base dati elettronica per l'analisi dei provvedimenti della sezione disciplinare del Consiglio Superiore della Magistratura.}, KEYWORDS = {Informatica giuridica documentale, Knowledge extraction from texts, DBT}, PAGES = {1-68}, URL = {https://publications.cnr.it/doc/143963}, PUBLISHER = {S. T. A. R. Servizio Tecnografico Area Ricerca CNR (Pisa, ITA)}, } @INPROCEEDINGS{CALZOLARI_2006_INPROCEEDINGS_CSSCPBEMSC_84625, AUTHOR = {Calzolari, F. and Sassolini, E. and Sassi, M. and Cucurullo, S. and Picchi, E. and Bertagna, F. and Enea, A. and Monachini, M. and Soria, C. and Calzolari, N.}, TITLE = {Next Generation Language Resources using Grid}, YEAR = {2006}, ABSTRACT = {This paper presents a case study concerning the challenges and requirements posed by next generation language resources, realized as an overall model of open, distributed and collaborative language infrastructure. If a sort of "new paradigm" for language resource sharing is required, we think that the emerging and still evolving technology connected to Grid computing is a very interesting and suitable one for a concrete realization of this vision. Given the current limitations of Grid computing, it is very important to test the new environment on basic language analysis tools, in order to get the feeling of what are the potentialities and possible limitations connected to its use in NLP. For this reason, we have done some experiments on a module of the Linguistic Miner, i.e. the extraction of linguistic patterns from restricted domain corpora. The Grid environment has produced the expected results (reduction of the processing time, huge storage capacity, data redundancy) without any additional cost for the final user.}, KEYWORDS = {grid, acquisition, topic classification}, PAGES = {1858-1861}, URL = {https://publications.cnr.it/doc/84625}, ISBN = {2-9517408-2-4}, CONFERENCE_NAME = {LREC 2006: 5th International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Genova}, CONFERENCE_DATE = {24-26 Maggio 2006}, } @INPROCEEDINGS{PARDELLI_2006_INPROCEEDINGS_PSGO_84640, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S. and Orsolini, P.}, TITLE = {Natural Language Processing: A Terminological and Statistical Approach}, YEAR = {2006}, ABSTRACT = {The aim of this article is to provide a statistical representation of significant terms used in the field of Natural Language Processing from the 1960s till nowadays, in order to draft a survey on the most significant research trends in that period. By retrieving these keywords it should be possible to highlight the ebb and flow of some thematic topics. The NLP terminological sample derives from a database created for this purpose using the DBT software (Textual Data Base, ILC patent).}, KEYWORDS = {Natural Language Processing, Terminology}, PAGES = {2395-2398}, URL = {https://publications.cnr.it/doc/84640}, PUBLISHER = {European Language Resources Association (ELRA)-Evaluations and Language resources Distribution Agency (ELDA) (Paris, FRA)}, ISBN = {2-9517408-2-4}, CONFERENCE_NAME = {LREC 2006: 5th International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Genoa}, CONFERENCE_DATE = {Genoa, 24-26 May}, } @INPROCEEDINGS{SASSI_2006_INPROCEEDINGS_S_84662, AUTHOR = {Sassi, M.}, TITLE = {Martí y Carpentier, voces de la América}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/84662}, CONFERENCE_NAME = {XXVIII Convegno Internazionale di Americanistica}, CONFERENCE_PLACE = {Perugia}, CONFERENCE_DATE = {2006}, } @TECHREPORT{SASSI_2006_TECHREPORT_SC_157406, AUTHOR = {Sassi, M. and Cinini, A.}, TITLE = {Content-analysis dei provvedimenti della sezione disciplinare del C. S. M}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/157406}, } @MISC{CECCOTTI_2006_MISC_CS_151558, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Dall'Archivio Elettronico delle Opere di Carlo Emilio Gadda. Studi e ricerche con strumenti computazionali dell'opera dell'ingegnere-scrittore}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/151558}, } @MISC{PICCHI_2006_MISC_PSCSC_151561, AUTHOR = {Picchi, E. and Sassi, M. and Ceccotti, M. L. and Sassolini, E. and Cucurullo, S.}, TITLE = {Linguistic Miner}, YEAR = {2006}, URL = {https://publications.cnr.it/doc/151561}, } @TECHREPORT{SASSI_2005_TECHREPORT_SC_171621, AUTHOR = {Sassi, M. and Cinini, A.}, TITLE = {L'archivio della Disciplinare come deposito per la content-analysis}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/171621}, } @MISC{CECCOTTI_2005_MISC_CS_151530, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Dall'Archivio Elettronico delle Opere di Carlo Emilio Gadda. Studi e ricerche con strumenti computazionali dell'opera dell'ingegnere-scrittore}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/151530}, } @MISC{PARDELLI_2005_MISC_PSGO_151553, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S. and Orsolini, P.}, TITLE = {NLPterminology}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/151553}, } @MISC{PICCHI_2005_MISC_PSCSC_151533, AUTHOR = {Picchi, E. and Sassi, M. and Ceccotti, M. L. and Sassolini, E. and Cucurullo, S.}, TITLE = {Linguistic Miner}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/151533}, } @MISC{SABA_2005_MISC_SSCPCG_151537, AUTHOR = {Saba, A. and Sassi, M. and Carpi, E. and Periñán, B. and Calí, S. and Garcia Macho, M. L.}, TITLE = {Corpus del LéNESO (Léxico Náutico del Español del Siglo de Oro)}, YEAR = {2005}, URL = {https://publications.cnr.it/doc/151537}, } @INPROCEEDINGS{PARDELLI_2004_INPROCEEDINGS_PSG_84614, AUTHOR = {Pardelli, G. and Sassi, M. and Goggi, S.}, TITLE = {From Weaver to the ALPAC Report}, YEAR = {2004}, ABSTRACT = {This paper presents a sample pertaining to the creation and the use of words in the field of Natural Language Processing (NLP) in the years 1949-1966. These words have been statistically sorted and the results could be taken as a proof that electronic processing of linguistic data leads to the diffusion of clear and concise words for describing a complex concept which would need a circumlocution to be described instead. The aim of this article is to provide an evolutionary overview of these new lexical forms in the various languages for the period taken into account and, whereas possible, a data register and a tabular representation have been prepared as well.}, KEYWORDS = {Terminology, Natural Language Processing}, PAGES = {2005-2008}, URL = {https://publications.cnr.it/doc/84614}, PUBLISHER = {European Language Resources Association (ELRA)-Evaluations and Language resources Distribution Agency (ELDA) (Paris, FRA)}, ISBN = {2-9517408-1-6}, CONFERENCE_NAME = {LREC 2004: Fourth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Lisbona}, CONFERENCE_DATE = {26th, 27th \& 28 May 2004}, EDITOR = {Lino, T. and Xavier, M. F. and Ferreira, F. and Costa, R. and Silvia, R.}, } @INPROCEEDINGS{PICCHI_2004_INPROCEEDINGS_PCCSS_84615, AUTHOR = {Picchi, E. and Ceccotti, M. L. and Cucurullo, S. and Sassi, M. and Sassolini, E.}, TITLE = {Linguistic Miner. An Italian Linguistic Knowledge System}, YEAR = {2004}, ABSTRACT = {Linguistic Miner is a project carried out at ILC whose objective is the development of an integrated system to build, organise and manage a corpus of Italian texts (of various origins and formats), and to design and constantly add new tools for the automatic extraction of tiered linguistic knowledge to be made available for many teaching, publishing, and other cultural purposes. The project is based on a notion that is preliminary to all the systems for corpus-based linguistic analysis: a language represented by the largest possible collection of heterogeneous texts is the best source of linguistic information at any level of analysis considered. The first goals of such a system are the semi-automated construction of an Italian data mine for the extraction of linguistic information, the validation of linguistic patterns, the installation of useful tools and resources for a range of different categories of Italian language users. The main feature of the project is its purpose of building large language reference corpora allowing for the creation and use of effective tools for the handling and processing, as well as the automatic linguistic synthesis, of such corpora.}, KEYWORDS = {linguistic analysis, information extraction}, PAGES = {1811-1814}, URL = {http://www.lrec-conf.org/lrec2004/}, VOLUME = {V}, ISBN = {2-9517408-1-6}, CONFERENCE_NAME = {LREC 2004: Fourth International Conference on Language Resources and Evaluation}, CONFERENCE_PLACE = {Lisbona}, CONFERENCE_DATE = {26-27-28 Maggio 2004}, BOOKTITLE = {Proceedings of the 4th International Conference on Language Resources and Evaluation}, } @TECHREPORT{CECCOTTI_2004_TECHREPORT_CS_157391, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Gadda in Abruzzo. Concordanze per lemma}, YEAR = {2004}, URL = {https://publications.cnr.it/doc/157391}, } @ARTICLE{CECCOTTI_2003_ARTICLE_CS_64460, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {L'archivio elettronico delle opere di Carlo Emilio Gadda. Da redattori a fruitori di un data base testuale}, YEAR = {2003}, PAGES = {221-250}, URL = {https://publications.cnr.it/doc/64460}, VOLUME = {16-17}, } @ARTICLE{CECCOTTI_2003_ARTICLE_CS_64495, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {L'Archivio elettronico delle Opere di Carlo Emilio Gadda in DBT 2000: risultati e prospettive" (The Electronic Archive of Carlo Emilio Gadda's Works: results and prospects)}, YEAR = {2003}, KEYWORDS = {Strumenti lessicali, Database gaddiano, Letteratura italiana, Informatica uman, Carlo Emilio Gadda}, URL = {https://publications.cnr.it/doc/64495}, VOLUME = {SupII}, } @ARTICLE{SASSI_2003_ARTICLE_SA_64464, AUTHOR = {Sassi, M. and Amoroso, Y.}, TITLE = {Letteratura, diritto e linguistica computazionale. Panorama delle collaborazioni Italia-Cuba}, YEAR = {2003}, ABSTRACT = {Abstract - These notes offer an outline of the collaborations started in 1995 by the Institute of Computational Linguistics (ILC) of Pisa, with some Cuban Scientific Institutions and expanded in several research sectors in the subsequent years. As regards Automatized Lexicography, we propose here a brief description of the results and prospects of work carried out with CEM (Centro de Estudios Martianos), FAC (Fundación Alejo Carpentier) and ILL (Instituto de Literatura y Lingüística). In 1996, as a result of the cooperation of ILC with CEM with regard to methodology and textual codification, we started the creation of an electronic archive of the complete works of Jose Martí (27 tomes). In 1997, in collaboration with FAC, ILC put to practical use the previous experiences for the creation of an electronic archive of the complete works of Alejo Carpentier. In 2002, the collaboration between ILL and SCDI (Sociedad Cubana de Derecho e Informática de la Unión di Juristas de Cuba), resulted in the creation of the “Diccionario de Jurismática”. As far as Legal Information Science is concerned, we propose a presentation of the objectives which have been reached as well as the projects for the future in the area, relating to the study of languages and legal documents as formulated by ILC and SCDI.}, PAGES = {901-924}, URL = {https://publications.cnr.it/doc/64464}, VOLUME = {18-19}, } @INCOLLECTION{CECCOTTI_2003_INCOLLECTION_CS_157355, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Sistema}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157355}, } @INCOLLECTION{SASSI_2003_INCOLLECTION_S_136433, AUTHOR = {Sassi, M.}, TITLE = {La consultazione dei corpora costituzionali con il DBT}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/136433}, PUBLISHER = {CNR, ITTIG (Firenze, ITA)}, } @INPROCEEDINGS{PICCHI_2003_INPROCEEDINGS_PCCCFSST_84548, AUTHOR = {Picchi, E. and Ceccotti, M. L. and Cignoni, L. and Cucurullo, N. and Fiorentini, G. and Sassi, M. and Sassolini, E. and Turrini, G.}, TITLE = {Linguistic Miner}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/84548}, CONFERENCE_NAME = {Congresso annuale AICA 2003: I costi dell'ignoranza e il valore della conoscenza nella società dell'informazione}, CONFERENCE_PLACE = {Trento}, CONFERENCE_DATE = {2003}, } @TECHREPORT{AMOROSO_2003_TECHREPORT_ACFFIMMPS_157312, AUTHOR = {Amoroso, Y. and Cammelli, A. and Fameli, E. and Fameli, M. and Inghirami, B. and Mariani, P. and Marinai, M. and Parenti, L. and Sassi, M.}, TITLE = {Diritto alla vita e Diritto all'ambiente nel lessico costituzionale e nella dottrina giuridica. Strumenti e metodi per l'analisi linguistico-concettuale}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157312}, } @TECHREPORT{CAMMELLI_2003_TECHREPORT_CS_172092, AUTHOR = {Cammelli, A. and Sassi, M.}, TITLE = {Strumenti e metodi per uno studio lessicale della Costituzione della Repubblica Bolivariana del Venezuela}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/172092}, } @TECHREPORT{CECCOTTI_2003_TECHREPORT_CS_157356, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Iterazioni gaddiane}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157356}, } @TECHREPORT{CECCOTTI_2003_TECHREPORT_CS_157357, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Annotazioni su composti in-cola. Da Dante a Gadda}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157357}, } @TECHREPORT{CECCOTTI_2003_TECHREPORT_CS_157358, AUTHOR = {Ceccotti, M. L. and Sassi, M.}, TITLE = {Concordanze del Pasticciaccio}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157358}, } @TECHREPORT{SASSI_2003_TECHREPORT_S_157333, AUTHOR = {Sassi, M.}, TITLE = {La consultazione dei corpora costituzionali con DBT}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157333}, } @TECHREPORT{SASSI_2003_TECHREPORT_SA_157334, AUTHOR = {Sassi, M. and Amoroso, Y.}, TITLE = {Il corpus elettronico delle costituzioni iberoamericane}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157334}, } @TECHREPORT{SASSI_2003_TECHREPORT_SA_157352, AUTHOR = {Sassi, M. and Amoroso, Y.}, TITLE = {Il corpus elettronico delle costituzioni ibero-americane}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157352}, } @TECHREPORT{SASSI_2003_TECHREPORT_SC_157351, AUTHOR = {Sassi, M. and Ceccotti, M. L.}, TITLE = {Documentazione dell'attività di consulenza svolta sul database Gadda per studiosi in Italia e all'estero (2002-2003)}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157351}, } @TECHREPORT{SASSI_2003_TECHREPORT_SC_157359, AUTHOR = {Sassi, M. and Ceccotti, M. L.}, TITLE = {Concordanze della Cognizione del dolore}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157359}, } @MISC{SASSI_2003_MISC_S_157354, AUTHOR = {Sassi, M.}, TITLE = {Costituzione della Repubblica Bolivariana del Venezuela}, YEAR = {2003}, URL = {https://publications.cnr.it/doc/157354}, } @BOOK{PARDELLI_2002_BOOK_POSEG_231725, AUTHOR = {Pardelli, G. and Orsolini, P. and Sassi, M. and Enea, A. and Gazzetti, S.}, TITLE = {TAL Bibliography (1951-2002). Parte I}, YEAR = {2002}, ABSTRACT = {Il presente catalogo contiene molte delle bibliografie del Trattamento Automatico della Lingua TAL a partire dal secondo dopoguerra ad oggi e diverse bibliografie di opere di linguistica generale variamente collegate alla linguistica Computazionale CL, in varie formulazioni: glossematica, grammatica trasformazionale-generativa, fonetica, stilistica linguistica, psicolinguistica, sociolinguistica, didattica delle lingue, filosofia del linguaggio, storia della lingua, funzionalismo praghese, prosodismo inglese, ecc.. Sono compresi, inoltre, alcuni riferimenti alla documentazione di alcuni linguaggi di programmazione evoluti adatti alla elaborazione di dati linguistici. La tipologia delle opere presenti nel seguente catalogo sinteticamente può ricondursi a: - Le prime testimonianze del trattamento automatico del linguaggio: Busa (1951); - Atti di Congressi e Conferenze di varie Associazioni Internazionali (ACL, ALLC, COLING, TAL, ACLA, AILA, ecc.), tra i principali possiamo citare quelli dei congressi di CL tenuti a Yorktown Heights (IBM-64), a Grenoble (CITAL-67), a Praga (1968) e a Bergen (COLING-78); - Opere generali o introduttive delle applicazioni del calcolatore alle ricerche umanistiche e letterarie. Un esempio è dato dalla pubblicazione di F. De Tollenaere: i lavori presentati in questo survey del 1962 si riferiscono a ricerche nel settore Humanities di vari paesi: Stati Uniti, Inghilterra, Francia, Italia, Belgio, Olanda, Unione Sovietica, Cecoslovacchia, ecc. In questo gruppo, per la lessicografia, ricordiamo i due colloqui di Praga (1967) e di Pisa (1970)0; - Opere sull'uso di modelli matematici nella linguistica (Garvin, Maegaard). Gli argomenti trattati in queste opere nella maggior parte dei casi fanno riferimento a: 1. Valutazione statistica: compilazione di liste, dizionari, indici e di ricerche statistiche in genere (livello distribuzionale); 2. Elaborazione algoritmica dei sistemi sintattici e di vari modelli di acquisizione del linguaggio (livello sintattico); 3. Elaborazione automatica del contenuto del linguaggio (livello semantico); 4. Traduzione automatica (per le varie lingue); 5. Lessicografia (classica, romanza, slava, germanica, biblica, concordanze, indici e studi dialettologici, ecc.). E' stato possibile individuare le tematiche principali e gli argomenti più ricorrenti della soggettazione attraverso l'analisi di circa 5000 documenti nel settore del TAL. Mantenere una terminologia comune (normalizzazione) della soggettazione non è stato sempre possibile. L'interdisciplinarietà, sempre più praticata dopo gli anni '50, dovuta all'incontro dei metodi della linguistica con altre discipline e la terminologia dei primi anni in questo settore di indagine, ci avrebbero condotto alla dispersione dei soggetti, che sono stati rivisti in funzione di una maggiore omogeinità. I testi che studiano il linguaggio e i sistemi di automazione nelle ricerche e nelle analisi linguistiche sono stati descritti nel database principalmente in inglese e occasionalmente in francese per casi particolari (v. nell'indice dei Soggetti: Traduction Mécanique /Traduction Automatique) per rispettarne le prime testimonianze. Abbiamo preferito mantenere descrittori simili per evidenziare l'evoluzione della terminologia usata nei testi nel corso degli anni (v.: Automatic ... /Automated ...). Abbiamo evitato, per quanto possibile, l'uso di termini di eccessivo tecnicismo: ciò per rendere più agevole la ricerca attraverso Internet da parte di un'utenza non sempre specialistica. Il lavoro di soggettazione, analisi e selezione dei documenti per la costruzione di questo archivio è stato svolto da Gabriella Pardelli. Il software CDS-ISIS dell'Unesco è stato utilizzato per gli standard catalografici e, allo scopo, è stato attivato un server per l'accesso simultaneo al database. Il supporto informatico, anche per ciò che concerne il rilevamento dei dati per le indagini statistiche come il recupero automatico dei tag di sottocampo dei record bibliografici (lingua, paese, argomento, ecc.) è stato fornito interamente da Alessandro Enea. Paola Orsolini e Silvia Gazzetti si sono occupate del lavoro di catalogazione. L'elaborazione del catalogo generato dal database ISIS è stata effettuata da Manuela Sassi.}, KEYWORDS = {Bibliografia, Linguistica Computazionale}, PAGES = {1-187}, URL = {http://www.biblos.cnr.it/04_2_TALb.html}, PUBLISHER = {S. T. A. R. Servizio Tecnografico Area Ricerca CNR (Pisa, ITA)}, } @MISC{CECCOTTI_2002_MISC_CPS_242344, AUTHOR = {Ceccotti, M. L. and Pardelli, G. and Sassi, M.}, TITLE = {Per un'analisi del lessico linguistico-computazionale: da Weaver all'ALPAC Report}, YEAR = {2002}, ABSTRACT = {In questo report proponiamo una breve presentazione della 'preistoria' della linguistica computazionale, del periodo compreso tra la pubblicazione del memorandum di Warren Weaver del 1949 e del Report dell'Alpac del 1966. La preistoria della CL vuol dire essenzialmente traduzione automatica, dizionari di macchina, etc., attività di ricerca svolte da matematici, fisici. L'inglese è la lingua veicolo di tutto ciò e continuerà ad esserlo ancora per anni prima che anche in Europa - eccezione è l'Inghilterra - e nel resto del mondo siano tentate traduzioni, adattamenti, proposte, che guidano i primi passi della ricerca in questi ambiti.}, KEYWORDS = {Lessico linguistico-computazionale}, URL = {https://publications.cnr.it/doc/242344}, } @INPROCEEDINGS{CECCOTTI_2000_INPROCEEDINGS_CSP_228142, AUTHOR = {Ceccotti, M. L. and Sassi, M. and Pardelli, G.}, TITLE = {Un laboratorio multimediale dedicato a Carlo Emilio Gadda: il modello e i primi dati implementati in formato XML}, YEAR = {2000}, ABSTRACT = {In this paper we present of the Italian National Council of Research titled "Gadda 's Electronic Archive: Lexicographical and bibliographical Tools in XML". The text is made of two sections: in the first, we present Gadda's Electronic Archive, implemented at the ILC, and in the second, we show the project's objectives and the results achieved in the first months of work.}, KEYWORDS = {Gadda's Electronic Archive}, PAGES = {267-271}, URL = {https://publications.cnr.it/doc/228142}, PUBLISHER = {Associazione Italiana per l'Informatica ed il Calcolo Automatico (AICA) (Milano, ITA)}, CONFERENCE_NAME = {XXXVIII Congresso Annuale AICA: Le tecnologie dell'Informazione e della Comunicazione come sviluppo del Paese}, CONFERENCE_PLACE = {Taormina}, CONFERENCE_DATE = {27-30 Settembre}, } @INPROCEEDINGS{CECCOTTI_2000_INPROCEEDINGS_CSP_231335, AUTHOR = {Ceccotti, M. L. and Sassi, M. and Pardelli, G.}, TITLE = {Il soccorso informatico per lo studio di un autore difficile, C. E. Gadda}, YEAR = {2000}, ABSTRACT = {Nella prima parte di questo contributo si illustreranno le caratteristiche dell'Archivio Gadda in DBT, frutto di un lungo lavoro redazionale di transcodifica e di codifica. Nella seconda sarà brevemente descritta e motivata la realizzabilità di un sito web su Gadda, un modello di 'laboratorio culturale' che costituito inizialmente da alcuni brani gaddiani, da nostre recenti pubblicazioni, da dati bibliografici, potrebbe essere arricchito dall'apporto del lettore di Gadda, studioso,. studente, curioso...}, KEYWORDS = {Gadda C. E}, PAGES = {149-154}, URL = {https://publications.cnr.it/doc/231335}, PUBLISHER = {Associazione Italiana per l'Informatica ed il Calcolo Automatico (AICA) (Milano, ITA)}, CONFERENCE_NAME = {DIDAMATICA 2000, Informatica per la Didattica}, CONFERENCE_PLACE = {Cesena}, CONFERENCE_DATE = {4-5-6 maggio 2000}, BOOKTITLE = {Atti 1. Lavori Scientifici}, EDITOR = {Andronico, A. and Casadei, G. and Sacerdoti, G.}, } @BOOK{PARDELLI_1998_BOOK_PS_255948, AUTHOR = {Pardelli, G. and Sassi, M.}, TITLE = {I. L. C. Library}, YEAR = {1998}, ABSTRACT = {La stampa del catalogo dell'archivio librario dell'Istituto di Linguistica Computazionale I.L.C. è tratta dall'archivio elettronico. La registrazione del materiale bibliografico viene effettuata con il software CDS/ISIS dell'UNESCO. Tale sistema ci permette di descrivere il documento secondo gli standard internazionali. I dati bibliografici dell'archivio sono stati curati da Gabriella Pardelli, responsabile della Biblioteca dell'Istituto, le procedure informatiche relative all'indicizzazione e alla stampa complessiva del catalogo sono state effettuate da Manuela Sassi, Tecnologo dell'Istituto.}, KEYWORDS = {Catalogo, Linguistica Computazionale}, PAGES = {i-253}, URL = {https://publications.cnr.it/doc/255948}, PUBLISHER = {S. T. A. R. Servizio Tecnografico Area Ricerca CNR (Pisa, ITA)}, } @BOOK{PARDELLI_1992_BOOK_PSM_241726, AUTHOR = {Pardelli, G. and Sassi, M. and Marinai, E.}, TITLE = {L'Archivio librario dell'I. L. C}, YEAR = {1992}, ABSTRACT = {Questo lavoro nasce dalla richiesta di colleghi e studenti di avere a disposizione un catalogo cartaceo per il reperimento di dati bibliografici del patrimonio librario dell'Istituto di Linguistica Computazionale. La Biblioteca dell'ILC fino a questo momento, infatti, si è avvalsa soltanto del catalogo on-line, attualmente gestito da un sistema informativo basato sulla procedura automatica di spoglio elettronico, DBT (Data Base Testuale), che permette il recupero delle informazioni contenute nell'archivio librario in tempo reale. Il catalogo segue questa strutturazione :a) Introduzione b) Premessa c) Criteri di catalogazione, d)I campi del database e) Esempi di interrogazione f) Indice per argomenti g) Indice per autori .}, KEYWORDS = {Linguistica Computazionale, Cataloghi, Sistema DBT}, PAGES = {1-131}, URL = {https://publications.cnr.it/doc/241726}, PUBLISHER = {S. T. A. R. Servizio Tecnografico Area Ricerca CNR (Pisa, ITA)}, }