@INPROCEEDINGS{FRONTINI_2023_INPROCEEDINGS_F_478212, AUTHOR = {Frontini, F.}, TITLE = {Words and the Company they Keep: Digital corpora and infrastructures for the foreign language classroom}, YEAR = {2023}, ABSTRACT = {We give an overview of corpora \& language technologies and their use in foreign language teaching.}, KEYWORDS = {corpora, didattica L2, tecnologie del linguaggio}, URL = {https://publications.cnr.it/doc/478212}, CONFERENCE_NAME = {Didattica della lingua, della cultura e cittadinanza attiva: sfide educative contemporanee-Seminari LEND Modena}, CONFERENCE_DATE = {07/02/2023}, } @INCOLLECTION{DEJONG_2022_INCOLLECTION_DVFVFW_472288, AUTHOR = {De Jong, F. and Van Uytvanck, D. and Frontini, F. and Van Den Bosch, A. and Fišer, D. and Witt, A.}, TITLE = {Language Matters. The European Research Infrastructure CLARIN, Today and Tomorrow}, YEAR = {2022}, ABSTRACT = {LARIN stands for "Common Language Resources and Technology Infrastructure". In 2012 CLARIN ERIC was established as a legal entity with the mission to create and maintain a digital infrastructure to support the sharing, use, and sustainability of language data (in written, spoken, or multimodal form) available through repositories from all over Europe, in support of research in the humanities and social sciences and beyond. Since 2016 CLARIN has had the status of Landmark research infrastructure and currently it provides easy and sustainable access to digital language data and also offers advanced tools to discover, explore, exploit, annotate, analyse, or combine such datasets, wherever they are located. This is enabled through a networked federation of centres: language data repositories, service centres, and knowledge centres with single sign-on access for all members of the academic community in all participating countries. In addition, CLARIN offers open access facilities for other interested communities of use, both inside and outside of academia. Tools and data from different centres are interoperable, so that data collections can be combined and tools from different sources can be chained to perform operations at different levels of complexity. The strategic agenda adopted by CLARIN and the activities undertaken are rooted in a strong commitment to the Open Science paradigm and the FAIR data principles. This also enables CLARIN to express its added value for the European Research Area and to act as a key driver of innovation and contributor to the increasing number of industry programmes running on data-driven processes and the digitalization of society at large.}, KEYWORDS = {research infrastructure, language resources, language technology, open science, service interoperability, innovation, SSH}, PAGES = {31-58}, URL = {https://www.degruyter.com/document/doi/10.1515/9783110767377-002/html}, VOLUME = {1}, DOI = {10.1515/9783110767377-002}, PUBLISHER = {Walter De Gruyter Inc (Boston/Berlin/Munich, USA)}, ISBN = {978-3-11-076737-7}, BOOKTITLE = {CLARIN: The Infrastructure for Language Resources}, EDITOR = {Fišer, D. and Witt, A.}, } @INCOLLECTION{DELFANTE_2022_INCOLLECTION_DFMQ_469112, AUTHOR = {Del Fante, D. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {Italian Language Resources. From CLARIN-IT to the VLO and Back: Sketching a Methodology for Monitoring LRs Visibility}, YEAR = {2022}, ABSTRACT = {This paper sketches a user-oriented, qualitative methodology for both (i) monitoring the existence and availability of language resources relevant for a given CLARIN national community and language and (ii) assessing the offering potential of CLARIN, in terms of Language Resources provided to national consortia. From the user perspective, the methodology has been applied to investigate the visibility of language resources available for Italian within the CLARIN central services, in particular the Virtual Language Observatory. As a proof-of-concept, the methodology has been tested on the resources available through the CLARIN-IT data centres, but, ideally, it could be applied by any national data centre aiming to assess the existence of LRs in CLARIN for any given languages and check their accessibility for the interested users. It is thus argued that such an assessment might be a useful instrument in the hands of national coordinators and centre managers for (i) bringing to the fore both strengths and critical issues about their data providing community and (ii) for planning targeted actions to improve and increase both visibility and accessibility of their LRs.}, KEYWORDS = {Virtual Language Observatory, CLARIN-IT, CLARIN-ERIC, Qualitative Assessment Methodology, User Involvement}, PAGES = {10-22}, URL = {https://ecp.ep.liu.se/index.php/clarin/article/view/413/371}, DOI = {10.3384/9789179294441}, ISBN = {978-91-7929-444-1}, BOOKTITLE = {Selected Papers from the CLARIN Annual Conference 2021}, EDITOR = {Monachini and Monica and Eskevich and Maria}, } @INPROCEEDINGS{AGNOLONI_2022_INPROCEEDINGS_ABFMMQRV_472294, AUTHOR = {Agnoloni, T. and Bartolini, R. and Frontini, F. and Montemagni, S. and Marchetti, C. and Quochi, V. and Ruisi, M. and Venturi, G.}, TITLE = {Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus}, YEAR = {2022}, ABSTRACT = {This paper describes the process of acquisition, cleaning, interpretation, coding and linguistic annotation of a collection of parliamentary debates from the Senate of the Italian Republic covering the COVID-19 pandemic emergency period and a former period for reference and comparison according to the CLARIN ParlaMint prescriptions. The corpus contains 1199 sessions and 79,373 speeches for a total of about 31 million words, and was encoded according to the ParlaCLARIN TEI XML format. It includes extensive metadata about the speakers, sessions, political parties and parliamentary groups. As required by the ParlaMint initiative, the corpus was also linguistically annotated for sentences, tokens, POS tags, lemmas and dependency syntax according to the universal dependencies guidelines. Named entity annotation and classification is also included. All linguistic annotation was performed automatically using state-of-the-art NLP technology with no manual revision. The Italian dataset is freely available as part of the larger ParlaMint 2.1 corpus deposited and archived in CLARIN repository together with all other national corpora. It is also available for direct analysis and inspection via various CLARIN services and has already been used both for research and educational purposes.}, KEYWORDS = {parliamentary debates, CLARIN ParlaMint, corpus creation, corpus annotation}, PAGES = {117-124}, URL = {https://aclanthology.org/2022.parlaclarin-1.17/}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, CONFERENCE_NAME = {Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference}, CONFERENCE_PLACE = {Marseille, France}, CONFERENCE_DATE = {20/06/2022}, } @INPROCEEDINGS{DELFANTE_2022_INPROCEEDINGS_DFMQ_468964, AUTHOR = {Del Fante, D. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {CLARIN-IT: An Overview on the Italian Clarin Consortium After Six Years of Activity}, YEAR = {2022}, ABSTRACT = {This paper offers an overview of the Italian CLARIN consortium after six years since its establishment. The members, the centres and the repositories and the most important collections are described. Lastly, in order to showcase the visibility and the accessiblity of Language Resources provided by CLARIN-IT from a user-perspective, we show how Italian resources are findable within CLARIN ERI}, KEYWORDS = {Language Resources, Data Repositories and Archives, Research Infrastructures, CLARIN}, PAGES = {8}, URL = {http://ceur-ws.org/Vol-3160/short21.pdf}, PUBLISHER = {CEUR-WS. org (Aachen, DEU)}, ISSN = {1613-0073}, CONFERENCE_NAME = {Italian Research Conference on Digital Libraries}, CONFERENCE_PLACE = {Università degli Studi di Padova}, CONFERENCE_DATE = {24/02/2022}, BOOKTITLE = {Proceedings of the 18th Italian Research Conference on Digital Libraries}, EDITOR = {Di Nunzio, G. M. and Portelli, B. and Redavid, D. and Silvello, G.}, } @INPROCEEDINGS{GAMBA_2022_INPROCEEDINGS_GFBM_472292, AUTHOR = {Gamba, F. and Frontini, F. and Broeder, D. and Monachini, M.}, TITLE = {Language Technologies for the Creation of Multilingual Terminologies. Lessons Learned from the SSHOC Project}, YEAR = {2022}, ABSTRACT = {This paper is framed in the context of the SSHOC project and aims at exploring how Language Technologies can help in promoting and facilitating multilingualism in the Social Sciences and Humanities (SSH). Although most SSH researchers produce culturally and societally relevant work in their local languages, metadata and vocabularies used in the SSH domain to describe and index research data are currently mostly in English. We thus investigate Natural Language Processing and Machine Translation approaches in view of providing resources and tools to foster multilingual access and discovery to SSH content across different languages. As case studies, we create and deliver as freely, openly available data a set of multilingual metadata concepts and an automatically extracted multilingual Data Stewardship terminology. The two case studies allow as well to evaluate performances of state-of-the-art tools and to derive a set of recommendations as to how best apply them. Although not adapted to the specific domain, the employed tools prove to be a valid asset to translation tasks. Nonetheless, validation of results by domain experts proficient in the language is an unavoidable phase of the whole workflow.}, KEYWORDS = {Multilingual terminologies, data curation, language resource infrastructures}, PAGES = {154-163}, URL = {https://aclanthology.org/2022.lrec-1.17}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, CONFERENCE_NAME = {13th Conference on Language Resources and Evaluation (LREC 2022)}, CONFERENCE_PLACE = {Marseille, France}, CONFERENCE_DATE = {22/06/2022-24/06/2022}, } @INPROCEEDINGS{HIRSCH_2022_INPROCEEDINGS_HFDD_469567, AUTHOR = {Hirsch, F. and Frontini, F. and Didirková, I. and Drengubiak, J.}, TITLE = {Esthétique de la voix dans les livres audio en langue française}, YEAR = {2022}, ABSTRACT = {Aesthetics of voice in French-language audio books. This research aims at studying listeners' preferences in audiobooks' voices. Samples of 8 male and 7 female voices were extracted from different audiobooks and analyzed. A survey has been carried out to obtain 69 listeners' points of view by answering questions on vocal features. Results show that the participants' choices depend on the literary genre. Indeed, male voices are preferred for science-fiction novels and female voices for juvenile literature and contemporary novels. Nevertheless, other literary genres that were tested do not match with a specific voice. On the other hand, essays are expected to be read with a slower speech rate, whereas listeners prefer faster speech rates in erotic novels.}, KEYWORDS = {audiobooks, voice esthetics, speech}, URL = {https://doi.org/10.1051/shsconf/202213808004}, DOI = {10.1051/shsconf/202213808004}, CONFERENCE_NAME = {8e Congrès Mondial de Linguistique Française}, CONFERENCE_PLACE = {Université d'Orléans, France}, CONFERENCE_DATE = {04-08/07/2022}, BOOKTITLE = {138}, } @TECHREPORT{MARTELLI_2022_TECHREPORT_MMCNVUFQKKLDTTCSKIDGM_472421, AUTHOR = {Martelli, F. and Maru, M. and Campagnano, C. and Navigli, R. and Velardi, P. and Ureña Ruiz, R. and Frontini, F. and Quochi, V. and Kallas, J. and Koppel, K. and Langemets, M. and De Does, J. and Tempelaars, R. and Tiberius, C. and Costa, R. and Salgado, A. and Krek, S. and Čibej, J. and Dobrovoljc, K. and Gantar, P. and Munda, T.}, TITLE = {D3. 8 Lexical-semantic analytics for NLP}, YEAR = {2022}, ABSTRACT = {The present document illustrates the work carried out in task 3.3 (work package 3) focused on lexicalsemantic analytics for Natural Language Processing (NLP). This task aims at computing analytics for lexicalsemantic information such as words, senses and domains in the available resources, investigating their role in NLP applications. Specifically, this task concentrates on three research directions, namely i) which grouping senses based on their semantic similari sense clustering , in ty improves the performance of NLP tasks such as Word Sense Disambiguation (WSD), ii) domain labeling of text , in which the lexicographic resources made available by the ELEXIS project for research purposes allow better performances to be achieved, and fin senses ally iii) analysing the , for which a software package is made available. diachronic distribution of In this deliverable, we illustrate the research activities aimed at achieving the aforementioned goals and put forward suggestions for future works. Importantly, we stress the crucial role played by highquality lexicalsemantic r esources when investigating such linguistic aspects and their impact on NLP applications. To this end, as an additional contribution, we address the paucity of manually the ELEXIS parallelannotated data in the lexical senseannotated datasetsemantic research field and introduce , a novel entirely manuallyavailable in 10 European languages and featuring 5 annotation layers.}, KEYWORDS = {research infrastructures, lexicography, lexical resources, word-sense disambiguation, WSD, sense-annotated language data, multilinguality}, PAGES = {67}, URL = {https://elex.is/wp-content/uploads/ELEXIS_D3_8_Lexical-Semantic_Analytics_for_NLP_final_report.pdf}, } @TECHREPORT{TASOVAC_2022_TECHREPORT_TTBBBCUFHHJKKKKMMMMMQRRSSVWWZ_463877, AUTHOR = {Tasovac, T. and Tiberius, C. and Bamberg, C. and Bellandi, A. and Burch, T. and Costa, R. and Ďurčo, M. and Frontini, F. and Hennemann, J. and Heylen, K. and Jakubíček, M. and Khan, F. and Klee, A. and Kosem, I. and Kovář, V. and Matuška, O. and McCrae, J. and Monachini, M. and Mörth, K. and Munda, T. and Quochi, V. and Repar, A. and Roche, C. and Salgado, A. and Sievers, H. and Váradi, T. and Weyand, S. and Woldrich, A. and Zhanial, S.}, TITLE = {D5. 3 Overview of Online Tutorials and Instruction Manuals}, YEAR = {2022}, ABSTRACT = {The ELEXIS Curriculum is an integrated set of training materials which contextualizes ELEXIS tools and services inside a broader, systematic pedagogic narrative. This means that the goal of the ELEXIS Curriculum is not simply to inform users about the functionalities of particular tools and services developed within the project, but to show how such tools and services are a) embedded in both lexicographic theory and practice; and b) representative of and contributing to the development of digital skills among lexicographers. The scope and rationale of the curriculum are described in more detail in the Deliverable D5.2 Guidelines for Producing ELEXIS Tutorials and Instruction Manuals. The goal of this deliverable, as stated in the project DOW, is to provide "a clear, structured overview of tutorials and instruction manuals developed within the project."}, KEYWORDS = {ELEXIS, lexicography, training materials}, PAGES = {31}, URL = {https://elex.is/wp-content/uploads/ELEXIS_D5_3_Overview-of-Online-Tutorials-and-Instruction-Manuals.pdf}, } @MISC{FRONTINI_2022_MISC_FBQMMZUW_463506, AUTHOR = {Frontini, F. and Bellandi, A. and Quochi, V. and Monachini, M. and Mörth, K. and Zhanial, S. and Ďurčo, M. and Woldrich, A.}, TITLE = {CLARIN Tools and Resources for Lexicographic Work}, YEAR = {2022}, ABSTRACT = {This course introduces lexicographers to the CLARIN Research Infrastructure and highlights language resources and tools useful for lexicographic practices. The course consists of two parts. In Part 1, you will learn about CLARIN, its technical and knowledge infrastructure, and about how to deposit and find lexical resources in CLARIN. In Part 2, you will become acquainted with CLARIN tools that can be used to create lexical resources.}, KEYWORDS = {CLARIN, lexicography}, URL = {https://elexis.humanistika.org/id/UnwYPq70Dewbn7XDEjsMM}, } @MISC{MARTELLI_2022_MISC_MNKKGKNPOLKKDUSLVGLQMFTTCSIM_472295, AUTHOR = {Martelli, F. and Navigli, R. and Krek, S. and Kallas, J. and Gantar, P. and Koeva, S. and Nimb, S. and Pedersen, B. S. and Olsen, S. and Langemets, M. and Koppel, K. and Üksik, T. and Dobrovoljc, K. and Ureña Ruiz, R. and Sancho Sánchez, J. and Lipp, V. and Váradi, T. and Győrffy, A. and László, S. and Quochi, V. and Monachini, M. and Frontini, F. and Tiberius, C. and Tempelaars, R. and Costa, R. and Salgado, A. and Čibej, J. and Munda, T.}, TITLE = {Parallel sense-annotated corpus ELEXIS-WSD 1. 0}, YEAR = {2022}, ABSTRACT = {ELEXIS-WSD is a parallel sense-annotated corpus in which content words (nouns, adjectives, verbs, and adverbs) have been assigned senses. Version 1.0 contains sentences for 10 languages: Bulgarian, Danish, English, Spanish, Estonian, Hungarian, Italian, Dutch, Portuguese, and Slovene. The corpus was compiled by automatically extracting a set of sentences from WikiMatrix (Schwenk et al., 2019), a large open-access collection of parallel sentences derived from Wikipedia, using an automatic approach based on multilingual sentence embeddings. The sentences were manually validated according to specific formal, lexical and semantic criteria (e.g. by removing incorrect punctuation, morphological errors, notes in square brackets and etymological information typically provided in Wikipedia pages). To obtain a satisfying semantic coverage, we filtered out sentences with less than 5 words and less than 2 polysemous words were filtered out. Subsequently, in order to obtain datasets in the other nine target languages, for each selected sentence in English, the corresponding WikiMatrix translation into each of the other languages was retrieved. If no translation was available, the English sentence was translated manually. The resulting corpus is comprised of 2,024 sentences for each language.}, KEYWORDS = {Word Sense Disambiguation, corpus parallelo, disambiguazione automatica del senso, annotazione semantica multilingue}, URL = {http://hdl.handle.net/11356/1674}, } @ARTICLE{PANCKHURST_2021_ARTICLE_PF_455049, AUTHOR = {Panckhurst, R. and Frontini, F.}, TITLE = {An Internationally Fair Mediated Digital Discourse Corpus: Improving Knowledge on Reuse}, YEAR = {2021}, ABSTRACT = {In this paper, the authors present a French Mediated Digital Discourse corpus, (88milSMS http://88milsms.huma-num.fr https://hdl.handle.net/11403/comere/ cmr-88milsms). Efforts were undertaken over the years to ensure its publication according to the best practices and standards of the community, thus guaranteeing compliance with FAIR principles and CLARIN recommendations with pertinent scientific and pedagogical reuse. Since knowledge on how resources are reused is sometimes difficult to obtain, ways of improving this are also envisaged.}, KEYWORDS = {Reuse, FAIR, SMS, corpus}, PAGES = {185-193}, URL = {https://ecp.ep.liu.se/index.php/clarin/article/view/20}, VOLUME = {180}, DOI = {10.3384/ecp18020}, PUBLISHER = {Linköping University Electronic Press (Linköping, Svezia)}, ISSN = {1650-3740}, JOURNAL = {Linköping electronic conference proceedings (Online)}, } @EDITORIAL{BRANDO_2021_EDITORIAL_BFMRM_453809, AUTHOR = {Brando, C. and Frontini, F. and Moreau, D. and Roche, M. and Masson, É.}, TITLE = {Humanités numériques spatialisées}, YEAR = {2021}, ABSTRACT = {This special issue provides an introduction to the contributions presented in this thematic issue dedicated to the spatial humanities. Three main themes are addressed: (1) the processing of spatial information in textual corpora resulting from work in the human and social sciences, mainly in literary studies; (2) problems of acquisition, spatialisation and dissemination of geographical data of the past and from cultural heritage, thus, here, more connected with research in history; (3) spatial information and its processing and uses in archaeology. For each of these topics, we present the founding initiatives with historiographical elements, a brief status quaestionis and a synthesis of the contributions.}, KEYWORDS = {spatial digital humanities, archaeology, history, history of the digital humanities, geographic information system, cartography, spatial analysis, textual analysis}, URL = {https://journals.openedition.org/revuehn/689}, VOLUME = {3}, } @EDITORIAL{BRANDO_2021_EDITORIAL_BFMRM_453821, AUTHOR = {Brando, C. and Frontini, F. and Moreau, D. and Roche, M. and Masson, É.}, TITLE = {Introduction. Humanités numériques et analyses spatiales: enjeux et perspectives}, YEAR = {2021}, KEYWORDS = {spatial digital humanities, archaeology, history, history of the digital humanities, geographic information system, cartography, spatial analysis, textual analysis}, URL = {https://journals.openedition.org/revuehn/2038}, VOLUME = {3}, PUBLISHER = {Humanistica (Bruxelles, Belgio)}, ISSN = {2736-2337}, BOOKTITLE = {Humanités numériques (Online)}, } @INPROCEEDINGS{MARTELLI_2021_INPROCEEDINGS_MNKTKGKNPOLKKDUSLVGLQMFTCSIM_461705, AUTHOR = {Martelli, F. and Navigli, R. and Krek, S. and Tiberius, C. and Kallas, J. and Gantar, P. and Koeva, S. and Nimb, S. and Pedersen, B. S. and Olsen, S. and Langements, M. and Koppel, K. and Üksik, T. and Dobrovolijc, K. and Ureña Ruiz, R. and Sanchosánchez, J. and Lipp, V. and Varadi, T. and Györffy, A. and László, S. and Quochi, V. and Monachini, M. and Frontini, F. and Tempelaars, R. and Costa, R. and Salgado, A. and Čibej, J. and Munda, T.}, TITLE = {Designing the ELEXIS Parallel Sense-Annotated Dataset in 10 European Languages}, YEAR = {2021}, ABSTRACT = {Over the course of the last few years, lexicography has witnessed the burgeoning of increasingly reliable automatic approaches supporting the creation of lexicographic resources such as dictionaries, lexical knowledge bases and annotated datasets. In fact, recent achievements in the field of Natural Language Processing and particularly in Word Sense Disambiguation have widely demonstrated their effectiveness not only for the creation of lexicographic resources, but also for enabling a deeper analysis of lexical-semantic data both within and across languages. Nevertheless, we argue that the potential derived from the connections between the two fields is far from exhausted. In this work, we address a serious limitation affecting both lexicography and Word Sense Disambiguation, i.e. the lack of high-quality sense-annotated data and describe our efforts aimed at constructing a novel entirely manually annotated parallel dataset in 10 European languages. For the purposes of the present paper, we concentrate on the annotation of morpho-syntactic features. Finally, unlike many of the currently available sense-annotated datasets, we will annotate semantically by using senses derived from high-quality lexicographic repositories.}, KEYWORDS = {Digital lexicography, Natural Language Processing, Computational Linguistics, Corpus Linguistics, Word Sense Disambiguation}, PAGES = {377-396}, URL = {https://static-curis.ku.dk/portal/files/279888836/eLex_2021_22_pp377_395.pdf}, CONFERENCE_NAME = {eLex 2021}, CONFERENCE_DATE = {05/-7/2021-07/07/2021}, BOOKTITLE = {Proceedings of the eLex 2021 conference}, } @INPROCEEDINGS{ESKEVICH_2021_INPROCEEDINGS_EF_455136, AUTHOR = {Eskevich, M. and Frontini, F.}, TITLE = {SSHOC'ing drama in the cloud}, YEAR = {2021}, ABSTRACT = {At LIBER 2021 Online Conference, CLARIN and SSHOC presented a webinar showcasing how SSH researchers can benefit from the resources and services offered by SSH research infrastructures in order to produce and exploit highly encoded historical textual data. After the webinar, the participants were able to successfully guide and advise SSH researchers (with a particular focus on literature studies) in their choice amongst existing resources and tools, based on their research question.}, KEYWORDS = {CLARIN, infrastrutture, scienze umane e sociali}, URL = {https://zenodo.org/record/5082522#.YOgETBMzb0s}, CONFERENCE_NAME = {LIBER annual conference}, CONFERENCE_PLACE = {virtual event}, CONFERENCE_DATE = {08/07/2021}, } @INPROCEEDINGS{FRONTINI_2021_INPROCEEDINGS_FK_443609, AUTHOR = {Frontini, F. and Khan, A. F.}, TITLE = {Di cosa parliamo quando parliamo di FAIR?}, YEAR = {2021}, ABSTRACT = {Nel 2016 un consorzio di scienziati afferenti a diverse istituzioni e discipline enuncia i principi FAIR; in questi quattro anni l'importanza e la portata del programma FAIR è divenuta sempre più evidente. L'adesione a tali principi nelle discipline umanistiche sembra farsi largo, ma non senza difficoltà e interrogativi. Questo lavoro propone una riflessione sulle implicazioni della proposta FAIR per la gestione dei dati scientifici, confrontandola con la sua effettiva ricezione nella comunità delle DH in Italia e in Europa.}, KEYWORDS = {Principi FAIR, Open Data, dati della ricerca, politiche della ricerca, EOSC}, PAGES = {19-24}, URL = {https://aiucd2021.labcd.unipi.it/en/book-of-abstracts-conference/}, ISBN = {9788894253559}, CONFERENCE_NAME = {AIUCD 2021-DH per la società: e-guaglianza, partecipazione, diritti e valori nell'era digitale}, CONFERENCE_DATE = {19-22/01/2021}, BOOKTITLE = {AIUCD 2021-DH per la società: e-guaglianza, par-tecipazione, diritti e valori nell'era digitale. Raccolta degli abstract estesi della 10a conferenza nazionale, Pisa, 2021}, EDITOR = {Del Grosso, A. M. and Boschetti, F. and Salvatori, E.}, } @TECHREPORT{FRONTINI_2021_TECHREPORT_FGM_463461, AUTHOR = {Frontini, F. and Gamba, F. and Monachini, M.}, TITLE = {D3. 9 Report on Ontology and Vocabulary Collection and Publication}, YEAR = {2021}, ABSTRACT = {This deliverable pertains to SSHOC Task 3.1 which was responsible for investigating and providing resources and tools to support the multilingual aspects of the future pan-EU SSH infrastructure. Making data and services accessible and usable in SSH is very much also a matter of providing relevant translations, translation of metadata concepts, multilingual vocabularies, terminology extraction across languages, multilingual databases. The deliverable offers a detailed report on the gathering and translation of relevant SSH metadata, ontologies and vocabularies for the use-cases indicated in the task's topics: multilingual metadata concepts and vocabularies, the multilingual occupation ontology, with cross-country female occupational titles. In accordance with SSHOC and the EOSC FAIR recommendations and requirements, the metadata vocabularies and ontologies have been published via several different formats and facilities. Section 1. The introduction sets the landscape and describes the need of multilingual vocabularies both for classification and discovery in the context of a cloud-based infrastructure that will offer access to research data and related services adapted to the needs of the SSH community. Section 2. "Multilingual metadata" investigates the possibility to use and test Natural Language Processing (NLP) approaches and Machine Translation (MT) to make the metadata more accessible using national languages other than English. A selected case study was the recommended metadata set of the CLARIN Concept Registry (CCR): the whole set of metadata and definitions were translated into French, Greek, and Italian. The section describes the machine-translation and evaluation process, also comparing different technologies. Section 3. "Multilingual vocabularies and ontologies" introduces two other typical case-studies. The first one addresses one of the pressing needs in social sciences research. Many surveys, indeed, ask respondents to specify their occupation and the occupational ontology is used for the survey questions. For many languages the occupational titles for males and females are not identical. In section 3.1 the enrichment of the occupational ontology with lists for male and female titles, is described for many languages, namely for Dutch, German, Slovenian and French. The second case study focuses on the automatic extraction of terminology from texts: a list of domain- specific terms was automatically extracted from a corpus of Data Curation and Stewardship, validated by domain experts, automatically translated into multiple languages (Dutch, French, German, Greek, Italian, Slovenian) and linked to other existing terminologies. Section 4. describes the SKOS-ification and publication process of the results, together with the challenges posed by multilinguality. Section 5. offers an overview of the exploitation and sustainability of the results and how these are made available to the community. Finally the Conclusions provide some reflections on Machine Translation approaches adopted for translating the vocabularies into multiple languages, the advantages in terms of time saving and some first recommendations to the community.}, KEYWORDS = {Terminologies, Infrastructures, Social Sciences and Humanities, Data Curation, Data Stewardship, vocabularies, Translations, Metadata}, URL = {https://doi.org/10.5281/zenodo.5913485}, } @MISC{ALRAHABI_2021_MISC_ABFPJBKG_453820, AUTHOR = {Alrahabi, M. and Brando, C. and Frontini, F. and Provenier, A. and Jalabert, R. and Bordry, M. and Koskas, C. and Gawley, J.}, TITLE = {Guide d'annotation manuelle d'entités nommées dans des corpus littéraires}, YEAR = {2021}, ABSTRACT = {Guide d'annotation manuelle d'entités nommées dans des corpus littéraires Campagne d'annotation OBVIL 2019-2021}, KEYWORDS = {NER}, URL = {https://hal.archives-ouvertes.fr/hal-03156278}, } @MISC{ERJAVEC_2021_MISC_EOOLSGRPKBSVDDJHNCDVMLCAFMQVRMBSRDUPBKMDLR_463861, AUTHOR = {Erjavec, T. and Ogrodniczuk, M. and Osenova, P. and Ljubešić, N. and Simov, K. and Grigorova, V. and Rudolf, M. and Pančur, A. and Kopp, M. and Barkarson, S. and Steingrímsson, S. and Van Der Pol, H. and Depoorter, G. and De Does, J. and Jongejan, B. and Haltrup Hansen, D. and Navarretta, C. and Calzada Pérez, M. and De Macedo, L. D. and Van Heusden, R. and Marx, M. and Çöltekin, Ç. and Coole, M. and Agnoloni, T. and Frontini, F. and Montemagni, S. and Quochi, V. and Venturi, G. and Ruisi, M. and Marchetti, C. and Battistoni, R. and Sebők, M. and Ring, O. and Darģis, R. and Utka, A. and Petkevičius, M. and Briedienė, M. and Krilavičius, T. and Morkevičius, V. and Diwersy, S. and Luxardo, G. and Rayson, P.}, TITLE = {Linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint. ana 2. 1}, YEAR = {2021}, ABSTRACT = {ParlaMint 2.1 is a multilingual set of 17 comparable corpora containing parliamentary debates mostly starting in 2015 and extending to mid-2020, with each corpus being about 20 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (from November 1st 2019), or being "reference" (before that date). The corpora have extensive metadata, including aspects of the parliament; the speakers (name, gender, MP status, party affiliation, party coalition/opposition); are structured into time-stamped terms, sessions and meetings; with speeches being marked by the speaker and their role (e.g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. Note that some corpora have further information, e.g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The corpora are encoded according to the Parla-CLARIN TEI recommendation (https://clarin-eric.github.io/parla-clarin/), but have been validated against the compatible, but much stricter ParlaMint schemas. This entry contains the linguistically marked-up version of the corpus, while the text version is available at http://hdl.handle.net/11356/1432. The ParlaMint.ana linguistic annotation includes tokenization, sentence segmentation, lemmatisation, Universal Dependencies part-of-speech, morphological features, and syntactic dependencies, and the 4-class CoNLL-2003 named entities. Some corpora also have further linguistic annotations, such as PoS tagging or named entities according to language-specific schemes, with their corpus TEI headers giving further details on the annotation vocabularies and tools.}, KEYWORDS = {dibattiti parlamentari, covid-19, ParlaCLARIN, parlamenti, discorso politico, CLARIN, linguistic annotation, pos-tagging, ner, linguistic dependency annotation, UD}, URL = {http://hdl.handle.net/11356/1432}, } @MISC{ERJAVEC_2021_MISC_EOOLSGRPKBSVDDJHNCDVMLCAFMQVRMBSRDUPBKMDLR_463865, AUTHOR = {Erjavec, T. and Ogrodniczuk, M. and Osenova, P. and Ljubešić, N. and Simov, K. and Grigorova, V. and Rudolf, M. and Pančur, A. and Kopp, M. and Barkarson, S. and Steingrímsson, S. and Van Der Pol, H. and Depoorter, G. and De Does, J. and Jongejan, B. and Haltrup Hansen, D. and Navarretta, C. and Calzada Pérez, M. and De Macedo, L. D. and Van Heusden, R. and Marx, M. and Çöltekin, Ç. and Coole, M. and Agnoloni, T. and Frontini, F. and Montemagni, S. and Quochi, V. and Venturi, G. and Ruisi, M. and Marchetti, C. and Battistoni, R. and Sebők, M. and Ring, O. and Darģis, R. and Utka, A. and Petkevičius, M. and Briedienė, M. and Krilavičius, T. and Morkevičius, V. and Diwersy, S. and Luxardo, G. and Rayson, P.}, TITLE = {Multilingual comparable corpora of parliamentary debates ParlaMint 2. 1}, YEAR = {2021}, ABSTRACT = {ParlaMint 2.1 is a multilingual set of 17 comparable corpora containing parliamentary debates mostly starting in 2015 and extending to mid-2020, with each corpus being about 20 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (after November 1st 2019), or being "reference" (before that date). The corpora have extensive metadata, including aspects of the parliament; the speakers (name, gender, MP status, party affiliation, party coalition/opposition); are structured into time-stamped terms, sessions and meetings; with speeches being marked by the speaker and their role (e.g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. Note that some corpora have further information, e.g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The corpora are encoded according to the Parla-CLARIN TEI recommendation (https://clarin-eric.github.io/parla-clarin/), but have been validated against the compatible, but much stricter ParlaMint schemas. This entry contains the ParlaMint TEI-encoded corpora with the derived plain text version of the corpus along with TSV metadata on the speeches. Also included is the 2.0 release of the data and scripts available at the GitHub repository of the ParlaMint project. Note that there also exists the linguistically marked-up version of the corpus, which is available at http://hdl.handle.net/11356/1431.}, KEYWORDS = {dibattiti parlamentari, covid-19, discorso politico, CLARIN, parlamenti, ParlaCLARIN}, URL = {http://hdl.handle.net/11356/1431}, } @MISC{FRONTINI_2021_MISC_FGMB_463503, AUTHOR = {Frontini, F. and Gamba, F. and Monachini, M. and Broeder, D.}, TITLE = {SSHOC Multilingual Data Stewardship Terminology}, YEAR = {2021}, ABSTRACT = {The SSHOC Multilingual Data Stewardship Terminology is a multilingual terminology that collects terms specific to the domain of Data Stewardship, as well as their definitions. A list of domain-specific terms was automatically extracted from a corpus pertaining to the domain of Data Stewardship and Curation, validated by domain experts, assigned a definition, and linked to other existing terminologies (Loterre Open Science Thesaurus, terms4FAIRskills, Linked Open Vocabularies, ISO terms and definitions). Each term-definition pair was then automatically translated into multiple languages (Dutch, French, German, Greek, Italian, Slovenian) by employing Deep-L. The Multilingual Data Stewardship Terminology thus consists of 210 concepts available in Dutch, French, German, Greek, Italian, Slovenian. This resource was created within the frame of the SSHOC (Social Sciences and Humanities Open Cloud) project (H2020-INFRAEOSC-2018-2-823782). It is the result of the work of Task 3.1.2 "extraction of terminology from technical documentation about standards and interoperability", as described in D3.9, carried out jointly by ILC-CNR and CLARIN ERIC.}, KEYWORDS = {terminology, data stewardship}, URL = {http://hdl.handle.net/20.500.11752/ILC-567}, } @MISC{FRONTINI_2021_MISC_FGMB_463504, AUTHOR = {Frontini, F. and Gamba, F. and Monachini, M. and Broeder, D.}, TITLE = {SSHOC Multilingual Metadata}, YEAR = {2021}, ABSTRACT = {SSHOC Multilingual Metadata is based on the metadata set of the CLARIN Concept Registry (CCR). The CCR 232 approved metadata concepts, as well as their definitions, were automatically translated into several languages (Dutch, French, Greek, Italian) thanks to the support of Machine Translation tools, and eventually validated by native speakers who were also expert of the domain. This resource was created within the frame of the SSHOC (Social Sciences and Humanities Open Cloud) project (H2020-INFRAEOSC-2018-2-823782). It is the result of the work of Task 3.1.3 "creating Multilingual metadata and taxonomies for discovery", as described in D3.9, carried out jointly by ILC-CNR and CLARIN ERIC.}, KEYWORDS = {metadata, terminology}, URL = {http://hdl.handle.net/20.500.11752/ILC-568}, } @INPROCEEDINGS{FRONTINI_2020_INPROCEEDINGS_F_437563, AUTHOR = {Frontini, F.}, TITLE = {Dans les coulisses des infrastructures européennes en SHS. Rôle et opportunités pour les acteurs de la recherche (ingénieurs et chercheurs)}, YEAR = {2020}, ABSTRACT = {La composante technologique prend une dimension de jour en jour plus importante en LLASHS. Les projets de recherche sont de plus en plus nombreux à mobiliser de gros volumes de données exigeant des services adaptés garants de formes de méthodologies augmentées (exploitation, interopérabilité, accessibilité, archivage). Afin de partager les savoirs et de garantir l'interopérabilité et la préservation à long terme de ces ressources et services, de grandes infrastructures informatiques se mettent en place aux niveaux national et international. Dans cette présentation, vous allez découvrir le panorama, en la matière, des e-infrastructures et des grands projets européens à caractère infrastructurel, avec un accent particulier sur les technologies utilisées, les principaux services offerts, et les aspects les plus intéressants en termes de synergie entre approches et disciplines différentes. La présentation portera sur des ERICs (European Research Infrastructure Consortium) établis, comme CLARIN et DARIAH, et sur des projets récents ou en cours de développement, comme PARTHENOS, SSHOC, ELEXIS et TRIPLE. Concernant les aspects techniques, on abordera les questions liées au dépôt, au stockage, à l'identification (sigle sign on), aux formats et choix des métadonnées et de modélisation formelle, à la recherche fédérée des sources. Nous soulignerons en particulier l'interaction de ces projets avec les infrastructures nationales, notamment Huma-Num, ainsi qu'avec la récemment constituée European Open Science Cloud (EOSC). La présentation aura une visée pratique, avec l'objectif de fournir des indications concrètes aux acteurs de la recherche (chercheurs, ingénieurs...) qui souhaitent participer à ces initiatives et aux groupes de travail qui les animent, ou plus largement favoriser l'accès des chercheurs français aux nombreux services et opportunités offerts.}, KEYWORDS = {Infrastrutture di ricerca, Scienze umane e sociali}, URL = {https://ja-mate2020.sciencesconf.org/data/pages/Resume_Frontini_Nov.pdf}, CONFERENCE_NAME = {Journées annuelles du réseau Mate-shs (JA2020)}, CONFERENCE_PLACE = {Montpellier}, CONFERENCE_DATE = {10/11/2020}, } @INPROCEEDINGS{KHAN_2018_INPROCEEDINGS_KBFM_387178, AUTHOR = {Khan, F. and Bellandi, A. and Frontini, F. and Monachini, M.}, TITLE = {One Language to rule them all: modelling Morphological Patterns in a Large Scale Italian Lexicon with SWRL}, YEAR = {2018}, ABSTRACT = {We present an application of Semantic Web Technologies to computational lexicography. More precisely we describe the publication of the morphological layer of the Italian Parole Simple Clips lexicon (PSC-M) as linked open data. The novelty of our work is in the use of the Semantic Web Rule Language (SWRL) to encode morphological patterns, thereby allowing the automatic derivation of the inflectional variants of the entries in the lexicon. By doing so we make these patterns available in a form that is human readable and that therefore gives a comprehensive morphological description of a large number of Italian word.}, KEYWORDS = {Morphology, Linked Open Data, Italian Lexicon, SWRL, SQVRL}, PAGES = {4385-4389}, URL = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/844.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-00-9}, CONFERENCE_NAME = {Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, CONFERENCE_PLACE = {Miyazaki, Japan}, CONFERENCE_DATE = {7-12/05/2018}, BOOKTITLE = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, EDITOR = {Calzolari, N.}, } @INCOLLECTION{MANZELLA_2017_INCOLLECTION_MBBDDFMMMNS_368363, AUTHOR = {Manzella, G. M. R. and Bartolini, R. and Bustaffa, F. and D'Angelo, P. and De Mattei, M. and Frontini, F. and Maltese, M. and Medone, D. and Monachini, M. and Novellino, A. and Spada, A.}, TITLE = {Semantic Search Engine for Data Management and Sustainable Development: Marine Planning Service Platform}, YEAR = {2017}, ABSTRACT = {This chapter presents a computer platform supporting a Marine Information and Knowledge System based on a repository that gathers, classify and structures marine scientific literature and data, guaranteeing their accessibility by means of standard protocols. This requires the access to quality controlled data and to information that is provided in grey literature and/or in relevant scientific literature. There exist efforts to develop search engines to find author's contributions to scientific literature or publications. This implies the use of persistent identifiers. However very few efforts are dedicated to link publications to data that was used, or cited in them or that can be of importance for the published studies. Full-text technologies are often unsuccessful since they assume the presence of specific keywords in the text; to fix this problem,it is suggested to use different semantic technologies for retrieving the text and data and thus getting much more complying results.}, KEYWORDS = {Marine Information and Knowledge System}, PAGES = {127-154}, URL = {http://www.igi-global.com/chapter/semantic-search-engine-for-data-management-and-sustainable-development/166839#}, VOLUME = {Volume 7}, DOI = {10.4018/978-1-5225-0700-0.ch006}, PUBLISHER = {IGI Global (Hershey, USA)}, BOOKTITLE = {Oceanographic and Marine Cross-Domain Data Management for Sustainable Development}, EDITOR = {Diviacco, P. and Leadbetter, A. and Glaves, H.}, } @ARTICLE{FRONTINI_2016_ARTICLE_FBRJJ_357604, AUTHOR = {Frontini, F. and Brando, C. and Riguet, M. and Jacquot, C. and Jolivet, V.}, TITLE = {Annotation of Toponyms in TEI Digital Literary Editions and Linking to the Web of Data}, YEAR = {2016}, ABSTRACT = {This paper aims to discuss the challenges and benefits of the annotation of place names in literary texts and literary criticism. We shall first highlight the problems of encoding spatial information in digital editions using the TEI format by means of two manual annotation experiments and the discussion of various cases. This will lead to the question of how to use existing semantic web resources to complement and en-rich toponym mark-up, in particular to provide mentions with precise geo-referencing. Finally the automatic annotation of a large corpus will show the potential of visualizing places from texts, by illustrating an analysis of the evolution of literary life from the spatial and geographical point of view.}, KEYWORDS = {digital literary studies toponyms semantic web geographic databases maps and visualizations}, PAGES = {49-75}, URL = {http://dx.doi.org/10.14195/2182-8830_4-2_3}, VOLUME = {4}, DOI = {10.14195/2182-8830_4-2_3}, ISSN = {2182-8830}, JOURNAL = {MATLIT: Materialidades da Literatura}, } @ARTICLE{FRONTINI_2016_ARTICLE_FCG_357602, AUTHOR = {Frontini, F. and Carmen, B. and Ganascia, J. G.}, TITLE = {REDEN: Named Entity Linking in Digital Literary Editions Using Linked Data Sets}, YEAR = {2016}, ABSTRACT = {This paper proposes a graph-based Named Entity Linking (NEL) algorithm named REDEN for the disambiguation of authors' names in French literary criticism texts and scientific essays from the 19th and early 20th centuries. The algorithm is described and evaluated according to the two phases of NEL as reported in current state of the art, namely, candidate retrieval and candidate selection. REDEN leverages knowledge from different Linked Data sources in order to select candidates for each author mention, subsequently crawls data from other Linked Data sets using equivalence links (e.g., owl:sameAs), and, finally, fuses graphs of homologous individuals into a non-redundant graph well-suited for graph centrality calculation; the resulting graph is used for choosing the best referent. The REDEN algorithm is distributed in open-source and follows current standards in digital editions (TEI) and semantic Web (RDF). Its integration into an editorial workflow of digital editions in Digital humanities and cultural heritage projects is entirely plausible. Experiments are conducted along with the corresponding error analysis in order to test our approach and to help us to study the weaknesses and strengths of our algorithm, thereby to further improvements of REDEN.}, KEYWORDS = {Named Entity Linking, graph centrality, linked data, data fusion, digital humanities}, PAGES = {60-80}, URL = {https://csimq-journals.rtu.lv/article/view/csimq.2016-7.04}, VOLUME = {7}, DOI = {10.7250/csimq.2016-7.04}, ISSN = {2255-9922}, JOURNAL = {Complex Systems Informatics and Modeling Quarterly}, } @ARTICLE{GOGGI_2016_ARTICLE_GPBFMMDB_359144, AUTHOR = {Goggi, S. and Pardelli, G. and Bartolini, R. and Frontini, F. and Monachini, M. and Manzella, G. and De Mattei, M. and Bustaffa, F.}, TITLE = {A semantic engine for grey literature retrieval in the oceanography domain}, YEAR = {2016}, ABSTRACT = {Here we present the final results of the MAPS (Marine Planning and Service Platform) project, an environment designed for gathering, classifying, managing and accessing marine scientific literature and data, making it available for search to Operative Oceanography researchers of various institutions by means of standard protocols. The system takes as input non-textual data (measurements) and text - both published papers and documentation - and it provides an advanced search facility thanks to the rich set of metadata and, above all, to the possibility of a refined and domain targeted key-word indexing of texts using Natural Language Processing (NLP) techniques. The paper describes the system in its details providing also evidence of evaluation.}, KEYWORDS = {Information Extraction, Search Engine, Operative Oceanography}, PAGES = {155-161}, URL = {http://www.greynet.org/thegreyjournal/currentissue.html}, VOLUME = {12}, PUBLISHER = {TextRelease (Amsterdam, Paesi Bassi)}, ISSN = {1574-1796}, JOURNAL = {The Grey journal (Print)}, } @ARTICLE{MONACHINI_2016_ARTICLE_MF_373630, AUTHOR = {Monachini, M. and Frontini, F.}, TITLE = {CLARIN, l'infrastruttura europea delle risorse linguistiche per le scienze umane e sociali e il suo network italiano CLARIN-IT}, YEAR = {2016}, ABSTRACT = {ll 1°ottobre 2015 il MIUR firma l'adesione dell'Italia a CLARIN-ERIC, l'infrastruttura di ricerca che offre risorse e tecnologie linguistiche dedicate al settore delle scienze del linguaggio e delle scienze umane e sociali. Questo articolo intende fornire alla comunità italiana una ampia panoramica di CLARIN, la sua missione, i suoi pilastri, i servizi, la sua organizzazione tecnica ed amministrativa e la struttura di governance, sia a livello europeo che locale. Viene introdotto il network italiano, con il primo centro nazionale ILC4CLARIN, ospitato ed in via di sviluppo presso l'ILC-CNR, le funzionalità, le risorse ed i servizi offerti; viene presentato infine il primo nucleo del consorzio nazionale CLARIN-IT, illustrando i criteri di costituzione, le attività previste e le prospettive future.}, KEYWORDS = {Infrastrutture di ricerca, Tecnologie linguistiche, Network italiano CLARIN-IT}, PAGES = {1-30}, URL = {http://www.ai-lc.it/IJCoL/v2n2/1-monachini_and_frontini.pdf}, VOLUME = {Vol. 2}, PUBLISHER = {aAccademia University Press, Torino (Italia)}, ISSN = {2499-4553}, JOURNAL = {Italian Journal of Computational Linguistics}, } @INCOLLECTION{FRONTINI_2016_INCOLLECTION_FDM_357638, AUTHOR = {Frontini, F. and Del Gratta, R. and Monachini, M.}, TITLE = {GeoDomainWordNet: Linking the Geonames Ontology to WordNet}, YEAR = {2016}, ABSTRACT = {This paper illustrates the transformation of GeoNames' ontology concepts, with their English labels and glosses, into a GeoDomain WordNet-like resource in English, its translation into Italian, and its linking to the existing generic WordNets of both languages. The paper describes the criteria used for the linking of domain synsets to each other and to the generic ones and presents the published resource in RDF according to the w3c and lemon schema.}, KEYWORDS = {GeoNames, WordNet, Language resources, Lexicons, Linguistic linked data, lemon, RDF}, PAGES = {229-242}, URL = {http://link.springer.com/chapter/10.1007/978-3-319-43808-5_18}, VOLUME = {9561}, DOI = {10.1007/978-3-319-43808-5}, ISBN = {978-3-319-43808-5}, BOOKTITLE = {Human Language Technology. Challenges for Computer Science and Linguistics}, EDITOR = {Vetulani, Z. and Uszkoreit, H. and Kubis, M.}, } @EDITORIAL{KHAN_2016_EDITORIAL_KVLFFPGU_355434, AUTHOR = {Khan, F. and Vintar, Š. and León Araúz, P. and Faber, P. and Frontini, F. and Parvizi, A. and Grčić Simeunović, L. and Unger, C.}, TITLE = {Language and Ontology (LangOnto2) & Terminology and Knowledge Structures (TermiKS)}, YEAR = {2016}, ABSTRACT = {This joint workshop brings together two different but closely related strands of research. On the one hand it looks at the overlap between ontologies and computational linguistics and on the other it explores the relationship between knowledge modelling and terminologies. In particular the workshop aims to create a forum for discussion in which the different relationships and commonalities between these two areas can be explored in detail, as well as presenting cutting edge research in each of the two individual areas. A significant amount of human knowledge can be found in texts. It is not surprising that languages such as OWL, which allow us to formally represent this knowledge, have become more and more popular both in linguistics and in automated language processing. For instance ontologies are now of core interest to many NLP fields including Machine Translation, Question Answering, Text Summarization, Information Retrieval, and Word Sense Disambiguation. At a more abstract level, however, ontologies can also help us to model and reason about phenomena in natural language semantics. In addition, ontologies and taxonomies can also be used in the organisation and formalisation of linguistically relevant categories such as those used in tagsets for corpus annotation. Notably also, the fact that formal ontologies are being increasingly accessed by users with limited to no background in formal logic has led to a growing interest in developing accessible front ends that allow for easy querying and summarisation of ontologies. It has also led to work in developing natural language interfaces for authoring ontologies and evaluating their design. Additionally in recent years there has been a renewed interest in the linguistic aspects of accessing, extracting, representing, modelling and transferring knowledge. Numerous tools for the automatic extraction of terms, term variants, knowledge-rich contexts, definitions, semantic relations and taxonomies from specialized corpora have been developed for a number of languages, and new theoretical approaches have emerged as potential frameworks for the study of specialized communication. However, the building of adequate knowledge models for practitioners (e.g. experts, researchers, translators, teachers etc.), on the one hand, and NLP applications (including cross-language, cross-domain, cross-device, multi-modal, multi-platform applications), on the other hand, still remains a challenge. The papers included in the workshop range across a wide variety of different areas and reflect the strong inter-disciplinary approach, which characterises both areas of research. In addition we are very happy to include two invited talks in the program presented by authorities in their respective fields: Pamela Faber from the field of terminology, and John McCrae, an expert on linguistic linked data and the interface between NLP and ontologies.}, KEYWORDS = {lexicons, ontologies}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, } @INPROCEEDINGS{BRANDO_2016_INPROCEEDINGS_BAF_348461, AUTHOR = {Brando, C. and Abadie, N. and Frontini, F.}, TITLE = {Linked Data Quality for Domain-Specific Named-Entity Linking}, YEAR = {2016}, ABSTRACT = {We present outgoing research whose goal is to assess quality of Linked Data for its usage in domain-specific Named-entity Linking (NEL). NEL is the task of assigning appropriate referents, typically an Uniform Resource Identifier (URI), to mentions of entities (e.g. persons or places) identified in textual documents. Nowadays, many of these approaches strongly rely on Linked Data as knowledge base. However, the scope of the chosen data sets can have an important influence on the performances of NEL as texts often concern specific domains of knowledge. In this paper, we describe LD quality aspects which should be considered for improving NEL in domain-specific contexts, then propose quality metrics and compute them for both French DBpedia and the French National Library (BnF) data sets thereby to discuss the opportunity of using these data sets for the linking of authors in old French Literary digital editions. Our ultimate goal is to improve a Natural Language Processing (NLP) pipeline for the automatic annotation of these texts.}, KEYWORDS = {Linked Data, Quality, Named Entity Linking}, PAGES = {13-24}, URL = {https://publications.cnr.it/doc/348461}, CONFERENCE_NAME = {Atelier-Qualité des Données du Web (QLOD'16) Joint à la 16ème édition de la conférence internationale francophone EGC 2016}, CONFERENCE_PLACE = {Reims}, CONFERENCE_DATE = {19/01/2016}, } @INPROCEEDINGS{DELGRATTA_2016_INPROCEEDINGS_DFMPRBKSC_355425, AUTHOR = {Del Gratta, R. and Frontini, F. and Monachini, M. and Pardelli, G. and Russo, I. and Bartolini, R. and Khan, F. and Soria, C. and Calzolari, N.}, TITLE = {LREC as a Graph: People and Resources in a Network}, YEAR = {2016}, ABSTRACT = {This proposal describes a new way to visualise resources in the LREMap, a community-built repository of language resource descriptions and uses. The LREMap is represented as a force-directed graph, where resources, papers and authors are nodes. The analysis of the visual representation of the underlying graph is used to study how the community gathers around LRs and how LRs are used in research.}, KEYWORDS = {Language Resources, Resources Documentation, Data Visualisation}, PAGES = {2529-2532}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, CONFERENCE_PLACE = {Portoroz, Slovenia}, CONFERENCE_DATE = {23-28 may}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Goggi, S. and Grobelnik, M. and Maegaard, B. and Mariani, J. and Mazo, H. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{GOGGI_2016_INPROCEEDINGS_GPBFMMDB_350374, AUTHOR = {Goggi, S. and Pardelli, G. and Bartolini, R. and Frontini, F. and Monachini, M. and Manzella, G. and De Mattei, M. and Bustaffa, F.}, TITLE = {A semantic engine for grey literature retrieval in the oceanography domain}, YEAR = {2016}, ABSTRACT = {Here we present the final results of the MAPS (Marine Planning and Service Platform) project, an environment designed for gathering, classifying, managing and accessing marine scientific literature and data, making it available for search to Operative Oceanography researchers of various institutions by means of standard protocols. The system takes as input non-textual data (measurements) and text - both published papers and documentation - and it provides an advanced search facility thanks to the rich set of metadata and, above all, to the possibility of a refined and domain targeted key-word indexing of texts using Natural Language Processing (NLP) techniques. The paper describes the system in its details providing also evidence of evaluation.}, KEYWORDS = {Information Extraction, Search Engine, Operative Oceanography}, PAGES = {104-111}, URL = {https://publications.cnr.it/doc/350374}, VOLUME = {17}, ISBN = {978-90-77484-27-2}, CONFERENCE_NAME = {Seventeenth International Conference on Grey Literature. A New Wave of Textual and Non-Textual Grey Literature}, CONFERENCE_PLACE = {Amsterdam}, CONFERENCE_DATE = {December 1st-2nd 2015}, EDITOR = {Farace, D. and Frantzen, J.}, } @INPROCEEDINGS{NAHLI_2016_INPROCEEDINGS_NFMKZK_355436, AUTHOR = {Nahli, O. and Frontini, F. and Monachini, M. and Khan, F. and Zarghili, A. and Khalfi, M.}, TITLE = {Al Qamus al Muhit, a Medieval Arabic Lexicon in LMF}, YEAR = {2016}, ABSTRACT = {This paper describes the conversion into LMF, a standard lexicographic digital format of 'al-q?m?s al-mu???, a Medieval Arabic lexicon. The lexicon is first described, then all the steps required for the conversion are illustrated. The work is will produce a useful lexicographic resource for Arabic NLP, but is also interesting per se, to study the implications of adapting the LMF model to the Arabic language. Some reflections are offered as to the status of roots with respect to previously suggested representations. In particular, roots are, in our opinion are to be not treated as lexical entries, but modeled as lexical metadata for classifying and identifying lexical entries. In this manner, each root connects all entries that are derived from it.}, KEYWORDS = {Arabic Lexicon, LMF, Al Qamus al Muhi}, PAGES = {943-950}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, CONFERENCE_PLACE = {Portoroz, Slovenia}, CONFERENCE_DATE = {23-28 may}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Goggi, S. and Grobelnik, M. and Maegaard, B. and Mariani, J. and Mazo, H. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{FRONTINI_2016_INPROCEEDINGS_FCG_357603, AUTHOR = {Frontini, F. and Carmen, B. and Ganascia, J. G.}, TITLE = {REDEN ONLINE: Disambiguation, Linking and Visualisation of References in TEI Digital Editions}, YEAR = {2016}, KEYWORDS = {entity linking, visualization, literary criticism, TEI}, URL = {http://dh2016.adho.org/abstracts/362}, CONFERENCE_NAME = {Digital Humanities 2016}, CONFERENCE_PLACE = {Jagiellonian University \& Pedagogical University, Kraków}, CONFERENCE_DATE = {11-16/07/2016}, BOOKTITLE = {Digital Humanities 2016: Conference Abstracts}, } @INPROCEEDINGS{MANZELLA_2016_INPROCEEDINGS_MBBDDFMMMNS_355476, AUTHOR = {Manzella, G. M. R. and Bartolini, R. and Bustaffa, F. and D'Angelo, P. and De Mattei, M. and Frontini, F. and Maltese, M. and Medone, D. and Monachini, M. and Novellino, A. and Spada, A.}, TITLE = {Marine Planning and Service Platform: Specific Ontology Based semantic Search Engine Serving Data Management and Sustainable Development}, YEAR = {2016}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is aiming at building a computer platform supporting a Marine Information and Knowledge System. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. In oceanography the cost related to data collection is very high and the new paradigm is based on the concept to collect once and re-use many times (for re-analysis, marine environment assessment, studies on trends, etc). This concept requires the access to quality controlled data and to information that is provided in reports (grey literature) and/or in relevant scientific literature. Hence, creation of new technology is needed by integrating several disciplines such as data management, information systems, knowledge management...}, KEYWORDS = {Marine Information, Knowledge System}, PAGES = {2}, URL = {http://meetingorganizer.copernicus.org/EGU2016/orals/20144}, VOLUME = {18}, PUBLISHER = {Copernicus GmbH (Katlenburg-Lindau, Germania)}, ISSN = {1607-7962}, CONFERENCE_NAME = {European Geosciences Union General Assembly (EGU 2016)}, CONFERENCE_PLACE = {Vienna, Austria}, CONFERENCE_DATE = {17-22 aprile 2016}, BOOKTITLE = {Geophysical research abstracts (Online)}, } @INPROCEEDINGS{MONACHINI_2016_INPROCEEDINGS_MEF_368272, AUTHOR = {Monachini, M. and Enea, A. and Frontini, F.}, TITLE = {CLARIN-IT: servizi per la comunità italiana delle scienze umane e sociali}, YEAR = {2016}, ABSTRACT = {CLARIN-IT -The Italian Common Language Resources and Technology Infrastructure: Monica Monachini - CLARIN Italian National Coordinator Alessandro Enea - Responsible of ILCforCLARIN \& contact person for IDEM Francesca Frontini - Standing Committee for CLARIN Technical Centres (SCCTC) ILC-CNR National Representative}, KEYWORDS = {CLARIN-IT, The Italian Common Language Resources and Technology Infrastructure}, URL = {http://www.clarin-it.it/en/content/clarin-it-idem-day-2016}, CONFERENCE_NAME = {CLARIN-IT @ IDEM Day 2016}, CONFERENCE_PLACE = {Roma [Università degli Studi di Roma Tre]}, CONFERENCE_DATE = {6-8 giugno 2016}, } @ARTICLE{DELGRATTA_2015_ARTICLE_DFKM_287051, AUTHOR = {Del Gratta, R. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {Converting the PAROLE SIMPLE CLIPS Lexicon into RDF with lemon}, YEAR = {2015}, ABSTRACT = {This paper describes the publication and linking of (parts of) PAROLE SIMPLE CLIPS (PSC), a large scale Italian lexicon, to the Semantic Web and the Linked Data cloud using the lemon model. The main challenge of the conversion is discussed, namely the reconciliation between the PSC semantic structure which contains richly encoded semantic information, following the qualia structure of the Generative Lexicon theory and the lemon view of lexical sense as a reified pairing of a lexical item and a concept in an ontology. The result is two datasets: one consists of a list of lemon lexical entries with their lexical properties, relations and senses; the other consists of a list of OWL individuals representing the referents for the lexical senses. These OWL individuals are linked to each other by a set of semantic relations and mapped onto the SIMPLE OWL ontology of higher level semantic types.}, KEYWORDS = {lemon, linked data, generative lexicon, RDF, OWL, lexical resource}, PAGES = {387-392}, URL = {http://www.semantic-web-journal.net/content/converting-parole-simple-clips-lexicon-rdf-lemon-0}, VOLUME = {6}, DOI = {10.3233/SW-140168}, PUBLISHER = {IOS Press (Amsterdam, Paesi Bassi)}, ISSN = {1570-0844}, JOURNAL = {Semantic web (Print)}, } @ARTICLE{GOGGI_2015_ARTICLE_GMFBPDBM_334894, AUTHOR = {Goggi, S. and Monachini, M. and Frontini, F. and Bartolini, R. and Pardelli, G. and De Mattei, M. and Bustaffa, F. and Manzella, G.}, TITLE = {Marine Planning and Service Platform (MAPS) An Advanced Research Engine for Grey Literature in Marine Science}, YEAR = {2015}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is a development of the Marine project (Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013) aiming at building a computer platform for supporting a Marine Information and Knowledge System, as part of the data management activities. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. We will present the scenario of the Operative Oceanography together with the technologies used to develop an advanced search engine which aims at providing rapid and efficient access to a Digital Library of oceanographic data. The case-study is also highlighting how the retrieval of grey literature from this specific marine community could be reproduced for similar communities as well, thus revealing the great impact that the processing, re-use as well as application of grey data have on societal needs/problems and their answers.}, KEYWORDS = {Marine Science Search Engine Source Data Oceanography}, PAGES = {171-178}, URL = {https://publications.cnr.it/doc/334894}, VOLUME = {11}, PUBLISHER = {TextRelease (Amsterdam, Paesi Bassi)}, ISSN = {1574-1796}, JOURNAL = {The Grey journal (Print)}, } @INCOLLECTION{BRANDO_2015_INCOLLECTION_BFG_334082, AUTHOR = {Brando, C. and Frontini, F. and Ganascia, J.}, TITLE = {Disambiguation of Named Entities in Cultural Heritage Texts Using Linked Data Sets}, YEAR = {2015}, ABSTRACT = {This paper proposes a graph-based algorithm baptized REDEN for the disambiguation of authors' names in French literary criticism texts and scientific essays from the 19th century. It leverages knowledge from different Linked Data sources in order to select candidates for each author mention, then performs fusion of DBpedia and BnF individuals into a single graph, and finally decides the best referent using the notion of graph centrality. Some experiments are conducted in order to identify the best size of disambiguation context and to assess the influence on centrality of specific relations represented as edges. This work will help scholars to trace the impact of authors' ideas across different works and time periods.}, KEYWORDS = {Named-entity disambiguation Centrality Linked data Data fusion Digital humanities}, PAGES = {505-514}, URL = {http://link.springer.com/chapter/10.1007%2F978-3-319-23201-0_51}, VOLUME = {539}, DOI = {10.1007/978-3-319-23201-0_51}, ISBN = {978-3-319-23200-3}, BOOKTITLE = {New Trends in Databases and Information Systems}, EDITOR = {Morzy, T. and Valduriez, P. and Bellatreche, L.}, } @INPROCEEDINGS{BRANDO_2015_INPROCEEDINGS_BFG_344351, AUTHOR = {Brando, C. and Frontini, F. and Ganascia, J.}, TITLE = {Linked data for toponym linking in French literary texts}, YEAR = {2015}, ABSTRACT = {The present article discusses first experiments in toponym linking of Modern French digital editions aiming to provide an external referent to Linked Data sources. We have so far focused on testing two knowledge bases - French DBpedia and Geonames - for recall. Results highlight quality issues in these data sets for usage in NLP-tasks in domain-specific heritage texts.}, KEYWORDS = {Named-Entity Linking Linked Data Digital Humanities}, URL = {https://publications.cnr.it/doc/344351}, DOI = {10.1145/2837689.2837699}, PUBLISHER = {Association for Computing Machinery (New York, N. Y, Stati Uniti d'America)}, ISSN = {1933-7825}, ISBN = {978-1-4503-3937-7}, CONFERENCE_NAME = {GIR'15 9th Workshop on Geographic Information Retrieval}, CONFERENCE_PLACE = {Paris}, CONFERENCE_DATE = {26-27th November, 2015}, BOOKTITLE = {GIR '15 Proceedings of the 9th Workshop on Geographic Information Retrieval}, EDITOR = {Purves, R. S. and Jones, C. B.}, } @INPROCEEDINGS{DELGRATTA_2015_INPROCEEDINGS_DFMPRBGKQSC_342213, AUTHOR = {Del Gratta, R. and Frontini, F. and Monachini, M. and Pardelli, G. and Russo, I. and Bartolini, R. and Goggi, S. and Khan, F. and Quochi, V. and Soria, C. and Calzolari, N.}, TITLE = {Visualising Italian Language Resources: a Snapshot}, YEAR = {2015}, ABSTRACT = {This paper aims to provide a first snapshot of Italian Language Resources (LRs) and their uses by the community, as documented by the papers presented at two different conferences, LREC2014 and CLiC-it 2014. The data of the former were drawn from the LOD version of the LRE Map, while those of the latter come from manually analyzing the proceedings. The results are presented in the form of visual graphs and confirm the initial hypothesis that Italian LRs require concrete actions to enhance their visibility.}, KEYWORDS = {Italian Language Resources}, PAGES = {100-104}, URL = {https://books.openedition.org/aaccademia/1277?lang=it}, ISBN = {978-88-99200-62-6}, CONFERENCE_NAME = {Second Italian Conference on Computational Linguistics CLiC-it 2015}, CONFERENCE_PLACE = {Trento}, CONFERENCE_DATE = {3-4 December 2015}, BOOKTITLE = {Proceedings of the Second Italian Conference on Computational Linguistics CLiC-it 2015}, EDITOR = {Bosco, C. and Tonelli, S. and Zanzotto, F. M.}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_307909, AUTHOR = {Frontini, F. and Boukhaled, M. A. and Ganascia, J.}, TITLE = {Linguistic Pattern Extraction and Analysis for Classic French Plays}, YEAR = {2015}, ABSTRACT = {Great authors of fiction and theatre have the capacity of creating memorable characters that take life and become almost as real as living persons to the readers/audience. The study of characterization, namely of how this is achieved, is a well-researched topic in corpus stylistics: for instance (Mahlberg, 2012) attempts to identify typical lexical patterns for memorable Dickens' characters by extracting those lexical bundles that stand out (namely are overrepresented) in comparison to a general corpus. In other works, authorship attribution methods are applied to the different characters of a play to identify whether the author has been able to provide each of them with a "distinct" voice. For instance (Vogel \& Lynch, 2008) compare individual Shakespeare characters against the whole play or even against all plays of the same author. The purpose of this paper is to propose a methodology for the study characterization of several characters in French plays of the classical period. The tools developed are meant to support textual analysis by: 1) Verifying the degree of characterization of each character with respect to others. 2) Automatically inducing a list of linguistic features that are significant, representative for that character. Preliminary investigations have been conducted on plays by Moliere, cross-comparing four protagonists from four different plays. The proposed methodology relies on sequential data mining for the extraction of linguistic patterns and on correspondence analysis for comparison of patterns frequencies in each character and for the visual representation of such differences.}, KEYWORDS = {computational stylometry, thater, sequential pattern mining}, PAGES = {3}, URL = {http://lipn.univ-paris13.fr/~charnois/conscilaGenres/resumes/frontini.pdf}, CONFERENCE_NAME = {Journée ConSciLa (Confrontations en Sciences du Langage) Grammaire des genres et des styles: quelles approches privilégier ?}, CONFERENCE_PLACE = {Paris}, CONFERENCE_DATE = {16/01/2015}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_330648, AUTHOR = {Frontini, F. and Brando, C. and Ganascia, J.}, TITLE = {Semantic Web based Named Entity Linking for Digital Humanities and Heritage Texts}, YEAR = {2015}, ABSTRACT = {This paper proposes a graph based methodology for automatically disambiguating authors' mentions in a corpus of French literary criticism. Candidate referents are identified and evaluated using a graph based named entity linking algorithm, which exploits a knowledge-base built out of two different resources (DBpedia and the BnF linked data). The algorithm expands previous ones applied for word sense disambiguation and entity linking, with good results. Its novelty resides in the fact that it successfully combines a generic knowledge base such as DBpedia with a domain specific one, thus enabling the efficient annotation of minor authors. This will help specialists to follow mentions of the same author in different works of literary criticism, and thus to investigate their literary appreciation over time.}, KEYWORDS = {named-entity linking, linked data, digital humanities}, PAGES = {77-88}, URL = {http://ceur-ws.org/Vol-1364/paper9.pdf}, VOLUME = {Vol-1364}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {SW4SH 2015 Semantic Web for Scientific Heritage 2015}, CONFERENCE_PLACE = {Portoroz, Slovenia}, CONFERENCE_DATE = {June, 1st 2015}, BOOKTITLE = {SW4SH 2015 Semantic Web for Scientific Heritage 2015}, EDITOR = {Zucker, A. and Draelants, I. and Zucker, C. F. and Monnin, A.}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_331797, AUTHOR = {Frontini, F. and Brando, C. and Ganascia, J.}, TITLE = {Domain-adapted named-entity linker using Linked Data}, YEAR = {2015}, ABSTRACT = {We present REDEN, a tool for graph-based Named Entity Linking that allows for the disambiguation of entities using domain-specific Linked Data sources and different configurations (e.g. context size). It takes TEI-annotated texts as input and outputs them enriched with external references (URIs). The possibility of customizing indexes built from various knowledge sources by defining temporal and spatial extents makes REDEN particularly suited to handle domain-specific corpora such as enriched digital editions in the Digital Humanities.}, KEYWORDS = {named-entity disambiguation, evaluation, linked data, digital humanities}, PAGES = {10}, URL = {http://ceur-ws.org/Vol-1386/named_entity.pdf}, VOLUME = {Vol-1386}, PUBLISHER = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen (Aachen, Germania)}, ISSN = {1613-0073}, CONFERENCE_NAME = {Workshop on NLP Applications: Completing the Puzzle co-located with the 20th International Conference on Applications of Natural Language to Information Systems (NLDB 2015)}, CONFERENCE_PLACE = {Passau, Germany}, CONFERENCE_DATE = {June 17-19, 2015}, BOOKTITLE = {Proceedings of the Workshop on NLP Applications: Completing the Puzzle}, EDITOR = {Izquierdo, R.}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FQM_304304, AUTHOR = {Frontini, F. and Quochi, V. and Monachini, M.}, TITLE = {Generative Lexicon and polysemy: inducing logical alternations}, YEAR = {2015}, ABSTRACT = {The current paper brings together the results of a series of experiments for inducing regular sense alternations, or regular/ logical polysemy, from a computational lexicon based on the Generative Lexicon theory. The results are discussed in light of the potential benefits and uses of the amended algorithm.}, KEYWORDS = {Polysemy, Generative Lexicon, Logical Alternations}, PAGES = {7}, URL = {https://publications.cnr.it/doc/304304}, PUBLISHER = {MAPLEX2015 Multiple Approaches to Lexicon Conference (Yamagata, JPN)}, CONFERENCE_NAME = {MAPLEX2015 Multiple Approaches to Lexicon Conference}, CONFERENCE_PLACE = {Yamagata, Japan}, CONFERENCE_DATE = {February 9-10, 2015}, EDITOR = {Hsieh, S. and Kanzaki, K.}, } @INPROCEEDINGS{GOGGI_2015_INPROCEEDINGS_GMFBPDBM_329370, AUTHOR = {Goggi, S. and Monachini, M. and Frontini, F. and Bartolini, R. and Pardelli, G. and De Mattei, M. and Bustaffa, F. and Manzella, G.}, TITLE = {Marine Planning and Service Platform (MAPS): An Advanced Research Engine for Grey Literature in Marine Science}, YEAR = {2015}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is a development of the Marine project (Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013) aiming at building a computer platform for supporting a Marine Information and Knowledge System, as part of the data management activities. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. We will present the scenario of the Operative Oceanography together with the technologies used to develop an advanced search engine which aims at providing rapid and efficient access to a Digital Library of oceanographic data. The case-study is also highlighting how the retrieval of grey literature from this specific marine community could be reproduced for similar communities as well, thus revealing the great impact that the processing, re-use as well as application of grey data have on societal needs/problems and their answers.}, KEYWORDS = {Marine Science Search Engine Source Data Oceanography}, PAGES = {108-114}, URL = {http://www.textrelease.com/gl16program.html}, VOLUME = {16}, PUBLISHER = {TextRelease (Amsterdam, NLD)}, ISBN = {978-90-77484-23-4}, CONFERENCE_NAME = {Sixteenth International Conference on Grey Literature Grey Literature Lobby: Engines and Requesters for Change}, CONFERENCE_PLACE = {Library of Congress Washington D. C., USA}, CONFERENCE_DATE = {December 8-9 2014}, BOOKTITLE = {Grey Literature Lobby: Engines and Requesters for Change}, EDITOR = {Farace, D. and Frantzen, J.}, } @INPROCEEDINGS{KHAN_2015_INPROCEEDINGS_KF_329646, AUTHOR = {Khan, F. and Frontini, F.}, TITLE = {Using Ontologies to Model Polysemy in Lexical Resources}, YEAR = {2015}, ABSTRACT = {In this article we look at how the use of ontologies can assist in analysing polysemy in natural languages. We develop a model, the Lexical-Sense-Ontology model (LSO), to represent the interaction between a lexicon and ontology, based on lemon. We use the LSO model to show how default rules can be used to represent semi-productivity in polysemy as well as discussing the kinds of ontological information that are useful for studying polysemy.}, KEYWORDS = {Polysemy, Ontology, Default Logic}, URL = {http://www.aclweb.org/anthology/W/W15/W15-0404.pdf}, CONFERENCE_NAME = {Workshop on Language and Ontologies}, CONFERENCE_PLACE = {London}, CONFERENCE_DATE = {14/04/2015}, BOOKTITLE = {Proceedings of the Workshop on Language and Ontologies}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_F_315607, AUTHOR = {Frontini, F.}, TITLE = {What makes them different: the extraction of distinctive linguistic patterns for the protagonists of Molière's plays}, YEAR = {2015}, ABSTRACT = {Quantitative approaches to the study of style in literature are far from a modern novelty. They have however recently gained more and more popularity, not only among computer scientists and corpus linguistics, but also among some influential literary critics. The present panorama of quantitative techniques is very rich, but often confusing, with a plethora of denominations and methodologies often difficult to reconcile; computer scientists classify their work as stylometry or computational stylistics, while linguists may use the label corpus stylistics, and finally critics like Franco Moretti will talk about macro-analysis and distant reading. This talk will try first to identify the differences between these trends, distinguishing between corpus based and corpus driven approaches on the methodological side (Quiniou et al 2012), and (following Ramsey 2011) between experimental and hermeneutical approaches. Finally we will present ongoing work conducted at Labex OBVIL on syntactic pattern extraction from theatrical characters. The proposed approach, using correspondence analysis to extract distinctive traits for each character, is imagined rather as an hermeneutical tool, in the sense that it does not seek to demonstrate that two different characters have been endowed with significantly different stylistic traits by the playwright, but it does enable the visualisation of their relative distances and the extraction of those elements that make them distinct.}, URL = {https://publications.cnr.it/doc/315607}, CONFERENCE_NAME = {Cycle des séminaires ILES LIMSI}, CONFERENCE_PLACE = {Paris}, CONFERENCE_DATE = {03/02/2015}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_F_329647, AUTHOR = {Frontini, F.}, TITLE = {Analyse et extraction des motifs syntaxiques dans la prose de Robert Challe et de ses apocryphes}, YEAR = {2015}, ABSTRACT = {Cette contribution presente une extraction et une analyse des motifs syntaxiques dans la prose de Robert Challe et de ses apocryphes. En particulier nous analysons les différence dans la syntaxe des contes originaux des Illustres Françaises et celle des contes apocryphes.}, KEYWORDS = {Robert Challe, authorship attribution, stilistica computazionale}, URL = {http://obvil.paris-sorbonne.fr/sites/default/files/projets/analyse_motifs_syntaxiques_if_et_apocryphes.pdf}, CONFERENCE_NAME = {Robert Challe: approches numériques des questions d'auctorialité}, CONFERENCE_PLACE = {Paris}, CONFERENCE_DATE = {28/03/2015}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_F_332668, AUTHOR = {Frontini, F.}, TITLE = {Mining for characterising patterns in literature using correspondence analysis: an experiment on French novels}, YEAR = {2015}, ABSTRACT = {The talk presents and describes a bottom up methodology for the detection of stylistic traits in the syntax of literary texts. The extraction of syntactic patterns is performed blindly by a sequential pattern mining algorithm, while the identification of significant and interesting features is performed later by using correspondence analysis and filtering for the most contributive patterns.}, KEYWORDS = {computational stylistics, French}, URL = {https://publications.cnr.it/doc/332668}, CONFERENCE_NAME = {Göttingen Dialog in Digital Humanities}, CONFERENCE_PLACE = {Göttingen}, CONFERENCE_DATE = {14/07/2015}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_F_336421, AUTHOR = {Frontini, F.}, TITLE = {Trattamento automatico del linguaggio per le Digital Humanities. Riconoscimento e disambiguazione di menzioni di autori in testi di critica letteraria}, YEAR = {2015}, ABSTRACT = {L'intervento scaturisce da una collaborazione tra ILC-CNR e il Labex OBVIL di Parigi. Lo scopo del progetto è quello di adattare ed estendere algoritmi di riconoscimento, classificazione e disambiguazione di entità nominate (in particolare menzioni di autori) nel "Corpus Critique", un insieme di testi di critica letteraria francese che il Labex OBVIL sta pubblicando in edizione digitale (formato TEI). Tali algoritmi si basano su approcci TAL supervisionati e non supervisionati e sfruttano massicciamente le basi di conoscenza, sia generiche (DBpedia) che di dominio, disponibili online sotto forma di linked data; lo scopo di tali lavori è di produrre risorse testuali annotate per facilitare la ricerca nell'ambito della storia della critica letteraria e della storia delle idee in generale. Durante il seminario verranno introdotti i formati e le risorse utilizzate, i criteri e le problematiche di annotazione emersi, e gli algoritmi riconoscimento e disambiguazione di entità nominate sviluppati. Più in generale si cercherà di mostrare con alcuni casi di utilizzo quali siano i vantaggi di arricchire risorse testuali con questo livello di annotazione, nel più ampio contesto delle convergenze tra digital humanities e trattamento automatico del linguaggio. Link http://obvil.paris-sorbonne.fr/ https://github.com/cvbrandoe/REDEN/blob/master/README.md}, KEYWORDS = {Named-entity disambiguation Centrality Linked data Data fusion Digital humanities}, URL = {https://publications.cnr.it/doc/336421}, CONFERENCE_NAME = {Seminario di Cultura Digitale}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {04/11/2015}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FB_342185, AUTHOR = {Frontini, F. and Bénard, E.}, TITLE = {The Syntax of Stage. Studying Linguistic Patterns in Molière}, YEAR = {2015}, ABSTRACT = {Theatrical dialogue is a very peculiar type of communication, namely a written text that aims to mimic orality. Great playwrights use dialogue to create iconic human types, that actors then bring to life. Characterisation, comical effects and other plot devices are often achieved through the use of specific linguistic patterns. For this reason theatrical dialogue is an interesting test bed for computer-aided literary analysis and stylometric tools. In this talk we shall analyse the application of advanced pattern extraction techniques to the study of Molière's dialogue and characters, where by "pattern" we mean sequences of lexical elements and parts of speech. In particular we shall see how different types of extractions may provide experts with different views on the texts and target different aspects of stylistic choice.}, KEYWORDS = {Computational stylistics, syntactic patterns, Molière}, URL = {http://www.uni-goettingen.de/de/525494.html}, CONFERENCE_NAME = {Göttinger philologisches Forum}, CONFERENCE_PLACE = {Göttingen, Germany}, CONFERENCE_DATE = {03/12/2015}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_332819, AUTHOR = {Frontini, F. and Boukhaled, M. A. and Ganascia, J. G.}, TITLE = {Moliere's Raisonneurs: a quantitative study of distinctive linguistic patterns}, YEAR = {2015}, KEYWORDS = {Computational Stylistics, Correspondence analysis, Corpus linguistics, Molière}, PAGES = {114-117}, URL = {http://ucrel.lancs.ac.uk/cl2015/doc/CL2015-AbstractBook.pdf}, CONFERENCE_NAME = {Corpus Linguistics 2015}, CONFERENCE_PLACE = {Lancaster}, CONFERENCE_DATE = {21-24/07/2015}, BOOKTITLE = {Corpus Linguistics 2015-Abstract Book}, EDITOR = {Formato, F. and Hardie, A.}, } @INPROCEEDINGS{GOGGI_2015_INPROCEEDINGS_GPBFMMDB_342221, AUTHOR = {Goggi, S. and Pardelli, G. and Bartolini, R. and Frontini, F. and Monachini, M. and Manzella, G. and De Mattei, M. and Bustaffa, F.}, TITLE = {A semantic engine for grey literature retrieval in the oceanography domain}, YEAR = {2015}, ABSTRACT = {Here we present the final results of MAPS (Marine Planning and Service Platform), an environment designed for gathering, classifying, managing and accessing marine scientific literature and data, making it available for search to Operative Oceanography researchers of various institutions by means of standard protocols. In previous publications the general architecture of the system as well as the set of metadata (Common Data Index) used to describe the documents were presented [3]; it was shown how individual oceanographic data-sets could be indexed within the MAPS library by types of measure, measurement tools, geographic areas, and also linked to specific textual documentation. Documentation is described using the current international standards: Title, Authors, Publisher, Language, Date of publication, Body/Institution, Abstract, etc.; serial publications are described in terms of ISSN, while books are assigned ISBN; content of various types on electronic networks is described by means of doi and url. Each description is linked to the document. Thanks to this, the MAPS library already enables researchers to go from structured oceanographic data to documents describing it. But this was not enough: documents may contain important information that has not been encoded in the metadata. Thus an advanced Search Engine was put in place that uses semantic-conceptual technologies in order to extract key concepts from unstructured text such as technical documents (reports and grey literature) and scientific papers and to make them indexable and searchable by the end user in the same way as the structured data (such as oceanographic observations and metadata) is. More specifically once a document is uploaded in the MAPS library, key domain concepts in documents are extracted via a natural language processing pipeline and used as additional information for its indexing. The key term identification algorithm is based on marine concepts that were pre-defined in a domain ontology, but crucially it also allows for the discovery of new related concepts. So for instance starting from the domain term salinity, related terms such as sea salinity and average sea salinity will also be identified as key terms and used for indexing and searching documents. A hybrid search system is then put in place, where users can search the library by metadata or by free text queries. In the latter case, the NLP pipeline performs an analysis of the text of the query, and when key concepts are matched, the relevant documents are presented. The results may be later refined by using other structured information (e.g. date of publication, area, ...). Currently a running system has been put in place, with data from satellites, buoys and sea stations; such data is documented and searchable by its relevant metadata and documentation. Results of quantitative evaluation in terms of information retrieval measures will be presented in the poster; more specifically, given an evaluation set defined by domain experts and composed of pre-defined queries together with documents that answer such queries, it will be shown how the system is highly accurate in retrieving the correct documents from the library. Though this work focuses on oceanography, its results may be easily extended to other domains; more generally, the possibility of enhancing the visibility and accessibility of grey literature via its connection to the data it describes and to an advanced full text indexing are of great relevance for the topic of this conference.}, KEYWORDS = {Information Extraction, Search Engine, Oceanography}, PAGES = {76-77}, URL = {https://publications.cnr.it/doc/342221}, VOLUME = {17}, ISBN = {978-90-77484-26-5}, CONFERENCE_NAME = {Seventeenth International Conference on Grey Literature. A New Wave of Textual and Non-Textual Grey Literature}, CONFERENCE_PLACE = {Amsterdam}, CONFERENCE_DATE = {December 1-2}, BOOKTITLE = {GL17 Program Book}, EDITOR = {Farace, D. and Frantzen, J.}, } @INPROCEEDINGS{DELGRATTA_2014_INPROCEEDINGS_DFKMS_285395, AUTHOR = {Del Gratta, R. and Frontini, F. and Khan, F. and Mariani, J. and Soria, C.}, TITLE = {The LREMap for Under-Resourced Languages}, YEAR = {2014}, ABSTRACT = {A complete picture of currently available language resources and technologies for the under-resourced languages of Europe is still lacking. Yet this would help policy makers, researchers and developers enormously in planning a roadmap for providing all languages with the necessary instruments to act as fully equipped languages in the digital era. In this paper we introduce the LRE Map and show its utility for documenting available language resources and technologies for under-resourced languages. The importance of the serialization of the LREMap into (L)LOD along with the possibility of its connection to a wider world is also introduced.}, KEYWORDS = {language resources, less-resourced languages, linguistic linked open data}, PAGES = {78-83}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, CONFERENCE_NAME = {Workshop on Collaboration and Computing for Under-Resourced Languages in the Linked Open Data Era (CCURL 2014)}, CONFERENCE_PLACE = {Reykjavik}, CONFERENCE_DATE = {26/05/2014}, BOOKTITLE = {Proceedings of the Workshop on Collaboration and Computing for Under-Resourced Languages in the Linked Open Data Era (CCURL 2014)}, EDITOR = {Pretorius, L. and Soria, C. and Baroni, P.}, } @INPROCEEDINGS{FRONTINI_2014_INPROCEEDINGS_FQM_291452, AUTHOR = {Frontini, F. and Quochi, V. and Monachini, M.}, TITLE = {Polysemy alternations extraction using the PAROLE SIMPLE CLIPS Italian lexicon}, YEAR = {2014}, ABSTRACT = {This paper presents the results of an experiment of polysemy alternations induction from a lexicon (Utt and Pad´o, 2011; Frontini et al., 2014), discussing the results and proposing an amendment in the original algorithm.}, KEYWORDS = {Language Resources and Technologies}, PAGES = {175-179}, URL = {http://clic.humnet.unipi.it/proceedings/Proceedings-CLICit-2014.pdf}, DOI = {10.12871/CLICIT2014134}, PUBLISHER = {Pisa University Press srl (Pisa, ITA)}, ISBN = {978-88-67-41472-7}, CONFERENCE_NAME = {Proceedings of the First Italian Conference on Computational Linguistics CLiC-it 2014 \& the Fourth International Workshop EVALITA 2014}, CONFERENCE_PLACE = {Pisa}, CONFERENCE_DATE = {9-11 December 2014, Pisa}, EDITOR = {Basili, R. and Lenci, A. and Magnini, B.}, } @INPROCEEDINGS{FRONTINI_2014_INPROCEEDINGS_FQPUM_286984, AUTHOR = {Frontini, F. and Quochi, V. and Padó, S. and Utt, J. and Monachini, M.}, TITLE = {Polysemy Index for Nouns: an Experiment on Italian using the PAROLE SIMPLE CLIPS Lexical Database}, YEAR = {2014}, ABSTRACT = {An experiment is presented to induce a set of polysemous basic type alternations (such as ANIMAL-FOOD, or BUILDING-INSTITUTION) by deriving them from the sense alternations found in an existing lexical resource. The paper builds on previous work and applies those results to the Italian lexicon PAROLE SIMPLE CLIPS. The new results show how the set of frequent type alternations that can be induced from the lexicon is partly different from the set of polysemy relations selected and explicitly applied by lexicographers when building it. The analysis of mismatches shows that frequent type alternations do not always correspond to prototypical polysemy relations, nevertheless the proposed methodology represents a useful tool offered to lexicographers to systematically check for possible gaps in their resource.}, KEYWORDS = {Polysemy, lexical resources, semantics}, PAGES = {2955-2963}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {9th International Conference on Language Resources and Evaluation, LREC 2014}, CONFERENCE_PLACE = {Reykjavik, Iceland}, CONFERENCE_DATE = {26-31 may}, BOOKTITLE = {LREC 2014 Ninth International Conference on Language Resources and Evaluation Proceedings}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{KHAN_2014_INPROCEEDINGS_KBF_286824, AUTHOR = {Khan, F. and Boschetti, F. and Frontini, F.}, TITLE = {Using lemon to Model Lexical Semantic  Shift in Diachronic Lexical Resources}, YEAR = {2014}, ABSTRACT = {In this paper we propose a model, called lemonDIA, for representing lexical semantic change using the lemon framework and based on the ontological notion of the perdurant. Namely we extend the notion of sense in lemon by adding a temporal dimension and then define a class of perdurant entities that represents a shift in meaning of a word and which contains different related senses. We start by discussing the general problem of semantic shift and the utility of being able to easily access and represent such information in diachronic lexical resources. We then describe our model and illustrate it with examples.}, KEYWORDS = {lemon, linked data, OWL, ontologies, perdurants, semantic shift}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/workshops/LREC2014Workshop-LDL2014%20Proceedings.pdf}, CONFERENCE_NAME = {3rd Workshop on Linked Data in Linguistics: Multilingual Knowledge Resources and Natural Language Processing (LDL2014)}, CONFERENCE_PLACE = {Reykjavik}, CONFERENCE_DATE = {May 27th, 2014}, BOOKTITLE = {Proceedings of the 3rd Workshop on Linked Data in Linguistics (LDL-2014)}, EDITOR = {Chiarcos, C. and McCrae, J. P. and Osenova, P. and Vertan, C.}, } @INPROCEEDINGS{MONEGLIA_2014_INPROCEEDINGS_MBFGKMP_286990, AUTHOR = {Moneglia, M. and Brown, S. and Frontini, F. and Gagliardi, G. and Khan, F. and Monachini, M. and Panunzi, A.}, TITLE = {The IMAGACT Visual Ontology. an Extendable Multilingual Infrastructure for the Representation of Lexical Encoding of Action}, YEAR = {2014}, ABSTRACT = {Action verbs have many meanings, covering actions in different ontological types. Moreover, each language categorizes action in its own way. One verb can refer to many different actions and one action can be identified by more than one verb. The range of variations within and across languages is largely unknown, causing trouble for natural language processing tasks. IMAGACT is a corpus-based ontology of action concepts, derived from English and Italian spontaneous speech corpora, which makes use of the universal language of images to identify the different action types extended by verbs referring to action in English, Italian, Chinese and Spanish. This paper presents the infrastructure and the various linguistic information the user can derive from it. IMAGACT makes explicit the variation of meaning of action verbs within one language and allows comparisons of verb variations within and across languages. Because the action concepts are represented with videos, extension into new languages beyond those presently implemented in IMAGACT is done using competence-based judgments by mother-tongue informants without intense lexicographic work involving underdetermined semantic description}, KEYWORDS = {Lexicon, Lexical Database, Ontologies}, PAGES = {3425-3432}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {9th International Conference on Language Resources and Evaluation, LREC 2014}, CONFERENCE_PLACE = {Reykjavik, Iceland}, CONFERENCE_DATE = {26-31 may}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{PALLOTTI_2014_INPROCEEDINGS_PFAMF_287029, AUTHOR = {Pallotti, G. and Frontini, F. and Affè, F. and Monachini, M. and Ferrari, S.}, TITLE = {Presenting a System of Human-Machine Interaction for Performing Map Tasks}, YEAR = {2014}, ABSTRACT = {A system for human machine interaction is presented, that offers second language learners of Italian the possibility of assessing their competence by performing a map task, namely by guiding the a virtual follower through a map with written instructions in natural language. The underlying natural language processing algorithm is described, and the map authoring infrastructure is presented.}, KEYWORDS = {Language learning, human machine interaction, map tasks}, PAGES = {3963-3966}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {9th International Conference on Language Resources and Evaluation, LREC 2014}, CONFERENCE_PLACE = {Reykjavik, Iceland}, CONFERENCE_DATE = {2}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{FRONTINI_2014_INPROCEEDINGS_F_315438, AUTHOR = {Frontini, F.}, TITLE = {La mappa delle opinioni e dei sentimenti estratte dai social media}, YEAR = {2014}, URL = {https://publications.cnr.it/doc/315438}, CONFERENCE_NAME = {Seminario rivolto agli alunni dell'Istituto Tecnico Economico "F. Carrara" di Lucca, organizzato dall'Istituto di Linguistica Computazionale "A. Zampolli" del CNR di Pisa}, CONFERENCE_PLACE = {Pisa, Area della Ricerca del CNR}, CONFERENCE_DATE = {31 marzo 2014}, } @INPROCEEDINGS{GOGGI_2014_INPROCEEDINGS_GMFBPDBM_291816, AUTHOR = {Goggi, S. and Monachini, M. and Frontini, F. and Bartolini, R. and Pardelli, G. and De Mattei, M. and Bustaffa, F. and Manzella, G.}, TITLE = {Marine Planning and Service Platform (MAPS): An Advanced Research Engine for Grey Literature in Marine Science}, YEAR = {2014}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is a development of the Marine project (Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013) aiming at building a computer platform for supporting Operative Oceanography in its activities. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. Community and Requirements. Operative Oceanography is the branch of marine research which deals with the development of integrated systems for examining and modeling the ocean monitoring and forecast. Experts need access to real-time data on the state of the sea such as forecasts on temperatures, streams, tides and the relevant scientific literature. This finds application in many areas, ranging from civilian and military safety to protection of off-shore and coastal infrastructures. The metadata. The set of metadata associated with marine data is defined in the CDI (Common Data Index) documented standard. They encode: the types of sizes which have been measured; the measurement tools the platform which has been employed; the geographic area where measures have been taken; the environmental matrix; the descriptive documentation. As concerns the scientific documentation, at the current stage of the CDI standard, a document is shaped around the following metadata: Title, Authors, Version, ISBN/DOI, Topic, Date of publication, Body/Institution, Abstract. The search engine. The query system (which is actually under development) has been designed for operating with structured data - the metadata - and raw data - the associated technical and scientific documentation. Full-text technologies are often unsuccessful when applied to this type of queries since they assume the presence of specific keywords in the text; in order to fix this problem, the MAPS project suggests to use different emantic technologies for retrieving the text and data and thus getting much more complying results. In the Poster we will present the scenario of the Operative Oceanography together with the technologies used to develop an advanced earch engine which aims at providing rapid and efficient access to a Digital Library of oceanographic data. The case-study is also highlighting how the retrieval of grey literature from this specific marine community could be reproduced for similar communities as well, thus revealing the 2 great impact that the processing, re-use as well as application of grey data have on societal needs/problems and their answers.}, KEYWORDS = {Marine Science Search Engine Source Data Oceanography}, PAGES = {93-94}, URL = {http://greyguide.isti.cnr.it/dfdownloadnew.php?ident=GLConference/GL16/2014-G01-015\&langver=en\&scelta=Metadata}, ISBN = {978-90-77484-24-1}, CONFERENCE_NAME = {Sixteenth International Conference on Grey Literature Grey Literature Lobby: Engines and Requesters for Change}, CONFERENCE_PLACE = {Library of Congress Washington D. C., USA}, CONFERENCE_DATE = {December 8-9, 2014}, EDITOR = {Farace, C. B. D. and Frantzen, J.}, } @INPROCEEDINGS{KHAN_2014_INPROCEEDINGS_KFM_291637, AUTHOR = {Khan, F. and Frontini, F. and Monachini, M.}, TITLE = {A Model for Representing Diachronic Semantic Information in Lexico-Semantic Resources on the Semantic Web}, YEAR = {2014}, ABSTRACT = {The Semantic Web offers a way of publishing structured data online that facilitates the interlinking of different datasets stored at different online locations? indeed one of the main aims of the Semantic Web movement is to actively encourage this enrichment of online datasets with information from other resources, in order to avoid the problem of so called 'data islands'. In contrast to conventional hyperlinks however the links between different resources on the Semantic Web can be given semantic types and classified hierarchically. Data published on the Semantic Web is referred to as Linked Data? if, in addition, this data is available with an open license then it can be referred to as Linked Open Data (Heath 2011).}, KEYWORDS = {Cultural resources, Heritage resources}, PAGES = {1-3}, URL = {http://www.dh.uni-leipzig.de/wo/wp-content/uploads/2014/11/Fahad-Khan-Francesca-Frontini-and-Monica-Monachini-A-Model-for-Representing.pdf}, CONFERENCE_NAME = {Greek and Latin in an age of Open Data. Open Philology Project}, CONFERENCE_PLACE = {University of Leipzig, GERMANY}, CONFERENCE_DATE = {December 1-4, 2014}, } @TECHREPORT{DEMATTEI_2014_TECHREPORT_DMDMBF_335399, AUTHOR = {De Mattei, M. and Medone, D. and D'Angelo, P. and Monachini, M. and Bartolini, R. and Frontini, F.}, TITLE = {MAPS: Architettura del Sistema}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1.2.2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012. Il presente documento è il deliverable "D3.1 - Architettura del Sistema" del progetto MAPS (Marine Planning and Service Platform). Il progetto MAPS è un'evoluzione del progetto precedente Marine. Tale evoluzione si articola su tre aspetti diversi: - Un meccanismo di federazione dei dati, che consenta di rendere disponibili ai propri utenti non soltanto i dati prodotti internamente da sistema Marine ma anche quelli resi disponibili da altri sistemi similari, soddisfacendo così un più ampio ambito di esigenze informative. Il deliverable D2.2, Modello della Soluzione specifica in dettaglio queste nuove funzionalità. - Un Catalogo dei Documenti che, conservando la documentazione tecnica e scientifica dei prodotti offerti, possa documentare in modo accurato le modalità di misurazione, elaborazione e controllo dei prodotti forniti e quindi i relativi ambiti di applicabilità. - Un sistema di ricerca capace di selezionare i dati necessari ad uno scopo determinato non soltanto sulla base della loro tipologia, della loro dislocazione territoriale o di altre informazioni simili contenute nei metadati associati come avviene oggi nella maggior parte dei sistemi esistenti, ma anche sulla base delle informazioni contenute nella documentazione tecnica e scientifica. Tali funzionalità sono specificate nel deliverable D1.3 - Modello della Soluzione.}, KEYWORDS = {Marine Science Search Engine Source Data Oceanography}, PAGES = {1-35}, URL = {https://publications.cnr.it/doc/335399}, } @TECHREPORT{DEMATTEI_2014_TECHREPORT_DMMFBM_335403, AUTHOR = {De Mattei, M. and Medone, D. and Maltese, M. and Frontini, F. and Bartolini, R. and Monachini, M.}, TITLE = {META: Report di progettazione degli algoritmi individuati}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1.2.2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012. Il deliverable definisce l'architettura del Sistema di Estrazione Eventi Meteo realizzato dagli autori nell'ambito del progetto META. Il sistema estrae da contenuti online informazione su eventi meteo critici verificatesi in Liguria e nel nord della Toscana.}, KEYWORDS = {Ontology, Information Extraction, Taxonomy}, PAGES = {1-19}, URL = {https://publications.cnr.it/doc/335403}, } @TECHREPORT{FRONTINI_2014_TECHREPORT_FBM_335400, AUTHOR = {Frontini, F. and Bartolini, R. and Monachini, M.}, TITLE = {MAPS: Stato dell'Arte}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1.2.2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012 Il documento descrive lo stato dell'arte delle tecnologie linguistiche applicate ai sistemi di ricerca semantica.}, KEYWORDS = {Marine Science Search Engine Source Data Oceanography}, PAGES = {1-21}, URL = {https://publications.cnr.it/doc/335400}, } @TECHREPORT{FRONTINI_2014_TECHREPORT_FBM_335402, AUTHOR = {Frontini, F. and Bartolini, R. and Monachini, M.}, TITLE = {META:-Report sui modelli e tecniche linguistiche}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1.2.2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012. Il deliverable riassume lo stato dell'arte delle tecnologie semantiche che possono essere impiegate nella realizzazione del progetto META. Il progetto META è una progetto di ricerca e sviluppo tecnologico finanziato dalla Regione Liguria con i fondi POR-FESR 2007-2013 della Comunità Europea che mira alla realizzazione di un sistema per l'allerta di eventi meteo critici in Liguria e nel nord della Toscana. Nell'ambito del progetto META le tecnologie semantiche sono utilizzate per estrarre eventi meteo di interesse da articoli pubblicati in rete o sui social network.}, KEYWORDS = {Ontology, Information Extraction, Semantic Web, Search Engine}, PAGES = {1-20}, URL = {https://publications.cnr.it/doc/335402}, } @TECHREPORT{FRONTINI_2014_TECHREPORT_FBMPG_287039, AUTHOR = {Frontini, F. and Bartolini, R. and Monachini, M. and Pardelli, G. and Goggi, S.}, TITLE = {Stato dell'arte dei motori semantici. Progetto MAPS, programma operativo regionale POR-FESR (2007-2013)}, YEAR = {2014}, ABSTRACT = {Il presente documento è il deliverable "D1.1 - Stato dell'Arte dei motori semantici del progetto MAPS (Marine Planning and Service Platform). Il progetto MAPS è una evoluzione del progetto precedente Marine. Tramite il progetto Marine (Bando Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013 - pos n.1) è stata realizzata una piattaforma informatica di supporto all'Oceanografia Operativa capace di raccogliere dati marini per renderli poi disponibili ai ricercatori e alle organizzazioni interessate tramite protocolli standard. Lo scopo del progetto MAPS è quello di realizzare una Catalogo di Documenti contenente informazioni per la piattaforma Marine. Caratteristica di MAPS è di fornire accesso ai dati oceanografici sia attraverso la ricerca per metadati, sia attraverso la ricerca semantica contenuta nella manualistica tecnico scientifica di riferimento.}, PAGES = {1-22}, URL = {https://publications.cnr.it/doc/287039}, } @INPROCEEDINGS{FRONTINI_2013_INPROCEEDINGS_FDM_287280, AUTHOR = {Frontini, F. and Del Gratta, R. and Monachini, M.}, TITLE = {Linking the Geonames ontology to WordNet}, YEAR = {2013}, ABSTRACT = {This paper illustrates the transformation of the GeoNames ontology concepts, with their English labels and glosses, into a GeoDomain WordNet-like resource in English, its translation into Italian, and its linking to the existing generic WordNets of both languages.}, KEYWORDS = {GeoNames, WordNet, lemon}, PAGES = {263-267}, URL = {http://hnk.ffzg.hr/bibl/ltc2013/book/papers/OWN-2.pdf}, PUBLISHER = {Fundacja Uniwersytetu im A. Mickiewicza (Poznan, POL)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {6th Language \& Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, CONFERENCE_PLACE = {Poznan, Poland}, CONFERENCE_DATE = {December 7-9, 2013}, BOOKTITLE = {Human Language Technologies as a Challenge for Computer Science and Linguistics. Proceedings, 6th Language \& Technology Conference, December 7-9, 2013, Poznañ, Poland}, EDITOR = {Vetulani, Z. and Uszkoreit, H.}, } @INPROCEEDINGS{MARCHETTI_2013_INPROCEEDINGS_MTALDFM_287331, AUTHOR = {Marchetti, A. and Tesconi, M. and Abbate, S. and Lo Duca, A. and D'Errico, A. and Frontini, F. and Monachini, M.}, TITLE = {Tour-pedia: a web application for the analysis and visualization of opinions for tourism domain}, YEAR = {2013}, ABSTRACT = {We present Tour-pedia an interactive web application that extracts opinions from reviews of accommodations from different sources available on-line. Polarity markers display on a map the different opinions. This tool is intended to help business operators to manage reputation on-line.}, KEYWORDS = {Visualization tools, opinion mining, NLP on social media, tourism reviews}, PAGES = {594-595}, URL = {http://www.iit.cnr.it/sites/default/files/ltc2013_opener_demo.pdf}, PUBLISHER = {Fundacja Uniwersytetu im A. Mickiewicza (Poznan, POL)}, ISBN = {978-83-932640-4-9}, CONFERENCE_NAME = {6th Language \& Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, CONFERENCE_PLACE = {Poznan, Poland}, CONFERENCE_DATE = {December 7-9, 2013}, EDITOR = {Vetulani, Z. and Uszkoreit, H.}, } @INPROCEEDINGS{MONEGLIA_2013_INPROCEEDINGS_MPGMRDKF_287346, AUTHOR = {Moneglia, M. and Panunzi, A. and Gagliardi, G. and Monachini, M. and Russo, I. and De Felice, I. and Khan, F. and Frontini, F.}, TITLE = {IMAGACT E-learning Platform for Basic Action Types. In: Pixel (ed.), Proceedings of the 6th International Conference ICT for Language Learning}, YEAR = {2013}, ABSTRACT = {Action verbs express important information in a sentence and they are the most frequent elements in speech, but they are also one of the most difficult part of the lexicon to learn for L2 language learners, because languages segment these concepts in very different ways. The two sentences "Mary folds her shirt" and "Mary folds her arms" refer to two completely different types of action, as becomes evident when they are translated into another language (e.g., in Italian they would be translated as "Maria piega la camicia" and "Maria incrocia le braccia" respectively). IMAGACT e-learning platform aims to make these differences evident by creating a cross-linguistic ontology of action types, whose nodes consist of 3D scenes, each of which relates to one action type. In order to identify these types, contexts of use have been extracted from English and Italian spontaneous speech corpora for around 600 high frequency action verbs (for each language). All instances that refer to similar events (e.g., fold the shirt/ the blanket) are grouped under one single action type: each one of these types is then represented by a linguistic best example and a short video that represents simple actions (e.g. a man taking a glass from a table).The action types extracted for Italian and English are compared and merged into one cross-linguistic ontology of action. IMAGACT has provided an internet based annotation infrastructure to derive this information from corpora. The project is now completed for the Italian and English lexicon, data extraction for Chinese and Spanish is ongoing. Reference to prototypical imagery is crucial in order to bootstrap the learning process. By selecting the set of 3D scenes referred to by a verb in one language and viewing the type of activity represented therein learners can directly understand the range of applicability of each verb. Thanks to an easy interface, a user can access the English/Italian/Chinese lexicon by lemma or directly by 3D scenes. For example, searching for the verb "to turn",s/he will be presented with a number of scenes, showing the various action types associated to that verb.Clicking on a scene s/he or she will know how this type of action is referred to in other the languages}, KEYWORDS = {Ontology}, PAGES = {85-89}, URL = {https://publications.cnr.it/doc/287346}, PUBLISHER = {libreriauniversitaria. it (Limena, ITA)}, ISBN = {978-88-6292-423-8}, CONFERENCE_NAME = {International Conference "ICT for Language Learning", 6th edition}, CONFERENCE_PLACE = {Florence, Italy}, CONFERENCE_DATE = {14-15 november 2013}, BOOKTITLE = {Conference Proceedings. ICT for Language Learning}, EDITOR = {Pixel}, } @INPROCEEDINGS{RUSSO_2013_INPROCEEDINGS_RDFKM_285373, AUTHOR = {Russo, I. and De Felice, I. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {(Fore)seeing actions in objects. Acquiring distinctive affordances from language}, YEAR = {2013}, ABSTRACT = {In this paper we investigate if conceptual information concerning objects' affordances as possibilities for actions anchored to an object can be at least partially acquired through language. Considering verb-noun pairs as the linguistic realizations of relations between actions performed by an agent and objects we collect this information from the ImagAct dataset, a linguistic resource obtained from manual annotation of basic action verbs, and from a web corpus(itTenTen). The notion of affordance verb as the most distinctive verb in ImagAct enables a comparison with distributional data that reveal how lemmas ranking based on a semantic association measure that mirror that of affordances as the most distinctive actions an object can be involved in.}, PAGES = {151-161}, URL = {https://docs.google.com/viewer?a=v\&pid=sites\&srcid=ZGVmYXVsdGRvbWFpbnxubHBjczIwMTN8Z3g6MTI0ZGMzYWYwYmMxNjY1Mg}, CONFERENCE_NAME = {NLPCS 2013-10th International Workshop on Natural Language Processing and Cognitive Science}, CONFERENCE_PLACE = {Marseille}, CONFERENCE_DATE = {15-17/10/2013}, BOOKTITLE = {Proceedings of NLPCS 2013-10th International Workshop on Natural Language Processing and Cognitive Science}, EDITOR = {Sharp, B. and Zock, M.}, } @INPROCEEDINGS{RUSSO_2013_INPROCEEDINGS_RFDKM_287456, AUTHOR = {Russo, I. and Frontini, F. and De Felice, I. and Khan, F. and Monachini, M.}, TITLE = {Disambiguation of Basic Action Types through Nouns' Telic Qualia}, YEAR = {2013}, ABSTRACT = {Knowledge about semantic associations between words is effective to disambiguate word senses. The aim of this paper is to investigate the role and the relevance of telic information from SIMPLE in the disambiguation of basic action types of Italian HOLD verbs ( prendere, 'to take', raccogliere, 'to pick up', pigliare 'to grab' etc.). We propose an experiment to compare the results obtained with telic information from SIMPLE with basic co-occurrence information extracted from corpora (most salient verbs modifying nouns) classified in terms of general semantic classes to avoid data sparseness.}, PAGES = {70-75}, URL = {http://www.aclweb.org/anthology/W13-5410}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-937284-98-5}, CONFERENCE_NAME = {6th International Conference on Generative Approaches to the Lexicon Generative Lexicon and Distributional Semantics}, CONFERENCE_PLACE = {Pisa, Italy}, CONFERENCE_DATE = {24-25/09/2013}, BOOKTITLE = {Proceedings of the 6th International Conference on Generative Approaches to the Lexicon. Generative Lexicon and Distributional Semantics}, EDITOR = {Saurí, R. and Calzolari, N. and Huang, C. and Lenci, A. and Monachini, M. and Pustejovsky, J.}, } @INPROCEEDINGS{CASELLI_2012_INPROCEEDINGS_CFQRR_287038, AUTHOR = {Caselli, T. and Frontini, F. and Quochi, V. and Rubino, F. and Russo, I.}, TITLE = {Flexible Acquisition of Subcategorization Frames in Italian}, YEAR = {2012}, ABSTRACT = {Lexica of predicate-argument structures constitute a useful tool for several tasks in NLP. This paper describes a web-service system for automatic acquisition of verb subcategorization frames (SCFs) from parsed data in Italian. The system acquires SCFs in an unsupervised manner. We created two gold standards for the evaluation of the system, the first by mixing together information from two lexica (one manually created and the second automatically acquired) and manual exploration of corpus data and the other annotating data extracted from a specialized corpus (environmental domain). Data filtering is accomplished by means of the maximum likelihood estimate (MLE). The evaluation phase has allowed us to identify the best empirical MLE threshold for the creation of a lexicon (P=0.653, R=0.557, F1=0.601). In addition to this, we assigned to the extracted entries of the lexicon a confidence score based on the relative frequency and evaluated the extractor on domain specific data. The confidence score will allow the final user to easily select the entries of the lexicon in terms of their reliability: one of the most interesting feature of this work is the possibility the final users have to customize the results of the SCF extractor, obtaining different SCF lexica in terms of size and accuracy.}, KEYWORDS = {lexicon, automatic acquisition, subcategorisation frames}, PAGES = {2842-2848}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/390.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {9782951740877}, CONFERENCE_NAME = {Eight International Conference on Language Resources and Evaluation (LREC'12)}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {23-25 Maggio 2012}, BOOKTITLE = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Doğan, M. U. and Maegaard, B. and Mariani, J. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{DELGRATTA_2012_INPROCEEDINGS_DFMQRAL_223098, AUTHOR = {Del Gratta, R. and Frontini, F. and Monachini, M. and Quochi, V. and Rubino, F. and Abrate, M. and Lo Duca, A.}, TITLE = {L-LEME: an Automatic Lexical Merger based on the LMF Standard}, YEAR = {2012}, ABSTRACT = {The present paper describes LMF LExical MErger (L-LEME), an architecture to combine two lexicons in order to obtain new resource(s). L-LEME relies on standards, thus exploiting the benefits of the ISO Lexical Markup Framework (LMF) to ensure interoperability. L-LEME is meant to be dynamic and heavily adaptable: it allows the users to configure it to meet their specific needs. The L-LEME architecture is composed of two main modules: the Mapper, which takes in input two lexicons A and B and a set of user-defined rules and instructions to guide the mapping process (Directives D) and gives in output all matching entries. The algorithm also calculates a cosine similarity score. The Builder takes in input the previous results, a set of Directives D1 and produces a new LMF lexicon C. The Directives allow the user to define its own building rules and different merging scenarios. L-LEME is applied to a specific concrete task within the PANACEA project, namely the merging of two Italian SubCategorization Frame (SCF) lexicons. The experiment is interesting in that A and B have different philosophies behind, being A built by human introspection and B automatically extracted. Ultimately, L-LEME has interesting repercussions in many language technology applications}, KEYWORDS = {LMF, Lexicon mapping, similarity score}, PAGES = {31-40}, URL = {https://publications.cnr.it/doc/223098}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC) 2012}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {2012}, BOOKTITLE = {Proceedings of the LREC 2012 Workshop on Language Resource Merging}, EDITOR = {Bel, N. and Gavrilidou, M. and Monachini, M. and Quochi, V. and Rimell, L.}, } @INPROCEEDINGS{DELGRATTA_2012_INPROCEEDINGS_DFRRC_220182, AUTHOR = {Del Gratta, R. and Frontini, F. and Rubino, F. and Russo, I. and Calzolari, N.}, TITLE = {The Language Library: supporting community effort for collective resource production}, YEAR = {2012}, ABSTRACT = {Relations among phenomena at different linguistic levels are at the essence of language properties but today we focus mostly on one specific linguistic layer at a time, without (having the possibility of) paying attention to the relations among the different layers. At the same time our efforts are too much scattered without much possibility of exploiting other people's achievements. To address the complexities hidden in multilayer interrelations even small amounts of processed data can be useful, improving the performance of complex systems. Exploiting the current trend towards sharing we want to initiate a collective movement that works towards creating synergies and harmonisation among different annotation efforts that are now dispersed. In this paper we present the general architecture of the Language Library, an initiative which is conceived as a facility for gathering and making available through simple functionalities the linguistic knowledge the field is able to produce, putting in place new ways of collaboration within the LRT community. In order to reach this goal, a first population round of the Language Library has started around a core of parallel/comparable texts that have been annotated by several contributors submitting a paper for LREC2012. The Language Library has also an ancillary aim related to language documentation and archiving and it is conceived as a theory-neutral space which allows for several language processing philosophies to coexist.}, KEYWORDS = {annotation, metadata, scientific crowdsourcing}, PAGES = {43-49}, URL = {https://publications.cnr.it/doc/220182}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {23-25 may 2012}, } @INPROCEEDINGS{FRONTINI_2012_INPROCEEDINGS_FABBMPPS_278677, AUTHOR = {Frontini, F. and Aliprandi, C. and Bacciu, C. and Bartolini, R. and Marchetti, A. and Parenti, E. and Piccinonno, F. and Soru, T.}, TITLE = {GLOSS, an infrastructure for the semantic annotation and mining of documents in the public security domain}, YEAR = {2012}, ABSTRACT = {Efficient access to information is crucial in the work of organizations that require decision taking in emergency situations. This paper gives an outline of GLOSS, an integrated system for the analysis and retrieval of data in the environmental and public security domain. We shall briefly present the GLOSS infrastructure and its use, and how semantic information of various kinds is integrated, annotated and made available to the final users.}, KEYWORDS = {semantic annotation, text mining, geographic data}, PAGES = {21-25}, URL = {https://publications.cnr.it/doc/278677}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {Eight International Conference on Language Resources and Evaluation. LREC'12. European Language Resources Association: France}, CONFERENCE_PLACE = {Istanbul}, CONFERENCE_DATE = {21-27/05/2012}, } @INPROCEEDINGS{FRONTINI_2012_INPROCEEDINGS_FQR_220785, AUTHOR = {Frontini, F. and Quochi, V. and Rubino, F.}, TITLE = {Automatic Creation of Quality Multi-Word Lexica from Noisy Text Data}, YEAR = {2012}, ABSTRACT = {This paper describes the design of a tool for the automatic creation of multi-word lexica that is deployed as a web service and runs on automatically web-crawled data within the framework of the PANACEA platform. The main purpose of our task is to provide a (computationally "light") tool that creates a full high quality lexical resource of multi-word items. Within the platform, this tool is typically inserted in a work flow whose first step is automatic web-crawling. Therefore, the input data of our lexical extractor is intrinsically noisy. The paper evaluates the capacity of the tool to deal with noisy data, and in particular with texts containing a significant amount of duplicated paragraphs. The accuracy of the extraction of multi-word expressions from the original crawled corpus is compared to the accuracy of the extraction from a later "de-duplicated" version of the corpus. The paper shows how our method can extract with sufficiently good precision also from the original, noisy crawled data. The output of our tool is a multi-word lexicon formatted and encoded in XML according to the Lexical Mark-up Framework.}, KEYWORDS = {Lexical induction, multi-word extraction, web-based distributed platform, noisy data}, URL = {http://www.kde.cs.tut.ac.jp/~aono/pdf/COLING2012/AND/pdf/AND04.pdf}, PUBLISHER = {ACM, Association for computing machinery (New York, USA)}, ISBN = {978-1-4503-1919-5}, CONFERENCE_NAME = {AND 2012}, CONFERENCE_PLACE = {Mumbai, India}, CONFERENCE_DATE = {December 9, 2012}, BOOKTITLE = {Proceedings of the Sixth Workshop on Analytics for Noisy Unstructured Text Data}, } @INPROCEEDINGS{GAVRILIDOU_2012_INPROCEEDINGS_GLDPPMFDFAM_219704, AUTHOR = {Gavrilidou, M. and Labropoulou, P. and Desipri, E. and Piperidis, S. and Papageorgiou, H. and Monachini, M. and Frontini, F. and Declerck, T. and Francopoulo, G. and Arranz, V. and Mapelli, V.}, TITLE = {The META-SHARE Metadata Schema for the Description of Language Resources}, YEAR = {2012}, ABSTRACT = {This paper presents a metadata model for the description of language resources proposed in the framework of the META-SHARE infrastructure, aiming to cover both datasets and tools/technologies used for their processing. It places the model in the overall framework of metadata models, describes the basic principles and features of the model, elaborates on the distinction between minimal and maximal versions thereof, briefly presents the integrated environment supporting the LRs description and search and retrieval processes and concludes with work to be done in the future for the improvement of the model.}, KEYWORDS = {metadata, META-SHARE, LRs description}, PAGES = {1090-1097}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/index.html}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {23-25 may 2012}, } @INPROCEEDINGS{MONACHINI_2012_INPROCEEDINGS_MFDRKGP_220211, AUTHOR = {Monachini, M. and Frontini, F. and De Felice, I. and Russo, I. and Khan, F. and Gagliardi, G. and Panunzi, A.}, TITLE = {Verb interpretation for basic action types: annotation, ontology induction and creation of prototypical scenes}, YEAR = {2012}, ABSTRACT = {In the last 20 years dictionaries and lexicographic resources such as WordNet have started to be enriched with multimodal content. Short videos depicting basic actions support the user's need (especially in second language acquisition) to fully understand the range of applicability of verbs. The IMAGACT project has among its results a repository of action verbs ontologically organised around prototypical action scenes in the form of both video recordings and 3D animations. The creation of the IMAGACT ontology, which consists in deriving action types from corpus instances of action verbs, intra and cross linguistically validating them and producing the prototypical scenes thereof, is the preliminary step for the creation of a resouce that users can browse by verb, learning how to match different action prototypes with the correct verbs in the target language. The mapping of IMAGACT types onto WordNet synsets allows for a mutual enrichment of both resources.}, KEYWORDS = {ontology of actions, lexical resource, 3D animations}, PAGES = {69-80}, URL = {https://publications.cnr.it/doc/220211}, CONFERENCE_NAME = {COLING 2012-3rd Workshop on Cognitive Aspects of the Lexicon (CogALex-III)}, CONFERENCE_PLACE = {Mumbai, India}, CONFERENCE_DATE = {15 Dicembre 2012}, } @INPROCEEDINGS{MONEGLIA_2012_INPROCEEDINGS_MGPFRM_220262, AUTHOR = {Moneglia, M. and Gagliardi, G. and Panunzi, A. and Frontini, F. and Russo, I. and Monachini, M.}, TITLE = {IMAGACT: Deriving an Action Ontology from Spoken Corpora}, YEAR = {2012}, ABSTRACT = {This paper presents the IMAGACT annotation infrastructure which uses both corpus - based and competence - based methods for the simultaneous extraction of a language independent Action ontology from English and Italian spontaneous speech corpora. The infrastructure relies on an innovative methodology based on images of prototypical scenes and will identify high frequency action concepts in everyday life, suitable for the implementation of an open set of languages.}, KEYWORDS = {Action verbs Ontology imagery}, PAGES = {42-47}, URL = {https://publications.cnr.it/doc/220262}, ISBN = {978-90-74029-00-1}, CONFERENCE_NAME = {Eighth Joint ISO-ACL SIGSEM Workshop on Interoperable Semantic Annotation (ISA-8)}, CONFERENCE_PLACE = {Pisa, Italy}, CONFERENCE_DATE = {3-5 October 2012}, BOOKTITLE = {Proceedings of the Eight Joint ISO-ACL SIGSEM Workshop on Interoperable Semantic Annotation ISA-8}, EDITOR = {Bunt, H.}, } @INPROCEEDINGS{MONEGLIA_2012_INPROCEEDINGS_MMCPFGR_219656, AUTHOR = {Moneglia, M. and Monachini, M. and Calabrese, O. and Panunzi, A. and Frontini, F. and Gagliardi, G. and Russo, I.}, TITLE = {The IMAGACT Cross-linguistic Ontology of Action. A new infrastructure for natural language disambiguation}, YEAR = {2012}, ABSTRACT = {Action verbs, which are highly frequent in speech, cause disambiguation problems that are relevant to Language Technologies. This is a consequence of the peculiar way each natural language categorizes Action i.e. it is a consequence of semantic factors. Action verbs are frequently "general", since they extend productively to actions belonging to different ontological types. Moreover, each language categorizes action in its own way and therefore the cross-linguistic reference to everyday activities is puzzling. This paper briefly sketches the IMAGACT project, which aims at setting up a cross-linguistic Ontology of Action for grounding disambiguation tasks in this crucial area of the lexicon. The project derives information on the actual variation of action verbs in English and Italian from spontaneous speech corpora, where references to action are high in frequency. Crucially it makes use of the universal language of images to identify action types, avoiding the underdeterminacy of semantic definitions. Action concept entries are implemented as prototypic scenes; this will make it easier to extend the Ontology to other languages.}, KEYWORDS = {Action verbs, Ontology, Imagery}, PAGES = {2606-2613}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/428_Paper.pdf}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, CONFERENCE_PLACE = {Istanbul, Turkey}, CONFERENCE_DATE = {23-25 may 2012}, } @INPROCEEDINGS{MONEGLIA_2012_INPROCEEDINGS_MMPFGR_220270, AUTHOR = {Moneglia, M. and Monachini, M. and Panunzi, A. and Frontini, F. and Gagliardi, G. and Russo, I.}, TITLE = {Mapping a corpusinduced ontology of action verbs on ItalWordNet}, YEAR = {2012}, ABSTRACT = {Action verbs are the least predictable linguistic type for bilingual dictionaries and they cause major problems for NLP technologies. This is not only because of language specific phraseology, but it is rather a consequence of the peculiar way each language categorizes events. In ordinary languages the most frequent action verbs are "general", since they extend productively to actions belonging to different ontological types. Moreover, each language categorizes actions in its own way and therefore the cross-linguistic reference to everyday activities is puzzling. A cross-linguistic stable ontology of actions is difficult to achieve because our knowledge on the actual variation of verbs across types of actions is largely unknown. This paper briefly presents the problems and the building strategies of the IMAGACT Ontology, which aims at filling this gap, and compares some early results on a set of Italian verbs with the information contained in ItalWordNet.}, KEYWORDS = {action verbs ontology image}, PAGES = {219-226}, URL = {https://publications.cnr.it/doc/220270}, ISBN = {978-80-263-0244-5}, CONFERENCE_NAME = {Global Wordnet Conference (GWC2012)}, CONFERENCE_PLACE = {Matsue, Japan}, CONFERENCE_DATE = {9-13 January 2012}, BOOKTITLE = {Proceedings of the 6th Global WordNet Conference (GWC2012)}, EDITOR = {Fellbaum, C. and Vossen, P.}, } @INPROCEEDINGS{QUOCHI_2012_INPROCEEDINGS_QFR_220778, AUTHOR = {Quochi, V. and Frontini, F. and Rubino, F.}, TITLE = {A MWE Acquisition and Lexicon Builder Web Service}, YEAR = {2012}, ABSTRACT = {This paper describes the development of a web-service tool for the automatic extraction of Multi-word expressions lexicons, which has been integrated in a distributed platform for the automatic creation of linguistic resources. The main purpose of the work described is thus to provide a (computationally "light") tool that produces a full lexical resource: multi-word terms/items with relevant and useful attached information that can be used for more complex processing tasks and applications (e.g. parsing, MT, IE, query expansion, etc.). The output of our tool is a MW lexicon formatted and encoded in XML according to the Lexical Mark-up Framework. The tool is already functional and available as a service. Evaluation experiments show that the tool precision is of about 80%.}, KEYWORDS = {Multiword extraction, lexical resources, LMF, web services}, PAGES = {2291-2306}, URL = {http://aclweb.org/anthology/C/C12/C12-1140.pdf}, PUBLISHER = {Curran Associates (Red Hook, NY 12571, USA)}, ISBN = {9781627483896}, CONFERENCE_NAME = {International Conference on Computational Linguistics (COLING)}, CONFERENCE_PLACE = {Mumbai, India}, CONFERENCE_DATE = {December 2012}, BOOKTITLE = {Proceedings of COLING 2012: Technical Papers}, EDITOR = {Kay, M. and Boitet, C.}, } @INPROCEEDINGS{RUBINO_2012_INPROCEEDINGS_RFQ_220773, AUTHOR = {Rubino, F. and Frontini, F. and Quochi, V.}, TITLE = {Integrating NLP Tools in a Distributed Environment: A Case Study Chaining a Tagger with a Dependency Parser}, YEAR = {2012}, ABSTRACT = {The present paper tackles the issue of PoS tag conversion within the framework of a distributed web service platform for the automatic creation of language resources. PoS tagging is now considered a "solved problem"; yet, because of the differences in the tagsets, interchange of the various PoS taggers vailable is still hampered. In this paper we describe the implementation of a PoS-tagged-corpus converter, which is needed for chaining together in a workflow the FreeLing PoS tagger for Italian and the DESR dependency parser, given that these two tools have been developed independently. The conversion problems experienced during the implementation, related to the properties of the different tagsets and of tagset conversion in general, are discussed together with the solutions adopted. Finally, the converter is evaluated by assessing the impact of conversion on the performance of the dependency parser by comparing with the outcome of the native pipeline. From this we learn that in most cases parsing errors are due to actual tagging errors, and not to conversion itself. Besides, information on accuracy loss is an important feature in a distributed environment of (NLP) services, where users need to decide which services best suit their needs}, KEYWORDS = {PoS tag conversion, interoperability, NLP pipelines}, PAGES = {2125-2131}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/726.html}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, ISBN = {9782951740877}, CONFERENCE_NAME = {Language Resources and Evaluation Conference 2012}, CONFERENCE_PLACE = {Istanbul, Turchia}, CONFERENCE_DATE = {23-25 Maggio 2012}, BOOKTITLE = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Doğan, M. U. and Maegaard, B. and Mariani, J. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{ABRATE_2012_INPROCEEDINGS_ABFLMM_220733, AUTHOR = {Abrate, M. and Bacciu, C. and Frontini, F. and Lapolla, M. N. and Marchetti, A. and Monachini, M.}, TITLE = {Web Language Identification Testing Tool}, YEAR = {2012}, ABSTRACT = {Nowadays a variety of tools for automatic language identification are available. Regardless of the approach used, at least two features can be identified as crucial to evaluate the performances of such tools: the precision of the presented results and the range of languages that can be detected. In this work we shall focus on a subtask of written language identification that is important to preserve and enhance multilinguality in the Web, i.e. detecting the language of a Web page given its URL. Most specifically, the final aim is to verify to which extent under-represented languages are recognized by available tools. The main specificity of Web Language Identification (WLI) lies in the fact that often an HTML page can provide interesting extralinguistic clues (URL domain name, metadata, encoding, etc) that can enhance accuracy. We shall first provide some data and statistics on the presence of languages on the web, secondly discuss existing practices and tools for language identification according to different metrics - for instance the approaches used and the number of supported languages - and finally make some proposals on how to improve current Web Language Identifiers. We shall also present a preliminary WLI service that builds on the Google Chromium Compact Language Detector; the WLI tool allows us to test the Google n-gram based algorithm against an adhoc gold standard of pages in various languages. The gold standard, based on a selection of Wikipedia projects, contains samples in languages for which no automatic recognition has been attempted; it can thus be used by specialists to develop and evaluate WLI systems.}, KEYWORDS = {Multilingual Web}, URL = {https://publications.cnr.it/doc/220733}, CONFERENCE_NAME = {The Multilingual Web-the Way Ahead}, CONFERENCE_PLACE = {Luxembourg}, CONFERENCE_DATE = {15-16 March 2012}, } @INPROCEEDINGS{FRONTINI_2012_INPROCEEDINGS_FMLMAB_348940, AUTHOR = {Frontini, F. and Monachini, M. and Lapolla, M. N. and Marchetti, A. and Abrate, M. and Bacciu, C.}, TITLE = {Web Language Identification Testing Tool}, YEAR = {2012}, ABSTRACT = {Nowadays a variety of tools for automatic language identification are available. Regardless of the approach used, at least two features can be identified as crucial to evaluate the performances of such tools: the precision of the presented results and the range of languages that can be detected. In this work we shall focus on a subtask of written language identification that is important to preserve and enhance multilinguality in the Web, i.e. detecting the language of a Web page given its URL. Most specifically, the final aim is to verify to which extent under-represented languages are recognized by available tools. The main specificity of Web Language Identification (WLI) lies in the fact that often an HTML page can provide interesting extralinguistic clues (URL domain name, metadata, encoding, etc) that can enhance accuracy. We shall first provide some data and statistics on the presence of languages on the web, secondly discuss existing practices and tools for language identification according to different metrics - for instance the approaches used and the number of supported languages - and finally make some proposals on how to improve current Web Language Identifiers. We shall also present a preliminary WLI service that builds on the Google Chromium Compact Language Detector; the WLI tool allows us to test the Google n-gram based algorithm against an ad-hoc gold standard of pages in various languages. The gold standard, based on a selection of Wikipedia projects, contains samples in languages for which no automatic recognition has been attempted; it can thus be used by specialists to develop and evaluate WLI systems.}, KEYWORDS = {Language Identification Tools, Multilingual Web}, PAGES = {1-1}, URL = {https://publications.cnr.it/doc/348940}, CONFERENCE_NAME = {W3C Workshop, Call for Participation: The Multilingual Web-The Way Ahead}, CONFERENCE_PLACE = {Luxembourg}, CONFERENCE_DATE = {15-16/03/2012}, } @TECHREPORT{ALIPRANDI_2012_TECHREPORT_ABBFLMPS_221743, AUTHOR = {Aliprandi, C. and Bacciu, C. and Bartolini, R. and Frontini, F. and Lapolla, N. and Marchetti, A. and Piccinonno, F. and Soru, T.}, TITLE = {Specifiche architetturali e funzionali}, YEAR = {2012}, ABSTRACT = {Questo documento contiene le specifiche funzionali ed architetturali del sistema GLOSS elaborate come risultato dell'obiettivo operativo 1. Tali specifiche debbono essere di riferimento per tutte le fasi di sviluppo dei vari componenti del sistema stesso e della loro integrazione in un prototipo dimostrativo. Ad una breve introduzione che richiama gli obiettivi generali del progetto, seguono: 1. La descrizione delle funzionalità suddivisa nelle varie fasi che compongono il flusso operativo di GLOSS. 2. La descrizione dell'architettura del sistema da realizzare nella quale si fornisce lo schema dell'integrazione dei vari componenti, il protocollo di comunicazione e memorizzazione dei dati che viene trattato più nel dettaglio nel documento D1.2 GAF - Gloss Annotation Format, e la descrizione di ciascun componente del sistema. Per sua natura, questo documento sarà soggetto a revisione durante tutto il periodo di sviluppo del sistema. Questa prima versione deve intendersi come guida per l'implementazione ed ha lo scopo di fornire a chi partecipa a questo progetto una visione generale delle funzionalità di GLOSS e come queste dovranno essere integrate nel prototipo dimostratore.}, KEYWORDS = {GLOSS specifiche funzionali}, URL = {https://publications.cnr.it/doc/221743}, } @TECHREPORT{PROKOPIDIS_2012_TECHREPORT_PPTPFRT_221582, AUTHOR = {Prokopidis, P. and Papavassiliou, V. and Toral, A. and Poch Riera, M. and Frontini, F. and Rubino, F. and Thurmair, G.}, TITLE = {D4. 5 Final Report on the Corpus Acquisition & Annotation subsystem and its components}, YEAR = {2012}, ABSTRACT = {PANACEA WP4 targets the creation of a Corpus Acquisition and Annotation (CAA) subsystem for the acquisition and processing of monolingual and bilingual language resources (LRs). The CAA subsystem consists of tools that have been integrated as web services in the PANACEA platform of LR production. D4.2 Initial functional prototype and documentation in T13 and D4.4 Report on the revised Corpus Acquisition \& Annotation subsystem and its components in T23 provided initial and updated documentation on this subsystem, while this deliverable presents the final documentation of the subsystem as it evolved after the third development cycle of the project. The deliverable is structured as follows. The Corpus Acquisition Component (i.e. the Focused Monolingual and Bilingual Crawlers (FMC/FBC)) is described in section 2. The final list of tools for corpus normalization (cleaning and de-duplication) is detailed in section 3. Section 4 provides documentation on all NLP tools included in the subsystem. Due to its nature, this deliverable aggregates considerable parts of all previous WP4 deliverables. The main new additions include a) new functionalities for, among others, crawling strategy, de-duplication, and detection of parallel document pairs; and b) new NLP tools for syntactic analysis, named entity recognition, tweet processing and anonymization.}, KEYWORDS = {Corpus Acquisition}, URL = {http://www.jotform.com/uploads/fabioaffeilc/30222975566357/225350067351490116/PANACEA}, } @TECHREPORT{QUOCHI_2012_TECHREPORT_QFBHPPBTTK_221616, AUTHOR = {Quochi, V. and Frontini, F. and Bartolini, R. and Hamon, O. and Poch Riera, M. and Padro, M. and Bel, N. and Thurmair, G. and Toral, A. and Kamran, A.}, TITLE = {D7. 4 Third evaluation report. Evaluation of PANACEA v3 and produced resources}, YEAR = {2012}, ABSTRACT = {D7.4 reports on the evaluation of the different components integrated in the PANACEA third cycle of development as well as the final validation of the platform itself. All validation and evaluation experiments follow the evaluation criteria already described in D7.1. The main goal of WP7 tasks was to test the (technical) functionalities and capabilities of the middleware that allows the integration of the various resource-creation components into an interoperable distributed environment (WP3) and to evaluate the quality of the components developed in WP5 and WP6. The content of this deliverable is thus complementary to D8.2 and D8.3 that tackle advantages and usability in industrial scenarios. It has to be noted that the PANACEA third cycle of development addressed many components that are still under research. The main goal for this evaluation cycle thus is to assess the methods experimented with and their potentials for becoming actual production tools to be exploited outside research labs. For most of the technologies, an attempt was made to re-interpret standard evaluation measures, usually in terms of accuracy, precision and recall, as measures related to a reduction of costs (time and human resources) in the current practices based on the manual production of resources. In order to do so, the different tools had to be tuned and adapted to maximize precision and for some tools the possibility to offer confidence measures that could allow a separation of the resources that still needed manual revision has been attempted. Furthermore, the extension to other languages in addition to English, also a PANACEA objective, has been evaluated. The main facts about the evaluation results are now summarized.}, KEYWORDS = {PANACEA, evaluation, machine translation}, URL = {http://hdl.handle.net/10230/22533}, } @TECHREPORT{RIMELL_2012_TECHREPORT_RBPFMQ_221631, AUTHOR = {Rimell, L. and Bel, N. and Padró, M. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {D6. 2 Integrated Final Version of the Components for Lexical Acquisition}, YEAR = {2012}, ABSTRACT = {The PANACEA project has addressed one of the most critical bottlenecks that threaten the development of technologies to support multilingualism in Europe, and to process the huge quantity of multilingual data produced annually. Any attempt at automated language processing, particularly Machine Translation (MT), depends on the availability of language-specific resources. Such Language Resources (LR) contain information about the language's lexicon, i.e. the words of the language and the characteristics of their use. In Natural Language Processing (NLP), LRs contribute information about the syntactic and semantic behaviour of words - i.e. their grammar and their meaning - which inform downstream applications such as MT. To date, many LRs have been generated by hand, requiring significant manual labour from linguistic experts. However, proceeding manually, it is impossible to supply LRs for every possible pair of European languages, textual domain, and genre, which are needed by MT developers. Moreover, an LR for a given language can never be considered complete nor final because of the characteristics of natural language, which continually undergoes changes, especially spurred on by the emergence of new knowledge domains and new technologies. PANACEA has addressed this challenge by building a factory of LRs that progressively automates the stages involved in the acquisition, production, updating and maintenance of LRs required by MT systems. The existence of such a factory will significantly cut down the cost, time and human effort required to build LRs. WP6 has addressed the lexical acquisition component of the LR factory, that is, the techniques for automated extraction of key lexical information from texts, and the automatic collation of lexical information into LRs in a standardized format. The goal of WP6 has been to take existing techniques capable of acquiring syntactic and semantic information from corpus data, improving upon them, adapting and applying them to multiple languages, and turning them into powerful and flexible techniques capable of supporting massive applications. One focus for improving the scalability and portability of lexical acquisition techniques has been to extend exiting techniques with more powerful, less "supervised" methods. In NLP, the amount of supervision refers to the amount of manual annotation which must be applied to a text corpus before machine learning or other techniques are applied to the data to compile a lexicon. More manual annotation means more accurate training data, and thus a more accurate LR. However, given that it is impractical from a cost and time perspective to manually annotate the vast amounts of data required for multilingual MT across domains, it is important to develop techniques which can learn from corpora with less supervision. Less supervised methods are capable of supporting both large-scale acquisition and efficient domain adaptation, even in the domains where data is scarce. Another focus of lexical acquisition in PANACEA has been the need of LR users to tune the accuracy level of LRs. Some applications may require increased precision, or accuracy, where the application requires a high degree of confidence in the lexical information used. At other times a greater level of coverage may be required, with information about more words at the expense of some degree of accuracy. Lexical acquisition in PANACEA has investigated confidence thresholds for lexical acquisition to ensure that the ultimate users of LRs can generate lexical data from the PANACEA factory at the desired level of accuracy.}, KEYWORDS = {Lexical Acquisition}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D6.2.pdf}, } @TECHREPORT{RIMELL_2012_TECHREPORT_RBPFMQD_221650, AUTHOR = {Rimell, L. and Bel, N. and Padró, M. and Frontini, F. and Monachini, M. and Quochi, V. and Del Gratta, R.}, TITLE = {D6. 5 Merged dictionaries}, YEAR = {2012}, ABSTRACT = {This document presents the merged dictionaries delivered in PANACEA. Those dictionaries result from merging already existing lexica, generally for general domain, with domain specific lexica acquired using PANACEA platform. The domain specific lexica are presented and delivered in D6.3 and the merging repository that allowed the multilevel merging in D6.4.}, KEYWORDS = {merged dictionaries, computational lexicon}, URL = {http://www.panacea-lr.eu//en/deliverables/list}, } @TECHREPORT{RIMELL_2012_TECHREPORT_RBPFMQD_221755, AUTHOR = {Rimell, L. and Bel, N. and Padrò, M. and Frontini, F. and Monachini, M. and Quochi, V. and Del Gratta, R.}, TITLE = {D6. 3 Monolingual lexica for English, Spanish and Italian tuned for a particular domain (LAB and ENV)}, YEAR = {2012}, ABSTRACT = {This document presents the lexica acquired using PANACEA platform for Labour and Environment domains. The languages of the lexica are English, Spanish and Italian. The lexical information acquired depends on the language, according to the available tools in the platform.}, KEYWORDS = {Lexicon Acqusition}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D6.3.pdf}, } @INPROCEEDINGS{CALZOLARI_2011_INPROCEEDINGS_CDFR_205564, AUTHOR = {Calzolari, N. and Del Gratta, R. and Frontini, F. and Russo, I.}, TITLE = {The Language Library: Many Layers, More Knowledge}, YEAR = {2011}, ABSTRACT = {In this paper we outline the general concept of the Language Library, a new initiative that has the purpose of building a huge archive of structured colletion of linguistic information. The Language Library is conceived as a community built repository and as an environment that allows language specialists to share multidimensional and multi-level annotated/processed resources. The first steps towards its implementation are briefly sketched.}, KEYWORDS = {Language Resources, Language Library}, PAGES = {93-97}, URL = {https://publications.cnr.it/doc/205564}, ISBN = {978-974-466-564-5}, CONFERENCE_NAME = {Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, CONFERENCE_PLACE = {Chiang Mai}, CONFERENCE_DATE = {12 Novembre 2011}, } @INPROCEEDINGS{FRONTINI_2011_INPROCEEDINGS_FMGLPFAM_205601, AUTHOR = {Frontini, F. and Monachini, M. and Gavrilidou, M. and Labropoulou, P. and Piperidis, S. and Francopoulo, G. and Arranz, V. and Mapelli, V.}, TITLE = {A Metadata Schema for the Description ofLanguage Resources (LRs)}, YEAR = {2011}, ABSTRACT = {This paper presents the metadata schema for describing language resources (LRs) currently under development for the needs of META-SHARE, an open distributed facility for the exchange and sharing of LRs. An essential ingredient in its setup is the existence of formal and standardized LR descriptions, cornerstone of the interoperability layer of any such initiative. The description of LRs is granular and abstractive, combining the taxonomy of LRs with an inventory of a structured set of descriptive elements, of which only a minimal subset is obligatory; the schema additionally proposes recommended and optional elements. Moreover, the schema includes a set of relations catering for the appropriate inter-linking of resources. The current paper presents the main principles and features of the metadata schema, focusing on the description of text corpora and lexical / conceptual resources.}, KEYWORDS = {metadata, language resources}, PAGES = {84-92}, URL = {https://publications.cnr.it/doc/205601}, ISBN = {978-974-466-564-5}, CONFERENCE_NAME = {Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, CONFERENCE_PLACE = {Chiang Mai}, CONFERENCE_DATE = {12 Novembre 2011}, } @INPROCEEDINGS{FRONTINI_2011_INPROCEEDINGS_FM_205738, AUTHOR = {Frontini, F. and Monachini, M.}, TITLE = {Towards interfacing lexical and ontological resources}, YEAR = {2011}, ABSTRACT = {During the last two decades, the Computational Linguistics community has dedicated considerable effort to the research and development Lexical Resources (LRs), especially Computational Lexicons. These LRs, even though belonging to different linguistic approaches and theories, share a common element; all of them contain, explicitly or implicitly, an ontology as the means of organizing their structure.}, KEYWORDS = {language resources, ontologies}, PAGES = {26}, URL = {https://publications.cnr.it/doc/205738}, CONFERENCE_NAME = {ONTOLOGIES AND LEXICAL SEMANTICS}, CONFERENCE_PLACE = {Roma}, CONFERENCE_DATE = {01 Ottobre 2011}, } @TECHREPORT{ARRANZ_2011_TECHREPORT_ABBCCDFGMQRR_290606, AUTHOR = {Arranz, V. and Bel, N. and Budin, G. and Caselli, T. and Choukri, K. and Del Gratta, R. and Frontini, F. and Goggi, S. and Monachini, M. and Quochi, V. and Rubino, F. and Russo, I.}, TITLE = {The FLaReNet Databook}, YEAR = {2011}, ABSTRACT = {The FLaReNet Databook is not only the collection of all the factual material collected during the activities of the project, but also a set on innovative initiatives and instruments that will remain in place for the continuous collection of such "facts". The purpose of the Databook is in fact, on one side, to consolidate the analyses carried out in the project and, at the same time, to set up the proper mechanisms that will enable the provision of a continuous stream of relevant factual material, also after the end of the project.}, KEYWORDS = {Language Resources (LRs)}, PAGES = {1-8}, URL = {http://www.flarenet.eu/?q=FLaReNet_Databook}, } @TECHREPORT{DESIPRI_2011_TECHREPORT_DGLPFMAMFD_206406, AUTHOR = {Desipri, E. and Gavrilidou, M. and Labropoulou, P. and Piperidis, S. and Frontini, F. and Monachini, M. and Arranz, V. and Mapelli, V. and Francopoulo, G. and Declerck, T.}, TITLE = {Documentation and User Manual of the META-SHARE Metadata Model}, YEAR = {2011}, ABSTRACT = {The current deliverable presents the META-SHARE metadata schema v1.0, as implemented in the META-SHARE XSD's v1.0 released to (META-NET and PSP partners) in July 2011 for text corpora and lexical/conceptual resources and its supplement for audio corpora, tools and language descriptions (simplified/refactored version) as implemented in November. It is meant to act as a user manual, providing explanations on the model contents for LRs providers and LRs curators that wish to describe their resources in accordance to it. Work on the schema is ongoing and changes/updates to the model are constantly being made; where appropriate, some changes that are already under way are documented in this deliverable.}, KEYWORDS = {Language resources, metadata, standards}, PAGES = {150}, URL = {https://publications.cnr.it/doc/206406}, } @TECHREPORT{MONACHINI_2011_TECHREPORT_MFS_206457, AUTHOR = {Monachini, M. and Frontini, F. and Soria, C.}, TITLE = {KYOTO-LMF WordNet Representation Format}, YEAR = {2011}, ABSTRACT = {The format described in the following pages is the final revised proposal for representing wordnets inside the Kyoto project (henceforth "Kyoto-LMF wordnet format"). The reference model is Lexical Markup Framework (LMF), version 16, probably one of the most widely recognized standards for the representation of NLP lexicons. The goals of LMF are to provide a common model for the creation and use of such lexical resources, to manage the exchange of data between and among them, and to enable the merging of a large number of individual resources to form extensive global electronic respurces. LMF was specifically designed to accomodate as many models of lexical representations as possible. Purposefully, it is designed as a mea-model, i.e a high-level specification for lexical resources defining the structural constraints of a lexicon.}, KEYWORDS = {Wordnets, LMF, ISO, Representation formats, standards}, PAGES = {32}, URL = {https://publications.cnr.it/doc/206457}, } @TECHREPORT{PROKOPIDIS_2011_TECHREPORT_PPTRFRT_327309, AUTHOR = {Prokopidis, P. and Papavassiliou, V. and Toral, A. and Riera, M. P. and Frontini, F. and Rubino, F. and Thurmair, G.}, TITLE = {WP-4. 4: Report on the revised Corpus Acquisition & Annotation subsystem and its components}, YEAR = {2011}, KEYWORDS = {corpus acquisition, corpus annotation}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D4.4.pdf}, } @TECHREPORT{PROKOPIDIS_2011_TECHREPORT_PPTRFRT_327310, AUTHOR = {Prokopidis, P. and Papavassiliou, V. and Toral, A. and Riera, M. P. and Frontini, F. and Rubino, F. and Thurmair, G.}, TITLE = {WP-4. 5: Final Report on the Corpus Acquisition & Annotation subsystem and its components}, YEAR = {2011}, KEYWORDS = {corpus acquisition, corpus annotation}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D4.5.pdf}, } @TECHREPORT{VOSSEN_2011_TECHREPORT_VBRASADHMBF_206329, AUTHOR = {Vossen, P. and Bosma, W. and Rigau, G. and Agirre, E. and Soroa, A. and Aliprandi, C. and De Jonge, J. and Hielkema, F. and Monachini, M. and Bartolini, R. and Frontini, F.}, TITLE = {KyotoCore: integrated system for knowledge mining from text}, YEAR = {2011}, ABSTRACT = {In this deliverable, we describe KyotoCore, an integrated system for applying text mining. We describe the software architecture of KyotoCore, the single modules and the process flows. Finally, we describe a use case where we apply the complete process toan English database on estuaries.}, KEYWORDS = {Knowledge and text mining software}, PAGES = {56}, URL = {https://publications.cnr.it/doc/206329}, } @INCOLLECTION{JEZEK_2010_INCOLLECTION_JF_136473, AUTHOR = {Jezek, E. and Frontini, F.}, TITLE = {From Pattern Dictionary to Patternbank}, YEAR = {2010}, KEYWORDS = {Ontology. Computational Semantics}, PAGES = {215-237}, URL = {https://publications.cnr.it/doc/136473}, BOOKTITLE = {A Way with Words: Recent Advances in Lexical Theory and Analysis}, EDITOR = {De Schryver, G.}, } @INPROCEEDINGS{FRONTINI_2010_INPROCEEDINGS_F_112965, AUTHOR = {Frontini, F.}, TITLE = {Statistical profiling of Italian L2 texts: competence and native language}, YEAR = {2010}, KEYWORDS = {Text categorization}, URL = {https://publications.cnr.it/doc/112965}, CONFERENCE_NAME = {20th Annual Conference of the European Second Language Association}, CONFERENCE_PLACE = {Reggio Emilia}, CONFERENCE_DATE = {2010}, }