@ARTICLE{PEDONESE_2025_ARTICLE_PFOBSFSRCSB_552963, AUTHOR = {Pedonese, G. and Frontini, F. and Ottaviani, R. and Boschetti, F. and Spadi, A. and Francalanci, L. and Scognamiglio, A. and Restaneo, P. and Chaban, A. and Striova, J. and Benassi, L.}, TITLE = {Materiali didattici come oggetti digitali FAIR: una metodologia condivisa per la formazione in H2IOSC}, YEAR = {2025}, ABSTRACT = {Il presente lavoro dettaglia la strategia per lo sviluppo di iniziative di formazione nell’ambito del progetto PNRR sviluppato dal CNR Humanities and cultural Heritage ItalianOpen Science Cloud(H2IOSC) e mira ad aprire alla comunità italiana di riferimento il processo di applicazione delle linee guida di design e di fruizione di moduli didattici che integrino l’uso delle Infrastrutture di Ricerca. In particolare, il contributo si sofferma suglistandard condivisi per la descrizione dei materiali didattici come oggetti digitali FAIR al fine di massimizzarne il riutilizzo in un’ottica train the trainers e sulla descrizione dei requisiti per l’implementazione dell’infrastruttura di training. Dopo aver descritto la strategia didattica (Sezione 2) e l’applicazione della metodologia FAIR-by-Design di Skills4EOSC ai materiali didattici preesistenti (Sezione 3), il lavoro descrive il processo di ideazione di due piattaforme con funzionalità coerenti ai requisiti degli oggetti didattici prendendo ad esempio il corso CLARIN Introduction to Language Data: Standards and Repositoriestradotto e adattato in H2IOSC (Sezione 4)}, KEYWORDS = {formazione, gestione dei dati, infrastrutture di ricerca, H2IOSC, principi FAIR}, PAGES = {361-380}, URL = {https://iris.cnr.it/handle/20.500.14243/552963}, VOLUME = {2025 (20)}, DOI = {10.6092/issn.2532-8816/21190}, ISSN = {2532-8816}, JOURNAL = {UMANISTICA DIGITALE}, } @INCOLLECTION{BUSCEMI_2025_INCOLLECTION_BF_549328, AUTHOR = {Buscemi, F. and Frontini, F.}, TITLE = {Scienza aperta, dati e infrastrutture}, YEAR = {2025}, ABSTRACT = {Scienza Aperta, Dati e Infrastrutture. Impatto, impegno e prospettive degli Istituti del DSU rispetto a questi grandi temi, nel contesto nazionale ed europeo}, KEYWORDS = {Scienza aperta, infrastrutture di ricerca, open publishing}, PAGES = {34-44}, URL = {https://iris.cnr.it/handle/20.500.14243/549328}, PUBLISHER = {CNR Edizioni (Roma, ITA)}, ISBN = {978}, CONFERENCE_PLACE = {Roma}, BOOKTITLE = {LE SCIENZE UMANE E SOCIALI NEL XXI SECOLO: COMPRENDERE E TRASFORMARE LA SOCIETÀ}, EDITOR = {Filippetti, A. and Sfameni, C. and Antonini, G.}, } @INCOLLECTION{PEDONESE_2025_INCOLLECTION_PFDF_564161, AUTHOR = {Pedonese, G. and Frontini, F. and Del Fante, D. and Federici, E.}, TITLE = {Adapting UPSKILLS Learning Modules to the University Curricula. Best Practices and Lessons Learnt from the H2IOSC Training Experience at the University of Ferrara}, YEAR = {2025}, ABSTRACT = {This paper details the steps taken to adapt and integrate the training materials developed by CLARIN ERIC in two bachelor’s degree courses and one master’s degree course at the University of Ferrara. The workflow applies the shared methodology developed within the Humanities and Heritage Italian Open Science Cloud project. It modifies the training materials of the UPSKILLS course “Introduction to Language Data: Standards and Repositories” according to the needs of three target courses focusing on English to Italian translation: English Language Course for Tourism, English Language for Translation and English Language and Linguistics for Humanities, Arts and Archaeology. The result of this pilot is a documented example of how CLARIN services can be integrated into university teaching, including initial teacher training, and providing an opportunity to discuss the topic and a use case for trainers who intend to include CLARIN in their courses}, KEYWORDS = {Training, Learning Resources, Language Data, FAIR principles, Research Infrastructures}, PAGES = {37-47}, URL = {https://ecp.ep.liu.se/index.php/clarin/article/view/1236}, DOI = {10.3384/ecp216}, PUBLISHER = {Linköping University Electronic press' conference series (SWE)}, ISBN = {978-91-8118-188-3}, CONFERENCE_PLACE = {SWE}, BOOKTITLE = {Selected papers from the CLARIN Annual Conference 2024}, EDITOR = {Vandeghinste, V. and Kontino, T.}, } @INPROCEEDINGS{CARDILLO_2025_INPROCEEDINGS_CDFACC_562981, AUTHOR = {Cardillo, F. A. and Debole, F. and Frontini, F. and Aelami, M. and Chahinian, N. and Conrad, S.}, TITLE = {Novel benchmark for NER in the wastewater and stormwater domain}, YEAR = {2025}, ABSTRACT = {Efficient wastewater and stormwater management is mandatory for sustainable cities. Extracting structured knowledge from reports and regulations is challenging due to domain-specific terminology and multilingual contexts. This work focuses on domain-specific Named Entity Recognition (NER) as a first step towards effective relation and information extraction to support decision making. A multilingual benchmark is crucial for evaluating these methods. This study develops a French-Italian domain-specific text corpus for wastewater management. It evaluates state-of-the-art NER methods, including LLM-based approaches, to provide a reliable baseline for future strategies and explores automated annotation projection in view of an extension of the corpus to new languages}, KEYWORDS = {Annotation projection, Domain-specific corpus, LLMs for NER, Multilingual NLP, Named Entity Recognition}, PAGES = {226-231}, URL = {https://ieeexplore.ieee.org/document/11224095}, DOI = {10.1109/cist65886}, PUBLISHER = {Institute of Electrical and Electronics Engineers (USA)}, ISBN = {979-8-3315-4384-6}, CONFERENCE_NAME = {Cist 2025-8th IEEE International Congress on Information Science and Technology}, CONFERENCE_PLACE = {USA}, BOOKTITLE = {Cist 2025 proceedings}, } @INPROCEEDINGS{PEDONESE_2025_INPROCEEDINGS_PFOBSFSRCSB_552965, AUTHOR = {Pedonese, G. and Frontini, F. and Ottaviani, R. and Boschetti, F. and Spadi, A. and Francalanci, L. and Scognamiglio, A. and Restaneo, P. and Chaban, A. and Striova, J. and Benassi, L.}, TITLE = {Dai Materiali Didattici alle Piattaforme FAIR: Costruire un’Infrastruttura di Training in H2IOSC}, YEAR = {2025}, ABSTRACT = {Questo contributo si propone di illustrare la progettazione e lo sviluppo di un’infrastruttura di formazione innovativa per le Scienze Umane e Sociali, basata sui principi FAIR e sulla promozione della Scienza Aperta, nell’ambito del progetto Humanities and cultural Heritage Italian Open Science Cloud (H2IOSC). L’obiettivo principale è la creazione di un ecosistema integrato che renda i materiali didattici facilmente reperibili, accessibili, interoperabili e riutilizzabili. A tal fine, sono state implementate due piattaforme: H2IOSC Virtual Environment, dedicata all’erogazione di corsi e risorse per studenti, e H2IOSC Training Library, un deposito per la conservazione e la condivisione di materiali didattici modulari. Entrambe le piattaforme si basano sulla metodologia "FAIR-by-Design" raccomandata dal progetto Skills4EOSC, che struttura il processo educativo in sei fasi, garantendo standard elevati di metadatazione e l’uso di formati aperti. Con l’implementazione di queste piattaforme, i cui servizi saranno resi disponibili a un livello di aggregazione più alto nel Marketplace di H2IOSC, il progetto intende favorire un approccio scalabile e sostenibile alla formazione, promuovendo al contempo la collaborazione tra docenti e studenti}, KEYWORDS = {formazione, gestione dei dati, infrastrutture di ricerca, principi FAIR, Scienza Aperta.}, PAGES = {473-477}, URL = {https://iris.cnr.it/handle/20.500.14243/552965}, DOI = {10.6092/unibo/amsacta/8380}, ISBN = {978-88-942535-9-7}, CONFERENCE_NAME = {XIV Convegno Annuale AIUCD 2025}, BOOKTITLE = {Diversità, Equità e Inclusione: Sfide e Opportunità per l’Informatica Umanistica nell’Era dell’Intelligenza Artificiale, Proceedings del XIV Convegno Annuale AIUCD2025}, } @MISC{VANDENHEUVEL_2025_MISC_VDFPV_561742, AUTHOR = {Van Den Heuvel, H. and Draxler, C. and Frontini, F. and Pedonese, G. and Van Der Lek, I.}, TITLE = {Introduzione alla Gestione dei Dati Orali}, YEAR = {2025}, ABSTRACT = {Il corso affronta le tematiche legate alla gestione dei dati linguistici orali. Dopo un'introduzione generale alle possibilità offerte dall'infrastruttura CLARIN ERIC in fase di scoperta, raccolta e deposito di dati orali, si approfondiranno le questioni etico-legali connesse alla raccolta, gestione e conservazione dei dati e il procedimento di trascrizione automatica, con ulteriori possibilità di annotazione attraverso strumenti ti trattamento automatico del linguaggio. Il corso è stato sviluppato con la collaborazione dei docenti della CLARIN Traners' Network nell'ambito della partecipazione di CLARIN-IT al Progetto H2IOSC-Humanities and cultural Heritage Italian Open Science Cloud finanziato dall’Unione Europea NextGenerationEU – PNRR M4C2 – Codice progetto IR0000029 – CUP B63C22000730005. Il materiale si compone di tre unità: Unità 1-I Dati Linguistici Orali in CLARIN Questa unità fornisce una panoramica delle risorse e dei servizi offerti dall'Infrastruttura di Ricerca CLARIN ERIC a supporto della scoperta, dell'annotazione e del deposito dei dati linguistici orali in accordo con i principi FAIR e le buone pratiche della Scienza Aperta. Unità 2-Raccolta e Gestione dei Dati Orali L'unità propone un'introduzione alle problematiche legate alla gestione dei dati orali dal punto di vista etico e legale. Gli aspetti legati al GDPR e alla normativa italiana di riferimento sono approfonditi in un gioco di ruolo interattivo. Unità 3-Laboratorio di Trascrizione Automatica In questa unità interattiva, saranno affrontate le questioni relative ad alcuni strumenti e i software utili per la trascrizione dei dati. Si ringraziano le ricercatrici e i ricercatori impegnate/i nel progetto PRIN Corpus SIM (Senecta Ipsa Morbus)-Spontaneous speech in healthy ageing per aver attivamente partecipato alle sessioni di didattica del 16 e 17 settembre 2024 presso l'Università di Firenze, da cui è stato tratto il materiale del corso}, KEYWORDS = {Dati orali, Archivi orali, Trascrizione automatica}, DOI = {10.5281/zenodo}, } @ARTICLE{KHAN_2024_ARTICLE_KF_475881, AUTHOR = {Khan, A. F. and Frontini, F.}, TITLE = {Toward a Representation of Semantic Change in Linked Data}, YEAR = {2024}, ABSTRACT = {In this article, we introduce a new framework, the Intensional–Ontological Model (IOM), for representing meaning, and especially for representing semantic change, in linguistic linked data resources. This framework, which makes use of previous work in the literature on lexical semantics and ontologies, is intended to help clarify what we mean when we model semantic change and to assist in elaborating different ontology patterns for doing so. In this work, we assume a simple architecture, one which is at the basis of the well-known OntoLex-Lemon vocabulary and which consists of one or more lexicons linked to an ontology. Our model, which is based on this architecture and informed by previous work on word senses and ontologies, is intended to provide a clear interpretation for the modelling of both onomasiological and semiasological changes, in both static and dynamic versions. This article describes how the IOM framework represents word meaning as the relationship between a word and an ontological concepts in the ’static’ case, demonstrating that the IOM is compatible with OntoLex-Lemon (while at the same time providing a greater level of detail as to the meaning of the ’sense’ and ’reference’ relationships). It then goes on to detail how the IOM can help us understand how to model semantic shifts in linked data lexical resources with a focus on conceptual change and the addition of temporal information to semantic shift data}, KEYWORDS = {linked data, semantic shift, ontologies, lexical semantics}, URL = {https://iris.cnr.it/handle/20.500.14243/475881}, VOLUME = {9 (6)}, DOI = {10.3390/languages9060215}, ISSN = {2226-471X}, JOURNAL = {LANGUAGES}, } @INCOLLECTION{FRONTINI_2024_INCOLLECTION_FRS_475984, AUTHOR = {Frontini, F. and Roth Boll, A. and Seguin, M. S.}, TITLE = {Cartographie d’une aventure Approche numérique du Journal d’un voyage fait aux Indes orientales de Robert Challe}, YEAR = {2024}, ABSTRACT = {L’article propose d’étudier le Journal d’un voyage de Robert Challe grâce aux outils des humanités numériques. Nous avons reconstitué la cartographie de l’aventure challienne et comparé les trajets ainsi restitués avec leur exploitation viatique. La rencontre des espaces réellement visités avec leur représentation textuelle fait ainsi émerger l’existence d’une forme de géographie mémorielle, affective et poétique, indispensable au travail littéraire de Robert Challe}, KEYWORDS = {Contextualisation, humanités numériques, identification, localisation, marquage, méthodologie, observation, référencement, spatialisation, technologie}, URL = {https://iris.cnr.it/handle/20.500.14243/475984}, DOI = {10.48611/isbn.978-2-406-16757-0.p.0247}, PUBLISHER = {Classiques Garnier}, ISBN = {978-2-406-16757-0}, BOOKTITLE = {Robert Challe et l’aventure}, } @INPROCEEDINGS{GROMANN_2024_INPROCEEDINGS_GGPABBCCFGGGKKLLPORRSSSSSSSTVZZ_475921, AUTHOR = {Gromann, D. and Goncalo Oliveira, H. and Pitarch, L. and Apostol, E. S. and Bernad, J. and Bytyçi, E. and Cantone, C. and Carvalho, S. and Frontini, F. and Garabik, R. and Gracia, J. and Granata, L. and Khan, F. and Knez, T. and Labropoulou, P. and Liebeskind, C. and Pia Di Buono, M. and Ostroški Anić, A. and Rackevičienė, S. and Rodrigues, R. and Sérasset, G. and Selmistraitis, L. and Sidibé, M. and Silvano, P. and Spahiu, B. and Sogutlu, E. and Stanković, R. and Truică, C. O. and Valunaite Oleskeviciene, G. and Zitnik, S. and Zdravkova, K.}, TITLE = {MultiLexBATS: Multilingual Dataset of Lexical Semantic Relations}, YEAR = {2024}, ABSTRACT = {Understanding the relation between the meanings of words is an important part of comprehending natural language. Prior work has either focused on analysing lexical semantic relations in word embeddings or probing pretrained language models (PLMs), with some exceptions. Given the rarity of highly multilingual benchmarks, it is unclear to what extent PLMs capture relational knowledge and are able to transfer it across languages. To start addressing this question, we propose MultiLexBATS, a multilingual parallel dataset of lexical semantic relations adapted from BATS in 15 languages including low-resource languages, such as Bambara, Lithuanian, and Albanian. As experiment on cross-lingual transfer of relational knowledge, we test the PLMs(') ability to (1) capture analogies across languages, and (2) predict translation targets. We find considerable differences across relation types and languages with a clear preference for hypernymy and antonymy as well as romance languages}, KEYWORDS = {Lexical Semantic Relations, Multilingual Benchmark, BATS}, PAGES = {11783-11793}, URL = {https://aclanthology.org/2024.lrec-main.1029}, PUBLISHER = {ELRA and ICCL}, BOOKTITLE = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, } @INPROCEEDINGS{KHAN_2024_INPROCEEDINGS_KSACLMORF_475941, AUTHOR = {Khan, F. and Salgado, A. and Anuradha, I. and Costa, R. and Liyanage, C. and McCrae, J. P. and Ojha, A. K. and Rani, P. and Frontini, F.}, TITLE = {CHAMUÇA: Towards a Linked Data Language Resource of Portuguese Borrowings in Asian Languages}, YEAR = {2024}, ABSTRACT = {This paper presents the development of CHAMUÇA, a novel lexical resource designed to document the influence of the Portuguese language on various Asian languages, with an initial focus on the languages of South Asia. Through the utilization of linked open data and the OntoLex vocabulary, CHAMUÇA offers structured insights into the linguistic characteristics, and cultural ramifications of Portuguese borrowings across multiple languages. The article outlines CHAMUÇA’s potential contributions to the linguistic linked data community, emphasising its role in addressing the scarcity of resources for lesser-resourced languages and serving as a test case for organising etymological data in a queryable format. CHAMUÇA emerges as an initiative towards the comprehensive catalogization and analysis of Portuguese borrowings, offering valuable insights into language contact dynamics, historical evolution, and cultural exchange in Asia, one that is based on linked data technology}, KEYWORDS = {portuguese, ontolex, language contact, lexicon}, URL = {https://aclanthology.org/2024.ldl-1.6}, PUBLISHER = {ELRA and ICCL (Torino, Italia)}, CONFERENCE_PLACE = {Torino, Italia}, BOOKTITLE = {Proceedings of the 9th Workshop on Linked Data in Linguistics @ LREC-COLING 2024}, } @INPROCEEDINGS{PEDONESE_2024_INPROCEEDINGS_PFOBSFSRCSB_506821, AUTHOR = {Pedonese, G. and Frontini, F. and Ottaviani, R. and Boschetti, F. and Spadi, A. and Francalanci, L. and Scognamiglio, A. and Restaneo, P. and Chaban, A. and Striova, J. and Benassi, L.}, TITLE = {Materiali didattici come oggetti digitali FAIR: una metodologia condivisa per la formazione in H2IOSC}, YEAR = {2024}, ABSTRACT = {Il presente lavoro dettaglia la strategia per lo sviluppo di iniziative di formazione nell’ambito del progetto H2IOSC e mira a coinvolgere la comunità italiana di riferimento sulle modalità di design e di fruizione di moduli didattici che integrino l’uso delle Infrastrutture di Ricerca. In particolare, il contributo si sofferma sulla descrizione dei requisiti per l’implementazione dell’infrastruttura di training e sugli standard condivisi per la descrizione dei materiali didattici come oggetti digitali FAIR al fine di massimizzarne il riutilizzo in un’ottica train the trainers}, KEYWORDS = {Formazione, training, infrastrutture di ricerca, H2IOSC, principi FAIR.}, PAGES = {577-581}, URL = {https://amsacta.unibo.it/id/eprint/7927/}, DOI = {10.6092/unibo/amsacta/7927}, ISBN = {978-88-942535-8-0}, CONFERENCE_NAME = {XIII Convegno Annuale AIUCD2024}, BOOKTITLE = {Me. Te. Digitali. Mediterraneo in rete tra testi e contesti, Proceedings del XIII Convegno Annuale AIUCD2024}, } @TECHREPORT{KALOV_2024_TECHREPORT_KFBLMH_475982, AUTHOR = {Kalová, T. and Frontini, F. and Bracco, L. and Laetitia, D. and Meeus, J. and Hasani Mavriqi, I.}, TITLE = {Data Stewardship Career Paths: Recommendations of the EOSC Task Force Data Stewardship Curricula and Career Paths}, YEAR = {2024}, ABSTRACT = {This document provides an overview of the topic of Data Stewardship Career Paths. Our review and summary of relevant reports and papers, ongoing initiatives, projects and surveys highlight the importance of ensuring more sustainable career paths for Data Stewards. The report further argues the need for further in-depth study and documentation of this topic. In particular, the analysis identifies relevant aspects that should be considered, ranging from employment conditions and salary to scientific recognition and roles. The report provides a list of recommendations and identifies activities that can be taken by the EOSC in the areas of Partnership, Association and Projects, summarised as follows: The EOSC Association and related projects should ensure that the overview of the current situation is kept up-to-date as a reference point. A section on the EOSC Association public website should be dedicated to Data Stewardship initiatives including a dedicated bibliography. To ensure long-term sustainability, a governing body should own and maintain this "inventory" as a point of reference; future projects should be encouraged to use it as a reference and provide input. The EOSC Association should ensure collaboration with international initiatives, in particular, the RDA IG Professionalizing Data Stewardship – TF Career Tracks and ensure coordination among current activities and studies carried out within the various EOSC Horizon Europe projects, and promote and support the organisation of dedicated events. The current and future EOSC projects and initiatives in the field should build on the work of this task force, as well as on the results of the studies above, and extend them by Applying qualitative methods such as guided interviews or focus groups to investigate the aspects covered in section 6 (“Data Stewardship Careers-What Counts”) in more depthDeveloping Data Steward Personas based on the proposed methodology detailed in Annex 1The EOSC Partnership (EOSC Association, European Commission and Steering Board) should establish a permanent Data Stewardship expert group (including representatives from the various existing initiatives and this task force) with the following responsibilities: Develop and implement a monitoring framework that will allow the EOSC and other national and international institutions to support the career paths and development of personnel hired (at least in part) in Data Stewardship rolesAdvise the Association and the relevant (ongoing and future) projects and initiativesIssue recommendations on further activities of the Association regarding Data StewardshipIn consideration of the importance of the key role of Data Stewards in the development and implementation of the EOSC and to facilitate the exchange with and among Data Stewards, a further objective of the EOSC Association should be to assess the need for a professional network for Data Stewards, at least on the European level. \  The use of an innovative methodology, the Persona workshops, is recommended alongside further surveys and in-depth interviews to explore the relevant aspects of career paths and professional development of Data Stewards, including the roles and responsibilities of employers and Data Stewardship training programmes. \  Context This work was carried out within the Data Stewardship Curricula and Career Paths EOSC Task Force framework, particularly its Career Paths work stream}, KEYWORDS = {career paths, data stewards, EOSC}, URL = {https://zenodo.org/records/11077722}, DOI = {10.5281/zenodo.11077722}, } @MISC{ERJAVEC_2024_MISC_EKOOAAAAAAAABBBBBBBCCLCDDDDDDDFFFGGGGGGGHIJJJKKKLLLMMMMMMMMMNNNNOPPPPPPPPPQRRRRRRSSSSTTTVVVVVVVVWYZF_483001, AUTHOR = {Erjavec, T. and Kopp, M. and Ogrodniczuk, M. and Osenova, P. and Agerri, R. and Agirrezabal, M. and Agnoloni, T. and Aires, J. and Albini, M. and Alkorta, J. and Antiba Cartazo, I. and Arrieta, E. and Barcala, M. and Bardanca, D. and Barkarson, S. and Bartolini, R. and Battistoni, R. and Bel, N. and Bonet Ramos, M. D. M. and Calzada Pérez, M. and Cardoso, A. and Çöltekin, Ç. and Coole, M. and Darģis, R. and De Does, J. and De Libano, R. and Depoorter, G. and Depuydt, K. and Diwersy, S. and Dodé, R. and Fernandez, K. and Fernández Rei, E. and Frontini, F. and Garcia, M. and García Díaz, N. and García Louzao, P. and Gavriilidou, M. and Gkoumas, D. and Grigorov, I. and Grigorova, V. and Haltrup Hansen, D. and Iruskieta, M. and Jarlbrink, J. and Jelencsik Mátyus, K. and Jongejan, B. and Kahusk, N. and Kirnbauer, M. and Kryvenko, A. and Ligeti Nagy, N. and Ljubešić, N. and Luxardo, G. and Magariños, C. and Magnusson, M. and Marchetti, C. and Marx, M. and Meden, K. and Mendes, A. and Mochtak, M. and Mölder, M. and Montemagni, S. and Navarretta, C. and Nitoń, B. and Norén, F. M. and Nwadukwe, A. and Ojsteršek, M. and Pančur, A. and Papavassiliou, V. and Pereira, R. and Pérez Lago, M. and Piperidis, S. and Pirker, H. and Pisani, M. and Pol, H. V. D. and Prokopidis, P. and Quochi, V. and Rayson, P. and Regueira, X. L. and Rii, A. and Rudolf, M. and Ruisi, M. and Rupnik, P. and Schopper, D. and Simov, K. and Sinikallio, L. and Skubic, J. and Tamper, M. and Tungland, L. M. and Tuominen, J. and Van Heusden, R. and Varga, Z. and Vázquez Abuín, M. and Venturi, G. and Vidal Miguéns, A. and Vider, K. and Vivel Couso, A. and Vladu, A. I. and Wissik, T. and Yrjänäinen, V. and Zevallos, R. and Fišer, D.}, TITLE = {Linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint. ana 4. 1}, YEAR = {2024}, ABSTRACT = {ParlaMint 4. 1 is a set of comparable corpora containing transcriptions of parliamentary debates of 29 European countries and autonomous regions, mostly starting in 2015 and extending to mid-2022. The individual corpora comprise between 9 and 126 million words and the complete set contains over 1. 2 billion words. The transcriptions are divided by days with information on the term, session and meeting, and contain speeches marked by the speaker and their role (e. g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. The corpora have extensive metadata, most importantly on speakers (name, gender, MP and minister status, party affiliation), on their political parties and parliamentary groups (name, coalition/opposition status, Wikipedia-sourced left-to-right political orientation, and CHES variables, https: //www. chesdata. eu/). Note that some corpora have further metadata, e. g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The transcriptions are also marked with the subcorpora they belong to ("reference", until 2020-01-30, "covid", from 2020-01-31, and "war", from 2022-02-24). An overview of the statistics of the corpora is avaialable on GitHub in the folder Build/Metadata, in particular for the release 4. 1 at https: //github. com/clarin-eric/ParlaMint/tree/v4. 1/Build/Metadata. The corpora are encoded according to the ParlaMint encoding guidelines (https: //clarin-eric. github. io/ParlaMint/) and schemas (included in the distribution). The ParlaMint. ana linguistic annotation includes tokenization; sentence segmentation; lemmatisation; Universal Dependencies part-of-speech, morphological features, and syntactic dependencies; and the 4-class CoNLL-2003 named entities. Some corpora also have further linguistic annotations, in particular PoS tagging according a language-specific scheme, with their corpus TEI headers giving further details on the annotation vocabularies and tools used. This entry contains the ParlaMint. ana TEI-encoded linguistically annotated corpora; the derived CoNLL-U files along with TSV metadata of the speeches; and the derived vertical files (with their registry file), suitable for use with CQP-based concordancers, such as CWB, noSketch Engine or KonText. Also included is the 4. 1 release of the sample data and scripts available at the GitHub repository of the ParlaMint project at https: //github. com/clarin-eric/ParlaMint and the log files produced in the process of building the corpora for this release. The log files show e. g. known errors in the corpora, while more information about known problems is available in the open issues at the GitHub repository of the project. This entry contains the linguistically marked-up version of the corpus, while the text version, i. e. without the linguistic annotation is also available at http: //hdl. handle. net/11356/1912. Another related resource, namely the ParlaMint corpora machine translated to English ParlaMint-en. ana 4. 1 can be found at http: //hdl. handle. net/11356/1910. As opposed to the previous version 4. 0, this version fixes a number of bugs and restructures the ParlaMint GitHub repository. The DK corpus has been linguistically re-annotated to remove bugs, while its speeches are now also marked with topics. The PT corpus has been extended to 2024-03 and the UA corpus to 2023-11, which also has improved language marking (uk vs. ru) on segments}, KEYWORDS = {ParlaCLARIN, linguistic annotation, pos-tagging, Named Entity Recognition, linguistic dependency annotation, UD}, URL = {https://iris.cnr.it/handle/20.500.14243/483001}, } @MISC{PEDONESE_2024_MISC_PKMFQS_561741, AUTHOR = {Pedonese, G. and Khan, A. F. and Mallia, M. and Frontini, F. and Quochi, V. and Squadrito, E.}, TITLE = {Linguistic Linked Open Data for Humanists}, YEAR = {2024}, ABSTRACT = {Having achieved popularity as a way of publishing and accessing data in different fields of the sciences and for sharing large encyclopaedic datasets such as DBpedia (derived from Wikipedia), linked data is becoming more and more popular in different areas of the humanities. In this course we will present a comprehensive introduction to the creation, publication, and use of linked open data for anyone who wants to work with linguistic datasets – such as lexicons and corpora – and especially for those who come from a linguistic or humanist background. We will look at the basics of linked data and the Semantic Web and introduce the various different standards technologies that make up the Semantic Web stack before focusing on the particular case of linked data language resources. During the course we will study the most important tools, vocabularies, and resources available in the Semantic Web and provide hands-on training for the creation and querying of linguistic linked data. We will look at how Semantic Web technologies can contribute to the creation of FAIR language resources as well as how to publish your resource on the linked open data cloud. We will also show how the Semantic Web query language SPARQL can be a powerful tool for data exploration}, KEYWORDS = {Linked Open Data, Linguistics}, DOI = {10.5281/zenodo}, } @MISC{VANDERLEK_2024_MISC_VFFP_561743, AUTHOR = {Van Der Lek, I. and Fišer, D. and Frontini, F. and Pedonese, G.}, TITLE = {Introduzione ai Dati Linguistici: Standard e Archivi Digitali}, YEAR = {2024}, ABSTRACT = {Il corso "Introduzione ai Dati Linguistici: Standard e Archivi Digitali" introduce gli insegnanti e gli studenti all'uso degli archivi digitali di dati della ricerca e al loro ruolo nel ciclo di vita dei dati linguistici nel contesto degli principi FAIR e delle buone pratiche della Scienza Aperta. I materiali del corso sono suddivisi in unità e sono intesi come contenuti didattici per i docenti che insegnano a livello di laurea triennale o laurea magistrale, che sono invitati a sfogliare i materiali, esportarli per l'uso nel Learning Management System della loro istituzione e adattarli ai propri scopi come ritengono opportuno. Questo corso traduce in italiano e aggiorna i materiali di: van der Lek, Iulianna; Fišer, Darja. (2023). Introduction to Language Data: Standards and Repositories. In UPSKILLS Learning Content. https: //upskillsproject. eu/project/standards_repositories/. CC BY 4. 0. https: //creativecommons. org/licenses/by/4. 0/ L'adattamento si è svolto nell'ambito del progetto Humanities and cultural Heritage Italian Open Science Cloud (https: //www. h2iosc. cnr. it/), Work Package 8 "Training, Capacity Building, Engagement", a cura del personale CNR-ILC dedicato all'Attività 8. 2 "Teach CLARIN, Teach with CLARIN". Progetto H2IOSC-Humanities and cultural Heritage Italian Open Science Cloud finanziato dall’Unione Europea NextGenerationEU – PNRR M4C2 – Codice progetto IR0000029 – CUP B63C22000730005}, KEYWORDS = {Dati Linguistici, Gestione dati}, DOI = {10.5281/zenodo}, } @ARTICLE{BERENIKEHERRMANN_2023_ARTICLE_BBFJPRRS_476001, AUTHOR = {Berenike Herrmann, J. and Bories, A. S. and Frontini, F. and Jacquot, C. and Pielström, S. and Rebora, S. and Rockwell, G. and Sinclair, S.}, TITLE = {Tool criticism in practice. On methods, tools and aims of computational literary studies}, YEAR = {2023}, ABSTRACT = {This paper is a case-driven contribution to the discussion on the method-theory relationship in practices within the field of Computational Literary Studies (CLS). Progress in this field dedicated to the computational analysis of literary texts has long revolved around the new, digital tools: tools, as computational devices for analysis, have had here a comparatively strong status as research entities of their own, while their ontological status has remained unclear to the day. As a rule, they have widely been imported from the fields of data science and NLP, while less often being hand-tailored to specific tasks within interdisciplinary settings. Although studies within CLS are evolving to both a higher degree of specialization in method (going beyond the limitations of out-of-the-box tools) and a stronger theoretical modeling, the technological dimension remains a defining factor. An unreflective adoption of technology in the shape of tools can compromise the plausibility and the reproducibility of the results produced using these tools. Our paper presents a multi-faceted intervention to the discussion around tools, methods, and the research questions that are answered with them. It presents research perspectives first conceived at the ADHO SIG-DLS workshop Anatomy of tools: A closer look at textual DH methodologies that took place in Utrecht in July 2019. At that event, the authors discussed selected case studies to address tool criticism from several angles. Our goal was to leverage a tool-critical perspective, in order to “take stock, reflect upon and critically comment upon our own practices” within CLS. We identified Textométrie, Stylometry, and Semantic Text Mining as three central types of hands-on CLS. For each of these sub-fields, we asked: What are our tools and methods-in-use? What are the implications of using a tool-oriented perspective as opposed to a methodology-oriented one? How do either relate to research questions and theory? These questions were explored by case-studies on an exemplary basis. The unifying perspective of this paper is an applied tool criticism – a critical inquiry leveraged towards crucial dimensions of CLS practices. Here we re-compose the original oral papers and add entirely new sections to it, to create a useful overview of the issue through a combination of perspectives. While we elaborated the thematic connections between the individual case studies, we hope the interactive spirit of an exemplary exchange remains palpable: individual research perspectives shape the case studies reported for Textométrie, Stylometry and Semantic Text Mining, are complemented by further studies showcasing CLS-specific perspectives on replicability and domain-specific research, and a short section discussing a tool inventory as a practical, community-based incarnation of tool criticism. The article reflects thus a rich array of perspectives on tool criticism, including the complementary perspective of tool defense – arguing that we need tools and methods as a basic common ground on how to carry out fundamental operations of analysis and interpretation within a community}, KEYWORDS = {tool criticism, digital literary studies, digital humanities}, URL = {https://www.digitalhumanities.org/dhq/vol/17/2/000687/000687.html}, VOLUME = {017 (2)}, ISSN = {1938-4122}, JOURNAL = {DIGITAL HUMANITIES QUARTERLY}, } @ARTICLE{BRANCO_2023_ARTICLE_BEFHHJKKLNPPPSSTWZ_475961, AUTHOR = {Branco, A. and Eskevich, M. and Frontini, F. and Hajic, J. and Hinrichs, E. and Jong, F. and Kamocki, P. and Konig, A. and Linden, K. and Navarretta, C. and Piasecki, M. and Piperidis, S. and Pitkanen, O. and Simov, K. and Skadina, I. and Trippel, T. and Witt, A. and Zinn, C.}, TITLE = {The CLARIN infrastructure as an interoperable language technology platform for SSH and beyond}, YEAR = {2023}, ABSTRACT = {CLARIN is a European Research Infrastructure Consortium developing and providing a federated and interoperable platform to support scientists in the field of the Social Sciences and Humanities in carrying-out language-related research. This contribution provides an overview of the entire infrastructure with a particular focus on tool interoperability, ease of access to research data, tools and services, the importance of sharing knowledge within and across (national) communities, and community building. By taking into account FAIR principles from the very beginning, CLARIN succeeded in becoming a successful example of a research infrastructure that is actively used by its members. The benefits CLARIN members reap from their infrastructure secure a future for their common good that is both sustainable and attractive to partners beyond the original target groups}, KEYWORDS = {Interoperability, Language resources, Language technology, Research infrastructure, Social sciences and humanities}, URL = {https://iris.cnr.it/handle/20.500.14243/475961}, DOI = {10.1007/s10579-023-09658-z}, ISSN = {1574-020X}, JOURNAL = {LANGUAGE RESOURCES AND EVALUATION}, } @INCOLLECTION{CAITIRUSSO_2023_INCOLLECTION_CF_469901, AUTHOR = {Caiti Russo, G. and Frontini, F.}, TITLE = {Migrazione di testi e di codici manoscritti: risorse digitali per la ricostruzione dell'Occitania medievale}, YEAR = {2023}, ABSTRACT = {This paper offers an overview of the current panorama of digital language resources and approaches to the study of Medieval Occitan. Starting from a scattered tradition of witnesses, Occitan manuscripts are now being digitised in various projects, but not enough has been done to adopt interoperable and shared practices. By drawing from other philological traditions, and in particular the Digital Classics, we trace an inventory of best practices, tools, standards and formats that Digital Occitan Studies needs to develop in order to offer scholars access to Virtual Research environments of linked and interconnected language resources}, KEYWORDS = {Occitan, Digital Philology, Digital Editions, Virtual Research Environments, Occitano, filologia digitale, edizioni digitali, ambienti virtuali di ricerca}, PAGES = {537-568}, URL = {https://iris.cnr.it/handle/20.500.14243/469901}, DOI = {10.7410/1678}, PUBLISHER = {Casalini-ISEM-Istituto di Storia dell'Europa Mediterranea (Cagliari)}, ISBN = {978-88-97317-81-4}, CONFERENCE_PLACE = {Cagliari}, BOOKTITLE = {Storie di idee nell'Europa mediterranea: trasmissione di parole e saperi nel Medioevo e nella prima età moderna}, } @INCOLLECTION{DRENGUBIAK_2023_INCOLLECTION_DHDF_476003, AUTHOR = {Drengubiak, J. and Hirsch, F. and Didirková, I. and Frontini, F.}, TITLE = {The perception of voice qualities in audiobooks in the context of teaching French as a second language}, YEAR = {2023}, ABSTRACT = {Audiobooks are a common part of everyday life and their possibilities are also used when learning a foreign language. The article comprehensively addresses the issue of audiobooks as a variation of the Listening comprehension activity on a sample of secondary and university students. In addition to their experience with audiobooks, their attitude to listening comprehension, it examines the preferences regarding specific characteristics of the voices of the narrators. The students first determined the perceived characteristic on a scale of 1-4 in the following categories: pitch (high-low), speed (slow-fast), melodicity (monotonous-melodious), articulation (comprehensible-incomprehensible). Later, subjective ratings were assigned to these categories on a scale from strong like to strong dislike. The preference research was conducted on an excerpt of Grand Meaulnes by A. Fournier, which was recorded by two professional and two amateur narrators}, URL = {https://www.pulib.sk/web/pdf/web/viewer.html?file=/web/kniznica/elpub/dokument/Drengubiak4/subor/9788055531786.pdf}, ISBN = {978-80-555-3178-6}, BOOKTITLE = {Literatúra vo výučbe – vyučovať literatúru; La littérature dans l'enseignement-Enseigner la littérature; Literature in teaching-Teaching literature}, } @INCOLLECTION{MONACHINI_2023_INCOLLECTION_MF_475983, AUTHOR = {Monachini, M. and Frontini, F.}, TITLE = {Infrastrutture digitali per le scienze umane e sociali}, YEAR = {2023}, ABSTRACT = {Questo capitolo esplora il ruolo delle infrastrutture di ricerca (IR) nel promuovere la collaborazione interdisciplinare e l’innovazione tecnologica nell’ambito delle Digital Humanities. Le IR, come CLARIN e DARIAH, rappresentano nodi cruciali per l’accesso e la condivisione di risorse linguistiche, tecnologie e dati. Attraverso una panoramica dei servizi offerti, tra cui l’archiviazione, l’accesso federato e le applicazioni web, viene evidenziata la loro importanza per garantire principi FAIR (Findable, Accessible, Interoperable, Reusable) e supportare la ricerca umanistica e sociale in Europa. Il lavoro sottolinea l’impatto strategico delle IR nel favorire la condivisione delle conoscenze e l’integrazione delle risorse a livello transnazionale, contribuendo alla costruzione di ecosistemi di ricerca avanzati}, KEYWORDS = {Infrastrutture di ricerca, Digital Humanities, Principi FAIR}, PAGES = {197-213}, URL = {https://iris.cnr.it/handle/20.500.14243/475983}, VOLUME = {DIGITAL HUMANITIES. METODI, STRUMENTI, SAPERI}, PUBLISHER = {Carocci Editore}, ISBN = {978-88-290-1843-7}, BOOKTITLE = {Digital (Humanities). (Metodi), strumenti, saperi}, } @INPROCEEDINGS{BOSCHETTI_2023_INPROCEEDINGS_BDDFM_504563, AUTHOR = {Boschetti, F. and Del Grosso, A. M. and Del Gratta, R. and Frontini, F. and Monachini, M.}, TITLE = {CLARIN-IT: texts, documents and new contexts}, YEAR = {2023}, ABSTRACT = {In recent years, CLARIN has increasingly broadened its interest from linguistic resources to textual resources relevant to digital humanists. This new and attractive scenario requires new technologies for texts, variants, and digital representations of primary sources, their contexts, and complex relationships. VeDPH in Venice, CNR-ILC-CoPhiLab, and ILC4CLARIN in Pisa collaborate on DH projects. Together, they are working on extracting text from manuscript page images, annotating historical graffiti on georeferenced images, and identifying text in digital images of paintings and sculptures}, KEYWORDS = {Research Infrastructure}, PAGES = {53-56}, URL = {https://iris.cnr.it/handle/20.500.14243/504563}, CONFERENCE_NAME = {CLARIN Annual Conference Proceedings 2023}, BOOKTITLE = {CLARIN Annual Conference Proceedings 2023}, } @INPROCEEDINGS{FRONTINI_2023_INPROCEEDINGS_FRK_476002, AUTHOR = {Frontini, F. and Romary, L. and Khan, A. F. A.}, TITLE = {ISO LMF 24613-6: A Revised Syntax Semantics Module for the Lexical Markup Framework}, YEAR = {2023}, ABSTRACT = {The Lexical Markup Framework (LMF) is a meta-model for representing data in monolingual and multilingual lexical databases with a view to its use in computer applications. The "new LMF" replaces the old LMF standard, ISO 24613: 2008, and is being published as a multi-part standard. This short paper introduces one of these new parts, ISO 24613-6, namely the Syntax and Semantics (SynSem) module. The SynSem module allows for the description of syntactic and semantic properties of lexemes, as well as the complex interactions between them. While the new standard remains faithful to (and backwards compatible with) the syntax and semantics coverage of the previous model, the new standard clarifies and simplifies it in a few places, which will be illustrated}, KEYWORDS = {ISO, LMF, TEI, Semantics, Syntax}, URL = {https://inria.hal.science/hal-04117132}, BOOKTITLE = {Proceedings of LDK 2023 – 4th Conference on Language, Data and Knowledge}, } @INPROCEEDINGS{KHAN_2023_INPROCEEDINGS_KCCDFJ_475981, AUTHOR = {Khan, A. F. A. and Cavallaro, M. and Cruz González, R. and Díaz Vera, J. and Frontini, F. and Javier Minaya Gómez, F.}, TITLE = {Constructing an Old English WordNet: The Case of Guilt}, YEAR = {2023}, PAGES = {122-124}, URL = {https://iris.unive.it/retrieve/0f226d38-e332-418b-9b14-d5558d1a0d9d/AIUCD2023.pdf}, BOOKTITLE = {La Memoria Digitale. Forme Del Testo e Organizzazione Della Conoscenza. Atti Del XII Convegno Annuale AIUCD}, } @MISC{FRONTINI_2023_MISC_F_456225, AUTHOR = {Frontini, F.}, TITLE = {Words and the Company they Keep: Digital corpora and infrastructures for the foreign language classroom}, YEAR = {2023}, ABSTRACT = {We give an overview of corpora \& language technologies and their use in foreign language teaching}, KEYWORDS = {corpora, didattica L2, tecnologie del linguaggio}, URL = {https://iris.cnr.it/handle/20.500.14243/456225}, CONFERENCE_NAME = {Didattica della lingua, della cultura e cittadinanza attiva: sfide educative contemporanee-Seminari LEND Modena}, } @INCOLLECTION{DEJONG_2022_INCOLLECTION_DVFVFW_446352, AUTHOR = {De Jong, F. and Van Uytvanck, D. and Frontini, F. and Van Den Bosch, A. and Fišer, D. and Witt, A.}, TITLE = {Language Matters. The European Research Infrastructure CLARIN, Today and Tomorrow}, YEAR = {2022}, ABSTRACT = {LARIN stands for "Common Language Resources and Technology Infrastructure". In 2012 CLARIN ERIC was established as a legal entity with the mission to create and maintain a digital infrastructure to support the sharing, use, and sustainability of language data (in written, spoken, or multimodal form) available through repositories from all over Europe, in support of research in the humanities and social sciences and beyond. Since 2016 CLARIN has had the status of Landmark research infrastructure and currently it provides easy and sustainable access to digital language data and also offers advanced tools to discover, explore, exploit, annotate, analyse, or combine such datasets, wherever they are located. This is enabled through a networked federation of centres: language data repositories, service centres, and knowledge centres with single sign-on access for all members of the academic community in all participating countries. In addition, CLARIN offers open access facilities for other interested communities of use, both inside and outside of academia. Tools and data from different centres are interoperable, so that data collections can be combined and tools from different sources can be chained to perform operations at different levels of complexity. The strategic agenda adopted by CLARIN and the activities undertaken are rooted in a strong commitment to the Open Science paradigm and the FAIR data principles. This also enables CLARIN to express its added value for the European Research Area and to act as a key driver of innovation and contributor to the increasing number of industry programmes running on data-driven processes and the digitalization of society at large}, KEYWORDS = {research infrastructure, language resources, service interoperability, innovation, SSH, language technology, open science}, PAGES = {31-58}, URL = {https://www.degruyter.com/document/doi/10.1515/9783110767377-002/html}, DOI = {10.1515/9783110767377-002}, PUBLISHER = {Walter De Gruyter Inc (Boston/Berlin/Munich, USA)}, ISBN = {978-3-11-076737-7}, CONFERENCE_PLACE = {Boston/Berlin/Munich}, BOOKTITLE = {CLARIN: The Infrastructure for Language Resources}, } @INCOLLECTION{DELFANTE_2022_INCOLLECTION_DFMQ_419162, AUTHOR = {Del Fante, D. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {Italian Language Resources. From CLARIN-IT to the VLO and Back: Sketching a Methodology for Monitoring LRs Visibility}, YEAR = {2022}, ABSTRACT = {This paper sketches a user-oriented, qualitative methodology for both (i) monitoring the existence and availability of language resources relevant for a given CLARIN national community and language and (ii) assessing the offering potential of CLARIN, in terms of Language Resources provided to national consortia. From the user perspective, the methodology has been applied to investigate the visibility of language resources available for Italian within the CLARIN central services, in particular the Virtual Language Observatory. As a proof-of-concept, the methodology has been tested on the resources available through the CLARIN-IT data centres, but, ideally, it could be applied by any national data centre aiming to assess the existence of LRs in CLARIN for any given languages and check their accessibility for the interested users. It is thus argued that such an assessment might be a useful instrument in the hands of national coordinators and centre managers for (i) bringing to the fore both strengths and critical issues about their data providing community and (ii) for planning targeted actions to improve and increase both visibility and accessibility of their LRs}, KEYWORDS = {Virtual Language Observatory, CLARIN-IT, CLARIN-ERIC, Qualitative Assessment Methodology, User Involvement}, PAGES = {10-22}, URL = {https://ecp.ep.liu.se/index.php/clarin/article/view/413/371}, DOI = {10.3384/9789179294441}, ISBN = {978-91-7929-444-1}, BOOKTITLE = {Selected Papers from the CLARIN Annual Conference 2021}, } @INCOLLECTION{MENANT_2022_INCOLLECTION_MGFFMMC_412939, AUTHOR = {Menant and Geneviève and Frontini, F. and Fujiwara and Mami and Martin and Chrostophe}, TITLE = {Approches numériques des questions d'auctorialité. Le corpus Challe}, YEAR = {2022}, ABSTRACT = {La contribution se concentre sur l'application d'approches textométriques et d'identification d'auteur à l'oeuvre de Robert Challe}, KEYWORDS = {Robert Challe, attribution d'auteur, textométrie}, PAGES = {167-192}, URL = {https://iris.cnr.it/handle/20.500.14243/412939}, DOI = {10.48611/isbn.978-2-406-13347-6.p.0167}, PUBLISHER = {Editions Classiques Garnier (Paris, FRA)}, ISBN = {978-2-406-13347-6}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Observer la vie littéraire. Études littéraires et numériques}, EDITOR = {Alexandre, D. and Roe, G.}, } @INPROCEEDINGS{AGNOLONI_2022_INPROCEEDINGS_ABFMMQRV_446358, AUTHOR = {Agnoloni, T. and Bartolini, R. and Frontini, F. and Montemagni, S. and Marchetti, C. and Quochi, V. and Ruisi, M. and Venturi, G.}, TITLE = {Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus}, YEAR = {2022}, ABSTRACT = {This paper describes the process of acquisition, cleaning, interpretation, coding and linguistic annotation of a collection of parliamentary debates from the Senate of the Italian Republic covering the COVID-19 pandemic emergency period and a former period for reference and comparison according to the CLARIN ParlaMint prescriptions. The corpus contains 1199 sessions and 79, 373 speeches for a total of about 31 million words, and was encoded according to the ParlaCLARIN TEI XML format. It includes extensive metadata about the speakers, sessions, political parties and parliamentary groups. As required by the ParlaMint initiative, the corpus was also linguistically annotated for sentences, tokens, POS tags, lemmas and dependency syntax according to the universal dependencies guidelines. Named entity annotation and classification is also included. All linguistic annotation was performed automatically using state-of-the-art NLP technology with no manual revision. The Italian dataset is freely available as part of the larger ParlaMint 2. 1 corpus deposited and archived in CLARIN repository together with all other national corpora. It is also available for direct analysis and inspection via various CLARIN services and has already been used both for research and educational purposes}, KEYWORDS = {parliamentary debates, CLARIN ParlaMint, corpus creation, corpus annotation}, PAGES = {117-124}, URL = {https://aclanthology.org/2022.parlaclarin-1.17/}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-85-6}, CONFERENCE_NAME = {Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of The Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference}, } @INPROCEEDINGS{DELFANTE_2022_INPROCEEDINGS_DFMQ_416549, AUTHOR = {Del Fante, D. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {CLARIN-IT: An Overview on the Italian Clarin Consortium After Six Years of Activity}, YEAR = {2022}, ABSTRACT = {This paper offers an overview of the Italian CLARIN consortium after six years since its establishment. The members, the centres and the repositories and the most important collections are described. Lastly, in order to showcase the visibility and the accessiblity of Language Resources provided by CLARIN-IT from a user-perspective, we show how Italian resources are findable within CLARIN ERI}, KEYWORDS = {Language Resources, Data Repositories and Archives, Research Infrastructures, CLARIN}, PAGES = {8}, URL = {http://ceur-ws.org/Vol-3160/short21.pdf}, PUBLISHER = {CEUR-WS. org (Aachen, DEU)}, CONFERENCE_NAME = {Italian Research Conference on Digital Libraries}, CONFERENCE_PLACE = {Aachen}, BOOKTITLE = {Proceedings of the 18th Italian Research Conference on Digital Libraries}, EDITOR = {Di Nunzio, G. M. and Portelli, B. and Redavid, D. and Silvello, G.}, } @INPROCEEDINGS{GAMBA_2022_INPROCEEDINGS_GFBM_446356, AUTHOR = {Gamba, F. and Frontini, F. and Broeder, D. and Monachini, M.}, TITLE = {Language Technologies for the Creation of Multilingual Terminologies. Lessons Learned from the SSHOC Project}, YEAR = {2022}, ABSTRACT = {This paper is framed in the context of the SSHOC project and aims at exploring how Language Technologies can help in promoting and facilitating multilingualism in the Social Sciences and Humanities (SSH). Although most SSH researchers produce culturally and societally relevant work in their local languages, metadata and vocabularies used in the SSH domain to describe and index research data are currently mostly in English. We thus investigate Natural Language Processing and Machine Translation approaches in view of providing resources and tools to foster multilingual access and discovery to SSH content across different languages. As case studies, we create and deliver as freely, openly available data a set of multilingual metadata concepts and an automatically extracted multilingual Data Stewardship terminology. The two case studies allow as well to evaluate performances of state-of-the-art tools and to derive a set of recommendations as to how best apply them. Although not adapted to the specific domain, the employed tools prove to be a valid asset to translation tasks. Nonetheless, validation of results by domain experts proficient in the language is an unavoidable phase of the whole workflow}, KEYWORDS = {language resource infrastructures, Multilingual terminologies, data curation}, PAGES = {154-163}, URL = {https://aclanthology.org/2022.lrec-1.17}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-72-6}, CONFERENCE_NAME = {13th Conference on Language Resources and Evaluation (LREC 2022)}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of the 13th Language Resources and Evaluation Conference}, } @INPROCEEDINGS{HIRSCH_2022_INPROCEEDINGS_HFDD_419303, AUTHOR = {Hirsch, F. and Frontini, F. and Didirková, I. and Drengubiak, J.}, TITLE = {Esthétique de la voix dans les livres audio en langue française}, YEAR = {2022}, ABSTRACT = {Cette recherche vise à étudier les préférences des auditeurs concernant les voix des livres audio. Des échantillons de 8 voix masculines et 7 voix féminines ont été extraits de différents livres audio et analysés. Une enquête a été réalisée pour obtenir le point de vue de 69 auditeurs en répondant à des questions sur les caractéristiques vocales. Les résultats montrent que les choix des participants dépendent du genre littéraire. En effet, les voix masculines sont préférées pour les romans de science-fiction et les voix féminines pour la littérature pour enfants et les romans contemporains. Néanmoins, les autres genres littéraires testés ne correspondent pas à une voix spécifique. Concernant le débit, une préférence a été notée pour des essais lus avec un débit de parole plus lent, alors que les auditeurs préfèrent un débit de parole plus rapide pour les romans érotiques}, KEYWORDS = {audiobooks, voice esthetics, speech}, URL = {https://doi.org/10.1051/shsconf/202213808004}, DOI = {10.1051/shsconf/202213808004}, CONFERENCE_NAME = {8e Congrès Mondial de Linguistique Française}, BOOKTITLE = {138}, } @TECHREPORT{MARTELLI_2022_TECHREPORT_MMCNVUFQKKLDTTCSKIDGM_412365, AUTHOR = {Martelli, F. and Maru, M. and Campagnano, C. and Navigli, R. and Velardi, P. and Ureñaruiz, R. and Frontini, F. and Quochi, V. and Kallas, J. and Koppel, K. and Langemets, M. and De Does, J. and Tempelaars, R. and Tiberius, C. and Costa, R. and Salgado, A. and Krek, S. and Ibej, J. and Dobrovoljc, K. and Gantar, P. and Munda, T.}, TITLE = {D3. 8 Lexical-semantic analytics for NLP}, YEAR = {2022}, ABSTRACT = {The present document illustrates the work carried out in task 3. 3 (work package 3) focused on lexicalsemantic analytics for Natural Language Processing (NLP). This task aims at computing analytics for lexicalsemantic information such as words, senses and domains in the available resources, investigating their role in NLP applications. Specifically, this task concentrates on three research directions, namely i) which grouping senses based on their semantic similari sense clustering, in ty improves the performance of NLP tasks such as Word Sense Disambiguation (WSD), ii) domain labeling of text, in which the lexicographic resources made available by the ELEXIS project for research purposes allow better performances to be achieved, and fin senses ally iii) analysing the, for which a software package is made available. diachronic distribution of In this deliverable, we illustrate the research activities aimed at achieving the aforementioned goals and put forward suggestions for future works. Importantly, we stress the crucial role played by highquality lexicalsemantic r esources when investigating such linguistic aspects and their impact on NLP applications. To this end, as an additional contribution, we address the paucity of manually the ELEXIS parallelannotated data in the lexical senseannotated datasetsemantic research field and introduce, a novel entirely manuallyavailable in 10 European languages and featuring 5 annotation layers}, KEYWORDS = {research infrastructures, lexicography, lexical resources, word-sense disambiguation, WSD, sense-annotated language data, multilinguality}, PAGES = {67}, URL = {https://elex.is/wp-content/uploads/ELEXIS_D3_8_Lexical-Semantic_Analytics_for_NLP_final_report.pdf}, } @TECHREPORT{TASOVAC_2022_TECHREPORT_TTBBBCUFHHMKKKKMMMMMQARSSVWWZ_446092, AUTHOR = {Tasovac, T. and Tiberius, C. and Bamberg, C. and Bellandi, A. and Burch, T. and Costa, R. and Uro, M. and Frontini, F. and Hennemann, J. and Heylen, K. and Milojakubíek and Khan, F. and Klee, A. and Kosem, I. and Ková, V. and Matuka, O. and McCrae, J. and Monachini, M. and Mörth, K. and Munda, T. and Quochi, V. and Andrarepar and Roche, C. and Salgado, A. and Sievers, H. and Váradi, T. and Weyand, S. and Woldrich, A. and Zhanial, S.}, TITLE = {D5. 3 Overview of Online Tutorials and Instruction Manuals}, YEAR = {2022}, ABSTRACT = {The ELEXIS Curriculum is an integrated set of training materials which contextualizes ELEXIS tools and services inside a broader, systematic pedagogic narrative. This means that the goal of the ELEXIS Curriculum is not simply to inform users about the functionalities of particular tools and services developed within the project, but to show how such tools and services are a) embedded in both lexicographic theory and practice; and b) representative of and contributing to the development of digital skills among lexicographers. The scope and rationale of the curriculum are described in more detail in the Deliverable D5. 2 Guidelines for Producing ELEXIS Tutorials and Instruction Manuals. The goal of this deliverable, as stated in the project DOW, is to provide "a clear, structured overview of tutorials and instruction manuals developed within the project. "}, KEYWORDS = {ELEXIS, lexicography, training materials}, PAGES = {31}, URL = {https://elex.is/wp-content/uploads/ELEXIS_D5_3_Overview-of-Online-Tutorials-and-Instruction-Manuals.pdf}, } @MISC{FRONTINI_2022_MISC_FBQMMZUW_441101, AUTHOR = {Frontini, F. and Bellandi, A. and Quochi, V. and Monachini, M. and Mörth, K. and Zhanial, S. and Ďurčo, M. and Woldrich, A.}, TITLE = {CLARIN Tools and Resources for Lexicographic Work}, YEAR = {2022}, ABSTRACT = {This course introduces lexicographers to the CLARIN Research Infrastructure and highlights language resources and tools useful for lexicographic practices. The course consists of two parts. In Part 1, you will learn about CLARIN, its technical and knowledge infrastructure, and about how to deposit and find lexical resources in CLARIN. In Part 2, you will become acquainted with CLARIN tools that can be used to create lexical resources}, KEYWORDS = {CLARIN, lexicography}, URL = {https://elexis.humanistika.org/id/UnwYPq70Dewbn7XDEjsMM}, } @MISC{MARTELLI_2022_MISC_MNKKGKNSOLKKDUSLVGLQMFTTCSIM_446359, AUTHOR = {Martelli, F. and Navigli, R. and Krek, S. and Kallas, J. and Gantar, P. and Koeva, S. and Nimb, S. and Sandford Pedersen, B. and Olsen, S. and Langemets, M. and Koppel, K. and Üksik, T. and Dobrovoljc, K. and Ureñaruiz, R. and Sanchosánchez, J. and Lipp, V. and Váradi, T. and Gyrffy, A. and László, S. and Quochi, V. and Monachini, M. and Frontini, F. and Tiberius, C. and Tempelaars, R. and Costa, R. and Salgado, A. and Ibej, J. and Munda, T.}, TITLE = {Parallel sense-annotated corpus ELEXIS-WSD 1. 0}, YEAR = {2022}, ABSTRACT = {ELEXIS-WSD is a parallel sense-annotated corpus in which content words (nouns, adjectives, verbs, and adverbs) have been assigned senses. Version 1. 0 contains sentences for 10 languages: Bulgarian, Danish, English, Spanish, Estonian, Hungarian, Italian, Dutch, Portuguese, and Slovene. The corpus was compiled by automatically extracting a set of sentences from WikiMatrix (Schwenk et al., 2019), a large open-access collection of parallel sentences derived from Wikipedia, using an automatic approach based on multilingual sentence embeddings. The sentences were manually validated according to specific formal, lexical and semantic criteria (e. g. by removing incorrect punctuation, morphological errors, notes in square brackets and etymological information typically provided in Wikipedia pages). To obtain a satisfying semantic coverage, we filtered out sentences with less than 5 words and less than 2 polysemous words were filtered out. Subsequently, in order to obtain datasets in the other nine target languages, for each selected sentence in English, the corresponding WikiMatrix translation into each of the other languages was retrieved. If no translation was available, the English sentence was translated manually. The resulting corpus is comprised of 2, 024 sentences for each language}, KEYWORDS = {Word Sense Disambiguation, corpus parallelo, disambiguazione automatica del senso, annotazione semantica multilingue}, URL = {https://iris.cnr.it/handle/20.500.14243/446359}, } @INCOLLECTION{CHAHINIAN_2021_INCOLLECTION_CBFDJPRSDT_394922, AUTHOR = {Chahinian, N. and Bonnabaud La Bruyère, T. and Frontini, F. and Delenne, C. and Julien, M. and Panckhurst, R. and Roche, M. and Sautot, L. and Deruelle, L. and Teisseire, M.}, TITLE = {WEIR-P: An Information Extraction Pipeline for the Wastewater Domain}, YEAR = {2021}, ABSTRACT = {We present the MeDO project, aimed at developing resources for text mining and information extraction in the wastewater domain. We developed a specific Natural Language Processing (NLP) pipeline named WEIR-P (WastewatEr InfoRmation extraction Platform) which identifies the entities and relations to be extracted from texts, pertaining to information, wastewater treatment, accidents and works, organizations, spatio-temporal information, measures and water quality. We presentand evaluate the first version of the NLP system which was developed to automate the extraction of the aforementioned annotation from texts and its integration with existing domain knowledge. The preliminary results obtained on the Montpellier corpus are encouraging and show how a mix of supervised and rule-based techniques can be used to extract useful information and reconstruct the various phases of the extension of a given wastewater network. While the NLP and Information Extraction (IE) methods used are state of the art, the novelty of our work lies in their adaptation to the domain, and in particular in the wastewater management conceptual model, which defines the relations between entities. French resources are less developed in the NLP community than English ones. The datasets obtained in this project are another original aspect of this work}, KEYWORDS = {Wastewater, text mining, Information extraction, NLP, NER, Domain adapted systems}, PAGES = {171-188}, URL = {https://www.springer.com/gp/book/9783030750176}, PUBLISHER = {Springer Nature Switzerland (Basel, CHE)}, ISBN = {978-3-030-75017-6}, CONFERENCE_PLACE = {Basel}, BOOKTITLE = {Research Challenges in Information Science-15th International Conference, RCIS 2021, Limassol, Cyprus, May 11-14, 2021, Proceedings}, } @INCOLLECTION{PANCKHURST_2021_INCOLLECTION_PF_397005, AUTHOR = {Panckhurst, R. and Frontini, F.}, TITLE = {An Internationally Fair Mediated Digital Discourse Corpus: Improving Knowledge on Reuse}, YEAR = {2021}, ABSTRACT = {In this paper, the authors present a French Mediated Digital Discourse corpus, (88milSMShttp: //88milsms. huma-num. fr https: //hdl. handle. net/11403/comere/cmr-88milsms). Efforts were undertaken over the years to ensure its publication accordingto the best practices and standards of the community, thus guaranteeing compliance with FAIRprinciples and CLARIN recommendations with pertinent scientific and pedagogical reuse. Sinceknowledge on how resources are reused is sometimes difficult to obtain, ways of improving thisare also envisaged}, KEYWORDS = {Reuse, FAIR, SMS, corpus}, PAGES = {185-193}, URL = {https://ecp.ep.liu.se/index.php/clarin/article/view/20}, VOLUME = {180}, DOI = {10.3384/ecp18020}, BOOKTITLE = {Selected Papers from the CLARIN Annual Conference 2020}, } @INPROCEEDINGS{DELFANTE_2021_INPROCEEDINGS_DFMQ_447069, AUTHOR = {Del Fante, D. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {CLARIN-IT Resources in CLARIN ERIC-a Bird's-Eye View}, YEAR = {2021}, ABSTRACT = {The paper investigates the visibility of CLARIN-IT language resources within the services of the CLARINERICcentral infrastructure, notably the Virtual Language Observatory, the Switchboard and the Federated Content Search, from a user perspective in order to identify possible issues. While the experiment focused on one national consortium, the ultimate goal is to develop an assessment methodology that can be used by any national consortia aiming to review the accessibility of their resources and tools within the CLARIN central services}, KEYWORDS = {FAIR, research infrastructure for SSH, language resources, findability, CLARIN}, PAGES = {129-133}, URL = {https://office.clarin.eu/v/CE-2021-1923-CLARIN2021_ConferenceProceedings.pdf}, ISSN = {2021-1923}, CONFERENCE_NAME = {CLARIN Annual Conference 2021}, } @INPROCEEDINGS{MARTELLI_2021_INPROCEEDINGS_MNKTKGKNPOLKKDUSLVGLQMFTCSIM_443238, AUTHOR = {Martelli, F. and Navigli, R. and Krek, S. and Tiberius, C. and Kallas, J. and Gantar, P. and Koeva, S. and Nimb, S. and Pedersen, B. S. and Olsen, S. and Langements, M. and Koppel, K. and Üksik, T. and Dobrovolijc, K. and Ureña Ruiz, R. J. and Sancho Sánchez, J. L. and Lipp, V. and Váradi, T. and Győrffy, A. and László, S. and Quochi, V. and Monachini, M. and Frontini, F. and Tempelaars, R. and Costa, R. and Salgado, A. and Čibej, J. and Munda, T.}, TITLE = {Designing the ELEXIS Parallel Sense-Annotated Dataset in 10 European Languages}, YEAR = {2021}, ABSTRACT = {Over the course of the last few years, lexicography has witnessed the burgeoning of increasingly reliable automaticapproaches supporting the creation of lexicographic resources such as dictionaries, lexical knowledge bases andannotated datasets. In fact, recent achievements in the field of Natural Language Processing and particularly inWord Sense Disambiguation have widely demonstrated their effectiveness not only for the creation of lexicographicresources, but also for enabling a deeper analysis of lexical-semantic data both within and across languages. Nevertheless, we argue that the potential derived from the connections between the two fields is far from exhausted. In this work, we address a serious limitation affecting both lexicography and Word Sense Disambiguation, i. e. thelack of high-quality sense-annotated data and describe our efforts aimed at constructing a novel entirely manuallyannotated parallel dataset in 10 European languages. For the purposes of the present paper, we concentrate on theannotation of morpho-syntactic features. Finally, unlike many of the currently available sense-annotated datasets, we will annotate semantically by using senses derived from high-quality lexicographic repositories}, KEYWORDS = {Digital lexicography, Word Sense Disambiguation, Computational Linguistics, Corpus Linguistics, Natural Language Processing}, PAGES = {377-395}, URL = {https://static-curis.ku.dk/portal/files/279888836/eLex_2021_22_pp377_395.pdf}, VOLUME = {2021}, PUBLISHER = {Lexical Computing (Brno, CZE)}, CONFERENCE_NAME = {eLex 2021}, CONFERENCE_PLACE = {Brno}, BOOKTITLE = {Electronic lexicography in the 21st century (eLex 2021): Post-editing lexicography}, } @TECHREPORT{ALRAHABI_2021_TECHREPORT_ABFPJBKG_394923, AUTHOR = {Alrahabi, M. and Brando, C. and Frontini, F. and Provenier, A. and Jalabert, R. and Bordry, M. and Koskas, C. and Gawley, J.}, TITLE = {Guide d'annotation manuelle d'entités nommées dans des corpus littéraires}, YEAR = {2021}, ABSTRACT = {Guide d'annotation manuelle d'entités nommées dans des corpus littéraires Campagne d'annotation OBVIL 2019-2021}, KEYWORDS = {NER}, URL = {https://hal.archives-ouvertes.fr/hal-03156278}, } @MISC{ERJAVEC_2021_MISC_EOOLSGRPKBSVDDJHNCDVMLCAFMQVRMBSRDUPBKMBCDLR_446076, AUTHOR = {Erjavec, T. and Ogrodniczuk, M. and Osenova, P. and Ljubei, N. and Simov, K. and Grigorova, V. and Rudolf, M. and Panur, A. and Kopp, M. and Barkarson, S. and Steingrímsson, S. and Van Der Pol, H. and Depoorter, G. and De Does, J. and Jongejan, B. and Haltrup Hansen, D. and Navarretta, C. and Calzada Pérez, M. and D De Macedo, L. and Van Heusden, R. and Marx, M. and Çöltekin, Ç. and Coole, M. and Agnoloni, T. and Frontini, F. and Montemagni, S. and Quochi, V. and Venturi, G. and Ruisi, M. and Marchetti, C. and Battistoni, R. and Sebk, M. and Ring, O. and Daris, R. and Utka, A. and Petkeviius, M. and Briediené, M. and Krilaviius, T. and Morkeviius, V. and Bartolini, R. and Cimino, A. and Diwersy, S. and Luxardo, G. and Rayson, P.}, TITLE = {Linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint. ana 2. 1}, YEAR = {2021}, ABSTRACT = {ParlaMint 2. 1 is a multilingual set of 17 comparable corpora containing parliamentary debates mostly starting in 2015 and extending to mid-2020, with each corpus being about 20 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (from November 1st 2019), or being "reference" (before that date). The corpora have extensive metadata, including aspects of the parliament; the speakers (name, gender, MP status, party affiliation, party coalition/opposition); are structured into time-stamped terms, sessions and meetings; with speeches being marked by the speaker and their role (e. g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. Note that some corpora have further information, e. g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The corpora are encoded according to the Parla-CLARIN TEI recommendation (https: //clarin-eric. github. io/parla-clarin/), but have been validated against the compatible, but much stricter ParlaMint schemas. This entry contains the linguistically marked-up version of the corpus, while the text version is available at http: //hdl. handle. net/11356/1432. The ParlaMint. ana linguistic annotation includes tokenization, sentence segmentation, lemmatisation, Universal Dependencies part-of-speech, morphological features, and syntactic dependencies, and the 4-class CoNLL-2003 named entities. Some corpora also have further linguistic annotations, such as PoS tagging or named entities according to language-specific schemes, with their corpus TEI headers giving further details on the annotation vocabularies and tools}, KEYWORDS = {covid-19, ParlaCLARIN, CLARIN, linguistic annotation, pos-tagging, Named Entity Recognition, linguistic dependency annotation, UD, dibattiti parlamentari, parlamenti, discorso politico}, URL = {https://iris.cnr.it/handle/20.500.14243/446076}, } @MISC{ERJAVEC_2021_MISC_EOOLSGRPKBSVDDJHNCDVMLCAFMQVRMBSRDUPBKMBCDLR_446080, AUTHOR = {Erjavec, T. and Ogrodniczuk, M. and Osenova, P. and Ljubei, N. and Simov, K. and Grigorova, V. and Rudolf, M. and Panur, A. and Kopp, M. and Barkarson, S. and Steingrímsson, S. and Van Der Pol, H. and Depoorter, G. and De Does, J. and Jongejan, B. and Haltrup Hansen, D. and Navarretta, C. and Calzada Pérez, M. and D De Macedo, L. and Van Heusden, R. and Marx, M. and Çöltekin, Ç. and Coole, M. and Agnoloni, T. and Frontini, F. and Montemagni, S. and Quochi, V. and Venturi, G. and Ruisi, M. and Marchetti, C. and Battistoni, R. and Sebk, M. and Ring, O. and Daris, R. and Utka, A. and Petkeviius, M. and Briediené, M. and Krilaviius, T. and Morkeviius, V. and Bartolini, R. and Cimino, A. and Diwersy, S. and Luxardo, G. and Rayson, P.}, TITLE = {Multilingual comparable corpora of parliamentary debates ParlaMint 2. 1}, YEAR = {2021}, ABSTRACT = {ParlaMint 2. 1 is a multilingual set of 17 comparable corpora containing parliamentary debates mostly starting in 2015 and extending to mid-2020, with each corpus being about 20 million words in size. The sessions in the corpora are marked as belonging to the COVID-19 period (after November 1st 2019), or being "reference" (before that date). The corpora have extensive metadata, including aspects of the parliament; the speakers (name, gender, MP status, party affiliation, party coalition/opposition); are structured into time-stamped terms, sessions and meetings; with speeches being marked by the speaker and their role (e. g. chair, regular speaker). The speeches also contain marked-up transcriber comments, such as gaps in the transcription, interruptions, applause, etc. Note that some corpora have further information, e. g. the year of birth of the speakers, links to their Wikipedia articles, their membership in various committees, etc. The corpora are encoded according to the Parla-CLARIN TEI recommendation (https: //clarin-eric. github. io/parla-clarin/), but have been validated against the compatible, but much stricter ParlaMint schemas. This entry contains the ParlaMint TEI-encoded corpora with the derived plain text version of the corpus along with TSV metadata on the speeches. Also included is the 2. 0 release of the data and scripts available at the GitHub repository of the ParlaMint project. Note that there also exists the linguistically marked-up version of the corpus, which is available at http: //hdl. handle. net/11356/1431}, KEYWORDS = {ParlaMint, ParlaCLARIN, dibattiti parlamentari, covid-19, discorso politico, CLARIN}, URL = {https://iris.cnr.it/handle/20.500.14243/446080}, } @ARTICLE{AMIEL_2020_ARTICLE_AFLR_529601, AUTHOR = {Amiel, P. and Frontini, F. and Lacour, P. Y. and Robin, A.}, TITLE = {Pratiques de gestion des données de la recherche : une nécessaire acculturation des chercheurs aux enjeux de la science ouverte ?}, YEAR = {2020}, ABSTRACT = {The article presents the results of an exploratory survey, conducted in June 2018 in the Montpellier’s basin by the CommonData Research Program, on the researchers’ management practices of research data. The principles objectives were to see if research data management is the result of an elaborated and strategic plan, to verify the ability or inability of researchers to qualify legally their explored, collected or produced datasets in order to determine their management in regard of the current Open Science politics and, at last, to observe the property feeling improved by researchers toward the datas they contribute to produce, which comes up with the broader question of the personal and/or institutional dimension of research’s work and its consequences in the awarding of property}, KEYWORDS = {research data, management, property, sharing, dissemination, valorization, public domain, open science, données de la recherche, gestion, propriété, partage, diffusion, valorisation, domaine public, science ouverte}, PAGES = {147-168}, URL = {https://iris.cnr.it/handle/20.500.14243/529601}, VOLUME = {(10)}, DOI = {10.4000/cdst.2061}, ISSN = {1967-0311}, JOURNAL = {CAHIERS DROIT, SCIENCES \& TECHNOLOGIES}, } @INCOLLECTION{PANCKHURST_2020_INCOLLECTION_PF_529606, AUTHOR = {Panckhurst, R. and Frontini, F.}, TITLE = {Evolving interactional practices of emoji in text messages}, YEAR = {2020}, ABSTRACT = {In this article, we examine the usage of emoji in the 88milSMS corpus. After differentiating between emoji and emoticons, we situate the context, indicate general statistics and mention press interest. Next, we address linguistic issues: are emoji used more often in addition (either redundantly or necessarily, sometimes as “softeners” (adoucisseurs, Détrie \& Verine 2015) or for lexical replacement, denoting a reference/referential function (Referenzfunktion, Dürscheid \& Siever 2017)? Concerning emoji insertion positioning, which is the most popular and what does this mean? Other researchers refer to “the emoji code” (Danesi 2016; Evans 2017), and emoji classifications have been proposed, including references to syntactic, semantic (Barbieri, Ronzano \& Saggion 2016), semiotic, phatic and emotive/sentiment (Novak et al. 2015) levels. Are these satisfactory or do we need to redefine levels, contexts and potential ambiguity? Part-ofspeech tagging (POS) and NLP software are then used to annotate SMS containing emoji within 88milSMS in order to investigate the immediate grammatical environment. This allows us to conduct contextual analysis relating to syntactic linguistic functions of emoji. Finally, results from two questionnaires are explored: 1. sociolinguistic factors (age, gender) of the SMS donors having used emoji in 88milSMS; 2. Comparison of SMS emoji usage with other instant messaging applications and social networks via a user-orientated questionnaire (Rascol 20171)}, KEYWORDS = {emoji, computer mediated communication, corpus}, PAGES = {81-104}, URL = {https://doi.org/10.1515/9781501510113-005}, DOI = {10.1515/9781501510113-005}, PUBLISHER = {De Gruyter Mouton}, ISBN = {978-1-5015-1011-3}, BOOKTITLE = {Visualizing Digital Discourse: Interactional, Institutional and Ideological Perspectives}, } @INPROCEEDINGS{PANCKHURST_2020_INPROCEEDINGS_PF_422985, AUTHOR = {Panckhurst, R. and Frontini, F.}, TITLE = {An internationally FAIR Mediated Digital Discourse Corpus: towards scientific and pedagogical reuse}, YEAR = {2020}, ABSTRACT = {In this paper, the authors present a French Mediated Digital Discourse corpus, (88milSMS http: //88milsms. huma-num. fr https: //hdl. handle. net/11403/comere/ cmr-88milsms). Efforts were undertaken over the years to ensure its publication according to the best practices and standards of the community, thus guaranteeing compliance with FAIR principles and CLARIN recommendations with pertinent scientific and pedagogical reuse}, KEYWORDS = {FAIR data, SMS corpus}, URL = {https://www.clarin.eu/clarin-annual-conference-2020-abstracts}, CONFERENCE_NAME = {CLARIN Annual Conference 2020 (5-7 October). Virtual Edition}, BOOKTITLE = {Proceedings of CLARIN Annual Conference 2020 (5-7 October). Virtual Edition}, EDITOR = {Navarretta, C. and Eskevich, M.}, } @MISC{FRONTINI_2020_MISC_F_384006, AUTHOR = {Frontini, F.}, TITLE = {Dans les coulisses des infrastructures européennes en SHS. Rôle et opportunités pour les acteurs de la recherche (ingénieurs et chercheurs)}, YEAR = {2020}, ABSTRACT = {La composante technologique prend une dimension de jour en jour plus importante en LLASHS. Les projets de recherche sont de plus en plus nombreux à mobiliser de gros volumes de données exigeant des services adaptés garants de formes de méthodologies augmentées (exploitation, interopérabilité, accessibilité, archivage). Afin de partager les savoirs et de garantir l'interopérabilité et la préservation à long terme de ces ressources et services, de grandes infrastructures informatiques se mettent en place aux niveaux national et international. Dans cette présentation, vous allez découvrir le panorama, en la matière, des e-infrastructures et des grands projets européens à caractère infrastructurel, avec un accent particulier sur les technologies utilisées, les principaux services offerts, et les aspects les plus intéressants en termes de synergie entre approches et disciplines différentes. La présentation portera sur des ERICs (European Research Infrastructure Consortium) établis, comme CLARIN et DARIAH, et sur des projets récents ou en cours de développement, comme PARTHENOS, SSHOC, ELEXIS et TRIPLE. Concernant les aspects techniques, on abordera les questions liées au dépôt, au stockage, à l'identification (sigle sign on), aux formats et choix des métadonnées et de modélisation formelle, à la recherche fédérée des sources. Nous soulignerons en particulier l'interaction de ces projets avec les infrastructures nationales, notamment Huma-Num, ainsi qu'avec la récemment constituée European Open Science Cloud (EOSC). La présentation aura une visée pratique, avec l'objectif de fournir des indications concrètes aux acteurs de la recherche (chercheurs, ingénieurs.) qui souhaitent participer à ces initiatives et aux groupes de travail qui les animent, ou plus largement favoriser l'accès des chercheurs français aux nombreux services et opportunités offerts}, KEYWORDS = {Infrastrutture di ricerca, Scienze umane e sociali}, URL = {https://ja-mate2020.sciencesconf.org/data/pages/Resume_Frontini_Nov.pdf}, CONFERENCE_NAME = {Journées annuelles du réseau Mate-shs (JA2020)}, } @INPROCEEDINGS{BOHBOT_2019_INPROCEEDINGS_BFKKR_389213, AUTHOR = {Bohbot, H. and Frontini, F. and Khan, F. and Khemakhem, M. and Romary, L.}, TITLE = {Nénufar: Modelling a Diachronic Collection of Dictionary Editions as a Computational Lexical Resource}, YEAR = {2019}, ABSTRACT = {The Petit Larousse Illustré (PLI) is a monolingual French dictionary which has been published every year since the 1906 edition, and which is therefore a fundamental record of the evolution of the French language. As a consequence of the pre-1948 editions of the PLI entering the public domain in 2018 the Nénufar (Nouvelle édition numérique de fac-similés de référence) project was launched at the Praxiling laboratory in Montpellier with the aim of digitizing and making these editions available electronically. The project is still ongoing; various selected editions from each decade are going to be fully digitized (so far the 1906, 1924 and 1925 editions have been completed), and changes backtracked and dated to the specific year. Nénufar's primary aim is to make the editions available and searchable via an advanced search interface which will not only enable the selective querying of text by lemma and type of content (definitions, examples,.), but crucially also detect and study changes by comparing different editions. In order to do so, a specific web interface has been put in place. Alongside the digitized text, the Nénufar website contains high quality scans for each page. In compliance with current open data best practices (Wilkinson et al., 2016), the project also aims to make the source data available separately from the querying interface both for research and for A similar project which presents data and scans from subsequent editions of the same legacy dictionary has been carried out by the team behind the Swedish Academy's Wordlist (see Holmer, Malmgren, and Martens (2016) and http: //spraakdata. gu. se/saolhist/). eLex 2019: Book of Abstracts 36 long-term preservation. The primary encoding format is TEI-XML; however in our case the TEI encoding is closely inspired by the latest version of the TEI-Lex0 (Ba?ski et al., 2017, Romary \& Tasovac, 2018) guidelines for encoding lexicographic resources, which are based upon TEI. The choice of a TEI based approach allows the Nénufar project to align itself to other pre-existing initiatives and tools. By aligning ourselves to TEI-Lex0 we will be able to make use of digitisation tools such as Grobid (Khemakhem et al., 2017) which have TEI-Lex0 as their native format and which have already been tested and used within the Nénufar project to speed up the digitization of new editions. In addition we will be able to make use of ongoing initiatives to convert TEI-Lex0 datasets to RDF using the W3C recommendation for publishing lexicons as Linked Data, namely OntoLex-Lemon (McCrae et al., 2017; Bosque-Gil et al., 2016) which will allow for the publication of the Nénufar dataset as an LOD graph. The LOD version of the Nénufar dataset, now currently being developed, will be queryable from the available SPARQL endpoint and contain all available editions as one single graph, allowing for expert users to perform complex queries that could detect systematic changes in the dataset. The LOD version is particularly adapted to be linked to other datasets; more recent editions, once added, could also be of interest for NLP applications}, URL = {https://iris.cnr.it/handle/20.500.14243/389213}, } @INPROCEEDINGS{KHAN_2018_INPROCEEDINGS_KBFM_376218, AUTHOR = {Khan, F. and Bellandi, A. and Frontini, F. and Monachini, M.}, TITLE = {One Language to rule them all: modelling Morphological Patterns in a Large Scale Italian Lexicon with SWRL}, YEAR = {2018}, ABSTRACT = {We present an application of Semantic Web Technologies to computational lexicography. More precisely we describe the publication of the morphological layer of the Italian Parole Simple Clips lexicon (PSC-M) as linked open data. The novelty of our work is in the use of the Semantic Web Rule Language (SWRL) to encode morphological patterns, thereby allowing the automatic derivation of the inflectional variants of the entries in the lexicon. By doing so we make these patterns available in a form that is human readable and that therefore gives a comprehensive morphological description of a large number of Italian word}, KEYWORDS = {Morphology, Linked Open Data, Italian Lexicon, SWRL, SQVRL}, PAGES = {4385-4389}, URL = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/844.pdf}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {979-10-95546-00-9}, CONFERENCE_NAME = {Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, EDITOR = {Chair, N. C. C. and Choukri, K. and Cieri, C. and Declerck, T. and Goggi, S. and Hasida, K. and Isahara, H. and Maegaard, B. and Mariani, J. and Mazo, H. and Moreno, A. and Odijk, J. and Piperidis, S. and Tokunaga, T.}, } @INPROCEEDINGS{BELLANDI_2018_INPROCEEDINGS_BFKM_345621, AUTHOR = {Bellandi, A. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {SWRL your lexicon: adding inflectional rules to a LOD dataset}, YEAR = {2018}, ABSTRACT = {Over the past few years the publication of lexical resources as Linked Data (LD) has taken on ever greater significance within the field of computational lexicography. So far the efforts of the community have been largely directed towards the definition of standards1 and the conversion of single resources (see McCrae et al 2012, Khan et al 2016), but with less of a focus on the technical possibilities afforded by this new mode of publishing lexical data. However, the fact is that the Semantic Web gives us access to a whole ecosystem of standards, languages, and technologies. In this paper we will look at one of these languages, the Semantic Web Rule Language2 (SWRL) and explore whether it might potentially play a useful role in the publication of lexical resources}, URL = {https://iris.cnr.it/handle/20.500.14243/345621}, } @INPROCEEDINGS{KHAN_2018_INPROCEEDINGS_KMBFB_345619, AUTHOR = {Khan, A. F. A. and Mugelli, G. and Boschetti, F. and Frontini, F. and Bellandi, A.}, TITLE = {Using Formal Ontologies for the Annotation and Study of Literary Texts}, YEAR = {2018}, URL = {https://iris.cnr.it/handle/20.500.14243/345619}, } @MISC{BELLANDI_2018_MISC_BFKM_350511, AUTHOR = {Bellandi, A. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {Parole-Simple-Clip/Morphological Layer in RDF}, YEAR = {2018}, ABSTRACT = {A version in RDF of the morphological layer of the wide coverage multi-level Italian lexicon Parole-Simple-Clips, containing the parts of speech Noun, Verb, Adjective. The dataset is encoded using the ontolex-lemon vocabulary. Information pertaining to inflectional morphological contained in the original resource is converted into Semantic Web Rule Language (SWRL) rules}, URL = {https://iris.cnr.it/handle/20.500.14243/350511}, } @INCOLLECTION{KHAN_2017_INCOLLECTION_KBFM_339934, AUTHOR = {Khan, F. and Bellandi, A. and Frontini, F. and Monachini, M.}, TITLE = {Using SWRL rules to model noun behaviour in Italian}, YEAR = {2017}, ABSTRACT = {In this article we describe our ongoing attempts to use the Semantic Web Rule Language (SWRL) to model the morphological layer of a wide-coverage Italian lexical resource, Parole-Simple-Clips (PSC); in this case that subset of PSC dealing with Italian noun morphology. After giving a brief introduction to SWRL and to Italian noun morphology we go onto describe the actual transformation itself. Finally we describe an experiment on our dataset using SWRL rules and queries written in the Semantic Query-Enhanced Rule Web Language (SQWRL)}, KEYWORDS = {Linked Open Data, Logic Programming, Italian Morphology}, PAGES = {134-142}, URL = {http://www.scopus.com/record/display.url?eid=2-s2.0-85021186095\&origin=inward}, DOI = {10.1007/978-3-319-59888-8_11}, PUBLISHER = {Springer (Berlin, DEU)}, CONFERENCE_PLACE = {Berlin}, BOOKTITLE = {LANGUAGE, DATA, AND KNOWLEDGE, LDK}, EDITOR = {Gracia, J. and Bond, F. and McCrae, J. and Buitelaar, P. and Chiarcos, C. and Hellmann, S.}, } @INCOLLECTION{RMANZELLA_2017_INCOLLECTION_RBBDDFMMMNS_333614, AUTHOR = {R Manzella, G. M. and Bartolini, R. and Bustaffa, F. and D'Angelo, P. and De Mattei, M. and Frontini, F. and Maltese, M. and Medone, D. and Monachini, M. and Novellino, A. and Spada, A.}, TITLE = {Semantic Search Engine for Data Management and Sustainable Development: Marine Planning Service Platform}, YEAR = {2017}, ABSTRACT = {This chapter presents a computer platform supporting a Marine Information and Knowledge System based on a repository that gathers, classify and structures marine scientific literature and data, guaranteeing their accessibility by means of standard protocols. This requires the access to quality controlled data and to information that is provided in grey literature and/or in relevant scientific literature. There exist efforts to develop search engines to find author's contributions to scientific literature or publications. This implies the use of persistent identifiers. However very few efforts are dedicated to link publications to data that was used, or cited in them or that can be of importance for the published studies. Full-text technologies are often unsuccessful since they assume the presence of specific keywords in the text; to fix this problem, it is suggested to use different semantic technologies for retrieving the text and data and thus getting much more complying results}, KEYWORDS = {Marine Information and Knowledge System}, PAGES = {127-154}, URL = {http://www.igi-global.com/chapter/semantic-search-engine-for-data-management-and-sustainable-development/166839#}, DOI = {10.4018/978-1-5225-0700-0.ch006}, PUBLISHER = {IGI Global (Hershey, USA)}, ISSN = {5225-0700}, CONFERENCE_PLACE = {Hershey}, BOOKTITLE = {Oceanographic and Marine Cross-Domain Data Management for Sustainable Development}, EDITOR = {Paolo Diviacco, A. L. and Glaves, H.}, } @INPROCEEDINGS{KHAN_2017_INPROCEEDINGS_KBF_342012, AUTHOR = {Khan, F. and Bowers, J. and Frontini, F.}, TITLE = {Situating Word Senses in their Historical Context with Linked Data}, YEAR = {2017}, ABSTRACT = {In this article we present a Semantic Web-based model for creating lexical resources in which the diachronic and, more broadly, contextual dimensions of word meaning can be explicitly represented as part of a graph-based data structure. We start by discussing why Linked Data is the right publishing approach for such diachronic datasets. We then describe our model, lemonEty, which utilizes the ontology engineering technique of perdurants in order to model lexical entries as dynamic processes. Next we go onto explain how to represent etymologies using our model, and in particular how to associate temporal information with word senses, taking examples from two different lexicographic resources. In addition, we will show how our model deals with cognates and attestations}, URL = {https://iris.cnr.it/handle/20.500.14243/342012}, } @ARTICLE{GOGGI_2016_ARTICLE_GPBFMMDB_320996, AUTHOR = {Goggi, S. and Pardelli, G. and Bartolini, R. and Frontini, F. and Monachini, M. and Manzella, G. and De Mattei, M. and Bustaffa, F.}, TITLE = {A semantic engine for grey literature retrieval in the oceanography domain}, YEAR = {2016}, ABSTRACT = {Here we present the final results of the MAPS (Marine Planning and Service Platform) project, an environment designed for gathering, classifying, managing and accessing marine scientific literature and data, making it available for search to Operative Oceanography researchers of various institutions by means of standard protocols. The system takes as input non-textual data (measurements) and text-both published papers and documentation-and it provides an advanced search facility thanks to the rich set of metadata and, above all, to the possibility of a refined and domain targeted key-word indexing of texts using Natural Language Processing (NLP) techniques. The paper describes the system in its details providing also evidence of evaluation}, KEYWORDS = {Information Extraction, Search Engine, Operative Oceanography}, PAGES = {155-161}, URL = {http://www.greynet.org/thegreyjournal/currentissue.html}, VOLUME = {12 (3)}, ISSN = {1574-1796}, JOURNAL = {THE GREY JOURNAL}, } @ARTICLE{KHAN_2016_ARTICLE_KABF_322086, AUTHOR = {Khan, F. and Arrigoni, S. and Boschetti, F. and Frontini, F.}, TITLE = {Restructuring a Taxonomy of Literary Themes and Motifs for More Efficient Querying}, YEAR = {2016}, ABSTRACT = {In this paper we describe ongoing work in the restructuring of a tagset originally organised as a taxonomy and used to annotate literary themes and motifs in a corpus of classical works of poetry from a number of different traditions. We show how such a tagset can be rendered more efficient and useful through the appropriation of ideas and techniques from lexical semantics and ontology design. The newly redesigned tagset is described with examples showing how the new design is much more expressive than the old taxonomy; furthermore, an example query is described in order to demonstrate how more refined semantic searches can be carried using the new version of the taxonomy. The final result is, we hope, a resource that will be useful not only for the specific project for which it was developed but one that is well-designed and well-documented enough to be of use for other similar semantic annotation tasks}, URL = {https://iris.cnr.it/handle/20.500.14243/322086}, DOI = {10.14195/2182-8830}, ISSN = {2182-8830}, JOURNAL = {MATLIT}, } @ARTICLE{MONACHINI_2016_ARTICLE_MF_327645, AUTHOR = {Monachini, M. and Frontini, F.}, TITLE = {CLARIN, l'infrastruttura europea delle risorse linguistiche per le scienze umane e sociali e il suo network italiano CLARIN-IT}, YEAR = {2016}, ABSTRACT = {ll 1°ottobre 2015 il MIUR firma l'adesione dell'Italia a CLARIN-ERIC, l'infrastruttura di ricerca che offre risorse e tecnologie linguistiche dedicate al settore delle scienze del linguaggio e delle scienze umane e sociali. Questo articolo intende fornire alla comunità italiana una ampia panoramica di CLARIN, la sua missione, i suoi pilastri, i servizi, la sua organizzazione tecnica ed amministrativa e la struttura di governance, sia a livello europeo che locale. Viene introdotto il network italiano, con il primo centro nazionale ILC4CLARIN, ospitato ed in via di sviluppo presso l'ILC-CNR, le funzionalità, le risorse ed i servizi offerti; viene presentato infine il primo nucleo del consorzio nazionale CLARIN-IT, illustrando i criteri di costituzione, le attività previste e le prospettive future}, KEYWORDS = {Infrastrutture di ricerca, Tecnologie linguistiche, Network italiano CLARIN-IT}, PAGES = {1-30}, URL = {http://www.ai-lc.it/IJCoL/v2n2/1-monachini_and_frontini.pdf}, VOLUME = {VOL. 2 (2)}, ISSN = {2499-4553}, JOURNAL = {IJCOL}, } @INCOLLECTION{FRONTINI_2016_INCOLLECTION_FDM_320992, AUTHOR = {Frontini, F. and Del Gratta, R. and Monachini, M.}, TITLE = {GeoDomainWordNet: Linking the Geonames Ontology to WordNet}, YEAR = {2016}, ABSTRACT = {This paper illustrates the transformation of GeoNames' ontology concepts, with their English labels and glosses, into a GeoDomain WordNet-like resource in English, its translation into Italian, and its linking to the existing generic WordNets of both languages. The paper describes the criteria used for the linking of domain synsets to each other and to the generic ones and presents the published resource in RDF according to the w3c and lemon schema}, KEYWORDS = {GeoNames, WordNet, Language resources, Lexi, Linguistic linked data, lemon, RDF}, PAGES = {229-242}, URL = {http://link.springer.com/chapter/10.1007/978-3-319-43808-5_18}, DOI = {10.1007/978-3-319-43808-5}, ISBN = {978-3-319-43808-5}, BOOKTITLE = {Human Language Technology. Challenges for Computer Science and Linguistics}, } @EDITORIAL{KHAN_2016_EDITORIAL_KVAFFPGU_324185, AUTHOR = {Khan, F. and Vintar, P. and Araúz, P. L. and Faber, P. and Frontini, F. and Parvizi, A. and Grisimeunovi, L. and Unger, C.}, TITLE = {Language and Ontology (LangOnto2) & Terminology and Knowledge Structures (TermiKS)}, YEAR = {2016}, ABSTRACT = {This joint workshop brings together two different but closely related strands of research. On the one hand it looks at the overlap between ontologies and computational linguistics and on the other it explores the relationship between knowledge modelling and terminologies. In particular the workshop aims to create a forum for discussion in which the different relationships and commonalities between these two areas can be explored in detail, as well as presenting cutting edge research in each of the two individual areas. A significant amount of human knowledge can be found in texts. It is not surprising that languages such as OWL, which allow us to formally represent this knowledge, have become more and more popular both in linguistics and in automated language processing. For instance ontologies are now of core interest to many NLP fields including Machine Translation, Question Answering, Text Summarization, Information Retrieval, and Word Sense Disambiguation. At a more abstract level, however, ontologies can also help us to model and reason about phenomena in natural language semantics. In addition, ontologies and taxonomies can also be used in the organisation and formalisation of linguistically relevant categories such as those used in tagsets for corpus annotation. Notably also, the fact that formal ontologies are being increasingly accessed by users with limited to no background in formal logic has led to a growing interest in developing accessible front ends that allow for easy querying and summarisation of ontologies. It has also led to work in developing natural language interfaces for authoring ontologies and evaluating their design. Additionally in recent years there has been a renewed interest in the linguistic aspects of accessing, extracting, representing, modelling and transferring knowledge. Numerous tools for the automatic extraction of terms, term variants, knowledge-rich contexts, definitions, semantic relations and taxonomies from specialized corpora have been developed for a number of languages, and new theoretical approaches have emerged as potential frameworks for the study of specialized communication. However, the building of adequate knowledge models for practitioners (e. g. experts, researchers, translators, teachers etc.), on the one hand, and NLP applications (including cross-language, cross-domain, cross-device, multi-modal, multi-platform applications), on the other hand, still remains a challenge. The papers included in the workshop range across a wide variety of different areas and reflect the strong inter-disciplinary approach, which characterises both areas of research. In addition we are very happy to include two invited talks in the program presented by authorities in their respective fields: Pamela Faber from the field of terminology, and John McCrae, an expert on linguistic linked data and the interface between NLP and ontologies}, KEYWORDS = {lexicons, ontologies}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, } @INPROCEEDINGS{DELGRATTA_2016_INPROCEEDINGS_DFMPRBKSC_324176, AUTHOR = {Del Gratta, R. and Frontini, F. and Monachini, M. and Pardelli, G. and Russo, I. and Bartolini, R. and Khan, F. and Soria, C. and Calzolari, N.}, TITLE = {LREC as a Graph: People and Resources in a Network}, YEAR = {2016}, ABSTRACT = {This proposal describes a new way to visualise resources in the LREMap, a community-built repository of language resource descriptions and uses. The LREMap is represented as a force-directed graph, where resources, papers and authors are nodes. The analysis of the visual representation of the underlying graph is used to study how the community gathers around LRs and how LRs are used in research}, KEYWORDS = {Language Resources, Resources Documentation, Data Visualisation}, PAGES = {2529-2532}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Goggi, S. and Grobelnik, M. and Maegaard, B. and Mariani, J. and Mazo, H. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{GOGGI_2016_INPROCEEDINGS_GPBFMMDB_315259, AUTHOR = {Goggi, S. and Pardelli, G. and Bartolini, R. and Frontini, F. and Monachini, M. and Manzella, G. and De Mattei, M. and Bustaffa, F.}, TITLE = {A semantic engine for grey literature retrieval in the oceanography domain}, YEAR = {2016}, ABSTRACT = {Here we present the final results of the MAPS (Marine Planning and Service Platform) project, an environment designed for gathering, classifying, managing and accessing marine scientific literature and data, making it available for search to Operative Oceanography researchers of various institutions by means of standard protocols. The system takes as input non-textual data (measurements) and text-both published papers and documentation-and it provides an advanced search facility thanks to the rich set of metadata and, above all, to the possibility of a refined and domain targeted key-word indexing of texts using Natural Language Processing (NLP) techniques. The paper describes the system in its details providing also evidence of evaluation}, KEYWORDS = {Information Extraction, Search Engine, Operative Oceanography}, PAGES = {104-111}, URL = {https://iris.cnr.it/handle/20.500.14243/315259}, ISBN = {978-90-77484-27-2}, CONFERENCE_NAME = {Seventeenth International Conference on Grey Literature. A New Wave of Textual and Non-Textual Grey Literature}, EDITOR = {Farace, D. and Frantzen, J.}, } @INPROCEEDINGS{KHAN_2016_INPROCEEDINGS_KBBFGR_322088, AUTHOR = {Khan, A. F. A. and Bellandi, A. and Benotto, G. and Frontini, F. and Giovannetti, E. and Reboul, M.}, TITLE = {Leveraging a narrative ontology to query a literary text}, YEAR = {2016}, ABSTRACT = {In this work we propose a model for the representation of the narrative of a literary text. The model is structured in an ontology and a lexicon constituting a knowledge base that can be queried by a system. This narrative ontology, as well as describing the actors, locations, situations found in the text, provides an explicit formal representation of the timeline of the story. We will focus on a specific case study, that of the representation of a selected portion of Homer's Odyssey, in particular of the knowledge required to answer a selection of salient queries, formulated by a literary scholar. This work is being carried out within the framework of the Semantic Web by adopting models and standards such as RDF, OWL, SPARQL, and lemon among others}, URL = {https://iris.cnr.it/handle/20.500.14243/322088}, DOI = {10.4230/OASIcs.CMN.2016.10}, } @INPROCEEDINGS{KHAN_2016_INPROCEEDINGS_KFBMM_322106, AUTHOR = {Khan, F. and Frontini, F. and Boschetti, F. and Monachini and , M.}, TITLE = {Converting the Liddell Scott Greek-English Lexicon into Linked Open Data using lemon}, YEAR = {2016}, ABSTRACT = {The emergence and growing popularity of Linked Open Data (LOD) offers researchers a new range of possibilities when it comes to publishing datasets online (Hyvönen 2012, Oomen et al 2012); indeed not only does the success of LOD greatly facilitate the process of making scholarly data accessible and to a wider community but it also permits the enrichment of individual datasets by linking them to the other datasets available on the so called Linked Open Data Cloud. The advantages of Linked Open Data for teachers, academics and students in the humanities are obvious and are indeed manifold. However there is currently a paucity of linked open datasets in fields such as philology and literary studies, and in particular of datasets that deal with classical languages such as ancient Greek, Sanskrit, and Latin. This seems strange given the rich abundance of surviving works, of both a religious and secular character, that exist in those languages. A salient consideration here relates to the fact that even when such works have been digitised and made available in a format such as TEI-XML, a format which renders the structure and content of such texts more amenable to computer processing, the conversion of these resources into the Resource Data Framework (RDF), the standardised data model that underpins the Semantic Web, is not always straightforward. In this article we describe ongoing work in the conversion of an important 19th century Ancient Greek resource the Liddell-Scott-Jones Lexicon, into RDF, part of a wider program of work that has been recently initiated at CNR-ILC in converting historical lexicons in languages such as Greek, Latin and Arabic into Linked Open Data}, URL = {https://iris.cnr.it/handle/20.500.14243/322106}, } @INPROCEEDINGS{NAHLI_2016_INPROCEEDINGS_NFMKZK_324187, AUTHOR = {Nahli, O. and Frontini, F. and Monachini, M. and Khan, F. and Zarghili, A. and Khalfi, M.}, TITLE = {Al Qamus al Muhit, a Medieval Arabic Lexicon in LMF}, YEAR = {2016}, ABSTRACT = {This paper describes the conversion into LMF, a standard lexicographic digital format of 'al-q?m?s al-mu???, a Medieval Arabic lexicon. The lexicon is first described, then all the steps required for the conversion are illustrated. The work is will produce a useful lexicographic resource for Arabic NLP, but is also interesting per se, to study the implications of adapting the LMF model to the Arabic language. Some reflections are offered as to the status of roots with respect to previously suggested representations. In particular, roots are, in our opinion are to be not treated as lexical entries, but modeled as lexical metadata for classifying and identifying lexical entries. In this manner, each root connects all entries that are derived from it}, KEYWORDS = {Arabic Lexicon, LMF, Al Qamus al Muhi}, PAGES = {943-950}, URL = {http://www.lrec-conf.org/proceedings/lrec2016/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-9-1}, CONFERENCE_NAME = {Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, CONFERENCE_PLACE = {Paris}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Goggi, S. and Grobelnik, M. and Maegaard, B. and Mariani, J. and Mazo, H. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{MANZELLA_2016_INPROCEEDINGS_MBBDDFMMMNS_324227, AUTHOR = {Manzella, G. and Bartolini, R. and Bustaffa, F. and D'Angelo, P. and De Mattei, M. and Frontini, F. and Maltese, M. and Medone, D. and Monachini, M. and Novellino, A. and Spada, A.}, TITLE = {Marine Planning and Service Platform: Specific Ontology Based semantic Search Engine Serving Data Management and Sustainable Development}, YEAR = {2016}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is aiming at building a computer platform supporting a Marine Information and Knowledge System. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. In oceanography the cost related to data collection is very high and the new paradigm is based on the concept to collect once and re-use many times (for re-analysis, marine environment assessment, studies on trends, etc). This concept requires the access to quality controlled data and to information that is provided in reports (grey literature) and/or in relevant scientific literature. Hence, creation of new technology is needed by integrating several disciplines such as data management, information systems, knowledge management}, KEYWORDS = {Marine Information, Knowledge System}, PAGES = {2}, URL = {http://meetingorganizer.copernicus.org/EGU2016/orals/20144}, VOLUME = {18}, ISSN = {1607-7962}, CONFERENCE_NAME = {European Geosciences Union General Assembly (EGU 2016)}, } @MISC{MONACHINI_2016_MISC_MEF_333124, AUTHOR = {Monachini, M. and Enea, A. and Frontini, F.}, TITLE = {CLARIN-IT: servizi per la comunità italiana delle scienze umane e sociali}, YEAR = {2016}, ABSTRACT = {CLARIN-IT-The Italian Common Language Resources and Technology Infrastructure: Monica Monachini-CLARIN Italian National Coordinator Alessandro Enea-Responsible of ILCforCLARIN \& contact person for IDEM Francesca Frontini-Standing Committee for CLARIN Technical Centres (SCCTC) ILC-CNR National Representative}, KEYWORDS = {CLARIN-IT, The Italian Common Language Resources and Technology Infrastructure}, URL = {http://www.clarin-it.it/en/content/clarin-it-idem-day-2016}, CONFERENCE_NAME = {CLARIN-IT @ IDEM Day 2016}, } @ARTICLE{DELGRATTA_2015_ARTICLE_DFKM_222847, AUTHOR = {Del Gratta, R. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {Converting the PAROLE SIMPLE CLIPS Lexicon into RDF with lemon}, YEAR = {2015}, ABSTRACT = {This paper describes the publication and linking of (parts of) PAROLE SIMPLE CLIPS (PSC), a large scale Italian lexicon, to the Semantic Web and the Linked Data cloud using the lemon model. The main challenge of the conversion is discussed, namely the reconciliation between the PSC semantic structure which contains richly encoded semantic information, following the qualia structure of the Generative Lexicon theory and the lemon view of lexical sense as a reified pairing of a lexical item and a concept in an ontology. The result is two datasets: one consists of a list of lemon lexical entries with their lexical properties, relations and senses; the other consists of a list of OWL individuals representing the referents for the lexical senses. These OWL individuals are linked to each other by a set of semantic relations and mapped onto the SIMPLE OWL ontology of higher level semantic types}, KEYWORDS = {lemon, linked data, generative lexicon, RDF, OWL, lexical resource}, PAGES = {387-392}, URL = {http://www.semantic-web-journal.net/content/converting-parole-simple-clips-lexicon-rdf-lemon-0}, VOLUME = {6}, DOI = {10.3233/SW-140168}, ISSN = {1570-0844}, JOURNAL = {SEMANTIC WEB (PRINT)}, } @ARTICLE{GOGGI_2015_ARTICLE_GMFBPDBM_296111, AUTHOR = {Goggi, S. and Monachini, M. and Frontini, F. and Bartolini, R. and Pardelli, G. and De Mattei, M. and Bustaffa, F. and Manzella, G.}, TITLE = {Marine Planning and Service Platform (MAPS) An Advanced Research Engine for Grey Literature in Marine Science}, YEAR = {2015}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is a development of the Marine project (Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013) aiming at building a computer platform for supporting a Marine Information and Knowledge System, as part of the data management activities. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. We will present the scenario of the Operative Oceanography together with the technologies used to develop an advanced search engine which aims at providing rapid and efficient access to a Digital Library of oceanographic data. The case-study is also highlighting how the retrieval of grey literature from this specific marine community could be reproduced for similar communities as well, thus revealing the great impact that the processing, re-use as well as application of grey data have on societal needs/problems and their answers}, KEYWORDS = {Marine Science, Search Engine, Source Data, Oceanography}, PAGES = {171-178}, URL = {https://iris.cnr.it/handle/20.500.14243/296111}, VOLUME = {11 (3)}, ISSN = {1574-1796}, JOURNAL = {THE GREY JOURNAL}, } @INCOLLECTION{BRANDO_2015_INCOLLECTION_BFG_292095, AUTHOR = {Brando, C. and Frontini, F. and Ganascia, J.}, TITLE = {Disambiguation of Named Entities in Cultural Heritage Texts Using Linked Data Sets}, YEAR = {2015}, ABSTRACT = {This paper proposes a graph-based algorithm baptized REDEN for the disambiguation of authors' names in French literary criticism texts and scientific essays from the 19th century. It leverages knowledge from different Linked Data sources in order to select candidates for each author mention, then performs fusion of DBpedia and BnF individuals into a single graph, and finally decides the best referent using the notion of graph centrality. Some experiments are conducted in order to identify the best size of disambiguation context and to assess the influence on centrality of specific relations represented as edges. This work will help scholars to trace the impact of authors' ideas across different works and time periods}, KEYWORDS = {Named-entity disambiguation Centrality Linked data Data fusion Digital humanities}, PAGES = {505-514}, URL = {http://link.springer.com/chapter/10.1007%2F978-3-319-23201-0_51}, DOI = {10.1007/978-3-319-23201-0_51}, ISBN = {978-3-319-23200-3}, BOOKTITLE = {New Trends in Databases and Information Systems}, EDITOR = {Morzy, T. and Valduriez, P. and Bellatreche, L.}, } @INPROCEEDINGS{BOSCHETTI_2015_INPROCEEDINGS_BDFKM_305311, AUTHOR = {Boschetti, F. and Del Gratta, R. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {(Re)thinking the BLARK for Ancient Greek}, YEAR = {2015}, ABSTRACT = {The paper discusses the Basic LAnguage Resource Kit (BLARK) for Ancient Greek, measuring the BLARK matrix against what is actually available for this language, and assessing its applicability to ancient languages in general. In addition, the BLARK and the FLaReNet recommendations are used to define priorities in the sector in close collaboration between philologists and the broader LRT community}, URL = {https://iris.cnr.it/handle/20.500.14243/305311}, } @INPROCEEDINGS{BOUKHALED_2015_INPROCEEDINGS_BFG_294757, AUTHOR = {Boukhaled, M. and Frontini, F. and Ganascia, J.}, TITLE = {Une mesure d'intérêt à base de surreprésentation pour l'extraction des motifs syntaxiques stylistiques}, YEAR = {2015}, ABSTRACT = {Dans cette contribution, nous présentons une étude sur la stylistique computationnelle des textes de la littérature classiques française fondée sur une approche conduite par données, où la découverte des motifs linguistiques intéressants se fait sans aucune connaissance préalable. Nous proposons une mesure objective capable de capturer et d'extraire des motifs syntaxiques stylistiques significatifs à partir d'un oeuvre d'un auteur donné. Notre hypothèse de travail est fondée sur le fait que les motifs syntaxiques les plus pertinents devraient refléter de manière significative le choix stylistique de l'auteur, et donc ils doivent présenter une sorte de comportement de surreprésentation contrôlé par les objectifs de l'auteur. Les résultats analysés montrent l'efficacité dans l'extraction de motifs syntaxiques intéressants dans le texte littéraire français classique, et semblent particulièrement prometteurs pour les analyses de ce type particulier de texte}, KEYWORDS = {Computational stylistic, text mining, syntactic patterns, interestingness measure}, PAGES = {391-396}, URL = {http://www.atala.org/taln_archives/TALN/TALN-2015/taln-2015-court-012.html}, CONFERENCE_NAME = {22e Conférence Sur Le Traitement Automatique Des Langues Naturelles (TALN 2015)}, BOOKTITLE = {Actes de La 22e Conférence Sur Le Traitement Automatique Des Langues Naturelles}, } @INPROCEEDINGS{BOUKHALED_2015_INPROCEEDINGS_BFG_297255, AUTHOR = {Boukhaled, M. and Frontini, F. and Ganascia, J.}, TITLE = {A Peculiarity-based Exploration of Syntactical Patterns: a Computational Study of Stylistics}, YEAR = {2015}, ABSTRACT = {In this contribution, we present a computational stylistic study and comparison of classic French literary texts based on a datadriven approach where discovering interesting linguistic patterns is done without any prior knowledge. We propose an objective measure capable of capturing and extracting meaningful stylistic syntactic patterns from a given author's work. Our hypothesis is based on the fact that the most relevant syntactic patterns should significantly reflect the author's stylistic choice and thus they should exhibit some kind of peculiar overrepresentation behavior controlled by the author's purpose with respect to a linguistic norm. The analyzed results show the effectiveness in extracting interesting syntactic patterns from novels, and seem particularly promising for the analysis of such particular texts}, KEYWORDS = {Computational Stylistics, Interestingness Measure, Sequential Pattern Mining, Syntactic Style}, PAGES = {31-39}, URL = {http://ceur-ws.org/Vol-1410/paper5.pdf}, VOLUME = {1410}, CONFERENCE_NAME = {Workshop on Interactions between Data Mining and Natural Language Processing 2015 co-located with European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2015)}, } @INPROCEEDINGS{BRANDO_2015_INPROCEEDINGS_BFG_340774, AUTHOR = {Brando, C. and Frontini, F. and Ganascia, J.}, TITLE = {Linked data for toponym linking in French literary texts}, YEAR = {2015}, ABSTRACT = {The present article discusses first experiments in toponym linking of Modern French digital editions aiming to provide an external referent to Linked Data sources. We have so far focused on testing two knowledge bases-French DBpedia and Geonames-for recall. Results highlight quality issues in these data sets for usage in NLP-tasks in domain-specific heritage texts}, KEYWORDS = {Named-Entity Linking, Linked Data, Digital Humanities}, URL = {https://iris.cnr.it/handle/20.500.14243/340774}, DOI = {10.1145/2837689.2837699}, ISBN = {978-1-4503-3937-7}, CONFERENCE_NAME = {GIR'15 9th Workshop on Geographic Information Retrieval}, BOOKTITLE = {GIR '15 Proceedings of the 9th Workshop on Geographic Information Retrieval}, EDITOR = {Purves, R. S. and Jones, C. B.}, } @INPROCEEDINGS{DELGRATTA_2015_INPROCEEDINGS_DFMPRBGKQSC_307390, AUTHOR = {Del Gratta, R. and Frontini, F. and Monachini, M. and Pardelli, G. and Russo, I. and Bartolini, R. and Goggi, S. and Khan, F. and Quochi, V. and Soria, C. and Calzolari, N.}, TITLE = {Visualising Italian Language Resources: a Snapshot}, YEAR = {2015}, ABSTRACT = {This paper aims to provide a first snapshot of Italian Language Resources (LRs) and their uses by the community, as documented by the papers presented at two different conferences, LREC2014 and CLiC-it 2014. The data of the former were drawn from the LOD version of the LRE Map, while those of the latter come from manually analyzing the proceedings. The results are presented in the form of visual graphs and confirm the initial hypothesis that Italian LRs require concrete actions to enhance their visibility}, KEYWORDS = {Italian Language Resources}, PAGES = {100-104}, URL = {https://books.openedition.org/aaccademia/1277?lang=it}, ISBN = {978-88-99200-62-6}, CONFERENCE_NAME = {Second Italian Conference on Computational Linguistics CLiC-it 2015}, BOOKTITLE = {Proceedings of the Second Italian Conference on Computational Linguistics CLiC-it 2015}, EDITOR = {Bosco, C. and Tonelli, S. and Zanzotto, F. M.}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FAG_276113, AUTHOR = {Frontini, F. and Amine Boukhaled, M. and Ganascia, J.}, TITLE = {Linguistic Pattern Extraction and Analysis for Classic French Plays}, YEAR = {2015}, ABSTRACT = {Great authors of fiction and theatre have the capacity of creating memorable characters that take life and become almost as real as living persons to the readers/audience. The study of characterization, namely of how this is achieved, is a well-researched topic in corpus stylistics: for instance (Mahlberg, 2012) attempts to identify typical lexical patterns for memorable Dickens' characters by extracting those lexical bundles that stand out (namely are overrepresented) in comparison to a general corpus. In other works, authorship attribution methods are applied to the different characters of a play to identify whether the author has been able to provide each of them with a "distinct" voice. For instance (Vogel \& Lynch, 2008) compare individual Shakespeare characters against the whole play or even against all plays of the same author. The purpose of this paper is to propose a methodology for the study characterization of several characters in French plays of the classical period. The tools developed are meant to support textual analysis by: 1) Verifying the degree of characterization of each character with respect to others. 2) Automatically inducing a list of linguistic features that are significant, representative for that character. Preliminary investigations have been conducted on plays by Moliere, cross-comparing four protagonists from four different plays. The proposed methodology relies on sequential data mining for the extraction of linguistic patterns and on correspondence analysis for comparison of patterns frequencies in each character and for the visual representation of such differences}, KEYWORDS = {computational stylometry, thater, sequential pattern mining}, PAGES = {3}, URL = {http://lipn.univ-paris13.fr/~charnois/conscilaGenres/resumes/frontini.pdf}, CONFERENCE_NAME = {Journée ConSciLa (Confrontations en Sciences du Langage) Grammaire des genres et des styles: quelles approches privilégier ?}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_290872, AUTHOR = {Frontini, F. and Brando, C. and Ganascia, J.}, TITLE = {Semantic Web based Named Entity Linking for Digital Humanities and Heritage Texts}, YEAR = {2015}, ABSTRACT = {This paper proposes a graph based methodology for automatically disambiguating authors' mentions in a corpus of French literary criticism. Candidate referents are identified and evaluated using a graph based named entity linking algorithm, which exploits a knowledge-base built out of two different resources (DBpedia and the BnF linked data). The algorithm expands previous ones applied for word sense disambiguation and entity linking, with good results. Its novelty resides in the fact that it successfully combines a generic knowledge base such as DBpedia with a domain specific one, thus enabling the efficient annotation of minor authors. This will help specialists to follow mentions of the same author in different works of literary criticism, and thus to investigate their literary appreciation over time}, KEYWORDS = {named-entity linking, linked data, digital humanities}, PAGES = {77-88}, URL = {http://ceur-ws.org/Vol-1364/paper9.pdf}, VOLUME = {VOL-1364}, CONFERENCE_NAME = {SW4SH 2015 Semantic Web for Scientific Heritage 2015}, BOOKTITLE = {SW4SH 2015 Semantic Web for Scientific Heritage 2015}, EDITOR = {Zucker, A. and Draelants, I. and Zucker, C. F. and Monnin, A.}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_295464, AUTHOR = {Frontini, F. and Brando, C. and Ganascia, J.}, TITLE = {Domain-adapted named-entity linker using Linked Data}, YEAR = {2015}, ABSTRACT = {We present REDEN, a tool for graph-based Named Entity Linking that allows for the disambiguation of entities using domain-specific Linked Data sources and different configurations (e. g. context size). It takes TEI-annotated texts as input and outputs them enriched with external references (URIs). The possibility of customizing indexes built from various knowledge sources by defining temporal and spatial extents makes REDEN particularly suited to handle domain-specific corpora such as enriched digital editions in the Digital Humanities}, KEYWORDS = {named-entity disambiguation, evaluation, linked data, digital humanities}, PAGES = {10}, URL = {http://ceur-ws.org/Vol-1386/named_entity.pdf}, VOLUME = {VOL-1386}, CONFERENCE_NAME = {Workshop on NLP Applications: Completing the Puzzle co-located with the 20th International Conference on Applications of Natural Language to Information Systems (NLDB 2015)}, BOOKTITLE = {Proceedings of the Workshop on NLP Applications: Completing the Puzzle}, EDITOR = {Izquierdo, R.}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FQM_267184, AUTHOR = {Frontini, F. and Quochi, V. and Monachini, M.}, TITLE = {Generative Lexicon and polysemy: inducing logical alternations}, YEAR = {2015}, ABSTRACT = {The current paper brings together the results of a series of experiments for inducing regular sense alternations, or regular/ logical polysemy, from a computational lexicon based on the Generative Lexicon theory. The results are discussed in light of the potential benefits and uses of the amended algorithm}, KEYWORDS = {Polysemy, Generative Lexicon, Logical Alternations}, PAGES = {7}, URL = {https://iris.cnr.it/handle/20.500.14243/267184}, PUBLISHER = {MAPLEX2015 Multiple Approaches to Lexicon Conference (Yamagata, JPN)}, CONFERENCE_NAME = {MAPLEX2015 Multiple Approaches to Lexicon Conference}, CONFERENCE_PLACE = {Yamagata}, EDITOR = {Hsieh, S. K. and Kanzaki, K.}, } @INPROCEEDINGS{GOGGI_2015_INPROCEEDINGS_GMFBPDBM_290971, AUTHOR = {Goggi, S. and Monachini, M. and Frontini, F. and Bartolini, R. and Pardelli, G. and De Mattei, M. and Bustaffa, F. and Manzella, G.}, TITLE = {Marine Planning and Service Platform (MAPS): An Advanced Research Engine for Grey Literature in Marine Science}, YEAR = {2015}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is a development of the Marine project (Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013) aiming at building a computer platform for supporting a Marine Information and Knowledge System, as part of the data management activities. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. We will present the scenario of the Operative Oceanography together with the technologies used to develop an advanced search engine which aims at providing rapid and efficient access to a Digital Library of oceanographic data. The case-study is also highlighting how the retrieval of grey literature from this specific marine community could be reproduced for similar communities as well, thus revealing the great impact that the processing, re-use as well as application of grey data have on societal needs/problems and their answers}, KEYWORDS = {Marine Science, Search Engine, Source Data, Oceanography}, PAGES = {108-114}, URL = {http://www.textrelease.com/gl16program.html}, PUBLISHER = {TextRelease (Amsterdam, NLD)}, ISSN = {1386-2316}, ISBN = {978-90-77484-23-4}, CONFERENCE_NAME = {Sixteenth International Conference on Grey Literature Grey Literature Lobby: Engines and Requesters for Change}, CONFERENCE_PLACE = {Amsterdam}, BOOKTITLE = {THE GL-CONFERENCE SERIES. CONFERENCE PROCEEDINGS}, EDITOR = {Farace, D. and Frantzen, J.}, } @INPROCEEDINGS{KHAN_2015_INPROCEEDINGS_KF_295959, AUTHOR = {Khan, F. and Frontini, F.}, TITLE = {Using Ontologies to Model Polysemy in Lexical Resources}, YEAR = {2015}, ABSTRACT = {In this article we look at how the use of ontologies can assist in analysing polysemy in natural languages. We develop a model, the Lexical-Sense-Ontology model (LSO), to represent the interaction between a lexicon and ontology, based on lemon. We use the LSO model to show how default rules can be used to represent semi-productivity in polysemy as well as discussing the kinds of ontological information that are useful for studying polysemy}, KEYWORDS = {Polysemy, Ontology, Default Logic}, URL = {http://www.aclweb.org/anthology/W/W15/W15-0404.pdf}, CONFERENCE_NAME = {Workshop on Language and Ontologies}, BOOKTITLE = {Proceedings of the Workshop on Language and Ontologies}, } @INPROCEEDINGS{BOSCHETTI_2015_INPROCEEDINGS_BDFKM_305309, AUTHOR = {Boschetti, F. and Del Gratta, R. and Frontini, F. and Khan, A. F. and Monachini, M.}, TITLE = {Strumenti, Risorse e Linguistic Linked Open Data per le lingue antiche}, YEAR = {2015}, ABSTRACT = {Strumenti e metodi dell'Informatica Umanistica hanno portato e portano ad una ridefinizione di processi teorici, metodologici e tecnici, fino a una vera e propria ri-concettualizzazione dei saperi nell'ambito dei beni culturali. L'Istituto di Linguistica Computazionale è attivo con varie iniziative sul fronte delle Digital Humanities per la creazione di strumenti e risorse linguistiche per il mondo classico. La direzione intrapresa si inserisce nel paradigma che si va consolidando nel settore delle tecnologie del linguaggio e che prevede la fruizione di servizi linguistici attraverso infrastrutture di ricerca, secondo un modello già operativo per le lingue moderne. Tale paradigma è in connessione con l'emergere degli standard e dei formati del web semantico per le tecnologie del linguaggio e per la pubblicazione di dati linguistici}, URL = {https://iris.cnr.it/handle/20.500.14243/305309}, } @INPROCEEDINGS{FRONTINI_2015_INPROCEEDINGS_FBG_289592, AUTHOR = {Frontini, F. and Boukhaled, M. A. and Ganascia, J. G.}, TITLE = {Moliere's Raisonneurs: a quantitative study of distinctive linguistic patterns}, YEAR = {2015}, KEYWORDS = {Computational Stylistics, Correspondence analysis, Corpus linguistics, Molière}, PAGES = {114-117}, URL = {http://ucrel.lancs.ac.uk/cl2015/doc/CL2015-AbstractBook.pdf}, CONFERENCE_NAME = {Corpus Linguistics 2015}, BOOKTITLE = {Corpus Linguistics 2015-Abstract Book}, EDITOR = {Formato, F. and Hardie, A.}, } @INPROCEEDINGS{GOGGI_2015_INPROCEEDINGS_GPBFMMDB_307398, AUTHOR = {Goggi, S. and Pardelli, G. and Bartolini, R. and Frontini, F. and Monachini, M. and Manzella, G. and De Mattei, M. and Bustaffa, F.}, TITLE = {A semantic engine for grey literature retrieval in the oceanography domain}, YEAR = {2015}, ABSTRACT = {Here we present the final results of MAPS (Marine Planning and Service Platform), an environment designed for gathering, classifying, managing and accessing marine scientific literature and data, making it available for search to Operative Oceanography researchers of various institutions by means of standard protocols. In previous publications the general architecture of the system as well as the set of metadata (Common Data Index) used to describe the documents were presented [3]; it was shown how individual oceanographic data-sets could be indexed within the MAPS library by types of measure, measurement tools, geographic areas, and also linked to specific textual documentation. Documentation is described using the current international standards: Title, Authors, Publisher, Language, Date of publication, Body/Institution, Abstract, etc.; serial publications are described in terms of ISSN, while books are assigned ISBN; content of various types on electronic networks is described by means of doi and url. Each description is linked to the document. Thanks to this, the MAPS library already enables researchers to go from structured oceanographic data to documents describing it. But this was not enough: documents may contain important information that has not been encoded in the metadata. Thus an advanced Search Engine was put in place that uses semantic-conceptual technologies in order to extract key concepts from unstructured text such as technical documents (reports and grey literature) and scientific papers and to make them indexable and searchable by the end user in the same way as the structured data (such as oceanographic observations and metadata) is. More specifically once a document is uploaded in the MAPS library, key domain concepts in documents are extracted via a natural language processing pipeline and used as additional information for its indexing. The key term identification algorithm is based on marine concepts that were pre-defined in a domain ontology, but crucially it also allows for the discovery of new related concepts. So for instance starting from the domain term salinity, related terms such as sea salinity and average sea salinity will also be identified as key terms and used for indexing and searching documents. A hybrid search system is then put in place, where users can search the library by metadata or by free text queries. In the latter case, the NLP pipeline performs an analysis of the text of the query, and when key concepts are matched, the relevant documents are presented. The results may be later refined by using other structured information (e. g. date of publication, area,.). Currently a running system has been put in place, with data from satellites, buoys and sea stations; such data is documented and searchable by its relevant metadata and documentation. Results of quantitative evaluation in terms of information retrieval measures will be presented in the poster; more specifically, given an evaluation set defined by domain experts and composed of pre-defined queries together with documents that answer such queries, it will be shown how the system is highly accurate in retrieving the correct documents from the library. Though this work focuses on oceanography, its results may be easily extended to other domains; more generally, the possibility of enhancing the visibility and accessibility of grey literature via its connection to the data it describes and to an advanced full text indexing are of great relevance for the topic of this conference}, KEYWORDS = {Information Extraction, Search Engine, Oceanography}, PAGES = {76-77}, URL = {https://iris.cnr.it/handle/20.500.14243/307398}, ISBN = {978-90-77484-26-5}, CONFERENCE_NAME = {Seventeenth International Conference on Grey Literature. A New Wave of Textual and Non-Textual Grey Literature}, BOOKTITLE = {GL17 Program Book}, EDITOR = {Farace, D. and Frantzen, J.}, } @MISC{BRANDO_2015_MISC_BFAG_300554, AUTHOR = {Brando, C. and Frontini, F. and Abi Haidar, A. and Ganascia, J.}, TITLE = {Reconnaissance d'entités nommées: adaptation au domaine de la littérature française du XIXe siècle}, YEAR = {2015}, ABSTRACT = {La reconnaissance d'entités nommées (REN) est un enjeu fondamental pour la recherche en humanités numériques (HN). En littérature française, il est particulièrement important de repérer des entités telles que les auteurs, les personnages fictifs, les lieux géographiques et imaginaires, les titres d'ouvrages, les marqueurs temporels, entre autres. Actuellement, il existe peu de corpus de littérature française du passé annotés et disponibles en ligne. Le coût élevé de l'annotation manuelle motive donc l'utilisation de méthodes automatiques. Les approches REN de l'état de l'art fonctionnent efficacement sur des corpus journalistique et de littérature scientifique en biologie [1]. Néanmoins, l'adaptation à un nouveau domaine semble affecter négativement la performance de ces approches [5]. La diversité des textes en littérature (fiction, critique, théâtre.) et la spécificité des époques prises en compte représentent un travail considérable d'adaptation des ressources linguistiques et des algorithmes à un domaine particulier. En général, les thèmes traités sont hétérogènes et les textes possèdent un style fréquemment caractérisé par un bas degré de standardisation et de prédictibilité. Il est par exemple difficile d'identifier des mentions candidates car les conventions typographiques et le registre linguistique varient selon le domaine (textes journalistiques vs. littérature française)}, KEYWORDS = {entités nommeés, littérature française}, URL = {https://iris.cnr.it/handle/20.500.14243/300554}, CONFERENCE_NAME = {8esJournées Internationales de Linguistique de Corpus (JLC2015)}, } @MISC{FRONTINI_2015_MISC_F_296549, AUTHOR = {Frontini, F.}, TITLE = {Mining for characterising patterns in literature using correspondence analysis: an experiment on French novels}, YEAR = {2015}, ABSTRACT = {The talk presents and describes a bottom up methodology for the detection of stylistic traits in the syntax of literary texts. The extraction of syntactic patterns is performed blindly by a sequential pattern mining algorithm, while the identification of significant and interesting features is performed later by using correspondence analysis and filtering for the most contributive patterns}, KEYWORDS = {computational stylistics, French}, URL = {https://iris.cnr.it/handle/20.500.14243/296549}, CONFERENCE_NAME = {Göttingen Dialog in Digital Humanities}, } @MISC{FRONTINI_2015_MISC_F_300594, AUTHOR = {Frontini, F.}, TITLE = {Trattamento automatico del linguaggio per le Digital Humanities. Riconoscimento e disambiguazione di menzioni di autori in testi di critica letteraria}, YEAR = {2015}, ABSTRACT = {L'intervento scaturisce da una collaborazione tra ILC-CNR e il Labex OBVIL di Parigi. Lo scopo del progetto è quello di adattare ed estendere algoritmi di riconoscimento, classificazione e disambiguazione di entità nominate (in particolare menzioni di autori) nel "Corpus Critique", un insieme di testi di critica letteraria francese che il Labex OBVIL sta pubblicando in edizione digitale (formato TEI). Tali algoritmi si basano su approcci TAL supervisionati e non supervisionati e sfruttano massicciamente le basi di conoscenza, sia generiche (DBpedia) che di dominio, disponibili online sotto forma di linked data; lo scopo di tali lavori è di produrre risorse testuali annotate per facilitare la ricerca nell'ambito della storia della critica letteraria e della storia delle idee in generale. Durante il seminario verranno introdotti i formati e le risorse utilizzate, i criteri e le problematiche di annotazione emersi, e gli algoritmi riconoscimento e disambiguazione di entità nominate sviluppati. Più in generale si cercherà di mostrare con alcuni casi di utilizzo quali siano i vantaggi di arricchire risorse testuali con questo livello di annotazione, nel più ampio contesto delle convergenze tra digital humanities e trattamento automatico del linguaggio. Link http: //obvil. paris-sorbonne. fr/ https: //github. com/cvbrandoe/REDEN/blob/master/README. md}, KEYWORDS = {Named-entity disambiguation Centrality Linked data Data fusion Digital humanities}, URL = {https://iris.cnr.it/handle/20.500.14243/300594}, CONFERENCE_NAME = {Seminario di Cultura Digitale}, } @MISC{FRONTINI_2015_MISC_F_295960, AUTHOR = {Frontini, F.}, TITLE = {Analyse et extraction des motifs syntaxiques dans la prose de Robert Challe et de ses apocryphes}, YEAR = {2015}, ABSTRACT = {Cette contribution presente une extraction et une analyse des motifs syntaxiques dans la prose de Robert Challe et de ses apocryphes. En particulier nous analysons les différence dans la syntaxe des contes originaux des Illustres Françaises et celle des contes apocryphes}, KEYWORDS = {Robert Challe, authorship attribution, stilistica computazionale}, URL = {http://obvil.paris-sorbonne.fr/sites/default/files/projets/analyse_motifs_syntaxiques_if_et_apocryphes.pdf}, CONFERENCE_NAME = {Robert Challe: approches numériques des questions d'auctorialité}, } @MISC{FRONTINI_2015_MISC_F_289092, AUTHOR = {Frontini, F.}, TITLE = {What makes them different: the extraction of distinctive linguistic patterns for the protagonists of Molière's plays}, YEAR = {2015}, ABSTRACT = {Quantitative approaches to the study of style in literature are far from a modern novelty. They have however recently gained more and more popularity, not only among computer scientists and corpus linguistics, but also among some influential literary critics. The present panorama of quantitative techniques is very rich, but often confusing, with a plethora of denominations and methodologies often difficult to reconcile; computer scientists classify their work as stylometry or computational stylistics, while linguists may use the label corpus stylistics, and finally critics like Franco Moretti will talk about macro-analysis and distant reading. This talk will try first to identify the differences between these trends, distinguishing between corpus based and corpus driven approaches on the methodological side (Quiniou et al 2012), and (following Ramsey 2011) between experimental and hermeneutical approaches. Finally we will present ongoing work conducted at Labex OBVIL on syntactic pattern extraction from theatrical characters. The proposed approach, using correspondence analysis to extract distinctive traits for each character, is imagined rather as an hermeneutical tool, in the sense that it does not seek to demonstrate that two different characters have been endowed with significantly different stylistic traits by the playwright, but it does enable the visualisation of their relative distances and the extraction of those elements that make them distinct}, URL = {https://iris.cnr.it/handle/20.500.14243/289092}, CONFERENCE_NAME = {Cycle des séminaires ILES LIMSI}, } @MISC{FRONTINI_2015_MISC_F_295465, AUTHOR = {Frontini, F.}, TITLE = {Indexing names in digital editions}, YEAR = {2015}, ABSTRACT = {This presentation outlines the work done on Named Entity Recognition and Linking in texts of French Literary criticism, underlying the points of interest for what concerns the creation of enriched digital editions}, URL = {https://iris.cnr.it/handle/20.500.14243/295465}, } @INPROCEEDINGS{DELGRATTA_2014_INPROCEEDINGS_DFKMS_257904, AUTHOR = {Del Gratta, R. and Frontini, F. and Khan, F. and Mariani, J. and Soria, C.}, TITLE = {The LREMap for Under-Resourced Languages}, YEAR = {2014}, ABSTRACT = {A complete picture of currently available language resources and technologies for the under-resourced languages of Europe is still lacking. Yet this would help policy makers, researchers and developers enormously in planning a roadmap for providing all languages with the necessary instruments to act as fully equipped languages in the digital era. In this paper we introduce the LRE Map and show its utility for documenting available language resources and technologies for under-resourced languages. The importance of the serialization of the LREMap into (L)LOD along with the possibility of its connection to a wider world is also introduced}, KEYWORDS = {language resources, less-resourced languages, linguistic linked open data}, PAGES = {78-83}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, CONFERENCE_NAME = {Workshop on Collaboration and Computing for Under-Resourced Languages in the Linked Open Data Era (CCURL 2014)}, BOOKTITLE = {Proceedings of the Workshop on Collaboration and Computing for Under-Resourced Languages in the Linked Open Data Era (CCURL 2014)}, EDITOR = {Pretorius, L. and Soria, C. and Baroni, P.}, } @INPROCEEDINGS{FRONTINI_2014_INPROCEEDINGS_FQM_259129, AUTHOR = {Frontini, F. and Quochi, V. and Monachini, M.}, TITLE = {Polysemy alternations extraction using the PAROLE SIMPLE CLIPS Italian lexicon}, YEAR = {2014}, ABSTRACT = {This paper presents the results of an experiment of polysemy alternations induction from a lexicon (Utt and Pad´o, 2011; Frontini et al., 2014), discussing the results and proposing an amendment in the original algorithm}, KEYWORDS = {Language Resources and Technologies}, PAGES = {175-179}, URL = {http://clic.humnet.unipi.it/proceedings/Proceedings-CLICit-2014.pdf}, DOI = {10.12871/CLICIT2014134}, PUBLISHER = {Pisa University Press srl (Pisa, ITA)}, ISBN = {978-88-67-41472-7}, CONFERENCE_NAME = {Proceedings of the First Italian Conference on Computational Linguistics CLiC-it 2014 \& the Fourth International Workshop EVALITA 2014}, CONFERENCE_PLACE = {Pisa}, EDITOR = {Basili, R. and Lenci, A. and Magnini, B.}, } @INPROCEEDINGS{FRONTINI_2014_INPROCEEDINGS_FQPUM_222781, AUTHOR = {Frontini, F. and Quochi, V. and Padó, S. and Utt, J. and Monachini, M.}, TITLE = {Polysemy Index for Nouns: an Experiment on Italian using the PAROLE SIMPLE CLIPS Lexical Database}, YEAR = {2014}, ABSTRACT = {An experiment is presented to induce a set of polysemous basic type alternations (such as ANIMAL-FOOD, or BUILDING-INSTITUTION) by deriving them from the sense alternations found in an existing lexical resource. The paper builds on previous work and applies those results to the Italian lexicon PAROLE SIMPLE CLIPS. The new results show how the set of frequent type alternations that can be induced from the lexicon is partly different from the set of polysemy relations selected and explicitly applied by lexicographers when building it. The analysis of mismatches shows that frequent type alternations do not always correspond to prototypical polysemy relations, nevertheless the proposed methodology represents a useful tool offered to lexicographers to systematically check for possible gaps in their resource}, KEYWORDS = {Polysemy, lexical resources, semantics}, PAGES = {2955-2963}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {9th International Conference on Language Resources and Evaluation, LREC 2014}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {LREC 2014 Ninth International Conference on Language Resources and Evaluation Proceedings}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{KHAN_2014_INPROCEEDINGS_KBF_259370, AUTHOR = {Khan, F. and Boschetti, F. and Frontini, F.}, TITLE = {Using lemon to Model Lexical Semantic  Shift in Diachronic Lexical Resources}, YEAR = {2014}, ABSTRACT = {In this paper we propose a model, called lemonDIA, for representing lexical semantic change using the lemon framework and based on the ontological notion of the perdurant. Namely we extend the notion of sense in lemon by adding a temporal dimension and then define a class of perdurant entities that represents a shift in meaning of a word and which contains different related senses. We start by discussing the general problem of semantic shift and the utility of being able to easily access and represent such information in diachronic lexical resources. We then describe our model and illustrate it with examples}, KEYWORDS = {lemon, linked data, OWL, ontologies, perdurants, semantic shift}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/workshops/LREC2014Workshop-LDL2014%20Proceedings.pdf}, CONFERENCE_NAME = {3rd Workshop on Linked Data in Linguistics: Multilingual Knowledge Resources and Natural Language Processing (LDL2014)}, BOOKTITLE = {Proceedings of the 3rd Workshop on Linked Data in Linguistics (LDL-2014)}, EDITOR = {Chiarcos, C. and McCrae, J. P. and Osenova, P. and Vertan, C.}, } @INPROCEEDINGS{MONEGLIA_2014_INPROCEEDINGS_MBFGKMP_222787, AUTHOR = {Moneglia, M. and Brown, S. and Frontini, F. and Gagliardi, G. and Khan, F. and Monachini, M. and Panunzi, A.}, TITLE = {The IMAGACT Visual Ontology. an Extendable Multilingual Infrastructure for the Representation of Lexical Encoding of Action}, YEAR = {2014}, ABSTRACT = {Action verbs have many meanings, covering actions in different ontological types. Moreover, each language categorizes action in its own way. One verb can refer to many different actions and one action can be identified by more than one verb. The range of variations within and across languages is largely unknown, causing trouble for natural language processing tasks. IMAGACT is a corpus-based ontology of action concepts, derived from English and Italian spontaneous speech corpora, which makes use of the universal language of images to identify the different action types extended by verbs referring to action in English, Italian, Chinese and Spanish. This paper presents the infrastructure and the various linguistic information the user can derive from it. IMAGACT makes explicit the variation of meaning of action verbs within one language and allows comparisons of verb variations within and across languages. Because the action concepts are represented with videos, extension into new languages beyond those presently implemented in IMAGACT is done using competence-based judgments by mother-tongue informants without intense lexicographic work involving underdetermined semantic description}, KEYWORDS = {Lexicon, Lexical Database, Ontologies}, PAGES = {3425-3432}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {9th International Conference on Language Resources and Evaluation, LREC 2014}, CONFERENCE_PLACE = {Paris}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{PALLOTTI_2014_INPROCEEDINGS_PFAMF_222825, AUTHOR = {Pallotti, G. and Frontini, F. and Affè, F. and Monachini, M. and Ferrari, S.}, TITLE = {Presenting a System of Human-Machine Interaction for Performing Map Tasks}, YEAR = {2014}, ABSTRACT = {A system for human machine interaction is presented, that offers second language learners of Italian the possibility of assessing their competence by performing a map task, namely by guiding the a virtual follower through a map with written instructions in natural language. The underlying natural language processing algorithm is described, and the map authoring infrastructure is presented}, KEYWORDS = {Language learning, human machine interaction, map tasks}, PAGES = {3963-3966}, URL = {http://www.lrec-conf.org/proceedings/lrec2014/index.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {9th International Conference on Language Resources and Evaluation, LREC 2014}, CONFERENCE_PLACE = {Paris}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Loftsson, H. and Maegaard, B. and Mariani, J. and Moreno, A. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{GOGGI_2014_INPROCEEDINGS_GMFBPDBM_265502, AUTHOR = {Goggi, S. and Monachini, M. and Frontini, F. and Bartolini, R. and Pardelli, G. and De Mattei, M. and Bustaffa, F. and Manzella, G.}, TITLE = {Marine Planning and Service Platform (MAPS): An Advanced Research Engine for Grey Literature in Marine Science}, YEAR = {2014}, ABSTRACT = {The MAPS (Marine Planning and Service Platform) project is a development of the Marine project (Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013) aiming at building a computer platform for supporting Operative Oceanography in its activities. One of the main objective of the project is to develop a repository that should gather, classify and structure marine scientific literature and data thus guaranteeing their accessibility to researchers and institutions by means of standard protocols. Community and Requirements. Operative Oceanography is the branch of marine research which deals with the development of integrated systems for examining and modeling the ocean monitoring and forecast. Experts need access to real-time data on the state of the sea such as forecasts on temperatures, streams, tides and the relevant scientific literature. This finds application in many areas, ranging from civilian and military safety to protection of off-shore and coastal infrastructures. The metadata. The set of metadata associated with marine data is defined in the CDI (Common Data Index) documented standard. They encode: the types of sizes which have been measured; the measurement tools the platform which has been employed; the geographic area where measures have been taken; the environmental matrix; the descriptive documentation. As concerns the scientific documentation, at the current stage of the CDI standard, a document is shaped around the following metadata: Title, Authors, Version, ISBN/DOI, Topic, Date of publication, Body/Institution, Abstract. The search engine. The query system (which is actually under development) has been designed for operating with structured data-the metadata-and raw data-the associated technical and scientific documentation. Full-text technologies are often unsuccessful when applied to this type of queries since they assume the presence of specific keywords in the text; in order to fix this problem, the MAPS project suggests to use different emantic technologies for retrieving the text and data and thus getting much more complying results. In the Poster we will present the scenario of the Operative Oceanography together with the technologies used to develop an advanced earch engine which aims at providing rapid and efficient access to a Digital Library of oceanographic data. The case-study is also highlighting how the retrieval of grey literature from this specific marine community could be reproduced for similar communities as well, thus revealing the 2 great impact that the processing, re-use as well as application of grey data have on societal needs/problems and their answers}, KEYWORDS = {Marine Science, Search Engine, Source Data, Oceanography}, PAGES = {93-94}, URL = {http://greyguide.isti.cnr.it/dfdownloadnew.php?ident=GLConference/GL16/2014-G01-015\&langver=en\&scelta=Metadata}, ISBN = {978-90-77484-24-1}, CONFERENCE_NAME = {Sixteenth International Conference on Grey Literature Grey Literature Lobby: Engines and Requesters for Change}, EDITOR = {Farace, C. B. D. and Frantzen, J.}, } @TECHREPORT{DEMATTEI_2014_TECHREPORT_DMDMBF_276258, AUTHOR = {De Mattei, M. and Medone, D. and D'Angelo, P. and Monachini, M. and Bartolini, R. and Frontini, F.}, TITLE = {MAPS: Architettura del Sistema}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1. 2. 2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012. Il presente documento è il deliverable "D3. 1-Architettura del Sistema" del progetto MAPS (Marine Planning and Service Platform). Il progetto MAPS è un'evoluzione del progetto precedente Marine. Tale evoluzione si articola su tre aspetti diversi:-Un meccanismo di federazione dei dati, che consenta di rendere disponibili ai propri utenti non soltanto i dati prodotti internamente da sistema Marine ma anche quelli resi disponibili da altri sistemi similari, soddisfacendo così un più ampio ambito di esigenze informative. Il deliverable D2. 2, Modello della Soluzione specifica in dettaglio queste nuove funzionalità.-Un Catalogo dei Documenti che, conservando la documentazione tecnica e scientifica dei prodotti offerti, possa documentare in modo accurato le modalità di misurazione, elaborazione e controllo dei prodotti forniti e quindi i relativi ambiti di applicabilità.-Un sistema di ricerca capace di selezionare i dati necessari ad uno scopo determinato non soltanto sulla base della loro tipologia, della loro dislocazione territoriale o di altre informazioni simili contenute nei metadati associati come avviene oggi nella maggior parte dei sistemi esistenti, ma anche sulla base delle informazioni contenute nella documentazione tecnica e scientifica. Tali funzionalità sono specificate nel deliverable D1. 3-Modello della Soluzione}, KEYWORDS = {Marine Science, Search Engine, Source Data, Oceanography}, PAGES = {1-35}, URL = {https://iris.cnr.it/handle/20.500.14243/276258}, } @TECHREPORT{DEMATTEI_2014_TECHREPORT_DMMFBM_276262, AUTHOR = {De Mattei, M. and Medone, D. and Maltese, M. and Frontini, F. and Bartolini, R. and Monachini, M.}, TITLE = {META: Report di progettazione degli algoritmi individuati}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1. 2. 2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012. Il deliverable definisce l'architettura del Sistema di Estrazione Eventi Meteo realizzato dagli autori nell'ambito del progetto META. Il sistema estrae da contenuti online informazione su eventi meteo critici verificatesi in Liguria e nel nord della Toscana}, KEYWORDS = {Ontology, Information Extraction, Taxonomy}, PAGES = {1-19}, URL = {https://iris.cnr.it/handle/20.500.14243/276262}, } @TECHREPORT{FRONTINI_2014_TECHREPORT_FBM_276259, AUTHOR = {Frontini, F. and Bartolini, R. and Monachini, M.}, TITLE = {MAPS: Stato dell'Arte}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1. 2. 2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012 Il documento descrive lo stato dell'arte delle tecnologie linguistiche applicate ai sistemi di ricerca semantica}, KEYWORDS = {Marine Science, Search Engine, Source Data, Oceanography}, PAGES = {1-21}, URL = {https://iris.cnr.it/handle/20.500.14243/276259}, } @TECHREPORT{FRONTINI_2014_TECHREPORT_FBM_276261, AUTHOR = {Frontini, F. and Bartolini, R. and Monachini, M.}, TITLE = {META:-Report sui modelli e tecniche linguistiche}, YEAR = {2014}, ABSTRACT = {PROGRAMMA OPERATIVO REGIONALE POR-FESR (2007-2013) Asse 1 Innovazione e Competitività Bando DLTM Azione 1. 2. 2 "Ricerca industriale e sviluppo sperimentale a favore delle imprese del Distretto Ligure per le Tecnologie Marine (DLTM) anno 2012. Il deliverable riassume lo stato dell'arte delle tecnologie semantiche che possono essere impiegate nella realizzazione del progetto META. Il progetto META è una progetto di ricerca e sviluppo tecnologico finanziato dalla Regione Liguria con i fondi POR-FESR 2007-2013 della Comunità Europea che mira alla realizzazione di un sistema per l'allerta di eventi meteo critici in Liguria e nel nord della Toscana. Nell'ambito del progetto META le tecnologie semantiche sono utilizzate per estrarre eventi meteo di interesse da articoli pubblicati in rete o sui social network}, KEYWORDS = {Ontology, Information Extraction, Semantic Web, Search Engine}, PAGES = {1-20}, URL = {https://iris.cnr.it/handle/20.500.14243/276261}, } @TECHREPORT{FRONTINI_2014_TECHREPORT_FBMPG_222835, AUTHOR = {Frontini, F. and Bartolini, R. and Monachini, M. and Pardelli, G. and Goggi, S.}, TITLE = {Stato dell'arte dei motori semantici. Progetto MAPS, programma operativo regionale POR-FESR (2007-2013)}, YEAR = {2014}, ABSTRACT = {Il presente documento è il deliverable "D1. 1-Stato dell'Arte dei motori semantici del progetto MAPS (Marine Planning and Service Platform). Il progetto MAPS è una evoluzione del progetto precedente Marine. Tramite il progetto Marine (Bando Ricerca Industriale e Sviluppo Sperimentale Regione Liguria 2007-2013-pos n. 1) è stata realizzata una piattaforma informatica di supporto all'Oceanografia Operativa capace di raccogliere dati marini per renderli poi disponibili ai ricercatori e alle organizzazioni interessate tramite protocolli standard. Lo scopo del progetto MAPS è quello di realizzare una Catalogo di Documenti contenente informazioni per la piattaforma Marine. Caratteristica di MAPS è di fornire accesso ai dati oceanografici sia attraverso la ricerca per metadati, sia attraverso la ricerca semantica contenuta nella manualistica tecnico scientifica di riferimento}, PAGES = {1-22}, URL = {https://iris.cnr.it/handle/20.500.14243/222835}, } @MISC{FRONTINI_2014_MISC_F_286128, AUTHOR = {Frontini, F.}, TITLE = {La mappa delle opinioni e dei sentimenti estratte dai social media}, YEAR = {2014}, URL = {https://iris.cnr.it/handle/20.500.14243/286128}, CONFERENCE_NAME = {Seminario rivolto agli alunni dell'Istituto Tecnico Economico "F. Carrara" di Lucca, organizzato dall'Istituto di Linguistica Computazionale "A. Zampolli" del CNR di Pisa}, } @MISC{KHAN_2014_MISC_KFM_262584, AUTHOR = {Khan, F. and Frontini, F. and Monachini, M.}, TITLE = {A Model for Representing Diachronic Semantic Information in Lexico-Semantic Resources on the Semantic Web}, YEAR = {2014}, ABSTRACT = {The Semantic Web offers a way of publishing structured data online that facilitates the interlinking of different datasets stored at different online locations? indeed one of the main aims of the Semantic Web movement is to actively encourage this enrichment of online datasets with information from other resources, in order to avoid the problem of so called 'data islands'. In contrast to conventional hyperlinks however the links between different resources on the Semantic Web can be given semantic types and classified hierarchically. Data published on the Semantic Web is referred to as Linked Data? if, in addition, this data is available with an open license then it can be referred to as Linked Open Data (Heath 2011)}, KEYWORDS = {Cultural resources, Heritage resources}, PAGES = {1-3}, URL = {http://www.dh.uni-leipzig.de/wo/wp-content/uploads/2014/11/Fahad-Khan-Francesca-Frontini-and-Monica-Monachini-A-Model-for-Representing.pdf}, CONFERENCE_NAME = {Greek and Latin in an age of Open Data. Open Philology Project}, } @INPROCEEDINGS{FRONTINI_2013_INPROCEEDINGS_FDM_226376, AUTHOR = {Frontini, F. and Del Gratta, R. and Monachini, M.}, TITLE = {Linking the Geonames ontology to WordNet}, YEAR = {2013}, ABSTRACT = {This paper illustrates the transformation of the GeoNames ontology concepts, with their English labels and glosses, into a GeoDomain WordNet-like resource in English, its translation into Italian, and its linking to the existing generic WordNets of both languages}, KEYWORDS = {GeoNames, WordNet, lemon}, PAGES = {263-267}, URL = {http://hnk.ffzg.hr/bibl/ltc2013/book/papers/OWN-2.pdf}, PUBLISHER = {Fundacja Uniwersytetu im A. Mickiewicza (Poznan, POL)}, ISBN = {978-2-9517408-8-4}, CONFERENCE_NAME = {6th Language \& Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, CONFERENCE_PLACE = {Poznan}, BOOKTITLE = {Human Language Technologies as a Challenge for Computer Science and Linguistics. Proceedings, 6th Language \& Technology Conference, December 7-9, 2013, Poznañ, Poland}, EDITOR = {Vetulani, Z. and Uszkoreit, H.}, } @INPROCEEDINGS{KHAN_2013_INPROCEEDINGS_KFDMQ_259365, AUTHOR = {Khan, F. and Frontini, F. and Del Gratta, R. and Monachini, M. and Quochi, V.}, TITLE = {Generative Lexicon Theory and Linguistic Linked Open Data}, YEAR = {2013}, ABSTRACT = {In this paper we look at how Generative Lexicon theory can assist in providing a more thorough definition of word senses as links between items in a RDF-based lexicon and concepts in an ontology. We focus on the definition of lexical sense in lemon and show its limitations before defining a new model based on lemon and which we term lemonGL. This new model is an initial attempt at providing a way of structuring lexico-ontological resources as linked data in such a way as to allow a rich representation of word meaning (following the GL theory) while at the same time (attempting to) re-main faithful to the separation between the lexicon and the ontology as recommended by the lemon model}, URL = {https://iris.cnr.it/handle/20.500.14243/259365}, } @INPROCEEDINGS{MARCHETTI_2013_INPROCEEDINGS_MTALDFM_226423, AUTHOR = {Marchetti, A. and Tesconi, M. and Abbate, S. and Lo Duca, A. and D'Errico, A. and Frontini, F. and Monachini, M.}, TITLE = {Tour-pedia: a web application for the analysis and visualization of opinions for tourism domain}, YEAR = {2013}, ABSTRACT = {We present Tour-pedia an interactive web application that extracts opinions from reviews of accommodations from different sources available on-line. Polarity markers display on a map the different opinions. This tool is intended to help business operators to manage reputation on-line}, KEYWORDS = {Visualization tools, opinion mining, NLP on social media, tourism reviews}, PAGES = {594-595}, URL = {http://www.iit.cnr.it/sites/default/files/ltc2013_opener_demo.pdf}, PUBLISHER = {Fundacja Uniwersytetu im A. Mickiewicza (Poznan, POL)}, ISBN = {978-83-932640-4-9}, CONFERENCE_NAME = {6th Language \& Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, CONFERENCE_PLACE = {Poznan}, EDITOR = {Vetulani, Z. and Uszkoreit, H.}, } @INPROCEEDINGS{MONEGLIA_2013_INPROCEEDINGS_MPGMRDKF_226438, AUTHOR = {Moneglia, M. and Panunzi, A. and Gagliardi, G. and Monachini, M. and Russo, I. and De Felice, I. and Khan, F. and Frontini, F.}, TITLE = {IMAGACT E-learning Platform for Basic Action Types. In: Pixel (ed.), Proceedings of the 6th International Conference ICT for Language Learning}, YEAR = {2013}, ABSTRACT = {Action verbs express important information in a sentence and they are the most frequent elements in speech, but they are also one of the most difficult part of the lexicon to learn for L2 language learners, because languages segment these concepts in very different ways. The two sentences "Mary folds her shirt" and "Mary folds her arms" refer to two completely different types of action, as becomes evident when they are translated into another language (e. g., in Italian they would be translated as "Maria piega la camicia" and "Maria incrocia le braccia" respectively). IMAGACT e-learning platform aims to make these differences evident by creating a cross-linguistic ontology of action types, whose nodes consist of 3D scenes, each of which relates to one action type. In order to identify these types, contexts of use have been extracted from English and Italian spontaneous speech corpora for around 600 high frequency action verbs (for each language). All instances that refer to similar events (e. g., fold the shirt/ the blanket) are grouped under one single action type: each one of these types is then represented by a linguistic best example and a short video that represents simple actions (e. g. a man taking a glass from a table). The action types extracted for Italian and English are compared and merged into one cross-linguistic ontology of action. IMAGACT has provided an internet based annotation infrastructure to derive this information from corpora. The project is now completed for the Italian and English lexicon, data extraction for Chinese and Spanish is ongoing. Reference to prototypical imagery is crucial in order to bootstrap the learning process. By selecting the set of 3D scenes referred to by a verb in one language and viewing the type of activity represented therein learners can directly understand the range of applicability of each verb. Thanks to an easy interface, a user can access the English/Italian/Chinese lexicon by lemma or directly by 3D scenes. For example, searching for the verb "to turn", s/he will be presented with a number of scenes, showing the various action types associated to that verb. Clicking on a scene s/he or she will know how this type of action is referred to in other the languages}, KEYWORDS = {Ontology}, PAGES = {85-89}, URL = {https://iris.cnr.it/handle/20.500.14243/226438}, PUBLISHER = {libreriauniversitaria. it (Limena, ITA)}, ISBN = {978-88-6292-423-8}, CONFERENCE_NAME = {International Conference "ICT for Language Learning", 6th edition}, CONFERENCE_PLACE = {Limena}, BOOKTITLE = {Conference Proceedings. ICT for Language Learning}, EDITOR = {Pixel}, } @INPROCEEDINGS{RUSSO_2013_INPROCEEDINGS_RDFKM_257360, AUTHOR = {Russo, I. and De Felice, I. and Frontini, F. and Khan, F. and Monachini, M.}, TITLE = {(Fore)seeing actions in objects. Acquiring distinctive affordances from language}, YEAR = {2013}, ABSTRACT = {In this paper we investigate if conceptual information concerning objects' affordances as possibilities for actions anchored to an object can be at least partially acquired through language. Considering verb-noun pairs as the linguistic realizations of relations between actions performed by an agent and objects we collect this information from the ImagAct dataset, a linguistic resource obtained from manual annotation of basic action verbs, and from a web corpus(itTenTen). The notion of affordance verb as the most distinctive verb in ImagAct enables a comparison with distributional data that reveal how lemmas ranking based on a semantic association measure that mirror that of affordances as the most distinctive actions an object can be involved in}, PAGES = {151-161}, URL = {https://docs.google.com/viewer?a=v\&pid=sites\&srcid=ZGVmYXVsdGRvbWFpbnxubHBjczIwMTN8Z3g6MTI0ZGMzYWYwYmMxNjY1Mg}, CONFERENCE_NAME = {NLPCS 2013-10th International Workshop on Natural Language Processing and Cognitive Science}, BOOKTITLE = {Proceedings of NLPCS 2013-10th International Workshop on Natural Language Processing and Cognitive Science}, EDITOR = {Sharp, B. and Zock, M.}, } @INPROCEEDINGS{RUSSO_2013_INPROCEEDINGS_RFDKM_227078, AUTHOR = {Russo, I. and Frontini, F. and De Felice, I. and Khan, F. and Monachini, M.}, TITLE = {Disambiguation of Basic Action Types through Nouns' Telic Qualia}, YEAR = {2013}, ABSTRACT = {Knowledge about semantic associations between words is effective to disambiguate word senses. The aim of this paper is to investigate the role and the relevance of telic information from SIMPLE in the disambiguation of basic action types of Italian HOLD verbs (prendere, 'to take', raccogliere, 'to pick up', pigliare 'to grab' etc.). We propose an experiment to compare the results obtained with telic information from SIMPLE with basic co-occurrence information extracted from corpora (most salient verbs modifying nouns) classified in terms of general semantic classes to avoid data sparseness}, PAGES = {70-75}, URL = {http://www.aclweb.org/anthology/W13-5410}, PUBLISHER = {Association for Computational Linguistics (Stroudsburg, USA)}, ISBN = {978-1-937284-98-5}, CONFERENCE_NAME = {6th International Conference on Generative Approaches to the Lexicon Generative Lexicon and Distributional Semantics}, CONFERENCE_PLACE = {Stroudsburg}, BOOKTITLE = {Proceedings of the 6th International Conference on Generative Approaches to the Lexicon. Generative Lexicon and Distributional Semantics}, EDITOR = {Saurí, R. and Calzolari, N. and Huang, C. R. and Lenci, A. and Monachini, M. and Pustejovsky, J.}, } @INPROCEEDINGS{CASELLI_2012_INPROCEEDINGS_CFQRR_222834, AUTHOR = {Caselli, T. and Frontini, F. and Quochi, V. and Rubino, F. and Russo, I.}, TITLE = {Flexible Acquisition of Subcategorization Frames in Italian}, YEAR = {2012}, ABSTRACT = {Lexica of predicate-argument structures constitute a useful tool for several tasks in NLP. This paper describes a web-service system for automatic acquisition of verb subcategorization frames (SCFs) from parsed data in Italian. The system acquires SCFs in an unsupervised manner. We created two gold standards for the evaluation of the system, the first by mixing together information from two lexica (one manually created and the second automatically acquired) and manual exploration of corpus data and the other annotating data extracted from a specialized corpus (environmental domain). Data filtering is accomplished by means of the maximum likelihood estimate (MLE). The evaluation phase has allowed us to identify the best empirical MLE threshold for the creation of a lexicon (P=0. 653, R=0. 557, F1=0. 601). In addition to this, we assigned to the extracted entries of the lexicon a confidence score based on the relative frequency and evaluated the extractor on domain specific data. The confidence score will allow the final user to easily select the entries of the lexicon in terms of their reliability: one of the most interesting feature of this work is the possibility the final users have to customize the results of the SCF extractor, obtaining different SCF lexica in terms of size and accuracy}, KEYWORDS = {lexicon, automatic acquisition, subcategorisation frames}, PAGES = {2842-2848}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/390.html}, PUBLISHER = {European Language Resources Association ELRA (Paris, FRA)}, ISBN = {9782951740877}, CONFERENCE_NAME = {Eight International Conference on Language Resources and Evaluation (LREC'12)}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Doğan, M. U. and Maegaard, B. and Mariani, J. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{DELGRATTA_2012_INPROCEEDINGS_DFMQRAL_117790, AUTHOR = {Del Gratta, R. and Frontini, F. and Monachini, M. and Quochi, V. and Rubino, F. and Abrate, M. and Lo Duca, A.}, TITLE = {L-LEME: an Automatic Lexical Merger based on the LMF Standard}, YEAR = {2012}, ABSTRACT = {The present paper describes LMF LExical MErger (L-LEME), an architecture to combine two lexicons in order to obtain new resource(s). L-LEME relies on standards, thus exploiting the benefits of the ISO Lexical Markup Framework (LMF) to ensure interoperability. L-LEME is meant to be dynamic and heavily adaptable: it allows the users to configure it to meet their specific needs. The L-LEME architecture is composed of two main modules: the Mapper, which takes in input two lexicons A and B and a set of user-defined rules and instructions to guide the mapping process (Directives D) and gives in output all matching entries. The algorithm also calculates a cosine similarity score. The Builder takes in input the previous results, a set of Directives D1 and produces a new LMF lexicon C. The Directives allow the user to define its own building rules and different merging scenarios. L-LEME is applied to a specific concrete task within the PANACEA project, namely the merging of two Italian SubCategorization Frame (SCF) lexicons. The experiment is interesting in that A and B have different philosophies behind, being A built by human introspection and B automatically extracted. Ultimately, L-LEME has interesting repercussions in many language technology applications}, KEYWORDS = {LMF, Lexicon mapping, similarity score}, PAGES = {31-40}, URL = {https://iris.cnr.it/handle/20.500.14243/117790}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC) 2012}, BOOKTITLE = {Proceedings of the LREC 2012 Workshop on Language Resource Merging}, EDITOR = {Bel, N. and Gavrilidou, M. and Monachini, M. and Quochi, V. and Rimell, L.}, } @INPROCEEDINGS{DELGRATTA_2012_INPROCEEDINGS_DFRRC_119634, AUTHOR = {Del Gratta, R. and Frontini, F. and Rubino, F. and Russo, I. and Calzolari, N.}, TITLE = {The Language Library: supporting community effort for collective resource production}, YEAR = {2012}, ABSTRACT = {Relations among phenomena at different linguistic levels are at the essence of language properties but today we focus mostly on one specific linguistic layer at a time, without (having the possibility of) paying attention to the relations among the different layers. At the same time our efforts are too much scattered without much possibility of exploiting other people's achievements. To address the complexities hidden in multilayer interrelations even small amounts of processed data can be useful, improving the performance of complex systems. Exploiting the current trend towards sharing we want to initiate a collective movement that works towards creating synergies and harmonisation among different annotation efforts that are now dispersed. In this paper we present the general architecture of the Language Library, an initiative which is conceived as a facility for gathering and making available through simple functionalities the linguistic knowledge the field is able to produce, putting in place new ways of collaboration within the LRT community. In order to reach this goal, a first population round of the Language Library has started around a core of parallel/comparable texts that have been annotated by several contributors submitting a paper for LREC2012. The Language Library has also an ancillary aim related to language documentation and archiving and it is conceived as a theory-neutral space which allows for several language processing philosophies to coexist}, KEYWORDS = {annotation, metadata, scientific crowdsourcing}, PAGES = {43-49}, URL = {https://iris.cnr.it/handle/20.500.14243/119634}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, BOOKTITLE = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, } @INPROCEEDINGS{FRONTINI_2012_INPROCEEDINGS_FABBMPPS_251924, AUTHOR = {Frontini, F. and Aliprandi, C. and Bacciu, C. and Bartolini, R. and Marchetti, A. and Parenti, E. and Piccinonno, F. and Soru, T.}, TITLE = {GLOSS, an infrastructure for the semantic annotation and mining of documents in the public security domain}, YEAR = {2012}, ABSTRACT = {Efficient access to information is crucial in the work of organizations that require decision taking in emergency situations. This paper gives an outline of GLOSS, an integrated system for the analysis and retrieval of data in the environmental and public security domain. We shall briefly present the GLOSS infrastructure and its use, and how semantic information of various kinds is integrated, annotated and made available to the final users}, KEYWORDS = {semantic annotation, text mining, geographic data}, PAGES = {21-25}, URL = {https://iris.cnr.it/handle/20.500.14243/251924}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {Eight International Conference on Language Resources and Evaluation. LREC'12. European Language Resources Association: France}, CONFERENCE_PLACE = {Paris}, } @INPROCEEDINGS{FRONTINI_2012_INPROCEEDINGS_FQR_128272, AUTHOR = {Frontini, F. and Quochi, V. and Rubino, F.}, TITLE = {Automatic Creation of Quality Multi-Word Lexica from Noisy Text Data}, YEAR = {2012}, ABSTRACT = {This paper describes the design of a tool for the automatic creation of multi-word lexica that is deployed as a web service and runs on automatically web-crawled data within the framework of the PANACEA platform. The main purpose of our task is to provide a (computationally "light") tool that creates a full high quality lexical resource of multi-word items. Within the platform, this tool is typically inserted in a work flow whose first step is automatic web-crawling. Therefore, the input data of our lexical extractor is intrinsically noisy. The paper evaluates the capacity of the tool to deal with noisy data, and in particular with texts containing a significant amount of duplicated paragraphs. The accuracy of the extraction of multi-word expressions from the original crawled corpus is compared to the accuracy of the extraction from a later "de-duplicated" version of the corpus. The paper shows how our method can extract with sufficiently good precision also from the original, noisy crawled data. The output of our tool is a multi-word lexicon formatted and encoded in XML according to the Lexical Mark-up Framework}, KEYWORDS = {Lexical induction, multi-word extraction, web-based distributed platform, noisy data}, URL = {http://www.kde.cs.tut.ac.jp/~aono/pdf/COLING2012/AND/pdf/AND04.pdf}, PUBLISHER = {ACM, Association for computing machinery (New York, USA)}, ISBN = {978-1-4503-1919-5}, CONFERENCE_NAME = {AND 2012}, CONFERENCE_PLACE = {New York}, BOOKTITLE = {Proceedings of the Sixth Workshop on Analytics for Noisy Unstructured Text Data}, } @INPROCEEDINGS{GAVRILIDOU_2012_INPROCEEDINGS_GLDPPMFDFAM_5349, AUTHOR = {Gavrilidou, M. and Labropoulou, P. and Desipri, E. and Piperidis, S. and Papageorgiou, H. and Monachini, M. and Frontini, F. and Declerck, T. and Francopoulo, G. and Arranz, V. and Mapelli, V.}, TITLE = {The META-SHARE Metadata Schema for the Description of Language Resources}, YEAR = {2012}, ABSTRACT = {This paper presents a metadata model for the description of language resources proposed in the framework of the META-SHARE infrastructure, aiming to cover both datasets and tools/technologies used for their processing. It places the model in the overall framework of metadata models, describes the basic principles and features of the model, elaborates on the distinction between minimal and maximal versions thereof, briefly presents the integrated environment supporting the LRs description and search and retrieval processes and concludes with work to be done in the future for the improvement of the model}, KEYWORDS = {metadata, META-SHARE, LRs description}, PAGES = {1090-1097}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/index.html}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, } @INPROCEEDINGS{MONACHINI_2012_INPROCEEDINGS_MFDRKGP_119663, AUTHOR = {Monachini, M. and Frontini, F. and De Felice, I. and Russo, I. and Khan, F. and Gagliardi, G. and Panunzi, A.}, TITLE = {Verb interpretation for basic action types: annotation, ontology induction and creation of prototypical scenes}, YEAR = {2012}, ABSTRACT = {In the last 20 years dictionaries and lexicographic resources such as WordNet have started to be enriched with multimodal content. Short videos depicting basic actions support the user's need (especially in second language acquisition) to fully understand the range of applicability of verbs. The IMAGACT project has among its results a repository of action verbs ontologically organised around prototypical action scenes in the form of both video recordings and 3D animations. The creation of the IMAGACT ontology, which consists in deriving action types from corpus instances of action verbs, intra and cross linguistically validating them and producing the prototypical scenes thereof, is the preliminary step for the creation of a resouce that users can browse by verb, learning how to match different action prototypes with the correct verbs in the target language. The mapping of IMAGACT types onto WordNet synsets allows for a mutual enrichment of both resources}, KEYWORDS = {ontology of actions, lexical resource, 3D animations}, PAGES = {69-80}, URL = {https://iris.cnr.it/handle/20.500.14243/119663}, CONFERENCE_NAME = {COLING 2012-3rd Workshop on Cognitive Aspects of the Lexicon (CogALex-III)}, } @INPROCEEDINGS{MONEGLIA_2012_INPROCEEDINGS_MGPFRM_122911, AUTHOR = {Moneglia, M. and Gagliardi, G. and Panunzi, A. and Frontini, F. and Russo, I. and Monachini, M.}, TITLE = {IMAGACT: Deriving an Action Ontology from Spoken Corpora}, YEAR = {2012}, ABSTRACT = {This paper presents the IMAGACT annotation infrastructure which uses both corpus-based and competence-based methods for the simultaneous extraction of a language independent Action ontology from English and Italian spontaneous speech corpora. The infrastructure relies on an innovative methodology based on images of prototypical scenes and will identify high frequency action concepts in everyday life, suitable for the implementation of an open set of languages}, KEYWORDS = {Action verb, Ontology, imagery}, PAGES = {42-47}, URL = {https://iris.cnr.it/handle/20.500.14243/122911}, ISBN = {978-90-74029-00-1}, CONFERENCE_NAME = {Eighth Joint ISO-ACL SIGSEM Workshop on Interoperable Semantic Annotation (ISA-8)}, BOOKTITLE = {Proceedings of the Eight Joint ISO-ACL SIGSEM Workshop on Interoperable Semantic Annotation ISA-8}, EDITOR = {Bunt, H.}, } @INPROCEEDINGS{MONEGLIA_2012_INPROCEEDINGS_MMCPFGR_5301, AUTHOR = {Moneglia, M. and Monachini, M. and Calabrese, O. and Panunzi, A. and Frontini, F. and Gagliardi, G. and Russo, I.}, TITLE = {The IMAGACT Cross-linguistic Ontology of Action. A new infrastructure for natural language disambiguation}, YEAR = {2012}, ABSTRACT = {Action verbs, which are highly frequent in speech, cause disambiguation problems that are relevant to Language Technologies. This is a consequence of the peculiar way each natural language categorizes Action i. e. it is a consequence of semantic factors. Action verbs are frequently "general", since they extend productively to actions belonging to different ontological types. Moreover, each language categorizes action in its own way and therefore the cross-linguistic reference to everyday activities is puzzling. This paper briefly sketches the IMAGACT project, which aims at setting up a cross-linguistic Ontology of Action for grounding disambiguation tasks in this crucial area of the lexicon. The project derives information on the actual variation of action verbs in English and Italian from spontaneous speech corpora, where references to action are high in frequency. Crucially it makes use of the universal language of images to identify action types, avoiding the underdeterminacy of semantic definitions. Action concept entries are implemented as prototypic scenes; this will make it easier to extend the Ontology to other languages}, KEYWORDS = {Action verbs, Ontology, Imagery}, PAGES = {2606-2613}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/428_Paper.pdf}, ISBN = {978-2-9517408-7-7}, CONFERENCE_NAME = {The Eight International Conference on Language Resources and Evaluation (LREC'12)}, } @INPROCEEDINGS{MONEGLIA_2012_INPROCEEDINGS_MMPFGR_122919, AUTHOR = {Moneglia, M. and Monachini, M. and Panunzi, A. and Frontini, F. and Gagliardi, G. and Russo, I.}, TITLE = {Mapping a corpusinduced ontology of action verbs on ItalWordNet}, YEAR = {2012}, ABSTRACT = {Action verbs are the least predictable linguistic type for bilingual dictionaries and they cause major problems for NLP technologies. This is not only because of language specific phraseology, but it is rather a consequence of the peculiar way each language categorizes events. In ordinary languages the most frequent action verbs are "general", since they extend productively to actions belonging to different ontological types. Moreover, each language categorizes actions in its own way and therefore the cross-linguistic reference to everyday activities is puzzling. A cross-linguistic stable ontology of actions is difficult to achieve because our knowledge on the actual variation of verbs across types of actions is largely unknown. This paper briefly presents the problems and the building strategies of the IMAGACT Ontology, which aims at filling this gap, and compares some early results on a set of Italian verbs with the information contained in ItalWordNet}, KEYWORDS = {action verb, ontology, image}, PAGES = {219-226}, URL = {https://iris.cnr.it/handle/20.500.14243/122919}, ISBN = {978-80-263-0244-5}, CONFERENCE_NAME = {Global Wordnet Conference (GWC2012)}, BOOKTITLE = {Proceedings of the 6th Global WordNet Conference (GWC2012)}, EDITOR = {Fellbaum, C. and Vossen, P.}, } @INPROCEEDINGS{QUOCHI_2012_INPROCEEDINGS_QFR_128266, AUTHOR = {Quochi, V. and Frontini, F. and Rubino, F.}, TITLE = {A MWE Acquisition and Lexicon Builder Web Service}, YEAR = {2012}, ABSTRACT = {This paper describes the development of a web-service tool for the automatic extraction of Multi-word expressions lexicons, which has been integrated in a distributed platform for the automatic creation of linguistic resources. The main purpose of the work described is thus to provide a (computationally "light") tool that produces a full lexical resource: multi-word terms/items with relevant and useful attached information that can be used for more complex processing tasks and applications (e. g. parsing, MT, IE, query expansion, etc.). The output of our tool is a MW lexicon formatted and encoded in XML according to the Lexical Mark-up Framework. The tool is already functional and available as a service. Evaluation experiments show that the tool precision is of about 80%}, KEYWORDS = {Multiword extraction, lexical resources, LMF, web services.}, PAGES = {2291-2306}, URL = {http://aclweb.org/anthology/C/C12/C12-1140.pdf}, PUBLISHER = {Curran Associates (Red Hook, NY 12571, USA)}, ISBN = {9781627483896}, CONFERENCE_NAME = {International Conference on Computational Linguistics (COLING)}, CONFERENCE_PLACE = {Red Hook, NY 12571}, BOOKTITLE = {Proceedings of COLING 2012: Technical Papers}, EDITOR = {Kay, M. and Boitet, C.}, } @INPROCEEDINGS{RUBINO_2012_INPROCEEDINGS_RFQ_128261, AUTHOR = {Rubino, F. and Frontini, F. and Quochi, V.}, TITLE = {Integrating NLP Tools in a Distributed Environment: A Case Study Chaining a Tagger with a Dependency Parser}, YEAR = {2012}, ABSTRACT = {The present paper tackles the issue of PoS tag conversion within the framework of a distributed web service platform for the automatic creation of language resources. PoS tagging is now considered a "solved problem"; yet, because of the differences in the tagsets, interchange of the various PoS taggers vailable is still hampered. In this paper we describe the implementation of a PoS-tagged-corpus converter, which is needed for chaining together in a workflow the FreeLing PoS tagger for Italian and the DESR dependency parser, given that these two tools have been developed independently. The conversion problems experienced during the implementation, related to the properties of the different tagsets and of tagset conversion in general, are discussed together with the solutions adopted. Finally, the converter is evaluated by assessing the impact of conversion on the performance of the dependency parser by comparing with the outcome of the native pipeline. From this we learn that in most cases parsing errors are due to actual tagging errors, and not to conversion itself. Besides, information on accuracy loss is an important feature in a distributed environment of (NLP) services, where users need to decide which services best suit their needs}, KEYWORDS = {PoS tag conversion, interoperability, NLP pipelines}, PAGES = {2125-2131}, URL = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/726.html}, PUBLISHER = {European language resources association (ELRA) (Paris, FRA)}, ISBN = {9782951740877}, CONFERENCE_NAME = {Language Resources and Evaluation Conference 2012}, CONFERENCE_PLACE = {Paris}, BOOKTITLE = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, EDITOR = {Calzolari, N. and Choukri, K. and Declerck, T. and Doğan, M. U. and Maegaard, B. and Mariani, J. and Odijk, J. and Piperidis, S.}, } @INPROCEEDINGS{FRONTINI_2012_INPROCEEDINGS_FMNMAB_314751, AUTHOR = {Frontini, F. and Monachini, M. and N Lapolla, M. and Marchetti, A. and Abrate, M. and Bacciu, C.}, TITLE = {Web Language Identification Testing Tool}, YEAR = {2012}, ABSTRACT = {Nowadays a variety of tools for automatic language identification are available. Regardless of the approach used, at least two features can be identified as crucial to evaluate the performances of such tools: the precision of the presented results and the range of languages that can be detected. In this work we shall focus on a subtask of written language identification that is important to preserve and enhance multilinguality in the Web, i. e. detecting the language of a Web page given its URL. Most specifically, the final aim is to verify to which extent under-represented languages are recognized by available tools. The main specificity of Web Language Identification (WLI) lies in the fact that often an HTML page can provide interesting extralinguistic clues (URL domain name, metadata, encoding, etc) that can enhance accuracy. We shall first provide some data and statistics on the presence of languages on the web, secondly discuss existing practices and tools for language identification according to different metrics-for instance the approaches used and the number of supported languages-and finally make some proposals on how to improve current Web Language Identifiers. We shall also present a preliminary WLI service that builds on the Google Chromium Compact Language Detector; the WLI tool allows us to test the Google n-gram based algorithm against an ad-hoc gold standard of pages in various languages. The gold standard, based on a selection of Wikipedia projects, contains samples in languages for which no automatic recognition has been attempted; it can thus be used by specialists to develop and evaluate WLI systems}, KEYWORDS = {Language Identification Tools, Multilingual Web}, PAGES = {1-1}, URL = {https://iris.cnr.it/handle/20.500.14243/314751}, CONFERENCE_NAME = {W3C Workshop, Call for Participation: The Multilingual Web-The Way Ahead}, } @TECHREPORT{ALIPRANDI_2012_TECHREPORT_ABBFLMPS_130245, AUTHOR = {Aliprandi, C. and Bacciu, C. and Bartolini, R. and Frontini, F. and Lapolla, N. and Marchetti, A. and Piccinonno, F. and Soru, T.}, TITLE = {Specifiche architetturali e funzionali}, YEAR = {2012}, ABSTRACT = {Questo documento contiene le specifiche funzionali ed architetturali del sistema GLOSS elaborate come risultato dell'obiettivo operativo 1. Tali specifiche debbono essere di riferimento per tutte le fasi di sviluppo dei vari componenti del sistema stesso e della loro integrazione in un prototipo dimostrativo. Ad una breve introduzione che richiama gli obiettivi generali del progetto, seguono: 1. La descrizione delle funzionalità suddivisa nelle varie fasi che compongono il flusso operativo di GLOSS. 2. La descrizione dell'architettura del sistema da realizzare nella quale si fornisce lo schema dell'integrazione dei vari componenti, il protocollo di comunicazione e memorizzazione dei dati che viene trattato più nel dettaglio nel documento D1. 2 GAF-Gloss Annotation Format, e la descrizione di ciascun componente del sistema. Per sua natura, questo documento sarà soggetto a revisione durante tutto il periodo di sviluppo del sistema. Questa prima versione deve intendersi come guida per l'implementazione ed ha lo scopo di fornire a chi partecipa a questo progetto una visione generale delle funzionalità di GLOSS e come queste dovranno essere integrate nel prototipo dimostratore}, KEYWORDS = {GLOSS, specifiche funzionali}, URL = {https://iris.cnr.it/handle/20.500.14243/130245}, } @TECHREPORT{PROKOPIDIS_2012_TECHREPORT_PPTPFRT_129408, AUTHOR = {Prokopidis, P. and Papavassiliou, V. and Toral, A. and Poch Riera, M. and Frontini, F. and Rubino, F. and Thurmair, G.}, TITLE = {D4. 5 Final Report on the Corpus Acquisition & Annotation subsystem and its components}, YEAR = {2012}, ABSTRACT = {PANACEA WP4 targets the creation of a Corpus Acquisition and Annotation (CAA) subsystem for the acquisition and processing of monolingual and bilingual language resources (LRs). The CAA subsystem consists of tools that have been integrated as web services in the PANACEA platform of LR production. D4. 2 Initial functional prototype and documentation in T13 and D4. 4 Report on the revised Corpus Acquisition \& Annotation subsystem and its components in T23 provided initial and updated documentation on this subsystem, while this deliverable presents the final documentation of the subsystem as it evolved after the third development cycle of the project. The deliverable is structured as follows. The Corpus Acquisition Component (i. e. the Focused Monolingual and Bilingual Crawlers (FMC/FBC)) is described in section 2. The final list of tools for corpus normalization (cleaning and de-duplication) is detailed in section 3. Section 4 provides documentation on all NLP tools included in the subsystem. Due to its nature, this deliverable aggregates considerable parts of all previous WP4 deliverables. The main new additions include a) new functionalities for, among others, crawling strategy, de-duplication, and detection of parallel document pairs; and b) new NLP tools for syntactic analysis, named entity recognition, tweet processing and anonymization}, KEYWORDS = {Corpus Acquisition}, URL = {http://www.jotform.com/uploads/fabioaffeilc/30222975566357/225350067351490116/PANACEA}, } @TECHREPORT{QUOCHI_2012_TECHREPORT_QFBHPPBTTK_130130, AUTHOR = {Quochi, V. and Frontini, F. and Bartolini, R. and Hamon, O. and Poch Riera, M. and Padro, M. and Bel, N. and Thurmair, G. and Toral, A. and Kamran, A.}, TITLE = {D7. 4 Third evaluation report. Evaluation of PANACEA v3 and produced resources}, YEAR = {2012}, ABSTRACT = {D7. 4 reports on the evaluation of the different components integrated in the PANACEA third cycle of development as well as the final validation of the platform itself. All validation and evaluation experiments follow the evaluation criteria already described in D7. 1. The main goal of WP7 tasks was to test the (technical) functionalities and capabilities of the middleware that allows the integration of the various resource-creation components into an interoperable distributed environment (WP3) and to evaluate the quality of the components developed in WP5 and WP6. The content of this deliverable is thus complementary to D8. 2 and D8. 3 that tackle advantages and usability in industrial scenarios. It has to be noted that the PANACEA third cycle of development addressed many components that are still under research. The main goal for this evaluation cycle thus is to assess the methods experimented with and their potentials for becoming actual production tools to be exploited outside research labs. For most of the technologies, an attempt was made to re-interpret standard evaluation measures, usually in terms of accuracy, precision and recall, as measures related to a reduction of costs (time and human resources) in the current practices based on the manual production of resources. In order to do so, the different tools had to be tuned and adapted to maximize precision and for some tools the possibility to offer confidence measures that could allow a separation of the resources that still needed manual revision has been attempted. Furthermore, the extension to other languages in addition to English, also a PANACEA objective, has been evaluated. The main facts about the evaluation results are now summarized}, KEYWORDS = {PANACEA, evaluation, machine translation}, URL = {https://iris.cnr.it/handle/20.500.14243/130130}, } @TECHREPORT{RIMELL_2012_TECHREPORT_RBPFMQ_130143, AUTHOR = {Rimell, L. and Bel, N. and Padró, M. and Frontini, F. and Monachini, M. and Quochi, V.}, TITLE = {D6. 2 Integrated Final Version of the Components for Lexical Acquisition}, YEAR = {2012}, ABSTRACT = {The PANACEA project has addressed one of the most critical bottlenecks that threaten the development of technologies to support multilingualism in Europe, and to process the huge quantity of multilingual data produced annually. Any attempt at automated language processing, particularly Machine Translation (MT), depends on the availability of language-specific resources. Such Language Resources (LR) contain information about the language's lexicon, i. e. the words of the language and the characteristics of their use. In Natural Language Processing (NLP), LRs contribute information about the syntactic and semantic behaviour of words-i. e. their grammar and their meaning-which inform downstream applications such as MT. To date, many LRs have been generated by hand, requiring significant manual labour from linguistic experts. However, proceeding manually, it is impossible to supply LRs for every possible pair of European languages, textual domain, and genre, which are needed by MT developers. Moreover, an LR for a given language can never be considered complete nor final because of the characteristics of natural language, which continually undergoes changes, especially spurred on by the emergence of new knowledge domains and new technologies. PANACEA has addressed this challenge by building a factory of LRs that progressively automates the stages involved in the acquisition, production, updating and maintenance of LRs required by MT systems. The existence of such a factory will significantly cut down the cost, time and human effort required to build LRs. WP6 has addressed the lexical acquisition component of the LR factory, that is, the techniques for automated extraction of key lexical information from texts, and the automatic collation of lexical information into LRs in a standardized format. The goal of WP6 has been to take existing techniques capable of acquiring syntactic and semantic information from corpus data, improving upon them, adapting and applying them to multiple languages, and turning them into powerful and flexible techniques capable of supporting massive applications. One focus for improving the scalability and portability of lexical acquisition techniques has been to extend exiting techniques with more powerful, less "supervised" methods. In NLP, the amount of supervision refers to the amount of manual annotation which must be applied to a text corpus before machine learning or other techniques are applied to the data to compile a lexicon. More manual annotation means more accurate training data, and thus a more accurate LR. However, given that it is impractical from a cost and time perspective to manually annotate the vast amounts of data required for multilingual MT across domains, it is important to develop techniques which can learn from corpora with less supervision. Less supervised methods are capable of supporting both large-scale acquisition and efficient domain adaptation, even in the domains where data is scarce. Another focus of lexical acquisition in PANACEA has been the need of LR users to tune the accuracy level of LRs. Some applications may require increased precision, or accuracy, where the application requires a high degree of confidence in the lexical information used. At other times a greater level of coverage may be required, with information about more words at the expense of some degree of accuracy. Lexical acquisition in PANACEA has investigated confidence thresholds for lexical acquisition to ensure that the ultimate users of LRs can generate lexical data from the PANACEA factory at the desired level of accuracy}, KEYWORDS = {Lexical Acquisition}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D6.2.pdf}, } @TECHREPORT{RIMELL_2012_TECHREPORT_RBPFMQD_130256, AUTHOR = {Rimell, L. and Bel, N. and Padrò, M. and Frontini, F. and Monachini, M. and Quochi, V. and Del Gratta, R.}, TITLE = {D6. 3 Monolingual lexica for English, Spanish and Italian tuned for a particular domain (LAB and ENV)}, YEAR = {2012}, ABSTRACT = {This document presents the lexica acquired using PANACEA platform for Labour and Environment domains. The languages of the lexica are English, Spanish and Italian. The lexical information acquired depends on the language, according to the available tools in the platform}, KEYWORDS = {Lexicon Acqusition}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D6.3.pdf}, } @TECHREPORT{RIMELL_2012_TECHREPORT_RBPFMQD_130161, AUTHOR = {Rimell, L. and Bel, N. and Padró, M. and Frontini, F. and Monachini, M. and Quochi, V. and Del Gratta, R.}, TITLE = {D6. 5 Merged dictionaries}, YEAR = {2012}, ABSTRACT = {This document presents the merged dictionaries delivered in PANACEA. Those dictionaries result from merging already existing lexica, generally for general domain, with domain specific lexica acquired using PANACEA platform. The domain specific lexica are presented and delivered in D6. 3 and the merging repository that allowed the multilevel merging in D6. 4}, KEYWORDS = {merged dictionaries, computational lexicon}, URL = {http://www.panacea-lr.eu//en/deliverables/list}, } @MISC{ABRATE_2012_MISC_ABFLMM_128221, AUTHOR = {Abrate, M. and Bacciu, C. and Frontini, F. and Lapolla Mariantonietta, N. and Marchetti, A. and Monachini, M.}, TITLE = {Web Language Identification Testing Tool}, YEAR = {2012}, ABSTRACT = {Nowadays a variety of tools for automatic language identification are available. Regardless of the approach used, at least two features can be identified as crucial to evaluate the performances of such tools: the precision of the presented results and the range of languages that can be detected. In this work we shall focus on a subtask of written language identification that is important to preserve and enhance multilinguality in the Web, i. e. detecting the language of a Web page given its URL. Most specifically, the final aim is to verify to which extent under-represented languages are recognized by available tools. The main specificity of Web Language Identification (WLI) lies in the fact that often an HTML page can provide interesting extralinguistic clues (URL domain name, metadata, encoding, etc) that can enhance accuracy. We shall first provide some data and statistics on the presence of languages on the web, secondly discuss existing practices and tools for language identification according to different metrics-for instance the approaches used and the number of supported languages-and finally make some proposals on how to improve current Web Language Identifiers. We shall also present a preliminary WLI service that builds on the Google Chromium Compact Language Detector; the WLI tool allows us to test the Google n-gram based algorithm against an adhoc gold standard of pages in various languages. The gold standard, based on a selection of Wikipedia projects, contains samples in languages for which no automatic recognition has been attempted; it can thus be used by specialists to develop and evaluate WLI systems}, KEYWORDS = {Multilingual Web}, URL = {https://iris.cnr.it/handle/20.500.14243/128221}, CONFERENCE_NAME = {The Multilingual Web-the Way Ahead}, } @INPROCEEDINGS{CALZOLARI_2011_INPROCEEDINGS_CDFR_214980, AUTHOR = {Calzolari, N. and Del Gratta, R. and Frontini, F. and Russo, I.}, TITLE = {The Language Library: Many Layers, More Knowledge}, YEAR = {2011}, ABSTRACT = {In this paper we outline the general concept of the Language Library, a new initiative that has the purpose of building a huge archive of structured colletion of linguistic information. The Language Library is conceived as a community built repository and as an environment that allows language specialists to share multidimensional and multi-level annotated/processed resources. The first steps towards its implementation are briefly sketched}, KEYWORDS = {Language Resources, Language Library}, PAGES = {93-97}, URL = {https://iris.cnr.it/handle/20.500.14243/214980}, ISBN = {978-974-466-564-5}, CONFERENCE_NAME = {Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, BOOKTITLE = {Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, } @INPROCEEDINGS{FRONTINI_2011_INPROCEEDINGS_FMGLPFAM_215017, AUTHOR = {Frontini, F. and Monachini, M. and Gavrilidou, M. and Labropoulou, P. and Piperidis, S. and Francopoulo, G. and Arranz, V. and Mapelli, V.}, TITLE = {A Metadata Schema for the Description ofLanguage Resources (LRs)}, YEAR = {2011}, ABSTRACT = {This paper presents the metadata schema for describing language resources (LRs) currently under development for the needs of META-SHARE, an open distributed facility for the exchange and sharing of LRs. An essential ingredient in its setup is the existence of formal and standardized LR descriptions, cornerstone of the interoperability layer of any such initiative. The description of LRs is granular and abstractive, combining the taxonomy of LRs with an inventory of a structured set of descriptive elements, of which only a minimal subset is obligatory; the schema additionally proposes recommended and optional elements. Moreover, the schema includes a set of relations catering for the appropriate inter-linking of resources. The current paper presents the main principles and features of the metadata schema, focusing on the description of text corpora and lexical / conceptual resources}, KEYWORDS = {metadata, language resources}, PAGES = {84-92}, URL = {https://iris.cnr.it/handle/20.500.14243/215017}, ISBN = {978-974-466-564-5}, CONFERENCE_NAME = {Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, } @TECHREPORT{ARRANZ_2011_TECHREPORT_ABBCCDFGMQRR_231385, AUTHOR = {Arranz, V. and Bel, N. and Budin, G. and Caselli, T. and Choukri, K. and Del Gratta, R. and Frontini, F. and Goggi, S. and Monachini, M. and Quochi, V. and Rubino, F. and Russo, I.}, TITLE = {The FLaReNet Databook}, YEAR = {2011}, ABSTRACT = {The FLaReNet Databook is not only the collection of all the factual material collected during the activities of the project, but also a set on innovative initiatives and instruments that will remain in place for the continuous collection of such "facts". The purpose of the Databook is in fact, on one side, to consolidate the analyses carried out in the project and, at the same time, to set up the proper mechanisms that will enable the provision of a continuous stream of relevant factual material, also after the end of the project}, KEYWORDS = {Language Resources (LRs)}, PAGES = {1-8}, URL = {http://www.flarenet.eu/?q=FLaReNet_Databook}, } @TECHREPORT{DESIPRI_2011_TECHREPORT_DGLPFMVMFD_174745, AUTHOR = {Desipri, E. and Gavrilidou, M. and Labropoulou, P. and Piperidis, S. and Frontini, F. and Monachini, M. and Victoriaarranz and Mapelli, V. and Francopoulo, G. and Declerck, T.}, TITLE = {Documentation and User Manual of the META-SHARE Metadata Model}, YEAR = {2011}, ABSTRACT = {The current deliverable presents the META-SHARE metadata schema v1. 0, as implemented in the META-SHARE XSD's v1. 0 released to (META-NET and PSP partners) in July 2011 for text corpora and lexical/conceptual resources and its supplement for audio corpora, tools and language descriptions (simplified/refactored version) as implemented in November. It is meant to act as a user manual, providing explanations on the model contents for LRs providers and LRs curators that wish to describe their resources in accordance to it. Work on the schema is ongoing and changes/updates to the model are constantly being made; where appropriate, some changes that are already under way are documented in this deliverable}, KEYWORDS = {Language resources, metadata, standards}, PAGES = {150}, URL = {https://iris.cnr.it/handle/20.500.14243/174745}, } @TECHREPORT{MONACHINI_2011_TECHREPORT_MFS_174795, AUTHOR = {Monachini, M. and Frontini, F. and Soria, C.}, TITLE = {KYOTO-LMF WordNet Representation Format}, YEAR = {2011}, ABSTRACT = {The format described in the following pages is the final revised proposal for representing wordnets inside the Kyoto project (henceforth "Kyoto-LMF wordnet format"). The reference model is Lexical Markup Framework (LMF), version 16, probably one of the most widely recognized standards for the representation of NLP lexicons. The goals of LMF are to provide a common model for the creation and use of such lexical resources, to manage the exchange of data between and among them, and to enable the merging of a large number of individual resources to form extensive global electronic respurces. LMF was specifically designed to accomodate as many models of lexical representations as possible. Purposefully, it is designed as a mea-model, i. e a high-level specification for lexical resources defining the structural constraints of a lexicon}, KEYWORDS = {Wordnets, LMF, ISO, Representation formats, standards}, PAGES = {32}, URL = {https://iris.cnr.it/handle/20.500.14243/174795}, } @TECHREPORT{PROKOPIDIS_2011_TECHREPORT_PPTPFRT_290521, AUTHOR = {Prokopidis, P. and Papavassiliou, V. and Toral, A. and Poch Riera, M. and Frontini, F. and Rubino, F. and Thurmair, G.}, TITLE = {WP-4. 4: Report on the revised Corpus Acquisition & Annotation subsystem and its components}, YEAR = {2011}, KEYWORDS = {corpus acquisition, corpus annotation}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D4.4.pdf}, } @TECHREPORT{PROKOPIDIS_2011_TECHREPORT_PPTRFRT_290522, AUTHOR = {Prokopidis, P. and Papavassiliou, V. and Toral, A. and Riera, M. P. and Frontini, F. and Rubino, F. and Thurmair, G.}, TITLE = {WP-4. 5: Final Report on the Corpus Acquisition & Annotation subsystem and its components}, YEAR = {2011}, KEYWORDS = {corpus acquisition, corpus annotation}, URL = {http://www.panacea-lr.eu/system/deliverables/PANACEA_D4.5.pdf}, } @TECHREPORT{VOSSEN_2011_TECHREPORT_VBRASADHMBF_174121, AUTHOR = {Vossen, P. and Bosma, W. and Rigau, G. and Agirre, E. and Soroa, A. and Aliprandi, C. and De Jonge, J. and Hielkema, F. and Monachini, M. and Bartolini, R. and Frontini, F.}, TITLE = {KyotoCore: integrated system for knowledge mining from text}, YEAR = {2011}, ABSTRACT = {In this deliverable, we describe KyotoCore, an integrated system for applying text mining. We describe the software architecture of KyotoCore, the single modules and the process flows. Finally, we describe a use case where we apply the complete process toan English database on estuaries}, KEYWORDS = {Knowledge and text mining software}, PAGES = {56}, URL = {https://iris.cnr.it/handle/20.500.14243/174121}, } @MISC{FRONTINI_2011_MISC_FM_217962, AUTHOR = {Frontini, F. and Monachini, M.}, TITLE = {Towards interfacing lexical and ontological resources}, YEAR = {2011}, ABSTRACT = {During the last two decades, the Computational Linguistics community has dedicated considerable effort to the research and development Lexical Resources (LRs), especially Computational Lexicons. These LRs, even though belonging to different linguistic approaches and theories, share a common element; all of them contain, explicitly or implicitly, an ontology as the means of organizing their structure}, KEYWORDS = {language resources, ontologies}, PAGES = {26}, URL = {https://iris.cnr.it/handle/20.500.14243/217962}, CONFERENCE_NAME = {ONTOLOGIES AND LEXICAL SEMANTICS}, } @INCOLLECTION{JEZEK_2010_INCOLLECTION_JF_134822, AUTHOR = {Jezek, E. and Frontini, F.}, TITLE = {From Pattern Dictionary to Patternbank}, YEAR = {2010}, KEYWORDS = {Ontology. Computational Semantics}, PAGES = {215-237}, URL = {https://iris.cnr.it/handle/20.500.14243/134822}, BOOKTITLE = {A Way with Words: Recent Advances in Lexical Theory and Analysis}, EDITOR = {De Schryver, G. M.}, } @MISC{FRONTINI_2010_MISC_F_106762, AUTHOR = {Frontini, F.}, TITLE = {Statistical profiling of Italian L2 texts: competence and native language}, YEAR = {2010}, KEYWORDS = {Text categorization}, URL = {https://iris.cnr.it/handle/20.500.14243/106762}, CONFERENCE_NAME = {20th Annual Conference of the European Second Language Association}, }