@article {1802, title = {Candoia: A Platform and an Ecosystem for Building and Deploying Versatile Mining Software Repositories Tools}, year = {2015}, note = {" In terms of its focus, the Candoia platform is closer to Bevan et al.{\textquoteright}s Kenyon [9], Bajracharya et al.{\textquoteright}s Sourcerer [6], Gousios and Spinellis{\textquoteright}s Alitheia Core [32, 31], Howison et al.{\textquoteright}s FLOSSMole [39] and different from Boetticher et al.{\textquoteright}s PROMISE Repository [69], Gonz{\'a}lez-Barahona and Robles{\textquoteright}s open-access data repositories [29], Black Duck OpenHub (aka Ohloh) [13], GHTorrent [30, 33], Ossher et al.{\textquoteright}s SourcererDB [64], the SourceForge Research Data Archive (SRDA) [28], and Boa [25]. }, month = {11/2015}, institution = {Iowa State University}, abstract = {Research on mining software repositories (MSR) has shown great promise during the last decade in solving many challenging software engineering problems. There exists, however, a {\textquoteleft}valley of death{\textquoteright} between these significant innovations in the MSR research and their deployment in practice. The significant cost of converting a prototype to software; need to provide support for a wide variety of tools and technologies e.g. CVS, SVN, Git, Bugzilla, Jira, Issues, etc, to improve applicability; and the high cost of customizing tools to practitioner-specific settings are some key hurdles in transition to practice. We describe Candoia, a platform and an ecosystem that is aimed at bridging this valley of death between innovations in MSR research and their deployment in practice. We have implemented Candoia and provide facilities to build and publish MSR ideas as Candoia apps. Our evaluation demonstrates that Candoia drastically reduces the cost of converting an idea to an app, thus reducing the barrier to transitioning research findings into practice. We also see versatility, in Candoia app{\textquoteright}s ability to work with a variety of tools and technologies that the platform supports. Finally, we find that customizing Candoia app to fit project-specific needs is often well within the grasp of developers.}, keywords = {Analysis of software and its evolution, Application specific development environments, flossmole cited, msr, research to practice, software evolution, software repositories}, url = {http://lib.dr.iastate.edu/cgi/viewcontent.cgi?article=1378\&context=cs_techreports}, attachments = {https://flosshub.org/sites/flosshub.org/files/Candoia-\%20A\%20Platform\%20and\%20an\%20Ecosystem\%20for\%20Building\%20and\%20Deploying\%20V.pdf}, author = {Nitin M. Tiwari and Dalton D. Mills and Ganesha Upadhyaya and Eric Lin and Rajan, Hridesh} } @article {1717, title = {An insight into license tools for open source software systems}, journal = {Journal of Systems and Software}, volume = {102}, year = {2015}, note = {"An additional tool can be found in FLOSSmole ( Howison et al., 2006), which is a central repository containing data and analyses about FLOSS projects collected and prepared in a decentralized manner. Content from FLOSSmole was intended to be used for the construction of an intelligent information system for FLOSS, namely FLOSSWALD (Hanft and Reichle, 2007). "}, month = {04/2015}, pages = {72 - 87}, abstract = {Free/Libre/Open Source Software (FLOSS) has gained a lot of attention lately allowing organizations to incorporate third party source code into their implementations. When open source software libraries are used, software resources may be linked directly or indirectly with multiple open source licenses giving rise to potential license incompatibilities. Adequate support in license use is vital in order to avoid such violations and address how diverse licenses should be handled. In the current work we investigate software licensing giving a critical and comparative overview of existing assistive approaches and tools. These approaches are centered on three main categories: license information identification from source code and binaries, software metadata stored in code repositories, and license modeling and associated reasoning actions. We also give a formalization of the license compatibility problem and demonstrate the role of existing approaches in license use decisions.}, keywords = {flossmole cited}, issn = {01641212}, doi = {10.1016/j.jss.2014.12.050}, url = {http://www.sciencedirect.com/science/article/pii/S0164121214002945}, author = {Kapitsaki, Georgia M. and Tselikas, Nikolaos D. and Foukarakis, Ioannis E.} } @conference {Williams:2014:MOP:2597073.2597132, title = {Models of OSS Project Meta-information: A Dataset of Three Forges}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, note = {"FLOSSMole [4] is a similar initiative to OSSMETER; it aims to collect and freely redistribute in different formats the data of open source software. Differently from OSSMETER, however, the FLOSSMole project does not provide the instruments to analyse data, that are simply collected and made publicly available."}, pages = {408{\textendash}411}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {The process of selecting open-source software (OSS) for adoption is not straightforward as it involves exploring various sources of information to determine the quality, maturity, activity, and user support of each project. In the context of the OSSMETER project, we have developed a forge-agnostic metamodel that captures the meta-information common to all OSS projects. We specialise this metamodel for popular OSS forges in order to capture forge-specific meta-information. In this paper we present a dataset conforming to these metamodels for over 500,000 OSS projects hosted on three popular OSS forges: Eclipse, SourceForge, and GitHub. The dataset enables different kinds of automatic analysis and supports objective comparisons of cross-forge OSS alternatives with respect to a user{\textquoteright}s needs and quality requirements. }, keywords = {data mining, flossmole cited}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597132}, url = {http://doi.acm.org/10.1145/2597073.2597132}, attachments = {https://flosshub.org/sites/flosshub.org/files/Models_of_OSS_Project_Meta-Information_A_Dataset_of_Three_Forges_draft.pdf}, author = {Williams, James R. and Di Ruscio, Davide and Matragkas, Nicholas and Di Rocco, Juri and Kolovos, Dimitris S.} } @article {Ampatzoglou2013131, title = {Building and mining a repository of design pattern instances: Practical and research benefits}, journal = {Entertainment Computing}, volume = {4}, number = {2}, year = {2013}, note = {flossmole cited}, pages = {131 - 142}, abstract = {Design patterns are well-known design solutions that are reported to produce substantial benefits with respect to software quality. However, to our knowledge there are no scientific efforts on gathering information on software projects that use design patterns. This paper introduces a web repository of design patterns instances that have been used in open source projects. The usefulness of such a repository lies in the provision of a base of knowledge, where developers can identify reusable components and researchers can find a mined data set. Currently, 141 open source projects have been considered and more than 4500 pattern instances have been found and recorded in the database of the repository. The evaluation of the repository has been performed from an academic and a practical point of view. The results suggest that the repository can be useful for both experienced and inexperienced users. However, the benefits of using the repository are more significant for inexperienced users. }, keywords = {flossmole cited, repository}, issn = {1875-9521}, doi = {10.1016/j.entcom.2012.10.002}, url = {http://www.sciencedirect.com/science/article/pii/S1875952112000195}, author = {Apostolos Ampatzoglou and Olia Michou and Ioannis Stamelos} } @book {1579, title = {Finding Source Code on the Web for Remix and Reuse}, series = {Infrastructure for Building Code Search Applications for Developers}, year = {2013}, note = {In "further reading": "Although not a code search infrastructure, FLOSSmole [13] is another major undertaking in building large collection of metadata about open source projects on the Web. Currently, FLOSSmole reports a massive data collection of more than 500,000 open source projects in its web site [32]. For code search infrastructure builders, now it is possible to leverage FLOSSmole{\textquoteright}s project metadata to build code repositories instead of spending an effort in implementing custom spiders and crawlers for code."}, pages = {135 - 164}, publisher = {Springer New York}, organization = {Springer New York}, address = {New York, NY}, abstract = {The large availability of open source code on the Web provides great opportunities to build useful code search applications for developers. Building such applications requires addressing several challenges inherent in collecting and analyzing code from open source repositories to make them available for search. An infrastructure that supports collection, analysis, and search services for open source code available on the Web can greatly facilitate building effective code search applications. This chapter presents such an infrastructure called Sourcerer that facilitates collection, analysis, and search of source code available in code repositories on the Web. This chapter provides useful information to researchers and implementors of code search applications interested in harnessing the large availability of source code in the repositories on the Web. In particular, this chapter highlights key aspects of Sourcerer that supports combining Software Engineering and Information Retrieval techniques to build effective code search applications.}, keywords = {code search, flossmole cited}, isbn = {978-1-4614-6596-6}, doi = {10.1007/978-1-4614-6596-6_8}, url = {http://www.drsusansim.org/papers/FindingCodeontheWeb-20120822.pdf}, author = {Bajracharya, Sushil Krishna}, editor = {Sim, Susan Elliott and Gallardo-Valencia, Rosalva E.} } @conference {1721, title = {Orion: A Software Project Search Engine with Integrated Diverse Software Artifacts}, booktitle = {2013 18th International Conference on Engineering of Complex Computer Systems (ICECCS)}, year = {2013}, note = {The FLOSSMole12, Flossmetrics [3] and Sourcerer [1] projects collect data and/or provide statistics on their collected data, but are not suitable for selecting a subset or identifying a unique project based on desired properties.}, pages = {242 - 245}, publisher = {IEEE}, organization = {IEEE}, address = {Singapore, Singapore}, abstract = {What projects contain more than 10, 000 lines of code developed by less than 10 people and are still actively maintained with a high bug-fixing rate? To address the challenges for answering such enquiries, we develop an integrated search engine architecture that combines information from different types of software repositories from multiple sources. Our search engine facilitates the construction and execution of complex search queries using a uniform interface that transparently correlates different artifacts of project development and maintenance, such as source code information, version control systems metadata, bug tracking systems elements, and metadata on developer activities and interactions extracted from hosting platforms. We have built an extensible system with an initial capability of over 100, 000 projects collected from the web, featuring various software development artifacts. Using scenarios, we illustrate the benefits of such a search engine for different kinds of project seekers. }, keywords = {flossmole cited}, doi = {10.1109/ICECCS.2013.42}, url = {http://www.mysmu.edu/faculty/davidlo/papers/iceccs13-projectsearch.pdf}, author = {Bissyande, Tegawende F. and Thung, Ferdian and Lo, David and Jiang, Lingxiao and Reveillere, Laurent} } @conference {1576, title = {A scalable crawler framework for FLOSS data}, booktitle = {Proceedings of the 5th Asia-Pacific Symposium on Internetware - Internetware {\textquoteright}13}, year = {2013}, note = {FLOSSmole [8] and FLOSSmetrics [9] retrieved FLOSS data of various types from famous software forges like SourceForge and Google code, interface to data sharing and analyzing is also provided. " Typically, Howison et al. [8] proposed a system called FLOSSmole. FLOSSmole is a large collection of datasets extracted from famous software forges such as SourceForge, GitHub, and Google Code. Datasets in FLOSSmole are mainly metadata describing various facts about the development of FLOSS projects. FLOSSmole manages its datasets in an open and collaborative manner. Most of the data is collected by the FLOSSmole research team, yet they also accept data donation from other research groups or similar projects. The scripts and programs that collects the datasets from the Internet is also open for download and donation. " "Using FLOSSmole [8] and FLOSSmetrics [9] as case studies, similar systems as such are called {\textquotedblleft}repository of repositories (RoR){\textquotedblright} and basic requirements of these systems are proposed. "}, pages = {1 - 7}, publisher = {ACM Press}, organization = {ACM Press}, address = {Changsha, China}, abstract = {Free / Libre / Open Source Software (FLOSS) data, such as bug reports, mailing lists and related webpages, contains valuable information for reusing open source software projects. Before conducting further experiment on FLOSS data, researchers often need to download these data into a local storage system. We refer to this pre-process as FLOSS data retrieval, which in many cases can be a challenging task. In this paper, we proposed a crawler framework to ease the process of FLOSS data retrieval. To cope with various types of FLOSS data scattered on the Internet, we designed the framework in a scalable manner where a crawler program can be easily plugged into the system to extend its functionality. Researchers can perform the retrieval process on datasets of various types and sources simply by adding new configurations to the system. We have implemented the framework and provided basic functions via web-based interfaces. We presented the usage of the system by a detailed case study where we retrieved various types of datasets related to Apache Lucene project using our framework.}, keywords = {flossmole cited}, isbn = {9781450323697}, doi = {10.1145/2532443.2532454}, author = {Yanzhen Zou and Bing Zie and Zhang, Lingxiao} } @article {1681, title = {Using Pig as a data preparation language for large-scale mining software repositories studies: An experience report}, journal = {Journal of Systems and Software}, volume = {85}, year = {2012}, note = {"For example, FLOSSMole (Howison et al., 2006) is a public relational database that contains data extracted from a large number of software repositories. Many researchers use FLOSSMole as a platform. For example, Herraiz et al. (2008) used data in FLOSSMole (Howison et al., 2006) to perform analysis to illustrate that most of the software projects are governed by short term goals rather than long term goals."}, month = {10/2012}, pages = {2195 - 2204}, abstract = {The Mining Software Repositories (MSR) field analyzes software repository data to uncover knowledge and assist development of ever growing, complex systems. However, existing approaches and platforms for MSR analysis face many challenges when performing large-scale MSR studies. Such approaches and platforms rarely scale easily out of the box. Instead, they often require custom scaling tricks and designs that are costly to maintain and that are not reusable for other types of analysis. We believe that the web community has faced many of these software engineering scaling challenges before, as web analyses have to cope with the enormous growth of web data. In this paper, we report on our experience in using a web-scale platform (i.e., Pig) as a data preparation language to aid large-scale MSR studies. Through three case studies, we carefully validate the use of this web platform to prepare (i.e., Extract, Transform, and Load, ETL) data for further analysis. Despite several limitations, we still encourage MSR researchers to leverage Pig in their large-scale studies because of Pig{\textquoteright}s scalability and flexibility. Our experience report will help other researchers who want to scale their analyses.}, keywords = {flossmole cited}, issn = {01641212}, doi = {10.1016/j.jss.2011.07.034}, url = {http://www.sciencedirect.com/science/article/pii/S0164121211002007}, author = {Weiyi Shang and Adams, Bram and Hassan, Ahmed E.} } @proceedings {1278, title = {Building Knowledge in Open Source Software Research in Six Years of Conferences}, year = {2011}, note = {RQ1. Is there any social network underlying the research production at the OSS conference series? RQ2. What are the major streams of research proposed at the OSS conference series?}, month = {10/2011}, pages = {123-141}, publisher = {Springer}, abstract = {Since its origins, the diffusion of the OSS phenomenon and the information about it has been entrusted to the Internet and its virtual communities of developers. This public mass of data has attracted the interest of researchers and practitioners aiming at formalizing it into a body of knowledge. To this aim, in 2005, a new series of conferences on OSS started to collect and convey OSS knowledge to the research and industrial community. Our work mines articles of the OSS conference series to understand the process of knowledge grounding and the community surrounding it. As such, we propose a semi-automated approach for a systematic mapping study on these articles. We automatically build a map of cross-citations among all the papers of the conferences and then we manually inspect the resulting clusters to identify knowledge building blocks and their mutual relationships. We found that industry-related, quality assurance, and empirical studies often originate or maintain new streams of research.}, keywords = {Cross-citations, flossmole cited, graph, literature review, network, research, Systematic Mapping Study}, author = {Mulazzini, Fabio and Rossi, Bruno and Russo, Barbara and Steff, Maximilian} } @article {1806, title = {An empirical study of software architectures{\textquoteright} effect on product quality}, journal = {Journal of Systems and Software}, volume = {84}, year = {2011}, note = {"We first collected meta-data on the 21,094 most highly ranked Java projects on 2009-03-17 from SourceForge for which it was possible to get such data. Here {\textquotedblleft}Java projects{\textquotedblright} were defined as projects belonging to {\textquotedblleft}trove{\textquotedblright} 198 at SourceForge and {\textquotedblleft}rank{\textquotedblright} was the SourceForge ranking of projects. The data consisted of characteristics such as number of bugs, time of latest file upload, number of developers, number of open bugs, and SourceForge {\textquotedblleft}rank{\textquotedblright}." They did not use FLOSSmole, "A more complete analysis of the project status, could be performed by mining the FLOSSMole database (Howison et al., 2006)."}, month = {7/2011}, pages = {1233 - 1243}, abstract = {Software architecture is concerned with the structure of software systems and is generally agreed to influence software quality. Even so, little empirical research has been performed on the relationship between software architecture and software quality. Basedon 1,141 open source Java projects, we calculate three software architecture metrics (measuring classes per package, normalizeddistance, and a new metric introduced by us concerning the excess of coupling degree) and analyze to which extent these metricsare related to product metrics (defect ratio, download rate, methods per class, and method complexity). We conclude that there area number of significant relationships between product metrics and architecture metrics. In particular, the number of open defectsdepends significantly on all our architecture measures.}, keywords = {flossmole cited, java}, issn = {01641212}, doi = {10.1016/j.jss.2011.02.037}, url = {http://www.sciencedirect.com/science/article/pii/S0164121211000574}, author = {Hansen, Klaus Marius and Jonasson, Kristjan and Neukirchen, Helmut} } @conference {1216, title = {Experiences Mining Open Source Release Histories}, booktitle = {International Conference on Software and Systems Process (ICSSP 2011) }, year = {2011}, note = {"First, we selected the projects to initially target, using several criteria to get a broad picture of the open source landscape. Second, we collected the actual data, using a framework of parsers and some manual inspection. Third, we standardized and inserted the data into a database for later use." "but we plan to eventually cross reference our list of projects with existing open source project information (such as FLOSSmole) to take advantage of the work already done by other researchers." "For each release, we collected the following data: the project it belonged to, the date the release was published, the type of release, the release label (version number) and the source of the data" discussion of their difficulties "We conclude that programmatically creating a release history database from existing open source data is not trivial," "We have currently collected 1579 distinct releases from 22 different open source projects"}, month = {05/2011}, abstract = {Software releases form a critical part of the life cycle of a software project. Typically, each project produces releases in its own way, using various methods of versioning, archiving, announcing and publishing the release. Understanding the release history of a software project can shed light on the project history, as well as the release process used by that project, and how those processes change. However, many factors make automating the retrieval of release history information difficult, such as the many sources of data, a lack of relevant standards and a disparity of tools used to create releases. In spite of the large amount of raw data available, no attempt has been made to create a release history database of a large number of projects in the open source ecosystem. This paper presents our experiences, including the tools, techniques and pitfalls, in our early work to create a software release history database which will be of use to future researchers who want to study and model the release engineering process in greater depth.}, keywords = {doap, flossmole cited, life cycle, release engineering, release history, release management, releases}, attachments = {https://flosshub.org/sites/flosshub.org/files/icssp11short-p034-tsay.pdf}, author = {Jason Tsay and Wright, Hyrum and Perry, Dewayne} } @conference {1829, title = {Validity concerns in software engineering research}, booktitle = {Proceedings of the FSE/SDP workshop on Future of software engineering research - FoSER {\textquoteright}10}, year = {2010}, note = {"Obtaining a balanced set of data from open source repositories has been an issue in the open source research community for some time, and several collections of data have grown to attempt to solve this problem [11, 13, 20]."}, pages = {411}, publisher = {ACM Press}, organization = {ACM Press}, address = {Santa Fe, New Mexico, USANew York, New York, USA}, abstract = {Empirical studies that use software repository artifacts have become popular in the last decade due to the ready availability of open source project archives. In this paper, we survey empirical studies in the last three years of ICSE and FSE proceedings, and categorize these studies in terms of open source projects vs. proprietary source projects and the diversity of subject programs used in these studies. Our survey has shown that almost half (49\%) of recent empirical studies used solely open source projects. Existing studies either draw general conclusions from these results or explicitly disclaim any conclusions that can extend beyond specific subject software. We conclude that researchers in empirical software engineering must consider the external validity concerns that arise from using only several well-known open source software projects, and that discussion of data source selection is an important discussion topic in software engineering research. Furthermore, we propose a community research infrastructure for software repository benchmarks and sharing the empirical analysis results, in order to address external validity concerns and to raise the bar for empirical software engineering research that analyzes software artifacts. }, keywords = {flossmole cited}, isbn = {9781450304276}, doi = {10.1145/1882362.1882446}, author = {Wright, Hyrum K. and Kim, Miryung} } @conference {herraiz2009research, title = {Research friendly software repositories}, booktitle = {Proceedings of the joint international and annual ERCIM workshops on Principles of software evolution (IWPSE) and software evolution (Evol) workshops}, year = {2009}, note = {"In spite of these rich availability of software repositories, the heterogeneity of the data makes it difficult to apply studies at a large scale, although some research projects, like FLOSSMole [10] or FLOSSMetrics [8] are addressing these issues and aim to provide datasets about thousands of libre software projects for research purposes." (more)}, pages = {19{\textendash}24}, publisher = {ACM}, organization = {ACM}, abstract = {What is the future of software evolution? In 1974, Meir M. Lehman had a vision of software evolution being driven by empirical studies of software repositories, and of a theory based on those empirical results. However, that scenario is yet to come. Software evolution studies are often based on a few cases, because the needed information is scarce, dispersed and incomplete. Their conclusions are not generalizable, slowing down the progress of this research discipline. Libre (free / open source) software supposes an opportunity to alleviate this situation. In this paper we describe the existing approaches to provide research datasets that are mining libre software repositories, and propose an agenda based on the concept of research friendly software repositories, which provides finer granularity and integrated data.}, keywords = {flossmetrics, flossmole cited}, attachments = {https://flosshub.org/sites/flosshub.org/files/herraiz-Research-Friendly-sw-repos.pdf}, author = {Herraiz, I. and Robles, G. and Gonzalez-Barahona, J.M.} } @article {Ripoche2006, title = {Experiences in Automating the Analysis of Linguistic Interactions for the Study of Distributed Collectives}, journal = {Computer Supported Cooperative Work (CSCW)}, volume = {15}, number = {2}, year = {2006}, note = {flossmole is mentioned in passing}, pages = {149{\textendash}183}, abstract = {An important issue faced by research on distributed collective practices is the amount and nature of the data available for study. While persistent mediated interaction offers unprecedented opportunities for research, the wealth and richness of available data pose issues on their own, calling for new methods of investigation. In such a context, automated tools can offer coverage, both within and across collectives. In this paper, we investigate the potential contributions of semantic analyses of linguistic interactions for the study of collective processes and practices. In other words, we are interested in discovering how linguistic interaction is related to collective action, as well as in exploring how computational tools can make use of these relationships for the study of distributed collectives.}, keywords = {flossmole cited}, issn = {1573-7551}, doi = {10.1007/s10606-006-9017-0}, url = {http://dx.doi.org/10.1007/s10606-006-9017-0}, author = {Gabriel Ripoche and Sansonnet, Jean-Paul} }