@conference {VGZ15, title = {Automatically Prioritizing Pull Requests}, booktitle = {Proceedings of the 12th International Conference on Mining Software Repositories}, year = {2015}, month = {05/2015}, publisher = {IEEE}, organization = {IEEE}, abstract = {In previous work, we observed that in the pull-based development model integrators face challenges with regard to prioritizing work in the face of multiple concurrent pull requests. We present the design and initial implementation of a prototype pull request prioritisation tool called PRioritizer. PRioritizer works like a priority inbox for pull requests, recommending the top pull requests the project owner should focus on. A preliminary user study showed that PRioritizer provides functionality that GitHub is currently lacking, even though users need more insight into how the priority ranking is established to make PRioritizer really useful.}, url = {http://www.gousios.gr/pub/prioritizer.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/prioritizer.pdf}, author = {van der Veen, Erik and Gousios, Georgios and Zaidman, Andy} } @proceedings {KMGSL15, title = {Generating the Blueprints of the Java Ecosystem}, year = {2015}, note = {To appear}, month = {05/2015}, publisher = {IEEE Computer Society}, url = {http://gaijin.dmst.aueb.gr/~bkarak/poster_msr2015.pdf}, author = {Vassilios Karakoidas and Mitropoulos, Dimitris and Louridas, Panos and Gousios, Georgios and Diomidis Spinellis} } @article {1796, title = {An in-depth study of the promises and perils of mining GitHub}, journal = {Empirical Software Engineering}, year = {2015}, publisher = {Springer}, abstract = {With over 10 million git repositories, GitHub is becoming one of the most important sources of software artifacts on the Internet. Researchers mine the information stored in GitHub{\textquoteright}s event logs to understand how its users employ the site to collaborate on software, but so far there have been no studies describing the quality and properties of the available GitHub data. We document the results of an empirical study aimed at understanding the characteristics of the repositories and users in GitHub; we see how users take advantage of GitHub{\textquoteright}s main features and how their activity is tracked on GitHub and related datasets to point out misalignment between the real and mined data. Our results indicate that while GitHub is a rich source of data on software development, mining GitHub for research purposes should take various potential perils into consideration. For example, we show that the majority of the projects are personal and inactive, and that almost 40\% of all pull requests do not appear as merged even though they were. Also, approximately half of GitHub{\textquoteright}s registered users do not have public activity, while the activity of GitHub users in repositories is not always easy to pinpoint. We use our identified perils to see if they can pose validity threats; we review selected papers from the MSR 2014 Mining Challenge and see if there are potential impacts to consider. We provide a set of recommendations for software engineering researchers on how to approach the data in GitHub.}, keywords = {github}, issn = {1573-7616}, doi = {10.1007/s10664-015-9393-5}, url = {http://www.gousios.gr/pub/promises-perils-github-extended.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/promises-perils-github-extended.pdf}, author = {Kalliamvakou, Eirini and Gousios, Georgios and Blincoe, Kelly and Singer, Leif and Daniel M. German and Damian, Daniela} } @conference {HG15, title = {Matching GitHub developer profiles to job advertisements}, booktitle = {Proceedings of the 12th International Conference on Mining Software Repositories}, year = {2015}, month = {05/2015}, publisher = {IEEE}, organization = {IEEE}, abstract = {GitHub is a social coding platform that enables developers to efficiently work on projects, connect with other developers, collaborate and generally {\textquotedblleft}be seen{\textquotedblright} by the community. This visibility also extends to prospective employers and HR personnel who may use GitHub to learn more about a developer{\textquoteright}s skills and interests. We propose a pipeline that automatizes this process and automatically suggests matching job advertisements to developers, based on signals extracting from their activities on GitHub.}, url = {http://www.gousios.gr/pub/dev-profiles.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/dev-profiles.pdf}, author = {Hauff, Claudia and Gousios, Georgios} } @proceedings {1754, title = {Unveiling Exception Handling Bug Hazards in Android based on GitHub and Google Code Issues}, year = {2015}, month = {05/2015}, abstract = {This paper reports on a study mining the exception stack traces included in 159,048 issues reported on Android projects hosted in GitHub (482 projects) and Google Code (157 projects). The goal of this study is to investigate whether stack trace information can reveal bug hazards related to exception handling code that may lead to a decrease in application robustness. Overall 6,005 exception stack traces were extracted, and subjected to source code and bytecode analysis. The outcomes of this study include the identification of the following bug hazards: (i) unexpected cross-type exception wrappings (for instance, trying to handle an instance of OutOfMemoryError {\textquotedblleft}hidden{\textquotedblright} in a checked exception) which can make the exceptionrelated code more complex and negatively impact the application robustness; (ii) undocumented runtime exceptions thrown by both the Android platform and third party libraries; and (iii) undocumented checked exceptions thrown by the Android Platform. Such undocumented exceptions make difficult, and most of the times infeasible for the client code to protect against {\textquotedblleft}unforeseen{\textquotedblright} situations that may happen while calling third-party code. This study provides further insights on such bug hazards and the robustness threats they impose to Android apps as well as to other systems based on the Java exception model.}, keywords = {github, google code}, url = {http://www.gousios.gr/pub/android-stacks.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/android-stacks.pdf}, author = {Roberta Coelho and Lucas Almeida and Gousios, Georgios and van Deursen, Arie} } @conference {Mitropoulos:2014:BCM:2597073.2597123, title = {The Bug Catalog of the Maven Ecosystem}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {372{\textendash}375}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Examining software ecosystems can provide the research community with data regarding artifacts, processes, and communities. We present a dataset obtained from the Maven central repository ecosystem (approximately 265GB of data) by statically analyzing the repository to detect potential software bugs. For our analysis we used FindBugs, a tool that examines Java bytecode to detect numerous types of bugs. The dataset contains the metrics results that FindBugs reports for every project version (a JAR) included in the ecosystem. For every version we also stored specific metadata such as the JAR{\textquoteright}s size, its dependencies and others. Our dataset can be used to produce interesting research results, as we show in specific examples. }, keywords = {findbugs, Maven Repository, msr data showcase, Software Bugs}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597123}, url = {http://doi.acm.org/10.1145/2597073.2597123}, attachments = {https://flosshub.org/sites/flosshub.org/files/mitro.pdf}, author = {Mitropoulos, Dimitris and Vassilios Karakoidas and Louridas, Panos and Gousios, Georgios and Diomidis Spinellis} } @conference {Gousios:2014:DPD:2597073.2597122, title = {A Dataset for Pull-based Development Research}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {368{\textendash}371}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Pull requests form a new method for collaborating in distributed software development. To study the pull request distributed development model, we constructed a dataset of almost 900 projects and 350,000 pull requests, including some of the largest users of pull requests on Github. In this paper, we describe how the project selection was done, we analyze the selected features and present a machine learning tool set for the R statistics environment. }, keywords = {Distributed software development, Empirical software engineering, msr data showcase, pull request, pull-based development}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597122}, url = {http://doi.acm.org/10.1145/2597073.2597122}, attachments = {https://flosshub.org/sites/flosshub.org/files/pullreqs-dataset.pdf}, author = {Gousios, Georgios and Zaidman, Andy} } @conference {Gousios:2014:LGG:2597073.2597126, title = {Lean GHTorrent: GitHub Data on Demand}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {384{\textendash}387}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In recent years, GitHub has become the largest code host in the world, with more than 5M developers collaborating across 10M repositories. Numerous popular open source projects (such as Ruby on Rails, Homebrew, Bootstrap, Django or jQuery) have chosen GitHub as their host and have migrated their code base to it. GitHub offers a tremendous research potential. For instance, it is a flagship for current open source development, a place for developers to showcase their expertise to peers or potential recruiters, and the platform where social coding features or pull requests emerged. However, GitHub data is, to date, largely underexplored. To facilitate studies of GitHub, we have created GHTorrent, a scalable, queriable, offline mirror of the data offered through the GitHub REST API. In this paper we present a novel feature of GHTorrent designed to offer customisable data dumps on demand. The new GHTorrent data-on-demand service offers users the possibility to request via a web form up-to-date GHTorrent data dumps for any collection of GitHub repositories. We hope that by offering customisable GHTorrent data dumps we will not only lower the "barrier for entry" even further for researchers interested in mining GitHub data (thus encourage researchers to intensify their mining efforts), but also enhance the replicability of GitHub studies (since a snapshot of the data on which the results were obtained can now easily accompany each study). }, keywords = {data on demand, dataset, github, msr data showcase}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597126}, url = {http://doi.acm.org/10.1145/2597073.2597126}, attachments = {https://flosshub.org/sites/flosshub.org/files/lean-ghtorrent_0.pdf}, author = {Gousios, Georgios and Vasilescu, Bogdan and Serebrenik, Alexander and Zaidman, Andy} } @conference {Kalliamvakou:2014:PPM:2597073.2597074, title = {The Promises and Perils of Mining GitHub}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {92{\textendash}101}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {With over 10 million git repositories, GitHub is becoming one of the most important source of software artifacts on the Internet. Researchers are starting to mine the information stored in GitHub{\textquoteright}s event logs, trying to understand how its users employ the site to collaborate on software. However, so far there have been no studies describing the quality and properties of the data available from GitHub. We document the results of an empirical study aimed at understanding the characteristics of the repositories in GitHub and how users take advantage of GitHub{\textquoteright}s main features---namely commits, pull requests, and issues. Our results indicate that, while GitHub is a rich source of data on software development, mining GitHub for research purposes should take various potential perils into consideration. We show, for example, that the majority of the projects are personal and inactive; that GitHub is also being used for free storage and as a Web hosting service; and that almost 40\% of all pull requests do not appear as merged, even though they were. We provide a set of recommendations for software engineering researchers on how to approach the data in GitHub. }, keywords = {bias, code reviews, git, github, mining software repositories}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597074}, url = {http://doi.acm.org/10.1145/2597073.2597074}, attachments = {https://flosshub.org/sites/flosshub.org/files/perils.pdf}, author = {Kalliamvakou, Eirini and Gousios, Georgios and Blincoe, Kelly and Singer, Leif and Daniel M. German and Damian, Daniela} } @conference {G13, title = {The GHTorrent dataset and tool suite}, booktitle = {MSR {\textquoteright}13: Proceedings of the 9th Working Conference on Mining Software Repositories}, year = {2013}, note = {Example of how to use the data/tool: http://www.gousios.gr/blog/on-github-pull-requests/}, month = {05/2013}, abstract = {A common requirement of many empirical software engineering studies is the acquisition and curation of data from software repositories. During the last few years, GitHub has emerged as a popular project hosting, mirroring and collaboration platform. GitHub provides an extensive REST API, which enables researchers to retrieve both the commits to the projects{\textquoteright} repositories and events generated through user actions on project resources. GHTorrent aims to create a scalable off line mirror of GitHub{\textquoteright}s event streams and persistent data, and offer it to the research community as a service. In this paper, we present the project{\textquoteright}s design and initial implementation and demonstrate how the provided datasets can be queried and processed.}, url = {http://www.gousios.gr/bibliography/G13.html}, attachments = {https://flosshub.org/sites/flosshub.org/files/ghtorrent-dataset-toolsuite.pdf}, author = {Gousios, Georgios} } @conference {924, title = {A platform for software engineering research}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {31 - 40}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Research in the fields of software quality, maintainability and evolution requires the analysis of large quantities of data, which often originate from open source software projects. Collecting and preprocessing data, calculating metrics, and synthesizing composite results from a large corpus of project artifacts is a tedious and error prone task lacking direct scientific value. The Alitheia Core tool is an extensible platform for software quality analysis that is designed specifically to facilitate software engineering research on large and diverse data sources, by integrating data collection and preprocessing phases with an array of analysis services, and presenting the researcher with an easy to use extension mechanism. Alitheia Core aims to be the basis of an ecosystem of shared tools and research data that will enable researchers to focus on their research questions at hand, rather than spend time on re-implementing analysis tools. In this paper, we present the Alitheia Core platform in detail and demonstrate its usefulness in mining software repositories by guiding the reader through the steps required to execute a simple experiment.}, keywords = {alitheia core}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069478}, attachments = {https://flosshub.org/sites/flosshub.org/files/31gousios.pdf}, author = {Gousios, Georgios and Diomidis Spinellis} } @conference {SGKL09, title = {Evaluating the Quality of Open Source Software}, booktitle = {Electronic Notes in Theoretical Computer Science}, volume = {233}, year = {2008}, note = {"the software source code and the associated data stored in the version control system, the bug tracking databases, the mailing lists, and the wikis allow us to evaluate quality in a transparent way" "The data collection system collects the raw data from open source projects" Mailing lists are measured in: Number of unique subscribers, Number of messages in user/support list per month, Number of messages in developers list per month, Average thread depth}, month = {03/2009}, pages = {5{\textendash}28}, publisher = {The Reengineering Forum}, organization = {The Reengineering Forum}, abstract = {Traditionally, research on quality attributes was either kept under wraps within the organization that performed it, or carried out by outsiders using narrow, black-box techniques. The emergence of open source software has changed this picture allowing us to evaluate both software products and the processes that yield them. Thus, the software source code and the associated data stored in the version control system, the bug tracking databases, the mailing lists, and the wikis allow us to evaluate quality in a transparent way. Even better, the large number of (often competing) open source projects makes it possible to contrast the quality of comparable systems serving the same domain. Furthermore, by combining historical source code snapshots with significant events, such as bug discoveries and fixes, we can further dig into the causes and effects of problems. Here we present motivating examples, tools, and techniques that can be used to evaluate the quality of open source (and by extension also proprietary) software. }, keywords = {bug tracking system, email, email archives, mailing list, metrics, open source, process quality attributes, product quality attributes, source code, SQO-OSS, wiki}, doi = {10.1016/j.entcs.2009.02.058}, url = {http://www.dmst.aueb.gr/dds/pubs/conf/2008-SQM-SQOOSS/html/SGKL09.html}, attachments = {https://flosshub.org/sites/flosshub.org/files/entcs-sqooss.pdf}, author = {Diomidis Spinellis and Gousios, Georgios and Vassilios Karakoidas and Panagiotis Louridas and Paul J. Adams and Samoladas, Ioannis and Ioannis Stamelos} } @conference {546, title = {The SQO-OSS Quality Model: Measurement Based Open Source Software Evaluation}, booktitle = {OSS2008: Open Source Development, Communities and Quality (IFIP 2.13)}, series = {IFIP International Federation for Information Processing}, volume = {275/2008}, year = {2008}, month = {2008///}, pages = {237 - 248}, publisher = {Springer}, organization = {Springer}, chapter = {19}, abstract = {Software quality evaluation has always been an important part of software business. The quality evaluation process is usually based on hierarchical quality models that measure various aspects of software quality and deduce a characterization of the product quality being evaluated. The particular nature of open source software has rendered existing models inappropriate for detailed quality evaluations. In this paper, we present a hierarchical quality model that evaluates source code and community processes, based on automatic calculation of metric values and their correlation to a set of predefined quality profiles.1 }, issn = {978-0-387-09683-4}, doi = {http://dx.doi.org/10.1007/978-0-387-09684-1_19}, attachments = {https://flosshub.org/sites/flosshub.org/files/SQO-OSS\%20Quality\%20Model.pdf}, author = {Samoladas, Ioannis and Gousios, Georgios and Diomidis Spinellis and Ioannis Stamelos} }