@proceedings {1912, title = {Developer Turnover in Global, Industrial Open Source Projects: Insights from Applying Survival Analysis}, year = {2017}, note = {we study five industrial OSS projects of different sizes (in terms of software size, contributors and number of companies involved) and examine whether the duration of developers staying in a project is related to following four factors: (i) the time of first contribution, (ii) the rate of maintaining own files, (iii) the main action type, and (iv) the main job type projects: WikiMedia, OpenStack, GlusterFS from Red Hat, Xen Project from the Linux Foundation and Apache CloudStack}, month = {05/2017}, pages = {66-75}, abstract = {Large open source software projects often have a globally distributed development team. Studies have shown developer turnover has a significant impact on the project success. Frequent developer turnover may lead to loss of productivity due to lacking relevant knowledge and spending extra time learning how projects work. Thus, lots of attention has been paid to which factors are related to developer retention; however, few of them focus on the impact of activities of individual developers. In this paper, we study five open source projects from different organizations and examine whether developer turnover is affected by when they start contributing and what types of contributions they are making. Our study reveals that developers have higher chances to survive in software projects when they 1) start contributing to the project earlier; 2) mainly modify instead of creating files; 3) mainly code instead of dealing with documentations. Our results also shed lights on the potential approaches to improving developer retention.}, keywords = {survival analysis}, author = {Bin Lin and Gregorio Robles and Serebrenik, Alexander} } @proceedings {1762, title = {A Data Set for Social Diversity Studies of GitHub Teams}, year = {2015}, month = {05/2015}, publisher = {IEEE}, abstract = {Like any other team oriented activity, the software development process is effected by social diversity in the programmer teams. The effect of team diversity can be significant, but also complex, especially in decentralized teams. Discerning the precise contribution of diversity on teams{\textquoteright} effectiveness requires quantitative studies of large data sets. Here we present for the first time a large data set of social diversity attributes of programmers in GITHUB teams. Using alias resolution, location data, and gender inference techniques, we collected a team social diversity data set of 23,493 GITHUB projects. We illustrate how the data set can be used in practice with a series of case studies, and we hope its availability will foster more interest in studying diversity issues in software teams.}, keywords = {ghtorrent, github}, url = {http://www.win.tue.nl/~aserebre/cr-msr-data-15.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/cr-msr-data-15.pdf}, author = {Vasilescu, Bogdan and Serebrenik, Alexander and Filkov, Vladimir} } @proceedings {1713, title = {Gender and Tenure Diversity in GitHub Teams}, year = {2015}, abstract = {Software development is usually a collaborative venture. Open Source Software (OSS) projects are no exception; indeed, by design, the OSS approach can accommodate teams that are more open, geographically distributed, and dynamic than commercial teams. This, we find, leads to OSS teams that are quite diverse. Team diversity, predominantly in of- fline groups, is known to correlate with team output, mostly with positive effects. How about in OSS? Using GITHUB, the largest publicly available collection of OSS projects, we studied how gender and tenure diversity relate to team productivity and turnover. Using regression modeling of GITHUB data and the results of a survey, we show that both gender and tenure diversity are positive and significant predictors of productivity, together explaining a sizable fraction of the data variability. These results can inform decision making on all levels, leading to better outcomes in recruiting and performance.}, keywords = {gender, github, team}, url = {http://bvasiles.github.io/papers/chi15.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/chi15.pdf}, author = {Vasilescu, Bogdan and Posnett, Daryl and Ray, Baishakhi and van den Brand, Mark G.J. and Serebrenik, Alexander and Devanbu, Premkumar and Filkov, Vladimir} } @proceedings {1685, title = {Continuous integration in a social-coding world: Empirical evidence from GitHub}, year = {2014}, pages = {5 pages}, abstract = {Continuous integration is a software engineering practice of frequently merging all developer working copies with a shared main branch, e.g., several times a day. With the advent of GITHUB, a platform well known for its {\textquotedblleft}social coding{\textquotedblright} features that aid collaboration and sharing, and currently the largest code host in the open source world, collaborative software development has never been more prominent. In GITHUB development one can distinguish between two types of developer contributions to a project: direct ones, coming from a typically small group of developers with write access to the main project repository, and indirect ones, coming from developers who fork the main repository, update their copies locally, and submit pull requests for review and merger. In this paper we explore how GITHUB developers use continuous integration as well as whether the contribution type (direct versus indirect) and different project characteristics (e.g., main programming language, or project age) are associated with the success of the automatic builds.}, keywords = {github}, url = {http://conferences.computer.org/icsme/2014/papers/6146a401.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/ICSME2014ERA.pdf}, author = {Vasilescu, Bogdan and Serebrenik, Alexander and Schuylenberg, Stef and Wulms, Jules and Brand, Mark G.J.} } @conference {Robles:2014:FSD:2597073.2597129, title = {FLOSS 2013: A Survey Dataset About Free Software Contributors: Challenges for Curating, Sharing, and Combining}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {396{\textendash}399}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this data paper we describe a data set obtained by means of performing an on-line survey to over 2,000 Free Libre Open Source Software (FLOSS) contributors. The survey includes questions related to personal characteristics (gender, age, civil status, nationality, etc.), education and level of English, professional status, dedication to FLOSS projects, reasons and motivations, involvement and goals. We describe as well the possibilities and challenges of using private information from the survey when linked with other, publicly available data sources. In this regard, an example of data sharing will be presented and legal, ethical and technical issues will be discussed. }, keywords = {anonymization, data combining, data sharing, ethics, free software, microdata, msr data showcase, open data, open source, privacy, Survey}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597129}, url = {http://doi.acm.org/10.1145/2597073.2597129}, attachments = {https://flosshub.org/sites/flosshub.org/files/msr14gregorio.pdf}, author = {Gregorio Robles and Reina, Laura Arjona and Serebrenik, Alexander and Vasilescu, Bogdan and Gonz{\'a}lez-Barahona, Jes{\'u}s M.} } @conference {Vasilescu:2014:SQS:2531602.2531659, title = {How Social Q\&A Sites Are Changing Knowledge Sharing in Open Source Software Communities}, booktitle = {Proceedings of the 17th ACM Conference on Computer Supported Cooperative Work \&\#38; Social Computing}, series = {CSCW {\textquoteright}14}, year = {2014}, pages = {342{\textendash}354}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Historically, mailing lists have been the preferred means for coordinating development and user support activities. With the emergence and popularity growth of social Q\&A sites such as the StackExchange network (e.g., StackOverflow), this is beginning to change. Such sites offer different socio-technical incentives to their participants than mailing lists do, e.g., rich web environments to store and manage content collaboratively, or a place to showcase their knowledge and expertise more vividly to peers or potential recruiters. A key difference between StackExchange and mailing lists is gamification, i.e., StackExchange participants compete to obtain reputation points and badges. In this paper, we use a case study of R (a widely-used tool for data analysis) to investigate how mailing list participation has evolved since the launch of StackExchange. Our main contribution is the assembly of a joint data set from the two sources, in which participants in both the texttt{r-help} mailing list and StackExchange are identifiable. This permits their activities to be linked across the two resources and also over time. With this data set we found that user support activities show a strong shift away from texttt{r-help}. In particular, mailing list experts are migrating to StackExchange, where their behaviour is different. First, participants active both on texttt{r-help} and on StackExchange are more active than those who focus exclusively on only one of the two. Second, they provide faster answers on StackExchange than on texttt{r-help}, suggesting they are motivated by the emph{gamified} environment. To our knowledge, our study is the first to directly chart the changes in behaviour of specific contributors as they migrate into gamified environments, and has important implications for knowledge management in software engineering.}, keywords = {a, crowdsourced knowledge, gamification., mailing lists, open source, social q\&\#38}, isbn = {978-1-4503-2540-0}, doi = {10.1145/2531602.2531659}, url = {http://doi.acm.org/10.1145/2531602.2531659}, attachments = {https://flosshub.org/sites/flosshub.org/files/cscw14.pdf}, author = {Vasilescu, Bogdan and Serebrenik, Alexander and Devanbu, Prem and Filkov, Vladimir} } @conference {Gousios:2014:LGG:2597073.2597126, title = {Lean GHTorrent: GitHub Data on Demand}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {384{\textendash}387}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In recent years, GitHub has become the largest code host in the world, with more than 5M developers collaborating across 10M repositories. Numerous popular open source projects (such as Ruby on Rails, Homebrew, Bootstrap, Django or jQuery) have chosen GitHub as their host and have migrated their code base to it. GitHub offers a tremendous research potential. For instance, it is a flagship for current open source development, a place for developers to showcase their expertise to peers or potential recruiters, and the platform where social coding features or pull requests emerged. However, GitHub data is, to date, largely underexplored. To facilitate studies of GitHub, we have created GHTorrent, a scalable, queriable, offline mirror of the data offered through the GitHub REST API. In this paper we present a novel feature of GHTorrent designed to offer customisable data dumps on demand. The new GHTorrent data-on-demand service offers users the possibility to request via a web form up-to-date GHTorrent data dumps for any collection of GitHub repositories. We hope that by offering customisable GHTorrent data dumps we will not only lower the "barrier for entry" even further for researchers interested in mining GitHub data (thus encourage researchers to intensify their mining efforts), but also enhance the replicability of GitHub studies (since a snapshot of the data on which the results were obtained can now easily accompany each study). }, keywords = {data on demand, dataset, github, msr data showcase}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597126}, url = {http://doi.acm.org/10.1145/2597073.2597126}, attachments = {https://flosshub.org/sites/flosshub.org/files/lean-ghtorrent_0.pdf}, author = {Gousios, Georgios and Vasilescu, Bogdan and Serebrenik, Alexander and Zaidman, Andy} } @conference {Pletea:2014:SES:2597073.2597117, title = {Security and Emotion: Sentiment Analysis of Security Discussions on GitHub}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {348{\textendash}351}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Application security is becoming increasingly prevalent during software and especially web application development. Consequently, countermeasures are continuously being discussed and built into applications, with the goal of reducing the risk that unauthorized code will be able to access, steal, modify, or delete sensitive data. In this paper we gauged the presence and atmosphere surrounding security-related discussions on GitHub, as mined from discussions around commits and pull requests. First, we found that security related discussions account for approximately 10\% of all discussions on GitHub. Second, we found that more negative emotions are expressed in security-related discussions than in other discussions. These findings confirm the importance of properly training developers to address security concerns in their applications as well as the need to test applications thoroughly for security vulnerabilities in order to reduce frustration and improve overall project atmosphere. }, keywords = {github, mining challenge, msr challenge, security, sentiment analysis}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597117}, url = {http://doi.acm.org/10.1145/2597073.2597117}, attachments = {https://flosshub.org/sites/flosshub.org/files/pletea.pdf}, author = {Pletea, Daniel and Vasilescu, Bogdan and Serebrenik, Alexander} } @inbook {1630, title = {The Babel of Software Development: Linguistic Diversity in Open Source}, booktitle = {Social Informatics}, series = {Lecture Notes in Computer Science}, volume = {8238}, year = {2013}, pages = {391-404}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, isbn = {978-3-319-03259-7}, doi = {10.1007/978-3-319-03260-3_34}, url = {http://dx.doi.org/10.1007/978-3-319-03260-3_34}, attachments = {https://flosshub.org/sites/flosshub.org/files/socinfo13.pdf}, author = {Vasilescu, Bogdan and Serebrenik, Alexander and Brand, MarkG.J.}, editor = {Jatowt, Adam and Lim, Ee-Peng and Ding, Ying and Miura, Asako and Tezuka, Taro and Dias, Ga{\"e}l and Tanaka, Katsumi and Flanagin, Andrew and Dai, BingTian} } @conference {1406, title = {Process Mining Software Repositories}, booktitle = {15th European Conference on Software Maintenance and Reengineering (CSMR 2011) }, year = {2011}, pages = {5 - 14}, publisher = {IEEE}, organization = {IEEE}, address = {Oldenburg, Germany}, abstract = {Software developers{\textquoteright} activities are in general recorded in software repositories such as version control systems, bug trackers and mail archives. While abundant information is usually present in such repositories, successful information extraction is often challenged by the necessity to simultaneously analyze different repositories and to combine the information obtained. We propose to apply process mining techniques, originally developed for business process analysis, to address this challenge. However, in order for process mining to become applicable, different software repositories should be combined, and {\textquotedblleft}related{\textquotedblright} software development events should be matched: e.g., mails sent about a file, modifications of the file and bug reports that can be traced back to it. The combination and matching of events has been implemented in FRASR (FRamework for Analyzing Software Repositories), augmenting the process mining framework ProM. FRASR has been successfully applied in a series of case studies addressing such aspects of the development process as roles of different developers and the way bug reports are handled.}, keywords = {amsn, email, email archives, gcc, mailing list, Process mining, software repositories}, isbn = {978-1-61284-259-2}, doi = {10.1109/CSMR.2011.5}, attachments = {https://flosshub.org/sites/flosshub.org/files/2011-03_CSMR.pdf}, author = {Poncin, Wouter and Serebrenik, Alexander and Brand, Mark van den} }