@conference {bird2010lee, title = {{Linkster: Enabling Efficient Manual Mining}}, booktitle = {Demonstration Track, Proceedings of the 17th SIGSOFT Symposium on Foundations of Software Engineering}, year = {2010}, note = {"LINKSTER efficiently displays, integrates, and allows inspection and annotation of information from three main sources of data: source code repositories, developer mailing lists archives, and bug tracking databases. LINKSTER requires access to a source code repository for file content and a database which contains the raw mined repository, mailing list, and bug tracking information. All notes and annotations made by the user are also recorded in the database."}, publisher = {ACM}, organization = {ACM}, abstract = {While many uses of mined software engineering data are automatic in nature, some techniques and studies either require, or can be improved, by manual methods. Unfortunately, manually inspecting, analyzing, and annotating mined data can be difficult and tedious, especially when information from multiple sources must be integrated. Oddly, while there are numerous tools and frameworks for automatically mining and analyzing data, there is a dearth of tools which facilitate manual methods. To fill this void, we have developed LINKSTER, a tool which integrates data from bug databases, source code repositories, and mailing list archives to allow manual inspection and annotation. LINKSTER has already been used successfully by an OSS project lead to obtain data for one empirical study.}, keywords = {artifacts, bug, bug tracking, data mining, email, mailing lists, open source, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/bird2010lee.pdf}, author = {Christian Bird and Adrian Bachman and Rahman, Foyzur and Bernstein, Abraham} } @conference {969, title = {Validity of network analyses in Open Source Projects}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, note = {"We have mined archival records of developer mailing lists to generate reply-to social networks for the three OSS projects: Apache, MySQL, and Perl." "For each of these projects, we construct an information flow network based on messages that are sent as replies to previous messages....We use this methodology on all mined data to create a network of mailing list participants. "}, pages = {201 - 209}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Social network methods are frequently used to analyze networks derived from Open Source Project communication and collaboration data. Such studies typically discover patterns in the information flow between contributors or contributions in these projects. Social network metrics have also been used to predict defect occurrence. However, such studies often ignore or side-step the issue of whether (and in what way) the metrics and networks of study are influenced by inadequate or missing data. In previous studies email archives of OSS projects have provided a useful trace of the communication and co-ordination activities of the participants. These traces have been used to construct social networks that are then subject to various types of analysis. However, during the construction of these networks, some assumptions are made, that may not always hold; this leads to incomplete, and sometimes incorrect networks. The question then becomes, do these errors affect the validity of the ensuing analysis? In this paper we specifically examine the stability of network metrics in the presence of inadequate and missing data. The issues that we study are: 1) the effect of paths with broken information flow (i.e. consecutive edges which are out of temporal order) on measures of centrality of nodes in the network, and 2) the effect of missing links on such measures. We demonstrate on three different OSS projects that while these issues do change network topology, the metrics used in the analysis are stable with respect to such changes.}, keywords = {apache, email archives, mailing lists, missing data, mysql, perl, social networks}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463342}, attachments = {https://flosshub.org/sites/flosshub.org/files/201NetworkAnalysis.pdf}, author = {Nia, Roozbeh and Christian Bird and Devanbu, Premkumar and Filkov, Vladimir} } @conference {DBLP:conf/msr/BirdRBHGD09, title = {The promises and perils of mining git}, booktitle = {Proceedings of the 6th International Working Conference on Mining Software Repositories, MSR 2009}, year = {2009}, pages = {1-10}, abstract = {We are now witnessing the rapid growth of decentralized source code management (DSCM) systems, in which every developer has her own repository. DSCMs facilitate a style of collaboration in which work output can flow sideways (and privately) between collaborators, rather than always up and down (and publicly) via a central repository. Decentralization comes with both the promise of new data and the peril of its misinterpretation. We focus on git, a very popular DSCM used in high-profile projects. Decentralization, and other features of git, such as automatically recorded contributor attribution, lead to richer content histories, giving rise to new questions such as "How do contributions flow between developers to the official project repository?" However, there are pitfalls. Commits may be reordered, deleted, or edited as they move between repositories. The semantics of terms common to SCMs and DSCMs sometimes differ markedly, potentially creating confusion. For example, a commit is immediately visible to all developers in centralized SCMs, but not in DSCMs. Our goal is to help researchers interested in DSCMs avoid these and other perils when mining and analyzing git data.}, keywords = {dscm, git, mining, scm, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/1promisePeril.pdf}, author = {Christian Bird and Peter C. Rigby and Earl T. Barr and David J. Hamilton and Daniel M. Germ{\'a}n and Premkumar T. Devanbu} } @conference {bird2009pat, title = {{Putting it All Together: Using Socio-Technical Networks to Predict Failures}}, booktitle = {Proceedings of the 17th International Symposium on Software Reliability Engineering}, year = {2009}, note = {First, we build each type of network separately and use network analysis on both to gather metrics for use in a predictive model. Second, we build a socio-technical network which combines the nodes and edges from both the dependency network and the contribution network and use metrics gathered from this network in a predictive model. We evaluate our approach by collecting data from Mi- crosoft Windows Vista and ECLIPSE development and using logistic regression analysis.}, abstract = {Studies have shown that social factors in development organizations have a dramatic effect on software quality. Separately, program dependency information has also been used successfully to predict which software components are more fault prone. Interestingly, the influence of these two phenomena have only been studied separately. Intuition and practical experience suggests, however, that task assignment (i.e. who worked on which components and how much) and dependency structure (which components have dependencies on others) together interact to influence the quality of the resulting software. We study the influence of combined socio-technical software networks on the fault-proneness of individual software components within a system. The network properties of a software component in this combined network are able to predict if an entity is failure prone with greater accuracy than prior methods which use dependency or contribution information in isolation. We evaluate our approach in different settings by using it on Windows Vista and across six releases of the Eclipse development environment including using models built from one release to predict failure prone components in the next release. We compare this to previous work. In every case, our method performs as well or better and is able to more accurately identify those software components that have more post-release failures, with precision and recall rates as high as 85\%.}, keywords = {eclipse, microsoft, social network, vista, windows}, attachments = {https://flosshub.org/sites/flosshub.org/files/bird2009pat.pdf}, author = {Christian Bird and Nachiappan Nagappan and Devanbu, Premkumar and Gall, Harald and Brendan Murphy} } @conference {1011, title = {Detecting Patch Submission and Acceptance in OSS Projects}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {26 - 26}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {The success of open source software (OSS) is completely dependent on the work of volunteers who contribute their time and talents. The submission of patches is the major way that participants outside of the core group of developers make contributions. We argue that the process of patch submission and acceptance into the codebase is an important piece of the open source puzzle and that the use of patch-related data can be helpful in understanding how OSS projects work. We present our methods in identifying the submission and acceptance of patches and give results and evaluation in applying these methods to the Apache webserver, Python interpreter, Postgres SQL database, and (with limitations) MySQL database projects. In addition, we present valuable ways in which this data has been and can be used.}, keywords = {apache, contributions, mysql, patches, postgresql, python, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.6}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300026.pdf}, author = {Christian Bird and Gourley, Alex and Devanbu, Prem} } @conference {Bird:2006:MES:1137983.1138016, title = {Mining email social networks}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {137{\textendash}143}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Communication \& Co-ordination activities are central to large software projects, but are difficult to observe and study in traditional (closed-source, commercial) settings because of the prevalence of informal, direct communication modes. OSS projects, on the other hand, use the internet as the communication medium,and typically conduct discussions in an open, public manner. As a result, the email archives of OSS projects provide a useful trace of the communication and co-ordination activities of the participants. However, there are various challenges that must be addressed before this data can be effectively mined. Once this is done, we can construct social networks of email correspondents, and begin to address some interesting questions. These include questions relating to participation in the email; the social status of different types of OSS participants; the relationship of email activity and commit activity (in the CVS repositories) and the relationship of social status with commit activity. In this paper, we begin with a discussion of our infrastructure (including a novel use of Scientific Workflow software) and then discuss our approach to mining the email archives; and finally we present some preliminary results from our data analysis.}, keywords = {communication, contributions, developers, email, email archives, mailing lists, open source, social networks}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138016}, url = {http://doi.acm.org/10.1145/1137983.1138016}, attachments = {https://flosshub.org/sites/flosshub.org/files/137MiningEmail.pdf}, author = {Christian Bird and Gourley, Alex and Devanbu, Prem and Gertz, Michael and Swaminathan, Anand} }