@proceedings {1498, title = {Who Does What during a Code Review? Datasets of OSS Peer Review Repositories }, year = {2013}, month = {05/2013}, abstract = {We present four datasets that are focused on the general roles of OSS peer review members. With data mined from both an integrated peer review system and code source repositories, our rich datasets comprise of peer review data that was automatically recorded. Using the Android project as a case study, we describe our extraction methodology, the datasets and their application used for three separate studies. Our datasets are available online at http://sdlab.naist.jp/reviewmining/}, keywords = {android, case study, code review, data set, peer review, roles, source code}, author = {Kazuki Hamasaki and Raula Gaikovina Kula and Norihiro Yoshida and A. E. Camargo Cruz and Kenji Fujiwara and Hajimu Iida} } @conference {1318, title = {Network-Based Analysis of the Structure and Evolution of an Open Source Software Product}, booktitle = {45th Hawai{\textquoteright}i International Conference on System Sciences}, year = {2012}, note = {"raw data about the product structure is extracted from the source code"}, month = {01/2012}, pages = {3436-3445}, abstract = {In this paper, an analysis of product structures in open source software (OSS) at both product level and module level is presented. At the product level, the product structures are modeled as complex networks, and the evolutionary characteristics of product structures are analyzed by using network analysis metrics. At the module level, linking mechanisms, which describe how a module is attached with other modules, are proposed. The linking mechanisms are modeled as probability functions dependent on the degrees of linking modules. A case study from an open source software project, Drupal, is presented. The evolutionary trends of Drupal product structures are analyzed and discussed. Finally, a model is presented to illustrate the effects of linking mechanisms at the module level on the product structures at the system level. The results indicate that the model built using the proposed linking mechanisms generates networks whose evolutionary characteristics are close to that of the original network.}, keywords = {drupal, source code}, author = {Le, Qize and Panchal, Jitesh H.} } @conference {1314, title = {Apples vs. oranges?}, booktitle = {Proceedings of the 8th working conference on Mining software repositories - MSR {\textquoteright}11}, year = {2011}, note = {"In this MSR challenge report, we compare the source code of two in- dustrial grade Integrated Development Environments (IDE): Netbeans, developed by Oracle Corporation, and Eclipse, developed by the Eclipse Foundation. In the same spirit as [1], our goal is not to identify how similar or different they are, but to identify differences that, if not taken into consideration, might result in biased, and potentially erroneous conclusions."}, month = {05/2011}, pages = {246-249}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {We attempt to compare the source code of two Java IDE systems: Netbeans and Eclipse. The result of this experiment shows that many factors, if ignored, could risk a bias in the results, and we posit various observations that should be taken into consideration to minimize such risk. }, keywords = {eclipse, netbeans, source code}, isbn = {9781450305747}, doi = {10.1145/1985441.1985483}, author = {Davies, Julius and Daniel M. German} } @conference {1306, title = {Java generics adoption}, booktitle = {Proceedings of the 8th working conference on Mining software repositories - MSR {\textquoteright}11}, year = {2011}, note = {"we automatically analyzed 20 open source software projects. We analyzed the top {\textquotedblleft}most used{\textquotedblright} projects according to ohloh.net, including only projects with significant amounts of Java code" "The 20 selected projects were Ant, Azureus, CheckStyle, Commons Collections, Free- Mind, FindBugs, Jetty, JEdit, JDT, JUnit, Eclipse-cs, Hibernate, Log4j, Lucene, Maven, the Spring Frame- work, Squirrel-SQL, Subclipse, Weka, and Xerces." "In mining the full version histories of these 20 projects, we analyzed the full content of each version of each Java source file, a total of 548,982,841 lines."}, month = {05/2011}, pages = {3-12}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {Support for generic programming was added to the Java language in 2004, representing perhaps the most significant change to one of the most widely used programming languages today. Researchers and language designers anticipated this addition would relieve many long-standing problems plaguing developers, but surprisingly, no one has yet measured whether generics actually provide such relief. In this paper, we report on the first empirical investigation into how Java generics have been integrated into open source software by automatically mining the history of 20 popular open source Java programs, traversing more than 500 million lines of code in the process. We evaluate five hypotheses, each based on assertions made by prior researchers, about how Java developers use generics. For example, our results suggest that generics do not significantly reduce the number of type casts and that generics are usually adopted by a single champion in a project, rather than all committers.}, keywords = {commits, generics, java, source code, version history}, isbn = {9781450305747}, doi = {10.1145/1985441.1985446}, author = {Christian Bird and Murphy-Hill, Emerson and Parnin, Chris} } @conference {963, title = {Automated dependency resolution for open source software}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {130 - 140}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Opportunities for software reuse are plentiful, thanks in large part to the widespread adoption of open source processes and the availability of search engines for locating relevant artifacts. One challenge presented by open source software reuse is simply getting a newly downloaded artifact to build/run in the first place. The artifact itself likely reuses other artifacts, and so depends on their being located to function properly. While merely tedious in the individual case, this can cause serious difficulties for those seeking to study open source software. It is simply not feasible to manually resolve dependencies for thousands of projects, and many forms of analysis require declarative completeness. In this paper we present a method for automatically resolving dependencies for open source software. It works by cross-referencing a project{\textquoteright}s missing type information with a repository of candidate artifacts. We have implemented this method on top of the Sourcerer, an infrastructure for the large-scale indexing and analysis of open source code. The performance of our resolution algorithm was evaluated in two parts. First, for a small number of popular open source projects, we manually examined the artifacts suggested by our system to determine if they were appropriate. Second, we applied the algorithm to the 13,241 projects in the Sourcerer managed repository to evaluate the rate of resolution success. The results demonstrate the feasibility of this approach, as the algorithm located all of the required artifacts needed by 3,904 additional projects, increasing the percentage of declaratively complete projects in Sourcerer from 39\% to 69\%.}, keywords = {dependencies, java, source code, sourcerer}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463346}, author = {Ossher, Joel and Bajracharya, Sushil and Lopes, Cristina} } @conference {954, title = {Clones: What is that smell?}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {72 - 81}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Clones are generally considered bad programming practice in software engineering folklore. They are identified as a bad smell and a major contributor to project maintenance difficulties. Clones inherently cause code bloat, thus increasing project size and maintenance costs. In this work, we try to validate the conventional wisdom empirically to see whether cloning makes code more defect prone. This paper analyses relationship between cloning and defect proneness. We find that, first, the great majority of bugs are not significantly associated with clones. Second, we find that clones may be less defect prone than non-cloned code. Finally, we find little evidence that clones with more copies are actually more error prone. Our findings do not support the claim that clones are really a "bad smell". Perhaps we can clone, and breathe easy, at the same time.}, keywords = {apache, bug fix revisions, bugs, clone, evolution, gimp, nautilus, scm, source code}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463343}, attachments = {https://flosshub.org/sites/flosshub.org/files/72rahman2010cws.pdf}, author = {Rahman, Foyzur and Christian Bird and Devanbu, Premkumar} } @conference {958, title = {Cloning and copying between GNOME projects}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {98 - 101}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {This paper presents an approach to automatically distinguish the copied clone from the original in a pair of clones. It matches the line-by-line version information of a clone to the pair{\textquoteright}s other clone. A case study on the GNOME Desktop Suite revealed a complex flow of reused code between the different subprojects. In particular, it showed that the majority of larger clones (with a minimal size of 28 lines or higher) exist between the subprojects and more than 60\% of the clone pairs can be automatically separated into original and copy.}, keywords = {clone, gnome, msr challenge, source code}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463290}, attachments = {https://flosshub.org/sites/flosshub.org/files/98Coning.pdf}, author = {Krinke, Jens and Gold, Nicolas and Jia, Yue and Binkley, David} } @conference {951, title = {The evolution of ANT build systems}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {42 - 51}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Build systems are responsible for transforming static source code artifacts into executable software. While build systems play such a crucial role in software development and maintenance, they have been largely ignored by software evolution researchers. With a firm understanding of build system aging processes, project managers could allocate personnel and resources to build system maintenance tasks more effectively, reducing the build maintenance overhead on regular development activities. In this paper, we study the evolution of ANT build systems from two perspectives: (1) a static perspective, where we examine the build system specifications using software metrics adopted from the source code domain; and (2) a dynamic perspective where representative sample build runs are conducted and their output logs are analyzed. Case studies of four open source ANT build systems with a combined history of 152 releases show that not only do ANT build systems evolve, but also that they need to react in an agile manner to changes in the source code.}, keywords = {ant, argouml, build, eclipse, jboss, maintenance, metrics, source code, tomcat}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463341}, attachments = {https://flosshub.org/sites/flosshub.org/files/42msr2010_mcintosh.pdf}, author = {McIntosh, Shane and Adams, Bram and Hassan, Ahmed E.} } @conference {1358, title = {Extracting source code from e-mails}, booktitle = {Proceedings of ICPC 2010 (18th IEEE International Conference on Program Comprehension)}, year = {2010}, note = {"We want to extract source code fragments from e-mail messages. To do this, we first need to select e-mails that contain source code fragments, and then we extract such fragments from the content in which they are enclosed." "we manually build a statistically significant benchmark taking sample e- mails from five unrelated open source Java software systems." }, pages = {24-33}, abstract = {E-mails, used by developers and system users to communicate over a broad range of topics, offer a valuable source of information. If archived, e-mails can be mined to support program comprehension activities and to provide views of a software system that are alternative and complementary to those offered by the source code. However, e-mails are written in natural language, and therefore contain noise that makes it difficult to retrieve the important data. Thus, before conducting an effective system analysis and extracting data for program comprehension, it is necessary to select the relevant messages, and to expose only the meaningful information. In this work we focus both on classifying e-mails that hold fragments of the source code of a system, and on extracting the source code pieces inside the e-mail. We devised and analyzed a number of lightweight techniques to accomplish these tasks. To assess the validity of our techniques, we manually inspected and annotated a statistically significant number of e-mails from five unrelated open source software systems written in Java. With such a benchmark in place, we measured the effectiveness of each technique in terms of precision and recall.}, keywords = {argouml, email, freenet, jmeter, mailing lists, mina, natural language, openjpa, source code}, url = {http://www.inf.usi.ch/phd/bacchelli/publications.php}, attachments = {https://flosshub.org/sites/flosshub.org/files/icpc2010.pdf}, author = {Bacchelli, Alberto and D{\textquoteright}Ambros, Marco and Lanza, Michele} } @conference {959, title = {Finding file clones in FreeBSD Ports Collection}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {102 - 105}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {In Open Source System (OSS) development, software components are often imported and reused; for this reason we might expect that files are copied in multiple projects (file clones). In this paper, we propose a file clone detection tool called FCFinder and show the analysis performed with it on the FreeBSD Ports Collection, a large OSS project collection. We found many file clones among similar or related projects, which are systematically introduced from base projects.}, keywords = {clone, freebsd, msr challenge, source code}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463293}, attachments = {https://flosshub.org/sites/flosshub.org/files/102FreeBSDClones.pdf}, author = {Sasaki, Yusuke and Yamamoto, Tetsuo and Hayase, Yasuhiro and Inoue, Katsuro} } @conference {965, title = {Identifying licensing of jar archives using a code-search approach}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {151 - 160}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Free and open source software strongly promotes the reuse of source code. Some open source Java components/libraries are distributed as jar archives only containing the bytecode and some additional information. For whoever wanting to integrate this jar in her own project, it is important to determine the license(s) of the code from which the jar archive was produced, as this affects the way that such component can be used. This paper proposes an automatic approach to determine the license of jar archives, combining the use of a code-search engine with the automatic classification of licenses contained in textual flies enclosed in the jar. Results of an empirical study performed on 37 jars - from 17 different systems - indicate that this approach is able to successfully infer the jar licenses in over 95\% of the cases, but that in many cases the license in textual flies may differ from the one of the classes contained in the jar.}, keywords = {apache, bytecode, classification, eclipse, google code, jar, java, licenses, source code}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463282}, attachments = {https://flosshub.org/sites/flosshub.org/files/151msr2010.pdf}, author = {Di Penta, Massimiliano and Daniel M. German and Antoniol, Giuliano} } @conference {bird2010lee, title = {{Linkster: Enabling Efficient Manual Mining}}, booktitle = {Demonstration Track, Proceedings of the 17th SIGSOFT Symposium on Foundations of Software Engineering}, year = {2010}, note = {"LINKSTER efficiently displays, integrates, and allows inspection and annotation of information from three main sources of data: source code repositories, developer mailing lists archives, and bug tracking databases. LINKSTER requires access to a source code repository for file content and a database which contains the raw mined repository, mailing list, and bug tracking information. All notes and annotations made by the user are also recorded in the database."}, publisher = {ACM}, organization = {ACM}, abstract = {While many uses of mined software engineering data are automatic in nature, some techniques and studies either require, or can be improved, by manual methods. Unfortunately, manually inspecting, analyzing, and annotating mined data can be difficult and tedious, especially when information from multiple sources must be integrated. Oddly, while there are numerous tools and frameworks for automatically mining and analyzing data, there is a dearth of tools which facilitate manual methods. To fill this void, we have developed LINKSTER, a tool which integrates data from bug databases, source code repositories, and mailing list archives to allow manual inspection and annotation. LINKSTER has already been used successfully by an OSS project lead to obtain data for one empirical study.}, keywords = {artifacts, bug, bug tracking, data mining, email, mailing lists, open source, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/bird2010lee.pdf}, author = {Christian Bird and Adrian Bachman and Rahman, Foyzur and Bernstein, Abraham} } @conference {964, title = {Mining subclassing directives to improve framework reuse}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {141 - 150}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {To help developers in using frameworks, good documentation is crucial. However, it is a challenge to create high quality documentation especially of hotspots in white-box frameworks. This paper presents an approach to documentation of object-oriented white-box frameworks which mines from client code four different kinds of documentation items, which we call subclassing directives. A case study on the Eclipse JFace user-interface framework shows that the approach can improve the state of API documentation w.r.t. subclassing directives.}, keywords = {api, documentation, eclipse, frameworks, jface, source code}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463347}, attachments = {https://flosshub.org/sites/flosshub.org/files/141Mining-Subclassing-Directives-to-Improve-Framework-Reuse.pdf}, author = {Bruch, Marcel and Mezini, Mira and Monperrus, Martin} } @conference {Meirelles:2010:SRS:1915078.1916012, title = {A Study of the Relationships between Source Code Metrics and Attractiveness in Free Software Projects}, booktitle = {Proceedings of the 2010 Brazilian Symposium on Software Engineering}, series = {SBES {\textquoteright}10}, year = {2010}, pages = {11{\textendash}20}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Washington, DC, USA}, abstract = {A significant number of Free Software projects has been widely used and considered successful. However, there is an even larger number of them that cannot overcome the initial steps towards building an active community of users and developers. In this study, we investigated whether there are relationships between source code metrics and attractiveness, i.e., the ability of a project to attract users and developers. To verify these relationships, we analyzed 6,773 Free Software projects from the SourceForge.net repository. The results indicated that attractiveness is indeed correlated to some source code metrics. This suggests that measurable attributes of the project source code somehow affect the decision to contribute to and adopt a Free Software. The findings described in this paper show that it is relevant for project leaders to monitor source code quality, particularly a few objective metrics, since these can have a positive influence in projects chances of forming a community of contributors and users around their software, enabling further enhancement in quality.}, keywords = {source code, source code analysis, sourceforge, user satisfaction, users}, isbn = {978-0-7695-4273-7}, doi = {10.1109/SBES.2010.27}, url = {http://dx.doi.org/10.1109/SBES.2010.27}, attachments = {https://flosshub.org/sites/flosshub.org/files/sourcecode_attractiveness.pdf}, author = {Meirelles, Paulo and Santos Jr., Carlos and Miranda, Joao and Kon, Fabio and Terceiro, Antonio and Chavez, Christina} } @article {1385, title = {Survival analysis on the duration of open source projects}, journal = {Information and Software Technology}, volume = {52}, year = {2010}, note = {"For the purposes of our study we used data coming only from source code repositories."}, month = {9/2010}, pages = {902 - 922}, abstract = {Context Open source (FLOSS) project survivability is an important piece of information for many open source stakeholders. Coordinators of open source projects would like to know the chances for the survival of the projects they coordinate. Companies are also interested in knowing how viable a project is in order to either participate or invest in it, and volunteers want to contribute to vivid projects. Objective The purpose of this article is the application of survival analysis techniques for estimating the future development of a FLOSS project. Method In order to apply such approach, duration data regarding FLOSS projects from the FLOSSMETRICS (This work was partially supported by the European Community{\textquoteright}s Sixth Framework Program under the Contract FP6-033982) database were collected. Such database contains metadata for thousands of FLOSS projects, derived from various forges. Subsequently, survival analysis methods were employed to predict the survivability of the projects, i.e. their probability of continuation in the future, by examining their duration, combined with other project characteristics such as their application domain and number of committers. Results It was shown how probability of termination or continuation may be calculated and how a prediction model may be built to upraise project future. In addition, the benefit of adding more committers to FLOSS projects was quantified. Conclusion Analysis results demonstrate the usefulness of the proposed framework for assessing the survival probability of a FLOSS project. }, keywords = {flossmetrics, prediction, source code, survival analysis}, issn = {09505849}, doi = {10.1016/j.infsof.2010.05.001}, author = {Samoladas, Ioannis and Lefteris Angelis and Ioannis Stamelos} } @conference {1217, title = {Trends That Affect Temporal Analysis Using SourceForge Data}, booktitle = {5th Workshop on Public Data about Software Development (WoPDaSD 2010)}, year = {2010}, note = {"In this paper we examine some of the limitations of artifact data by specifically addressing the applicability of SourceForge data to the study of project evolution." "For our analysis we examine 9,997 Production/Stable or Maintenance phase projects stored in CVS on SourceForge and extracted in October of 2006 [5]"}, abstract = {SourceForge is a valuable source of software artifact data for researchers who study project evolution and developer behavior. However, the data exhibit patterns that may bias temporal analyses. Most notable are cliff walls in project source code repository timelines, which indicate large commits that are out of character for the given project. These cliff walls often hide significant periods of development and developer collaboration{\textemdash}a threat to studies that rely on SourceForge repository data. We demonstrate how to identify these cliff walls, discuss reasons for their appearance, and propose preliminary measures for mitigating their effects in evolution-oriented studies.}, keywords = {cliff walls, committers, cvs, evolution, growth, source code, sourceforge, time, time series}, attachments = {https://flosshub.org/sites/flosshub.org/files/wopdasd001.pdf}, author = {MacLean, Alexander C. and Pratt, Landon J. and Krein, Jonathan L. and Knutson, Charles D.} } @conference {839, title = {Amassing and indexing a large sample of version control systems: towards the census of public source code history}, booktitle = {6th IEEE Working Conference on Mining Software Repositories}, year = {2009}, note = {Related work: "For example, FLOSSMole [8] collects the list of projects and their metadata from SourceForge, FreshMeat, Free Software Foundation (FSF), RubyForge, and ObjectWeb. " "The list of home pages from open source directories such as FSF or FLOSSMole could provide a focused set of web pages for the search" "To get the list of projects for SourceForge we used FLOSSMole project [8] that collects the list of SourceForge projects and their metadata. We use project ID, because it was used to specify the VCS URL within the SourceForge repository."}, month = {May 16{\textendash}17}, abstract = {The source code and its history represent the output and process of software development activities and are an in- valuable resource for study and improvement of software development practice. While individual projects and groups of projects have been extensively analyzed, some fundamental questions, such as the spread of innovation or genealogy of the source code, can be answered only by considering the entire universe of publicly available source code and its history. We describe methods we developed over the last six years to gather, index, and update an approximation of such a universal repository for publicly accessible version control systems and for the source code inside a large corporation. While challenging, the task is achievable with limited resources. The bottlenecks in network bandwidth, processing, and disk access can be dealt with using inherent parallelism of the tasks and suitable tradeoffs between the amount of storage and computations, but a completely automated discovery of public version control systems may require enticing participation of the sampled projects. Such universal repository would allow studies of global properties and origins of the source code that are not possible through other means.}, keywords = {bazaar, cvs, flossmole, git, mercurial, source code, sourceforge, subversion, version control}, attachments = {https://flosshub.org/sites/flosshub.org/files/11amassing.pdf}, author = {Audris Mockus} } @conference {943, title = {Automatic labeling of software components and their evolution using log-likelihood ratio of word frequencies in source code}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {175 - 178}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {As more and more open-source software components become available on the Internet we need automatic ways to label and compare them. For example, a developer who searches for reusable software must be able to quickly gain an understanding of retrieved components. This understanding cannot be gained at the level of source code due to the semantic gap between source code and the domain model. In this paper we present a lexical approach that uses the log-likelihood ratios of word frequencies to automatically provide labels for software components. We present a prototype implementation of our labeling/comparison algorithm and provide examples of its application. In particular, we apply the approach to detect trends in the evolution of a software system.}, keywords = {frequency, hapax, information retrieval, java, junit, keywords, labeling, source code}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069499}, attachments = {https://flosshub.org/sites/flosshub.org/files/175AutomaticLabeling.pdf}, author = {Kuhn, Adrian} } @conference {929, title = {Code siblings: Technical and legal implications of copying code between applications}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {81 - 90}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Source code cloning does not happen within a single system only. It can also occur between one system and another. We use the term code sibling to refer to a code clone that evolves in a different system than the code from which it originates. Code siblings can only occur when the source code copyright owner allows it and when the conditions imposed by such license are not incompatible with the license of the destination system. In some situations copying of source code fragments are allowed - legally - in one direction, but not in the other. In this paper, we use clone detection, license mining and classification, and change history techniques to understand how code siblings - under different licenses - flow in one direction or the other between Linux and two BSD Unixes, FreeBSD and OpenBSD. Our results show that, in most cases, this migration appears to happen according to the terms of the license of the original code being copied, favoring always copying from less restrictive licenses towards more restrictive ones. We also discovered that sometimes code is inserted to the kernels from an outside source.}, keywords = {bsd, fossology, freebsd, linux, openbsd, source code}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069483}, attachments = {https://flosshub.org/sites/flosshub.org/files/81CodeSiblings.pdf}, author = {Daniel M. German and Di Penta, Massimiliano and Gueheneuc, Yann-Gael and Antoniol, Giuliano} } @conference {1264, title = {The Commit Size Distribution of Open Source Software}, booktitle = {2009 42nd Hawaii International Conference on System Sciences (HICSS 2009)}, year = {2009}, note = {"We use the database of the open source analytics firm Ohloh Inc." "This article is based on a March 2008 database snapshot, which contains 9,363 completely crawled and analyzed projects covering a time frame from January 1990 to February 2008." "The Ohloh database provides the complete configuration management history of each crawled project (to the extent available on the web). Thus, every single commit action of all the projects over their entire history is available." "We measure the size of commits in this paper in source lines of code (SLoC) using Ohloh{\textquoteright}s own open source diff too"}, pages = {1 - 8}, publisher = {IEEE}, organization = {IEEE}, address = {Waikoloa, Hawaii, USA}, abstract = {With the growing economic importance of open source, we need to improve our understanding of how open source software development processes work. The analysis of code contributions to open source projects is an important part of such research. In this paper we analyze the size of code contributions to more than 9,000 open source projects. We review the total distribution and distinguish three categories of code contributions using a size-based heuristic: single focused commits, aggregate team contributions, and repository refactorings. We find that both the overall distribution and the individual categories follow a power law. We also suggest that distinguishing these commit categories by size will benefit future analyses.}, keywords = {commits, configuration management, history, lines of code, sloc, source code}, isbn = {978-0-7695-3450-3}, doi = {10.1109/HICSS.2009.421}, attachments = {https://flosshub.org/sites/flosshub.org/files/07-07-07.pdf}, author = {Arafat, O. and Dirk Riehle} } @article {10.1109/HICSS.2009.687, title = {Easier Said than Done: An Empirical Investigation of Software Design and Quality in Open Source Software Development}, journal = {2009 42nd Hawaii International Conference on System Sciences (HICSS 2009)}, year = {2009}, note = {"projects that do not exhibit obvious corporate or organizational sponsorship or involvement, projects with at least one software release in a relatively mature development stage (Beta or Production/Stable), projects exhibiting a minimum level of project activity, and products written only using Java." "The frame includes approximately 180 projects from which a random sample of 46 was drawn" "For each project, we sample each major software release (e.g. 1.x, 2.x) as our unit of analysis for a total of 203 releases." "We calculate the degree of modularity for each major software release sampled using the source code contained in each release" "To measure intrinsic software quality, we calculate the number of static bugs and software complexity based on objective evaluations of the source code included in the software release using two static source code analysis tools" "To assess customer satisfaction, we use three measures of software quality previously identified in OSS research as a proxy for software quality and OSS project success [3]: number of bugs reported, percentage of bugs closed, and time to close bugs."}, pages = {1-10}, publisher = {IEEE Computer Society}, address = {Los Alamitos, CA, USA}, abstract = {We empirically examine the relationship between software design modularity and software quality in open source software (OSS) development projects. Conventional wisdom suggests that degree of software modularity affects software quality. An analysis of 203 software releases in 46 OSS projects hosted on SourceForge.net lends support for a more complex relationship between software modularity and software quality than conventional wisdom suggests. We find that software modularity is associated with reduced software complexity, an increased number of static software bugs, and a mixed relationship with the percentage of bugs closed. We do not find empirical evidence supporting any relationship between modularity and other measures of customer satisfaction. In addition to empirically testing the relationship between modularity and quality, we introduce new measures of software modularity and software quality. Implications are developed for the theory of modularity and the practice of software development.}, keywords = {modularity, quality, source code, sourceforge}, isbn = {978-0-7695-3450-3}, doi = {http://doi.ieeecomputersociety.org/10.1109/HICSS.2009.687}, attachments = {https://flosshub.org/sites/flosshub.org/files/09-14-05.pdf}, author = {Conley, Caryn A. and Lee Sproull} } @conference {944, title = {Learning from defect removals}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {179 - 182}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Recent research has tried to identify changes in source code repositories that fix bugs by linking these changes to reports in issue tracking systems. These changes have been traced back to the point in time when they were previously modified as a way of identifying bug introducing changes. But we observe that not all changes linked to bug tracking systems are fixing bugs; some are enhancing the code. Furthermore, not all fixes are applied at the point in the code where the bug was originally introduced. We flesh out these observations with a manual review of several software projects, and use this opportunity to see how many defects are in the scope of static analysis tools.}, keywords = {bug fixing, bugzilla, change management, cherry, cvs, eclipse, groovy, launching, source code, svn, text editor}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069500}, attachments = {https://flosshub.org/sites/flosshub.org/files/179LearnFromDefects-MSR09.pdf}, author = {Ayewah, Nathaniel and Pugh, William} } @conference {935, title = {Mining search topics from a code search engine usage log}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {111 - 120}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {We present a topic modeling analysis of a year long usage log of Koders, one of the major commercial code search engines. This analysis contributes to the understanding of what users of code search engines are looking for. Observations on the prevalence of these topics among the users, and on how search and download activities vary across topics, leads to the conclusion that users who find code search engines usable are those who already know to a high level of specificity what to look for. This paper presents a general categorization of these topics that provides insights on the different ways code search engine users express their queries. The findings support the conclusion that existing code search engines provide only a subset of the various information needs of the users when compared to the categories of queries they look at.}, keywords = {analysis, black duck, koders, log, logfile, search, source code}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069489}, author = {Bajracharya, Sushil and Lopes, Cristina} } @conference {DBLP:conf/msr/BirdRBHGD09, title = {The promises and perils of mining git}, booktitle = {Proceedings of the 6th International Working Conference on Mining Software Repositories, MSR 2009}, year = {2009}, pages = {1-10}, abstract = {We are now witnessing the rapid growth of decentralized source code management (DSCM) systems, in which every developer has her own repository. DSCMs facilitate a style of collaboration in which work output can flow sideways (and privately) between collaborators, rather than always up and down (and publicly) via a central repository. Decentralization comes with both the promise of new data and the peril of its misinterpretation. We focus on git, a very popular DSCM used in high-profile projects. Decentralization, and other features of git, such as automatically recorded contributor attribution, lead to richer content histories, giving rise to new questions such as "How do contributions flow between developers to the official project repository?" However, there are pitfalls. Commits may be reordered, deleted, or edited as they move between repositories. The semantics of terms common to SCMs and DSCMs sometimes differ markedly, potentially creating confusion. For example, a commit is immediately visible to all developers in centralized SCMs, but not in DSCMs. Our goal is to help researchers interested in DSCMs avoid these and other perils when mining and analyzing git data.}, keywords = {dscm, git, mining, scm, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/1promisePeril.pdf}, author = {Christian Bird and Peter C. Rigby and Earl T. Barr and David J. Hamilton and Daniel M. Germ{\'a}n and Premkumar T. Devanbu} } @article {Capiluppi200989, title = {Quality Factors and Coding Standards - a Comparison Between Open Source Forges}, journal = {Electronic Notes in Theoretical Computer Science}, volume = {233}, year = {2009}, note = {Proceedings of the International Workshop on Software Quality and Maintainability (SQM 2008)}, pages = {89 - 103}, abstract = {Enforcing adherence to standards in software development in order to produce high quality software artefacts has long been recognised as best practice in traditional software engineering. In a distributed heterogeneous development environment such those found within the Open Source paradigm, coding standards are informally shared and adhered to by communities of loosely coupled developers. Following these standards could potentially lead to higher quality software. This paper reports on the empirical analysis of two major forges where OSS projects are hosted. The first one, the KDE forge, provides a set of guidelines and coding standards in the form of a coding style that developers may conform to when producing the code source artefacts. The second studied forge, SourceForge, imposes no formal coding standards on developers. A sample of projects from these two forges has been analysed to detect whether the SourceForge sample, where no coding standards are reinforced, has a lower quality than the sample from KDE. Results from this analysis form a complex picture; visually, all the selected metrics show a clear divide between the two forges, but from the statistical standpoint, clear distinctions cannot be drawn amongst these quality related measures in the two forge samples.}, keywords = {artefacts, artifacts, coding standards, coding style, complexity, forge, forges, kde, metrics, quality, source code, sourceforge}, issn = {1571-0661}, doi = {DOI: 10.1016/j.entcs.2009.02.063}, url = {http://www.sciencedirect.com/science/article/B75H1-4VXDKRV-7/2/abcc2be2c4c3998e4bc9b53473ca2d81}, author = {Capiluppi, Andrea and Boldyreff, Cornelia and Karl Beecher and Paul J. Adams} } @conference {1066, title = {SourcererDB: An aggregated repository of statically analyzed and cross-linked open source Java projects}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)}, year = {2009}, pages = {183 - 186}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {The open source movement has made vast quantities of source code available online for free, providing an extremely large dataset for empirical study and potential re-use. A major difficulty in exploiting this potential fully is that the data are currently scattered between competing source code repositories, none of which are structured for empirical analysis and cross-project comparison. As a result, software researchers and developers are left to compile their own datasets, resulting in duplicated effort and limited results. To address this challenge, we built SourcererDB, an aggregated repository of statically analyzed and cross-linked open source Java projects. SourcererDB contains local snapshots of 2,852 Java projects taken from Sourceforge, Apache and Java.net. These projects are statically analyzed to extract rich structural information, which is then stored in a relational database. References to entities in the 16,058 external jars are resolved and grouped, allowing for cross-project usage information to be accessed easily. This paper describes: (a) the mechanism for resolving and grouping these cross-project references, (b) the structure of and the metamodel for the SourcererDB repository, and (d) end-user dataset access mechanisms. Our goal in building SourcererDB is to provide a rich dataset of source code to facilitate the sharing of extracted data and to encourage reuse and repeatability of experiments.}, keywords = {apache, java, java.net, source code, sourceforge, sourcerer}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069501}, author = {Ossher, Joel and Bajracharya, Sushil and Linstead, Erik and Baldi, Pierre and Lopes, Cristina} } @conference {938, title = {Using association rules to study the co-evolution of production \& test code}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {151 - 154}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Unit tests are generally acknowledged as an important aid to produce high quality code, as they provide quick feedback to developers on the correctness of their code. In order to achieve high quality, well-maintained tests are needed. Ideally, tests co-evolve with the production code to test changes as soon as possible. In this paper, we explore an approach based on association rule mining to determine whether production and test code co-evolve synchronously. Through two case studies, one with an open source and another one with an industrial software system, we show that our association rule mining approach allows one to assess the co-evolution of product and test code in a software project and, moreover, to uncover the distribution of programmer effort over pure coding, pure testing, or a more test-driven-like practice.}, keywords = {association rules, checkstyle, source code, unit test}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069493}, attachments = {https://flosshub.org/sites/flosshub.org/files/151UsingAssociation.pdf}, author = {Lubsen, Zeeger and Zaidman, Andy and Pinzger, Martin} } @article {denBesten2008316, title = {The allocation of collaborative efforts in open-source software}, journal = {Information Economics and Policy}, volume = {20}, number = {4}, year = {2008}, note = {"we have selected a set of 10 large open-source projects" apache, cvs, gaim, gcc, ghostscript, mozilla, netbsd, openssh, postgresql, python "Our data were extracted from logs of development activity generated by software version control systems. For each project in the selection, we extracted CVS development logs" "We notably computed for each file in the sample, and for each month in its history, the number of distinct maintainers that had committed a change during that month, and the number of commits, the blocks of code addition, each file had received during that month." "other variables used in the regressions are proxies for the size, age, and granularity of files; the size of a file is represented as its number of lines of code (LOCs), its age by its creation date (Youth), and its granularity by the number of functions it contains."}, pages = {316 - 322}, abstract = {The article investigates the allocation of collaborative efforts among core developers (maintainers) of open-source software by analyzing on-line development traces (logs) for a set of 10 large projects. Specifically, we investigate whether the division of labor within open-source projects is influenced by characteristics of software code. We suggest that the collaboration among maintainers tends to be influenced by different measures of code complexity. We interpret these findings by providing preliminary evidence that the organization of open-source software development would self-adapt to characteristics of the code base, in a {\textquoteright}stigmergic{\textquoteright} manner.}, keywords = {age, apache, complexity, cvs, division of labor, functions, gaim, gcc, ghostscript, lines of code, loc, log files, mozilla, netbsd, openssh, postgresql, python, revision control, scm, size, source code, Stigmergy, version control}, issn = {0167-6245}, doi = {DOI: 10.1016/j.infoecopol.2008.06.003}, url = {http://www.sciencedirect.com/science/article/B6V8J-4SSG4PN-1/2/88b3824c30a31c18929d8a5ca6d64f62}, author = {den Besten, Matthijs and Jean-Michel Dalle and Galia, Fabrice} } @conference {Hill:2008:AAM:1370750.1370771, title = {AMAP: automatically mining abbreviation expansions in programs to enhance software maintenance tools}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {79{\textendash}88}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {When writing software, developers often employ abbreviations in identifier names. In fact, some abbreviations may never occur with the expanded word, or occur more often in the code. However, most existing program comprehension and search tools do little to address the problem of abbreviations, and therefore may miss meaningful pieces of code or relationships between software artifacts. In this paper, we present an automated approach to mining abbreviation expansions from source code to enhance software maintenance tools that utilize natural language information. Our scoped approach uses contextual information at the method, program, and general software level to automatically select the most appropriate expansion for a given abbreviation. We evaluated our approach on a set of 250 potential abbreviations and found that our scoped approach provides a 57\% improvement in accuracy over the current state of the art.}, keywords = {automatic abbreviation expansion, azureus, itext.net, liferay, maintenance, natural language, openoffice.org, program comprehension, source code, tiger envelopes, tools}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370771}, url = {http://doi.acm.org/10.1145/1370750.1370771}, attachments = {https://flosshub.org/sites/flosshub.org/files/p79-hill.pdf}, author = {Hill, Emily and Fry, Zachary P. and Boyd, Haley and Sridhara, Giriprasad and Novikova, Yana and Pollock, Lori and Vijay-Shanker, K.} } @conference {Wermelinger:2008:AEE:1370750.1370783, title = {Analyzing the evolution of eclipse plugins}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, pages = {133{\textendash}136}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Eclipse is a good example of a modern component-based complex system that is designed for long-term evolution, due to its architecture of reusable and extensible components. This paper presents our preliminary results about the evolution of Eclipse{\textquoteright}s architecture, based on a lightweight and scalable analysis of the metadata in Eclipse{\textquoteright}s sources. We find that the development of Eclipse follows a systematic process: most architectural changes take place in milestones, and maintenance releases only make exceptional changes to component dependencies. We also found a stable architectural core that remains since the first release.}, keywords = {architectural evolution, cvs, eclipse, metadata, msr challenge, releases, source code}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370783}, url = {http://doi.acm.org/10.1145/1370750.1370783}, author = {Wermelinger, Michel and Yu, Yijun} } @conference {1211, title = {Are FLOSS developers committing to CVS/SVN as much as they are talking in mailing lists? Challenges for Integrating data from Multiple Repositories}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, month = {09/2008}, pages = {49-54}, abstract = {This paper puts forward a framework for investigating Free and Open Source Software (F/OSS) developers activities in both source code and mailing lists repositories. We used data dumps of fourteen pro jects from the FLOSSMetrics (FM) retrieval system. Our intentions are (i) to present a possible methodology, its advantages and disadvantages which can benefit future researchers using some aspects of the FM retrieval system{\textquoteright}s data dumps, and (ii) discuss our initial research results on the contributions developers make to both coding and lists activities.}, keywords = {cvs, cvsanaly, developers, email, email archives, flossmetrics, mailing list, mlstats, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/49-542008.pdf}, author = {Sowe, Sulayman K. and Samoladas, Ioannis and Ioannis Stamelos and Lefteris Angelis} } @conference {971, title = {Branching and merging in the repository}, booktitle = {the 2008 international workshopProceedings of the 2008 international workshop on Mining software repositories - MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {19-22}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {Two of the most complex operations version control software allows a user to perform are branching and merging. Branching provides the user the ability to create a copy of the source code to allow changes to be stored in version control but outside of the trunk. Merging provides the user the ability to copy changes from a branch to the trunk. Performing a merge can be a tedious operation and one that may be error prone. In this paper, we compare file revisions found on branches with those found on the trunk to determine when a change that is applied to a branch is moved to the trunk. This will allow us to study how developers use merges and to determine if merges are in fact more error prone than other commits.}, keywords = {argouml, changes, cvs2svn, diffj, revision, scm, source code, version control}, isbn = {9781605580241}, doi = {10.1145/1370750.1370754}, attachments = {https://flosshub.org/sites/flosshub.org/files/p19-williams.pdf}, author = {Spacco, Jamie and Williams, Chadd C.} } @conference {970, title = {Determinism and evolution}, booktitle = {Proceedings of the 2008 international workshop on Mining software repositories - MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {1-9}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {It has been proposed that software evolution follows a Self-Organized Criticality (SOC) dynamics. This fact is supported by the presence of long range correlations in the time series of the number of changes made to the source code over time. Those long range correlations imply that the current state of the project was determined time ago. In other words, the evolution of the software project is governed by a sort of determinism. But this idea seems to contradict intuition. To explore this apparent contradiction, we have performed an empirical study on a sample of 3,821 libre (free, open source) software projects, finding that their evolution projects is short range correlated. This suggests that the dynamics of software evolution may not be SOC, and therefore that the past of a project does not determine its future except for relatively short periods of time, at least for libre software.}, keywords = {changes, evolution, source code, sourceforge}, isbn = {9781605580241}, doi = {10.1145/1370750.1370752}, attachments = {https://flosshub.org/sites/flosshub.org/files/p1-herraiz.pdf}, author = {Gonz{\'a}lez-Barahona, Jes{\'u}s M. and Gregorio Robles and Herraiz, Israel} } @article {Koch2008345, title = {Effort modeling and programmer participation in open source software projects}, journal = {Information Economics and Policy (Empirical Issues in Open Source Software)}, volume = {20}, number = {4}, year = {2008}, note = {"Using a two-step approach, first a detailed case study on one project, GNOME, will be undertaken, then a large data set retrieved from a project hosting site, SourceForge.net, will be used to validate the results." CVS was the main source of data "e-mails sent to the different project discussion lists were identified as an additional source of information especially on communication and coordination besides the CVS-repository" basic counts were calculated for developer discussion levels}, month = {12/2008}, pages = {345 - 355}, abstract = {This paper develops models for programmer participation and effort estimation in open source software projects and employs the results to assess the efficiency of open source software creation. Successful development of such models will be important for decision makers of various kinds. We propose hypotheses based on a prior case study on manpower function and effort modeling. A large data set retrieved from a project repository is used to test these hypotheses. The main results are that if Norden-Rayleigh-based approaches are used, they need to be complemented in order to account for the addition of new features during a product life cycle, and that programmer-participation based effort models result in distinctly lower estimations of effort than those based on output metrics, such as lines of code.}, keywords = {cvs, developers, email, email archives, gnome, lines of code, scm, Software repository mining, source code, sourceforge}, issn = {0167-6245}, doi = {DOI: 10.1016/j.infoecopol.2008.06.004}, url = {http://www.sciencedirect.com/science/article/B6V8J-4SSND1J-1/2/c857fa1493e19aa7fe4297dedb077b3a}, attachments = {https://flosshub.org/sites/flosshub.org/files/KochEffortModeling.pdf}, author = {Koch, Stefan} } @article {1100, title = {An Empirical Study on the Relationship Between Software Design Quality, Development Effort and Governance in Open Source Projects}, journal = {IEEE Transactions on Software Engineering}, volume = {34}, year = {2008}, note = {"empirical data from a sample of 75 major OS projects" no PDF to confirm [ms]}, month = {11/2008}, pages = {765 - 782}, abstract = {The relationship among software design quality, development effort, and governance practices is a traditional research problem. However, the extent to which consolidated results on this relationship remain valid for open source (OS) projects is an open research problem. An emerging body of literature contrasts the view of open source as an alternative to proprietary software and explains that there exists a continuum between closed and open source projects. This paper hypothesizes that as projects approach the OS end of the continuum, governance becomes less formal. In turn a less formal governance is hypothesized to require a higher-quality code as a means to facilitate coordination among developers by making the structure of code explicit and facilitate quality by removing the pressure of deadlines from contributors. However, a less formal governance is also hypothesized to increase development effort due to a more cumbersome coordination overhead. The verification of research hypotheses is based on empirical data from a sample of 75 major OS projects. Empirical evidence supports our hypotheses and suggests that software quality, mainly measured as coupling and inheritance, does not increase development effort, but represents an important managerial variable to implement the more open governance approach that characterizes OS projects which, in turn, increases development effort.}, keywords = {effort estimation, governance, quality, source code}, issn = {0098-5589}, doi = {10.1109/TSE.2008.68}, author = {Capra, E. and Francalanci, C. and Merlo, F.} } @conference {SGKL09, title = {Evaluating the Quality of Open Source Software}, booktitle = {Electronic Notes in Theoretical Computer Science}, volume = {233}, year = {2008}, note = {"the software source code and the associated data stored in the version control system, the bug tracking databases, the mailing lists, and the wikis allow us to evaluate quality in a transparent way" "The data collection system collects the raw data from open source projects" Mailing lists are measured in: Number of unique subscribers, Number of messages in user/support list per month, Number of messages in developers list per month, Average thread depth}, month = {03/2009}, pages = {5{\textendash}28}, publisher = {The Reengineering Forum}, organization = {The Reengineering Forum}, abstract = {Traditionally, research on quality attributes was either kept under wraps within the organization that performed it, or carried out by outsiders using narrow, black-box techniques. The emergence of open source software has changed this picture allowing us to evaluate both software products and the processes that yield them. Thus, the software source code and the associated data stored in the version control system, the bug tracking databases, the mailing lists, and the wikis allow us to evaluate quality in a transparent way. Even better, the large number of (often competing) open source projects makes it possible to contrast the quality of comparable systems serving the same domain. Furthermore, by combining historical source code snapshots with significant events, such as bug discoveries and fixes, we can further dig into the causes and effects of problems. Here we present motivating examples, tools, and techniques that can be used to evaluate the quality of open source (and by extension also proprietary) software. }, keywords = {bug tracking system, email, email archives, mailing list, metrics, open source, process quality attributes, product quality attributes, source code, SQO-OSS, wiki}, doi = {10.1016/j.entcs.2009.02.058}, url = {http://www.dmst.aueb.gr/dds/pubs/conf/2008-SQM-SQOOSS/html/SGKL09.html}, attachments = {https://flosshub.org/sites/flosshub.org/files/entcs-sqooss.pdf}, author = {Diomidis Spinellis and Gousios, Georgios and Vassilios Karakoidas and Panagiotis Louridas and Paul J. Adams and Samoladas, Ioannis and Ioannis Stamelos} } @conference {Alonso:2008:EIV:1370750.1370780, title = {Expertise identification and visualization from CVS}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {125{\textendash}128}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {As software evolves over time, the identification of expertise becomes an important problem. Component ownership and team awareness of such ownership are signals of solid project. Ownership and ownership awareness are also issues in open-source software (OSS) projects. Indeed, the membership in OSS projects is dynamic with team members arriving and leaving. In large open source projects, specialists who know the system very well are considered experts. How can one identify the experts in a project by mining a particular repository like the source code? Have they gotten help from other people? We provide an approach using classification of the source code tree as a path to derive the expertise of the committers. Because committers may get help from other people, we also retrieve their contributors. We also provide a visualization that helps to further explore the repository via committers and categories. We present a prototype implementation that describes our research using the Apache HTTP Web server project as a case study.}, keywords = {apache, classification, committers, components, contributors, expertise, expertise identification, repository, scm, source code}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370780}, url = {http://doi.acm.org/10.1145/1370750.1370780}, attachments = {https://flosshub.org/sites/flosshub.org/files/p125-alonso.pdf}, author = {Alonso, Omar and Premkumar T. Devanbu and Gertz, Michael} } @conference {972, title = {Extracting structural information from bug reports}, booktitle = {Proceedings of the 2008 international workshop on Mining software repositories - MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {27-30}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {In software engineering experiments, the description of bug reports is typically treated as natural language text, although it often contains stack traces, source code, and patches. Neglecting such structural elements is a loss of valuable information; structure usually leads to a better performance of machine learning approaches. In this paper, we present a tool called infoZilla that detects structural elements from bug reports with near perfect accuracy and allows us to extract them. We anticipate that infoZilla can be used to leverage data from bug reports at a different granularity level that can facilitate interesting research in the future.}, keywords = {bug reports, eclipse, enumerations, infozilla, natural language, patches, source code, stack trace}, isbn = {9781605580241}, doi = {10.1145/1370750.1370757}, attachments = {https://flosshub.org/sites/flosshub.org/files/p27-bettenburg.pdf}, author = {Premraj, Rahul and Zimmermann, Thomas and Kim, Sunghun and Bettenburg, Nicolas} } @conference {Schuler:2008:MUE:1370750.1370779, title = {Mining usage expertise from version archives}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {121{\textendash}124}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In software development, there is an increasing need to find and connect developers with relevant expertise. Existing expertise recommendation systems are mostly based on variations of the Line 10 Rule: developers who changed a file most often have the most implementation expertise. In this paper, we introduce the concept of usage expertise, which manifests itself whenever developers are using functionality, e.g., by calling API methods. We present preliminary results for the ECLIPSE project that demonstrate that our technique allows to recommend experts for files with no or little history, identify developers with similar expertise, and measure the usage of API methods.}, keywords = {api, computer-supported cooperative work, eclipse, expertise, recommendation, scm, software repository, source code}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370779}, url = {http://doi.acm.org/10.1145/1370750.1370779}, attachments = {https://flosshub.org/sites/flosshub.org/files/p121-schuler.pdf}, author = {Schuler, David and Zimmermann, Thomas} } @conference {Pattison:2008:TWP:1370750.1370776, title = {Talk and work: a preliminary report}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {113{\textendash}116}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Developers in Open Source Software (OSS) projects communicate using mailing lists. By convention, the mailing lists used only for task-related discussions, so they are primarily concerned with the software under development, and software process issues (releases, etc.). We focus on the discussions concerning the software, and study the frequency with which software entities (functions, methods, classes, etc) are mentioned in the mail. We find a strong, striking, cumulative relationship between this mention count in the email, and the number of times these entities are included in changes to the software. When we study the same phenomena over a series of time-intervals, the relationship is much less strong. This suggests some interesting avenues for future research.}, keywords = {ant, apache, email, mailing lists, postgresql, python, scm, source code}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370776}, url = {http://doi.acm.org/10.1145/1370750.1370776}, attachments = {https://flosshub.org/sites/flosshub.org/files/p113-pattison.pdf}, author = {Pattison, David S. and Bird, Christian A. and Premkumar T. Devanbu} } @conference {1007, title = {Analysis of the Linux Kernel Evolution Using Code Clone Coverage}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {22 - 22}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Most studies of the evolution of software systems are based on the comparison of simple software metrics. In this paper, we present our preliminary investigation of the evolution of the Linux kernel using code-clone analysis and the code-clone coverage metrics. We examined 136 versions of the stable Linux kernel using a distributed extension of the code clone detection tool CCFinder. The result is shown as a heat map.}, keywords = {ccfinder, clone, cloning, kernel, linux, metrics, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.1}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300022.pdf}, author = {Livieri, Simone and Higo, Yoshiki and Matsushita, Makoto and Inoue, Katsuro} } @conference {996, title = {Correlating Social Interactions to Release History during Software Evolution}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {7 - 7}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {In this paper, we propose a method to reason about the nature of software changes by mining and correlating discussion archives. We employ an information retrieval approach to find correlation between source code change history and history of social interactions surrounding these changes. We apply our correlation method on two software systems, LSEdit and Apache Ant. The results of these exploratory case studies demonstrate the evidence of similarity between the content of free-form text emails among developers and the actual modifications in the code. We identify a set of correlation patterns between discussion and changed code vocabularies and discover that some releases referred to as minor should instead fall under the major category. These patterns can be used to give estimations about the type of a change and time needed to implement it.}, keywords = {ant, apache, change management, developers, discussion, effort estimation, lsedit, mailing lists, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.4}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300007.pdf}, author = {Baysal, Olga and Malton, Andrew J.} } @conference {1011, title = {Detecting Patch Submission and Acceptance in OSS Projects}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {26 - 26}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {The success of open source software (OSS) is completely dependent on the work of volunteers who contribute their time and talents. The submission of patches is the major way that participants outside of the core group of developers make contributions. We argue that the process of patch submission and acceptance into the codebase is an important piece of the open source puzzle and that the use of patch-related data can be helpful in understanding how OSS projects work. We present our methods in identifying the submission and acceptance of patches and give results and evaluation in applying these methods to the Apache webserver, Python interpreter, Postgres SQL database, and (with limitations) MySQL database projects. In addition, we present valuable ways in which this data has been and can be used.}, keywords = {apache, contributions, mysql, patches, postgresql, python, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.6}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300026.pdf}, author = {Christian Bird and Gourley, Alex and Devanbu, Prem} } @conference {992, title = {Determining Implementation Expertise from Bug Reports}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {2 - 2}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {As developers work on a software product they accumulate expertise, including expertise about the code base of the software product. We call this type of expertise "implementation expertise". Knowing the set of developers who have implementation expertise for a software product has many important uses. This paper presents an empirical evaluation of two approaches to determining implementation expertise from the data in source and bug repositories. The expertise sets created by the approaches are compared to those provided by experts and evaluated using the measures of precision and recall. We found that both approaches are good at finding all of the appropriate developers, although they vary in how many false positives are returned.}, keywords = {bug reports, developers, eclipse, expertise, repository, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.7}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300002.pdf}, author = {Anvik, John and Murphy, Gail C.} } @conference {1003, title = {Evaluating the Harmfulness of Cloning: A Change Based Experiment}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {18 - 18}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Cloning is considered a harmful practice for software maintenance because it requires consistent changes of the entities that share a cloned fragment. However this claim has not been refuted or confirmed empirically. Therefore, we have developed a prototype tool, CloneTracker, in order to study the rate of change of applications containing clones. This paper describes CloneTracker and illustrates its preliminary application on a case study.}, keywords = {ccfinder, clone, clones, clonetracker, cloning, ctags, cvs, dnsjava, maintenance, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.8}, author = {Lozano, Angela and Wermelinger, Michel and Nuseibeh, Bashar} } @conference {1001, title = {Identifying Changed Source Code Lines from Version Repositories}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {14 - 14}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Observing the evolution of software systems at different levels of granularity has been a key issue for a number of studies, aiming at predicting defects or at studying certain phenomena, such as the presence of clones or of crosscutting concerns. Versioning systems such as CVS and SVN, however, only provide information about lines added or deleted by a contributor: any change is shown as a sequence of additions and deletions. This provides an erroneous estimate of the amount of code changed. This paper shows how the evolution of changes at source code line level can be inferred from CVS repositories, by combining information retrieval techniques and the Levenshtein edit distance. The application of the proposed approach to the ArgoUML case study indicates a high precision and recall.}, keywords = {argouml, cvs, levenshtein, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.14}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300014.pdf}, author = {Canfora, Gerardo and Cerulo, Luigi and Di Penta, Massimiliano} } @conference {997, title = {Mining CVS Repositories to Understand Open-Source Project Developer Roles}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {8 - 8}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {This paper presents a model to represent the interactions of distributed open-source software developers and utilizes data mining techniques to derive developer roles. The model is then applied on case studies of two open-source projects, ORAC-DR and Mediawiki with encouraging results.}, keywords = {cvs, developer interaction, developers, mediawiki, orac-dr, roles, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.19}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300008.pdf}, author = {Yu, Liguo and Ramaswamy, Srini} } @conference {1015, title = {Mining Eclipse Developer Contributions via Author-Topic Models}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {30 - 30}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {We present the results of applying statistical author-topic models to a subset of the Eclipse 3.0 source code consisting of 2,119 source files and 700,000 lines of code from 59 developers. This technique provides an intuitive and automated framework with which to mine developer contributions and competencies from a given code base while simultaneously extracting software function in the form of topics. In addition to serving as a convenient summary for program function and developer activities, our study shows that topic models provide a meaningful, effective, and statistical basis for developer similarity analysis.}, keywords = {contributions, developers, eclipse, expertise, mining challenge, msr challenge, source code, topics}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.20}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300030.pdf}, author = {Linstead, Erik and Rigor, Paul and Bajracharya, Sushil and Lopes, Cristina and Baldi, Pierre} } @conference {1006, title = {Towards a Theoretical Model for Software Growth}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {21 - 21}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Software growth (and more broadly, software evolution) is usually considered in terms of size or complexity of source code. However in different studies, usually different metrics are used, which make it difficult to compare approaches and results. In addition, not all metrics are equally easy to calculate for a given source code, which leads to the question of which one is the easiest to calculate without losing too much information. To address both issues, in this paper present a comprehensive study, based on the analysis of about 700,000 C source code files, calculating several size and complexity metrics for all of them. For this sample, we have found double Pareto statistical distributions for all metrics considered, and a high correlation between any two of them. This would imply that any model addressing software growth should produce this Pareto distributions, and that analysis based on any of the considered metrics should show a similar pattern, provided the sample of files considered is large enough.}, keywords = {C, complexity, evolution, freebsd, growth, halstead, lines of code, loc, mccabe, metrics, scm, size, sloc, sloccount, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.31}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300021.pdf}, author = {Herraiz, Israel and Jesus M. Gonzalez-Barahona and Gregorio Robles} } @conference {1010, title = {Using Software Repositories to Investigate Socio-technical Congruence in Development Projects}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {25 - 25}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {We propose a quantitative measure of socio-technical congruence as an indicator of the performance of an organization in carrying out a software development project. We show how the information necessary to implement that measure can be mined from commonly used software repositories, and we describe how socio-technical congruence can be computed based on that information.}, keywords = {developers, graph, scm, social networks, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.33}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300025.pdf}, author = {Valetto, Giuseppe and Helander, Mary and Ehrlich, Kate and Chulani, Sunita and Wegman, Mark and Williams, Clay} } @conference {998, title = {Visual Data Mining in Software Archives to Detect How Developers Work Together}, booktitle = {Fourth International Workshop on Mining Software RepositoriesFourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {9 - 9}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Analyzing the check-in information of open source software projects which use a version control system such as CVS or SUBVERSION can yield interesting and important insights into the programming behavior of developers. As in every major project tasks are assigned to many developers, the development must be coordinated between these programmers. This paper describes three visualization techniques that help to examine how programmers work together, e.g. if they work as a team or if they develop their part of the software separate from each other. Furthermore, phases of stagnation in the lifetime of a project can be uncovered and thus, possible problems are revealed. To demonstrate the usefulness of these visualization techniques we performed case studies on two open source projects. In these studies interesting patterns of developers? behavior, e.g. the specialization on a certain module can be observed. Moreover, modules that have been changed by many developers can be identified as well as such ones that have been altered by only one programmer.}, keywords = {change, coordination, cvs, developers, junit, modules, scm, source code, svn, teams, tomcat, visualization}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.34}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300009.pdf}, author = {Weissgerber, Peter and Pohl, Mathias and Burch, Michael} } @article {120, title = {Applying Social Network Analysis Techniques to Community-Driven Libre Software Projects}, journal = {International Journal of Information Technology and Web Engineering}, volume = {1}, number = {3}, year = {2006}, abstract = {Source code management repositories of large, long-lived libre (free, open source) software projects can be a source of valuable data about the organizational structure, evolution, and knowledge exchange in the corresponding development communities. Unfortunately, the sheer volume of the available information renders it almost unusable without applying methodologies which highlight the relevant information for a given aspect of the project. Such methodology is proposed in this article, based on well known concepts from the social networks analysis field, which can be used to study the relationships among developers and how they collaborate in different parts of a project. It is also applied to data mined from some well known projects (Apache, GNOME, and KDE), focusing on the characterization of their collaboration network architecture. These cases help to understand the potentials of the methodology and how it is applied, but also shows some relevant results which open new paths in the understanding of the informal organization of libre software development communities.}, keywords = {apache, conway{\textquoteright}s law, cvs, gnome, kde, scm, social network analysis, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/06_Lopez_ijitwe_sna.pdf}, author = {L{\'o}pez-Fern{\'a}ndez, L. and Gregorio Robles and Jesus M. Gonzalez-Barahona and Herraiz, I.} } @conference {D{\textquoteright}Ambros:2006:AER:1137983.1138029, title = {Applying the evolution radar to PostgreSQL}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {177{\textendash}178}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, keywords = {cvs, documentation, evolution, evolution radar, logical coupling, makefile, mining challenge, msr challenge, postgresql, re-engineering, refactoring, release history, rhdb, source code, version control, visualization}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138029}, url = {http://doi.acm.org/10.1145/1137983.1138029}, attachments = {https://flosshub.org/sites/flosshub.org/files/177ApplyingEvolution.pdf}, author = {D{\textquoteright}Ambros, Marco and Lanza, Michele} } @conference {Sager:2006:DSJ:1137983.1138000, title = {Detecting similar Java classes using tree algorithms}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {65{\textendash}71}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Similarity analysis of source code is helpful during development to provide, for instance, better support for code reuse. Consider a development environment that analyzes code while typing and that suggests similar code examples or existing implementations from a source code repository. Mining software repositories by means of similarity measures enables and enforces reusing existing code and reduces the developing effort needed by creating a shared knowledge base of code fragments. In information retrieval similarity measures are often used to find documents similar to a given query document. This paper extends this idea to source code repositories. It introduces our approach to detect similar Java classes in software projects using tree similarity algorithms. We show how our approach allows to find similar Java classes based on an evaluation of three tree-based similarity measures in the context of five user-defined test cases as well as a preliminary software evolution analysis of a medium-sized Java project. Initial results of our technique indicate that it (1) is indeed useful to identify similar Java classes, (2)successfully identifies the ex ante and ex post versions of refactored classes, and (3) provides some interesting insights into within-version and between-version dependencies of classes within a Java project.}, keywords = {change analysis, clones, coogle, eclipse, famix, java, similarity, software evolution, software repositories, source code, tree similarity measures}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138000}, url = {http://doi.acm.org/10.1145/1137983.1138000}, attachments = {https://flosshub.org/sites/flosshub.org/files/65Detecting.pdf}, author = {Sager, Tobias and Bernstein, Abraham and Pinzger, Martin and Kiefer, Christoph} } @conference {D{\textquoteright}Ambros:2006:ERV:1137983.1137992, title = {The evolution radar: visualizing integrated logical coupling information}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {26{\textendash}32}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In software evolution research logical coupling has extensively been used to recover the hidden dependencies between source code artifacts. They would otherwise go lost because of the file-based nature of current versioning systems. Previous research has dealt with low-level couplings between files, leading to an explosion of data to be analyzed, or has abstracted the logical couplings to module level, leading to a loss of detailed information. In this paper we propose a visualization-based approach which integrates both file-level and module-level logical coupling information. This not only facilitates an in-depth analysis of the logical couplings at all granularity levels, it also leads to a precise characterization of the system modules in terms of their logical coupling dependencies.}, keywords = {change management, cvs, evolution, logical coupling, mozilla, scm, source code, thunderbird, tinderbox, visualization}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137992}, url = {http://doi.acm.org/10.1145/1137983.1137992}, attachments = {https://flosshub.org/sites/flosshub.org/files/26TheEvolutionRadar.pdf}, author = {D{\textquoteright}Ambros, Marco and Lanza, Michele and Lungu, Mircea} } @conference {Jiang:2006:EEC:1137983.1138030, title = {Examining the evolution of code comments in PostgreSQL}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {179{\textendash}180}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {It is common, especially in large software systems, for developers to change code without updating its associated comments due to their unfamiliarity with the code or due to time constraints. This is a potential problem since outdated comments may confuse or mislead developers who perform future development. Using data recovered from CVS, we study the evolution of code comments in the PostgreSQL project. Our study reveals that over time the percentage of commented functions remains constant except for early fluctuation due to the commenting style of a particular active developer.}, keywords = {code comments, comments, cvs, evolution, functions, maintenance, mining challenge, msr challenge, postgresql, software evolution, software maintenance, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138030}, url = {http://doi.acm.org/10.1145/1137983.1138030}, attachments = {https://flosshub.org/sites/flosshub.org/files/179ExaminingTheEvolution.pdf}, author = {Zhen Ming Jiang and Hassan, Ahmed E.} } @conference {Canfora:2006:FGI:1137983.1138009, title = {Fine grained indexing of software repositories to support impact analysis}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {105{\textendash}111}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Versioned and bug-tracked software systems provide a huge amount of historical data regarding source code changes and issues management. In this paper we deal with impact analysis of a change request and show that data stored in software repositories are a good descriptor on how past change requests have been resolved. A fine grained analysis method of software repositories is used to index code at different levels of granularity, such as lines of code and source files, with free text contained in software repositories. The method exploits information retrieval algorithms to link the change request description and code entities impacted by similar past change requests. We evaluate such approach on a set of three open-source projects.}, keywords = {argouml, change analysis, Firefox, gedit, impact analysis, mining software repositories, scm, source code, version control}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138009}, url = {http://doi.acm.org/10.1145/1137983.1138009}, attachments = {https://flosshub.org/sites/flosshub.org/files/105FineGrained.pdf}, author = {Canfora, Gerardo and Cerulo, Luigi} } @conference {Askari:2006:ITE:1137983.1138013, title = {Information theoretic evaluation of change prediction models for large-scale software}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {126{\textendash}132}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this paper, we analyze the data extracted from several open source software repositories. We observe that the change data follows a Zipf distribution. Based on the extracted data, we then develop three probabilistic models to predict which files will have changes or bugs. The first model is Maximum Likelihood Estimation (MLE), which simply counts the number of events, i.e., changes or bugs, that happen to each file and normalizes the counts to compute a probability distribution. The second model is Reflexive Exponential Decay (RED) in which we postulate that the predictive rate of modification in a file is incremented by any modification to that file and decays exponentially. The third model is called RED-Co-Change. With each modification to a given file, the RED-Co-Change model not only increments its predictive rate, but also increments the rate for other files that are related to the given file through previous co-changes. We then present an information-theoretic approach to evaluate the performance of different prediction models. In this approach, the closeness of model distribution to the actual unknown probability distribution of the system is measured using cross entropy. We evaluate our prediction models empirically using the proposed information-theoretic approach for six large open source systems. Based on this evaluation, we observe that of our three prediction models, the RED-Co-Change model predicts the distribution that is closest to the actual distribution for all the studied systems.}, keywords = {bugs, change analysis, cvs, evaluation approach, file, freebsd, information theory, kde, koffice, log files, netbsd, openbsd, postgresql, prediction, prediction models, scm, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138013}, url = {http://doi.acm.org/10.1145/1137983.1138013}, attachments = {https://flosshub.org/sites/flosshub.org/files/126InformationTheoretic.pdf}, author = {Askari, Mina and Holt, Ric} } @conference {1094, title = {Knowledge Reuse in Open Source Software: An Exploratory Study of 15 Open Source Projects}, booktitle = {Proceedings of the 38th Annual Hawaii International Conference on System Sciences}, year = {2006}, note = {"In a first step, we asked developers of different open source projects to respond to a very short web-based survey."... "In a second step, we started with gathering data from 15 projects, including interviews. In parallel, the source code, CVS comments and to a certain extent email communication was analyzed to receive a dynamic, and within the limits of the method, complete picture of knowledge reuse practices." "Projects included games (Adonthell, FlightGear, Xboard), text processing (Abiword), a GNU/Linux desktop (Xfce4), an instant messenger client (Miranda), fax software (HylaFAX), a content management system (Tiki/CMS Groupware), encryption software (OpenSSL), a collaborative music system (iRATE Radio), file sharing networks (GNUnet, Mnet, Freenet), a mailing list manager (Mailman), and an mp3 encoder (Lame)." "Our data sources included interviews with key developers, source code, CVS comments, mailing lists and various Internet resources"}, month = {2006}, pages = {1-10}, publisher = {IEEE}, organization = {IEEE}, address = {Big Island, HI, USA}, abstract = {To date, there is no investigation of knowledge reuse in open source software projects. This paper focuses on the forms of knowledge reuse and the factors impacting on them. It develops a theory drawn from data of 15 open source software projects and finds that the effort to search, integrate and maintain external knowledge influences the form of knowledge to be reused. Implications for firms and innovation research are discussed.}, keywords = {cvs, email, knowledge reuse, lines of code, loc, source code, Survey}, doi = {10.1109/HICSS.2005.378}, url = {http://www.computer.org/csdl/proceedings/hicss/2005/2268/07/22680198b-abs.html}, author = {von Krogh, G. and Spaeth, S. and Haefliger, S.} } @article {Yu:2006:MKO:1150566.1150571, title = {Maintainability of the kernels of open-source operating systems: A comparison of Linux with FreeBSD, NetBSD, and OpenBSD}, journal = {J. Syst. Softw.}, volume = {79}, year = {2006}, note = {"Data regarding the number and total number of lines of code of kernel and nonkernel modules in the four operating systems are provided in Table 1" loc, kloc, number of kernel modules, number of nonkernel modules size c files .h files}, month = {June}, pages = {807{\textendash}815}, publisher = {Elsevier Science Inc.}, address = {New York, NY, USA}, abstract = {We compared and contrasted the maintainability of four open-source operating systems: Linux, FreeBSD, NetBSD, and OpenBSD. We used our categorization of common coupling in kernel-based software to highlight future maintenance problems. An unsafe definition is a definition of a global variable that can affect a kernel module if that definition is changed. For each operating system we determined a number of measures, including the number of global variables, the number of instances of global variables in the kernel and overall, as well as the number of unsafe definitions in the kernel and overall. We also computed the value of each our measures per kernel KLOC and per KLOC overall. For every measure and every ratio, Linux compared unfavorably with FreeBSD, NetBSD, and OpenBSD. Accordingly, we are concerned about the future maintainability of Linux. }, keywords = {abiword, Common coupling, coupling, Definition-use analysis, freebsd, kernel, lines of code, linux, linux kernel, loc, Maintainability, modules, netbsd, Open-source software, openbsd, source code}, issn = {0164-1212}, doi = {http://dx.doi.org/10.1016/j.jss.2005.08.014}, url = {http://dx.doi.org/10.1016/j.jss.2005.08.014}, attachments = {https://flosshub.org/sites/flosshub.org/files/YuSchachChen.pdf}, author = {Yu, Liguo and Schach, Stephen R. and Chen, Kai and Heller, Gillian Z. and Offutt, Jeff} } @conference {Xie:2006:MMA:1137983.1137997, title = {MAPO: mining API usages from open source repositories}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {54{\textendash}57}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {To improve software productivity, when constructing new software systems, developers often reuse existing class libraries or frameworks by invoking their APIs. Those APIs, however, are often complex and not well documented, posing barriers for developers to use them in new client code. To get familiar with how those APIs are used, developers may search the Web using a general search engine to find relevant documents or code examples. Developers can also use a source code search engine to search open source repositories for source files that use the same APIs. Nevertheless, the number of returned source files is often large. It is difficult for developers to learn API usages from a large number of returned results. In order to help developers understand API usages and write API client code more effectively, we have developed an API usage mining framework and its supporting tool called MAPO (for Mining API usages from Open source repositories). Given a query that describes a method, class, or package for an API, MAPO leverages the existing source code search engines to gather relevant source files and conducts data mining. The mining leads to a short list of frequent API usages for developers to inspect. MAPO currently consists of five components: a code search engine, a source code analyzer, a sequence preprocessor, a frequent sequence miner, and a frequent sequence post processor. We have examined the effectiveness of MAPO using a set of various queries. The preliminary results show that the framework is practical for providing informative and succinct API usage patterns.}, keywords = {api, application programming interfaces, documentation, mining software repositories, pmd, program comprehension, search engine, sequences, source code, source code search engine}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137997}, url = {http://doi.acm.org/10.1145/1137983.1137997}, attachments = {https://flosshub.org/sites/flosshub.org/files/54MAPO.pdf}, author = {Xie, Tao and Pei, Jian} } @conference {Kim:2006:MPE:1137983.1137995, title = {Micro pattern evolution}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {40{\textendash}46}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {When analyzing the evolution history of a software project, we wish to develop results that generalize across projects. One approach is to analyze design patterns, permitting characteristics of the evolution to be associated with patterns, instead of source code. Traditional design patterns are generally not amenable to reliable automatic extraction from source code, yet automation is crucial for scalable evolution analysis. Instead, we analyze {\textquotedblleft}micro pattern{\textquotedblright} evolution; patterns whose abstraction level is closer to source code, and designed to be automatically extractable from Java source code or bytecode. We perform micro-pattern evolution analysis on three open source projects, ArgoUML, Columba, and jEdit to identify micro pattern frequencies, common kinds of pattern evolution, and bug-prone patterns. In all analyzed projects, we found that the micro patterns of Java classes do not change often. Common bug- prone pattern evolution kinds are {\textquoteleft}Pool {\textrightarrow} Pool{\textquoteright}, {\textquoteleft}Implementor {\textrightarrow} NONE{\textquoteright}, and {\textquoteleft}Sampler {\textrightarrow} Sampler{\textquoteright}. Among all pattern evolution kinds,{\textquoteleft}Box{\textquoteright},{\textquoteleft}CompoundBox{\textquoteright}, {\textquoteleft}Pool{\textquoteright}, {\textquoteleft}CommonState{\textquoteright}, and {\textquoteleft}Outline{\textquoteright} micro patterns have high bug rates, but they have low frequencies and a small number of changes. The pattern evolution kinds that are bug-prone are somewhat similar across projects. The bug-prone pattern evolution kinds of two different periods of the same project are almost identical.}, keywords = {argouml, bugs, columba, design patterns, evolution, extraction, java, jedit, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137995}, url = {http://doi.acm.org/10.1145/1137983.1137995}, attachments = {https://flosshub.org/sites/flosshub.org/files/40MicroPattern.pdf}, author = {Kim, Sunghun and Pan, Kai and Whitehead,Jr., E. James} } @conference {Zimmermann:2006:MAM:1137983.1138025, title = {Mining additions of method calls in ArgoUML}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {169{\textendash}170}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this paper we refine the classical co-change to the addition of method calls. We use this concept to find usage patterns and to identify cross-cutting concerns for ArgoUML.}, keywords = {argouml, change analysis, eclipse, function calls, mining challenge, msr challenge, pattern, source code, xelopes}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138025}, url = {http://doi.acm.org/10.1145/1137983.1138025}, attachments = {https://flosshub.org/sites/flosshub.org/files/169MiningAdditions.pdf}, author = {Zimmermann, Thomas and Breu, Silvia and Lindig, Christian and Livshits, Benjamin} } @conference {Breu:2006:MEC:1137983.1138006, title = {Mining eclipse for cross-cutting concerns}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {94{\textendash}97}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Software may contain functionality that does not align with its architecture. Such cross-cutting concerns do not exist from the beginning but emerge over time. By analysing where developers add code to a program, our history-based mining identifies cross-cutting concerns in a two-step process. First, we mine CVS archives for sets of methods where a call to a specific single method was added. In a second step, such simple cross-cutting concerns are combined to complex cross-cutting concerns. To compute these efficiently, we apply formal concept analysis{\textemdash}an algebraic theory. History-based mining scales well: we are the first to report aspects mined from an industrial-sized project like Eclipse. For example, we identified a locking concern that crosscuts 1284 methods.}, keywords = {aspects, concept analysis, cvs, eclipse, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138006}, url = {http://doi.acm.org/10.1145/1137983.1138006}, attachments = {https://flosshub.org/sites/flosshub.org/files/94MiningEclipse.pdf}, author = {Breu, Silvia and Zimmermann, Thomas and Lindig, Christian} } @conference {Bird:2006:MES:1137983.1138033, title = {Mining email social networks in Postgres}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {185{\textendash}186}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Open Source Software (OSS) projects provide a unique opportunity to gather and analyze publicly available historical data. The Postgres SQL server, for example, has over seven years of recorded development and communication activity. We mined data from both the source code repository and the mailing list archives to examine the relationship between communication and development in Postgres. Along the way, we had to deal with the difficult challenge of resolving email aliases. We used a number of social network analysis measures and statistical techniques to analyze this data. We present our findings in this paper.}, keywords = {developers, email, email archives, open source, postgresql, scm, social network analysis, social networks, source code, status}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138033}, url = {http://doi.acm.org/10.1145/1137983.1138033}, attachments = {https://flosshub.org/sites/flosshub.org/files/185MiningEmail.pdf}, author = {Christian Bird and Gourley, Alex and Devanbu, Prem and Gertz, Michael and Swaminathan, Anand} } @conference {Kagdi:2006:MSC:1137983.1137996, title = {Mining sequences of changed-files from version histories}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {47{\textendash}53}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Modern source-control systems, such as Subversion, preserve change-sets of files as atomic commits. However, the specific ordering information in which files were changed is typically not found in these source-code repositories. In this paper, a set of heuristics for grouping change-sets (i.e., log-entries) found in source-code repositories is presented. Given such groups of change-sets, sequences of files that frequently change together are uncovered. This approach not only gives the (unordered) sets of files but supplements them with (partial temporal) ordering information. The technique is demonstrated on a subset of KDE source-code repository. The results show that the approach is able to find sequences of changed-files.}, keywords = {change, change history, change management, change sequences, heuristics, kde, mining software repositories, scm, sequences, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137996}, url = {http://doi.acm.org/10.1145/1137983.1137996}, attachments = {https://flosshub.org/sites/flosshub.org/files/47MiningSequences.pdf}, author = {Kagdi, Huzefa and Yusuf, Shehnaaz and Maletic, Jonathan I.} } @conference {Voinea:2006:MSR:1137983.1138024, title = {Mining software repositories with CVSgrab}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {167{\textendash}168}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, keywords = {argouml, cvs, cvsgrab, evolution, mining challenge, msr challenge, postgresql, software visualization, source code, team, visualization}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138024}, url = {http://doi.acm.org/10.1145/1137983.1138024}, attachments = {https://flosshub.org/sites/flosshub.org/files/167MiningSoftware.pdf}, author = {Voinea, Lucian and Telea, Alexandru} } @conference {Zimmermann:2006:MVA:1137983.1138001, title = {Mining version archives for co-changed lines}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {72{\textendash}75}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Files, classes, or methods have frequently been investigated in recent research on co-change. In this paper, we present a first study at the level of lines. To identify line changes across several versions, we define the annotation graph which captures how lines evolve over time. The annotation graph provides more fine-grained software evolution information such as life cycles of each line and related changes: "Whenever a developer changed line 1 of version.txt she also changed line 25 of Library.java."}, keywords = {change, change analysis, change management, graph, lines of code, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138001}, url = {http://doi.acm.org/10.1145/1137983.1138001}, attachments = {https://flosshub.org/sites/flosshub.org/files/72MiningVersionArchives.pdf}, author = {Zimmermann, Thomas and Kim, Sunghun and Zeller, Andreas and Whitehead,Jr., E. James} } @conference {Knab:2006:PDD:1137983.1138012, title = {Predicting defect densities in source code files with decision tree learners}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {119{\textendash}125}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {With the advent of open source software repositories the data available for defect prediction in source files increased tremendously. Although traditional statistics turned out to derive reasonable results the sheer amount of data and the problem context of defect prediction demand sophisticated analysis such as provided by current data mining and machine learning techniques.In this work we focus on defect density prediction and present an approach that applies a decision tree learner on evolution data extracted from the Mozilla open source web browser project. The evolution data includes different source code, modification, and defect measures computed from seven recent Mozilla releases. Among the modification measures we also take into account the change coupling, a measure for the number of change-dependencies between source files. The main reason for choosing decision tree learners, instead of for example neural nets, was the goal of finding underlying rules which can be easily interpreted by humans. To find these rules, we set up a number of experiments to test common hypotheses regarding defects in software entities. Our experiments showed, that a simple tree learner can produce good results with various sets of input data.}, keywords = {change analysis, data mining, decision tree learner, defect density, defect prediction, mozilla, prediction, release history, scm, source code, version control}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138012}, url = {http://doi.acm.org/10.1145/1137983.1138012}, attachments = {https://flosshub.org/sites/flosshub.org/files/119Predicting.pdf}, author = {Knab, Patrick and Pinzger, Martin and Bernstein, Abraham} } @conference {German:2006:SCP:1137983.1138022, title = {A study of the contributors of PostgreSQL}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {163{\textendash}164}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {This report describes some characteristics of the development team of PostgreSQL that were uncovered by analyzing the history of its software artifacts as recorded by the project{\textquoteright}s CVS repository.}, keywords = {contributions, contributors, cvs, developers, mining challenge, mining software repositories, msr challenge, patches, postgresql, revision history, roles, software evolution, source code, team}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138022}, url = {http://doi.acm.org/10.1145/1137983.1138022}, attachments = {https://flosshub.org/sites/flosshub.org/files/163AStudyOf.pdf}, author = {Daniel M. German} } @article {1122, title = {Understanding the Motivations, Participation, and Performance of Open Source Software Developers: A Longitudinal Study of the Apache Projects}, journal = {Management Science}, volume = {52}, year = {2006}, note = {"analyzing archival data collected from OSS project records over a period of four years" "Apart from the source and binary codes of the actual software programs, Apache products include developer websites, change logs, documentation, and developer communications in the form of e-mail archives. From these products, we extracted two types of information: each contributor{\textquoteright}s progression along the Apache career path, and each contributor{\textquoteright}s source code contributions to the project."}, month = {07/2006}, pages = {984 - 999}, abstract = {Understanding what motivates participation is a central theme in the research on open source software (OSS) development. Our study contributes by revealing how the different motivations of OSS developers are interrelated, how these motivations influence participation leading to performance, and how past performance influences subsequent motivations. Drawing on theories of intrinsic and extrinsic motivation, we develop a theoretical model relating the motivations, participation, and performance of OSS developers. We evaluate our model using survey and archival data collected from a longitudinal field study of software developers in the Apache projects. Our results reveal several important findings. First, we find that developers{\textquoteright} motivations are not independent but rather are related in complex ways. Being paid to contribute to Apache projects is positively related to developers{\textquoteright} status motivations but negatively related to their use-value motivations. Perhaps surprisingly, we find no evidence of diminished intrinsic motivation in the presence of extrinsic motivations; rather, status motivations enhance intrinsic motivations. Second, we find that different motivations have an impact on participation in different ways. Developers{\textquoteright} paid participation and status motivations lead to above-average contribution levels, but use-value motivations lead to below-average contribution levels, and intrinsic motivations do not significantly impact average contribution levels. Third, we find that developers{\textquoteright} contribution levels positively impact their performance rankings. Finally, our results suggest that past-performance rankings enhance developers{\textquoteright} subsequent status motivations.}, keywords = {apache, change logs, contributions, email, email archives, extrinsic motivation, intrinsic motivation, mailing lists, MOTIVATION, open source software, participation, software development performance, source code, status, Survey}, issn = {1526-5501}, doi = {10.1287/mnsc.1060.0554}, author = {Roberts, Jeffrey A. and Il-Horn Hann and Slaughter, Sandra A.} } @conference {Kakimoto:2006:USB:1137983.1138026, title = {Using software birthmarks to identify similar classes and major functionalities}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {171{\textendash}172}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, keywords = {argouml, class, file, mining challenge, msr challenge, multi-dimensional scaling, similarity, software birthmark, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138026}, url = {http://doi.acm.org/10.1145/1137983.1138026}, attachments = {https://flosshub.org/sites/flosshub.org/files/171UsingSoftware.pdf}, author = {Kakimoto, Takeshi and Monden, Akito and Kamei, Yasutaka and Tamada, Haruaki and Tsunoda, Masateru and Matsumoto, Ken-ichi} } @conference {Canfora:2006:BRK:1137983.1138032, title = {Where is bug resolution knowledge stored?}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {183{\textendash}184}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {ArgoUML uses both CVS and Bugzilla to keep track of bug-fixing activities since 1998. A common practice is to reference source code changes resolving a bug stored in Bugzilla by inserting the id number of the bug in the CVS commit notes. This relationship reveals useful to predict code entities impacted by a new bug report.In this paper we analyze ArgoUML software repositories with a tool, we have implemented, showing what are Bugzilla fields that better predict such impact relationship, that is where knowledge about bug resolution is stored.}, keywords = {argouml, bugs, bugzilla, cvs, impact analysis, mining challenge, mining software repositories, msr challenge, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138032}, url = {http://doi.acm.org/10.1145/1137983.1138032}, attachments = {https://flosshub.org/sites/flosshub.org/files/183WhereIsBug.pdf}, author = {Canfora, Gerardo and Cerulo, Luigi} } @conference {Robles:2005:DIM:1083142.1083162, title = {Developer identification methods for integrated data from various sources}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {106-110}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Studying a software project by mining data from a single repository has been a very active research field in software engineering during the last years. However, few efforts have been devoted to perform studies by integrating data from various repositories, with different kinds of information, which would, for instance, track the different activities of developers. One of the main problems of these multi-repository studies is the different identities that developers use when they interact with different tools in different contexts. This makes them appear as different entities when data is mined from different repositories (and in some cases, even from a single one). In this paper we propose an approach, based on the application of heuristics, to identify the many identities of developers in such cases, and a data structure for allowing both the anonymized distribution of information, and the tracking of identities for verification purposes. The methodology will be presented in general, and applied to the GNOME project as a case example. Privacy issues and partial merging with new data sources will also be considered and discussed.}, keywords = {anonymization, bug tracker, developers, email, email address, gnome, identity, mailing list, privacy, source code, version control}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083162}, url = {http://doi.acm.org/10.1145/1082983.1083162}, attachments = {https://flosshub.org/sites/flosshub.org/files/106DeveloperIdentification.pdf}, author = {Gregorio Robles and Jesus M. Gonzalez-Barahona} } @article {102, title = {Empirical validation of object-oriented metrics on open source software for fault prediction}, journal = {IEEE Transactions on Software Engineering}, volume = {31}, number = {10}, year = {2005}, note = {"This paper describes how we calculated the object-oriented metrics given by Chidamber and Kemerer to illustrate how fault-proneness detection of the source code of the open source Web and e-mail suite called Mozilla can be carried out. We checked the values obtained against the number of bugs found in its bug database - called Bugzilla - using regression and machine learning methods to validate the usefulness of these metrics for fault-proneness prediction. We also compared the metrics of several versions of Mozilla to see how the predicted fault-proneness of the software system changed during its development cycle." metrics, wmc weighted methods per class, dit depth of inheritance, rfc response for a class, noc number of children, cbo coupling between object classes, cohesion, lines of code, loc, sloc chidamber and kemerer metrics}, pages = {897-910}, abstract = {Open source software systems are becoming increasingly important these days. Many companies are investing in open source projects and lots of them are also using such software in their own work. But, because open source software is often developed with a different management style than the industrial ones, the quality and reliability of the code needs to be studied. Hence, the characteristics of the source code of these projects need to be measured to obtain more information about it. This paper describes how we calculated the object-oriented metrics given by Chidamber and Kemerer to illustrate how fault-proneness detection of the source code of the open source Web and e-mail suite called Mozilla can be carried out. We checked the values obtained against the number of bugs found in its bug database - called Bugzilla - using regression and machine learning methods to validate the usefulness of these metrics for fault-proneness prediction. We also compared the metrics of several versions of Mozilla to see how the predicted fault-proneness of the software system changed during its development cycle.}, keywords = {bugs, bugzilla, cbo, defects, dit, fault-prone modules, faults, lcom, lcomn, loc, metrics, mozilla, noc, object-oriented, rfc, source code, wmc}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.115.8372\&rep=rep1\&type=pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/Gyimothy.pdf}, author = {Gyimothy, T. and Ferenc, R. and Siket, I.} } @article {flosswp184, title = {Exploring the Structure of Complex Software Designs: An Empirical Study of Open Source and Proprietary Code (updated)}, year = {2005}, note = {"For each design, we report data on the number of source files, the number of dependencies, the density of the DSM (i.e., the number of dependencies per source file pair) the propagation cost and the clustered cost. We also provide data on the average complexity of source files, in terms of the number of functions and lines of code."}, month = {June}, abstract = {This paper reports data from a study that seeks to characterize the differences in design structure between complex software products. In particular, we use Design Structure Matrices (DSMs) to map the dependencies between the elements of a design and define metrics that allow us to compare the structures of different designs. We first use these metrics to compare the architectures of two software products - the Linux operating system and the Mozilla web browser - that were developed via contrasting modes of organization: specifically, open source versus proprietary development. We then track the evolution of Mozilla, paying particular attention to a purposeful "re-design" effort that was undertaken with the intention of making the product more "modular." We find significant differences in structure between Linux and the first version of Mozilla, suggesting that Linux had a more modular architecture. We also find that the redesign of Mozilla resulted in an architecture that was significantly more modular than that of its predecessor, and indeed, than that of Linux. Our results, while exploratory, are consistent with a view that different modes of organization are associated with designs that possess different structures. However, we also illustrate that purposeful managerial actions can have a large impact on structure. This latter result is important given recent moves to release proprietary software into the public domain. These moves are likely to fail unless the product possesses an architecture that facilitates participation. Our paper provides evidence that a tightly-coupled design can be adapted to meet this objective.}, keywords = {complexity, cost, dependencies, functions, lines of code, linux, loc, mozilla, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/maccormackrusnakbaldwin2.pdf}, author = {Alan MacCormack and John Rusnak and Carliss Baldwin} } @article {DBLP:journals/tse/Dinh-TrongB05, title = {The FreeBSD Project: A Replication Case Study of Open Source Development}, journal = {IEEE Trans. Software Eng.}, volume = {31}, number = {6}, year = {2005}, note = {" we obtained the necessary data from the [FreeBsd] CVS repository, the bug report database, and the e-mail archive. The CVS repository contains all of the code and related documentation that is committed to the project from 1993 until the present. The bug report database contains information describing all reported problems, as well as the status (such as fixed, under test, or open) of each problem. Each bug report is called a PR and assigned a reference number. The e-mail archive contains every e-mail message exchanged between the developers since 1994." d/l: research.cs.queensu.ca/~ahmed/home/teaching/.../F06/.../free-bsd.pdf}, pages = {481-494}, abstract = {Case studies can help to validate claims that open source software development produces higher quality software at lower cost than traditional commercial development. One problem inherent in case studies is external validity{\textemdash}we do not know whether or not results from one case study apply to another development project. We gain or lose confidence in case study results when similar case studies are conducted on other projects. This case study of the FreeBSD project, a long-lived open source project, provides further understanding of open source development. The paper details a method for mining repositories and querying project participants to retrieve key process information. The FreeBSD development process is fairly well-defined with proscribed methods for determining developer responsibilities, dealing with enhancements and defects, and managing releases. Compared to the Apache project, FreeBSD uses 1) a smaller set of core developers{\textemdash}developers who control the code base{\textemdash}that implement a smaller percentage of the system, 2) a larger set of top developers to implement 80 percent of the system, and 3) a more well-defined testing process. FreeBSD and Apache have a similar ratio of core developers to people involved in adapting and debugging the system and people who report problems. Both systems have similar defect densities and the developers are also users in both systems.}, keywords = {apache, bug reports, contributors, core, cvs, defect density, developers, email, email archive, freebsd, mailing list, scm, source code, users}, doi = {10.1109/TSE.2005.73}, attachments = {https://flosshub.org/sites/flosshub.org/files/DinhTrungBieman.pdf}, author = {Trung T. Dinh-Trong and James M. Bieman} } @conference {Antoniol:2005:LPC:1083142.1083156, title = {Linear predictive coding and cepstrum coefficients for mining time variant information from software repositories}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {74-78}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {This paper presents an approach to recover time variant information from software repositories. It is widely accepted that software evolves due to factors such as defect removal, market opportunity or adding new features. Software evolution details are stored in software repositories which often contain the changes history. On the other hand there is a lack of approaches, technologies and methods to efficiently extract and represent time dependent information. Disciplines such as signal and image processing or speech recognition adopt frequency domain representations to mitigate differences of signals evolving in time. Inspired by time-frequency duality, this paper proposes the use of Linear Predictive Coding (LPC) and Cepstrum coefficients to model time varying software artifact histories. LPC or Cepstrum allow obtaining very compact representations with linear complexity. These representations can be used to highlight components and artifacts evolved in the same way or with very similar evolution patterns. To assess the proposed approach we applied LPC and Cepstral analysis to 211 Linux kernel releases (i.e., from 1.0 to 1.3.100), to identify files with very similar size histories. The approach, the preliminary results and the lesson learned are presented in this paper.}, keywords = {change history, data mining, evolution, files, kernel, linear predictive coding, linux, lpc, size, software evolution, source code}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083156}, url = {http://doi.acm.org/10.1145/1082983.1083156}, attachments = {https://flosshub.org/sites/flosshub.org/files/74LinearPredictive.pdf}, author = {Antoniol, Giuliano and Rollo, Vincenzo Fabio and Venturi, Gabriele} } @conference {Fischer:2005:MED:1083142.1083145, title = {Mining evolution data of a product family}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {12-16}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Diversification of software assets through changing requirements impose a constant challenge on the developers and maintainers of large software systems. Recent research has addressed the mining for data in software repositories of single products ranging from fine- to coarse grained analyses. But so far, little attention has been payed to mining data about the evolution of product families. In this work, we study the evolution and commonalities of three variants of the BSD (Berkeley Software Distribution), a large open source operating system. The research questions we tackle are concerned with how to generate high level views of the system discovering and indicating evolutionary highlights. To process the large amount of data, we extended our previously developed approach for storing release history information to support the analysis of product families. In a case study we apply our approach on data from three different code repositories representing about 8.5GB of data and 10 years of active development.}, keywords = {bsd, change analysis, change history, cvs, evolution, freebsd, netbsd, openbsd, release history, source code, text mining}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083145}, url = {http://doi.acm.org/10.1145/1082983.1083145}, attachments = {https://flosshub.org/sites/flosshub.org/files/12MiningEvolution.pdf}, author = {Fischer, Michael and Oberleitner, Johann and Ratzinger, Jacek and Gall, Harald} } @conference {Williams:2005:RSS:1083142.1083144, title = {Recovering system specific rules from software repositories}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {7-11}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {One of the most successful applications of static analysis based bug finding tools is to search the source code for violations of system-specific rules. These rules may describe how functions interact in the code, how data is to be validated or how an API is to be used. To apply these tools, the developer must encode a rule that must be followed in the source code. The difficulty is that many of these system-specific rules are undocumented and "grow" over time as the source code changes. Most research in this area relies on expert programmers to document these little-known rules. In this paper we discuss a method to automatically recover a subset of these rules, function usage patterns, by mining the software repository. We present a preliminary study that applies our work to a large open source software project.}, keywords = {function usage patterns, functions, source code, wine}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083144}, url = {http://doi.acm.org/10.1145/1082983.1083144}, attachments = {https://flosshub.org/sites/flosshub.org/files/7Recovering.pdf}, author = {Williams, Chadd C. and Hollingsworth, Jeffrey K.} } @conference {Hindle:2005:SFM:1083142.1083161, title = {SCQL: a formal model and a query language for source control repositories}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {100-104}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Source Control Repositories are used in most software projects to store revisions to source code files. These repositories operate at the file level and support multiple users. A generalized formal model of source control repositories is described herein. The model is a graph in which the different entities stored in the repository become vertices and their relationships become edges. We then define SCQL, a first order, and temporal logic based query language for source control repositories. We demonstrate how SCQL can be used to specify some questions and then evaluate them using the source control repositories of five different large software projects.}, keywords = {evolution, file, gnumeric, modperl, openssl, revision, samba, scm, source code}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083161}, url = {http://doi.acm.org/10.1145/1082983.1083161}, attachments = {https://flosshub.org/sites/flosshub.org/files/100scql.pdf}, author = {Hindle, Abram and Daniel M. German} } @article {springerlink:10.1007/s10606-005-9000-1, title = {Socialization in an Open Source Software Community: A Socio-Technical Analysis}, journal = {Computer Supported Cooperative Work (CSCW)}, volume = {14}, year = {2005}, note = {"I present the results of my analyses of participation patterns in a particular OSS project, used as a case study: Python. More precisely, I use the Open Source Project Browser to qualitatively track and analyze the trajectories of several project members who evolved (or not) into full-fledged participants. This allows me to later discuss how socialization proceeds in an OSS community such as Python" "Over the course of 2002, I progressively retrieved the entire email archive of python-dev (the developers{\textquoteright} mailing list) and the CVS source tree for the project. " "I read the entirety of the messages written by these participants and also looked at the software code they produced. There were striking similarities between their progressions over time, which I will de- scribe shortly. Overall the trajectory of these participants reflects successful socialization in Python: an evolution from newcomer to developer." http://www2.parc.com/csl/members/nicolas/documents/JCSCW-OSS.pdf}, pages = {323-368}, publisher = {Springer Netherlands}, abstract = {Open Source Software (OSS) development is often characterized as a fundamentally new way to develop software. Past analyses and discussions, however, have treated OSS projects and their organization mostly as a static phenomenon. Consequently, we do not know how these communities of software developers are sustained and reproduced over time through the progressive integration of new members. To shed light on this issue I report on my analyses of socialization in a particular OSS community. In particular, I document the relationships OSS newcomers develop over time with both the social and material aspects of a project. To do so, I combine two mutually informing activities: ethnography and the use of software specially designed to visualize and explore the interacting networks of human and material resources incorporated in the email and code databases of OSS. Socialization in this community is analyzed from two perspectives: as an individual learning process and as a political process. From these analyses it appears that successful participants progressively construct identities as software craftsmen, and that this process is punctuated by specific rites of passage. Successful participants also understand the political nature of software development and progressively enroll a network of human and material allies to support their efforts. I conclude by discussing how these results could inform the design of software to support socialization in OSS projects, as well as practical implications for the future of these projects.}, keywords = {cvs, developers, email, email archive, mailing list, open source project browser, participation, python, scm, source code, team, tools}, issn = {0925-9724}, url = {http://dx.doi.org/10.1007/s10606-005-9000-1}, author = {DUCHENEAUT, NICOLAS} } @conference {Neamtiu:2005:USC:1083142.1083143, title = {Understanding source code evolution using abstract syntax tree matching}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {2-6}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Mining software repositories at the source code level can provide a greater understanding of how software evolves. We present a tool for quickly comparing the source code of different versions of a C program. The approach is based on partial abstract syntax tree matching, and can track simple changes to global variables, types and functions. These changes can characterize aspects of software evolution useful for answering higher level questions. In particular, we consider how they could be used to inform the design of a dynamic software updating system. We report results based on measurements of various versions of popular open source programs, including BIND, OpenSSH, Apache, Vsftpd and the Linux kernel.}, keywords = {abstract syntax trees, apache, bind, evolution, linux, openssh, software evolution, source code, source code analysis, vsftpd}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083143}, url = {http://doi.acm.org/10.1145/1082983.1083143}, attachments = {https://flosshub.org/sites/flosshub.org/files/2Understanding.pdf}, author = {Neamtiu, Iulian and Foster, Jeffrey S. and Hicks, Michael} } @conference {Kim:2005:UCG:1083142.1083146, title = {Using a clone genealogy extractor for understanding and supporting evolution of code clones}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {17-23}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Programmers often create similar code snippets or reuse existing code snippets by copying and pasting. Code clones {\textemdash}syntactically and semantically similar code snippets{\textemdash}can cause problems during software maintenance because programmers may need to locate code clones and change them consistently. In this work, we investigate (1) how code clones evolve, (2) how many code clones impose maintenance challenges, and (3) what kind of tool or engineering process would be useful for maintaining code clones. Based on a formal definition of clone evolution, we built a clone genealogy tool that automatically extracts the history of code clones from a source code repository (CVS). Our clone genealogy tool enables several analyses that reveal evolutionary characteristics of code clones. Our initial results suggest that aggressive refactoring may not be the best solution for all code clones; thus, we propose alternative tool solutions that assist in maintaining code clones using clone genealogy information.}, keywords = {clone, clone detection, cvs, developers, evolution, maintenance, refactoring, source code}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083146}, url = {http://doi.acm.org/10.1145/1082983.1083146}, attachments = {https://flosshub.org/sites/flosshub.org/files/17Using.pdf}, author = {Kim, Miryung and Notkin, David} } @booklet {Lopez-Fernandez_applyingsocial, title = {Applying Social Network Analysis to the Information in CVS Repositories}, howpublished = {International Workshop on Mining Software Repositories (MSR 2004)}, year = {2004}, pages = {101-105}, abstract = {The huge quantities of data available in the CVS repositories of large, long-lived libre (free, open source) software projects, and the many interrelationships among those data offer opportunities for extracting large amounts of valuable information about their structure, evolution and internal processes. Unfortunately, the sheer volume of that information renders it almost unusable without applying methodologies which highlight the relevant information for a given aspect of the project. In this paper, we propose the use of a well known set of methodologies (social network analysis) for characterizing libre software projects, their evolution over time and their internal structure. In addition, we show how we have applied such methodologies to real cases, and extract some preliminary conclusions from that experience.}, keywords = {apache, complex networks, cvs, gnome, kde, libre software engineering, source code, source code repositories, visualization techniques, vizualization}, attachments = {https://flosshub.org/sites/flosshub.org/files/101ApplyingSocial.pdf}, author = {L{\'o}pez-Fern{\'a}ndez, L. and Gregorio Robles and Jesus M. Gonzalez-Barahona} } @proceedings {1190, title = {Community structure of modules in the Apache project}, year = {2004}, pages = {44-48}, abstract = {The relationships among modules in a software project of a certain size can give us much information about its internal organization and a way to control and monitor development activities and evolution of large libre software projects. In this paper, we show how information available in CVS repositories can be used to study the structure of the modules in a project when they are related by the people working in them, and how techniques taken from the social networks fields can be used to highlight the characteristics of that structure. As a case example, we also show some results of applying this methodology to the Apache project in several points in time. Among other facts, it is shown how the project evolves and is self-structuring, with developer communities of modules corresponding to semantically related families of modules.}, keywords = {apache, cvs, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/gonzalezBarahona44-48.pdf}, author = {Jesus M. Gonzalez-Barahona and Luis Lopez and Gregorio Robles} } @conference {Xinyi04fourinteresting, title = {Four Interesting Ways in Which History Can Teach Us About Software}, booktitle = {Proceedings of the 2004 international workshop on Mining software repositories - MSR {\textquoteright}04}, year = {2004}, month = {05/2004}, pages = {58-62}, abstract = {In this position paper, we outline four kinds of studies that we have undertaken in trying to understand various aspects of a software system{\textquoteright}s evolutionary history. In each instance, the studies have involved detailed examination of real software systems based on {\textquotedblleft}facts{\textquotedblright} extracted from various kinds of source artifact repositories, as well as the development of accompanying tools to aid in the extraction, abstraction, and comprehension processes. We briefly discuss the goals, results, and methodology of each approach.}, keywords = {ant, apache, change analysis, clone, clone detection, cvs, evolution, gcc, growth, kepler, linux, midworld, mycore, postgresql, source code, version control}, attachments = {https://flosshub.org/sites/flosshub.org/files/58FourInterestingWays.pdf}, author = {Michael Godfrey and Xinyi Dong and Cory Kapser and Lijie Zou} } @conference {1105, title = {Free \& Open Source Software Creation and {\textquoteleft}the Economy of Regard{\textquoteright}}, booktitle = {Third EPIP Workshop}, year = {2004}, month = {04/2004}, keywords = {linux, linux kernel, scm, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/DalleDavidGhosh\%20Wolak.pdf}, author = {Jean-Michel Dalle and Paul A. David and Rishab Ayer Ghosh and Frank A. Wolak} } @article {capiluppi:23, title = {Improving comprehension and cooperation through code structure}, journal = {IEE Seminar Digests}, volume = {2004}, number = {908}, year = {2004}, note = {"In this study we measured source code size in three different forms (LOCs, SLOCs, and Kbs of code)." "Pilot Project: The ARLA System" }, pages = {23-28}, publisher = {IEE}, abstract = {Defining a relationship between a software system{\textquoteright}s architecture and the process{\textquoteright} efforts is one of the most fascinating questions of software engineering. Apparently, when a system{\textquoteright}s architecture is complex, the process to improve and evolve it will be more difficult. We try to tackle this question from a different point of view: given an open source system, in all the phases of its evolution, we focus on both the aspects of software developers, and the obtained software product. More we observe one of the possible architectures of this system, based on the tree structure derived from source components. First conclusions show that some patterns of tree evolution are recognizable: some branches may appear more promising than other, and are extensively evolved, while other remains in the same status for all the life cycle. More, when the tree structure reaches some status, the process of joining as a core developer seems to forestall. }, keywords = {arla, code structure, contributors, developers, open source system, scm, software development, software engineering, software process, software product, software system architecture, source code, source components, tree evolution, tree structure}, doi = {10.1049/ic:20040260}, url = {http://link.aip.org/link/abstract/IEESEM/v2004/i908/p23/s1}, attachments = {https://flosshub.org/sites/flosshub.org/files/capiluppi2004.pdf}, author = {A. Capiluppi} } @conference {1054, title = {LASER: a lexical approach to analogy in software reuse}, booktitle = {International Workshop on Mining Software Repositories (MSR 2004)}, volume = {2004}, year = {2004}, pages = {112 - 116}, publisher = {IEE}, organization = {IEE}, address = {Edinburgh, Scotland, UK}, abstract = {Software reuse is the process of creating a software system from existing software components, rather than creating it from scratch. With the increase in size and complexity of existing software repositories, the need to provide intelligent support to the programmer becomes more pressing. An analogy is a comparison of certain similarities between things which are otherwise unlike. This concept has shown to be valuable in developing UML-level reuse techniques. In the LASER project we apply lexically-driven Analogy at the code level, rather than at the UML-level, in order to retrieve matching components from a repository of existing components. Using the lexical ontology Word-Net, we have conducted a case study to assess if class and method names in open source applications are used in a semantically meaningful way. Our results demonstrate that both hierarchical reuse and parallel reuse can be enhanced through the use of lexically-driven Analogy.}, keywords = {class, developers, functions, jrefactory, method, naming, natural language, reuse, source code, wordnet}, doi = {10.1049/ic:20040487}, attachments = {https://flosshub.org/sites/flosshub.org/files/112LASER.pdf}, author = {Amin, R. and Mel O Cinneide and Veale, Tony} } @article {Chen:2004:OCL:990374.990391, title = {Open-Source Change Logs}, journal = {Empirical Softw. Engg.}, volume = {9}, year = {2004}, note = {"We decided to compare actual differences in the source code with entries in the ChangeLog file. We used lxr, the Linux cross-referencing tool..., to determine the precise differences between two successive software versions. We then compared these differences with the records in the ChangeLog file to check the completeness of the ChangeLog file." }, month = {September}, pages = {197{\textendash}210}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, abstract = {A recent editorial in Empirical Software Engineering suggested that open-source software projects offer a great deal of data that can be used for experimentation. These data not only include source code, but also artifacts such as defect reports and update logs. A common type of update log that experimenters may wish to investigate is the ChangeLog, which lists changes and the reasons for which they were made. ChangeLog files are created to support the development of software rather than for the needs of researchers, so questions need to be asked about the limitations of using them to support research. This paper presents evidence that the ChangeLog files provided at three open-source web sites were incomplete. We examined at least three ChangeLog files for each of three different open-source software products, namely, GNUJSP, GCC-g++, and Jikes. We developed a method for counting changes that ensures that, as far as possible, each individual ChangeLog entry is treated as a single change. For each ChangeLog file, we compared the actual changes in the source code to the entries in the ChangeLog file and discovered significant omissions. For example, using our change-counting method, only 35 of the 93 changes in version 1.11 of Jikes appear in the ChangeLog file{\textemdash}that is, over 62\% of the changes were not recorded there. The percentage of omissions we found ranged from 3.7 to 78.6\%. These are significant omissions that should be taken into account when using ChangeLog files for research. Before using ChangeLog files as a basis for research into the development and maintenance of open-source software, experimenters should carefully check for omissions and inaccuracies.}, keywords = {change log, gcc, GCC-g, GNUJSP, Jikes, log files, Open-source software, source code}, issn = {1382-3256}, doi = {10.1023/B:EMSE.0000027779.70556.d0}, url = {http://portal.acm.org/citation.cfm?id=990374.990391}, attachments = {https://flosshub.org/sites/flosshub.org/files/chen.pdf}, author = {Chen, Kai and Schach, Stephen R. and Yu, Liguo and Offutt, Jeff and Heller, Gillian Z.} } @conference {German03automatingthe, title = {Automating the measurement of open source projects}, booktitle = {Proceedings of the 3rd Workshop on Open Source Software Engineering}, year = {2003}, note = {"We have chosen to use Evolution to illustrate some of the capabilities of SoftChange. We focus on the data provided by CVS logs and the CVS commit mailing list. Our data includes changes to the CVS repository from April 1998 to January 2003."}, pages = {63{\textendash}67}, abstract = {The proliferation of open source projects raises a number of vital economic, social, and software engineering questions that are subject of intense research. Based on experience analyzing numerous open source and commercial projects we propose a set of tools to support extraction and validation of software project data. Such tools would streamline empirical investigation of open source projects and make it possible to test existing and new theories about the nature of open source projects. Our soft- ware includes tools to extract and summarize information from mailing lists, CVS logs, ChangeLog files, and defect tracking databases. More importantly, it cross-links records from various data sources and identifies all contributors for a software change. We illustrate some of the capabilities by analyzing data from Ximian Evolution project.}, keywords = {bug reports, bug tracking, changelog, cvs, defects, evolution, log files, logs, mailing list, scm, softchange, source code, ximian, ximian evolution}, attachments = {https://flosshub.org/sites/flosshub.org/files/germanMockus2003.pdf}, author = {German, Daniel and Audris Mockus} } @article {flosswp84, title = {Clustering and Dependencies in Free/Open Source Software Development: Methodology and Tools}, year = {2003}, note = {source code is the artifact used}, month = {April}, abstract = {This paper addresses the problem of measurement of non-monetary economic activity, specifically in the area of free/open source software communities. It describes the problems associated with research on these communities in the absence of measurable monetary transactions, and suggests possible alternatives. A class of techniques using software source code as factual documentation of economic activity is described and a methodology for the extraction, interpretation and analysis of empirical data from software source code is detailed, with the outline of algorithms for identifying collaborative authorship and determining the identity of coherent economic actors in developer communities. Finally, conclusions are drawn from the application of these techniques to a base of software.}, keywords = {scm, source code, source code analysis}, url = {http://dxm.org/papers/toulouse2/cluster-final.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/cluster-final.pdf}, author = {Rishab Ayer Ghosh} } @article {146, title = {Community, joining, and specialization in open source software innovation: a case study}, journal = {Research Policy}, volume = {32}, number = {7}, year = {2003}, note = {first, telephone interviews "Secondly, we collected the project{\textquoteright}s public email conversations stored in the projects{\textquoteright} mailing lists which is archived on Freenet{\textquoteright}s website" "The third source of data included the history of changes to the software code available via the project{\textquoteright}s software repository within the CVS ({\textquoteleft}Concurrent Versioning System{\textquoteright}) source code management tool" "Fourthly, in order obtain contextual understanding of the project we collected publicly available documents related to open source in general and to the project in particular. Among the most important sources were the Freenet project web pages (e.g. the Frequently Asked Questions (FAQ)7), Ian Clarke{\textquoteright}s master thesis (1999), newspaper interviews with the core developers, and a technical paper (Clarke, Sandberg, Wiley, \& Hong, 2000) describing the Freenet project written by some of the developers."}, pages = {1217-1241}, abstract = {This paper develops an inductive theory of the open source software innovation process by focussing on the creation of Freenet, a project aimed at developing a decentralized and anonymous peer-to-peer electronic file sharing network. We are particularly interested in the strategies and processes by which new people join the existing community of software developers, and how they initially contribute code. Analyzing data from multiple sources on the Freenet software development process, we generate the constructs of "joining script", We are grateful to helpful comments from two anonymous reviewers. We also thank Chris Argyris, John Seely Brown, Eric von Hippel, Stefan Haefliger, Petra Kugler, Heike Bruch, Simon Gchter, Simon Peck, and Hari Tsoukas for helpful comments and suggestions. Ben Ho and Craig Lebowitz provided technical assistance with data importation and parsing. We would like to thank Ian Clarke and the Freenet developers for their willingness to participate in our study and providing key insights into the open source development process. Karim R. Lakhani would like to acknowledge the generous support of The Boston Consulting Group and Canada{\textquoteright}s Social Science and Humanities Research Council doctoral fellowship. Georg von Krogh and Sebastian Spaeth acknowledge the generous support from the Research Foundation at the University of St. Gallen.}, keywords = {cvs, email, email archives, freenet, INNOVATION, mailing lists, roles, source code}, doi = {http://dx.doi.org/10.1016/S0048-7333(03)00050-7}, attachments = {https://flosshub.org/sites/flosshub.org/files/krogh03.pdf}, author = {Georg von Krogh and Spaeth, S. and Karim R Lakhani} } @article {1099, title = {Evidences in the evolution of OS projects through Changelog Analyses}, journal = {Proceedings of the 3rd ICSE Workshop on Open Source}, year = {2003}, note = {"In this study we concentrate on a very large sample (406 projects) selected randomly from an OS portal[20]" (freshmeat) "We define three clusters of projects: {\textquoteright}large{\textquoteright} projects as long as they are based on more than 1000KB(40KLOC)..."}, pages = {19-24}, abstract = {Most empirical studies about Open Source (OS) projects or products are vertical and usually deal with the flagship, successful projects. There is a substantial lack of horizontal studies to shed light on the whole population of projects, including failures. This paper presents a horizontal study aimed at characterizing OS projects. We analyze a sample of around 400 projects from a popular OS project repository. Each project is characterized by a number of attributes. We analyze these attributes statically and over time. The main results show that few projects are capable of attracting a meaningful community of developers. The majority of projects is made by few (in many cases one) person with a very slow pace of evolution. We then try to observe how many projects count on a substantial number of developers, and analyze those projects more deeply. The goal is to achieve a better insight in the dynamics of open source development. The initial results of this analysis, especially growth in code size and tendency to stability in modularity, seem to be in line with traditional close source development.}, keywords = {classification, freshmeat, loc, modularity, repository, size, sloc, source code}, url = {http://hdl.handle.net/10552/1037}, attachments = {https://flosshub.org/sites/flosshub.org/files/capiluppi2003.pdf}, author = {Capiluppi, Andrea} } @article {2003, title = {From a Firm-Based to a Community-Based Model of Knowledge Creation: The Case of the Linux Kernel Development}, journal = {Organization Science}, volume = {14}, number = {6}, year = {2003}, note = {"we study the Linux development community mainly by analyzing the artifacts that the Linux developers have produced. A key output of knowledge creation activities is the artifacts. The most important artifact, of course, is the Linux operating system source code." "Along with the source code, a "Credits" text file and a "MAINTAINERS" text file are distributed to the users." "An equally important artifact is the development activities archived in the Linux-kernel mailing list"..."Using the weekly Linux-kernel email archive for years 1995 to 2000 as a key source of data, we focus on people who have sent at least one email to the Linux-kernel mailing list. " "In addition, we examine the developers{\textquoteright} demographic distributions, working patterns, and motivations by analyzing the raw data from an on-line survey"}, pages = {pp. 633-649}, publisher = {INFORMS}, abstract = {We propose a new model of knowledge creation in purposeful, loosely coordinated, distributed systems, as an alternative to a firm-based one. Specifically, using the case of the Linux kernel development project, we build a model of community-based, evolutionary knowledge creation to study how thousands of talented volunteers, dispersed across organizational and geographical boundaries, collaborate via the Internet to produce a knowledge-intensive, innovative product of high quality. By comparing and contrasting the Linux model with the traditional/commercial model of software development and firm-based knowledge creation efforts, we show how the proposed model of knowledge creation expands beyond the boundary of the firm. Our model suggests that the product development process can be effectively organized as an evolutionary process of learning driven by criticism and error correction. We conclude by offering some theoretical implications of our community-based model of knowledge creation for the literature of organizational learning, community life, and the uses of knowledge in society.}, keywords = {credits, developers, email, email archives, knowledge creation, linux kernel, mailing list, maintainers, scm, source code, Survey, Volunteers}, issn = {10477039}, url = {http://www.jstor.org/stable/4135125}, author = {Lee, Gwendolyn K. and Cole, Robert E.} } @article {flosswp124, title = {Maintainability of the Linux Kernel}, journal = {Proceedings of the 2nd Workshop on Open Source Software Engineering ICSE2002}, year = {2003}, note = {"We have examined 365 versions of Linux. For every version, we counted the number of instances of common (global) coupling between each of the 17 kernel modules and all the other modules in that version of Linux."}, month = {October}, abstract = {We have examined 365 versions of Linux. For every version, we counted the number of instances of common (global) coupling between each of the 17 kernel modules and all the other modules in that version of Linux. We found that the number of instances of common coupling grows exponentially with version number. This result is significant at the 99.99\% level, and no additional variables are needed to explain this increase. We conclude that, unless Linux is restructured with a bare minimum of common coupling, the dependencies induced by common coupling will, at some future date, make Linux exceedingly hard to maintain without inducing regression faults.}, keywords = {coupling, kernel, linux, linux kernel, modules, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/linux-maint_0.pdf}, author = {Schach, Stephen R. and Jin, B. and Wright, D.R.} } @conference {Ye:2003:TUM:776816.776867, title = {Toward an understanding of the motivation Open Source Software developers}, booktitle = {Proceedings of the 25th International Conference on Software Engineering}, series = {ICSE {\textquoteright}03}, year = {2003}, note = {"Analyzing the emails sent to the mailing fist is one way of understanding the structure of the community." "Table 2 displays the number of code contributions made by members to the GIMP system and the defined roles of those contributing members. We counted the number of contributions made by each person by analyzing the change log of the system."}, pages = {419{\textendash}429}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Washington, DC, USA}, abstract = {An Open Source Software (OSS) project is unlikely to be successful unless there is an accompanied community that provides the platform for developers and users to collaborate. Members of such communities are volunteers whose motivation to participate and contribute is of essential importance to the success of OSS projects. In this paper, we aim to create an understanding of what motivates people to participate in OSS communities. We theorize that learning is one of the motivational forces. Our theory is grounded in the learning theory of Legitimate Peripheral Participation, and is supported by analyzing the social structure of OSS communities and the co-evolution between OSS systems and communities. We also discuss practical implications of our theory for creating and maintaining sustainable OSS communities as well as for software engineering research and education.}, keywords = {change log, COMMUNITY, contributions, contributors, developers, email, email archives, evolution, gimp, log files, mailing list, roles, source code}, isbn = {0-7695-1877-X}, url = {http://portal.acm.org/citation.cfm?id=776816.776867}, attachments = {https://flosshub.org/sites/flosshub.org/files/YeKishida.pdf}, author = {Ye, Yunwen and Kishida, Kouichi} } @article {65, title = {Analyzing cloning evolution in the Linux kernel}, journal = {Information and Software Technology}, volume = {44}, number = {13}, year = {2002}, pages = {755-765}, abstract = {Identifying code duplication in large multi-platform software systems is a challenging problem. This is due to a variety of reasons including the presence of high-level programming languages and structures interleaved with hardware-dependent low-level resources and assembler code, the use of GUI-based configuration scripts generating commands to compile the system, and the extremely high number of possible different configurations. This paper studies the extent and the evolution of code duplications in the Linux kernel. Linux is a large, multi-platform software system; it is based on the Open Source concept, and so there are no obstacles in discussing its implementation. In addition, it is decidedly too large to be examined manually: the current Linux kernel release (2.4.18) is about three million LOCs. Nineteen releases, from 2.4.0 to 2.4.18, were processed and analyzed, identifying code duplication among Linux subsystems by means of a metric-based approach. The obtained results support the hypothesis that the Linux system does not contain a relevant fraction of code duplication. Furthermore, code duplication tends to remain stable across releases, thus suggesting a fairly stable structure, evolving smoothly without any evidence of degradation. (C) 2002 Elsevier Science B.V. All rights reserved.}, keywords = {cvs, kernel, lines of code, linux, loc, project success, source code}, url = {web.soccerlab.polymtl.ca/~antoniol/publications/.../infsoft2002.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/infsoft2002.pdf}, author = {Antoniol, G. and Villano, U. and Merlo, E. and Di Penta, M.} } @proceedings {104, title = {Economic incentives for participating in open source software projects}, year = {2002}, note = {"The data for this research come from two primary sources: Apache project archives and a targeted survey of Apache participants. Archival data are open source project artifacts such as e-mail and source code archives, source code version control meta-data, and developer Web sites" independent variables: patch, committer, work experience, programming experience, education, job switch, firm size, firm public, industry}, pages = {365{\textendash}372}, abstract = {Using the Internet as a basis for communication, collaboration, and storage of artifacts, the open source community is producing software of a quality that was previously thought to be achievable only by professional engineers following strict software development paradigms. This accomplishment is even more astounding as developers contribute to the source code without any remuneration. Open source leaders as well as academics have proposed theories about the motivation of open source developers that are rooted in diverse fields such as social psychology and anthropology. However, Lerner and Tirole (2000) argue that developer participation in open source projects may, in part, be explained by existing economic theory regarding career concerns. This research seeks to confirm or disconfirm the existence of economic returns to participation in open source development. Our findings suggest that greater open source participation per se, as measured in contributions made, is not associated with wage increases. However, a higher status in a merit-based ranking within the Apache Project is associated with significantly higher wages. This suggests that employers do not reward the gain in experience through open source participation as an increase in human capital. The results are also consistent with the notion that a high rank within the Apache Software Foundation is a credible signal of the productive capacity of a programmer.}, keywords = {apache, contributions, email, email archives, mailing list, organizational sponsorship, participation, patch, scm, source code, Survey, version control}, attachments = {https://flosshub.org/sites/flosshub.org/files/42.pdf}, author = {Il-Horn Hann and Jeff Roberts and Sandra Slaughter and Roy Fielding} } @article {1095, title = {High Quality and Open Source Software Practices}, journal = {Proceedings of the 2nd ICSE Workshop on Open Source}, year = {2002}, note = {"We examined the publicly visible portions of these projects from November 2001 through March 2002, ...The SLOC counts for the predominate languages are shown}, month = {2002}, abstract = {Surveys suggest that, according to various metrics, the quality and dependability of today{\textquoteright}s open source software is roughly on par with commercial and government developed software. What are the prospects for advancing to much higher levels of quality in open source software? More specifically, what attributes must be possessed by quality-related interventions for them to be feasibly adoptable in open source practice? In order to identify some of these attributes, we conducted a preliminary survey of the quality practices of a number of successful open source projects. We focus, in particular, on attributes related to adoptability by the open source practitioner community.}, keywords = {apache, bug report, bug tracker, bug tracking system, feature requests, gcc, gnome, kde, lines of code, linux, loc, mozilla, netbeans, perl, position paper, python, sloc, source code, Survey, tomcat, xfree86}, attachments = {https://flosshub.org/sites/flosshub.org/files/HalloranScherlis.pdf}, author = {T. Halloran and W. Scherlis} } @article {121, title = {Two case studies of open source software development: Apache and Mozilla}, journal = {ACM Transactions on Software Engineering and Methodology}, volume = {11}, number = {3}, year = {2002}, note = {apache data sources: email, cvs, bug database regarding email: "We wrote Perl scripts to extract date, sender identity, message subject, and the message body that was further processed to obtain details on code changes and problem reports (see below). Manual inspection was used to resolve such things as multiple email addresses in cases where all automated techniques failed." (but the rest of the paper does not address this data source at all) mozilla data sources bugzilla, cvs }, pages = {309-346}, abstract = {According to its proponents, open source style software development has the capacity to compete successfully, and perhaps in many cases displace, traditional commercial development methods. In order to begin investigating such claims, we examine data from two major open source projects, the Apache web server and the Mozilla browser. By using email archives of source code change history and problem reports we quantify aspects of developer participation, core team size, code ownership, productivity, defect density, and problem resolution intervals for these OSS projects. We develop several hypotheses by comparing the Apache project with several commercial projects. We then test and refine several of these hypotheses, based on an analysis of Mozilla data. We conclude with thoughts about the prospects for high- performance commercial/ open source process hybrids.}, keywords = {apache, bug fixing, bug reports, bugzilla, change history, core, defect density, email, email archives, mailing list, mozilla, ownership, participation, productivity, scm, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/mockusFieldingHerbsleb2002.pdf}, author = {Audris Mockus and Roy Fielding and Herbsleb, J. D.} } @conference {1157, title = {Why Do Developers Contribute to Open Source Projects? First Evidence of Economic Incentives}, booktitle = {Proceedings of the 2nd ICSE Workshop on Open Source}, year = {2002}, note = {"The data for this research come from two primary sources: Apache project archives and a targeted survey of Apache participants. Archival data are open source project artifacts such as email and source code archives, source code version control meta-data and developer web sites."}, abstract = {The availability of commercial quality, free software products such as the Apache HTTP (web) server or the Linux operating system has focused significant attention on the open source development process by which these products were created. One of the more perplexing aspects of open source software projects is why developers freely devote their time and energy to these projects. While many open source participants cite idealistic motives for participation, Lerner and Tirole (2000) argue that developer participation in open source projects may, in part, be explained by existing economic theory regarding career concerns. This research seeks to confirm or disconfirm the existence of economic returns to participation in open source development. Preliminary results of our empirical investigation suggest that greater open source participation per se, as measured in contributions made, does not lead to wage increases. However, a higher status in a merit-based ranking within the Apache Project does lead to significantly higher wages. This suggests that employers do not reward the gain in experience through open source participation as an increase in human capital. The results are also consistent with the notion that a high rank within the Apache Software Foundation is a credible signal of the productive capacity of a programmer.}, keywords = {apache, contributions, cvs, developers, ECONOMICS, email, email archives, financial, Human capital, mailing list, MOTIVATION, participation, source code, version control}, attachments = {https://flosshub.org/sites/flosshub.org/files/HannRobertsSlaughterFielding.pdf}, author = {Il-Horn Hann and Jeff Roberts and Sandra Slaughter and Roy Fielding} } @conference {Godfrey:2001:GES:602461.602482, title = {Growth, evolution, and structural change in open source software}, booktitle = {Proceedings of the 4th International Workshop on Principles of Software Evolution (IWPSE 2001)}, series = {IWPSE {\textquoteright}01}, year = {2001}, note = {"We measured [linux] system size in uncommented LOC" "We also examined the growth of several other open source systems, including the VIM text editor, Eric Raymond{\textquoteright}s fetchmail utility, and the GCC compiler suite. "}, pages = {103{\textendash}106}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Our recent work has addressed how and why software systems evolve over time, with a particular emphasis on software architecture and open source software systems [2, 3, 6]. In this position paper, we present a short summary of two recent projects. First, we have performed a case study on the evolution of the Linux kernel [3], as well as some other open source software (OSS) systems. We have found that several OSS systems appear not to obey some of "Lehman{\textquoteright}s laws" of software evolution [5, 7], and that Linux in particular is continuing to grow at a geometric rate. Currently, we are working on a detailed study of the evolution of one of the subsystems of the Linux kernel: the SCSI drivers subsystem. We have found that cloning, which is usually considered to be an indicator of lazy development and poor process, is quite common and is even considered to be a useful practice. Second, we are developing a tool called Beagle to aid software maintainers in understanding how large systems have changed over time. Beagle integrates data from various static analysis and metrics tools and provides a query engine as well as navigable visualizations. Of particular note, Beagle aims to provide help in modelling long term evolution of systems that have undergone architectural and structural change.}, keywords = {agile methods, beagle, cloning, evolution, fetchmail, gcc, growth, kernel, lehman{\textquoteright}s laws, lines of code, linux, linux kernel, loc, open source software, software architecture, software evolution, source code, structural change, supporting environments, vim}, isbn = {1-58113-508-4}, doi = {http://doi.acm.org/10.1145/602461.602482}, url = {http://doi.acm.org/10.1145/602461.602482}, attachments = {https://flosshub.org/sites/flosshub.org/files/tu2001.pdf}, author = {Michael Godfrey and Tu, Qiang} } @conference {1142, title = {Software Engineering Research in the Bazaar}, booktitle = {1st Workshop on Open Source Software Engineering at ICSE 2001}, year = {2001}, note = {"To gain a better understanding of a software system, we recover its software architecture from the system{\textquoteright}s source code. We recovered the architecture of many open source systems such as the Linux kernel [3], the Mozilla browser [5], the Apache web server [7], and the VIM editor [14]. The recovered architecture is browse-able to permit developers to interact with it, [10] shows an example for the Linux kernel."}, abstract = {During the last five years, our research group has studied the architecture and evolution of several large open source systems {\textemdash} including Linux, GCC, VIM, Mozilla, and Apache {\textemdash} and we have found that open source software systems often exhibit interesting differences when compared to similar commercially-developed systems. Our investigations of these systems have involved the creation of software architecture models, software architecture repair, the creation of a reference architecture for web servers, the study of evolution and growth of open source systems, and the modelling of architectural properties of systems that are apparent only at build time.}, keywords = {apache, architecture, gcc, kernel, linux, linux kernel, mozilla, open source software, software architecture, Software Engineering Research, source code, vim}, attachments = {https://flosshub.org/sites/flosshub.org/files/hassangodfreyholt.pdf}, author = {Hassan, Ahmed E. and Godfrey, Michael W. and Holt, Richard C.} } @article {flosswp26, title = {A Case Study of Open Source Software Development: The Apache Server}, journal = {Proceedings of the International Conference on Software Engineering (ICSE 2000)}, year = {2000}, note = {We used the following archival sources of data: Developer email list (EMAIL). Concurrent Version Control archive (CVS). Problem reporting database (BUGDB).}, month = {June}, abstract = {According to its proponents, open source style software development has the capacity to compete successfully, and perhaps in many cases displace, traditional commercial development methods. We examine the development process of a major open source application, the Apache web server. By using email archives of source code change history and problem reports we quantify aspects of developer participation, core team size, code ownership, productivity, defect density, and problem resolution interval for this OSS project. This analysis reveals a unique process, which performs well on important measures.}, keywords = {apache, bug fix revisions, bugs, core, cvs, defect density, developers, email archives, participation, productivity, revision control, revision history, roles, scm, source code, team size}, attachments = {https://flosshub.org/sites/flosshub.org/files/mockusapache.pdf}, author = {Audris Mockus and Roy Fielding and Herbsleb, James} } @conference {Godfrey:2000:EOS:850948.853411, title = {Evolution in Open Source Software: A Case Study}, booktitle = {Proceedings of the International Conference on Software Maintenance (ICSM{\textquoteright}00)}, series = {ICSM {\textquoteright}00}, year = {2000}, note = {"We examined 96 kernel versions..." .c files, .h files only loc, lines of code number of functions number of modules}, pages = {131{\textendash}}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Washington, DC, USA}, abstract = {Most studies of software evolution have been performed on systems developed within a single company using traditional management techniques. With the widespread availability of several large software systems that have been developed using an {\textquoteright}open source{\textquoteright} development approach, we now have a chance to examine these systems in detail, and see if their evolutionary narratives are significantly different from commercially developed systems. This paper summarizes our preliminary investigations into the evolution of the best known open source system: the Linux operating system kernel. Because Linux is large (over two million lines of code in the most recent version) and because its development model is not as tightly planned and managed as most industrial software processes, we had expected to find that Linux was growing more slowly as it got bigger and more complex. Instead, we have found that Linux has been growing at a super-linear rate for several years. In this paper, we explore the evolution of the Linux kernel both at the system level and within the major subsystems, and we discuss why we think Linux continues to exhibit such strong growth.}, keywords = {evolution, functions, growth, lines of code, linux, linux kernel, loc, source code}, isbn = {0-7695-0753-0}, url = {http://portal.acm.org/citation.cfm?id=850948.853411}, attachments = {https://flosshub.org/sites/flosshub.org/files/godfrey00.pdf}, author = {Godfrey, Michael W. and Tu, Qiang} }