@proceedings {1503, title = {Apache Commits: Social Network Dataset}, year = {2013}, month = {05/2013}, abstract = {Building non-trivial software is a social endeavor. Therefore, understanding the social network of developers is key to the study of software development organizations. We present a graph representation of the commit behavior of developers within the Apache Software Foundation for 2010 and 2011. Relationships between developers in the network represent collaborative commit behavior. Several similarity and summary metrics have been pre-calculated. The data, along with the tools that were used to create it and some further discussion, can be found at: http://sequoia.cs.byu.edu/lab/?page=artifacts/apacheGraphs}, author = {MacLean, Alexander C. and Knutson, Charles D.} } @proceedings {1288, title = {An Analysis of Author Contribution Patterns in Eclipse Foundation Project Source Code}, year = {2011}, note = {"We conducted an observational study on existing Eclipse projects by extracting author attribution data for Java source code files from git repositories." "Specifically, we consider entropy of source code by counting the number of lines attributed to each author."}, month = {10/2011}, pages = {269-281}, publisher = {Springer}, abstract = {Collaborative development is a key tenet of open source software, but if not properly understood and managed, it can become a liability. We examine author contribution data for the newest revision of 251,633 Java source files in 592 Eclipse projects. We use this observational data to analyze collaboration patterns within files, and to explore relationships between file size, author count, and code authorship. We calculate author entropy to characterize the contributions of multiple authors to a given file, with an eye toward understanding the degree of collaboration and the most common interaction patterns.}, keywords = {COLLABORATION, contribution, eclipse, entropy, java}, author = {Taylor, Quinn C. and Krein, Jonathan L. and MacLean, Alexander C. and Knutson, Charles D.} } @proceedings {1289, title = {Cliff Walls: An Analysis of Monolithic Commits Using Latent Dirichlet Allocation}, year = {2011}, note = {"Our data set consists of the version control logs of almost 10,000 projects from SourceForge, acquired in late 2006"}, month = {10/2011}, pages = {282-298}, publisher = {Springer}, abstract = {Artifact-based research provides a mechanism whereby researchers may study the creation of software yet avoid many of the difficulties of direct observation and experimentation. However, there are still many challenges that can affect the quality of artifact-based studies, especially those studies examining software evolution. Large commits, which we refer to as {\textquotedblleft}Cliff Walls,{\textquotedblright} are one significant threat to studies of software evolution because they do not appear to represent incremental development. We used Latent Dirichlet Allocation to extract topics from over 2 million commit log messages, taken from 10,000 SourceForge projects. The topics generated through this method were then analyzed to determine the causes of over 9,000 of the largest commits. We found that branch merges, code imports, and auto-generated documentation were significant causes of large commits. We also found that corrective maintenance tasks, such as bug fixes, did not play a significant role in the creation of large commits.}, keywords = {artifacts, commit, cvs, LDA, lines of code, log files, scm, sloc, sourceforge, version control}, author = {Pratt, Landon J. and MacLean, Alexander C. and Knutson, Charles D. and Ringger, Eric K.} } @proceedings {1277, title = {Knowledge Homogeneity and Specialization in the Apache HTTP Server Project}, year = {2011}, note = {"Our data set consists of the commit history and email archives for the Apache HTTP Server Project, spanning sixteen years (2/27/1995 - 1/31/2011)" "we 1) mapped the committers to email records, 2) cleaned the email records to remove extraneous information, 3) identified topics of discussion in the resulting messages, and 4) constructed a social network model from committers and topics." "If specialization exists within the httpd community, we should see distinct communities develop around topics. In addition, unique groups of developers should congregate around specialized subtopics. We examined the data from both angles: topical affinity and topic communities." }, month = {10/2011}, pages = {106-122}, publisher = {Springer}, abstract = {We present an analysis of developer communication in the Apache HTTP Server project. Using topic modeling techniques we expose latent conceptual sub-communities arising from developer specialization within the greater developer population. However, we found that among the major contributors to the project, very little specialization exists. We present theories to explain this phenomenon, and suggest further research.}, keywords = {apache, commits, developer, email, email archive, LDA, mailing list, revision control, revision history, scm, social network analysis, specialization, subversion, svn}, url = {http://sequoia.cs.byu.edu/lab/files/pubs/MacLean2011a.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/MacLean2011a.pdf}, author = {MacLean, Alexander C. and Pratt, Landon J. and Knutson, Charles D. and Ringger, Eric K.} } @article {1240, title = {Impact of Programming Language Fragmentation on Developer Productivity}, journal = {International Journal of Open Source Software and Processes}, volume = {2}, year = {2010}, month = {32/2010}, pages = {41 - 61}, abstract = {Programmers often develop software in multiple languages. In an effort to study the effects of programming language fragmentation on productivity{\textemdash}and ultimately on a developer{\textquoteright}s problem-solving abilities{\textemdash}the authors present a metric, language entropy, for characterizing the distribution of a developer{\textquoteright}s programming efforts across multiple programming languages. This paper presents an observational study examining the project contributions of a random sample of 500 SourceForge developers. Using a random coefficients model, the authors find a statistically (alpha level of 0.001) and practically significant correlation between language entropy and the size of monthly project contributions. Results indicate that programming language fragmentation is negatively related to the total amount of code contributed by developers within SourceForge, an open source software (OSS) community.}, keywords = {commits, entropy, language entropy, programming languages, sourceforge, srda}, issn = {1942-3934}, doi = {10.4018/jossp.2010040104}, author = {Krein, Jonathan L. and MacLean, Alexander C. and Knutson, Charles D. and Delorey, Daniel P. and Eggett, Dennis L.} } @conference {1217, title = {Trends That Affect Temporal Analysis Using SourceForge Data}, booktitle = {5th Workshop on Public Data about Software Development (WoPDaSD 2010)}, year = {2010}, note = {"In this paper we examine some of the limitations of artifact data by specifically addressing the applicability of SourceForge data to the study of project evolution." "For our analysis we examine 9,997 Production/Stable or Maintenance phase projects stored in CVS on SourceForge and extracted in October of 2006 [5]"}, abstract = {SourceForge is a valuable source of software artifact data for researchers who study project evolution and developer behavior. However, the data exhibit patterns that may bias temporal analyses. Most notable are cliff walls in project source code repository timelines, which indicate large commits that are out of character for the given project. These cliff walls often hide significant periods of development and developer collaboration{\textemdash}a threat to studies that rely on SourceForge repository data. We demonstrate how to identify these cliff walls, discuss reasons for their appearance, and propose preliminary measures for mitigating their effects in evolution-oriented studies.}, keywords = {cliff walls, committers, cvs, evolution, growth, source code, sourceforge, time, time series}, attachments = {https://flosshub.org/sites/flosshub.org/files/wopdasd001.pdf}, author = {MacLean, Alexander C. and Pratt, Landon J. and Krein, Jonathan L. and Knutson, Charles D.} } @conference {930, title = {Author entropy vs. file size in the GNOME suite of applications}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {91 - 94}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {We present the results of a study in which author entropy was used to characterize author contributions per file. Our analysis reveals three patterns: banding in the data, uneven distribution of data across bands, and file size dependent distributions within bands. Our results suggest that when two authors contribute to a file, large files are more likely to have a dominant author than smaller files.}, keywords = {author entropy, contributions, gnome, msr challenge}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069484}, author = {Casebolt, Jason R. and Krein, Jonathan L. and MacLean, Alexander C. and Knutson, Charles D. and Delorey, Daniel P.} } @conference {1204, title = {Language entropy: A metric for characterization of author programming language distribution}, booktitle = {4th Workshop on Public Data about Software Development (WoPDaSD 2009)}, year = {2009}, note = {The data set used in this study was previously collected for a separate, but related work. It was originally extracted from the SourceForge Research Archive (SFRA), August 2006. For a detailed discussion of the data source, collection tools and processes, and summary statistics, see [6]." "From the initial data set we extracted a random sample of 500 developers3 along with descriptive details of all revisions that those developers made since the inception of the projects on which they worked. We then condensed this sample by totaling the lines of code added by each developer for each month in which that developer made at least one code submission." [6] Daniel P. Delorey, Charles D. Knutson, and Alex MacLean. Studying production phase sourceforge projects: A case study using cvs2mysql and sfra+. In Second International Workshop on Public Data about Software Development (WoPDaSD {\textquoteright}07), June 2007.}, month = {2009}, abstract = {Programmers are often required to develop in multiple languages. In an effort to study the effects of programming language fragmentation on productivity{\textemdash}and ultimately on a programmer{\textquoteright}s problem solving abilities{\textemdash}we propose a metric, language entropy, for characterizing the distribution of an individual{\textquoteright}s development efforts across multiple programming languages. To evaluate this metric, we present an observational study examining all project contributions (through August 2006) of a random sample of 500 SourceForge developers. Using a random coefficients model, we found a statistically significant correlation (alpha level of 0.05) between language entropy and the size of monthly pro ject contributions (measured in lines of code added). Our results indicate that language entropy is a good candidate for characterizing author programing language distribution.}, keywords = {contributions, developers, language entropy, lines of code, loc, multiple languages, programming languages, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/LanguageEntropy-JonathanKrein.pdf}, author = {Krein, Jonathan L. and MacLean, Alexander C. and Delorey, Daniel P. and Knutson, Charles D. and Eggett, Dennis L.} } @proceedings {1645, title = {Mining Programming Language Vocabularies from Source Code}, year = {2009}, pages = {12 pp}, abstract = {We can learn much from the artifacts produced as the by-products of software devel- opment and stored in software repositories. Of all such potential data sources, one of the most important from the perspective of program comprehension is the source code itself. While other data sources give insight into what developers intend a program to do, the source code is the most accurate human-accessible description of what it will do. However, the ability of an individual developer to comprehend a particular source file depends directly on his or her familiarity with the specific features of the programming language being used in the file. This is not unlike the difficulties second-language learners may encounter when attempting to read a text written in a new language. We propose that by applying the techniques used by corpus linguists in the study of natural language texts to a corpus of programming language texts (i.e., source code repositories), we can gain new insights into the communication medium that is programming language. In this paper we lay the foundation for applying corpus linguistic methods to programming language by 1) defining the term {\textquotedblleft}word{\textquotedblright} for programming language, 2) developing data collection tools and a data storage schema for the Java programming language, and 3) presenting an initial analysis of an example linguistic corpus based on version 1.5 of the Java Developers Kit.}, attachments = {https://flosshub.org/sites/flosshub.org/files/21st-delorey.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and Davies, Mark} } @conference {1210, title = {Author Entropy: A Metric for Characterization of Software Authorship Patterns}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, note = {used flossmole to get sample of SF developers}, month = {2008}, pages = {42-47}, abstract = {We propose the concept of author entropy and describe how file-level entropy measures may be used to understand and characterize authorship patterns within individual files, as well as across an entire project. As a proof of concept, we compute author entropy for 28,955 files from 33 open-source projects. We explore patterns of author entropy, identify techniques for visualizing author entropy, and propose avenues for further study. }, keywords = {developers, entropy, flossmole, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/entropy2008.pdf}, author = {Taylor, Quinn C. and Stevenson, James E. and Delorey, Daniel P. and Knutson, Charles D.} } @conference {1225, title = {Programming Language Trends in Open Source Development: An Evaluation Using Data from All Production Phase SourceForge Projects}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, note = {"Our data were gathered from the SourceForge Research Ar- chive (SFRA) [4] and the CVS repositories Open Source projects hosted on SourceForge. We used cvs2mysql and SFRA+ to collect the data. cvs2mysql gathers data from CVS repositories and writes them to SQL scripts for im- port into a MySQL 5.0 database. The data collected by cvs2mysql are the name of the file, the location of the file in the repository, the type and state of the file, as well as the author, date, number of lines added and removed, and the author{\textquoteright}s message for each revision to the file."}, abstract = {In this work, we analyze data collected from the CVS repos- itories of 9,997 Open Source projects hosted on SourceForge in an effort to understand trends in programming language usage in the Open Source community between 2000 and 2005. The trends we consider include: 1) the relative popularity of the ten most popular programming languages over time, 2) the use of multiple programming languages by individual programmers and by individual projects, and 3) the programming languages most often used in combination.}, keywords = {cvs, cvs2mysql, programming languages, sfra, sourceforge, srda}, attachments = {https://flosshub.org/sites/flosshub.org/files/Delorey2007b.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and Giraud-Carrier, C.} } @conference {1212, title = {Studying Production Phase SourceForge Projects: An Exploratory Analysis Using cvs2mysql and SFRA}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, month = {2007}, abstract = {A wealth of data can be extracted from the natural by-products of software development processes and used in empirical studies of software engineering. However, the size and accuracy of such studies depend in large part on the availability of tools that facilitate the collection of data from individual projects and the combination of data from multiple projects. To demonstrate this point, we present our experience gathering and analyzing data from nearly 10,000 open source projects hosted on SourceForge. We describe the tools we developed to collect the data and the ways in which these tools and data may be used by other researchers. We also provide examples of statistics that we have calculated from these data to describe interesting author- and project-level behaviors of the SourceForge community.}, keywords = {Data Collection, forge, repositories, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/Delorey2007c.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and MacLean, Alexander C.} }