@proceedings {1289, title = {Cliff Walls: An Analysis of Monolithic Commits Using Latent Dirichlet Allocation}, year = {2011}, note = {"Our data set consists of the version control logs of almost 10,000 projects from SourceForge, acquired in late 2006"}, month = {10/2011}, pages = {282-298}, publisher = {Springer}, abstract = {Artifact-based research provides a mechanism whereby researchers may study the creation of software yet avoid many of the difficulties of direct observation and experimentation. However, there are still many challenges that can affect the quality of artifact-based studies, especially those studies examining software evolution. Large commits, which we refer to as {\textquotedblleft}Cliff Walls,{\textquotedblright} are one significant threat to studies of software evolution because they do not appear to represent incremental development. We used Latent Dirichlet Allocation to extract topics from over 2 million commit log messages, taken from 10,000 SourceForge projects. The topics generated through this method were then analyzed to determine the causes of over 9,000 of the largest commits. We found that branch merges, code imports, and auto-generated documentation were significant causes of large commits. We also found that corrective maintenance tasks, such as bug fixes, did not play a significant role in the creation of large commits.}, keywords = {artifacts, commit, cvs, LDA, lines of code, log files, scm, sloc, sourceforge, version control}, author = {Pratt, Landon J. and MacLean, Alexander C. and Knutson, Charles D. and Ringger, Eric K.} } @proceedings {1277, title = {Knowledge Homogeneity and Specialization in the Apache HTTP Server Project}, year = {2011}, note = {"Our data set consists of the commit history and email archives for the Apache HTTP Server Project, spanning sixteen years (2/27/1995 - 1/31/2011)" "we 1) mapped the committers to email records, 2) cleaned the email records to remove extraneous information, 3) identified topics of discussion in the resulting messages, and 4) constructed a social network model from committers and topics." "If specialization exists within the httpd community, we should see distinct communities develop around topics. In addition, unique groups of developers should congregate around specialized subtopics. We examined the data from both angles: topical affinity and topic communities." }, month = {10/2011}, pages = {106-122}, publisher = {Springer}, abstract = {We present an analysis of developer communication in the Apache HTTP Server project. Using topic modeling techniques we expose latent conceptual sub-communities arising from developer specialization within the greater developer population. However, we found that among the major contributors to the project, very little specialization exists. We present theories to explain this phenomenon, and suggest further research.}, keywords = {apache, commits, developer, email, email archive, LDA, mailing list, revision control, revision history, scm, social network analysis, specialization, subversion, svn}, url = {http://sequoia.cs.byu.edu/lab/files/pubs/MacLean2011a.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/MacLean2011a.pdf}, author = {MacLean, Alexander C. and Pratt, Landon J. and Knutson, Charles D. and Ringger, Eric K.} }