@article {1240, title = {Impact of Programming Language Fragmentation on Developer Productivity}, journal = {International Journal of Open Source Software and Processes}, volume = {2}, year = {2010}, month = {32/2010}, pages = {41 - 61}, abstract = {Programmers often develop software in multiple languages. In an effort to study the effects of programming language fragmentation on productivity{\textemdash}and ultimately on a developer{\textquoteright}s problem-solving abilities{\textemdash}the authors present a metric, language entropy, for characterizing the distribution of a developer{\textquoteright}s programming efforts across multiple programming languages. This paper presents an observational study examining the project contributions of a random sample of 500 SourceForge developers. Using a random coefficients model, the authors find a statistically (alpha level of 0.001) and practically significant correlation between language entropy and the size of monthly project contributions. Results indicate that programming language fragmentation is negatively related to the total amount of code contributed by developers within SourceForge, an open source software (OSS) community.}, keywords = {commits, entropy, language entropy, programming languages, sourceforge, srda}, issn = {1942-3934}, doi = {10.4018/jossp.2010040104}, author = {Krein, Jonathan L. and MacLean, Alexander C. and Knutson, Charles D. and Delorey, Daniel P. and Eggett, Dennis L.} } @conference {930, title = {Author entropy vs. file size in the GNOME suite of applications}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {91 - 94}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {We present the results of a study in which author entropy was used to characterize author contributions per file. Our analysis reveals three patterns: banding in the data, uneven distribution of data across bands, and file size dependent distributions within bands. Our results suggest that when two authors contribute to a file, large files are more likely to have a dominant author than smaller files.}, keywords = {author entropy, contributions, gnome, msr challenge}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069484}, author = {Casebolt, Jason R. and Krein, Jonathan L. and MacLean, Alexander C. and Knutson, Charles D. and Delorey, Daniel P.} } @conference {1204, title = {Language entropy: A metric for characterization of author programming language distribution}, booktitle = {4th Workshop on Public Data about Software Development (WoPDaSD 2009)}, year = {2009}, note = {The data set used in this study was previously collected for a separate, but related work. It was originally extracted from the SourceForge Research Archive (SFRA), August 2006. For a detailed discussion of the data source, collection tools and processes, and summary statistics, see [6]." "From the initial data set we extracted a random sample of 500 developers3 along with descriptive details of all revisions that those developers made since the inception of the projects on which they worked. We then condensed this sample by totaling the lines of code added by each developer for each month in which that developer made at least one code submission." [6] Daniel P. Delorey, Charles D. Knutson, and Alex MacLean. Studying production phase sourceforge projects: A case study using cvs2mysql and sfra+. In Second International Workshop on Public Data about Software Development (WoPDaSD {\textquoteright}07), June 2007.}, month = {2009}, abstract = {Programmers are often required to develop in multiple languages. In an effort to study the effects of programming language fragmentation on productivity{\textemdash}and ultimately on a programmer{\textquoteright}s problem solving abilities{\textemdash}we propose a metric, language entropy, for characterizing the distribution of an individual{\textquoteright}s development efforts across multiple programming languages. To evaluate this metric, we present an observational study examining all project contributions (through August 2006) of a random sample of 500 SourceForge developers. Using a random coefficients model, we found a statistically significant correlation (alpha level of 0.05) between language entropy and the size of monthly pro ject contributions (measured in lines of code added). Our results indicate that language entropy is a good candidate for characterizing author programing language distribution.}, keywords = {contributions, developers, language entropy, lines of code, loc, multiple languages, programming languages, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/LanguageEntropy-JonathanKrein.pdf}, author = {Krein, Jonathan L. and MacLean, Alexander C. and Delorey, Daniel P. and Knutson, Charles D. and Eggett, Dennis L.} } @proceedings {1645, title = {Mining Programming Language Vocabularies from Source Code}, year = {2009}, pages = {12 pp}, abstract = {We can learn much from the artifacts produced as the by-products of software devel- opment and stored in software repositories. Of all such potential data sources, one of the most important from the perspective of program comprehension is the source code itself. While other data sources give insight into what developers intend a program to do, the source code is the most accurate human-accessible description of what it will do. However, the ability of an individual developer to comprehend a particular source file depends directly on his or her familiarity with the specific features of the programming language being used in the file. This is not unlike the difficulties second-language learners may encounter when attempting to read a text written in a new language. We propose that by applying the techniques used by corpus linguists in the study of natural language texts to a corpus of programming language texts (i.e., source code repositories), we can gain new insights into the communication medium that is programming language. In this paper we lay the foundation for applying corpus linguistic methods to programming language by 1) defining the term {\textquotedblleft}word{\textquotedblright} for programming language, 2) developing data collection tools and a data storage schema for the Java programming language, and 3) presenting an initial analysis of an example linguistic corpus based on version 1.5 of the Java Developers Kit.}, attachments = {https://flosshub.org/sites/flosshub.org/files/21st-delorey.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and Davies, Mark} } @conference {1210, title = {Author Entropy: A Metric for Characterization of Software Authorship Patterns}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, note = {used flossmole to get sample of SF developers}, month = {2008}, pages = {42-47}, abstract = {We propose the concept of author entropy and describe how file-level entropy measures may be used to understand and characterize authorship patterns within individual files, as well as across an entire project. As a proof of concept, we compute author entropy for 28,955 files from 33 open-source projects. We explore patterns of author entropy, identify techniques for visualizing author entropy, and propose avenues for further study. }, keywords = {developers, entropy, flossmole, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/entropy2008.pdf}, author = {Taylor, Quinn C. and Stevenson, James E. and Delorey, Daniel P. and Knutson, Charles D.} } @conference {1225, title = {Programming Language Trends in Open Source Development: An Evaluation Using Data from All Production Phase SourceForge Projects}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, note = {"Our data were gathered from the SourceForge Research Ar- chive (SFRA) [4] and the CVS repositories Open Source projects hosted on SourceForge. We used cvs2mysql and SFRA+ to collect the data. cvs2mysql gathers data from CVS repositories and writes them to SQL scripts for im- port into a MySQL 5.0 database. The data collected by cvs2mysql are the name of the file, the location of the file in the repository, the type and state of the file, as well as the author, date, number of lines added and removed, and the author{\textquoteright}s message for each revision to the file."}, abstract = {In this work, we analyze data collected from the CVS repos- itories of 9,997 Open Source projects hosted on SourceForge in an effort to understand trends in programming language usage in the Open Source community between 2000 and 2005. The trends we consider include: 1) the relative popularity of the ten most popular programming languages over time, 2) the use of multiple programming languages by individual programmers and by individual projects, and 3) the programming languages most often used in combination.}, keywords = {cvs, cvs2mysql, programming languages, sfra, sourceforge, srda}, attachments = {https://flosshub.org/sites/flosshub.org/files/Delorey2007b.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and Giraud-Carrier, C.} } @conference {1212, title = {Studying Production Phase SourceForge Projects: An Exploratory Analysis Using cvs2mysql and SFRA}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, month = {2007}, abstract = {A wealth of data can be extracted from the natural by-products of software development processes and used in empirical studies of software engineering. However, the size and accuracy of such studies depend in large part on the availability of tools that facilitate the collection of data from individual projects and the combination of data from multiple projects. To demonstrate this point, we present our experience gathering and analyzing data from nearly 10,000 open source projects hosted on SourceForge. We describe the tools we developed to collect the data and the ways in which these tools and data may be used by other researchers. We also provide examples of statistics that we have calculated from these data to describe interesting author- and project-level behaviors of the SourceForge community.}, keywords = {Data Collection, forge, repositories, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/Delorey2007c.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and MacLean, Alexander C.} }