@proceedings {1901, title = {Adoption of Academic Tools in Open Source Communities: The Debian Case Study}, volume = {496}, year = {2017}, month = {05/2017}, pages = {139-150}, publisher = {Springer}, abstract = {Component repositories play a key role in the open software ecosystem. Managing the evolution of these repositories is a challenging task, and maintainers are confronted with a number of complex issues that need automatic tools to be adressed properly. In this paper, we present an overview of 10 years of research in this field and the process leading to the adoption of our tools in a FOSS community. We focus on the Debian distribution and in particular we look at the issues arising during the distribution lifecycle: ensuring buildability of source packages, detecting packages that cannot be installed and bootstrapping the distribution on a new architecture. We present three tools, distcheck, buildcheck and botch, that we believe of general interest for other open source component repositories. The lesson we have learned during this journey may provide useful guidance for researchers willing to see their tools broadly adopted by the community.}, keywords = {components, debian, distribution}, doi = {10.1007/978-3-319-57735-7_14}, url = {https://link.springer.com/chapter/10.1007/978-3-319-57735-7_14}, author = {Abate, Pietro and Di Cosmo, Roberto} } @proceedings {1902, title = {Assessing Code Authorship: The Case of the Linux Kernel}, volume = {496}, year = {2017}, month = {05/2017}, pages = {151-163}, publisher = {Springer}, abstract = {Code authorship is a key information in large-scale open-source systems. Among others, it allows maintainers to assess division of work and identify key collaborators. Interestingly, open-source communities lack guidelines on how to manage authorship. This could be mitigated by setting to build an empirical body of knowledge on how authorship-related measures evolve in successful open-source communities. Towards that direction, we perform a case study on the Linux kernel. Our results show that: (a) only a small portion of developers (26\%) makes significant contributions to the code base; (b) the distribution of the number of files per author is highly skewed{\textemdash}a small group of top-authors (3\%) is responsible for hundreds of files, while most authors (75\%) are responsible for at most 11 files; (c) most authors (62\%) have a specialist profile; (d) authors with a high number of co-authorship connections tend to collaborate with others with less connections.}, keywords = {code authorship, developer network, linux kernel}, doi = {10.1007/978-3-319-57735-7_15}, url = {https://link.springer.com/chapter/10.1007/978-3-319-57735-7_15}, author = {Guilherme Avelino and Passos, Leonardo and Andre Hora and Marco Tulio Valente} } @proceedings {1895, title = {How are Developers Treating License Inconsistency Issues? A Case Study on License Inconsistency Evolution in FOSS Projects}, volume = {496}, year = {2017}, month = {05/2017}, pages = {69-79}, publisher = {Springer}, abstract = {A license inconsistency is the presence of two or more source files that evolved from the same original file containing different licenses. In our previous study, we have shown that license inconsistencies do exist in open source projects and may lead to potential license violation problems. In this study, we try to find out whether the issues of license inconsistencies are properly solved by analyzing two versions of a FOSS distribution{\textemdash}Debian{\textemdash}and investigate the evolution patterns of license inconsistencies. Findings are: license inconsistencies occur mostly because the original copyright owner updated the license while the reusers were still using the old version of the source files with the old license; most license inconsistencies would disappear when the reusers synchronize their project from the upstream, while some would exist permanently if reusers decide not to synchronize anymore. Legally suspicious cases have not been found yet in those Debian distributions.}, keywords = {Code clone, debian, License inconsistency, licenses, Software license}, doi = {10.1007/978-3-319-57735-7_8}, url = {https://link.springer.com/chapter/10.1007/978-3-319-57735-7_8}, author = {Y. Wu and Manabe, Yuki and Daniel M. Germ{\'a}n and Inoue, K.} } @conference {Sharma:2017:IDE:3084226.3084271, title = {Investigating Developers{\textquoteright} Email Discussions During Decision-making in Python Language Evolution}, booktitle = {Proceedings of the 21st International Conference on Evaluation and Assessment in Software Engineering}, series = {EASE{\textquoteright}17}, year = {2017}, pages = {286{\textendash}291}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Context: Open Source Software (OSS) developers use mailing lists as their main forum for discussing the evolution of a project. However, the use of mailing lists by developers for decision-making has not received much research attention. Objective: We have explored this issue by studying developers{\textquoteright} email discussions around Python Enhancement Proposals (PEPs). Method: Our dataset comprised 42,672 emails from six different mailing lists pertaining to PEP development. We performed multiple forms of analysis on these emails, involving both quantitative measures (e.g., frequency) and deeper analysis of specific PEP discussions (i.e., outlier analysis). Results: Out of three PEP types (Informational, Process and Standard Track), Standard Track PEPs attract a large amount of discussion (both in volume and average number of messages per proposal). Our study also identified specific PEP states and topics that generated a disproportionate amount of discussion. Conclusion: Our outcomes point to several opportunities for improving the management of an OSS team based on the knowledge generated from discussions. We have also identified several interesting avenues for future work such as identifying individuals or groups that present persuasive arguments during decision-making.}, keywords = {Decision-making, Email discussions, Python development}, isbn = {978-1-4503-4804-1}, doi = {10.1145/3084226.3084271}, url = {http://doi.acm.org/10.1145/3084226.3084271}, author = {Sharma, Pankajeshwara and Savarimuthu, Bastin Tony Roy and Stanger, Nigel and Licorish, Sherlock A. and Rainer, Austen} } @article {1868, title = {Managing knowledge sharing in distributed innovation from the perspective of developers: empirical study of open source software projects in China}, journal = {Technology Analysis \& Strategic Management}, volume = {2929143029}, year = {2017}, month = {01/2017}, pages = {1 - 22}, abstract = {Knowledge sharing is the key factor that influences the performance of open source software (OSS) projects, which are the representative cases of distributed innovation. This paper aims to explore the mechanism of knowledge sharing in OSS projects from the perspective of developers in China. A quantitative method with the analysis of 403 valid questionnaires is adopted. A series of hypotheses about how distributed innovation (independent variables) influences knowledge sharing (mediating variable) and then affects the performance of OSS projects (dependent variable) are tested and approved. On the one side, we argue that developers will actively affect knowledge sharing in terms of participative motivation, social network and organisational culture. On the other hand, users may also affect the knowledge sharing when considering innovation willingness and capacity. It is interesting to find that social network is the most important factor in Chinese cases. It is strongly recommended to strengthen the collaboration between software companies and OSS communities.}, keywords = {Distributed innovation, knowledge sharing, performance of OSS projects, user innovation}, issn = {1465-3990}, doi = {http://dx.doi.org/10.1080/09537325.2016.1194387}, author = {Chen, Xiaohong and Zhou, Yuan and Probert, David and Su, Jun} } @proceedings {1891, title = {Principled Evaluation of Strengths and Weaknesses in FLOSS Communities: A Systematic Mixed Methods Maturity Model Approach}, volume = {496}, year = {2017}, month = {05/2017}, pages = {34-46}, publisher = {Springer}, abstract = {Context: Free and Open Source Software usually results from intricate socio-technical dynamics operating in a diverse and geographically dispersed community. Understanding the fundamental underpinnings of healthy and thriving communities is of paramount importance to evaluate existing efforts and identify improvement opportunities. Objective: This paper presents a novel reference model for evaluating the maturity of FLOSS communities by mixing quantitative and qualitative methods. Method: We build upon established guidelines for Design Science research in order to devise a well-informed and expressive maturity model, describing how those methods and procedures were used in the design and development of such a model. Results: We present the model structure and functions, as well as instructions on how to instantiate it as evaluations of FLOSS communities. The use of the proposed maturity model is demonstrated in four FLOSS communities. Conclusion: Whilst instantiating the model may be burdensome if aiming at sketchy evaluations, results indicate our model effectively captures the maturity regardless aspects such as community size and lifetime.}, keywords = {Design science research, Discourse communities, evaluation, FLOSS communities, maturity models, Mixed methods research}, doi = {10.1007/978-3-319-57735-7_4}, url = {https://link.springer.com/chapter/10.1007/978-3-319-57735-7_4}, author = {Andrade, S and Saraiva, F.} } @proceedings {1899, title = {Progression and Forecast of a Curated Web-of-Trust: A Study on the Debian Project{\textquoteright}s Cryptographic Keyring}, volume = {496}, year = {2017}, month = {05/2017}, pages = {117-127}, publisher = {Springer}, abstract = {The Debian project is one of the largest free software undertakings worldwide. It is geographically distributed, and participation in the project is done on a voluntary basis, without a single formal employee or directly funded person. As we will explain, due to the nature of the project, its authentication needs are very strict{\textemdash}User/password schemes are way surpassed, and centralized trust management schemes such as PKI are not compatible with its distributed and flat organization; fully decentralized schemes such as the PGP Web of Trust are insuficient by themselves. The Debian project has solved this need by using what we termed a {\textquotedblleft}curated Web of Trust{\textquotedblright}. We will explain some lessons learned from a massive key migration process that was triggered in 2014. We will present the social insight we have found from examining the relationships expressed as signatures in this curated Web of Trust, some recommendations on personal key-signing policies, and a statistical study and forecast on aging, refreshment and survival of project participants stemming from an analysis on their key-handling.}, keywords = {cryptography, curated Web of Trust, debian, Keyring, trust management}, doi = {10.1007/978-3-319-57735-7_12}, url = {https://link.springer.com/chapter/10.1007/978-3-319-57735-7_12}, author = {Gunnar Wolf and V{\'\i}ctor Gonz{\'a}lez Quiroga} } @conference {Izquierdo-Cortazar:2017:UMT:3084226.3084247, title = {Using Metrics to Track Code Review Performance}, booktitle = {Proceedings of the 21st International Conference on Evaluation and Assessment in Software Engineering}, series = {EASE{\textquoteright}17}, year = {2017}, pages = {214{\textendash}223}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {During 2015, some members of the Xen Project Advisory Board became worried about the performance of their code review process. The Xen Project is a free, open source software project developing one of the most popular virtualization platforms in the industry. They use a pre-commit peer review process similar to that in the Linux kernel, based on email messages. They had observed a large increase over time in the number of messages related to code review, and were worried about how this could be a signal of problems with their code review process. To address these concerns, we designed and conducted, with their continuous feedback, a detailed analysis focused on finding these problems, if any. During the study, we dealt with the methodological problems of Linux-like code review, and with the deeper issue of finding metrics that could uncover the problems they were worried about. For having a benchmark, we run the same analysis on a similar project, which uses very similar code review practices: the Linux Netdev (Netdev) project. As a result, we learned how in fact the Xen Project had some problems, but at the moment of the analysis those were already under control. We found as well how different the Xen and Netdev projects were behaving with respect to code review performance, despite being so similar from many points of view. In this paper we show the results of both analyses, and propose a comprehensive methodology, fully automated, to study Linux-style code review. We discuss also the problems of getting significant metrics to track improvements or detect problems in this kind of code review.}, keywords = {code review, data mining, Software development analytics}, isbn = {978-1-4503-4804-1}, doi = {10.1145/3084226.3084247}, url = {http://doi.acm.org/10.1145/3084226.3084247}, author = {Izquierdo-Cortazar, Daniel and Sekitoleko, Nelson and Jesus M. Gonzalez-Barahona and Kurth, Lars} } @article {1861, title = {The Debsources Dataset: two decades of free and open source software}, journal = {Empirical Software Engineering}, year = {2016}, month = {05/2015}, publisher = {IEEE}, abstract = {We present the Debsources Dataset: distribution metadata and source code metrics spanning two decades of Free and Open Source Software (FOSS) history, seen through the lens of the Debian distribution. Debsources is a software platform used to gather, search, and publish on the Web the full source code of the Debian operating system, as well as measures about it. A notable public instance of Debsources is available at http://sources.debian.net, it includes both current and historical releases of Debian. Plugins to compute popular source code metrics (lines of code, defined symbols, disk usage) and other derived data (e.g., Checksums) have been written, integrated, and run on all the source code available on sources.debian.net. The Debsources Dataset is a PostgreSQL database dump of sources.debian.net metadata, as of February 10th, 2015. The dataset contains both Debian-specific metadata -- e.g., which software packages are available in which release, which source code file belong to which package, release dates, etc. -- and source code information gathered by running Debsources plugins. The Debsources Dataset offer a very long-term historical view of the macro-level evolution and constitution of FOSS through the lens of popular, representative FOSS projects of their times.}, keywords = {debian, metadata, postgresql}, issn = {1573-7616}, doi = {10.1007/s10664-016-9461-5}, url = {https://matthieu.io/dl/papers/debsources-ese-2016.pdf}, author = {Caneill, Matthieu and Daniel M. Germ{\'a}n and Zacchiroli, Stefano} } @proceedings {1808, title = {How Do Free/Open Source Developers Pick Their Tools? A Delphi Study of the Debian Project}, year = {2016}, abstract = {Free and Open Source Software (FOSS) has come to play a critical role in the global software industry. Organizations are widely adopting FOSS and interacting with open source communities, and hence organizations have a considerable interest in seeing these communities flourishing. Very little research has focused on the tools used to develop that software. Given the absence of organizational policies and mandate that would occur in a traditional environment, an open question is how FOSS developers decide what tools to use. In this paper we report on a policy delphi study conducted in the Debian Project, one of the largest FOSS projects. Drawing from data collected in three phases from a panel of 21 experts, we identified 15 factors that affect their decision to adopt tools. This in turn can help FOSS communities to define a suitable policy of actions, in order to improve their processes.}, keywords = {Delphi, Free/Open Source Software, Qualitative Study, study, tools}, url = {https://www.researchgate.net/publication/291312269_How_Do_FreeOpen_Source_Developers_Pick_Their_Tools_A_Delphi_Study_of_the_Debian_Project}, author = {Martin Krafft and Stol, Klaas-Jan and Fitzgerald, Brian} } @proceedings {1763, title = {A Dataset of High Impact Bugs: Manually-Classified Issue Reports}, year = {2015}, month = {05/2015}, publisher = {IEEE}, abstract = {The importance of supporting test and maintenance activities in software development has been increasing, since recent software systems have become large and complex. Although in the field of Mining Software Repositories (MSR) there are many promising approaches to predicting, localizing, and triaging bugs, most of them do not consider impacts of each bug on users and developers but rather treat all bugs with equal weighting, excepting a few studies on high impact bugs including security, performance, blocking, and so forth. To make MSR techniques more actionable and effective in practice, we need deeper understandings of high impact bugs. In this paper we introduced our dataset of high impact bugs which was created by manually reviewing four thousand issue reports in four open source projects (Ambari, Camel, Derby and Wicket).}, keywords = {ambari, camel, derby, wicket}, url = {http://oss.sys.wakayama-u.ac.jp/publications/pman3.cgi?DOWNLOAD=141}, attachments = {https://flosshub.org/sites/flosshub.org/files/5594a518.pdf}, author = {Ohira, Masao and Yutaro Kashiwa and Yosuke Yamatani and Hayato Yoshiyuki and Yoshiya Maeda and Nachai Limsettho and Keisuke Fujino and Hata, Hideaki and Ihara, Akinori and Kenichi Matsumoto} } @proceedings {1757, title = {The Debsources Dataset: Two Decades of Debian Source Code Metadata}, year = {2015}, month = {05/2015}, publisher = {IEEE}, abstract = {We present the Debsources Dataset: distribution metadata and source code metrics spanning two decades of Free and Open Source Software (FOSS) history, seen through the lens of the Debian distribution. Debsources is a software platform used to gather, search, and publish on the Web the full source code of the Debian operating system, as well as measures about it. A notable public instance of Debsources is available at http://sources.debian.net; it includes both current and historical releases of Debian. Plugins to compute popular source code metrics (lines of code, defined symbols, disk usage) and other derived data (e.g., checksums) have been written, integrated, and run on all the source code available on sources.debian.net. The Debsources Dataset is a PostgreSQL database dump of sources.debian.net metadata, as of February 10th, 2015. The dataset contains both Debian-specific metadata{\textemdash}e.g., which software packages are available in which release, which source code file belong to which package, release dates, etc.{\textemdash}and source code information gathered by running Debsources plugins. The Debsources Dataset offer a very long-term historical view of the macro-level evolution and constitution of FOSS through the lens of popular, representative FOSS projects of their times.}, keywords = {debian}, url = {https://upsilon.cc/~zack/research/publications/debsources-msr-2015.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/debsources-msr-2015.pdf}, author = {Zacchiroli, Stefano} } @proceedings {1748, title = {Mining Component Repositories for Installability Issues}, year = {2015}, month = {05/2015}, abstract = {Component repositories play an increasingly relevant role in software life-cycle management, from software distribution to end-user, to deployment and upgrade management. Software components shipped via such repositories are equipped with rich metadata that describe their relationship (e.g., dependencies and conflicts) with other components. In this practice paper we show how to use a tool, distcheck, that uses component metadata to identify all the components in a repository that cannot be installed (e.g., due to unsatisfiable dependencies), provides detailed information to help developers understanding the cause of the problem, and fix it in the repository. We report about detailed analyses of several repositories: the Debian distribution, the OPAM package collection, and Drupal modules. In each case, distcheck is able to efficiently identify not installable components and provide valuable explanations of the issues. Our experience provides solid ground for generalizing the use of distcheck to other component repositories.}, keywords = {drupal, opam}, url = {http://www.dicosmo.org/preprints/msr-2015-distcheck.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/msr-2015-distcheck.pdf}, author = {Abate, Pietro and Di Cosmo, Roberto and Gesbert, Louis and Fabrice Le Fessant and Ralf Treinen and Zacchiroli, Stefano} } @inbook {1738, title = {A Qualitative Study on the Adoption of Open Source Software in Information Technology Outsourcing Organizations}, booktitle = {Open Source Systems: Adoption and Impact}, series = {IFIP Advances in Information and Communication Technology}, volume = {451}, year = {2015}, pages = {103-113}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {The purpose of this paper is to identify the influence of Outsourcing on Open source software (OSS) and further investigate the factors that impact the adoption of OSS in global Information Technology (IT) outsourcing organizations serviced by Indian IT services providers. This exploratory research adopted positivism research philosophy and qualitative approach. An in-depth interview was conducted with ten participants across IT outsourcing organizations, IT service providers, and OSS service providers. The results show that IT outsourcing was not found to have an impact on OSS adoption. However, eight factors including management support and OSS support availability was identified to influence OSS adoption. IT services providers can utilize this research model to increase their understanding of why some IT outsourcing organizations choose to adopt OSS, while seemingly similar ones facing similar market conditions do not. }, keywords = {Diffusion of innovation, Indian IT, IT outsourcing, open source software, OSS adoption, TOE}, isbn = {978-3-319-17836-3}, doi = {10.1007/978-3-319-17837-0_10}, url = {http://dx.doi.org/10.1007/978-3-319-17837-0_10}, author = {Ramanathan, Lakshmanan and Iyer, Sundaresan Krishnan}, editor = {Damiani, Ernesto and Frati, Fulvio and Dirk Riehle and Wasserman, Anthony I.} } @inbook {1731, title = {Scaling and Internationalizing an Agile FOSS Project: Lessons Learned}, booktitle = {Open Source Systems: Adoption and Impact}, series = {IFIP Advances in Information and Communication Technology}, volume = {451}, year = {2015}, pages = {13-22}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {This paper describes problems that arose with the scaling and internationalization of the open source project Catrobat. The problems we faced were the lack of a centralized user management, insufficient scaling of our communication channels, and the necessity to adapt agile development techniques to remote collaboration. To solve the problems we decided to use a mix of open source tools (Git, IRC, LDAP) and commercial solutions (Jira, Confluence, GitHub) because we believe that this mix best fits our needs. Other projects can benefit from the lessons we learned during the reorganization of our knowledge base and communication tools, as infrastructure changes can be very labor-intensive and time-consuming.}, keywords = {Agile development, communication, Distributed software development, Documentation management, Internationalization, kanban, Scaling}, isbn = {978-3-319-17836-3}, doi = {10.1007/978-3-319-17837-0_2}, url = {http://dx.doi.org/10.1007/978-3-319-17837-0_2}, author = {Fellhofer, Stephan and Harzl, Annemarie and Slany, Wolfgang}, editor = {Damiani, Ernesto and Frati, Fulvio and Dirk Riehle and Wasserman, Anthony I.} } @proceedings {1715, title = {"Should we move to Stack Overflow?" Measuring the utility of social media for developer support}, year = {2015}, month = {05/2015}, pages = {10pp}, publisher = {IEEE}, abstract = {Stack Overflow is an enormously popular question-and-answer web site intended for software developers to help each other with programming issues. Some software projects aimed at developers (for example, application programming interfaces, application engines, cloud services, development frameworks, and the like) are closing their self-supported developer discussion forums and mailing lists and instead directing developers to use special-purpose tags on Stack Overflow. The goals of this paper are to document the main reasons given for moving developer support to Stack Overflow, and then to collect and analyze data from a group of software projects that have done this, in order to show whether the expected quality of support was actually achieved. The analysis shows that for all four software projects in this study, two of the desired quality indicators, developer participation and response time, did show improvements on Stack Overflow as compared to mailing lists and forums. However, we also found several projects that moved back from Stack Overflow, despite achieving these desired improvements. The results of this study are applicable to a wide variety of software projects that provide developer support using social media.}, keywords = {developer support, forums, mailing list, metrics, quality, social media, Stack Overflow, technical support}, attachments = {https://flosshub.org/sites/flosshub.org/files/SEIP2015stackv2.pdf}, author = {Squire, Megan} } @article {1805, title = {Stigmergic coordination in FLOSS development teams: Integrating explicit and implicit mechanisms}, journal = {Cognitive Systems Research}, year = {2015}, month = {12/2015}, abstract = {The vast majority of literature on coordination in team-based projects has drawn on a conceptual separation between explicit (e.g. plans, feedbacks) and implicit coordination mechanisms (e.g. mental maps, shared knowledge). This analytical distinction presents some limitations in explaining how coordination is reached in organizations characterized by distributed teams, scarce face to face meetings and fuzzy and changing lines of authority, as in free/libre open source software (FLOSS) development. Analyzing empirical illustrations from two FLOSS projects, we highlight the existence of a peculiar model, stigmergic coordination, which includes aspects of both implicit and explicit mechanisms. The work product itself (implicit) and the characteristics under which it is shared (explicit) play an under-appreciated role in helping software developers manage dependencies as they arise. We develop this argument beyond the existing literature by working with an existing coordination framework, considering the role that the codebase itself might play at each step. We also discuss the features and the practices to support stigmergic coordination in distributed teams, as well as recommendations for future research. {\textquotedblleft}Not everything that implicitly exists needs to be rendered explicit{\textquotedblright} (Sloterdijk, 2009, p. 3).}, keywords = {Coordination mechanisms, distributed teams, FLOSS teams, Stigmergic coordination}, issn = {13890417}, doi = {10.1016/j.cogsys.2015.12.003}, url = {http://www.sciencedirect.com/science/article/pii/S1389041715000339}, attachments = {https://flosshub.org/sites/flosshub.org/files/COGSYS-RS-\%28HHS\%29-\%282015\%29-\%283\%29.pdf}, author = {Bolici, Francesco and Howison, James and Kevin Crowston} } @conference {Matragkas:2014:ABO:2597073.2597119, title = {Analysing the {\textquoteright}Biodiversity{\textquoteright} of Open Source Ecosystems: The GitHub Case}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {356{\textendash}359}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In nature the diversity of species and genes in ecological communities affects the functioning of these communities. Biologists have found out that more diverse communities appear to be more productive than less diverse communities. Moreover such communities appear to be more stable in the face of perturbations. In this paper, we draw the analogy between ecological communities and Open Source Software (OSS) ecosystems, and we investigate the diversity and structure of OSS communities. To address this question we use the MSR 2014 challenge dataset, which includes data from the top-10 software projects for the top programming languages on GitHub. Our findings show that OSS communities on GitHub consist of 3 types of users (core developers, active users, passive users). Moreover, we show that the percentage of core developers and active users does not change as the project grows and that the majority of members of large projects are passive users. }, keywords = {Data and knowledge visualization, data mining, mining challenge, msr challenge}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597119}, url = {http://doi.acm.org/10.1145/2597073.2597119}, author = {Matragkas, Nicholas and Williams, James R. and Kolovos, Dimitris S. and Paige, Richard F.} } @proceedings {1573, title = {"A bit of code": How the Stack Overflow Community Creates Quality Postings}, year = {2014}, month = {01/2014}, pages = {1425-1434}, publisher = {IEEE Computer Society}, abstract = {The Stack Overflow web site is an online community where programmers can ask and answer one another{\textquoteright}s questions, earning points and badges. The site offers guidance in the form of a Frequently Asked Questions (FAQ), beginning with "What kind of questions can I ask here?" The answer explains that "the best Stack Overflow questions have a bit of source code in them". This paper explores the role of source code and non-source code text on Stack Overflow in both questions and answers. The primary contribution of this paper is to provide a more detailed understanding of whether the presence of source code (and how much) actually will produce the "best" Stack Overflow questions or answers. A second contribution of this paper is to determine how the non-code portions of the text might also contribute the "best" Stack Overflow postings. }, keywords = {COLLABORATION, collaborative development, data mining, developer network, knowledge collaboration, open content, text mining}, doi = {http://dx.doi.org/10.1109/HICSS.2014.185}, attachments = {https://flosshub.org/sites/flosshub.org/files/hicssSMFinalWatermark.pdf}, author = {Squire, Megan and Funkhouser, Christian} } @conference {Aggarwal:2014:CPD:2597073.2597120, title = {Co-evolution of Project Documentation and Popularity Within Github}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {360{\textendash}363}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Github is a very popular collaborative software-development platform that provides typical source-code management and issue tracking features augmented by strong social-networking features such as following developers and watching projects. These features help "spread the word" about individuals and projects, building the reputation of the former and increasing the popularity of the latter. In this paper, we investigate the relation between project popularity and regular, consistent documentation updates. We found strong indicators that consistently popular projects exhibited consistent documentation effort and that this effort tended to attract more documentation collaborators. We also found that frameworks required more documentation effort than libraries to achieve similar adoption success, especially in the initial phase. }, keywords = {Cross Correlation, Documentation Change, mining challenge, msr challenge, popularity}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597120}, url = {http://doi.acm.org/10.1145/2597073.2597120}, author = {Aggarwal, Karan and Hindle, Abram and Stroulia, Eleni} } @conference {Gousios:2014:DPD:2597073.2597122, title = {A Dataset for Pull-based Development Research}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {368{\textendash}371}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Pull requests form a new method for collaborating in distributed software development. To study the pull request distributed development model, we constructed a dataset of almost 900 projects and 350,000 pull requests, including some of the largest users of pull requests on Github. In this paper, we describe how the project selection was done, we analyze the selected features and present a machine learning tool set for the R statistics environment. }, keywords = {Distributed software development, Empirical software engineering, msr data showcase, pull request, pull-based development}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597122}, url = {http://doi.acm.org/10.1145/2597073.2597122}, attachments = {https://flosshub.org/sites/flosshub.org/files/pullreqs-dataset.pdf}, author = {Gousios, Georgios and Zaidman, Andy} } @conference {Brunet:2014:DDD:2597073.2597115, title = {Do Developers Discuss Design?}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {340{\textendash}343}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Design is often raised in the literature as important to attaining various properties and characteristics in a software system. At least for open-source projects, it can be hard to find evidence of ongoing design work in the technical artifacts produced as part of the development. Although developers usually do not produce specific design documents, they do communicate about design in different ways. In this paper, we provide quantitative evidence that developers address design through discussions in commits, issues, and pull requests. To achieve this, we built a discussions{\textquoteright} classifier and automatically labeled 102,122 discussions from 77 projects. Based on this data, we make four observations about the projects: i) on average, 25\% of the discussions in a project are about design; ii) on average, 26\% of developers contribute to at least one design discussion; iii) only 1\% of the developers contribute to more than 15\% of the discussions in a project; and iv) these few developers who contribute to a broad range of design discussions are also the top committers in a project. }, keywords = {Design Discussions, empirical study, machine learning, mining challenge, msr challenge}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597115}, url = {http://doi.acm.org/10.1145/2597073.2597115}, attachments = {https://flosshub.org/sites/flosshub.org/files/brunet.pdf}, author = {Brunet, Jo{\~a}o and Murphy, Gail C. and Terra, Ricardo and Figueiredo, Jorge and Serey, Dalton} } @conference {Rozas:2014:DCP:2641580.2641624, title = {Drupal As a Commons-Based Peer Production Community: A Sociological Perspective}, booktitle = {Proceedings of The International Symposium on Open Collaboration}, series = {OpenSym {\textquoteright}14}, year = {2014}, pages = {36:1{\textendash}36:2}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {The aim of this research consists of extracting a set of insights related to the dynamics, group decision making procedures, motivations to contribute and mechanisms employed in the coordination of Commons-Based Peer Production communities, using as a case study the community responsible for the development of the Free/Libre Open Source Software Drupal. A sociological perspective is taken for this purpose, and a set of social research qualitative and quantitative methods employed for the study of online communities (virtual ethnography) are being used. }, keywords = {Activity Theory, Commons-Based Peer Production, drupal, Free/Libre Open Source Software, Virtual Ethnography}, isbn = {978-1-4503-3016-9}, doi = {10.1145/2641580.2641624}, url = {http://doi.acm.org/10.1145/2641580.2641624}, author = {Rozas, David} } @conference {Robles:2014:FSD:2597073.2597129, title = {FLOSS 2013: A Survey Dataset About Free Software Contributors: Challenges for Curating, Sharing, and Combining}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {396{\textendash}399}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this data paper we describe a data set obtained by means of performing an on-line survey to over 2,000 Free Libre Open Source Software (FLOSS) contributors. The survey includes questions related to personal characteristics (gender, age, civil status, nationality, etc.), education and level of English, professional status, dedication to FLOSS projects, reasons and motivations, involvement and goals. We describe as well the possibilities and challenges of using private information from the survey when linked with other, publicly available data sources. In this regard, an example of data sharing will be presented and legal, ethical and technical issues will be discussed. }, keywords = {anonymization, data combining, data sharing, ethics, free software, microdata, msr data showcase, open data, open source, privacy, Survey}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597129}, url = {http://doi.acm.org/10.1145/2597073.2597129}, attachments = {https://flosshub.org/sites/flosshub.org/files/msr14gregorio.pdf}, author = {Gregorio Robles and Reina, Laura Arjona and Serebrenik, Alexander and Vasilescu, Bogdan and Gonz{\'a}lez-Barahona, Jes{\'u}s M.} } @conference {Bloemen:2014:GPD:2597073.2597131, title = {Gentoo Package Dependencies over Time}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {404{\textendash}407}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Open source distributions such as Gentoo need to accurately track dependency relations between software packages in order to install working systems. To do this, Gentoo has a carefully authored database containing those relations. In this paper, we extract the Gentoo package dependency graph and its changes over time. The final dependency graph spans 15 thousand open source projects and 80 thousand dependency relations. Furthermore, the development of this graph is tracked over time from the beginning of the Gentoo project in 2000 to the first quarter of 2012, with monthly resolution. The resulting dataset provides many opportunities for research. In this paper we explore cluster analysis to reveals meaningful relations between packages and in a separate paper we analyze changes in the dependencies over time to get insights in the innovation dynamics of open source software. }, keywords = {dependencies, gentoo, graph, INNOVATION}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597131}, url = {http://doi.acm.org/10.1145/2597073.2597131}, author = {Bloemen, Remco and Amrit, Chintan and Kuhlmann, Stefan and Ord{\'o}{\~n}ez{\textendash}Matamoros, Gonzalo} } @conference {Ahmed:2014:IAC:2641580.2641585, title = {The Impact of Automatic Crash Reports on Bug Triaging and Development in Mozilla}, booktitle = {Proceedings of The International Symposium on Open Collaboration}, series = {OpenSym {\textquoteright}14}, year = {2014}, pages = {1:1{\textendash}1:8}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Free/Open Source Software projects often rely on users submitting bug reports. However, reports submitted by novice users may lack information critical to developers, and the process may be intimidating and difficult. To gather more and better data, projects deploy automatic crash reporting tools, which capture stack traces and memory dumps when a crash occurs. These systems potentially generate large volumes of data, which may overwhelm developers, and their presence may discourage users from submitting traditional bug reports. In this paper, we examine Mozilla{\textquoteright}s automatic crash reporting system and how it affects their bug triaging process. We find that fewer than 0.00009\% of crash reports end up in a bug report, but as many as 2.33\% of bug reports have data from crash reports added. Feedback from developers shows that despite some problems, these systems are valuable. We conclude with a discussion of the pros and cons of automatic crash reporting systems. }, keywords = {Automatic Crash reporting, debugging, FOSS, Free/Open Source Software, Open Bug Reporting, testing}, isbn = {978-1-4503-3016-9}, doi = {10.1145/2641580.2641585}, url = {http://doi.acm.org/10.1145/2641580.2641585}, author = {Ahmed, Iftekhar and Mohan, Nitin and Jensen, Carlos} } @conference {Bloemen:2014:IDO:2597073.2597079, title = {Innovation Diffusion in Open Source Software: Preliminary Analysis of Dependency Changes in the Gentoo Portage Package Database}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {316{\textendash}319}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this paper we make the case that software dependencies are a form of innovation adoption. We then test this on the time-evolution of the Gentoo package dependency graph. We find that the Bass model of innovation diffusion fits the growth of the number of packages depending on a given library. Interestingly, we also find that low-level packages have a primarily imitation driven adoption and multimedia libraries have primarily innovation driven growth. }, keywords = {dependencies, gentoo, graph, INNOVATION}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597079}, url = {http://doi.acm.org/10.1145/2597073.2597079}, attachments = {https://flosshub.org/sites/flosshub.org/files/bloeman.pdf}, author = {Bloemen, Remco and Amrit, Chintan and Kuhlmann, Stefan and Ord{\'o}{\~n}ez{\textendash}Matamoros, Gonzalo} } @conference {Gousios:2014:LGG:2597073.2597126, title = {Lean GHTorrent: GitHub Data on Demand}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {384{\textendash}387}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In recent years, GitHub has become the largest code host in the world, with more than 5M developers collaborating across 10M repositories. Numerous popular open source projects (such as Ruby on Rails, Homebrew, Bootstrap, Django or jQuery) have chosen GitHub as their host and have migrated their code base to it. GitHub offers a tremendous research potential. For instance, it is a flagship for current open source development, a place for developers to showcase their expertise to peers or potential recruiters, and the platform where social coding features or pull requests emerged. However, GitHub data is, to date, largely underexplored. To facilitate studies of GitHub, we have created GHTorrent, a scalable, queriable, offline mirror of the data offered through the GitHub REST API. In this paper we present a novel feature of GHTorrent designed to offer customisable data dumps on demand. The new GHTorrent data-on-demand service offers users the possibility to request via a web form up-to-date GHTorrent data dumps for any collection of GitHub repositories. We hope that by offering customisable GHTorrent data dumps we will not only lower the "barrier for entry" even further for researchers interested in mining GitHub data (thus encourage researchers to intensify their mining efforts), but also enhance the replicability of GitHub studies (since a snapshot of the data on which the results were obtained can now easily accompany each study). }, keywords = {data on demand, dataset, github, msr data showcase}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597126}, url = {http://doi.acm.org/10.1145/2597073.2597126}, attachments = {https://flosshub.org/sites/flosshub.org/files/lean-ghtorrent_0.pdf}, author = {Gousios, Georgios and Vasilescu, Bogdan and Serebrenik, Alexander and Zaidman, Andy} } @conference {Yamashita:2014:MSO:2597073.2597116, title = {Magnet or Sticky? An OSS Project-by-project Typology}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {344{\textendash}347}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {For Open Source Software (OSS) projects, retaining existing contributors and attracting new ones is a major concern. In this paper, we expand and adapt a pair of population migration metrics to analyze migration trends in a collection of open source projects. Namely, we study: (1) project stickiness, i.e., its tendency to retain existing contributors and (2) project magnetism, i.e., its tendency to attract new contributors. Using quadrant plots, we classify projects as attractive (highly magnetic and sticky), stagnant (highly sticky, weakly magnetic), fluctuating (highly magnetic, weakly sticky), or terminal (weakly magnetic and sticky). Through analysis of the MSR challenge dataset, we find that: (1) quadrant plots can effectively identify at-risk projects, (2) stickiness is often motivated by professional activity and (3) transitions among quadrants as a project ages often coincides with interesting events in the evolution history of a project. }, keywords = {Developer migration, Magnet, mining challenge, msr challenge, open source, Sticky}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597116}, url = {http://doi.acm.org/10.1145/2597073.2597116}, attachments = {https://flosshub.org/sites/flosshub.org/files/yamashita.pdf}, author = {Yamashita, Kazuhiro and McIntosh, Shane and Kamei, Yasutaka and Ubayashi, Naoyasu} } @conference {Williams:2014:MOP:2597073.2597132, title = {Models of OSS Project Meta-information: A Dataset of Three Forges}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, note = {"FLOSSMole [4] is a similar initiative to OSSMETER; it aims to collect and freely redistribute in different formats the data of open source software. Differently from OSSMETER, however, the FLOSSMole project does not provide the instruments to analyse data, that are simply collected and made publicly available."}, pages = {408{\textendash}411}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {The process of selecting open-source software (OSS) for adoption is not straightforward as it involves exploring various sources of information to determine the quality, maturity, activity, and user support of each project. In the context of the OSSMETER project, we have developed a forge-agnostic metamodel that captures the meta-information common to all OSS projects. We specialise this metamodel for popular OSS forges in order to capture forge-specific meta-information. In this paper we present a dataset conforming to these metamodels for over 500,000 OSS projects hosted on three popular OSS forges: Eclipse, SourceForge, and GitHub. The dataset enables different kinds of automatic analysis and supports objective comparisons of cross-forge OSS alternatives with respect to a user{\textquoteright}s needs and quality requirements. }, keywords = {data mining, flossmole cited}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597132}, url = {http://doi.acm.org/10.1145/2597073.2597132}, attachments = {https://flosshub.org/sites/flosshub.org/files/Models_of_OSS_Project_Meta-Information_A_Dataset_of_Three_Forges_draft.pdf}, author = {Williams, James R. and Di Ruscio, Davide and Matragkas, Nicholas and Di Rocco, Juri and Kolovos, Dimitris S.} } @conference {Beller:2014:MCR:2597073.2597082, title = {Modern Code Reviews in Open-source Projects: Which Problems Do They Fix?}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {202{\textendash}211}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Code review is the manual assessment of source code by humans, mainly intended to identify defects and quality problems. Modern Code Review (MCR), a lightweight variant of the code inspections investigated since the 1970s, prevails today both in industry and open-source software (OSS) systems. The objective of this paper is to increase our understanding of the practical benefits that the MCR process produces on reviewed source code. To that end, we empirically explore the problems fixed through MCR in OSS systems. We manually classified over 1,400 changes taking place in reviewed code from two OSS projects into a validated categorization scheme. Surprisingly, results show that the types of changes due to the MCR process in OSS are strikingly similar to those in the industry and academic systems from literature, featuring the similar 75:25 ratio of maintainability-related to functional problems. We also reveal that 7{\textendash}35\% of review comments are discarded and that 10{\textendash}22\% of the changes are not triggered by an explicit review comment. Patterns emerged in the review data; we investigated them revealing the technical factors that influence the number of changes due to the MCR process. We found that bug-fixing tasks lead to fewer changes and tasks with more altered files and a higher code churn have more changes. Contrary to intuition, the person of the reviewer had no impact on the number of changes. }, keywords = {code review, defects, open source software}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597082}, url = {http://doi.acm.org/10.1145/2597073.2597082}, attachments = {https://flosshub.org/sites/flosshub.org/files/beller.pdf}, author = {Beller, Moritz and Bacchelli, Alberto and Zaidman, Andy and Juergens, Elmar} } @conference {Guo:2014:ODC:2597073.2597094, title = {Oops! Where Did That Code Snippet Come from?}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {52{\textendash}61}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {A kernel oops is an error report that logs the status of the Linux kernel at the time of a crash. Such a report can provide valuable first-hand information for a Linux kernel maintainer to conduct postmortem debugging. Recently, a repository has been created that systematically collects kernel oopses from Linux users. However, debugging based on only the information in a kernel oops is difficult. We consider the initial problem of finding the offending line, i.e., the line of source code that incurs the crash. For this, we propose a novel algorithm based on approximate sequence matching, as used in bioinformatics, to automatically pinpoint the offending line based on information about nearby machine-code instructions, as found in a kernel oops. Our algorithm achieves 92\% accuracy compared to 26\% for the traditional approach of using only the oops instruction pointer.}, keywords = {debugging, linux kernel, oops, sequence alignment}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597094}, url = {http://doi.acm.org/10.1145/2597073.2597094}, attachments = {https://flosshub.org/sites/flosshub.org/files/guo.pdf}, author = {Guo, Lisong and Lawall, Julia and Muller, Gilles} } @article {1745, title = {SENTIMENT ANALYSIS OF FREE/OPEN SOURCE DEVELOPERS: PRELIMINARY FINDINGS FROM A CASE STUDY}, journal = {Revista Eletr{\^o}nica de Sistemas de Informa{\c c}{\~a}o}, volume = {13}, year = {2014}, month = {08/2014}, abstract = {Software development is a human intensive activity. And as such, how developers face their tasks is of major importance. In an environment such as the one that is common in FOSS (free/open source software) projects where professionals (i.e., paid developers) share the development effort with volunteers, the morale of the development and user community is of major importance. In this paper, we present a preliminary analysis using sentiment analysis techniques to a FOSS project. We therefore mine the mailing list of a project and apply these techniques to the most relevant participants. Although the application is at this time limited, we hope that this experience can be of benefit in the future to determine situations that may affect the developers or the project, such as low productivity, developer abandonment, project forking, etc. }, keywords = {developer productivity, FLOSS, mailing lists, natural language processing, openSUSE, sentiment analysis, software development; software repository mining}, doi = {10.5329/RESI.2014.1302006}, url = {http://189.16.45.2/ojs/index.php/reinfo/article/view/1677}, attachments = {https://flosshub.org/sites/flosshub.org/files/1677-6732-1-PB.pdf}, author = {Rousinopoulos, Athanasios-Ilias and Gregorio Robles and Gonz{\'a}lez-Barahona, Jes{\'u}s M.} } @inbook {1604, title = {Use of Open Software Tools for Data Offloading Techniques Analysis on Mobile Networks}, booktitle = {Open Source Software: Mobile Open Source Technologies}, series = {IFIP Advances in Information and Communication Technology}, volume = {427}, year = {2014}, pages = {111-112}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, abstract = { This research aims to highlight the benefits of using free software based tools for studying a LTE mobile network with realistic parameters. We will overload this LTE network and offload it through data offloading techniques such as small cells and WiFi offload. For this research, discreteevent open software network simulator ns3 will be implemented. Ns3 is a network simulator based on the programming language C++, and has all the necessary libraries to simulate an LTE and WiFi network. }, keywords = {Data Offloading, LTE, ns3, OSS for research and education, small cells, WiFi}, isbn = {978-3-642-55127-7}, doi = {10.1007/978-3-642-55128-4_15}, url = {http://dx.doi.org/10.1007/978-3-642-55128-4_15}, author = {Koo, Jos{\'e}M. and Espino, JuanP. and Armuelles, Iv{\'a}n and Villarreal, Rub{\'e}n}, editor = {Corral, Luis and Sillitti, Alberto and Succi, Giancarlo and Vlasenko, Jelena and Wasserman, AnthonyI.} } @proceedings {1552, title = {Analyzing Social Behavior of Software Developers Across Different Communication Channels}, year = {2013}, abstract = {Software developers use different project repositories (i.e., mailing list, bug tracking repositories, discussion forums etc.) to interact with each other or to solve software related problems. The growing interest in the usage of social media channels (i.e., Twitter, Facebook, LinkedIn) have also attracted the open source software community and software developers to adopt an identity in order to disseminate project-related information to a wider audience. Much research has been carried out to analyze the social behavior of software developers in different project repositories but so far no one has tried to study the social communication patterns of developers in other social media channels. We in this paper presents a new dimension to the social aspects of software developers and study if the social communication patterns of software developers is different on project repositories and social media channels (i.e., Twitter).}, keywords = {communication, developer, social media}, url = {http://index.ksi.edu/conf/seke/2013/cr/296.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/iqbal_a_et_al_june_2013.pdf}, author = {Iqbal, Aftab and M Karnstedt and M Hausenblas} } @proceedings {1488, title = {Apache-Affiliated Twitter Screen Names: A Dataset}, year = {2013}, month = {05/2013}, abstract = {This paper describes a new dataset containing Twitter screen names for members of the projects affiliated with the Apache Software Foundation (ASF). The dataset includes the confirmed Twitter screen names, as well as the real name as listed on Twitter, and the user identification as used within the Apache organization. The paper also describes the process used to collect and clean this data, and shows some sample queries for learning how to use the data. The dataset has been donated to the FLOSSmole project and is available for download (https://code. google.com/p/flossmole/downloads/detail?name=apacheTwitter2013-Jan.zip) or direct querying via a database client. }, keywords = {apache, dataset, twitter}, attachments = {https://flosshub.org/sites/flosshub.org/files/apacheTwitterPREPRINT.pdf , https://flosshub.org/sites/flosshub.org/files/MSR\%20presentation.pdf}, author = {Squire, Megan} } @conference {Paschalidou:2013:ADE:2490257.2490264, title = {An application of data envelopment analysis to software quality assessment}, booktitle = {Proceedings of the 6th Balkan Conference in Informatics}, series = {BCI {\textquoteright}13}, year = {2013}, pages = {228{\textendash}235}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Data Envelopment Analysis (DEA) is a non-parametric technique which involves the use of linear programming methods to measure the efficiency of a homogenous set of units. These units are known as Decision Making Units (DMUs) and defined by multiple input and output data. Efficiencies are measured relative to a piece-wise surface (efficient frontier) which envelops the data, thus justifying the name of the technique. Although DEA has been mostly used in production economics, its application in the context of software quality evaluation seems to be a promising approach. This study provides an application of DEA to assess the evolution of two open-source software projects in terms of selected metric values for successive versions of each project. What is really interesting in DEA is that a single efficiency score is calculated for each version despite the often convoluted overall picture of the metric values. According to a simplified view of DEA, there are two categories of units, the efficient (onto the efficient frontier) and the inefficient ones. Each inefficient unit is characterized by a reference set of peers which involves all the efficient units "operating" closer to that unit. Through the consideration of the reference set of the inefficient versions of each project, the metrics that require improvement, as well as the extent of improvement, could be estimated. These results could assist software developers in identifying design issues that require further improvement. Notwithstanding the fact that there are a number of issues to be further investigated, the applicability of DEA and other operations research tools in the context of software quality might yield interesting results. }, keywords = {dea, design metrics, software evolution, software quality}, isbn = {978-1-4503-1851-8}, doi = {10.1145/2490257.2490264}, url = {http://doi.acm.org/10.1145/2490257.2490264}, author = {Paschalidou, Georgia and Stiakakis, Emmanouil and Chatzigeorgiou, Alexander} } @book {1542, title = {Authoritative Linked Data Descriptions of Debian Source Packages Using ADMS.SW}, series = {IFIP Advances in Information and Communication TechnologyOpen Source Software: Quality Verification}, volume = {404}, year = {2013}, pages = {168 - 181}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, address = {Berlin, Heidelberg}, abstract = {he Debian Package Tracking System is a Web dashboard for Debian contributors and advanced users. This central tool publishes the status of subsequent releases of source packages in the Debian distribution. It has been improved to generate RDF meta-data documenting the source packages, their releases and links to other packaging artifacts, using the ADMS.SW 1.0 model. This constitutes an authoritative source of machine-readable Debian {\textquotedblleft}facts{\textquotedblright} and proposes a reference URI naming scheme for Linked Data resources about Debian packages. This should enable the interlinking of these Debian package descriptions with other ADMS.SW or DOAP descriptions of FLOSS projects available on the Semantic Web also using Linked Data principles. This will be particularly interesting for traceability with upstream projects whose releases are packaged in Debian, derivative distributions reusing Debian source packages, or with other FLOSS distributions.}, keywords = {debian}, isbn = {978-3-642-38928-3}, issn = {1868-422X}, doi = {10.1007/978-3-642-38928-3_12}, author = {Olivier Berger and Christian Bac}, editor = {Petrinja, Etiel and Succi, Giancarlo and Ioini, Nabil and Sillitti, Alberto} } @proceedings {1501, title = {The Impact of Tangled Code Changes}, year = {2013}, month = {05/2013}, abstract = {When interacting with version control systems, developers often commit unrelated or loosely related code changes in a single transaction. When analyzing the version history, such tangled changes will make all changes to all modules appear related, possibly compromising the resulting analyses through noise and bias. In an investigation of five open-source JAVA projects, we found up to 15\% of all bug fixes to consist of multiple tangled changes. Using a multi-predictor approach to untangle changes, we show that on average at least 16.6\% of all source files are incorrectly associated with bug reports. We recommend better change organization to limit the impact of tangled changes.}, keywords = {bias, data quality, history, java, mining software repositories, noise, tangled code changes, version control}, url = {http://www.kim-herzig.de/wp-content/uploads/2013/03/msr2013-untangling.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/msr2013-untangling.pdf}, author = {Kim Herzig and Zeller, Andreas} } @article {1553, title = {Interlinking Developer Identities within and across Open Source Projects: The Linked Data Approach}, journal = {ISRN Software Engineering}, volume = {201330692164}, year = {2013}, month = {2013}, pages = {1 - 12}, abstract = {Software developers use various software repositories in order to interact with each other or to solve related problems. These repositories provide a rich source of information for a wide range of tasks. However, one issue to overcome in order to make this information useful is the identification and interlinking of multiple identities of developers. In this paper, we propose a Linked Data-based methodology to interlink and integrate multiple identities of a developer found in different software repositories of a project as well as across repositories of multiple projects. Providing such interlinking will enable us to keep track of a developer{\textquoteright}s activity not only within a single project but also across multiple projects. The methodology will be presented in general and applied to 5 Apache projects as a case study. Further, we show that the few methods suggested so far are not always appropriate to overcome the developer identification problem.}, keywords = {developer, identity, linked data}, doi = {10.1155/2013/58473110.1007/s10664-009-9110-310.1109/TSE.2005.7010.1007/978-0-387-72486-7_4}, attachments = {https://flosshub.org/sites/flosshub.org/files/584731.pdf}, author = {Iqbal, Aftab and Hausenblas, Michael} } @proceedings {1521, title = {It{\textquoteright}s Not a Bug, It{\textquoteright}s a Feature: How Misclassification Impacts Bug Prediction}, year = {2013}, month = {05/2013}, pages = {392-401}, abstract = {In a manual examination of more than 7,000 issue reports from the bug databases of five open-source projects, we found 33.8\% of all bug reports to be misclassified{\textemdash}that is, rather than referring to a code fix, they resulted in a new feature, an update to documentation, or an internal refactoring. This misclassification introduces bias in bug prediction models, confusing bugs and features: On average, 39\% of files marked as defective actually never had a bug. We discuss the impact of this misclassification on earlier studies and recommend manual data validation for future studies.}, keywords = {bias, bug reports, data quality, mining software repositories, noise}, author = {Kim Herzig and Sascha Just and Zeller, Andreas} } @article {IJC397, title = {Preliminary steps toward a general theory of internet-based collective-action in digital information commons: Findings from a study of open source software projects}, journal = {International Journal of the Commons}, volume = {7}, number = {2}, year = {2013}, abstract = {This paper presents some of the findings from a 5-year empirical study of FOSS (free/libre and open source software) commons, completed in 2011. FOSS projects are Internet-based common property regimes where the project source code is developed over the Internet. The resulting software is generally distributed with a license that provides users with the freedoms to access, use, read, modify and redistribute the software. In this study we used three different and very large datasets (approximately 107,000; 174,000 and 1400 cases respectively) with information on FOSS projects residing in Sourceforge.net, one of the largest, if not the largest, FOSS repository in the world. We employ various quantitative methods to uncover factors that lead some FOSS projects to ongoing collaborative success, while others become abandoned. After presenting some of our study{\textquoteright}s results, we articulate the collaborative {\textquotedblleft}story{\textquotedblright} of FOSS that emerged. We close the paper by discussing some key findings that can contribute to a general theory of Internet-based collective-action and FOSS-like forms of digital online commons.}, keywords = {collaborative success and abandonment, common property regime, digital information commons, flossmole, Free/libre software, open source software, sourceforge, srda}, issn = {1875-0281}, url = {http://www.thecommonsjournal.org/index.php/ijc/article/view/URN\%3ANBN\%3ANL\%3AUI\%3A10-1-114926}, author = {Charles Schweik and Robert English} } @proceedings {1489, title = {Project Roles in the Apache Software Foundation: A Dataset}, year = {2013}, month = {05/2013}, abstract = {This paper outlines the steps in the creation and maintenance of a new dataset listing leaders of the various projects of the Apache Software Foundation (ASF). Included in this dataset are different levels of committers to the various ASF project code bases, as well as regular and emeritus members of the ASF, and directors and officers of the ASF. The dataset has been donated to the FLOSSmole project under an open source license, and is available for download (https://code.google.com /p/flossmole/downloads/detail?name=apachePeople2013-Jan.zip), or for direct querying via a database client.}, keywords = {apache, dataset, roles}, attachments = {https://flosshub.org/sites/flosshub.org/files/apacheRolesPREPRINT.pdf , https://flosshub.org/sites/flosshub.org/files/MSR\%20presentation_0.pdf}, author = {Squire, Megan} } @proceedings {1561, title = {A Replicable Infrastructure for Empirical Studies of Email Archives}, year = {2013}, month = {10/2013}, pages = {43-50}, publisher = {IEEE}, address = {Baltimore, MD, USA}, abstract = {This paper describes a replicable infrastructure solution for conducting empirical software engineering studies based on email mailing list archives. Mailing list emails, such as those affiliated with free, libre, and open source software (FLOSS) projects, are currently archived in several places online, but each research team that wishes to study these email artifacts closely must design their own solution for collection, storage and cleaning of the data. Consequently, research results will be difficult to replicate, especially as the email archive for any living project will still be continually growing. This paper describes a simple, replicable infrastructure for the collection, storage, and cleaning of project email data and analyses.}, keywords = {apache, cleaning, collection, couchdb, database, document-oriented database, email, lucene, mailing lists, nosql, replication, storage}, isbn = {978-0-7695-5121-0}, attachments = {https://flosshub.org/sites/flosshub.org/files/RESERv2.pdf}, author = {Squire, Megan} } @proceedings {1498, title = {Who Does What during a Code Review? Datasets of OSS Peer Review Repositories }, year = {2013}, month = {05/2013}, abstract = {We present four datasets that are focused on the general roles of OSS peer review members. With data mined from both an integrated peer review system and code source repositories, our rich datasets comprise of peer review data that was automatically recorded. Using the Android project as a case study, we describe our extraction methodology, the datasets and their application used for three separate studies. Our datasets are available online at http://sdlab.naist.jp/reviewmining/}, keywords = {android, case study, code review, data set, peer review, roles, source code}, author = {Kazuki Hamasaki and Raula Gaikovina Kula and Norihiro Yoshida and A. E. Camargo Cruz and Kenji Fujiwara and Hajimu Iida} } @conference {1375, title = {The Effects of Diversity in Global, Distributed Collectives: A Study of Open Source Project Success}, booktitle = {Information Systems Research}, year = {2012}, abstract = {Diversity is a defining characteristic of global collectives facilitated by the Internet. Though substantial evidence suggests that diversity has profound implications for a variety of outcomes including performance, member engagement, and withdrawal behavior, the effects of diversity have been predominantly investigated in the context of organizational workgroups or virtual teams. We use a diversity lens to study the success of non-traditional virtual work groups exemplified by open source software (OSS) projects. Building on the diversity literature, we propose that three types of diversity (separation, variety and disparity) influence two critical outcomes for OSS projects: community engagement and market success. We draw on the OSS literature to further suggest that the effects of diversity on market success are moderated by the application development stage. We instantiate the operational definitions of three forms of diversity to the unique context of open source projects. Using archival data from 357 projects hosted on SourceForge, we find that disparity diversity, reflecting variation in participants{\textquoteleft} contribution-based reputation, is positively associated with success. The impact of separation diversity, conceptualized as culture and measured as diversity in the spoken language and country of participants, has a negative impact on community engagement but an unexpected positive effect on market success. Variety diversity, reflected in dispersion in project participant roles, positively influences community engagement and market success. The impact of diversity on market success is conditional on the development stage of the project. We discuss how the study{\textquoteleft}s findings advance the literature on antecedents of OSS success, expand our theoretical understanding of diversity, and present the practical implications of the results for managers of distributed collectives.}, keywords = {diversity, global collectives, open source software}, attachments = {https://flosshub.org/sites/flosshub.org/files/isr_2012.pdf}, author = {Daniel, Sherae L. and Ritu Agarwal and Stewart, Katherine J.} } @conference {1316, title = {An Empirical Study of Volunteer Members{\textquoteright} Perceived Turnover in Open Source Software Projects}, booktitle = {45th Hawai{\textquoteright}i International Conference on System Sciences}, year = {2012}, note = {"After designing the questionnaire, we conducted a web-based survey by inviting developers working in sourceforge.net and launchpad.net."}, month = {01/2012}, pages = {3396-3405}, abstract = {Turnover of volunteer members and the ensuing instability bring about severe problems to open source software (OSS) projects. To better understand it, we based our study on Herzberg ́s two-factor theory to investigate the influence of hygiene factors on volunteer members ́ dissatisfaction and perceived turnover. After empirically testing the research model, we found shortcomings in project regulation and administration are the key reason for volunteer members ́ dissatisfaction, followed by future rewards and personal needs for software functionalities. By contrast, a possible lack of supportive working relationship among OSS developers was not found to be a trigger for developer dissatisfaction. Dissatisfaction was confirmed to be a significant predictor of perceived turnover. The results demonstrates generalized hygiene factors cannot unreflectively be transferred into the OSS context because volunteer members ́ personal expectation has a weaker influence on perceived turnover than objective attributes of OSS project. Our study further makes suggestions for project administrators.}, keywords = {developers, launchpad, sourceforge, Survey}, author = {Yu, Yiqing and Benlian, Alexander and Hess, Thomas} } @proceedings {1465, title = {Forking the Commons: Developmental Tensions and Evolutionary Patterns in Open Source Software}, volume = {378}, year = {2012}, month = {09/2012}, pages = {310-315}, publisher = {IFIP AICT, Springer}, abstract = {Open source software (OSS) presents opportunities and challenges for developers to exploit its commons based licensing regime by creating specializations of a software technology to address plurality of goals and priorities. By {\textquoteleft}forking{\textquoteright} a new branch of development separate from the main project, development diverges into a path in order to relieve tensions related to specialization, which later encounters new tensions. In this study, we first classify forces and patterns within this divergence process. Such tensions may stem from a variety of sources including internal power conflicts, emergence of new environmental niches such as demand for specialized uses of same software, or differences along stability vs. development speed trade-off. We then present an evolutionary model which combines divergence options available to resolve tensions, and how further tensions emerge. In developing this model we attempt to define open software evolution at the level of systems of software, rather than at individual software project level.}, keywords = {divergence, forking, software evolution, specialization}, author = {Gen{\c c}er, Mehmet and {\"O}zel, B{\"u}lent} } @proceedings {1444, title = {Gender Differences in Early Free and Open Source Software Joining Process}, volume = {378}, year = {2012}, note = {"We examined subscriber logs and data for six FOSS projects..." "We examined the differences between posters and non-posters to determine the attrition rate..." "Using data from the US Census, we matched names to lists of the most common female and male names. We identified 666 users using this process."}, month = {09/2012}, pages = {78-93}, publisher = {IFIP AICT, Springer}, address = {Eighth International Conference on Open Source Systems (OSS 2012)}, abstract = {With the growth of free and open source software (FOSS) and the adoption of FOSS solutions in business and everyday life, it is important that projects serve their growingly diverse user base. The sustainability of FOSS projects relies on a constant influx of new contributors. Several large demo- graphic surveys found that FOSS communities are very homogenous, dominated by young men, similar to the bias existing in the rest of the IT workforce. Building on previous research, we examine mailing list subscriptions and posting statistics of female FOSS participants. New participants often experience their first interaction on a FOSS project{\textquoteright}s mailing list. We explored six FOSS projects {\textendash} Buildroot, Busybox, Jaws, Parrot, uClibc, and Yum. We found a declining rate of female participation from the 8.27\% of subscribers, to 6.63\% of posters, and finally the often reported code contributor rate of 1.5\%. We found a disproportionate attrition rate among women along every step of the FOSS joining process.}, keywords = {buildroot, busybox, diversity, email, jaws, joining process, mailing lists, parrot, uClibc, Yum}, author = {Kuechler, Victor and Gilbertson, Claire and Jensen, Carlos} } @article {1879, title = {Linux Kernel Development: How Fast it is Going, Who is Doing It, What They are Doing, and Who is Sponsoring It}, year = {2012}, month = {03/2012}, institution = {The Linux Foundation}, abstract = {The kernel which forms the core of the Linux system is the result of one of the largest cooperative software projects ever attempted. Regular 2-3 month releases deliver stable updates to Linux users, each with signi cant new features, added device support, and improved performance. The rate of change in the kernel is high and increasing, with between 8,000 and 12,000 patches going into each recent kernel release. These releases each contain the work of over 1,000 developers representing nearly 200 corporations. Since 2005, over 7,800 individual developers from almost 800 different companies have contributed to the kernel. The Linux kernel, thus, has become a common resource developed on a massive scale by companies which are erce competitors in other areas. This is the fourth update of this document, which has been published roughly annually since 2008. It covers development through the 3.2 release, with an emphasis on the releases (2.6.36 to 3.2) made since the last update. It has been a busy period, with seven kernel releases created, many signi cant changes made, and continual growth of the kernel developer and user community. }, keywords = {corporate, corporations, developers, linux kernel, metrics}, attachments = {https://flosshub.org/sites/flosshub.org/files/lf_kernel_development_2012.pdf}, author = {Corbet, Jonathan and Greg Kroah-Hartman and Amanda McPherson} } @conference {1318, title = {Network-Based Analysis of the Structure and Evolution of an Open Source Software Product}, booktitle = {45th Hawai{\textquoteright}i International Conference on System Sciences}, year = {2012}, note = {"raw data about the product structure is extracted from the source code"}, month = {01/2012}, pages = {3436-3445}, abstract = {In this paper, an analysis of product structures in open source software (OSS) at both product level and module level is presented. At the product level, the product structures are modeled as complex networks, and the evolutionary characteristics of product structures are analyzed by using network analysis metrics. At the module level, linking mechanisms, which describe how a module is attached with other modules, are proposed. The linking mechanisms are modeled as probability functions dependent on the degrees of linking modules. A case study from an open source software project, Drupal, is presented. The evolutionary trends of Drupal product structures are analyzed and discussed. Finally, a model is presented to illustrate the effects of linking mechanisms at the module level on the product structures at the system level. The results indicate that the model built using the proposed linking mechanisms generates networks whose evolutionary characteristics are close to that of the original network.}, keywords = {drupal, source code}, author = {Le, Qize and Panchal, Jitesh H.} } @article {1324, title = {Are Developers Fixing Their Own Bugs?}, journal = {International Journal of Open Source Software and Processes}, volume = {3}, year = {2011}, note = {"The analysis is focused at the level of lines of code and it uses the information stored in the source code management system"}, pages = {23 - 42}, abstract = {The process of fixing software bugs plays a key role in the maintenance activities of a software project. Ideally, code ownership and responsibility should be enforced among developers working on the same artifacts, so that those introducing buggy code could also contribute to its fix. However, especially in FLOSS projects, this mechanism is not clearly understood: in particular, it is not known whether those contributors fixing a bug are the same introducing and seeding it in the first place. This paper analyzes the comm-central FLOSS project, which hosts part of the Thunderbird, SeaMonkey, Lightning extensions and Sunbird projects from the Mozilla community. The analysis is focused at the level of lines of code and it uses the information stored in the source code management system. The results of this study show that in 80\% of the cases, the bug-fixing activity involves source code modified by at most two developers. It also emerges that the developers fixing the bug are only responsible for 3.5\% of the previous modifications to the lines affected; this implies that the other developers making changes to those lines could have made that fix. In most of the cases the bug fixing process in comm-central is not carried out by the same developers than those who seeded the buggy code.}, keywords = {bug fixing, developers, loc, scm}, issn = {1942-3934}, doi = {10.4018/jossp.2011040102}, author = {Izquierdo-Cortazar, Daniel and Capiluppi, Andrea and Jesus M. Gonzalez-Barahona} } @conference {1304, title = {Entering the circle of trust}, booktitle = {Proceeding of the 8th working conference on Mining software repositories - MSR {\textquoteright}11}, year = {2011}, note = {"we started by analyzing 219 projects" }, month = {05/2011}, pages = {133-142}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {The success of an open-source project depends to a large degree on the proactive and constructive participation by the developer community. An important role that developers play in a project is that of a code committer. However, code-commit privilege is typically restricted to the core group of a project. In this paper, we study the phenomenon of the induction of external developers as code committers. The trustworthiness of an external developer is one of the key factors that determines the granting of commit privileges. Therefore, we formulate different hypotheses to explain how the trust is established in practice. To investigate our hypotheses, we developed an automated approach based on mining code repositories and bug-tracking systems. We implemented the approach and performed an empirical study, using the Eclipse projects, to test the hypotheses. Our results indicate that, most frequently, developers establish trust and credibility in a project by contributing to the project in a non-committer role. Moreover, the employing organization of a developer is another factor--although a less significant one--that influences trust.}, keywords = {bug tracking, bug tracking system, commits, committers, core, developers, eclipse, trust}, isbn = {9781450305747}, doi = {10.1145/1985441.1985462}, author = {Mani, Senthil and Sinha, Saurabh and Sinha, Vibha Singhal} } @conference {1216, title = {Experiences Mining Open Source Release Histories}, booktitle = {International Conference on Software and Systems Process (ICSSP 2011) }, year = {2011}, note = {"First, we selected the projects to initially target, using several criteria to get a broad picture of the open source landscape. Second, we collected the actual data, using a framework of parsers and some manual inspection. Third, we standardized and inserted the data into a database for later use." "but we plan to eventually cross reference our list of projects with existing open source project information (such as FLOSSmole) to take advantage of the work already done by other researchers." "For each release, we collected the following data: the project it belonged to, the date the release was published, the type of release, the release label (version number) and the source of the data" discussion of their difficulties "We conclude that programmatically creating a release history database from existing open source data is not trivial," "We have currently collected 1579 distinct releases from 22 different open source projects"}, month = {05/2011}, abstract = {Software releases form a critical part of the life cycle of a software project. Typically, each project produces releases in its own way, using various methods of versioning, archiving, announcing and publishing the release. Understanding the release history of a software project can shed light on the project history, as well as the release process used by that project, and how those processes change. However, many factors make automating the retrieval of release history information difficult, such as the many sources of data, a lack of relevant standards and a disparity of tools used to create releases. In spite of the large amount of raw data available, no attempt has been made to create a release history database of a large number of projects in the open source ecosystem. This paper presents our experiences, including the tools, techniques and pitfalls, in our early work to create a software release history database which will be of use to future researchers who want to study and model the release engineering process in greater depth.}, keywords = {doap, flossmole cited, life cycle, release engineering, release history, release management, releases}, attachments = {https://flosshub.org/sites/flosshub.org/files/icssp11short-p034-tsay.pdf}, author = {Jason Tsay and Wright, Hyrum and Perry, Dewayne} } @conference {1307, title = {How do developers blog?}, booktitle = {Proceedings of the 8th working conference on Mining software repositories - MSR {\textquoteright}11}, year = {2011}, note = {publishing frequency, post structure, word usage, publication patterns, content}, month = {05/2011}, pages = {123-132}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {We report on an exploratory study, which aims at understanding how software developers use social media compared to conventional development infrastructures. We analyzed the blogging and the committing behavior of 1,100 developers in four large open source communities. We observed that these communities intensively use blogs with one new entry about every 8 hours. A blog entry includes 14 times more words than a commit message. When analyzing the content of the blogs, we found that most popular topics represent high-level concepts such as functional requirements and domain concepts. Source code related topics are covered in less than 15\% of the posts. Our results also show that developers are more likely to blog after corrective engineering and management activities than after forward engineering and re-engineering activities. Our findings call for a hypothesis-driven research to further understand the role of social media in software engineering and integrate it into development processes and tools.}, keywords = {blog, communication, developer, eclipse, gnome, LDA, postgres, python}, isbn = {9781450305747}, doi = {10.1145/1985441.1985461}, author = {Maalej, Walid and Pagano, Dennis} } @proceedings {1270, title = {Impact of Stakeholder Type and Collaboration on Issue Resolution Time in OSS Projects}, year = {2011}, note = {"First, we characterize the difference in the average amount of resolved issues and issue resolution time between a volunteer assignee and a firm-paid assignee....Second, we investigate collaboration among stakeholders in OSS projects by using Social network metrics and analysis. Last, we explore the impact of the collaboration measures on issue resolution time." "Three OSS projects were selected for our study, namely Qt, Qpid and Geronimo" "All software issues were collected from JIRA repositories...Issue resolution time was computed by using the created time field and the issue resolved time field."}, month = {10/2011}, pages = {1-16}, publisher = {Springer}, abstract = {Initialized by a collective contribution of volunteer developers, Open source software (OSS) attracts an increasing involvement of commercial firms. Many OSS projects are composed of a mix group of firm-paid and volunteer developers, with different motivations, collaboration practices and working styles. As OSS development consists of collaborative works in nature, it is important to know whether these differences have an impact on collaboration between difference types of stakeholders, which lead to an influence in the project outcomes. In this paper, we empirically investigate the firm-paid participation in resolving OSS evolution issues, the stakeholder collaboration and its impact on OSS issue resolution time. The results suggest that though a firm-paid assigned developer resolves much more issues than a volunteer developer does, there is no difference in issue resolution time between them. Besides, the more important factor that influences the issue resolution time comes from the collaboration among stakeholders rather than from individual characteristics.}, keywords = {COLLABORATION, companies, coordination, defects, feature requests, geronimo, jira, qpid, qt, social network analysis, volunteer}, author = {Duc, Ach Nguyen and Cruzes, Daniela S. and Ayala, Claudia and Conradi, Reidar} } @proceedings {1277, title = {Knowledge Homogeneity and Specialization in the Apache HTTP Server Project}, year = {2011}, note = {"Our data set consists of the commit history and email archives for the Apache HTTP Server Project, spanning sixteen years (2/27/1995 - 1/31/2011)" "we 1) mapped the committers to email records, 2) cleaned the email records to remove extraneous information, 3) identified topics of discussion in the resulting messages, and 4) constructed a social network model from committers and topics." "If specialization exists within the httpd community, we should see distinct communities develop around topics. In addition, unique groups of developers should congregate around specialized subtopics. We examined the data from both angles: topical affinity and topic communities." }, month = {10/2011}, pages = {106-122}, publisher = {Springer}, abstract = {We present an analysis of developer communication in the Apache HTTP Server project. Using topic modeling techniques we expose latent conceptual sub-communities arising from developer specialization within the greater developer population. However, we found that among the major contributors to the project, very little specialization exists. We present theories to explain this phenomenon, and suggest further research.}, keywords = {apache, commits, developer, email, email archive, LDA, mailing list, revision control, revision history, scm, social network analysis, specialization, subversion, svn}, url = {http://sequoia.cs.byu.edu/lab/files/pubs/MacLean2011a.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/MacLean2011a.pdf}, author = {MacLean, Alexander C. and Pratt, Landon J. and Knutson, Charles D. and Ringger, Eric K.} } @article {1384, title = {On qualitative methodologies and dispersed communities: Reflections on the process of investigating an open source community}, journal = {Information and Software Technology}, volume = {53}, year = {2011}, month = {9/2011}, pages = {981 - 993}, abstract = {Context Qualitative methodologies hold much potential for building an understanding of the principles and practices of free and open source software (FOSS) communities. Yet there is a scarcity in the literature of discussions focused on the practical and methodological challenges of this particular research context. Objective This paper formulates and addresses a number of questions regarding the applicability of qualitative methodologies for the study of FOSS communities. It reflects on the challenges of such approaches as seen in previous research efforts and discusses how they manifest in research practice through a thorough description of a case study of a community called PyPy. Method The paper primarily discusses interpretive research approaches which are based on ethnographic data collection methods. The study under discussion was an exploratory case study utilizing multiple methods, including participant observation, virtual ethnography, and open-ended questionnaires. Grounded Theory was used for data analysis. Results Two broad sets of challenges are highlighted in relation to the multidimensionality of the FOSS phenomenon and the difficulty of qualitative analysis of activities in long-term context. Additional issues identified relate to potential problems with focus and the need for reflexivity, but also to the extent of the study and the importance of maintaining an active relationship with the core community group. Conclusion This paper provides an overview {\textendash} grounded in practical research experience and linked to insights from the literature {\textendash} of methodological issues in the specific research area of qualitative studies of FOSS communities, which up until now has been lacking.}, keywords = {case study, Collaborative practice, Distributed software development, Free open source communities, Qualitative methodologies}, issn = {09505849}, doi = {10.1016/j.infsof.2011.01.012}, url = {http://www.sciencedirect.com/science/article/pii/S0950584911000413}, author = {Sigfridsson, Anders and Sheehan, Anne} } @conference {Meneely:2011:SDN:1985793.1985832, title = {Socio-technical developer networks: should we trust our measurements?}, booktitle = {Proceedings of the 33rd International Conference on Software Engineering}, series = {ICSE {\textquoteright}11}, year = {2011}, pages = {281{\textendash}290}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Software development teams must be properly structured to provide effectiv collaboration to produce quality software. Over the last several years, social network analysis (SNA) has emerged as a popular method for studying the collaboration and organization of people working in large software development teams. Researchers have been modeling networks of developers based on socio-technical connections found in software development artifacts. Using these developer networks, researchers have proposed several SNA metrics that can predict software quality factors and describe the team structure. But do SNA metrics measure what they purport to measure? The objective of this research is to investigate if SNA metrics represent socio-technical relationships by examining if developer networks can be corroborated with developer perceptions. To measure developer perceptions, we developed an online survey that is personalized to each developer of a development team based on that developer{\textquoteright}s SNA metrics. Developers answered questions about other members of the team, such as identifying their collaborators and the project experts. A total of 124 developers responded to our survey from three popular open source projects: the Linux kernel, the PHP programming language, and the Wireshark network protocol analyzer. Our results indicate that connections in the developer network are statistically associated with the collaborators whom the developers named. Our results substantiate that SNA metrics represent socio-technical relationships in open source development projects, while also clarifying how the developer network can be interpreted by researchers and practitioners.}, keywords = {developer network, developers, linux, linux kernel, PHP, social network analysis, Survey, wireshark}, isbn = {978-1-4503-0445-0}, doi = {10.1145/1985793.1985832}, url = {http://doi.acm.org/10.1145/1985793.1985832}, author = {Meneely, Andrew and Williams, Laurie} } @conference {1312, title = {A tale of two browsers}, booktitle = {Proceedings of the 8th working conference on Mining software repositories - MSR {\textquoteright}11}, year = {2011}, month = {05/2011}, pages = {238-241}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {We explore the space of open source systems and their user communities by examining the development artifact histories of two popular web browsers -- Firefox and Chrome -- as well as usage data. By examining the data and addressing a number of research questions, two very different profiles emerge: Firefox, as the older and established system, with long product version cycles but short bug fix cycles, and a user base that is slow to adopt newer versions; and Chrome, as the new and fast evolving system, with short version cycles, longer bug fix cycles, and a user base that very quickly adopts new versions as they become available (due largely to Chrome{\textquoteright}s mandatory automatic updates). }, keywords = {chrome, development history, Firefox, msr challenge}, isbn = {9781450305747}, doi = {10.1145/1985441.1985481}, author = {Davis, Ian and Godfrey, Michael W. and Baysal, Olga} } @conference {Stroulia:2011:TDS:1984665.1984670, title = {Teaching distributed software engineering with UCOSP: the undergraduate capstone open-source project}, booktitle = {Proceedings of the 2011 Community Building Workshop on Collaborative Teaching of Globally Distributed Software Development}, series = {CTGDSD {\textquoteright}11}, year = {2011}, pages = {20{\textendash}25}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Software engineering courses in computer-science departments are meant to prepare students for the practice of designing, developing, understanding and maintaining software in the real world. The effectiveness of these courses have potentially a tremendous impact on the software industry, since it is through these courses that students must learn the state-of-the-art process and the tools of their eventual "trade", so that they can bring this knowledge to their job and thus advance the actual state of practice. The value of "learning software engineering" through project-based courses has long been recognized by educators and practitioners alike. In this paper, we discuss our experience with a distributed project-based course, which infuses the students{\textquoteright} learning experience with an increased degree of realism, which, we believe, further improves the quality of their learning and advances their readiness to join the profession.}, keywords = {distributed, education, pedagogical, project-based courses, software engineering education}, isbn = {978-1-4503-0590-7}, doi = {10.1145/1984665.1984670}, url = {http://doi.acm.org/10.1145/1984665.1984670}, author = {Stroulia, Eleni and Bauer, Ken and Craig, Michelle and Reid, Karen and Wilson, Greg} } @conference {1305, title = {Visualizing collaboration and influence in the open-source software community}, booktitle = {Proceedings of the 8th working conference on Mining software repositories - MSR {\textquoteright}11}, year = {2011}, note = {"This data set includes the complete social graph of 500,000 follow links as well as over 1,000,000 commits and 50,000 users." "...a large fraction of [GitHub] users provide a location in their profile, which we can turn into geographic coordinates using a geocoding API like PlaceFinder... "For each repository, we extract the owner, collaborator, and contributor usernames, plus branch names. New user- names help to find new repositories, while branch names are used to fetch commit metadata. Using this method, the crawler uncovered 40,860 code repositories, representing 33,388 unique project names and 1,219,872 individual commits." "In addition to crawled data, we use the complete GitHub user follower graph from Jan 19, 2011. This graph includes 452,248 links connecting 106,247 unique users, 47\% (49,500) of which could be geocoded with the PlaceFinder API"}, month = {05/2011}, pages = {223-226}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {We apply visualization techniques to user profiles and repository metadata from the GitHub source code hosting service. Our motivation is to identify patterns within this development community that might otherwise remain obscured. Such patterns include the effect of geographic distance on developer relationships, social connectivity and influence among cities, and variation in project-specific contribution styles (e.g., centralized vs. distributed). Our analysis examines directed graphs in which nodes represent users{\textquoteright} geographic locations and edges represent (a) follower relationships, (b) successive commits, or (c) contributions to the same project. We inspect this data using a set of visualization techniques: geo-scatter maps, small multiple displays, and matrix diagrams. Using these representations, and tools based on them, we develop hypotheses about the larger GitHub community that would be difficult to discern using traditional lists, tables, or descriptive statistics. These methods are not intended to provide conclusive answers; instead, they provide a way for researchers to explore the question space and communicate initial insights.}, keywords = {COLLABORATION, data exploration, geography, geoscatter, github, graph, mapping, metadata, open source, social graph, user profiles, visualization}, isbn = {9781450305747}, doi = {10.1145/1985441.1985476}, url = {http://vis.stanford.edu/files/2011-GotHub-MSR.pdf}, author = {Marschner, Eli and Rosenfeld, Evan and Heer, Jeffrey and Heller, Brandon} } @conference {1256, title = {Analyzing Leadership Dynamics in Distributed Group Communication}, booktitle = {2010 43rd Hawaii International Conference on System Sciences (HICSS 2010)}, year = {2010}, note = {"Our analysis examines the communication patterns in two FLOSS development projects, Fire and Gaim" "These data were imported into a database to allow automated analysis. The Fire data set includes about 1,800 events in the user email list, 7,800 messages in the developer venues, and 1,300 events in the combined trackers, spanning a period of 54 months. The significantly larger Gaim data set included over 41,000 events in the user forum, over 30,000 events in the developer venues, and about 20,000 events in the trackers, generated over 78 months." "The dynamic network analysis was performed using a scientific workflow tool, Taverna Workbench"}, pages = {1 - 10}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii, USA}, abstract = {We apply social network analysis (SNA) to examine the dynamics of leadership in distributed groups, specifically Free/Libre Open Source Software development projects, and its relation to group performance. Based on prior work on leadership in distributed groups, we identify leaders with those who make the highest level of contribution to the group and assess the degree of leadership by measuring centralization of communications. We compare the dynamics of leadership in two FLOSS projects, one more and one less effective. We find that in both projects, centralization was higher in developer-oriented communications venues than in user-oriented venues, suggesting higher degrees of leadership in developer venues. However, we do not find a consistent relation between centralization and effectiveness. We suggest that SNA can instead be useful for identifying interesting periods in the history of the project, e.g., periods where the leadership of the project is in transition.}, keywords = {core, DYNAMICS, email, email archives, fire, flossmole, gaim, leadership, mailing list, project success, social network analysis, srda}, isbn = {978-1-4244-5509-6}, doi = {10.1109/HICSS.2010.62}, attachments = {https://flosshub.org/sites/flosshub.org/files/07-06-02.pdf}, author = {Kevin Crowston and Andrea Wiggins and Howison, James} } @conference {963, title = {Automated dependency resolution for open source software}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {130 - 140}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Opportunities for software reuse are plentiful, thanks in large part to the widespread adoption of open source processes and the availability of search engines for locating relevant artifacts. One challenge presented by open source software reuse is simply getting a newly downloaded artifact to build/run in the first place. The artifact itself likely reuses other artifacts, and so depends on their being located to function properly. While merely tedious in the individual case, this can cause serious difficulties for those seeking to study open source software. It is simply not feasible to manually resolve dependencies for thousands of projects, and many forms of analysis require declarative completeness. In this paper we present a method for automatically resolving dependencies for open source software. It works by cross-referencing a project{\textquoteright}s missing type information with a repository of candidate artifacts. We have implemented this method on top of the Sourcerer, an infrastructure for the large-scale indexing and analysis of open source code. The performance of our resolution algorithm was evaluated in two parts. First, for a small number of popular open source projects, we manually examined the artifacts suggested by our system to determine if they were appropriate. Second, we applied the algorithm to the 13,241 projects in the Sourcerer managed repository to evaluate the rate of resolution success. The results demonstrate the feasibility of this approach, as the algorithm located all of the required artifacts needed by 3,904 additional projects, increasing the percentage of declaratively complete projects in Sourcerer from 39\% to 69\%.}, keywords = {dependencies, java, source code, sourcerer}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463346}, author = {Ossher, Joel and Bajracharya, Sushil and Lopes, Cristina} } @conference {968, title = {Can development work describe itself?}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {191 - 200}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Work descriptions are informal notes taken by developers to summarize work achieved in a particular session. Existing studies indicate that maintaining them is a distracting task, which costs a developer more than 30 min. a day. The goal of this research is to analyze the purposes of work descriptions, and find out if automated tools can assist developers in efficiently creating them. For this, we mine a large dataset of heterogeneous work descriptions from open source and commercial projects. We analyze the semantics of these documents and identify common information entities and granularity levels. Information on performed actions, concerned artifacts, references and new work, shows the work management purpose of work descriptions. Information on problems, rationale and experience shows their knowledge sharing purpose. We discuss how work description information, in particular information used for work management, can be generated by observing developers{\textquoteright} interactions. Our findings have many implications for next generation software engineering tools.}, keywords = {developer interactions, work descriptions}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463344}, author = {Maalej, Walid and Happel, Hans-Jorg} } @article {1330, title = {Data Mining User Activity in Free and Open Source Software (FOSS)/ Open Learning Management Systems}, journal = {International Journal of Open Source Software and Processes}, volume = {2}, year = {2010}, pages = {65 - 75}, abstract = {Free and Open Source Software (FOSS)/Open Educational Systems development projects abound in higher education today. Many universities worldwide have adopted open source software like ATutor and Moodle as an alternative to commercial or homegrown systems. The move to open source learning management systems entails many special considerations, including usage analysis facilities. The tracking of users and their activities poses major technical and analytical challenges within web-based systems. This paper examines how user activity tracking challenges are met with data mining techniques, particularly web usage mining methods, in four different open learning management systems: ATutor, LON-CAPA, Moodle, and Sakai. As examples of data mining technologies adapted within widely used systems, they represent important first steps for moving educational data mining outside the research laboratory. Moreover, as examples of different open source development contexts, exemplify the potential for programmatic integration of data mining technology processes in the future. As open systems mature in the use of educational data mining, they move closer to the long-sought goal of achieving more interactive, personalized, adaptive learning environments online on a broad scale.}, keywords = {data mining, education, student}, issn = {1942-3934}, doi = {10.4018/jossp.2010010105}, author = {McGrath, Owen} } @conference {961, title = {Do stack traces help developers fix bugs?}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {118 - 121}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {A widely shared belief in the software engineering community is that stack traces are much sought after by developers to support them in debugging. But limited empirical evidence is available to confirm the value of stack traces to developers. In this paper, we seek to provide such evidence by conducting an empirical study on the usage of stack traces by developers from the ECLIPSE project. Our results provide strong evidence to this effect and also throws light on some of the patterns in bug fixing using stack traces. We expect the findings of our study to further emphasize the importance of adding stack traces to bug reports and that in the future, software vendors will provide more support in their products to help general users make such information available when filing bug reports.}, keywords = {bug fixing, bug report, debugging, eclipse, stack trace}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463280}, attachments = {https://flosshub.org/sites/flosshub.org/files/118-10-msr.pdf}, author = {Schroter, Adrian and Schr{\"o}ter, Adrian and Bettenburg, Nicolas and Premraj, Rahul} } @article {1134, title = {A Fistful of Dollars: Financial Rewards, Payment Norms, and Motivation Crowding in Open Source Software Development}, year = {2010}, month = {04/2010}, institution = {ssrn}, address = {ssrn}, abstract = {Existing literature on open source software (OSS) maintains that intrinsic motivation and extrinsic financial rewards have a unidimensionally positive effect on the motivation of individual developers. Based on self-determination theory, which underlies most of these studies, we challenge this assumption. We argue that the effect of payment on both intrinsic motivation and total motivation of OSS developers is far more complex. To illustrate our point, we introduce the concept of individuals{\textquoteright} norms about payment to the field of OSS. In doing so, we are able to show that payment norms moderate the effect of payment on intrinsic motivation and total motivation. Conducting a scenario experiment, we find that intrinsic motivation decreases for individuals with norms against payment. This effect becomes even stronger when analyzing for mediation effects. Total motivation is impacted positively by payment, but the effect turns insignificant for individuals with norms for payment. Our findings help explain the results of previous studies in which OSS developers did not seem to be affected by motivation crowding. They further contribute to the more general debate on how to manage individuals in the absence of formal contracts. From a practical perspective, we show that financial rewards may create a management dilemma for OSS project leaders. }, keywords = {developers, experiment, financial, MOTIVATION}, author = {Oliver Alexy and Martin Leitner} } @conference {1257, title = {The Importance of Social Network Structure in the Open Source Software Developer Community}, booktitle = {2010 43rd Hawaii International Conference on System Sciences}, year = {2010}, note = {"Using data from the SourceForge Research Data Archive [2, 9] and the new dataset of concurrent versions system (CVS) metadata described in [8]..." (M. Van Antwerp. Studying open source versioning metadata. Master{\textquoteright}s thesis, University of NotreDame, Notre Dame, IN, January 2009) "To measure long-term popularity, we used the SourceForge activity percentile."}, pages = {1 - 10}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii, USA}, abstract = {This paper outlines the motivations and methods for analyzing the developer network of open source software (OSS) projects. Previous work done by Hinds [5] suggested social network structure was instrumental towards the success of an OSS project, as measured by activity and output. The follow-up paper by Hinds [4] discovered that his hypotheses, based on social network theory and previous research on the importance of subgroup connectedness, were vastly different than the results of his study of over 100 successful OSS projects. He concluded that the social network structure had no significant effect on project success. We outline how his approach disregarded potentially important factors and through a new study evaluate the role of the OSS developer network as it pertains to long-term project popularity. We also present an initial investigation into the adequacy of using the SourceForge activity percentile as a long-term success metric. In contrast with Hinds, we show that previously existing developer-developer ties are an indicator of past and future project popularity.}, keywords = {developers, popularity, project success, social network analysis, sourceforge, srda}, isbn = {978-1-4244-5509-6}, doi = {10.1109/HICSS.2010.385}, attachments = {https://flosshub.org/sites/flosshub.org/files/07-06-07.pdf}, author = {Matthew Van Antwerp and Madey, Greg} } @conference {bird2010lee, title = {{Linkster: Enabling Efficient Manual Mining}}, booktitle = {Demonstration Track, Proceedings of the 17th SIGSOFT Symposium on Foundations of Software Engineering}, year = {2010}, note = {"LINKSTER efficiently displays, integrates, and allows inspection and annotation of information from three main sources of data: source code repositories, developer mailing lists archives, and bug tracking databases. LINKSTER requires access to a source code repository for file content and a database which contains the raw mined repository, mailing list, and bug tracking information. All notes and annotations made by the user are also recorded in the database."}, publisher = {ACM}, organization = {ACM}, abstract = {While many uses of mined software engineering data are automatic in nature, some techniques and studies either require, or can be improved, by manual methods. Unfortunately, manually inspecting, analyzing, and annotating mined data can be difficult and tedious, especially when information from multiple sources must be integrated. Oddly, while there are numerous tools and frameworks for automatically mining and analyzing data, there is a dearth of tools which facilitate manual methods. To fill this void, we have developed LINKSTER, a tool which integrates data from bug databases, source code repositories, and mailing list archives to allow manual inspection and annotation. LINKSTER has already been used successfully by an OSS project lead to obtain data for one empirical study.}, keywords = {artifacts, bug, bug tracking, data mining, email, mailing lists, open source, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/bird2010lee.pdf}, author = {Christian Bird and Adrian Bachman and Rahman, Foyzur and Bernstein, Abraham} } @conference {1218, title = {A Longitudinal Study on Collaboration Networks and Decision to Participate in a FLOSS Community}, booktitle = {5th Workshop on Public Data about Software Development (WoPDaSD 2010)}, year = {2010}, note = {"we chose as a case of study Epiphany, which is the default web browser of the GNOME graphical desktop environment" "We collected all relevant data by parsing all the bug reports in GNOME Bugzilla repository relative to Epiphany. The data collection and storing was done using Bicho (v. 0.4 rev. 7198), a software part of the FLOSSMetric project [14]."}, abstract = {In this paper we conjecture that individual decisions of FLOSS (Free/Libre Open Source Software) developers to take on a task are influenced by network relations generated by collaboration among project members. In order to explore our conjecture we collected data on a FLOSS project team consisting of 227 developers committed since 2002 to the development of a web browser. We reconstructed 2-mode co- collaboration networks (software developer by bug) in which a tie represents an action taken by a developer in order to solve a specific bug. Co-collaboration networks were collected at five points in time during a six-month development cycle of the software. We report and discuss results of longitudinal actor-based modeling that we specify to test for the influence of local network structures on developer{\textquoteright}s decision to take action on a specific bug. The study controls for bug-specific and developer-specific characteristics that may also affect developers{\textquoteright} decisions exogenously. We also control for priority and severity levels assigned by the team to bugs in an attempt to manage voluntary contribution.}, keywords = {bicho, bug fixing, bug reports, bugzilla, COLLABORATION, developers, epiphany, flossmetrics, gnome, social network analysis}, attachments = {https://flosshub.org/sites/flosshub.org/files/wopdasd002.pdf}, author = {Guido Conaldi and Tonellato, Marco} } @conference {964, title = {Mining subclassing directives to improve framework reuse}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {141 - 150}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {To help developers in using frameworks, good documentation is crucial. However, it is a challenge to create high quality documentation especially of hotspots in white-box frameworks. This paper presents an approach to documentation of object-oriented white-box frameworks which mines from client code four different kinds of documentation items, which we call subclassing directives. A case study on the Eclipse JFace user-interface framework shows that the approach can improve the state of API documentation w.r.t. subclassing directives.}, keywords = {api, documentation, eclipse, frameworks, jface, source code}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463347}, attachments = {https://flosshub.org/sites/flosshub.org/files/141Mining-Subclassing-Directives-to-Improve-Framework-Reuse.pdf}, author = {Bruch, Marcel and Mezini, Mira and Monperrus, Martin} } @conference {955, title = {Perspectives on bugs in the Debian bug tracking system}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {86 - 89}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Bugs in Debian differ from regular software bugs. They are usually associated with packages, instead of software modules. They are caused and fixed by source package uploads instead of code commits. The majority are reported by individuals who appear in the bug database once, and only once. There also exists a small group of bug reporters with over 1,000 bug reports each to their name. We also explore our idea that a high bug-frequency for an individual package might be an indicator of popularity instead of poor quality.}, keywords = {bug reports, debian, msr challenge, popularity}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463288}, attachments = {https://flosshub.org/sites/flosshub.org/files/86bugs-debian.pdf}, author = {Davies, Julius and Hanyu Zhang and Nussbaum, Lucas and Daniel M. German} } @conference {966, title = {Replaying IDE interactions to evaluate and improve change prediction approaches}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {161 - 170}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {Change prediction helps developers by recommending program entities that will have to be changed alongside the entities currently being changed. To evaluate their accuracy, current change prediction approaches use data from versioning systems such as CVS or SVN. These data sources provide a coarse-grained view of the development history that flattens the sequence of changes in a single commit. They are thus not a valid basis for evaluation in the case of development-style prediction, where the order of the predictions has to match the order of the changes a developer makes. We propose a benchmark for the evaluation of change prediction approaches based on fine-grained change data recorded from IDE usage. Moreover, the change prediction approaches themselves can use the more accurate data to fine-tune their prediction. We present an evaluation procedure and use it on several change prediction approaches, both novel and from the literature, and report on the results.}, keywords = {cbse, change based software evolution, change prediction, changes, commit, cvs, development history, eclipseeye, ide, mylyn, spyware, svn}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463278}, attachments = {https://flosshub.org/sites/flosshub.org/files/161Robbes2010changePrediction.pdf}, author = {Robbes, Romain and Pollet, Damien and Lanza, Michele} } @conference {947, title = {Replicating MSR: A study of the potential replicability of papers published in the Mining Software Repositories proceedings}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {171 - 180}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {This paper is the result of reviewing all papers published in the proceedings of the former International Workshop on Mining Software Repositories (MSR) (2004-2006) and now Working Conference on MSR (2007-2009). We have analyzed the papers that contained any experimental analysis of software projects for their potentiality of being replicated. In this regard, three main issues have been addressed: i) the public availability of the data used as case study, ii) the public availability of the processed dataset used by researchers and iii) the public availability of the tools and scripts. A total number of 171 papers have been analyzed from the six workshops/working conferences up to date. Results show that MSR authors use in general publicly available data sources, mainly from free software repositories, but that the amount of publicly available processed datasets is very low. Regarding tools and scripts, for a majority of papers we have not been able to find any tool, even for papers where the authors explicitly state that they have built one. Lessons learned from the experience of reviewing the whole MSR literature and some potential solutions to lower the barriers of replicability are finally presented and discussed.}, keywords = {data, literature review, msr, replication}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463348}, url = {http://gsyc.urjc.es/~grex/msr2010}, attachments = {https://flosshub.org/sites/flosshub.org/files/171MSR_2010_69.final_.pdf}, author = {Gregorio Robles} } @conference {967, title = {Should I contribute to this discussion?}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {181 - 190}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town}, abstract = {Development mailing lists play a central role in facilitating communication in open source projects. Since these lists frequently host design and project discussions, knowledgeable contribution to these discussion threads is essential to avoid mis-communication that might slow-down the progress of a project. However, given the sheer volume of emails on these lists, it is easy to miss important discussions. To find out how developers are able to deal with mailing list discussions, we study the main factors that encourage developers to contribute to the development mailing lists. We develop personalized models to automatically identify discussion threads that a developer would contribute to based on his previous contribution behavior. Case studies on development mailing lists of three open source projects (Apache, PostgreSQL and Python) show that the average accuracy of our models is 89-85\% and that the models vary significantly between different developers.}, keywords = {apache, contributions, developers, email, email archives, mailing lists, postgresql, python}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463345}, attachments = {https://flosshub.org/sites/flosshub.org/files/181ibrahim-msr2010.pdf}, author = {Ibrahim, Walid M and Bettenburg, Nicolas and Shihab, Emad and Adams, Bram and Hassan, Ahmed E.} } @article {1376, title = {Sustainability of Open-Source Projects: A Longitudinal Study}, journal = {Journal of the Association for Information Systems}, volume = {11}, number = {5}, year = {2010}, abstract = {This paper examines the factors that influence the long-term sustainability of FLOSS projects. A model of project sustainability based on organizational ecology is developed and tested empirically. Data about activity and contribution patterns over the course of five years for 2,772 projects registered with SourceForge is analyzed. Our results suggest that the size of the project{\textquoteright}s development base, project age and the size of niche occupied by the project are positively related to the project{\textquoteright}s ability to attract user and/or developer resources. The ability to attract resources is an indicator of the perceived project legitimacy, which in turn is a strong predictor of the project{\textquoteright}s future sustainability. Thus a project{\textquoteright}s ability to attract developer and user resources is shown to play a mediating role between the demographic (size and age) and ecological (niche) characteristics of the project and its future sustainability. Our results support the applicability of tenets of organizational ecology related to the liability of smallness, the liability of newness, and population characteristics (niche size) to the FLOSS development environment. The implications of the results for future research and practice are discussed.}, keywords = {contribution, developers, sourceforge, sustainability}, url = {http://aisel.aisnet.org/jais/vol11/iss11/5/}, author = {Chengular-Smith, I. and Sidorova, Anna and Daniel, Sherae L.} } @conference {1259, title = {Towards an Openness Rating System for Open Source Software}, booktitle = {2010 43rd Hawaii International Conference on System Sciences (HICSS 2010)}, year = {2010}, pages = {1 - 8}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii, USA}, abstract = {Many open source software projects are not very open to third party developers. The point of open source is to enable anyone to fix bugs or add desired capabilities without holding them hostage to the original developers. This principle is important because an open source project{\textquoteright}s developers may be unresponsive or unable to meet third party needs, even if funding support for requested improvements is offered.This paper presents a simple rating system for evaluating the openness of software distributions. The rating system considers factors such as platform portability, documentation, licensing, and contribution policy. Several popular open source products are rated in order to illustrate the efficacy of the rating system.}, keywords = {alice, case study, contribution, documentation, freespire, galib, latex, license, linux, linux kernel, mediaportal, openness, openoffice, opensolaris, rating, unicon}, isbn = {978-1-4244-5509-6}, doi = {10.1109/HICSS.2010.405}, attachments = {https://flosshub.org/sites/flosshub.org/files/10-07-04.pdf}, author = {Bein, Wolfgang and Jeffery, Clinton} } @conference {952, title = {The Ultimate Debian Database: Consolidating bazaar metadata for Quality Assurance and data mining}, booktitle = {2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)2010 7th IEEE Working Conference on Mining Software Repositories (MSR 2010)}, year = {2010}, pages = {52 - 61}, publisher = {IEEE}, organization = {IEEE}, address = {Cape Town, South Africa}, abstract = {FLOSS distributions like RedHat and Ubuntu require a lot more complex infrastructures than most other FLOSS projects. In the case of community-driven distributions like Debian, the development of such an infrastructure is often not very organized, leading to new data sources being added in an impromptu manner while hackers set up new services that gain acceptance in the community. Mixing and matching data is then harder than should be, albeit being badly needed for Quality Assurance and data mining. Massive refactoring and integration is not a viable solution either, due to the constraints imposed by the bazaar development model. This paper presents the Ultimate Debian Database (UDD), which is the countermeasure adopted by the Debian project to the above "data hell". UDD gathers data from various data sources into a single, central SQL database, turning Quality Assurance needs that could not be easily implemented before into simple SQL queries. The paper also discusses the customs that have contributed to the data hell, the lessons learnt while designing UDD, and its applications and potentialities for data mining on FLOSS distributions.}, keywords = {debian, metadata, udd}, isbn = {978-1-4244-6802-7}, doi = {10.1109/MSR.2010.5463277}, attachments = {https://flosshub.org/sites/flosshub.org/files/52msr2010-udd.pdf}, author = {Nussbaum, Lucas and Zacchiroli, Stefano} } @article {1387, title = {Using the DEMO methodology for modeling open source software development processes}, journal = {Information and Software Technology}, volume = {52}, year = {2010}, month = {6/2010}, pages = {656 - 671}, abstract = {Context Open source software development (OSSD) process modeling has received increasing interest in recent years. These efforts aim to identify common elements in the development process between multiple open source software (OSS) projects. However, the complexity inherent to OSSD process modeling puts significant demands on the modeling language. Objective In this paper, we propose that the Design and Engineering Methodology for Organizations (DEMO) may provide an interesting alternative to develop OSSD process models. DEMO exhibits two unique features within the context of OSSD process modeling. First, DEMO analyzes processes at the ontological level and provides high-level process descriptions, instead of focusing on the implementation level. Second, DEMO studies the communication patterns between human actors, instead of the sequences in which activities are performed. Method We investigate the feasibility of using DEMO to construct OSSD process models by means of a case study. DEMO models were constructed to describe the NetBeans Requirements and Release process. In addition, the quality of these DEMO models was evaluated using a quality framework for conceptual modeling. Results Our results showed that our DEMO models exhibited a high level of abstraction, thereby reducing the complexity of the OSSD process models. In addition, the evaluation of the models developed in this paper by using the quality framework for conceptual modeling showed that the models were of high quality. Conclusions We have shown that the DEMO methodology can be successfully used to model OSSD processes and to obtain abstract and high-quality OSSD process models. However, given some potential drawbacks with respect to understandability and implementability, we primarily propose the use of DEMO within OSSD process modeling as an analysis tool that should be complemented with other techniques and models for communication and reenactment purposes.}, keywords = {DEMO, Enterprise ontology, open source software, Software process modeling}, issn = {09505849}, doi = {10.1016/j.infsof.2010.02.002}, url = {http://www.sciencedirect.com/science/article/pii/S0950584910000157}, author = {Huysmans, Philip and Ven, Kris and Verelst, Jan} } @article {1239, title = {Weaving a Semantic Web Across OSS Repositories}, journal = {International Journal of Open Source Software and Processes}, volume = {2}, year = {2010}, month = {32/2010}, pages = {29 - 40}, abstract = {Several public repositories and archives of {\textquotedblleft}facts{\textquotedblright} about libre software projects, maintained either by open source communities or by research communities, have been flourishing over the Web in recent years. These have enabled new analysis and support for new quality assurance tasks. This paper presents some complementary existing tools, projects and models proposed both by OSS actors or research initiatives that are likely to lead to useful future developments in terms of study of the FLOSS phenomenon, and also to the very practitioners in the FLOSS development projects. A goal of the research conducted within the HELIOS project is to address bugs traceability issues. In this regard, the authors investigate the potential of using Semantic Web technologies in navigating between many different bugtracker systems scattered all over the open source ecosystem. By using Semantic Web techniques, it is possible to interconnect the databases containing data about open-source software projects development, which enables OSS partakers to identify resources, annotate them, and further interlink those using dedicated properties and collectively designing a distributed semantic graph.}, keywords = {archive, bug, bugtracker, database, debian, forge, interoperability, ontology, OSLC-CM, RDF, repository of repositories, semantic, semantic Web}, issn = {1942-3934}, doi = {10.4018/jossp.2010040103}, attachments = {https://flosshub.org/sites/flosshub.org/files/wopdasd2009-olivier-berger.pdf}, author = {Olivier Berger and Valentin Vlasceanu and Christian Bac and Quang Vu Dang and Lauriere, St{\'e}phane} } @conference {937, title = {Assigning bug reports using a vocabulary-based expertise model of developers}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {131 - 140}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {For popular software systems, the number of daily submitted bug reports is high. Triaging these incoming reports is a time consuming task. Part of the bug triage is the assignment of a report to a developer with the appropriate expertise. In this paper, we present an approach to automatically suggest developers who have the appropriate expertise for handling a bug report. We model developer expertise using the vocabulary found in their source code contributions and compare this vocabulary to the vocabulary of bug reports. We evaluate our approach by comparing the suggested experts to the persons who eventually worked on the bug. Using eight years of Eclipse development as a case study, we achieve 33.6\% top-1 precision and 71.0\% top-10 recall.}, keywords = {bug reports, bugzilla, develect, developers, eclipse, expertise, scm}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069491}, attachments = {https://flosshub.org/sites/flosshub.org/files/131AssigningBugReports.pdf}, author = {Matter, Dominique and Kuhn, Adrian and Nierstrasz, Oscar} } @article {1414, title = {Competition and production of digital public goods}, journal = {International Journal of Intelligent Control and Systems}, volume = {14}, number = {1}, year = {2009}, pages = {77-86}, chapter = {77}, abstract = {With the Internet has come the phenomenon of people volunteering to work on digital public goods such as open source software and online encyclopedia articles. Presumably, the success of individual public goods has an effect on attracting volunteers. However, the definition of success is ill-defined. This paper explores the impact of different success metrics on a simple public goods model. The findings show that the different success metrics considered do have an impact on the behavior of the model, with the largest differences being between consumer-oriented and producer-oriented metrics. This indicates that many proposed success metrics may be mapped into one of these two categories and within a category, all success metrics measure the same phenomenon. We argue that the characteristics of producer-oriented metrics more closely match real world phenomena, indicating that public goods are driven by producer, and not consumer, interests.}, keywords = {digital public goods, FLOSS, open source software, sourceforge, success metrics, wikipedia}, url = {http://www.public.asu.edu/~majansse/pubs/ijics2009.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/ijics2009.pdf}, author = {Radtke, Nicholas P. and Janssen, Marco A.} } @conference {1244, title = {Design Information Sharing Across Multiple Knowledge Systems in a FLOSS Community}, booktitle = {iConference {\textquoteright}09}, year = {2009}, month = {02/2009}, abstract = {This paper explores support for design information sharing between the distinct knowledge systems and skill sets of interactive system designers and developers. The paper focuses on the challenges of sharing information among groups of designers, developers, and users with multiple knowledge systems in the context of free/libre/open source software (FLOSS) communities. Bringing design to FLOSS communities introduces new knowledge into a solitary community of practice, and discussion ensues about how exploiting the {\textquoteright}symmetry of ignorance{\textquoteright} can enhance information sharing through design in CodePlex, an open source project hosting community website. Finally, design mockups illustrate how CodePlex serves as a boundary object supporting design information sharing across distinct knowledge systems.}, keywords = {codeplex, developers, information sharing}, attachments = {https://flosshub.org/sites/flosshub.org/files/finalDraft41.pdf}, author = {Bach, Paula} } @article {Subramaniam:2009:DOS:1480545.1480824, title = {Determinants of open source software project success: A longitudinal study}, journal = {Decis. Support Syst.}, volume = {46}, year = {2009}, month = {January}, pages = {576{\textendash}585}, publisher = {Elsevier Science Publishers B. V.}, address = {Amsterdam, The Netherlands, The Netherlands}, abstract = {In this paper, we investigate open source software (OSS) success using longitudinal data on OSS projects. We find that restrictive OSS licenses have an adverse impact on OSS success. On further analysis, restrictive OSS license is found to be negatively associated with developer interest, but is positively associated with the interest of non-developer users and project administrators. We also show that developer and non-developer interest in the OSS project and the project activity levels in any time period significantly affect the project success measures in subsequent time period. The implications of our findings for OSS research and practice are discussed. }, keywords = {contributors, developers, licenses, longitudinal study, Open source project, OSS, project success, restrictive, Software project success}, issn = {0167-9236}, doi = {10.1016/j.dss.2008.10.005}, url = {http://portal.acm.org/citation.cfm?id=1480545.1480824}, author = {Subramaniam, Chandrasekar and Sen, Ravi and Nelson, Matthew L.} } @conference {941, title = {Evolution of the core team of developers in libre software projects}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {167 - 170}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {In many libre (free, open source) software projects, most of the development is performed by a relatively small number of persons, the "core team". The stability and permanence of this group of most active developers is of great importance for the evolution and sustainability of the project. In this position paper we propose a quantitative methodology to study the evolution of core teams by analyzing information from source code management repositories. The most active developers in different periods are identified, and their activity is calculated over time, looking for core team evolution patterns.}, keywords = {core, cvs, cvsanaly, developers, evolution, gimp, scm}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069497}, attachments = {https://flosshub.org/sites/flosshub.org/files/167core-evolution.pdf}, author = {Gregorio Robles and Jesus M. Gonzalez-Barahona and Herraiz, Israel} } @conference {1262, title = {An Exploratory Study on the Two New Trends in Open Source Software: End-Users and Service}, booktitle = {2009 42nd Hawaii International Conference on System Sciences (HICSS 2009)}, year = {2009}, note = {used sourceforge data "We created our dataset by restricting our attention to projects that have production/stable and mature development status" "we limited our sample to two categories: developers and end-users/desktop" "We manually compiled a total of 200 projects, 100 each of the most downloaded projects from developers and end-users/desktop categories during the period of March 4 to March 23, 2008. "}, pages = {1 - 10}, publisher = {IEEE}, organization = {IEEE}, address = {Waikoloa, Hawaii, USA}, abstract = {Many have been envisaging the emergence of Open Source Software (OSS) for general end-users and the enhancements in providing services and support, as the most critical factors for OSS success, and at the same time, the most critical issues which are holding back the OSS movement. While these two distinct waves in OSS evolution have become more observable, researchers have not yet explored the characteristics of these two distinct new waves. The current study found evidence for these two waves and further explored the two waves by empirically examining two hundred projects hosted in Sourceforge.net. We compared the characteristics of OSS projects that are intended for two disparate audiences: developers and end-users and found that projects for end-users supported more languages but also had more restrictive licenses as compared to projects for developers.}, keywords = {developers, intended audiences, sourceforge}, isbn = {978-0-7695-3450-3}, doi = {10.1109/HICSS.2009.63}, attachments = {https://flosshub.org/sites/flosshub.org/files/07-07-05.pdf}, author = {Namjoo Choi and Chengular-Smith, I.} } @conference {936, title = {From work to word: How do software developers describe their work?}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {121 - 130}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Developers take notes about their work sessions, either to remember the work status and share it with collaborators, or because employers explicitly require this for project management matters. We report on an exploratory study which aims at understanding how software developers describe their work. We analyzed more than 750,000 work descriptions of about 2,000 professionals taken over 8 years in three settings. We observed several similarities in the content and time meta-data of work descriptions. Most frequent terms, such as top-30 performed activities, are used consistently. Particular templates such as ldquoACTION concerning ARTIFACT because of CAUSErdquo occur frequently. Developers described sessions that last 30-120 min. 4-16 times a day. Maintaining diaries seems to consume between 3-6\% of the total work time, and in 10\% of the sessions, developers did not describe their work in sufficient detail. We argue that our results make the first step towards automatically generating work diaries for software developers.}, keywords = {apache, developers, diaries, eureka, mycomp, scm, work management system}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069490}, author = {Maalej, Walid and Happel, Hans-Jorg} } @article {Beecher2009739, title = {Identifying exogenous drivers and evolutionary stages in FLOSS projects}, journal = {Journal of Systems and Software}, volume = {82}, number = {5}, year = {2009}, pages = {739 - 750}, abstract = {The success of a Free/Libre/Open Source Software (FLOSS) project has been evaluated in the past through the number of commits made to its configuration management system, number of developers and number of users. Most studies, based on a popular FLOSS repository (SourceForge), have concluded that the vast majority of projects are failures. This study{\textquoteright}s empirical results confirm and expand conclusions from an earlier and more limited work. Not only do projects from different repositories display different process and product characteristics, but a more general pattern can be observed. Projects may be considered as early inceptors in highly visible repositories, or as established projects within desktop-wide projects, or finally as structured parts of FLOSS distributions. These three possibilities are formalized into a framework of transitions between repositories. The framework developed here provides a wider context in which results from FLOSS repository mining can be more effectively presented. Researchers can draw different conclusions based on the overall characteristics studied about an Open Source software project{\textquoteright}s potential for success, depending on the repository that they mine. These results also provide guidance to OSS developers when choosing where to host their project and how to distribute it to maximize its evolutionary success.}, keywords = {developers, forge, forges, repositories, repository, scm, software repositories, sourceforge, success, users}, issn = {0164-1212}, doi = {DOI: 10.1016/j.jss.2008.10.026}, url = {http://www.sciencedirect.com/science/article/B6V0N-4TVTJFS-1/2/e32ecee1bcb54bd4a5dff6d5e3daca8d}, author = {Karl Beecher and Capiluppi, Andrea and Boldyreff, Cornelia} } @article {1231, title = {Integrating Projects from Multiple Open Source Code Forges}, journal = {International Journal of Open Source Software and Processes}, volume = {1}, year = {2009}, month = {31/2009}, pages = {46 - 57}, abstract = {Much of the data about free, libre, and open source (FLOSS) software development comes from studies of code forges or code repositories used for managing projects. This paper presents a method for integrating data about open source projects by way of matching projects (entities) across multiple code forges. After a review of the relevant literature, a few of the methods are chosen and applied to the FLOSS domain, including a comparison of some simple scoring systems for pairwise project matches. Finally, the paper describes limitations of this approach and recommendations for future work.}, keywords = {data integration, forges}, issn = {1942-3934}, doi = {10.4018/jossp.2009010103}, author = {Squire, Megan} } @conference {1204, title = {Language entropy: A metric for characterization of author programming language distribution}, booktitle = {4th Workshop on Public Data about Software Development (WoPDaSD 2009)}, year = {2009}, note = {The data set used in this study was previously collected for a separate, but related work. It was originally extracted from the SourceForge Research Archive (SFRA), August 2006. For a detailed discussion of the data source, collection tools and processes, and summary statistics, see [6]." "From the initial data set we extracted a random sample of 500 developers3 along with descriptive details of all revisions that those developers made since the inception of the projects on which they worked. We then condensed this sample by totaling the lines of code added by each developer for each month in which that developer made at least one code submission." [6] Daniel P. Delorey, Charles D. Knutson, and Alex MacLean. Studying production phase sourceforge projects: A case study using cvs2mysql and sfra+. In Second International Workshop on Public Data about Software Development (WoPDaSD {\textquoteright}07), June 2007.}, month = {2009}, abstract = {Programmers are often required to develop in multiple languages. In an effort to study the effects of programming language fragmentation on productivity{\textemdash}and ultimately on a programmer{\textquoteright}s problem solving abilities{\textemdash}we propose a metric, language entropy, for characterizing the distribution of an individual{\textquoteright}s development efforts across multiple programming languages. To evaluate this metric, we present an observational study examining all project contributions (through August 2006) of a random sample of 500 SourceForge developers. Using a random coefficients model, we found a statistically significant correlation (alpha level of 0.05) between language entropy and the size of monthly pro ject contributions (measured in lines of code added). Our results indicate that language entropy is a good candidate for characterizing author programing language distribution.}, keywords = {contributions, developers, language entropy, lines of code, loc, multiple languages, programming languages, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/LanguageEntropy-JonathanKrein.pdf}, author = {Krein, Jonathan L. and MacLean, Alexander C. and Delorey, Daniel P. and Knutson, Charles D. and Eggett, Dennis L.} } @article {1118, title = {Monetary donations to an open source software platform}, journal = {Research Policy}, volume = {38}, year = {2009}, month = {03/2009}, pages = {404 - 414}, abstract = {Online open source software platforms, such as Sourceforge.net, play a vital role in creating an ecosystem that enables the creation and growth of open source projects. However, there is little research exploring the interactions between open source stakeholders and the platform. We believe that the sustainability of the platform crucially depends on financial incentives. While platforms can obtain these incentives through multiple means, in this paper we focus on one form of financial incentives{\textemdash}voluntary monetary donations by open source community members. We report findings from two empirical studies that examine factors that impact donations. Study 1 investigates the factors that cause some community members to donate and not others. We find that the decision to donate is impacted by relational commitment with open source software platform, donation to projects and accepting donations from others. Study 2 examines what drives the level of donation. We find that the length of association with the platform and relational commitment affects donation levels.}, keywords = {Collective action, Donation, Identification, incentives, metadata, MOTIVATION, Open source software platform, projects, Reciprocity, Relational commitment, sourceforge}, issn = {00487333}, doi = {10.1016/j.respol.2008.11.004}, author = {Sandeep Krishnamurthy and Tripathi, Arvind K.} } @article {Barcellini2009533, title = {Participation in online interaction spaces: Design-use mediation in an Open Source Software community}, journal = {International Journal of Industrial Ergonomics}, volume = {39}, number = {3}, year = {2009}, note = {Selected papers from ECCE 2007, the 25th Anniversary Conference of the European Conference on Cognitive Ergonomics}, pages = {533 - 540}, abstract = {This research aims at characterizing emerging roles fostering design-use mediation during the Open Source Software (OSS) design process through the analysis of participation. Studying OSS is of particular interest: (1) to investigate socio-technical settings supporting user participation to the design process, which is considered to be the major strength of OSS design; (2) to gain insights into supporting the changing nature of the software industry, which is becoming more and more distributed and global, and which is thus increasingly making use of OSS design tools and methods. In this research, we characterized effective roles of participants, i.e. participation, on the basis of activities analysis in three online interaction spaces (discussion, documentation and implementation) during a continuous {\textquotedblleft}pushed-by-users{\textquotedblright} design process of the Python project. Participation is targeted through a methodology articulating: (1) structural analyses (organization of the discussions, regularity and involvement of participants, quotes-based social network) in usage-oriented and development-oriented mailing lists of the projects{\textquoteright} discussion space; (2) actions to the code and documentation made by participants in the implementation and documentation spaces. Besides the importance of the users{\textquoteright} contribution to the process, OSS design is fostered by some key-participants, the cross-participants, who act as boundary spanners between the developers and the users, helping them to go beyond some barriers to participation. These findings can be reinforced developing software to automate the structural analysis of discussions and actions to the code and documentation.}, keywords = {Distributed participatory design}, issn = {0169-8141}, doi = {10.1016/j.ergon.2008.10.013}, url = {http://www.sciencedirect.com/science/article/pii/S0169814108001637}, author = {Barcellini, Flore and D{\'e}tienne, Fran{\c c}oise and Burkhardt, Jean-Marie} } @conference {DBLP:conf/msr/BirdRBHGD09, title = {The promises and perils of mining git}, booktitle = {Proceedings of the 6th International Working Conference on Mining Software Repositories, MSR 2009}, year = {2009}, pages = {1-10}, abstract = {We are now witnessing the rapid growth of decentralized source code management (DSCM) systems, in which every developer has her own repository. DSCMs facilitate a style of collaboration in which work output can flow sideways (and privately) between collaborators, rather than always up and down (and publicly) via a central repository. Decentralization comes with both the promise of new data and the peril of its misinterpretation. We focus on git, a very popular DSCM used in high-profile projects. Decentralization, and other features of git, such as automatically recorded contributor attribution, lead to richer content histories, giving rise to new questions such as "How do contributions flow between developers to the official project repository?" However, there are pitfalls. Commits may be reordered, deleted, or edited as they move between repositories. The semantics of terms common to SCMs and DSCMs sometimes differ markedly, potentially creating confusion. For example, a commit is immediately visible to all developers in centralized SCMs, but not in DSCMs. Our goal is to help researchers interested in DSCMs avoid these and other perils when mining and analyzing git data.}, keywords = {dscm, git, mining, scm, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/1promisePeril.pdf}, author = {Christian Bird and Peter C. Rigby and Earl T. Barr and David J. Hamilton and Daniel M. Germ{\'a}n and Premkumar T. Devanbu} } @article {1082, title = {Returns from social capital in open source software networks}, journal = {Journal of Evolutionary Economics}, volume = {19}, year = {2009}, note = {"The data we use in this analysis come from the SourceForge.net Research Data (Department of Computer Science and Engineering, University of Notre Dame)." "Particularly, our dataset is a sub-sample of 2,962 valid observations over twelve months of projects aimed at developing games{\textquoteright} software. To get this sample, we look at the monthly dumps of data and select all projects that belong to the category of GNU Public License (GPL) [6]. " dependent variable: performance (number of downloads); independent variable: social network ("Here we measure ties among projects through the individuals{\textquoteright} member and contributor roles at projects on the network over time.)}, month = {4/2009}, pages = {277 - 295}, abstract = {Open Source Software projects base their operation on a collaborative structure for knowledge exchange in the form of provision or reception of information, expertise, and feedback on the creation of source code. Here, we address the direction of these knowledge flows among projects throughout social networks and their impact on project success. We identify the roles of membership or contribution that individuals play within projects. We found that connections through contributors who bring their knowledge to the project, improve project success, and that connection through members, who transfer their knowledge towards other projects, enhance project success. Finally, we found that ties through shared membership and contributions hamper project success. The analysis of knowledge flows and their impact on project success imply a translation of returns from investment in social capital, where investment takes the shape of knowledge flows and the returns mean the projects{\textquoteright} diffusion over the network.}, keywords = {contributors, developers, games, gpl, project success, roles, social capital, social network analysis, social networks, sourceforge, srda, teams}, issn = {1432-1386}, doi = {10.1007/s00191-008-0125-5}, attachments = {https://flosshub.org/sites/flosshub.org/files/Mendez-DuronGarcia.pdf}, author = {M{\'e}ndez-Dur{\'o}n, Rebeca and Garc{\'\i}a, Clara E.} } @article {1236, title = {Tools for the Study of the Usual Data Sources found in Libre Software Projects}, journal = {International Journal of Open Source Software and Processes}, volume = {1}, year = {2009}, month = {31/2009}, pages = {24 - 45}, abstract = {Due to the open nature of Free/Libre/Open Source software projects, researchers have gained access to a rich set of development-related information. Although this information is publicly available on the Internet, obtaining and analyzing it in a convenient way is not an easy task and many considerations have to be taken into account. In this paper we present the most important data sources that can be found in libre software projects and that are studied by the research community: source code, source code management systems, mailing lists and bug tracking systems. We will give advice for the problems that can be found when retrieving and preparing the data sources for a posterior analysis, as well as provide information about the tools that support these tasks.}, keywords = {bug tracking systems, data sources, mailing lists, scm, tools}, issn = {1942-3934}, doi = {10.4018/jossp.2009010102}, attachments = {https://flosshub.org/sites/flosshub.org/files/robles.pdf}, author = {Gregorio Robles and Gonz{\'a}lez-Barahona, Jes{\'u}s M. and Izquierdo-Cortazar, Daniel and Herraiz, Israel} } @conference {926, title = {Tracking concept drift of software projects using defect prediction quality}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {51 - 60}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Defect prediction is an important task in the mining of software repositories, but the quality of predictions varies strongly within and across software projects. In this paper we investigate the reasons why the prediction quality is so fluctuating due to the altering nature of the bug (or defect) fixing process. Therefore, we adopt the notion of a concept drift, which denotes that the defect prediction model has become unsuitable as set of influencing features has changed - usually due to a change in the underlying bug generation process (i.e., the concept). We explore four open source projects (Eclipse, OpenOffice, Netbeans and Mozilla) and construct file-level and project-level features for each of them from their respective CVS and Bugzilla repositories. We then use this data to build defect prediction models and visualize the prediction quality along the time axis. These visualizations allow us to identify concept drifts and - as a consequence - phases of stability and instability expressed in the level of defect prediction quality. Further, we identify those project features, which are influencing the defect prediction quality using both a tree induction-algorithm and a linear regression model. Our experiments uncover that software systems are subject to considerable concept drifts in their evolution history. Specifically, we observe that the change in number of authors editing a file and the number of defects fixed by them contribute to a project{\textquoteright}s concept drift and therefore influence the defect prediction quality. Our findings suggest that project managers using defect prediction models for decision making should be aware of the actual phase of stability or instability due to a potential concept drift.}, keywords = {bugzilla, cvs, defect prediction, eclipse, mozilla, netbeans, openoffice}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069480}, attachments = {https://flosshub.org/sites/flosshub.org/files/51MSR2009_0111_Ekanayake_Jayalath.pdf}, author = {Ekanayake, Jayalath and Tappolet, Jonas and Gall, Harald C. and Bernstein, Abraham} } @conference {Yatani:2009:UWO:1518701.1518853, title = {Understanding how and why open source contributors use diagrams in the development of Ubuntu}, booktitle = {Proceedings of the 27th international conference on Human factors in computing systems}, series = {CHI {\textquoteright}09}, year = {2009}, note = {"To examine how and why diagrams are used in any and all aspects of the software development process of an OSS project, we performed a series of semi-structured interviews with contributors to one particular effort{\textemdash}Ubuntu." "Our study was divided into two phases. First, we asked participants to complete a questionnaire and provide us with information and materials for discussion. The questionnaire featured questions about participants{\textquoteright} OSS experience, project participation, their roles in each project, and basic demographics. We also asked participants to share diagrams they had created, modified or used as part of their work on Ubuntu. In the second phase of the study, we conducted semi-structured interviews with participants."}, pages = {995{\textendash}1004}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Some of the most interesting differences between Open Source Software (OSS) development and commercial co-located software development lie in the communication and collaboration practices of these two groups of developers. One interesting practice is that of diagramming. Though well studied and important in many aspects of co-located software development (including communication and collaboration among developers), its role in OSS development has not been thoroughly studied. In this paper, we report our investigation on how and why Ubuntu contributors use diagrams in their work. Our study shows that diagrams are not actively used in many scenarios where they commonly would in co-located software development efforts. We describe differences in the use and practices of diagramming, their possible reasons, and present design considerations for potential systems aimed at better supporting diagram use in OSS development.}, keywords = {developers, diagramming, interviews, open source software (oss), software development, Ubuntu, visual representation}, isbn = {978-1-60558-246-7}, doi = {http://doi.acm.org/10.1145/1518701.1518853}, url = {http://doi.acm.org/10.1145/1518701.1518853}, author = {Yatani, Koji and Chung, Eunyoung and Jensen, Carlos and Truong, Khai N.} } @conference {597, title = {Using FLOSS Project Metadata in the Undergraduate Classroom}, booktitle = {OSS2009: Open Source Ecosystems: Diverse Communities Interacting (IFIP 2.13)}, series = {IFIP Advances in Information and Communication Technology }, volume = {299/2009}, year = {2009}, month = {2009///}, pages = {330 - 339}, publisher = {Springer}, organization = {Springer}, chapter = {29}, abstract = {This paper describes our efforts to use the large amounts of data available from public repositories of free, libre, and open source software (FLOSS) in our undergraduate classrooms to teach concepts that would have previously been taught using other types of data from other sources. }, keywords = {artificial intelligence, database, education, teaching, undergraduate, undergraduate research}, issn = {978-3-642-02031-5}, doi = {http://dx.doi.org/10.1007/978-3-642-02032-2_29}, attachments = {https://flosshub.org/sites/flosshub.org/files/Using\%20FLOSS\%20Project\%20Metadata.pdf}, author = {Squire, Megan and Duvall, Shannon} } @article {10.1109/HICSS.2009.1014, title = {Using Software Archaeology to Measure Knowledge Loss in Software Projects Due to Developer Turnover}, journal = {2009 42nd Hawaii International Conference on System Sciences (HICSS 2009)}, year = {2009}, pages = {1-10}, publisher = {IEEE Computer Society}, address = {Los Alamitos, CA, USA}, abstract = {Developer turnover can result in a major problem when developing software. When senior developers abandon a software project, they leave a knowledge gap that has to be managed. In addition, new (junior) developers require some time in order to achieve the desired level of productivity. In this paper, we present a methodology to measure the effect of knowledge loss due to developer turnover in software projects. For a given software project, we measure the quantity of code that has been authored by developers that do not belong to the current development team, which we define as orphaned code. Besides, we study how orphaned code is managed by the project. Our methodology is based on the concept of software archaeology, a derivation of software evolution. As case studies we have selected four FLOSS (free, libre, open source software) projects, from purely driven by volunteers to company-supported. The application of our methodology to these case studies will give insight into the turnover that these projects suffer and how they have managed it and shows that this methodology is worth being augmented in future research.}, keywords = {attrition, case study, developers, evince, evolution, gimp, growth, knowledge collaboration, lines of code, nautilus, quality, sloc, turnover}, isbn = {978-0-7695-3450-3}, doi = {http://doi.ieeecomputersociety.org/10.1109/HICSS.2009.1014}, attachments = {https://flosshub.org/sites/flosshub.org/files/07-07-08.pdf}, author = {Izquierdo-Cortazar, Daniel and Gregorio Robles and Ortega, Felipe and Jesus M. Gonzalez-Barahona} } @article {Xu2009151, title = {Volunteers{\textquoteright} involvement in online community based software development}, journal = {Information \& Management}, volume = {46}, number = {3}, year = {2009}, note = {"Data were collected through an online survey and by searching project archives. On Sourceforge.net, each developer was uniquely identified with a user account, and the developer{\textquoteright}s performance was assessed through the number of function points accepted by the project in a certain time period, obtained through conversion and calculation from the project{\textquoteright}s code repository. Data for other constructs were obtained from the developers{\textquoteright} response to the online survey. " "a developer{\textquoteright}s performance was measured by the number of function points made and accepted into the project during the observed time period." project age, development status, license type, number of developers}, pages = {151 - 158}, abstract = {We sought to gain understanding of voluntary developers{\textquoteright} involvement in open source software (OSS) projects. Data were collected from voluntary developers working on open source projects. Our findings indicated that a voluntary developer{\textquoteright}s involvement was very important to his or her performance and that involvement was dependent on individual motivations (personal software needs, reputation and skills gaining expectation, enjoyment in open source coding) and project community factors (leadership effectiveness, interpersonal relationship, community ideology). Our work contributes theoretically and empirically to the body of OSS research and has practical implications for OSS project management.}, keywords = {age, developers, effectiveness, function points, ideology, leadership, MOTIVATION, scm, sourceforge, status, Survey, team size, Volunteers}, issn = {0378-7206}, doi = {DOI: 10.1016/j.im.2008.12.005}, url = {http://www.sciencedirect.com/science/article/B6VD0-4VP1CN0-1/2/8e1c7be4fcedd1419209c5c843ffa923}, author = {Bo Xu and Donald R. Jones and Bingjia Shao} } @conference {1205, title = {Weaving~a~Semantic~Web~across~OSS~repositories: a~spotlight~on~bts-link,~UDD,~SWIM}, booktitle = {4th Workshop on Public Data about Software Development (WoPDaSD 2009)}, year = {2009}, note = {position paper; non-experimental}, abstract = {Several public repositories and archives of facts about libre software projects, developed either by open source communities or by research communities, have been flourishing over the Web in the recent years. These enable new analysis and support new quality assurance tasks. By using Semantic Web techniques, the databases containing data about open-source software projects development can be interconnected, hence letting OSS partakers identify resources, annotate them and further interlink them using dedicated properties, collectively designing a distributed semantic graph. Such links expressed with standard Semantic techniques are paving the way to new applications (including ones meant for {\textquotedblleft}end-users{\textquotedblright}). For instance this may have an impact on the way research efforts are conducted (less fragmented), and could also be used by development communities to improve Quality Assurance tasks. A goal of the research conducted within the HELIOS project, is to address bugtracker synchronization issues. For that, the potential of using Semantic Web technologies in navigating between many different bugtracker systems scattered all over the open source ecosystem is being investigated. This position paper presents some existing tools, projects and models proposed by OSS actors that are complementary to research initiatives, and that are likely to lead to useful future developments: UDD (Ultimate Debian Database) and bts-link, developed by the Debian community, and SWIM (Semantic Web enabled Issue Manager) developed by Mandriva. The HELIOS team welcomes comments on the future paths that can be considered in using the Semantic Web approach for improving these projects. }, keywords = {bts-link, bug tracker, bugzilla, debian, ecosystem, helios, mandriva, semantic Web, swim, udd}, attachments = {https://flosshub.org/sites/flosshub.org/files/HELIOS-WOPDASD-improved-Olivier.pdf}, author = {Olivier Berger and Valentin Vlasceanu and Christian Bac and Lauri{\`e}re, St{\'e}phane} } @article {denBesten2008316, title = {The allocation of collaborative efforts in open-source software}, journal = {Information Economics and Policy}, volume = {20}, number = {4}, year = {2008}, note = {"we have selected a set of 10 large open-source projects" apache, cvs, gaim, gcc, ghostscript, mozilla, netbsd, openssh, postgresql, python "Our data were extracted from logs of development activity generated by software version control systems. For each project in the selection, we extracted CVS development logs" "We notably computed for each file in the sample, and for each month in its history, the number of distinct maintainers that had committed a change during that month, and the number of commits, the blocks of code addition, each file had received during that month." "other variables used in the regressions are proxies for the size, age, and granularity of files; the size of a file is represented as its number of lines of code (LOCs), its age by its creation date (Youth), and its granularity by the number of functions it contains."}, pages = {316 - 322}, abstract = {The article investigates the allocation of collaborative efforts among core developers (maintainers) of open-source software by analyzing on-line development traces (logs) for a set of 10 large projects. Specifically, we investigate whether the division of labor within open-source projects is influenced by characteristics of software code. We suggest that the collaboration among maintainers tends to be influenced by different measures of code complexity. We interpret these findings by providing preliminary evidence that the organization of open-source software development would self-adapt to characteristics of the code base, in a {\textquoteright}stigmergic{\textquoteright} manner.}, keywords = {age, apache, complexity, cvs, division of labor, functions, gaim, gcc, ghostscript, lines of code, loc, log files, mozilla, netbsd, openssh, postgresql, python, revision control, scm, size, source code, Stigmergy, version control}, issn = {0167-6245}, doi = {DOI: 10.1016/j.infoecopol.2008.06.003}, url = {http://www.sciencedirect.com/science/article/B6V8J-4SSG4PN-1/2/88b3824c30a31c18929d8a5ca6d64f62}, author = {den Besten, Matthijs and Jean-Michel Dalle and Galia, Fabrice} } @conference {Wang:2008:ADD:1368088.1368151, title = {An approach to detecting duplicate bug reports using natural language and execution information}, booktitle = {Proceedings of the 30th international conference on Software engineering}, series = {ICSE {\textquoteright}08}, year = {2008}, pages = {461{\textendash}470}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {An open source project typically maintains an open bug repository so that bug reports from all over the world can be gathered. When a new bug report is submitted to the repository, a person, called a triager, examines whether it is a duplicate of an existing bug report. If it is, the triager marks it as DUPLICATE and the bug report is removed from consideration for further work. In the literature, there are approaches exploiting only natural language information to detect duplicate bug reports. In this paper we present a new approach that further involves execution information. In our approach, when a new bug report arrives, its natural language information and execution information are compared with those of the existing bug reports. Then, a small number of existing bug reports are suggested to the triager as the most similar bug reports to the new bug report. Finally, the triager examines the suggested bug reports to determine whether the new bug report duplicates an existing bug report. We calibrated our approach on a subset of the Eclipse bug repository and evaluated our approach on a subset of the Firefox bug repository. The experimental results show that our approach can detect 67\%-93\% of duplicate bug reports in the Firefox bug repository, compared to 43\%-72\% using natural language information alone.}, keywords = {bug report, duplicate bug report, execution information, information retrieval, natural language}, isbn = {978-1-60558-079-1}, doi = {10.1145/1368088.1368151}, url = {http://doi.acm.org/10.1145/1368088.1368151}, author = {Wang, Xiaoyin and Zhang, Lu and Xie, Tao and Anvik, John and Sun, Jiasu} } @conference {1211, title = {Are FLOSS developers committing to CVS/SVN as much as they are talking in mailing lists? Challenges for Integrating data from Multiple Repositories}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, month = {09/2008}, pages = {49-54}, abstract = {This paper puts forward a framework for investigating Free and Open Source Software (F/OSS) developers activities in both source code and mailing lists repositories. We used data dumps of fourteen pro jects from the FLOSSMetrics (FM) retrieval system. Our intentions are (i) to present a possible methodology, its advantages and disadvantages which can benefit future researchers using some aspects of the FM retrieval system{\textquoteright}s data dumps, and (ii) discuss our initial research results on the contributions developers make to both coding and lists activities.}, keywords = {cvs, cvsanaly, developers, email, email archives, flossmetrics, mailing list, mlstats, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/49-542008.pdf}, author = {Sowe, Sulayman K. and Samoladas, Ioannis and Ioannis Stamelos and Lefteris Angelis} } @conference {1210, title = {Author Entropy: A Metric for Characterization of Software Authorship Patterns}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, note = {used flossmole to get sample of SF developers}, month = {2008}, pages = {42-47}, abstract = {We propose the concept of author entropy and describe how file-level entropy measures may be used to understand and characterize authorship patterns within individual files, as well as across an entire project. As a proof of concept, we compute author entropy for 28,955 files from 33 open-source projects. We explore patterns of author entropy, identify techniques for visualizing author entropy, and propose avenues for further study. }, keywords = {developers, entropy, flossmole, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/entropy2008.pdf}, author = {Taylor, Quinn C. and Stevenson, James E. and Delorey, Daniel P. and Knutson, Charles D.} } @conference {971, title = {Branching and merging in the repository}, booktitle = {the 2008 international workshopProceedings of the 2008 international workshop on Mining software repositories - MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {19-22}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {Two of the most complex operations version control software allows a user to perform are branching and merging. Branching provides the user the ability to create a copy of the source code to allow changes to be stored in version control but outside of the trunk. Merging provides the user the ability to copy changes from a branch to the trunk. Performing a merge can be a tedious operation and one that may be error prone. In this paper, we compare file revisions found on branches with those found on the trunk to determine when a change that is applied to a branch is moved to the trunk. This will allow us to study how developers use merges and to determine if merges are in fact more error prone than other commits.}, keywords = {argouml, changes, cvs2svn, diffj, revision, scm, source code, version control}, isbn = {9781605580241}, doi = {10.1145/1370750.1370754}, attachments = {https://flosshub.org/sites/flosshub.org/files/p19-williams.pdf}, author = {Spacco, Jamie and Williams, Chadd C.} } @article {Crowston:2008, title = {Bug Fixing Practices within Free/Libre Open Source Software Development Teams}, journal = {Journal of Database Management}, volume = {19}, number = {2}, year = {2008}, note = {"Projects to be studied were selected from those hosted on SourceForge, (http://sourceforge.net/)" "we chose projects for which data we need for our analysis are publicly available, meaning a large number of bug reports" "we chose teams with more than 8 developers" "Only 140 projects of SourceForge met the first two requirements in 2002 when we drew our sample" kicq, gaim, phpmyadmin, dynapi "First, we obtained data indica- tive of the effectiveness of each project, such as its level of activity, number of downloads and development status" "we elected to use objective data about the bug- fixing process. Hence, the main source of data about the bug-fixing process was obtained from the archives of the bug tracking system, which is the tool used to support the bug-fixing process"}, pages = {1{\textendash}30}, abstract = {Free/libre open source software (FLOSS, e.g., Linux or Apache) is primarily developed by distributed teams. Developers contribute from around the world and coordinate their activity almost exclusively by means of email and bulletin boards, yet some how profit from the advantages and evade the challenges of distributed software development. In this article we investigate the structure and the coordination practices adopted by development teams during the bug-fixing process, which is considered one of main areas of FLOSS project success. In particular, based on a codification of the messages recorded in the bug tracking system of four projects, we identify the accomplished tasks, the adopted coordination mechanisms, and the role undertaken by both the FLOSS development team and the FLOSS community. We conclude with suggestions for further research.}, keywords = {activity, bug tracker, bug tracking system, coordination, downloads, dynapi, effectiveness, FLOSS, gaim, kicq, phpmyadmin, project success, size, status}, issn = {1063-8016}, attachments = {https://flosshub.org/sites/flosshub.org/files/CrowstonScozziJDBM2008.pdf}, author = {Kevin Crowston and Barbara Scozzi} } @conference {1206, title = {Collecting data from distributed FOSS projects}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, note = {"We selected three projects from the initial set of projects: Linux 2.6, an operating system kernel, gimp, a graphics program, and Blender, a 3d content creation suite." "To acquire data from each data source, we wrote special programs based on the earlier prototypes....The first program extracts information from mailing list archives....The second program obtains bug reports from bug tracking systems....The third program obtains source code from network-accessible version control systems and runs metric calculations on it."}, month = {2009}, pages = {8-13}, abstract = {A key trait of Free and Open Source Software (foss) development is its distributed nature. Nevertheless, two project-level operations, the fork and the merge of program code, are among the least well understood events in the lifespan of a foss project. Some projects have explicitly adopted these operations as the primary means of concurrent development. In this study, we examine the effect of highly distributed software development, as found in the Linux kernel project, on collection and modelling of software development data. We find that distributed development calls for sophisticated temporal modelling techniques where several versions of the source code tree can exist at once. Attention must be turned towards the methods of quality assurance and peer review that projects employ to manage these parallel source trees. Our analysis indicates that two new metrics, fork rate and merge rate, could be useful for determining the role of distributed version control systems in foss projects. The study presents a preliminary data set consisting of version control and mailing list data. }, keywords = {bitkeeper, bug tracking system, cvs, distributed, email archive, fork rate, git, life cycle, linux, linux kernel, mailing list, merge rate, subversion, svn, version control}, attachments = {https://flosshub.org/sites/flosshub.org/files/fagerholm.pdf}, author = {Fagerholm, Fabian and Taina, Juha} } @conference {1207, title = {Cross-repository data linking with RDF and OWL}, booktitle = {3rd Workshop on Public Data about Software Development (WoPDaSD 2008)}, year = {2008}, note = {non-experimental}, month = {2009}, pages = {15-22}, abstract = {This paper provides an approach to the problem of integrating data from multiple research repositories for FLOSS data. It introduces semantic web technologies (RDF, OWL, OWL-DL reasoners and SPARQL) to argue that these are useful for building shared research infrastructure. The paper illustrates its point by describing parts of an ontology developed for the integration and analysis of project communications drawn from FLOSSmole, the Notre Dame archive and direct collection of data. RDF vocabularies provide a way to agree on things we agree about as well as a way to be clearer about ways in which we disagree.}, keywords = {data integration, flossmole, forges, integration, owl, RDF, repositories, semantic, semantic Web, sparql, srda}, attachments = {https://flosshub.org/sites/flosshub.org/files/howison2008.pdf}, author = {Howison, James} } @article {1084, title = {Dynamics of innovation in an "open source" collaboration environment: lurking, laboring, and launching FLOSS projects on SourceForge}, journal = {Industrial and Corporate Change}, volume = {17}, year = {2008}, note = {"For the purposes of this study, we are able to draw upon micro-level data pertaining to the activities undertaken during the period between the beginning of September 2000 and December 2002 by the entire cohort of 222,835 individuals who had registered on SF.net during the 14 months from September 1, 2000 through October 26, 2001." "The statistical analysis in this article is based upon an edited dataset (referred to here as the SFnetDataset) covering the SourceForge cohort of 222,835 individuals who registered on SF.net during the 14 months from September 1, 2000 through October 26, 2001." They appear to be using their own dataset which is from 2000-2001 even though this paper is from 2008}, month = {07/2008}, pages = {647 - 710}, abstract = {A systems analysis perspective is adopted to examine the critical properties of the Free/Libre/Open Source Software (FLOSS) mode of innovation, as reflected on the SourceForge platform (SF.net). This approach re-scales March{\textquoteright}s (1991) framework and applies it to characterize the {\textquotedblleft}innovation system{\textquotedblright} of a {\textquotedblleft}distributed organization{\textquotedblright} of interacting agents in a virtual collaboration environment, rather than to innovation within a firm. March (1991) views the process of innovation at the organizational level as the coupling of sub-processes of exploration and exploitation. Correspondingly, the innovation system of the virtual collaboration environment represented by SF.net is an emergent property of two {\textquotedblleft}coupled{\textquotedblright} processes: one involves the interactions among agents searching the locale for information and knowledge resources to use in designing novel software products (i.e., exploration), and the other involves the mobilization of individuals{\textquoteright} capabilities for application in the software development projects that become established on the platform (i.e., exploitation). The micro-dynamics of this system are studied empirically by constructing transition probability matrices representing the movements of 222,835 SF.net users among seven different activity states, which range from {\textquotedblleft}lurking{\textquotedblright} (not contributing or contributing to projects without becoming a member) to {\textquotedblleft}laboring{\textquotedblright} (joining one or more projects as members), and to {\textquotedblleft}launching{\textquotedblright} (founding one or more projects) within each successive 6-month interval. The estimated probabilities are found to form first-order Markov chains describing ergodic processes. This makes it possible the computation of the equilibrium distribution of agents among the states, thereby suppressing transient effects and revealing persisting patterns of project joining and project launching. The latter show the FLOSS innovation process on SF.net to be highly dissipative: a very large proportion of the registered {\textquotedblleft}developers{\textquotedblright} fail to become even minimally active on the platform. There is nevertheless an active core of mobile project joiners, and a (still smaller) core of project founders who persist in creating new projects. The structure of these groups{\textquoteright} interactions (as displayed within the 3-year period examined) is investigated in detail, and it is shown that it would be sufficient to sustain both the exploration and exploitation phases of the platform{\textquoteright}s global dynamics.}, keywords = {contributors, core, developers, roles, SFnetDataset, sourceforge, users, virtual communities, virtual organization, virtual organizations}, issn = {1464-3650}, doi = {10.1093/icc/dtn026}, author = {David, P. A. and Rullani, F.} } @article {Koch2008345, title = {Effort modeling and programmer participation in open source software projects}, journal = {Information Economics and Policy (Empirical Issues in Open Source Software)}, volume = {20}, number = {4}, year = {2008}, note = {"Using a two-step approach, first a detailed case study on one project, GNOME, will be undertaken, then a large data set retrieved from a project hosting site, SourceForge.net, will be used to validate the results." CVS was the main source of data "e-mails sent to the different project discussion lists were identified as an additional source of information especially on communication and coordination besides the CVS-repository" basic counts were calculated for developer discussion levels}, month = {12/2008}, pages = {345 - 355}, abstract = {This paper develops models for programmer participation and effort estimation in open source software projects and employs the results to assess the efficiency of open source software creation. Successful development of such models will be important for decision makers of various kinds. We propose hypotheses based on a prior case study on manpower function and effort modeling. A large data set retrieved from a project repository is used to test these hypotheses. The main results are that if Norden-Rayleigh-based approaches are used, they need to be complemented in order to account for the addition of new features during a product life cycle, and that programmer-participation based effort models result in distinctly lower estimations of effort than those based on output metrics, such as lines of code.}, keywords = {cvs, developers, email, email archives, gnome, lines of code, scm, Software repository mining, source code, sourceforge}, issn = {0167-6245}, doi = {DOI: 10.1016/j.infoecopol.2008.06.004}, url = {http://www.sciencedirect.com/science/article/B6V8J-4SSND1J-1/2/c857fa1493e19aa7fe4297dedb077b3a}, attachments = {https://flosshub.org/sites/flosshub.org/files/KochEffortModeling.pdf}, author = {Koch, Stefan} } @article {1091, title = {Evaluating the performance of open source software projects using data envelopment analysis}, journal = {Information Management \& Computer Security}, volume = {16}, year = {2008}, note = {"The data set used for this research consists of 34 OSS software projects listed on sourceforge.net." "only highly ranked projects in the security domain were considered." "Ultimately data were collected on the 34 highest ranked security-based OSS software projects on Sourcefore.net."[sic] fields: bug (?), developers, rank, downloads, Kperdownload (?)}, month = {2008}, pages = {449 - 462}, abstract = { Purpose {\textendash} The purpose of this paper is to develop and test a model of the relative performance of open source software (OSS) projects. Design/methodology/approach {\textendash} This paper evaluates the relative performance of OSS projects by evaluating multiple project inputs and multiple project outputs by using a data envelopment analysis (DEA) model. The DEA model produces an efficiency score for each project based on project inputs and outputs. The method of producing an efficiency score is based on the convex envelopment technology structure. The efficiency measure quantifies a {\textquotedblleft}distance{\textquotedblright} to an efficient frontier. Findings {\textendash} The DEA model produced an index of corresponding intensities linking an inefficient project to its benchmark efficient project(s). The inefficiency measures produced an ordering of inefficient projects. Eight projects were found to be {\textquotedblleft}efficient{\textquotedblright} and used as benchmarking projects. Research limitations/implications {\textendash} This research is limited to only security-based OSS projects. Future research on other areas of OSS projects is warranted. Practical implications {\textendash} The result of this research is a practical model that can be used by OSS project developers to evaluate the relative performance of their projects and make resource decisions. Originality/value {\textendash} This research extends the work of previous studies that have examined the relative performance of software development projects in a traditional development environment. As a result of this research, OSS projects can now be adequately benchmarked and evaluated according to project performance. An OSS project manger can effectively use these results to critically evaluate resources for their project and judge the relative efficiency of the resources.}, keywords = {dea, efficiency, Project performance, sourceforge}, issn = {0968-5227}, doi = {10.1108/09685220810920530}, author = {Wray, Barry and Mathieu, Richard and Teets, J.} } @article {GonzalezBarahona2008356, title = {Geographic origin of libre software developers}, journal = {Information Economics and Policy}, volume = {20}, number = {4}, year = {2008}, note = {Empirical Issues in Open Source Software}, pages = {356 - 363}, abstract = {This paper examines the claim that libre (free, open source) software involves global development. The anecdotal evidence is that developers usually work in teams including individuals residing in many different geographical areas, time zones and even continents and that, as a whole, the libre software community is also diverse in terms of national origin. However, its exact composition is difficult to capture, since there are few records of the geographical location of developers. Past studies have been based on surveying a limited (and sometimes biased) sample and extrapolating that sample to the global distribution of developers. In this paper we present an alternate approach in which databases are analyzed to create traces of information from which the geographical origin of developers can be inferred. Applying this technique to the SourceForge users database and the mailing lists archives from several large projects, we have estimated the geographical origin of more than one million individuals who are closely related to the libre software development process. The paper concludes that the result is a good proxy for the actual distribution of libre software developers working on global projects.}, keywords = {developers, email, email address, email archives, geography, mailing list, open source software, sourceforge, timezone, users}, issn = {0167-6245}, doi = {DOI: 10.1016/j.infoecopol.2008.07.001}, url = {http://www.sciencedirect.com/science/article/B6V8J-4T3DCPK-1/2/3981dfbc523eae1d1ce65fb1f0c0edb7}, author = {Jesus M. Gonzalez-Barahona and Gregorio Robles and Roberto Andradas-Izquierdo and Rishab Aiyer Ghosh} } @conference {flosswp1737, title = {Improving community awareness in software forges by semantical aggregation of tools feeds}, booktitle = {3nd International Workshop on Public Data about Software Development (WoPDaSD 2008), Milano, Italy, September 2008}, year = {2008}, abstract = {It is rather difficult to monitor or visualize what can be the contribution of a member in a project, especially when the project uses multiple tools to produce its results. This is the case for collaborative development of FLOSS software, that use Wiki, bug tracker, mailing lists and source code management tools. This paper presents an approach to data collection by using aggregation of feeds published by the different tools of a software forge. To allow this aggregation, collected data is semantically reformatted into Semantic Web standards: RDF, DC, DOAP, and FOAF. Resulting data can then be processed, republished or displayed to project members. We implemented this approach in a supervision module that has been integrated into the PicoForge platform. This module is able do draw a live graph of the social community out of the different sources of data, and in turn export semantic feeds for other uses.}, keywords = {community of practice, DOAF., FOAF, free and open source software development, public data, RDF, semantic Web, social filtering, social network analysis}, attachments = {https://flosshub.org/sites/flosshub.org/files/Paper4.pdf}, author = {Quang Vu Dang and Christian Bac and Olivier Berger and Xuan Sang Dao} } @conference {Zhang:2008:ISG:1370750.1370785, title = {An initial study of the growth of eclipse defects}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, pages = {141{\textendash}144}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {We analyze the Eclipse defect data from June 2004 to November 2007, and find that the growth of the number of defects can be well modeled by polynomial functions. Furthermore, we can predict the number of future Eclipse defects based on the nature of defect growth.}, keywords = {bug reports, defect growth model, defect prediction, eclipse, msr challenge, polynomial regression}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370785}, url = {http://doi.acm.org/10.1145/1370750.1370785}, author = {Zhang, Hongyu} } @article {1120, title = {The institutions of open source software: Examining the Debian community*}, journal = {Information Economics and Policy}, volume = {20}, year = {2008}, note = {"using primary data from its mailing lists archives, handbooks written to inform potential and actual community members, and previous analyses of institutional evolution and political conflict" no discussion of which lists, how many, or when they were gathered...?[ms]}, month = {12/2008}, pages = {333 - 344}, abstract = {Free and open source software activities involve and, perhaps, evolve institutions (rules, norms and standards) that influence the formation, growth, and demise of communities. Community institutions are attractors for some individuals while discouraging other individuals from entering or continuing to participate. Their suitability may change as a community grows. This paper examines the institutions of the Debian community where issues of community identity, distribution of authority, and decentralisation have facilitated growth and development. These same institutions have also resulted in conflicts regarding community purposes and the quality and delivery of the community{\textquoteright}s output. We examine the institutional redesign undertaken to address these problems and derive implications for F/LOS communities and companies.}, keywords = {authority, COMMUNITY, conflict, debian, decentralization, growth, institutions, leadership}, issn = {01676245}, doi = {10.1016/j.infoecopol.2008.06.001}, attachments = {https://flosshub.org/sites/flosshub.org/files/The_institutions_of_open_source_software-_IR.pdf}, author = {Mateos Garcia, J. and Steinmueller, W.E.} } @article {499, title = {The Material and Social Dynamics of Motivation: Contributions to Open Source Language Technology Development}, journal = {Science Studies}, number = {2}, year = {2008}, note = {"I conducted a two-phased qualitative analysis on multiple data (thematic phone-inter- views, notes from OpenOffice.org conference 2004 meetings, historical documents, mailing list discussions, homepages, Google, blogs, real-time video and audio presentations from OpenOffice.org conference 2005)."}, abstract = {Volunteer motivation has been a central theme in Free/Libre/Open Source Software (FLOSS) literature. This research has been largely dominated by economists who rely in their surveys on the distinction between intrinsic and extrinsic motivations and the "hacker ethic" for profit juxtaposition. The paper argues that survey-based analytical frameworks and research designs have led to a focus on some motivational attributions at the expense of others. It then presents a case study that explores dynamic, non individualistic and content-sensitive aspects of motivations. The approach is based on socio-cultural psychology and the author{\textquoteright}s observations of a hybrid firm-community FLOSS project, OpenOffice.org. Instead of separating intrinsic motivations from extrinsic ones, it is argued that complex and changing patterns of motivations are tied to changing objects and personal histories prior to and during participation. The boundary between work and hobby in an individual{\textquoteright}s participation path is blurred and shifting.}, keywords = {contributions, developers, email, email archives, mailing list, MOTIVATION, openoffice, openoffice.org, secondary data, Volunteers}, attachments = {https://flosshub.org/sites/flosshub.org/files/Freeman.pdf}, author = {Stephanie Freeman} } @conference {Holmes:2008:NGE:1370750.1370787, title = {A newbie{\textquoteright}s guide to eclipse APIs}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, pages = {149{\textendash}152}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Eclipse has evolved from a fledgling Java IDE into a mature software ecosystem. One of the greatest benefits Eclipse provides developers is flexibility; however, this is not without cost. New Eclipse developers often find the framework to be large and confusing. Determining which parts of the framework they should be using can be a difficult task as Eclipse documentation tends to be either very high-level, focusing on the design of the framework, or low-level, focusing on specific APIs. We have developed a tool called PopCon that provides a bridge between high-level design documentation and low-level API documentation by statically analyzing a framework and several of its clients and providing a ranked list of the relative popularity of its APIs. We have applied PopCon to the Eclipse framework for this challenge to help newbie Eclipse developers identify some of the most relevant APIs for their tasks.}, keywords = {API popularity, documentation, eclipse, mining software repositories, module, msr challenge, PopCon, popularity}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370787}, url = {http://doi.acm.org/10.1145/1370750.1370787}, author = {Holmes, Reid and Walker, Robert J.} } @conference {973, title = {On the relation of refactorings and software defect prediction}, booktitle = {Proceedings of the 2008 international workshop on Mining software repositories - MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {35-38}, publisher = {ACM Press}, organization = {ACM Press}, address = {New York, New York, USA}, abstract = {This paper analyzes the influence of evolution activities such as refactoring on software defects. In a case study of five open source projects we used attributes of software evolution to predict defects in time periods of six months. We use versioning and issue tracking systems to extract 110 data mining features, which are separated into refactoring and non-refactoring related features. These features are used as input into classification algorithms that create prediction models for software defects. We found out that refactoring related features as well as non-refactoring related features lead to high quality prediction models. Additionally, we discovered that refactorings and defects have an inverse correlation: The number of software defects decreases, if the number of refactorings increased in the preceding time period. As a result, refactoring should be a significant part of both bug fixes and other evolutionary changes to reduce software defects.}, keywords = {argouml, bug fixing, bug reports, defects, evolution, jboss, liferay, prediction, refactoring, spring, weka, xdoclet}, isbn = {9781605580241}, doi = {10.1145/1370750.1370759}, attachments = {https://flosshub.org/sites/flosshub.org/files/p35-ratzinger.pdf}, author = {Sigmund, Thomas and Gall, Harald C. and Ratzinger, Jacek} } @conference {Siy:2008:SDW:1370750.1370784, title = {Summarizing developer work history using time series segmentation: challenge report}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {137{\textendash}140}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Temporal segmentation partitions time series data with the intent of producing more homogeneous segments. It is a technique used to preprocess data so that subsequent time series analysis on individual segments can detect trends that may not be evident when performing time series analysis on the entire dataset. This technique allows data miners to partition a large dataset without making any assumption of periodicity or any other a priori knowledge of the dataset{\textquoteright}s features. We investigate the insights that can be gained from the application of time series segmentation to software version repositories. Software version repositories from large projects contain on the order of hundreds of thousands of timestamped entries or more. It is a continuing challenge to aggregate such data so that noise is reduced and important characteristics are brought out. In this paper, we present a way to summarize developer work history in terms of the files they have modified over time by segmenting the CVS change data of individual Eclipse developers. We show that the files they modify tends to change significantly over time though most of them tend to work within the same directories.}, keywords = {contributions, cvs, developers, eclipse, msr challenge, temporal segmentation, time series, work history}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370784}, url = {http://doi.acm.org/10.1145/1370750.1370784}, author = {Siy, Harvey and Chundi, Parvathi and Subramaniam, Mahadevan} } @article {Sadowski2008323, title = {Transition of governance in a mature open software source community: Evidence from the Debian case}, journal = {Information Economics and Policy}, volume = {20}, number = {4}, year = {2008}, note = {"We primarily used internal documents related to the contents and context of different Debian projects." "Furthermore, we attended several Debian conferences and were {\textquoteleft}{\textquoteleft}lurking around{\textquotedblright} on the Debian mailing lists, websites, IRC channels, etc."}, pages = {323 - 332}, abstract = {As open source software (OSS) communities mature, they have to introduce a variety of governance mechanisms to manage the participation of their members and to coordinate the launch of new releases. The Debian community introduced new mechanisms of informal administrative control based on a constitution, elected leaders, and used interactive communication channels. We show that these control mechanisms were introduced as a response to emerging innovative opportunities due to the usage of source packages and to the need to build a responsive organization within the Debian OSS community.}, keywords = {debian, governance}, issn = {0167-6245}, doi = {DOI: 10.1016/j.infoecopol.2008.05.001}, url = {http://www.sciencedirect.com/science/article/B6V8J-4SHF4BS-1/2/579ba679ee43d7c77302f3595334dd24}, author = {Bert M. Sadowski and Gaby Sadowski-Rasters and Geert Duysters} } @article {138, title = {Understanding knowledge sharing activities in free/open source software projects: An empirical study}, journal = {Journal of Systems and Software}, volume = {81}, number = {3}, year = {2008}, note = {"Our study utilized data from the Debian project lists archives.... The Debian project hosts over 100 lists on all aspects related to the project. From the Debian lists archives (Debian Mailing Lists) we selected two high volume mailing lists. The following lists are analyzed in our study: {\textbullet} Debian-user. This list is specifically dedicated to help and discussion among users of Debian who speak English. {\textbullet} Debian-devel. This list is specifically dedicated to discus- sion about technical development topics. Our data collection period for both lists was from January 2000 to December 2005. We obtained archived mbox files of the two lists. Each file is a single text file containing one month of archived email messages. Every email message has a unique message-id, together with other identification fields defined by the Internet Message Format (RFC) 2822 (Internet Message Format, 2001)."}, pages = {431-446}, abstract = {Free/Open Source Software (F/OSS) projects are people-oriented and knowledge intensive software development environments. Many researchers focused on mailing lists to study coding activities of software developers. How expert software developers interact with each other and with non-developers in the use of community products have received little attention. This paper discusses the altruistic sharing of knowledge between knowledge providers and knowledge seekers in the Developer and User mailing lists of the Debian project. We analyze the posting and replying activities of the participants by counting the number of email messages they posted to the lists and the number of replies they made to questions others posted. We found out that participants interact and share their knowledge a lot, their positing activity is fairly highly correlated with their replying activity, the characteristics of posting and replying activities are different for different kinds of lists, and the knowledge sharing activity of self-organizing Free/Open Source communities could best be explained in terms of what we called "Fractal Cubic Distribution" rather than the power-law distribution mostly reported in the literature. The paper also proposes what could be researched in knowledge sharing activities in F/OSS projects mailing list and for what purpose. The research findings add to {\textquoteright} our understanding of knowledge sharing activities in F/OSS projects. (C) 2007 Elsevier Inc. All rights reserved.}, keywords = {debian, email, email archives, mailing list}, attachments = {https://flosshub.org/sites/flosshub.org/files/JSS_0.pdf}, author = {Sowe, Sulayman K. and Ioannis Stamelos and Lefteris Angelis} } @article {Barcellini:2008:UDM:1379919.1380288, title = {User and developer mediation in an Open Source Software community: Boundary spanning through cross participation in online discussions}, journal = {Int. J. Hum.-Comput. Stud.}, volume = {66}, number = {7}, year = {2008}, pages = {558{\textendash}570}, publisher = {Academic Press, Inc.}, address = {Duluth, MN, USA}, abstract = {The aim of this research is to analyse how design and use are mediated in Open Source Software (OSS) design. Focusing on the Python community, our study examines a {\textquoteright}{\textquoteright}pushed-by-users{\textquoteright}{\textquoteright} design proposal through the discussions occurring in two mailing-lists: one, user-oriented and the other, developer-oriented. To characterize the links between users and developers, we investigate the activities and references (knowledge sharing) performed by the contributors to these two mailing-lists. We found that the participation of users remains local to their community. However, several key participants act as boundary spanners between the user and the developer communities. This emerging role is characterized by cross-participation in parallel same-topic discussions in both mailing-lists, cohesion between cross-participants, the occupation of a central position in the social network linking users and developers, as well as active, distinctive and adapted contributions. The user championing the proposal acts as a key boundary spanner coordinating the process and using explicit linking strategies. We argue that OSS design may be considered as a form of {\textquoteright}{\textquoteright}role emerging design{\textquoteright}{\textquoteright}, i.e. design organized and pushed through emerging roles and through a balance between these roles. The OSS communities seem to provide a suitable socio-technical environment to enable such role emergence. }, keywords = {Boundary spanners, Cross-participants, Distributed design, Open Source Software Community, Role emerging design}, issn = {1071-5819}, doi = {10.1016/j.ijhcs.2007.10.008}, url = {http://dx.doi.org/10.1016/j.ijhcs.2007.10.008}, author = {Barcellini, Flore and D{\'e}tienne, Fran{\c c}oise and Burkhardt, Jean-Marie} } @conference {996, title = {Correlating Social Interactions to Release History during Software Evolution}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {7 - 7}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {In this paper, we propose a method to reason about the nature of software changes by mining and correlating discussion archives. We employ an information retrieval approach to find correlation between source code change history and history of social interactions surrounding these changes. We apply our correlation method on two software systems, LSEdit and Apache Ant. The results of these exploratory case studies demonstrate the evidence of similarity between the content of free-form text emails among developers and the actual modifications in the code. We identify a set of correlation patterns between discussion and changed code vocabularies and discover that some releases referred to as minor should instead fall under the major category. These patterns can be used to give estimations about the type of a change and time needed to implement it.}, keywords = {ant, apache, change management, developers, discussion, effort estimation, lsedit, mailing lists, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.4}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300007.pdf}, author = {Baysal, Olga and Malton, Andrew J.} } @article {springerlink:10.1007/s10588-006-9006-3, title = {Correlating temporal communication patterns of the Eclipse open source community with performance and creativity}, journal = {Computational \& Mathematical Organization Theory}, volume = {13}, year = {2007}, note = {"Social network data was collected from the Eclipse component development groups{\textquoteright} online mailing lists by using the online process tool (Gloor and Zhao, 2004). Data on bugs and enhancements for each group was collected from the Eclipse bugzilla database (Eclipse bugzilla, 2004). The social network data was analyzed with the TeCFlow tool (Gloor and Zhao, 2004)." "The study is based on data from the three main projects of the Eclipse open source development community, namely {\textquotedblleft}eclipse{\textquotedblright}, {\textquotedblleft}tools{\textquotedblright} and {\textquotedblleft}technology{\textquotedblright}. We have chosen thirty-three different component development groups for analysis." "The online process tool (online process tool, 2004) was utilized to collect communication data from their mailing list archives. The online process tool runs a robot that searches for URLs in the projects{\textquoteright} mailing list archives to compile a list of the possible URL links. It then extracts communication data as tuples in the form of {\textquotedblleft}sender, receiver, communication type, timestamp, communication contents{\textquotedblright} and stores it in the database. Further, bugs and enhancement data were collected from the Eclipse bugzilla database."}, pages = {17-27}, publisher = {Springer Netherlands}, abstract = {This paper studies the temporal communication patterns of online communities of developers and users of the open source Eclipse Java development environment. It measures the productivity of each community and seeks to identify correlations that exist between group communication characteristics and productivity attributes. The study uses the TeCFlow (Temporal Communication Flow) visualizer to create movie maps of the knowledge flow by analyzing the publicly accessible Eclipse developer mailing lists as an approximation of the social networks of developers and users. Thirty-three different Eclipse communities discussing development and use of components of Eclipse such as the Java Development Tools, the different platform components, the C/C++ Development Tools and the AspectJ extension have been analyzed over a period of six months. The temporal evolution of social network variables such as betweenness centrality, density, contribution index, and degree have been computed and plotted. Productivity of each development group is measured in terms of two indices, namely performance and creativity. Performance of a group is defined as the ratio of new bugs submitted compared with bugs fixed within the same period of time. Creativity is calculated as a function of new features proposed and implemented. Preliminary results indicate that there is a correlation between attributes of social networks such as density and betweenness centrality and group productivity measures in an open source development community. We also find a positive correlation between changes over time in betweenness centrality and creativity, and a negative correlation between changes in betweenness centrality and performance.}, keywords = {bug fixing, bugs, bugzilla, communication, creativity, developers, eclipse, email, email archives, feature requests, mailing lists, performance, productivity}, issn = {1381-298X}, url = {http://dx.doi.org/10.1007/s10588-006-9006-3}, attachments = {https://flosshub.org/sites/flosshub.org/files/16.pdf}, author = {Kidane, Yared and Gloor, Peter} } @conference {992, title = {Determining Implementation Expertise from Bug Reports}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {2 - 2}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {As developers work on a software product they accumulate expertise, including expertise about the code base of the software product. We call this type of expertise "implementation expertise". Knowing the set of developers who have implementation expertise for a software product has many important uses. This paper presents an empirical evaluation of two approaches to determining implementation expertise from the data in source and bug repositories. The expertise sets created by the approaches are compared to those provided by experts and evaluated using the measures of precision and recall. We found that both approaches are good at finding all of the appropriate developers, although they vary in how many false positives are returned.}, keywords = {bug reports, developers, eclipse, expertise, repository, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.7}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300002.pdf}, author = {Anvik, John and Murphy, Gail C.} } @conference {1003, title = {Evaluating the Harmfulness of Cloning: A Change Based Experiment}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {18 - 18}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Cloning is considered a harmful practice for software maintenance because it requires consistent changes of the entities that share a cloned fragment. However this claim has not been refuted or confirmed empirically. Therefore, we have developed a prototype tool, CloneTracker, in order to study the rate of change of applications containing clones. This paper describes CloneTracker and illustrates its preliminary application on a case study.}, keywords = {ccfinder, clone, clones, clonetracker, cloning, ctags, cvs, dnsjava, maintenance, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.8}, author = {Lozano, Angela and Wermelinger, Michel and Nuseibeh, Bashar} } @conference {1013, title = {Impact of the Creation of the Mozilla Foundation in the Activity of Developers}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {28 - 28}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {During 2003, the Mozilla project transitioned from company-promoted (sponsored by AOL) to community-promoted (sponsored by the Mozilla Foundation). What happened to the group of developers during this transition? There was any significant impact on its activity or composition? To answer these questions, we have performed an analysis of the CVS repository of Mozilla, using the CVSAnalY tool, finding little on activity, but dramatic changes in the the composition of the development team.}, keywords = {cvs, cvsanaly, developers, mining challenge, mozilla, msr challenge, revision history}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.15}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300028.pdf}, author = {Jesus M. Gonzalez-Barahona and Gregorio Robles and Herraiz, Israel} } @article {125, title = {Membership herding and network stability in the open source community: The Ising perspective}, journal = {Management Science}, volume = {53}, number = {7}, year = {2007}, month = {Jul}, pages = {1086-1101}, abstract = {The aim of this paper is twofold: (1) to conceptually understand membership dynamics in the open source software (OSS) community, and (2) to explore how different network characteristics (i.e., network size and connectivity) influence the stability of an OSS network. Through the lens of Ising theory, which is widely accepted in physics, we investigate basic patterns of interaction and present fresh conceptual insight into dynamic and reciprocal relations among OSS community members. We also perform computer simulations based on empirical data collected from two actual OSS communities. Key findings include: (1) membership herding is highly present when external influences (e.g., the availability of other OSS projects) are weak, but decreases significantly when external influences increase, (2) propensity for membership herding is most likely to be seen in a large network with random connectivity, and (3) for large networks, when external influences are weak, random connectivity will result in higher network strength than scale-free connectivity (as external influences increase, however, the reverse phenomenon is observed). In addition, scale-free connectivity appears to be less volatile than random connectivity in response to an increase in the strength of external influences. We conclude with several implications that may be of significance to OSS stakeholders in particular, and to a broader range of online communities in general.}, keywords = {BEHAVIOR, DYNAMICS, ECONOMICS, INNOVATION, INVESTMENT, Ising theory, membership herding, MODEL, MOTIVATION, network connectivity, network stability, open source, PHASE-TRANSITION, UNCERTAINTY}, isbn = {0025-1909}, author = {Oh, Wonseok and Jeon, Sangyong} } @conference {997, title = {Mining CVS Repositories to Understand Open-Source Project Developer Roles}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {8 - 8}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {This paper presents a model to represent the interactions of distributed open-source software developers and utilizes data mining techniques to derive developer roles. The model is then applied on case studies of two open-source projects, ORAC-DR and Mediawiki with encouraging results.}, keywords = {cvs, developer interaction, developers, mediawiki, orac-dr, roles, scm, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.19}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300008.pdf}, author = {Yu, Liguo and Ramaswamy, Srini} } @conference {1015, title = {Mining Eclipse Developer Contributions via Author-Topic Models}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {30 - 30}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {We present the results of applying statistical author-topic models to a subset of the Eclipse 3.0 source code consisting of 2,119 source files and 700,000 lines of code from 59 developers. This technique provides an intuitive and automated framework with which to mine developer contributions and competencies from a given code base while simultaneously extracting software function in the form of topics. In addition to serving as a convenient summary for program function and developer activities, our study shows that topic models provide a meaningful, effective, and statistical basis for developer similarity analysis.}, keywords = {contributions, developers, eclipse, expertise, mining challenge, msr challenge, source code, topics}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.20}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300030.pdf}, author = {Linstead, Erik and Rigor, Paul and Bajracharya, Sushil and Lopes, Cristina and Baldi, Pierre} } @conference {999, title = {Mining Software Repositories with iSPAROL and a Software Evolution Ontology}, booktitle = {Fourth International Workshop on Mining Software RepositoriesFourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {10 - 10}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {One of the most important decisions researchers face when analyzing the evolution of software systems is the choice of a proper data analysis/exchange format. Most existing formats have to be processed with special programs written specifically for that purpose and are not easily extendible. Most scientists, therefore, use their own database(s) requiring each of them to repeat the work of writing the import/export programs to their format. We present EvoOnt, a software repository data exchange format based on the Web Ontology Language (OWL). EvoOnt includes software, release, and bug-related information. Since OWL describes the semantics of the data, EvoOnt is (1) easily extendible, (2) comes with many existing tools, and (3) allows to derive assertions through its inherent Description Logic reasoning capabilities. The paper also shows iSPARQL -- our SPARQL-based Semantic Web query engine containing similarity joins. Together with EvoOnt, iSPARQL can accomplish a sizable number of tasks sought in software repository mining projects, such as an assessment of the amount of change between versions or the detection of bad code smells. To illustrate the usefulness of EvoOnt (and iSPARQL), we perform a series of experiments with a real-world Java project. These show that a number of software analyses can be reduced to simple iSPARQL queries on an EvoOnt dataset.}, keywords = {database, eclipse, evoont, java, owl, semantic, sparql}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.21}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300010.pdf}, author = {Kiefer, Christoph and Bernstein, Abraham and Tappolet, Jonas} } @conference {1000, title = {Mining Workspace Updates in CVS}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {11 - 11}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {The version control archive CVS records not only all changes in a project but also activity data such as when developers create or update their workspaces. Furthermore, CVS records when it has to integrate changes because of parallel development. In this paper, we analyze the CVS activity data of four large open-source projects GCC, JBOSS, JEDIT, and PYTHON to investigate parallel development: What is the degree of parallel development? How frequently do conflicts occur during updates and how are they resolved? How do we identify changes that contain integrations?}, keywords = {change management, cvs, developers, gcc, jboss, jedit, python, workspaces}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.22}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300011.pdf}, author = {Zimmermann, Thomas} } @article {springerlink:10.1007/s10368-007-0086-4, title = {Open source software: Motivation and restrictive licensing}, journal = {International Economics and Economic Policy}, volume = {4}, year = {2007}, note = {"We employ a unique data set consisting of 71 open source projects hosted at the SourceForge web site. The 71 projects in the sample were chosen (in January 2000)" "This sample was observed over an 18-month period from January 2002 through the middle of 2003, with data collected at 2-month intervals." "We are grateful to NERA for providing us with the data." "Although we only have data on a relatively small sample of the projects hosted SourceForge, the sample is unique because of data on lines of code as well as data on different versions of the program. The latter is a potentially important control variable, since a change in version may necessitate additional lines of code. Our data set contains information on the size of the open source projects in the form of source lines of code (SLOC). Using SLOC as a performance measure is not always ideal; nevertheless, this performance measure is employed in the profession and the literature.15 For our purposes, SLOC is in fact an ideal measure, because we want to measure the effort that is put into the project, rather than whether a project succeeds." }, pages = {209-225}, publisher = {Springer Berlin / Heidelberg}, abstract = {Open source software (OSS) is an economic paradox. Development of open source software is often done by unpaid volunteers and the source code is typically freely available. Surveys suggest that status, signaling, and intrinsic motivations play an important role in inducing developers to invest effort. Contribution to an OSS project is rewarded by adding one{\textquoteright}s name to the list of contributors which is publicly observable. Such incentives imply that programmers may have little incentive to contribute beyond the threshold level required for being listed as a contributor. Using a unique data set we empirically examine this hypothesis. We find that the output per contributor in open source projects is much higher when licenses are less restrictive and more commercially oriented. These results indeed suggest a status, signaling, or intrinsic motivation for participation in OSS projects with restrictive licenses.}, keywords = {contributions, contributors, developers, incentives, license analysis, licenses, lines of code, loc, MOTIVATION, restrictive, scm, size, status, version history}, issn = {1612-4804}, url = {http://dx.doi.org/10.1007/s10368-007-0086-4}, author = {Fershtman, Chaim and Gandal, Neil} } @conference {1016, title = {Predicting Defects and Changes with Import Relations}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {31 - 31}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Lowering the number of defects and estimating the development time of a software project are two important goals of software engineering. To predict the number of defects and changes we train models with import relations. This enables us to decrease the number of defects by more efficient testing and to assess the effort needed in respect to the number of changes.}, keywords = {defects, eclipse, effort estimation, mining challenge, msr challenge, prediction}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.24}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300031.pdf}, author = {Schroter, Adrian} } @conference {1214, title = {A Preliminary Analysis of Publicly Available FLOSS Measurements: Towards Discovering Maintainability Trends}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, note = {used SourceKibitzer data downloaded from FLOSSmole}, month = {2007}, abstract = {The spread of free/libre/open source software (FLOSS) and the openness of its development model offer researchers a valuable source of information regarding software data. The creation of large portals, which host a vast amount of FLOSS projects make it easy to create large datasets with valuable information regarding the FLOSS development process. In addition initiatives such as FLOSSMole provide researchers with a single point and continuing access to those data. Up to now the majority of datasets from FLOSSMole offered data regarding the development process and not the code itself. From February 2007 FLOSSMole offers data donated by SourceKibitzer, which contain source code metrics for FLOSS projects written in Java. In this paper we provide a premilinary analysis on those data using machine learning techniques, such as classification rules and decision trees. Using the first available data from February 2007, we tried to build rules that can be used in order to estimate the future values of metrics offered for March. Here we present some preliminary results that are encouraging and deserve to be further analyzed in future releases of SourceKibitzer datasets. }, keywords = {decision tree, flossmole, java, machine learning, metrics, sourcekibitzer}, attachments = {https://flosshub.org/sites/flosshub.org/files/Samolades2007.pdf}, author = {Samoladas, Ioannis and Bibi, Stamatia and Ioannis Stamelos and Sowe, Sulayman K. and Deligiannis, Ignatios} } @conference {994, title = {Recommending Emergent Teams}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {5 - 5}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {To build successful complex software systems, developers must collaborate with each other to solve issues. To facilitate this collaboration, specialized tools, such as chat and screen sharing, are being integrated into development environments. Currently, these tools require a developer to maintain a list of other developers with whom they may wish to communicate and to determine who within this list has expertise for a specific situation. For large, dynamic projects, like several successful open-source projects, these requirements place an unreasonable burden on the developer. In this paper, we show how the structure of a team emerges from how developers change software artifacts. We introduce the Emergent Expertise Locator (EEL) that uses emergent team information to propose experts to a developer within their development environment as the developer works. We found that EEL produces, on average, results with higher precision and higher recall than an existing heuristic for expertise recommendation.}, keywords = {bugzilla, developers, eclipse, evolution, expertise, Firefox, teams}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.27}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300005.pdf}, author = {Minto, Shawn and Murphy, Gail C.} } @article {flosswp405, title = {Self-organization of teams for free/libre open source software development}, journal = {Information and Software Technology Journal}, volume = {49}, number = {564-575}, year = {2007}, note = {"First, the data from these projects that we needed for analysis had to be publicly available (ruling out projects that limit access to their email lists or trackers). Second, we chose the projects that had more than 7 members"..."projects that have attracted numerous developers beyond the initial project founders, are continuing to release software, have numerous downloads and have an active user community that provides feedback" "3 FLOSS projects were selected for analysis, namely Gaim, eGroupWare and Compiere ERP." all had sourceforge hosting Data: "The primary data used for our study were interactions on the main developer communication forum, either a developer mailing list or web-based discussion forum." Analysis: "For this project, we inductively content-analyzed developer email interactions to identify the task assignment mechanisms used in the process. We coded each instance of task assignment identified on three dimensions: who assigned the task, to whom, and how" }, abstract = {This paper provides empirical evidence about how free/libre open source software development teams self-organize their work. Following a case study methodology, we examined developer interaction data from three active and successful FLOSS projects using qualitative research methods, specifically inductive content analysis, to identify the task-assignment mechanisms used by the participants. We found that "self-assignment" was the most common mechanism across three FLOSS projects. This mechanism is consistent with expectations for distributed and largely volunteer teams. We conclude by discussing whether these emergent practices can be usefully transferred to mainstream practice and indicating directions for future research.}, keywords = {case study, compiere, coordination, DESIGN, distributed teams, egroupware, email, email archives, forum, free/libre open source software development, gaim, INTERNET, mailing list, metadata, qualitative research methods, self-organizing teams, sourceforge, SYSTEMS, task assignment, WORK}, attachments = {https://flosshub.org/sites/flosshub.org/files/task_assignment_final.pdf}, author = {Kevin Crowston and Li, Qing and Kangning Wei and Eseryel, U. Yeliz and Howison, James} } @conference {1212, title = {Studying Production Phase SourceForge Projects: An Exploratory Analysis Using cvs2mysql and SFRA}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, month = {2007}, abstract = {A wealth of data can be extracted from the natural by-products of software development processes and used in empirical studies of software engineering. However, the size and accuracy of such studies depend in large part on the availability of tools that facilitate the collection of data from individual projects and the combination of data from multiple projects. To demonstrate this point, we present our experience gathering and analyzing data from nearly 10,000 open source projects hosted on SourceForge. We describe the tools we developed to collect the data and the ways in which these tools and data may be used by other researchers. We also provide examples of statistics that we have calculated from these data to describe interesting author- and project-level behaviors of the SourceForge community.}, keywords = {Data Collection, forge, repositories, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/Delorey2007c.pdf}, author = {Delorey, Daniel P. and Knutson, Charles D. and MacLean, Alexander C.} } @conference {1009, title = {Using Software Distributions to Understand the Relationship among Free and Open Source Software Projects}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {24 - 24}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Success in the open source software world has been measured in terms of metrics such as number of downloads, number of commits, number of lines of code, number of participants, etc. These metrics tend to discriminate towards applications that are small and tend to evolve slowly. A problem is, however, how to identify applications in these latter categories that are important. Software distributions specify the dependencies needed to build and to run a given software application. We use this information to create a dependency graph of the applications contained in such a distribution. We explore the characteristics of this graph, and use it to define some metrics to quantify the dependencies (and dependents) of a given software application. We demonstrate that some applications that are invisible to the final user (such as libraries) are widely used by end-user applications. This graph can be used as a proxy to measure success of small, slowly evolving free and open source software.}, keywords = {dependencies, evolution, fink, metrics}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.32}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300024.pdf}, author = {Daniel M. German} } @conference {1010, title = {Using Software Repositories to Investigate Socio-technical Congruence in Development Projects}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {25 - 25}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {We propose a quantitative measure of socio-technical congruence as an indicator of the performance of an organization in carrying out a software development project. We show how the information necessary to implement that measure can be mined from commonly used software repositories, and we describe how socio-technical congruence can be computed based on that information.}, keywords = {developers, graph, scm, social networks, source code}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.33}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300025.pdf}, author = {Valetto, Giuseppe and Helander, Mary and Ehrlich, Kate and Chulani, Sunita and Wegman, Mark and Williams, Clay} } @conference {998, title = {Visual Data Mining in Software Archives to Detect How Developers Work Together}, booktitle = {Fourth International Workshop on Mining Software RepositoriesFourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {9 - 9}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Analyzing the check-in information of open source software projects which use a version control system such as CVS or SUBVERSION can yield interesting and important insights into the programming behavior of developers. As in every major project tasks are assigned to many developers, the development must be coordinated between these programmers. This paper describes three visualization techniques that help to examine how programmers work together, e.g. if they work as a team or if they develop their part of the software separate from each other. Furthermore, phases of stagnation in the lifetime of a project can be uncovered and thus, possible problems are revealed. To demonstrate the usefulness of these visualization techniques we performed case studies on two open source projects. In these studies interesting patterns of developers? behavior, e.g. the specialization on a certain module can be observed. Moreover, modules that have been changed by many developers can be identified as well as such ones that have been altered by only one programmer.}, keywords = {change, coordination, cvs, developers, junit, modules, scm, source code, svn, teams, tomcat, visualization}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.34}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300009.pdf}, author = {Weissgerber, Peter and Pohl, Mathias and Burch, Michael} } @conference {1008, title = {What Can OSS Mailing Lists Tell Us? A Preliminary Psychometric Text Analysis of the Apache Developer Mailing List}, booktitle = {Fourth International Workshop on Mining Software Repositories (MSR{\textquoteright}07:ICSE Workshops 2007)}, year = {2007}, pages = {23 - 23}, publisher = {IEEE}, organization = {IEEE}, address = {Minneapolis, MN, USA}, abstract = {Developer mailing lists are a rich source of information about Open Source Software (OSS) development. The unstructured nature of email makes extracting information difficult. We use a psychometrically-based linguistic analysis tool, the LIWC, to examine the Apache httpd server developer mailing list. We conduct three preliminary experiments to assess the appropriateness of this tool for information extraction from mailing lists. First, using LIWC dimensions that are correlated with the big five personality traits, we assess the personality of four top developers against a baseline for the entire mailing list. The two developers that were responsible for the major Apache releases had similar personalities. Their personalities were different from the baseline and the other developers. Second, the first and last 50 emails for two top developers who have left the project are examined. The analysis shows promise in understanding why developers join and leave a project. Third, we examine word usage on the mailing list for two major Apache releases. The differences may reflect the relative success of each release.}, keywords = {apache, developers, email, joining, liwc, mailing lists, personality}, isbn = {0-7695-2950-X}, doi = {10.1109/MSR.2007.35}, attachments = {https://flosshub.org/sites/flosshub.org/files/28300023.pdf}, author = {Peter C. Rigby and Hassan, Ahmed E.} } @conference {1213, title = {Working with Open Source Development Data: Considerations triggered by a study of bug scenarios}, booktitle = {2nd Workshop on Public Data about Software Development (WoPDaSD 2007)}, year = {2007}, month = {2007}, abstract = {The retrieval and preparation of public data on software development calls for more than just technical skills. In addition, care and judgement are needed to avoid disproportionate costs to the providers of data or unnecessary embarrassment to the participants tracked in the data. Taking the extraction of bug scenarios as a use case, we illustrate these concerns and discuss how they could be translated into social requirements that would help to make retrieval and preparation a sustainable exercise. In particular, we call for more efforts to establish institutional repositories of public data on software development and, besides, we suggest that reviewers could play a role in making sure that empirical research is performed in a way that does not bring the long-term relationship between software developers and researchers in jeopardy. }, keywords = {bug reports, bug scenarios, Data Collection}, attachments = {https://flosshub.org/sites/flosshub.org/files/denBesten-wopdasd.pdf}, author = {den Besten, Matthijs and Masmoudi, H{\'e}la and Jean-Michel Dalle} } @conference {Tsunoda:2006:AOD:1137983.1138031, title = {Analyzing OSS developers{\textquoteright} working time using mailing lists archives}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {181{\textendash}182}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Our mining question is {\textquotedblleft}when OSS developers work?{\textquotedblright} OSS developers{\textquoteright} working time may be a good indicator to understand the development style of a project. (For example, if many developers work in office hour, these might be daily works in a company.)}, keywords = {developers, email, email archives, mailing lists, mining challenge, msr challenge, overtime work, postgresql, workload}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138031}, url = {http://doi.acm.org/10.1145/1137983.1138031}, attachments = {https://flosshub.org/sites/flosshub.org/files/181AnalyzingOSS.pdf}, author = {Tsunoda, Masateru and Monden, Akito and Kakimoto, Takeshi and Kamei, Yasutaka and Matsumoto, Ken-ichi} } @conference {D{\textquoteright}Ambros:2006:AER:1137983.1138029, title = {Applying the evolution radar to PostgreSQL}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {177{\textendash}178}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, keywords = {cvs, documentation, evolution, evolution radar, logical coupling, makefile, mining challenge, msr challenge, postgresql, re-engineering, refactoring, release history, rhdb, source code, version control, visualization}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138029}, url = {http://doi.acm.org/10.1145/1137983.1138029}, attachments = {https://flosshub.org/sites/flosshub.org/files/177ApplyingEvolution.pdf}, author = {D{\textquoteright}Ambros, Marco and Lanza, Michele} } @conference {683, title = {Communication Networks in an Open Source Software Project}, booktitle = {OSS2006: Open Source Systems (IFIP 2.13)}, series = {IFIP International Federation for Information Processing }, volume = {203/2006}, year = {2006}, month = {2006///}, pages = {297 - 306}, publisher = {Springer}, organization = {Springer}, abstract = {This study explores the nature of the social network and the patterns of communication that exist in an open source software development project, the Apache HTTP (WEB) server project. Our analysis of archival data on email communications between developers in the Apache HTTP server project suggests an interesting pattern of communication. We find that the core developers self-organize into three sub-groups that communicate intensely in completing the project. Our analysis also reveals that a few prominent developers who are centrally located in the network are driving communications within the project. We identify the implications of our findings and suggest areas for further research. }, keywords = {apache, core, developers, email, email archive, mailing list, participation, social network analysis}, issn = {978-0-387-34225-2}, doi = {http://dx.doi.org/10.1007/0-387-34226-5_30}, attachments = {https://flosshub.org/sites/flosshub.org/files/Communication\%20Networks\%20in\%20an\%20Open\%20Source.pdf}, author = {Roberts, Jeffrey and Il-Horn Hann and Sandra Slaughter} } @conference {686, title = {Contributor Turnover in Libre Software Projects}, booktitle = {OSS2006: Open Source Systems (IFIP 2.13)}, series = {IFIP International Federation for Information Processing}, year = {2006}, pages = {273 - 286}, publisher = {Springer}, organization = {Springer}, abstract = {A common problem that management faces in software companies is the high instability of their staff. In libre (free, open source) software projects, the permanence of developers is also an open issue, with the potential of causing problems amplified by the self-organizing nature that most of them exhibit. Hence, human resources in libre software projects are even more difficult to manage: developers are in most cases not bound by a contract and, in addition, there is not a real management structure concerned about this problem. This raises some interesting questions with respect to the composition of development teams in libre software projects, and how they evolve over time. There are projects lead by their original founders (some sort of {\textquotedblleft}code gods{\textquotedblright}), while others are driven by several different developer groups over time (i.e. the project {\textquotedblleft}regenerates{\textquotedblright} itself). In this paper, we propose a quantitative methodology, based on the analysis of the activity in the source code management repositories, to study how these processes (developers leaving, developers joining) affect libre software projects. The basis of it is the analysis of the composition of the core group, the group of developers most active in a project, for several time lapses. We will apply this methodology to several large, well-known libre software projects, and show how it can be used to characterize them. In addition, we will discuss the lessons that can be learned, and the validity of our proposal. }, keywords = {apache, committers, core, cvs, cvsanaly, developers, evolution, freebsd, gimp, gnome, kde, mono, mozilla}, doi = {http://dx.doi.org/10.1007/0-387-34226-5_28}, attachments = {https://flosshub.org/sites/flosshub.org/files/Contributor\%20Turnover\%20in\%20Libre\%20Software\%20Projects.pdf}, author = {Gregorio Robles and Gonzalez-Barahona, Jesus} } @article {90, title = {Core and periphery in Free/Libre and Open Source software team communications}, journal = {Proceedings of the 39th Annual Hawaii International Conference on System Sciences-Volume 06}, year = {2006}, abstract = {The concept of the core group of developers is important and often discussed in empirical studies of FLOSS projects. This paper examines the question, "how does one empirically distinguish the core?" Being able to identify the core members of a FLOSS development project is important because many of the processes necessary for successful projects likely involve core members differently than peripheral members, so analyses that mix the two groups will likely yield invalid results. We compare 3 analysis approaches to identify the core: the named list of developers, a Bradford{\textquoteright}s law analysis that takes as the core the most frequent contributors and a social network analysis of the interaction pattern that identifies the core in a core-and-periphery structure. We apply these measures to the interactions around bug fixing for 116 SourceForge projects. The 3 techniques identify different individuals as core members; examination of which individuals are identified leads to suggestions for refining the measures. All 3 measures though suggest that the core of FLOSS projects is a small fraction of the total number of contributors.}, keywords = {bug fixing, contributions, contributors, core, developers, social network analysis, sourceforge, team}, author = {Kevin Crowston and Kangning Wei and Li, Qing and Howison, James} } @conference {Amor:2006:EEC:1139113.1139116, title = {Effort Estimation by Characterizing Developer Activity}, booktitle = {Proceedings of the 2006 International Workshop on Economics Driven Software Engineering Research}, series = {EDSER {\textquoteright}06}, year = {2006}, pages = {3{\textendash}6}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {During the latest years libre (free, open source) software has gained a lot of attention from the industry. Following this interest, the research community is also studying it. For instance, many teams are performing quantitative analysis on the large quantity of data which is publicly available from the development repositories maintained by libre software projects. However, not much of this research is focused on cost or effort estimations, despite its importance (for instance, for companies developing libre software or collaborating with libre software projects), and the availability of some data which could be useful for this purpose. Our position is that classical effort estimation models can be improved from the study of these data, at least when applied to libre software. In this paper, we focus on the characterization of developer activity, which we argue can improve effort estimation. This activity can be traced with a lot of detail, and the resulting data can also be used for validation of any effort estimation model. }, keywords = {developer characterization, effort estimation, mining software repositories, open source software, software economics}, isbn = {1-59593-396-4}, doi = {10.1145/1139113.1139116}, url = {http://doi.acm.org/10.1145/1139113.1139116}, author = {Amor, Juan Jose and Gregorio Robles and Jesus M. Gonzalez-Barahona} } @conference {689, title = {Evolution of Open Source Communities}, booktitle = {OSS2006: Open Source Systems (IFIP 2.13)}, series = {IFIP International Federation for Information Processing}, year = {2006}, note = {"we took snapshots of its membership at regular intervals" "we chose a one year period" "we retrieve the list of core developers ordered by their number of inbound messages, as noted above."}, pages = {21 - 32}, publisher = {Springer}, organization = {Springer}, abstract = {The goal of this paper is to document the evolution of a portfolio of related open source communities over time. As a case study, we explore the subprojects of the Apache project, one of the largest and most visible open source projects. We extract the community structure from the mailing list data, and study how the subcommunities evolve, and are interrelated over time. Our analysis leads us to propose the following hypotheses about the growth of open source communities: (1) communities add new developers by a process of preferential attachment; (2) links between existing communities are also subject to preferential attachment; (3) developers will migrate between communities together with other collaborators; and (4) information flow follows project dependencies. In particular, we are concerned with the underlying factors that motivate the migration between communities, such as information flow, co-worker ties, and project dependencies. }, keywords = {apache, COMMUNITY, core, developers, email, email archives, mailing list, membership}, doi = {http://dx.doi.org/10.1007/0-387-34226-5_3}, attachments = {https://flosshub.org/sites/flosshub.org/files/Evolution\%20of\%20Open\%20Source\%20Communities.pdf}, author = {Weiss, Michael and Moroiu, Gabriella and Zhao, Ping} } @conference {41, title = {The FLOSSWALD information system on free and open source software}, booktitle = {9th International Workshop on Learning Software Organizations}, year = {2006}, note = {"Using these [FLOSSmole] data we intend to map the respective projects to their developed software and thus extend the already existing cases with new attributes or create new we cases where necessary." "Our first step will be to evaluate the data provided by the Debian project and the FLOSSmole project and design a knowledge base and case structure to flexibly work with them."}, month = {10/2006}, abstract = {We propose the implementation of an intelligent information system on free and open source software. This system will consist of a case-based reasoning (CBR) system and several machine learning modules to maintain the knowledge base and train the CBR system thus enhancing its performance. Our knowledge base will include data on free and open source software provided by the Debian project, the FLOSSmole project, and other public free and open source software directories. We plan to enrich these data by learning additional information such as concepts and different similarities. With this knowledge base, we hope to be able to create an information system that will be capable of answering queries based on precise as well as vague criteria and give intelligent recommendations on software based on the preferences of the user.}, keywords = {debian, flossmole}, attachments = {https://flosshub.org/sites/flosshub.org/files/flosswald.pdf}, author = {Reichle, M. and Hanft, A.} } @conference {Robles:2006:GLD:1137983.1138017, title = {Geographic location of developers at SourceForge}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {144{\textendash}150}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {The development of libre (free/open source) software is usually performed by geographically distributed teams. Participation in most cases is voluntary, sometimes sporadic, and often not framed by a pre-defined management structure. This means that anybody can contribute, and in principle no national origin has advantages over others, except for the differences in availability and quality of Internet connections and language. However, differences in participation across regions do exist, although there are little studies about them. In this paper we present some data which can be the basis for some of those studies. We have taken the database of users registered at SourceForge, the largest libre software development web-based platform, and have inferred their geographical locations. For this, we have applied several techniques and heuristics on the available data (mainly e-mail addresses and time zones), which are presented and discussed in detail. The results show a snapshot of the regional distribution of SourceForge users, which may be a good proxy of the actual distribution of libre software developers. In addition, the methodology may be of interest for similar studies in other domains, when the available data is similar (as is the case of mailing lists related to software projects).}, keywords = {distributed, email, email address, free software, geographical location, geography, libre software, mining software repositories, open source software, sourceforge, timezone}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138017}, url = {http://doi.acm.org/10.1145/1137983.1138017}, attachments = {https://flosshub.org/sites/flosshub.org/files/144GeographicLocation.pdf}, author = {Gregorio Robles and Jesus M. Gonzalez-Barahona} } @article {flosswp325, title = {Identifying Knowledge Brokers that Yield Software Engineering Knowledge in OSS Projects}, journal = {Information and Software Technology}, volume = {46}, year = {2006}, note = {Uses the Debian mailing lists "kde", "mentor", and "user". the collection period was from January 2001 to September 2004}, month = {11/2006}, pages = {1025-1033}, abstract = {Much research on open source software development concentrates on developer lists and other software repositories to investigate what motivates professional software developers to participate in open source software projects. Little attention has been paid to individuals who spend valuable time in lists helping participants on some mundane yet vital project activities. Using three Debian lists as a case study we investigate the impact of knowledge brokers and their associated activities in open source projects. Social network analysis was used to visualize how participants are affiliated with the lists. The network topology reveals substantial community participation. The consequence of collaborating in mundane activities for the success of open source software projects is discussed. The direct beneficiaries of this research are in the identification of knowledge experts in open source software projects.}, keywords = {debian, email, email archives, expertise, knowledge sharing, mailing list, project success, social network analysis}, doi = {10.1016/j.infsof.2005.12.019}, attachments = {https://flosshub.org/sites/flosshub.org/files/IST-Vol-48-11-2006.pdf}, author = {Sowe, Sulayman K. and Ioannis Stamelos and Lefteris Angelis} } @conference {693, title = {Impact of Social Ties on Open Source Project Team Formation}, booktitle = {OSS2006: Open Source Systems (IFIP 2.13)}, series = {IFIP International Federation for Information Processing }, volume = {203/2006}, year = {2006}, note = {"we randomly selected 1030 new projects that were registered between X and X in 2005. A web crawler downloaded the HTML files containing project summary data and developer information on the date of registration."}, month = {2006///}, pages = {307 - 317}, publisher = {Springer}, organization = {Springer}, abstract = {In this paper, we empirically examined the role of social ties in OSSD team formation and developer joining behavior. We find that the existence and the amount of prior social relations in the network do increase the probability of an OSS project to attract more developers. Interestingly, for projects without preexisting social ties, developers tend to join the project initiated by people with less OSSD experience. This research fills a gap in the open source literature by conducting an empirical investigation of the role of social relations on project team formation behavior. Furthermore, the adoption of social network analysis, which has received little attention in the OSS literature, can yield some interesting results on the interactions among OSS developers. }, keywords = {developers, metadata, social network analysis, sourceforge}, issn = {978-0-387-34225-2}, doi = {http://dx.doi.org/10.1007/0-387-34226-5_31}, attachments = {https://flosshub.org/sites/flosshub.org/files/Impact\%20of\%20Social\%20Ties\%20on\%20Open\%20Source\%20Project.pdf}, author = {Hahn, Jungpil and Moon, Jae and Zhang, Chen} } @article {Crowston:2006, title = {Information systems success in Free and Open Source Software development: Theory and measures}, journal = {Software Process{\textendash}Improvement and Practice}, volume = {11}, number = {2}, year = {2006}, note = {"we continue our examination of success measures using data from Sourceforge..." "e chose the number of developers (assessed from the records of the project and from bug fixing logs), bug-fixing time, and popularity (assessed from the number of downloads and viewings of project Web pages, and inclusion in distributions). These measures were chosen because they span the reconsidered FLOSS development process discussed above, including inputs (number of developers), process (speed of bug fixing) and output (popularity)."}, pages = {123{\textendash}148}, abstract = {Information systems success is one of the most widely used dependent variables in information systems (IS) research, but research on Free/Libre and Open Source software (FLOSS) often fails to appropriately conceptualize this important concept. In this paper, we reconsider what success means within a FLOSS context. We first review existing models of IS success and success variables used in FLOSS research and assess them for their usefulness, practicality and fit to the FLOSS context. Then, drawing on a theoretical model of group effectiveness in the FLOSS development process, as well as an online discussion group with developers, we present additional concepts that are central to an appropriate understanding of success for FLOSS. In order to examine the practicality and validity of this conceptual scheme, the second half of our paper presents an empirical study that demonstrates its operationalization of the chosen measures and assesses their internal validity. We use data from SourceForge to measure the project{\textquoteright}s effectiveness in team building, the speed of the project at responding to bug reports and the project{\textquoteright}s popularity. We conclude by discussing the implications of this study for our proposed extension of IS success in the context of FLOSS development and highlight future directions for research.}, keywords = {bug fixing, developers, downloads, FLOSS, flossmole, page views, popularity, project success, size, sourceforge, success, team size}, doi = {10.1002/spip.259}, attachments = {https://flosshub.org/sites/flosshub.org/files/CrowstonHowisonAnnabi2006.pdf}, author = {Kevin Crowston and Howison, James and Hala Annabi} } @conference {697, title = {Life cycle of Defects in Open Source Software Projects}, booktitle = {OSS2006: Open Source Systems (IFIP 2.13)}, series = {IFIP International Federation for Information Processing}, year = {2006}, note = {"we collected and analyzed defects of the apache http server and mozilla firefox" "all the defects reported between X and X were analyzed"}, pages = {195 - 200}, publisher = {Springer}, organization = {Springer}, abstract = {We studied the maintenance process from the viewpoint of defect management and the defect life cycle. First, we outline a model for the defect life cycle based on ISO/IEC standards, the Framework for Open Source maintenance process, and the Bugzilla defect management system. Thereafter, we analyze defects from two Open Source software projects. The aim of the study was support the maintenance reliability. However, we found that most of the defects did not follow the life-cycle model. Defects were usually directly resolved from initial state without being assigned. }, keywords = {apache, defects, Firefox, mozilla}, doi = {http://dx.doi.org/10.1007/0-387-34226-5_19}, attachments = {https://flosshub.org/sites/flosshub.org/files/Life\%20cycle\%20of\%20Defects\%20in\%20OSS\%20Projects.pdf}, author = {Koponen, Timo} } @article {Grewal:2006:LLL:1246148.1246155, title = {Location, Location, Location: How Network Embeddedness Affects Project Success in Open Source Systems}, journal = {Management Science}, volume = {52}, number = {7}, year = {2006}, month = {July}, pages = {1043{\textendash}1056}, publisher = {INFORMS}, address = {Institute for Operations Research and the Management Sciences (INFORMS), Linthicum, Maryland, USA}, abstract = {The community-based model for software development in open source environments is becoming a viable alternative to traditional firm-based models. To better understand the workings of open source environments, we examine the effects of network embeddedness---or the nature of the relationship among projects and developers---on the success of open source projects. We find that considerable heterogeneity exists in the network embeddedness of open source projects and project managers. We use a visual representation of the affiliation network of projects and developers as well as a formal statistical analysis to demonstrate this heterogeneity and to investigate how these structures differ across projects and project managers. Our main results surround the effect of this differential network embeddedness on project success. We find that network embeddedness has strong and significant effects on both technical and commercial success, but that those effects are quite complex. We use latent class regression analysis to show that multiple regimes exist and that some of the effects of network embeddedness are positive under some regimes and negative under others. We use project age and number of page views to provide insights into the direction of the effect of network embeddedness on project success. Our findings show that different aspects of network embeddedness have powerful but subtle effects on project success and suggest that this is a rich environment for further study.}, keywords = {affiliation network, age, developers, latent class analysis, network embeddedness, open source software, page views, perl, project success, registration, sourceforge}, issn = {0025-1909}, doi = {10.1287/mnsc.1060.0550}, url = {http://portal.acm.org/citation.cfm?id=1246148.1246155}, author = {Grewal, Rajdeep and Lilien, Gary L. and Mallapragada, Girish} } @article {Yu:2006:MKO:1150566.1150571, title = {Maintainability of the kernels of open-source operating systems: A comparison of Linux with FreeBSD, NetBSD, and OpenBSD}, journal = {J. Syst. Softw.}, volume = {79}, year = {2006}, note = {"Data regarding the number and total number of lines of code of kernel and nonkernel modules in the four operating systems are provided in Table 1" loc, kloc, number of kernel modules, number of nonkernel modules size c files .h files}, month = {June}, pages = {807{\textendash}815}, publisher = {Elsevier Science Inc.}, address = {New York, NY, USA}, abstract = {We compared and contrasted the maintainability of four open-source operating systems: Linux, FreeBSD, NetBSD, and OpenBSD. We used our categorization of common coupling in kernel-based software to highlight future maintenance problems. An unsafe definition is a definition of a global variable that can affect a kernel module if that definition is changed. For each operating system we determined a number of measures, including the number of global variables, the number of instances of global variables in the kernel and overall, as well as the number of unsafe definitions in the kernel and overall. We also computed the value of each our measures per kernel KLOC and per KLOC overall. For every measure and every ratio, Linux compared unfavorably with FreeBSD, NetBSD, and OpenBSD. Accordingly, we are concerned about the future maintainability of Linux. }, keywords = {abiword, Common coupling, coupling, Definition-use analysis, freebsd, kernel, lines of code, linux, linux kernel, loc, Maintainability, modules, netbsd, Open-source software, openbsd, source code}, issn = {0164-1212}, doi = {http://dx.doi.org/10.1016/j.jss.2005.08.014}, url = {http://dx.doi.org/10.1016/j.jss.2005.08.014}, attachments = {https://flosshub.org/sites/flosshub.org/files/YuSchachChen.pdf}, author = {Yu, Liguo and Schach, Stephen R. and Chen, Kai and Heller, Gillian Z. and Offutt, Jeff} } @conference {Xie:2006:MMA:1137983.1137997, title = {MAPO: mining API usages from open source repositories}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {54{\textendash}57}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {To improve software productivity, when constructing new software systems, developers often reuse existing class libraries or frameworks by invoking their APIs. Those APIs, however, are often complex and not well documented, posing barriers for developers to use them in new client code. To get familiar with how those APIs are used, developers may search the Web using a general search engine to find relevant documents or code examples. Developers can also use a source code search engine to search open source repositories for source files that use the same APIs. Nevertheless, the number of returned source files is often large. It is difficult for developers to learn API usages from a large number of returned results. In order to help developers understand API usages and write API client code more effectively, we have developed an API usage mining framework and its supporting tool called MAPO (for Mining API usages from Open source repositories). Given a query that describes a method, class, or package for an API, MAPO leverages the existing source code search engines to gather relevant source files and conducts data mining. The mining leads to a short list of frequent API usages for developers to inspect. MAPO currently consists of five components: a code search engine, a source code analyzer, a sequence preprocessor, a frequent sequence miner, and a frequent sequence post processor. We have examined the effectiveness of MAPO using a set of various queries. The preliminary results show that the framework is practical for providing informative and succinct API usage patterns.}, keywords = {api, application programming interfaces, documentation, mining software repositories, pmd, program comprehension, search engine, sequences, source code, source code search engine}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137997}, url = {http://doi.acm.org/10.1145/1137983.1137997}, attachments = {https://flosshub.org/sites/flosshub.org/files/54MAPO.pdf}, author = {Xie, Tao and Pei, Jian} } @conference {Kim:2006:MPE:1137983.1137995, title = {Micro pattern evolution}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {40{\textendash}46}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {When analyzing the evolution history of a software project, we wish to develop results that generalize across projects. One approach is to analyze design patterns, permitting characteristics of the evolution to be associated with patterns, instead of source code. Traditional design patterns are generally not amenable to reliable automatic extraction from source code, yet automation is crucial for scalable evolution analysis. Instead, we analyze {\textquotedblleft}micro pattern{\textquotedblright} evolution; patterns whose abstraction level is closer to source code, and designed to be automatically extractable from Java source code or bytecode. We perform micro-pattern evolution analysis on three open source projects, ArgoUML, Columba, and jEdit to identify micro pattern frequencies, common kinds of pattern evolution, and bug-prone patterns. In all analyzed projects, we found that the micro patterns of Java classes do not change often. Common bug- prone pattern evolution kinds are {\textquoteleft}Pool {\textrightarrow} Pool{\textquoteright}, {\textquoteleft}Implementor {\textrightarrow} NONE{\textquoteright}, and {\textquoteleft}Sampler {\textrightarrow} Sampler{\textquoteright}. Among all pattern evolution kinds,{\textquoteleft}Box{\textquoteright},{\textquoteleft}CompoundBox{\textquoteright}, {\textquoteleft}Pool{\textquoteright}, {\textquoteleft}CommonState{\textquoteright}, and {\textquoteleft}Outline{\textquoteright} micro patterns have high bug rates, but they have low frequencies and a small number of changes. The pattern evolution kinds that are bug-prone are somewhat similar across projects. The bug-prone pattern evolution kinds of two different periods of the same project are almost identical.}, keywords = {argouml, bugs, columba, design patterns, evolution, extraction, java, jedit, source code}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137995}, url = {http://doi.acm.org/10.1145/1137983.1137995}, attachments = {https://flosshub.org/sites/flosshub.org/files/40MicroPattern.pdf}, author = {Kim, Sunghun and Pan, Kai and Whitehead,Jr., E. James} } @conference {Bird:2006:MES:1137983.1138016, title = {Mining email social networks}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {137{\textendash}143}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Communication \& Co-ordination activities are central to large software projects, but are difficult to observe and study in traditional (closed-source, commercial) settings because of the prevalence of informal, direct communication modes. OSS projects, on the other hand, use the internet as the communication medium,and typically conduct discussions in an open, public manner. As a result, the email archives of OSS projects provide a useful trace of the communication and co-ordination activities of the participants. However, there are various challenges that must be addressed before this data can be effectively mined. Once this is done, we can construct social networks of email correspondents, and begin to address some interesting questions. These include questions relating to participation in the email; the social status of different types of OSS participants; the relationship of email activity and commit activity (in the CVS repositories) and the relationship of social status with commit activity. In this paper, we begin with a discussion of our infrastructure (including a novel use of Scientific Workflow software) and then discuss our approach to mining the email archives; and finally we present some preliminary results from our data analysis.}, keywords = {communication, contributions, developers, email, email archives, mailing lists, open source, social networks}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138016}, url = {http://doi.acm.org/10.1145/1137983.1138016}, attachments = {https://flosshub.org/sites/flosshub.org/files/137MiningEmail.pdf}, author = {Christian Bird and Gourley, Alex and Devanbu, Prem and Gertz, Michael and Swaminathan, Anand} } @conference {Bird:2006:MES:1137983.1138033, title = {Mining email social networks in Postgres}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {185{\textendash}186}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Open Source Software (OSS) projects provide a unique opportunity to gather and analyze publicly available historical data. The Postgres SQL server, for example, has over seven years of recorded development and communication activity. We mined data from both the source code repository and the mailing list archives to examine the relationship between communication and development in Postgres. Along the way, we had to deal with the difficult challenge of resolving email aliases. We used a number of social network analysis measures and statistical techniques to analyze this data. We present our findings in this paper.}, keywords = {developers, email, email archives, open source, postgresql, scm, social network analysis, social networks, source code, status}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138033}, url = {http://doi.acm.org/10.1145/1137983.1138033}, attachments = {https://flosshub.org/sites/flosshub.org/files/185MiningEmail.pdf}, author = {Christian Bird and Gourley, Alex and Devanbu, Prem and Gertz, Michael and Swaminathan, Anand} } @conference {Robles:2006:MLS:1137983.1137986, title = {Mining large software compilations over time: another perspective of software evolution}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {3{\textendash}9}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {With the success of libre (free, open source) software, a new type of software compilation has become increasingly common. Such compilations, often referred to as {\textquoteright}distributions{\textquoteright}, group hundreds, if not thousands, of software applications and libraries written by independent parties into an integrated system. Software compilations raise a number of questions that have not been targeted so far by software evolution, which usually focuses on the evolution of single applications. Undoubtedly, the challenges that software compilations face differ from those found in single software applications. Nevertheless, it can be assumed that both, the evolution of applications and that of software compilations, have similarities and dependencies.In this sense, we identify a dichotomy, common to that in economics, of software evolution in the small (micro-evolution) and in the large (macro-evolution). The goal of this paper is to study the evolution of a large software compilation, mining the publicly available repository of a well-known Linux distribution, Debian. We will therefore investigate changes related to hundreds of millions of lines of code over seven years. The aspects that will be covered in this paper are size (in terms of number of packages and of number of lines of code), use of programming languages, maintenance of packages and file sizes.}, keywords = {debian, distributions, evolution, large software collections, lines of code, loc, metrics, mining software repositories, size, sloc, sloccount, software evolution, software integrators}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1137986}, url = {http://doi.acm.org/10.1145/1137983.1137986}, attachments = {https://flosshub.org/sites/flosshub.org/files/3miningLarge.pdf}, author = {Gregorio Robles and Jesus M. Gonzalez-Barahona and Martin Michlmayr and Amor, Juan Jose} } @conference {Knab:2006:PDD:1137983.1138012, title = {Predicting defect densities in source code files with decision tree learners}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {119{\textendash}125}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {With the advent of open source software repositories the data available for defect prediction in source files increased tremendously. Although traditional statistics turned out to derive reasonable results the sheer amount of data and the problem context of defect prediction demand sophisticated analysis such as provided by current data mining and machine learning techniques.In this work we focus on defect density prediction and present an approach that applies a decision tree learner on evolution data extracted from the Mozilla open source web browser project. The evolution data includes different source code, modification, and defect measures computed from seven recent Mozilla releases. Among the modification measures we also take into account the change coupling, a measure for the number of change-dependencies between source files. The main reason for choosing decision tree learners, instead of for example neural nets, was the goal of finding underlying rules which can be easily interpreted by humans. To find these rules, we set up a number of experiments to test common hypotheses regarding defects in software entities. Our experiments showed, that a simple tree learner can produce good results with various sets of input data.}, keywords = {change analysis, data mining, decision tree learner, defect density, defect prediction, mozilla, prediction, release history, scm, source code, version control}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138012}, url = {http://doi.acm.org/10.1145/1137983.1138012}, attachments = {https://flosshub.org/sites/flosshub.org/files/119Predicting.pdf}, author = {Knab, Patrick and Pinzger, Martin and Bernstein, Abraham} } @article {1088, title = {Self-Organization Patterns in Wasp and Open Source Communities}, journal = {IEEE Intelligent Systems}, volume = {21}, year = {2006}, note = {"To investigate such claims, we studied an OSS community{\textquoteright}s social network from a dataset describing the email activity of 120 different software teams" "Our test data originated from Sourceforge (http://sourceforge.net), a large open source project repository, and included communi- ties ranging from very small networks with one or two members to large networks with thousands of members." "we limited our consideration to email traffic associated with bug fixes and bug reporting. As other researchers have shown[5] this email subset allows an effective reconstruction of the software community{\textquoteright}s social network." "We thank Kevin Crowston and James Howison for making their software data publicly available."}, month = {03/2006}, pages = {36 - 40}, abstract = {In this paper, we conducted a comparative study of how social organization takes place in a wasp colony and OSS developer communities. Both these systems display similar global organization patterns, such as hierarchies and clear labor divisions. As our analysis shows, both systems also define interacting agent networks with similar common features that reflect limited information sharing among agents. As far as we know, this is the first research study analyzing the patterns and functional significance of these systems{\textquoteright} weighted-interaction networks. By illuminating the extent to which self-organization is responsible for patterns such as hierarchical structure, we can gain insight into the origins of organization in OSS communities.}, keywords = {agents, decentralization, developers, email, email archives, flossmole, hierarchy, labor division, organization, self-organizing teams, social network analysis, social networks, sourceforge, teams, wasps}, issn = {1541-1672}, doi = {10.1109/MIS.2006.34}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.95.5574\&rep=rep1\&type=pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/valverde.pdf}, author = {Valverde, S. and Theraulaz, G. and Gautrais, J. and Fourcassie, V. and Sole, R.V.} } @conference {708, title = {Social dynamics of free and open source team communications}, booktitle = {OSS2006: Open Source Systems (IFIP 2.13)}, series = {IFIP International Federation for Information Processing }, volume = {203/2006}, year = {2006}, month = {06/2006}, pages = {319 - 330}, publisher = {Springer}, organization = {Springer}, abstract = {This paper furthers inquiry into the social structure of free and open source software (FLOSS) teams by undertaking social network analysis across time. Contrary to expectations, we confirmed earlier findings of a wide distribution of centralizations even when examining the networks over time. The paper also provides empirical evidence that while change at the center of FLOSS projects is relatively uncommon, participation across the project communities is highly skewed, with many participants appearing for only one period. Surprisingly, large project teams are not more likely to undergo change at their centers. }, keywords = {bug fixing, bug reports, bug tracker, bug tracking, bugs, communications, Dynamic social networks, FLOSS teams, Human Factors, social network analysis, software development, sourceforge}, issn = {978-0-387-34225-2}, doi = {http://dx.doi.org/10.1007/0-387-34226-5_32}, attachments = {https://flosshub.org/sites/flosshub.org/files/Social\%20dynamics\%20of\%20free\%20and\%20open\%20source\%20team.pdf}, author = {Howison, James and Inoue, Keisuke and Kevin Crowston} } @article {1119, title = {Strategic Interaction and Knowledge Sharing in the KDE Developer Mailing List}, journal = {Management Science}, volume = {52}, year = {2006}, note = {"the threaded discussion from the K Desktop Environment (KDE) developer mailing list was used." March 15 to March 31, 2000. 128 threads selected. measures: (1) level of participation measured by number of postings made by indiv. devs in a thread (2) conversational interactivity (how long is a conversation) (3) cross-thread connectivity}, month = {07/2006}, pages = {1031 - 1042}, abstract = {In stark contrast with the fully participative "bazaar" imagery of open source software (OSS) development, some recent empirical research has pointed out that much of the OSS development is carried out by a small percentage of developers. This raises serious concerns that concentration of development effort on a few will limit knowledge sharing and underutilize the available resources. Using the notion of strategic interaction, this paper argues that individual developers often interact strategically with other highly resourceful developers by forming a smaller but better organized structure to intensify the types of epistemic interactions that matter most to the OSS development. A general framework of strategic interaction including participation inequality, conversational interactivity, and cross-thread connectivity is proposed to examine its impact on knowledge sharing, and validated using 128 discussion threads from the K Desktop Environment (KDE) developer mailing list. The findings indicate that strategic interaction has expanded knowledge sharing but with the caveat that extreme concentration of development could have an opposite effect. For researchers, this study dovetails the incentive logic by proposing and validating the strategic aspects of OSS participation to better understand the collective dynamics underpinning OSS development. Practitioners can use this approach to evaluate and better support existing knowledge-sharing initiatives.}, keywords = {developers, email, email archives, kde, knowledge collaboration, knowledge sharing, mailing list}, issn = {1526-5501}, doi = {10.1287/mnsc.1060.0551}, author = {Kuk, George} } @conference {German:2006:SCP:1137983.1138022, title = {A study of the contributors of PostgreSQL}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, series = {MSR {\textquoteright}06}, year = {2006}, pages = {163{\textendash}164}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {This report describes some characteristics of the development team of PostgreSQL that were uncovered by analyzing the history of its software artifacts as recorded by the project{\textquoteright}s CVS repository.}, keywords = {contributions, contributors, cvs, developers, mining challenge, mining software repositories, msr challenge, patches, postgresql, revision history, roles, software evolution, source code, team}, isbn = {1-59593-397-2}, doi = {http://doi.acm.org/10.1145/1137983.1138022}, url = {http://doi.acm.org/10.1145/1137983.1138022}, attachments = {https://flosshub.org/sites/flosshub.org/files/163AStudyOf.pdf}, author = {Daniel M. German} } @conference {1090, title = {A Topological Analysis of the Open Souce Software Development Community}, booktitle = {Proceedings of the 38th Annual Hawaii International Conference on System Sciences}, year = {2006}, note = {"We extracted data from a 2003 data dump obtained from SourceForge. " roles}, pages = {1-10}, publisher = {IEEE}, organization = {IEEE}, address = {Big Island, HI, USA}, abstract = {The fast growth of OSS has increased the interest in studying the composition of the OSS community and its collaboration mechanisms. Moreover, the success of a project may be related to the underlying social structure of the OSS development community. In this paper, we perform a quantitative analysis of Open Source Software developers by studying the entire development community at SourceForge [26]. Statistics and social network properties are explored to find collaborations and the effects of different members in the OSS development community. Small world phenomenon and scale free behaviors are found in the SourceForge development network. These topological properties may potentially explain the success and efficiency of OSS development practices. We also infer from our analysis that weakly associated but contributing co-developers and active users may be an important factor in OSS development.}, keywords = {contributors, developers, roles, social network analysis, social networks, sourceforge, srda, users}, doi = {10.1109/HICSS.2005.57}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.132.6830\&rep=rep1\&type=pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/xuGao.pdf}, author = {Jin Xu and Gao, Yongqin and Christley, S. and Madey, G.} } @conference {Ohira:2005:ACK:1083142.1083163, title = {Accelerating cross-project knowledge collaboration using collaborative filtering and social networks}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {111-115}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Vast numbers of free/open source software (F/OSS) development projects use hosting sites such as Java.net and SourceForge.net. These sites provide each project with a variety of software repositories (e.g. repositories for source code sharing, bug tracking, discussions, etc.) as a media for communication and collaboration. They tend to focus on supporting rich collaboration among members in each project. However, a majority of hosted projects are relatively small projects consisting of few developers and often need more resources for solving problems. In order to support cross-project knowledge collaboration in F/OSS development, we have been developing tools to collect data of projects and developers at SourceForge, and to visualize the relationship among them using the techniques of collaborative filtering and social networks. The tools help a developer identify {\textquotedblleft}who should I ask?{\textquotedblright} and {\textquotedblleft}what can I ask?{\textquotedblright} and so on. In this paper, we report a case study of applying the tools to F/OSS projects data collected from SourceForge and how effective the tools can be used for helping cross-project knowledge collaboration.}, keywords = {collaborative filtering, developers, knowledge collaboration, projects, social networks, sourceforge, visualization tool}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083163}, url = {http://doi.acm.org/10.1145/1082983.1083163}, attachments = {https://flosshub.org/sites/flosshub.org/files/111Accelerating.pdf}, author = {Ohira, Masao and Ohsugi, Naoki and Ohoka, Tetsuya and Matsumoto, Ken-ichi} } @article {1079, title = {Are All Open Source Projects Created Equal? Understanding the Sustainability of Open Source Software Development Model}, journal = {AMCIS 2005 Proceedings}, number = {435}, year = {2005}, note = {uses first 300 most active projects on Sourceforge: {\textquoteright}Our data sample consists of 300 open source software development projects hosted in the Sourceforge.Net. They are the first 300 active projects ranked by Sourceforge.Net....Our dependent variable in the model to measure the success of the projects is the number of downloads. It is an essential variable to show how successful the project is. Generally, more number of downloads means a more successful project. Independent variables include: development status, project lifespan, number of developers, number of messages in the forums, number of mailing list, number of bug report, number of patch report, number of CVS report, number of file releases and also number of news release."}, abstract = {A very intriguing question in Open Source software (OSS) development is: why there are only a few open source projects succeed, while the majority of projects never do. In this research, we examine the factors that may influence the performance of OSS projects. We particularly focus on the OSS{\textquoteright}s core developers{\textquoteright} role in the project{\textquoteright}s success. Extant research has yet to distinguish core developers and non-core developers from the community at large. The different roles of the core developers and non-core developers in OSS projects{\textquoteright} success still remain unclear. Our research contributes to the literature by separating the core developers from the development forces in general and empirically examining the core developers{\textquoteright} importance. Drawing the evidences from our extensive dataset of 300 open source projects, we demonstrated that core developers{\textquoteright} leadership and project advocation are crucial in determining the fate of the OSS projects. Our research could provide better understanding of OSS sustainability. It could also give practical advice to the OSS community on how to make the project successful.}, keywords = {contributors, core, developers, downloads, metadata, project success, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/LongYuan.pdf}, author = {Long, J. and Yuan, M.J.} } @conference {Robles:2005:DIM:1083142.1083162, title = {Developer identification methods for integrated data from various sources}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {106-110}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Studying a software project by mining data from a single repository has been a very active research field in software engineering during the last years. However, few efforts have been devoted to perform studies by integrating data from various repositories, with different kinds of information, which would, for instance, track the different activities of developers. One of the main problems of these multi-repository studies is the different identities that developers use when they interact with different tools in different contexts. This makes them appear as different entities when data is mined from different repositories (and in some cases, even from a single one). In this paper we propose an approach, based on the application of heuristics, to identify the many identities of developers in such cases, and a data structure for allowing both the anonymized distribution of information, and the tracking of identities for verification purposes. The methodology will be presented in general, and applied to the GNOME project as a case example. Privacy issues and partial merging with new data sources will also be considered and discussed.}, keywords = {anonymization, bug tracker, developers, email, email address, gnome, identity, mailing list, privacy, source code, version control}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083162}, url = {http://doi.acm.org/10.1145/1082983.1083162}, attachments = {https://flosshub.org/sites/flosshub.org/files/106DeveloperIdentification.pdf}, author = {Gregorio Robles and Jesus M. Gonzalez-Barahona} } @proceedings {citeulike:3052508, title = {Development Success in Open Source Software Projects: Exploring the Impact of Copylefted Licenses}, year = {2005}, abstract = {Copyleft prevents the source code of open source software (OSS) from being privately appropriated. The ethos of the OSS movement suggests that volunteer developers may particularly value and contribute to copylefted projects. Based on social movement theory, we hypothesized that copylefted OSS projects are more likely than non-copylefted OSS projects to succeed in the development process, in terms of two key indicators: developer membership and developer productivity. We performed an exploratory study using data from 62 relevant OSS projects spanning an average of three years of development time. We found that copylefted projects were associated with higher developer membership and productivity. This is the first study to empirically test the relationship between copylefted licenses and OSS project success. Implications for OSS project initiators as well as future research directions are discussed.}, keywords = {contributions, copyleft, developer, developers, membership, productivity, project success, success}, url = {http://aisel.isworld.org/password.asp?Vpath=AMCIS/2005\&\#38;PDFpath=OSSDAU01-1167.pdf}, author = {Colazo, Jorge A. and Fang, Yulin and Neufeld, Derrick J.} } @article {102, title = {Empirical validation of object-oriented metrics on open source software for fault prediction}, journal = {IEEE Transactions on Software Engineering}, volume = {31}, number = {10}, year = {2005}, note = {"This paper describes how we calculated the object-oriented metrics given by Chidamber and Kemerer to illustrate how fault-proneness detection of the source code of the open source Web and e-mail suite called Mozilla can be carried out. We checked the values obtained against the number of bugs found in its bug database - called Bugzilla - using regression and machine learning methods to validate the usefulness of these metrics for fault-proneness prediction. We also compared the metrics of several versions of Mozilla to see how the predicted fault-proneness of the software system changed during its development cycle." metrics, wmc weighted methods per class, dit depth of inheritance, rfc response for a class, noc number of children, cbo coupling between object classes, cohesion, lines of code, loc, sloc chidamber and kemerer metrics}, pages = {897-910}, abstract = {Open source software systems are becoming increasingly important these days. Many companies are investing in open source projects and lots of them are also using such software in their own work. But, because open source software is often developed with a different management style than the industrial ones, the quality and reliability of the code needs to be studied. Hence, the characteristics of the source code of these projects need to be measured to obtain more information about it. This paper describes how we calculated the object-oriented metrics given by Chidamber and Kemerer to illustrate how fault-proneness detection of the source code of the open source Web and e-mail suite called Mozilla can be carried out. We checked the values obtained against the number of bugs found in its bug database - called Bugzilla - using regression and machine learning methods to validate the usefulness of these metrics for fault-proneness prediction. We also compared the metrics of several versions of Mozilla to see how the predicted fault-proneness of the software system changed during its development cycle.}, keywords = {bugs, bugzilla, cbo, defects, dit, fault-prone modules, faults, lcom, lcomn, loc, metrics, mozilla, noc, object-oriented, rfc, source code, wmc}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.115.8372\&rep=rep1\&type=pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/Gyimothy.pdf}, author = {Gyimothy, T. and Ferenc, R. and Siket, I.} } @conference {731, title = {Evolution of Volunteer Participation in Libre Software Projects: Evidence from Debian}, booktitle = {OSS2005: Open Source Systems }, year = {2005}, note = {"For each release we have retrieved the corresponding sources.gz file (see below) from the Debian archive. From it we have extracted information about the packages and their maintainers...." }, pages = {100-107}, abstract = {Most libre software projects rely on the work of volunteers. Therefore, attracting people who contribute their time and technical skills is of paramount importance, both in technical and economic terms. This reliance on volunteers leads to some fundamental management challenges: volunteer contributions are inherently difficult to predict, plan and manage, especially in the case of large projects. In this paper we analyze the evolution in time of the human resources of one of the largest and most complex libre software projects composed primarily of volunteers, the Debian project. Debian currently has around 1300 volunteers working on several tasks: much activity is focused on packaging software applications and libraries, but there is also major work related to the maintenance of the infrastructure needed to sustain the development. We have performed a quantitative investigation of data from almost seven years, studying how volunteer involvement has affected the software...}, keywords = {contributors, debian, maintainers, PopCon, popularity, Volunteers}, url = {http://pascal.case.unibz.it/handle/2038/857}, attachments = {https://flosshub.org/sites/flosshub.org/files/robles_barahona_michlmayr-evolution_participation.pdf}, author = {Gregorio Robles and Jesus M. Gonzalez-Barahona and Martin Michlmayr} } @article {flosswp184, title = {Exploring the Structure of Complex Software Designs: An Empirical Study of Open Source and Proprietary Code (updated)}, year = {2005}, note = {"For each design, we report data on the number of source files, the number of dependencies, the density of the DSM (i.e., the number of dependencies per source file pair) the propagation cost and the clustered cost. We also provide data on the average complexity of source files, in terms of the number of functions and lines of code."}, month = {June}, abstract = {This paper reports data from a study that seeks to characterize the differences in design structure between complex software products. In particular, we use Design Structure Matrices (DSMs) to map the dependencies between the elements of a design and define metrics that allow us to compare the structures of different designs. We first use these metrics to compare the architectures of two software products - the Linux operating system and the Mozilla web browser - that were developed via contrasting modes of organization: specifically, open source versus proprietary development. We then track the evolution of Mozilla, paying particular attention to a purposeful "re-design" effort that was undertaken with the intention of making the product more "modular." We find significant differences in structure between Linux and the first version of Mozilla, suggesting that Linux had a more modular architecture. We also find that the redesign of Mozilla resulted in an architecture that was significantly more modular than that of its predecessor, and indeed, than that of Linux. Our results, while exploratory, are consistent with a view that different modes of organization are associated with designs that possess different structures. However, we also illustrate that purposeful managerial actions can have a large impact on structure. This latter result is important given recent moves to release proprietary software into the public domain. These moves are likely to fail unless the product possesses an architecture that facilitates participation. Our paper provides evidence that a tightly-coupled design can be adapted to meet this objective.}, keywords = {complexity, cost, dependencies, functions, lines of code, linux, loc, mozilla, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/maccormackrusnakbaldwin2.pdf}, author = {Alan MacCormack and John Rusnak and Carliss Baldwin} } @conference {773, title = {Fare Open Source all{\textquoteright}Istat: il generatore di data-entry per indagini statistiche}, booktitle = {OSS2005: Open Source Systems }, year = {2005}, pages = {283-286}, abstract = {Viene presentata una prima esperienza di sviluppo di software Open Source all{\textquoteright}Istat, che riguarda un sistema generalizzato di data-entry per indagini statistiche. L{\textquoteright}applicazione, in via di rilascio, copre la fase di acquisizione dati propria delle indagini statistiche e consentir{\`a} di generare maschere che potranno essere utilizzate sia in Intranet che sul Web. L{\textquoteright}applicazione descritta, che utilizza PHP e XML come tecnologie fondanti, rappresenta un{\textquoteright}esperienza concreta di sviluppo Open Source nella quale l{\textquoteright}Istat vuole riversare le proprie esperienze specifiche, mettendole poi a disposizione delle altre strutture interessate.}, keywords = {data-entry, indagini statistiche, open source, PHP, web, XML}, url = {http://pascal.case.unibz.it/handle/2038/903}, author = {Altarocca, Francesco and Vaccari, Carlo} } @article {DBLP:journals/tse/Dinh-TrongB05, title = {The FreeBSD Project: A Replication Case Study of Open Source Development}, journal = {IEEE Trans. Software Eng.}, volume = {31}, number = {6}, year = {2005}, note = {" we obtained the necessary data from the [FreeBsd] CVS repository, the bug report database, and the e-mail archive. The CVS repository contains all of the code and related documentation that is committed to the project from 1993 until the present. The bug report database contains information describing all reported problems, as well as the status (such as fixed, under test, or open) of each problem. Each bug report is called a PR and assigned a reference number. The e-mail archive contains every e-mail message exchanged between the developers since 1994." d/l: research.cs.queensu.ca/~ahmed/home/teaching/.../F06/.../free-bsd.pdf}, pages = {481-494}, abstract = {Case studies can help to validate claims that open source software development produces higher quality software at lower cost than traditional commercial development. One problem inherent in case studies is external validity{\textemdash}we do not know whether or not results from one case study apply to another development project. We gain or lose confidence in case study results when similar case studies are conducted on other projects. This case study of the FreeBSD project, a long-lived open source project, provides further understanding of open source development. The paper details a method for mining repositories and querying project participants to retrieve key process information. The FreeBSD development process is fairly well-defined with proscribed methods for determining developer responsibilities, dealing with enhancements and defects, and managing releases. Compared to the Apache project, FreeBSD uses 1) a smaller set of core developers{\textemdash}developers who control the code base{\textemdash}that implement a smaller percentage of the system, 2) a larger set of top developers to implement 80 percent of the system, and 3) a more well-defined testing process. FreeBSD and Apache have a similar ratio of core developers to people involved in adapting and debugging the system and people who report problems. Both systems have similar defect densities and the developers are also users in both systems.}, keywords = {apache, bug reports, contributors, core, cvs, defect density, developers, email, email archive, freebsd, mailing list, scm, source code, users}, doi = {10.1109/TSE.2005.73}, attachments = {https://flosshub.org/sites/flosshub.org/files/DinhTrungBieman.pdf}, author = {Trung T. Dinh-Trong and James M. Bieman} } @article {vanWendeldeJoode2005109, title = {Handling variety: the tension between adaptability and interoperability of open source software}, journal = {Computer Standards \& Interfaces}, volume = {28}, number = {1}, year = {2005}, note = {unable to find concise description of the data used [ms]}, pages = {109 - 121}, abstract = {Open source software (OSS) offers unprecedented opportunities to create variety. This could lead to incompatibility and fragmentation. To prevent this some form of coordination is needed. This paper explores which mechanisms of coordination are responsible for limiting divergence in OSS. Two cases are examined: Java and Linux. A systematic difference seems to exist between the mechanisms identified in the two communities. With respect to Java, divergence is where possible avoided ex ante, whereas for Linux divergence is foremost reduced ex post. The conclusion discusses this difference and the implications of both types of coordination in respect to interoperability.}, keywords = {coordination, divergence, java, linux}, issn = {0920-5489}, doi = {DOI: 10.1016/j.csi.2004.12.004}, url = {http://www.sciencedirect.com/science/article/B6TYV-4F6K72H-1/2/c74c64ce51e6f46abf9f39ae945c9e15}, author = {Ruben van Wendel de Joode and Tineke M. Egyedi} } @conference {781, title = {Idealism and Commercialism {\textendash} Developing Free/Libre and Open Source Software in Private Businesses}, booktitle = {OSS2005: Open Source Systems }, year = {2005}, pages = {301-302}, abstract = {This paper presents a PhD research project undertaken as part of a larger project aimed at paying sociological attention to different forms of distribution of knowledge, including program code. We want to investigate empirically how the commons knows as free/open source software is actually made. In my PhD project I study the use and development of FLOSS in private businesses, focusing on professional developers working in private businesses and at the same time participating in the FLOSS community. The theoretical starting point is theories of power, dominance and legitimacy by Max Weber and Pierre Bourdieu.}, keywords = {dominance, FLOSS, FLOSS community, free/libre, legitimacy, linux, open source, Private Businesses, social organisation, theories of power}, url = {http://pascal.case.unibz.it/handle/2038/970}, author = {Lundestad, Christian V.} } @conference {784, title = {An Investigation of Developer and User Activity In FLOSS Projects}, booktitle = {OSS2005: Open Source Systems }, year = {2005}, pages = {307-308}, abstract = {This document outlines research plans to investigate the evolution of a set of FLOSS (Free, Libre, and Open Source Software) projects and their communities by looking at patterns of developer and user activity through a combination of analysis of archived material and interaction with the project communities. The goal of the proposed research is to better understand the dynamics within FLOSS projects and how some projects become successful.}, keywords = {developer, development, FLOSS, FLOSS project, open source, pattern, portal, project}, url = {http://pascal.case.unibz.it/handle/2038/712}, author = {Gale, Stephen} } @conference {Antoniol:2005:LPC:1083142.1083156, title = {Linear predictive coding and cepstrum coefficients for mining time variant information from software repositories}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {74-78}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {This paper presents an approach to recover time variant information from software repositories. It is widely accepted that software evolves due to factors such as defect removal, market opportunity or adding new features. Software evolution details are stored in software repositories which often contain the changes history. On the other hand there is a lack of approaches, technologies and methods to efficiently extract and represent time dependent information. Disciplines such as signal and image processing or speech recognition adopt frequency domain representations to mitigate differences of signals evolving in time. Inspired by time-frequency duality, this paper proposes the use of Linear Predictive Coding (LPC) and Cepstrum coefficients to model time varying software artifact histories. LPC or Cepstrum allow obtaining very compact representations with linear complexity. These representations can be used to highlight components and artifacts evolved in the same way or with very similar evolution patterns. To assess the proposed approach we applied LPC and Cepstral analysis to 211 Linux kernel releases (i.e., from 1.0 to 1.3.100), to identify files with very similar size histories. The approach, the preliminary results and the lesson learned are presented in this paper.}, keywords = {change history, data mining, evolution, files, kernel, linear predictive coding, linux, lpc, size, software evolution, source code}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083156}, url = {http://doi.acm.org/10.1145/1082983.1083156}, attachments = {https://flosshub.org/sites/flosshub.org/files/74LinearPredictive.pdf}, author = {Antoniol, Giuliano and Rollo, Vincenzo Fabio and Venturi, Gabriele} } @conference {Phadke:2005:PRM:1145319.1145337, title = {Predicting risky modules in open-source software for high-performance computing}, booktitle = {Proceedings of the second international workshop on Software engineering for high performance computing system applications}, series = {SE-HPCS {\textquoteright}05}, year = {2005}, pages = {60{\textendash}64}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {This paper presents the position that software-quality modeling of open-source software for high-performance computing can identify modules that have a high risk of bugs.Given the source code for a recent release, a model can predict which modules are likely to have bugs, based on data from past releases. If a user knows which software modules correspond to functionality of interest, then risks to operations become apparent. If the risks are too great, the user may prefer not to upgrade to the most recent release.Of course, such predictions are never perfect. After release, bugs are discovered. Some bugs are missed by the model, and some predicted errors do not occur. A successful model will be accurate enough for informed management action at the time of the predictions.As evidence for this position, this paper summarizes a case study of the Portable Extensible Toolkit for Scientific Computation (PETSC), which is a mathematical library for high-performance computing. Data was drawn from source-code and configuration management logs. The accuracy of logistic-regression and decision-tree models indicated that the methodology is promising. The case study also illustrated several modeling issues.}, keywords = {C4.5, decision trees, empirical case study, high performance computing, logistic regression, Open-source software, PETSc, software metrics, software quality model, software reliability}, isbn = {1-59593-117-1}, doi = {10.1145/1145319.1145337}, url = {http://doi.acm.org/10.1145/1145319.1145337}, author = {Phadke, Amit A. and Allen, Edward B.} } @conference {Stewart:2005:PAI:1042438.1043100, title = {A Preliminary Analysis of the Influences of Licensing and Organizational Sponsorship on Success in Open Source Projects}, booktitle = {Proceedings of the Proceedings of the 38th Annual Hawaii International Conference on System Sciences - Volume 07}, series = {HICSS {\textquoteright}05}, year = {2005}, note = {"Publicly available data on open source projects registered on the Freshmeat website (www.freshmeat.net) was used to test the hypotheses. Data was collected from each project{\textquoteright}s Freshmeat website at the start and end of an eight month period (March - December 2002)." "We first selected three project categories from which to draw our sample. These were utilities, software development, and games." "Within these categories we further differentiated between new projects, which had been registered on the site within the two weeks prior to our first data collection point and older projects that had been registered more than two weeks prior to our initial data collection." }, month = {2005}, pages = {1-10}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Washington, DC, USA}, abstract = {This paper develops and tests a model of the impact of licensing restrictiveness and organizational sponsorship on the popularity and vitality of open source software (OSS) development projects. Using data gathered from Freshmeat.net and OSS project home pages the main conclusions derived from the analysis are that organizational sponsorship has a positive effect on project popularity by easing user concerns about cost and quality and that license restrictiveness may have a negative effect on popularity by reducing the perceived utility of open source software. Theoretical and practical implications are discussed, and the paper outlines several avenues for future research.}, keywords = {contributors, developers, freshmeat, license analysis, licensing, metadata, popularity, restrictive, users}, isbn = {0-7695-2268-8-7}, doi = {http://dx.doi.org/10.1109/HICSS.2005.38}, url = {http://dx.doi.org/10.1109/HICSS.2005.38}, author = {Stewart, Katherine J. and Ammeter, Anthony P. and Maruping, Likoebe M.} } @article {springerlink:10.1007/s10606-005-9000-1, title = {Socialization in an Open Source Software Community: A Socio-Technical Analysis}, journal = {Computer Supported Cooperative Work (CSCW)}, volume = {14}, year = {2005}, note = {"I present the results of my analyses of participation patterns in a particular OSS project, used as a case study: Python. More precisely, I use the Open Source Project Browser to qualitatively track and analyze the trajectories of several project members who evolved (or not) into full-fledged participants. This allows me to later discuss how socialization proceeds in an OSS community such as Python" "Over the course of 2002, I progressively retrieved the entire email archive of python-dev (the developers{\textquoteright} mailing list) and the CVS source tree for the project. " "I read the entirety of the messages written by these participants and also looked at the software code they produced. There were striking similarities between their progressions over time, which I will de- scribe shortly. Overall the trajectory of these participants reflects successful socialization in Python: an evolution from newcomer to developer." http://www2.parc.com/csl/members/nicolas/documents/JCSCW-OSS.pdf}, pages = {323-368}, publisher = {Springer Netherlands}, abstract = {Open Source Software (OSS) development is often characterized as a fundamentally new way to develop software. Past analyses and discussions, however, have treated OSS projects and their organization mostly as a static phenomenon. Consequently, we do not know how these communities of software developers are sustained and reproduced over time through the progressive integration of new members. To shed light on this issue I report on my analyses of socialization in a particular OSS community. In particular, I document the relationships OSS newcomers develop over time with both the social and material aspects of a project. To do so, I combine two mutually informing activities: ethnography and the use of software specially designed to visualize and explore the interacting networks of human and material resources incorporated in the email and code databases of OSS. Socialization in this community is analyzed from two perspectives: as an individual learning process and as a political process. From these analyses it appears that successful participants progressively construct identities as software craftsmen, and that this process is punctuated by specific rites of passage. Successful participants also understand the political nature of software development and progressively enroll a network of human and material allies to support their efforts. I conclude by discussing how these results could inform the design of software to support socialization in OSS projects, as well as practical implications for the future of these projects.}, keywords = {cvs, developers, email, email archive, mailing list, open source project browser, participation, python, scm, source code, team, tools}, issn = {0925-9724}, url = {http://dx.doi.org/10.1007/s10606-005-9000-1}, author = {DUCHENEAUT, NICOLAS} } @conference {790, title = { Socialization practices in FLOSS development teams}, booktitle = {OSS2005: Open Source Systems }, year = {2005}, pages = {322-323}, abstract = {Socialization of new members into Free/Libre Open Source Software (FLOSS) development teams is an important but little studied process in producing effective teams of this type. This is a dissertation proposal for a virtual ethnographic study that looks at the mechanisms and processes used to socialize new members into the team in order to help maintain a common group identity and focus.}, keywords = {development team, FLOSS, member, open source, socialization}, url = {http://pascal.case.unibz.it/handle/2038/1438}, author = {Chengetai Masango} } @conference {899, title = {Stopping spyware at the gate: a user study of privacy, notice and spyware}, booktitle = {2005 Symposium on Usable Privacy and Security}, year = {2005}, month = {07/2005}, pages = {43-52}, publisher = {Association for Computing Machinery}, organization = {Association for Computing Machinery}, address = {Pittsburgh, PA}, keywords = {agreement,, and, Aspects,, Design,, end, EULA,, Experimentation,, Factors,, Human, Legal, license, notice,, of, privacy,, security, service,, spyware,, terms, ToS,, usability,, user}, isbn = {1-59593-178-3 }, author = {N. Good and Dhamija, R. and J. Grossklags and D. Thaw and Aronowitz, S. and D. Mulligan and J. Konstan} } @conference {Kim:2005:UCG:1083142.1083146, title = {Using a clone genealogy extractor for understanding and supporting evolution of code clones}, booktitle = {Proceedings of the 2005 international workshop on Mining software repositories}, series = {MSR {\textquoteright}05}, year = {2005}, pages = {17-23}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Programmers often create similar code snippets or reuse existing code snippets by copying and pasting. Code clones {\textemdash}syntactically and semantically similar code snippets{\textemdash}can cause problems during software maintenance because programmers may need to locate code clones and change them consistently. In this work, we investigate (1) how code clones evolve, (2) how many code clones impose maintenance challenges, and (3) what kind of tool or engineering process would be useful for maintaining code clones. Based on a formal definition of clone evolution, we built a clone genealogy tool that automatically extracts the history of code clones from a source code repository (CVS). Our clone genealogy tool enables several analyses that reveal evolutionary characteristics of code clones. Our initial results suggest that aggressive refactoring may not be the best solution for all code clones; thus, we propose alternative tool solutions that assist in maintaining code clones using clone genealogy information.}, keywords = {clone, clone detection, cvs, developers, evolution, maintenance, refactoring, source code}, isbn = {1-59593-123-6}, doi = {http://doi.acm.org/10.1145/1082983.1083146}, url = {http://doi.acm.org/10.1145/1082983.1083146}, attachments = {https://flosshub.org/sites/flosshub.org/files/17Using.pdf}, author = {Kim, Miryung and Notkin, David} } @conference {Crowston:2004, title = {Coordination practices for bug fixing within FLOSS development teams}, booktitle = {1st International Workshop on Computer Supported Activity Coordination, 6th International Conference on Enterprise Information Systems}, year = {2004}, note = {"First, we chose projects for which data we need for our analysis are publicly available (not all projects allow public access to the bug tracking system). Second, we chose teams with more than 8 members...we tried to select more and less suc- cessful development teams. To this aim we used the definitions of success proposed by [9], who suggest that a project is successful if it is active, the resulting software is downloaded and used and the code matures" "Kicq, Gaim and PhPmyAdmin were chosen" - effective DynAPI was chosen as an example of a less effective project" "We collected data indicative of the success of each project, such as its level of activity, number of downloads and development status. We then collected data from the archives of the bug tracking system, the tool used to support the bug fixing process"}, address = {Porto, Portugal}, abstract = {Free/Libre Open Source Software (FLOSS) is primarily developed by distributed teams. Developers contribute from around the world and coordinate their activity almost exclusively by means of email and bulletin boards. FLOSS development teams some how profit from the advantages and evade the challenges of distributed software development. Despite the relevance of the FLOSS both for research and practice, few studies have investigated the work practices adopted by these development teams. In this paper we investigate the structure and the coordination practices adopted by development teams during the bug-fixing process, which is considered one of main areas of FLOSS project success. In particular, based on a codification of the messages recorded in the bug tracking system of four projects, we identify the accomplished tasks, the adopted coordination mechanisms, and the role undertaken by both the FLOSS development team and the FLOSS community. We conclude with suggestions for further research.}, keywords = {activity, bug fixing, bug reports, bug tracker, coordination, downloads, dynapi, FLOSS, gaim, kicq, phpmyadmin, status}, attachments = {https://flosshub.org/sites/flosshub.org/files/CrowstonScozzi04coordination.pdf}, author = {Kevin Crowston and Barbara Scozzi} } @proceedings {1195, title = {From Bazaar to Kibbutz: How Freedom Deals with Coherence in the Debian Project}, year = {2004}, note = {"I am a member of the Debian project since 2000."}, pages = {71-75}, abstract = {The goal of obtaining a coherent distribution of software packages where all programs interact smoothly increases its complexity with the number of applications, the number of architectures involved, and the number of system configurations supported. The Debian project aims at producing a software system with thousands of components running on eleven different hardware architectures, with three different operating system kernels. This paper describes the project and how the work of hundreds of people that never meet one with another can be coordinated to produce reasonably robust and integrated systems.}, keywords = {debian}, attachments = {https://flosshub.org/sites/flosshub.org/files/monga72-76.pdf}, author = {Monga, M.} } @article {capiluppi:23, title = {Improving comprehension and cooperation through code structure}, journal = {IEE Seminar Digests}, volume = {2004}, number = {908}, year = {2004}, note = {"In this study we measured source code size in three different forms (LOCs, SLOCs, and Kbs of code)." "Pilot Project: The ARLA System" }, pages = {23-28}, publisher = {IEE}, abstract = {Defining a relationship between a software system{\textquoteright}s architecture and the process{\textquoteright} efforts is one of the most fascinating questions of software engineering. Apparently, when a system{\textquoteright}s architecture is complex, the process to improve and evolve it will be more difficult. We try to tackle this question from a different point of view: given an open source system, in all the phases of its evolution, we focus on both the aspects of software developers, and the obtained software product. More we observe one of the possible architectures of this system, based on the tree structure derived from source components. First conclusions show that some patterns of tree evolution are recognizable: some branches may appear more promising than other, and are extensively evolved, while other remains in the same status for all the life cycle. More, when the tree structure reaches some status, the process of joining as a core developer seems to forestall. }, keywords = {arla, code structure, contributors, developers, open source system, scm, software development, software engineering, software process, software product, software system architecture, source code, source components, tree evolution, tree structure}, doi = {10.1049/ic:20040260}, url = {http://link.aip.org/link/abstract/IEESEM/v2004/i908/p23/s1}, attachments = {https://flosshub.org/sites/flosshub.org/files/capiluppi2004.pdf}, author = {A. Capiluppi} } @conference {1054, title = {LASER: a lexical approach to analogy in software reuse}, booktitle = {International Workshop on Mining Software Repositories (MSR 2004)}, volume = {2004}, year = {2004}, pages = {112 - 116}, publisher = {IEE}, organization = {IEE}, address = {Edinburgh, Scotland, UK}, abstract = {Software reuse is the process of creating a software system from existing software components, rather than creating it from scratch. With the increase in size and complexity of existing software repositories, the need to provide intelligent support to the programmer becomes more pressing. An analogy is a comparison of certain similarities between things which are otherwise unlike. This concept has shown to be valuable in developing UML-level reuse techniques. In the LASER project we apply lexically-driven Analogy at the code level, rather than at the UML-level, in order to retrieve matching components from a repository of existing components. Using the lexical ontology Word-Net, we have conducted a case study to assess if class and method names in open source applications are used in a semantically meaningful way. Our results demonstrate that both hierarchical reuse and parallel reuse can be enhanced through the use of lexically-driven Analogy.}, keywords = {class, developers, functions, jrefactory, method, naming, natural language, reuse, source code, wordnet}, doi = {10.1049/ic:20040487}, attachments = {https://flosshub.org/sites/flosshub.org/files/112LASER.pdf}, author = {Amin, R. and Mel O Cinneide and Veale, Tony} } @article {flosswp201, title = {Managing Volunteer Activity in Free Software Projects}, year = {2004}, month = {July}, abstract = {During the last few years, thousands of volunteers have created a large body of free software. Even though this accomplishment shows that the free software development model works, there are some drawbacks associated with this model. Due to the volunteer nature of most free software projects, it is impossible to fully rely on participants. Volunteers may become busy and neglect their duties. This may lead to a steady decrease of quality as work is not being carried out. The problem of inactive volunteers is intensified by the fact that most free software projects are distributed, which makes it hard to quickly identify volunteers who neglect their duties. This paper shows Debian{\textquoteright}s approach to inactive volunteers. Insights presented here can be applied to other free software projects in order to implement effective quality assurance strategies.}, keywords = {debian, volunteer, volunteer teams}, attachments = {https://flosshub.org/sites/flosshub.org/files/michlmayr-mia.pdf}, author = {Martin Michlmayr} } @proceedings {flosswp181, title = {The perils and pitfalls of mining SourceForge}, year = {2004}, month = {05/2004}, pages = {7-11}, abstract = {SourceForge provides abundant accessible data from Open Source Software development projects, making it an attractive data source for software engineering research. However it is not without theoretical peril and practical pitfalls. In this paper, we outline practical lessons gained from our spidering, parsing and analysis of SourceForge data. SourceForge can be practically difficult: projects are defunct, data from earlier systems has been dumped in and crucial data is hosted outside SourceForge, dirtying the retrieved data. These practical issues play directly into analysis: decisions made in screening projects can reduce the range of variables, skewing data and biasing correlations. SourceForge is theoretically perilous: because it provides easily accessible data items for each project, tempting researchers to fit their theories to these limited data. Worse, few are plausible dependent variables. Studies are thus likely to test the same hypotheses even if they start from different theoretical bases. To avoid these problems, analyses of SourceForge projects should go beyond project level variables and carefully consider which variables are used for screening projects and which for testing hypotheses.}, keywords = {Data Collection, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/howison04msr.pdf}, author = {Howison, James and Kevin Crowston} } @conference {Gasser04researchinfrastructure, title = {Research Infrastructure for Empirical Science of F/OSS}, booktitle = {Proc. Intern. Workshop on Mining Software Repositories}, year = {2004}, pages = {12-16}, abstract = {F/OSS research faces a new and unusual situation: the traditional difficulties of gathering enough empirical data have been replaced by issues of dealing with enormous amounts of freely available data from many disparate sources (forums, code, bug reports, etc.) At present no means exist for assembling these data under common access points and frameworks for comparative, longitudinal, and collaborative research. Gathering and maintaining large F/OSS data collections reliably and making them usable present several research challenges. For example, current projects usually rely on {\textquotedblleft}web scraping{\textquotedblright} or on direct access to raw data from groups that generate it, and both of these methods require unique effort for each new corpus, or even for updating existing corpora. In this paper we identify several common needs and critical factors in F/OSS empirical research, and suggest orientations and recommendations for the design of a shared research infrastructure.}, keywords = {data, Data Collection, empirical, infrastructure}, attachments = {https://flosshub.org/sites/flosshub.org/files/12ResearchInfrastructure.pdf}, author = {Gasser, Les and Gabriel Ripoche and Sandusky, Robert J.} } @conference {flosswp180, title = {Towards a Portfolio of FLOSS project Success Measures}, booktitle = {Workshop on Open Source Software Engineering, International Conference on Software Engineering}, year = {2004}, month = {May}, abstract = {Project success is one of the most widely used dependent variables in information systems research. However, conventional measures of project success are difficult to apply to Free/Libre Open Source Software projects. In this paper, we present an analysis of four measures of success applied to SourceForge projects: number of members of the extended development community, project activity, bug fixing time and number of downloads. We argue that these four measures provide different insights into the collaboration and control mechanisms of the projects.}, keywords = {bug fixing, developers, downloads, project success, sourceforge, team, team size}, attachments = {https://flosshub.org/sites/flosshub.org/files/crowston04towards.pdf}, author = {Kevin Crowston and Hala Annabi and Howison, James and Chengetai Masango} } @conference {German03automatingthe, title = {Automating the measurement of open source projects}, booktitle = {Proceedings of the 3rd Workshop on Open Source Software Engineering}, year = {2003}, note = {"We have chosen to use Evolution to illustrate some of the capabilities of SoftChange. We focus on the data provided by CVS logs and the CVS commit mailing list. Our data includes changes to the CVS repository from April 1998 to January 2003."}, pages = {63{\textendash}67}, abstract = {The proliferation of open source projects raises a number of vital economic, social, and software engineering questions that are subject of intense research. Based on experience analyzing numerous open source and commercial projects we propose a set of tools to support extraction and validation of software project data. Such tools would streamline empirical investigation of open source projects and make it possible to test existing and new theories about the nature of open source projects. Our soft- ware includes tools to extract and summarize information from mailing lists, CVS logs, ChangeLog files, and defect tracking databases. More importantly, it cross-links records from various data sources and identifies all contributors for a software change. We illustrate some of the capabilities by analyzing data from Ximian Evolution project.}, keywords = {bug reports, bug tracking, changelog, cvs, defects, evolution, log files, logs, mailing list, scm, softchange, source code, ximian, ximian evolution}, attachments = {https://flosshub.org/sites/flosshub.org/files/germanMockus2003.pdf}, author = {German, Daniel and Audris Mockus} } @conference {DBLP:conf/ecis/TsiavosH03, title = {Beyond good and evil: why open source development for peer-to-peer networks does not necessarily equal to an open society is as imbalanced as copyright law and definitely is not going to make you a better person}, booktitle = {European Conference on Information Systems (ECIS 2003)}, year = {2003}, note = {"In a first stage we collected data related to the development of the Gnutella protocol. The sources included: web sites that were used for hosting forums and file repositories related to the development of the protocol that could be either archived or still operational; messages posted on discussion groups, forums and newsgroups; the design documents of the Gnutella protocol. In a second stage we gathered material related to the Limewire application. The sources included: operational and archived web sites having been used for the development of the application; applications such as Concurrent Version Systems (CVS) or Bug reporting tools (such as Issuezila), design and implementation documentation and relevant press reports. The data gathered covered a time span from early 2000 to late November 2002." "Other sources informed our research and also acted, at times of uncertainty, as forms of triangulation and verification (Lee 1991). These sources include websites such as Slashdot.org and WiredNews; IRC-mediated communications and private messages exchanged between the various developers."}, month = {2003}, abstract = {This paper interrogates the claims that open source development is an ideal form of regulatory development. We begin by presenting the literature that offers a framework of modalities of regulation where code, along with laws, markets, and norms shape and influence individual action. Within this framework, it is argued that for an Open Society we need Open Code. We present the processes through which the Gnutella protocol and the Limewire application are developed by deconstructing the mechanisms of participation and contribution of the individual developers. The techniques of monitoring, modularization and filtering that we identify appear to be inconsistent with open society promises. Instead we suggest a different framing, that of creating nests of interests, whose creators can find refuge from inhabitants of other nests. From that perspective, we suggest that we should stop referring to the war between Copyright and peer-to-peer networks as the battle between good and evil.}, keywords = {bug reports, cvs, design documents, discussion, discussion forums, documentation, forums, gnutella, issuezilla, limewire, web site}, attachments = {https://flosshub.org/sites/flosshub.org/files/tsiavosHosein.pdf}, author = {Prodromos Tsiavos and Ian Hosein} } @article {flosswp103, title = {Contributing to the common pool resources in Open Source software. A comparison between individuals and firms}, year = {2003}, month = {August}, abstract = {This paper studies the contributions to Open Source projects of software firms. Our goal is to analyse whether they follow the same regularities that characterize the behaviour of individual programmers. An exhaustive empirical analysis is carried out using data on project membership, project coordination and contribution efforts of 146 Italian firms that do business with Open Source software. We follow a meta-analytic approach comparing our findings with the results of the surveys conducted on Free Software programmers. Moreover, the availability of the data gathered by Hertel et al. (2003) on 141 developers of the Linux kernel will allow direct comparisons between the two sets.}, keywords = {developers, linux, linux kernel, Survey}, attachments = {https://flosshub.org/sites/flosshub.org/files/bnaccorsirossidevelopers.pdf}, author = {Andrea Bonaccorsi} } @article {2003, title = {From a Firm-Based to a Community-Based Model of Knowledge Creation: The Case of the Linux Kernel Development}, journal = {Organization Science}, volume = {14}, number = {6}, year = {2003}, note = {"we study the Linux development community mainly by analyzing the artifacts that the Linux developers have produced. A key output of knowledge creation activities is the artifacts. The most important artifact, of course, is the Linux operating system source code." "Along with the source code, a "Credits" text file and a "MAINTAINERS" text file are distributed to the users." "An equally important artifact is the development activities archived in the Linux-kernel mailing list"..."Using the weekly Linux-kernel email archive for years 1995 to 2000 as a key source of data, we focus on people who have sent at least one email to the Linux-kernel mailing list. " "In addition, we examine the developers{\textquoteright} demographic distributions, working patterns, and motivations by analyzing the raw data from an on-line survey"}, pages = {pp. 633-649}, publisher = {INFORMS}, abstract = {We propose a new model of knowledge creation in purposeful, loosely coordinated, distributed systems, as an alternative to a firm-based one. Specifically, using the case of the Linux kernel development project, we build a model of community-based, evolutionary knowledge creation to study how thousands of talented volunteers, dispersed across organizational and geographical boundaries, collaborate via the Internet to produce a knowledge-intensive, innovative product of high quality. By comparing and contrasting the Linux model with the traditional/commercial model of software development and firm-based knowledge creation efforts, we show how the proposed model of knowledge creation expands beyond the boundary of the firm. Our model suggests that the product development process can be effectively organized as an evolutionary process of learning driven by criticism and error correction. We conclude by offering some theoretical implications of our community-based model of knowledge creation for the literature of organizational learning, community life, and the uses of knowledge in society.}, keywords = {credits, developers, email, email archives, knowledge creation, linux kernel, mailing list, maintainers, scm, source code, Survey, Volunteers}, issn = {10477039}, url = {http://www.jstor.org/stable/4135125}, author = {Lee, Gwendolyn K. and Cole, Robert E.} } @article {flosswp122, title = {Managing the Boundary of an {\textquoteright}Open{\textquoteright} Project}, year = {2003}, month = {October}, abstract = {In the past ten years, the boundaries between public and open science and commercial research efforts have become more porous. Scholars have thus more critically examined ways in which these two institutional regimes intersect. Large open source software projects have also attracted commercial collaborators and now struggle to develop code in an open public environment that still protects their communal boundaries. This research applies a dynamic social network approach to understand how one community managed software project, Debian, develops a membership process. We examine the project{\textquoteright}s face-to-face social network during a five-year period (1997-2001) to see how changes in the social structure affect the evolution of membership mechanisms and the determination of gatekeepers. While the amount and importance of a contributor{\textquoteright}s work increases the probability that a contributor will become a gatekeeper, those more central in the social network are more likely to become gatekeepers and influence the membership process. A greater understanding of the mechanisms open projects use to manage their boundaries has critical implications for research and knowledge producing communities operating in pluralistic, open and distributed environments.}, keywords = {debian, membership, social network analysis}, attachments = {https://flosshub.org/sites/flosshub.org/files/omahonyferraro.pdf}, author = {Siobhan O{\textquoteright}Mahony} } @article {123, title = {Open source software development and Lotka{\textquoteright}s Law: Bibliometric patterns in programming}, journal = {Journal of the American Society for Information Science and Technology}, volume = {54}, number = {2}, year = {2003}, note = {"Two lead- ing metadata repositories are the Linux Software Map (LSM) and Sourceforge, both of which were used for this research." "For this article, we examined data listing the number of registered developers for each software project hosted by Sourceforge." "The data we obtained from the LSM collection were taken mainly from the Author: field of LSM records. The Author: field in LSM records gives us the ability to track the author of record for a software package. LSM metadata also include a list of maintainers, primary software distribution sites, date of update and other items." "The data we obtained from Sourceforge consist of a list of developer ID numbers, followed by the number of projects on which the individual is listed as a developer, then the number of projects on which the individual is listed as an administrator. These data were provided for all 33,892 individuals registered to work on projects hosted by Sourceforge in July 2001."}, pages = {169-178}, abstract = {This research applies Lotka{\textquoteright}s Law to metadata on open source software development. Lotka{\textquoteright}s Law predicts the proportion of authors at different levels of productivity. Open source software development harnesses the creativity of thousands of programmers worldwide, is important to the progress of the Internet and many other computing environments, and yet has not been widely researched. We examine metadata from the Linux Software Map (LSM), which documents many open source projects, and Sourceforge, one of the largest resources for open source developers. Authoring patterns found are comparable to prior studies of Lotka{\textquoteright}s Law for scientific and scholarly publishing. Lotka{\textquoteright}s Law was found to be effective in understanding software development productivity patterns, and offer promise in predicting aggregate behavior of open source developers.}, keywords = {developers, linux, linux software map, lsm, sourceforge, team size}, doi = {10.1002/asi.10177}, author = {Newby, G. B. and Greenberg, J. and Jones, P.} } @conference {Wynn03organizationalstructure, title = {Organizational Structure of Open Source Projects: A Life Cycle Approach}, booktitle = {Proceedings of 7th Annual Conference of the Southern Association for Information Systems}, year = {2003}, note = {"The three graphs in Figure 2 below were taken from smoothed download counts for existing open source projects on Sourceforge.net" "A random sample of 150 open source projects will be taken from data provided by Sourceforge.net. Each project will be evaluated to determine their current life cycle stage (where possible) using download counts. Next, the project admins, developers, and several identifiable users for each evaluated project will be contacted via email to request completing a brief questionnaire to measure the current focus of the project, formal structure, division of labor, leader role, coordination, level of commitment, user success, and developer success. "}, abstract = {The structure of open source project communities is discussed in relation to the organizational life cycle. In lieu of sales figures, the download counts for each project are used to identify the life cycle stage of a random sample of open source projects. A research model is proposed that attempts to measure the fit between the life cycle stage and the specific organizational characteristics of these projects (focus, division of labor, role of the leader, level of commitment, and coordination/control) as an indicator of the success of a project as measured by the satisfaction and involvement of both developers and users.}, keywords = {division of labor, downloads, growth, interview, leadership, life cycle, lifecycle, project success, roles, sourceforge, Survey}, attachments = {https://flosshub.org/sites/flosshub.org/files/wynn2004.pdf}, author = {Donald E. Wynn} } @conference {1248, title = {Supporting Distributed and Decentralized Projects: Drawing Lessons from the Open Source Community}, booktitle = {1st Workshop on Open Source in an Industrial Context}, year = {2003}, note = {"We begin the remainder of the paper with discussion of a survey of open source projects, showing similarities that have arisen in tool usage"}, month = {10/2003}, abstract = {Open source projects are typically organized in a distributed and decentralized manner. These factors strongly determine the processes followed and constrain the types of tools that can be utilized. This paper explores how distribution and decentralization have affected processes and tools in existing open source projects with the goals of summarizing the lessons learned and identifying opportunities for improving both. Issues considered include decision-making, accountability, communication, awareness, rationale, managing source code, testing, and release management.}, keywords = {abiword, apache, debian, freebsd, kde, linux, mozilla, mysql, perl, PHP, postgresql, python, subversion, tomcat, tools}, attachments = {https://flosshub.org/sites/flosshub.org/files/erenkrantz2003.pdf}, author = {Erenkrantz, J. and Taylor, R.N.} } @conference {Ye:2003:TUM:776816.776867, title = {Toward an understanding of the motivation Open Source Software developers}, booktitle = {Proceedings of the 25th International Conference on Software Engineering}, series = {ICSE {\textquoteright}03}, year = {2003}, note = {"Analyzing the emails sent to the mailing fist is one way of understanding the structure of the community." "Table 2 displays the number of code contributions made by members to the GIMP system and the defined roles of those contributing members. We counted the number of contributions made by each person by analyzing the change log of the system."}, pages = {419{\textendash}429}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Washington, DC, USA}, abstract = {An Open Source Software (OSS) project is unlikely to be successful unless there is an accompanied community that provides the platform for developers and users to collaborate. Members of such communities are volunteers whose motivation to participate and contribute is of essential importance to the success of OSS projects. In this paper, we aim to create an understanding of what motivates people to participate in OSS communities. We theorize that learning is one of the motivational forces. Our theory is grounded in the learning theory of Legitimate Peripheral Participation, and is supported by analyzing the social structure of OSS communities and the co-evolution between OSS systems and communities. We also discuss practical implications of our theory for creating and maintaining sustainable OSS communities as well as for software engineering research and education.}, keywords = {change log, COMMUNITY, contributions, contributors, developers, email, email archives, evolution, gimp, log files, mailing list, roles, source code}, isbn = {0-7695-1877-X}, url = {http://portal.acm.org/citation.cfm?id=776816.776867}, attachments = {https://flosshub.org/sites/flosshub.org/files/YeKishida.pdf}, author = {Ye, Yunwen and Kishida, Kouichi} } @conference {1160, title = {Adopting OSS Methods by Adopting OSS Tools}, booktitle = {Proceedings of the 2nd ICSE Workshop on Open Source}, year = {2002}, abstract = {The open source movement has created and used a set of software engineering tools with features that fit the characteristics of open source development processes. To a large extent, the open source culture and methodology are conveyed to new developers via the toolset itself, and through the demonstrated usage of these tools on existing projects. The rapid and wide adoption of open source tools stands in stark contrast to the difficulties encountered in adopting traditional CASE tools. This paper explores the characteristics that make these tools adoptable and how adopting them may influence software development processes.}, keywords = {ant, argouml, bugzilla, cactus, cvs, developers, eclipse, emacs, email, faq, junit, mailing lists, make, netbeans, package management, rpm, scarab, subversion, teams, tools, torque, WORK}, attachments = {https://flosshub.org/sites/flosshub.org/files/Robbins.pdf}, author = {Robbins, Jason E.} } @article {flosswp40, title = {Cave or Community? An Empirical Examination of 100 Mature Open Source Projects}, journal = {First Monday}, volume = {7}, number = {6}, year = {2002}, note = {The author conducts an empirical study of the top 100 mature projects on SourceForge.net to develop an understanding of the F/OSS community. The author sought empirical evidence that would help us understand which is more common- the cave (i.e., lone producer) or the community in F/OSS development. Some key findings include: first, most F/OSS programs are developed by individuals, rather than communities. Second, most OSS programs do not generate a lot of discussion. Third, products with more developers tend to be viewed and downloaded more often. Fourth, the number of developers associated with a project is unrelated to the age of the project.}, month = {06/2002}, abstract = {Starting with Eric Raymond{\textquoteright}s groundbreaking work, "The Cathedral and the Bazaar", open-source software (OSS) has commonly been regarded as work produced by a community of developers. Yet, given the nature of software programs, one also hears of developers with no lives that work very hard to achieve great product results. In this paper, I sought empirical evidence that would help us understand which is more common - the cave (i.e., lone producer) or the community. Based on a study of the top 100 mature products on Sourceforge, I find a few surprising things. First, most OSS programs are developed by individuals, rather than communities. The median number of developers in the 100 projects I looked at was 4 and the mode was 1 - numbers much lower than previous numbers reported for highly successful projects! Second, most OSS programs do not generate a lot of discussion. Third, products with more developers tend to be viewed and downloaded more often. Fourth, the number of developers associated with a project was positively correlated to the age of the project. Fifth, the larger the project, the smaller the percent of project administrators.}, keywords = {age, contributors, developers, project success, registration, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/krishnamurthy.pdf}, author = {Sandeep Krishnamurthy} } @conference {1153, title = {Characterizing the OSS process}, booktitle = {Proceedings of the 2nd ICSE Workshop on Open Source}, year = {2002}, note = {"We have considered two well-known Open Source portals (FreshMeat [1] and SourceForge [2] )." "Using pseudo-random sampling we have selected a sample of 400 projects (mostly from FreshMeat). Each project is described by several variables (programming language, type of license, size of source code, type of documentation available and others). By indirect means (analysis of the Changelog file, or CVS) it is also possible to compute the number of people working on the project, and the number of external contributors. From FreshMeat we get both a vitality index, that considers the number of releases per time period, and a popularity index, which is a first measure of the interest of users to the project (project URL hits, mixed with subscriptions to it)."}, abstract = {The Open Source model of software development has gained the attention of both the business, the practitioners{\textquoteright} and the research communities. The Open Source process has been described by the seminal paper by Eric Raymond [4] and [5]. However, sound empirical studies are still very limited [3], [6]. Our goal is to investigate the OS process by empirical means, to analyze, characterize it, and possibly model it with quantitative models. It should be noted that the Open Source process provides open process and product data, and therefore is a rare opportunity for empirical research. Our initial research focus is on the characterization of the process, starting from the evolution of OS projects. In traditional projects, a significant number of releases in a short time is usually considered an instability factor [7] and [8], while in the OSS community, it is an evidence of vitality, shows the commitment of the authors and the power of attraction of other programmers [9]. Is it possible to characterize the vitality of projects? And, can vitality be traced to some other characteristics of a project?}, keywords = {bugs, change log, classification, cvs, downloads, freshmeat, metadata, patches, popularity, project success, release history, sourceforge, vitality}, attachments = {https://flosshub.org/sites/flosshub.org/files/CapiluppiLagoMorisio.pdf}, author = {Capiluppi, Andrea and Patricia Lago and Maurizio Morisio} } @article {1117, title = {Effort, co-operation and co-ordination in an open source software project: GNOME}, journal = {Information Systems Journal}, volume = {12}, year = {2002}, note = {"Also retrieved by a Perl script were the postings to the relevant discussion lists including the sender, the subject, time and complete text. For analysis of the posting behaviour of the programmers, the short name that each programmer uses for checkins had to be matched to the full name or email address used for postings. For 175 persons, this has been possible using several regular expressions with human check-up."}, month = {01/2002}, pages = {27 - 42}, abstract = {This paper presents results from research into open source projects from a software engineering perspective. The research methodology employed relies on public data retrieved from the CVS repository of the GNOME project and relevant discussion groups. This methodology is described, and results concerning the special characteristics of open source software development are given. These data are used for a first approach to estimating the total effort to be expended.}, keywords = {cvs, discussion, effort estimation, gnome}, issn = {1365-2575}, doi = {10.1046/j.1365-2575.2002.00110.x}, author = {Koch, Stefan and Schneider, Georg} } @conference {stewart2002an-explorat, title = {An Exploratory Study of Factors Influencing the Level of Vitality and Popularity of Open Source Projects}, booktitle = {ICIS 2002. Proceedings of International Conference on Information Systems 2002}, year = {2002}, note = {"We are currently tracking publicly available data on 240 open source projects registered on the freshmeat Website." "First, we randomly selected a total of 120 projects from the utilities, software development, and games and entertainment areas. We then selected 120 projects from these forums that had been registered on the site during the two weeks prior to the start of our data collection effort."}, month = {2002}, pages = {1-5}, abstract = {In this research, we ask the question: What differentiates successful from unsuccessful open source software projects? Using a sample of 240 open source projects, we examine how organizational sponsorship, target audience (developer versus end user), license choice, and development status interact over time to influence the extent to which open source software projects attract user attention and developer activity.}, keywords = {activity, audience, developers, freshmeat, license analysis, licenses, organizational sponsorship, project success, roles, status, target audience, users}, author = {Stewart, Katherine J. and Ammeter, Tony} } @inbook {Madey_Freeh_Tynan_2002, title = {The open source software development phenomenon: An analysis based on social network theory}, booktitle = {Proceedings of the Eighth Americas Conference on Information Systems}, year = {2002}, pages = {1806{\textendash}1813}, abstract = {The OSS movement is a phenomenon that challenges many traditional theories in economics, software engineering, business strategy, and IT management. Thousands of software programmers are spending tremendous amounts of time and effort writing and debugging software, most often with no direct monetary compensation. The programs, some of which are extremely large and complex, are written without the benefit of traditional project management, change tracking, or error checking techniques. Since the programmers are working outside of a traditional organizational reward structure, accountability is an issue as well. A significant portion of internet e-commerce runs on OSS, and thus many firms have little choice but to trust mission-critical e-commerce systems to run on such software, requiring IT management to deal with new types of socio-technical problems. A better understanding of how the OSS community functions may help IT planners make more informed decisions and develop more effective strategies for using OSS software. We hypothesize that open source software development can be modeled as self-organizing, collaboration, social networks. We analyze structural data on over 39,000 open source projects hosted at SourceForge.net involving over 33,000 developers. We define two software developers to be connected part of a collaboration social network if they are members of the same project, or are connected by a chain of connected developers. Project sizes, developer project participation, and clusters of connected developers are analyzed. We find evidence to support our hypothesis, primarily in the presence of power-law relationships on project sizes (number of developers per project), project membership (number of projects joined by a developer), and cluster sizes. Potential implications for IT researchers, IT managers, and governmental policy makers are discussed.}, keywords = {developers, social network analysis, social networks, sourceforge}, url = {http://ais.bepress.com/cgi/viewcontent.cgi?article=1606\&context=amcis2002}, attachments = {https://flosshub.org/sites/flosshub.org/files/MadeyFreehAmcis2002.pdf}, author = {Madey, G. and Freeh, V and Tynan, R} } @article {flosswp63, title = {The Scope of Open Source Licensing}, journal = {Journal of Law, Economics and Organization}, volume = {21}, number = {1}, year = {2002}, month = {2005}, pages = {20-56}, abstract = {This paper is an initial exploration of the determinants of open source license choice. It first enumerates the various considerations that should figure into the licensor{\textquoteright}s choice of contractual terms, in particular highlighting how the decision is shaped not just by the preferences of the licensor itself, but also by that of the community of developers. The paper then presents an empirical analysis of the determinants of license choice using the SourceForge database, a compilation of nearly 40,000 open source projects. Projects geared toward end-users tend to have restrictive licenses, while those oriented toward developers are less likely to do so. Projects that are designed to run on commercial operating systems and those geared towards the Internet are less likely to have restrictive licenses. Finally, projects that are likely to be attractive to consumers such as games are more likely to have restrictive licenses.}, keywords = {developers, license, licenses, permissive, restrictive, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/lernertirole2.pdf}, author = {Josh Lerner and Jean Tirole} } @article {121, title = {Two case studies of open source software development: Apache and Mozilla}, journal = {ACM Transactions on Software Engineering and Methodology}, volume = {11}, number = {3}, year = {2002}, note = {apache data sources: email, cvs, bug database regarding email: "We wrote Perl scripts to extract date, sender identity, message subject, and the message body that was further processed to obtain details on code changes and problem reports (see below). Manual inspection was used to resolve such things as multiple email addresses in cases where all automated techniques failed." (but the rest of the paper does not address this data source at all) mozilla data sources bugzilla, cvs }, pages = {309-346}, abstract = {According to its proponents, open source style software development has the capacity to compete successfully, and perhaps in many cases displace, traditional commercial development methods. In order to begin investigating such claims, we examine data from two major open source projects, the Apache web server and the Mozilla browser. By using email archives of source code change history and problem reports we quantify aspects of developer participation, core team size, code ownership, productivity, defect density, and problem resolution intervals for these OSS projects. We develop several hypotheses by comparing the Apache project with several commercial projects. We then test and refine several of these hypotheses, based on an analysis of Mozilla data. We conclude with thoughts about the prospects for high- performance commercial/ open source process hybrids.}, keywords = {apache, bug fixing, bug reports, bugzilla, change history, core, defect density, email, email archives, mailing list, mozilla, ownership, participation, productivity, scm, source code}, attachments = {https://flosshub.org/sites/flosshub.org/files/mockusFieldingHerbsleb2002.pdf}, author = {Audris Mockus and Roy Fielding and Herbsleb, J. D.} } @conference {Madey02understandingoss, title = {Understanding oss as a self-organizing process}, booktitle = {In The 2nd Workshop on Open Source Software Engineering at the 24th International Conference on Software Engineering (ICSE2002}, year = {2002}, abstract = {We hypothesize that open source software development can be modeled as self-organizing, collaboration, social networks. We analyze structural data on over 39,000 open source projects hosted at SourceForge.net. We define two software developers to be connected {\textemdash} part of a collaboration social network {\textemdash} if they are members of the same project, or are connected by a chain of connected developers. Project sizes, developer project participation, and clusters of connected developers are analyzed. We find evidence to support our hypothesis, primarily in the presence of power-law relationships on project sizes (number of developers per project), project membership (number of projects joined by a developer), and cluster sizes.}, keywords = {developers, size, social network analysis, social networks, sourceforge}, attachments = {https://flosshub.org/sites/flosshub.org/files/MadeyFreehTynan.pdf}, author = {Madey, G. and Freeh, V and Tynan, R} } @conference {1157, title = {Why Do Developers Contribute to Open Source Projects? First Evidence of Economic Incentives}, booktitle = {Proceedings of the 2nd ICSE Workshop on Open Source}, year = {2002}, note = {"The data for this research come from two primary sources: Apache project archives and a targeted survey of Apache participants. Archival data are open source project artifacts such as email and source code archives, source code version control meta-data and developer web sites."}, abstract = {The availability of commercial quality, free software products such as the Apache HTTP (web) server or the Linux operating system has focused significant attention on the open source development process by which these products were created. One of the more perplexing aspects of open source software projects is why developers freely devote their time and energy to these projects. While many open source participants cite idealistic motives for participation, Lerner and Tirole (2000) argue that developer participation in open source projects may, in part, be explained by existing economic theory regarding career concerns. This research seeks to confirm or disconfirm the existence of economic returns to participation in open source development. Preliminary results of our empirical investigation suggest that greater open source participation per se, as measured in contributions made, does not lead to wage increases. However, a higher status in a merit-based ranking within the Apache Project does lead to significantly higher wages. This suggests that employers do not reward the gain in experience through open source participation as an increase in human capital. The results are also consistent with the notion that a high rank within the Apache Software Foundation is a credible signal of the productive capacity of a programmer.}, keywords = {apache, contributions, cvs, developers, ECONOMICS, email, email archives, financial, Human capital, mailing list, MOTIVATION, participation, source code, version control}, attachments = {https://flosshub.org/sites/flosshub.org/files/HannRobertsSlaughterFielding.pdf}, author = {Il-Horn Hann and Jeff Roberts and Sandra Slaughter and Roy Fielding} } @conference {1159, title = {Why Not Improve Coordination in Distributed Software Development by Stealing Good Ideas from Open Source?}, booktitle = {Proceedings of the 2nd ICSE Workshop on Open Source}, year = {2002}, keywords = {apache, communication, coordination, distributed}, attachments = {https://flosshub.org/sites/flosshub.org/files/MockusHerbsleb.pdf}, author = {Audris Mockus and Herbsleb, James} } @conference {1138, title = {Creating a Free, Dependable Software Engineering Environment for Building Java Applications}, booktitle = {1st Workshop on Open Source Software Engineering at ICSE 2001}, year = {2001}, abstract = {As open source software engineering becomes more prevalent, employing sound software engineering practices and the tools used to implement these practices becomes more important. This paper examines the current status of free software engineering tools. For each set of tools, we determined the important attributes that would best assist a developer in each stage of the waterfall model. We rated each tool based on predetermined attributes. We used the creation of a graphical user interface based email client in Java to assist in evaluating each tool. Our findings show that there is still a need for free tools to extract UML diagrams, test graphical user interfaces, make configuring Emacs easier, and profile Java applications. In other areas there are free tools that provide satisfactory functionality such as Concurrent Versions System (CVS), GVim, JUnit, JRefactory, GNU Make, Jakarta Ant, Javadoc, and Doc++.}, keywords = {applications, cvs, Doc++, GNU Make, GVim, Jakarta Ant, java, Javadoc, jrefactory, junit, tools}, attachments = {https://flosshub.org/sites/flosshub.org/files/bittman.pdf}, author = {Bittman, M. and Roos, R. and Kapfhammer, G.M.} } @conference {1151, title = {Open Source Development: A suitable Method to introduce a standardized communication protocol?}, booktitle = {1st Workshop on Open Source Software Engineering at ICSE 2001}, year = {2001}, abstract = {Open Source developments like telnet and Apache are very important for an extensive and compatible use of the TCP/IP and the HTTP protocol. Both contain features which rely on a compatible implementation of complex interactions between computers of different platform types. This can be successfully achieved with the help of Open Source. Reconciliation of different interpretations of the standard, clarifications and extensions can be realised by discussion systems and some generally accepted conciliators of Open Source projects. As long as companies have a commercial interest in selling solutions based on the protocol, it makes economic sense for them to jointly support its common implementation. This paper examines the possibility to use an Open Source reference implementation of the agricultural communication protocols DIN 9684 [1] [2] and ISO 11783 to establish a capable and compatible implementation in agricultural mobile process control and data collection.}, keywords = {communication protocol, DIN 9684, ISO 11783, open source}, attachments = {https://flosshub.org/sites/flosshub.org/files/spangler.pdf}, author = {Spangler, Achim} } @conference {1139, title = {Open Source Development: An Arthurian Legend}, booktitle = {1st Workshop on Open Source Software Engineering at ICSE 2001}, year = {2001}, abstract = {OSSD (Open Source Software Development) achieves remarkable success in delivering complex software systems {\textendash} systems which are incredibly reliable and robust {\textendash} in a short amount of time and without even paying anyone! Naturally, in the face of this success, organizations are interested in seeing if the mechanisms behind OSSD success can be migrated into their own practices, hopefully improving their systems and their productivity. In this paper, we look (lighthearted at first) at the motivations behind those involved in OSSD and describe the problems that need to be overcome if OSSD-type practices can be migrated into traditional organizations.}, keywords = {commercial software, developers, MOTIVATION}, attachments = {https://flosshub.org/sites/flosshub.org/files/cook.pdf}, author = {Cook, J.} } @conference {1145, title = {Reputation Layers for Open-Source Development}, booktitle = {1st Workshop on Open Source Software Engineering at ICSE 2001}, year = {2001}, keywords = {currency, developers, MOTIVATION, reputation}, attachments = {https://flosshub.org/sites/flosshub.org/files/masum.pdf}, author = {Hasan Masum} } @article {flosswp26, title = {A Case Study of Open Source Software Development: The Apache Server}, journal = {Proceedings of the International Conference on Software Engineering (ICSE 2000)}, year = {2000}, note = {We used the following archival sources of data: Developer email list (EMAIL). Concurrent Version Control archive (CVS). Problem reporting database (BUGDB).}, month = {June}, abstract = {According to its proponents, open source style software development has the capacity to compete successfully, and perhaps in many cases displace, traditional commercial development methods. We examine the development process of a major open source application, the Apache web server. By using email archives of source code change history and problem reports we quantify aspects of developer participation, core team size, code ownership, productivity, defect density, and problem resolution interval for this OSS project. This analysis reveals a unique process, which performs well on important measures.}, keywords = {apache, bug fix revisions, bugs, core, cvs, defect density, developers, email archives, participation, productivity, revision control, revision history, roles, scm, source code, team size}, attachments = {https://flosshub.org/sites/flosshub.org/files/mockusapache.pdf}, author = {Audris Mockus and Roy Fielding and Herbsleb, James} } @conference {Yamauchi:2000:CLM:358916.359004, title = {Collaboration with Lean Media: how open-source software succeeds}, booktitle = {Proceedings of the 2000 ACM conference on Computer supported cooperative work (CSCW)}, series = {CSCW {\textquoteright}00}, year = {2000}, pages = {329{\textendash}338}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Open-source software, usually created by volunteer programmers dispersed worldwide, now competes with that developed by software firms. This achievement is particularly impressive as open-source programmers rarely meet. They rely heavily on electronic media, which preclude the benefits of face-to-face contact that programmers enjoy within firms. In this paper, we describe findings that address this paradox based on observation, interviews and quantitative analyses of two open-source projects. The findings suggest that spontaneous work coordinated afterward is effective, rational organizational culture helps achieve agreement among members and communications media moderately support spontaneous work. These findings can imply a new model of dispersed collaboration.}, keywords = {cooperative work, cvs, distributed work, electronic media, INNOVATION, open-source, software engineering}, isbn = {1-58113-222-0}, doi = {10.1145/358916.359004}, url = {http://doi.acm.org/10.1145/358916.359004}, author = {Yamauchi, Yutaka and Yokozawa, Makoto and Shinohara, Takeshi and Ishida, Toru} }