@proceedings {1915, title = {Classifying code comments in Java open-source software systems}, year = {2017}, note = {"we conducted an in-depth analysis of the comments in the source code files of six major OSS systems in Java"}, month = {05/2017}, pages = {227-237}, abstract = {Code comments are a key software component containing information about the underlying implementation. Several studies have shown that code comments enhance the readability of the code. Nevertheless, not all the comments have the same goal and target audience. In this paper, we investigate how six diverse Java OSS projects use code comments, with the aim of understanding their purpose. Through our analysis, we produce a taxonomy of source code comments; subsequently, we investigate how often each category occur by manually classifying more than 2,000 code comments from the aforementioned projects. In addition, we conduct an initial evaluation on how to automatically classify code comments at line level into our taxonomy using machine learning; initial results are promising and suggest that an accurate classification is within reach.}, keywords = {java, Survey}, author = {Luca Pascarella and Bacchelli, Alberto} } @proceedings {1760, title = {A Dataset For API Usage}, year = {2015}, month = {05/2015}, publisher = {IEEE}, abstract = {An Application Programming Interface (API) provides a specific set of functionalities to a developer. The main aim of an API is to encourage the reuse of already existing functionality. There has been some work done into API popularity trends, API evolution and API usage. For all the aforementioned research avenues there has been a need to mine the usage of an API in order to perform any kind of analysis. Each one of the approaches that has been employed in the past involved a certain degree of inaccuracy as there was no type check that takes place. We introduce an approach that takes type information into account while mining API method invocations and annotation usages. This approach accurately makes a connection between a method invocation and the class of the API to which the method belongs to. We try collecting as many usages of an API as possible, this is achieved by targeting projects hosted on GitHub. Additionally, we look at the history of every project to collect the usage of an API from earliest version onwards. By making such a large and rich dataset public, we hope to stimulate some more research in the field of APIs with the aid of accurate API usage samples.}, url = {http://sback.it/publications/msr2015data.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/msr2015data.pdf}, author = {Anand Ashok Sawant and Bacchelli, Alberto} } @proceedings {1768, title = {Will they like this? Evaluating Code Contributions With Language Models}, year = {2015}, month = {05/2015}, publisher = {IEEE}, abstract = {Popular open-source software projects receive and review contributions from a diverse array of developers, many of whom have little to no prior involvement with the project. A recent survey reported that reviewers consider conformance to the project{\textquoteright}s code style to be one of the top priorities when evaluating code contributions on Github. We propose to quantitatively evaluate the existence and effects of this phenomenon. To this aim we use language models, which were shown to accurately capture stylistic aspects of code. We find that rejected changesets do contain code significantly less similar to the project than accepted ones; furthermore, the less similar changesets are more likely to be subject to thorough review. Armed with these results we further investigate whether new contributors learn to conform to the project style and find that experience is positively correlated with conformance to the project{\textquoteright}s code style.}, url = {http://sback.it/publications/msr2015.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/msr2015_0.pdf}, author = {Vincent J. Hellendoorn and Premkumar T. Devanbu and Bacchelli, Alberto} } @conference {Beller:2014:MCR:2597073.2597082, title = {Modern Code Reviews in Open-source Projects: Which Problems Do They Fix?}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories}, series = {MSR 2014}, year = {2014}, pages = {202{\textendash}211}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Code review is the manual assessment of source code by humans, mainly intended to identify defects and quality problems. Modern Code Review (MCR), a lightweight variant of the code inspections investigated since the 1970s, prevails today both in industry and open-source software (OSS) systems. The objective of this paper is to increase our understanding of the practical benefits that the MCR process produces on reviewed source code. To that end, we empirically explore the problems fixed through MCR in OSS systems. We manually classified over 1,400 changes taking place in reviewed code from two OSS projects into a validated categorization scheme. Surprisingly, results show that the types of changes due to the MCR process in OSS are strikingly similar to those in the industry and academic systems from literature, featuring the similar 75:25 ratio of maintainability-related to functional problems. We also reveal that 7{\textendash}35\% of review comments are discarded and that 10{\textendash}22\% of the changes are not triggered by an explicit review comment. Patterns emerged in the review data; we investigated them revealing the technical factors that influence the number of changes due to the MCR process. We found that bug-fixing tasks lead to fewer changes and tasks with more altered files and a higher code churn have more changes. Contrary to intuition, the person of the reviewer had no impact on the number of changes. }, keywords = {code review, defects, open source software}, isbn = {978-1-4503-2863-0}, doi = {10.1145/2597073.2597082}, url = {http://doi.acm.org/10.1145/2597073.2597082}, attachments = {https://flosshub.org/sites/flosshub.org/files/beller.pdf}, author = {Beller, Moritz and Bacchelli, Alberto and Zaidman, Andy and Juergens, Elmar} } @proceedings {1491, title = {Communication in Open Source Software Development Mailing Lists}, year = {2013}, note = {"The entire dataset used in the experiment, including the cards, the resolved aliases, and detailed statistical results, can be downloaded from ..." http://www.st.ewi.tudelft.nl/~guzzi/oss-communication/}, month = {05/2013}, pages = {277-286}, abstract = {Open source software (OSS) development teams use electronic means, such as emails, instant messaging, or forums, to conduct open and public discussions. Researchers investigated mailing lists considering them as a hub for project communication. Prior work focused on specific aspects of emails, for example the handling of patches, traceability concerns, or social networks. This led to insights pertaining to the investigated aspects, but not to a comprehensive view of what developers communicate about. Our objective is to increase the understanding of development mailing lists communication. We quantitatively and qualitatively analyzed a sample of 506 email threads from the development mailing list of a major OSS project, Lucene. Our investigation reveals that implementation details are discussed only in about 35\% of the threads, and that a range of other topics is discussed. Moreover, core developers participate in less than 75\% of the threads. We observed that the development mailing list is not the main player in OSS project communication, as it also includes other channels such as the issue repository.}, keywords = {email, lucene, mailling list}, url = {http://www.st.ewi.tudelft.nl/~guzzi/downloads/Guzzi2013msr.pdf}, attachments = {https://flosshub.org/sites/flosshub.org/files/Guzzi2013msr.pdf}, author = {Guzzi, Anja and Bacchelli, Alberto and Lanza, Michele and Pinzger, Martin and van Deursen, Arie} } @conference {1357, title = {Content classification of developer emails}, booktitle = {Proceedings of the 34th IEEE/ACM International Conference On Software Engineering (ICSE 2012)}, year = {2012}, note = {We created a web application to manually classify email content in the chosen categories. We classified a statistically significant set of emails from four java open source software (OSS) systems, used to evaluate the accuracy of our approach. The contributions of this paper are: 1) a novel approach that fuses parsing and ML techniques for classification of email lines; 2) a web application to manually classify email content; 3) the manual classification of a statistically significant sample set of emails (for a total of 67,792 lines) from mailing lists of four different software systems{\textendash}in the form of a freely available benchmark; and 4) the empirical evaluation of our approach against the benchmark}, month = {06/2012}, abstract = {Emails related to the development of a software system contain information about design choices and issues encountered during the development process. Exploiting the knowledge embedded in emails with automatic tools is challenging, due to the unstructured, noisy and mixed language nature of this communication medium. Natural language text is often not well-formed and is interleaved with languages with other syntaxes, such as code or stack traces. We present an approach to classify email content at line level. Our technique classifies email lines in five categories (i.e., text, junk, code, patch, and stack trace) to allow one to subsequently apply ad hoc analysis techniques for each category. We evaluated our approach on a statistically significant set of emails gathered from mailing lists of four unrelated open source systems.}, keywords = {email, Emails, Empirical software engineering, mailing list, natural language, Unstructured Data Mining}, url = {http://www.inf.usi.ch/phd/bacchelli/publications.php}, attachments = {https://flosshub.org/sites/flosshub.org/files/icse2012.pdf}, author = {Bacchelli, Alberto and Dal Sasso, Tommaso and D{\textquoteright}Ambros, Marco and Lanza, Michele} } @conference {Bacchelli:2011:EEE:1985793.1985999, title = {Exploring, exposing, and exploiting emails to include human factors in software engineering}, booktitle = {Companion to the Proceedings of the 33rd International Conference on Software Engineering}, series = {ICSE {\textquoteright}11}, year = {2011}, note = {This paper is a summary of work in the field, for the doctoral consortium.}, pages = {1074{\textendash}1077}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Researchers mine software repositories to support software maintenance and evolution. The analysis of the structured data, mainly source code and changes, has several benefits and offers precise results. This data, however, leaves communication in the background, and does not permit a deep investigation of the human factor, which is crucial in software engineering. Software repositories also archive documents, such as emails or comments, that are used to exchange knowledge among people - we call it "people-centric information." By covering this data, we include the human factor in our analysis, yet its unstructured nature makes it currently sub-exploited. Our work, by focusing on email communication and by implementing the necessary tools, investigates methods for exploring, exposing, and exploiting unstructured data. We believe it is possible to close the gap between development and communication, extract opinions, habits, and views of developers, and link implementation to its rationale; we see in a future where software analysis and development is routinely augmented with people-centric information.}, keywords = {email communication, toolset, unstructured data}, isbn = {978-1-4503-0445-0}, doi = {10.1145/1985793.1985999}, url = {http://doi.acm.org/10.1145/1985793.1985999}, author = {Bacchelli, Alberto} } @conference {1358, title = {Extracting source code from e-mails}, booktitle = {Proceedings of ICPC 2010 (18th IEEE International Conference on Program Comprehension)}, year = {2010}, note = {"We want to extract source code fragments from e-mail messages. To do this, we first need to select e-mails that contain source code fragments, and then we extract such fragments from the content in which they are enclosed." "we manually build a statistically significant benchmark taking sample e- mails from five unrelated open source Java software systems." }, pages = {24-33}, abstract = {E-mails, used by developers and system users to communicate over a broad range of topics, offer a valuable source of information. If archived, e-mails can be mined to support program comprehension activities and to provide views of a software system that are alternative and complementary to those offered by the source code. However, e-mails are written in natural language, and therefore contain noise that makes it difficult to retrieve the important data. Thus, before conducting an effective system analysis and extracting data for program comprehension, it is necessary to select the relevant messages, and to expose only the meaningful information. In this work we focus both on classifying e-mails that hold fragments of the source code of a system, and on extracting the source code pieces inside the e-mail. We devised and analyzed a number of lightweight techniques to accomplish these tasks. To assess the validity of our techniques, we manually inspected and annotated a statistically significant number of e-mails from five unrelated open source software systems written in Java. With such a benchmark in place, we measured the effectiveness of each technique in terms of precision and recall.}, keywords = {argouml, email, freenet, jmeter, mailing lists, mina, natural language, openjpa, source code}, url = {http://www.inf.usi.ch/phd/bacchelli/publications.php}, attachments = {https://flosshub.org/sites/flosshub.org/files/icpc2010.pdf}, author = {Bacchelli, Alberto and D{\textquoteright}Ambros, Marco and Lanza, Michele} } @conference {1359, title = {Linking e-mails and source code artifacts}, booktitle = {Proceedings of the 32nd ACM/IEEE International Conference on Software Engineering - ICSE {\textquoteright}10}, volume = {1}, year = {2010}, note = {"we devised a set of lightweight methods, based on regular expressions, to establish the link between e-mails and software artifacts. We evaluated them in terms of precision and recall considering one single Java system. In this paper we overcome a number of limitations of our previous work, resulting in the following contributions: {\textbullet} An extensive and publicly available1 benchmark and toolset for recovering traceability links between e-mails and source code artifacts. We created our benchmark by analyzing the mailing lists of six different software systems written in four different programming languages. For each system we manually annotated a statistically significant number of e-mails. {\textbullet} A comprehensive evaluation of linking techniques. We evaluated and compared, in terms of precision and recall, different linking methods, ranging from lightweight grep-style approaches to more complex approa- ches from the information retrieval (IR) field."}, month = {05/2010}, pages = {375-384}, publisher = {ACM Press}, organization = {ACM Press}, address = {Cape Town, South Africa}, abstract = {E-mails concerning the development issues of a system constitute an important source of information about high-level design decisions, low-level implementation concerns, and the social structure of developers. Establishing links between e-mails and the software artifacts they discuss is a non-trivial problem, due to the inherently informal nature of human communication. Different approaches can be brought into play to tackle this traceability issue, but the question of how they can be evaluated remains unaddressed, as there is no recognized benchmark against which they can be compared. In this article we present such a benchmark, which we created through the manual inspection of a statistically significant number of e-mails pertaining to six unrelated software systems. We then use our benchmark to measure the effectiveness of a number of approaches, ranging from lightweight approaches based on regular expressions to full-fledged information retrieval approaches.}, isbn = {9781605587196}, doi = {10.1145/1806799.1806855}, url = {http://www.inf.usi.ch/phd/bacchelli/publications.php}, author = {Bacchelli, Alberto and Lanza, Michele and Robbes, Romain} } @conference {1360, title = {Benchmarking Lightweight Techniques to Link E-Mails and Source Code}, booktitle = {2009 16th Working Conference on Reverse Engineering}, year = {2009}, note = {"We present different lightweight approaches that, exploiting the specific characteristics of e-mails and the ones of the source code, are capable of establishing a bi-directional link between source code entities and e-mails" "We analyzed ArgoUML1, a UML modelling tool written in Java, developed over the course of approximately 9 years, and made available under the BSD Open Source License. We consider the release 0.28 (March 2009) that comprehends 2,197 classes. We employed the lightweight approaches to map such classes to the related e-mails in ArgoUML mailing lists. ArgoUML e-mails are stored in six mailing lists (see Table I), for a total amount of 79,175 messages" Figure 4 is helpful to understand what this does.}, pages = {205 - 214}, publisher = {IEEE}, organization = {IEEE}, address = {Lille, France}, abstract = {During the evolution of a software system, a large amount of information, which is not always directly related to the source code, is produced. Several researchers have provided evidence that the contents of mailing lists represent a valuable source of information: Through e-mails, developers discuss design decisions, ideas, known problems and bugs, etc. which are otherwise not to be found in the system. A technical challenge in this context is how to establish the missing link between free-form e-mails and the system artifacts they refer to. Although the range of approaches is vast, establishing their accuracy remains a problem, as there is no benchmark against which to compare their performance. To overcome this issue, we manually inspected a statistically significant number of e-mails pertaining to the ArgoUML system. Based on this benchmark, we present a variety of lightweight techniques to assign e-mails to software artifacts and measure their effectiveness in terms of precision and recall.}, keywords = {argouml, email, mailing lists}, isbn = {978-0-7695-3867-9}, doi = {10.1109/WCRE.2009.44}, attachments = {https://flosshub.org/sites/flosshub.org/files/wcre2009.pdf}, author = {Bacchelli, Alberto and D{\textquoteright}Ambros, Marco and Lanza, Michele and Robbes, Romain} }