@conference {928, title = {Mining source code to automatically split identifiers for software analysis}, booktitle = {2009 6th IEEE International Working Conference on Mining Software Repositories (MSR)2009 6th IEEE International Working Conference on Mining Software Repositories}, year = {2009}, pages = {71 - 80}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {Automated software engineering tools (e.g., program search, concern location, code reuse, quality assessment, etc.) increasingly rely on natural language information from comments and identifiers in code. The first step in analyzing words from identifiers requires splitting identifiers into their constituent words. Unlike natural languages, where space and punctuation are used to delineate words, identifiers cannot contain spaces. One common way to split identifiers is to follow programming language naming conventions. For example, Java programmers often use camel case, where words are delineated by uppercase letters or non-alphabetic characters. However, programmers also create identifiers by concatenating sequences of words together with no discernible delineation, which poses challenges to automatic identifier splitting. In this paper, we present an algorithm to automatically split identifiers into sequences of words by mining word frequencies in source code. With these word frequencies, our identifier splitter uses a scoring technique to automatically select the most appropriate partitioning for an identifier. In an evaluation of over 8000 identifiers from open source Java programs, our Samurai approach outperforms the existing state of the art techniques.}, keywords = {java, samurai, sourceforge}, isbn = {978-1-4244-3493-0}, doi = {10.1109/MSR.2009.5069482}, attachments = {https://flosshub.org/sites/flosshub.org/files/71EnslenandHillandPollockandVijayShanker.pdf}, author = {Enslen, Eric and Hill, Emily and Pollock, Lori and Vijay-Shanker, K.} } @conference {Hill:2008:AAM:1370750.1370771, title = {AMAP: automatically mining abbreviation expansions in programs to enhance software maintenance tools}, booktitle = {Proceedings of the 2008 international working conference on Mining software repositories}, series = {MSR {\textquoteright}08}, year = {2008}, month = {05/2008}, pages = {79{\textendash}88}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {When writing software, developers often employ abbreviations in identifier names. In fact, some abbreviations may never occur with the expanded word, or occur more often in the code. However, most existing program comprehension and search tools do little to address the problem of abbreviations, and therefore may miss meaningful pieces of code or relationships between software artifacts. In this paper, we present an automated approach to mining abbreviation expansions from source code to enhance software maintenance tools that utilize natural language information. Our scoped approach uses contextual information at the method, program, and general software level to automatically select the most appropriate expansion for a given abbreviation. We evaluated our approach on a set of 250 potential abbreviations and found that our scoped approach provides a 57\% improvement in accuracy over the current state of the art.}, keywords = {automatic abbreviation expansion, azureus, itext.net, liferay, maintenance, natural language, openoffice.org, program comprehension, source code, tiger envelopes, tools}, isbn = {978-1-60558-024-1}, doi = {http://doi.acm.org/10.1145/1370750.1370771}, url = {http://doi.acm.org/10.1145/1370750.1370771}, attachments = {https://flosshub.org/sites/flosshub.org/files/p79-hill.pdf}, author = {Hill, Emily and Fry, Zachary P. and Boyd, Haley and Sridhara, Giriprasad and Novikova, Yana and Pollock, Lori and Vijay-Shanker, K.} }