@article {thomas2014studying, title = {Studying software evolution using topic models}, journal = {Science of Computer Programming}, volume = {80}, year = {2014}, pages = {457{\textendash}479}, publisher = {Elsevier}, abstract = {Topic models are generative probabilistic models which have been applied to information retrieval to automatically organize and provide structure to a text corpus. Topic models discover topics in the corpus, which represent real world concepts by frequently cooccurring words. Recently, researchers found topics to be effective tools for structuring various software artifacts, such as source code, requirements documents, and bug reports. This research also hypothesized that using topics to describe the evolution of software repositories could be useful for maintenance and understanding tasks. However, research has yet to determine whether these automatically discovered topic evolutions describe the evolution of source code in a way that is relevant or meaningful to project stakeholders, and thus it is not clear whether topic models are a suitable tool for this task. In this paper, we take a first step towards evaluating topic models in the analysis of software evolution by performing a detailed manual analysis on the source code histories of two well-known and well-documented systems, JHotDraw and jEdit. We define and compute various metrics on the discovered topic evolutions and manually investigate how and why the metrics evolve over time. We find that the large majority (87\%{\textendash}89\%) of topic evolutions correspond well with actual code change activities by developers. We are thus encouraged to use topic models as tools for studying the evolution of a software system.}, keywords = {Latent Dirichlet allocation, mining software repositories, software evolution, topic model}, url = {http://sail.cs.queensu.ca/publications/pubs/Thomas-2012-SCP.pdf}, author = {Stephen W. Thomas and Adams, Bram and Hassan, Ahmed E. and Blostein, Dorothea} }