@article {1395, title = {Automated topic naming: supporting cross-project analysis of software maintenance activities}, journal = {Empirical Software Engineering}, year = {2012}, abstract = {Software repositories provide a deluge of software artifacts to analyze. Researchers have attempted to summarize, categorize, and relate these artifacts by using semi-unsupervised machine-learning algorithms, such as Latent Dirichlet Allocation (LDA). LDA is used for concept and topic analysis to suggest candidate word-lists or topics that describe and relate software artifacts. However, these word-lists and topics are difficult to interpret in the absence of meaningful summary labels. Current attempts to interpret topics assume manual labelling and do not use domain-specific knowledge to improve, contextualize, or describe results for the developers. We propose a solution: automated labelled topic extraction. Topics are extracted using LDA from commit-log comments recovered from source control systems. These topics are given labels from a generalizable cross-project taxonomy, consisting of non-functional requirements. Our approach was evaluated with experiments and case studies on three large-scale Relational Database Management System (RDBMS) projects: MySQL, PostgreSQL and MaxDB. The case studies show that labelled topic extraction can produce appropriate, context-sensitive labels that are relevant to these projects, and provide fresh insight into their evolving software development activities. }, keywords = {LDA, maxdb, mysql, postgresql, topics}, issn = {1573-7616}, doi = {10.1007/s10664-012-9209-9}, author = {Hindle, Abram and Ernst, Neil A. and Godfrey, Michael W. and Mylopoulos, John} }