@proceedings {1755, title = {Mining StackOverflow to Filter out Off-topic IRC Discussion}, year = {2015}, month = {05/2015}, abstract = {Internet Relay Chat (IRC) is a commonly used tool by OpenSource developers. Developers use IRC channels to discuss programming related problems, but much of the discussion is irrelevant and off-topic. Essentially if we treat IRC discussions like email messages, and apply spam filtering, we can try to filter out the spam (the off-topic discussions) from the ham (the programming discussions). Yet we need labelled data that unfortunately takes time to curate. To avoid costly curration in order to filter out off-topic discussions, we need positive and negative data-sources. Online discussion forums, such as StackOverflow, are very effective for solving programming problems. By engaging in open-data, StackOverflow data becomes a powerful source of labelled text regarding programming. This work shows that we can train classifiers using StackOverflow posts as positive examples of on-topic programming discussion. YouTube video comments, notorious for their lack of quality, serve as training set of offtopic discussion. By exploiting these datasets, accurate classifiers can be built, tested and evaluated that require very little effort for end-users to deploy and exploit.}, keywords = {irc, Stack Overflow, youtube}, attachments = {https://flosshub.org/sites/flosshub.org/files/shaiful-mining_so_0.pdf}, author = {Shaiful Alam Chowdhury and Hindle, Abram} }