
library(tm)

#initialize some short documents:
doc1 <- "I love deep dish pizza." 
doc2 <- "Chicago deep dish pizza." 
doc3 <- "New York deep dish pizza." 
doc4 <- "Good toppings and crust."
doc5 <- "Deep dish with Parmigiano cheese."

# create a document list:
doc.list <- list(doc1, doc2, doc3, doc4, doc5)
N.docs <- length(doc.list)
names(doc.list) <- paste0("doc", c(1:N.docs))
query <- "Good pizza"

# create a corpus from the documents and query:
my.docs <- VectorSource(c(doc.list, query))
my.docs$Names <- c(names(doc.list), "query")
my.corpus <- Corpus(my.docs)

#####################################
# => transform the corpus as follows:
# 1) convert to lower case
# 2) remove stopwords
# 3) remove punctuation
# 4) remove numbers
# 5) remove multiple whitespaces
# 6) remove plural
#####################################

my.corpus2 <- tm_map(my.corpus,  tolower)
my.corpus3 <- tm_map(my.corpus2, removeWords, stopwords("english"))
my.corpus4 <- tm_map(my.corpus3, removePunctuation)
my.corpus5 <- tm_map(my.corpus4, removeNumbers)
my.corpus6 <- tm_map(my.corpus5, stripWhitespace)
my.corpus6

library(SnowballC)
my.corpus7 <- tm_map(my.corpus6, stemDocument)

# create a document/term matrix:
docTermMatrix <- DocumentTermMatrix(my.corpus7)
cat("\n")
paste("*** Document Term Matrix ***",collapse=" ")
docTermMatrix
inspect(docTermMatrix)

# perform tf-idf operation:
docTermMatrix_tfxidf <- weightTfIdf(docTermMatrix)
cat("\n")
paste("*** TF/IDF Matrix ***",collapse=" ")
docTermMatrix_tfxidf
inspect(docTermMatrix_tfxidf)

