#######replace this section with your own data loading section...

setwd("~/Documents/Sysabee_and_Business/Partnerships/dataactionlab/introtomoderndataanalyis/NSERC/NSERC_data")
app_scoring <- read.csv("app_scoring.csv")#data from app scoring tab
dg_apps <- read.csv("dg_apps.csv") #data from dg apps tab
financial <- read.csv("financial.csv") #data from financial tab
apps_data <- merge(dg_apps, app_scoring, by.x="Application.Id", by.y="Application.Id")
apps_data <- merge(apps_data, financial, by.x="Application.Id", by.y="Application.Id")
write.csv(apps_data, "merged_apps_data.csv")
write.csv(colnames(apps_data,"apps_data_field_name_index.csv"))

#my_texts <- apps_data[,15]

apps_data <- read.csv("merged_apps_data.csv")
my_texts <- apps_data[,16]

#below is based on https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a

library(wordcloud)
library(wordcloud2)
library(tm)#Create a vector containing only the text
library(SentimentAnalysis)
library(udpipe)
library(lattice)

docs <- Corpus(VectorSource(my_texts))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))

dtm <- TermDocumentMatrix(docs) 
nDocs(dtm)
nTerms(dtm)

matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)


set.seed(1234)
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35,colors=brewer.pal(8, "Dark2"))

wordcloud(words = df$word, freq = df$freq, min.freq = 4, max.words=30, random.order=FALSE, rot.per=0.35,colors=brewer.pal(8, "Dark2"))

#https://stackoverflow.com/questions/47524602/wordcloud-is-cropping-text

wordcloud(words = df$word, freq = df$freq, min.freq = 4, max.words=30, random.order=FALSE, rot.per=0.35,colors=brewer.pal(8, "Dark2"),scale=c(3.5,0.01))

wordcloud(words = df$word, freq = df$freq, min.freq = 4, max.words=100, random.order=FALSE, rot.per=0.35,colors=brewer.pal(8, "Dark2"),scale=c(3.5,0.01))

#####BONUS SENTIMENT ANALYSIS CODE
#sentiment <- analyzeSentiment(docs)
#table(convertToBinaryResponse(sentiment$SentimentLM))
#table(convertToDirection(sentiment$SentimentLM))
######################

#below is based on https://datascienceplus.com/introducing-udpipe-for-easy-natural-language-processing-in-r/

test_text_vector <- c("what a great time this is", "life is quite the journey. It's an amazing time", "yesterday I ate a peanut butter sandwich. It was delicious")

library(udpipe)
 
model <- udpipe_download_model(language = "english")

udmodel_english <- udpipe_load_model(file = 'english-ewt-ud-2.5-191206.udpipe')

s <- udpipe_annotate(udmodel_english, test_text_vector)

x <- data.frame(s)

library(lattice)

stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "yellow", 
         main = "UPOS (Universal Parts of Speech)\n frequency of occurrence", 
         xlab = "Freq")

## NOUNS
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most occurring nouns", xlab = "Freq")

## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple", 
         main = "Most occurring adjectives", xlab = "Freq")

## VERBS
stats <- subset(x, upos %in% c("VERB")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "gold", 
         main = "Most occurring Verbs", xlab = "Freq")

## Using RAKE
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", 
                       relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red", 
         main = "Keywords identified by RAKE", 
         xlab = "Rake")

## Using a sequence of POS tags (noun phrases / verb phrases)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token), 
                          pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                          is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 1)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "magenta", 
         main = "Keywords - simple noun phrases", xlab = "Frequency")

cs_bits <- c("Cognitive science studies the basic processes of cognition often mirroring the simple informational environments for which our brain is adapted. Though our brain is adapted for simple environments we live in an era in which we have access to more information and are surrounded by multiple distractions vying for our attention. This attention economy has redefined critical questions in cognitive science. The work of cognitive science must be translated to the current environment. This chapter examines how cognition works or fails to work in an attention economy across a range of phenomena from safety and willpower to the appreciation of art and the ability to think creatively. The costs of a divided cognition and how cognitive science may help us understand what has gone wrong are offered with a look to the future to examine how the technology underlying an attention economy may help improve cognitive function. The chapter concludes with a discussion of the increasing relevance of cognitive science and a call for cognitive science to study and understand the problems of our distraction-rich world.", "Cognitive science is the interdisciplinary scientific study of the mind. Many questions therefore fall within its scope. For instance how do people perceive the world through their senses? How do they manage to act in a timely fashion in a changing world? How do they solve novel problems? How do they manage to learn new skills? And how do they understand one another? In addressing these questions most researchers assume that the human mind is some kind of computational device containing representations. Modeling human language capacities has been a central goal within cognitive science relevant research draws on a wide range of empirical and computational methods. This brief overview first characterizes the subject and then sketches a brief history of it. In indicating the current state of play key issues in mental representation modularity and computational architecture are noted and some current directions in cognitive research are indicated.","Cognitive science was long dominated by computational approaches in which relevant processes were symbol manipulation processes Franklin 1995 and certainly such approaches are still very prevalent. There are obviously many powers and advantages afforded by such design approaches over for example simple associationistic approaches but regarding representation per se they are hopeless. Basic representations in such models are taken to represent something in virtue of being in a correspondence with that something—a correspondence that somehow encodes its distal end—with the crucial correspondence variously taken to be one of a “stand-in” or perhaps a structural isomorphism Newell 1980 Vera and Simon 1993. But such models cannot account for the bare possibility of representational error and have no way to address the possibility of system detectable representational error Bickhard 2004a in press in preparation. If the crucial representational encoding relationship exists then it is correct and if it does not exist then the representation does not exist and there is no third possibility for modeling the representation existing but being incorrect.")

s <- udpipe_annotate(udmodel_english, cs_bits)

x <- data.frame(s)

stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", 
                       relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red", 
         main = "Keywords identified by RAKE", 
         xlab = "Rake")