Title

rm(list=ls())
library(tm)

## Loading required package: NLP

library(DBI)

## Warning: package 'DBI' was built under R version 3.6.3

library(RMySQL)

## Warning: package 'RMySQL' was built under R version 3.6.3

library(e1071)

## Warning: package 'e1071' was built under R version 3.6.3

library(pROC)

## Warning: package 'pROC' was built under R version 3.6.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## load data
csvdata <- read.csv("C:/users/DELL/Desktop/Rochester/social media analytics/final/complaint1700.csv", header=TRUE, sep=',', quote='"')
csvdata1 <- read.csv("C:/users/DELL/Desktop/Rochester/social media analytics/final/noncomplaint1700.csv", header=TRUE, sep=',', quote='"')
testdata <- read.csv("C:/users/DELL/Desktop/Rochester/social media analytics/final/airline.csv", header=TRUE, sep=',', quote='"')

testdata <- testdata[,c(2,4,6)]
airdata <- rbind(csvdata, csvdata1, testdata)
air <- airdata[,c(1,3)]
names(air) = c("doc_id", "text")
docs <- Corpus(DataframeSource(air))

mystops = c("alaskaair","fly", "flying","flight", "united", "americanair", "jetblue", "deltaassist", "now", "southwestair", "delta", "virginamerica", "airline")
dtm <- DocumentTermMatrix(docs, control=list(tolower=T, removePunctuation=T, removeNumbers=T, stripWhitespace=T, stopwords=c(mystops, stopwords("english"), stopwords("spanish"))))
dtm <- removeSparseTerms(dtm,0.99)
f1 <- as.matrix(dtm)
df<- data.frame(f1)
df_air <- df[1:3400, ]
df_air[1:1700, 'y'] <- 1
df_air[1701:3400, 'y'] <- 0

## train model

##SVM
svm.model <- svm(trainingData[,'y'] ~ ., data = trainingData[,1:157], kernel='linear')
pred <- predict(svm.model, validationData[,1:157])
pred.class <- as.numeric(pred>0.1)
table(pred.class, validationData[,'y'])

##           
## pred.class   0   1
##          0 139  39
##          1 197 313

Evaluation(pred.class, validationData['y'], 1)

## [1] 0.7262181

svm.model <- svm(trainingData[,'y'] ~ ., data = trainingData[,1:157], kernel='polynomial')
pred <- predict(svm.model, validationData[,1:157])
pred.class <- as.numeric(pred>0.1)
table(pred.class, validationData[,'y'])

##           
## pred.class   0   1
##          0  78  19
##          1 258 333

Evaluation(pred.class, validationData['y'], 1)

## [1] 0.7062566

##Naive Bayesion

nb.model <- naiveBayes(trainingData[,1:157], factor(trainingData[,'y'])) # encode the response as a factor variable
pred.class <- predict(nb.model, validationData[,1:157])
table(pred.class, validationData[,'y'] )

##           
## pred.class   0   1
##          0 270 141
##          1  66 211

Evaluation(pred.class, validationData[,'y'], 1)

## [1] 0.6709062

##Maximum Entropy
source('C:/users/DELL/Desktop/Rochester/social media analytics/final/Maximum Entropy.R')
maxent.model <- maximumentropy(trainingData[,1:157], trainingData[,'y'])

## iter    1 value 1715.717041
## iter    2 value 1537.677006
## iter    3 value 1472.132010
## iter    4 value 1445.499093
## iter    5 value 1433.218762
## iter    6 value 1426.048114
## iter    7 value 1421.622314
## iter    8 value 1420.436076
## iter    9 value 1420.070678
## iter   10 value 1419.658580
## iter   11 value 1419.454138
## iter   12 value 1419.218876
## iter   13 value 1419.142970
## iter   14 value 1419.048775
## iter   15 value 1418.918341
## iter   16 value 1418.827654
## iter   17 value 1418.689382
## iter   18 value 1418.603587
## iter   19 value 1418.558400
## iter   20 value 1418.525241
## iter   21 value 1418.495074
## iter   22 value 1418.477302
## iter   23 value 1418.446817
## iter   24 value 1418.433767
## iter   25 value 1418.428896
## iter   26 value 1418.423493
## iter   27 value 1418.418567
## iter   28 value 1418.414182
## iter   29 value 1418.411943
## iter   30 value 1418.407286
## iter   31 value 1418.405448
## iter   32 value 1418.400215
## iter   33 value 1418.397197
## iter   34 value 1418.394459
## iter   35 value 1418.389955
## iter   36 value 1418.387391
## iter   37 value 1418.384553
## iter   38 value 1418.383037
## iter   39 value 1418.380140
## iter   40 value 1418.379042
## iter   41 value 1418.377447
## iter   42 value 1418.376646
## iter   43 value 1418.375612
## iter   44 value 1418.375212
## iter   45 value 1418.374629
## iter   46 value 1418.374372
## iter   47 value 1418.374087
## iter   48 value 1418.373561
## iter   49 value 1418.373186
## iter   50 value 1418.372686
## iter   51 value 1418.372512
## iter   52 value 1418.372315
## iter   53 value 1418.372130
## iter   54 value 1418.371937
## iter   55 value 1418.371541
## iter   56 value 1418.371260
## iter   57 value 1418.370903
## iter   58 value 1418.370637
## iter   59 value 1418.370532
## iter   60 value 1418.370428
## iter   61 value 1418.370354
## iter   62 value 1418.370249
## iter   63 value 1418.370152
## iter   64 value 1418.370085
## iter   65 value 1418.370036
## iter   66 value 1418.369982
## iter   67 value 1418.369939
## iter   68 value 1418.369905
## iter   69 value 1418.369890
## iter   70 value 1418.369856
## iter   71 value 1418.369833
## iter   72 value 1418.369821
## iter   73 value 1418.369800
## iter   74 value 1418.369795
## iter   75 value 1418.369785
## iter   76 value 1418.369780
## iter   77 value 1418.369766
## iter   78 value 1418.369762
## iter   79 value 1418.369758
## iter   80 value 1418.369755
## iter   81 value 1418.369751
## iter   82 value 1418.369747
## iter   83 value 1418.369745
## final  value 1418.369745 
## converged

pred <- predict(maxent.model, validationData[,1:157])
as.numeric(pred[,2]) + as.numeric(pred[,3])

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [519] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [630] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

table(pred[,1], validationData[,'y'] )

##    
##       0   1
##   0 234 109
##   1 102 243

Evaluation(pred[,1], validationData[,'y'], 1)

## [1] 0.697274

##Random Forest
install.packages('randomForest', repos='http://cran.us.r-project.org')

## Installing package into 'C:/Users/DELL/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'randomForest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
## 	C:\Users\DELL\AppData\Local\Temp\Rtmp8Y1L0N\downloaded_packages

library('randomForest')

## Warning: package 'randomForest' was built under R version 3.6.3

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

rf <- randomForest(factor(y) ~ . , data=trainingData)
pred <- predict(rf, validationData[,1:157])
table(pred, validationData[,'y'] )

##     
## pred   0   1
##    0 229 109
##    1 107 243

Evaluation(pred, validationData[,'y'], 1)

## [1] 0.6923077

##KNN
library(kknn)

## Warning: package 'kknn' was built under R version 3.6.3

validationData1 <- validationData
pre3 <- kknn(factor(trainingData[,'y'])~., trainingData, validationData, distance = 1, kernel = "triangular")
fit <- fitted(pre3)
table(fit,validationData[,'y'])

##    
## fit   0   1
##   0 304  63
##   1  32 289

Evaluation(fit, validationData[,'y'], 1)

## [1] 0.858841

##test data
test <- df[3401:7955,]
pred_test <- predict(svm.model, test)
pred.class_test <- as.numeric(pred_test>0.1)
testdata$y <- pred.class_test
df1 <- testdata[testdata$y == 0, ]

## rule classification to test data
## word frequency of complaint
com_air <- airdata[1:1700,c(1,3)]
names(com_air) = c("doc_id", "text")
docs_com <- Corpus(DataframeSource(com_air))

mystops = c("alaskaair","fly", "flying","flight", "united", "americanair", "jetblue", "deltaassist", "now", "southwestair", "delta", "virginamerica", "airline")
dtm_com <- DocumentTermMatrix(docs_com, control=list(tolower=T, removePunctuation=T, removeNumbers=T, stripWhitespace=T, stopwords=c(mystops, stopwords("english"), stopwords("spanish"))))
dtm_com <- removeSparseTerms(dtm_com,0.995)
freq_com <- colSums( as.matrix(dtm_com) )
freq.sorted_com <- sort( freq_com, decreasing=TRUE )
freq.sorted_com[1:100]

##      delayed          get      service        hours        never          can 
##          178          149          124          118          118          116 
##          amp        delay        plane        worst         help      waiting 
##          111          107          105          100           98           95 
##         hour         time      flights         gate     customer         ever 
##           92           91           89           88           88           84 
##         just    cancelled         hold          one         will         back 
##           83           81           77           75           75           68 
##         lost        still          bad          ive          bag       please 
##           68           67           66           64           63           58 
##        today         cant         like         wait        stuck   experience 
##           58           56           56           55           55           55 
##       really         late         need         wifi      another         last 
##           54           53           53           52           52           50 
##      minutes         dont        first         bags          min disappointed 
##           49           49           48           46           46           45 
##        phone       people          hey         miss         home     terrible 
##           45           45           45           45           44           44 
##      airport        going       trying         make          day      sitting 
##           44           43           43           41           41           40 
##          hrs         call         seat         even          new   connection 
##           40           40           40           39           39           38 
##          due      luggage       missed          got          jfk       delays 
##           38           38           37           37           37           37 
##          way         wont       better        issue       change     response 
##           37           36           36           36           36           36 
##          two      missing     airlines    usairways         guys        leave 
##           35           35           34           34           33           33 
##        sucks      getting         told         mins      problem        thats 
##           32           32           32           32           32           31 
##       issues         work          lax         fail        check       travel 
##           30           30           30           30           29           29 
##        class         days         hope         want 
##           29           28           28           28

##word frequency of noncomplaint
non_air <- airdata[1701:3400,c(1,3)]
names(non_air) = c("doc_id", "text")
docs_non <- Corpus(DataframeSource(non_air))

mystops = c("alaskaair","fly", "flying","flight", "united", "americanair", "jetblue", "deltaassist", "now", "southwestair", "delta", "virginamerica", "airline")
dtm_non <- DocumentTermMatrix(docs_non, control=list(tolower=T, removePunctuation=T, removeNumbers=T, stripWhitespace=T, stopwords=c(mystops, stopwords("english"), stopwords("spanish"))))
dtm_non <- removeSparseTerms(dtm_non,0.995)
freq_non <- colSums( as.matrix(dtm_non) )
freq.sorted_non <- sort(freq_non, decreasing=TRUE )
freq.sorted_non[1:100]

##       wait      never        can        amp        get       cant       just 
##        154        150        121        113        102        101         95 
##       miss       time       will        bad    flights       like       dont 
##         89         87         86         86         75         70         64 
##     thanks      great      issue     travel    waiting  cancelled      plane 
##         63         61         61         61         60         59         55 
##       back      today        new      leave        got    service        day 
##         54         54         54         52         51         51         51 
##      first        see        way       late    delayed   airlines      thank 
##         51         51         51         49         49         48         48 
##       lost      delay       best       good        one       need     missed 
##         48         47         46         46         45         45         42 
##        ive       know     cancel       home    problem       guys       last 
##         41         41         41         41         41         40         39 
##       love       help      going        sad     longer   customer      stuck 
##         37         37         37         37         36         36         35 
##       trip      sorry       much     issues   tomorrow     people     switch 
##         34         34         34         34         33         33         32 
##     please      youre      still       even       make       seat     always 
##         32         32         31         30         30         29         29 
##     really       gate        ill     better    tonight      didnt        jfk 
##         29         29         28         27         27         27         27 
##    getting       next    missing       hold        air experience       hope 
##         27         27         27         27         26         26         26 
##    another    weather    awesome    morning       call    chicago  usairways 
##         26         26         25         25         25         24         24 
##       crew      miles      early       free        via       made       give 
##         24         24         24         24         23         23         23 
##       want      thats 
##         22         22

a <- grep("fuck|shit|worst|bad|hate|awful|wtf|worse|disappoint|frustrat|rude|never|poor|sad|disgust|bother|no longer|changing|hell|suck|stuck|unacceptable|upset|ruin|shame|unprofessional|complaint|nothappy|kill|waste|screw",
          df1$tweet, ignore.case=TRUE,  perl=TRUE)

df3 <- df1[-a, ]