
Social Media Analytics:

## load data
csvdata <- read.csv("C:/users/DELL/Desktop/Rochester/social media analytics/final/complaint1700.csv", header=TRUE, sep=',', quote='"')
csvdata1 <- read.csv("C:/users/DELL/Desktop/Rochester/social media analytics/final/noncomplaint1700.csv", header=TRUE, sep=',', quote='"')
testdata <- read.csv("C:/users/DELL/Desktop/Rochester/social media analytics/final/airline.csv", header=TRUE, sep=',', quote='"')

testdata <- testdata[,c(2,4,6)]
airdata <- rbind(csvdata, csvdata1, testdata)
air <- airdata[,c(1,3)]
names(air) = c("doc_id", "text")
docs <- Corpus(DataframeSource(air))

mystops = c("alaskaair","fly", "flying","flight", "united", "americanair", "jetblue", "deltaassist", "now", "southwestair", "delta", "virginamerica", "airline")
dtm <- DocumentTermMatrix(docs, control=list(tolower=T, removePunctuation=T, removeNumbers=T, stripWhitespace=T, stopwords=c(mystops, stopwords("english"), stopwords("spanish"))))
dtm <- removeSparseTerms(dtm,0.99)
f1 <- as.matrix(dtm)
df<- data.frame(f1)
df_air <- df[1:3400, ]
df_air[1:1700, 'y'] <- 1
df_air[1701:3400, 'y'] <- 0
## train data & validation data
isTraining <- runif(nrow(df_air))<.8
trainingData <- subset(df_air,isTraining)
validationData <- subset(df_air,!isTraining)

## evaluation
Evaluation <- function(pred, true, class)
    tp <- sum( pred==class & true==class)
    fp <- sum( pred==class & true!=class)
    tn <- sum( pred!=class & true!=class)
    fn <- sum( pred!=class & true==class)
    precision <- tp/(tp+fp)
    recall <- tp/(tp+fn)
    F1 <- 2/(1/precision + 1/recall)
## train model

svm.model <- svm(trainingData[,'y'] ~ ., data = trainingData[,1:157], kernel='linear')
pred <- predict(svm.model, validationData[,1:157])
pred.class <- as.numeric(pred>0.1)
table(pred.class, validationData[,'y'])
## pred.class   0   1
##          0 139  39
##          1 197 313
Evaluation(pred.class, validationData['y'], 1)
## [1] 0.7262181
svm.model <- svm(trainingData[,'y'] ~ ., data = trainingData[,1:157], kernel='polynomial')
pred <- predict(svm.model, validationData[,1:157])
pred.class <- as.numeric(pred>0.1)
table(pred.class, validationData[,'y'])
## pred.class   0   1
##          0  78  19
##          1 258 333
Evaluation(pred.class, validationData['y'], 1)
## [1] 0.7062566
##Naive Bayesion

nb.model <- naiveBayes(trainingData[,1:157], factor(trainingData[,'y'])) # encode the response as a factor variable
pred.class <- predict(nb.model, validationData[,1:157])
table(pred.class, validationData[,'y'] )
## pred.class   0   1
##          0 270 141
##          1  66 211
Evaluation(pred.class, validationData[,'y'], 1)
## [1] 0.6709062
##Maximum Entropy
source('C:/users/DELL/Desktop/Rochester/social media analytics/final/Maximum Entropy.R')
maxent.model <- maximumentropy(trainingData[,1:157], trainingData[,'y'])
## converged
pred <- predict(maxent.model, validationData[,1:157])
as.numeric(pred[,2]) + as.numeric(pred[,3])
table(pred[,1], validationData[,'y'] )
##       0   1
##   0 234 109
##   1 102 243
Evaluation(pred[,1], validationData[,'y'], 1)
## [1] 0.697274
##Random Forest
rf <- randomForest(factor(y) ~ . , data=trainingData)
pred <- predict(rf, validationData[,1:157])
table(pred, validationData[,'y'] )
## pred   0   1
##    0 229 109
##    1 107 243
Evaluation(pred, validationData[,'y'], 1)
## [1] 0.6923077
## Warning: package 'kknn' was built under R version 3.6.3
validationData1 <- validationData
pre3 <- kknn(factor(trainingData[,'y'])~., trainingData, validationData, distance = 1, kernel = "triangular")
fit <- fitted(pre3)
## fit   0   1
##   0 304  63
##   1  32 289
Evaluation(fit, validationData[,'y'], 1)
## [1] 0.858841
##test data
test <- df[3401:7955,]
pred_test <- predict(svm.model, test)
pred.class_test <- as.numeric(pred_test>0.1)
testdata$y <- pred.class_test
df1 <- testdata[testdata$y == 0, ]

## rule classification to test data
## word frequency of complaint
com_air <- airdata[1:1700,c(1,3)]
names(com_air) = c("doc_id", "text")
docs_com <- Corpus(DataframeSource(com_air))

mystops = c("alaskaair","fly", "flying","flight", "united", "americanair", "jetblue", "deltaassist", "now", "southwestair", "delta", "virginamerica", "airline")
dtm_com <- DocumentTermMatrix(docs_com, control=list(tolower=T, removePunctuation=T, removeNumbers=T, stripWhitespace=T, stopwords=c(mystops, stopwords("english"), stopwords("spanish"))))
dtm_com <- removeSparseTerms(dtm_com,0.995)
freq_com <- colSums( as.matrix(dtm_com) )
freq.sorted_com <- sort( freq_com, decreasing=TRUE )
##      delayed          get      service        hours        never          can 
##          178          149          124          118          118          116 
##          amp        delay        plane        worst         help      waiting 
##          111          107          105          100           98           95 
##         hour         time      flights         gate     customer         ever 
##           92           91           89           88           88           84 
##         just    cancelled         hold          one         will         back 
##           83           81           77           75           75           68 
##         lost        still          bad          ive          bag       please 
##           68           67           66           64           63           58 
##        today         cant         like         wait        stuck   experience 
##           58           56           56           55           55           55 
##       really         late         need         wifi      another         last 
##           54           53           53           52           52           50 
##      minutes         dont        first         bags          min disappointed 
##           49           49           48           46           46           45 
##        phone       people          hey         miss         home     terrible 
##           45           45           45           45           44           44 
##      airport        going       trying         make          day      sitting 
##           44           43           43           41           41           40 
##          hrs         call         seat         even          new   connection 
##           40           40           40           39           39           38 
##          due      luggage       missed          got          jfk       delays 
##           38           38           37           37           37           37 
##          way         wont       better        issue       change     response 
##           37           36           36           36           36           36 
##          two      missing     airlines    usairways         guys        leave 
##           35           35           34           34           33           33 
##        sucks      getting         told         mins      problem        thats 
##           32           32           32           32           32           31 
##       issues         work          lax         fail        check       travel 
##           30           30           30           30           29           29 
##        class         days         hope         want 
##           29           28           28           28
##word frequency of noncomplaint
non_air <- airdata[1701:3400,c(1,3)]
names(non_air) = c("doc_id", "text")
docs_non <- Corpus(DataframeSource(non_air))

mystops = c("alaskaair","fly", "flying","flight", "united", "americanair", "jetblue", "deltaassist", "now", "southwestair", "delta", "virginamerica", "airline")
dtm_non <- DocumentTermMatrix(docs_non, control=list(tolower=T, removePunctuation=T, removeNumbers=T, stripWhitespace=T, stopwords=c(mystops, stopwords("english"), stopwords("spanish"))))
dtm_non <- removeSparseTerms(dtm_non,0.995)
freq_non <- colSums( as.matrix(dtm_non) )
freq.sorted_non <- sort(freq_non, decreasing=TRUE )
##       wait      never        can        amp        get       cant       just 
##        154        150        121        113        102        101         95 
##       miss       time       will        bad    flights       like       dont 
##         89         87         86         86         75         70         64 
##     thanks      great      issue     travel    waiting  cancelled      plane 
##         63         61         61         61         60         59         55 
##       back      today        new      leave        got    service        day 
##         54         54         54         52         51         51         51 
##      first        see        way       late    delayed   airlines      thank 
##         51         51         51         49         49         48         48 
##       lost      delay       best       good        one       need     missed 
##         48         47         46         46         45         45         42 
##        ive       know     cancel       home    problem       guys       last 
##         41         41         41         41         41         40         39 
##       love       help      going        sad     longer   customer      stuck 
##         37         37         37         37         36         36         35 
##       trip      sorry       much     issues   tomorrow     people     switch 
##         34         34         34         34         33         33         32 
##     please      youre      still       even       make       seat     always 
##         32         32         31         30         30         29         29 
##     really       gate        ill     better    tonight      didnt        jfk 
##         29         29         28         27         27         27         27 
##    getting       next    missing       hold        air experience       hope 
##         27         27         27         27         26         26         26 
##    another    weather    awesome    morning       call    chicago  usairways 
##         26         26         25         25         25         24         24 
##       crew      miles      early       free        via       made       give 
##         24         24         24         24         23         23         23 
##       want      thats 
##         22         22
a <- grep("fuck|shit|worst|bad|hate|awful|wtf|worse|disappoint|frustrat|rude|never|poor|sad|disgust|bother|no longer|changing|hell|suck|stuck|unacceptable|upset|ruin|shame|unprofessional|complaint|nothappy|kill|waste|screw",
          df1$tweet, ignore.case=TRUE,  perl=TRUE)

df3 <- df1[-a, ]