require("cluster")
## Loading required package: cluster
require("fpc")
## Loading required package: fpc
require("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
require("gridExtra")
## Loading required package: gridExtra
library(cluster)
library(fpc)
library(factoextra)
library(gridExtra)
library("conjoint")

##load data
load("~/Desktop/Toy house/GBA424 - Toy Horse Case Data.Rdata")

##delete NA rows
df1 = conjointData[complete.cases(conjointData$ratings), ]
##create a new dataframe, which has 5 columns
util = as.data.frame(array(dim=c(1,5))) 
colnames(util) = c("(intercept)", "pricelow", "26inches", "Rocking", "Glamour")

##complete individual part-utilities
for (i in 1:200){
   lm1 = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df1[df1$ID ==i, ])
   util[i,] = lm1$coefficients
}
##cluster 
clustTest = function(toClust,print=TRUE,scale=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100){
    if(scale){ toClust = scale(toClust);}
    set.seed(seed);   # set random number seed before doing cluster analysis
    wss <- (nrow(toClust)-1)*sum(apply(toClust,2,var))
    for (i in 2:maxClusts) wss[i] <- sum(kmeans(toClust,centers=i,nstart=nstart,iter.max=iter.max)$withinss)
    ##gpw essentially does the following plot using wss above. 
    #plot(1:maxClusts, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
    gpw = fviz_nbclust(toClust,kmeans,method="wss",iter.max=iter.max,nstart=nstart,k.max=maxClusts) #alternative way to get wss elbow chart.
    pm1 = pamk(toClust,scaling=TRUE)
    ## pm1$nc indicates the optimal number of clusters based on 
    ## lowest average silhoutte score (a measure of quality of clustering)
    #alternative way that presents it visually as well.
    gps = fviz_nbclust(toClust,kmeans,method="silhouette",iter.max=iter.max,nstart=nstart,k.max=maxClusts) 
    if(print){
        grid.arrange(gpw,gps, nrow = 1)
    }
    list(wss=wss,pm1=pm1$nc,gpw=gpw,gps=gps)
}

runClusts = function(toClust,nClusts,print=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100){
    if(length(nClusts)>4){
        warning("Using only first 4 elements of nClusts.")
    }
    kms=list(); ps=list();
    for(i in 1:length(nClusts)){
        kms[[i]] = kmeans(toClust,nClusts[i],iter.max = iter.max, nstart=nstart)
        ps[[i]] = fviz_cluster(kms[[i]], geom = "point", data = toClust) + ggtitle(paste("k =",nClusts[i]))
        
    }
    library(gridExtra)
    if(print){
        tmp = marrangeGrob(ps, nrow = 2,ncol=2)
        print(tmp)
    }
    list(kms=kms,ps=ps)
}

##Plots a kmeans cluster as three plot report
##  pie chart with membership percentages
##  ellipse plot that indicates cluster definitions against principle components
##  barplot of the cluster means
plotClust = function(km,toClust,discPlot=FALSE){
    nc = length(km$size)
    if(discPlot){par(mfrow=c(2,2))}
    else {par(mfrow=c(3,1))}
    percsize = paste(1:nc," = ",format(km$size/sum(km$size)*100,digits=2),"%",sep="")
    pie(km$size,labels=percsize,col=1:nc)
    
    clusplot(toClust, km$cluster, color=TRUE, shade=TRUE,
             labels=2, lines=0,col.clus=1:nc); #plot clusters against principal components
    
    if(discPlot){
        plotcluster(toClust, km$cluster,col=km$cluster); #plot against discriminant functions ()
    }
    rng = range(km$centers)
    dist = rng[2]-rng[1]
    locs = km$centers+.05*dist*ifelse(km$centers>0,1,-1)
    bm = barplot(km$centers,beside=TRUE,col=1:nc,main="Cluster Means",ylim=rng+dist*c(-.1,.1))
    text(bm,locs,formatC(km$centers,format="f",digits=1))
}

checks = clustTest(util,print=TRUE,scale=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100)

clusts = runClusts(util,3,print=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100)

plotClust(clusts[[1]][[1]],util)

##predicted ratings for missing profiles (complete ratings)
fini = conjointData
for (i in (1:3200)){
    if (is.na(conjointData[i,3]) == TRUE){
        fini[i,3] = util[conjointData[i,1],1]+util[conjointData[i,1],2]*conjointData[i,4]+util[conjointData[i,1],3]*conjointData[i,5]+util[conjointData[i,1],4]*conjointData[i,6]+util[conjointData[i,1],5]*conjointData[i,7]
    }
}


## part utilities by gender and age
df2 = merge(df1,respondentData,by = 'ID')
lmfemale = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$gender == 1,])
summary(lmfemale)
## 
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) + 
##     factor(style), data = df2[df2$gender == 1, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.675 -13.646   2.716  12.948  36.175 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      40.8701     1.0580  38.630  < 2e-16 ***
## factor(price)1   13.5086     0.9914  13.626  < 2e-16 ***
## factor(size)1     7.7555     0.9492   8.171 7.24e-16 ***
## factor(motion)1   2.9068     0.9492   3.062  0.00224 ** 
## factor(style)1    3.7270     0.9492   3.927 9.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.82 on 1291 degrees of freedom
## Multiple R-squared:  0.1902, Adjusted R-squared:  0.1877 
## F-statistic: 75.82 on 4 and 1291 DF,  p-value: < 2.2e-16
lmmale = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$gender ==0,])
summary(lmmale)
## 
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) + 
##     factor(style), data = df2[df2$gender == 0, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.917 -10.607  -2.351   7.762  47.017 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      36.5668     0.9821  37.233  < 2e-16 ***
## factor(price)1   16.8573     0.9203  18.318  < 2e-16 ***
## factor(size)1     3.8509     0.8811   4.371 1.36e-05 ***
## factor(motion)1  -0.7601     0.8811  -0.863   0.3885    
## factor(style)1   -1.8895     0.8811  -2.145   0.0322 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.41 on 1099 degrees of freedom
## Multiple R-squared:  0.2862, Adjusted R-squared:  0.2836 
## F-statistic: 110.2 on 4 and 1099 DF,  p-value: < 2.2e-16
lmage2= lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$age ==0, ])
summary(lmage2)
## 
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) + 
##     factor(style), data = df2[df2$age == 0, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.528 -11.312  -0.598   9.857  43.040 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      39.5462     1.0164  38.909  < 2e-16 ***
## factor(price)1   14.4133     0.9524  15.134  < 2e-16 ***
## factor(size)1     3.8532     0.9118   4.226 2.57e-05 ***
## factor(motion)1   2.7950     0.9118   3.065  0.00222 ** 
## factor(style)1    1.1867     0.9118   1.301  0.19338    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 1183 degrees of freedom
## Multiple R-squared:  0.1912, Adjusted R-squared:  0.1884 
## F-statistic:  69.9 on 4 and 1183 DF,  p-value: < 2.2e-16
lmage3 = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$age ==1, ])
summary(lmage3)
## 
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) + 
##     factor(style), data = df2[df2$age == 1, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.465 -14.368  -2.484  15.008  44.386 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      38.2480     1.1400  33.549  < 2e-16 ***
## factor(price)1   15.6721     1.0683  14.671  < 2e-16 ***
## factor(size)1     8.0239     1.0228   7.845 9.47e-15 ***
## factor(motion)1  -0.3238     1.0228  -0.317    0.752    
## factor(style)1    1.1010     1.0228   1.076    0.282    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.53 on 1207 degrees of freedom
## Multiple R-squared:  0.2185, Adjusted R-squared:  0.2159 
## F-statistic: 84.37 on 4 and 1207 DF,  p-value: < 2.2e-16
##Test
summary(lm(ratings~(factor(price)+factor(size)+factor(motion)+factor(style))*factor(age), data = df2))
## 
## Call:
## lm(formula = ratings ~ (factor(price) + factor(size) + factor(motion) + 
##     factor(style)) * factor(age), data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.528 -12.886  -1.307  12.388  44.386 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   39.5462     1.0867  36.390  < 2e-16 ***
## factor(price)1                14.4133     1.0183  14.154  < 2e-16 ***
## factor(size)1                  3.8532     0.9749   3.952 7.97e-05 ***
## factor(motion)1                2.7950     0.9749   2.867  0.00418 ** 
## factor(style)1                 1.1867     0.9749   1.217  0.22367    
## factor(age)1                  -1.2982     1.5292  -0.849  0.39601    
## factor(price)1:factor(age)1    1.2588     1.4330   0.878  0.37977    
## factor(size)1:factor(age)1     4.1708     1.3720   3.040  0.00239 ** 
## factor(motion)1:factor(age)1  -3.1188     1.3720  -2.273  0.02310 *  
## factor(style)1:factor(age)1   -0.0857     1.3720  -0.062  0.95020    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.55 on 2390 degrees of freedom
## Multiple R-squared:  0.2069, Adjusted R-squared:  0.204 
## F-statistic:  69.3 on 9 and 2390 DF,  p-value: < 2.2e-16
summary(lm(ratings~(factor(price)+factor(size)+factor(motion)+factor(style))*factor(gender), data = df2))
## 
## Call:
## lm(formula = ratings ~ (factor(price) + factor(size) + factor(motion) + 
##     factor(style)) * factor(gender), data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.675 -11.838  -0.398  11.456  47.017 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      36.5668     1.0739  34.050  < 2e-16 ***
## factor(price)1                   16.8573     1.0063  16.752  < 2e-16 ***
## factor(size)1                     3.8509     0.9635   3.997 6.61e-05 ***
## factor(motion)1                  -0.7601     0.9635  -0.789  0.43022    
## factor(style)1                   -1.8895     0.9635  -1.961  0.04997 *  
## factor(gender)1                   4.3032     1.4614   2.945  0.00327 ** 
## factor(price)1:factor(gender)1   -3.3488     1.3694  -2.445  0.01454 *  
## factor(size)1:factor(gender)1     3.9046     1.3111   2.978  0.00293 ** 
## factor(motion)1:factor(gender)1   3.6669     1.3111   2.797  0.00520 ** 
## factor(style)1:factor(gender)1    5.6165     1.3111   4.284 1.91e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.76 on 2390 degrees of freedom
## Multiple R-squared:  0.2803, Adjusted R-squared:  0.2776 
## F-statistic: 103.4 on 9 and 2390 DF,  p-value: < 2.2e-16
##customer segmentation
group = clusts$kms[[1]]$cluster
gro = c()
for (i in group) {
    gro = c(gro,rep(i, times=16))
}
fini$group = gro
##market simulation
market = as.data.frame(array(dim=c(1,16)))
colnames(market) = c(1:16)
for (i in (1:200)){
    customer = fini[fini$ID == i, ]
    market[i, ] = rank(customer$ratings)
}


simFCDecisions = function(scen,data,ascend=FALSE){ 
    inmkt = data[,scen] #construct the subsetted matrix of options
    if(ascend){ #if ranks 1 is best 
        bestOpts = apply(inmkt,1,which.min)  #identify which option is best = min
    } else { #else the best rank is the largest number
        bestOpts = apply(inmkt,1,which.max) #identify which option is best = max
    }
    ret = as.data.frame(model.matrix(~0+as.factor(bestOpts))) #fill to set of options marked 0 or 1
    names(ret) = names(inmkt)
    ret
}

calcUnitShares = function(decisions){
    colSums(decisions)/sum(decisions) #assumes that total decisions is market size
}

simFCShares=function(scen,data,ascend=FALSE){
    decs = simFCDecisions(scen,data,ascend) #determine decisions
    calcUnitShares(decs) #calculate shares and return
}

simFCScenarios = function(scenarios,data,...){
    res = matrix(nrow=length(scenarios),ncol=length(data)) #sets everything to NA by default
    for(i in 1:length(scenarios)){ ##loop over scenarios
        res[i, scenarios[[i]] ] = simFCShares(scenarios[[i]],data,...)##  calculate market shares and save to right columns in res for the scenario
    }
    res = as.data.frame(res); names(res) = names(data)
    res ##return result table
}


scens = list()
scens[[1]] = c(4,8,16)       
scens[[2]] = c(4,14,8)
scens[[3]] = c(14,16,8)
scens[[4]] = c(4,8,14,16)
scens[[5]] = c(3,7,15)
scens[[6]] = c(3,7,13)
scens[[7]] = c(7,13,15)
scens[[8]] = c(3,7,13,15)
scens[[9]] = c(5,7,13)

df100 = simFCScenarios(scens,market[,1:16])
df101 = df100*200