require("cluster")
## Loading required package: cluster
require("fpc")
## Loading required package: fpc
require("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
require("gridExtra")
## Loading required package: gridExtra
library(cluster)
library(fpc)
library(factoextra)
library(gridExtra)
library("conjoint")
##load data
load("~/Desktop/Toy house/GBA424 - Toy Horse Case Data.Rdata")
##delete NA rows
df1 = conjointData[complete.cases(conjointData$ratings), ]
##create a new dataframe, which has 5 columns
util = as.data.frame(array(dim=c(1,5)))
colnames(util) = c("(intercept)", "pricelow", "26inches", "Rocking", "Glamour")
##complete individual part-utilities
for (i in 1:200){
lm1 = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df1[df1$ID ==i, ])
util[i,] = lm1$coefficients
}
##cluster
clustTest = function(toClust,print=TRUE,scale=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100){
if(scale){ toClust = scale(toClust);}
set.seed(seed); # set random number seed before doing cluster analysis
wss <- (nrow(toClust)-1)*sum(apply(toClust,2,var))
for (i in 2:maxClusts) wss[i] <- sum(kmeans(toClust,centers=i,nstart=nstart,iter.max=iter.max)$withinss)
##gpw essentially does the following plot using wss above.
#plot(1:maxClusts, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
gpw = fviz_nbclust(toClust,kmeans,method="wss",iter.max=iter.max,nstart=nstart,k.max=maxClusts) #alternative way to get wss elbow chart.
pm1 = pamk(toClust,scaling=TRUE)
## pm1$nc indicates the optimal number of clusters based on
## lowest average silhoutte score (a measure of quality of clustering)
#alternative way that presents it visually as well.
gps = fviz_nbclust(toClust,kmeans,method="silhouette",iter.max=iter.max,nstart=nstart,k.max=maxClusts)
if(print){
grid.arrange(gpw,gps, nrow = 1)
}
list(wss=wss,pm1=pm1$nc,gpw=gpw,gps=gps)
}
runClusts = function(toClust,nClusts,print=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100){
if(length(nClusts)>4){
warning("Using only first 4 elements of nClusts.")
}
kms=list(); ps=list();
for(i in 1:length(nClusts)){
kms[[i]] = kmeans(toClust,nClusts[i],iter.max = iter.max, nstart=nstart)
ps[[i]] = fviz_cluster(kms[[i]], geom = "point", data = toClust) + ggtitle(paste("k =",nClusts[i]))
}
library(gridExtra)
if(print){
tmp = marrangeGrob(ps, nrow = 2,ncol=2)
print(tmp)
}
list(kms=kms,ps=ps)
}
##Plots a kmeans cluster as three plot report
## pie chart with membership percentages
## ellipse plot that indicates cluster definitions against principle components
## barplot of the cluster means
plotClust = function(km,toClust,discPlot=FALSE){
nc = length(km$size)
if(discPlot){par(mfrow=c(2,2))}
else {par(mfrow=c(3,1))}
percsize = paste(1:nc," = ",format(km$size/sum(km$size)*100,digits=2),"%",sep="")
pie(km$size,labels=percsize,col=1:nc)
clusplot(toClust, km$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0,col.clus=1:nc); #plot clusters against principal components
if(discPlot){
plotcluster(toClust, km$cluster,col=km$cluster); #plot against discriminant functions ()
}
rng = range(km$centers)
dist = rng[2]-rng[1]
locs = km$centers+.05*dist*ifelse(km$centers>0,1,-1)
bm = barplot(km$centers,beside=TRUE,col=1:nc,main="Cluster Means",ylim=rng+dist*c(-.1,.1))
text(bm,locs,formatC(km$centers,format="f",digits=1))
}
checks = clustTest(util,print=TRUE,scale=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100)

clusts = runClusts(util,3,print=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100)

plotClust(clusts[[1]][[1]],util)

##predicted ratings for missing profiles (complete ratings)
fini = conjointData
for (i in (1:3200)){
if (is.na(conjointData[i,3]) == TRUE){
fini[i,3] = util[conjointData[i,1],1]+util[conjointData[i,1],2]*conjointData[i,4]+util[conjointData[i,1],3]*conjointData[i,5]+util[conjointData[i,1],4]*conjointData[i,6]+util[conjointData[i,1],5]*conjointData[i,7]
}
}
## part utilities by gender and age
df2 = merge(df1,respondentData,by = 'ID')
lmfemale = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$gender == 1,])
summary(lmfemale)
##
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) +
## factor(style), data = df2[df2$gender == 1, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.675 -13.646 2.716 12.948 36.175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.8701 1.0580 38.630 < 2e-16 ***
## factor(price)1 13.5086 0.9914 13.626 < 2e-16 ***
## factor(size)1 7.7555 0.9492 8.171 7.24e-16 ***
## factor(motion)1 2.9068 0.9492 3.062 0.00224 **
## factor(style)1 3.7270 0.9492 3.927 9.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.82 on 1291 degrees of freedom
## Multiple R-squared: 0.1902, Adjusted R-squared: 0.1877
## F-statistic: 75.82 on 4 and 1291 DF, p-value: < 2.2e-16
lmmale = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$gender ==0,])
summary(lmmale)
##
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) +
## factor(style), data = df2[df2$gender == 0, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.917 -10.607 -2.351 7.762 47.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.5668 0.9821 37.233 < 2e-16 ***
## factor(price)1 16.8573 0.9203 18.318 < 2e-16 ***
## factor(size)1 3.8509 0.8811 4.371 1.36e-05 ***
## factor(motion)1 -0.7601 0.8811 -0.863 0.3885
## factor(style)1 -1.8895 0.8811 -2.145 0.0322 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.41 on 1099 degrees of freedom
## Multiple R-squared: 0.2862, Adjusted R-squared: 0.2836
## F-statistic: 110.2 on 4 and 1099 DF, p-value: < 2.2e-16
lmage2= lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$age ==0, ])
summary(lmage2)
##
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) +
## factor(style), data = df2[df2$age == 0, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.528 -11.312 -0.598 9.857 43.040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.5462 1.0164 38.909 < 2e-16 ***
## factor(price)1 14.4133 0.9524 15.134 < 2e-16 ***
## factor(size)1 3.8532 0.9118 4.226 2.57e-05 ***
## factor(motion)1 2.7950 0.9118 3.065 0.00222 **
## factor(style)1 1.1867 0.9118 1.301 0.19338
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 1183 degrees of freedom
## Multiple R-squared: 0.1912, Adjusted R-squared: 0.1884
## F-statistic: 69.9 on 4 and 1183 DF, p-value: < 2.2e-16
lmage3 = lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style), data = df2[df2$age ==1, ])
summary(lmage3)
##
## Call:
## lm(formula = ratings ~ factor(price) + factor(size) + factor(motion) +
## factor(style), data = df2[df2$age == 1, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.465 -14.368 -2.484 15.008 44.386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.2480 1.1400 33.549 < 2e-16 ***
## factor(price)1 15.6721 1.0683 14.671 < 2e-16 ***
## factor(size)1 8.0239 1.0228 7.845 9.47e-15 ***
## factor(motion)1 -0.3238 1.0228 -0.317 0.752
## factor(style)1 1.1010 1.0228 1.076 0.282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.53 on 1207 degrees of freedom
## Multiple R-squared: 0.2185, Adjusted R-squared: 0.2159
## F-statistic: 84.37 on 4 and 1207 DF, p-value: < 2.2e-16
##Test
summary(lm(ratings~(factor(price)+factor(size)+factor(motion)+factor(style))*factor(age), data = df2))
##
## Call:
## lm(formula = ratings ~ (factor(price) + factor(size) + factor(motion) +
## factor(style)) * factor(age), data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.528 -12.886 -1.307 12.388 44.386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.5462 1.0867 36.390 < 2e-16 ***
## factor(price)1 14.4133 1.0183 14.154 < 2e-16 ***
## factor(size)1 3.8532 0.9749 3.952 7.97e-05 ***
## factor(motion)1 2.7950 0.9749 2.867 0.00418 **
## factor(style)1 1.1867 0.9749 1.217 0.22367
## factor(age)1 -1.2982 1.5292 -0.849 0.39601
## factor(price)1:factor(age)1 1.2588 1.4330 0.878 0.37977
## factor(size)1:factor(age)1 4.1708 1.3720 3.040 0.00239 **
## factor(motion)1:factor(age)1 -3.1188 1.3720 -2.273 0.02310 *
## factor(style)1:factor(age)1 -0.0857 1.3720 -0.062 0.95020
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.55 on 2390 degrees of freedom
## Multiple R-squared: 0.2069, Adjusted R-squared: 0.204
## F-statistic: 69.3 on 9 and 2390 DF, p-value: < 2.2e-16
summary(lm(ratings~(factor(price)+factor(size)+factor(motion)+factor(style))*factor(gender), data = df2))
##
## Call:
## lm(formula = ratings ~ (factor(price) + factor(size) + factor(motion) +
## factor(style)) * factor(gender), data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.675 -11.838 -0.398 11.456 47.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.5668 1.0739 34.050 < 2e-16 ***
## factor(price)1 16.8573 1.0063 16.752 < 2e-16 ***
## factor(size)1 3.8509 0.9635 3.997 6.61e-05 ***
## factor(motion)1 -0.7601 0.9635 -0.789 0.43022
## factor(style)1 -1.8895 0.9635 -1.961 0.04997 *
## factor(gender)1 4.3032 1.4614 2.945 0.00327 **
## factor(price)1:factor(gender)1 -3.3488 1.3694 -2.445 0.01454 *
## factor(size)1:factor(gender)1 3.9046 1.3111 2.978 0.00293 **
## factor(motion)1:factor(gender)1 3.6669 1.3111 2.797 0.00520 **
## factor(style)1:factor(gender)1 5.6165 1.3111 4.284 1.91e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.76 on 2390 degrees of freedom
## Multiple R-squared: 0.2803, Adjusted R-squared: 0.2776
## F-statistic: 103.4 on 9 and 2390 DF, p-value: < 2.2e-16
##customer segmentation
group = clusts$kms[[1]]$cluster
gro = c()
for (i in group) {
gro = c(gro,rep(i, times=16))
}
fini$group = gro
##market simulation
market = as.data.frame(array(dim=c(1,16)))
colnames(market) = c(1:16)
for (i in (1:200)){
customer = fini[fini$ID == i, ]
market[i, ] = rank(customer$ratings)
}
simFCDecisions = function(scen,data,ascend=FALSE){
inmkt = data[,scen] #construct the subsetted matrix of options
if(ascend){ #if ranks 1 is best
bestOpts = apply(inmkt,1,which.min) #identify which option is best = min
} else { #else the best rank is the largest number
bestOpts = apply(inmkt,1,which.max) #identify which option is best = max
}
ret = as.data.frame(model.matrix(~0+as.factor(bestOpts))) #fill to set of options marked 0 or 1
names(ret) = names(inmkt)
ret
}
calcUnitShares = function(decisions){
colSums(decisions)/sum(decisions) #assumes that total decisions is market size
}
simFCShares=function(scen,data,ascend=FALSE){
decs = simFCDecisions(scen,data,ascend) #determine decisions
calcUnitShares(decs) #calculate shares and return
}
simFCScenarios = function(scenarios,data,...){
res = matrix(nrow=length(scenarios),ncol=length(data)) #sets everything to NA by default
for(i in 1:length(scenarios)){ ##loop over scenarios
res[i, scenarios[[i]] ] = simFCShares(scenarios[[i]],data,...)## calculate market shares and save to right columns in res for the scenario
}
res = as.data.frame(res); names(res) = names(data)
res ##return result table
}
scens = list()
scens[[1]] = c(4,8,16)
scens[[2]] = c(4,14,8)
scens[[3]] = c(14,16,8)
scens[[4]] = c(4,8,14,16)
scens[[5]] = c(3,7,15)
scens[[6]] = c(3,7,13)
scens[[7]] = c(7,13,15)
scens[[8]] = c(3,7,13,15)
scens[[9]] = c(5,7,13)
df100 = simFCScenarios(scens,market[,1:16])
df101 = df100*200