rm(list=ls())
dir = "~/Desktop/Toy house"
setwd(dir)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(grf)
d = read.csv("test_data_1904.csv")
dt = data.table(d)
dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),seOpen = sd(open)/sqrt(.N), seClick=sd(click)/sqrt(.N), sePurch = sd(purch)/sqrt(.N),.N),by = .(group)]
dagg
dodge = position_dodge(width=1)
ggplot(aes(x=group,y=purch,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
geom_bar(position=dodge,stat="identity",col=2:3,fill=2:3) +
geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
##
## $y
## [1] "Purchases"
##
## attr(,"class")
## [1] "labels"
summary(lm(purch~group,data=d))
##
## Call:
## lm(formula = purch ~ group, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.12 -14.12 -12.77 -12.77 1798.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.7727 0.2260 56.528 < 2e-16 ***
## groupemail 1.3465 0.3195 4.214 2.52e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.71 on 78310 degrees of freedom
## Multiple R-squared: 0.0002267, Adjusted R-squared: 0.0002139
## F-statistic: 17.76 on 1 and 78310 DF, p-value: 2.515e-05
d$email = (d$group != "ctrl")
##recent purchase
hist(d$last_purch,
xlab="Days Since Last Purchase", ylab="Customers",
main="Histogram of Days Since Last Purchase")

d$recentPurch = (d$last_purch < 60)
dt = data.table(d)
dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),sePurch = sd(purch)/sqrt(.N),.N),by = .(group,recentPurch)]
dagg
dodge = position_dodge(width=1)
ggplot(aes(fill=group,y=purch,x=recentPurch,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
geom_bar(position=dodge,stat="identity") +
geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
##
## $y
## [1] "Purchases"
##
## attr(,"class")
## [1] "labels"
##past-purchase
hist(d$past_purch,
xlab="Past Purchase", ylab="Customers",
main="Histogram of how much the customer avg spend in past visits")

d$buysomething = (d$past_purch > 0)
dt = data.table(d)
dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),sePurch = sd(purch)/sqrt(.N),.N),by = .(group,buysomething)]
dagg
dodge = position_dodge(width=1)
ggplot(aes(fill=group,y=purch,x=buysomething,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
geom_bar(position=dodge,stat="identity") +
geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
##
## $y
## [1] "Purchases"
##
## attr(,"class")
## [1] "labels"
##frequent visitors
hist(d$visits,
xlab="Past Purchase", ylab="Customers",
main="Histogram of how much the customer avg spend in past visits")

d$frequentvisitor = (d$visits > 5)
dt = data.table(d)
dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),sePurch = sd(purch)/sqrt(.N),.N),by = .(group,frequentvisitor)]
dagg
dodge = position_dodge(width=1); ##to form constant dimensions
ggplot(aes(fill=group,y=purch,x=frequentvisitor,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
geom_bar(position=dodge,stat="identity") +
geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
##
## $y
## [1] "Purchases"
##
## attr(,"class")
## [1] "labels"
##interaction test
summary(lm(purch~recentPurch+group:recentPurch, data = d))
##
## Call:
## lm(formula = purch ~ recentPurch + group:recentPurch, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.03 -19.29 -7.66 -6.74 1791.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.7399 0.3102 21.727 < 2e-16 ***
## recentPurchTRUE 12.5536 0.4475 28.054 < 2e-16 ***
## recentPurchFALSE:groupemail 0.9158 0.4393 2.085 0.037081 *
## recentPurchTRUE:groupemail 1.7388 0.4555 3.817 0.000135 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.24 on 78308 degrees of freedom
## Multiple R-squared: 0.02124, Adjusted R-squared: 0.0212
## F-statistic: 566.4 on 3 and 78308 DF, p-value: < 2.2e-16
summary(lm(purch~buysomething+group:buysomething, data = d))
##
## Call:
## lm(formula = purch ~ buysomething + group:buysomething, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.92 -15.34 -15.34 -7.27 1795.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2668 0.3991 18.208 < 2e-16 ***
## buysomethingTRUE 8.0736 0.4833 16.706 < 2e-16 ***
## buysomethingFALSE:groupemail 0.6899 0.5668 1.217 0.223
## buysomethingTRUE:groupemail 1.5835 0.3847 4.116 3.86e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.54 on 78308 degrees of freedom
## Multiple R-squared: 0.008084, Adjusted R-squared: 0.008046
## F-statistic: 212.7 on 3 and 78308 DF, p-value: < 2.2e-16
summary(lm(purch~frequentvisitor+group:frequentvisitor, data = d))
##
## Call:
## lm(formula = purch ~ frequentvisitor + group:frequentvisitor,
## data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.46 -15.29 -11.96 -10.47 1796.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.4665 0.3123 33.514 < 2e-16 ***
## frequentvisitorTRUE 4.8253 0.4517 10.682 < 2e-16 ***
## frequentvisitorFALSE:groupemail 1.4960 0.4419 3.385 0.000712 ***
## frequentvisitorTRUE:groupemail 1.1707 0.4613 2.538 0.011154 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.65 on 78308 degrees of freedom
## Multiple R-squared: 0.002943, Adjusted R-squared: 0.002905
## F-statistic: 77.05 on 3 and 78308 DF, p-value: < 2.2e-16
d$profit = d$purch*0.3-0.1
##cluster
seg1 = d[d$recentPurch == TRUE & d$visits >5 & d$buysomething == TRUE, ]
seg1$segmentation = 1
seg2 = d[d$recentPurch ==FALSE & d$visits >5 & d$buysomething == TRUE, ]
seg2$segmentation = 2
seg3 = d[d$recentPurch == TRUE & d$visits <=5 & d$buysomething == TRUE, ]
seg3$segmentation = 3
seg4 = d[d$recentPurch == FALSE & d$visits <=5 & d$buysomething == TRUE,]
seg4$segmentation = 4
seg5 = d[d$recentPurch == TRUE & d$visits >5 & d$buysomething ==FALSE, ]
seg5$segmentation = 5
seg6 = d[d$recentPurch ==FALSE & d$visits >5 & d$buysomething == FALSE, ]
seg6$segmentation = 6
seg7 = d[d$recentPurch == TRUE & d$visits <=5 & d$buysomething == FALSE, ]
seg7$segmentation = 7
seg8 = d[d$recentPurch == FALSE & d$visits <=5 & d$buysomething == FALSE,]
seg8$segmentation = 8
df = rbind(seg1,seg2,seg3,seg4,seg5,seg6,seg7,seg8)
tapply(seg1$profit, seg1$group, sum)
## ctrl email
## 50548.45 54696.16
tapply(seg2$profit, seg2$group, sum)
## ctrl email
## 23025.44 25501.77
tapply(seg3$profit, seg3$group, sum)
## ctrl email
## 35059.02 40127.34
tapply(seg4$profit, seg4$group, sum)
## ctrl email
## 11586.86 13604.75
tapply(seg5$profit, seg5$group, sum)
## ctrl email
## 8353.520 8273.553
tapply(seg6$profit, seg6$group, sum)
## ctrl email
## 2052.446 2332.428
tapply(seg7$profit, seg7$group, sum)
## ctrl email
## 13070.63 14389.58
tapply(seg8$profit, seg8$group, sum)
## ctrl email
## 2425.956 3013.439
cf_size <- 10000
cf_set = sample(nrow(d),cf_size)
treat <- d$email[cf_set]
response <- d$purch[cf_set]
colnames(d)
## [1] "user_id" "cpgn_id" "group" "open"
## [5] "click" "purch" "chard" "sav_blanc"
## [9] "syrah" "cab" "past_purch" "last_purch"
## [13] "visits" "email" "recentPurch" "buysomething"
## [17] "frequentvisitor" "profit"
baseline <- d[cf_set, c("last_purch", "past_purch","visits", "chard", "sav_blanc", "syrah", "cab")]
cf <- causal_forest(baseline, response, treat)
print(cf)
## GRF forest object of type causal_forest
## Number of trees: 2000
## Number of training samples: 10000
## Variable importance:
## 1 2 3 4 5 6 7
## 0.215 0.239 0.088 0.176 0.147 0.050 0.086
average_treatment_effect(cf, method="AIPW")
## estimate std.err
## 2.5424379 0.8450148
new_cust = d[,c("last_purch", "past_purch","visits", "chard", "sav_blanc", "syrah", "cab")]
pre = predict(cf, new_cust, estimate.variance = TRUE)
hist(predict(cf)$predictions,
main="Histogram of Purchase Lift",
xlab="Purchase Lift for Email", ylab="Customers")

new_cust$score = pre$predictions*0.3-0.1
c1 = new_cust$score > 0
new_cust$target = c1
summary(new_cust[new_cust$target == TRUE,])
## last_purch past_purch visits chard
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 24.00 1st Qu.: 11.02 1st Qu.: 4.000 1st Qu.: 0.00
## Median : 60.00 Median : 74.67 Median : 6.000 Median : 0.00
## Mean : 77.44 Mean : 142.47 Mean : 5.755 Mean : 74.71
## 3rd Qu.: 107.00 3rd Qu.: 197.60 3rd Qu.: 7.000 3rd Qu.: 73.01
## Max. :1054.00 Max. :13379.44 Max. :64.000 Max. :13379.44
## sav_blanc syrah cab score
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000024
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.388226
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.955587
## Mean : 31.56 Mean : 2.989 Mean : 33.21 Mean : 1.210086
## 3rd Qu.: 35.18 3rd Qu.: 0.000 3rd Qu.: 26.86 3rd Qu.: 1.672907
## Max. :1804.92 Max. :360.320 Max. :2649.78 Max. :16.623212
## target
## Mode:logical
## TRUE:53840
##
##
##
##
summary(new_cust[new_cust$target == FALSE,])
## last_purch past_purch visits chard
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 35.0 1st Qu.: 0.00 1st Qu.: 4.00 1st Qu.: 0.00
## Median : 77.0 Median : 14.12 Median : 5.00 Median : 0.00
## Mean : 117.8 Mean : 104.48 Mean : 5.41 Mean : 72.49
## 3rd Qu.: 183.0 3rd Qu.: 97.19 3rd Qu.: 7.00 3rd Qu.: 0.00
## Max. :1225.0 Max. :6321.21 Max. :43.00 Max. :6321.21
## sav_blanc syrah cab score
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. :-5.677884
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:-0.851761
## Median : 0.00 Median : 0.000 Median : 0.00 Median :-0.342670
## Mean : 16.05 Mean : 2.514 Mean : 13.42 Mean :-0.595749
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.:-0.136757
## Max. :3843.24 Max. :182.380 Max. :329.40 Max. :-0.000051
## target
## Mode :logical
## FALSE:24472
##
##
##
##
d$score = pre$predictions*0.3-0.1
d$target = d$score>0