rm(list=ls())
dir =  "~/Desktop/Toy house"
setwd(dir)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(grf)

d = read.csv("test_data_1904.csv")
dt = data.table(d)
dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),seOpen = sd(open)/sqrt(.N), seClick=sd(click)/sqrt(.N), sePurch = sd(purch)/sqrt(.N),.N),by = .(group)]
dagg
dodge = position_dodge(width=1)
ggplot(aes(x=group,y=purch,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
    geom_bar(position=dodge,stat="identity",col=2:3,fill=2:3) + 
    geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
## 
## $y
## [1] "Purchases"
## 
## attr(,"class")
## [1] "labels"
summary(lm(purch~group,data=d))
## 
## Call:
## lm(formula = purch ~ group, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -14.12  -14.12  -12.77  -12.77 1798.38 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.7727     0.2260  56.528  < 2e-16 ***
## groupemail    1.3465     0.3195   4.214 2.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.71 on 78310 degrees of freedom
## Multiple R-squared:  0.0002267,  Adjusted R-squared:  0.0002139 
## F-statistic: 17.76 on 1 and 78310 DF,  p-value: 2.515e-05
d$email = (d$group != "ctrl")
##recent purchase
hist(d$last_purch, 
     xlab="Days Since Last Purchase", ylab="Customers", 
     main="Histogram of Days Since Last Purchase")

d$recentPurch = (d$last_purch < 60)
dt = data.table(d)

dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),sePurch = sd(purch)/sqrt(.N),.N),by = .(group,recentPurch)]
dagg
dodge = position_dodge(width=1)
ggplot(aes(fill=group,y=purch,x=recentPurch,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
    geom_bar(position=dodge,stat="identity") + 
    geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
## 
## $y
## [1] "Purchases"
## 
## attr(,"class")
## [1] "labels"
##past-purchase
hist(d$past_purch, 
     xlab="Past Purchase", ylab="Customers", 
     main="Histogram of how much the customer avg spend in past visits")

d$buysomething = (d$past_purch > 0)
dt = data.table(d)

dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),sePurch = sd(purch)/sqrt(.N),.N),by = .(group,buysomething)]
dagg
dodge = position_dodge(width=1)
ggplot(aes(fill=group,y=purch,x=buysomething,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
    geom_bar(position=dodge,stat="identity") + 
    geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
## 
## $y
## [1] "Purchases"
## 
## attr(,"class")
## [1] "labels"
##frequent visitors
hist(d$visits, 
     xlab="Past Purchase", ylab="Customers", 
     main="Histogram of how much the customer avg spend in past visits")

d$frequentvisitor = (d$visits > 5)
dt = data.table(d)

dagg = dt[,.(open = mean(open), click=mean(click), purch = mean(purch),sePurch = sd(purch)/sqrt(.N),.N),by = .(group,frequentvisitor)]
dagg
dodge = position_dodge(width=1); ##to form constant dimensions
ggplot(aes(fill=group,y=purch,x=frequentvisitor,ymax=purch+sePurch,ymin=purch-sePurch),data=dagg)+
    geom_bar(position=dodge,stat="identity") + 
    geom_errorbar(position=dodge)

labs(x="Group",y="Purchases")
## $x
## [1] "Group"
## 
## $y
## [1] "Purchases"
## 
## attr(,"class")
## [1] "labels"
##interaction test
summary(lm(purch~recentPurch+group:recentPurch, data = d))
## 
## Call:
## lm(formula = purch ~ recentPurch + group:recentPurch, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -21.03  -19.29   -7.66   -6.74 1791.47 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   6.7399     0.3102  21.727  < 2e-16 ***
## recentPurchTRUE              12.5536     0.4475  28.054  < 2e-16 ***
## recentPurchFALSE:groupemail   0.9158     0.4393   2.085 0.037081 *  
## recentPurchTRUE:groupemail    1.7388     0.4555   3.817 0.000135 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.24 on 78308 degrees of freedom
## Multiple R-squared:  0.02124,    Adjusted R-squared:  0.0212 
## F-statistic: 566.4 on 3 and 78308 DF,  p-value: < 2.2e-16
summary(lm(purch~buysomething+group:buysomething, data = d))
## 
## Call:
## lm(formula = purch ~ buysomething + group:buysomething, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -16.92  -15.34  -15.34   -7.27 1795.58 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    7.2668     0.3991  18.208  < 2e-16 ***
## buysomethingTRUE               8.0736     0.4833  16.706  < 2e-16 ***
## buysomethingFALSE:groupemail   0.6899     0.5668   1.217    0.223    
## buysomethingTRUE:groupemail    1.5835     0.3847   4.116 3.86e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.54 on 78308 degrees of freedom
## Multiple R-squared:  0.008084,   Adjusted R-squared:  0.008046 
## F-statistic: 212.7 on 3 and 78308 DF,  p-value: < 2.2e-16
summary(lm(purch~frequentvisitor+group:frequentvisitor, data = d))
## 
## Call:
## lm(formula = purch ~ frequentvisitor + group:frequentvisitor, 
##     data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -16.46  -15.29  -11.96  -10.47 1796.04 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      10.4665     0.3123  33.514  < 2e-16 ***
## frequentvisitorTRUE               4.8253     0.4517  10.682  < 2e-16 ***
## frequentvisitorFALSE:groupemail   1.4960     0.4419   3.385 0.000712 ***
## frequentvisitorTRUE:groupemail    1.1707     0.4613   2.538 0.011154 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.65 on 78308 degrees of freedom
## Multiple R-squared:  0.002943,   Adjusted R-squared:  0.002905 
## F-statistic: 77.05 on 3 and 78308 DF,  p-value: < 2.2e-16
d$profit = d$purch*0.3-0.1

##cluster
seg1 = d[d$recentPurch == TRUE & d$visits >5 & d$buysomething == TRUE, ]
seg1$segmentation = 1
seg2 = d[d$recentPurch ==FALSE & d$visits >5 & d$buysomething == TRUE, ]
seg2$segmentation = 2
seg3 = d[d$recentPurch == TRUE & d$visits <=5 & d$buysomething == TRUE, ]
seg3$segmentation = 3
seg4 = d[d$recentPurch == FALSE & d$visits <=5 & d$buysomething == TRUE,]
seg4$segmentation = 4
seg5 = d[d$recentPurch == TRUE & d$visits >5 & d$buysomething ==FALSE, ]
seg5$segmentation = 5
seg6 = d[d$recentPurch ==FALSE & d$visits >5 & d$buysomething == FALSE, ]
seg6$segmentation = 6
seg7 = d[d$recentPurch == TRUE & d$visits <=5 & d$buysomething == FALSE, ]
seg7$segmentation = 7
seg8 = d[d$recentPurch == FALSE & d$visits <=5 & d$buysomething == FALSE,]
seg8$segmentation = 8
df = rbind(seg1,seg2,seg3,seg4,seg5,seg6,seg7,seg8)

tapply(seg1$profit, seg1$group, sum)
##     ctrl    email 
## 50548.45 54696.16
tapply(seg2$profit, seg2$group, sum)
##     ctrl    email 
## 23025.44 25501.77
tapply(seg3$profit, seg3$group, sum)
##     ctrl    email 
## 35059.02 40127.34
tapply(seg4$profit, seg4$group, sum)
##     ctrl    email 
## 11586.86 13604.75
tapply(seg5$profit, seg5$group, sum)
##     ctrl    email 
## 8353.520 8273.553
tapply(seg6$profit, seg6$group, sum)
##     ctrl    email 
## 2052.446 2332.428
tapply(seg7$profit, seg7$group, sum)
##     ctrl    email 
## 13070.63 14389.58
tapply(seg8$profit, seg8$group, sum)
##     ctrl    email 
## 2425.956 3013.439
cf_size <- 10000 
cf_set = sample(nrow(d),cf_size)
treat <- d$email[cf_set]
response <- d$purch[cf_set]
colnames(d)
##  [1] "user_id"         "cpgn_id"         "group"           "open"           
##  [5] "click"           "purch"           "chard"           "sav_blanc"      
##  [9] "syrah"           "cab"             "past_purch"      "last_purch"     
## [13] "visits"          "email"           "recentPurch"     "buysomething"   
## [17] "frequentvisitor" "profit"
baseline <- d[cf_set, c("last_purch", "past_purch","visits", "chard", "sav_blanc", "syrah", "cab")]
cf <- causal_forest(baseline, response, treat)
print(cf)
## GRF forest object of type causal_forest 
## Number of trees: 2000 
## Number of training samples: 10000 
## Variable importance: 
##     1     2     3     4     5     6     7 
## 0.215 0.239 0.088 0.176 0.147 0.050 0.086
average_treatment_effect(cf, method="AIPW")
##  estimate   std.err 
## 2.5424379 0.8450148
new_cust = d[,c("last_purch", "past_purch","visits", "chard", "sav_blanc", "syrah", "cab")]
pre = predict(cf, new_cust, estimate.variance = TRUE)

hist(predict(cf)$predictions, 
     main="Histogram of Purchase Lift", 
     xlab="Purchase Lift for Email", ylab="Customers")

new_cust$score = pre$predictions*0.3-0.1
c1 = new_cust$score > 0
new_cust$target = c1
summary(new_cust[new_cust$target == TRUE,])
##    last_purch        past_purch           visits           chard         
##  Min.   :   0.00   Min.   :    0.00   Min.   : 0.000   Min.   :    0.00  
##  1st Qu.:  24.00   1st Qu.:   11.02   1st Qu.: 4.000   1st Qu.:    0.00  
##  Median :  60.00   Median :   74.67   Median : 6.000   Median :    0.00  
##  Mean   :  77.44   Mean   :  142.47   Mean   : 5.755   Mean   :   74.71  
##  3rd Qu.: 107.00   3rd Qu.:  197.60   3rd Qu.: 7.000   3rd Qu.:   73.01  
##  Max.   :1054.00   Max.   :13379.44   Max.   :64.000   Max.   :13379.44  
##    sav_blanc           syrah              cab              score          
##  Min.   :   0.00   Min.   :  0.000   Min.   :   0.00   Min.   : 0.000024  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:   0.00   1st Qu.: 0.388226  
##  Median :   0.00   Median :  0.000   Median :   0.00   Median : 0.955587  
##  Mean   :  31.56   Mean   :  2.989   Mean   :  33.21   Mean   : 1.210086  
##  3rd Qu.:  35.18   3rd Qu.:  0.000   3rd Qu.:  26.86   3rd Qu.: 1.672907  
##  Max.   :1804.92   Max.   :360.320   Max.   :2649.78   Max.   :16.623212  
##   target       
##  Mode:logical  
##  TRUE:53840    
##                
##                
##                
## 
summary(new_cust[new_cust$target == FALSE,])
##    last_purch       past_purch          visits          chard        
##  Min.   :   0.0   Min.   :   0.00   Min.   : 0.00   Min.   :   0.00  
##  1st Qu.:  35.0   1st Qu.:   0.00   1st Qu.: 4.00   1st Qu.:   0.00  
##  Median :  77.0   Median :  14.12   Median : 5.00   Median :   0.00  
##  Mean   : 117.8   Mean   : 104.48   Mean   : 5.41   Mean   :  72.49  
##  3rd Qu.: 183.0   3rd Qu.:  97.19   3rd Qu.: 7.00   3rd Qu.:   0.00  
##  Max.   :1225.0   Max.   :6321.21   Max.   :43.00   Max.   :6321.21  
##    sav_blanc           syrah              cab             score          
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :-5.677884  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:-0.851761  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :-0.342670  
##  Mean   :  16.05   Mean   :  2.514   Mean   : 13.42   Mean   :-0.595749  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:-0.136757  
##  Max.   :3843.24   Max.   :182.380   Max.   :329.40   Max.   :-0.000051  
##    target       
##  Mode :logical  
##  FALSE:24472    
##                 
##                 
##                 
## 
d$score =  pre$predictions*0.3-0.1
d$target = d$score>0