Instacart is a grocery ordering and delivery service app, which aims to make the process of ordering groceries more convenient for stores and shoppers
Shoppers place orders to partnering stores (ie Whole Foods, Trader Joe’s, etc.) and are connected to in-person shoppers who drive to the store location and purchase and deliver the goods
Instacart shoppers are able to use the grocery delivery service for a one-time fee of \$3.99, or for free if they have a membership (\$99/year) and orders are above \$35
The data set was released by Instacart as a machine learning competition via Kaggle
The data sets consisted of over 3 million grocery orders of anonymized information based on the products that belong to each order, and the orders that belong to each user
Additionally there was also information the times and days the orders were placed, the relative amount of time between orders per user, grocery department and aisle information, and the sequence of the products being placed in the basket
The goal of the kaggle competition was to predict which items were likely to be reordered again by the same customers
The files provide a combination of categorical data, both ordinal and nominal pertaining to the features of the actual goods being ordered such as “dairy,” “produce,” “organic,” etc.
head(products, n=10)
length(unique(products$product_id)) # 49688 total products
product_id | product_name | aisle_id | department_id |
---|---|---|---|
<int> | <fct> | <int> | <int> |
1 | Chocolate Sandwich Cookies | 61 | 19 |
2 | All-Seasons Salt | 104 | 13 |
3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
4 | Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce | 38 | 1 |
5 | Green Chile Anytime Sauce | 5 | 13 |
6 | Dry Nose Oil | 11 | 11 |
7 | Pure Coconut Water With Orange | 98 | 7 |
8 | Cut Russet Potatoes Steam N' Mash | 116 | 1 |
9 | Light Strawberry Blueberry Yogurt | 120 | 16 |
10 | Sparkling Orange Juice & Prickly Pear Beverage | 115 | 7 |
head(order_products_train, n=15)
order_id | product_id | add_to_cart_order | reordered |
---|---|---|---|
1 | 49302 | 1 | 1 |
1 | 11109 | 2 | 1 |
1 | 10246 | 3 | 0 |
1 | 49683 | 4 | 0 |
1 | 43633 | 5 | 1 |
1 | 13176 | 6 | 0 |
1 | 47209 | 7 | 0 |
1 | 22035 | 8 | 1 |
36 | 39612 | 1 | 0 |
36 | 19660 | 2 | 1 |
36 | 49235 | 3 | 0 |
36 | 43086 | 4 | 1 |
36 | 46620 | 5 | 1 |
36 | 34497 | 6 | 1 |
36 | 48679 | 7 | 1 |
head(orders, n=5)
order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
---|---|---|---|---|---|---|
<int> | <int> | <fct> | <int> | <int> | <int> | <int> |
2539329 | 1 | prior | 1 | 2 | 8 | NA |
2398795 | 1 | prior | 2 | 3 | 7 | 15 |
473747 | 1 | prior | 3 | 3 | 12 | 21 |
2254736 | 1 | prior | 4 | 4 | 7 | 29 |
431534 | 1 | prior | 5 | 4 | 15 | 28 |
orders %>%
ggplot(aes(x=order_dow, height=.2)) +
geom_histogram(stat="count",fill="slateblue", position=position_dodge(width=1))
Warning message: “Ignoring unknown parameters: binwidth, bins, pad”
orders %>%
ggplot(aes(x=days_since_prior_order)) +
geom_histogram(stat="count",fill="slateblue")
Warning message: “Ignoring unknown parameters: binwidth, bins, pad”Warning message: “Removed 63100 rows containing non-finite values (stat_count).”
mostcommon <- order_products_train %>%
group_by(product_id) %>%
summarize(count = n()) %>%
top_n(10, wt = count) %>%
left_join(select(products,product_id,product_name),by="product_id") %>%
arrange(desc(count))
head(mostcommon, n=10)
product_id | count | product_name |
---|---|---|
<int> | <int> | <fct> |
24852 | 18726 | Banana |
13176 | 15480 | Bag of Organic Bananas |
21137 | 10894 | Organic Strawberries |
21903 | 9784 | Organic Baby Spinach |
47626 | 8135 | Large Lemon |
47766 | 7409 | Organic Avocado |
47209 | 7293 | Organic Hass Avocado |
16797 | 6494 | Strawberries |
26209 | 6033 | Limes |
27966 | 5546 | Organic Raspberries |
mostcommon %>%
ggplot(aes(x=reorder(product_name,-count), y=count))+
geom_bar(stat="identity",fill="slateblue")+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())
item_reorders <- order_products_train %>%
group_by(product_id) %>%
summarize(proportion_reordered = mean(reordered), n=n()) %>%
filter(n>40) %>%
top_n(10,wt=proportion_reordered) %>%
arrange(desc(proportion_reordered)) %>%
left_join(products,by="product_id")
head(data.frame(item_reorders), n=15)
product_id | proportion_reordered | n | product_name | aisle_id | department_id |
---|---|---|---|---|---|
<int> | <dbl> | <int> | <fct> | <int> | <int> |
1729 | 0.9347826 | 92 | 2% Lactose Free Milk | 84 | 16 |
20940 | 0.9130435 | 368 | Organic Low Fat Milk | 84 | 16 |
12193 | 0.8983051 | 59 | 100% Florida Orange Juice | 98 | 7 |
21038 | 0.8888889 | 81 | Organic Spelt Tortillas | 128 | 3 |
31764 | 0.8888889 | 45 | Original Sparkling Seltzer Water Cans | 115 | 7 |
24852 | 0.8841717 | 18726 | Banana | 24 | 4 |
117 | 0.8833333 | 120 | Petit Suisse Fruit | 2 | 16 |
39180 | 0.8819876 | 483 | Organic Lowfat 1% Milk | 84 | 16 |
12384 | 0.8810409 | 269 | Organic Lactose Free 1% Lowfat Milk | 91 | 16 |
24024 | 0.8785249 | 461 | 1% Lowfat Milk | 84 | 16 |
item_reorders %>%
ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
geom_bar(stat="identity",fill="slateblue")+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))
reorder %>%
ggplot(aes(x=reordered,y=count, fill=reordered))+
geom_bar(stat="identity")
reordered | n | prop |
---|---|---|
<int> | <int> | <dbl> |
0 | 555793 | 0.4014056 |
1 | 828824 | 0.5985944 |
reorder.table<- order_products_train %>%
count(reordered) %>%
mutate(prop = prop.table(n))
as.data.frame(reorder.table)
reorder <- order_products_train %>%
group_by(reordered) %>%
summarize(count = n()) %>%
mutate(proportion = count/sum(count))
reordered | n | prop |
---|---|---|
<int> | <int> | <dbl> |
0 | 555793 | 0.4014056 |
1 | 828824 | 0.5985944 |
install.packages("pROC") library(pROC) plot(roc(Test.Orders$reordered, PredM1 , direction=">"), #col="yellow", lwd=3, main="Reordering ROC")
We had to think about what variables could be created to help us solve our problem - (which is to predict the probability of a user reordering a specific product/of a product being reordered).
A few new columns we created:
head(orders)
head(order_products_prior)
order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
---|---|---|---|---|---|---|
<int> | <int> | <fct> | <int> | <int> | <int> | <int> |
2539329 | 1 | prior | 1 | 2 | 8 | NA |
2398795 | 1 | prior | 2 | 3 | 7 | 15 |
473747 | 1 | prior | 3 | 3 | 12 | 21 |
2254736 | 1 | prior | 4 | 4 | 7 | 29 |
431534 | 1 | prior | 5 | 4 | 15 | 28 |
3367565 | 1 | prior | 6 | 2 | 7 | 19 |
order_id | product_id | add_to_cart_order | reordered |
---|---|---|---|
<int> | <int> | <int> | <int> |
2 | 33120 | 1 | 1 |
2 | 28985 | 2 | 1 |
2 | 9327 | 3 | 0 |
2 | 45918 | 4 | 1 |
2 | 30035 | 5 | 0 |
2 | 17794 | 6 | 1 |
head(Combined.Orders, n=10)
user_id | user_prop_reordered | total_orders | product_id | product_orders | proportion_reordered | order_id | add_to_cart_order | reordered | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | product_name | aisle_id | department_id | reordered.Num |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <int> | <int> | <int> | <dbl> | <int> | <int> | <fct> | <fct> | <int> | <fct> | <dbl> | <int> | <fct> | <int> | <int> | <dbl> |
1 | 1.728571 | 70 | 46149 | 2699 | 1.810671 | 1187899 | 11 | 1 | train | 11 | 4 | 8 | 14 | Zero Calorie Cola | 77 | 7 | 2 |
1 | 1.728571 | 70 | 26405 | 405 | 1.429630 | 1187899 | 4 | 1 | train | 11 | 4 | 8 | 14 | XL Pick-A-Size Paper Towel Rolls | 54 | 17 | 2 |
1 | 1.728571 | 70 | 10258 | 569 | 1.690685 | 431534 | 3 | 1 | prior | 5 | 4 | 15 | 28 | Pistachios | 117 | 19 | 2 |
1 | 1.728571 | 70 | 196 | 11936 | 1.785690 | 3367565 | 1 | 1 | prior | 6 | 2 | 7 | 19 | Soda | 77 | 7 | 2 |
1 | 1.728571 | 70 | 30450 | 7286 | 1.586467 | 473747 | 5 | 0 | prior | 3 | 3 | 12 | 21 | Creamy Almond Butter | 88 | 13 | 1 |
1 | 1.728571 | 70 | 12427 | 2059 | 1.739679 | 550135 | 3 | 1 | prior | 7 | 1 | 9 | 20 | Original Beef Jerky | 23 | 19 | 2 |
1 | 1.728571 | 70 | 12427 | 2059 | 1.739679 | 2398795 | 3 | 1 | prior | 2 | 3 | 7 | 15 | Original Beef Jerky | 23 | 19 | 2 |
1 | 1.728571 | 70 | 26405 | 405 | 1.429630 | 2254736 | 5 | 1 | prior | 4 | 4 | 7 | 29 | XL Pick-A-Size Paper Towel Rolls | 54 | 17 | 2 |
1 | 1.728571 | 70 | 26088 | 851 | 1.537015 | 2398795 | 5 | 1 | prior | 2 | 3 | 7 | 15 | Aged White Cheddar Popcorn | 23 | 19 | 2 |
1 | 1.728571 | 70 | 13032 | 1131 | 1.639257 | 2398795 | 6 | 0 | prior | 2 | 3 | 7 | 15 | Cinnamon Toast Crunch | 121 | 14 | 1 |
set.seed(1)
Train<- sample(1:nrow(Combined.Orders), nrow(Combined.Orders)*.8)
Training.Orders <- Combined.Orders[Train,]
Test.Orders <- Combined.Orders[-Train,]
nrow(Training.Orders)
nrow(Test.Orders)
Logistic.TestModel2 <- glm(reordered ~ user_prop_reordered + proportion_reordered + product_orders + total_orders + add_to_cart_order + order_dow + days_since_prior_order + department_id, family = binomial, data = Training.Orders)
summary(Logistic.TestModel2)
departments
department_id | department |
---|---|
<int> | <fct> |
1 | frozen |
2 | other |
3 | bakery |
4 | produce |
5 | alcohol |
6 | international |
7 | beverages |
8 | pets |
9 | dry goods pasta |
10 | bulk |
11 | personal care |
12 | meat seafood |
13 | pantry |
14 | breakfast |
15 | canned goods |
16 | dairy eggs |
17 | household |
18 | babies |
19 | snacks |
20 | deli |
21 | missing |
attach(Combined.Orders)
set.seed(1)
Accuracy <- function(table)
{
n11 <- table[1,1]
n22 <- table[2,2]
Total <- table[1,1]+table[2,2]+table[2,1]+table[1,2]
Total
return((n11+n22)/Total)
}
ProbM1 <- predict.glm(Logistic.TestModel1, newdata = Test.Orders, type = "response")
PredM1 <- ifelse(ProbM1 > .5, "1" , "0")
TableM1 <- table(PredM1, Test.Orders$reordered)
TableM1
Accuracy(TableM1)
PredM1 0 1 0 370828 175666 1 349293 1047199
Null.TestModel2 <- glm(reordered ~ 1, family = binomial, data = Training.Orders)
summary(Null.TestModel2)
Call: glm(formula = reordered ~ 1, family = binomial, data = Training.Orders) Deviance Residuals: Min 1Q Median 3Q Max -1.4092 -1.4092 0.9621 0.9621 0.9621 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.5300206 0.0007428 713.6 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 10245747 on 7771022 degrees of freedom Residual deviance: 10245747 on 7771022 degrees of freedom AIC: 10245749 Number of Fisher Scoring iterations: 4
anova(Null.TestModel2, Logistic.TestModel2, test='Chisq')
Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
7771022 | 10245747 | NA | NA | NA |
7770990 | 8321585 | 32 | 1924161 | 0 |
library(rpart)
Tree.TestModel <- rpart(reordered ~ user_prop_reordered + proportion_reordered + product_orders + total_orders + add_to_cart_order + order_dow + days_since_prior_order + department_id,method = "class", data = Training.Orders)
plot(Tree.TestModel)
text(Tree.TestModel)
Pred.Tree <- predict(Tree.TestModel, newdata = Test.Orders, type = "class")
Tree.Table <- table(Pred.Tree, Test.Orders$reordered)
Tree.Table
Accuracy(Tree.Table)
Pred.Tree 0 1 0 459415 219611 1 387676 1003254
set.seed(1)
Trying <- sample(1:nrow(Combined.Orders), nrow(Combined.Orders)*.001)
DF <- Combined.Orders[Trying,]
Train.Trying <- sample(1:nrow(DF), nrow(DF)*.8)
Trying.DF.Train <- DF[Train.Trying, ]
Trying.DF.Test <- DF[-Train.Trying, ]
SVM.TestModel <- svm(reordered ~ user_prop_reordered + proportion_reordered + product_orders + total_orders + add_to_cart_order + order_dow + days_since_prior_order + department_id.x, data = Trying.DF.Train)
summary(SVM.TestModel)
Call: svm(formula = reordered ~ user_prop_reordered + proportion_reordered + product_orders + total_orders + add_to_cart_order + order_dow + days_since_prior_order + department_id, data = Trying.DF.Train) Parameters: SVM-Type: C-classification SVM-Kernel: radial cost: 1 gamma: 0.03030303 Number of Support Vectors: 4727 ( 2351 2376 ) Number of Classes: 2 Levels: 0 1
SVM.Pred <- predict(SVM.TestModel, Trying.DF.Test)
SVM.Table <- table(SVM.Pred, Trying.DF.Test$reordered)
Accuracy(SVM.Table)
Model 1: Logistic Regression - 0.7298
Model 2: Classification Tree - 0.7066
Model 3: SVM - 0.7391 on .1% of the data