Many Ways to Lasso

\[ \min_{\beta \in \mathbb{R}^{p+1}} \left[ \frac{1}{2N} \sum_{i=1}^N \left( y_i - x_i\beta \right)^2 + \lambda ||\beta||_{1} \right] \]

The Data

ames <- AmesHousing::make_ames()
set.seed(4595)

data_split <- rsample::initial_split(ames, strata='Sale_Price')
ames_train <- rsample::training(data_split)
ames_test <- rsample::testing(data_split)

Data Prep

theFormula <- Sale_Price ~ Longitude + Latitude + 
  Lot_Frontage + Lot_Area + Street + Utilities +
  Bldg_Type + Overall_Qual + Year_Built + Foundation + 
  Exter_Cond + Heating + Central_Air + 
  First_Flr_SF + Second_Flr_SF + Full_Bath + Half_Bath + 
  Kitchen_Qual + TotRms_AbvGrd + Garage_Type + 
  Garage_Cars + Open_Porch_SF + Pool_Area + Fence

basic_rec <- recipe(theFormula, data = ames_train) %>%
    # collapse some levels into "other"
    step_other(all_nominal(), threshold = 0.01) %>%
    # remove variables with zero variance
    step_zv(all_predictors()) %>% 
    # center and scale numeric variables
    step_center(all_numeric()) %>% step_scale(all_numeric()) %>% 
    # turn categoricals into dummies, don't drop baseline
    step_dummy(all_nominal(), one_hot=TRUE) %>%
    # ensure there is an intercept
    step_intercept()

prepped <- basic_rec %>% prep(training=ames_train, retain=TRUE)
prepped

Data Recipe

Inputs:

      role #variables
   outcome          1
 predictor         24

Training data contained 2199 data points and no missing data.

Operations:

Collapsing factor levels for Street, Utilities, ... [trained]
Zero variance filter removed no terms [trained]
Centering for Longitude, Latitude, ... [trained]
Scaling for Longitude, Latitude, ... [trained]
Dummy variables from Street, Utilities, Bldg_Type, ... [trained]
Adding intercept

Many Ways to Compute

glmnet

# train matrices
x_train <- prepped %>% 
  juice(all_predictors(), composition='dgCMatrix')
y_train <- prepped %>% 
  juice(all_outcomes(), composition='matrix')

mod_glmnet <- glmnet(x=x_train, y=y_train, family='gaussian', 
                     alpha=1, intercept=FALSE, standardize=FALSE)

plot(mod_glmnet, xvar='lambda')

coefplot::coefpath(mod_glmnet)

coefplot::coefplot(mod_glmnet, sort='magnitude', lambda=0.0004)

xgboost

# train matrices
x_train <- prepped %>% 
  juice(all_predictors(), composition='dgCMatrix')
y_train <- prepped %>% 
  juice(all_outcomes(), composition='matrix')

xg_train <- xgb.DMatrix(data=x_train, label=y_train)

mod_xgboost <- xgb.train(
    data=xg_train,
    booster='gblinear',
    objective='reg:linear',
    alpha=0.0004,
    nrounds=20
)

coefplot(mod_xgboost, sort='magnitude')

coefplot(mod_xgboost, sort='magnitude', zero_threshold=0.01)

lars

# train matrices
x_train_dense <- prepped %>% 
  juice(all_predictors(), composition='matrix')
y_train_dense <- prepped %>% 
  juice(all_outcomes() ,composition='matrix')

mod_lars <- lars(x=x_train_dense, y=y_train_dense, 
                 type='lasso', 
                 normalize=FALSE, intercept=FALSE)

coef(mod_lars, s=50) %>% as.data.frame() %>% setNames('Value') %>%
  tibble::rownames_to_column('Coefficient') %>% filter(Value != 0) %>% 
  mutate(HighInner=NA_real_, LowInner=NA_real_, 
         HighOuter=NA_real_, LowOuter=NA_real_, Model='model') %>% 
  mutate(Coefficient=reorder(Coefficient, Value)) %>% coefplot()

Stan

stan_recipe <- recipe(theFormula, data = ames_train) %>%
    # collapse some levels into "other"
    step_other(all_nominal(), threshold = 0.01) %>%
    # remove variables with zero variance
    step_zv(all_predictors()) %>% 
    # center and scale numeric variables
    step_center(all_numeric()) %>% step_scale(all_numeric()) %>% 
    prep(training=ames_train, retain=TRUE)

train_stan <- stan_recipe %>% juice(composition='tibble')

mod_stan <- stan_glm(theFormula, data=train_stan, 
                     family=gaussian, prior=lasso(), iter=1000)

mcmc_intervals(as.array(mod_stan))

gridExtra::grid.arrange(
  mcmc_areas(as.array(mod_stan), pars=c('Longitude', 'Latitude')), 
  mcmc_areas(as.array(mod_stan), 
           pars=stringr::str_subset(names(mod_stan$coefficients), 'Overall_Qual')),
  nrow=1)

Quadratic Programming

\[ \min_{\beta \in \mathbb{R}^{p+1}} \left[ \frac{1}{2N} \sum_{i=1}^N \left( y_i - x_i\beta \right)^2 + \lambda ||\beta||_{1} \right] \]

\[ \min_{\beta \in \mathbb{R}^{p+1}} \left[ ||Y - X\beta||_2^2 + \lambda||\beta||_1 \right] \]

\[ \min_{\beta \in \mathbb{R}^{p+1}} ||Y - X\beta||_2^2 \\ \text{subject to } ||\beta||_1 \le t \]

\[ (Y - X\beta)^T(Y - X\beta) \\ (Y^T - \beta^TX^T)(Y - X\beta) \\ Y^TY - Y^TX\beta - \beta^TX^TY + \beta^TX^TX\beta \\ Y^TY - 2Y^TX\beta + \beta^TX^TX\beta \\ Y^TY+ \beta^TX^TX\beta - 2Y^TX\beta \]

\[ \min_{x} \frac{1}{2} x^TVV^Tx + c^Tx \\ Ax = b \\ l \preceq x \preceq u \]

\[ \min_{\beta^+, \beta^-} \left[(\beta^+ - \beta^-)XX^T(\beta^+ - \beta^-) - 2YX^T(\beta^+ - \beta^-) \right] \\ \text{subject to } \sum_{j=1}^p (\beta^+ - \beta^-) \le t \\ \beta^+ \succeq 0, \beta^- \succeq 0 \]

lasso.qp <- function(x, y, t)
{
    y <- as.vector(y)
    p <- ncol(x)
    # get negative and positive side of X
    Vn <- x %*% cbind(diag(p), -diag(p), 0)
    # compute YX
    YX <- colSums(x * -y)
    Zn <- c (2*YX, -2*YX, 0)
    # get upper bound for coefficients
    mod <- lm.fit(x, y)
    # replace any NAs with the maximum of all coefs
    b_ols <- mod$coefficients %>% replace_na(max(mod$coefficients, na.rm=TRUE))
    # create matrices and vectors
    u <- c(abs(b_ols), abs(b_ols), sum(abs(b_ols)))
    A <- matrix (c(rep (1, 2*p), 1), nrow=1)
    b <- c(min(t, sum(abs(b_ols))))
    # solve
    soln <- LowRankQP::LowRankQP(Vmat=sqrt(2)*t(Vn), dvec=Zn, Amat=A, 
                                 bvec=b, uvec=u, method="LU")
    return(round(soln$alpha[1:p] - soln$alpha[(p+1):(2*p)], digits=5))
}

# train matrices
x_train_dense <- prepped %>% 
  juice(all_predictors(), composition='matrix')
y_train_dense <- prepped %>% 
  juice(all_outcomes() ,composition='matrix')

mod_quad <- lasso.qp(x=x_train_dense, y=y_train_dense, t=5)

LowRankQP CONVERGED IN 27 ITERATIONS

    Primal Feasibility    =   6.6547305e-11
    Dual Feasibility      =   3.1086245e-14
    Complementarity Value =   3.4538714e-08
    Duality Gap           =   3.4540108e-08
    Termination Condition =   1.8830090e-11

tibble(Coefficient=colnames(x_train_dense), Value=mod_quad) %>%
  filter(Value != 0) %>% 
  mutate(HighInner=NA_real_, LowInner=NA_real_, 
         HighOuter=NA_real_, LowOuter=NA_real_, Model='model') %>% 
  mutate(Coefficient=reorder(Coefficient, Value)) %>% coefplot()

TensorFlow

# train matrices
x_train_dense <- prepped %>% 
  juice(all_predictors(), composition='matrix')
y_train_dense <- prepped %>% 
  juice(all_outcomes() ,composition='matrix')

mod_keras <- keras_model_sequential() %>%
    layer_dense(units = 1, activation = "linear", 
                kernel_regularizer=regularizer_l1(l=0.02),
                input_shape = dim(x_train_dense)[2])

mod_keras

Model
________________________________________________________________________________
Layer (type)                      Output Shape                          Param #              
================================================================================
dense_1 (Dense)                   (None, 1)                             64                   
================================================================================
Total params: 64
Trainable params: 64
Non-trainable params: 0
________________________________________________________________________________

mod_keras %>% compile(
    loss = "mse",
    optimizer = optimizer_rmsprop(),
    metrics = list("mean_absolute_error")
)

history <- mod_keras %>% 
    fit(
        x=x_train_dense,
        y=y_train_dense,
        epochs = 30,
        batch_size=64,
        validation_split = 0.1,
        view_metrics=TRUE,
        callbacks=list(
            callback_early_stopping(monitor='val_acc',
                                    patience=10),
            callback_reduce_lr_on_plateau(monitor='val_loss',
                                          factor=0.1,
                                          patience=5),
            callback_tensorboard("logs/run_lasso")
        )
    )

plot(history)

get_weights(mod_keras)[[1]] %>% as_data_frame() %>% rename(Value=V1) %>% 
    mutate(Coefficient=colnames(x_train_dense)) %>% filter(abs(Value) > 1E-3) %>% 
    mutate(HighInner=NA_real_, LowInner=NA_real_, 
           HighOuter=NA_real_, LowOuter=NA_real_, Model='model') %>% 
    mutate(Coefficient=reorder(Coefficient, Value)) %>% coefplot()

Performance

# test matrices
x_test <- prepped %>% 
  bake(all_predictors(), composition='matrix', new_data=ames_test)
y_test <- prepped %>% 
  bake(all_outcomes() ,composition='matrix', new_data=ames_test)

predictions <- list(
  glmnet=predict(mod_glmnet, newx=x_test, s=0.0004),
  xgboost=predict(mod_xgboost, newdata=x_test[, mod_xgboost$feature_names]),
  lars=predict(mod_lars, newx=x_test, s=50)$fit,
  stan=predict(mod_stan, newdata=stan_recipe %>% 
                 bake(new_data=ames_test, composition='tibble')),
  quad=x_test %*% mod_quad,
  keras=predict(mod_keras, x=x_test)
)

predictions <- predictions %>% 
  map(~data.frame(Pred=as.numeric(.))) %>% 
  map_df(. %>% mutate(Actual=as.numeric(y_test)) %>% mutate(.Resid=Actual-Pred), 
         .id='Model')

ggplot(predictions, aes(x=Actual, y=Pred, color=Model)) + 
  geom_abline(color='grey50', linetype=2) + geom_point(alpha=1/2, shape=1, size=1) + 
  geom_smooth() +facet_wrap(~Model) + theme(legend.position='none')

error_measures <- predictions %>% 
  group_by(Model) %>% summarize(RMSE=sqrt(mean(.Resid^2))) %>% 
  arrange(RMSE) %>% mutate(Model=reorder(Model, RMSE))

ggplot(error_measures, aes(x=Model, y=RMSE)) + geom_col(width=0.5, fill='lightblue')

glmnet
xgboost
lars
stan
quadratic programming
keras

Bonus: parsnip

mod_tidy_glmnet <- linear_reg(penalty=0.004) %>%
  set_engine('glmnet') %>%
  fit_xy(x_train_dense, y_train)

mod_tidy_spark <- linear_reg(penalty=0.004) %>%
  set_engine('sprak') %>%
  fit(theFormula, data=train_stan)

mod_tidy_stan <- linear_reg(penalty=0.004) %>%
  set_engine('stan') %>%
  fit(theFormula, data=train_stan)

mod_tidy_glmnet <- linear_reg(penalty=0.004) %>%
  set_engine('keras') %>%
  fit_xy(x_train_dense, y_train)

Packages

 [1] "AmesHousing"    "bayesplot"      "coefplot"       "distr"         
 [5] "DT"             "ggplot2"        "glmnet"         "here"          
 [9] "keras"          "knitr"          "lars"           "leaflet"       
[13] "leaflet.extras" "LowRankQP"      "magrittr"       "parsnip"       
[17] "purrr"          "recipes"        "rsample"        "rstanarm"      
[21] "stringr"        "tibble"         "useful"         "xgboost"

Thank You