8 Competitor Methods

We also provide implementations of some competitor feature selection methods. We used these in the simulation studies in our paper to compare cluster stability selection to the protolasso (Reid and Tibshirani, 2016) and the cluster representative lasso (Bühlmann et. al. 2013), two other feature selection methods that are designed for data with clustered features. These feature selection methods are in some ways closely related, so their implementations share helper functions.

protolasso()
- processClusterLassoInputs() checks and formats the function inputs
- getXglmnet() formats the provided design matrix Xglmnet for the lasso as implemented by glmnet (for the protolasso, this means discarding all features from each cluster except the one most highly correlated with the response; for the cluster representative lasso, this means replacing the clustered features with a simple average of the cluster members).
  - checkGetXglmnetInputs() verifies the inputs to getXglmnet()
- Finally, getClusterSelsFromGlmnet() extracts the relevant output from the results yielded by a glmnet lasso fit.
  - getSelectedSets() takes in a single selected set from Xglmnet and yields a selected feature set in the original feature space (with each selected cluster from Xglmnet replaced by its prototype) as well as a selected set of clusters.
clusterRepLasso()

protolasso():

#' Select features via the protolasso (Reid and Tibshirani 2016)
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the 
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' #' CAUTION: if the provided X is a data.frame that contains a categorical
#' feature with more than two levels, then the resulting matrix made from
#' model.matrix will have a different number of columns than the provided
#' data.frame, some of the feature numbers will change, and the clusters
#' argument will not work properly (in the current version of the package).
#' To get correct results in this case, please use model.matrix to convert
#' the data.frame to a numeric matrix on your own, then provide this matrix
#' and cluster assignments with respect to this matrix. Default is list() (so no
#' clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the protolasso. Default is 100 (following the default for glmnet). For
#' now, nlambda must be at least 2 (using a single lambda is not supported).
#' @return A list with three elements. \item{selected_sets}{A list of integer
#' vectors. Entry k of this list contains a selected set (an integer vector) of
#' size k yielded by the protolasso (If no set of size k was selected, entry k
#' will be empty.)} \item{selected_clusts_list}{A list; each element of the list
#' is a named list of selected clusters. (That is, if a selected set of size k
#' was yielded by the protolasso, then selected_clusts_list[[k]] is a named
#' list of length k, where each member of the list is an integer vector
#' of cluster members. In particular, selected_clusts_lists[[k]][[j]] will be
#' the cluster that contains feature selected_sets[[k]][j].)} \item{beta}{The
#' beta output from glmnet when the lasso was estimated on a matrix of
#' prototypes. (See documentation for the function glmnet from the glmnet
#' package for details.)}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
#' @export
protolasso <- function(X, y, clusters=list(), nlambda=100){

    # Handle and format inputs; get cluster prototypes
    ret <- processClusterLassoInputs(X, y, clusters, nlambda)

    x <- ret$x
    clusters <- ret$clusters
    prototypes <- ret$prototypes
    feat_names <- ret$var_names

    rm(ret)

    # Format the design matrix for glmnet according to the protolasso procedure
    X_glmnet <- getXglmnet(x, clusters, type="protolasso",
        prototypes=prototypes)

    # Estimate the lasso on the cluster prototypes
    fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=nlambda)
    lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

    # Finally, obtain a tidy list of selected sets--one for each model size
    cluster_sel_results <- getClusterSelsFromGlmnet(lasso_sets, clusters,
        prototypes, feat_names)

    return(list(selected_sets=cluster_sel_results$selected_sets,
        selected_clusts_list=cluster_sel_results$selected_clusts_list,
        beta=fit$beta))
}

processClusterLassoInputs():

#' Check the inputs to protolasso and clusterRepLasso, format clusters, and
#' identify prototypes for each cluster
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the 
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' Default is list() (so no clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the protolasso. Default is 100 (following the default for glmnet). For
#' now, nlambda must be at least 2 (using a single lambda is not supported).
#' @return A list with four elements. \item{x}{The provided X, converted to a
#' matrix if it was provided as a data.frame, and with column names removed.}
#' \item{clusters}{A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters are
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features will be put in their own "cluster" of size 1).}
#' \item{prototypes}{An integer vector whose length is equal to the number of
#' clusters. Entry i is the index of the feature belonging to cluster i that is
#' most highly correlated with y (that is, the prototype for the cluster, as in
#' the protolasso; see Reid and Tibshirani 2016).} \item{var_names}{If the
#' provided X matrix had column names, the names of the featurrs in the provided
#' X matrix. If no names were provided, feat_names will be NA.}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
processClusterLassoInputs <- function(X, y, clusters, nlambda){

    stopifnot(is.matrix(X) | is.data.frame(X))

    # Check if x is a matrix; if it's a data.frame, convert to matrix.
    if(is.data.frame(X)){
        p <- ncol(X)

        X <- stats::model.matrix(~ ., X)
        X <- X[, colnames(X) != "(Intercept)"]

        if(p != ncol(X) & length(clusters) > 0){
            stop("When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.")
        }
    }

    stopifnot(is.matrix(X))
    stopifnot(all(!is.na(X)))

    feat_names <- as.character(NA)
    if(!is.null(colnames(X))){
        feat_names <- colnames(X)
        if(any(is.na(feat_names))){
            stop("Some features in provided X matrix had valid names and some had NA names; please neither name all features in X or remove the names altogether.")
        }
    }

    n <- nrow(X)

    colnames(X) <- character()

    stopifnot(is.numeric(y) | is.integer(y))
    stopifnot(n == length(y))
    stopifnot(all(!is.na(y)))

    # Check clusters argument
    clusters <- checkCssClustersInput(clusters)

    # Format clusters into a list where all features are in exactly one
    # cluster (any unclustered features are put in their own "cluster" of size
    # 1).
    clust_names <- as.character(NA)
    if(!is.null(names(clusters)) & is.list(clusters)){
        clust_names <- names(clusters)
    }

    cluster_results <- formatClusters(clusters, p=ncol(X),
        clust_names=clust_names, get_prototypes=TRUE, x=X, y=y)

    clusters <- cluster_results$clusters
    prototypes <- cluster_results$prototypes

    rm(cluster_results)

    stopifnot(length(clusters) == length(prototypes))

    stopifnot(is.numeric(nlambda) | is.integer(nlambda))
    stopifnot(length(nlambda) == 1)
    stopifnot(!is.na(nlambda))
    stopifnot(nlambda >= 2)
    stopifnot(nlambda == round(nlambda))

    return(list(x=X, clusters=clusters, prototypes=prototypes,
        var_names=feat_names))
}

Tests for processClusterLassoInputs():

testthat::test_that("processClusterLassoInputs works", {
  set.seed(82612)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)

  ret <- processClusterLassoInputs(X=x, y=y, clusters=good_clusters, nlambda=10)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("x", "clusters", "prototypes",
                                           "var_names"))
  
  # X
  testthat::expect_true(is.matrix(ret$x))
  testthat::expect_true(all(!is.na(ret$x)))
  testthat::expect_true(is.numeric(ret$x))
  testthat::expect_equal(ncol(ret$x), 11)
  testthat::expect_equal(nrow(ret$x), 15)
  testthat::expect_true(all(abs(ret$x - x) < 10^(-9)))
  
  # clusters
  testthat::expect_true(is.list(ret$clusters))
  testthat::expect_equal(length(ret$clusters), 5)
  testthat::expect_equal(5, length(names(ret$clusters)))
  testthat::expect_equal(5, length(unique(names(ret$clusters))))
  testthat::expect_true("red_cluster" %in% names(ret$clusters))
  testthat::expect_true("green_cluster" %in% names(ret$clusters))
  testthat::expect_true(all(!is.na(names(ret$clusters))))
  testthat::expect_true(all(!is.null(names(ret$clusters))))
  testthat::expect_true(all(names(ret$clusters) != ""))

  clust_feats <- integer()
  true_list <- list(1:4, 5:8, 9, 10, 11)
  for(i in 1:length(ret$clusters)){
    testthat::expect_true(is.integer(ret$clusters[[i]]))
    testthat::expect_equal(length(intersect(clust_feats, ret$clusters[[i]])), 0)
    testthat::expect_true(all(ret$clusters[[i]] %in% 1:11))
    testthat::expect_equal(length(ret$clusters[[i]]),
                           length(unique(ret$clusters[[i]])))
    testthat::expect_true(all(ret$clusters[[i]] == true_list[[i]]))
    clust_feats <- c(clust_feats, ret$clusters[[i]])
  }

  testthat::expect_equal(length(clust_feats), 11)
  testthat::expect_equal(11, length(unique(clust_feats)))
  testthat::expect_equal(11, length(intersect(clust_feats, 1:11)))
  
  # prototypes
  testthat::expect_true(is.integer(ret$prototypes))
  testthat::expect_true(all(ret$prototypes %in% 1:11))
  testthat::expect_equal(length(ret$prototypes), 5)
  testthat::expect_true(ret$prototypes[1] %in% 1:4)
  testthat::expect_true(ret$prototypes[2] %in% 5:8)
  testthat::expect_equal(ret$prototypes[3], 9)
  testthat::expect_equal(ret$prototypes[4], 10)
  testthat::expect_equal(ret$prototypes[5], 11)

  # var_names
  testthat::expect_equal(length(ret$var_names), 1)
  testthat::expect_true(is.na(ret$var_names))
  
  # X as a data.frame
  X_df <- datasets::mtcars
  res <- processClusterLassoInputs(X=X_df, y=stats::rnorm(nrow(X_df)),
                                   clusters=1:3, nlambda=10)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("x", "clusters", "prototypes",
                                           "var_names"))

  X_df_model <- stats::model.matrix(~ ., X_df)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  # X
  testthat::expect_true(is.matrix(res$x))
  testthat::expect_true(all(!is.na(res$x)))
  testthat::expect_true(is.numeric(res$x))
  testthat::expect_equal(ncol(res$x), ncol(X_df_model))
  testthat::expect_equal(nrow(res$x), nrow(X_df))
  testthat::expect_true(all(abs(res$x - X_df_model) < 10^(-9)))

  # var_names
  testthat::expect_equal(length(res$var_names), ncol(X_df_model))
  testthat::expect_true(is.character(res$var_names))
  testthat::expect_identical(res$var_names, colnames(X_df_model))


  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  # Should get error if I try to use clusters because df2 contains factors with
  # more than two levels
  testthat::expect_error(processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
                                   clusters=1:3, nlambda=10), "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.",
                         fixed=TRUE)

  # Should be fine with no clusters
  res <- processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
                                   clusters=list(), nlambda=10)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("x", "clusters", "prototypes",
                                           "var_names"))

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  # X
  testthat::expect_true(is.matrix(res$x))
  testthat::expect_true(all(!is.na(res$x)))
  testthat::expect_true(is.numeric(res$x))
  testthat::expect_equal(ncol(res$x), ncol(X_df_model))
  testthat::expect_equal(nrow(res$x), nrow(X_df))
  testthat::expect_true(all(abs(res$x - X_df_model) < 10^(-9)))

  # var_names
  testthat::expect_equal(length(res$var_names), ncol(X_df_model))
  testthat::expect_true(is.character(res$var_names))
  testthat::expect_identical(res$var_names, colnames(X_df_model))

  # X as a matrix with column names
  x2 <- x
  colnames(x2) <- LETTERS[1:11]

  ret <- processClusterLassoInputs(X=x2, y=y, clusters=good_clusters, nlambda=10)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("x", "clusters", "prototypes",
                                           "var_names"))

  # X
  testthat::expect_true(is.matrix(ret$x))
  testthat::expect_true(all(!is.na(ret$x)))
  testthat::expect_true(is.numeric(ret$x))
  testthat::expect_equal(ncol(ret$x), 11)
  testthat::expect_equal(nrow(ret$x), 15)
  testthat::expect_true(all(abs(ret$x - x) < 10^(-9)))

  # var_names
  testthat::expect_equal(length(ret$var_names), ncol(x2))
  testthat::expect_true(is.character(ret$var_names))
  testthat::expect_identical(ret$var_names, LETTERS[1:11])

  # Bad inputs
  testthat::expect_error(processClusterLassoInputs(X="x", y=y[1:10],
                                                   clusters=good_clusters,
                                                   nlambda=10),
                         "is.matrix(X) | is.data.frame(X) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y[1:10],
                                                   clusters=good_clusters,
                                                   nlambda=10),
                         "n == length(y) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=list(1:4, 4:6),
                                                   nlambda=10),
                         "Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=list(2:3, 2:3),
                                                   nlambda=10),
                         "length(clusters) == length(unique(clusters)) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=list(1:4,
                                                                 as.integer(NA)),
                                                   nlambda=10),
                         "!is.na(clusters) are not all TRUE",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=list(2:3,
                                                                 c(4, 4, 5)),
                                                   nlambda=10),
                         "length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=good_clusters,
                                                   nlambda=1),
                         "nlambda >= 2 is not TRUE", fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=good_clusters,
                                                   nlambda=x),
                         "length(nlambda) == 1 is not TRUE", fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=good_clusters,
                                                   nlambda="nlambda"),
                         "is.numeric(nlambda) | is.integer(nlambda) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(processClusterLassoInputs(X=x, y=y,
                                                   clusters=good_clusters,
                                                   nlambda=10.5),
                         "nlambda == round(nlambda) is not TRUE",
                         fixed=TRUE)
  
})

## Test passed 🥇

getXglmnet():

#' Converts the provided design matrix to an appropriate format for either the
#' protolasso or the cluster representative lasso.
#'
#' Creates design matrix for glmnet by dealing with clusters (for
#' type="protolasso", discards all cluster members except prototype; for
#' type="clusterRepLasso", replaces all cluster members with a simple
#' average of all the cluster members).
#' @param x A numeric matrix; the provided matrix with n observations and p
#' features.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters should
#' be equal to the number of clusters.) All identified clusters should be
#' non-overlapping. All features should appear in exactly one cluster (any
#' unclustered features should be put in their own "cluster" of size 1).
#' @param type Character; "protolasso" for the protolasso or "clusterRepLasso"
#' for the cluster representative lasso.
#' @param prototypes Only required for type "protolasso". An integer vector
#' whose length is equal to the number of clusters. Entry i should be the
#' prototype for cluster i (the feature belonging to cluster i that is most
#' highly correlated with y; see Reid and Tibshirani 2016).
#' @return A numeric matrix; the design matrix as required for the protolasso or
#' cluster representative lasso, prepared for input to glmnet.
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
getXglmnet <- function(x, clusters, type, prototypes=NA){
    
    # Check inputs
    checkGetXglmnetInputs(x, clusters, type, prototypes)

    n <- nrow(x)
    p <- ncol(x)

    for(i in 1:length(clusters)){
        cluster_i <- clusters[[i]]

        if(length(cluster_i) == 1){
            X_glmnet_i <- x[, cluster_i]
        } else{
            stopifnot(length(cluster_i) > 1)
            
            if(type == "protolasso"){
                prototype_ind_i <- which(prototypes %in% cluster_i)
                stopifnot(length(prototype_ind_i) == 1)
                prototype_i <- prototypes[prototype_ind_i]
                X_glmnet_i <- x[, prototype_i]
            } else {
                stopifnot(type == "clusterRepLasso")
                X_glmnet_i <- rowMeans(x[, cluster_i])
            }
        }
        
        stopifnot(length(X_glmnet_i) == n)
        
        if(i == 1){
            X_glmnet <- as.matrix(X_glmnet_i)
        } else{
            X_glmnet <- cbind(X_glmnet, X_glmnet_i)
        }
    }
    
    stopifnot(ncol(X_glmnet) == length(clusters))
    stopifnot(ncol(X_glmnet) == length(clusters))
    colnames(X_glmnet) <- character()

    # Check output
    stopifnot(is.matrix(X_glmnet))
    stopifnot(nrow(X_glmnet) == n)
    stopifnot(ncol(X_glmnet) <= p)
    stopifnot(ncol(X_glmnet) >= 1)
    
    return(X_glmnet)
}

checkGetXglmnetInputs():

#' Verifies the inputs for getXglmnet.
#'
#' @param x A numeric matrix; the provided matrix with n observations and p
#' features.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters should
#' be equal to the number of clusters.) All identified clusters should be
#' non-overlapping. All features should appear in exactly one cluster (any
#' unclustered features should be put in their own "cluster" of size 1).
#' @param type Character; "protolasso" for the protolasso or "clusterRepLasso"
#' for the cluster representative lasso.
#' @param prototypes Only required for type "protolasso". An integer vector
#' whose length is equal to the number of clusters. Entry i should be the
#' prototype for cluster i (the feature belonging to cluster i that is most
#' highly correlated with y; see Reid and Tibshirani 2016).
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
checkGetXglmnetInputs <- function(x, clusters, type, prototypes){
    stopifnot(is.matrix(x))

    stopifnot(is.list(clusters))
    stopifnot(all(lengths(clusters) >= 1))

    stopifnot(length(type) == 1)
    stopifnot(is.character(type))
    stopifnot(!is.na(type))
    stopifnot(type %in% c("protolasso", "clusterRepLasso"))

    stopifnot(!is.na(prototypes))
    stopifnot(is.integer(prototypes))
    stopifnot(all(!is.na(prototypes)))
    stopifnot(length(prototypes) == length(unique(prototypes)))
    stopifnot(all(prototypes %in% 1:ncol(x)))
    
    for(i in 1:length(clusters)){
        cluster_i <- clusters[[i]]
        stopifnot(sum(prototypes %in% cluster_i) == 1)
    }
}

Tests for checkGetXglmnetInputs():

testthat::test_that("checkGetXglmnetInputs works", {
  set.seed(82612)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)
  
  process <- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
                                       nlambda=10)

  checkGetXglmnetInputs(x=process$x, clusters=process$clusters,
                               type="protolasso", prototypes=process$prototypes)
  
  checkGetXglmnetInputs(x=process$x, clusters=process$clusters,
                               type="clusterRepLasso",
                        prototypes=process$prototypes)
  
  # X as a data.frame
  X_df <- datasets::mtcars
  res <- processClusterLassoInputs(X=X_df, y=stats::rnorm(nrow(X_df)),
                                   clusters=1:3, nlambda=10)
  
  checkGetXglmnetInputs(x=res$x, clusters=res$clusters, type="clusterRepLasso",
                        prototypes=res$prototypes)
  
  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  # Should get an error if clusters are provided since df2 contains factors
  # with more than two levels
  testthat::expect_error(processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
                                   clusters=1:3, nlambda=10),
                         "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
  
  # Should be fine if no clusters are provided 
  res <- processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
                                   clusters=list(), nlambda=10)
  
  checkGetXglmnetInputs(x=res$x, clusters=res$clusters, type="protolasso",
                        prototypes=res$prototypes)

  # X as a matrix with column names
  x2 <- x
  colnames(x2) <- LETTERS[1:11]

  ret <- processClusterLassoInputs(X=x2, y=y, clusters=good_clusters, nlambda=10)

  checkGetXglmnetInputs(x=ret$x, clusters=ret$clusters, type="clusterRepLasso",
                        prototypes=ret$prototypes)

  # Bad prototype inputs
  # Error has quotation marks
  testthat::expect_error(checkGetXglmnetInputs(x=process$x,
                                               clusters=process$clusters,
                                               type="clsterRepLasso",
                                               prototypes=process$prototypes))

  testthat::expect_error(checkGetXglmnetInputs(x=process$x,
                                               clusters=process$clusters,
                                               type=c("clusterRepLasso",
                                                      "protolasso"),
                                               prototypes=process$prototypes),
                         "length(type) == 1 is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(checkGetXglmnetInputs(x=process$x,
                                               clusters=process$clusters,
                                               type=2,
                                               prototypes=process$prototypes),
                         "is.character(type) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(checkGetXglmnetInputs(x=process$x,
                                               clusters=process$clusters,
                                               type=as.character(NA),
                                               prototypes=process$prototypes),
                         "!is.na(type) is not TRUE",
                         fixed=TRUE)
  
})

## Test passed 🥇

Tests for getXglmnet():

testthat::test_that("getXglmnet works", {
  set.seed(82612)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)
  
  process <- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
                                       nlambda=10)

  res <- getXglmnet(x=process$x, clusters=process$clusters,
                               type="protolasso", prototypes=process$prototypes)
  
  testthat::expect_true(is.matrix(res))
  testthat::expect_true(is.numeric(res))
  testthat::expect_true(is.null(colnames(res)))
  testthat::expect_true(nrow(res) == 15)
  # Each column of res should be one of the prototypes. Features 9 - 11 are
  # in clusters by themselves and are therefore their own prototypes.
  testthat::expect_true(ncol(res) == 5)
  for(i in 1:length(good_clusters)){
    proto_i_found <- FALSE
    cluster_i <- good_clusters[[i]]
    for(j in 1:length(cluster_i)){
      proto_i_found <- proto_i_found | all(abs(res[, i] - x[, cluster_i[j]]) <
                                             10^(-9))
    }
    testthat::expect_true(proto_i_found)
  }
  testthat::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
  
  res <- getXglmnet(x=process$x, clusters=process$clusters,
                    type="clusterRepLasso", prototypes=process$prototypes)
  
  testthat::expect_true(is.matrix(res))
  testthat::expect_true(is.numeric(res))
  testthat::expect_true(is.null(colnames(res)))
  testthat::expect_true(nrow(res) == 15)
  # Each column of res should be one of the cluster representatives. Features 9
  # - 11 are in clusters by themselves and are therefore their own cluster
  # representatives.
  testthat::expect_true(ncol(res) == 5)
  for(i in 1:length(good_clusters)){
    cluster_i <- good_clusters[[i]]
    clus_rep_i <- rowMeans(x[, cluster_i])
    testthat::expect_true(all(abs(res[, i] - clus_rep_i) <
                                             10^(-9)))
  }
  testthat::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
  
  # X as a data.frame
  X_df <- datasets::mtcars
  res <- processClusterLassoInputs(X=X_df, y=stats::rnorm(nrow(X_df)),
                                   clusters=1:3, nlambda=10)

  ret_df <- getXglmnet(x=res$x, clusters=res$clusters, type="protolasso",
                       prototypes=res$prototypes)
  
  X_df_model <- stats::model.matrix(~ ., X_df)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
  
  testthat::expect_true(is.matrix(ret_df))
  testthat::expect_true(is.numeric(ret_df))
  testthat::expect_true(is.null(colnames(ret_df)))
  testthat::expect_true(nrow(ret_df) == nrow(X_df))
  # Each column of ret_df should be one of the prototypes.
  testthat::expect_true(ncol(ret_df) == ncol(X_df_model) - 3 + 1)

  proto_found <- FALSE
  for(j in 1:3){
    proto_found <- proto_found | all(abs(ret_df[, 1] - X_df_model[, j]) < 10^(-9))
  }
  testthat::expect_true(proto_found)

  for(j in 4:ncol(X_df_model)){
    testthat::expect_true(all(abs(ret_df[, j - 2] - X_df_model[, j]) < 10^(-9)))
  }
  
  ret_df <- getXglmnet(x=res$x, clusters=res$clusters, type="clusterRepLasso",
                       prototypes=res$prototypes)
  
  testthat::expect_true(is.matrix(ret_df))
  testthat::expect_true(is.numeric(ret_df))
  testthat::expect_true(is.null(colnames(ret_df)))
  testthat::expect_true(nrow(ret_df) == nrow(X_df))
  # Each column of ret_df should be one of the prototypes.
  testthat::expect_true(ncol(ret_df) == ncol(X_df_model) - 3 + 1)

  proto_found <- FALSE
  clus_rep <- rowMeans(X_df_model[, 1:3])
  testthat::expect_true(all(abs(ret_df[, 1] - clus_rep) < 10^(-9)))

  for(j in 4:ncol(X_df_model)){
    testthat::expect_true(all(abs(ret_df[, j - 2] - X_df_model[, j]) < 10^(-9)))
  }

  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  res <- processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
                                   clusters=list(), nlambda=10)

  ret_df <- getXglmnet(x=res$x, clusters=res$clusters, type="protolasso",
                       prototypes=res$prototypes)

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  testthat::expect_true(is.matrix(ret_df))
  testthat::expect_true(is.numeric(ret_df))
  testthat::expect_true(is.null(colnames(ret_df)))
  testthat::expect_true(nrow(ret_df) == nrow(X_df))
  # Each column of ret_df should be one of the prototypes.
  testthat::expect_true(ncol(ret_df) == ncol(X_df_model))

  for(j in 1:ncol(X_df_model)){
    testthat::expect_true(all(abs(ret_df[, j] - X_df_model[, j]) < 10^(-9)))
  }

  ret_df <- getXglmnet(x=res$x, clusters=res$clusters, type="clusterRepLasso",
                       prototypes=res$prototypes)

  testthat::expect_true(is.matrix(ret_df))
  testthat::expect_true(is.numeric(ret_df))
  testthat::expect_true(is.null(colnames(ret_df)))
  testthat::expect_true(nrow(ret_df) == nrow(X_df))
  # Each column of ret_df should be one of the prototypes.
  testthat::expect_true(ncol(ret_df) == ncol(X_df_model))

  for(j in 1:ncol(X_df_model)){
    testthat::expect_true(all(abs(ret_df[, j] - X_df_model[, j]) < 10^(-9)))
  }

  # X as a matrix with column names (returned X shouldn't have column names)
  x2 <- x
  colnames(x2) <- LETTERS[1:11]

  process <- processClusterLassoInputs(X=x2, y=y, clusters=good_clusters,
                                       nlambda=10)

  res <- getXglmnet(x=process$x, clusters=process$clusters,
                               type="protolasso", prototypes=process$prototypes)
  
  testthat::expect_true(is.matrix(res))
  testthat::expect_true(is.numeric(res))
  testthat::expect_true(is.null(colnames(res)))
  testthat::expect_true(nrow(res) == 15)
  # Each column of res should be one of the prototypes. Features 9 - 11 are
  # in clusters by themselves and are therefore their own prototypes.
  testthat::expect_true(ncol(res) == 5)
  for(i in 1:length(good_clusters)){
    proto_i_found <- FALSE
    cluster_i <- good_clusters[[i]]
    for(j in 1:length(cluster_i)){
      proto_i_found <- proto_i_found | all(abs(res[, i] - x[, cluster_i[j]]) <
                                             10^(-9))
    }
    testthat::expect_true(proto_i_found)
  }
  testthat::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
  
  res <- getXglmnet(x=process$x, clusters=process$clusters,
                    type="clusterRepLasso", prototypes=process$prototypes)
  
  testthat::expect_true(is.matrix(res))
  testthat::expect_true(is.numeric(res))
  testthat::expect_true(is.null(colnames(res)))
  testthat::expect_true(nrow(res) == 15)
  # Each column of res should be one of the cluster representatives. Features 9
  # - 11 are in clusters by themselves and are therefore their own cluster
  # representatives.
  testthat::expect_true(ncol(res) == 5)
  for(i in 1:length(good_clusters)){
    cluster_i <- good_clusters[[i]]
    clus_rep_i <- rowMeans(x[, cluster_i])
    testthat::expect_true(all(abs(res[, i] - clus_rep_i) <
                                             10^(-9)))
  }
  testthat::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
  testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))

  # Bad prototype inputs
  # Error has quotation marks
  testthat::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
                                    type="clsterRepLasso",
                                    prototypes=process$prototypes))

  testthat::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
                                    type=c("clusterRepLasso", "protolasso"),
                                    prototypes=process$prototypes),
                         "length(type) == 1 is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
                                    type=2, prototypes=process$prototypes),
                         "is.character(type) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
                                    type=as.character(NA),
                                    prototypes=process$prototypes),
                         "!is.na(type) is not TRUE",
                         fixed=TRUE)
  
})

## Test passed 🥇

getClusterSelsFromGlmnet():

#' Extracts selected clusters and cluster prototypes from the glmnet lasso
#' output
#'
#' @param lasso_sets A list of integer vectors. Each vector represents a set of
#' features selected by the lasso for a given value of the penalty parameter
#' lambda.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters must be
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features must be in their own "cluster" of size 1).
#' @param prototypes An integer vector whose length must be equal to the number
#' of clusters. Entry i should be the index of the feature belonging to cluster
#' i that is most highly correlated with y (that is, the prototype for the
#' cluster, as in the protolasso; see Reid and Tibshirani 2016).
#' @param feat_names Character vector; the names of the features in X. (If the
#' X provided to protolasso or clusterRepLasso did not have feature names,
#' feat_names will be NA.)
#' @return A list containing the following items: \item{selected_sets}{A list of
#' integer vectors. Entry k of this list contains a selected set of size k
#' yielded by glmnet--each member of the set is the index of a single feature
#' from a cluster selected by either the protolasso or the cluster
#' representative lasso (the prototype from that cluster--the cluster member
#' most highly correlated with y). (If no set of size k was selected, entry k
#' will be NULL.)} \item{selected_clusts_list}{A list of lists; entry k of this
#' list is a list of length k of clusters (the clusters that were selected by
#' the cluster representative lasso). Again, if no set of size k was selected,
#' entry k will be NULL.}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}. \cr Bühlmann, P.,
#' Rütimann, P., van de Geer, S., & Zhang, C. H. (2013). Correlated variables in
#' regression: Clustering and sparse estimation.
#' \emph{Journal of Statistical Planning and Inference}, 143(11), 1835–1858.
#' \url{https://doi.org/10.1016/j.jspi.2013.05.019}.
getClusterSelsFromGlmnet <- function(lasso_sets, clusters, prototypes,
    feat_names){

    if(any(!is.na(feat_names))){
        stopifnot(all(!is.na(feat_names)))
    }

    # Largest selected set among all those in lasso_sets
    max_length <- max(vapply(lasso_sets, length, integer(1)))

    # Preparing lists to store 
    selected_sets <- list()
    selected_clusts_list <- list()
    
    for(j in 1:max_length){
        # Lasso selected set of size j
        lasso_sets_j <- lasso_sets[lapply(lasso_sets, length) == j]
        # Are there any lasso selected sets of size j? (If not, we will skip to
        # the next j, and slot j in the list will be empty.)
        if(length(lasso_sets_j) > 0){

            # Select the first set of size j
            lasso_set_j <- lasso_sets_j[[1]]
            stopifnot(length(lasso_set_j) == j)
            
            ret <- getSelectedSets(lasso_set=lasso_set_j, clusters=clusters,
                prototypes=prototypes, feat_names=feat_names)

            selected_sets[[j]] <- ret$selected_set
            selected_clusts_list[[j]] <- ret$selected_clusts_list

            rm(ret)
        }
    }

    stopifnot(length(selected_sets) <= max_length)
    stopifnot(length(selected_clusts_list) <= max_length)

    return(list(selected_sets=selected_sets,
        selected_clusts_list=selected_clusts_list))
}

getSelectedSets():

#' Converts a selected set from X_glmnet to selected sets and selected clusters
#' from the original feature space of X.
#'
#' @param lasso_set A vector containing the indices of selected cluster
#' representatives or prototypes.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters must be
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features must be in their own "cluster" of size 1).
#' @param prototypes An integer vector whose length must be equal to the number
#' of clusters. Entry i should be the index of the feature belonging to cluster
#' i that is most highly correlated with y (that is, the prototype for the
#' cluster, as in the protolasso).
#' @param feat_names Character vector; the names of the features in X.
#' @return A list containing two items: \item{selected_set}{An integer vector
#' with length equal to lasso_set containing a set of selected features in the
#' original X matrix. (Selections in lasso_set corresponding to a cluster will
#' be replaced by the cluster's prototype from X.)}
#' \item{selected_clusts_list}{A named list of integer vectors with length equal
#' to selected_set. selected_clusts_list[[k]] will be an integer vector
#' containing the indices of the features in X that are in the cluster
#' containing prototype selected_set[k].}
#' @author Gregory Faletto, Jacob Bien
getSelectedSets <- function(lasso_set, clusters, prototypes, feat_names){
    
    model_size <- length(lasso_set)
    stopifnot(model_size > 0)

    stopifnot(length(unique(lasso_set)) == model_size)
    stopifnot(all(lasso_set <= length(clusters)))

    selected_set <- integer()
    selected_clusts_list <- list()
    # Recover features from original feature space
    for(k in 1:model_size){
        selected_cluster_k <- clusters[[lasso_set[k]]]
        stopifnot(is.integer(selected_cluster_k))
        selected_clusts_list[[k]] <- selected_cluster_k

        if(length(selected_cluster_k) == 1){
            stopifnot(!(selected_cluster_k %in% selected_set))
            selected_set <- c(selected_set, selected_cluster_k)
        } else{
            sel_prototype <- which(prototypes %in% selected_cluster_k)
            stopifnot(length(sel_prototype) == 1)
            stopifnot(!(prototypes[sel_prototype] %in% selected_set))
            selected_set <- c(selected_set, prototypes[sel_prototype])
        }
    }

    stopifnot(length(selected_set) == model_size)
    stopifnot(length(unique(selected_set)) == model_size)
    
    if(any(!is.na(feat_names))){
        names(selected_set) <- feat_names[selected_set]
    }

    stopifnot(length(selected_clusts_list) == model_size)
    all_feats <- unlist(selected_clusts_list)
    stopifnot(length(all_feats) == length(unique(all_feats)))

    return(list(selected_set=selected_set,
        selected_clusts_list=selected_clusts_list))
}

Tests for getSelectedSets():

testthat::test_that("getSelectedSets works", {
  set.seed(82612)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)
  
  process <- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
                                       nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="protolasso", prototypes=process$prototypes)
  
  fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  # Pick an arbitrary lasso set
  lasso_set <- lasso_sets[[5]]
  
  res <- getSelectedSets(lasso_set, process$clusters, process$prototypes,
                         process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_set",
                                           "selected_clusts_list"))
  
  # selected_set
  testthat::expect_true(is.integer(res$selected_set))
  testthat::expect_true(all(!is.na(res$selected_set)))
  testthat::expect_true(all(res$selected_set %in% process$prototypes))
  
  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  testthat::expect_equal(length(res$selected_set),
                         length(res$selected_clusts_list))
  sel_feats <- unlist(res$selected_clusts_list)
  testthat::expect_true(all(sel_feats %in% 1:11))
  n_clusts <- length(res$selected_clusts_list)
  for(i in 1:n_clusts){
    clust_i_found <- FALSE
    clust_i <- res$selected_clusts_list[[i]]
    for(j in 1:length(process$clusters)){
      clust_i_found <- clust_i_found | identical(clust_i, process$clusters[[j]])
    }
    testthat::expect_true(clust_i_found)
  }
  
  # Try again with cluster representative lasso
  
  
  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="clusterRepLasso", prototypes=process$prototypes)
  
  fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  # Pick an arbitrary lasso set
  lasso_set <- lasso_sets[[5]]
  
  res <- getSelectedSets(lasso_set, process$clusters, process$prototypes,
                         process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_set",
                                           "selected_clusts_list"))
  
  # selected_set
  testthat::expect_true(is.integer(res$selected_set))
  testthat::expect_true(all(!is.na(res$selected_set)))
  testthat::expect_true(all(res$selected_set %in% process$prototypes))
  
  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  testthat::expect_equal(length(res$selected_set),
                         length(res$selected_clusts_list))
  sel_feats <- unlist(res$selected_clusts_list)
  testthat::expect_true(all(sel_feats %in% 1:11))
  n_clusts <- length(res$selected_clusts_list)
  for(i in 1:n_clusts){
    clust_i_found <- FALSE
    clust_i <- res$selected_clusts_list[[i]]
    for(j in 1:length(process$clusters)){
      clust_i_found <- clust_i_found | identical(clust_i, process$clusters[[j]])
    }
    testthat::expect_true(clust_i_found)
  }
  

  
  
  
  # X as a data.frame
  X_df <- datasets::mtcars

  X_df_model <- stats::model.matrix(~ ., X_df)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
  
  process <- processClusterLassoInputs(X=X_df, y=rnorm(nrow(X_df)),
                                       clusters=1:3, nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="protolasso", prototypes=process$prototypes)
  
  fit <- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(X_df)), family="gaussian",
                        nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  # Pick an arbitrary lasso set
  lasso_set <- lasso_sets[[min(length(lasso_sets), 3)]]
  
  res <- getSelectedSets(lasso_set, process$clusters, process$prototypes,
                         process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_set",
                                           "selected_clusts_list"))
  
  # selected_set
  testthat::expect_true(is.integer(res$selected_set))
  testthat::expect_true(all(!is.na(res$selected_set)))
  testthat::expect_true(all(res$selected_set %in% process$prototypes))
  
  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  testthat::expect_equal(length(res$selected_set),
                         length(res$selected_clusts_list))
  sel_feats <- unlist(res$selected_clusts_list)
  testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
  n_clusts <- length(res$selected_clusts_list)
  for(i in 1:n_clusts){
    clust_i_found <- FALSE
    clust_i <- res$selected_clusts_list[[i]]
    for(j in 1:length(process$clusters)){
      clust_i_found <- clust_i_found | identical(clust_i, process$clusters[[j]])
    }
    testthat::expect_true(clust_i_found)
  }
  
  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  # Should throw an error if we assign clusters because df2 contains factors
  # with more than two levels
  testthat::expect_error(processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
                                       clusters=1:3, nlambda=100),
                         "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)

  # Should be fine if no clusters are provided
  process <- processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
                                       clusters=list(), nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="clusterRepLasso", prototypes=process$prototypes)

  fit <- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(df2)), family="gaussian",
                        nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  # Pick an arbitrary lasso set
  lasso_set <- lasso_sets[[min(length(lasso_sets), 3)]]

  res <- getSelectedSets(lasso_set, process$clusters, process$prototypes,
                         process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_set",
                                           "selected_clusts_list"))

  # selected_set
  testthat::expect_true(is.integer(res$selected_set))
  testthat::expect_true(all(!is.na(res$selected_set)))
  testthat::expect_true(all(res$selected_set %in% process$prototypes))

  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  testthat::expect_equal(length(res$selected_set),
                         length(res$selected_clusts_list))
  sel_feats <- unlist(res$selected_clusts_list)
  testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
  n_clusts <- length(res$selected_clusts_list)
  for(i in 1:n_clusts){
    clust_i_found <- FALSE
    clust_i <- res$selected_clusts_list[[i]]
    for(j in 1:length(process$clusters)){
      clust_i_found <- clust_i_found | identical(clust_i, process$clusters[[j]])
    }
    testthat::expect_true(clust_i_found)
  }




  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  # Should throw an error if we assign clusters because df2 contains factors
  # with more than two levels
  testthat::expect_error(processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
                                       clusters=1:3, nlambda=100),
                         "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)

  # Should be fine if no clusters are provided
  process <- processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
                                       clusters=list(), nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="clusterRepLasso", prototypes=process$prototypes)

  fit <- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(df2)), family="gaussian",
                        nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  # Pick an arbitrary lasso set
  lasso_set <- lasso_sets[[min(length(lasso_sets), 3)]]

  res <- getSelectedSets(lasso_set, process$clusters, process$prototypes,
                         process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_set",
                                           "selected_clusts_list"))

  # selected_set
  testthat::expect_true(is.integer(res$selected_set))
  testthat::expect_true(all(!is.na(res$selected_set)))
  testthat::expect_true(all(res$selected_set %in% process$prototypes))

  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  testthat::expect_equal(length(res$selected_set),
                         length(res$selected_clusts_list))
  sel_feats <- unlist(res$selected_clusts_list)
  testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
  n_clusts <- length(res$selected_clusts_list)
  for(i in 1:n_clusts){
    clust_i_found <- FALSE
    clust_i <- res$selected_clusts_list[[i]]
    for(j in 1:length(process$clusters)){
      clust_i_found <- clust_i_found | identical(clust_i, process$clusters[[j]])
    }
    testthat::expect_true(clust_i_found)
  }

  
  
  # X as a matrix with column names
  x2 <- x
  colnames(x2) <- LETTERS[1:11]

  process <- processClusterLassoInputs(X=x2, y=y,
                                       clusters=good_clusters, nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="protolasso", prototypes=process$prototypes)
  
  fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian",
                        nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  # Pick an arbitrary lasso set
  lasso_set <- lasso_sets[[min(length(lasso_sets), 3)]]
  
  res <- getSelectedSets(lasso_set, process$clusters, process$prototypes,
                         process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_set",
                                           "selected_clusts_list"))
  
  # selected_set
  testthat::expect_true(is.integer(res$selected_set))
  testthat::expect_true(all(!is.na(res$selected_set)))
  testthat::expect_true(all(res$selected_set %in% process$prototypes))
  
  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  testthat::expect_equal(length(res$selected_set),
                         length(res$selected_clusts_list))
  sel_feats <- unlist(res$selected_clusts_list)
  testthat::expect_true(all(sel_feats %in% 1:11))
  n_clusts <- length(res$selected_clusts_list)
  for(i in 1:n_clusts){
    clust_i_found <- FALSE
    clust_i <- res$selected_clusts_list[[i]]
    for(j in 1:length(process$clusters)){
      clust_i_found <- clust_i_found | identical(clust_i, process$clusters[[j]])
    }
    testthat::expect_true(clust_i_found)
  }
  
})

## Test passed 😸

Tests for getClusterSelsFromGlmnet():

testthat::test_that("getClusterSelsFromGlmnet works", {
  set.seed(61282)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)
  
  process <- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
                                       nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="protolasso", prototypes=process$prototypes)
  
  fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
  
  res <- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
                                  process$prototypes, process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }
  

  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(process$clusters)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     process$clusters[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # Try again with cluster representative lasso

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="clusterRepLasso", prototypes=process$prototypes)

  fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

  res <- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
                                  process$prototypes, process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(process$clusters)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     process$clusters[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  
  
  
  
  # X as a data.frame
  X_df <- datasets::mtcars

  X_df_model <- stats::model.matrix(~ ., X_df)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  process <- processClusterLassoInputs(X=X_df, y=rnorm(nrow(X_df)),
                                       clusters=1:3, nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="protolasso", prototypes=process$prototypes)

  fit <- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(X_df)), family="gaussian",
                        nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

  res <- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
                                  process$prototypes, process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(process$clusters)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     process$clusters[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  process <- processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
                                       clusters=list(), nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="clusterRepLasso", prototypes=process$prototypes)

  fit <- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(df2)), family="gaussian",
                        nlambda=100)
  
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

  res <- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
                                  process$prototypes, process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(process$clusters)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     process$clusters[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }



  # X as a matrix with column names
  x2 <- x
  colnames(x2) <- LETTERS[1:11]

  process <- processClusterLassoInputs(X=x2, y=y,
                                       clusters=good_clusters, nlambda=100)

  X_glmnet <- getXglmnet(x=process$x, clusters=process$clusters,
                         type="protolasso", prototypes=process$prototypes)

  fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian",
                        nlambda=100)
  lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

  res <- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
                                  process$prototypes, process$var_names)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(process$clusters)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     process$clusters[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }
  
})

## Test passed 🥳

Finally, tests for protolasso():

testthat::test_that("protolasso works", {
  set.seed(61282)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)
  
  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=good_clusters, p=11,
                                     clust_names=names(good_clusters),
                                     get_prototypes=TRUE, x=x, y=y)
  
  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters
  
  res <- protolasso(x, y, good_clusters, nlambda=60)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(is.null(names(res$selected_sets[[i]])))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
  testthat::expect_true(ncol(res$beta) <= 60)

  
  # X as a data.frame
  X_df <- datasets::mtcars

  X_df_model <- stats::model.matrix(~ ., X_df)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
  
  y_df <- rnorm(nrow(X_df))
  
  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=1:3, p=ncol(X_df_model),
                                     get_prototypes=TRUE, x=X_df_model, y=y_df)
  
  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters
  
  res <- protolasso(X_df, y_df, 1:3, nlambda=80)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))

  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
                                  colnames(X_df_model)))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }
  
  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
  testthat::expect_true(ncol(res$beta) <= 80)

  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
  
  # Should get an error if we try to call protolasso on df2 with clusters
  # because df2 contains factors with more than two levels
  testthat::expect_error(protolasso(df2, y_df, 4:6, nlambda=70),
                         "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
  
  # Should be fine if no clusters are provided
  res <- protolasso(df2, y_df, nlambda=70)
  
  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))


  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=4:6, p=ncol(X_df_model),
                                     get_prototypes=TRUE, x=X_df_model, y=y_df)

  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters

  res <- protolasso(X_df_model, y_df, 4:6, nlambda=70)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))


  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
                                  colnames(X_df_model)))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }
  
  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
  testthat::expect_true(ncol(res$beta) <= 70)



  # X as a matrix with column names
  x2 <- x
  colnames(x2) <- LETTERS[1:11]


  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=good_clusters, p=11,
                                     clust_names=names(good_clusters),
                                     get_prototypes=TRUE, x=x2, y=y)
  
  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters
  
  res <- protolasso(x2, y, good_clusters, nlambda=50)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))
  
  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
                                  LETTERS[1:11]))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }
  
  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
  testthat::expect_true(ncol(res$beta) <= 50)
  
  # Bad inputs
  testthat::expect_error(protolasso(X="x", y=y[1:10], clusters=good_clusters,
                                    nlambda=10),
                         "is.matrix(X) | is.data.frame(X) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y[1:10], clusters=good_clusters,
                                    nlambda=10),
                         "n == length(y) is not TRUE", fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=list(1:4, 4:6),
                                    nlambda=10),
                         "Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.", fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=list(2:3, 2:3),
                                    nlambda=10),
                         "length(clusters) == length(unique(clusters)) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y,
                                    clusters=list(1:4, as.integer(NA)),
                                    nlambda=10),
                         "!is.na(clusters) are not all TRUE", fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=list(2:3, c(4, 4, 5)),
                                    nlambda=10),
                         "length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
                                    nlambda=1), "nlambda >= 2 is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
                                    nlambda=x),
                         "length(nlambda) == 1 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
                                    nlambda="nlambda"),
                         "is.numeric(nlambda) | is.integer(nlambda) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
                                    nlambda=10.5),
                         "nlambda == round(nlambda) is not TRUE",
                         fixed=TRUE)
  
})

## Test passed 🥇

clusterRepLasso():

#' Select features via the cluster representative lasso (Bühlmann et. al. 2013)
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the 
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' CAUTION: if the provided X is a data.frame that contains a categorical
#' feature with more than two levels, then the resulting matrix made from
#' model.matrix will have a different number of columns than the provided
#' data.frame, some of the feature numbers will change, and the clusters
#' argument will not work properly (in the current version of the package).
#' To get correct results in this case, please use model.matrix to convert
#' the data.frame to a numeric matrix on your own, then provide this matrix
#' and cluster assignments with respect to this matrix. Default is list() (so no
#' clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the cluster representative lasso. Default is 100 (following the default
#' for glmnet). For now, nlambda must be at least 2 (using a single lambda is
#' not supported).
#' @return A list with three elements. \item{selected_sets}{A list of integer
#' vectors. Entry k of this list contains a selected set (an integer vector) of
#' size k yielded by the lasso--each member of the set is the index of a single
#' feature from a cluster selected by the cluster representative lasso (the
#' prototype from that cluster--the cluster member most highly correlated with
#' y). (If no set of size k was selected, entry k will be empty.)}
#' \item{selected_clusts_list}{A list; each element of the list is a named list
#' of selected clusters. (That is, if a selected set of size k was yielded by
#' the cluster representative lasso, then selected_clusts_list[[k]] is a named
#' list of length k, where each member of the list is an integer vector
#' of cluster members. Note that selected_clusts_lists[[k]][[j]] will be the
#' cluster that contains feature selected_sets[[k]][j].)} \item{beta}{The beta
#' output from glmnet when the lasso was estimated on a matrix of prototypes.
#' (See documentation for the function glmnet from the glmnet package for
#' details.)}
#' @references Bühlmann, P., Rütimann, P., van de Geer, S., & Zhang, C. H.
#' (2013). Correlated variables in regression: Clustering and sparse estimation.
#' \emph{Journal of Statistical Planning and Inference}, 143(11), 1835–1858.
#' \url{https://doi.org/10.1016/j.jspi.2013.05.019}. \cr Jerome Friedman, Trevor
#' Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear
#' Models via Coordinate Descent. \emph{Journal of Statistical Software}, 33(1)
#' ' 1-22. URL \url{https://www.jstatsoft.org/v33/i01/}.
clusterRepLasso <- function(X, y, clusters=list(), nlambda=100){

    # Handle and format inputs; get cluster prototypes
    ret <- processClusterLassoInputs(X, y, clusters, nlambda)

    x <- ret$x
    clusters <- ret$clusters
    prototypes <- ret$prototypes
    feat_names <- ret$var_names

    rm(ret)

    # Format the design matrix for glmnet according to the cluster
    # representative lasso procedure
    X_glmnet <- getXglmnet(x, clusters, type="clusterRepLasso",
        prototypes=prototypes)

    # Estimate the lasso on the cluster representatives
    fit <- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=nlambda)
    lasso_sets <- unique(glmnet::predict.glmnet(fit, type="nonzero"))

    # Finally, extract the desired information from the lasso fit--all the
    # sets of selected clusters (one for each observed model size), and
    # corresponding sets of selected features
    cluster_sel_results <- getClusterSelsFromGlmnet(lasso_sets, clusters,
        prototypes, feat_names)

    return(list(selected_sets=cluster_sel_results$selected_sets,
        selected_clusts_list=cluster_sel_results$selected_clusts_list,
        beta=fit$beta))
}

Tests for clusterRepLasso():

# TODO(gregfaletto): deal with the fact that clusters argument doesn't work
# for a data.frame input that has a categorical random variable with more than
# two levels (because then p, and the numbering of the features, changes)
testthat::test_that("clusterRepLasso works", {
  set.seed(61282)
  
  x <- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
  y <- stats::rnorm(15)
  
  good_clusters <- list(red_cluster=1L:4L, green_cluster=5L:8L)
  
  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=good_clusters, p=11,
                                     clust_names=names(good_clusters),
                                     get_prototypes=TRUE, x=x, y=y)
  
  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters
  
  res <- clusterRepLasso(x, y, good_clusters, nlambda=60)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))

  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(is.null(names(res$selected_sets[[i]])))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
  testthat::expect_true(ncol(res$beta) <= 60)


  # X as a data.frame
  X_df <- datasets::mtcars

  X_df_model <- stats::model.matrix(~ ., X_df)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]

  y_df <- rnorm(nrow(X_df))

  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=1:3, p=ncol(X_df_model),
                                     get_prototypes=TRUE, x=X_df_model, y=y_df)

  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters

  res <- clusterRepLasso(X_df, y_df, 1:3, nlambda=80)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))

  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
                                  colnames(X_df_model)))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
  testthat::expect_true(ncol(res$beta) <= 80)

  # X as a dataframe with factors (number of columns of final design matrix
  # after one-hot encoding factors won't match number of columns of df2)
  # cyl, gear, and carb are factors with more than 2 levels
  df2 <- X_df
  df2$cyl <- as.factor(df2$cyl)
  df2$vs <- as.factor(df2$vs)
  df2$am <- as.factor(df2$am)
  df2$gear <- as.factor(df2$gear)
  df2$carb <- as.factor(df2$carb)
  
  # Should get an error if we try to call clusterRepLasso on df2 with clusters
  # because df2 contains factors with more than two levels
  testthat::expect_error(clusterRepLasso(df2, y_df, 4:6, nlambda=70),
                         "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
  
  # Should be fine if no clusters are provided
  res <- clusterRepLasso(df2, y_df, nlambda=70)
  
  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))

  X_df_model <- stats::model.matrix(~ ., df2)
  X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]


  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=4:6, p=ncol(X_df_model),
                                     get_prototypes=TRUE, x=X_df_model, y=y_df)

  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters

  res <- clusterRepLasso(X_df_model, y_df, 4:6, nlambda=70)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))


  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
                                  colnames(X_df_model)))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))
  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
  testthat::expect_true(ncol(res$beta) <= 70)



  # X as a matrix with column names
  x2 <- x
  colnames(x2) <- LETTERS[1:11]


  # Get properly formatted clusters and prototypes for testing
  format_clust_res <- formatClusters(clusters=good_clusters, p=11,
                                     clust_names=names(good_clusters),
                                     get_prototypes=TRUE, x=x2, y=y)

  prototypes <- format_clust_res$prototypes
  clus_formatted <- format_clust_res$clusters

  res <- clusterRepLasso(x2, y, good_clusters, nlambda=50)

  testthat::expect_true(is.list(res))
  testthat::expect_identical(names(res), c("selected_sets",
                                           "selected_clusts_list", "beta"))

  # selected_sets
  testthat::expect_true(is.list(res$selected_sets))
  # Selected models should have one of each size without repetition
  lengths <- lengths(res$selected_sets)
  lengths <- lengths[lengths != 0]
  testthat::expect_identical(lengths, unique(lengths))
  for(i in 1:length(res$selected_sets)){
    if(!is.null(res$selected_sets[[i]])){
      testthat::expect_true(is.integer(res$selected_sets[[i]]))
      testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
      testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
      testthat::expect_equal(length(res$selected_sets[[i]]), i)
      testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
                                  LETTERS[1:11]))
    } else{
      testthat::expect_true(is.null(res$selected_sets[[i]]))
    }
  }


  # selected_clusts_list
  testthat::expect_true(is.list(res$selected_clusts_list))
  # Selected models should have one of each size without repetition
  clust_lengths <- lengths(res$selected_clusts_list)
  clust_lengths <- clust_lengths[clust_lengths != 0]
  testthat::expect_identical(clust_lengths, unique(clust_lengths))

  for(k in 1:length(res$selected_clusts_list)){
    if(!is.null(res$selected_clusts_list[[k]])){
      testthat::expect_true(is.list(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_sets[[k]]),
                             length(res$selected_clusts_list[[k]]))
      testthat::expect_equal(length(res$selected_clusts_list[[k]]), k)
      sel_feats <- unlist(res$selected_clusts_list[[k]])
      testthat::expect_true(all(sel_feats %in% 1:11))
      testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
      n_clusts <- k
      for(i in 1:n_clusts){
        clust_i_found <- FALSE
        clust_i <- res$selected_clusts_list[[k]][[i]]
        for(j in 1:length(clus_formatted)){
          clust_i_found <- clust_i_found | identical(clust_i,
                                                     clus_formatted[[j]])
        }
        testthat::expect_true(clust_i_found)
      }
    } else{
      testthat::expect_true(is.null(res$selected_clusts_list[[k]]))
    }
  }

  # beta
  testthat::expect_true(grepl("dgCMatrix", class(res$beta)))
  testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
  testthat::expect_true(ncol(res$beta) <= 50)

  # Bad inputs
  testthat::expect_error(clusterRepLasso(X="x", y=y[1:10], clusters=good_clusters,
                                    nlambda=10),
                         "is.matrix(X) | is.data.frame(X) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y[1:10], clusters=good_clusters,
                                    nlambda=10),
                         "n == length(y) is not TRUE", fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=list(1:4, 4:6),
                                    nlambda=10),
                         "Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.", fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=list(2:3, 2:3),
                                    nlambda=10),
                         "length(clusters) == length(unique(clusters)) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y,
                                    clusters=list(1:4, as.integer(NA)),
                                    nlambda=10),
                         "!is.na(clusters) are not all TRUE", fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=list(2:3, c(4, 4, 5)),
                                    nlambda=10),
                         "length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
                                    nlambda=1), "nlambda >= 2 is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
                                    nlambda=x),
                         "length(nlambda) == 1 is not TRUE", fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
                                    nlambda="nlambda"),
                         "is.numeric(nlambda) | is.integer(nlambda) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
                                    nlambda=10.5),
                         "nlambda == round(nlambda) is not TRUE",
                         fixed=TRUE)
  
})

## Test passed 🥇