8 Competitor Methods
We also provide implementations of some competitor feature selection methods. We used these in the simulation studies in our paper to compare cluster stability selection to the protolasso (Reid and Tibshirani, 2016) and the cluster representative lasso (Bühlmann et. al. 2013), two other feature selection methods that are designed for data with clustered features. These feature selection methods are in some ways closely related, so their implementations share helper functions.
protolasso()
processClusterLassoInputs()
checks and formats the function inputsgetXglmnet()
formats the provided design matrixXglmnet
for the lasso as implemented byglmnet
(for the protolasso, this means discarding all features from each cluster except the one most highly correlated with the response; for the cluster representative lasso, this means replacing the clustered features with a simple average of the cluster members).checkGetXglmnetInputs()
verifies the inputs togetXglmnet()
- Finally,
getClusterSelsFromGlmnet()
extracts the relevant output from the results yielded by aglmnet
lasso fit.getSelectedSets()
takes in a single selected set fromXglmnet
and yields a selected feature set in the original feature space (with each selected cluster fromXglmnet
replaced by its prototype) as well as a selected set of clusters.
clusterRepLasso()
protolasso()
:
#' Select features via the protolasso (Reid and Tibshirani 2016)
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' #' CAUTION: if the provided X is a data.frame that contains a categorical
#' feature with more than two levels, then the resulting matrix made from
#' model.matrix will have a different number of columns than the provided
#' data.frame, some of the feature numbers will change, and the clusters
#' argument will not work properly (in the current version of the package).
#' To get correct results in this case, please use model.matrix to convert
#' the data.frame to a numeric matrix on your own, then provide this matrix
#' and cluster assignments with respect to this matrix. Default is list() (so no
#' clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the protolasso. Default is 100 (following the default for glmnet). For
#' now, nlambda must be at least 2 (using a single lambda is not supported).
#' @return A list with three elements. \item{selected_sets}{A list of integer
#' vectors. Entry k of this list contains a selected set (an integer vector) of
#' size k yielded by the protolasso (If no set of size k was selected, entry k
#' will be empty.)} \item{selected_clusts_list}{A list; each element of the list
#' is a named list of selected clusters. (That is, if a selected set of size k
#' was yielded by the protolasso, then selected_clusts_list[[k]] is a named
#' list of length k, where each member of the list is an integer vector
#' of cluster members. In particular, selected_clusts_lists[[k]][[j]] will be
#' the cluster that contains feature selected_sets[[k]][j].)} \item{beta}{The
#' beta output from glmnet when the lasso was estimated on a matrix of
#' prototypes. (See documentation for the function glmnet from the glmnet
#' package for details.)}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
#' @export
protolasso <- function(X, y, clusters=list(), nlambda=100){
# Handle and format inputs; get cluster prototypes
<- processClusterLassoInputs(X, y, clusters, nlambda)
ret
<- ret$x
x <- ret$clusters
clusters <- ret$prototypes
prototypes <- ret$var_names
feat_names
rm(ret)
# Format the design matrix for glmnet according to the protolasso procedure
<- getXglmnet(x, clusters, type="protolasso",
X_glmnet prototypes=prototypes)
# Estimate the lasso on the cluster prototypes
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=nlambda)
fit <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
# Finally, obtain a tidy list of selected sets--one for each model size
<- getClusterSelsFromGlmnet(lasso_sets, clusters,
cluster_sel_results
prototypes, feat_names)
return(list(selected_sets=cluster_sel_results$selected_sets,
selected_clusts_list=cluster_sel_results$selected_clusts_list,
beta=fit$beta))
}
#' Check the inputs to protolasso and clusterRepLasso, format clusters, and
#' identify prototypes for each cluster
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' Default is list() (so no clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the protolasso. Default is 100 (following the default for glmnet). For
#' now, nlambda must be at least 2 (using a single lambda is not supported).
#' @return A list with four elements. \item{x}{The provided X, converted to a
#' matrix if it was provided as a data.frame, and with column names removed.}
#' \item{clusters}{A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters are
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features will be put in their own "cluster" of size 1).}
#' \item{prototypes}{An integer vector whose length is equal to the number of
#' clusters. Entry i is the index of the feature belonging to cluster i that is
#' most highly correlated with y (that is, the prototype for the cluster, as in
#' the protolasso; see Reid and Tibshirani 2016).} \item{var_names}{If the
#' provided X matrix had column names, the names of the featurrs in the provided
#' X matrix. If no names were provided, feat_names will be NA.}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
processClusterLassoInputs <- function(X, y, clusters, nlambda){
stopifnot(is.matrix(X) | is.data.frame(X))
# Check if x is a matrix; if it's a data.frame, convert to matrix.
if(is.data.frame(X)){
<- ncol(X)
p
<- stats::model.matrix(~ ., X)
X <- X[, colnames(X) != "(Intercept)"]
X
if(p != ncol(X) & length(clusters) > 0){
stop("When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.")
}
}
stopifnot(is.matrix(X))
stopifnot(all(!is.na(X)))
<- as.character(NA)
feat_names if(!is.null(colnames(X))){
<- colnames(X)
feat_names if(any(is.na(feat_names))){
stop("Some features in provided X matrix had valid names and some had NA names; please neither name all features in X or remove the names altogether.")
}
}
<- nrow(X)
n
colnames(X) <- character()
stopifnot(is.numeric(y) | is.integer(y))
stopifnot(n == length(y))
stopifnot(all(!is.na(y)))
# Check clusters argument
<- checkCssClustersInput(clusters)
clusters
# Format clusters into a list where all features are in exactly one
# cluster (any unclustered features are put in their own "cluster" of size
# 1).
<- as.character(NA)
clust_names if(!is.null(names(clusters)) & is.list(clusters)){
<- names(clusters)
clust_names
}
<- formatClusters(clusters, p=ncol(X),
cluster_results clust_names=clust_names, get_prototypes=TRUE, x=X, y=y)
<- cluster_results$clusters
clusters <- cluster_results$prototypes
prototypes
rm(cluster_results)
stopifnot(length(clusters) == length(prototypes))
stopifnot(is.numeric(nlambda) | is.integer(nlambda))
stopifnot(length(nlambda) == 1)
stopifnot(!is.na(nlambda))
stopifnot(nlambda >= 2)
stopifnot(nlambda == round(nlambda))
return(list(x=X, clusters=clusters, prototypes=prototypes,
var_names=feat_names))
}
Tests for processClusterLassoInputs()
:
::test_that("processClusterLassoInputs works", {
testthatset.seed(82612)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
<- processClusterLassoInputs(X=x, y=y, clusters=good_clusters, nlambda=10)
ret
::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("x", "clusters", "prototypes",
testthat"var_names"))
# X
::expect_true(is.matrix(ret$x))
testthat::expect_true(all(!is.na(ret$x)))
testthat::expect_true(is.numeric(ret$x))
testthat::expect_equal(ncol(ret$x), 11)
testthat::expect_equal(nrow(ret$x), 15)
testthat::expect_true(all(abs(ret$x - x) < 10^(-9)))
testthat
# clusters
::expect_true(is.list(ret$clusters))
testthat::expect_equal(length(ret$clusters), 5)
testthat::expect_equal(5, length(names(ret$clusters)))
testthat::expect_equal(5, length(unique(names(ret$clusters))))
testthat::expect_true("red_cluster" %in% names(ret$clusters))
testthat::expect_true("green_cluster" %in% names(ret$clusters))
testthat::expect_true(all(!is.na(names(ret$clusters))))
testthat::expect_true(all(!is.null(names(ret$clusters))))
testthat::expect_true(all(names(ret$clusters) != ""))
testthat
<- integer()
clust_feats <- list(1:4, 5:8, 9, 10, 11)
true_list for(i in 1:length(ret$clusters)){
::expect_true(is.integer(ret$clusters[[i]]))
testthat::expect_equal(length(intersect(clust_feats, ret$clusters[[i]])), 0)
testthat::expect_true(all(ret$clusters[[i]] %in% 1:11))
testthat::expect_equal(length(ret$clusters[[i]]),
testthatlength(unique(ret$clusters[[i]])))
::expect_true(all(ret$clusters[[i]] == true_list[[i]]))
testthat<- c(clust_feats, ret$clusters[[i]])
clust_feats
}
::expect_equal(length(clust_feats), 11)
testthat::expect_equal(11, length(unique(clust_feats)))
testthat::expect_equal(11, length(intersect(clust_feats, 1:11)))
testthat
# prototypes
::expect_true(is.integer(ret$prototypes))
testthat::expect_true(all(ret$prototypes %in% 1:11))
testthat::expect_equal(length(ret$prototypes), 5)
testthat::expect_true(ret$prototypes[1] %in% 1:4)
testthat::expect_true(ret$prototypes[2] %in% 5:8)
testthat::expect_equal(ret$prototypes[3], 9)
testthat::expect_equal(ret$prototypes[4], 10)
testthat::expect_equal(ret$prototypes[5], 11)
testthat
# var_names
::expect_equal(length(ret$var_names), 1)
testthat::expect_true(is.na(ret$var_names))
testthat
# X as a data.frame
<- datasets::mtcars
X_df <- processClusterLassoInputs(X=X_df, y=stats::rnorm(nrow(X_df)),
res clusters=1:3, nlambda=10)
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("x", "clusters", "prototypes",
testthat"var_names"))
<- stats::model.matrix(~ ., X_df)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
# X
::expect_true(is.matrix(res$x))
testthat::expect_true(all(!is.na(res$x)))
testthat::expect_true(is.numeric(res$x))
testthat::expect_equal(ncol(res$x), ncol(X_df_model))
testthat::expect_equal(nrow(res$x), nrow(X_df))
testthat::expect_true(all(abs(res$x - X_df_model) < 10^(-9)))
testthat
# var_names
::expect_equal(length(res$var_names), ncol(X_df_model))
testthat::expect_true(is.character(res$var_names))
testthat::expect_identical(res$var_names, colnames(X_df_model))
testthat
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
# Should get error if I try to use clusters because df2 contains factors with
# more than two levels
::expect_error(processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
testthatclusters=1:3, nlambda=10), "When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.",
fixed=TRUE)
# Should be fine with no clusters
<- processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
res clusters=list(), nlambda=10)
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("x", "clusters", "prototypes",
testthat"var_names"))
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
# X
::expect_true(is.matrix(res$x))
testthat::expect_true(all(!is.na(res$x)))
testthat::expect_true(is.numeric(res$x))
testthat::expect_equal(ncol(res$x), ncol(X_df_model))
testthat::expect_equal(nrow(res$x), nrow(X_df))
testthat::expect_true(all(abs(res$x - X_df_model) < 10^(-9)))
testthat
# var_names
::expect_equal(length(res$var_names), ncol(X_df_model))
testthat::expect_true(is.character(res$var_names))
testthat::expect_identical(res$var_names, colnames(X_df_model))
testthat
# X as a matrix with column names
<- x
x2 colnames(x2) <- LETTERS[1:11]
<- processClusterLassoInputs(X=x2, y=y, clusters=good_clusters, nlambda=10)
ret
::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("x", "clusters", "prototypes",
testthat"var_names"))
# X
::expect_true(is.matrix(ret$x))
testthat::expect_true(all(!is.na(ret$x)))
testthat::expect_true(is.numeric(ret$x))
testthat::expect_equal(ncol(ret$x), 11)
testthat::expect_equal(nrow(ret$x), 15)
testthat::expect_true(all(abs(ret$x - x) < 10^(-9)))
testthat
# var_names
::expect_equal(length(ret$var_names), ncol(x2))
testthat::expect_true(is.character(ret$var_names))
testthat::expect_identical(ret$var_names, LETTERS[1:11])
testthat
# Bad inputs
::expect_error(processClusterLassoInputs(X="x", y=y[1:10],
testthatclusters=good_clusters,
nlambda=10),
"is.matrix(X) | is.data.frame(X) is not TRUE",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y[1:10],
testthatclusters=good_clusters,
nlambda=10),
"n == length(y) is not TRUE",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=list(1:4, 4:6),
nlambda=10),
"Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=list(2:3, 2:3),
nlambda=10),
"length(clusters) == length(unique(clusters)) is not TRUE",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=list(1:4,
as.integer(NA)),
nlambda=10),
"!is.na(clusters) are not all TRUE",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=list(2:3,
c(4, 4, 5)),
nlambda=10),
"length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=good_clusters,
nlambda=1),
"nlambda >= 2 is not TRUE", fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=good_clusters,
nlambda=x),
"length(nlambda) == 1 is not TRUE", fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=good_clusters,
nlambda="nlambda"),
"is.numeric(nlambda) | is.integer(nlambda) is not TRUE",
fixed=TRUE)
::expect_error(processClusterLassoInputs(X=x, y=y,
testthatclusters=good_clusters,
nlambda=10.5),
"nlambda == round(nlambda) is not TRUE",
fixed=TRUE)
})
## Test passed 🥇
getXglmnet()
:
#' Converts the provided design matrix to an appropriate format for either the
#' protolasso or the cluster representative lasso.
#'
#' Creates design matrix for glmnet by dealing with clusters (for
#' type="protolasso", discards all cluster members except prototype; for
#' type="clusterRepLasso", replaces all cluster members with a simple
#' average of all the cluster members).
#' @param x A numeric matrix; the provided matrix with n observations and p
#' features.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters should
#' be equal to the number of clusters.) All identified clusters should be
#' non-overlapping. All features should appear in exactly one cluster (any
#' unclustered features should be put in their own "cluster" of size 1).
#' @param type Character; "protolasso" for the protolasso or "clusterRepLasso"
#' for the cluster representative lasso.
#' @param prototypes Only required for type "protolasso". An integer vector
#' whose length is equal to the number of clusters. Entry i should be the
#' prototype for cluster i (the feature belonging to cluster i that is most
#' highly correlated with y; see Reid and Tibshirani 2016).
#' @return A numeric matrix; the design matrix as required for the protolasso or
#' cluster representative lasso, prepared for input to glmnet.
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
getXglmnet <- function(x, clusters, type, prototypes=NA){
# Check inputs
checkGetXglmnetInputs(x, clusters, type, prototypes)
<- nrow(x)
n <- ncol(x)
p
for(i in 1:length(clusters)){
<- clusters[[i]]
cluster_i
if(length(cluster_i) == 1){
<- x[, cluster_i]
X_glmnet_i else{
} stopifnot(length(cluster_i) > 1)
if(type == "protolasso"){
<- which(prototypes %in% cluster_i)
prototype_ind_i stopifnot(length(prototype_ind_i) == 1)
<- prototypes[prototype_ind_i]
prototype_i <- x[, prototype_i]
X_glmnet_i else {
} stopifnot(type == "clusterRepLasso")
<- rowMeans(x[, cluster_i])
X_glmnet_i
}
}
stopifnot(length(X_glmnet_i) == n)
if(i == 1){
<- as.matrix(X_glmnet_i)
X_glmnet else{
} <- cbind(X_glmnet, X_glmnet_i)
X_glmnet
}
}
stopifnot(ncol(X_glmnet) == length(clusters))
stopifnot(ncol(X_glmnet) == length(clusters))
colnames(X_glmnet) <- character()
# Check output
stopifnot(is.matrix(X_glmnet))
stopifnot(nrow(X_glmnet) == n)
stopifnot(ncol(X_glmnet) <= p)
stopifnot(ncol(X_glmnet) >= 1)
return(X_glmnet)
}
#' Verifies the inputs for getXglmnet.
#'
#' @param x A numeric matrix; the provided matrix with n observations and p
#' features.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters should
#' be equal to the number of clusters.) All identified clusters should be
#' non-overlapping. All features should appear in exactly one cluster (any
#' unclustered features should be put in their own "cluster" of size 1).
#' @param type Character; "protolasso" for the protolasso or "clusterRepLasso"
#' for the cluster representative lasso.
#' @param prototypes Only required for type "protolasso". An integer vector
#' whose length is equal to the number of clusters. Entry i should be the
#' prototype for cluster i (the feature belonging to cluster i that is most
#' highly correlated with y; see Reid and Tibshirani 2016).
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}.
checkGetXglmnetInputs <- function(x, clusters, type, prototypes){
stopifnot(is.matrix(x))
stopifnot(is.list(clusters))
stopifnot(all(lengths(clusters) >= 1))
stopifnot(length(type) == 1)
stopifnot(is.character(type))
stopifnot(!is.na(type))
stopifnot(type %in% c("protolasso", "clusterRepLasso"))
stopifnot(!is.na(prototypes))
stopifnot(is.integer(prototypes))
stopifnot(all(!is.na(prototypes)))
stopifnot(length(prototypes) == length(unique(prototypes)))
stopifnot(all(prototypes %in% 1:ncol(x)))
for(i in 1:length(clusters)){
<- clusters[[i]]
cluster_i stopifnot(sum(prototypes %in% cluster_i) == 1)
} }
Tests for checkGetXglmnetInputs()
:
::test_that("checkGetXglmnetInputs works", {
testthatset.seed(82612)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
<- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
process nlambda=10)
checkGetXglmnetInputs(x=process$x, clusters=process$clusters,
type="protolasso", prototypes=process$prototypes)
checkGetXglmnetInputs(x=process$x, clusters=process$clusters,
type="clusterRepLasso",
prototypes=process$prototypes)
# X as a data.frame
<- datasets::mtcars
X_df <- processClusterLassoInputs(X=X_df, y=stats::rnorm(nrow(X_df)),
res clusters=1:3, nlambda=10)
checkGetXglmnetInputs(x=res$x, clusters=res$clusters, type="clusterRepLasso",
prototypes=res$prototypes)
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
# Should get an error if clusters are provided since df2 contains factors
# with more than two levels
::expect_error(processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
testthatclusters=1:3, nlambda=10),
"When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
# Should be fine if no clusters are provided
<- processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
res clusters=list(), nlambda=10)
checkGetXglmnetInputs(x=res$x, clusters=res$clusters, type="protolasso",
prototypes=res$prototypes)
# X as a matrix with column names
<- x
x2 colnames(x2) <- LETTERS[1:11]
<- processClusterLassoInputs(X=x2, y=y, clusters=good_clusters, nlambda=10)
ret
checkGetXglmnetInputs(x=ret$x, clusters=ret$clusters, type="clusterRepLasso",
prototypes=ret$prototypes)
# Bad prototype inputs
# Error has quotation marks
::expect_error(checkGetXglmnetInputs(x=process$x,
testthatclusters=process$clusters,
type="clsterRepLasso",
prototypes=process$prototypes))
::expect_error(checkGetXglmnetInputs(x=process$x,
testthatclusters=process$clusters,
type=c("clusterRepLasso",
"protolasso"),
prototypes=process$prototypes),
"length(type) == 1 is not TRUE",
fixed=TRUE)
::expect_error(checkGetXglmnetInputs(x=process$x,
testthatclusters=process$clusters,
type=2,
prototypes=process$prototypes),
"is.character(type) is not TRUE",
fixed=TRUE)
::expect_error(checkGetXglmnetInputs(x=process$x,
testthatclusters=process$clusters,
type=as.character(NA),
prototypes=process$prototypes),
"!is.na(type) is not TRUE",
fixed=TRUE)
})
## Test passed 🥇
Tests for getXglmnet()
:
::test_that("getXglmnet works", {
testthatset.seed(82612)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
<- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
process nlambda=10)
<- getXglmnet(x=process$x, clusters=process$clusters,
res type="protolasso", prototypes=process$prototypes)
::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_true(is.null(colnames(res)))
testthat::expect_true(nrow(res) == 15)
testthat# Each column of res should be one of the prototypes. Features 9 - 11 are
# in clusters by themselves and are therefore their own prototypes.
::expect_true(ncol(res) == 5)
testthatfor(i in 1:length(good_clusters)){
<- FALSE
proto_i_found <- good_clusters[[i]]
cluster_i for(j in 1:length(cluster_i)){
<- proto_i_found | all(abs(res[, i] - x[, cluster_i[j]]) <
proto_i_found 10^(-9))
}::expect_true(proto_i_found)
testthat
}::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
testthat
<- getXglmnet(x=process$x, clusters=process$clusters,
res type="clusterRepLasso", prototypes=process$prototypes)
::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_true(is.null(colnames(res)))
testthat::expect_true(nrow(res) == 15)
testthat# Each column of res should be one of the cluster representatives. Features 9
# - 11 are in clusters by themselves and are therefore their own cluster
# representatives.
::expect_true(ncol(res) == 5)
testthatfor(i in 1:length(good_clusters)){
<- good_clusters[[i]]
cluster_i <- rowMeans(x[, cluster_i])
clus_rep_i ::expect_true(all(abs(res[, i] - clus_rep_i) <
testthat10^(-9)))
}::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
testthat
# X as a data.frame
<- datasets::mtcars
X_df <- processClusterLassoInputs(X=X_df, y=stats::rnorm(nrow(X_df)),
res clusters=1:3, nlambda=10)
<- getXglmnet(x=res$x, clusters=res$clusters, type="protolasso",
ret_df prototypes=res$prototypes)
<- stats::model.matrix(~ ., X_df)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
::expect_true(is.matrix(ret_df))
testthat::expect_true(is.numeric(ret_df))
testthat::expect_true(is.null(colnames(ret_df)))
testthat::expect_true(nrow(ret_df) == nrow(X_df))
testthat# Each column of ret_df should be one of the prototypes.
::expect_true(ncol(ret_df) == ncol(X_df_model) - 3 + 1)
testthat
<- FALSE
proto_found for(j in 1:3){
<- proto_found | all(abs(ret_df[, 1] - X_df_model[, j]) < 10^(-9))
proto_found
}::expect_true(proto_found)
testthat
for(j in 4:ncol(X_df_model)){
::expect_true(all(abs(ret_df[, j - 2] - X_df_model[, j]) < 10^(-9)))
testthat
}
<- getXglmnet(x=res$x, clusters=res$clusters, type="clusterRepLasso",
ret_df prototypes=res$prototypes)
::expect_true(is.matrix(ret_df))
testthat::expect_true(is.numeric(ret_df))
testthat::expect_true(is.null(colnames(ret_df)))
testthat::expect_true(nrow(ret_df) == nrow(X_df))
testthat# Each column of ret_df should be one of the prototypes.
::expect_true(ncol(ret_df) == ncol(X_df_model) - 3 + 1)
testthat
<- FALSE
proto_found <- rowMeans(X_df_model[, 1:3])
clus_rep ::expect_true(all(abs(ret_df[, 1] - clus_rep) < 10^(-9)))
testthat
for(j in 4:ncol(X_df_model)){
::expect_true(all(abs(ret_df[, j - 2] - X_df_model[, j]) < 10^(-9)))
testthat
}
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
<- processClusterLassoInputs(X=df2, y=stats::rnorm(nrow(df2)),
res clusters=list(), nlambda=10)
<- getXglmnet(x=res$x, clusters=res$clusters, type="protolasso",
ret_df prototypes=res$prototypes)
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
::expect_true(is.matrix(ret_df))
testthat::expect_true(is.numeric(ret_df))
testthat::expect_true(is.null(colnames(ret_df)))
testthat::expect_true(nrow(ret_df) == nrow(X_df))
testthat# Each column of ret_df should be one of the prototypes.
::expect_true(ncol(ret_df) == ncol(X_df_model))
testthat
for(j in 1:ncol(X_df_model)){
::expect_true(all(abs(ret_df[, j] - X_df_model[, j]) < 10^(-9)))
testthat
}
<- getXglmnet(x=res$x, clusters=res$clusters, type="clusterRepLasso",
ret_df prototypes=res$prototypes)
::expect_true(is.matrix(ret_df))
testthat::expect_true(is.numeric(ret_df))
testthat::expect_true(is.null(colnames(ret_df)))
testthat::expect_true(nrow(ret_df) == nrow(X_df))
testthat# Each column of ret_df should be one of the prototypes.
::expect_true(ncol(ret_df) == ncol(X_df_model))
testthat
for(j in 1:ncol(X_df_model)){
::expect_true(all(abs(ret_df[, j] - X_df_model[, j]) < 10^(-9)))
testthat
}
# X as a matrix with column names (returned X shouldn't have column names)
<- x
x2 colnames(x2) <- LETTERS[1:11]
<- processClusterLassoInputs(X=x2, y=y, clusters=good_clusters,
process nlambda=10)
<- getXglmnet(x=process$x, clusters=process$clusters,
res type="protolasso", prototypes=process$prototypes)
::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_true(is.null(colnames(res)))
testthat::expect_true(nrow(res) == 15)
testthat# Each column of res should be one of the prototypes. Features 9 - 11 are
# in clusters by themselves and are therefore their own prototypes.
::expect_true(ncol(res) == 5)
testthatfor(i in 1:length(good_clusters)){
<- FALSE
proto_i_found <- good_clusters[[i]]
cluster_i for(j in 1:length(cluster_i)){
<- proto_i_found | all(abs(res[, i] - x[, cluster_i[j]]) <
proto_i_found 10^(-9))
}::expect_true(proto_i_found)
testthat
}::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
testthat
<- getXglmnet(x=process$x, clusters=process$clusters,
res type="clusterRepLasso", prototypes=process$prototypes)
::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_true(is.null(colnames(res)))
testthat::expect_true(nrow(res) == 15)
testthat# Each column of res should be one of the cluster representatives. Features 9
# - 11 are in clusters by themselves and are therefore their own cluster
# representatives.
::expect_true(ncol(res) == 5)
testthatfor(i in 1:length(good_clusters)){
<- good_clusters[[i]]
cluster_i <- rowMeans(x[, cluster_i])
clus_rep_i ::expect_true(all(abs(res[, i] - clus_rep_i) <
testthat10^(-9)))
}::expect_true(all(abs(res[, 3] - x[, 9]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 4] - x[, 10]) < 10^(-9)))
testthat::expect_true(all(abs(res[, 5] - x[, 11]) < 10^(-9)))
testthat
# Bad prototype inputs
# Error has quotation marks
::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
testthattype="clsterRepLasso",
prototypes=process$prototypes))
::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
testthattype=c("clusterRepLasso", "protolasso"),
prototypes=process$prototypes),
"length(type) == 1 is not TRUE",
fixed=TRUE)
::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
testthattype=2, prototypes=process$prototypes),
"is.character(type) is not TRUE",
fixed=TRUE)
::expect_error(getXglmnet(x=process$x, clusters=process$clusters,
testthattype=as.character(NA),
prototypes=process$prototypes),
"!is.na(type) is not TRUE",
fixed=TRUE)
})
## Test passed 🥇
#' Extracts selected clusters and cluster prototypes from the glmnet lasso
#' output
#'
#' @param lasso_sets A list of integer vectors. Each vector represents a set of
#' features selected by the lasso for a given value of the penalty parameter
#' lambda.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters must be
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features must be in their own "cluster" of size 1).
#' @param prototypes An integer vector whose length must be equal to the number
#' of clusters. Entry i should be the index of the feature belonging to cluster
#' i that is most highly correlated with y (that is, the prototype for the
#' cluster, as in the protolasso; see Reid and Tibshirani 2016).
#' @param feat_names Character vector; the names of the features in X. (If the
#' X provided to protolasso or clusterRepLasso did not have feature names,
#' feat_names will be NA.)
#' @return A list containing the following items: \item{selected_sets}{A list of
#' integer vectors. Entry k of this list contains a selected set of size k
#' yielded by glmnet--each member of the set is the index of a single feature
#' from a cluster selected by either the protolasso or the cluster
#' representative lasso (the prototype from that cluster--the cluster member
#' most highly correlated with y). (If no set of size k was selected, entry k
#' will be NULL.)} \item{selected_clusts_list}{A list of lists; entry k of this
#' list is a list of length k of clusters (the clusters that were selected by
#' the cluster representative lasso). Again, if no set of size k was selected,
#' entry k will be NULL.}
#' @author Gregory Faletto, Jacob Bien
#' @references Reid, S., & Tibshirani, R. (2016). Sparse regression and marginal
#' testing using cluster prototypes. \emph{Biostatistics}, 17(2), 364–376.
#' \url{https://doi.org/10.1093/biostatistics/kxv049}. \cr Bühlmann, P.,
#' Rütimann, P., van de Geer, S., & Zhang, C. H. (2013). Correlated variables in
#' regression: Clustering and sparse estimation.
#' \emph{Journal of Statistical Planning and Inference}, 143(11), 1835–1858.
#' \url{https://doi.org/10.1016/j.jspi.2013.05.019}.
getClusterSelsFromGlmnet <- function(lasso_sets, clusters, prototypes,
feat_names){
if(any(!is.na(feat_names))){
stopifnot(all(!is.na(feat_names)))
}
# Largest selected set among all those in lasso_sets
<- max(vapply(lasso_sets, length, integer(1)))
max_length
# Preparing lists to store
<- list()
selected_sets <- list()
selected_clusts_list
for(j in 1:max_length){
# Lasso selected set of size j
<- lasso_sets[lapply(lasso_sets, length) == j]
lasso_sets_j # Are there any lasso selected sets of size j? (If not, we will skip to
# the next j, and slot j in the list will be empty.)
if(length(lasso_sets_j) > 0){
# Select the first set of size j
<- lasso_sets_j[[1]]
lasso_set_j stopifnot(length(lasso_set_j) == j)
<- getSelectedSets(lasso_set=lasso_set_j, clusters=clusters,
ret prototypes=prototypes, feat_names=feat_names)
<- ret$selected_set
selected_sets[[j]] <- ret$selected_clusts_list
selected_clusts_list[[j]]
rm(ret)
}
}
stopifnot(length(selected_sets) <= max_length)
stopifnot(length(selected_clusts_list) <= max_length)
return(list(selected_sets=selected_sets,
selected_clusts_list=selected_clusts_list))
}
#' Converts a selected set from X_glmnet to selected sets and selected clusters
#' from the original feature space of X.
#'
#' @param lasso_set A vector containing the indices of selected cluster
#' representatives or prototypes.
#' @param clusters A named list where each entry is an integer vector of indices
#' of features that are in a common cluster. (The length of list clusters is
#' equal to the number of clusters.) All identified clusters must be
#' non-overlapping. All features appear in exactly one cluster (any unclustered
#' features must be in their own "cluster" of size 1).
#' @param prototypes An integer vector whose length must be equal to the number
#' of clusters. Entry i should be the index of the feature belonging to cluster
#' i that is most highly correlated with y (that is, the prototype for the
#' cluster, as in the protolasso).
#' @param feat_names Character vector; the names of the features in X.
#' @return A list containing two items: \item{selected_set}{An integer vector
#' with length equal to lasso_set containing a set of selected features in the
#' original X matrix. (Selections in lasso_set corresponding to a cluster will
#' be replaced by the cluster's prototype from X.)}
#' \item{selected_clusts_list}{A named list of integer vectors with length equal
#' to selected_set. selected_clusts_list[[k]] will be an integer vector
#' containing the indices of the features in X that are in the cluster
#' containing prototype selected_set[k].}
#' @author Gregory Faletto, Jacob Bien
getSelectedSets <- function(lasso_set, clusters, prototypes, feat_names){
<- length(lasso_set)
model_size stopifnot(model_size > 0)
stopifnot(length(unique(lasso_set)) == model_size)
stopifnot(all(lasso_set <= length(clusters)))
<- integer()
selected_set <- list()
selected_clusts_list # Recover features from original feature space
for(k in 1:model_size){
<- clusters[[lasso_set[k]]]
selected_cluster_k stopifnot(is.integer(selected_cluster_k))
<- selected_cluster_k
selected_clusts_list[[k]]
if(length(selected_cluster_k) == 1){
stopifnot(!(selected_cluster_k %in% selected_set))
<- c(selected_set, selected_cluster_k)
selected_set else{
} <- which(prototypes %in% selected_cluster_k)
sel_prototype stopifnot(length(sel_prototype) == 1)
stopifnot(!(prototypes[sel_prototype] %in% selected_set))
<- c(selected_set, prototypes[sel_prototype])
selected_set
}
}
stopifnot(length(selected_set) == model_size)
stopifnot(length(unique(selected_set)) == model_size)
if(any(!is.na(feat_names))){
names(selected_set) <- feat_names[selected_set]
}
stopifnot(length(selected_clusts_list) == model_size)
<- unlist(selected_clusts_list)
all_feats stopifnot(length(all_feats) == length(unique(all_feats)))
return(list(selected_set=selected_set,
selected_clusts_list=selected_clusts_list))
}
Tests for getSelectedSets()
:
::test_that("getSelectedSets works", {
testthatset.seed(82612)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
<- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
process nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="protolasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
fit <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets # Pick an arbitrary lasso set
<- lasso_sets[[5]]
lasso_set
<- getSelectedSets(lasso_set, process$clusters, process$prototypes,
res $var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_set",
testthat"selected_clusts_list"))
# selected_set
::expect_true(is.integer(res$selected_set))
testthat::expect_true(all(!is.na(res$selected_set)))
testthat::expect_true(all(res$selected_set %in% process$prototypes))
testthat
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat::expect_equal(length(res$selected_set),
testthatlength(res$selected_clusts_list))
<- unlist(res$selected_clusts_list)
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat<- length(res$selected_clusts_list)
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i, process$clusters[[j]])
clust_i_found
}::expect_true(clust_i_found)
testthat
}
# Try again with cluster representative lasso
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="clusterRepLasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
fit <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets # Pick an arbitrary lasso set
<- lasso_sets[[5]]
lasso_set
<- getSelectedSets(lasso_set, process$clusters, process$prototypes,
res $var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_set",
testthat"selected_clusts_list"))
# selected_set
::expect_true(is.integer(res$selected_set))
testthat::expect_true(all(!is.na(res$selected_set)))
testthat::expect_true(all(res$selected_set %in% process$prototypes))
testthat
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat::expect_equal(length(res$selected_set),
testthatlength(res$selected_clusts_list))
<- unlist(res$selected_clusts_list)
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat<- length(res$selected_clusts_list)
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i, process$clusters[[j]])
clust_i_found
}::expect_true(clust_i_found)
testthat
}
# X as a data.frame
<- datasets::mtcars
X_df
<- stats::model.matrix(~ ., X_df)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
<- processClusterLassoInputs(X=X_df, y=rnorm(nrow(X_df)),
process clusters=1:3, nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="protolasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(X_df)), family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets # Pick an arbitrary lasso set
<- lasso_sets[[min(length(lasso_sets), 3)]]
lasso_set
<- getSelectedSets(lasso_set, process$clusters, process$prototypes,
res $var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_set",
testthat"selected_clusts_list"))
# selected_set
::expect_true(is.integer(res$selected_set))
testthat::expect_true(all(!is.na(res$selected_set)))
testthat::expect_true(all(res$selected_set %in% process$prototypes))
testthat
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat::expect_equal(length(res$selected_set),
testthatlength(res$selected_clusts_list))
<- unlist(res$selected_clusts_list)
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat<- length(res$selected_clusts_list)
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i, process$clusters[[j]])
clust_i_found
}::expect_true(clust_i_found)
testthat
}
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
# Should throw an error if we assign clusters because df2 contains factors
# with more than two levels
::expect_error(processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
testthatclusters=1:3, nlambda=100),
"When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
# Should be fine if no clusters are provided
<- processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
process clusters=list(), nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="clusterRepLasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(df2)), family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets # Pick an arbitrary lasso set
<- lasso_sets[[min(length(lasso_sets), 3)]]
lasso_set
<- getSelectedSets(lasso_set, process$clusters, process$prototypes,
res $var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_set",
testthat"selected_clusts_list"))
# selected_set
::expect_true(is.integer(res$selected_set))
testthat::expect_true(all(!is.na(res$selected_set)))
testthat::expect_true(all(res$selected_set %in% process$prototypes))
testthat
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat::expect_equal(length(res$selected_set),
testthatlength(res$selected_clusts_list))
<- unlist(res$selected_clusts_list)
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat<- length(res$selected_clusts_list)
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i, process$clusters[[j]])
clust_i_found
}::expect_true(clust_i_found)
testthat
}
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
# Should throw an error if we assign clusters because df2 contains factors
# with more than two levels
::expect_error(processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
testthatclusters=1:3, nlambda=100),
"When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
# Should be fine if no clusters are provided
<- processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
process clusters=list(), nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="clusterRepLasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(df2)), family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets # Pick an arbitrary lasso set
<- lasso_sets[[min(length(lasso_sets), 3)]]
lasso_set
<- getSelectedSets(lasso_set, process$clusters, process$prototypes,
res $var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_set",
testthat"selected_clusts_list"))
# selected_set
::expect_true(is.integer(res$selected_set))
testthat::expect_true(all(!is.na(res$selected_set)))
testthat::expect_true(all(res$selected_set %in% process$prototypes))
testthat
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat::expect_equal(length(res$selected_set),
testthatlength(res$selected_clusts_list))
<- unlist(res$selected_clusts_list)
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat<- length(res$selected_clusts_list)
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i, process$clusters[[j]])
clust_i_found
}::expect_true(clust_i_found)
testthat
}
# X as a matrix with column names
<- x
x2 colnames(x2) <- LETTERS[1:11]
<- processClusterLassoInputs(X=x2, y=y,
process clusters=good_clusters, nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="protolasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets # Pick an arbitrary lasso set
<- lasso_sets[[min(length(lasso_sets), 3)]]
lasso_set
<- getSelectedSets(lasso_set, process$clusters, process$prototypes,
res $var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_set",
testthat"selected_clusts_list"))
# selected_set
::expect_true(is.integer(res$selected_set))
testthat::expect_true(all(!is.na(res$selected_set)))
testthat::expect_true(all(res$selected_set %in% process$prototypes))
testthat
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat::expect_equal(length(res$selected_set),
testthatlength(res$selected_clusts_list))
<- unlist(res$selected_clusts_list)
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat<- length(res$selected_clusts_list)
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i, process$clusters[[j]])
clust_i_found
}::expect_true(clust_i_found)
testthat
}
})
## Test passed 😸
Tests for getClusterSelsFromGlmnet()
:
::test_that("getClusterSelsFromGlmnet works", {
testthatset.seed(61282)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
<- processClusterLassoInputs(X=x, y=y, clusters=good_clusters,
process nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="protolasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
fit <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
<- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
res $prototypes, process$var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i,
clust_i_found $clusters[[j]])
process
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# Try again with cluster representative lasso
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="clusterRepLasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=100)
fit <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
<- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
res $prototypes, process$var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i,
clust_i_found $clusters[[j]])
process
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# X as a data.frame
<- datasets::mtcars
X_df
<- stats::model.matrix(~ ., X_df)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
<- processClusterLassoInputs(X=X_df, y=rnorm(nrow(X_df)),
process clusters=1:3, nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="protolasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(X_df)), family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
<- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
res $prototypes, process$var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i,
clust_i_found $clusters[[j]])
process
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
<- processClusterLassoInputs(X=df2, y=rnorm(nrow(df2)),
process clusters=list(), nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="clusterRepLasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=rnorm(nrow(df2)), family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
<- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
res $prototypes, process$var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i,
clust_i_found $clusters[[j]])
process
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# X as a matrix with column names
<- x
x2 colnames(x2) <- LETTERS[1:11]
<- processClusterLassoInputs(X=x2, y=y,
process clusters=good_clusters, nlambda=100)
<- getXglmnet(x=process$x, clusters=process$clusters,
X_glmnet type="protolasso", prototypes=process$prototypes)
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian",
fit nlambda=100)
<- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
<- getClusterSelsFromGlmnet(lasso_sets, process$clusters,
res $prototypes, process$var_names)
process
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% process$prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(process$clusters)){
<- clust_i_found | identical(clust_i,
clust_i_found $clusters[[j]])
process
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
})
## Test passed 🥳
Finally, tests for protolasso()
:
::test_that("protolasso works", {
testthatset.seed(61282)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=good_clusters, p=11,
format_clust_res clust_names=names(good_clusters),
get_prototypes=TRUE, x=x, y=y)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- protolasso(x, y, good_clusters, nlambda=60)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(is.null(names(res$selected_sets[[i]])))
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
testthat::expect_true(ncol(res$beta) <= 60)
testthat
# X as a data.frame
<- datasets::mtcars
X_df
<- stats::model.matrix(~ ., X_df)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
<- rnorm(nrow(X_df))
y_df
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=1:3, p=ncol(X_df_model),
format_clust_res get_prototypes=TRUE, x=X_df_model, y=y_df)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- protolasso(X_df, y_df, 1:3, nlambda=80)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
testthatcolnames(X_df_model)))
else{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
testthat::expect_true(ncol(res$beta) <= 80)
testthat
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
# Should get an error if we try to call protolasso on df2 with clusters
# because df2 contains factors with more than two levels
::expect_error(protolasso(df2, y_df, 4:6, nlambda=70),
testthat"When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
# Should be fine if no clusters are provided
<- protolasso(df2, y_df, nlambda=70)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=4:6, p=ncol(X_df_model),
format_clust_res get_prototypes=TRUE, x=X_df_model, y=y_df)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- protolasso(X_df_model, y_df, 4:6, nlambda=70)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
testthatcolnames(X_df_model)))
else{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
testthat::expect_true(ncol(res$beta) <= 70)
testthat
# X as a matrix with column names
<- x
x2 colnames(x2) <- LETTERS[1:11]
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=good_clusters, p=11,
format_clust_res clust_names=names(good_clusters),
get_prototypes=TRUE, x=x2, y=y)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- protolasso(x2, y, good_clusters, nlambda=50)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
testthat1:11]))
LETTERS[else{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
testthat::expect_true(ncol(res$beta) <= 50)
testthat
# Bad inputs
::expect_error(protolasso(X="x", y=y[1:10], clusters=good_clusters,
testthatnlambda=10),
"is.matrix(X) | is.data.frame(X) is not TRUE",
fixed=TRUE)
::expect_error(protolasso(X=x, y=y[1:10], clusters=good_clusters,
testthatnlambda=10),
"n == length(y) is not TRUE", fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=list(1:4, 4:6),
testthatnlambda=10),
"Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.", fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=list(2:3, 2:3),
testthatnlambda=10),
"length(clusters) == length(unique(clusters)) is not TRUE",
fixed=TRUE)
::expect_error(protolasso(X=x, y=y,
testthatclusters=list(1:4, as.integer(NA)),
nlambda=10),
"!is.na(clusters) are not all TRUE", fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=list(2:3, c(4, 4, 5)),
testthatnlambda=10),
"length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
testthatnlambda=1), "nlambda >= 2 is not TRUE",
fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
testthatnlambda=x),
"length(nlambda) == 1 is not TRUE", fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
testthatnlambda="nlambda"),
"is.numeric(nlambda) | is.integer(nlambda) is not TRUE",
fixed=TRUE)
::expect_error(protolasso(X=x, y=y, clusters=good_clusters,
testthatnlambda=10.5),
"nlambda == round(nlambda) is not TRUE",
fixed=TRUE)
})
## Test passed 🥇
#' Select features via the cluster representative lasso (Bühlmann et. al. 2013)
#'
#' @param X An n x p numeric matrix (preferably) or a data.frame (which will
#' be coerced internally to a matrix by the function model.matrix) containing
#' p >= 2 features/predictors
#' @param y The response; A length n numeric (or integer) real-valued vector.
#' @param clusters A list of integer vectors; each vector should contain the
#' indices of a cluster of features (a subset of 1:p). (If there is only one
#' cluster, clusters can either be a list of length 1 or an integer vector.)
#' All of the provided clusters must be non-overlapping. Every feature not
#' appearing in any cluster will be assumed to be unclustered (that is, they
#' will be treated as if they are in a "cluster" containing only themselves).
#' CAUTION: if the provided X is a data.frame that contains a categorical
#' feature with more than two levels, then the resulting matrix made from
#' model.matrix will have a different number of columns than the provided
#' data.frame, some of the feature numbers will change, and the clusters
#' argument will not work properly (in the current version of the package).
#' To get correct results in this case, please use model.matrix to convert
#' the data.frame to a numeric matrix on your own, then provide this matrix
#' and cluster assignments with respect to this matrix. Default is list() (so no
#' clusters are specified).
#' @param nlambda Integer; the number of lambda values to use in the lasso fit
#' for the cluster representative lasso. Default is 100 (following the default
#' for glmnet). For now, nlambda must be at least 2 (using a single lambda is
#' not supported).
#' @return A list with three elements. \item{selected_sets}{A list of integer
#' vectors. Entry k of this list contains a selected set (an integer vector) of
#' size k yielded by the lasso--each member of the set is the index of a single
#' feature from a cluster selected by the cluster representative lasso (the
#' prototype from that cluster--the cluster member most highly correlated with
#' y). (If no set of size k was selected, entry k will be empty.)}
#' \item{selected_clusts_list}{A list; each element of the list is a named list
#' of selected clusters. (That is, if a selected set of size k was yielded by
#' the cluster representative lasso, then selected_clusts_list[[k]] is a named
#' list of length k, where each member of the list is an integer vector
#' of cluster members. Note that selected_clusts_lists[[k]][[j]] will be the
#' cluster that contains feature selected_sets[[k]][j].)} \item{beta}{The beta
#' output from glmnet when the lasso was estimated on a matrix of prototypes.
#' (See documentation for the function glmnet from the glmnet package for
#' details.)}
#' @references Bühlmann, P., Rütimann, P., van de Geer, S., & Zhang, C. H.
#' (2013). Correlated variables in regression: Clustering and sparse estimation.
#' \emph{Journal of Statistical Planning and Inference}, 143(11), 1835–1858.
#' \url{https://doi.org/10.1016/j.jspi.2013.05.019}. \cr Jerome Friedman, Trevor
#' Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear
#' Models via Coordinate Descent. \emph{Journal of Statistical Software}, 33(1)
#' ' 1-22. URL \url{https://www.jstatsoft.org/v33/i01/}.
clusterRepLasso <- function(X, y, clusters=list(), nlambda=100){
# Handle and format inputs; get cluster prototypes
<- processClusterLassoInputs(X, y, clusters, nlambda)
ret
<- ret$x
x <- ret$clusters
clusters <- ret$prototypes
prototypes <- ret$var_names
feat_names
rm(ret)
# Format the design matrix for glmnet according to the cluster
# representative lasso procedure
<- getXglmnet(x, clusters, type="clusterRepLasso",
X_glmnet prototypes=prototypes)
# Estimate the lasso on the cluster representatives
<- glmnet::glmnet(x=X_glmnet, y=y, family="gaussian", nlambda=nlambda)
fit <- unique(glmnet::predict.glmnet(fit, type="nonzero"))
lasso_sets
# Finally, extract the desired information from the lasso fit--all the
# sets of selected clusters (one for each observed model size), and
# corresponding sets of selected features
<- getClusterSelsFromGlmnet(lasso_sets, clusters,
cluster_sel_results
prototypes, feat_names)
return(list(selected_sets=cluster_sel_results$selected_sets,
selected_clusts_list=cluster_sel_results$selected_clusts_list,
beta=fit$beta))
}
Tests for clusterRepLasso()
:
# TODO(gregfaletto): deal with the fact that clusters argument doesn't work
# for a data.frame input that has a categorical random variable with more than
# two levels (because then p, and the numbering of the features, changes)
::test_that("clusterRepLasso works", {
testthatset.seed(61282)
<- matrix(stats::rnorm(15*11), nrow=15, ncol=11)
x <- stats::rnorm(15)
y
<- list(red_cluster=1L:4L, green_cluster=5L:8L)
good_clusters
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=good_clusters, p=11,
format_clust_res clust_names=names(good_clusters),
get_prototypes=TRUE, x=x, y=y)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- clusterRepLasso(x, y, good_clusters, nlambda=60)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(is.null(names(res$selected_sets[[i]])))
testthatelse{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
testthat::expect_true(ncol(res$beta) <= 60)
testthat
# X as a data.frame
<- datasets::mtcars
X_df
<- stats::model.matrix(~ ., X_df)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
<- rnorm(nrow(X_df))
y_df
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=1:3, p=ncol(X_df_model),
format_clust_res get_prototypes=TRUE, x=X_df_model, y=y_df)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- clusterRepLasso(X_df, y_df, 1:3, nlambda=80)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
testthatcolnames(X_df_model)))
else{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
testthat::expect_true(ncol(res$beta) <= 80)
testthat
# X as a dataframe with factors (number of columns of final design matrix
# after one-hot encoding factors won't match number of columns of df2)
# cyl, gear, and carb are factors with more than 2 levels
<- X_df
df2 $cyl <- as.factor(df2$cyl)
df2$vs <- as.factor(df2$vs)
df2$am <- as.factor(df2$am)
df2$gear <- as.factor(df2$gear)
df2$carb <- as.factor(df2$carb)
df2
# Should get an error if we try to call clusterRepLasso on df2 with clusters
# because df2 contains factors with more than two levels
::expect_error(clusterRepLasso(df2, y_df, 4:6, nlambda=70),
testthat"When stats::model.matrix converted the provided data.frame X to a matrix, the number of columns changed (probably because the provided data.frame contained a factor variable with at least three levels). Please convert X to a matrix yourself using model.matrix and provide cluster assignments according to the columns of the new matrix.", fixed=TRUE)
# Should be fine if no clusters are provided
<- clusterRepLasso(df2, y_df, nlambda=70)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
<- stats::model.matrix(~ ., df2)
X_df_model <- X_df_model[, colnames(X_df_model) != "(Intercept)"]
X_df_model
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=4:6, p=ncol(X_df_model),
format_clust_res get_prototypes=TRUE, x=X_df_model, y=y_df)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- clusterRepLasso(X_df_model, y_df, 4:6, nlambda=70)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
testthatcolnames(X_df_model)))
else{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthatfor(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:ncol(X_df_model)))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == ncol(X_df_model) - 3 + 1)
testthat::expect_true(ncol(res$beta) <= 70)
testthat
# X as a matrix with column names
<- x
x2 colnames(x2) <- LETTERS[1:11]
# Get properly formatted clusters and prototypes for testing
<- formatClusters(clusters=good_clusters, p=11,
format_clust_res clust_names=names(good_clusters),
get_prototypes=TRUE, x=x2, y=y)
<- format_clust_res$prototypes
prototypes <- format_clust_res$clusters
clus_formatted
<- clusterRepLasso(x2, y, good_clusters, nlambda=50)
res
::expect_true(is.list(res))
testthat::expect_identical(names(res), c("selected_sets",
testthat"selected_clusts_list", "beta"))
# selected_sets
::expect_true(is.list(res$selected_sets))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_sets)
lengths <- lengths[lengths != 0]
lengths ::expect_identical(lengths, unique(lengths))
testthatfor(i in 1:length(res$selected_sets)){
if(!is.null(res$selected_sets[[i]])){
::expect_true(is.integer(res$selected_sets[[i]]))
testthat::expect_true(all(!is.na(res$selected_sets[[i]])))
testthat::expect_true(all(res$selected_sets[[i]] %in% prototypes))
testthat::expect_equal(length(res$selected_sets[[i]]), i)
testthat::expect_true(all(names(res$selected_sets[[i]]) %in%
testthat1:11]))
LETTERS[else{
} ::expect_true(is.null(res$selected_sets[[i]]))
testthat
}
}
# selected_clusts_list
::expect_true(is.list(res$selected_clusts_list))
testthat# Selected models should have one of each size without repetition
<- lengths(res$selected_clusts_list)
clust_lengths <- clust_lengths[clust_lengths != 0]
clust_lengths ::expect_identical(clust_lengths, unique(clust_lengths))
testthat
for(k in 1:length(res$selected_clusts_list)){
if(!is.null(res$selected_clusts_list[[k]])){
::expect_true(is.list(res$selected_clusts_list[[k]]))
testthat::expect_equal(length(res$selected_sets[[k]]),
testthatlength(res$selected_clusts_list[[k]]))
::expect_equal(length(res$selected_clusts_list[[k]]), k)
testthat<- unlist(res$selected_clusts_list[[k]])
sel_feats ::expect_true(all(sel_feats %in% 1:11))
testthat::expect_equal(length(sel_feats), length(unique(sel_feats)))
testthat<- k
n_clusts for(i in 1:n_clusts){
<- FALSE
clust_i_found <- res$selected_clusts_list[[k]][[i]]
clust_i for(j in 1:length(clus_formatted)){
<- clust_i_found | identical(clust_i,
clust_i_found
clus_formatted[[j]])
}::expect_true(clust_i_found)
testthat
}else{
} ::expect_true(is.null(res$selected_clusts_list[[k]]))
testthat
}
}
# beta
::expect_true(grepl("dgCMatrix", class(res$beta)))
testthat::expect_true(nrow(res$beta) == 11 - 8 + 2)
testthat::expect_true(ncol(res$beta) <= 50)
testthat
# Bad inputs
::expect_error(clusterRepLasso(X="x", y=y[1:10], clusters=good_clusters,
testthatnlambda=10),
"is.matrix(X) | is.data.frame(X) is not TRUE",
fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y[1:10], clusters=good_clusters,
testthatnlambda=10),
"n == length(y) is not TRUE", fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=list(1:4, 4:6),
testthatnlambda=10),
"Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.", fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=list(2:3, 2:3),
testthatnlambda=10),
"length(clusters) == length(unique(clusters)) is not TRUE",
fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y,
testthatclusters=list(1:4, as.integer(NA)),
nlambda=10),
"!is.na(clusters) are not all TRUE", fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=list(2:3, c(4, 4, 5)),
testthatnlambda=10),
"length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
testthatnlambda=1), "nlambda >= 2 is not TRUE",
fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
testthatnlambda=x),
"length(nlambda) == 1 is not TRUE", fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
testthatnlambda="nlambda"),
"is.numeric(nlambda) | is.integer(nlambda) is not TRUE",
fixed=TRUE)
::expect_error(clusterRepLasso(X=x, y=y, clusters=good_clusters,
testthatnlambda=10.5),
"nlambda == round(nlambda) is not TRUE",
fixed=TRUE)
})
## Test passed 🥇