9 Tests for selection and prediction functions
Tests for checkCutoff():
testthat::test_that("checkCutoff works", {
testthat::expect_null(checkCutoff(0))
testthat::expect_null(checkCutoff(0.2))
testthat::expect_null(checkCutoff(1))
testthat::expect_error(checkCutoff(-.2), "cutoff >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkCutoff(2), "cutoff <= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkCutoff(".3"),
"is.numeric(cutoff) | is.integer(cutoff) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkCutoff(matrix(1:12, nrow=4, ncol=3)),
"length(cutoff) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkCutoff(numeric()),
"length(cutoff) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkCutoff(as.numeric(NA)),
"!is.na(cutoff) is not TRUE", fixed=TRUE)
})## Test passed with 9 successes.
Tests for checkWeighting():
testthat::test_that("checkWeighting works", {
testthat::expect_null(checkWeighting("sparse"))
testthat::expect_null(checkWeighting("simple_avg"))
testthat::expect_null(checkWeighting("weighted_avg"))
testthat::expect_error(checkWeighting(c("sparse", "simple_avg")),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkWeighting(NA), "!is.na(weighting) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkWeighting(1), "Weighting must be a character",
fixed=TRUE)
testthat::expect_error(checkWeighting("spasre"),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
})## Test passed with 7 successes.
Tests for checkMinNumClusts():
testthat::test_that("checkMinNumClusts works", {
testthat::expect_null(checkMinNumClusts(1, 5, 4))
testthat::expect_null(checkMinNumClusts(6, 6, 6))
testthat::expect_null(checkMinNumClusts(3, 1932, 3))
# min_num_clusts = 0 is now allowed (empty threshold selection)
testthat::expect_null(checkMinNumClusts(0, 13, 7))
testthat::expect_error(checkMinNumClusts(c(2, 4), 5, 4),
"length(min_num_clusts) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkMinNumClusts("3", "1932", "3"),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMinNumClusts(NA, NA, NA),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMinNumClusts(as.numeric(NA), as.numeric(NA),
as.numeric(NA)),
"!is.na(min_num_clusts) is not TRUE", fixed=TRUE)
testthat::expect_error(checkMinNumClusts(-1, 13, 7),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkMinNumClusts(-1, 9, 8),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkMinNumClusts(6, 5, 5),
"min_num_clusts <= p is not TRUE", fixed=TRUE)
testthat::expect_error(checkMinNumClusts(6, 7, 5),
"min_num_clusts <= n_clusters is not TRUE", fixed=TRUE)
})## Test passed with 12 successes.
Tests for checkMaxNumClusts():
testthat::test_that("checkMaxNumClusts works", {
testthat::expect_equal(checkMaxNumClusts(max_num_clusts=4, min_num_clusts=1,
p=5, n_clusters=4), 4)
testthat::expect_equal(checkMaxNumClusts(max_num_clusts=5, min_num_clusts=1,
p=5, n_clusters=4), 4)
testthat::expect_true(is.na(checkMaxNumClusts(max_num_clusts=NA,
min_num_clusts=3, p=5,
n_clusters=4)))
testthat::expect_error(checkMaxNumClusts(max_num_clusts="4", min_num_clusts=1,
p=5, n_clusters=4),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts=3.2, min_num_clusts=2,
p=5, n_clusters=4),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts=1, min_num_clusts=2,
p=5, n_clusters=4),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts=c(3, 4),
min_num_clusts=2,
p=5, n_clusters=4),
"length(max_num_clusts) == 1 is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts="4",
min_num_clusts="2",
p="5", n_clusters="4"),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts=-1, min_num_clusts=2,
p=5, n_clusters=4),
"max_num_clusts >= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts=6, min_num_clusts=2,
p=5, n_clusters=4),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
testthat::expect_error(checkMaxNumClusts(max_num_clusts=1, min_num_clusts=2,
p=5, n_clusters=4),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
})## Test passed with 11 successes.
Test for checkSelectedClusters():
testthat::test_that("checkSelectedClusters works", {
testthat::expect_null(checkSelectedClusters(n_sel_clusts=5, min_num_clusts=1,
max_num_clusts=NA, max_sel_prop=.8))
testthat::expect_null(checkSelectedClusters(n_sel_clusts=5, min_num_clusts=2,
max_num_clusts=5, max_sel_prop=.3))
testthat::expect_null(checkSelectedClusters(n_sel_clusts=2, min_num_clusts=2,
max_num_clusts=5, max_sel_prop=.3))
testthat::expect_error(checkSelectedClusters(n_sel_clusts=0, min_num_clusts=2,
max_num_clusts=5,
max_sel_prop=.6),
"No clusters selected with this cutoff (try a cutoff below the maximum cluster selection proportion, 0.6)",
fixed=TRUE)
testthat::expect_warning(checkSelectedClusters(n_sel_clusts=1,
min_num_clusts=2,
max_num_clusts=5,
max_sel_prop=.6),
"Returning fewer than min_num_clusts = 2 clusters because decreasing the cutoff any further would require returning more than max_num_clusts = 5 clusters",
fixed=TRUE)
testthat::expect_warning(checkSelectedClusters(n_sel_clusts=6,
min_num_clusts=2,
max_num_clusts=5,
max_sel_prop=.6),
"Returning more than max_num_clusts = 5 clusters because increasing the cutoff any further would require returning 0 clusters",
fixed=TRUE)
})## Test passed with 6 successes.
Tests for getClustWeights():
testthat::test_that("getClustWeights works", {
sel_props <- c(0.1, 0.3, 0.5, 0.7, 0.9)
# sparse
testthat::expect_identical(getClustWeights(cluster_i=c(3L, 4L, 5L),
weighting="sparse",
feat_sel_props=sel_props),
c(0, 0, 1))
# weighted_avg
cluster=c(1L, 3L, 5L)
true_weights <- sel_props[cluster]/sum(sel_props[cluster])
testthat::expect_identical(getClustWeights(cluster_i=cluster,
weighting="weighted_avg",
feat_sel_props=sel_props),
true_weights)
# simple_avg
testthat::expect_identical(getClustWeights(cluster_i=c(2L, 3L, 4L, 5L),
weighting="simple_avg",
feat_sel_props=sel_props),
rep(0.25, 4))
# (#105, L11) Zero-selection fallback: when every member of the cluster has
# selection proportion 0 (so sum(sel_props) == 0), both "sparse" and
# "weighted_avg" must fall back to equal weights rep(1/n, n) rather than
# dividing by zero.
zero_props <- c(0, 0, 0, 0.5, 0.8)
testthat::expect_identical(getClustWeights(cluster_i=c(1L, 2L, 3L),
weighting="sparse",
feat_sel_props=zero_props),
rep(1/3, 3))
testthat::expect_identical(getClustWeights(cluster_i=c(1L, 2L, 3L),
weighting="weighted_avg",
feat_sel_props=zero_props),
rep(1/3, 3))
# (#105, L11) Sparse tie-split: when two cluster members tie for the top
# selection proportion, "sparse" splits the weight equally between them (and
# gives 0 to the rest) rather than arbitrarily picking one.
tie_props <- c(0.9, 0.9, 0.3, 0.5, 0.8)
testthat::expect_identical(getClustWeights(cluster_i=c(1L, 2L, 3L),
weighting="sparse",
feat_sel_props=tie_props),
c(0.5, 0.5, 0))
})## Test passed with 6 successes.
Tests for getAllClustWeights():
testthat::test_that("getAllClustWeights works", {
set.seed(1872)
x <- matrix(stats::rnorm(10*5), nrow=10, ncol=5)
y <- stats::rnorm(10)
clust_names <- letters[1:3]
good_clusters <- list(1:2, 3:4, 5)
names(good_clusters) <- clust_names
res <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, fitfun = cssLasso,
sampling_type = "SS", B = 10, prop_feats_remove = 0, train_inds = integer(),
num_cores = 1L)
sel_props <- colMeans(res$feat_sel_mat)
sel_clusts <- list(1L:2L, 3L:4L)
names(sel_clusts) <- clust_names[1:2]
# sparse
true_weights <- list()
for(i in 1:2){
weights_i <- sel_props[sel_clusts[[i]]]/sum(sel_props[sel_clusts[[i]]])
true_weights[[i]] <- rep(0, length(weights_i))
true_weights[[i]][weights_i == max(weights_i)] <- 1
true_weights[[i]] <- true_weights[[i]]/sum(true_weights[[i]])
}
names(true_weights) <- clust_names[1:2]
testthat::expect_identical(getAllClustWeights(res,
colMeans(res$clus_sel_mat[, 1:2]),
"sparse"), true_weights)
# weighted_avg
true_weights <- list()
for(i in 1:2){
true_weights[[i]] <- sel_props[sel_clusts[[i]]]/sum(sel_props[unlist(sel_clusts[[i]])])
}
names(true_weights) <- clust_names[1:2]
testthat::expect_identical(getAllClustWeights(res,
colMeans(res$clus_sel_mat[, 1:2]),
"weighted_avg"), true_weights)
# simple_avg
true_weights <- list()
for(i in 1:2){
n_weights_i <- length(sel_clusts[[i]])
true_weights[[i]] <- rep(1/n_weights_i, n_weights_i)
}
names(true_weights) <- clust_names[1:2]
testthat::expect_identical(getAllClustWeights(res,
colMeans(res$clus_sel_mat[, 1:2]),
"simple_avg"), true_weights)
# Errors
# css_results not correct (error has quotation marks)
testthat::expect_error(getAllClustWeights(1:4, colMeans(res$clus_sel_mat[,
1:2]),
"simple_avg"))
bad_sel_clusts <- colMeans(res$clus_sel_mat[, 1:2])
names(bad_sel_clusts) <- c("apple", "banana")
testthat::expect_error(getAllClustWeights(res, bad_sel_clusts, "sparse"),
"all(names(sel_clusters) %in% names(clusters)) is not TRUE",
fixed=TRUE)
testthat::expect_error(getAllClustWeights(res, colMeans(res$clus_sel_mat[,
1:2]),
c("sparse", "simple_avg")),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getAllClustWeights(res, colMeans(res$clus_sel_mat[,
1:2]),
NA),
"!is.na(weighting) is not TRUE", fixed=TRUE)
testthat::expect_error(getAllClustWeights(res, colMeans(res$clus_sel_mat[,
1:2]),
1),
"Weighting must be a character", fixed=TRUE)
testthat::expect_error(getAllClustWeights(res, colMeans(res$clus_sel_mat[,
1:2]),
"spasre"),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
})## Test passed with 9 successes.
Tests for checkGetSelectedClustersOutput():
testthat::test_that("checkGetSelectedClustersOutput works", {
sel_clusts <- 0.1*(1:9)
names(sel_clusts) <- letters[1:9]
weights <- list()
for(i in 1:8){
weights[[i]] <- c(0.2, 0.3)
}
weights[[9]] <- 0.4
names(weights) <- letters[1:9]
sel_feats <- 10:26
names(sel_feats) <- LETTERS[10:26]
testthat::expect_null(checkGetSelectedClustersOutput(selected_clusts=sel_clusts,
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30))
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=letters[1:4],
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30),
"is.numeric(selected_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=-sel_clusts,
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30),
"all(selected_clusts >= 0) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=10*sel_clusts,
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30),
"all(selected_clusts <= 1) is not TRUE",
fixed=TRUE)
# An empty selection is now allowed (min_num_clusts=0), so length 0 no longer
# trips a guard; an unnamed empty selected_clusts is still rejected (it fails
# the names check before the length check would matter).
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=numeric(),
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30),
"!is.null(names(selected_clusts)) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=sel_clusts,
selected_feats=sel_feats,
weights=weights,
n_clusters=8, p=30),
"length(selected_clusts) <= n_clusters is not TRUE",
fixed=TRUE)
bad_clusts <- sel_clusts
names(bad_clusts) <- rep("a", length(bad_clusts))
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=bad_clusts,
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30),
"length(names(selected_clusts)) == length(unique(names(selected_clusts))) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=unname(sel_clusts),
selected_feats=sel_feats,
weights=weights,
n_clusters=10, p=30),
"!is.null(names(selected_clusts)) is not TRUE",
fixed=TRUE)
bad_clusts <- sel_clusts
names(bad_clusts)[1] <- ""
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=bad_clusts,
selected_feats=sel_feats, weights=weights,
n_clusters=10, p=30),
"all(!is.na(names(selected_clusts)) & names(selected_clusts) != .... is not TRUE",
fixed=TRUE)
names(bad_clusts)[1] <- as.character(NA)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=bad_clusts,
selected_feats=sel_feats, weights=weights,
n_clusters=10, p=30),
"all(!is.na(names(selected_clusts)) & names(selected_clusts) != .... is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=sel_clusts,
selected_feats=0.1,
weights=weights,
n_clusters=10, p=30),
"is.integer(selected_feats) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=sel_clusts,
selected_feats=c(1L,
rep(2L,
2)),
weights=weights,
n_clusters=10, p=30),
"length(selected_feats) == length(unique(selected_feats)) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=sel_clusts,
selected_feats=sel_feats, weights=weights,
n_clusters=10, p=25),
"all(selected_feats %in% 1:p) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetSelectedClustersOutput(selected_clusts=sel_clusts,
selected_feats=sel_feats[1:8], weights=weights,
n_clusters=10, p=25),
"length(selected_clusts) <= length(selected_feats) is not TRUE",
fixed=TRUE)
})## Test passed with 14 successes.
Tests for getSelectedClusters()
testthat::test_that("getSelectedClusters works", {
set.seed(26717)
x <- matrix(stats::rnorm(10*5), nrow=10, ncol=5)
y <- stats::rnorm(10)
good_clusters <- list("apple"=1:2, "banana"=3:4, "cantaloupe"=5)
css_res <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 10)
res <- getSelectedClusters(css_res, weighting="sparse", cutoff=0.05,
min_num_clusts=1, max_num_clusts=NA)
testthat::expect_true(is.list(res))
testthat::expect_equal(length(res), 3)
testthat::expect_identical(names(res), c("selected_clusts", "selected_feats",
"weights"))
testthat::expect_true(length(res$selected_clusts) <=
length(res$selected_feats))
testthat::expect_true(is.numeric(res$selected_clusts))
testthat::expect_true(length(res$selected_clusts) >= 1)
testthat::expect_equal(length(names(res$selected_clusts)),
length(res$selected_clusts))
testthat::expect_equal(length(names(res$selected_clusts)),
length(unique(names(res$selected_clusts))))
testthat::expect_true(all(res$selected_clusts >= 0))
testthat::expect_true(all(res$selected_clusts <= 1))
testthat::expect_true(is.integer(res$selected_feats))
testthat::expect_true(length(res$selected_feats) >= 1)
testthat::expect_equal(length(names(res$selected_feats)),
length(unique(names(res$selected_feats))))
testthat::expect_true(all(res$selected_feats >= 1))
testthat::expect_true(all(res$selected_feats <= 5))
testthat::expect_equal(length(res$selected_feats),
length(unique(res$selected_feats)))
testthat::expect_equal(length(res$selected_clusts), length(res$weights))
for(i in 1:length(res$weights)){
weights_i <- res$weights[[i]]
num_nonzero_weights <- sum(weights_i > 0)
# For "sparse" weighting, either there should only be one nonzero weight and
# it should equal 1 (if there were no ties in selection proportions among
# cluster members) or the nonzero weights should all be
# 1/num_nonzero_weights
testthat::expect_true(all(weights_i[weights_i > 0] == 1/num_nonzero_weights))
}
# weighted_avg
res_weighted <- getSelectedClusters(css_res, weighting="weighted_avg",
cutoff=0.05, min_num_clusts=1,
max_num_clusts=NA)
testthat::expect_equal(length(res_weighted$selected_clusts),
length(res_weighted$weights))
for(i in 1:length(res_weighted$weights)){
weights_i <- res_weighted$weights[[i]]
testthat::expect_true(all(weights_i >= 0))
testthat::expect_true(all(weights_i <= 1))
}
# simple_avg
res_simple <- getSelectedClusters(css_res, weighting="simple_avg",
cutoff=0.05, min_num_clusts=1,
max_num_clusts=NA)
testthat::expect_equal(length(res_simple$selected_clusts),
length(res_simple$weights))
for(i in 1:length(res_simple$weights)){
weights_i <- res_simple$weights[[i]]
testthat::expect_equal(length(unique(weights_i)), 1)
testthat::expect_equal(length(weights_i), sum(weights_i > 0))
}
# Test min_num_clusts
res2 <- getSelectedClusters(css_res, weighting="weighted_avg", cutoff=1,
min_num_clusts=3, max_num_clusts=NA)
testthat::expect_true(is.list(res2))
testthat::expect_equal(length(res2$selected_clusts), 3)
res3 <- getSelectedClusters(css_res, weighting="sparse", cutoff=1,
min_num_clusts=2, max_num_clusts=NA)
testthat::expect_true(length(res3$selected_clusts) >= 2)
# Regression test (#10): the cutoff is adjusted by repeated +/- 1/B in the
# min/max loops, accumulating floating-point error, so a cluster sitting
# exactly at the threshold could be dropped -- breaking the max_num_clusts
# loop early and returning MORE clusters than max_num_clusts. With B=10 and
# cluster proportions (0, 0.3, 0.2), cutoff=0.1 / min=1 / max=1 must return a
# single cluster (the 0.3 one); pre-fix it returned two (0.3 was excluded by
# the float-accumulated cutoff 0.30000000000000004).
csm <- cbind(as.integer(rep(0, 10)),
as.integer(c(rep(1, 3), rep(0, 7))),
as.integer(c(rep(1, 2), rep(0, 8))))
clus_mat <- csm
colnames(clus_mat) <- c("c1", "c2", "c3")
feat_mat <- csm
colnames(feat_mat) <- c("f1", "f2", "f3")
mock_css <- list(feat_sel_mat=feat_mat, clus_sel_mat=clus_mat,
X=matrix(stats::rnorm(30), nrow=10, ncol=3,
dimnames=list(NULL, c("f1", "f2", "f3"))),
y=stats::rnorm(10),
clusters=list(c1=1L, c2=2L, c3=3L),
train_inds=integer())
class(mock_css) <- "cssr"
res_tie <- getSelectedClusters(mock_css, weighting="simple_avg", cutoff=0.1,
min_num_clusts=1, max_num_clusts=1)
testthat::expect_true(length(res_tie$selected_clusts) <= 1)
testthat::expect_identical(names(res_tie$selected_clusts), "c2")
# Test max_num_clusts
# Ensure there is at least one relevant feature
x2 <- x
x2[, 5] <- y
css_res2 <- css(X=x2, y=y, lambda=0.01, clusters=good_clusters, B = 10)
res4 <- getSelectedClusters(css_res2, weighting="simple_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=1)
testthat::expect_true(is.list(res4))
testthat::expect_equal(length(res4$selected_clusts), 1)
res5 <- getSelectedClusters(css_res, weighting="weighted_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=2)
testthat::expect_true(length(res5$selected_clusts) <= 2)
# Name features
colnames(x) <- LETTERS[1:ncol(x)]
css_res3 <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 10)
res <- getSelectedClusters(css_res3, weighting="sparse", cutoff=0.05,
min_num_clusts=1, max_num_clusts=NA)
testthat::expect_true(is.list(res))
testthat::expect_equal(length(res), 3)
testthat::expect_identical(names(res), c("selected_clusts", "selected_feats",
"weights"))
testthat::expect_true(length(res$selected_clusts) <=
length(res$selected_feats))
testthat::expect_true(is.numeric(res$selected_clusts))
testthat::expect_true(length(res$selected_clusts) >= 1)
testthat::expect_equal(length(names(res$selected_clusts)),
length(res$selected_clusts))
testthat::expect_equal(length(names(res$selected_clusts)),
length(unique(names(res$selected_clusts))))
testthat::expect_true(all(res$selected_clusts >= 0))
testthat::expect_true(all(res$selected_clusts <= 1))
testthat::expect_true(is.integer(res$selected_feats))
testthat::expect_true(length(res$selected_feats) >= 1)
testthat::expect_equal(length(names(res$selected_feats)),
length(unique(names(res$selected_feats))))
testthat::expect_true(all(res$selected_feats >= 1))
testthat::expect_true(all(res$selected_feats <= 5))
testthat::expect_equal(length(res$selected_feats),
length(unique(res$selected_feats)))
testthat::expect_equal(length(names(res$selected_feats)),
length(res$selected_feats))
testthat::expect_equal(length(names(res$selected_feats)),
length(unique(names(res$selected_feats))))
})
testthat::test_that("getSelectedClusters does not hang when min_num_clusts > n_clusters (#69)", {
# Minimal cssr object (same pattern as the #42 test) with 2 clusters.
# min_num_clusts = 5 exceeds the 2 available clusters: pre-#69 the min-loop
# decrements the cutoff forever; post-#69 the guard breaks once all clusters
# are selected, so checkSelectedClusters warns and the function returns them.
B <- 10
clusters <- list(c1 = 1:2, c2 = 3:4)
clus_sel_mat <- cbind(c1 = rep(1, B), c2 = c(rep(1, B - 1), 0))
feat_sel_mat <- cbind(X1 = rep(1, B), X2 = rep(1, B),
X3 = c(rep(1, B - 1), 0), X4 = c(rep(1, B - 1), 0))
obj <- structure(list(feat_sel_mat = feat_sel_mat, clus_sel_mat = clus_sel_mat,
clusters = clusters), class = "cssr")
testthat::expect_warning(
res <- getSelectedClusters(obj, weighting = "simple_avg", cutoff = 0,
min_num_clusts = 5L, max_num_clusts = NA),
"Returning fewer than min_num_clusts", fixed = TRUE)
testthat::expect_equal(length(res$selected_clusts), 2L) # both clusters, no hang
})
testthat::test_that("getSelectedClusters max_num_clusts handles proportion-1.0 ties (#42)", {
# Minimal cssr object: cluster c1 at selection proportion 1.0, cluster c2 at
# (B - 1)/B (just below 1.0). With max_num_clusts = 1 the cutoff loop must
# raise the threshold to 1.0 to drop c2 and keep only the proportion-1.0
# cluster. The cutoff accumulates +1/B, and for these B the cumulative sum
# floats just above 1 (e.g. B = 9: 1.0000000000000002), so the old
# `if(cutoff > 1) break` fired before the cutoff == 1 filter ran and wrongly
# kept c2 (returning 2 clusters for max_num_clusts = 1). The `+ tol` guard
# fixes this.
make_obj <- function(B){
clusters <- list(c1 = 1:2, c2 = 3:4)
clus_sel_mat <- cbind(c1 = rep(1, B), c2 = c(rep(1, B - 1), 0))
feat_sel_mat <- cbind(X1 = rep(1, B), X2 = rep(1, B),
X3 = c(rep(1, B - 1), 0), X4 = c(rep(1, B - 1), 0))
obj <- list(feat_sel_mat = feat_sel_mat, clus_sel_mat = clus_sel_mat,
clusters = clusters)
class(obj) <- "cssr"
obj
}
for(B in c(9, 11, 20)){
res <- getSelectedClusters(make_obj(B), weighting = "simple_avg", cutoff = 0,
min_num_clusts = 1, max_num_clusts = 1)
# Only the proportion-1.0 cluster c1 survives; c2 at (B-1)/B is below 1.
testthat::expect_identical(names(res$selected_clusts), "c1")
testthat::expect_equal(unname(res$selected_clusts), 1)
}
})## Test passed with 60 successes.
## Test passed with 2 successes.
## Test passed with 6 successes.
Finally, tests for getCssSelections()
testthat::test_that("getCssSelections works", {
set.seed(26717)
x <- matrix(stats::rnorm(10*7), nrow=10, ncol=7)
y <- stats::rnorm(10)
good_clusters <- list("apple"=1:2, "banana"=3:4, "cantaloupe"=5)
css_res <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 10)
res <- getCssSelections(css_res)
testthat::expect_true(is.list(res))
testthat::expect_equal(length(res), 3)
testthat::expect_identical(names(res), c("selected_clusts", "selected_feats",
"weights"))
testthat::expect_true(length(res$selected_clusts) <=
length(res$selected_feats))
testthat::expect_true(is.list(res$selected_clusts))
testthat::expect_equal(length(names(res$selected_clusts)),
length(res$selected_clusts))
testthat::expect_equal(length(names(res$selected_clusts)),
length(unique(names(res$selected_clusts))))
already_used_feats <- integer()
for(i in 1:length(res$selected_clusts)){
sels_i <- res$selected_clusts[[i]]
testthat::expect_true(length(sels_i) >= 1)
testthat::expect_true(is.integer(sels_i))
testthat::expect_true(all(sels_i %in% 1:11))
testthat::expect_equal(length(sels_i), length(unique(sels_i)))
testthat::expect_equal(length(intersect(already_used_feats, sels_i)), 0)
already_used_feats <- c(already_used_feats, sels_i)
}
testthat::expect_true(length(already_used_feats) <= 11)
testthat::expect_equal(length(already_used_feats),
length(unique(already_used_feats)))
testthat::expect_true(all(already_used_feats %in% 1:11))
testthat::expect_true(is.integer(res$selected_feats))
testthat::expect_true(length(res$selected_feats) >= 1)
testthat::expect_equal(length(names(res$selected_feats)),
length(unique(names(res$selected_feats))))
testthat::expect_true(all(res$selected_feats >= 1))
testthat::expect_true(all(res$selected_feats <= 7))
testthat::expect_equal(length(res$selected_feats),
length(unique(res$selected_feats)))
testthat::expect_equal(length(res$selected_clusts), length(res$weights))
for(i in 1:length(res$weights)){
weights_i <- res$weights[[i]]
num_nonzero_weights <- sum(weights_i > 0)
# For "sparse" weighting, either there should only be one nonzero weight and
# it should equal 1 (if there were no ties in selection proportions among
# cluster members) or the nonzero weights should all be
# 1/num_nonzero_weights
testthat::expect_true(all(weights_i[weights_i > 0] == 1/num_nonzero_weights))
}
# Test min_num_clusts (should be 5 clusters--3 named ones, plus last two get
# put in their own unnamed clusters automatically by css)
res2 <- getCssSelections(css_res, weighting="weighted_avg", cutoff=1,
min_num_clusts=5, max_num_clusts=NA)
testthat::expect_true(is.list(res2))
testthat::expect_equal(length(res2$selected_clusts), 5)
res3 <- getCssSelections(css_res, weighting="sparse", cutoff=1,
min_num_clusts=3, max_num_clusts=NA)
testthat::expect_true(length(res3$selected_clusts) >= 3)
# Test max_num_clusts
# Ensure there is at least one relevant feature
x2 <- x
x2[, 5] <- y
css_res2 <- css(X=x2, y=y, lambda=0.01, clusters=good_clusters, B = 10)
res4 <- getCssSelections(css_res2, weighting="simple_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=1)
testthat::expect_true(is.list(res4))
testthat::expect_equal(length(res4$selected_clusts), 1)
res5 <- getCssSelections(css_res, weighting="weighted_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=2)
testthat::expect_true(length(res5$selected_clusts) <= 2)
# Name features
colnames(x) <- LETTERS[1:ncol(x)]
css_res3 <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 10)
res <- getCssSelections(css_res3, weighting="sparse", cutoff=0.05,
min_num_clusts=1, max_num_clusts=NA)
testthat::expect_true(is.list(res))
testthat::expect_equal(length(res), 3)
testthat::expect_identical(names(res), c("selected_clusts", "selected_feats",
"weights"))
testthat::expect_true(length(res$selected_clusts) <=
length(res$selected_feats))
testthat::expect_equal(length(names(res$selected_feats)),
length(res$selected_feats))
testthat::expect_equal(length(names(res$selected_feats)),
length(unique(names(res$selected_feats))))
# Bad inputs
# Error has quotation marks in it
testthat::expect_error(getCssSelections("css_results"))
testthat::expect_error(getCssSelections(css_res, weighting="spasre"),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
testthat::expect_error(getCssSelections(css_res, cutoff=-.5),
"cutoff >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssSelections(css_res, min_num_clusts=-1),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssSelections(css_res, max_num_clusts=50),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
testthat::expect_error(getCssSelections(css_res, max_num_clusts=4.5),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
})## Test passed with 65 successes.
Test for min_num_clusts = 0 (empty threshold selection, issue #107):
testthat::test_that("min_num_clusts = 0 allows an empty threshold selection", {
# Deterministic empty-producing fixture: a fixed seed, all-singleton clusters
# (clusters = list(), so every feature is its own cluster), and a large lambda
# so the base lasso selects almost nothing. We assert that the maximum cluster
# selection proportion is below 1 - 1/(2*B) so that cutoff = 1 genuinely
# selects no cluster (otherwise the test would not exercise the empty path).
set.seed(8)
B <- 10
x <- matrix(stats::rnorm(30*8), nrow=30, ncol=8)
y <- stats::rnorm(30)
css_res <- css(X=x, y=y, lambda=0.5, clusters=list(), B=B)
testthat::expect_true(max(colMeans(css_res$clus_sel_mat)) < 1 - 1/(2*B))
# Selection path: getCssSelections returns a clean empty result. (weights is a
# NAMED empty list, so assert via length/type rather than expect_equal(.,list()).)
empty_sel <- getCssSelections(css_res, cutoff=1, min_num_clusts=0)
testthat::expect_length(empty_sel$selected_clusts, 0)
testthat::expect_true(is.list(empty_sel$selected_clusts))
testthat::expect_length(empty_sel$selected_feats, 0)
testthat::expect_true(is.integer(empty_sel$selected_feats))
testthat::expect_length(empty_sel$weights, 0)
testthat::expect_true(is.list(empty_sel$weights))
# Design / prediction paths: one clear error (no design/predictions possible).
testthat::expect_error(getCssDesign(css_res, newX=x, cutoff=1,
min_num_clusts=0),
"No clusters were selected", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x, trainX=x, trainY=y,
cutoff=1, min_num_clusts=0),
"No clusters were selected", fixed=TRUE)
# Legacy regression: the default min_num_clusts = 1 still forces >= 1 cluster
# on the SAME fixture (the floor of 1 is unchanged).
legacy_sel <- getCssSelections(css_res, cutoff=1, min_num_clusts=1)
testthat::expect_equal(length(legacy_sel$selected_clusts), 1)
})## Test passed with 10 successes.
Tests for checkXInputResults()
testthat::test_that("checkXInputResults works", {
set.seed(72617)
x_select <- matrix(stats::rnorm(10*5), nrow=10, ncol=5)
x_new <- matrix(stats::rnorm(8*5), nrow=8, ncol=5)
y_select <- stats::rnorm(10)
y_new <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- checkXInputResults(x_new, css_res$X)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("feat_names", "newx"))
testthat::expect_true(is.character(res$feat_names))
testthat::expect_true(is.na(res$feat_names))
testthat::expect_true(is.numeric(res$newx))
testthat::expect_true(is.matrix(res$newx))
testthat::expect_equal(nrow(res$newx), 8)
testthat::expect_equal(ncol(res$newx), 5)
testthat::expect_null(colnames(res$newx))
# Try naming variables
colnames(x_select) <- LETTERS[1:5]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(checkXInputResults(x_new, css_res_named$X),
"New X provided had no variable names (column names) even though the X provided to css did.",
fixed=TRUE)
# Try mismatching variable names
colnames(x_new) <- LETTERS[2:6]
testthat::expect_error(checkXInputResults(x_new, css_res_named$X),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_new) <- LETTERS[1:5]
res_named <- checkXInputResults(x_new, css_res_named$X)
testthat::expect_true(is.list(res_named))
testthat::expect_identical(names(res_named), c("feat_names", "newx"))
testthat::expect_true(is.character(res_named$feat_names))
testthat::expect_identical(res_named$feat_names, LETTERS[1:5])
# Try data.frame input to css and checkXInputResults
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/2)
fit_inds <- setdiff(1:n, selec_inds)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkXInputResults(X_df[fit_inds, ], css_res_df$X)
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("feat_names", "newx"))
testthat::expect_true(is.character(res_df$feat_names))
testthat::expect_identical(res_df$feat_names, colnames(css_res_df$X))
testthat::expect_identical(res_df$feat_names, colnames(X_df))
testthat::expect_true(is.numeric(res_df$newx))
testthat::expect_true(is.matrix(res_df$newx))
testthat::expect_null(colnames(res_df$newx))
testthat::expect_equal(ncol(res_df$newx), ncol(css_res_df$X))
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
# cyl, gear, and carb are factors with more than 2 levels
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkXInputResults(X_df[fit_inds, ], css_res_df$X)
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("feat_names", "newx"))
testthat::expect_true(is.character(res_df$feat_names))
testthat::expect_identical(res_df$feat_names, colnames(css_res_df$X))
mat <- model.matrix( ~., X_df)
mat <- mat[, colnames(mat) != "(Intercept)"]
testthat::expect_identical(res_df$feat_names, colnames(mat))
testthat::expect_true(is.numeric(res_df$newx))
testthat::expect_true(is.matrix(res_df$newx))
testthat::expect_null(colnames(res_df$newx))
testthat::expect_equal(ncol(res_df$newx), ncol(css_res_df$X))
})## Test passed with 33 successes.
Tests for checkNewXProvided()
testthat::test_that("checkNewXProvided works", {
set.seed(2673)
x_select <- matrix(stats::rnorm(10*5), nrow=10, ncol=5)
x_new <- matrix(stats::rnorm(8*5), nrow=8, ncol=5)
y_select <- stats::rnorm(10)
y_new <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- checkNewXProvided(x_new, css_res)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("newX", "newXProvided"))
testthat::expect_true(is.numeric(res$newX))
testthat::expect_true(is.matrix(res$newX))
testthat::expect_equal(nrow(res$newX), 8)
testthat::expect_equal(ncol(res$newX), 5)
testthat::expect_null(colnames(res$newX))
testthat::expect_true(is.logical(res$newXProvided))
testthat::expect_equal(length(res$newXProvided), 1)
testthat::expect_true(!is.na(res$newXProvided))
testthat::expect_true(res$newXProvided)
# Add training indices
css_res_train <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10, train_inds=6:10)
# Training indices should be ignored if new x is provided
res <- checkNewXProvided(x_new, css_res_train)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("newX", "newXProvided"))
testthat::expect_true(all(abs(x_new - res$newX) < 10^(-9)))
testthat::expect_true(res$newXProvided)
# Things should still work if new x is not provided
res <- checkNewXProvided(NA, css_res_train)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("newX", "newXProvided"))
testthat::expect_true(is.numeric(res$newX))
testthat::expect_true(is.matrix(res$newX))
testthat::expect_equal(nrow(res$newX), 5)
testthat::expect_equal(ncol(res$newX), 5)
testthat::expect_null(colnames(res$newX))
testthat::expect_false(res$newXProvided)
# An NA-containing newX was previously read as "not provided" (the
# all(!is.na) sentinel) and silently replaced by the train_inds data; it is
# now treated as provided and rejected by checkNoNAs (#71).
x_na <- x_new
x_na[3, 2] <- NA
testthat::expect_error(checkNewXProvided(x_na, css_res_train),
"must not contain missing", fixed = TRUE)
# Try not providing training indices and omitting newx--should get error
testthat::expect_error(checkNewXProvided(NA, css_res),
"css was not provided with indices to set aside for model training (train_inds), so must provide new X in order to generate a design matrix", fixed=TRUE)
# Try naming variables
colnames(x_select) <- LETTERS[1:5]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(checkNewXProvided(x_new, css_res_named),
"New X provided had no variable names (column names) even though the X provided to css did.", fixed=TRUE)
# Try mismatching variable names
colnames(x_new) <- LETTERS[2:6]
testthat::expect_error(checkNewXProvided(x_new, css_res_named),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_new) <- LETTERS[1:5]
res_named <- checkNewXProvided(x_new, css_res_named)
testthat::expect_true(is.list(res_named))
testthat::expect_identical(names(res_named), c("newX", "newXProvided"))
testthat::expect_true(all(abs(x_new - res_named$newX) < 10^(-9)))
testthat::expect_true(res_named$newXProvided)
# Try data.frame input to css and checkNewXProvided
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/2)
fit_inds <- setdiff(1:n, selec_inds)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkNewXProvided(X_df[fit_inds, ], css_res_df)
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("newX", "newXProvided"))
testthat::expect_true(is.numeric(res_df$newX))
testthat::expect_true(is.matrix(res_df$newX))
testthat::expect_equal(nrow(res_df$newX), length(fit_inds))
testthat::expect_equal(ncol(res_df$newX), ncol(css_res_df$X))
testthat::expect_null(colnames(res_df$newX))
testthat::expect_true(is.logical(res_df$newXProvided))
testthat::expect_equal(length(res_df$newXProvided), 1)
testthat::expect_true(!is.na(res_df$newXProvided))
testthat::expect_true(res_df$newXProvided)
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkNewXProvided(X_df[fit_inds, ], css_res_df)
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("newX", "newXProvided"))
testthat::expect_true(is.numeric(res_df$newX))
testthat::expect_true(is.matrix(res_df$newX))
testthat::expect_equal(nrow(res_df$newX), length(fit_inds))
testthat::expect_equal(ncol(res_df$newX), ncol(css_res_df$X))
testthat::expect_null(colnames(res_df$newX))
testthat::expect_true(is.logical(res_df$newXProvided))
testthat::expect_equal(length(res_df$newXProvided), 1)
testthat::expect_true(!is.na(res_df$newXProvided))
testthat::expect_true(res_df$newXProvided)
# (#104, L5) Dispatch on the NA-sentinel default, not a length() heuristic.
# css_res_train has train_inds, so the OLD length()-based code silently fell
# back to that training data for these misclassified inputs; they must now be
# treated as "provided".
# A bare numeric vector (length > 1) is not a valid design matrix and now
# errors clearly instead of failing later on a cryptic is.matrix() stopifnot.
testthat::expect_error(
checkNewXProvided(stats::rnorm(5), css_res_train),
"newX must be a matrix or data.frame", fixed=TRUE)
# A one-column data.frame (length 1) used to be misread as "not provided" and
# silently replaced by the train_inds data; it now enters the provided branch
# and errors (here on a feature-name/column mismatch) rather than falling back.
testthat::expect_error(
checkNewXProvided(data.frame(a=stats::rnorm(8)), css_res_train))
})## Test passed with 55 successes.
Tests for checkFormCssDesignInputs()
testthat::test_that("checkFormCssDesignInputs works", {
set.seed(72617)
x_select <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
x_new <- matrix(stats::rnorm(8*6), nrow=8, ncol=6)
y_select <- stats::rnorm(10)
y_new <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- checkFormCssDesignInputs(css_results=css_res, weighting="sparse",
cutoff=0.5, min_num_clusts=1,
max_num_clusts=NA, newx=x_new)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("newx", "max_num_clusts"))
testthat::expect_true(is.numeric(res$newx))
testthat::expect_true(is.matrix(res$newx))
testthat::expect_equal(nrow(res$newx), 8)
testthat::expect_equal(ncol(res$newx), 6)
testthat::expect_null(colnames(res$newx))
testthat::expect_true(all(abs(x_new - res$newX) < 10^(-9)))
testthat::expect_equal(length(res$max_num_clusts), 1)
testthat::expect_true(is.na(res$max_num_clusts))
# Add training indices
css_res_train <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B=10, train_inds=6:10)
# Training indices should be ignored if new x is provided
res <- checkFormCssDesignInputs(css_results=css_res_train,
weighting="weighted_avg", cutoff=0,
min_num_clusts=2, max_num_clusts=NA,
newx=x_new)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("newx", "max_num_clusts"))
testthat::expect_true(is.numeric(res$newx))
testthat::expect_true(is.matrix(res$newx))
testthat::expect_equal(nrow(res$newx), 8)
testthat::expect_equal(ncol(res$newx), 6)
testthat::expect_null(colnames(res$newx))
testthat::expect_true(all(abs(x_new - res$newX) < 10^(-9)))
# Things should still work if new x is not provided
res <- checkFormCssDesignInputs(css_results=css_res_train, weighting="sparse",
cutoff=1, min_num_clusts=3,
max_num_clusts=NA, newx=NA)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("newx", "max_num_clusts"))
testthat::expect_true(is.numeric(res$newx))
testthat::expect_true(is.matrix(res$newx))
testthat::expect_equal(nrow(res$newx), length(6:10))
testthat::expect_equal(ncol(res$newx), 6)
testthat::expect_null(colnames(res$newx))
testthat::expect_true(all(abs(x_select[1:5, ] - res$newX) < 10^(-9)))
# Try not providing training indices and omitting newx--should get error
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="sparse",
cutoff=0.5, min_num_clusts=1,
max_num_clusts=5, newx=NA),
"If css was not provided with indices to set aside for model training, then newx must be provided to formCssDesign", fixed=TRUE)
# Try naming variables
colnames(x_select) <- LETTERS[1:6]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(checkFormCssDesignInputs(css_results=css_res_named,
weighting="simple_avg",
cutoff=0.9,
min_num_clusts=1,
max_num_clusts=3,
newx=x_new),
"New X provided had no variable names (column names) even though the X provided to css did.", fixed=TRUE)
# Try mismatching variable names
colnames(x_new) <- LETTERS[2:7]
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res_named,
weighting="weighted_avg",
cutoff=0.2, min_num_clusts=1,
max_num_clusts=1,
newx=x_new),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_new) <- LETTERS[1:6]
res_named <- checkFormCssDesignInputs(css_results=css_res_named,
weighting="sparse", cutoff=0.5,
min_num_clusts=2, max_num_clusts=NA,
newx=x_new)
testthat::expect_true(is.list(res_named))
testthat::expect_identical(names(res_named), c("newx", "max_num_clusts"))
testthat::expect_true(is.numeric(res_named$newx))
testthat::expect_true(is.matrix(res_named$newx))
testthat::expect_equal(nrow(res_named$newx), 8)
testthat::expect_equal(ncol(res_named$newx), 6)
testthat::expect_null(colnames(res_named$newx))
testthat::expect_identical(colnames(css_res_named$X), LETTERS[1:6])
testthat::expect_true(all(abs(x_new - res_named$newX) < 10^(-9)))
# Try data.frame input to css and checkFormCssDesignInputs
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/2)
fit_inds <- setdiff(1:n, selec_inds)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkFormCssDesignInputs(css_results=css_res_df,
weighting="simple_avg", cutoff=0.7,
min_num_clusts=3, max_num_clusts=NA,
newx=X_df[fit_inds, ])
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("newx", "max_num_clusts"))
testthat::expect_true(is.numeric(res_df$newx))
testthat::expect_true(is.matrix(res_df$newx))
testthat::expect_null(colnames(res_df$newx))
testthat::expect_equal(nrow(res_df$newx), length(fit_inds))
testthat::expect_equal(ncol(res_df$newx), ncol(css_res_df$X))
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkFormCssDesignInputs(css_results=css_res_df,
weighting="weighted_avg", cutoff=0.3,
min_num_clusts=1, max_num_clusts=4,
newx=X_df[fit_inds, ])
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("newx", "max_num_clusts"))
testthat::expect_true(is.numeric(res_df$newx))
testthat::expect_true(is.matrix(res_df$newx))
testthat::expect_null(colnames(res_df$newx))
testthat::expect_equal(nrow(res_df$newx), length(fit_inds))
testthat::expect_equal(ncol(res_df$newx), ncol(css_res_df$X))
##### Try other bad inputs
colnames(x_new) <- NULL
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=-0.3, min_num_clusts=1,
max_num_clusts=4,
newx=x_new),
"cutoff >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="sparse",
cutoff="0.5",
min_num_clusts=1,
max_num_clusts=NA, newx=x_new),
"is.numeric(cutoff) | is.integer(cutoff) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="sparse",
cutoff=as.numeric(NA),
min_num_clusts=1,
max_num_clusts=NA,
newx=x_new),
"!is.na(cutoff) is not TRUE", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting=c("sparse",
"simple_avg"),
cutoff=0.2,
min_num_clusts=1,
max_num_clusts=NA,
newx=x_new),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting=1,
cutoff=0.2,
min_num_clusts=1,
max_num_clusts=NA,
newx=x_new),
"Weighting must be a character", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="spasre",
cutoff=0.2,
min_num_clusts=1,
max_num_clusts=NA,
newx=x_new),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=c(1, 2),
max_num_clusts=NA,
newx=x_new),
"length(min_num_clusts) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts="3",
max_num_clusts=NA,
newx=x_new),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=-1,
max_num_clusts=NA,
newx=x_new),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=6,
max_num_clusts=NA,
newx=x_new),
"min_num_clusts <= n_clusters is not TRUE", fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=1,
max_num_clusts="4",
newx=x_new),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=1,
max_num_clusts=3.5,
newx=x_new),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=2,
max_num_clusts=1,
newx=x_new),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
testthat::expect_error(checkFormCssDesignInputs(css_results=css_res,
weighting="weighted_avg",
cutoff=0.2,
min_num_clusts=2,
max_num_clusts=8,
newx=x_new),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
})## Test passed with 66 successes.
Tests for formCssDesign()
testthat::test_that("formCssDesign works", {
set.seed(17230)
x_select <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
x_new <- matrix(stats::rnorm(8*6), nrow=8, ncol=6)
y_select <- stats::rnorm(10)
y_new <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- formCssDesign(css_res, newx=x_new)
testthat::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(nrow(res), 8)
testthat::expect_equal(ncol(res), length(css_res$clusters))
testthat::expect_true(all(colnames(res) %in% names(css_res$clusters)))
testthat::expect_true(all(names(css_res$clusters) %in% colnames(res)))
# Value-level checks: pin every entry of the design matrix to a hand-computed
# value across all three weighting schemes (#101). The structural checks above
# would pass for a wrong-but-well-shaped design (column misalignment,
# transposed/reversed weights, or one cluster's weights applied to another).
feat_sel_props <- colMeans(css_res$feat_sel_mat)
for (w in c("simple_avg", "weighted_avg", "sparse")) {
des <- formCssDesign(css_res, weighting = w, cutoff = 0, min_num_clusts = 1, newx = x_new)
wts <- getSelectedClusters(css_res, weighting = w, cutoff = 0,
min_num_clusts = 1, max_num_clusts = NA)$weights
# exported-path equivalence (getCssDesign just forwards to formCssDesign)
testthat::expect_equal(
getCssDesign(css_res, newX = x_new, weighting = w, cutoff = 0, min_num_clusts = 1), des)
for (name in names(wts)) {
clust <- css_res$clusters[[name]]
w_i <- wts[[name]]
sp <- feat_sel_props[clust]
# (1) APPLICATION: weights applied to the correct columns, correct orientation, correct cluster
testthat::expect_equal(des[, name],
as.numeric(x_new[, clust, drop = FALSE] %*% w_i), info = paste(w, name))
# (2) SEMANTIC anchor per scheme (independent of getSelectedClusters$weights)
if (w == "simple_avg") {
testthat::expect_equal(des[, name], rowMeans(x_new[, clust, drop = FALSE]), info = name)
} else if (w == "weighted_avg") {
if (sum(sp) > 0) {
testthat::expect_equal(unname(w_i), unname(sp / sum(sp)), info = name)
} else {
testthat::expect_equal(unname(w_i), rep(1 / length(clust), length(clust)), info = name)
}
} else { # sparse: equal weight on the member(s) tied at max(sp); else equal weights
if (sum(sp) > 0) {
maxes <- sp == max(sp)
testthat::expect_equal(des[, name],
as.numeric(rowMeans(x_new[, clust[maxes], drop = FALSE])), info = name)
} else {
testthat::expect_equal(des[, name], rowMeans(x_new[, clust, drop = FALSE]), info = name)
}
}
}
}
# Add training indices
css_res_train <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B=10, train_inds=6:10)
# Training indices should be ignored if new x is provided
res <- formCssDesign(css_results=css_res_train, weighting="weighted_avg",
cutoff=0, min_num_clusts=2, max_num_clusts=NA,
newx=x_new)
testthat::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(nrow(res), 8)
testthat::expect_equal(ncol(res), length(css_res_train$clusters))
testthat::expect_true(all(colnames(res) %in% names(css_res_train$clusters)))
testthat::expect_true(all(names(css_res_train$clusters) %in% colnames(res)))
# Things should still work if new x is not provided
res <- formCssDesign(css_results=css_res_train, weighting="weighted_avg",
cutoff=0, min_num_clusts=2, max_num_clusts=NA)
testthat::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(nrow(res), 5)
testthat::expect_equal(ncol(res), length(css_res_train$clusters))
testthat::expect_true(all(colnames(res) %in% names(css_res_train$clusters)))
testthat::expect_true(all(names(css_res_train$clusters) %in% colnames(res)))
# Try not providing training indices and omitting newx--should get error
testthat::expect_error(formCssDesign(css_results=css_res, weighting="sparse",
cutoff=0.5, min_num_clusts=1,
max_num_clusts=5, newx=NA),
"If css was not provided with indices to set aside for model training, then newx must be provided to formCssDesign", fixed=TRUE)
# Try naming variables
colnames(x_select) <- LETTERS[1:6]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(formCssDesign(css_results=css_res_named,
weighting="simple_avg", cutoff=0.9,
min_num_clusts=1, max_num_clusts=3,
newx=x_new),
"New X provided had no variable names (column names) even though the X provided to css did.", fixed=TRUE)
# Try mismatching variable names
colnames(x_new) <- LETTERS[2:7]
testthat::expect_error(formCssDesign(css_results=css_res_named,
weighting="weighted_avg", cutoff=0.2,
min_num_clusts=1, max_num_clusts=1,
newx=x_new),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_new) <- LETTERS[1:6]
res_named <- formCssDesign(css_results=css_res_named,
weighting="sparse", cutoff=0.5,
min_num_clusts=2, max_num_clusts=NA,
newx=x_new)
testthat::expect_true(is.matrix(res_named))
testthat::expect_true(is.numeric(res_named))
testthat::expect_equal(nrow(res_named), 8)
testthat::expect_true(ncol(res_named) <= length(css_res_named$clusters))
testthat::expect_true(all(colnames(res_named) %in% names(css_res_named$clusters)))
# Try data.frame input to css and formCssDesign
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/2)
fit_inds <- setdiff(1:n, selec_inds)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- formCssDesign(css_results=css_res_df, weighting="simple_avg",
cutoff=0.7, min_num_clusts=3, max_num_clusts=NA,
newx=X_df[fit_inds, ])
testthat::expect_true(is.matrix(res_df))
testthat::expect_true(is.numeric(res_df))
testthat::expect_equal(nrow(res_df), length(fit_inds))
testthat::expect_true(ncol(res_df) <= length(css_res_df$clusters))
testthat::expect_true(all(colnames(res_df) %in% names(css_res_df$clusters)))
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- formCssDesign(css_results=css_res_df, weighting="weighted_avg",
cutoff=0.3, min_num_clusts=1, max_num_clusts=4,
newx=X_df[fit_inds, ])
testthat::expect_true(is.matrix(res_df))
testthat::expect_true(is.numeric(res_df))
testthat::expect_equal(nrow(res_df), length(fit_inds))
testthat::expect_true(ncol(res_df) <= length(css_res_df$clusters))
testthat::expect_true(all(colnames(res_df) %in% names(css_res_df$clusters)))
##### Try other bad inputs
colnames(x_new) <- NULL
testthat::expect_error(formCssDesign(css_results=css_res, cutoff=-0.3,
newx=x_new), "cutoff >= 0 is not TRUE",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, cutoff="0.5",
newx=x_new),
"is.numeric(cutoff) | is.integer(cutoff) is not TRUE",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res,
cutoff=as.numeric(NA), newx=x_new),
"!is.na(cutoff) is not TRUE", fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res,
weighting=c("sparse", "simple_avg"),
newx=x_new),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, weighting=1,
newx=x_new),
"Weighting must be a character", fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, weighting="spasre",
newx=x_new),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res,
min_num_clusts=c(1, 2), newx=x_new),
"length(min_num_clusts) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, min_num_clusts="3",
newx=x_new),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, min_num_clusts=-1,
newx=x_new),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, min_num_clusts=6,
newx=x_new),
"min_num_clusts <= n_clusters is not TRUE", fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, max_num_clusts="4",
newx=x_new),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, max_num_clusts=3.5,
newx=x_new),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, min_num_clusts=2,
max_num_clusts=1, newx=x_new),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
testthat::expect_error(formCssDesign(css_results=css_res, max_num_clusts=8,
newx=x_new),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
})## Test passed with 77 successes.
Finally, tests for getCssDesign()
testthat::test_that("getCssDesign works", {
set.seed(23170)
x_select <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
x_new <- matrix(stats::rnorm(8*6), nrow=8, ncol=6)
y_select <- stats::rnorm(10)
y_new <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- getCssDesign(css_res, newX=x_new)
testthat::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(nrow(res), 8)
testthat::expect_equal(ncol(res), length(css_res$clusters))
testthat::expect_true(all(colnames(res) %in% names(css_res$clusters)))
testthat::expect_true(all(names(css_res$clusters) %in% colnames(res)))
# A single-row newX is accepted (#44): getCssDesign used to require > 1 row,
# while getCssPreds/cssPredict already accept a 1-row test set.
res_1row <- getCssDesign(css_res, newX = x_new[1, , drop = FALSE])
testthat::expect_true(is.matrix(res_1row))
testthat::expect_equal(nrow(res_1row), 1)
testthat::expect_equal(ncol(res_1row), length(css_res$clusters))
# Add training indices
css_res_train <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B=10, train_inds=6:10)
# Training indices should be ignored if new x is provided
res <- getCssDesign(css_results=css_res_train, weighting="weighted_avg",
min_num_clusts=2, newX=x_new)
testthat::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(nrow(res), 8)
testthat::expect_equal(ncol(res), length(css_res_train$clusters))
testthat::expect_true(all(colnames(res) %in% names(css_res_train$clusters)))
testthat::expect_true(all(names(css_res_train$clusters) %in% colnames(res)))
# Things should still work if new x is not provided
res <- getCssDesign(css_results=css_res_train, min_num_clusts=2)
testthat::expect_true(is.matrix(res))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(nrow(res), 5)
testthat::expect_equal(ncol(res), length(css_res_train$clusters))
testthat::expect_true(all(colnames(res) %in% names(css_res_train$clusters)))
testthat::expect_true(all(names(css_res_train$clusters) %in% colnames(res)))
# Try not providing training indices and omitting newX--should get error
testthat::expect_error(getCssDesign(css_results=css_res, weighting="sparse",
cutoff=0.5, min_num_clusts=1,
max_num_clusts=5, newX=NA),
"css was not provided with indices to set aside for model training (train_inds), so must provide new X in order to generate a design matrix", fixed=TRUE)
# Try naming variables
colnames(x_select) <- LETTERS[1:6]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(getCssDesign(css_results=css_res_named,
weighting="simple_avg", cutoff=0.9,
min_num_clusts=1, max_num_clusts=3,
newX=x_new),
"New X provided had no variable names (column names) even though the X provided to css did.", fixed=TRUE)
# Try mismatching variable names
colnames(x_new) <- LETTERS[2:7]
testthat::expect_error(getCssDesign(css_results=css_res_named,
weighting="weighted_avg", cutoff=0.2,
min_num_clusts=1, max_num_clusts=1,
newX=x_new),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_new) <- LETTERS[1:6]
res_named <- getCssDesign(css_results=css_res_named, weighting="sparse",
cutoff=0.5, min_num_clusts=2, max_num_clusts=NA,
newX=x_new)
testthat::expect_true(is.matrix(res_named))
testthat::expect_true(is.numeric(res_named))
testthat::expect_equal(nrow(res_named), 8)
testthat::expect_true(ncol(res_named) <= length(css_res_named$clusters))
testthat::expect_true(all(colnames(res_named) %in% names(css_res_named$clusters)))
# Try data.frame input to css and getCssDesign
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/2)
fit_inds <- setdiff(1:n, selec_inds)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- getCssDesign(css_results=css_res_df, weighting="simple_avg",
cutoff=0.7, min_num_clusts=3, max_num_clusts=NA,
newX=X_df[fit_inds, ])
testthat::expect_true(is.matrix(res_df))
testthat::expect_true(is.numeric(res_df))
testthat::expect_equal(nrow(res_df), length(fit_inds))
testthat::expect_true(ncol(res_df) <= length(css_res_df$clusters))
testthat::expect_true(all(colnames(res_df) %in% names(css_res_df$clusters)))
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- getCssDesign(css_results=css_res_df, weighting="weighted_avg",
cutoff=0.3, min_num_clusts=1, max_num_clusts=4,
newX=X_df[fit_inds, ])
testthat::expect_true(is.matrix(res_df))
testthat::expect_true(is.numeric(res_df))
testthat::expect_equal(nrow(res_df), length(fit_inds))
testthat::expect_true(ncol(res_df) <= length(css_res_df$clusters))
testthat::expect_true(all(colnames(res_df) %in% names(css_res_df$clusters)))
##### Try other bad inputs
colnames(x_new) <- NULL
testthat::expect_error(getCssDesign(css_results=css_res, cutoff=-0.3,
newX=x_new), "cutoff >= 0 is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, cutoff="0.5",
newX=x_new),
"is.numeric(cutoff) | is.integer(cutoff) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res,
cutoff=as.numeric(NA), newX=x_new),
"!is.na(cutoff) is not TRUE", fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res,
weighting=c("sparse", "simple_avg"),
newX=x_new),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, weighting=1,
newX=x_new),
"Weighting must be a character", fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, weighting="spasre",
newX=x_new),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res,
min_num_clusts=c(1, 2), newX=x_new),
"length(min_num_clusts) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, min_num_clusts="3",
newX=x_new),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, min_num_clusts=-1,
newX=x_new),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, min_num_clusts=6,
newX=x_new),
"min_num_clusts <= n_clusters is not TRUE", fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, max_num_clusts="4",
newX=x_new),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, max_num_clusts=3.5,
newX=x_new),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, min_num_clusts=2,
max_num_clusts=1, newX=x_new),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssDesign(css_results=css_res, max_num_clusts=8,
newX=x_new),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
})## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. +-testthat::expect_warning(...)
## 2. | \-testthat:::expect_condition_matching_(...)
## 3. | \-testthat:::quasi_capture(...)
## 4. | +-testthat (local) .capture(...)
## 5. | | \-base::withCallingHandlers(...)
## 6. | \-rlang::eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
## 7. \-litr (local) getCssDesign(...)
## 8. \-litr (local) checkXInputResults(newX, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. +-testthat::expect_warning(...)
## 2. | \-testthat:::expect_condition_matching_(...)
## 3. | \-testthat:::quasi_capture(...)
## 4. | +-testthat (local) .capture(...)
## 5. | | \-base::withCallingHandlers(...)
## 6. | \-rlang::eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
## 7. \-litr (local) getCssDesign(...)
## 8. \-litr (local) formCssDesign(...)
## 9. \-litr (local) checkFormCssDesignInputs(...)
## 10. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssDesign(...)
## 2. \-litr (local) checkXInputResults(newX, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssDesign(...)
## 2. \-litr (local) formCssDesign(...)
## 3. \-litr (local) checkFormCssDesignInputs(...)
## 4. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssDesign(...)
## 2. \-litr (local) checkXInputResults(newX, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssDesign(...)
## 2. \-litr (local) formCssDesign(...)
## 3. \-litr (local) checkFormCssDesignInputs(...)
## 4. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssDesign(...)
## 2. \-litr (local) checkXInputResults(newX, css_results$X)
## -- Warning: getCssDesign works -------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssDesign(...)
## 2. \-litr (local) formCssDesign(...)
## 3. \-litr (local) checkFormCssDesignInputs(...)
## 4. \-litr (local) checkXInputResults(newx, css_results$X)
## Test passed with 53 successes.
Tests for checkGetCssPredsInputs()
testthat::test_that("checkGetCssPredsInputs works", {
set.seed(17081)
x_select <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
x_train <- matrix(stats::rnorm(8*6), nrow=8, ncol=6)
x_pred <- matrix(stats::rnorm(7*6), nrow=7, ncol=6)
y_select <- stats::rnorm(10)
y_train <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- checkGetCssPredsInputs(css_res, testX=x_pred, weighting="simple_avg",
cutoff=0.05, min_num_clusts=1,
max_num_clusts=NA, trainX=x_train,
trainY=y_train)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("trainXProvided", "trainX", "testX",
"feat_names", "max_num_clusts"))
testthat::expect_true(!is.na(res$trainXProvided))
testthat::expect_equal(length(res$trainXProvided), 1)
testthat::expect_true(is.logical(res$trainXProvided))
testthat::expect_true(res$trainXProvided)
testthat::expect_true(all(!is.na(res$trainX)))
testthat::expect_true(is.matrix(res$trainX))
testthat::expect_true(is.numeric(res$trainX))
testthat::expect_equal(nrow(res$trainX), 8)
testthat::expect_equal(ncol(res$trainX), 6)
testthat::expect_true(all(abs(x_train - res$trainX) < 10^(-9)))
testthat::expect_true(all(!is.na(res$testX)))
testthat::expect_true(is.matrix(res$testX))
testthat::expect_true(is.numeric(res$testX))
testthat::expect_equal(nrow(res$testX), 7)
testthat::expect_equal(ncol(res$testX), 6)
testthat::expect_true(all(abs(x_pred - res$testX) < 10^(-9)))
testthat::expect_true(is.character(res$feat_names))
testthat::expect_true(is.na(res$feat_names))
testthat::expect_true(is.na(res$max_num_clusts))
testthat::expect_true(length(res$max_num_clusts) == 1)
##### Try other bad inputs
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="weighted_avg",
cutoff=-0.5, min_num_clusts=1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"cutoff >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse",
cutoff="0.3", min_num_clusts=1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"is.numeric(cutoff) | is.integer(cutoff) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse",
cutoff=as.numeric(NA),
min_num_clusts=1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"!is.na(cutoff) is not TRUE", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting=c("sparse",
"simple_avg"),
cutoff=0.1,
min_num_clusts=1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting=2, cutoff=0.1,
min_num_clusts=1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"Weighting must be a character", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="spasre", cutoff=0.1,
min_num_clusts=1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse", cutoff=0.1,
min_num_clusts=c(1, 2),
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"length(min_num_clusts) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="weighted_avg",
cutoff=0.1, min_num_clusts="2",
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="simple_avg",
cutoff=0.1, min_num_clusts=-1,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="weighted_avg",
cutoff=0.1, min_num_clusts=10,
max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"min_num_clusts <= p is not TRUE", fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="simple_avg",
cutoff=0.1, min_num_clusts=1,
max_num_clusts="5",
trainX=x_train, trainY=y_train),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse",
cutoff=0.1, min_num_clusts=1,
max_num_clusts=4.5,
trainX=x_train, trainY=y_train),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse",
cutoff=0.1, min_num_clusts=3,
max_num_clusts=2,
trainX=x_train, trainY=y_train),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse",
cutoff=0.1, min_num_clusts=1,
max_num_clusts=10,
trainX=x_train, trainY=y_train),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
# Add training indices
css_res_train <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B=10, train_inds=6:10)
# Training indices should be ignored if new x is provided
res <- checkGetCssPredsInputs(css_res_train, testX=x_pred,
weighting="weighted_avg",
cutoff=0, min_num_clusts=1,
max_num_clusts=NA, trainX=x_train,
trainY=y_train)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("trainXProvided", "trainX", "testX",
"feat_names", "max_num_clusts"))
testthat::expect_true(!is.na(res$trainXProvided))
testthat::expect_equal(length(res$trainXProvided), 1)
testthat::expect_true(is.logical(res$trainXProvided))
testthat::expect_true(res$trainXProvided)
testthat::expect_true(all(!is.na(res$trainX)))
testthat::expect_true(is.matrix(res$trainX))
testthat::expect_true(is.numeric(res$trainX))
testthat::expect_equal(nrow(res$trainX), 8)
testthat::expect_equal(ncol(res$trainX), 6)
testthat::expect_true(all(abs(x_train - res$trainX) < 10^(-9)))
testthat::expect_true(all(!is.na(res$testX)))
testthat::expect_true(is.matrix(res$testX))
testthat::expect_true(is.numeric(res$testX))
testthat::expect_equal(nrow(res$testX), 7)
testthat::expect_equal(ncol(res$testX), 6)
testthat::expect_true(all(abs(x_pred - res$testX) < 10^(-9)))
testthat::expect_true(is.character(res$feat_names))
testthat::expect_true(is.na(res$feat_names))
testthat::expect_true(is.na(res$max_num_clusts))
testthat::expect_true(length(res$max_num_clusts) == 1)
# Things should still work if new x is not provided
res <- checkGetCssPredsInputs(css_res_train, testX=x_pred,
weighting="weighted_avg",
cutoff=0, min_num_clusts=1,
max_num_clusts=NA, trainX=NA, trainY=NA)
testthat::expect_true(is.list(res))
testthat::expect_identical(names(res), c("trainXProvided", "trainX", "testX",
"feat_names", "max_num_clusts"))
testthat::expect_true(!is.na(res$trainXProvided))
testthat::expect_equal(length(res$trainXProvided), 1)
testthat::expect_true(is.logical(res$trainXProvided))
testthat::expect_true(!res$trainXProvided)
testthat::expect_true(all(!is.na(res$trainX)))
testthat::expect_true(is.matrix(res$trainX))
testthat::expect_true(is.numeric(res$trainX))
testthat::expect_equal(nrow(res$trainX), 5)
testthat::expect_equal(ncol(res$trainX), 6)
testthat::expect_true(all(abs(x_select[6:10, ] - res$trainX) < 10^(-9)))
testthat::expect_true(all(!is.na(res$testX)))
testthat::expect_true(is.matrix(res$testX))
testthat::expect_true(is.numeric(res$testX))
testthat::expect_equal(nrow(res$testX), 7)
testthat::expect_equal(ncol(res$testX), 6)
testthat::expect_true(all(abs(x_pred - res$testX) < 10^(-9)))
testthat::expect_true(is.character(res$feat_names))
testthat::expect_true(is.na(res$feat_names))
testthat::expect_true(is.na(res$max_num_clusts))
testthat::expect_true(length(res$max_num_clusts) == 1)
# Try not providing training indices and omitting newX--should get error
testthat::expect_error(checkGetCssPredsInputs(css_res, testX=x_pred,
weighting="sparse",
cutoff=0, min_num_clusts=1,
max_num_clusts=NA, trainX=NA, trainY=NA),
"css was not provided with indices to set aside for model training (train_inds), so must provide new X in order to generate a design matrix", fixed=TRUE)
# Try naming variables
colnames(x_select) <- LETTERS[1:6]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(checkGetCssPredsInputs(css_res_named, testX=x_pred,
weighting="simple_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"New X provided had no variable names (column names) even though the X provided to css did.", fixed=TRUE)
# Try mismatching variable names
colnames(x_train) <- LETTERS[2:7]
colnames(x_pred) <- LETTERS[1:6]
testthat::expect_error(checkGetCssPredsInputs(css_res_named, testX=x_pred,
weighting="weighted_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_train) <- LETTERS[1:6]
colnames(x_pred) <- LETTERS[2:7]
testthat::expect_error(checkGetCssPredsInputs(css_res_named, testX=x_pred,
weighting="sparse", cutoff=0,
min_num_clusts=1, max_num_clusts=NA,
trainX=x_train, trainY=y_train),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_pred) <- LETTERS[1:6]
res_named <- checkGetCssPredsInputs(css_res_named, testX=x_pred,
weighting="simple_avg", cutoff=0,
min_num_clusts=1, max_num_clusts=NA,
trainX=x_train, trainY=y_train)
testthat::expect_true(is.list(res_named))
testthat::expect_identical(names(res_named), c("trainXProvided", "trainX", "testX",
"feat_names", "max_num_clusts"))
testthat::expect_true(all(!is.na(res_named$trainX)))
testthat::expect_true(is.matrix(res_named$trainX))
testthat::expect_true(is.numeric(res_named$trainX))
testthat::expect_equal(nrow(res_named$trainX), 8)
testthat::expect_equal(ncol(res_named$trainX), 6)
testthat::expect_true(all(abs(x_train - res_named$trainX) < 10^(-9)))
testthat::expect_true(is.character(res_named$feat_names))
testthat::expect_identical(res_named$feat_names, LETTERS[1:6])
# Try data.frame input to css and checkGetCssPredsInputs
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/3)
train_inds <- (max(selec_inds) + 1):(2*round(n/3))
test_inds <- setdiff(1:n, c(selec_inds, train_inds))
css_res_df <- css(X=X_df[c(selec_inds, train_inds), ],
y=y[c(selec_inds, train_inds)], lambda=0.01, B = 10,
train_inds=train_inds)
res_df <- checkGetCssPredsInputs(css_res_df, testX=X_df[test_inds, ],
weighting="sparse", cutoff=0,
min_num_clusts=1, max_num_clusts=NA,
trainX=NA, trainY=NA)
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("trainXProvided", "trainX",
"testX","feat_names",
"max_num_clusts"))
testthat::expect_true(all(!is.na(res_df$trainX)))
testthat::expect_true(is.matrix(res_df$trainX))
testthat::expect_true(is.numeric(res_df$trainX))
testthat::expect_equal(nrow(res_df$trainX), length(train_inds))
stopifnot(nrow(css_res_df$X) >= max(train_inds))
train_mat <- css_res_df$X[train_inds, ]
testthat::expect_equal(ncol(res_df$trainX), ncol(train_mat))
testthat::expect_true(all(abs(train_mat - res_df$trainX) < 10^(-9)))
testthat::expect_identical(colnames(res_df$trainX), colnames(train_mat))
testthat::expect_true(all(!is.na(res_df$testX)))
testthat::expect_true(is.matrix(res_df$testX))
testthat::expect_true(is.numeric(res_df$testX))
testthat::expect_equal(nrow(res_df$testX), length(test_inds))
test_mat <- stats::model.matrix(~ ., X_df[test_inds, ])
test_mat <- test_mat[, colnames(test_mat) != "(Intercept)"]
testthat::expect_equal(ncol(res_df$testX), ncol(test_mat))
testthat::expect_true(all(abs(test_mat - res_df$testX) < 10^(-9)))
testthat::expect_identical(colnames(res_df$testX), colnames(test_mat))
testthat::expect_identical(colnames(res_df$testX), colnames(res_df$trainX))
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- checkGetCssPredsInputs(css_res_df, testX=X_df[test_inds, ],
weighting="simple_avg", cutoff=0.3,
min_num_clusts=1, max_num_clusts=4,
trainX=X_df[train_inds, ],
trainY=y[train_inds])
testthat::expect_true(is.list(res_df))
testthat::expect_identical(names(res_df), c("trainXProvided", "trainX",
"testX","feat_names",
"max_num_clusts"))
testthat::expect_true(all(!is.na(res_df$trainX)))
testthat::expect_true(is.matrix(res_df$trainX))
testthat::expect_true(is.numeric(res_df$trainX))
testthat::expect_equal(nrow(res_df$trainX), length(train_inds))
train_mat <- stats::model.matrix(~ ., X_df[train_inds, ])
train_mat <- train_mat[, colnames(train_mat) != "(Intercept)"]
testthat::expect_equal(ncol(res_df$trainX), ncol(train_mat))
testthat::expect_true(all(abs(train_mat - res_df$trainX) < 10^(-9)))
testthat::expect_true(all(!is.na(res_df$testX)))
testthat::expect_true(is.matrix(res_df$testX))
testthat::expect_true(is.numeric(res_df$testX))
testthat::expect_equal(nrow(res_df$testX), length(test_inds))
test_mat <- stats::model.matrix(~ ., X_df[test_inds, ])
test_mat <- test_mat[, colnames(test_mat) != "(Intercept)"]
testthat::expect_equal(ncol(res_df$testX), ncol(test_mat))
testthat::expect_true(all(abs(test_mat - res_df$testX) < 10^(-9)))
})## -- Warning: checkGetCssPredsInputs works ---------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. +-testthat::expect_warning(...)
## 2. | \-testthat:::expect_condition_matching_(...)
## 3. | \-testthat:::quasi_capture(...)
## 4. | +-testthat (local) .capture(...)
## 5. | | \-base::withCallingHandlers(...)
## 6. | \-rlang::eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
## 7. \-litr (local) checkGetCssPredsInputs(...)
## 8. \-litr (local) checkXInputResults(testX, css_results$X)
## -- Warning: checkGetCssPredsInputs works ---------------------------------------
## Column names were provided for testX but not for trainX (are you sure they both contain identical features in the same order?)
## Backtrace:
## x
## 1. \-litr (local) checkGetCssPredsInputs(...)
## -- Warning: checkGetCssPredsInputs works ---------------------------------------
## Column names were provided for testX but not for trainX (are you sure they both contain identical features in the same order?)
## Backtrace:
## x
## 1. \-litr (local) checkGetCssPredsInputs(...)
## Test passed with 125 successes.
Finally, tests for getCssPreds()
testthat::test_that("getCssPreds works", {
set.seed(70811)
x_select <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
x_train <- matrix(stats::rnorm(8*6), nrow=8, ncol=6)
x_pred <- matrix(stats::rnorm(7*6), nrow=7, ncol=6)
y_select <- stats::rnorm(10)
y_train <- stats::rnorm(8)
good_clusters <- list("red"=1:2, "blue"=3:4, "green"=5)
css_res <- css(X=x_select, y=y_select, lambda=0.01, clusters=good_clusters,
B = 10)
res <- getCssPreds(css_res, testX=x_pred, trainX=x_train, trainY=y_train)
testthat::expect_true(all(!is.na(res)))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(length(res), 7)
# (#104, L6) The OLS adequacy guard must account for the lm() intercept. At the
# default cutoff the training design has one column per cluster; css_res has 4
# clusters (red, blue, green, and feature 6 as a singleton). With exactly 4
# training rows, lm(y ~ .) is rank-deficient (4 observations vs 4 cluster
# coefficients + an intercept), so getCssPreds must now error rather than
# silently fit a rank-deficient model with NA coefficients.
x_train_eq <- matrix(stats::rnorm(4*6), nrow=4, ncol=6)
y_train_eq <- stats::rnorm(4)
testthat::expect_error(
getCssPreds(css_res, testX=x_pred, trainX=x_train_eq, trainY=y_train_eq),
"one more training observation than the number of clusters", fixed=TRUE)
# One extra training row (5 observations vs 4 coefficients + intercept) is
# enough, and predictions are produced normally.
x_train_ok <- matrix(stats::rnorm(5*6), nrow=5, ncol=6)
y_train_ok <- stats::rnorm(5)
res_ok <- getCssPreds(css_res, testX=x_pred, trainX=x_train_ok,
trainY=y_train_ok)
testthat::expect_true(all(!is.na(res_ok)))
testthat::expect_equal(length(res_ok), 7)
# (#105, L9) The value-producing calls above only exercise the default
# weighting="weighted_avg"; cover "sparse" and "simple_avg" too on the
# trainX/trainY path. Each weighting must return a finite numeric vector of
# length nrow(x_pred).
preds_by_w <- list()
for(w in c("weighted_avg", "sparse", "simple_avg")){
preds_w <- getCssPreds(css_res, testX=x_pred, weighting=w, trainX=x_train,
trainY=y_train)
testthat::expect_true(is.numeric(preds_w))
testthat::expect_true(all(!is.na(preds_w)))
testthat::expect_equal(length(preds_w), nrow(x_pred))
preds_by_w[[w]] <- preds_w
}
# getCssPreds already enforces finiteness internally, so "didn't error" is
# nearly vacuous; pin the behavior by requiring the three weightings to
# produce mutually different predictions.
testthat::expect_false(isTRUE(all.equal(preds_by_w[["weighted_avg"]],
preds_by_w[["sparse"]])))
testthat::expect_false(isTRUE(all.equal(preds_by_w[["weighted_avg"]],
preds_by_w[["simple_avg"]])))
testthat::expect_false(isTRUE(all.equal(preds_by_w[["sparse"]],
preds_by_w[["simple_avg"]])))
# (#105, L10a) trainY OLS-response contract. A non-numeric trainY is caught
# upstream by checkGetCssPredsInputs' stopifnot(is.numeric(trainY)) (which
# shadows getCssPreds' own "must be real-valued" guard -- that guard is dead
# code, flagged for a future cleanup), so the message is the stopifnot text.
testthat::expect_error(
getCssPreds(css_res, testX=x_pred, trainX=x_train,
trainY=factor(sample(letters[1:3], nrow(x_train),
replace=TRUE))),
"is.numeric(trainY) is not TRUE", fixed=TRUE)
##### Try other bad inputs
testthat::expect_error(getCssPreds(css_res, testX=x_pred, cutoff=-0.5,
trainX=x_train, trainY=y_train),
"cutoff >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, cutoff="0.3",
trainX=x_train, trainY=y_train),
"is.numeric(cutoff) | is.integer(cutoff) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred,
cutoff=as.numeric(NA), trainX=x_train,
trainY=y_train),
"!is.na(cutoff) is not TRUE", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred,
weighting=c("sparse", "simple_avg"),
trainX=x_train, trainY=y_train),
"length(weighting) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, weighting=2,
trainX=x_train, trainY=y_train),
"Weighting must be a character", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, weighting="spasre",
trainX=x_train, trainY=y_train),
"Weighting must be a character and one of sparse, simple_avg, or weighted_avg",
fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred,
min_num_clusts=c(1, 2), trainX=x_train,
trainY=y_train),
"length(min_num_clusts) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, min_num_clusts="2",
trainX=x_train, trainY=y_train),
"is.numeric(min_num_clusts) | is.integer(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, min_num_clusts=-1,
trainX=x_train, trainY=y_train),
"min_num_clusts >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, min_num_clusts=10,
trainX=x_train, trainY=y_train),
"min_num_clusts <= p is not TRUE", fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, max_num_clusts="5",
trainX=x_train, trainY=y_train),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, max_num_clusts=4.5,
trainX=x_train, trainY=y_train),
"max_num_clusts == round(max_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, min_num_clusts=3,
max_num_clusts=2, trainX=x_train,
trainY=y_train),
"max_num_clusts >= min_num_clusts is not TRUE",
fixed=TRUE)
testthat::expect_error(getCssPreds(css_res, testX=x_pred, max_num_clusts=10,
trainX=x_train, trainY=y_train),
"max_num_clusts <= p is not TRUE", fixed=TRUE)
# Add training indices
css_res_train <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B=10, train_inds=6:10)
# Training indices should be ignored if new x is provided
res <- getCssPreds(css_res_train, testX=x_pred, trainX=x_train,
trainY=y_train)
testthat::expect_true(all(!is.na(res)))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(length(res), 7)
# Things should still work if new x is not provided
res <- getCssPreds(css_res_train, testX=x_pred)
testthat::expect_true(all(!is.na(res)))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(length(res), 7)
# (#105, L9) Same weighting coverage ("sparse"/"simple_avg" plus the default)
# on the train_inds (no-trainX) path, and the same mutual-difference pin.
preds_by_w_ti <- list()
for(w in c("weighted_avg", "sparse", "simple_avg")){
preds_w_ti <- getCssPreds(css_res_train, testX=x_pred, weighting=w)
testthat::expect_true(is.numeric(preds_w_ti))
testthat::expect_true(all(!is.na(preds_w_ti)))
testthat::expect_equal(length(preds_w_ti), nrow(x_pred))
preds_by_w_ti[[w]] <- preds_w_ti
}
testthat::expect_false(isTRUE(all.equal(preds_by_w_ti[["weighted_avg"]],
preds_by_w_ti[["sparse"]])))
testthat::expect_false(isTRUE(all.equal(preds_by_w_ti[["weighted_avg"]],
preds_by_w_ti[["simple_avg"]])))
testthat::expect_false(isTRUE(all.equal(preds_by_w_ti[["sparse"]],
preds_by_w_ti[["simple_avg"]])))
# Try not providing training indices and omitting newX--should get error
testthat::expect_error(getCssPreds(css_res, testX=x_pred),
"css was not provided with indices to set aside for model training (train_inds), so must provide new X in order to generate a design matrix", fixed=TRUE)
# Try naming variables
colnames(x_select) <- LETTERS[1:6]
css_res_named <- css(X=x_select, y=y_select, lambda=0.01,
clusters=good_clusters, B = 10)
# Named variables for css matrix but not new one--should get a warning
testthat::expect_warning(getCssPreds(css_res_named, testX=x_pred,
trainX=x_train, trainY=y_train),
"New X provided had no variable names (column names) even though the X provided to css did.", fixed=TRUE)
# Try mismatching variable names
colnames(x_train) <- LETTERS[2:7]
colnames(x_pred) <- LETTERS[1:6]
testthat::expect_error(getCssPreds(css_res_named, testX=x_pred,
trainX=x_train, trainY=y_train),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_train) <- LETTERS[1:6]
colnames(x_pred) <- LETTERS[2:7]
testthat::expect_error(getCssPreds(css_res_named, testX=x_pred,
trainX=x_train, trainY=y_train),
"identical(feat_names, colnames(css_X)) is not TRUE",
fixed=TRUE)
colnames(x_pred) <- LETTERS[1:6]
res_named <- getCssPreds(css_res_named, testX=x_pred, trainX=x_train,
trainY=y_train)
testthat::expect_true(all(!is.na(res)))
testthat::expect_true(is.numeric(res))
testthat::expect_equal(length(res), 7)
# Try data.frame input to css and getCssPreds
X_df <- datasets::mtcars
n <- nrow(X_df)
y <- stats::rnorm(n)
selec_inds <- 1:round(n/3)
# 18 (not 17) training rows: once the factor columns below are one-hot encoded
# the design has 17 cluster representatives, and the OLS adequacy guard (#104,
# L6) now requires strictly more training rows than clusters (the lm()
# intercept), so 17 == 17 would (correctly) error.
train_inds <- (max(selec_inds) + 1):(max(selec_inds) + 18)
test_inds <- setdiff(1:n, c(selec_inds, train_inds))
css_res_df <- css(X=X_df[c(selec_inds, train_inds), ],
y=y[c(selec_inds, train_inds)], lambda=0.01, B = 10,
train_inds=train_inds)
res_df <- getCssPreds(css_res_df, testX=X_df[test_inds, ])
testthat::expect_true(all(!is.na(res_df)))
testthat::expect_true(is.numeric(res_df))
testthat::expect_equal(length(res_df), length(test_inds))
# Try again with X as a dataframe with factors (number of columns of final
# design matrix after one-hot encoding factors won't match number of columns
# of X_df)
X_df$cyl <- as.factor(X_df$cyl)
X_df$vs <- as.factor(X_df$vs)
X_df$am <- as.factor(X_df$am)
X_df$gear <- as.factor(X_df$gear)
X_df$carb <- as.factor(X_df$carb)
css_res_df <- css(X=X_df[selec_inds, ], y=y[selec_inds], lambda=0.01, B = 10)
res_df <- getCssPreds(css_res_df, testX=X_df[test_inds, ],
trainX=X_df[train_inds, ], trainY=y[train_inds])
# TODO(gregfaletto): known issue--the above code produces the following
# undesired warnings:
# 1: In checkGetCssPredsInputs(css_results, testX, weighting, cutoff, :
# Column names were provided for testX but not for trainX (are you sure they both contain identical features in the same order?)
# 2: In checkXInputResults(newx, css_results$X) :
# New X provided had no variable names (column names) even though the X provided to css did.
testthat::expect_true(all(!is.na(res_df)))
testthat::expect_true(is.numeric(res_df))
testthat::expect_equal(length(res_df), length(test_inds))
# (#105, L10b) Reachable real-valued-y guard on the train_inds (no-trainX)
# path: when css was fit on a non-real-valued y (here a character response,
# which css accepts because validation is delegated to fitfun), getCssPreds
# cannot fit the OLS model from css_results$y[train_inds] and must error. The
# custom fitfun's formals must match cssLasso's (X, y, lambda) exactly. Seeded
# last so this RNG-consuming css() call does not shift any earlier draws.
set.seed(10510)
n_chr <- 12
X_chr <- matrix(stats::rnorm(n_chr*6), nrow=n_chr, ncol=6)
y_chr <- as.character(sample(letters, n_chr, replace=TRUE))
fitfun_chr <- function(X, y, lambda) 1:2
css_res_chr <- css(X_chr, y_chr, lambda=0.01, train_inds=8:12,
fitfun=fitfun_chr, B=10)
testthat::expect_error(
getCssPreds(css_res_chr, testX=matrix(stats::rnorm(3*6), nrow=3, ncol=6)),
"Can't generate predictions from the data", fixed=TRUE)
})## -- Warning: getCssPreds works --------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. +-testthat::expect_warning(...)
## 2. | \-testthat:::expect_condition_matching_(...)
## 3. | \-testthat:::quasi_capture(...)
## 4. | +-testthat (local) .capture(...)
## 5. | | \-base::withCallingHandlers(...)
## 6. | \-rlang::eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
## 7. \-litr (local) getCssPreds(...)
## 8. \-litr (local) checkGetCssPredsInputs(...)
## 9. \-litr (local) checkXInputResults(testX, css_results$X)
## -- Warning: getCssPreds works --------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. +-testthat::expect_warning(...)
## 2. | \-testthat:::expect_condition_matching_(...)
## 3. | \-testthat:::quasi_capture(...)
## 4. | +-testthat (local) .capture(...)
## 5. | | \-base::withCallingHandlers(...)
## 6. | \-rlang::eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
## 7. \-litr (local) getCssPreds(...)
## 8. \-litr (local) formCssDesign(...)
## 9. \-litr (local) checkFormCssDesignInputs(...)
## 10. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssPreds works --------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. +-testthat::expect_warning(...)
## 2. | \-testthat:::expect_condition_matching_(...)
## 3. | \-testthat:::quasi_capture(...)
## 4. | +-testthat (local) .capture(...)
## 5. | | \-base::withCallingHandlers(...)
## 6. | \-rlang::eval_bare(quo_get_expr(.quo), quo_get_env(.quo))
## 7. \-litr (local) getCssPreds(...)
## 8. \-litr (local) formCssDesign(...)
## 9. \-litr (local) checkFormCssDesignInputs(...)
## 10. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssPreds works --------------------------------------------------
## Column names were provided for testX but not for trainX (are you sure they both contain identical features in the same order?)
## Backtrace:
## x
## 1. \-litr (local) getCssPreds(...)
## 2. \-litr (local) checkGetCssPredsInputs(...)
## -- Warning: getCssPreds works --------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssPreds(...)
## 2. \-litr (local) formCssDesign(...)
## 3. \-litr (local) checkFormCssDesignInputs(...)
## 4. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssPreds works --------------------------------------------------
## Column names were provided for testX but not for trainX (are you sure they both contain identical features in the same order?)
## Backtrace:
## x
## 1. \-litr (local) getCssPreds(...)
## 2. \-litr (local) checkGetCssPredsInputs(...)
## -- Warning: getCssPreds works --------------------------------------------------
## New X provided had no variable names (column names) even though the X provided to css did.
## Backtrace:
## x
## 1. \-litr (local) getCssPreds(...)
## 2. \-litr (local) formCssDesign(...)
## 3. \-litr (local) checkFormCssDesignInputs(...)
## 4. \-litr (local) checkXInputResults(newx, css_results$X)
## -- Warning: getCssPreds works --------------------------------------------------
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Backtrace:
## x
## 1. \-litr (local) getCssPreds(...)
## 2. \-stats::predict.lm(model, newdata = df_test)
## Test passed with 65 successes.
A cross-cutting integration test pinning the bug fixed in #99: a single
non-finite (Inf) cell used to slip silently through checkNoNAs()
(is.na(Inf) is FALSE) into glmnet and the cluster-representative averaging
– corrupting selection proportions and predictions with no error. Both the
css() selection path and the getCssPreds() prediction path must now error at
the gate.
testthat::test_that("css and getCssPreds reject non-finite (Inf) inputs (#99)", {
set.seed(99001)
X <- matrix(stats::rnorm(10 * 6), nrow = 10, ncol = 6)
y <- stats::rnorm(10)
good_clusters <- list("red" = 1:2, "blue" = 3:4, "green" = 5)
# One Inf cell in the css design matrix now errors (previously: silent
# corruption of the selection proportions).
X_inf <- X
X_inf[3, 2] <- Inf
testthat::expect_error(
css(X = X_inf, y = y, lambda = 0.01, clusters = good_clusters, B = 10),
"must not contain missing", fixed = TRUE)
# A valid css fit; then an Inf in testX passed to getCssPreds must error too.
# testX is gated via checkGetCssPredsInputs -> checkXInputResults(testX) ->
# checkNoNAs(newx, "newx"), so the message names "newx", not "testX"; assert
# the stable non-finite substring instead.
css_res <- css(X = X, y = y, lambda = 0.01, clusters = good_clusters, B = 10)
x_train <- matrix(stats::rnorm(8 * 6), nrow = 8, ncol = 6)
y_train <- stats::rnorm(8)
x_pred <- matrix(stats::rnorm(7 * 6), nrow = 7, ncol = 6)
# Sanity: clean inputs still predict finite values.
preds_ok <- getCssPreds(css_res, testX = x_pred, trainX = x_train,
trainY = y_train)
testthat::expect_true(all(is.finite(preds_ok)))
x_pred_inf <- x_pred
x_pred_inf[1, 1] <- Inf
testthat::expect_error(
getCssPreds(css_res, testX = x_pred_inf, trainX = x_train,
trainY = y_train),
"must not contain missing", fixed = TRUE)
})## Test passed with 3 successes.
The lasso entry points reject non-finite y (#100). Companion to the #99 test
above (which guards X):
testthat::test_that("the lasso entry points reject non-finite y (#100)", {
set.seed(100001)
X <- matrix(stats::rnorm(15 * 6), nrow = 15, ncol = 6)
y <- stats::rnorm(15)
good_clusters <- list("red" = 1:3, "green" = 4:6)
test_X <- matrix(stats::rnorm(7 * 6), nrow = 7, ncol = 6)
y_na <- y; y_na[4] <- NA
y_inf <- y; y_inf[9] <- Inf
# --- getLassoLambda nondeterminism pin -------------------------------------
# getLassoLambda fits cv.glmnet on a RANDOM subsample (sample(1:n, ...)), so
# before this fix a non-finite y in a single cell errored only on the draws
# that happened to include it. checkFiniteY runs BEFORE that sample(), so it
# must now error under EVERY seed -- never returning a stray lambda.
for(s in c(1L, 2L, 7L, 13L, 101L, 2024L)){
set.seed(s)
testthat::expect_error(getLassoLambda(X = X, y = y_na, nfolds = 4),
"must not contain missing", fixed = TRUE)
set.seed(s)
testthat::expect_error(getLassoLambda(X = X, y = y_inf, nfolds = 4),
"must not contain missing", fixed = TRUE)
}
# A clean y still returns a valid lambda (no success -> error regression).
set.seed(100002)
lam_ok <- getLassoLambda(X = X, y = y, nfolds = 4)
testthat::expect_true(is.numeric(lam_ok) && length(lam_ok) == 1 &&
!is.na(lam_ok) && lam_ok >= 0)
# --- entry-point integration ----------------------------------------------
testthat::expect_error(cssSelect(X = X, y = y_na),
"must not contain missing", fixed = TRUE)
testthat::expect_error(cssPredict(X_train_selec = X, y_train_selec = y_na,
X_test = test_X),
"must not contain missing", fixed = TRUE)
testthat::expect_error(getModelSize(X = X, y = y_inf, clusters = good_clusters),
"must not contain missing", fixed = TRUE)
# protolasso / clusterRepLasso route through processClusterLassoInputs, whose
# stopifnot(all(is.finite(y))) now catches Inf (previously all(!is.na(y))).
testthat::expect_error(protolasso(X = X, y = y_inf, clusters = good_clusters),
"is.finite", fixed = TRUE)
testthat::expect_error(clusterRepLasso(X = X, y = y_inf,
clusters = good_clusters),
"is.finite", fixed = TRUE)
# css()'s default cssLasso path validates each random subsample via
# checkCssLassoInputs. A single non-finite cell would fire only on the
# subsamples that draw it; set enough Inf cells (> n - floor(n/2)) that EVERY
# size-floor(n/2) subsample must contain one, so css errors deterministically.
y_many_inf <- y; y_many_inf[1:9] <- Inf
testthat::expect_error(
css(X = X, y = y_many_inf, lambda = 0.01, clusters = good_clusters, B = 10),
"non-finite", fixed = TRUE)
# Clean finite y still selects without error.
res_ok <- cssSelect(X = X, y = y, clusters = good_clusters)
testthat::expect_true(is.list(res_ok))
})## -- Warning: the lasso entry points reject non-finite y (#100) ------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getLassoLambda(X = X, y = y, nfolds = 4)
## 2. \-glmnet::cv.glmnet(...)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: the lasso entry points reject non-finite y (#100) ------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) cssSelect(X = X, y = y, clusters = good_clusters)
## 2. \-litr (local) getLassoLambda(X, y, alpha = alpha)
## 3. \-glmnet::cv.glmnet(...)
## 4. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: the lasso entry points reject non-finite y (#100) ------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) cssSelect(X = X, y = y, clusters = good_clusters)
## 2. \-litr (local) getModelSize(X, y, css_results$clusters, alpha = alpha)
## 3. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 4. \-glmnet:::cv.glmnet.raw(...)
## Test passed with 20 successes.