10 Tests for other useful functions
Tests for checkGenClusteredDataInputs()
testthat::test_that("checkGenClusteredDataInputs works", {
set.seed(7612)
# Should get no error
checkGenClusteredDataInputs(p=19, k_unclustered=2, cluster_size=5, n_clusters=3,
sig_clusters=2, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
checkGenClusteredDataInputs(p=19, k_unclustered=2, cluster_size=5, n_clusters=3,
sig_clusters=2, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=1, sigma_eps_sq=NA)
# sig_clusters
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters="2", rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"is.numeric(sig_clusters) | is.integer(sig_clusters) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=4, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"sig_clusters <= n_clusters is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=-1, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"sig_clusters >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=.6, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"sig_clusters == round(sig_clusters) is not TRUE",
fixed=TRUE)
# n_clusters
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5,
n_clusters="3",
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"is.numeric(n_clusters) | is.integer(n_clusters) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3.2,
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"n_clusters == round(n_clusters) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=0,
sig_clusters=0, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"n_clusters >= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=1, n_clusters=3,
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"cluster_size >= 2 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=16, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"p >= n_clusters * cluster_size + k_unclustered is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"rho > 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=0,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"beta_latent != 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=0, snr=1,
sigma_eps_sq=NA),
"beta_unclustered != 0 is not TRUE", fixed=TRUE)
# k_unclustered
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered="2",
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"is.numeric(k_unclustered) | is.integer(k_unclustered) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=0,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"k_unclustered >= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2.2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"k_unclustered == round(k_unclustered) is not TRUE",
fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=NA,
sigma_eps_sq=NA),
"Must specify one of snr or sigma_eps_sq", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=-.2,
sigma_eps_sq=NA),
"snr > 0 is not TRUE", fixed=TRUE)
testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=NA,
sigma_eps_sq=-.3),
"sigma_eps_sq >= 0 is not TRUE", fixed=TRUE)
})## Test passed with 18 successes.
Tests for getNoiseVar():
testthat::test_that("getNoiseVar works", {
# getNoiseVar(cor) returns the noise variance v such that a proxy Z + N(0, v)
# has correlation cor with Z, i.e. v = 1/cor^2 - 1 (vectorized; cor in (0, 1]).
testthat::expect_equal(getNoiseVar(1), 0)
testthat::expect_equal(getNoiseVar(0.5), 3)
testthat::expect_equal(getNoiseVar(0.9), 1/0.81 - 1)
testthat::expect_equal(getNoiseVar(c(0.5, 1, 0.9)), c(3, 0, 1/0.81 - 1))
# The defining property: a proxy with this noise variance has correlation cor.
testthat::expect_equal(1/sqrt(1 + getNoiseVar(0.8)), 0.8)
# cor must be numeric, non-NA, non-empty, and in (0, 1] (checks are vectorized).
testthat::expect_error(getNoiseVar(0), "all(cor > 0) is not TRUE", fixed=TRUE)
testthat::expect_error(getNoiseVar(-0.5), "all(cor > 0) is not TRUE",
fixed=TRUE)
testthat::expect_error(getNoiseVar(c(0.5, 0)), "all(cor > 0) is not TRUE",
fixed=TRUE)
testthat::expect_error(getNoiseVar(1.5), "all(cor <= 1) is not TRUE",
fixed=TRUE)
testthat::expect_error(getNoiseVar(NA_real_), "all(!is.na(cor)) is not TRUE",
fixed=TRUE)
testthat::expect_error(getNoiseVar("0.5"),
"is.numeric(cor) | is.integer(cor) is not TRUE", fixed=TRUE)
testthat::expect_error(getNoiseVar(numeric(0)), "length(cor) >= 1 is not TRUE",
fixed=TRUE)
})## Test passed with 12 successes.
Finally, tests for genClusteredData()
testthat::test_that("genClusteredData works", {
set.seed(23478)
ret <- genClusteredData(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho=.99,
beta_latent=1.5, beta_unclustered=-2, snr=NA,
sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_true(is.numeric(ret$X))
testthat::expect_true(is.matrix(ret$X))
testthat::expect_equal(ncol(ret$X), 19)
testthat::expect_equal(nrow(ret$X), 25)
# X is Gaussian with mean 0 and variance 4; expect all observations to lie
# within 5 standard deviations of mean
testthat::expect_true(all(abs(ret$X) < 5*2))
# Test that clusters are correlated--within-cluster correlation should be
# high, correlation with other features should be low
testthat::expect_true(min(cor(ret$X[, 1:5])) > .9)
testthat::expect_true(max(abs(cor(ret$X[, 1:5], ret$X[, 6:19]))) < .6)
testthat::expect_true(min(cor(ret$X[, 6:10])) > .9)
testthat::expect_true(max(abs(cor(ret$X[, 6:10],
ret$X[, c(1:5, 11:19)]))) < .6)
testthat::expect_true(min(cor(ret$X[, 11:15])) > .9)
testthat::expect_true(max(abs(cor(ret$X[, 11:15],
ret$X[, c(1:10, 16:19)]))) < .6)
cor_indeps <- cor(ret$X[, 16:19])
testthat::expect_true(max(abs(cor_indeps[lower.tri(cor_indeps)])) < .6)
testthat::expect_true(is.numeric(ret$y))
testthat::expect_equal(length(ret$y), 25)
testthat::expect_true(is.numeric(ret$Z))
testthat::expect_true(is.matrix(ret$Z))
testthat::expect_equal(nrow(ret$Z), 25)
testthat::expect_equal(ncol(ret$Z), 3)
testthat::expect_true(is.numeric(ret$mu))
testthat::expect_equal(length(ret$mu), 25)
# Because y is Gaussian with mean mu and standard deviation .5 conditional on
# mu, expect all observations to lie within 5 sds of mu
testthat::expect_true(all(abs(ret$y - ret$mu) < 5*.5))
# Specify SNR instead of sigma_eps_sq
ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=1, sigma_eps_sq=NA)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
# Specifying both snr and sigma_eps_sq is an error (issue #13): only one of
# the two may be given.
testthat::expect_error(
genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5, n_clusters=3,
sig_clusters=2, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=.01, sigma_eps_sq=.25),
"Only one of snr and sigma_eps_sq may be specified", fixed=TRUE)
# Try a single latent variable (z should be a one-column matrix)
ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
n_clusters=1, sig_clusters=1, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_true(is.numeric(ret$Z))
testthat::expect_true(is.matrix(ret$Z))
testthat::expect_equal(nrow(ret$Z), 5)
testthat::expect_equal(ncol(ret$Z), 1)
# Bad inputs
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters="2", rho=.8,
beta_latent=1.5, beta_unclustered=-2,
snr=1, sigma_eps_sq=NA),
"is.numeric(sig_clusters) | is.integer(sig_clusters) is not TRUE",
fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=4, rho=.8,
beta_latent=1.5, beta_unclustered=-2,
snr=1, sigma_eps_sq=NA),
"sig_clusters <= n_clusters is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=-1, rho=.8,
beta_latent=1.5, beta_unclustered=-2,
snr=1, sigma_eps_sq=NA),
"sig_clusters >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=.6, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"sig_clusters == round(sig_clusters) is not TRUE",
fixed=TRUE)
# n_clusters
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5,
n_clusters="3",
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"is.numeric(n_clusters) | is.integer(n_clusters) is not TRUE",
fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3.2,
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"n_clusters == round(n_clusters) is not TRUE",
fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=0,
sig_clusters=0, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"n_clusters >= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=1, n_clusters=3,
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"cluster_size >= 2 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(p=16, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"p >= n_clusters * cluster_size + k_unclustered is not TRUE",
fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"rho > 0 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=0,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"beta_latent != 0 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=0, snr=1,
sigma_eps_sq=NA),
"beta_unclustered != 0 is not TRUE", fixed=TRUE)
# k_unclustered
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered="2",
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"is.numeric(k_unclustered) | is.integer(k_unclustered) is not TRUE",
fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=0,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"k_unclustered >= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2.2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=1,
sigma_eps_sq=NA),
"k_unclustered == round(k_unclustered) is not TRUE",
fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=NA,
sigma_eps_sq=NA),
"Must specify one of snr or sigma_eps_sq", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=-.2,
sigma_eps_sq=NA),
"snr > 0 is not TRUE", fixed=TRUE)
testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3,
sig_clusters=2, rho=0.8,
beta_latent=1.5,
beta_unclustered=-2, snr=NA,
sigma_eps_sq=-.3),
"sigma_eps_sq >= 0 is not TRUE", fixed=TRUE)
# sigma_eps_sq = 0 is allowed (issue #13) and yields noiseless data (y == mu)
ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=NA, sigma_eps_sq=0)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_equal(ret$y, ret$mu)
# k_unclustered = 1 is allowed (issue #13)
ret <- genClusteredData(n=5, p=19, k_unclustered=1, cluster_size=5,
n_clusters=3, sig_clusters=2, rho=.8, beta_latent=1.5,
beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_equal(nrow(ret$X), 5)
testthat::expect_equal(ncol(ret$X), 19)
})## Test passed with 57 successes.
Tests for genClusteredDataWeighted()
testthat::test_that("genClusteredDataWeighted works", {
set.seed(23478)
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3,
sig_clusters=2, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_true(is.numeric(ret$X))
testthat::expect_true(is.matrix(ret$X))
testthat::expect_equal(ncol(ret$X), 19)
testthat::expect_equal(nrow(ret$X), 25)
# X is Gaussian with mean 0 and variance 4; expect all observations to lie
# within 5 standard deviations of mean
testthat::expect_true(all(abs(ret$X) < 5*2))
# Test that clusters are correlated--within-cluster correlation should be
# high, correlation with other features should be low
testthat::expect_true(min(cor(ret$X[, 1:3])) > .9)
testthat::expect_true(max(abs(cor(ret$X[, 1:5], ret$X[, 6:19]))) < .6)
testthat::expect_true(min(cor(ret$X[, 6:8])) > .9)
testthat::expect_true(max(abs(cor(ret$X[, 6:10],
ret$X[, c(1:5, 11:19)]))) < .6)
testthat::expect_true(min(cor(ret$X[, 11:13])) > .9)
testthat::expect_true(max(abs(cor(ret$X[, 11:15],
ret$X[, c(1:10, 16:19)]))) < .7)
cor_indeps <- cor(ret$X[, 16:19])
testthat::expect_true(max(abs(cor_indeps[lower.tri(cor_indeps)])) < .6)
testthat::expect_true(is.numeric(ret$y))
testthat::expect_equal(length(ret$y), 25)
testthat::expect_true(is.numeric(ret$Z))
testthat::expect_true(is.matrix(ret$Z))
testthat::expect_equal(nrow(ret$Z), 25)
testthat::expect_equal(ncol(ret$Z), 3)
testthat::expect_true(is.numeric(ret$mu))
testthat::expect_equal(length(ret$mu), 25)
# Because y is Gaussian with mean mu and standard deviation .5 conditional on
# mu, expect all observations to lie within 5 sds of mu
testthat::expect_true(all(abs(ret$y - ret$mu) < 5*.5))
# Specify SNR instead of sigma_eps_sq
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3,
sig_clusters=2, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=1, sigma_eps_sq=NA)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
# Specifying both snr and sigma_eps_sq is an error (issue #13): only one of
# the two may be given.
testthat::expect_error(
genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3,
sig_clusters=2, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=.01, sigma_eps_sq=.25),
"Only one of snr and sigma_eps_sq may be specified", fixed=TRUE)
# Try a single latent variable (z should be a one-column matrix)
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=1,
sig_clusters=1, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_true(is.numeric(ret$Z))
testthat::expect_true(is.matrix(ret$Z))
testthat::expect_equal(nrow(ret$Z), 25)
testthat::expect_equal(ncol(ret$Z), 1)
# sigma_eps_sq = 0 is allowed (issue #13) and yields noiseless data (y == mu)
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3,
sig_clusters=2, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=0)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_equal(ret$y, ret$mu)
# k_unclustered = 1 is allowed (issue #13)
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=1, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3,
sig_clusters=2, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_equal(ncol(ret$X), 19)
})## Test passed with 38 successes.
Tests for genClusteredDataWeightedRandom()
testthat::test_that("genClusteredDataWeightedRandom works", {
set.seed(23478)
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3,
sig_clusters=2, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_true(is.numeric(ret$X))
testthat::expect_true(is.matrix(ret$X))
testthat::expect_equal(ncol(ret$X), 19)
testthat::expect_equal(nrow(ret$X), 25)
# X is Gaussian with mean 0 and variance 4; expect all observations to lie
# within 5 standard deviations of mean
testthat::expect_true(all(abs(ret$X) < 5*2))
# Test that clusters are correlated--within-cluster correlation should be
# high, correlation with other features should be low
testthat::expect_true(min(cor(ret$X[, 1:3])) > .2)
testthat::expect_true(max(abs(cor(ret$X[, 1:5], ret$X[, 6:19]))) < .6)
testthat::expect_true(min(cor(ret$X[, 6:8])) > .2)
testthat::expect_true(max(abs(cor(ret$X[, 6:10],
ret$X[, c(1:5, 11:19)]))) < .6)
testthat::expect_true(min(cor(ret$X[, 11:13])) > .2)
testthat::expect_true(max(abs(cor(ret$X[, 11:15],
ret$X[, c(1:10, 16:19)]))) < .7)
cor_indeps <- cor(ret$X[, 16:19])
testthat::expect_true(max(abs(cor_indeps[lower.tri(cor_indeps)])) < .6)
testthat::expect_true(is.numeric(ret$y))
testthat::expect_equal(length(ret$y), 25)
testthat::expect_true(is.numeric(ret$Z))
testthat::expect_true(is.matrix(ret$Z))
testthat::expect_equal(nrow(ret$Z), 25)
testthat::expect_equal(ncol(ret$Z), 3)
testthat::expect_true(is.numeric(ret$mu))
testthat::expect_equal(length(ret$mu), 25)
# Because y is Gaussian with mean mu and standard deviation .5 conditional on
# mu, expect all observations to lie within 5 sds of mu
testthat::expect_true(all(abs(ret$y - ret$mu) < 5*.5))
# Specify SNR instead of sigma_eps_sq
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3,
sig_clusters=2, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=1, sigma_eps_sq=NA)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
# Specifying both snr and sigma_eps_sq is an error (issue #13): only one of
# the two may be given.
testthat::expect_error(
genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3,
sig_clusters=2, rho_high=.99, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=.01, sigma_eps_sq=.25),
"Only one of snr and sigma_eps_sq may be specified", fixed=TRUE)
# Try a single latent variable (z should be a one-column matrix)
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=1,
sig_clusters=1, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_true(is.numeric(ret$Z))
testthat::expect_true(is.matrix(ret$Z))
testthat::expect_equal(nrow(ret$Z), 25)
testthat::expect_equal(ncol(ret$Z), 1)
# sigma_eps_sq = 0 is allowed (issue #13) and yields noiseless data (y == mu)
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3,
sig_clusters=2, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=0)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_equal(ret$y, ret$mu)
# k_unclustered = 1 is allowed (issue #13)
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=1, cluster_size=5,
n_clusters=3,
sig_clusters=2, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_true(is.list(ret))
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
testthat::expect_equal(ncol(ret$X), 19)
})## Test passed with 38 successes.
Finally, characterization snapshots that pin the exact generated output of all
three genClusteredData* generators. The “… works” tests above only check
distributional properties (correlations, ranges), not the exact RNG draw order
(which differs across the three: bulk rnorm vs per-column vs random per
column). These snapshots make the #16 consolidation of the generators and their
validators provably behavior-preserving; regenerate only if the data-generating
algorithm intentionally changes.
# Characterization snapshots: pin the EXACT generated values (X / y / Z / mu)
# under fixed seeds so the #16 consolidation of the genClusteredData* functions
# and their validators is provably behavior-preserving. The existing "... works"
# tests only check distributional properties (correlations, ranges), NOT the
# exact RNG draw order, which differs across the three generators (bulk vs
# per-column vs random-per-column). Regenerate ONLY if the data-generating
# algorithm intentionally changes.
testthat::test_that("genClusteredData output is byte-stable (characterization snapshot, #16)", {
set.seed(23478)
ret <- genClusteredData(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho=.99, beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.02641989570528,
-0.113749750828501, -0.0540798505887575, -0.719018251689331,
-1.02002920513898, -0.133167075802704, -0.228504275273568,
-0.594853445315136, -1.3743321671407, -0.691923420713347, -0.11653157345392,
-0.949501763500117, -1.09911307059764, -0.234373142554013,
-0.0610136281766905, -0.614831158775111, -0.875588846521461,
-0.267096146835881, 0.243882464912086, -0.525573959766865, 0.38889466127034,
-0.196999108158916, -0.548580993298435, 1.29708342622908), dim = c(4L, 6L)))
testthat::expect_equal(ret$y[1:4], c(1.78818155412284, 2.55825580753328,
0.0881535457958548, 1.21771079504409))
testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
-0.338373896594486, 0.00224652036037424, -0.638671932314538,
0.402628944106856, -0.0568963388614887, -0.533358544824988, 1.23055042537552,
0.495888208134907, 2.19477907424054, 0.357725198858587, -0.863633909554439),
dim = 4:3))
testthat::expect_equal(ret$mu[1:4], c(1.12211695300006, 3.29507281236772,
-0.106753094432726, 1.52394871772745))
testthat::expect_equal(
c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
c(sumX = 7.85857128981734, sumY = 13.5810460445817, sumZ = 3.15823002771053,
sumMu = 9.93005749962466))
# single latent variable (n_clusters = 1): Z is a one-column matrix, and the
# proxy block adds bulk noise to the raw vector Z.
set.seed(23478)
ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
n_clusters=1, sig_clusters=1, rho=.8, beta_latent=1.5, beta_unclustered=-2,
snr=NA, sigma_eps_sq=.5)
testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-0.00161764234876749,
-0.557794858227907, -0.827386446973581, -0.460377784885637, -2.67619233265038,
0.671711827459298, 1.65236925524076, -0.600115109545631, -0.280741265470752,
-1.67026187144442, -0.0473803806232191, 0.0826508004462609, 0.170200984778884,
0.976190994120857, -0.499749723868518, -0.474295388720649, -1.17346153974724,
-0.574655697546479, -0.120791507647279, -0.567678976454219, -1.3596277359555,
-0.0619416164679036, 0.538496824968004, 0.51591446765952), dim = c(4L, 6L)))
testthat::expect_equal(ret$y[1:4], c(-0.558336409902388, -2.73369761842788,
-0.884087327686163, -3.17343267752372))
testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
-0.338373896594486, 0.00224652036037424, -0.638671932314538),
dim = c(4L, 1L)))
testthat::expect_equal(ret$mu[1:4], c(0.195781398124046, -1.51661095607912,
-0.722190912632797, -2.90119777610212))
testthat::expect_equal(
c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
c(sumX = -5.88971766152033, sumY = -7.36101068003228, sumZ = -2.4782435456402,
sumMu = -5.08843939501818))
})
testthat::test_that("genClusteredDataWeighted output is byte-stable (characterization snapshot, #16)", {
set.seed(23478)
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.02641989570528,
-0.113749750828501, -0.0540798505887575, -0.719018251689331,
-1.02002920513898, -0.133167075802704, -0.228504275273568,
-0.594853445315136, -1.3743321671407, -0.691923420713347, -0.11653157345392,
-0.949501763500117, -0.748805376677026, 0.925796953277494, -0.766705940388196,
-0.348877774329742, 1.96822115764394, 0.528035720534176, 2.93942827641455,
0.736079254765787, 0.38889466127034, -0.196999108158916, -0.548580993298435,
1.29708342622908), dim = c(4L, 6L)))
testthat::expect_equal(ret$y[1:4], c(1.78818155412284, 2.55825580753328,
0.0881535457958548, 1.21771079504409))
testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
-0.338373896594486, 0.00224652036037424, -0.638671932314538,
0.402628944106856, -0.0568963388614887, -0.533358544824988, 1.23055042537552,
0.495888208134907, 2.19477907424054, 0.357725198858587, -0.863633909554439),
dim = 4:3))
testthat::expect_equal(ret$mu[1:4], c(1.12211695300006, 3.29507281236772,
-0.106753094432726, 1.52394871772745))
testthat::expect_equal(
c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
c(sumX = 15.3829830134111, sumY = 13.5810460445817, sumZ = 3.15823002771053,
sumMu = 9.93005749962466))
# single latent variable (n_clusters = 1)
set.seed(23478)
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=1, sig_clusters=1, rho_high=.99,
rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.07114417787482,
-0.368013007285506, 0.014438919713251, -0.845503147886515, -1.23739163512048,
-0.174811089926991, -0.0829364724064908, -0.479069797377207, -1.12138274604786,
-0.402828108682171, 0.104291325515473, -0.844159398768831, 0.352430473911601,
-1.4960796572646, -1.23544829374631, 0.148162490964714, -1.03021388828778,
0.576416731940481, -0.513340292522171, -1.14008022670496, 0.402628944106856,
-0.0568963388614887, -0.533358544824988, 1.23055042537552), dim = c(4L, 6L)))
testthat::expect_equal(ret$y[1:4], c(-3.91149964768012, -4.73964429926795,
0.513991031913566, -2.95972727928473))
testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
-0.338373896594486, 0.00224652036037424, -0.638671932314538),
dim = c(4L, 1L)))
testthat::expect_equal(ret$mu[1:4], c(-3.20232310305676, -3.49765450037239,
0.564187042362111, -2.19774596140566))
testthat::expect_equal(
c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
c(sumX = -24.0234160115409, sumY = -10.2198990332918, sumZ = 0.586596841562748,
sumMu = -5.15679097838167))
})
testthat::test_that("genClusteredDataWeightedRandom output is byte-stable (characterization snapshot, #16)", {
set.seed(23478)
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2,
cluster_size=5, n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.1897691657824,
-0.00982041549935259, 0.665172439447775, -0.756543570417209, 1.15949678203449,
-2.91344510959381, 0.491240389956153, -1.30476423524191, -0.485131132932798,
0.707045727376943, 3.2222522503854, -3.27350445562509, -1.85428176680484,
-0.0656090373877866, -0.551130785327929, -1.90090513408521, -1.29616909361208,
-0.144918159176611, 0.544180897409365, -0.50148178532334, 0.627200376817289,
0.171599744439457, -1.07507438640901, 1.5693792481444), dim = c(4L, 6L)))
testthat::expect_equal(ret$y[1:4], c(1.78818155412284, 2.55825580753328,
0.0881535457958548, 1.21771079504409))
testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
-0.338373896594486, 0.00224652036037424, -0.638671932314538,
0.402628944106856, -0.0568963388614887, -0.533358544824988, 1.23055042537552,
0.495888208134907, 2.19477907424054, 0.357725198858587, -0.863633909554439),
dim = 4:3))
testthat::expect_equal(ret$mu[1:4], c(1.12211695300006, 3.29507281236772,
-0.106753094432726, 1.52394871772745))
testthat::expect_equal(
c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
c(sumX = 3.23025166163967, sumY = 13.5810460445817, sumZ = 3.15823002771053,
sumMu = 9.93005749962466))
# single latent variable (n_clusters = 1)
set.seed(23478)
ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2,
cluster_size=5, n_clusters=1, sig_clusters=1, rho_high=1, rho_low=.5,
beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.36526166060881,
0.586409931283558, 1.2227469255493, -2.17655371414979, 0.326418806550987,
-1.09714072579561, 1.4239012653582, -0.760011975131564, -0.227256340192126,
-1.84845832529723, 0.184388770399019, -0.394752299747441, -1.47062212216842,
-0.12215942773548, -0.0749451055463801, -0.842364831584477, -1.41223753691949,
-1.68487949598676, 0.247717311839652, -1.86068883953814, 0.402628944106856,
-0.0568963388614887, -0.533358544824988, 1.23055042537552), dim = c(4L, 6L)))
testthat::expect_equal(ret$y[1:4], c(-3.91149964768012, -4.73964429926795,
0.513991031913566, -2.95972727928473))
testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
-0.338373896594486, 0.00224652036037424, -0.638671932314538),
dim = c(4L, 1L)))
testthat::expect_equal(ret$mu[1:4], c(-3.20232310305676, -3.49765450037239,
0.564187042362111, -2.19774596140566))
testthat::expect_equal(
c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
c(sumX = -28.8273582900822, sumY = -10.2198990332918, sumZ = 0.586596841562748,
sumMu = -5.15679097838167))
})## Test passed with 10 successes.
## Test passed with 10 successes.
## Test passed with 10 successes.
The two weighted generators now run the same snr / sigma_eps_sq type checks
as genClusteredData() (#35): a non-numeric snr / sigma_eps_sq is rejected
up front rather than silently passing the old lean check and failing later
inside genZmuY().
testthat::test_that("genClusteredData* reject n < 2 and non-integer cluster_size / p (#70)", {
ok <- list(n = 10, p = 8, k_unclustered = 2, cluster_size = 3, n_clusters = 1,
sig_clusters = 1, sigma_eps_sq = 1)
testthat::expect_error(do.call(genClusteredData, utils::modifyList(ok, list(n = 1L))),
"n >= 2", fixed = TRUE)
testthat::expect_error(do.call(genClusteredData, utils::modifyList(ok, list(cluster_size = 2.5))),
"cluster_size == round(cluster_size)", fixed = TRUE)
testthat::expect_error(do.call(genClusteredData, utils::modifyList(ok, list(p = 8.5))),
"p == round(p)", fixed = TRUE)
# n = 2 is the tight boundary -- it must still WORK (locks the threshold
# against a future "n >= 3" / "> 2" regression).
res2 <- do.call(genClusteredData, utils::modifyList(ok, list(n = 2L)))
testthat::expect_equal(nrow(res2$X), 2L)
# The shared genZmuY n-guard and the shared Pre cluster_size check cover the
# weighted generators too:
wok <- list(n = 1L, p = 8, k_unclustered = 2, cluster_size = 3, n_clusters = 1,
n_strong_cluster_vars = 2, sig_clusters = 1, rho_high = 0.8,
rho_low = 0.2, sigma_eps_sq = 1)
testthat::expect_error(do.call(genClusteredDataWeighted, wok),
"n >= 2", fixed = TRUE)
testthat::expect_error(
do.call(genClusteredDataWeighted, utils::modifyList(wok, list(n = 10L, cluster_size = 2.5))),
"cluster_size == round(cluster_size)", fixed = TRUE)
})
testthat::test_that("genClusteredDataWeighted/Random validate snr/sigma_eps_sq type (#35)", {
# Newly rejected (these previously slipped past the weighted validators' lean
# checks and only failed later, cryptically, inside genZmuY).
testthat::expect_error(
genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr="1",
sigma_eps_sq=NA),
"is.numeric(snr) | is.integer(snr) is not TRUE", fixed=TRUE)
testthat::expect_error(
genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA,
sigma_eps_sq="0.5"),
"is.numeric(sigma_eps_sq) | is.integer(sigma_eps_sq) is not TRUE", fixed=TRUE)
testthat::expect_error(
genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5, beta_latent=1.5,
beta_unclustered=-2, snr="1", sigma_eps_sq=NA),
"is.numeric(snr) | is.integer(snr) is not TRUE", fixed=TRUE)
testthat::expect_error(
genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5, beta_latent=1.5,
beta_unclustered=-2, snr=NA, sigma_eps_sq="0.5"),
"is.numeric(sigma_eps_sq) | is.integer(sigma_eps_sq) is not TRUE", fixed=TRUE)
# Range checks (snr > 0, sigma_eps_sq >= 0) are unchanged by #35.
testthat::expect_error(
genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=-.2,
sigma_eps_sq=NA),
"snr > 0 is not TRUE", fixed=TRUE)
testthat::expect_error(
genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5, beta_latent=1.5,
beta_unclustered=-2, snr=NA, sigma_eps_sq=-.3),
"sigma_eps_sq >= 0 is not TRUE", fixed=TRUE)
# A valid scalar numeric snr / sigma_eps_sq is still accepted.
ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
})## Test passed with 6 successes.
## Test passed with 7 successes.
Tests for getLassoLambda():
testthat::test_that("getLassoLambda works", {
set.seed(7252)
x <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
y <- stats::rnorm(10)
ret <- getLassoLambda(X=x, y=y, lambda_choice="1se", nfolds=4)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_true(ret >= 0)
ret <- getLassoLambda(X=x, y=y, lambda_choice="min", nfolds=5)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_true(ret >= 0)
# Different mixing parameter
ret <- getLassoLambda(X=x, y=y, lambda_choice="min", nfolds=5, alpha=0.5)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_true(ret >= 0)
# data.frame X is now accepted (coerced internally), like the sibling exports (#51)
ret_df <- getLassoLambda(X = as.data.frame(x), y = y, lambda_choice = "1se", nfolds = 4)
testthat::expect_true(is.numeric(ret_df) | is.integer(ret_df))
testthat::expect_true(!is.na(ret_df))
testthat::expect_equal(length(ret_df), 1)
testthat::expect_true(ret_df >= 0)
# Bad inputs
testthat::expect_error(getLassoLambda(X="x", y=y), "is.matrix(X) is not TRUE",
fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x[1:9, ], y=y),
"n == length(y) is not TRUE", fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=TRUE),
"must be a numeric (real-valued) vector",
fixed=TRUE)
# Error has quotation marks in it
testthat::expect_error(getLassoLambda(X=x, y=y, lambda_choice="mni"))
testthat::expect_error(getLassoLambda(X=x, y=y,
lambda_choice=c("min", "1se")),
"length(lambda_choice) == 1 is not TRUE",
fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=y, lambda_choice=1),
"is.character(lambda_choice) is not TRUE",
fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=y, nfolds="5"),
"is.numeric(nfolds) | is.integer(nfolds) is not TRUE",
fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=1:4),
"length(nfolds) == 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=3.2),
"nfolds == round(nfolds) is not TRUE", fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=3),
"nfolds > 3 is not TRUE", fixed=TRUE)
testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=4, alpha=1.2),
"alpha <= 1 is not TRUE", fixed=TRUE)
# alpha = 0 (degenerate ridge) is now rejected (previously allowed)
testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=4, alpha=0),
"alpha > 0 is not TRUE", fixed=TRUE)
# An NA in the design matrix now yields a friendly message (#56) rather than
# falling through to a cryptic glmnet-level error.
x_na <- x; x_na[2, 1] <- NA
testthat::expect_error(getLassoLambda(X=x_na, y=y),
"must not contain missing", fixed=TRUE)
# A data.frame with a numeric NA must error with the same message BEFORE
# coercion: model.matrix's na.action=na.omit would otherwise silently drop
# the NA row and the failure would surface downstream (length mismatch), not
# as this message. This case pins the pre-coercion placement of checkNoNAs.
df_na <- as.data.frame(x); df_na[2, 1] <- NA
testthat::expect_error(getLassoLambda(X=df_na, y=y),
"must not contain missing", fixed=TRUE)
# (#104, L7) When n is too small for even 3-fold cross-validation, error
# clearly here instead of crashing inside cv.glmnet ("nfolds must be bigger
# than 3"). With n = 4 and the default nfolds = 10, n_sample rounds to 2 (< 3).
testthat::expect_error(
getLassoLambda(X=matrix(stats::rnorm(4*5), nrow=4, ncol=5),
y=stats::rnorm(4)),
"is too small to choose lambda", fixed=TRUE)
})## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getLassoLambda(X = x, y = y, lambda_choice = "1se", nfolds = 4)
## 2. \-glmnet::cv.glmnet(...)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getLassoLambda(X = x, y = y, lambda_choice = "min", nfolds = 5)
## 2. \-glmnet::cv.glmnet(...)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getLassoLambda(...)
## 2. \-glmnet::cv.glmnet(...)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getLassoLambda(...)
## 2. \-glmnet::cv.glmnet(...)
## 3. \-glmnet:::cv.glmnet.raw(...)
## Test passed with 31 successes.
Out-of-range cluster indices are caught up front (#104, L3) through the public
css() and protolasso() entry points – covering a bare-vector clusters
argument (which is wrapped into a list after the per-element validation loop) and
a list clusters argument:
testthat::test_that("out-of-range cluster indices error via css/protolasso (#104)", {
set.seed(104105)
X <- matrix(stats::rnorm(20*8), nrow=20, ncol=8)
y <- stats::rnorm(20)
# Bare-vector clusters with an index beyond p (= 5 here).
testthat::expect_error(
css(X[, 1:5], y, lambda=0.01, clusters=c(1, 2, 99)),
"Cluster index 99 exceeds the number of features (p = 5).", fixed=TRUE)
# List clusters routed through protolasso.
testthat::expect_error(
protolasso(X[, 1:5], y, clusters=list(c(1, 2, 99))),
"Cluster index 99 exceeds the number of features (p = 5).", fixed=TRUE)
})## Test passed with 2 successes.
css() reports a too-small sample size clearly (#104, L4) instead of the
misleading “Too many training indices provided” when no training indices were
supplied:
testthat::test_that("css reports a too-small sample size clearly (#104)", {
set.seed(104104)
# No train_inds: the message must blame the sample size, not training indices.
testthat::expect_error(
css(matrix(stats::rnorm(15), nrow=3, ncol=5), stats::rnorm(3), lambda=0.01),
"is too small", fixed=TRUE)
# When train_inds leave fewer than 4 observations for selection, the original
# "Too many training indices" branch still fires.
testthat::expect_error(
css(matrix(stats::rnorm(10*5), nrow=10, ncol=5), stats::rnorm(10),
lambda=0.01, train_inds=1:7),
"Too many training indices provided", fixed=TRUE)
})## Test passed with 2 successes.
Tests for getModelSize():
testthat::test_that("getModelSize works", {
set.seed(1723)
data <- genClusteredData(n=15, p=11, k_unclustered=2, cluster_size=3,
n_clusters=2, sigma_eps_sq=10^(-6))
x <- data$X
y <- data$y
good_clusters <- list(red_cluster=1L:3L, green_cluster=4L:6L)
ret <- getModelSize(X=x, y=y, clusters=good_clusters)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_equal(ret, round(ret))
testthat::expect_true(ret >= 1)
# 11 features, but two clusters of size 3, so maximum size should
# be 11 - 2 - 2 = 7
testthat::expect_true(ret <= 7)
## Trying other inputs
unnamed_clusters <- list(1L:3L, 5L:8L)
ret <- getModelSize(X=x, y=y, clusters=unnamed_clusters)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_equal(ret, round(ret))
testthat::expect_true(ret >= 1)
# 11 features, but 3 in one cluster and 4 in another, so maximum size should
# be 11 - 2 - 3 = 6
testthat::expect_true(ret <= 6)
# Single cluster
ret <- getModelSize(X=x, y=y, clusters=2:5)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_equal(ret, round(ret))
testthat::expect_true(ret >= 1)
# 11 features, but 4 in one cluster, so maximum size should be 11 - 3 = 8
testthat::expect_true(ret <= 8)
# Regression test: when prototype reduction leaves a single column (all
# provided features in one cluster), getModelSize must return 1L rather than
# crashing in cv.glmnet. Previously X_size[, -feats_to_drop] dropped to a
# vector when one column remained.
testthat::expect_equal(getModelSize(X=x[, 1:3, drop=FALSE], y=y,
clusters=list(1:3)), 1L)
# Intentionally don't provide clusters for all feature, mix up formatting,
# etc.
good_clusters <- list(red_cluster=1:3, 5:8)
ret <- getModelSize(X=x, y=y, clusters=good_clusters)
testthat::expect_true(is.numeric(ret) | is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_equal(ret, round(ret))
testthat::expect_true(ret >= 1)
# 11 features, but 3 in one cluster and 4 in another, so maximum size should
# be 11 - 2 - 3 = 6
testthat::expect_true(ret <= 6)
## Trying bad inputs
testthat::expect_error(getModelSize(X="x", y=y, clusters=good_clusters),
"is.matrix(X) | is.data.frame(X) is not TRUE", fixed=TRUE)
testthat::expect_error(getModelSize(X=x[1:5, ], y=y, clusters=good_clusters),
"length(y) == n is not TRUE", fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=FALSE, clusters=good_clusters),
"must be a numeric (real-valued) vector",
fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=list(3:7, 7:10)),
"Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.",
fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=list(5:8, 5:8)),
"length(clusters) == length(unique(clusters)) is not TRUE",
fixed=TRUE)
# Out-of-range cluster index now caught up front by checkFormatClustersInput
# (#104, L3); x has 11 columns, so index 50 is out of range.
testthat::expect_error(getModelSize(X=x, y=y, clusters=6:50),
"Cluster index 50 exceeds the number of features (p = 11).",
fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y,
clusters=list(2:3, as.integer(NA))),
"!is.na(clusters) are not all TRUE",
fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=list(2:3, c(4, 4, 5))),
"length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=list(1:4, -1)),
"all(clusters[[i]] >= 1) is not TRUE",
fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=list(1:4,
c(2.3, 1.2))),
"all(clusters[[i]] == round(clusters[[i]])) is not TRUE",
fixed=TRUE)
## alpha (#21): the elastic-net mixing parameter now drives the model-size
## estimate (the internal cv.glmnet), keeping it consistent with elastic-net
## feature selection.
# Validation, mirroring getLassoLambda: alpha must be in (0, 1].
testthat::expect_error(getModelSize(X=x, y=y, clusters=good_clusters, alpha=0),
"alpha > 0 is not TRUE", fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=good_clusters, alpha=1.5),
"alpha <= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getModelSize(X=x, y=y, clusters=good_clusters, alpha=NA),
"is.numeric(alpha) | is.integer(alpha) is not TRUE",
fixed=TRUE)
# Back-compat: default is pure lasso (alpha = 1). cv.glmnet uses random CV
# folds, so seed before each call to compare apples-to-apples.
set.seed(7); a_default <- getModelSize(X=x, y=y, clusters=good_clusters)
set.seed(7); a_alpha1 <- getModelSize(X=x, y=y, clusters=good_clusters, alpha=1)
testthat::expect_equal(a_default, a_alpha1)
# Headline: on a design with a block of highly correlated relevant features
# (each feature its own singleton cluster, so cv.glmnet sees the whole block
# with no prototype reduction), a lower alpha (elastic net) keeps more of the
# correlated features than pure lasso, giving a strictly larger estimated
# size. On main, getModelSize has no alpha argument, so this cannot happen.
set.seed(2718)
n_h <- 120
z_h <- stats::rnorm(n_h)
X_block <- matrix(rep(z_h, 10), nrow=n_h) +
matrix(stats::rnorm(n_h * 10, sd=0.05), nrow=n_h)
X_noise <- matrix(stats::rnorm(n_h * 4), nrow=n_h)
X_h <- cbind(X_block, X_noise)
y_h <- 2 * z_h + stats::rnorm(n_h, sd=0.5)
singletons_h <- as.list(1:ncol(X_h))
set.seed(11)
size_lasso <- getModelSize(X=X_h, y=y_h, clusters=singletons_h, alpha=1)
set.seed(11)
size_enet <- getModelSize(X=X_h, y=y_h, clusters=singletons_h, alpha=0.2)
testthat::expect_true(size_enet >= size_lasso)
testthat::expect_true(size_enet > size_lasso)
})## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters)
## 2. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getModelSize(X = x, y = y, clusters = unnamed_clusters)
## 2. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getModelSize(X = x, y = y, clusters = 2:5)
## 2. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters)
## 2. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters)
## 2. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 3. \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
## x
## 1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters, alpha = 1)
## 2. \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
## 3. \-glmnet:::cv.glmnet.raw(...)
## Test passed with 41 successes.
Tests for getSelectionPrototypes():
testthat::test_that("getSelectionPrototypes works", {
set.seed(67234)
data <- genClusteredData(n=15, p=11, k_unclustered=2, cluster_size=3,
n_clusters=2, sig_clusters=1, sigma_eps_sq=10^(-6))
x <- data$X
y <- data$y
good_clusters <- list(red_cluster=1L:3L, green_cluster=4L:6L)
css_results <- css(X=x, y=y, lambda=0.01, clusters=good_clusters)
ret <- getSelectionPrototypes(css_results, selected_clusts=good_clusters)
testthat::expect_true(is.integer(ret))
testthat::expect_true(all(!is.na(ret)))
testthat::expect_equal(length(ret), length(good_clusters))
testthat::expect_equal(length(ret), length(unique(ret)))
for(i in 1:length(ret)){
testthat::expect_true(ret[i] %in% good_clusters[[i]])
# Find the largest selection proportion of any feature in cluster i
max_prop <- max(colMeans(css_results$feat_sel_mat[, good_clusters[[i]]]))
# Find the selection proportion of the identified prototype
proto_prop <- colMeans(css_results$feat_sel_mat)[ret[i]]
testthat::expect_equal(max_prop, proto_prop)
}
# Try with only one selected cluster (still should be in a list)
ret <- getSelectionPrototypes(css_results,
selected_clusts=list(red_cluster=1L:3L))
testthat::expect_true(is.integer(ret))
testthat::expect_true(!is.na(ret))
testthat::expect_equal(length(ret), 1)
testthat::expect_true(ret %in% 1L:3L)
# Find the largest selection proportion of any feature in the cluster
max_prop <- max(colMeans(css_results$feat_sel_mat[, 1L:3L]))
# Find the selection proportion of the identified prototype
proto_prop <- colMeans(css_results$feat_sel_mat)[ret]
testthat::expect_equal(max_prop, proto_prop)
## Trying bad inputs
# Error contains quotation marks
testthat::expect_error(getSelectionPrototypes(x, good_clusters))
testthat::expect_error(getSelectionPrototypes(css_results, 1L:3L),
"is.list(selected_clusts) is not TRUE", fixed=TRUE)
testthat::expect_error(getSelectionPrototypes(css_results, list()),
"n_selected_clusts >= 1 is not TRUE", fixed=TRUE)
testthat::expect_error(getSelectionPrototypes(css_results,
list(red_cluster=1L:3L,
green_cluster=4L:6L,
bad_cluster=integer())),
"all(lengths(selected_clusts) >= 1) is not TRUE",
fixed=TRUE)
})
testthat::test_that("getSelectionPrototypes handles a constant tied column without crashing (#68)", {
set.seed(680)
n <- 20L; p <- 4L
X <- matrix(stats::rnorm(n * p), nrow = n, ncol = p)
X[, 2] <- 3.5 # feature 2 is a CONSTANT column
colnames(X) <- paste0("V", 1:p)
y <- X[, 1] + stats::rnorm(n) # y correlated with feature 1 (varying)
# feat_sel_mat: features 1 and 2 tie at selection proportion 0.5 (both
# "selected" in the same 5 of 10 subsamples), forcing the tie-break path.
B <- 10L
M <- matrix(0L, nrow = B, ncol = p)
M[1:5, 1] <- 1L
M[1:5, 2] <- 1L
css_results <- structure(list(X = X, y = y, feat_sel_mat = M), class = "cssr")
# Pre-fix: cor(X[, c(1,2)], y) -> c(<corr>, NA) -> max NA -> stopifnot abort.
proto <- getSelectionPrototypes(css_results, selected_clusts = list(c(1L, 2L)))
testthat::expect_false(is.na(proto))
testthat::expect_identical(unname(proto), 1L) # the varying, y-correlated member
# All-constant tied cluster: must not crash, picks the first deterministically.
# (suppressWarnings: the names-assignment emits a pre-existing "number of items
# to replace..." warning when proto_i stays length>1; not introduced by this
# fix, unrelated to #68.)
X2 <- X; X2[, 3] <- 2.0; X2[, 4] <- 9.0
M2 <- matrix(0L, nrow = B, ncol = p); M2[1:5, 3] <- 1L; M2[1:5, 4] <- 1L
css2 <- structure(list(X = X2, y = y, feat_sel_mat = M2), class = "cssr")
proto2 <- suppressWarnings(
getSelectionPrototypes(css2, selected_clusts = list(c(3L, 4L))))
testthat::expect_identical(unname(proto2), 3L)
})## Test passed with 17 successes.
## Test passed with 3 successes.
Tests for printCssDf():
testthat::test_that("printCssDf works", {
set.seed(67234)
data <- genClusteredData(n=15, p=11, k_unclustered=2, cluster_size=3,
n_clusters=2, sig_clusters=1, sigma_eps_sq=10^(-6))
x <- data$X
y <- data$y
good_clusters <- list(red_cluster=1L:3L, green_cluster=4L:6L)
css_results <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B=10)
ret <- printCssDf(css_results)
testthat::expect_true(is.data.frame(ret))
testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
"ClustSelProp", "ClustSize"))
# Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
testthat::expect_equal(nrow(ret), 7)
testthat::expect_true(is.character(ret$ClustName))
testthat::expect_true(all(names(good_clusters) %in% ret$ClustName))
testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
testthat::expect_true(is.integer(ret$ClustProtoNum))
testthat::expect_true(ret[ret$ClustName=="red_cluster",
"ClustProtoNum"] %in% 1L:3L)
testthat::expect_true(ret[ret$ClustName=="green_cluster",
"ClustProtoNum"] %in% 4L:6L)
other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
length(unique(ret[other_rows, "ClustProtoNum"])))
testthat::expect_true(is.numeric(ret$ClustSelProp))
testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
decreasing=TRUE))
testthat::expect_true(is.integer(ret$ClustSize))
testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
# Try again naming features
x_named <- x
colnames(x_named) <- LETTERS[1:11]
css_results_name_feats <- css(X=x_named, y=y, lambda=0.01,
clusters=good_clusters, B=10)
ret <- printCssDf(css_results_name_feats)
testthat::expect_true(is.data.frame(ret))
testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoName",
"ClustProtoNum", "ClustSelProp",
"ClustSize"))
# Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
testthat::expect_equal(nrow(ret), 7)
testthat::expect_true(is.character(ret$ClustName))
testthat::expect_true(all(names(good_clusters) %in% ret$ClustName))
testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
testthat::expect_true(is.character(ret$ClustProtoName))
testthat::expect_true(ret[ret$ClustName=="red_cluster",
"ClustProtoName"] %in% LETTERS[1:3])
testthat::expect_true(ret[ret$ClustName=="green_cluster",
"ClustProtoName"] %in% LETTERS[4:6])
other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
testthat::expect_true(all(ret[other_rows, "ClustProtoName"] %in% LETTERS[7:11]))
testthat::expect_true(length(ret[other_rows, "ClustProtoName"]) ==
length(unique(ret[other_rows, "ClustProtoName"])))
testthat::expect_true(is.integer(ret$ClustProtoNum))
testthat::expect_true(ret[ret$ClustName=="red_cluster",
"ClustProtoNum"] %in% 1L:3L)
testthat::expect_true(ret[ret$ClustName=="green_cluster",
"ClustProtoNum"] %in% 4L:6L)
testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
length(unique(ret[other_rows, "ClustProtoNum"])))
testthat::expect_true(is.numeric(ret$ClustSelProp))
testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
decreasing=TRUE))
testthat::expect_true(is.integer(ret$ClustSize))
testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
# Unnamed clusters
unnamed_clusters <- list(1:3, 4:6)
css_results_unnamed <- css(X=x, y=y, lambda=0.01, clusters=unnamed_clusters,
B=10)
ret <- printCssDf(css_results_unnamed)
testthat::expect_true(is.data.frame(ret))
testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
"ClustSelProp", "ClustSize"))
# Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
testthat::expect_equal(nrow(ret), 7)
testthat::expect_true(is.character(ret$ClustName))
testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
testthat::expect_true(is.integer(ret$ClustProtoNum))
testthat::expect_true(is.numeric(ret$ClustSelProp))
testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
decreasing=TRUE))
testthat::expect_true(is.integer(ret$ClustSize))
# Try other settings for cutoff, min_num_clusts, max_num_clusts, etc.
ret <- printCssDf(css_results, max_num_clusts=3)
testthat::expect_true(is.data.frame(ret))
testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
"ClustSelProp", "ClustSize"))
testthat::expect_true(nrow(ret) <= 3)
testthat::expect_true(is.character(ret$ClustName))
testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
testthat::expect_true(is.integer(ret$ClustProtoNum))
other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
length(unique(ret[other_rows, "ClustProtoNum"])))
testthat::expect_true(is.numeric(ret$ClustSelProp))
testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
decreasing=TRUE))
testthat::expect_true(is.integer(ret$ClustSize))
testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
if("red_cluster" %in% ret$ClustName){
testthat::expect_true(ret[ret$ClustName=="red_cluster",
"ClustProtoNum"] %in% 1L:3L)
testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
}
if("green_cluster" %in% ret$ClustName){
testthat::expect_true(ret[ret$ClustName=="green_cluster",
"ClustProtoNum"] %in% 4L:6L)
testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
}
ret <- printCssDf(css_results, min_num_clusts=2, cutoff=1)
testthat::expect_true(is.data.frame(ret))
testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
"ClustSelProp", "ClustSize"))
# Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
testthat::expect_true(nrow(ret) >= 2)
testthat::expect_true(nrow(ret) <= 7)
testthat::expect_true(is.character(ret$ClustName))
testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
testthat::expect_true(is.integer(ret$ClustProtoNum))
other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
length(unique(ret[other_rows, "ClustProtoNum"])))
testthat::expect_true(is.numeric(ret$ClustSelProp))
testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
decreasing=TRUE))
testthat::expect_true(is.integer(ret$ClustSize))
testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
if("red_cluster" %in% ret$ClustName){
testthat::expect_true(ret[ret$ClustName=="red_cluster",
"ClustProtoNum"] %in% 1L:3L)
testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
}
if("green_cluster" %in% ret$ClustName){
testthat::expect_true(ret[ret$ClustName=="green_cluster",
"ClustProtoNum"] %in% 4L:6L)
testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
}
#
ret <- printCssDf(css_results, cutoff=1)
testthat::expect_true(is.data.frame(ret))
testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
"ClustSelProp", "ClustSize"))
testthat::expect_true(nrow(ret) >= 1)
testthat::expect_true(nrow(ret) <= 7)
testthat::expect_true(is.character(ret$ClustName))
testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
testthat::expect_true(is.integer(ret$ClustProtoNum))
other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
length(unique(ret[other_rows, "ClustProtoNum"])))
testthat::expect_true(is.numeric(ret$ClustSelProp))
testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
decreasing=TRUE))
testthat::expect_true(is.integer(ret$ClustSize))
testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
if("red_cluster" %in% ret$ClustName){
testthat::expect_true(ret[ret$ClustName=="red_cluster",
"ClustProtoNum"] %in% 1L:3L)
testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
}
if("green_cluster" %in% ret$ClustName){
testthat::expect_true(ret[ret$ClustName=="green_cluster",
"ClustProtoNum"] %in% 4L:6L)
testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
}
## Trying bad inputs
# Error has quotation marks in it
testthat::expect_error(printCssDf("css_results"))
testthat::expect_error(printCssDf(css_results, cutoff=-.1),
"cutoff >= 0 is not TRUE", fixed=TRUE)
testthat::expect_error(printCssDf(css_results, min_num_clusts=3.2),
"min_num_clusts == round(min_num_clusts) is not TRUE",
fixed=TRUE)
testthat::expect_error(printCssDf(css_results, max_num_clusts="5"),
"is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
fixed=TRUE)
})## Test passed with 98 successes.
Tests for print.cssr() (and a minimal-B smoke test for css()):
testthat::test_that("print.cssr works", {
set.seed(26717)
x <- matrix(stats::rnorm(10*7), nrow=10, ncol=7)
y <- stats::rnorm(10)
good_clusters <- list("apple"=1:2, "banana"=3:4, "cantaloupe"=5)
css_res <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 10)
# print() dispatches to print.cssr, which returns its argument invisibly (the
# standard print-method convention; see print.cssr's @return / #16).
testthat::expect_invisible(print(css_res))
ret <- print(css_res)
testthat::expect_identical(ret, css_res)
# It prints the printCssDf summary table (column headers included).
testthat::expect_output(print(css_res), "ClustName")
testthat::expect_output(print(css_res), "ClustSize")
# max_num_clusts routes through to printCssDf (at most that many cluster rows).
out_all <- utils::capture.output(print(css_res))
out_one <- utils::capture.output(print(css_res, max_num_clusts=1))
testthat::expect_true(length(out_one) <= length(out_all))
})
testthat::test_that("css runs with B = 1 (minimal subsampling)", {
set.seed(26717)
x <- matrix(stats::rnorm(10*7), nrow=10, ncol=7)
y <- stats::rnorm(10)
good_clusters <- list("apple"=1:2, "banana"=3:4, "cantaloupe"=5)
# B = 1 is allowed (checkB requires only B > 0) but warns that small B is poor.
testthat::expect_warning(
css_b1 <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 1),
"Small values of B may lead to poor results.", fixed=TRUE)
testthat::expect_true(inherits(css_b1, "cssr"))
# The minimal-B object is still well-formed downstream.
sel <- getCssSelections(css_b1)
testthat::expect_identical(names(sel),
c("selected_clusts", "selected_feats", "weights"))
})## ClustName ClustProtoNum ClustSelProp ClustSize
## 1 banana 4 1.00 2
## 2 cantaloupe 5 0.80 1
## 3 c4 6 0.60 1
## 4 apple 1 0.55 2
## 5 c5 7 0.30 1
## ClustName ClustProtoNum ClustSelProp ClustSize
## 1 banana 4 1.00 2
## 2 cantaloupe 5 0.80 1
## 3 c4 6 0.60 1
## 4 apple 1 0.55 2
## 5 c5 7 0.30 1
## Test passed with 5 successes.
## Test passed with 3 successes.