10 Tests for other useful functions

Tests for checkGenClusteredDataInputs()

testthat::test_that("checkGenClusteredDataInputs works", {
  set.seed(7612)

  # Should get no error
  checkGenClusteredDataInputs(p=19, k_unclustered=2, cluster_size=5, n_clusters=3,
                           sig_clusters=2, rho=.8, beta_latent=1.5,
                           beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  
  checkGenClusteredDataInputs(p=19, k_unclustered=2, cluster_size=5, n_clusters=3,
                           sig_clusters=2, rho=.8, beta_latent=1.5,
                           beta_unclustered=-2, snr=1, sigma_eps_sq=NA)
  
  # sig_clusters
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters="2", rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "is.numeric(sig_clusters) | is.integer(sig_clusters) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=4, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "sig_clusters <= n_clusters is not TRUE", fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=-1, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "sig_clusters >= 0 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=.6, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "sig_clusters == round(sig_clusters) is not TRUE",
                         fixed=TRUE)
  
  
  # n_clusters
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5,
                                                  n_clusters="3",
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "is.numeric(n_clusters) | is.integer(n_clusters) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3.2,
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "n_clusters == round(n_clusters) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=0,
                                                  sig_clusters=0, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "n_clusters >= 1 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=1, n_clusters=3,
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "cluster_size >= 2 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=16, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "p >= n_clusters * cluster_size + k_unclustered is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "rho > 0 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=0,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "beta_latent != 0 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=0, snr=1,
                                                  sigma_eps_sq=NA),
                         "beta_unclustered != 0 is not TRUE", fixed=TRUE)
  
  # k_unclustered
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered="2",
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "is.numeric(k_unclustered) | is.integer(k_unclustered) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=0,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "k_unclustered >= 1 is not TRUE", fixed=TRUE)

  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2.2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "k_unclustered == round(k_unclustered) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=NA,
                                                  sigma_eps_sq=NA),
                         "Must specify one of snr or sigma_eps_sq", fixed=TRUE)

  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=-.2,
                                                  sigma_eps_sq=NA),
                         "snr > 0 is not TRUE", fixed=TRUE)

  testthat::expect_error(checkGenClusteredDataInputs(p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=NA,
                                                  sigma_eps_sq=-.3),
                         "sigma_eps_sq >= 0 is not TRUE", fixed=TRUE)
  
})
## Test passed with 18 successes.

Tests for getNoiseVar():

testthat::test_that("getNoiseVar works", {
  # getNoiseVar(cor) returns the noise variance v such that a proxy Z + N(0, v)
  # has correlation cor with Z, i.e. v = 1/cor^2 - 1 (vectorized; cor in (0, 1]).
  testthat::expect_equal(getNoiseVar(1), 0)
  testthat::expect_equal(getNoiseVar(0.5), 3)
  testthat::expect_equal(getNoiseVar(0.9), 1/0.81 - 1)
  testthat::expect_equal(getNoiseVar(c(0.5, 1, 0.9)), c(3, 0, 1/0.81 - 1))
  # The defining property: a proxy with this noise variance has correlation cor.
  testthat::expect_equal(1/sqrt(1 + getNoiseVar(0.8)), 0.8)

  # cor must be numeric, non-NA, non-empty, and in (0, 1] (checks are vectorized).
  testthat::expect_error(getNoiseVar(0), "all(cor > 0) is not TRUE", fixed=TRUE)
  testthat::expect_error(getNoiseVar(-0.5), "all(cor > 0) is not TRUE",
    fixed=TRUE)
  testthat::expect_error(getNoiseVar(c(0.5, 0)), "all(cor > 0) is not TRUE",
    fixed=TRUE)
  testthat::expect_error(getNoiseVar(1.5), "all(cor <= 1) is not TRUE",
    fixed=TRUE)
  testthat::expect_error(getNoiseVar(NA_real_), "all(!is.na(cor)) is not TRUE",
    fixed=TRUE)
  testthat::expect_error(getNoiseVar("0.5"),
    "is.numeric(cor) | is.integer(cor) is not TRUE", fixed=TRUE)
  testthat::expect_error(getNoiseVar(numeric(0)), "length(cor) >= 1 is not TRUE",
    fixed=TRUE)
})
## Test passed with 12 successes.

Finally, tests for genClusteredData()

testthat::test_that("genClusteredData works", {
  set.seed(23478)

  ret <- genClusteredData(n=25, p=19, k_unclustered=2, cluster_size=5,
                          n_clusters=3, sig_clusters=2, rho=.99,
                          beta_latent=1.5, beta_unclustered=-2, snr=NA,
                          sigma_eps_sq=.5)
  
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  
  testthat::expect_true(is.numeric(ret$X))
  testthat::expect_true(is.matrix(ret$X))
  testthat::expect_equal(ncol(ret$X), 19)
  testthat::expect_equal(nrow(ret$X), 25)
  # X is Gaussian with mean 0 and variance 4; expect all observations to lie
  # within 5 standard deviations of mean
  testthat::expect_true(all(abs(ret$X) < 5*2))
  # Test that clusters are correlated--within-cluster correlation should be
  # high, correlation with other features should be low
  testthat::expect_true(min(cor(ret$X[, 1:5])) > .9)
  testthat::expect_true(max(abs(cor(ret$X[, 1:5], ret$X[, 6:19]))) < .6)
  
  testthat::expect_true(min(cor(ret$X[, 6:10])) > .9)
  testthat::expect_true(max(abs(cor(ret$X[, 6:10],
                                    ret$X[, c(1:5, 11:19)]))) < .6)
  
  testthat::expect_true(min(cor(ret$X[, 11:15])) > .9)
  testthat::expect_true(max(abs(cor(ret$X[, 11:15],
                                    ret$X[, c(1:10, 16:19)]))) < .6)

  cor_indeps <- cor(ret$X[, 16:19])
  testthat::expect_true(max(abs(cor_indeps[lower.tri(cor_indeps)])) < .6)

  testthat::expect_true(is.numeric(ret$y))
  testthat::expect_equal(length(ret$y), 25)

  testthat::expect_true(is.numeric(ret$Z))
  testthat::expect_true(is.matrix(ret$Z))
  testthat::expect_equal(nrow(ret$Z), 25)
  testthat::expect_equal(ncol(ret$Z), 3)

  testthat::expect_true(is.numeric(ret$mu))
  testthat::expect_equal(length(ret$mu), 25)
  # Because y is Gaussian with mean mu and standard deviation .5 conditional on
  # mu, expect all observations to lie within 5 sds of mu
  testthat::expect_true(all(abs(ret$y - ret$mu) < 5*.5))

  # Specify SNR instead of sigma_eps_sq
  ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
                          n_clusters=3, sig_clusters=2, rho=.8, beta_latent=1.5,
                          beta_unclustered=-2, snr=1, sigma_eps_sq=NA)
  
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  
  # Specifying both snr and sigma_eps_sq is an error (issue #13): only one of
  # the two may be given.
  testthat::expect_error(
    genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5, n_clusters=3,
                     sig_clusters=2, rho=.8, beta_latent=1.5,
                     beta_unclustered=-2, snr=.01, sigma_eps_sq=.25),
    "Only one of snr and sigma_eps_sq may be specified", fixed=TRUE)
  
  # Try a single latent variable (z should be a one-column matrix)
  ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
                          n_clusters=1, sig_clusters=1, rho=.8, beta_latent=1.5,
                          beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  testthat::expect_true(is.numeric(ret$Z))
  testthat::expect_true(is.matrix(ret$Z))
  testthat::expect_equal(nrow(ret$Z), 5)
  testthat::expect_equal(ncol(ret$Z), 1)
  
  # Bad inputs
  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                          cluster_size=5, n_clusters=3,
                                          sig_clusters="2", rho=.8,
                                          beta_latent=1.5, beta_unclustered=-2,
                                          snr=1, sigma_eps_sq=NA),
                         "is.numeric(sig_clusters) | is.integer(sig_clusters) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                          cluster_size=5, n_clusters=3,
                                          sig_clusters=4, rho=.8,
                                          beta_latent=1.5, beta_unclustered=-2,
                                          snr=1, sigma_eps_sq=NA),
                         "sig_clusters <= n_clusters is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                          cluster_size=5, n_clusters=3,
                                          sig_clusters=-1, rho=.8,
                                          beta_latent=1.5, beta_unclustered=-2,
                                          snr=1, sigma_eps_sq=NA),
                         "sig_clusters >= 0 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=.6, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "sig_clusters == round(sig_clusters) is not TRUE",
                         fixed=TRUE)


  # n_clusters
  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5,
                                                  n_clusters="3",
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "is.numeric(n_clusters) | is.integer(n_clusters) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3.2,
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "n_clusters == round(n_clusters) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=0,
                                                  sig_clusters=0, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "n_clusters >= 1 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=1, n_clusters=3,
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "cluster_size >= 2 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(p=16, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "p >= n_clusters * cluster_size + k_unclustered is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "rho > 0 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=0,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "beta_latent != 0 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=0, snr=1,
                                                  sigma_eps_sq=NA),
                         "beta_unclustered != 0 is not TRUE", fixed=TRUE)

  # k_unclustered
  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered="2",
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "is.numeric(k_unclustered) | is.integer(k_unclustered) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=0,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "k_unclustered >= 1 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2.2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=1,
                                                  sigma_eps_sq=NA),
                         "k_unclustered == round(k_unclustered) is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=NA,
                                                  sigma_eps_sq=NA),
                         "Must specify one of snr or sigma_eps_sq", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=-.2,
                                                  sigma_eps_sq=NA),
                         "snr > 0 is not TRUE", fixed=TRUE)

  testthat::expect_error(genClusteredData(n=5, p=19, k_unclustered=2,
                                                  cluster_size=5, n_clusters=3,
                                                  sig_clusters=2, rho=0.8,
                                                  beta_latent=1.5,
                                                  beta_unclustered=-2, snr=NA,
                                                  sigma_eps_sq=-.3),
                         "sigma_eps_sq >= 0 is not TRUE", fixed=TRUE)

  # sigma_eps_sq = 0 is allowed (issue #13) and yields noiseless data (y == mu)
  ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
                          n_clusters=3, sig_clusters=2, rho=.8, beta_latent=1.5,
                          beta_unclustered=-2, snr=NA, sigma_eps_sq=0)
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  testthat::expect_equal(ret$y, ret$mu)

  # k_unclustered = 1 is allowed (issue #13)
  ret <- genClusteredData(n=5, p=19, k_unclustered=1, cluster_size=5,
                          n_clusters=3, sig_clusters=2, rho=.8, beta_latent=1.5,
                          beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  testthat::expect_equal(nrow(ret$X), 5)
  testthat::expect_equal(ncol(ret$X), 19)

})
## Test passed with 57 successes.

Tests for genClusteredDataWeighted()

testthat::test_that("genClusteredDataWeighted works", {
  set.seed(23478)

  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_strong_cluster_vars=3, n_clusters=3,
                                  sig_clusters=2, rho_high=.99, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=.5)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  testthat::expect_true(is.numeric(ret$X))
  testthat::expect_true(is.matrix(ret$X))
  testthat::expect_equal(ncol(ret$X), 19)
  testthat::expect_equal(nrow(ret$X), 25)
  # X is Gaussian with mean 0 and variance 4; expect all observations to lie
  # within 5 standard deviations of mean
  testthat::expect_true(all(abs(ret$X) < 5*2))
  # Test that clusters are correlated--within-cluster correlation should be
  # high, correlation with other features should be low
  testthat::expect_true(min(cor(ret$X[, 1:3])) > .9)
  testthat::expect_true(max(abs(cor(ret$X[, 1:5], ret$X[, 6:19]))) < .6)

  testthat::expect_true(min(cor(ret$X[, 6:8])) > .9)
  testthat::expect_true(max(abs(cor(ret$X[, 6:10],
                                    ret$X[, c(1:5, 11:19)]))) < .6)

  testthat::expect_true(min(cor(ret$X[, 11:13])) > .9)
  testthat::expect_true(max(abs(cor(ret$X[, 11:15],
                                    ret$X[, c(1:10, 16:19)]))) < .7)

  cor_indeps <- cor(ret$X[, 16:19])
  testthat::expect_true(max(abs(cor_indeps[lower.tri(cor_indeps)])) < .6)

  testthat::expect_true(is.numeric(ret$y))
  testthat::expect_equal(length(ret$y), 25)

  testthat::expect_true(is.numeric(ret$Z))
  testthat::expect_true(is.matrix(ret$Z))
  testthat::expect_equal(nrow(ret$Z), 25)
  testthat::expect_equal(ncol(ret$Z), 3)

  testthat::expect_true(is.numeric(ret$mu))
  testthat::expect_equal(length(ret$mu), 25)
  # Because y is Gaussian with mean mu and standard deviation .5 conditional on
  # mu, expect all observations to lie within 5 sds of mu
  testthat::expect_true(all(abs(ret$y - ret$mu) < 5*.5))

  # Specify SNR instead of sigma_eps_sq
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_strong_cluster_vars=3, n_clusters=3,
                                  sig_clusters=2, rho_high=.99, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=1, sigma_eps_sq=NA)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  # Specifying both snr and sigma_eps_sq is an error (issue #13): only one of
  # the two may be given.
  testthat::expect_error(
    genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
                             n_strong_cluster_vars=3, n_clusters=3,
                             sig_clusters=2, rho_high=.99, rho_low=.5,
                             beta_latent=1.5, beta_unclustered=-2,
                             snr=.01, sigma_eps_sq=.25),
    "Only one of snr and sigma_eps_sq may be specified", fixed=TRUE)

  # Try a single latent variable (z should be a one-column matrix)
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_strong_cluster_vars=3, n_clusters=1,
                                  sig_clusters=1, rho_high=.99, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=.5)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  testthat::expect_true(is.numeric(ret$Z))
  testthat::expect_true(is.matrix(ret$Z))
  testthat::expect_equal(nrow(ret$Z), 25)
  testthat::expect_equal(ncol(ret$Z), 1)

  # sigma_eps_sq = 0 is allowed (issue #13) and yields noiseless data (y == mu)
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_strong_cluster_vars=3, n_clusters=3,
                                  sig_clusters=2, rho_high=.99, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=0)
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  testthat::expect_equal(ret$y, ret$mu)

  # k_unclustered = 1 is allowed (issue #13)
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=1, cluster_size=5,
                                  n_strong_cluster_vars=3, n_clusters=3,
                                  sig_clusters=2, rho_high=.99, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=.5)
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  testthat::expect_equal(ncol(ret$X), 19)

})
## Test passed with 38 successes.

Tests for genClusteredDataWeightedRandom()

testthat::test_that("genClusteredDataWeightedRandom works", {
  set.seed(23478)

  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_clusters=3,
                                  sig_clusters=2, rho_high=1, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=.5)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  testthat::expect_true(is.numeric(ret$X))
  testthat::expect_true(is.matrix(ret$X))
  testthat::expect_equal(ncol(ret$X), 19)
  testthat::expect_equal(nrow(ret$X), 25)
  # X is Gaussian with mean 0 and variance 4; expect all observations to lie
  # within 5 standard deviations of mean
  testthat::expect_true(all(abs(ret$X) < 5*2))
  # Test that clusters are correlated--within-cluster correlation should be
  # high, correlation with other features should be low
  testthat::expect_true(min(cor(ret$X[, 1:3])) > .2)
  testthat::expect_true(max(abs(cor(ret$X[, 1:5], ret$X[, 6:19]))) < .6)

  testthat::expect_true(min(cor(ret$X[, 6:8])) > .2)
  testthat::expect_true(max(abs(cor(ret$X[, 6:10],
                                    ret$X[, c(1:5, 11:19)]))) < .6)

  testthat::expect_true(min(cor(ret$X[, 11:13])) > .2)
  testthat::expect_true(max(abs(cor(ret$X[, 11:15],
                                    ret$X[, c(1:10, 16:19)]))) < .7)

  cor_indeps <- cor(ret$X[, 16:19])
  testthat::expect_true(max(abs(cor_indeps[lower.tri(cor_indeps)])) < .6)

  testthat::expect_true(is.numeric(ret$y))
  testthat::expect_equal(length(ret$y), 25)

  testthat::expect_true(is.numeric(ret$Z))
  testthat::expect_true(is.matrix(ret$Z))
  testthat::expect_equal(nrow(ret$Z), 25)
  testthat::expect_equal(ncol(ret$Z), 3)

  testthat::expect_true(is.numeric(ret$mu))
  testthat::expect_equal(length(ret$mu), 25)
  # Because y is Gaussian with mean mu and standard deviation .5 conditional on
  # mu, expect all observations to lie within 5 sds of mu
  testthat::expect_true(all(abs(ret$y - ret$mu) < 5*.5))

  # Specify SNR instead of sigma_eps_sq
  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_clusters=3,
                                  sig_clusters=2, rho_high=1, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=1, sigma_eps_sq=NA)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  # Specifying both snr and sigma_eps_sq is an error (issue #13): only one of
  # the two may be given.
  testthat::expect_error(
    genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
                                   n_clusters=3,
                                   sig_clusters=2, rho_high=.99, rho_low=.5,
                                   beta_latent=1.5, beta_unclustered=-2,
                                   snr=.01, sigma_eps_sq=.25),
    "Only one of snr and sigma_eps_sq may be specified", fixed=TRUE)

  # Try a single latent variable (z should be a one-column matrix)
  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_clusters=1,
                                  sig_clusters=1, rho_high=1, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=.5)

  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))

  testthat::expect_true(is.numeric(ret$Z))
  testthat::expect_true(is.matrix(ret$Z))
  testthat::expect_equal(nrow(ret$Z), 25)
  testthat::expect_equal(ncol(ret$Z), 1)

  # sigma_eps_sq = 0 is allowed (issue #13) and yields noiseless data (y == mu)
  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
                                  n_clusters=3,
                                  sig_clusters=2, rho_high=1, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=0)
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  testthat::expect_equal(ret$y, ret$mu)

  # k_unclustered = 1 is allowed (issue #13)
  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=1, cluster_size=5,
                                  n_clusters=3,
                                  sig_clusters=2, rho_high=1, rho_low=.5,
                                  beta_latent=1.5, beta_unclustered=-2,
                                  snr=NA, sigma_eps_sq=.5)
  testthat::expect_true(is.list(ret))
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
  testthat::expect_equal(ncol(ret$X), 19)

})
## Test passed with 38 successes.

Finally, characterization snapshots that pin the exact generated output of all three genClusteredData* generators. The “… works” tests above only check distributional properties (correlations, ranges), not the exact RNG draw order (which differs across the three: bulk rnorm vs per-column vs random per column). These snapshots make the #16 consolidation of the generators and their validators provably behavior-preserving; regenerate only if the data-generating algorithm intentionally changes.

# Characterization snapshots: pin the EXACT generated values (X / y / Z / mu)
# under fixed seeds so the #16 consolidation of the genClusteredData* functions
# and their validators is provably behavior-preserving. The existing "... works"
# tests only check distributional properties (correlations, ranges), NOT the
# exact RNG draw order, which differs across the three generators (bulk vs
# per-column vs random-per-column). Regenerate ONLY if the data-generating
# algorithm intentionally changes.

testthat::test_that("genClusteredData output is byte-stable (characterization snapshot, #16)", {
  set.seed(23478)
  ret <- genClusteredData(n=25, p=19, k_unclustered=2, cluster_size=5,
    n_clusters=3, sig_clusters=2, rho=.99, beta_latent=1.5, beta_unclustered=-2,
    snr=NA, sigma_eps_sq=.5)
  testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.02641989570528,
    -0.113749750828501, -0.0540798505887575, -0.719018251689331,
    -1.02002920513898, -0.133167075802704, -0.228504275273568,
    -0.594853445315136, -1.3743321671407, -0.691923420713347, -0.11653157345392,
    -0.949501763500117, -1.09911307059764, -0.234373142554013,
    -0.0610136281766905, -0.614831158775111, -0.875588846521461,
    -0.267096146835881, 0.243882464912086, -0.525573959766865, 0.38889466127034,
    -0.196999108158916, -0.548580993298435, 1.29708342622908), dim = c(4L, 6L)))
  testthat::expect_equal(ret$y[1:4], c(1.78818155412284, 2.55825580753328,
    0.0881535457958548, 1.21771079504409))
  testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
    -0.338373896594486, 0.00224652036037424, -0.638671932314538,
    0.402628944106856, -0.0568963388614887, -0.533358544824988, 1.23055042537552,
    0.495888208134907, 2.19477907424054, 0.357725198858587, -0.863633909554439),
    dim = 4:3))
  testthat::expect_equal(ret$mu[1:4], c(1.12211695300006, 3.29507281236772,
    -0.106753094432726, 1.52394871772745))
  testthat::expect_equal(
    c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
    c(sumX = 7.85857128981734, sumY = 13.5810460445817, sumZ = 3.15823002771053,
      sumMu = 9.93005749962466))

  # single latent variable (n_clusters = 1): Z is a one-column matrix, and the
  # proxy block adds bulk noise to the raw vector Z.
  set.seed(23478)
  ret <- genClusteredData(n=5, p=19, k_unclustered=2, cluster_size=5,
    n_clusters=1, sig_clusters=1, rho=.8, beta_latent=1.5, beta_unclustered=-2,
    snr=NA, sigma_eps_sq=.5)
  testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-0.00161764234876749,
    -0.557794858227907, -0.827386446973581, -0.460377784885637, -2.67619233265038,
    0.671711827459298, 1.65236925524076, -0.600115109545631, -0.280741265470752,
    -1.67026187144442, -0.0473803806232191, 0.0826508004462609, 0.170200984778884,
    0.976190994120857, -0.499749723868518, -0.474295388720649, -1.17346153974724,
    -0.574655697546479, -0.120791507647279, -0.567678976454219, -1.3596277359555,
    -0.0619416164679036, 0.538496824968004, 0.51591446765952), dim = c(4L, 6L)))
  testthat::expect_equal(ret$y[1:4], c(-0.558336409902388, -2.73369761842788,
    -0.884087327686163, -3.17343267752372))
  testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
    -0.338373896594486, 0.00224652036037424, -0.638671932314538),
    dim = c(4L, 1L)))
  testthat::expect_equal(ret$mu[1:4], c(0.195781398124046, -1.51661095607912,
    -0.722190912632797, -2.90119777610212))
  testthat::expect_equal(
    c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
    c(sumX = -5.88971766152033, sumY = -7.36101068003228, sumZ = -2.4782435456402,
      sumMu = -5.08843939501818))
})

testthat::test_that("genClusteredDataWeighted output is byte-stable (characterization snapshot, #16)", {
  set.seed(23478)
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
    n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
    rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.02641989570528,
    -0.113749750828501, -0.0540798505887575, -0.719018251689331,
    -1.02002920513898, -0.133167075802704, -0.228504275273568,
    -0.594853445315136, -1.3743321671407, -0.691923420713347, -0.11653157345392,
    -0.949501763500117, -0.748805376677026, 0.925796953277494, -0.766705940388196,
    -0.348877774329742, 1.96822115764394, 0.528035720534176, 2.93942827641455,
    0.736079254765787, 0.38889466127034, -0.196999108158916, -0.548580993298435,
    1.29708342622908), dim = c(4L, 6L)))
  testthat::expect_equal(ret$y[1:4], c(1.78818155412284, 2.55825580753328,
    0.0881535457958548, 1.21771079504409))
  testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
    -0.338373896594486, 0.00224652036037424, -0.638671932314538,
    0.402628944106856, -0.0568963388614887, -0.533358544824988, 1.23055042537552,
    0.495888208134907, 2.19477907424054, 0.357725198858587, -0.863633909554439),
    dim = 4:3))
  testthat::expect_equal(ret$mu[1:4], c(1.12211695300006, 3.29507281236772,
    -0.106753094432726, 1.52394871772745))
  testthat::expect_equal(
    c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
    c(sumX = 15.3829830134111, sumY = 13.5810460445817, sumZ = 3.15823002771053,
      sumMu = 9.93005749962466))

  # single latent variable (n_clusters = 1)
  set.seed(23478)
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
    n_strong_cluster_vars=3, n_clusters=1, sig_clusters=1, rho_high=.99,
    rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.07114417787482,
    -0.368013007285506, 0.014438919713251, -0.845503147886515, -1.23739163512048,
    -0.174811089926991, -0.0829364724064908, -0.479069797377207, -1.12138274604786,
    -0.402828108682171, 0.104291325515473, -0.844159398768831, 0.352430473911601,
    -1.4960796572646, -1.23544829374631, 0.148162490964714, -1.03021388828778,
    0.576416731940481, -0.513340292522171, -1.14008022670496, 0.402628944106856,
    -0.0568963388614887, -0.533358544824988, 1.23055042537552), dim = c(4L, 6L)))
  testthat::expect_equal(ret$y[1:4], c(-3.91149964768012, -4.73964429926795,
    0.513991031913566, -2.95972727928473))
  testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
    -0.338373896594486, 0.00224652036037424, -0.638671932314538),
    dim = c(4L, 1L)))
  testthat::expect_equal(ret$mu[1:4], c(-3.20232310305676, -3.49765450037239,
    0.564187042362111, -2.19774596140566))
  testthat::expect_equal(
    c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
    c(sumX = -24.0234160115409, sumY = -10.2198990332918, sumZ = 0.586596841562748,
      sumMu = -5.15679097838167))
})

testthat::test_that("genClusteredDataWeightedRandom output is byte-stable (characterization snapshot, #16)", {
  set.seed(23478)
  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2,
    cluster_size=5, n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5,
    beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.1897691657824,
    -0.00982041549935259, 0.665172439447775, -0.756543570417209, 1.15949678203449,
    -2.91344510959381, 0.491240389956153, -1.30476423524191, -0.485131132932798,
    0.707045727376943, 3.2222522503854, -3.27350445562509, -1.85428176680484,
    -0.0656090373877866, -0.551130785327929, -1.90090513408521, -1.29616909361208,
    -0.144918159176611, 0.544180897409365, -0.50148178532334, 0.627200376817289,
    0.171599744439457, -1.07507438640901, 1.5693792481444), dim = c(4L, 6L)))
  testthat::expect_equal(ret$y[1:4], c(1.78818155412284, 2.55825580753328,
    0.0881535457958548, 1.21771079504409))
  testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
    -0.338373896594486, 0.00224652036037424, -0.638671932314538,
    0.402628944106856, -0.0568963388614887, -0.533358544824988, 1.23055042537552,
    0.495888208134907, 2.19477907424054, 0.357725198858587, -0.863633909554439),
    dim = 4:3))
  testthat::expect_equal(ret$mu[1:4], c(1.12211695300006, 3.29507281236772,
    -0.106753094432726, 1.52394871772745))
  testthat::expect_equal(
    c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
    c(sumX = 3.23025166163967, sumY = 13.5810460445817, sumZ = 3.15823002771053,
      sumMu = 9.93005749962466))

  # single latent variable (n_clusters = 1)
  set.seed(23478)
  ret <- genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2,
    cluster_size=5, n_clusters=1, sig_clusters=1, rho_high=1, rho_low=.5,
    beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  testthat::expect_equal(ret$X[1:4, 1:6], structure(c(-1.36526166060881,
    0.586409931283558, 1.2227469255493, -2.17655371414979, 0.326418806550987,
    -1.09714072579561, 1.4239012653582, -0.760011975131564, -0.227256340192126,
    -1.84845832529723, 0.184388770399019, -0.394752299747441, -1.47062212216842,
    -0.12215942773548, -0.0749451055463801, -0.842364831584477, -1.41223753691949,
    -1.68487949598676, 0.247717311839652, -1.86068883953814, 0.402628944106856,
    -0.0568963388614887, -0.533358544824988, 1.23055042537552), dim = c(4L, 6L)))
  testthat::expect_equal(ret$y[1:4], c(-3.91149964768012, -4.73964429926795,
    0.513991031913566, -2.95972727928473))
  testthat::expect_equal(ret$Z[1:4, , drop = FALSE], structure(c(-1.13051559031852,
    -0.338373896594486, 0.00224652036037424, -0.638671932314538),
    dim = c(4L, 1L)))
  testthat::expect_equal(ret$mu[1:4], c(-3.20232310305676, -3.49765450037239,
    0.564187042362111, -2.19774596140566))
  testthat::expect_equal(
    c(sumX=sum(ret$X), sumY=sum(ret$y), sumZ=sum(ret$Z), sumMu=sum(ret$mu)),
    c(sumX = -28.8273582900822, sumY = -10.2198990332918, sumZ = 0.586596841562748,
      sumMu = -5.15679097838167))
})
## Test passed with 10 successes.
## Test passed with 10 successes.
## Test passed with 10 successes.

The two weighted generators now run the same snr / sigma_eps_sq type checks as genClusteredData() (#35): a non-numeric snr / sigma_eps_sq is rejected up front rather than silently passing the old lean check and failing later inside genZmuY().

testthat::test_that("genClusteredData* reject n < 2 and non-integer cluster_size / p (#70)", {
  ok <- list(n = 10, p = 8, k_unclustered = 2, cluster_size = 3, n_clusters = 1,
             sig_clusters = 1, sigma_eps_sq = 1)
  testthat::expect_error(do.call(genClusteredData, utils::modifyList(ok, list(n = 1L))),
                         "n >= 2", fixed = TRUE)
  testthat::expect_error(do.call(genClusteredData, utils::modifyList(ok, list(cluster_size = 2.5))),
                         "cluster_size == round(cluster_size)", fixed = TRUE)
  testthat::expect_error(do.call(genClusteredData, utils::modifyList(ok, list(p = 8.5))),
                         "p == round(p)", fixed = TRUE)
  # n = 2 is the tight boundary -- it must still WORK (locks the threshold
  # against a future "n >= 3" / "> 2" regression).
  res2 <- do.call(genClusteredData, utils::modifyList(ok, list(n = 2L)))
  testthat::expect_equal(nrow(res2$X), 2L)
  # The shared genZmuY n-guard and the shared Pre cluster_size check cover the
  # weighted generators too:
  wok <- list(n = 1L, p = 8, k_unclustered = 2, cluster_size = 3, n_clusters = 1,
              n_strong_cluster_vars = 2, sig_clusters = 1, rho_high = 0.8,
              rho_low = 0.2, sigma_eps_sq = 1)
  testthat::expect_error(do.call(genClusteredDataWeighted, wok),
                         "n >= 2", fixed = TRUE)
  testthat::expect_error(
    do.call(genClusteredDataWeighted, utils::modifyList(wok, list(n = 10L, cluster_size = 2.5))),
    "cluster_size == round(cluster_size)", fixed = TRUE)
})

testthat::test_that("genClusteredDataWeighted/Random validate snr/sigma_eps_sq type (#35)", {
  # Newly rejected (these previously slipped past the weighted validators' lean
  # checks and only failed later, cryptically, inside genZmuY).
  testthat::expect_error(
    genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
      n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
      rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr="1",
      sigma_eps_sq=NA),
    "is.numeric(snr) | is.integer(snr) is not TRUE", fixed=TRUE)
  testthat::expect_error(
    genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
      n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
      rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA,
      sigma_eps_sq="0.5"),
    "is.numeric(sigma_eps_sq) | is.integer(sigma_eps_sq) is not TRUE", fixed=TRUE)
  testthat::expect_error(
    genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
      n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5, beta_latent=1.5,
      beta_unclustered=-2, snr="1", sigma_eps_sq=NA),
    "is.numeric(snr) | is.integer(snr) is not TRUE", fixed=TRUE)
  testthat::expect_error(
    genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
      n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5, beta_latent=1.5,
      beta_unclustered=-2, snr=NA, sigma_eps_sq="0.5"),
    "is.numeric(sigma_eps_sq) | is.integer(sigma_eps_sq) is not TRUE", fixed=TRUE)

  # Range checks (snr > 0, sigma_eps_sq >= 0) are unchanged by #35.
  testthat::expect_error(
    genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
      n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
      rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=-.2,
      sigma_eps_sq=NA),
    "snr > 0 is not TRUE", fixed=TRUE)
  testthat::expect_error(
    genClusteredDataWeightedRandom(n=25, p=19, k_unclustered=2, cluster_size=5,
      n_clusters=3, sig_clusters=2, rho_high=1, rho_low=.5, beta_latent=1.5,
      beta_unclustered=-2, snr=NA, sigma_eps_sq=-.3),
    "sigma_eps_sq >= 0 is not TRUE", fixed=TRUE)

  # A valid scalar numeric snr / sigma_eps_sq is still accepted.
  ret <- genClusteredDataWeighted(n=25, p=19, k_unclustered=2, cluster_size=5,
    n_strong_cluster_vars=3, n_clusters=3, sig_clusters=2, rho_high=.99,
    rho_low=.5, beta_latent=1.5, beta_unclustered=-2, snr=NA, sigma_eps_sq=.5)
  testthat::expect_identical(names(ret), c("X", "y", "Z", "mu"))
})
## Test passed with 6 successes.
## Test passed with 7 successes.

Tests for getLassoLambda():

testthat::test_that("getLassoLambda works", {
  set.seed(7252)
  
  x <- matrix(stats::rnorm(10*6), nrow=10, ncol=6)
  y <- stats::rnorm(10)

  ret <- getLassoLambda(X=x, y=y, lambda_choice="1se", nfolds=4)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_true(ret >= 0)
  
  ret <- getLassoLambda(X=x, y=y, lambda_choice="min", nfolds=5)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_true(ret >= 0)
  
  # Different mixing parameter
  
   ret <- getLassoLambda(X=x, y=y, lambda_choice="min", nfolds=5, alpha=0.5)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_true(ret >= 0)

  # data.frame X is now accepted (coerced internally), like the sibling exports (#51)
  ret_df <- getLassoLambda(X = as.data.frame(x), y = y, lambda_choice = "1se", nfolds = 4)
  testthat::expect_true(is.numeric(ret_df) | is.integer(ret_df))
  testthat::expect_true(!is.na(ret_df))
  testthat::expect_equal(length(ret_df), 1)
  testthat::expect_true(ret_df >= 0)

  # Bad inputs
  testthat::expect_error(getLassoLambda(X="x", y=y), "is.matrix(X) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x[1:9, ], y=y),
                         "n == length(y) is not TRUE", fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x, y=TRUE),
                         "must be a numeric (real-valued) vector",
                         fixed=TRUE)

  # Error has quotation marks in it
  testthat::expect_error(getLassoLambda(X=x, y=y, lambda_choice="mni"))

  testthat::expect_error(getLassoLambda(X=x, y=y,
                                        lambda_choice=c("min", "1se")),
                         "length(lambda_choice) == 1 is not TRUE",
                         fixed=TRUE)

  testthat::expect_error(getLassoLambda(X=x, y=y, lambda_choice=1),
                         "is.character(lambda_choice) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x, y=y, nfolds="5"),
                         "is.numeric(nfolds) | is.integer(nfolds) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=1:4),
                         "length(nfolds) == 1 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=3.2),
                         "nfolds == round(nfolds) is not TRUE", fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=3),
                         "nfolds > 3 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=4, alpha=1.2),
                         "alpha <= 1 is not TRUE", fixed=TRUE)

  # alpha = 0 (degenerate ridge) is now rejected (previously allowed)
  testthat::expect_error(getLassoLambda(X=x, y=y, nfolds=4, alpha=0),
                         "alpha > 0 is not TRUE", fixed=TRUE)

  # An NA in the design matrix now yields a friendly message (#56) rather than
  # falling through to a cryptic glmnet-level error.
  x_na <- x; x_na[2, 1] <- NA
  testthat::expect_error(getLassoLambda(X=x_na, y=y),
                         "must not contain missing", fixed=TRUE)
  # A data.frame with a numeric NA must error with the same message BEFORE
  # coercion: model.matrix's na.action=na.omit would otherwise silently drop
  # the NA row and the failure would surface downstream (length mismatch), not
  # as this message. This case pins the pre-coercion placement of checkNoNAs.
  df_na <- as.data.frame(x); df_na[2, 1] <- NA
  testthat::expect_error(getLassoLambda(X=df_na, y=y),
                         "must not contain missing", fixed=TRUE)

  # (#104, L7) When n is too small for even 3-fold cross-validation, error
  # clearly here instead of crashing inside cv.glmnet ("nfolds must be bigger
  # than 3"). With n = 4 and the default nfolds = 10, n_sample rounds to 2 (< 3).
  testthat::expect_error(
    getLassoLambda(X=matrix(stats::rnorm(4*5), nrow=4, ncol=5),
                   y=stats::rnorm(4)),
    "is too small to choose lambda", fixed=TRUE)

})
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getLassoLambda(X = x, y = y, lambda_choice = "1se", nfolds = 4)
##  2.   \-glmnet::cv.glmnet(...)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getLassoLambda(X = x, y = y, lambda_choice = "min", nfolds = 5)
##  2.   \-glmnet::cv.glmnet(...)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getLassoLambda(...)
##  2.   \-glmnet::cv.glmnet(...)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getLassoLambda works -----------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getLassoLambda(...)
##  2.   \-glmnet::cv.glmnet(...)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## Test passed with 31 successes.

Out-of-range cluster indices are caught up front (#104, L3) through the public css() and protolasso() entry points – covering a bare-vector clusters argument (which is wrapped into a list after the per-element validation loop) and a list clusters argument:

testthat::test_that("out-of-range cluster indices error via css/protolasso (#104)", {
  set.seed(104105)
  X <- matrix(stats::rnorm(20*8), nrow=20, ncol=8)
  y <- stats::rnorm(20)

  # Bare-vector clusters with an index beyond p (= 5 here).
  testthat::expect_error(
    css(X[, 1:5], y, lambda=0.01, clusters=c(1, 2, 99)),
    "Cluster index 99 exceeds the number of features (p = 5).", fixed=TRUE)

  # List clusters routed through protolasso.
  testthat::expect_error(
    protolasso(X[, 1:5], y, clusters=list(c(1, 2, 99))),
    "Cluster index 99 exceeds the number of features (p = 5).", fixed=TRUE)
})
## Test passed with 2 successes.

css() reports a too-small sample size clearly (#104, L4) instead of the misleading “Too many training indices provided” when no training indices were supplied:

testthat::test_that("css reports a too-small sample size clearly (#104)", {
  set.seed(104104)

  # No train_inds: the message must blame the sample size, not training indices.
  testthat::expect_error(
    css(matrix(stats::rnorm(15), nrow=3, ncol=5), stats::rnorm(3), lambda=0.01),
    "is too small", fixed=TRUE)

  # When train_inds leave fewer than 4 observations for selection, the original
  # "Too many training indices" branch still fires.
  testthat::expect_error(
    css(matrix(stats::rnorm(10*5), nrow=10, ncol=5), stats::rnorm(10),
        lambda=0.01, train_inds=1:7),
    "Too many training indices provided", fixed=TRUE)
})
## Test passed with 2 successes.

Tests for getModelSize():

testthat::test_that("getModelSize works", {
  set.seed(1723)
  
  data <- genClusteredData(n=15, p=11, k_unclustered=2, cluster_size=3,
                        n_clusters=2, sigma_eps_sq=10^(-6))
  
  x <- data$X
  y <- data$y
  
  good_clusters <- list(red_cluster=1L:3L, green_cluster=4L:6L)
  
  ret <- getModelSize(X=x, y=y, clusters=good_clusters)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_equal(ret, round(ret))
  testthat::expect_true(ret >= 1)
  # 11 features, but two clusters of size 3, so maximum size should
  # be 11 - 2 - 2 = 7
  testthat::expect_true(ret <= 7)
  
  ## Trying other inputs

  unnamed_clusters <- list(1L:3L, 5L:8L)
  
  ret <- getModelSize(X=x, y=y, clusters=unnamed_clusters)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_equal(ret, round(ret))
  testthat::expect_true(ret >= 1)
  # 11 features, but 3 in one cluster and 4 in another, so maximum size should
  # be 11 - 2 - 3 = 6
  testthat::expect_true(ret <= 6)
  
  # Single cluster
  ret <- getModelSize(X=x, y=y, clusters=2:5)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_equal(ret, round(ret))
  testthat::expect_true(ret >= 1)
  # 11 features, but 4 in one cluster, so maximum size should be 11 - 3 = 8
  testthat::expect_true(ret <= 8)

  # Regression test: when prototype reduction leaves a single column (all
  # provided features in one cluster), getModelSize must return 1L rather than
  # crashing in cv.glmnet. Previously X_size[, -feats_to_drop] dropped to a
  # vector when one column remained.
  testthat::expect_equal(getModelSize(X=x[, 1:3, drop=FALSE], y=y,
                                      clusters=list(1:3)), 1L)


  # Intentionally don't provide clusters for all feature, mix up formatting,
  # etc.
  good_clusters <- list(red_cluster=1:3, 5:8)
  
  ret <- getModelSize(X=x, y=y, clusters=good_clusters)
  
  testthat::expect_true(is.numeric(ret) | is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_equal(ret, round(ret))
  testthat::expect_true(ret >= 1)
  # 11 features, but 3 in one cluster and 4 in another, so maximum size should
  # be 11 - 2 - 3 = 6
  testthat::expect_true(ret <= 6)
  
  ## Trying bad inputs
  
  testthat::expect_error(getModelSize(X="x", y=y, clusters=good_clusters),
                         "is.matrix(X) | is.data.frame(X) is not TRUE", fixed=TRUE)
  
  testthat::expect_error(getModelSize(X=x[1:5, ], y=y, clusters=good_clusters),
                         "length(y) == n is not TRUE", fixed=TRUE)

  testthat::expect_error(getModelSize(X=x, y=FALSE, clusters=good_clusters),
                         "must be a numeric (real-valued) vector",
                         fixed=TRUE)

  testthat::expect_error(getModelSize(X=x, y=y, clusters=list(3:7, 7:10)),
                         "Overlapping clusters detected; clusters must be non-overlapping. Overlapping clusters: 1, 2.",
                         fixed=TRUE)
  
  testthat::expect_error(getModelSize(X=x, y=y, clusters=list(5:8, 5:8)),
                         "length(clusters) == length(unique(clusters)) is not TRUE",
                         fixed=TRUE)

  # Out-of-range cluster index now caught up front by checkFormatClustersInput
  # (#104, L3); x has 11 columns, so index 50 is out of range.
  testthat::expect_error(getModelSize(X=x, y=y, clusters=6:50),
                         "Cluster index 50 exceeds the number of features (p = 11).",
                         fixed=TRUE)

  testthat::expect_error(getModelSize(X=x, y=y,
                                      clusters=list(2:3, as.integer(NA))),
                         "!is.na(clusters) are not all TRUE",
                         fixed=TRUE)

  testthat::expect_error(getModelSize(X=x, y=y, clusters=list(2:3, c(4, 4, 5))),
                         "length(clusters[[i]]) == length(unique(clusters[[i]])) is not TRUE",
                         fixed=TRUE)

   testthat::expect_error(getModelSize(X=x, y=y, clusters=list(1:4, -1)),
                         "all(clusters[[i]] >= 1) is not TRUE",
                         fixed=TRUE)

   testthat::expect_error(getModelSize(X=x, y=y, clusters=list(1:4,
                                                               c(2.3, 1.2))),
                         "all(clusters[[i]] == round(clusters[[i]])) is not TRUE",
                         fixed=TRUE)

  ## alpha (#21): the elastic-net mixing parameter now drives the model-size
  ## estimate (the internal cv.glmnet), keeping it consistent with elastic-net
  ## feature selection.

  # Validation, mirroring getLassoLambda: alpha must be in (0, 1].
  testthat::expect_error(getModelSize(X=x, y=y, clusters=good_clusters, alpha=0),
                         "alpha > 0 is not TRUE", fixed=TRUE)
  testthat::expect_error(getModelSize(X=x, y=y, clusters=good_clusters, alpha=1.5),
                         "alpha <= 1 is not TRUE", fixed=TRUE)
  testthat::expect_error(getModelSize(X=x, y=y, clusters=good_clusters, alpha=NA),
                         "is.numeric(alpha) | is.integer(alpha) is not TRUE",
                         fixed=TRUE)

  # Back-compat: default is pure lasso (alpha = 1). cv.glmnet uses random CV
  # folds, so seed before each call to compare apples-to-apples.
  set.seed(7); a_default <- getModelSize(X=x, y=y, clusters=good_clusters)
  set.seed(7); a_alpha1 <- getModelSize(X=x, y=y, clusters=good_clusters, alpha=1)
  testthat::expect_equal(a_default, a_alpha1)

  # Headline: on a design with a block of highly correlated relevant features
  # (each feature its own singleton cluster, so cv.glmnet sees the whole block
  # with no prototype reduction), a lower alpha (elastic net) keeps more of the
  # correlated features than pure lasso, giving a strictly larger estimated
  # size. On main, getModelSize has no alpha argument, so this cannot happen.
  set.seed(2718)
  n_h <- 120
  z_h <- stats::rnorm(n_h)
  X_block <- matrix(rep(z_h, 10), nrow=n_h) +
    matrix(stats::rnorm(n_h * 10, sd=0.05), nrow=n_h)
  X_noise <- matrix(stats::rnorm(n_h * 4), nrow=n_h)
  X_h <- cbind(X_block, X_noise)
  y_h <- 2 * z_h + stats::rnorm(n_h, sd=0.5)
  singletons_h <- as.list(1:ncol(X_h))
  set.seed(11)
  size_lasso <- getModelSize(X=X_h, y=y_h, clusters=singletons_h, alpha=1)
  set.seed(11)
  size_enet <- getModelSize(X=X_h, y=y_h, clusters=singletons_h, alpha=0.2)
  testthat::expect_true(size_enet >= size_lasso)
  testthat::expect_true(size_enet > size_lasso)

})
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters)
##  2.   \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getModelSize(X = x, y = y, clusters = unnamed_clusters)
##  2.   \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getModelSize(X = x, y = y, clusters = 2:5)
##  2.   \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters)
##  2.   \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters)
##  2.   \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## -- Warning: getModelSize works -------------------------------------------------
## Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
## Backtrace:
##     x
##  1. \-litr (local) getModelSize(X = x, y = y, clusters = good_clusters, alpha = 1)
##  2.   \-glmnet::cv.glmnet(x = X_size, y = y, family = "gaussian", alpha = alpha)
##  3.     \-glmnet:::cv.glmnet.raw(...)
## Test passed with 41 successes.

Tests for getSelectionPrototypes():

testthat::test_that("getSelectionPrototypes works", {
  set.seed(67234)
  
  data <- genClusteredData(n=15, p=11, k_unclustered=2, cluster_size=3,
                        n_clusters=2, sig_clusters=1, sigma_eps_sq=10^(-6))
  
  x <- data$X
  y <- data$y
  
  good_clusters <- list(red_cluster=1L:3L, green_cluster=4L:6L)
  
  css_results <- css(X=x, y=y, lambda=0.01, clusters=good_clusters)
  
  ret <- getSelectionPrototypes(css_results, selected_clusts=good_clusters)
  
  testthat::expect_true(is.integer(ret))
  testthat::expect_true(all(!is.na(ret)))
  testthat::expect_equal(length(ret), length(good_clusters))
  testthat::expect_equal(length(ret), length(unique(ret)))
  for(i in 1:length(ret)){
    testthat::expect_true(ret[i] %in% good_clusters[[i]])
    # Find the largest selection proportion of any feature in cluster i
    max_prop <- max(colMeans(css_results$feat_sel_mat[, good_clusters[[i]]]))
    # Find the selection proportion of the identified prototype
    proto_prop <- colMeans(css_results$feat_sel_mat)[ret[i]]
    testthat::expect_equal(max_prop, proto_prop)
  }
  
  # Try with only one selected cluster (still should be in a list)

  ret <- getSelectionPrototypes(css_results,
                                selected_clusts=list(red_cluster=1L:3L))

  testthat::expect_true(is.integer(ret))
  testthat::expect_true(!is.na(ret))
  testthat::expect_equal(length(ret), 1)
  testthat::expect_true(ret %in% 1L:3L)
  # Find the largest selection proportion of any feature in the cluster
  max_prop <- max(colMeans(css_results$feat_sel_mat[, 1L:3L]))
  # Find the selection proportion of the identified prototype
  proto_prop <- colMeans(css_results$feat_sel_mat)[ret]
  testthat::expect_equal(max_prop, proto_prop)

  ## Trying bad inputs

  # Error contains quotation marks
  testthat::expect_error(getSelectionPrototypes(x, good_clusters))

  testthat::expect_error(getSelectionPrototypes(css_results, 1L:3L),
                         "is.list(selected_clusts) is not TRUE", fixed=TRUE)

  testthat::expect_error(getSelectionPrototypes(css_results, list()),
                         "n_selected_clusts >= 1 is not TRUE", fixed=TRUE)

  testthat::expect_error(getSelectionPrototypes(css_results,
                                                list(red_cluster=1L:3L,
                                                     green_cluster=4L:6L,
                                                     bad_cluster=integer())),
                         "all(lengths(selected_clusts) >= 1) is not TRUE",
                         fixed=TRUE)

})

testthat::test_that("getSelectionPrototypes handles a constant tied column without crashing (#68)", {
  set.seed(680)
  n <- 20L; p <- 4L
  X <- matrix(stats::rnorm(n * p), nrow = n, ncol = p)
  X[, 2] <- 3.5                       # feature 2 is a CONSTANT column
  colnames(X) <- paste0("V", 1:p)
  y <- X[, 1] + stats::rnorm(n)       # y correlated with feature 1 (varying)
  # feat_sel_mat: features 1 and 2 tie at selection proportion 0.5 (both
  # "selected" in the same 5 of 10 subsamples), forcing the tie-break path.
  B <- 10L
  M <- matrix(0L, nrow = B, ncol = p)
  M[1:5, 1] <- 1L
  M[1:5, 2] <- 1L
  css_results <- structure(list(X = X, y = y, feat_sel_mat = M), class = "cssr")
  # Pre-fix: cor(X[, c(1,2)], y) -> c(<corr>, NA) -> max NA -> stopifnot abort.
  proto <- getSelectionPrototypes(css_results, selected_clusts = list(c(1L, 2L)))
  testthat::expect_false(is.na(proto))
  testthat::expect_identical(unname(proto), 1L)   # the varying, y-correlated member

  # All-constant tied cluster: must not crash, picks the first deterministically.
  # (suppressWarnings: the names-assignment emits a pre-existing "number of items
  # to replace..." warning when proto_i stays length>1; not introduced by this
  # fix, unrelated to #68.)
  X2 <- X; X2[, 3] <- 2.0; X2[, 4] <- 9.0
  M2 <- matrix(0L, nrow = B, ncol = p); M2[1:5, 3] <- 1L; M2[1:5, 4] <- 1L
  css2 <- structure(list(X = X2, y = y, feat_sel_mat = M2), class = "cssr")
  proto2 <- suppressWarnings(
      getSelectionPrototypes(css2, selected_clusts = list(c(3L, 4L))))
  testthat::expect_identical(unname(proto2), 3L)
})
## Test passed with 17 successes.
## Test passed with 3 successes.

Tests for printCssDf():

testthat::test_that("printCssDf works", {
  set.seed(67234)
  
  data <- genClusteredData(n=15, p=11, k_unclustered=2, cluster_size=3,
                        n_clusters=2, sig_clusters=1, sigma_eps_sq=10^(-6))
  
  x <- data$X
  y <- data$y
  
  good_clusters <- list(red_cluster=1L:3L, green_cluster=4L:6L)
  
  css_results <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B=10)
  
  ret <- printCssDf(css_results)
  
  testthat::expect_true(is.data.frame(ret))
  testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
                                              "ClustSelProp", "ClustSize"))
  
  # Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
  testthat::expect_equal(nrow(ret), 7)

  testthat::expect_true(is.character(ret$ClustName))
  testthat::expect_true(all(names(good_clusters) %in% ret$ClustName))
  testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
  
  testthat::expect_true(is.integer(ret$ClustProtoNum))
  testthat::expect_true(ret[ret$ClustName=="red_cluster",
                            "ClustProtoNum"] %in% 1L:3L)
  testthat::expect_true(ret[ret$ClustName=="green_cluster",
                            "ClustProtoNum"] %in% 4L:6L)
  other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
  testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
  testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
                          length(unique(ret[other_rows, "ClustProtoNum"])))
  
  testthat::expect_true(is.numeric(ret$ClustSelProp))
  testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
                                                    decreasing=TRUE))
  
  testthat::expect_true(is.integer(ret$ClustSize))
  testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
  testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
  testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
  
  # Try again naming features
  
  x_named <- x
  colnames(x_named) <- LETTERS[1:11]
  
  css_results_name_feats <- css(X=x_named, y=y, lambda=0.01,
                                clusters=good_clusters, B=10)
  
  ret <- printCssDf(css_results_name_feats)
  
  testthat::expect_true(is.data.frame(ret))
  testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoName",
                                              "ClustProtoNum", "ClustSelProp",
                                              "ClustSize"))
  
  # Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
  testthat::expect_equal(nrow(ret), 7)

  testthat::expect_true(is.character(ret$ClustName))
  testthat::expect_true(all(names(good_clusters) %in% ret$ClustName))
  testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
  
  testthat::expect_true(is.character(ret$ClustProtoName))
  testthat::expect_true(ret[ret$ClustName=="red_cluster",
                            "ClustProtoName"] %in% LETTERS[1:3])
  testthat::expect_true(ret[ret$ClustName=="green_cluster",
                            "ClustProtoName"] %in% LETTERS[4:6])
  other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
  testthat::expect_true(all(ret[other_rows, "ClustProtoName"] %in% LETTERS[7:11]))
  testthat::expect_true(length(ret[other_rows, "ClustProtoName"]) ==
                          length(unique(ret[other_rows, "ClustProtoName"])))
  
  testthat::expect_true(is.integer(ret$ClustProtoNum))
  testthat::expect_true(ret[ret$ClustName=="red_cluster",
                            "ClustProtoNum"] %in% 1L:3L)
  testthat::expect_true(ret[ret$ClustName=="green_cluster",
                            "ClustProtoNum"] %in% 4L:6L)
  testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
  testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
                          length(unique(ret[other_rows, "ClustProtoNum"])))
  
  testthat::expect_true(is.numeric(ret$ClustSelProp))
  testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
                                                    decreasing=TRUE))
  
  testthat::expect_true(is.integer(ret$ClustSize))
  testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
  testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
  testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
  
  # Unnamed clusters
  
  unnamed_clusters <- list(1:3, 4:6)
  
  css_results_unnamed <- css(X=x, y=y, lambda=0.01, clusters=unnamed_clusters,
                             B=10)
  
  ret <- printCssDf(css_results_unnamed)
  
  testthat::expect_true(is.data.frame(ret))
  testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
                                              "ClustSelProp", "ClustSize"))
  
  # Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
  testthat::expect_equal(nrow(ret), 7)

  testthat::expect_true(is.character(ret$ClustName))
  testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
  
  testthat::expect_true(is.integer(ret$ClustProtoNum))
  
  testthat::expect_true(is.numeric(ret$ClustSelProp))
  testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
                                                    decreasing=TRUE))
  
  testthat::expect_true(is.integer(ret$ClustSize))
  
  # Try other settings for cutoff, min_num_clusts, max_num_clusts, etc.
  
  ret <- printCssDf(css_results, max_num_clusts=3)
  
  testthat::expect_true(is.data.frame(ret))
  testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
                                              "ClustSelProp", "ClustSize"))

  testthat::expect_true(nrow(ret) <= 3)

  testthat::expect_true(is.character(ret$ClustName))
  testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
  
  testthat::expect_true(is.integer(ret$ClustProtoNum))
  other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
  testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
  testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
                          length(unique(ret[other_rows, "ClustProtoNum"])))
  
  testthat::expect_true(is.numeric(ret$ClustSelProp))
  testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
                                                    decreasing=TRUE))
  
  testthat::expect_true(is.integer(ret$ClustSize))
  testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
  
  if("red_cluster" %in% ret$ClustName){
    testthat::expect_true(ret[ret$ClustName=="red_cluster",
                              "ClustProtoNum"] %in% 1L:3L)
    testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
  }
  
  if("green_cluster" %in% ret$ClustName){
    testthat::expect_true(ret[ret$ClustName=="green_cluster",
                              "ClustProtoNum"] %in% 4L:6L)
    testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
  }
  
  ret <- printCssDf(css_results, min_num_clusts=2, cutoff=1)
  
  testthat::expect_true(is.data.frame(ret))
  testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
                                              "ClustSelProp", "ClustSize"))
  
  # Total number of clusters is 11 - (3 - 1) - (3 - 1) = 7
  testthat::expect_true(nrow(ret) >= 2)
  testthat::expect_true(nrow(ret) <= 7)

  testthat::expect_true(is.character(ret$ClustName))
  testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))
  
  testthat::expect_true(is.integer(ret$ClustProtoNum))
  other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
  testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
  testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
                          length(unique(ret[other_rows, "ClustProtoNum"])))
  
  testthat::expect_true(is.numeric(ret$ClustSelProp))
  testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
                                                    decreasing=TRUE))
  
  testthat::expect_true(is.integer(ret$ClustSize))
  testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))
  
  if("red_cluster" %in% ret$ClustName){
    testthat::expect_true(ret[ret$ClustName=="red_cluster",
                              "ClustProtoNum"] %in% 1L:3L)
    testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
  }
  
  if("green_cluster" %in% ret$ClustName){
    testthat::expect_true(ret[ret$ClustName=="green_cluster",
                              "ClustProtoNum"] %in% 4L:6L)
    testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
  }
  
  #
  ret <- printCssDf(css_results, cutoff=1)

  testthat::expect_true(is.data.frame(ret))
  testthat::expect_identical(colnames(ret), c("ClustName", "ClustProtoNum",
                                              "ClustSelProp", "ClustSize"))

  testthat::expect_true(nrow(ret) >= 1)
  testthat::expect_true(nrow(ret) <= 7)

  testthat::expect_true(is.character(ret$ClustName))
  testthat::expect_equal(length(ret$ClustName), length(unique(ret$ClustName)))

  testthat::expect_true(is.integer(ret$ClustProtoNum))
  other_rows <- !(ret$ClustName %in% c("red_cluster", "green_cluster"))
  testthat::expect_true(all(ret[other_rows, "ClustProtoNum"] %in% 7L:11L))
  testthat::expect_true(length(ret[other_rows, "ClustProtoNum"]) ==
                          length(unique(ret[other_rows, "ClustProtoNum"])))

  testthat::expect_true(is.numeric(ret$ClustSelProp))
  testthat::expect_identical(ret$ClustSelProp, sort(ret$ClustSelProp,
                                                    decreasing=TRUE))

  testthat::expect_true(is.integer(ret$ClustSize))
  testthat::expect_true(all(ret[other_rows, "ClustSize"] == 1))

  if("red_cluster" %in% ret$ClustName){
    testthat::expect_true(ret[ret$ClustName=="red_cluster",
                              "ClustProtoNum"] %in% 1L:3L)
    testthat::expect_equal(ret[ret$ClustName=="red_cluster", "ClustSize"], 3)
  }

  if("green_cluster" %in% ret$ClustName){
    testthat::expect_true(ret[ret$ClustName=="green_cluster",
                              "ClustProtoNum"] %in% 4L:6L)
    testthat::expect_equal(ret[ret$ClustName=="green_cluster", "ClustSize"], 3)
  }

  ## Trying bad inputs

  # Error has quotation marks in it
  testthat::expect_error(printCssDf("css_results"))
  
  testthat::expect_error(printCssDf(css_results, cutoff=-.1),
                         "cutoff >= 0 is not TRUE", fixed=TRUE)
  
  testthat::expect_error(printCssDf(css_results, min_num_clusts=3.2),
                         "min_num_clusts == round(min_num_clusts) is not TRUE",
                         fixed=TRUE)
  
  testthat::expect_error(printCssDf(css_results, max_num_clusts="5"),
                         "is.numeric(max_num_clusts) | is.integer(max_num_clusts) is not TRUE",
                         fixed=TRUE)
})
## Test passed with 98 successes.

Tests for print.cssr() (and a minimal-B smoke test for css()):

testthat::test_that("print.cssr works", {
  set.seed(26717)
  x <- matrix(stats::rnorm(10*7), nrow=10, ncol=7)
  y <- stats::rnorm(10)
  good_clusters <- list("apple"=1:2, "banana"=3:4, "cantaloupe"=5)
  css_res <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 10)

  # print() dispatches to print.cssr, which returns its argument invisibly (the
  # standard print-method convention; see print.cssr's @return / #16).
  testthat::expect_invisible(print(css_res))
  ret <- print(css_res)
  testthat::expect_identical(ret, css_res)

  # It prints the printCssDf summary table (column headers included).
  testthat::expect_output(print(css_res), "ClustName")
  testthat::expect_output(print(css_res), "ClustSize")

  # max_num_clusts routes through to printCssDf (at most that many cluster rows).
  out_all <- utils::capture.output(print(css_res))
  out_one <- utils::capture.output(print(css_res, max_num_clusts=1))
  testthat::expect_true(length(out_one) <= length(out_all))
})

testthat::test_that("css runs with B = 1 (minimal subsampling)", {
  set.seed(26717)
  x <- matrix(stats::rnorm(10*7), nrow=10, ncol=7)
  y <- stats::rnorm(10)
  good_clusters <- list("apple"=1:2, "banana"=3:4, "cantaloupe"=5)

  # B = 1 is allowed (checkB requires only B > 0) but warns that small B is poor.
  testthat::expect_warning(
    css_b1 <- css(X=x, y=y, lambda=0.01, clusters=good_clusters, B = 1),
    "Small values of B may lead to poor results.", fixed=TRUE)
  testthat::expect_true(inherits(css_b1, "cssr"))

  # The minimal-B object is still well-formed downstream.
  sel <- getCssSelections(css_b1)
  testthat::expect_identical(names(sel),
    c("selected_clusts", "selected_feats", "weights"))
})
##    ClustName ClustProtoNum ClustSelProp ClustSize
## 1     banana             4         1.00         2
## 2 cantaloupe             5         0.80         1
## 3         c4             6         0.60         1
## 4      apple             1         0.55         2
## 5         c5             7         0.30         1
##    ClustName ClustProtoNum ClustSelProp ClustSize
## 1     banana             4         1.00         2
## 2 cantaloupe             5         0.80         1
## 3         c4             6         0.60         1
## 4      apple             1         0.55         2
## 5         c5             7         0.30         1
## Test passed with 5 successes.
## Test passed with 3 successes.