trace tracker logo

Least cost sample size with unknown cluster sizes

Calculate least-cost sample sizes for 2-stage surveys for demonstrating disease freedom, where cluster sizes are unknown. This analysis calculates the number of clusters and the number of units within each cluster to be tested to provide a specified system sensitivity (probability of detecting disease) for the given unit and cluster-level design prevalences and test sensitivity, where actual cluster sizes are unknown. Test specificity is assumed to be 100% (or follow-up testing of any positive will be undertaken to confirm or exclude disease).

Sample sizes are optimised to minimise overall cost for given cluster and unit-level testing costs. A maximum sample size per cluster must be specified and either the number of cluster in the population or a maximum number of clusters to be tested must be specified.

Numbers of units to test in each cluster are calculated using assumed binomial sampling (sample size is small relative to cluster sizes), while numbers of clusters to test are calculated using the hypergeometric distribution approximation (sampling without replacement) if the number of clusters in the population is specified or assuming binomial sampling if not.

Design prevalence (specified level of disease to be detected) must be specified at both unit and cluster levels. Design prevalence can be specified as either:

  • a proportion of the population infected; or
  • a specific (integer) number of clusters infected (for cluster-level prevalence only and only if the number of clusters in the population is specified).


Inputs required include:

  • unit-level design prevalence (as a proportion only);
  • cluster-level design prevalence as either a proportion or an integer number of herds;
  • the estimated test sensitivity;
  • the relative cost of testing at both cluster and unit levels;
  • the target system sensitivity (SSe) which is the probability of detecting disease if it is present at the specified design prevalences;
  • the maximum sample size to be tested per clusters; and
  • The number of clusters in the population OR the maximum number of clusters to be tested.


Outputs from the analysis include:

  • A summary of the total numbers of clusters and units to be sampled, target number of units to test per cluster, estimated SeH per cluster and the achieved SSe;
  • A summary of numbers of clusters to be tested and corresponding numbers of units to test in each cluster, the estimated SeH and the relative cost for each option; and
  • An excel spreadsheet and graph of the summary results.

If it is not possible to achieve the desired system sensitivity by testing the specified maximum number of units in all (or the specified maximum number) of the clusters, a message will be returned, along with a summary of the achieved mean SeH and SSe if the maximum numbers of animals and herds were tested.

No results

No example available
No references available
# Program to calculate sample size for 2-stage freedom survey assuming perfect test specificity
# uses RSurveillance package, 

rm(list = ls())

# cat("
Test:",length(commandArgs())) test<- ifelse(length(commandArgs()) < 3, TRUE, FALSE) fpath<- ifelse(test, "webRootUrl", "rtoolsPath") # load header scripts source(paste(fpath, "R/epi_head.R", sep = "")) source(paste(fpath, "R/HTMLStream.R", sep = "")) source(paste(fpath, "R/epitools_functions.r", sep = "")) # extract command arguments # 1 = animal design prevalence, 2 = dp.a format (0=proportion, 1=number), # 3 = herd design prevalence, 4 = dp.h format (0=proportion, 1=number), # 5 = test sensitivity, 6 = test specificity = 1, 7 = cost per herd, 8 = cost per animals # 9 = target SSe, 10 = maximum sample size, 11 = use max ss (0 = F (Inf), 1 =T), # 12 = number of herds, 13 = average herd size a1<- type.convert(a0[8:16]) # cat("
", a1) dp.a<- a1[1]<- F dp.h<- a1[2]<- ifelse(dp.h <1, F, T) sens<- a1[3] spec<- 1 herd.cost<- a1[4] animal.cost<- a1[5] target.sse<- a1[6]<- a1[7] max.herds<- a1[8] herds<- a1[9] herd.n<- 0 # a1[10] heading<- "2-Stage Freedom Sample Size" heading2<- "Summary of least-cost option" heading3<- "Summary of sample size options" digits<- 4 d1<- paste(substr(date(), 1, 10), substr(date(), 20,24), " @", substr(date(), 11, 16))<- ifelse ( > 0,, Inf) # cat("
", 46) filename<- digest(Sys.time) tmp.path<- paste(fpath, "tmp/", sep = "") tmp.file<- paste(fpath, "tmp/", filename, sep = "") sinkfile<- paste(fpath, "tmp/", filename, ".txt", sep="") # fpath, # set up table of inputs inputs<- array("", dim = c(9, 1)) rownames(inputs)<- c("Unit-level design prevalence", "Cluster-level design prevalence", "No. clusters", "Test sensitivity", "Relative testing cost per cluster", "Relative testing cost per unit", "Target system sensitivity", "Maximum sample size per cluster", "Maximum number of clusters to test") inputs[1, 1]<- ifelse(, paste(dp.a, " animal(s)"), paste(dp.a*100, "%", sep = "")) inputs[2, 1]<- ifelse(, paste(dp.h, " herd(s)"), paste(dp.h*100, "%", sep = "")) inputs[3, 1]<- ifelse(herds > 0, herds, "Unknown") inputs[4:8, 1]<- a1[3:7] inputs[9, 1]<- max.herds # cat("
", 65) # calculate herd seh and corrresponding sample sizes to achieve target n.herds<- ifelse(herds > 0, herds, max.herds) # max number of herds to test herds<- ifelse(herds > 0, herds, NA) # herds in population<- 1:n.herds # seh required for numbers of of herds tested<- numeric(n.herds) for (h in 1:n.herds) { for (s in 1:100/100) { sse<- sep(herds, h, dp.h, s) if (sse >= target.sse) {[h]<- s # need SeH 0f s to acheve target SSe break } } if (s == 1 & sse < target.sse)[h]<- NA } # cat("
", 84) # sample size for given Seh<- numeric(n.herds)[]<- NA se<- which(! # index for non-NA results max.n<- ifelse(herd.n > 0, herd.n, herd.n<- ifelse(herd.n > 0, herd.n, NA) for (s in se) { for (an in 1:max.n) { seh<- sep(herd.n, an, dp.a, sens) if (seh >=[s]) {[s]<- an break } } if ((an == max.n & seh <[s]) | an >[s]<- NA } if (sum(! == 0) { # unable to achieve result by testing max numbers # tmp<-,, sens, dp.a,, dp.h,, target.sse, digits) seh.max<- sep(herd.n,, dp.a, sens) sse.max<- sep(n.herds, max.herds, dp.h, seh.max) results<- array("", dim = c(6, 1)) rownames(results)<- c("Clusters to test", "Units per cluster", "Total sample size (units)", "Estimated cluster sensitivity (SeH)", "Target SSe", "Achieved SSe") results[1]<- paste("

Unable to achieve desired SSe by testing maximum number of units in", ifelse(max.herds > 0, max.herds, paste("all", herds)), "clusters.

") results[2]<- results[3]<- n.herds * results[4]<- round(seh.max, digits) results[5]<- round(target.sse, digits) results[6]<- round(sse.max, digits) d1<- paste(substr(date(), 1, 10), substr(date(), 20,24), " @", substr(date(), 11, 16)) # reformat headers output<- paste("

", heading, "

\n") output<- paste(output, "

", "Analysed: ", d1, "

\n", sep="") output<- paste(output, "


\n") output<- paste(output, HTMLStream(inputs, cellborder = 0, classfirstline = "mbg", classfirstcolumn = "mbg", classcellinside = "left_mar", cellalign = "center", align = "left")) output<- paste(output, "


\n") output<- paste(output, "

", heading2, "

\n") output<- paste(output, HTMLStream(results, cellborder = 0, classfirstline = "mbg", classfirstcolumn = "mbg", classcellinside = "left_mar", cellalign = "center", align = "left")) cat(output) quit() } else { cost<- (herd.cost + animal.cost** # cat("
", test) tmp<- na.omit(cbind(,,, cost)) sam.sizes<- na.omit(unique( ss.first<- match(sam.sizes, ss.combinations<- cbind([ss.first],[ss.first],[ss.first], cost[ss.first]) ss.combinations[,4]<- ss.combinations[,4]/max(ss.combinations[,4]) colnames(ss.combinations)<- c("Number of clusters", "Cluster sensitivity (SeH)", "Units sampled per cluster", "Relative cost") min.cost<- ss.combinations[which.min(ss.combinations[,4]),] # cat("
", min.cost) target.herds<- min.cost[1] target.seh<- min.cost[2] target.n<- min.cost[3] sse<- sep(herds, target.herds, dp.h, target.seh) results<- array("", dim = c(5, 1)) rownames(results)<- c("Clusters to test", "Units to test per cluster", "Total sample size (units)", "Estimated cluster sensitivity (SeH)", "Achieved system sensitivity (SSe)") results[1]<- target.herds results[2]<- target.n results[3]<- target.herds*target.n results[4]<- round(min.cost[2], digits) results[5]<- round(sse, digits) } result<- results colnames(result)<- "Value" sink(sinkfile) graphfile<- paste(tmp.file, "_freedom_sample_size.png", sep="") cols<- c("darkblue", "darkgreen", "red") OpenGraphOutput(graphfile, pointsize = 12, ht = 8, wd = 8) par(mar = c(5, 4, 4, 4) + 0.1) plot(ss.combinations[,3], ss.combinations[,1], col = cols[1], type = "b", pch = 1, xlab = "Units per cluster", ylab = "Clusters to test", main = "SeH, number of clusters to test, and relative cost", cex.main = 1, bty = "n") par(new = T) plot(ss.combinations[,3], ss.combinations[,2], col = cols[2], type = "b", pch = 2, bty = "n", axes = F, xlab = "", ylab = "", ylim = range(ss.combinations[,c(2,4)])) points(ss.combinations[,3], ss.combinations[,4], col = cols[3], type = "b", pch = 3, bty = "n") abline(v = target.n, col = "purple") Axis(ss.combinations[,c(2,4)], side = 4, ylab = "") legend("right", c("Clusters to test", "Cluster sensitivity", "Relative cost"), cex = 0.8, col = cols, pch = 1:3, bty = "n") mtext("for varying numbers of units tested per cluster", side = 3, line = 0.8, cex = 0.8) mtext("vertical line shows least-cost option", cex = 0.7, side = 3, line = 0) mtext("SeH / Relative cost", side = 4, line = 2.5) CloseGraphOutput("R") sink() # write to html and file subheadings<- c(heading2, heading3) result.txt<- "" output<- html.output(heading, subheadings, inputs, results = list(result, ss.combinations), graphs = graphfile, graph.headings = "Plot of sampling options and relative costs", show.inputs = T, show.graphs = T, tmp.file, result.txt = result.txt) write.html(output, tmp.file) cat(output)