Least-cost sample size with sampling frame of cluster sizes

Calculate least-cost sample sizes for 2-stage surveys for demonstrating disease freedom. This analysis calculates the number of clusters and the number of units within each cluster to be tested to provide a specified system sensitivity (probability of detecting disease) for the given unit and cluster-level design prevalences and test sensitivity. Calculations are based on actual cluster sizes provided (for the entire population) and a list of randomly selected clusters, along with the number of units to sample for each selected cluster is included in the outputs. Test specificity is assumed to be 100% (or follow-up testing of any positive will be undertaken to confirm or exclude disease).

Sample sizes are optimised to minimise overall cost for given cluster and unit-level testing costs. A maximum sample size per cluster can be specified, if desired and calculations can be specified to ensure either a fixed sample size per cluster or a fixed (minimum) cluster sensitivity.

Sample sizes are calculated using the hypergeometric probability approximation (assuming sampling without replacement).

Design prevalence (specified level of disease to be detected) must be specified, at both unit and cluster levels. Design prevalence can be specified as either:

a proportion of the population infected; or
a specific (integer) number of clusters or units (within clusters) infected.

Inputs

Inputs required include:

unit-level design prevalence as either a proportion or an integer number of units;
cluster-level design prevalence as either a proportion or an integer number of clusters;
the estimated test sensitivity;
the relative cost of testing at both cluster and unit levels;
the target system sensitivity (SSe) which is the probability of detecting disease if it is present at the specified design prevalences;
an optional maximum sample size to be tested per cluster;
whether calculations are to be based on maintaining a fixed sample size per cluster or a fixed (minimum) cluster sensitivity (SeH);
sampling frame data for all clusters in the population, including cluster id (labelled "HerdID") and cluster size (labelled "HerdSize").

Note: Any clusters with a cluster size < 1 or where clusters size is missing are excluded from the calculations.

Outputs

Outputs from the analysis include:

A summary of the total numbers of clusters and units to be sampled, target number of units to test per selected cluster, mean SeH and achieved SSe;
A list of clusters randomly selected for testing, the number of units to be tested for each cluster and the corresponding SeH;
A graph of required numbers of clusters to test, SeH and relative costs for varying numbers of units tested per cluster; and
An excel spreadsheet of the summary results and herd list.

If it is not possible to achieve the desired system sensitivity by testing all (or the specified maximum number of) units in all of the herds, a message will be returned, along with a summary of the achieved mean SeH and SSe if all units were tested. In this case a list of all clusters, and the SeH achieved if all or the maximum number of units were tested.

No results

No example available

No references available

				######################################
# Program to calculate sample size for 2-stage freedom survey assuming perfect test specificity
######################################
# uses RSurveillance package, 

 rm(list = ls())

# cat("
Test:",length(commandArgs()))
test<- ifelse(length(commandArgs()) < 3, TRUE, FALSE)
fpath<- ifelse(test, "webRootUrl", "rtoolsPath")

# load header scripts
  source(paste(fpath, "R/epi_head.R", sep = ""))
  source(paste(fpath, "R/HTMLStream.R", sep = ""))
  source(paste(fpath, "R/epitools_functions.r", sep = ""))

# extract command arguments 
# 1 = animal design prevalence, 2 = dp.a format (0=proportion, 1=number), 
# 3 = herd design prevalence, 4 = dp.h format (0=proportion, 1=number), 
# 5 = test sensitivity, 6 = test specificity = 1, 7 = cost per herd, 8 = cost per animals 
# 9 = target SSe, 10 = maximum sample size, 11 = use constant SS

    a1<- type.convert(a0[8:15])

 cat("
", a1)

dp.a<- a1[1] 
dp.a.int<- ifelse(dp.a <1, F, T)          
dp.h<- a1[2]           
dp.h.int<- ifelse(dp.h <1, F, T)          
sens<- a1[3]
spec<- 1
herd.cost<- a1[4]     
animal.cost<- a1[5]   
target.sse<- a1[6] 
max.ss<- a1[7]
ss.constant<- a1[8]
heading<- "2-Stage Freedom Sample Size"
heading2<- "Testing summary"
heading3<- "Details of selected clusters"
digits<- 4
use.max.ss<- ifelse(max.ss > 0, 1, 0)
if (!use.max.ss) max.ss<- Inf

filename<- digest(Sys.time)
tmp.path<- paste(fpath, "tmp/", sep = "")
tmp.file<- paste(fpath, "tmp/", filename, sep = "")
sinkfile<- paste(fpath, "tmp/", filename, ".txt", sep="")   # fpath,
data.file<- ifelse(test, paste(fpath, "docs/2StageFreedomSSDemo.txt", sep = ""), a0[length(a0)])              #test name
 cat("
", data.file)
dat1<- read.csv(data.file)       # first clumn = cluster id, 2nd = test result (T/F or 1/0)
dat<- dat1[dat1$HerdSize >= 1 & !is.na(dat1$HerdSize),]
herds<- nrow(dat)
herd.n<- round(mean(dat$HerdSize), 0)
cat("
", colnames(dat))
cat("
", nrow(dat), ncol(dat))
# set up table of inputs
inputs<- array("", dim = c(11, 1))
rownames(inputs)<- c("Unit-level design prevalence", "Cluster-level design prevalence", 
                  "No. clusters (cluster size >= 1)", "Mean cluster size",
                  "Test sensitivity", "Relative testing cost per cluster", 
                  "Relative testing cost per unit", "Target system sensitivity", 
                  "Maximum sample size per unit", "Sampling target", "Clusters excluded (cluster size < 1)")
inputs[1, 1]<- ifelse(dp.a.int, paste(dp.a, "unit(s)"), paste(dp.a*100, "%", sep = ""))
inputs[2, 1]<- ifelse(dp.h.int, paste(dp.h, "cluster(s)"), paste(dp.h*100, "%", sep = ""))  
inputs[3, 1]<- herds
inputs[4, 1]<- herd.n
inputs[5:9, 1]<- a1[3:(length(a1)-1)]
inputs[nrow(inputs)-1, 1]<- ifelse(ss.constant, "Fixed sample size", "Fixed (minimum) SeH")
inputs[nrow(inputs), 1]<- nrow(dat1) - nrow(dat)

 cat("
", herd.n)

# calculate least cost sample size for average herd size
attach(dat)
# cat(names(dat))
if (!exists("HerdSize") || !exists("HerdID"))  {
      cat("
I cannot do the analysis because columns labeled 'HerdID' and 'HerdSize' are not included in the data. 
Please correct the data and re-try the analysis.
")
      quit()
    }
if (!exists("HerdSize"))  {
      cat("
I cannot do the analysis because there is no column labeled 'HerdSize' in the data. 
Please correct the data and re-try the analysis.
")
      quit()
    }
if (!exists("HerdID"))  {
      cat("
I cannot do the analysis because there is no column labeled 'HerdID' in the data. 
Please correct the data and re-try the analysis.
")
      quit()
    }
 cat(94)

# select sample with fixed sample size 
fixed.ss<- function(dat, herds, target.n, cp, sens, sp, dp.a, dp.a.int, dp.h, dp.h.int, target.sse, digits) {
  herd.list<- sample(1:herds, herds, replace = F)
  h.lst<- dat$HerdID[herd.list]
  herd.sizes<- dat$HerdSize[herd.list]     
	cat("
", target.n)
  sample.sizes<- pmin(herd.sizes, target.n)
  herd.seh<- sep(N=herd.sizes, n=sample.sizes, pstar=dp.a, se=sens)
  avg.seh<- cumsum(herd.seh)/(1:herds)
  cum.sse<- sep(rep(herds, herds), 1:herds, pstar=dp.h, se=avg.seh) 
	cat("
", which(cum.sse >= target.sse)[1])
  to.test<- which(cum.sse >= target.sse)[1]
	cat("
", to.test)
  tmp<- cbind(h.lst[1:herds.to.test], herd.sizes[1:herds.to.test], sample.sizes[1:herds.to.test],
              herd.seh[1:herds.to.test], avg.seh[1:herds.to.test], cum.sse[1:herds.to.test])
	cat("
", sens)
  herd.order<- order(herd.list[1:herds.to.test])
  test.list<- data.frame("HerdID" = h.lst[herd.order], "HerdSize" = herd.sizes[herd.order], "SampleSize" = sample.sizes[herd.order],
                         "HerdSeH" = herd.seh[herd.order])
  
  results<- array("", dim = c(5, 1))
  rownames(results)<- c("Clusters to test", "Total sample size (units)", "Samples per cluster", 
                        "Mean cluster sensitivity (SeH)", "Achieved system sensitivity (SSe)")
  results[1]<- nrow(test.list)
  results[2]<- sum(test.list$SampleSize)
  results[3]<- target.n
  results[4]<- round(mean(test.list$HerdSeH), digits)
  results[5]<- round(cum.sse[herds.to.test], digits)
  test.list$HerdSeH<- ifelse(test.list$HerdSeH == rep(0, results[1]), 0, ifelse(test.list$HerdSeH > rep(10, results[1]), round(test.list$HerdSeH, 0), ifelse(test.list$HerdSeH >= rep(1, results[1]), round(test.list$HerdSeH, 1), ifelse(test.list$HerdSeH < 10^-digits, paste("<", format(10^-digits, scientific = F), sep = ""), format(round(test.list$HerdSeH, digits), scientific = F)))))
  
  return(list(results, test.list))
} # end of fixed.ss

# calculate for fixed seh per herd
fixed.seh<- function(dat, herds, target.seh, cp, sens, sp, dp.a, dp.a.int, dp.h, dp.h.int, max.ss, target.sse, target.n, digits) {
  herd.list<- sample(1:herds, herds, replace = F)
  h.lst<- dat$HerdID[herd.list]
  herd.sizes<- dat$HerdSize[herd.list]  
  herd.seh<- numeric(herds) 
  ss<- numeric(herds)        
  for (h in 1:herds) {
    seh.list<- sep(N=rep(herd.sizes[h], min(max.ss, herd.sizes[h])), n=1:min(max.ss, herd.sizes[h]), se=sens, pstar=dp.a)
    tmp<- which(seh.list >= target.seh)
    if(length(tmp) == 0) {      # unable to achieve target by testing to max
      herd.seh[h]<- seh.list[length(seh.list)]
      ss[h]<- length(seh.list)
    } else {
      herd.seh[h]<- seh.list[tmp[1]]
      ss[h]<- tmp[1]
    }
  }
  avg.seh<- cumsum(herd.seh)/(1:herds)
  cum.sse<- sep(rep(herds, herds), 1:herds, se=avg.seh, pstar=dp.h) 
  herds.to.test<- which(cum.sse >= target.sse)[1]
  #    tmp<- cbind(h.lst[1:herds.to.test], herd.sizes[1:herds.to.test], ss[1:herds.to.test],
  #                      herd.seh[1:herds.to.test], avg.seh[1:herds.to.test], cum.sse[1:herds.to.test])
  herd.order<- order(herd.list[1:herds.to.test])
  test.list<- data.frame("HerdID" = h.lst[herd.order], "HerdSize" = herd.sizes[herd.order], "SampleSize" = ss[herd.order],
                         "HerdSeH" = herd.seh[herd.order])
  
  results<- array("", dim = c(6, 1))
  rownames(results)<- c("Clusters to test", "Total sample size (units)", "Samples per cluster", 
                        "Mean cluster sensitivity (SeH)", "Target SeH", "Achieved system sensitivity (SSe)")
  results[1]<- nrow(test.list)
  results[2]<- sum(test.list$SampleSize)
  results[3]<- target.n
  results[4]<- round(mean(test.list$HerdSeH), digits)
  results[5]<- round(target.seh, digits)
  results[6]<- round(cum.sse[herds.to.test], digits)
  test.list$HerdSeH<- ifelse(test.list$HerdSeH == rep(0, results[1]), 0, ifelse(test.list$HerdSeH > rep(10, results[1]), round(test.list$HerdSeH, 0), ifelse(test.list$HerdSeH >= rep(1, results[1]), round(test.list$HerdSeH, 1), ifelse(test.list$HerdSeH < 10^-digits, paste("<", format(10^-digits, scientific = F), sep = ""), format(round(test.list$HerdSeH, digits), scientific = F)))))
  return(list(results, test.list))
} # end of fixed.seh

# function to calculate achieved SeH and SSe if unable to achieve target
sse.fail<- function(dat, max.ss, sens, dp.a, dp.a.int, dp.h, dp.h.int, target.sse, digits) {
  seh.list<- sep(N = dat$HerdSize, n = pmin(max.ss, dat$HerdSize), se=sens, pstar=dp.a)
  mean.seh<- mean(seh.list)
  sse<- sep(N = herds, n = herds, se=mean.seh, pstar=dp.h)
  test.list<- data.frame("HerdID" = dat$HerdID, "HerdSize" = dat$HerdSize, "SampleSize" = pmin(max.ss, dat$HerdSize), "HerdSeH" = seh.list)
  results<- array("", dim = c(5, 1))
  rownames(results)<- c("Clusters to test", "Total sample size (units)", 
                        "Mean cluster sensitivity (SeH)", "Target SSe", "Achieved SSe")
  results[1]<- paste("Unable to achieve desired SSe by testing", ifelse(use.max.ss, "maximum number of", "all"), "units in all", herds, "clusters.")
  results[2]<- sum(test.list$SampleSize)
  results[3]<- round(mean(test.list$HerdSeH), digits)
  results[4]<- round(target.sse, digits)
  results[5]<- round(sse, digits)
  test.list$HerdSeH<- ifelse(test.list$HerdSeH == rep(0, herds), 0, ifelse(test.list$HerdSeH > rep(10, herds), round(test.list$HerdSeH, 0), ifelse(test.list$HerdSeH >= rep(1, herds), round(test.list$HerdSeH, 1), ifelse(test.list$HerdSeH < 10^-digits, paste("<", format(10^-digits, scientific = F), sep = ""), format(round(test.list$HerdSeH, digits), scientific = F)))))
  return(list(results, test.list))
} # end of fixed.seh

cat(177)
# calculate average SeH required to achieve target sse depending on number of herds tested
n.herds<- ifelse(herds > 0, herds, max.herds)       # max number of herds to test
herds<- ifelse(herds > 0, herds, NA)                # herds in population
herd.ss<- 1:n.herds
# seh required for numbers of  of herds tested
herd.se<- numeric(n.herds)
for (h in 1:n.herds) {
  for (s in 1:100/100) {
    sse<- sep(herds, h, dp.h, s)
    if (sse >= target.sse) {
      herd.se[h]<- s                    # need SeH 0f s to acheve target SSe
      break
    }
  }
  if (s == 1 & sse < target.sse)  herd.se[h]<- NA
}
cat(194)
# sample size for given Seh
an.ss<- numeric(n.herds)
an.ss[is.na(herd.se)]<- NA
se<- which(!is.na(herd.se))         # index for non-NA results
max.n<- ifelse(herd.n > 0, herd.n, max.ss)
herd.n<- ifelse(herd.n > 0, herd.n, NA)
for (s in se) {
  for (an in 1:max.n) {
    seh<- sep(herd.n, an, dp.a, sens)
    if (seh >= herd.se[s]) {
      an.ss[s]<- an
      break
    }
  }
  if ((an == max.n & seh < herd.se[s]) | an > max.ss)  an.ss[s]<- NA
}

cat("
", sum(!is.na(an.ss)))          
# calculate herd seh and corrresponding sample sizes to achieve target
if (sum(!is.na(an.ss)) == 0) {
    tmp<- sse.fail(dat, max.ss, sens, dp.a, dp.a.int, dp.h, dp.h.int, target.sse, digits) 
    fail<- T
} else {
    fail<- F
    cost<- (herd.cost + animal.cost*an.ss)*herd.ss
    tmp<- na.omit(cbind(herd.ss, herd.se, an.ss, cost))
    min.cost<- tmp[which.min(tmp[,4]),]
 cat("
", min.cost)
    sam.sizes<- na.omit(unique(an.ss))
    ss.first<- match(sam.sizes, an.ss)
    ss.combinations<- cbind(herd.ss[ss.first], herd.se[ss.first], an.ss[ss.first], cost[ss.first])
    ss.combinations[,4]<- ss.combinations[,4]/max(ss.combinations[,4])
    colnames(ss.combinations)<- c("Number of clusters", "Cluster sensitivity (SeH)", "Units sampled per cluster", "Relative cost") 
    target.herds<- min.cost[1]
    target.seh<- min.cost[2]
    target.n<- min.cost[3]
 cat("
", ss.constant)
    # calculate sample sizes and seh for fixed sample size per herd and select sample
    if (ss.constant) {
cat("
", herds, target.n, sens, dp.a, dp.a.int, dp.h, dp.h.int, target.sse, digits)
        tmp<- fixed.ss(dat, herds, target.n, cp=1, sens, sp=1, dp.a, dp.a.int, dp.h, dp.h.int, target.sse, digits)
 cat("
", test)
    } else {
        tmp<- fixed.seh(dat, herds, target.seh, cp=1, sens, sp=1, dp.a, dp.a.int, dp.h, dp.h.int, max.ss, target.sse, target.n, digits)
    }
}
 cat("
", test)
result<- tmp[[1]]
test.list<- tmp[[2]]
 cat(248)

if (!fail) {
# sink(sinkfile)
graphfile<- paste(tmp.file, "_freedom_sample_size.png", sep="")
cols<- c("darkblue", "darkgreen", "red")
OpenGraphOutput(graphfile, pointsize = 12, ht = 8, wd = 8)
    par(mar = c(5, 4, 4, 4) + 0.1)
    plot(ss.combinations[,3], ss.combinations[,1], col = cols[1], type = "b", pch = 1,
          xlab = "Animals per herd", ylab = "Herds to test", 
          main = "SeH, number of herds to test, and relative cost", 
          cex.main = 1, bty = "n")
    par(new = T)
    plot(ss.combinations[,3], ss.combinations[,2], col = cols[2], type = "b",
          pch = 2, bty = "n", axes = F, xlab = "", ylab = "", ylim = range(ss.combinations[,c(2,4)]))
    points(ss.combinations[,3], ss.combinations[,4], col = cols[3], type = "b",
          pch = 3, bty = "n")
    abline(v = target.n, col = "purple")
    Axis(ss.combinations[,c(2,4)], side = 4, ylab = "")
    legend("right", c("Herds to test", "Herd sensitivity", "Relative cost"), cex = 0.8, 
          col = cols, pch = 1:3, bty = "n") 
    mtext("for varying numbers of animals tested per herd", side = 3, line = 0.8, cex = 0.8)
    mtext("vertical line shows least-cost option", cex = 0.7, side = 3, line = 0)
    mtext("SeH / Relative cost", side = 4, line = 2.5)
CloseGraphOutput("R")
sink()
}
 cat(275)

# write to html and file
subheadings<- c(heading2, heading3)
result.txt<- ""
output<- html.output(heading, subheadings, inputs, results = list(result, test.list), graphs = graphfile, graph.headings = "Plot of sampling options and relative costs", show.inputs = T, show.graphs = T, tmp.file, result.txt = result.txt) 
cat(280)
write.html(output, tmp.file)
cat(dim(output))
cat(output)

email.results(email, output, fpath, filename, "2StageFreedomSS")