Attribution/DESCRIPTION00006660000000106413046326303010563 0ustar00Package: Attribution Type: Package Title: Bayesian source attribution from relative prevalence data Version: 1.0 Date: 2016-08-12 Author: Dr Simon EF Spencer Maintainer: Dr Simon EF Spencer Description: This package uses Markov chain Monte Carlo methods to fit a source attribution model from relative prevalance data. It also provides several functions to analyse the output. License: GPL-3 LazyData: false Depends: mvtnorm, DirichletReg RoxygenNote: 6.0.0 NeedsCompilation: no Packaged: 2017-02-07 11:17:23 UTC; simon Attribution/NAMESPACE00006660000000034213046326303010272 0ustar00# Generated by roxygen2: do not edit by hand export(casesBarplot) export(casesPlot) export(casesTable) export(plotFitsFile) export(plotParameterFile) export(plotPrevalenceFile) export(sourceAttribution) export(top) Attribution/R/00007770000000000013046326303007255 5ustar00Attribution/R/casesBarplot.R00006660000000311213046326303012017 0ustar00#' Plot cases barplot #' #' Produce pdf document containing a barplot showing the posterior expected number of cases attributed to each source. For use with the single attribution model. The vertical lines indicate 95\% credible intervals. #' @param o The output list from the function \link{sourceAttribution}. #' @param file Filename of the output. Put file=NULL to plot to the current graphics device. #' @param burnin Number of samples to discard as burnin (plus 1). #' @param col Vector of colours for the different sources. #' @details Source labels are taken from the attribute \code{rownames(X)}. #' @export casesBarplot<-function(o,file="casesBarplot.pdf",burnin=501,col=NULL) { srcs<-dim(o$X)[1] typs<-dim(o$X)[2] yrs<-dim(o$Y)[1] if (!is.null(file)) {pdf(file,width=8,height=6)} if (is.null(col)) {col<-c("darkorange","blue","black","red","darkgreen","purple","grey")} if (length(col)1) { qfits<-apply(fits[,,pts],c(1,2),quantile,probs=c(0.025,0.5,0.975)) } else { qfits<-array(apply(fits[1,,pts],1,quantile,probs=c(0.025,0.5,0.975)),c(3,1,typs)) } cat(dim(qfits),"\n") m<-max(o$Y) x<-0:100*m/100 cols<-topo.colors(yrs) pdf(file,width=8,height=8) par(mfrow=c(1,1),mar=c(4, 4, 1, 1) + 0.1) plot(c(0,m),t="n",col="grey",xlab="Posterior median cases",ylab="Observed cases",main="",xlim=c(0,m),ylim=c(0,m)) polygon(c(x,rev(x)),c(qpois(0.025,x),rev(qpois(0.975,x))),col="lightgrey",border=NA) lines(c(0,m),c(0,m),col="darkgrey") if (length(dim(o$a.stored))==2) { text(qfits[2,1,],m,colnames(o$X),cex=0.5,adj=c(1,0.25),srt=90) } segments(qfits[1,,],o$Y,qfits[3,,],o$Y,col=cols) points(qfits[2,,],o$Y,pch=16,col=cols) legend("bottomright",rownames(o$Y),fill=cols) dev.off() }Attribution/R/plotParameterFile.R00006660000000501313046326303013016 0ustar00#' Plot model parameters #' #' Produce pdf document containing trace plots and histograms of the source-specific factors, the strain-specific factors and the precision of the strain-specific factors in the posterior distribution. #' @param o The output list from the function \link{sourceAttribution}. #' @param file Filename of the output. #' @param burnin Number of samples to discard as burnin (plus 1). #' @details The prior density is drawn in red over the histogram of posterior samples. #' @export plotParameterFile<-function(o,file="parameters.pdf",burnin=501) { srcs<-dim(o$X)[1] typs<-dim(o$X)[2] yrs<-dim(o$Y)[1] pdf(file,paper="a4",height=12,width=8) par(mfrow=c(srcs,2),mar=c(4, 4, 2, 1) + 0.1) pts<-burnin:length(o$theta.stored) if (length(dim(o$a.stored))==2) { for (j in 1:srcs) { plot(pts,o$a.stored[j,pts],t="l",ylab=round(o$acceptance$a.accept[j]/(o$acceptance$a.accept[j]+o$acceptance$a.reject[j]),3),xlab="",main=paste("Source",j,rownames(o$X)[j])) hist(o$a.stored[j,pts],probability=TRUE,breaks=40,xlab="",main=paste("Source",j,rownames(o$X)[j])) ax<-seq(min(o$a.stored[j,pts]),max(o$a.stored[j,pts]),length.out=101) lines(ax,dexp(ax,o$priors$alpha[j]),col="red") } } else { for (k in 1:yrs) { for (j in 1:srcs) { plot(pts,o$a.stored[k,j,pts],t="l",ylab=round(o$acceptance$a.accept[k,j]/(o$acceptance$a.accept[k,j]+o$acceptance$a.reject[k,j]),3),xlab="",main=paste("Source",j,rownames(o$X)[j],rownames(o$Y)[k])) hist(o$a.stored[k,j,pts],probability=TRUE,breaks=40,xlab="",main=paste("Source",j,rownames(o$X)[j],rownames(o$Y)[k])) ax<-seq(min(o$a.stored[k,j,pts]),max(o$a.stored[k,j,pts]),length.out=101) lines(ax,dexp(ax,o$priors$alpha[j]),col="red") } } } plot(pts,o$theta.stored[pts],t="l",ylab="",xlab="",main="Precision of random effects theta") hist(o$theta.stored[pts],probability=TRUE,breaks=40,xlab="",main="Precision of random effects theta") ax<-seq(min(o$theta.stored[pts]),max(o$theta.stored[pts]),length.out=101) lines(ax,dgamma(ax,o$priors$theta.prior[1],o$priors$theta.prior[2]),col="red") for (i in 1:typs) { plot(pts,o$q.stored[i,pts],t="l",ylab="",xlab="",main=paste("Type",i,colnames(o$X)[i])) hist(o$q.stored[i,pts],probability=TRUE,breaks=40,xlab="",main=paste("Type",i,colnames(o$X)[i])) ax<-seq(min(o$q.stored[i,pts]),max(o$q.stored[i,pts]),length.out=101) lines(ax,dgamma(ax,median(o$theta.stored),median(o$theta.stored)),col="red") } dev.off() }Attribution/R/plotPrevalenceFile.R00006660000000225213046326303013164 0ustar00#' Plot relative prevalences #' #' Produce pdf document containing trace plots and histograms of the relative prevalences in the posterior distribution. #' @param o The output list from the function \link{sourceAttribution}. #' @param file Filename of the output. #' @param burnin Number of samples to discard as burnin (plus 1). #' @details The prior density is drawn in red over the histogram of posterior samples. #' @export plotPrevalenceFile<-function(o,file="prevalence.pdf",burnin=501) { srcs<-dim(o$X)[1] typs<-dim(o$X)[2] pdf(file,paper="a4",height=12,width=8) par(mfrow=c(7,2),mar=c(4, 4, 2, 1) + 0.1) pts<-burnin:length(o$theta.stored) for (j in 1:srcs) { for (i in 1:typs) { plot(pts,o$r.stored[j,i,pts],t="l",ylab=round(o$acceptance$r.accept[j,i]/(o$acceptance$r.accept[j,i]+o$acceptance$r.reject[j,i]),3),xlab="",main=paste("Source",j,rownames(o$X)[j])) hist(o$r.stored[j,i,pts],probability=TRUE,breaks=40,xlab="",main=paste("Type",i,colnames(o$X)[i])) ax<-seq(min(o$r.stored[j,i,pts]),max(o$r.stored[j,i,pts]),length.out=101) lines(ax,dbeta(ax,1+o$X[j,i],typs-1+sum(o$X[j,-i])),col="red") } } dev.off() }Attribution/R/sourceAttribution.r00006660000003316513046326303013175 0ustar00#' Bayesian source attribution modelling #' #' Attribute sources from relative prevalence data. This function fits a Poisson regression model using Markov chain Monte Carlo (MCMC) methods. #' @param X a sources x types matrix of source data (counts). The attributes \code{rownames(X)} and \code{colnames(X)} provide the labels to the sources and types respectively. #' @param Y a years x types matrix of human data (counts). The attribute \code{rownames(Y)} provides the labels to the years. #' @param temporal Run temporal attribution model (ie different attribution for each year)? #' @param unobserved Include unobserved source? #' @param iters Number of samples to draw from MCMC. #' @param thinning Number of MCMC updates between samples. Default is 20 for temporal model and 100 for single attribution model. #' @param r.prior Prior parameter for relative prevalence (\eqn{\gamma}). #' @param theta.prior Vector of two parameters for precision of strain specific random effects (\eqn{\alpha^{(\theta)},\beta^{(\theta)}}). #' @param alpha Prior parameters for \eqn{a}. #' @param aSigma Initial covariance matrix for the joint proposals for \eqn{a} (on the log scale). This should adapt. #' @param thetaSigma Initial proposal variance for \eqn{\theta} (adapts). #' @param reps Initial number of repeats of the pairwise updates for \eqn{r} (adapts). #' @param adapt.reps Minimum acceptance probability for \eqn{r} - if lower than this value \code{reps} is increased. #' @return list giving full details of MCMC input and output, including #' \tabular{ll}{ #' \code{X} \tab Source input matrix X\cr #' \code{Y} \tab Human case input matrix Y\cr #' \code{a.stored} \tab Array of MCMC samples for the source-specific factors \eqn{a}\cr #' \code{r.stored} \tab Array of MCMC samples for the relative prevalences \eqn{r}\cr #' \code{q.stored} \tab Array of MCMC samples for the strain-specific factors \eqn{q}\cr #' \code{theta.stored} \tab Matrix of MCMC samples for precision of the strain-specific random effects, \eqn{\theta}\cr #' \code{priors} \tab list of priors\cr #' \code{acceptance} \tab list of acceptance probabilities\cr #' } #' @examples #' data(campyNZ) #' # thinning=100 will give better convergence, but makes the package check take a long time. #' output <- sourceAttribution(campyNZ$X, campyNZ$Y, temporal=FALSE, thinning=1) #' #' # Examine output (in order of importance) #' #' # casesPlot(output) # for temporal attribution #' casesBarplot(output) # for single attribution #' plotPrevalenceFile(output) # histograms that look different from their prior (in red) indicate lack-of-fit #' plotParameterFile(output) # check convergence #' top(output) # interesting which types appear in unobserved source #' casesTable(output) # useful summary for paper #' plotFitsFile(output) # points appearing outside grey band (Poisson variation) indicate lack-of-fit. NB converse does not hold! #' @references Ahlstrom, C., Muellner, P., Spencer, S.E.F., \emph{et al.} 2017 Inferring source attribution from a multi-year multi-source #' dataset of Salmonella in Minnesota. \emph{Zoonoses and public health} #' @export sourceAttribution<-function(X,Y,temporal=TRUE,unobserved=FALSE,iters=2500, thinning=20+(1-temporal)*80,r.prior=1,theta.prior=c(1,1),alpha=NULL,aSigma=diag(rep(0.005,srcs)), thetaSigma=1, reps=NULL, adapt.reps=0.05) { srcs<-dim(X)[1] typs<-dim(X)[2] yrs<-dim(Y)[1] yr<-rownames(Y) if (dim(Y)[2]!=typs) {stop("Number of types in X and Y mismatched.\n")} if (unobserved) { # add unobserved source X<-rbind(X,rep(0,typs)) rownames(X)[srcs+1]<-"unobserved" srcs<-srcs+1 } if (dim(aSigma)[1]!=srcs | dim(aSigma)[2]!=srcs) {aSigma<-diag(rep(0.005,srcs))} # revert to default if user has mis-specified. src.labs<-rownames(X) if (is.null(alpha)) { # use data to generate prior for a, if alpha is not already specified. if (temporal) { alpha<-matrix(rep(srcs/apply(Y,1,sum),srcs),yrs,srcs) } else { alpha<-rep(srcs*yrs/sum(Y),srcs) } } r.prior<-matrix(r.prior,srcs,typs) if (is.null(reps)) {reps<-rep(0,srcs)} # can be used to expend more effort on updating p. If convergence is poor in a source increase number in reps. Should adapt by itself if adapt.reps<1. # Define monitors theta.accept<-0 theta.reject<-0 theta.stored<-rep(NA,iters) q.stored<-matrix(NA,typs,iters) if (temporal) { a.accept<-matrix(0,yrs,srcs) a.reject<-matrix(0,yrs,srcs) a.stored<-array(NA,c(yrs,srcs,iters)) } else { a.accept<-matrix(0,1,srcs) a.reject<-matrix(0,1,srcs) a.stored<-matrix(NA,srcs,iters) } r.accept<-matrix(0,srcs,typs) r.reject<-matrix(0,srcs,typs) r.stored<-array(NA,c(srcs,typs,iters)) cases<-array(NA,c(yrs,srcs,iters))# number of cases from each source # # Initialise parameters at sensible values # r<-0.01*rep(1/typs,typs)+0.99*X/c(X%*%matrix(1,typs,1)) r[which(is.nan(r))]<-1/typs a<-rexp(length(alpha),alpha) dim(a)<-dim(alpha) q<-rep(1,typs) theta<-1 # # Begin MCMC # for (it in 1:iters) { for (th in 1:thinning) { # update q = type specific factor (with a Gibbs step, ie sample from full conditional distribution) if (temporal) { q<-rgamma(typs,theta+matrix(1,1,yrs)%*%Y,theta+matrix(1,1,yrs)%*%a%*%r) } else { q<-rgamma(typs,theta+matrix(1,1,yrs)%*%Y,theta+yrs*(a%*%r)) } # update theta = precision of random effects (with a Metropolis Hastings log-Gaussian random walk proposal) proposal<-exp(rnorm(1,log(theta),thetaSigma)) #proposal ap<-dgamma(proposal,theta.prior[1],theta.prior[2],log=TRUE)-dgamma(theta,theta.prior[1],theta.prior[2],log=TRUE) #prior ap<-ap+log(proposal)-log(theta)#Jacobian term ap<-ap+sum(dgamma(q,proposal,proposal,log=TRUE)-dgamma(q,theta,theta,log=TRUE)) #likelihood u<-runif(1) if (u<=exp(ap)) { # accept theta.accept<-theta.accept+1 theta<-proposal } else { # reject theta.reject<-theta.reject+1 } # update a = source specific factor if (temporal) { for (k in 1:yrs) { proposal<-exp(rmvnorm(1,log(a[k,]),aSigma)) # Hetropolis Hastings log-multivariate-Gaussian random walk proposal on srcs dimensions. ap<-sum(dexp(proposal,alpha[k,],log=TRUE)-dexp(a[k,],alpha[k,],log=TRUE))# prior ap<-ap+sum(log(proposal)-log(a[k,])) # Jacobian term ap<-ap+sum(dpois(Y[k,],q*(matrix(proposal,1,srcs)%*%r),log=TRUE)-dpois(Y[k,],q*(a[k,]%*%r),log=TRUE)) # likelihood u<-runif(1) if (u<=exp(ap)) { # accept a.accept[k,]<-a.accept[k,]+1 a[k,]<-proposal } else { a.reject[k,]<-a.reject[k,]+1 } # previous update method performs poorly for small a's, but this perform great for small a's because there is only a tiny contribution to the likelihood for (j in 1:srcs) { proposal<-a[k,] proposal[j]<-rexp(1,alpha[k,j]) # independence sampler from the prior ap<-sum(dpois(Y[k,],q*(matrix(proposal,1,srcs)%*%r),log=TRUE)-dpois(Y[k,],q*(a[k,]%*%r),log=TRUE)) # likelihood u<-runif(1) if (u<=exp(ap)) { # accept a.accept[k,j]<-a.accept[k,j]+1 a[k,j]<-proposal[j] } else { a.reject[k,j]<-a.reject[k,j]+1 } } } } else { # update a (same again), but for single attribution model proposal<-exp(rmvnorm(1,log(a),aSigma))# Hetropolis Hastings log-multivariate-Gaussian random walk proposal on srcs dimensions. ap<-sum(dexp(proposal,alpha,log=TRUE)-dexp(a,alpha,log=TRUE))#prior ap<-ap+sum(log(proposal)-log(a))#Jacobian term ap<-ap+sum(dpois(Y,rep(q*(matrix(proposal,1,srcs)%*%r),each=yrs),log=TRUE)-dpois(Y,rep(q*(a%*%r),each=yrs),log=TRUE))#likelihood u<-runif(1) if (u<=exp(ap)) { # accept a.accept<-a.accept+1 a<-proposal } else { # reject a.reject<-a.reject+1 } # previous update method performs poorly for small a's, but this perform great for small a's because there is only a tiny contribution to the likelihood for (j in 1:srcs) { proposal<-a proposal[j]<-rexp(1,alpha[j])#independence sampler from the prior ap<-sum(dpois(Y,rep(q*(matrix(proposal,1,srcs)%*%r),each=yrs),log=TRUE)-dpois(Y,rep(q*(a%*%r),each=yrs),log=TRUE))#likelihood u<-runif(1) if (u<=exp(ap)) { # accept a.accept[j]<-a.accept[j]+1 a[j]<-proposal[j] } else { # reject a.reject[j]<-a.reject[j]+1 } } } ### update r from prior for (j in 1:srcs) { proposal<-r proposal[j,]<-rdirichlet(1,r.prior[j,]+X[j,]) if (temporal) {#likelihood ap<-sum(dpois(Y,rep(q,each=yrs)*(a%*%proposal),log=TRUE)-dpois(Y,rep(q,each=yrs)*(a%*%r),log=TRUE)) } else { ap<-sum(dpois(Y,rep(q*(a%*%proposal),each=yrs),log=TRUE)-dpois(Y,rep(q*(a%*%r),each=yrs),log=TRUE)) } u<-runif(1) if (u<=exp(ap)) { # accept r.accept[j,]<-r.accept[j,]+1 r[j,]<-proposal[j,] } else { # reject r.reject[j,]<-r.reject[j,]+1 } } ### update r in pairs for (j in which(reps>0)) { for (rp in 1:reps[j]) { proposal<-r id<-sample(1:typs,2) # choose 2 types at random to update L<-r[j,id[1]]+r[j,id[2]] # amount of probability attached to the 2 types stays fixed. # beta proposal from prior (the conditional of 2 components of a dirichlet is a beta times L) proposal[j,id[1]]<-L*rbeta(1,r.prior[j,id[1]]+X[j,id[1]],r.prior[j,id[2]]+X[j,id[2]]) proposal[j,id[2]]<-L-proposal[j,id[1]] if (temporal) {#likelihood ap<-sum(dpois(Y[,id],rep(q[id],each=yrs)*(a%*%proposal[,id]),log=TRUE)-dpois(Y[,id],rep(q[id],each=yrs)*(a%*%r[,id]),log=TRUE)) } else { ap<-sum(dpois(Y[,id],rep(q[id]*(a%*%proposal[,id]),each=yrs),log=TRUE)-dpois(Y[,id],rep(q[id]*(a%*%r[,id]),each=yrs),log=TRUE)) } u<-runif(1) if (u<=exp(ap)) { # accept r.accept[j,id]<-r.accept[j,id]+1 r[j,id]<-proposal[j,id] } else { # reject r.reject[j,id]<-r.reject[j,id]+1 } } } } # take a sample if (temporal) { a.stored[,,it]<-a cases[,,it]<-a*t(r%*%matrix(q,typs,yrs)) } else { a.stored[,it]<-a cases[,,it]<-rep(a,each=yrs)*t(r%*%matrix(q,typs,yrs)) } r.stored[,,it]<-r q.stored[,it]<-q theta.stored[it]<-theta # # This section of the code detects poor mixing and adapts to improve it # # adapt number of reps for relative prevalances wh<-which(apply(r.accept/(r.accept+r.reject),1,min)0) { reps[wh]<-reps[wh]+1 if (max(reps)>19) {cat("Increasing update effort for relative prevalences in sources:",wh,"to",reps[wh],"\n")} } # adapt theta if ((theta.accept/(theta.accept+theta.reject)<0.15 | theta.accept/(theta.accept+theta.reject)>0.5) & theta.accept+theta.reject>=100) { if (theta.accept==0) { thetaSigma<-thetaSigma/2 } else if (theta.reject==0) { thetaSigma<-thetaSigma*2 } else { thetaSigma<-thetaSigma*qnorm(0.45/2)/qnorm(theta.accept/(theta.accept+theta.reject)/2) } theta.accept<-0 theta.reject<-0 cat("Rescaling proposal sd for theta to",thetaSigma,"\n") } # adapt a min.a.accept<-min(a.accept/(a.accept+a.reject)) if (min(a.accept+a.reject)>=100 && (min.a.accept<0.15 | min.a.accept>0.5)) { if (min(a.accept)==0) { aSigma<-aSigma/4 } else if (min(a.reject)==0) { aSigma<-aSigma*4 } else { aSigma<-aSigma*qnorm(0.234/2)^2/qnorm(min.a.accept/2)^2 } a.accept<-0*a.accept a.reject<-0*a.reject cat("Rescaling proposal variance for a to",diag(aSigma),"\n") } if (it%%100==0) { # plot interim output cat("\nAfter iteration",it,"/",iters,"\n") cat("minimum acceptance rate for a:",apply(a.accept/(a.accept+a.reject),1,min),"\n") cat("minimum acceptance rate for r:",apply(r.accept/(r.accept+r.reject),1,min),"\n") cat("acceptance rate for theta:",theta.accept/(theta.accept+theta.reject),"\n") if (min(a.accept/(a.accept+a.reject))<0.1) {cat("aSigma needs reducing: smallest acceptance rate for a:",min(a.accept/(a.accept+a.reject)),"\n")} par(mfrow=c(ceiling((srcs+1)/2),2),mar=0.1+c(2,4,3,1)) k<-sample.int(yrs,1) for (j in 1:srcs) { if (temporal) { plot(1:iters,a.stored[k,j,],t="l",ylab=round(a.accept[k,j]/(a.accept[k,j]+a.reject[k,j]),3),main=paste(yr[k],src.labs[j])) } else { plot(1:iters,a.stored[j,],t="l",ylab=round(a.accept[j]/(a.accept[j]+a.reject[j]),3),main=src.labs[j]) } } plot(1:iters,theta.stored,t="l",ylab=round(theta.accept/(theta.accept+theta.reject),3),main="Random effect parameter theta") } } # output results priors<-list(r.prior=r.prior,theta.prior=theta.prior,alpha=alpha) acceptance<-list(a.accept=a.accept,a.reject=a.reject,r.accept=r.accept,r.reject=r.reject,theta.accept=theta.accept,theta.reject=theta.reject) output<-list(X=X,Y=Y,a.stored=a.stored,r.stored=r.stored,q.stored=q.stored,theta.stored=theta.stored,cases=cases,priors=priors,acceptance=acceptance) return(output) } Attribution/R/top.R00006660000000165213046326303010206 0ustar00#' Count top strains in each source #' #' Count the most represented strains in each source. Useful for exploring which strains contribute to the unobserved source. #' @param o The output list from the function \link{sourceAttribution}. #' @param tops Number of strains to show (default is the 5 most represented strains). #' @param burnin Number of samples to discard as burnin (plus 1). #' @details Strain labels are taken from the attribute \code{colnames(X)}. #' @export top<-function(o,tops=5, burnin=501) { srcs<-dim(o$X)[1] typs<-dim(o$X)[2] yrs<-dim(o$Y)[1] pts<-burnin:length(o$theta.stored) rw<-o$X/c(o$X%*%matrix(1,typs,1)) for (j in 1:srcs) { mp<-apply(o$r.stored[j,,pts],1,mean) or<-order(mp,decreasing=TRUE) cat(rownames(o$X)[j],":\n") for (k in 1:tops) { cat(" ",round(mp[or[k]]*100,1),"%",colnames(o$X)[or[k]],"(raw",round(100*rw[j,or[k]],1),"%)\n") } } }Attribution/data/00007770000000000013046326303007765 5ustar00Attribution/data/campyNZ.rda00006660000000153013046326303012035 0ustar00BZh91AY&SY,̸>Ap./ z82S)mF14 0&MM4# i` 244Lڒ{Rzhh@440& T M$DhhPd4 J4*zdLDIsXp( Ė@DD,0Fj ;Ts'[YC]'@E23Z V*OBDCR bݺNs}NIz};@51D5jIg d PTv91O