nnonmissing = function(x){return(sum(!is.na(x)))} normalise = function(x){return(x/sum(x))} cluster.multiSNP=function(multifile,singlefile){ x = read.table(multifile,header=T) rs = read.table(singlefile,header=T) pos = rs[,2] nsnp = apply(x,1,nnonmissing)-1 L = max(nsnp) NSNP = max(x[,-(1:2)],na.rm=T) prior = normalise((0.5^(1:L))[nsnp]/choose(NSNP,nsnp)) post = normalise(10^x[,1]*prior) #print(lapply(split(post,nsnp),sum)) # posterior on different numbers of SNPs pairp = matrix(0,nrow=NSNP,ncol=NSNP) contains = matrix(0,nrow=NSNP,ncol=length(post)) for(i in 1:NSNP){ contains[i,] = apply(x[,-(1:2)]==i,1,sum,na.rm=T) } for(i in 1:NSNP){ for(j in 1:NSNP){ pairp[i,j] = sum(post[contains[i,] & contains[j,]]) } } marginalp = diag(pairp) indepp = marginalp %*% t(marginalp) #d1 = dor(i,j) - dindep(i,j) # where dor is big if i and j tend to occur together # and dindep is big if i and j do not tend to occur together (ie less often than expected by chance) #so d is small if i and j tend not to occur together d = pairp^2 - 4*(indepp > pairp)*(indepp - pairp)^2 #d = pairp - 4*(indepp > pairp) * (indepp - pairp) diag(d) = 0 a= cutree(hclust(as.dist(d),method="complete"),h=0) nclust = max(a) clusterprobs = rep(0,nclust) for(i in 1:nclust){ members = (1:NSNP)[a==i] present = apply(x[,-(1:2)],2,is.element,members) clusterprobs[i] = sum(post[apply(present,1,max)==1]) } return(list(id = as.character(rs[,1]),pos=pos,marginalp=marginalp,cluster = (rank(-unlist(clusterprobs)))[a],clusterprobs=sort(clusterprobs,decreasing=TRUE))) } plot.multiSNP=function(mSNP,minp=0.05,interactive=FALSE){ par(mfcol=c(2,1)) plot(mSNP$pos,mSNP$marginalp,type="h",lwd=2,ylim=c(0,1),col=mSNP$cluster,ylab="Posterior Prob", xlab="Position",main="Marginal Probability of Each SNP") text(mSNP$pos[mSNP$marginalp>minp],mSNP$marginalp[mSNP$marginalp>minp],mSNP$id[mSNP$marginalp>minp],srt=90,cex=0.8,pos=2,col=mSNP$cluster[mSNP$marginalp>minp]) if(interactive==TRUE){identify(mSNP$pos,mSNP$marginalp,mSNP$id)} barplot(mSNP$clusterprobs,col=1:length(mSNP$clusterprobs),ylim=c(0,1),main="Probability for each cluster of SNPs") }