Last updated: 2017-08-03

Code version: 14f04a0

The database plantTFDB provides transcription factors of more than 160 species, including main lineages of green plants. What is special is that each species has specific background probability of A,C,G,T. Here, we firstly apply dash to the PFM and then plot the logo plot. Notice that only PWMs are provided so we first multiply the PWM with the nsites and the round to the nearest integer of each entry as the PFM. We compare the logo plots of the original PWM, the PWM from dash and the PWM from ‘aggregated’ dash. In the aggregated dash, we first combine all the PWMs of one species into a large matrix with four rows(of A,C,G,T) and \(N_p\) columns where \(N_p\) is the sum of the number of positions of each PWM.

We studied the TF binding motifs from 5 species with varying background probabilities, including
Vigna radiata Eucalyptus grandis Arabidopsis thaliana Actinidia chinensis.

library(Logolas)
# a function to get pwms, pfms, nsites etc from http://planttfdb.cbi.pku.edu.cn/
#'@ species: the name of speices, use abbreviation. Ach - Actinidia chinensis
#
plantTF=function(species){
  library(dash)
  #read all the TF names from the infomation page
  nam=readLines(paste('http://planttfdb.cbi.pku.edu.cn/download/motif/',species,'_TF_binding_motifs_information.txt',sep=''))
  
#record the original pwm from the website as a list
pwm_original=list()
#record the transformed pfm as a list
pfm_indi=list()
#record the pwm after applying the dash
pwm_dash=list()
#a large mattrix combine all the pfms
pfm_cb=c()
#record the number of positions for each pwm
npos=c()
#record the number of sequences of each pfm
site=c()

for(i in 2:length(nam)){
  #read each pwm file from the website 
  readfile=readLines(paste('http://planttfdb.cbi.pku.edu.cn/motif/Ach/',substring(nam[i],1,10),'.meme',sep = ''))
  #extract the pwm
  pwm=matrix(as.numeric(as.character(unlist(strsplit(readfile[13:(length(readfile)-3)],split = '\t')))),nrow = 4,byrow = F)
  rownames(pwm)=c('A','C','G','T')
  colnames(pwm)=1:ncol(pwm)
  pwm_original[[i-1]]=pwm
  #extract the number of sites
  nsites=as.numeric(as.character(unlist(strsplit(readfile[12],split = ' '))[grep('nsites',unlist(strsplit(readfile[12],split = ' ')))+1]))
  site[i-1]=nsites
  #extract the background probabilities
  bg=as.numeric(strsplit(readfile[8],split = ' ')[[1]][c(2,4,6,8)])
  #
  pfm=round(pwm*nsites)
  pfm_cb=cbind(pfm_cb,pfm)
  npos[i-1]=ncol(pwm)
  pfm_indi[[i-1]]=pfm
  pwm=dash(t(pfm),optmethod = 'mixEM',mode = bg)
  pwm=t(pwm$posmean)
  rownames(pwm)=c('A','C','G','T')
  colnames(pwm)=1:ncol(pwm)
  pwm_dash[[i-1]]=pwm
}
pwm_adash=dash(t(pfm_cb),mode=bg,optmethod = 'mixEM')
pwm_adash=t(pwm_adash$posmean)
pwm_agg=list()
for(i in 1:length(npos)){
  pwm=pwm_adash[,1:npos[i]]
  rownames(pwm)=c('A','C','G','T')
  colnames(pwm)=1:ncol(pwm)
  pwm_adash=pwm_adash[,-(1:npos[i])]
  pwm_agg[[i]]=pwm
}

results=list(pfm=pfm_indi,pwm_original=pwm_original,pwm_dash=pwm_dash,pwm_cbdash=pwm_agg,bg=bg,nseq=site,npos=npos)

return(results)

}

Then we show the logo plots and negative logo plots for the three PWMs from Actinidia chinensis. We pick the PWMs whose the number of sequences are small.

Ach=plantTF('Ach')

color_profile = list("type" = "per_row", 
                     "col" = RColorBrewer::brewer.pal(4,name ="Spectral"))
set.seed(1)
for (i in sample(which(Ach$nseq<=20),5)){
  
   logomaker(Ach$pwm_original[[i]],color_profile = color_profile,frame_width = 1,bg=Ach$bg,pop_name = paste('original pwm',i))
  logomaker(Ach$pwm_dash[[i]],color_profile = color_profile,frame_width = 1,bg=Ach$bg,pop_name = paste('dash pwm',i))
  logomaker(Ach$pwm_cbdash[[i]],color_profile = color_profile,frame_width = 1,bg=Ach$bg,pop_name = paste('combined dash pwm',i))
  
nlogomaker(Ach$pwm_original[[i]],color_profile = color_profile,frame_width = 1,logoheight = 'log_odds',bg=Ach$bg,control = list(logscale = 0.2, quant = 0.5,depletion_weight = 0.5),pop_name = paste('original pwm',i))
  nlogomaker(Ach$pwm_dash[[i]],color_profile = color_profile,frame_width = 1,logoheight = 'log_odds',bg=Ach$bg,control = list(logscale = 0.2, quant = 0.5,depletion_weight = 0.5),pop_name = paste('dash pwm',i))
  nlogomaker(Ach$pwm_cbdash[[i]],color_profile = color_profile,frame_width = 1,logoheight = 'log_odds',bg=Ach$bg,control = list(logscale = 0.2, quant = 0.5,depletion_weight = 0.5),pop_name = paste('combined dash pwm',i))
 
}

Session information

sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 15063)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dash_0.99.0      SQUAREM_2016.8-2 Logolas_1.1.2   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.11         digest_0.6.12        rprojroot_1.2       
 [4] grid_3.4.0           backports_1.0.5      git2r_0.18.0        
 [7] magrittr_1.5         evaluate_0.10        stringi_1.1.5       
[10] LaplacesDemon_16.0.1 rmarkdown_1.6        RColorBrewer_1.1-2  
[13] tools_3.4.0          stringr_1.2.0        parallel_3.4.0      
[16] yaml_2.1.14          compiler_3.4.0       htmltools_0.3.5     
[19] knitr_1.15.1        

This R Markdown site was created with workflowr