Last updated: 2017-08-07

Code version: 3847bbf

Another application of logo plot is for the protein seqeunce motif. Portein is composed of 20 amino acid.

Data are from the 3FDB. The data in ‘weighted observed percentages rounded down’ are simply the sample proportion. Here, we firstly transform it back to the pfm by multiplying the ‘Number of reference sequence used to generate final profile’, then apply dash to the pfm. The background frequency of the amino acid is from BLOSUM62.(See here for details)

library(Logolas)
cols1 <- c(rev(RColorBrewer::brewer.pal(12, "Paired"))[c(3,4,7,8,11,12,5,6,9,10)],
RColorBrewer::brewer.pal(12, "Set3")[c(1,2,5,8,9)],
RColorBrewer::brewer.pal(9, "Set1")[c(9,7)],
RColorBrewer::brewer.pal(8, "Dark2")[c(3,4,8)])
color_profile <- list("type" = "per_row",
"col" = cols1)


# a function to read pwm from 3FDB.
# then apply dash to the pwm.
bg=c(0.074,0.052,0.045,0.054,0.025,0.034,0.054,0.074,0.026,0.068,0.099,0.058,0.025,0.047,0.039,0.057,0.051,0.013,0.034,0.073)
readprotein=function(file,skip=3,nrows,nsites,bg=bg,adash){
  library(dash)
  rawdata=read.table(file = file,skip = skip,nrows = nrows)
  pwm=as.matrix(rawdata[,23:42]/100)
  pfm=round(pwm*nsites)
  if(adash==T){
    pwm=dash(as.matrix(pfm),mode = bg,optmethod = 'mixEM')$posmean
  }
  colnames(pwm)=c('A' ,'R','N','D',   'C' ,  'Q',   'E' ,  'G',   'H' ,  'I',   'L' , 'K'  , 'M' ,  'F',   'P' ,  'S'  , 'T' ,  'W',   'Y',   'V')
  rownames(pwm)=1:nrow(pwm)
  return(pwm)
}

#plot the logo plot

plotprotein=function(file,motif,original,pdash,nrows,nsites,bg){
  
  pwm_dash=readprotein(file,nrows = nrows,nsites = nsites,adash = T,bg=bg)
  
  pwm_ori=readprotein(file,nrows = nrows,nsites = nsites,adash =F,bg=bg)
  
  if(original==T){
    
    logomaker(t(pwm_ori[motif[1]:(sum(motif)-1),]),color_profile = color_profile,frame_width = 1,bg=bg,pop_name = 'original logo')
    nlogomaker(t(pwm_ori[motif[1]:(sum(motif)-1),]),logoheight = 'log_odds',color_profile = color_profile,frame_width = 1,bg=bg,pop_name = 'original neg logo')
    
  
  if(pdash==T){
    logomaker(t(pwm_dash[motif[1]:(sum(motif)-1),]),color_profile = color_profile,frame_width = 1,bg=bg,pop_name = 'dash logo')
    nlogomaker(t(pwm_dash[motif[1]:(sum(motif)-1),]),logoheight = 'log_odds',color_profile = color_profile,frame_width = 1,bg=bg,pop_name = 'dash neg logo')
  }
  
  }
}

#EX: Zona-pellucida-binding protein (Sp38) , sample size=5

plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF07354&data_dir=SDB_folder',c(82,8),T,T,272,5,bg)

#EX 0: Zeta toxin , sample size = 9

plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF06414&data_dir=SDB_folder',c(1,8),T,T,227,9,bg)

#EX 1:Alpha-2,8-polysialyltransferase (POLYST), sample size=19

plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF07388&data_dir=ADB_folder',c(306,8),T,T,467,19,bg)

#EX 2: ATPase family associated with various cellular activities (AAA) , sample size=35

plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF07728&data_dir=SDB_folder',c(24,8),T,T,145,35,bg)

#EX 2:  A1 Propeptide, sample size = 85
plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF07966&data_dir=SDB_folder',c(19,8),T,T,29,85,bg)

#EX 3: ABC 3 transport family ,samplesize=904

#error: LaplacesDemon::ddirichlet(rep(1, 2), alpha = c(sum(conc_mat[k,  : 
#  alpha must be positive.
#plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF00950&data_dir=ADB_folder',c(22,20),T,T,50,904,bg)

#EX: Acetyl-CoA hydrolase/transferase N-terminal domain , sample size=306

plotprotein('http://caps.ncbs.res.in/cgi-bin/mini/databases/3pfdb/3pfdb_pssm_download.cgi?id=PF02550&data_dir=ADB_folder',c(148,11),T,T,176,306,bg)

Session information

sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 15063)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dash_0.99.0      SQUAREM_2016.8-2 Logolas_1.1.2   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.11         digest_0.6.12        rprojroot_1.2       
 [4] grid_3.4.0           backports_1.0.5      git2r_0.18.0        
 [7] magrittr_1.5         evaluate_0.10        stringi_1.1.5       
[10] LaplacesDemon_16.0.1 rmarkdown_1.6        RColorBrewer_1.1-2  
[13] tools_3.4.0          stringr_1.2.0        parallel_3.4.0      
[16] yaml_2.1.14          compiler_3.4.0       htmltools_0.3.5     
[19] knitr_1.15.1        

This R Markdown site was created with workflowr