Package Biskit :: Package Mod :: Module Analyse
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.Analyse

  1  ## Automatically adapted for numpy.oldnumeric Mar 26, 2007 by alter_code1.py 
  2   
  3  ## 
  4  ## Biskit, a toolkit for the manipulation of macromolecular structures 
  5  ## Copyright (C) 2004-2006 Raik Gruenberg & Johan Leckner 
  6  ## 
  7  ## This program is free software; you can redistribute it and/or 
  8  ## modify it under the terms of the GNU General Public License as 
  9  ## published by the Free Software Foundation; either version 2 of the 
 10  ## License, or any later version. 
 11  ## 
 12  ## This program is distributed in the hope that it will be useful, 
 13  ## but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 15  ## General Public License for more details. 
 16  ## 
 17  ## You find a copy of the GNU General Public License in the file 
 18  ## license.txt along with this program; if not, write to the Free 
 19  ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 20  ## 
 21   
 22  ## Contributions: olivier PERIN 
 23  ## 
 24  ## last $Author: graik $ 
 25  ## last $Date: 2007/04/06 10:16:09 $ 
 26  ## $Revision: 1.12 $ 
 27   
 28  """ 
 29  Analyze model quality 
 30  """ 
 31   
 32  import Biskit.tools as T 
 33  from Biskit.PDBModel import PDBModel 
 34  from Biskit.Mod.Benchmark import Benchmark 
 35  from Biskit.Mod.ValidationSetup import ValidationSetup as VS 
 36  from Biskit.Mod.CheckIdentities import CheckIdentities as CI 
 37  from Biskit.Mod.Modeller import Modeller 
 38   
 39  import os 
 40  from string import * 
 41  import numpy.oldnumeric as N 
 42   
 43   
44 -class Analyse:
45 """ 46 Create a folder named analyse in the project root folder 47 that contains the following results: 48 49 GLOBAL: global rmsd: all atoms, c-alpha only, percentage of 50 identities, Modeller score, and the number of templates 51 52 LOCAL: results of the cross validation, rmsd per residue c-alpha 53 only for each templates and the mean rmsd 54 55 3D structure: pickle down the final.pdb that is the best model of 56 the project and the mean rmsd is set inside (temperature_factor) 57 """ 58 59 F_RESULT_FOLDER = '/analyse' 60 F_TEMPLATE_FOLDER = VS.F_RESULT_FOLDER 61 62 F_PDBModels = Benchmark.F_PDBModels_OUT 63 F_MODELS = Modeller.F_RESULT_FOLDER + Modeller.F_PDBModels 64 65 F_INPUT_ALNS= '/t_coffee/final.pir_aln' 66 67 F_INPUT_RMSD = Benchmark.F_RESULT_FOLDER 68 F_RMSD_AA = Benchmark.F_RMSD_AA 69 F_RMSD_CA = Benchmark.F_RMSD_CA 70 71 72 F_OUTPUT_VALUES = F_RESULT_FOLDER + '/global_results.out' 73 F_CROSS_VAL = F_RESULT_FOLDER + '/local_results.out' 74 F_FINAL_PDB = F_RESULT_FOLDER + '/final.pdb' 75 76
77 - def __init__( self, outFolder, log=None ):
78 """ 79 @param outFolder: base folder for output 80 @type outFolder: str 81 @param log: None reports to STDOUT 82 @type log: LogFile instance or None 83 """ 84 self.outFolder = T.absfile( outFolder ) 85 self.log = log 86 87 self.prepareFolders()
88 89
90 - def prepareFolders( self ):
91 """ 92 Create folders needed by this class. 93 """ 94 if not os.path.exists( self.outFolder + self.F_RESULT_FOLDER ): 95 os.mkdir( self.outFolder + self.F_RESULT_FOLDER )
96 97
98 - def parseFile( self, name ):
99 """ 100 Parse a identity matrix file 101 102 @param name: file to parse 103 @type name: str 104 105 @return: contents of parsed file 106 @rtype: [[str]] 107 """ 108 f = open( name, 'r') 109 result = [] 110 lines = f.readlines() 111 112 for l in lines: 113 if not l[0] == '#': 114 r =[] 115 for s in l.split(): 116 try: 117 r += [float(s)] 118 except: 119 pass 120 if len(r)>=1: 121 result += [ r ] 122 123 f.close() 124 return result
125 126
127 - def __listDir( self, path ):
128 """ 129 List all the files and folders in a directory 130 with the exceprion of ... 131 132 @param path: dir to list 133 @type path: str 134 135 @return: list of files 136 @rtype: [str] 137 """ 138 files = os.listdir( path ) 139 if 'CVS' in files: 140 files.remove('CVS') 141 142 return files
143 144 145 ###################################################################### 146 #### GLOBAL RESTULTS: RMSD_AA, RMSD_CA, 147 #### %ID(mean of the templates), 148 #### Nb of Templates 149
150 - def global_rmsd_aa(self, validation_folder= None):
151 """ 152 Global RMSD values. 153 154 @param validation_folder: folder vith validation data 155 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 156 @type validation_folder: str 157 158 @return: two dictionaries: 159 - rmsd_aa_wo_if: global all atom rmsd for each 160 template without iterative fitting 161 - rmsd_aa_if: global all atom rmsd for each 162 templates with iterative fitting 163 @rtype: dict, dict 164 """ 165 validation_folder = validation_folder or self.outFolder + \ 166 self.F_TEMPLATE_FOLDER 167 168 folders = self.__listDir(validation_folder) 169 170 rmsd_aa_wo_if = {} 171 rmsd_aa_if = {} 172 173 for folder in folders: 174 175 file = "%s/%s"%(validation_folder, folder + self.F_RMSD_AA) 176 lst = self.parseFile( file ) 177 178 rmsd_aa_wo_if[folder] = [ lst[0][0] ] 179 rmsd_aa_if[folder] = [ lst[0][1], lst[0][2]*100.] 180 181 return rmsd_aa_wo_if, rmsd_aa_if
182 183
184 - def global_rmsd_ca(self, validation_folder= None):
185 """ 186 Global RMSD CA values. 187 188 @param validation_folder: folder vith validation data 189 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 190 @type validation_folder: str 191 192 @return: two dictionaries: 193 - rmsd_ca_wo_if: global CA rmsd for each template 194 without iterative fitting 195 - rmsd_ca_if: global CA rmsd for each template 196 with iterative fitting 197 @rtype: dict, dict 198 """ 199 validation_folder = validation_folder or self.outFolder + \ 200 self.F_TEMPLATE_FOLDER 201 202 folders = self.__listDir(validation_folder) 203 204 rmsd_ca_wo_if = {} 205 rmsd_ca_if = {} 206 207 for folder in folders: 208 209 file = "%s/%s"%(validation_folder, folder + self.F_RMSD_CA) 210 211 lst = self.parseFile( file ) 212 213 rmsd_ca_wo_if[folder] = [ lst[0][0] ] 214 rmsd_ca_if[folder] = [ lst[0][1], lst[0][2]*100.] 215 216 return rmsd_ca_wo_if, rmsd_ca_if
217 218
219 - def get_identities(self, nb_templates, validation_folder = None):
220 """ 221 Calculate the mean of the percentage of identities for each 222 template with the others. 223 224 @param nb_templates: number of templates used in the cross-validation 225 @type nb_templates: int 226 @param validation_folder: folder vith validation data 227 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 228 @type validation_folder: str 229 230 @return: dictionary with mean percent identities for each template 231 @rtype: {str:float} 232 """ 233 234 validation_folder = validation_folder or self.outFolder + \ 235 self.F_TEMPLATE_FOLDER 236 237 folders = self.__listDir(validation_folder) 238 identities = {} 239 240 for folder in folders: 241 file = "%s/%s"%(validation_folder, folder + \ 242 CI.F_OUTPUT_IDENTITIES_COV) 243 244 lst = self.parseFile( file ) 245 246 ## identity to mean template 247 identities[folder] = N.sum(lst[0][1:])/nb_templates 248 249 return identities
250 251
252 - def get_score(self, validation_folder = None):
253 """ 254 Get the best global modeller score for each template re-modeled 255 256 @param validation_folder: folder vith validation data 257 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 258 @type validation_folder: str 259 260 @return: dictionary with modeller score for each template 261 @rtype: {str:float} 262 """ 263 validation_folder = validation_folder or self.outFolder + \ 264 self.F_TEMPLATE_FOLDER 265 266 folders = self.__listDir(validation_folder) 267 score = {} 268 269 for folder in folders: 270 file = "%s/%s"%(validation_folder, folder + Modeller.F_SCORE_OUT) 271 272 file = open(file, 'r') 273 string_lines = file.readlines()[3] 274 score[folder] = float( split(string_lines)[1] ) 275 276 return score
277 278
279 - def output_values(self, rmsd_aa_wo_if, rmsd_aa_if, rmsd_ca_wo_if, 280 rmsd_ca_if, identities, score, nb_templates, 281 output_file = None):
282 """ 283 Write result to file. 284 285 @param rmsd_aa_wo_if: Rmsd for heavy atoms and normal fit. 286 Data should be a dictionary mapping pdb 287 codes to a list containing the rmsd value 288 and the percent of discharded atoms in 289 the rmsd calculation. 290 @type rmsd_aa_wo_if: {str:[float,float]} 291 @param rmsd_aa_if: Rmsd for heavy atoms, iterative fit. 292 @type rmsd_aa_if: {str:[float,float]} 293 @param rmsd_ca_wo_if: rmsd for only CA, normal fit. 294 @type rmsd_ca_wo_if: {str:[float,float]} 295 @param rmsd_ca_if: Rmsd for only CA, iterative fit. 296 @type rmsd_ca_if: {str:[float,float]} 297 @param identities: mean identity to template, dictionary 298 mapping pdb codes to identity values 299 @type identities: {str:float} 300 @param score: score calculated by Modeller 301 @type score: float 302 @param nb_templates: number of templates used for re-modeling 303 @type nb_templates: int 304 @param output_file: file to write 305 (default: None S{->} outFolder/L{F_OUTPUT_VALUES}) 306 @type output_file: str 307 """ 308 output_file = output_file or self.outFolder + self.F_OUTPUT_VALUES 309 310 file = open( output_file, 'w' ) 311 file.write("PROPERTIES OF RE-MODELED TEMPLATES:\n\n") 312 file.write(" | NORMAL FIT | ITERATIVE FIT | IDENTITY | SCORE | NR\n" ) 313 file.write("PDB | heavy CA | heavy percent CA percent | mean to | mod8 | of\n") 314 file.write("code | rmsd rmsd | rmsd discarded rmsd discarded | templates | | templates\n") 315 316 for key, value in rmsd_aa_wo_if.items(): 317 318 file.write("%4s %6.2f %6.2f %6.2f %8.1f %7.2f %7.1f %10.1f %9i %6i\n"%\ 319 (key, value[0], rmsd_ca_wo_if[key][0], rmsd_aa_if[key][0], 320 rmsd_aa_if[key][1], rmsd_ca_if[key][0], rmsd_ca_if[key][1], 321 identities[key], score[key], nb_templates)) 322 323 file.close()
324 325 326 ######################################################################## 327 ######### LOCAL RESULTS: Cross Validation ---- RMSD / Res 328
329 - def get_aln_info(self, output_folder = None):
330 """ 331 Collect alignment information. 332 333 @param output_folder: output folder (default: None S{->} outFolder) 334 @type output_folder: str 335 336 @return: aln_dictionary, contains information from the alignment 337 between the target and its templates 338 e.g. {'name':'target, 'seq': 'sequence of the target'} 339 @rtype: dict 340 """ 341 output_folder = output_folder or self.outFolder 342 343 ci = CI(outFolder=output_folder) 344 345 string_lines = ci.get_lines() 346 aln_length = ci.search_length(string_lines) 347 aln_dictionnary = ci.get_aln_sequences(string_lines, aln_length) 348 aln_dictionnary = ci.get_aln_templates(string_lines, aln_dictionnary, 349 aln_length) 350 aln_dictionnary = ci.identities(aln_dictionnary) 351 352 return aln_dictionnary
353 354
355 - def get_templates_rmsd(self, templates):
356 """ 357 Collect RMSD values between all the templates. 358 359 @param templates: name of the different templates 360 @type templates: [str] 361 362 @return: template_rmsd_dic, contains all the rmsd per residues 363 of all the templates 364 @rtype: dict 365 """ 366 template_rmsd_dic = {} 367 for template in templates: 368 369 pdb_list = self.outFolder + self.F_TEMPLATE_FOLDER \ 370 + "/%s"%template + self.F_PDBModels 371 372 pdb_list = T.Load(pdb_list) 373 374 template_rmsd_dic[template] = \ 375 pdb_list[0].compress(pdb_list[0].maskCA()).aProfiles["rmsd2ref_if"] 376 377 return template_rmsd_dic
378 379
380 - def templates_profiles(self, templates, aln_dic, template_rmsd_dic):
381 """ 382 Collect RMSD profiles of each template with the target and their %ID. 383 384 @param templates: name of the different templates 385 @type templates: [str] 386 @param aln_dic: contains all the informations between the 387 target and its templates from the alignment 388 @type aln_dic: dict 389 @param template_rmsd_dic: contains all the rmsd per residues of all 390 the templates 391 @type template_rmsd_dic: dict 392 393 @return: template_profiles, contains all the profile rmsd of 394 each template with the target and their %ID 395 @rtype: dict 396 """ 397 templates_profiles = {} 398 399 target_aln = aln_dic["target"]["seq"] 400 for template in templates: 401 template_aln = [] 402 template_profile = [] 403 template_info = {} 404 405 template_rmsd = template_rmsd_dic[template] 406 for key in aln_dic: 407 if(key[:4] == template): 408 template_aln = aln_dic[key]["seq"] 409 410 no_res = -1 411 for i in range(len(target_aln)): 412 if(template_aln[i] is not '-'): 413 no_res += 1 414 415 if(target_aln[i] != '-' and template_aln[i] != '-'): 416 template_profile.append(template_rmsd[no_res]) 417 418 if(target_aln[i] != '-' and template_aln[i] == '-'): 419 template_profile.append(-1) 420 421 template_info["rProfile"] = template_profile 422 423 for key in aln_dic["target"]["cov_ID"]: 424 if(key[:4] == template): 425 template_info["cov_ID"] = \ 426 aln_dic["target"]["cov_ID"][key] 427 428 templates_profiles[template] = template_info 429 430 return templates_profiles
431 432 433
434 - def output_cross_val(self, aln_dic, templates_profiles, 435 templates, model, output_file=None):
436 """ 437 Calculates the mean rmsd of the model to the templates and 438 write the result to a file. 439 440 @param aln_dic: contains all the informations between the 441 target and its templates from the alignment 442 @type aln_dic: dict 443 @param templates_profiles: contains all the profile rmsd of 444 each template with the target and their %ID 445 @type templates_profiles: dict 446 @param templates: name of the different templates 447 @type templates: [str] 448 @param model: model 449 @type model: PDBModel 450 @param output_file: output file 451 (default: None S{->} outFolder/L{F_CROSS_VAL}) 452 @type output_file: str 453 454 @return: mean_rmsd, dictionary with the mean rmsd of the model 455 to the templates. 456 @rtype: dict 457 """ 458 output_file = output_file or self.outFolder + self.F_CROSS_VAL 459 460 mean, sum, values = 0, 0, 0 461 mean_rmsd = [] 462 463 for k,v in aln_dic["target"]["cov_ID"].items(): 464 465 if (k != "target"): 466 sum += aln_dic["target"]["cov_ID"][k] 467 values +=1 468 469 ## cov_id_target = float(sum/values) 470 471 for i in range(len(templates_profiles[templates[0]]["rProfile"])): 472 473 mean = 0 474 sum = 0 475 n_values = 0 476 477 for k in templates_profiles: 478 479 if(templates_profiles[k]["rProfile"][i] != -1): 480 sum += templates_profiles[k]["rProfile"][i] 481 n_values += 1 482 483 if(n_values != 0): 484 mean = float(sum) / float(n_values) 485 486 else: mean = -1 487 488 mean_rmsd.append(mean) 489 490 ## write header 491 file = open (output_file, 'w') 492 file.write("Mean rmsd of model to templates and the residue rmsd.\n") 493 494 ## write pdb code 495 file.write(" "*7) 496 for k in templates_profiles.keys(): 497 file.write("%6s"%k) 498 file.write(" mean\n") 499 500 ## write mean rmsd 501 file.write(" "*7) 502 for k in templates_profiles.keys(): 503 file.write("%6.2f"%templates_profiles[k]["cov_ID"]) 504 file.write("\n%s\n"%('='*70)) 505 506 ## write rmsd residue profiles 507 res_nr = model.compress( model.maskCA()).aProfiles['residue_number'] 508 res_na = model.compress( model.maskCA()).aProfiles['residue_name'] 509 for i in range(len(templates_profiles[templates[0]]["rProfile"])): 510 file.write("%3i %3s"%(res_nr[i], 511 res_na[i])) 512 for k in templates_profiles: 513 file.write("%6.2f"%(templates_profiles[k]["rProfile"][i])) 514 515 file.write("%6.2f\n"%(mean_rmsd[i])) 516 517 file.close() 518 519 return mean_rmsd
520 521 522 ################################################# 523 ###### 3D Structure: mean RMSD 524 525
526 - def updatePDBs_charge(self, mean_rmsd_atoms, model):
527 """ 528 pickle down the final.pdb which is judged to be the best model 529 of the project. The mean rmsd to the templates is written to the 530 temperature_factor column. 531 532 @param mean_rmsd_atoms: mean rmsd for each atom of the 533 target's model 534 @type mean_rmsd_atoms: [int] 535 @param model: target's model with the highest modeller score 536 @type model: PDBModel 537 """ 538 model['temperature_factor'] = v 539 540 model.writePdb(self.outFolder + self.F_FINAL_PDB)
541 542 543 544 ###################### 545 ### LAUNCH FUNCTION ## 546 ###################### 547
548 - def go(self, output_folder = None, template_folder = None):
549 """ 550 Run analysis of models. 551 552 @param output_folder: folder for result files 553 (default: None S{->} outFolder/L{F_RESULT_FOLDER}) 554 @type output_folder: str 555 @param template_folder: folder with template structures 556 (default: None S{->} outFolder/L{VS.F_RESULT_FOLDER}) 557 @type template_folder: str 558 """ 559 ## 560 pdb_list = T.Load(self.outFolder + self.F_MODELS) 561 model = PDBModel(pdb_list[0]) 562 563 ## 564 output_folder = output_folder or self.outFolder + self.F_RESULT_FOLDER 565 template_folder = template_folder or self.outFolder +VS.F_RESULT_FOLDER 566 567 templates = self.__listDir(template_folder) 568 569 ## 570 global_rmsd_aa_wo_if, global_rmsd_aa_if = self.global_rmsd_aa() 571 global_rmsd_ca_wo_if, global_rmsd_ca_if = self.global_rmsd_ca() 572 nb_templates = len(templates)-1 573 574 identities = self.get_identities(nb_templates) 575 score = self.get_score() 576 577 self.output_values(global_rmsd_aa_wo_if, global_rmsd_aa_if, 578 global_rmsd_ca_wo_if, global_rmsd_ca_if, 579 identities, score, nb_templates) 580 581 ## 582 aln_dic = self.get_aln_info(output_folder=self.outFolder) 583 584 template_rmsd_dic = self.get_templates_rmsd(templates) 585 templates_profiles = self.templates_profiles(templates, 586 aln_dic, 587 template_rmsd_dic) 588 mean_rmsd = self.output_cross_val(aln_dic, templates_profiles, 589 templates, model) 590 591 ## 592 mean_rmsd_atoms = model.res2atomProfile(mean_rmsd) 593 self.updatePDBs_charge(mean_rmsd_atoms, model)
594 595 596 597 ############# 598 ## TESTING 599 ############# 600 import Biskit.test as BT 601
602 -class Test( BT.BiskitTest ):
603 """ 604 Test class 605 """ 606
607 - def prepare(self):
608 import tempfile 609 import shutil 610 611 ## collect the input files needed 612 self.outfolder = tempfile.mkdtemp( '_test_Analyse' ) 613 614 ## data from validation sub-projects 615 dir = '/Mod/project/validation/' 616 for v in ['1DT7', '1J55']: 617 618 os.makedirs( self.outfolder +'/validation/%s/modeller'%v ) 619 shutil.copy( T.testRoot() +dir+'/%s/modeller/Modeller_Score.out'%v, 620 self.outfolder + '/validation/%s/modeller'%v) 621 622 shutil.copy( T.testRoot() + dir + '/%s/identities_cov.out'%v, 623 self.outfolder + '/validation/%s'%v ) 624 625 os.mkdir( self.outfolder +'/validation/%s/benchmark'%v ) 626 for f in [ 'rmsd_aa.out', 'rmsd_ca.out', 'PDBModels.list' ]: 627 shutil.copy( T.testRoot() + dir + '/%s/benchmark/%s'%(v,f), 628 self.outfolder + '/validation/%s/benchmark'%v ) 629 630 631 ## data from main project 632 os.mkdir( self.outfolder +'/modeller' ) 633 shutil.copy( T.testRoot() + '/Mod/project/modeller/PDBModels.list', 634 self.outfolder + '/modeller' ) 635 636 os.mkdir( self.outfolder +'/t_coffee' ) 637 shutil.copy( T.testRoot() + '/Mod/project/t_coffee/final.pir_aln', 638 self.outfolder + '/t_coffee' )
639 640
641 - def test_Analyzer(self):
642 """Mod.Analyzer test""" 643 644 self.a = Analyse( outFolder = self.outfolder ) 645 self.a.go() 646 647 if self.local and self.DEBUG: 648 self.log.add( 649 'The result from the analysis is in %s/analyse'%self.outfolder)
650
651 - def cleanUp(self):
652 T.tryRemove( self.outfolder, tree=1 )
653 654
655 -class ProjectAnalyzeTest( BT.BiskitTest ):
656 """ 657 Test case for analyzing a complete modeling project in test/Mod/project 658 """ 659 660 ## rename to test_Analyze to include this test
661 - def t_Analyze( self ):
662 """ 663 Mod.Analyze full test/Mod/project test 664 """ 665 self.outfolder = T.testRoot() + '/Mod/project' 666 667 self.a = Analyse( outFolder = self.outfolder ) 668 self.a.go() 669 670 if self.local: 671 print 'The result from the analysis can be found in %s/analyse'%outfolder
672 673 if __name__ == '__main__': 674 675 BT.localTest() 676