Package Biskit :: Package Mod :: Module Modeller
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.Modeller

  1  ## 
  2  ## Biskit, a toolkit for the manipulation of macromolecular structures 
  3  ## Copyright (C) 2004-2005 Raik Gruenberg & Johan Leckner 
  4  ## 
  5  ## This program is free software; you can redistribute it and/or 
  6  ## modify it under the terms of the GNU General Public License as 
  7  ## published by the Free Software Foundation; either version 2 of the 
  8  ## License, or any later version. 
  9  ## 
 10  ## This program is distributed in the hope that it will be useful, 
 11  ## but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 13  ## General Public License for more details. 
 14  ## 
 15  ## You find a copy of the GNU General Public License in the file 
 16  ## license.txt along with this program; if not, write to the Free 
 17  ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 18  ## 
 19  ## 
 20  ## original authors: Raik Gruenberg, Johan Leckner 
 21  ## contributing    : Olivier Perin (post-processing) 
 22  ## 
 23  ## last $Author: leckner $ 
 24  ## last $Date: 2006/09/13 10:28:14 $ 
 25  ## $Revision: 2.8 $ 
 26  """ 
 27  Interface to Modeller 
 28  """ 
 29   
 30  import re, os, os.path, subprocess 
 31  import commands 
 32  import linecache 
 33  from string import * 
 34   
 35  import settings 
 36  import Biskit.tools as T 
 37   
 38  from Biskit.Mod.TemplateCleaner import TemplateCleaner as TC 
 39  from Biskit.Mod.SequenceSearcher import SequenceSearcher as SS 
 40  from Biskit.Mod.CheckIdentities import Check_Identities as CI 
 41  from Biskit.Mod.Aligner import Aligner 
 42  from Biskit.ModelList import ModelList 
 43  from Biskit.DictList import DictList 
 44   
 45  from Biskit import Executor, TemplateError 
 46   
 47  from Biskit import PDBModel 
 48  import glob 
 49   
 50  from Biskit import StdLog, EHandler 
 51   
 52   
53 -class ModellerError( Exception ):
54 pass
55 56
57 -class Modeller( Executor ):
58 """ 59 Run a Modeller job 60 ================== 61 62 Take Alignment from t_coffee and template PDBs. Creates a modeller-script 63 and runs modeller and post-processes the results. The Modeller output ends 64 up in the output folder. 65 66 Example usage 67 ------------- 68 >>> m = Modeller( outfolder ) 69 >>> m.run() 70 71 References 72 ---------- 73 - U{http://www.salilab.org/modeller/} 74 """ 75 76 MODELLER_TEMPLATE = """ 77 # Homology modelling by the MODELLER TOP routine 'model'. 78 79 INCLUDE # Include the predefined TOP routines 80 81 SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file 82 SET ALNFILE = '%(f_pir)s' # alignment filename 83 SET KNOWNS = %(template_ids)s 84 SET SEQUENCE = '%(target_id)s' # code of the target 85 SET ATOM_FILES_DIRECTORY = '%(template_folder)s'# directories for input atom files 86 SET STARTING_MODEL= %(starting_model)i # index of the first model 87 SET ENDING_MODEL = %(ending_model)i # index of the last model 88 #(determines how many models to calculate) 89 90 CALL ROUTINE = 'model' # do homology modelling 91 """ 92 93 F_RESULT_FOLDER = '/modeller' 94 F_MOD_SCRIPT = 'modeller.top' 95 96 F_INPUT_FOLDER = F_RESULT_FOLDER 97 98 ## default modeller inp file 99 F_INP = F_RESULT_FOLDER + '/' + F_MOD_SCRIPT 100 101 ## standard file name output for PDBModels 102 F_PDBModels = '/PDBModels.list' 103 104 ## standard file name output for Objective Function 105 F_SCORE_OUT = F_RESULT_FOLDER + '/Modeller_Score.out' 106 107
108 - def __init__( self, outFolder='.', log=None, fasta_target=None, 109 f_pir=None, template_folder=None, fout=None, 110 starting_model=1, ending_model=10, verbose=0, 111 host=None, **kw ):
112 """ 113 @param outFolder: base folder for Modeller output 114 (default: L{F_RESULT_FOLDER}) 115 @type outFolder: str 116 @param log: log file instance, if None, STDOUT is used (default: None) 117 @type log: LogFile 118 @param f_pir: PIR alignment file (default: None -> L{Aligner.F_FINAL_ALN}) 119 @type f_pir: str 120 @param template_folder: folder with template coordinate files 121 (default: None -> L{TC.F_MODELLER}) 122 @type template_folder: str 123 @param fout: name of modeller script (default: None -> L{F_INP}) 124 @type fout: None or str 125 @param starting_model: first model (default: 1) 126 @type starting_model: int 127 @param ending_model: last model (default: 10) 128 @type ending_model: int 129 @param verbose: verbosity level (default: 1) 130 @type verbose: 1|0 131 @param host: host for calculation (None->no ssh) (default: None) 132 @type host: str 133 """ 134 self.outFolder = T.absfile( outFolder ) 135 self.log = log 136 137 self.verbose = verbose 138 139 self.f_inp = None 140 141 self.f_pir = f_pir 142 self.template_folder = template_folder 143 self.starting_model = starting_model 144 self.ending_model = ending_model 145 146 f_top = '%s%s'%(self.outFolder, self.F_INP) 147 148 self.template_ids = '' 149 self.target_id = '' 150 151 ## sequence ids from the last prepared alignment 152 self.pir_ids = [] 153 154 Executor.__init__( self, 'modeller', args=f_top, f_in=f_top, 155 template=self.MODELLER_TEMPLATE, 156 cwd='%s%s'%(self.outFolder, self.F_RESULT_FOLDER), 157 verbose=verbose, node=host, **kw )
158 159
160 - def prepareFolders( self ):
161 """ 162 Create needed output folders if they don't exist. 163 """ 164 if not os.path.exists( self.outFolder + self.F_RESULT_FOLDER ): 165 os.mkdir( self.outFolder + self.F_RESULT_FOLDER ) 166 if self.verbose: 167 self.logWrite( 'Creating '+self.outFolder + self.F_RESULT_FOLDER)
168 169
170 - def prepare( self ):
171 """ 172 Create needed folders and get information needed by Modeller 173 """ 174 self.prepareFolders() 175 176 self.prepare_modeller()
177 178
179 - def generateInp(self):
180 """ 181 Convert the list of template id's into a string in addition 182 L{Executor.generateInp} functionality. 183 184 Replace formatstr place holders in inp by fields of this class. 185 186 @return: input file name OR (if pipes=1) content of input file 187 @rtype: str 188 189 @raise TemplateError: if error while creating template file 190 """ 191 ## we need to convert the list of template id's into a string 192 self.template_ids = [ "'%s'"%id for id in self.template_ids ] 193 self.template_ids = ' '.join( self.template_ids ) 194 195 try: 196 inp = None 197 198 if self.template: 199 inp = self.fillTemplate() 200 201 return self.convertInput( inp ) 202 203 except Exception, why: 204 s = "Error while creating template file." 205 s += "\n template file: " + str( self.template ) 206 s += "\n why: " + str( why ) 207 s += "\n Error:\n " + t.lastError() 208 raise TemplateError, s
209 210
211 - def logWrite( self, msg, force=1 ):
212 """ 213 Write message to log. 214 215 @param msg: message to print 216 @type msg: str 217 """ 218 if self.log: 219 self.log.add( msg ) 220 else: 221 if force: 222 print msg
223 224
225 - def get_template_ids( self ):
226 """ 227 Extract names of all PDB files in a folder (without .PDB) 228 229 @return: list of PDB names 230 @rtype: [str] 231 """ 232 fs = os.listdir( self.template_folder ) 233 return [ f[:-4] for f in fs if f[-4:].upper()=='.PDB' ]
234 235
236 - def get_target_id( self, f_fasta ):
237 """ 238 Extract (first) sequence id from fasta file. Make intelligent guess 239 if there is no exact match between target fasta and alignment file. 240 241 @param f_fasta: fasta file with target sequence 242 @type f_fasta: str 243 244 @raise ModellerError: if target ID is not found in alignment 245 """ 246 ex_fasta_id = re.compile('^>([A-Za-z0-9_]+)') 247 248 title = open( f_fasta ).readline() 249 250 try: 251 id_fasta = ex_fasta_id.findall( title )[0] 252 except IndexError, why: 253 raise ModellerError( "Can't extract sequence id from "+f_fasta ) 254 255 if self.pir_ids: 256 ## the id matches one previously found in the alignment 257 if id_fasta in self.pir_ids: 258 return id_fasta 259 260 guesses = [ pid for pid in self.pir_ids 261 if pid[:len(id_fasta)] == id_fasta ] 262 263 if len( guesses ) == 1: 264 return guesses[0] 265 266 raise ModellerError("Target ID %s is not found in alignment." %\ 267 id_fasta ) 268 269 self.logWrite('Warning: target sequence ID extracted without check'+ 270 ' against alignment file. Use prepare_alignment first.') 271 272 title.close() 273 274 return id_fasta
275 276
277 - def prepare_alignment( self ):
278 """ 279 Convert t_coffee - generated PIR into PIR for Modeller. 280 On the way all sequence ids are extracted into self.pir_ids. 281 282 @raise ModellerError: if can't rename file 283 """ 284 ex_title = re.compile( '^>P1;([A-Za-z0-9_]+)' ) 285 ex_description = re.compile( '^sequence|^structure' ) 286 287 self.pir_ids = [] 288 289 try: 290 bak_fname = self.f_pir + '_original' 291 os.rename( self.f_pir, bak_fname ) 292 except OSError, why: 293 raise ModellerError( "Can't rename " + self.f_pir + " to "+ bak_fname ) 294 295 lines = open( bak_fname ).readlines() 296 297 for i in range( len( lines ) ): 298 current = lines[ i ] 299 300 if ex_title.match( current ): 301 id = ex_title.findall( current )[0] 302 303 self.pir_ids += [ id ] 304 305 prefix = 'sequence' 306 if id in self.template_ids: 307 prefix = 'structure' 308 ## if lines[i + 2][0] != '*': 309 ## lines[i + 2] = '*' + lines[i + 2] 310 311 lines[i + 1] = "%s:%s: : : : : : : : \n" \ 312 % (prefix, id) 313 314 else: 315 if not ex_description.match( current ): 316 lines[ i ] = current.upper() 317 318 open( self.f_pir, 'w' ).writelines( lines )
319 320
321 - def prepare_modeller( self, fasta_target=None, fout=None ):
322 """ 323 Prepare information needed by modeller. 324 325 @param fasta_target: target fasta file 326 (default: None -> L{SS.F_FASTA_TARGET}) 327 @type fasta_target: str 328 @param fout: name of modeller script (default: None -> L{F_INP}) 329 @type fout: None or str 330 """ 331 fasta_target = fasta_target or self.outFolder + SS.F_FASTA_TARGET 332 self.template_folder = self.template_folder or self.outFolder + TC.F_MODELLER 333 self.f_pir = self.f_pir or self.outFolder + Aligner.F_FINAL_ALN 334 335 self.template_ids = self.get_template_ids( ) 336 337 ## add Modeller-style lines and extract sequence ids 338 self.prepare_alignment( ) 339 340 ## guess target seq id within alignment 341 self.target_id = self.get_target_id( fasta_target )
342 343
344 - def finish( self ):
345 """ 346 Overrides Executor method 347 """ 348 Executor.finish( self ) 349 350 self.postProcess()
351 352 353 #################### 354 ## Post processing 355
356 - def pdb_list(self, model_folder):
357 """ 358 Compile a list with the names of all models created by Modeller. 359 360 @param model_folder: folder containing model PDBs 361 (output from Modeller) 362 @type model_folder: str 363 364 @return: pdb files from modeller 365 @rtype: [str] 366 """ 367 return glob.glob('%s/target.B*'%(model_folder))
368 369
370 - def extract_modeller_score(self, f_model, model):
371 """ 372 Extract the modeller score from the models 373 374 @param f_model: file name of model pdb 375 @type f_model: str 376 @param model: model 377 @type model: PDBModel 378 379 @return: modeller score for the models 380 @rtype: float 381 """ 382 r1 = re.compile(r'\w\d.*\d') 383 384 line_of_interest = linecache.getline(f_model, 2) 385 OF = r1.findall(line_of_interest) 386 387 file = open(model.validSource(),'r') 388 string = file.readlines()[:2] 389 model.info["headlines"] = string 390 file.close() 391 392 return float(OF[0])
393 394
395 - def output_score(self, pdb_list, model_folder):
396 """ 397 Write a list of model scores to file L{F_SCORE_OUT} 398 399 @param pdb_list: list of models 400 @type pdb_list: ModelList 401 @param model_folder: ouput folder 402 @type model_folder: str 403 """ 404 file_output = open(self.outFolder+self.F_SCORE_OUT,'w') 405 file_output.write("The models from modeller with their Score:\n") 406 407 for model in pdb_list: 408 409 file_output.write('%s\t%6.2f\n'%(T.stripFilename( 410 model.validSource()), model.info["mod_score"])) 411 412 file_output.close()
413 414
415 - def update_PDB(self, pdb_list, cwd, model_folder):
416 """ 417 Extract number of templates per residue position from alignment. 418 Write new PDBs with number of templates in occupancy column. 419 420 @param pdb_list: list of models 421 @type pdb_list: ModelList 422 @param cwd: current project directory 423 @type cwd: str 424 @param model_folder: folder with models 425 @type model_folder: str 426 """ 427 cwd = cwd or self.outFolder 428 429 ci = CI(cwd) 430 aln_dictionnary = ci.go() 431 432 template_info = aln_dictionnary["target"]["template_info"] 433 434 for model in pdb_list: 435 436 model.setResProfile("n_templates", template_info, 437 comment="number of templates for each residue") 438 439 rmask = pdb_list[0].profile2mask("n_templates", 1,1000) 440 amask = pdb_list[0].res2atomMask(rmask) 441 442 443 for i in range(len(pdb_list)): 444 445 for a,v in zip(pdb_list[i].atoms, amask): 446 a['occupancy'] = v 447 448 t = [] 449 450 for string in pdb_list[i].info["headlines"]: 451 452 tuple = () 453 tuple = string.split()[0],join(string.split()[1:]) 454 t.append(tuple) 455 456 pdb_list[i].writePdb('%s/model_%02i.pdb'%(model_folder,i), 457 headlines = t)
458 459
460 - def update_rProfiles(self, pdb_list):
461 """ 462 Read Modeller score from the 'temperature_factor' column and 463 create a profile from it. 464 465 @param pdb_list: list of models 466 @type pdb_list: ModelList 467 """ 468 for model in pdb_list: 469 470 m = model.compress( model.maskCA()) 471 472 atoms = DictList( m.atoms ) 473 prof = atoms.valuesOf( "temperature_factor") 474 475 model.setResProfile('mod_score', prof)
476 477
478 - def write_PDBModels(self, pdb_list, model_folder = None):
479 """ 480 Dump the list of PDBModels. 481 482 @param pdb_list: list of models 483 @type pdb_list: ModelList 484 @param model_folder: ouput folder (default: None -> L{F_PDBModels}) 485 @type model_folder: str 486 """ 487 T.Dump(pdb_list, '%s'%(model_folder + self.F_PDBModels))
488 489
490 - def postProcess(self, model_folder=None):
491 """ 492 Rename model PDBs to 'model_01.pdb' - 'model_xx.pdb' according to their 493 score, create and dump a ModelList instance containing these models 494 and info-records and profiles with the Modeller score. 495 496 @param model_folder: folder containing model PDBs 497 (default: None -> L{F_INPUT_FOLDER}) 498 @type model_folder: str 499 """ 500 501 model_folder = model_folder or self.outFolder + self.F_INPUT_FOLDER 502 503 model_files = self.pdb_list(model_folder) 504 505 pdb_list = ModelList( model_files ) 506 507 for model in pdb_list: 508 509 model.info["mod_score"] = self.extract_modeller_score( 510 model.validSource(), model) 511 512 pdb_list = pdb_list.sortBy("mod_score") 513 514 self.update_PDB(pdb_list, self.outFolder, model_folder) 515 516 self.output_score(pdb_list,model_folder) 517 518 self.update_rProfiles(pdb_list) 519 520 self.write_PDBModels(pdb_list, model_folder)
521 522 523 524 ############# 525 ## TESTING 526 ############# 527
528 -class Test:
529 """ 530 Test class 531 """ 532
533 - def run( self, local=0, run=0 ):
534 """ 535 run function test 536 537 @param local: transfer local variables to global and perform 538 other tasks only when run locally 539 @type local: 1|0 540 541 @param run: run the full test (call external application) or not 542 @type run: 1|0 543 544 @return: 1 545 @rtype: int 546 """ 547 import tempfile 548 import shutil 549 550 ## collect the input files needed 551 outfolder = tempfile.mkdtemp( '_test_Modeller' ) 552 os.mkdir( outfolder +'/templates' ) 553 os.mkdir( outfolder +'/t_coffee' ) 554 555 shutil.copytree( T.testRoot() + '/Mod/project/templates/modeller', 556 outfolder + '/templates/modeller' ) 557 558 shutil.copy( T.testRoot() + '/Mod/project/t_coffee/final.pir_aln', 559 outfolder + '/t_coffee' ) 560 561 shutil.copy( T.testRoot() + '/Mod/project/target.fasta', 562 outfolder ) 563 564 m = Modeller( outfolder, verbose=local ) 565 566 if run: 567 print 'Running modeller job ...' 568 m.run() 569 print 'The modelling result can be found in %s/modeller'%outfolder 570 571 if local: 572 globals().update( locals() ) 573 574 ## cleanup 575 T.tryRemove( outfolder, tree=1 ) 576 577 return 1
578 579
580 - def expected_result( self ):
581 """ 582 Precalculated result to check for consistent performance. 583 584 @return: 1 585 @rtype: int 586 """ 587 return 1
588 589 590 if __name__ == '__main__': 591 592 test = Test() 593 594 assert test.run( run=1, local=1 ) == test.expected_result() 595