Package Biskit :: Package Mod :: Module Aligner
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.Aligner

  1  ## 
  2  ## Biskit, a toolkit for the manipulation of macromolecular structures 
  3  ## Copyright (C) 2004-2006 Raik Gruenberg & Johan Leckner 
  4  ## 
  5  ## This program is free software; you can redistribute it and/or 
  6  ## modify it under the terms of the GNU General Public License as 
  7  ## published by the Free Software Foundation; either version 2 of the 
  8  ## License, or any later version. 
  9  ## 
 10  ## This program is distributed in the hope that it will be useful, 
 11  ## but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 13  ## General Public License for more details. 
 14  ## 
 15  ## You find a copy of the GNU General Public License in the file 
 16  ## license.txt along with this program; if not, write to the Free 
 17  ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 18  ## 
 19  ## 
 20  ## last $Author: graik $ 
 21  ## last $Date: 2007/03/20 21:40:26 $ 
 22  ## $Revision: 2.16 $ 
 23  """ 
 24  Sequence Alignment 
 25  """ 
 26   
 27  import re, os 
 28   
 29  from TemplateCleaner import TemplateCleaner as TC 
 30  from SequenceSearcher import SequenceSearcher as SS 
 31   
 32  import Biskit.tools as T 
 33  from Biskit.Errors import * 
 34   
 35  from Biskit import Executor, TemplateError 
 36   
 37   
38 -class AlignerError( BiskitError ):
39 pass
40 41
42 -class TCoffee( Executor ):
43 """ 44 Execute a t_coffee command 45 """ 46
47 - def __init__(self, cmd, host=None, **kw ):
48 """ 49 @param cmd: command sequence 50 @type cmd: str 51 @param host: host name for remote execution 52 (default: None=local execution) 53 @type host: str 54 """ 55 Executor.__init__( self, 't_coffee', cmd, catch_out=1, 56 node=host, **kw )
57 58
59 -class Aligner:
60 """ 61 Take template CA traces and template sequences plus non-template 62 sequences. 63 64 Returns a T-Coffee script or T-Coffee alignment 65 """ 66 67 F_RESULT_FOLDER = '/t_coffee' 68 69 F_FAST_LIB = F_RESULT_FOLDER + '/fast_pair.lib' 70 F_LALIGN_LIB = F_RESULT_FOLDER + '/lalign_id_pair.lib' 71 F_SAP_LIB = F_RESULT_FOLDER + '/sap_pair.lib' 72 F_STRUCT_ALN = F_RESULT_FOLDER + '/struct.aln' 73 F_FINAL = F_RESULT_FOLDER + '/final' 74 F_COFFEE_LOG = F_RESULT_FOLDER + '/t_coffee.log' 75 F_COFFEE_INP = F_RESULT_FOLDER + '/t_coffee.inp' 76 F_FAST_TREE = F_RESULT_FOLDER + '/fast_tree.dnd' 77 F_SEQ_TREE = F_RESULT_FOLDER + '/sequence.dnd' 78 79 ## default result alignment file. Not actually needed in Aligner itself. 80 F_FINAL_ALN = F_FINAL + '.pir_aln' 81
82 - def __init__( self, outFolder='.', log=None, verbose=1, sap=1 ):
83 """ 84 @param outFolder: base folder for t_coffee output 85 (default: L{F_RESULT_FOLDER}) 86 @type outFolder: str 87 @param log: log file instance, if None, STDOUT is used (default: None) 88 @type log: LogFile 89 @param verbose: be verbose 90 @type verbose: 1 | 0 91 @param sap: perform structural alignment 92 @type sap: 1 | 0 93 """ 94 self.log = log 95 self.outFolder = T.absfile( outFolder ) 96 self.verbose = verbose 97 self.sap = sap 98 99 ## recognize file types for adding t_coffee type code 100 self.ex_inp_types = { 'P' : re.compile('.+\.[Pp][Dd][Bb]$|'+ 101 '.+\.[Aa][Ll][Pp][Hh][Aa]$'), 102 'L' : re.compile('.+\.[Ll][Ii][Bb]$'), 103 'S' : re.compile('.*\.[Ff][Aa][Ss][Tt][Aa]$') 104 } 105 106 self.commands = [] ## list of t_coffee commands to run 107 108 fn = self.outFolder + self.F_RESULT_FOLDER 109 if not os.path.exists( fn ): 110 os.mkdir( fn ) 111 if verbose: self.logWrite( 'Creating new output folder '+ fn )
112 113
114 - def logWrite( self, msg, force=1 ):
115 """ 116 Write message to log. 117 118 @param msg: message to print 119 @type msg: str 120 """ 121 if self.log: 122 self.log.writeln( msg ) 123 else: 124 if force: 125 print msg
126 127
128 - def __add_type_code( self, inp ):
129 """ 130 Try guessing a t_coffee type code for file name and add type code 131 (P for structurem S for sequence and L for library). 132 133 @param inp: file name 134 @type inp: str 135 136 @return: type_code + file name OR file name unchanged 137 @rtype: str 138 """ 139 for code, pattern in self.ex_inp_types.items(): 140 if pattern.match( inp ): 141 return code + inp 142 return inp
143 144
145 - def coffee_align_inp( self, input, method=None, out_lib=None, 146 output=None, outfile=None, **kw ):
147 """ 148 Create a single t_coffee command. 149 150 @param input: list of input files 151 @type input: [str] 152 @param method: t_coffee method ('M' is added automatically) 153 (default:None) 154 @type method: str 155 @param output: output format(s) (default:None) 156 @type output: [str] 157 @param out_lib: output library file if needed (default:None) 158 @type out_lib: str 159 @param outfile: output file (base) if needed (default:None) 160 @type outfile: str 161 @param kw: additional param=value pairs 162 @type kw: param=value 163 164 @return: t_coffee command 165 @rtype: str 166 """ 167 r = '' 168 169 input = [ self.__add_type_code( i ) for i in T.toList(input) ] 170 171 if input or method: 172 r += ' -in' 173 for i in input: 174 r += ' ' + i 175 if method: 176 r += ' M' + method 177 178 if out_lib: 179 r += ' -out_lib ' + out_lib 180 181 if output: 182 r += ' -output' 183 for o in T.toList( output ): 184 r += ' ' + o 185 186 if not outfile: 187 raise AlignerError, 'T-Coffee -output also requires -outfile' 188 189 if outfile: 190 r += ' -outfile ' + outfile 191 192 if not output: 193 r += ' -lib_only' 194 195 r += ' -template_file No' ## MSAP runs generate unwanted *.template_file 196 197 for param, value in kw.items(): 198 r += ' -'+param + ' ' + str( value ) 199 200 return r
201 202
203 - def __default_input_files( self, pdbFiles=None, fasta_templates=None, 204 fasta_sequences=None, fasta_target=None ):
205 """ 206 Fetch missing fasta and pdb files from standard locations. 207 208 @param pdbFiles: fetch pdb file from L{TC.F_COFFEE} (default:None) 209 @type pdbFiles: [str] 210 @param fasta_templates: fetch fasta template file from L{TC.F_FASTA} 211 (default:None) 212 @type fasta_templates: [str] 213 @param fasta_sequences: fetch fasta sequence file from 214 L{SS.F_FASTA_NR} (default:None) 215 @type fasta_sequences: [str] 216 @param fasta_target: fetch fasta target file from 217 L{SS.F_FASTA_TARGET} (default:None) 218 @type fasta_target: [str] 219 220 @return: dictionary mapping input file class to path(s) 221 @rtype: {str:[str]} 222 """ 223 r = {} 224 225 ## collect CA trace PDBs from default location 226 if not pdbFiles: 227 folder = self.outFolder + TC.F_COFFEE 228 fs = os.listdir( folder ) 229 pdbFiles= [ folder + f for f in fs if f[-6:].upper()=='.ALPHA' ] 230 231 r['pdbs'] = T.toList( pdbFiles ) 232 r['templates'] = fasta_templates or self.outFolder + TC.F_FASTA 233 r['sequences'] = fasta_sequences or self.outFolder + SS.F_FASTA_NR 234 r['target'] = fasta_target or self.outFolder + SS.F_FASTA_TARGET 235 236 return r
237 238
239 - def align_for_modeller_inp( self, pdbFiles=None, fasta_templates=None, 240 fasta_sequences=None, fasta_target=None, 241 f_fast_tree=None, f_sequence_tree=None ):
242 """ 243 Prepare alignment commands for homology modeling. 244 245 @note: If verbose==1, the commands are mirrored to t_coffee.inp. 246 247 @param pdbFiles: template PDBs from L{TC.F_COFFEE} (default:None) 248 @type pdbFiles: [str] 249 @param fasta_templates: template sequences from 250 templates/templates.fasta (default:None) 251 @type fasta_templates: [str] 252 @param fasta_sequences: more sequences from L{SS.F_FASTA_NR} 253 (default:None) 254 @type fasta_sequences: [str] 255 @param fasta_target: fasta target files 256 L{SS.F_FASTA_TARGET} (default:None) 257 @type fasta_target: str 258 @param f_fast_tree: filename of clustalW style guide tree (.dnd) 259 @type f_fast_tree: str 260 @param f_sequence_tree: filename of clustalW style guide tree (.dnd) 261 @type f_sequence_tree: str 262 """ 263 ## Fetch default files where needed 264 d = self.__default_input_files( pdbFiles, fasta_templates, 265 fasta_sequences, fasta_target ) 266 267 pdbFiles = d['pdbs'] 268 f_templates = d['templates'] 269 f_sequences = d['sequences'] 270 f_target = d['target'] 271 272 pdbFiles = [ T.absfile( f ) for f in pdbFiles ] 273 274 f_templates = T.absfile( f_templates ) 275 f_sequences = T.absfile( f_sequences ) 276 277 ## create output file names 278 f_fast_lib = self.outFolder + self.F_FAST_LIB 279 f_lalign_lib = self.outFolder + self.F_LALIGN_LIB 280 f_sap_lib = self.outFolder + self.F_SAP_LIB 281 f_struct_aln = self.outFolder + self.F_STRUCT_ALN 282 f_final_aln = self.outFolder + self.F_FINAL 283 f_coffee_log= self.outFolder + self.F_COFFEE_LOG 284 f_fast_tree= f_fast_tree or self.outFolder + self.F_FAST_TREE 285 f_sequence_tree= f_sequence_tree or self.outFolder + self.F_SEQ_TREE 286 287 ## fix target file 288 self.repair_target_fasta( f_target ) 289 290 if len( pdbFiles ) == 0: 291 raise AlignerError( "\n No templates avaliable. ABORTING" ) 292 293 ## SAP will not run if there is only one template 294 if (not self.sap) or ( len( pdbFiles ) == 1 ): 295 if self.sap: 296 self.logWrite('WARNING! Only one template avaliable:' +\ 297 str(pdbFiles) ) 298 self.logWrite('Structural alignment (SAP) will not be performed.') 299 300 r = [ 301 ## fast global pair-wise alignment 302 ## why not use slow pair? better for distant sequences 303 self.coffee_align_inp( [ f_sequences, f_templates, f_target ], 304 method='fast_pair', 305 out_lib=f_fast_lib, 306 quiet=f_coffee_log+'_1', convert='' ), 307 308 ## uses LALIGN to find the best pair-wise local alignments 309 self.coffee_align_inp( [ f_sequences, f_templates, f_target ], 310 method='lalign_id_pair', 311 out_lib=f_lalign_lib, 312 quiet=f_coffee_log+'_2', convert='' ), 313 314 ## combine alignments to one 315 self.coffee_align_inp( [ f_fast_lib, f_lalign_lib], 316 clean_aln=0, newtree=f_fast_tree, 317 output=['clustalw','phylip', 318 'score_html', 'pir_aln'], 319 ## run_name=T.stripFileName(f_final_aln), 320 outfile=f_final_aln, 321 quiet=f_coffee_log+'_4') 322 ] 323 324 ## Normal alignment run with structural alignment 325 ## (more than one template) 326 else: 327 r = [ 328 ## fast global pair-wise alignment 329 ## why not use slow pair? better for distant sequences 330 self.coffee_align_inp( [ f_sequences, f_templates, f_target ], 331 method='fast_pair', 332 out_lib=f_fast_lib, 333 quiet=f_coffee_log+'_1', convert='' ), 334 335 ## uses LALIGN to find the best pair-wise local alignments 336 self.coffee_align_inp( [ f_sequences, f_templates, f_target ], 337 method='lalign_id_pair', 338 out_lib=f_lalign_lib, 339 quiet=f_coffee_log+'_2', convert='' ), 340 341 ## uses SAP to do pairwise structural alignments 342 self.coffee_align_inp( pdbFiles, newtree=f_sequence_tree, 343 method='sap_pair', 344 weight=1000, 345 out_lib=f_sap_lib, 346 output='fasta_aln', 347 outfile=f_struct_aln, 348 quiet=f_coffee_log+'_3' ), 349 350 ## fix lib file 351 (f_sap_lib, f_struct_aln), 352 353 ## combine alignments to one 354 self.coffee_align_inp( [ f_fast_lib, f_lalign_lib, f_sap_lib], 355 clean_aln=0, newtree=f_fast_tree, 356 output=['clustalw','phylip', 357 'score_html', 358 'pir_aln', 'score_ascii'], 359 ## run_name=T.stripFileName(f_final_aln), 360 outfile=f_final_aln, 361 quiet=f_coffee_log+'_4') 362 ] 363 364 ## add to internal command 'queue' 365 self.commands += r 366 367 if self.verbose: 368 f = open( self.outFolder + self.F_COFFEE_INP, 'w' ) 369 for cmd in r: 370 f.write( str(cmd) + '\n\n' ) 371 f.close()
372 373
374 - def repair_lib_file(self, fname ):
375 """ 376 Msap_pair alignments mess up sequence ids of certain 377 (single-chain?) structures. 378 @note: Dirty hack.. 379 380 @param fname: Msap_pair file to repair 381 @type fname: str 382 """ 383 ex_sapfix = re.compile('_[A-Z]{0,1}_[A-Z]') 384 ex_alpha = re.compile('.alpha') ## new from T-Coffe ver. 3.32 385 bak_fname = fname+'_original' 386 387 os.rename( fname, bak_fname) 388 389 fout = open( fname, 'w' ) 390 for l in open( bak_fname ): 391 l = re.sub( ex_alpha, '', l ) 392 fout.write( re.sub( ex_sapfix, '_', l ) ) 393 394 fout.close()
395 396
397 - def repair_target_fasta( self, fname ):
398 """ 399 Change target ID to 'target' and remove '|' etc. that mess up 400 things in the alignments. 401 402 @param fname: fasta file to repair 403 @type fname: str 404 405 @raise AlignerError: if cannot fix target sequence file 406 """ 407 bak_fname = fname+'_original' 408 os.rename( fname, bak_fname) 409 410 try: 411 fout = open( fname, 'w' ) 412 fin = open( bak_fname ) 413 414 l0 = fin.readline() #skipp header line 415 416 fout.write( '>target\n' ) 417 418 for l in fin: 419 fout.write( l ) 420 421 fin.close() 422 fout.close() 423 424 except Exception, why: 425 print T.lastError() 426 raise AlignerError("Cannot fix target sequence file: "+str(why))
427 428
429 - def go( self, host=None ):
430 """ 431 Run the previously added commands, delete internal command list. 432 433 @param host: host name for remote execution 434 (default: None=local execution) 435 @type host: str 436 437 @raise AlignerError: if T_Coffee execution failed 438 """ 439 try: 440 if self.verbose: 441 self.logWrite('\nALIGNING...\n') 442 443 for cmd in self.commands: 444 445 if type( cmd ) == tuple: 446 for f in cmd: 447 self.repair_lib_file( f ) 448 else: 449 450 if host: 451 tc = TCoffee( cmd, host ) 452 else: 453 tc = TCoffee( cmd ) 454 455 self.logWrite( 'Running t_coffee .. ') 456 self.logWrite( cmd ) 457 458 ## run t_coffee 459 output, error, returncod = tc.run() 460 461 self.logWrite( '..done: ' + str( output ) + '\n' ) 462 print "DEBUG: ", os.path.exists('1MHO_.template_file') 463 464 except EnvironmentError, why: 465 self.logWrite("ERROR: Can't run t_coffee: %s Error: %s"\ 466 %( why, error ) ) 467 raise AlignerError( "Can't run t_coffee: %s Error: %"\ 468 %( why, error ) )
469 470 471 ############# 472 ## TESTING 473 ############# 474 import Biskit.test as BT 475
476 -class Test(BT.BiskitTest):
477 """ 478 Base Test class (does not actually run any test) 479 """
480 - def prepare(self):
481 import tempfile 482 import shutil 483 from Biskit import LogFile, StdLog 484 485 ## collect the input files needed 486 487 self.outfolder = tempfile.mkdtemp( '_test_Aligner' ) 488 489 ## self.outfolder = '/tmp/test_aligner' 490 ## try: os.mkdir(self.outfolder) 491 ## except: pass 492 493 os.mkdir( self.outfolder +'/templates' ) 494 os.mkdir( self.outfolder +'/sequences/' ) 495 496 shutil.copytree( T.testRoot() + '/Mod/project/templates/t_coffee', 497 self.outfolder + '/templates/t_coffee' ) 498 499 shutil.copy( T.testRoot() + '/Mod/project/templates/templates.fasta', 500 self.outfolder + '/templates' ) 501 502 shutil.copy( T.testRoot() + '/Mod/project/sequences/nr.fasta', 503 self.outfolder + '/sequences/' ) 504 505 shutil.copy( T.testRoot() + '/Mod/project/target.fasta', 506 self.outfolder ) 507 508 if not self.local: 509 self.a_log = LogFile( self.outfolder + '/Aligner.log' ) 510 else: 511 self.a_log = StdLog()
512 513
514 - def t_Aligner(self, run=True ):
515 516 self.a = Aligner( outFolder=self.outfolder, verbose=self.local, 517 log=self.a_log) 518 519 self.a.align_for_modeller_inp() 520 521 if run: 522 self.a.go() 523 if self.local: 524 self.log.add( 525 'The alignment result is in %s/t_coffee' % self.outfolder)
526
527 - def cleanUp(self):
528 T.tryRemove( self.outfolder, tree=1 )
529 530
531 -class TestDry( Test ):
532 """Test case dry run -- only initialize without running t-coffee""" 533 534 TAGS = [] 535
536 - def test_AlignerDryRun(self):
537 """Mod.Aligner dry run test""" 538 self.t_Aligner(run=False)
539 540
541 -class TestLong( Test ):
542 """Complete test case for Aligner""" 543 544 TAGS = [BT.EXE, BT.LONG] 545
546 - def test_Aligner(self):
547 """Mod.Aligner test""" 548 self.t_Aligner(run=True)
549 550 if __name__ == '__main__': 551 552 BT.localTest(debug=0) 553