Biskit.Hmmer

1 ## Automatically adapted for numpy.oldnumeric Mar 26, 2007 by alter_code1.py 2 3 ## Class conservation 4 ## 5 ## Biskit, a toolkit for the manipulation of macromolecular structures 6 ## Copyright (C) 2004-2006 Raik Gruenberg & Johan Leckner 7 ## 8 ## This program is free software; you can redistribute it and/or 9 ## modify it under the terms of the GNU General Public License as 10 ## published by the Free Software Foundation; either version 2 of the 11 ## License, or any later version. 12 ## 13 ## This program is distributed in the hope that it will be useful, 14 ## but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 ## General Public License for more details. 17 ## 18 ## You find a copy of the GNU General Public License in the file 19 ## license.txt along with this program; if not, write to the Free 20 ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 21 ## 22 ## 23 ## $Revision: 2.19 $ 24 ## last $Author: graik $ 25 ## last $Date: 2007/03/26 18:40:40 $ 26 27 """ 28 Search Hmmer Pfam database and retrieve conservation data. 29 """ 30 31 import tempfile 32 import re, string 33 import types, os.path 34 import numpy.oldnumeric as N 35 import molUtils 36 import settings 37 38 import Biskit.mathUtils as math 39 from Biskit.Errors import BiskitError 40 import Biskit.tools as T 41 import Biskit.molTools as MT 42 import Biskit.molUtils as MU 43 from Biskit import Executor, TemplateError, PDBModel, StdLog, EHandler 44 45 46 ## executables 47 hmmpfamExe = settings.hmmpfam_bin 48 hmmfetchExe = settings.hmmfetch_bin 49 hmmalignExe = settings.hmmalign_bin 50 hmmindexExe = settings.hmmindex_bin 51 hmmDatabase = settings.hmm_db 52 53

54 -class HmmerError( BiskitError ):

55 pass

56 57

58 -class HmmerdbIndex( Executor ):

59 """ 60 Checks if the hmm database has been indexed or not, 61 if not indexing will be done. 62 """ 63

64 - def __init__( self, hmmdb, **kw ):

65 """ 66 @param hmmdb: Pfam hmm database 67 @type hmmdb: str 68 """ 69 Executor.__init__( self, 'hmmindex', args='%s'%hmmdb, **kw ) 70 71 if not os.path.exists(hmmdb+'.ssi'): 72 if self.verbose: 73 self.log.writeln( 74 'HMMINDEX: Indexing hmm database. This will take a while') 75 76 self.run()

77 78

79 -class HmmerSearch( Executor ):

80 """ 81 Search hmm database (using the hmmpfam program) with a sequence 82 in fasta format. 83 If the profile names have been provided - skip the search and 84 only write the temporary sequence files. 85 """ 86

87 - def __init__( self, target, hmmdb, noSearch=None, **kw ):

88 """ 89 @param target: sequence 90 @type target: PDBModel or fasta file 91 @param hmmdb: Pfam hmm database 92 @type hmmdb: str 93 @param noSearch: don't perform a seach 94 @type noSearch: 1 OR None 95 """ 96 self.fName = tempfile.mktemp('.fasta') 97 self.hmmdb = hmmdb 98 99 Executor.__init__( self, 'hmmpfam', catch_out=1, 100 args=' %s %s'%(hmmdb, self.fName), **kw ) 101 102 self.target = target 103 104 self.fastaID = '' 105 106 if noSearch: 107 if self.verbose: 108 self.log.writeln( 109 'Profiles provided - No search will be performed.')

110 111

112 - def __verify_fasta(self, target ):

113 """ 114 Verify that a given file or string is in Fasta format. 115 The definition used for a fasta file here is that: 116 - first line starts with '>' 117 - the following sequence lines are not longer that 80 characters 118 - the characters has to belong to the standard amino acid codes 119 120 @param target: name of fasta file OR file contents as list of strings 121 @type target: str OR [str] 122 123 @return: conforms to the fsata format 124 @rtype: True/False 125 """ 126 if not type(target) == types.ListType: 127 if os.path.exists( target ): 128 f = open( target, 'r' ) 129 target = f.readlines() 130 131 if not target[0][0] == '>': 132 self.log.writeln('Fasta format does not contain description line.') 133 return False 134 135 for i in range( 1, len(target) ): 136 if len( target[i] ) >= 80: 137 self.log.writeln( 138 'Fasta sequence lines longer that 80 characters') 139 return False 140 141 for j in target[i]: 142 aa_codes = MU.aaDicStandard.values() + [ '\n' ] 143 if not j.upper() in aa_codes: 144 self.log.writeln('Invalid amino acid code: %s'%j.upper()) 145 return False 146 147 return True

148 149

150 - def prepare( self ):

151 """ 152 If the target id a PDBModel its sequence will be written 153 to disc as a fasta file. 154 If it is a fasta file the path to the file will be passed on. 155 """ 156 ## if target is a PDBModel 157 if isinstance( self.target, PDBModel): 158 fastaSeq, self.fastaID = MT.fasta( self.target ) 159 ## write fasta sequence file 160 seq = open( self.fName, 'w' ) 161 seq.write( fastaSeq ) 162 seq.close() 163 164 ## else assume it is a fasta sequence file 165 else: 166 if self.__verify_fasta(self.target): 167 self.fName = self.target

168 169

170 - def parse_result( self ):

171 """ 172 Parse the output from hmmpfam. 173 174 @return: dictionary witn profile names as keys and a list of 175 lists containing information about the range where the 176 profile matches the sequence 177 @rtype: dict, [list] 178 """ 179 matches = {} 180 hits = [] 181 182 ## check that the outfut file is there and seems valid 183 if not os.path.exists( self.f_out ): 184 raise HmmerError,\ 185 'Hmmersearch result file %s does not exist.'%self.f_out 186 187 if T.fileLength( self.f_out ) < 10: 188 raise HmmerError,\ 189 'Hmmersearch result file %s seems incomplete.'%self.f_out 190 191 try: 192 out = open( self.f_out, 'r' ) 193 while 1: 194 l = out.readline() 195 ## get names and descriptions of matching profiles 196 if re.match('^-{8}\s{7,8}-{11}.+-{3}$', l): 197 m = string.split( out.readline() ) 198 while len(m) != 0: 199 matches[m[0]] = m[1:] 200 m = string.split( out .readline() ) 201 202 ## get hits, scores and alignment positions 203 if re.match('^-{8}\s{7,8}-{7}\s-{5}\s-{5}.+-{7}$', l): 204 h = string.split( out.readline() ) 205 while len(h) != 0: 206 hits += [ h ] 207 h = string.split( out.readline() ) 208 break 209 210 except: 211 raise HmmerError,\ 212 'ERROR parsing hmmpfam search result: %s'%self.f_out 213 214 out.close() 215 216 return matches, hits

217 218

219 - def finish( self ):

220 """ 221 Overrides Executor method 222 """ 223 Executor.finish( self ) 224 self.result = self.parse_result( )

225 226 227

228 -class HmmerProfile( Executor ):

229 """ 230 Get the hmm profile with name hmmName from hmm databse 231 (using the program hmmfetch). 232 """ 233

234 - def __init__( self, hmmName, hmmdb, **kw ):

235 """ 236 @param hmmName: hmm profile name 237 @type hmmName: str 238 @param hmmdb: Pfam hmm database 239 @type hmmdb: str 240 """ 241 self.hmmName = hmmName 242 243 Executor.__init__( self, 'hmmfetch', 244 args=' %s %s'%(hmmdb, hmmName), **kw ) 245 246 self.f_out = tempfile.mktemp('.hmm')

247 248

249 - def parse_result( self ):

250 """ 251 Extract some information about the profile as well as the 252 match state emmission scores. Keys of the returned dictionary:: 253 'AA', 'name', 'NrSeq', 'emmScore', 'accession', 254 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum' 255 256 @return: dictionary with warious information about the profile 257 @rtype: dict 258 """ 259 ## check that the outfut file is there and seems valid 260 if not os.path.exists( self.f_out ): 261 raise HmmerError,\ 262 'Hmmerfetch result file %s does not exist.'%self.f_out 263 264 if T.fileLength( self.f_out ) < 10: 265 raise HmmerError,\ 266 'Hmmerfetch result file %s seems incomplete.'%self.f_out 267 268 profileDic = {} 269 270 ## read result 271 hmm = open( self.f_out, 'r') 272 out = hmm.read() 273 hmm.close() 274 275 ## collect some data about the hmm profile 276 profileDic['name'] = self.hmmName 277 profileDic['profLength'] = \ 278 int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] ) 279 profileDic['accession'] = \ 280 string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1] 281 profileDic['NrSeq'] = \ 282 int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] ) 283 profileDic['AA'] = \ 284 string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:] 285 286 ## collect null emmission scores 287 pattern = 'NULE[ ]+' + '[-0-9]+[ ]+'*20 288 nullEmm = [ float(j) for j in string.split(re.findall(pattern, out)[0])[1:] ] 289 290 ## get emmision scores 291 prob=[] 292 for i in range(1, profileDic['profLength']+1): 293 pattern = "[ ]+%i"%i + "[ ]+[-0-9]+"*20 294 e = [ float(j) for j in string.split(re.findall(pattern, out)[0]) ] 295 prob += [ e ] 296 297 profileDic['seqNr'] = N.transpose( N.take( prob, (0,),1 ) ) 298 profileDic['emmScore'] = N.array(prob)[:,1:] 299 300 ## calculate emission probablitities 301 emmProb, nullProb = self.hmmEmm2Prob( nullEmm, profileDic['emmScore']) 302 303 ent = [ N.resize( self.entropy(e, nullProb), (1,20) )[0] for e in emmProb ] 304 profileDic['ent'] = N.array(ent) 305 306 profileDic['ent'] = N.array(ent) 307 308 ###### TEST ##### 309 310 proba = N.array(prob)[:,1:] 311 312 ## # test set all to max score 313 ## p = proba 314 ## p1 = [] 315 ## for i in range( len(p) ): 316 ## p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ] 317 ## profileDic['maxAll'] = p1 318 319 # test set all to N.sum( abs( probabilities ) ) 320 p = proba 321 p2 = [] 322 for i in range( len(p) ) : 323 p2 += [ N.resize( N.sum( N.absolute( p[i] )), N.shape( p[i] ) ) ] 324 profileDic['absSum'] = p2 325 326 # set all to normalized max score 327 p = proba 328 p4 = [] 329 for i in range( len(p) ) : 330 p_scale = (p[i] - N.average(p[i]) )/ math.SD(p[i]) 331 p4 += [ N.resize( p_scale[N.argmax( N.array(p_scale) )] , 332 N.shape( p[i] ) ) ] 333 profileDic['maxAllScale'] = p4 334 335 return self.f_out, profileDic

336 337

338 - def hmmEmm2Prob( self, nullEmm, emmScore ):

339 """ 340 Convert HMM profile emmisiion scores into emmission probabilities 341 342 @param nullEmm: null scores 343 @type nullEmm: array 344 @param emmScore: emmission scores 345 @type emmScore: array 346 347 @return: null and emmission probabilities, for each amino acid 348 in each position 349 @rtype: array( len_seq x 20 ), array( 1 x 20 ) 350 """ 351 ## Null probabilities: prob = 2 ^ (nullEmm / 1000) * 1/len(alphabet) 352 nullProb = N.power( 2, N.array( nullEmm )/1000.0 )*(1./20) 353 354 ## Emmission probabilities: prob = nullProb 2 ^ (nullEmm / 1000) 355 emmProb = nullProb * N.power( 2, ( emmScore/1000.0) ) 356 357 return emmProb, nullProb

358 359

360 - def entropy( self, emmProb, nullProb ):

361 """ 362 calculate entropy for normalized probabilities scaled by aa freq. 363 emmProb & nullProb is shape 1,len(alphabet) 364 365 @param emmProb: emmission probabilities 366 @type emmProb: array 367 @param nullProb: null probabilities 368 @type nullProb: array 369 370 @return: entropy value 371 @rtype: float 372 """ 373 ## remove zeros to avoid log error 374 emmProb = N.clip(emmProb, 1.e-10, 1.) 375 376 return N.sum( emmProb * N.log(emmProb/nullProb) )

377 378

379 - def cleanup( self ):

380 """ 381 Clean up after external program has finished (failed or not). 382 Override, but call in child method! 383 """ 384 if not self.keep_inp and not self.debug: 385 T.tryRemove( self.f_in ) 386 387 if self.f_err and not self.debug: 388 T.tryRemove( self.f_err )

389 390

391 - def fail( self ):

392 """ 393 Called if external program failed, override! 394 """ 395 raise HmmerError,\ 396 'Hmmerfetch failed retrieving profile: %s'%self.hmmName

397 398

399 - def finish( self ):

400 """ 401 Overrides Executor method 402 """ 403 Executor.finish( self ) 404 self.result = self.parse_result( )

405 406 407

408 -class HmmerAlign( Executor ):

409 """ 410 Align fasta formated sequence to hmm profile (using hmmalign). 411 """ 412

413 - def __init__( self, hmmFile, fastaFile, fastaID, **kw ):

414 """ 415 @param hmmFile: path to hmm file (profile) 416 @type hmmFile: str 417 @param fastaFile: path to fasta search sequence 418 @type fastaFile: str 419 @param fastaID: fasta id of search sequence 420 @type fastaID: str 421 """ 422 self.fastaID = fastaID 423 424 Executor.__init__( self, 'hmmalign', 425 args=' -q %s %s'%(hmmFile, fastaFile), **kw )

426 427

428 - def parse_result( self ):

429 """ 430 Align fasta formated sequence to hmm profile. 431 432 @return: alignment and matching hmm positions with gaps 433 @rtype: str, str 434 """ 435 ## check that the outfut file is there and seems valid 436 if not os.path.exists( self.f_out ): 437 raise HmmerError,\ 438 'Hmmeralign result file %s does not exist.'%self.f_out 439 440 if T.fileLength( self.f_out ) < 1: 441 raise HmmerError,\ 442 'Hmmeralign result file %s seems incomplete.'%self.f_out 443 444 ## read result 445 hmm = open( self.f_out, 'r') 446 out = hmm.read() 447 hmm.close() 448 449 ## extract search sequence 450 fastaSeq = re.findall( self.fastaID + '[ ]+[-a-yA-Y]+', out ) 451 fastaSeq = string.join([ string.split(i)[1] for i in fastaSeq ], '') 452 453 ## extract hmm sequence 454 hmmSeq = re.findall( '#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out ) 455 hmmSeq = string.join([ string.strip( string.split(i)[2] ) for i in hmmSeq ], '') 456 457 return fastaSeq, hmmSeq

458 459

460 - def fail( self ):

461 """ 462 Called if external program failed, override! 463 """ 464 raise HmmerError,\ 465 'hmmeralign failed aligning sequence file %s with profile %s'\ 466 %(self.fastaFile ,self.hmmFile)

467 468

469 - def finish( self ):

470 """ 471 Overrides Executor method 472 """ 473 Executor.finish( self ) 474 self.result = self.parse_result( )

475 476

477 -class Hmmer:

478 """ 479 Search Hmmer Pfam database and retrieve conservation score for model 480 """ 481

482 - def __init__(self, hmmdb = hmmDatabase, verbose=1, log=StdLog() ):

483 """ 484 @param hmmdb: Pfam hmm database 485 @type hmmdb: str 486 @param verbose: verbosity level (default: 1) 487 @type verbose: 1|0 488 @param log: Log file for messages [STDOUT] 489 @type log: Biskit.LogFile 490 """ 491 self.hmmdb = hmmdb 492 self.verbose = verbose 493 self.log = log or StdLog() 494 self.tempDir = settings.tempDirShared 495 496 self.fastaID = '' 497 498 self.hmmFile = '' 499 self.fastaFile = tempfile.mktemp('.fasta', dir=self.tempDir) 500 self.sub_fastaFile = tempfile.mktemp('_sub.fasta', dir=self.tempDir)

501 502

503 - def checkHmmdbIndex( self ):

504 """ 505 Checks if the hmm database has been indexed or not, 506 if not indexing will be done. 507 """ 508 idx = HmmerdbIndex( self.hmmdb, verbose=self.verbose, log=self.log )

509 510

511 - def searchHmmdb( self, target, noSearch=None ):

512 """ 513 Search hmm database with a sequence in fasta format. 514 If the profile names have been provided - skip the search and 515 only write the temporary sequence files. 516 517 @param target: sequence 518 @type target: PDBModel or fasta file 519 @param noSearch: don't perform a seach 520 @type noSearch: 1 OR None 521 522 @return: dictionary witn profile names as keys and a list of 523 lists containing information about the range where the 524 profile matches the sequence 525 @rtype: dict, [list] 526 """ 527 if self.verbose: 528 self.log.writeln( 529 '\nSearching hmm database, this will take a while...') 530 531 search = HmmerSearch( target, self.hmmdb, verbose=self.verbose, 532 log=self.log ) 533 matches, hits = search.run() 534 535 return matches, hits

536 537

538 - def selectMatches( self, matches, hits, score_cutoff=60 , 539 eValue_cutoff = 1e-8 ):

540 """ 541 select what hmm profiles to use based on score and e-Value cutoff 542 543 @param matches: output from L{searchHmmdb} 544 @type matches: dict 545 @param hits: output from L{searchHmmdb} 546 @type hits: [list] 547 @param score_cutoff: cutoff value for an acceptable score 548 @type score_cutoff: float 549 @param eValue_cutoff: cutoff value for an acceptable e-value 550 @type eValue_cutoff: float 551 552 @return: dictionary with good matches {hmm_name : [[start,stop],[..]]} 553 @rtype: dict 554 """ 555 ## Print list of matching profiles 556 if len(matches) == 0: 557 if self.verbose: self.log.writeln('\nNo matching profiles found') 558 return None 559 if len(matches) == 1: 560 if self.verbose: 561 self.log.writeln('\nFound only one matching profile.') 562 if len(matches) > 1: 563 if self.verbose: self.log.writeln('\nFound more than one hit.\n') 564 565 try: 566 if self.verbose: 567 self.log.writeln( 568 '\tName -- Nr hits -- Score -- E-value -- Description') 569 for k in matches.keys(): 570 self.log.writeln( '\t'+k+' --'+matches[k][-1] +' -- '+\ 571 matches[k][-3]+ ' -- '+ matches[k][-2]+' -- '+\ 572 string.join(matches[k][0:-3])) 573 except: 574 pass 575 576 ## select to use profiles from searchHits given the given criteria 577 hmmHits = {} 578 579 for h in range( len(hits) ): 580 581 hmmName = hits[h][0] 582 score = float( hits[h][-2] ) 583 eValue = float( hits[h][-1] ) 584 585 ## print warning message if profile doesn't meet criteria 586 if score < score_cutoff and eValue > eValue_cutoff \ 587 and self.verbose: 588 self.log.writeln('\nWARNING: Bad scores! Profile NOT used.') 589 self.log.writeln('\t Name: %s - Score: %f E-value: %f ' \ 590 %( hmmName, score, eValue ) ) 591 592 ## add profile to list of profiles to use 593 else: 594 seqStart = int(hits[h][2]) 595 seqEnd = int(hits[h][3]) 596 if hmmHits.has_key(hmmName): 597 hmmHits[hmmName] = hmmHits[hmmName] +[[seqStart, seqEnd ]] 598 599 else: 600 hmmHits[hmmName] = [ [ seqStart, seqEnd ] ] 601 602 if self.verbose: 603 self.log.writeln( '\nWill use profile(s):\n '+str( hmmHits )) 604 605 return hmmHits

606 607

608 - def getHmmProfile( self, hmmName ):

609 """ 610 Get the hmm profile with name hmmName from hmm databse. 611 Extract some information about the profile as well as the 612 match state emmission scores. Keys of the returned dictionary:: 613 'AA', 'name', 'NrSeq', 'emmScore', 'accession', 614 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum' 615 616 @param hmmName: hmm profile name 617 @type hmmName: str 618 619 @return: dictionary with warious information about the profile 620 @rtype: dict 621 """ 622 profile = HmmerProfile( hmmName, self.hmmdb, verbose=self.verbose, 623 log=self.log) 624 self.hmmFile, profileDic = profile.run() 625 626 return profileDic

627 628

629 - def align( self, model, hits ):

630 """ 631 Performs alignment 632 If there is more than one hit with the profile, the sequence will 633 be subdevided and the alignment will be performed on each part. 634 a final merger profile for the profile will be returned. 635 636 @param model: model 637 @type model: PDBModel 638 @param hits: list with matching sections from L{searchHmmdb} 639 @type hits: [[int,int]] 640 641 @return: fastaSeq hmmSeq repete hmmGap:: 642 fastaSeq - sequence 643 hmmSeq - matching positions in profile 644 repete - number of repetes of the profile 645 hmmGap - list with gaps (deletions in search sequence) for 646 each repete 647 @rtype: str, str, int, [int] 648 """ 649 try: 650 651 fastaSeq, self.fastaID = MT.fasta( model ) 652 fastaSeq = fastaSeq.split()[1] 653 l = len(fastaSeq) 654 655 ## sub sequence alignment 656 j = 0 657 repete = 0 658 hmmGap = [] 659 660 for h in hits: 661 start = h[0]-1 662 stop = h[1] 663 sub_fastaSeq, self.fastaID = MT.fasta( model, start, stop ) 664 665 ## write sub-fasta sequence file 666 fName = self.sub_fastaFile 667 sub_seq = open( fName, 'w' ) 668 sub_seq.write( sub_fastaSeq ) 669 sub_seq.close() 670 671 ## get sub-alignmnet 672 align = HmmerAlign( self.hmmFile, self.sub_fastaFile, 673 self.fastaID, verbose=self.verbose, 674 log=self.log) 675 676 sub_fastaSeq, sub_hmmSeq = align.run() 677 678 ## remove positions corresponding to insertions in 679 ## search sequence 680 sub_fastaSeq, sub_hmmSeq, del_hmm = \ 681 self.removeGapInSeq( sub_fastaSeq, sub_hmmSeq ) 682 hmmGap += [ del_hmm ] 683 684 ## rebuild full lenght hmmSeq from sub_hmmSeq 685 sub_hmmSeq = '.'*start + sub_hmmSeq + '.'*(l-stop) 686 687 if j == 0: 688 hmmSeq = sub_hmmSeq 689 if j != 0: 690 hmmSeq = self.mergeHmmSeq( hmmSeq, sub_hmmSeq ) 691 692 j+= 1 693 repete += 1 694 695 return fastaSeq, hmmSeq, repete, hmmGap 696 697 except IOError, e: 698 raise HmmerError, "Error creating temporary file %r. "%e.filename+\ 699 "Check that Biskit.settings.temDirShared is accessible!\n"+\ 700 "See also .biskit/settings.cfg!"

701 702

703 - def removeGapInSeq( self, fasta, hmm ):

704 """ 705 Removes position scorresponding to insertions in search sequence 706 707 @param fasta: search sequence 708 @type fasta: str 709 @param hmm: sequence, matching hmm positions 710 @type hmm: str 711 712 @return: search sequence and profile match sequence with insertions 713 and deletions removed and a list with the deleted positions 714 @rtype: str, str, [int] 715 """ 716 new_hmm = [] 717 new_fasta = [] 718 del_pos = [] 719 720 j = 0 721 for i in range( len(fasta) ): 722 if string.upper( fasta[i] ) in molUtils.allAA(): 723 new_hmm += hmm[i] 724 new_fasta += fasta[i] 725 ## if there is an insertion 726 if fasta[i] == '-': 727 del_pos += [j] 728 ## if there is a deletion 729 if hmm[i] == '.': 730 j -= 1 731 j += 1 732 733 return ''.join(new_fasta), ''.join(new_hmm), del_pos

734 735

736 - def mergeHmmSeq( self, seq1, seq2 ):

737 """ 738 Merges two sequence files into one. 739 Multilple hits with one profile cannot overlap!! Overlap == ERROR 740 741 @param seq1: sequence 742 @type seq1: str 743 @param seq2: sequence 744 @type seq2: str 745 746 @return: merged sequence or None 747 @rtype: str OR None 748 """ 749 if len(seq1) != len(seq2): 750 EHandler.warning( 'ERR in mergeHmmSeq:\n' +\ 751 '\tSequences of different lengths cannot be merged') 752 return None 753 else: 754 result = '' 755 for i in range( len(seq1) ): 756 ## no match in either 757 if seq1[i] == seq2[i] == '.': 758 result += '.' 759 ## match in seq1 760 if seq1[i] > seq2[i]: 761 result += seq1[i] 762 ## match in seq2 763 if seq1[i] < seq2[i]: 764 result += seq2[i] 765 766 return result

767 768

769 - def subAlign( self, file ):

770 """ 771 Align fasta formated sequence to hmm profile. 772 773 @param file: path to hmmalign output file 774 @type file: str 775 776 @return: alignment and matching hmm positions with gaps 777 @rtype: str, str 778 """ 779 ## align sequence to hmm profile 780 out = os.popen( hmmalignExe + ' -q ' + self.hmmFile + \ 781 ' ' + file ).read() 782 783 ## extract search sequence 784 fastaSeq = re.findall( self.fastaID + '[ ]+[-a-yA-Y]+', out ) 785 fastaSeq = string.join([ string.split(i)[1] for i in fastaSeq ], '') 786 787 ## extract hmm sequence 788 hmmSeq = re.findall( '#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out ) 789 hmmSeq = string.join( 790 [ string.strip( string.split(i)[2] ) for i in hmmSeq ], '') 791 792 return fastaSeq, hmmSeq

793 794

795 - def castHmmDic( self, hmmDic, repete, hmmGap, key ):

796 """ 797 Blow up hmmDic to the number of repetes of the profile used. 798 Correct scores for possible deletions in the search sequence. 799 800 @param hmmDic: dictionary from L{getHmmProfile} 801 @type hmmDic: dict 802 @param repete: repete information from L{align} 803 @type repete: int 804 @param hmmGap: information about gaps from L{align} 805 @type hmmGap: [int] 806 @param key: name of scoring method to adjust for gaps and repetes 807 @type key: str 808 809 @return: dictionary with information about the profile 810 @rtype: dict 811 """ 812 s = hmmDic[key] 813 814 for i in range( repete ): 815 mask = N.ones( len(s) ) 816 N.put( mask, hmmGap[i], 0 ) 817 if i == 0: 818 score = N.compress( mask, s, 0 ) 819 if i > 0: 820 score = N.concatenate( ( N.compress( mask, s, 0 ), score ) ) 821 822 hmmDic[key] = score 823 824 return hmmDic

825 826

827 - def matchScore( self, fastaSeq, hmmSeq, profileDic, key ):

828 """ 829 Get match emmision score for residues in search sequence 830 831 @param fastaSeq: search sequence 832 @type fastaSeq: str 833 @param hmmSeq: sequence, matching hmm positions 834 @type hmmSeq: str 835 @param profileDic: from L{castHmmDic} 836 @type profileDic: dict 837 @param key: name of scoring method 838 @type key: str 839 840 @return: list of emmision scores for sequence 841 @rtype: [float] 842 """ 843 cons = [] 844 hmmShift = 0 845 846 for i in range( len( fastaSeq ) ): 847 848 ## if aa is not in hmm profile 849 if hmmSeq[i] == '.': 850 cons += [0] 851 hmmShift += 1 852 853 ## extract emmission value for residue in position i 854 if fastaSeq[i] in profileDic['AA'] and hmmSeq[i] != '.': 855 j = profileDic['AA'].index( fastaSeq[i] ) 856 cons += [ profileDic[key][i - hmmShift][j] ] 857 858 return cons

859 860

861 - def __score( self, model, key, hmmNames=None ):

862 """ 863 Get match emmission scores for search sequence. 864 If profile name(s) are provided, no search will be performed. 865 866 - If names and positions of hmm profiles is B{NOT} provided 867 B{search performed} -> score (array), hmmNames (dictionary) 868 869 - If names and positions of hmm profiles is provided 870 B{NO search performed} -> score (array), hmmNames 871 (dictionary - same as input) 872 873 @param model: model 874 @type model: PDBModel 875 @param key: name of scoring method 876 @type key: str 877 @param hmmNames: profile name OR None (default: None) 878 @type hmmNames: str OR None 879 880 @return: score, hmmNames 881 @rtype: array, dict 882 """ 883 if not hmmNames: 884 ## get profile for model 885 searchResult, searchHits = self.searchHmmdb( model ) 886 hmmNames = self.selectMatches( searchResult, searchHits ) 887 else: 888 searchResult = self.searchHmmdb( model, noSearch=1 ) 889 hmmNames = hmmNames 890 891 ## retrieve hmm model(s) 892 result = None 893 894 for name in hmmNames.keys(): 895 ## retrieve hmm model 896 hmmDic = self.getHmmProfile( name ) 897 if self.verbose: 898 self.log.writeln('Hmm profile ' + str(name) +\ 899 ' with accession number ' \ 900 + str(hmmDic['accession']) + ' retrieved') 901 902 ## align sequence with model 903 fastaSeq, hmmSeq, repete, hmmGap = self.align( model, 904 hmmNames[ name ] ) 905 ## cast hmm model 906 hmmDic = self.castHmmDic( hmmDic, repete, hmmGap, key ) 907 908 ## Hmmer profile match scores for sequence 909 cons = self.matchScore( fastaSeq, hmmSeq, hmmDic, key ) 910 911 if result is not None: 912 result = self.mergeProfiles( result, cons ) 913 else: 914 result = cons 915 916 return result, hmmNames

917 918

919 - def scoreAbsSum( self, model, hmmNames=None ):

920 return self.__score( model, key='absSum', hmmNames=hmmNames )

921 922

923 - def scoreMaxAll( self, model, hmmNames=None ):

924 return self.__score( model, key='maxAllScale', hmmNames=hmmNames )

925 926

927 - def scoreEntropy( self, model, hmmNames=None ):

928 return self.__score( model, key='ent', hmmNames=hmmNames )

929 930

931 - def __list2array( self, lstOrAr ):

932 if type( lstOrAr ) == list: 933 return N.array( lstOrAr ) 934 return lstOrAr

935 936

937 - def mergeProfiles( self, p0, p1, maxOverlap=3 ):

938 """ 939 Merge profile p0 with profile p1, as long as they overlap in 940 at most maxOverlap positions 941 942 @param p0: profile 943 @type p0: [float] 944 @param p1: profile 945 @type p1: [float] 946 @param maxOverlap: maximal allowed overlap between profiles 947 @type maxOverlap: int 948 949 @return: array 950 @rtype: 951 """ 952 p0 = self.__list2array( p0 ) 953 p1 = self.__list2array( p1 ) 954 955 overlap = N.greater( N.greater(p0,0) + N.greater(p1,0), 1 ) 956 957 if N.sum( overlap ) <= maxOverlap: 958 ## one of the two profiles will in most cases not belong to these 959 ## positions. We can't decide which one is wrong, let's eliminate 960 ## both values. Alternatively we could keep one, or the average, .. 961 N.put( p1, N.nonzero( overlap ), 0 ) 962 N.put( p0, N.nonzero( overlap ), 0 ) 963 964 p0 = p0 + p1 965 966 return p0

967 968

969 - def cleanup( self ):

970 """ 971 remove temp files 972 """ 973 del_lst = [ self.sub_fastaFile, self.hmmFile, self.fastaFile ] 974 for d in del_lst: 975 try: 976 os.remove( d ) 977 except: 978 pass

979 980 981 ############# 982 ## TESTING 983 ############# 984 import Biskit.test as BT 985

986 -class Test(BT.BiskitTest):

987 """Hmmer test""" 988 989 TAGS = [ BT.EXE ] 990

991 - def test_Hmmer( self):

992 """Hmmer test """ 993 994 from Biskit import PDBModel 995 import Biskit.tools as T 996 997 if self.local: print "Loading PDB...", 998 self.m = PDBModel( T.testRoot()+'/lig/1A19.pdb') 999 self.model = self.m.compress( self.m.maskProtein() ) 1000 if self.local: print "Done" 1001 1002 ## initiate and check database status 1003 self.hmmer = Hmmer( hmmdb=settings.hmm_db, verbose=self.local, 1004 log=self.log ) 1005 self.hmmer.checkHmmdbIndex() 1006 1007 ## scoring methods to use 1008 method = [ 'emmScore', 'ent', 'maxAll', 'absSum', 'maxAllScale' ] 1009 1010 ## search 1011 searchMatches, searchHits = self.hmmer.searchHmmdb( self.model ) 1012 hmmNames = self.hmmer.selectMatches( searchMatches, searchHits ) 1013 ## hmmNames = {'Barstar': [[1, 89]]} 1014 1015 self.cons = [] 1016 self.result = None 1017 1018 for name in hmmNames.keys(): 1019 1020 ## retrieve hmm model 1021 hmmDic = self.hmmer.getHmmProfile( name ) 1022 1023 ## align sequence with model 1024 fastaSeq, hmmSeq, repete, hmmGap = \ 1025 self.hmmer.align( self.model, hmmNames[ name ] ) 1026 1027 ## cast hmm model 1028 hmmDic_cast = \ 1029 self.hmmer.castHmmDic( hmmDic, repete, hmmGap, method[0] ) 1030 1031 ## Hmmer profile match scores for sequence 1032 self.cons = self.hmmer.matchScore( fastaSeq, hmmSeq, 1033 hmmDic_cast, method[0] ) 1034 1035 ## If there are more than one profile in the model, merge to one. 1036 if self.result: 1037 self.result = self.hmmer.mergeProfiles( self.result, self.cons ) 1038 else: 1039 self.result = self.cons 1040 1041 self.hmmer.cleanup() 1042 1043 self.assertEqual( self.result, self.EXPECTED )

1044 1045 1046 #: Hmmer emission scores 1047 EXPECTED = [2581.0, 3583.0, 1804.0, 2596.0, 3474.0, 2699.0, 3650.0, 2087.0, 2729.0, 2450.0, 2412.0, 2041.0, 3474.0, 1861.0, 2342.0, 2976.0, 5124.0, 2729.0, 2202.0, 2976.0, 3583.0, 2202.0, 2103.0, 2976.0, 1922.0, 2132.0, 4122.0, 2403.0, 4561.0, 4561.0, 3650.0, 2087.0, 4001.0, 2976.0, 3860.0, 3260.0, 2976.0, 6081.0, 3860.0, 5611.0, 2976.0, 3609.0, 3650.0, 6081.0, 3343.0, 2403.0, 3288.0, 4122.0, 2976.0, 2322.0, 2976.0, 1995.0, 4378.0, 2706.0, 2665.0, 4186.0, 3539.0, 2692.0, 3270.0, 2302.0, 2604.0, 2132.0, 2118.0, 2380.0, 2614.0, 2170.0, 3260.0, 2403.0, 1964.0, 3343.0, 2976.0, 2643.0, 3343.0, 2714.0, 2591.0, 3539.0, 3260.0, 2410.0, 1809.0, 3539.0, 2111.0, -774.0, 3860.0, 2450.0, 2063.0, 3474.0, 3474.0, 2057.0, 1861.0]

1048 1049 1050 if __name__ == '__main__': 1051 1052 BT.localTest() 1053

Source Code for Module Biskit.Hmmer