Package Biskit :: Package Mod :: Module SequenceSearcher
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.SequenceSearcher

  1  ## 
  2  ## Biskit, a toolkit for the manipulation of macromolecular structures 
  3  ## Copyright (C) 2004-2005 Raik Gruenberg & Johan Leckner 
  4  ## 
  5  ## This program is free software; you can redistribute it and/or 
  6  ## modify it under the terms of the GNU General Public License as 
  7  ## published by the Free Software Foundation; either version 2 of the 
  8  ## License, or any later version. 
  9  ## 
 10  ## This program is distributed in the hope that it will be useful, 
 11  ## but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 13  ## General Public License for more details. 
 14  ## 
 15  ## You find a copy of the GNU General Public License in the file 
 16  ## license.txt along with this program; if not, write to the Free 
 17  ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 18  ## 
 19  ## 
 20  ## last $Author: leckner $ 
 21  ## last $Date: 2006/12/21 10:07:51 $ 
 22  ## $Revision: 2.12 $ 
 23  """ 
 24  Search one or more sequence databases (SwissProt, Tremble) with a sequence 
 25  using the method of choice (Blast , FastA) 
 26  """ 
 27   
 28  import Bio.SwissProt.SProt 
 29  from Bio.Blast import NCBIWWW 
 30  from Bio.Blast import NCBIXML 
 31  from Bio.Blast import NCBIStandalone 
 32  from Bio import Fasta 
 33   
 34  import Biskit.tools as T 
 35  from Biskit.Errors import * 
 36  from Biskit import StdLog, EHandler 
 37   
 38  from Biskit.Mod import settings 
 39   
 40  import re, os 
 41  import string, copy 
 42  import commands 
 43  import copy 
 44  import sys 
 45   
46 -class BlastError( BiskitError ):
47 pass
48
49 -class InternalError( BiskitError ):
50 pass
51
52 -class SequenceSearcher:
53 """ 54 Take a sequence and return a list of nonredundant homolog 55 sequences as fasta file. The search can be performed using three 56 different methods: 57 58 1. localBlast 59 Uses Bio.Blast.NCBIStandalone.blastall (Biopython) to perform 60 the search. 61 62 2. localPSIBlast 63 Uses Bio.Blast.NCBIStandalone.blastpgp (Biopython) 64 65 3. remoteBlast 66 Uses Bio.Blast.NCBIWWW.qblast (Biopython) which performs 67 a BLAST search using the QBLAST server at NCBI. 68 69 @note: the blast sequence database has to be built with -o potion 70 e.g. cat db1.dat db2.dat | formatdb -i stdin -o T -n indexed_db 71 72 @todo: copy blast output 73 """ 74 75 F_RESULT_FOLDER = '/sequences' 76 77 ## standard file name for unclusterd sequences 78 F_FASTA_ALL = F_RESULT_FOLDER + '/all.fasta' 79 80 ## standard file name for non-redundant seqs 81 F_FASTA_NR = F_RESULT_FOLDER + '/nr.fasta' 82 83 ## clusters 84 F_CLUSTER_RAW = F_RESULT_FOLDER + '/cluster_raw.out' 85 F_CLUSTER_LOG = F_RESULT_FOLDER + '/cluster_result.out' 86 87 ## blast alignments 88 F_BLAST_OUT = F_RESULT_FOLDER + '/blast.out' 89 F_CLUSTER_BLAST_OUT = F_RESULT_FOLDER + '/cluster_blast.out' 90 91 ## default location of existing fasta file with target sequence 92 F_FASTA_TARGET = '/target.fasta' 93 94
95 - def __init__(self, outFolder='.', clusterLimit=50, verbose=0, log=None ):
96 """ 97 @param outFolder: project folder (results are put into subfolder) ['.'] 98 @type outFolder: str 99 @param clusterLimit: maximal number of returned sequence clusters 100 (default: 50) 101 @type clusterLimit: int 102 @param verbose: keep temporary files (default: 0) 103 @type verbose: 1|0 104 @param log: log file instance, if None, STDOUT is used (default: None) 105 @type log: LogFile 106 """ 107 self.outFolder = T.absfile( outFolder ) 108 109 ## 110 ## NOTE: If you get errors of the type "Couldn't find ID in" 111 ## check these regexps (see getSequenceIDs function)! 112 ## 113 114 self.ex_gi = re.compile( '^>(?P<db>ref|gb|ngb|emb|dbj|prf)\|{1,2}(?P<id>[A-Z_0-9.]{4,9})\|' ) 115 self.ex_swiss = re.compile( '^>sp\|(?P<id>[A-Z0-9_]{5,7})\|' ) 116 self.ex_swiss2= re.compile( '^>.*swiss.*\|(?P<id>[A-Z0-9_]{5,7})\|' ) 117 self.ex_pdb = re.compile( '^>pdb\|(?P<id>[A-Z0-9]{4})\|' ) 118 self.ex_all = re.compile( '^>.*\|(?P<id>[A-Z0-9_.]{4,14})\|' ) 119 120 ## 121 ## RegExp for the remoteBlast searches. Somehow the 122 ## NCBI GI identifier is written before the identifier 123 ## we want (even though this is explicitly turned off in 124 ## SequenceSearcher.remoteBlast). 125 ## 126 127 gi = '^gi\|(?P<gi>[0-9]+)\|' 128 self.ex_Rgi_1 = re.compile( gi+'(?P<db>ref|sp|pdb|gb|ngb|emb|dbj|prf|pir)\|(?P<id>[A-Z_0-9.]+)\|' ) 129 self.ex_Rgi_2 = re.compile( gi+'(?P<db>ref|sp|pdb|gb|ngb|emb|dbj|prf|pir)\|{2}(?P<id>[A-Z_0-9.]+)' ) 130 self.ex_Rswiss = re.compile( gi+'sp\|(?P<id>[A-Z0-9_]{5,7})\|' ) 131 self.ex_Rpdb = re.compile( gi+'pdb\|(?P<id>[A-Z0-9]{4})\|' ) 132 133 134 135 self.verbose = verbose 136 self.log = log or StdLog() 137 138 self.record_dic = None ## dict ID - Bio.Fasta.Record 139 self.clusters = None ## list of lists of ids 140 self.bestOfCluster = None ## list of non-redundant ids 141 142 ## the maximal number of clusters to return 143 self.clusterLimit = clusterLimit 144 145 self.clustersCurrent = None ## current number of clusters 146 147 self.prepareFolders()
148 149
150 - def prepareFolders( self ):
151 """ 152 Create needed output folders if not there. 153 """ 154 if not os.path.exists(self.outFolder + self.F_RESULT_FOLDER): 155 os.mkdir( self.outFolder + self.F_RESULT_FOLDER )
156 157
158 - def getRecords( self ):
159 """ 160 Get all homologues. 161 162 @return: list of Bio.Fasta.Record with all found protein homologues 163 @rtype: [Bio.Fasta.Record] 164 165 @raise BlastError: if no sequences found 166 """ 167 if not self.record_dic: 168 raise BlastError( "No sequences found (yet)." ) 169 170 return self.record_dic.values()
171 172
173 - def getClusteredRecords( self ):
174 """ 175 Get representative of each cluster. 176 177 @return: list with Bio.Fasta.Record with best record of each cluster 178 @rtype: [Bio.Fasta.Record] 179 180 @raise BlastError: if called before clustering 181 """ 182 if not self.clusters: 183 raise BlastError( "Sequences are not yet clustered." ) 184 185 return [ self.record_dic[id] for id in self.bestOfCluster ]
186 187
188 - def __blast2dict( self, parsed_blast, db ):
189 """ 190 Convert parsed blast result into dictionary of FastaRecords indexed 191 by sequence ID. 192 Writes all the sequences in fasta format to L{F_FASTA_ALL} and the 193 raw blast result to L{F_BLAST_OUT}. 194 195 @param parsed_blast: parsed blast result 196 @type parsed_blast: Bio.Blast.Record.Blast 197 @param db: database 198 @type db: str 199 """ 200 ## fastaFromIds( db, ids ) -> { str:Bio.Fasta.Record } 201 ids = self.getSequenceIDs( parsed_blast ) 202 self.record_dic = self.fastaFromIds( db, ids ) 203 204 if self.verbose: 205 self.writeFasta( self.record_dic.values(), 206 self.outFolder + self.F_FASTA_ALL ) 207 208 self.__writeBlastResult( parsed_blast, 209 self.outFolder + self.F_BLAST_OUT)
210 211 212
213 - def remoteBlast( self, seqFile, db, method, e=0.01, **kw ):
214 """ 215 Perform a remote BLAST search using the QBLAST server at NCBI. 216 Uses Bio.Blast.NCBIWWW.qblast (Biopython) for the search 217 218 @param seqFile: file name with search sequence as FASTA 219 @type seqFile: str 220 @param db: database(s) to search in, e.g. ['swissprot', 'pdb'] 221 @type db: [str] 222 @param method: search method, e.g. 'blastp', 'fasta' 223 @type method: str 224 @param e: expectation value cutoff 225 @type e: float 226 @param kw: optional keywords:: 227 program BLASTP, BLASTN, BLASTX, TBLASTN, or TBLASTX. 228 database Which database to search against. 229 sequence The sequence to search. 230 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 231 (default: FALSE) 232 descriptions Number of descriptions to show. Def 500. 233 alignments Number of alignments to show. Def 500. 234 expect An expect value cutoff. Def 10.0. 235 matrix Specify an alt. matrix 236 (PAM30, PAM70, BLOSUM80, BLOSUM45). 237 filter 'none' turns off filtering. Default uses 'seg' 238 or 'dust'. 239 format_type 'HTML', 'Text', 'ASN.1', or 'XML'. Def. 'HTML 240 @type kw: any 241 242 243 @note: Using the remoteBlast is asking for trouble, as 244 every change in the output file might kill the 245 parser. If you still want to use remoteBlast we 246 strongly recomend that you install BioPython 247 from CVS. Information on how to do this you 248 will find on the BioPython homepage. 249 250 @todo: Remote Blasting is running as expected but sequences 251 are still retrieved from a local database. Implement remote 252 collection of fasta seuqences from NCBI (there should be 253 something like in Biopython). Otherwise something like this 254 will also work:: 255 256 ## collect the entry with gi 87047648 257 url = 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?CMD=text&db=protein&dopt=FASTA&dispmax=20&uid=87047648' 258 import urllib 259 handle = urllib.urlopen(url) 260 lines = handle.readlines() 261 seq = '' 262 for i in range(len(lines)): 263 if re.match( "[<]pre[>][<]div class='recordbody'[>]{2}gi", l[i] ): 264 i+= 1 265 while not re.match( "^[<]/pre[>][<]/div[>]", l[i]): 266 seq += l[i][:-1] 267 i+= 1 268 print seq 269 """ 270 fasta = Fasta.Iterator( open(seqFile) ) 271 query = fasta.next() 272 273 blast_result = NCBIWWW.qblast( program=method, database=db, 274 sequence=query, expect=e, 275 ncbi_gi='FALSE', **kw ) 276 277 p = NCBIXML.BlastParser() 278 279 parsed = p.parse( blast_result ) 280 281 ## self.__remote_blast2dict( parsed, db ) 282 self.__blast2dict( parsed, db )
283
284 - def localBlast( self, seqFile, db, method='blastp', 285 resultOut=None, e=0.01, **kw ):
286 """ 287 Performa a local blast search (requires that the blast binaries 288 and databases are installed localy). 289 Uses Bio.Blast.NCBIStandalone.blastall (Biopython) for the search. 290 291 @param seqFile: file name with search sequence in FASTA format 292 @type seqFile: str 293 @param db: database(s) to search, e.g. ['swissprot', 'pdb'] 294 @type db: [str] 295 @param method: search program to use, e.g. 'blastp', 'fasta' 296 (default: blastp) 297 @type method: str 298 @param e: expectation value cutoff 299 @type e: float 300 @param resultOut: save blast output to this new file 301 @type resultOut: str 302 @param kw: optional keywords:: 303 --- Scoring --- 304 matrix Matrix to use (default BLOSUM62). 305 gap_open Gap open penalty (default 0). 306 gap_extend Gap extension penalty (default 0). 307 308 --- Algorithm --- 309 gapped Whether to do a gapped alignment. T/F 310 (default T) 311 wordsize Word size (blastp default 11). 312 keep_hits Number of best hits from a region to keep 313 (default off). 314 xdrop Dropoff value (bits) for gapped alignments 315 (blastp default 25). 316 hit_extend Threshold for extending hits (blastp default 11) 317 318 --- Processing --- 319 filter Filter query sequence? (T/F, default F) 320 restrict_gi Restrict search to these GI's. 321 believe_query Believe the query defline? (T/F, default F) 322 nprocessors Number of processors to use (default 1). 323 324 --- Formatting --- 325 alignments Number of alignments. (default 250) 326 @type kw: any 327 328 @raise BlastError: if program call failes 329 """ 330 results = err = p = None 331 try: 332 self.l = {"blast.bin":settings.blast_bin, 333 "m":method, 334 "db":db, 335 "s":seqFile, 336 "e":e, 337 "k":kw} 338 339 results, err = NCBIStandalone.blastall( settings.blast_bin, 340 method, db, seqFile, 341 expectation=e, **kw) 342 343 p = NCBIStandalone.BlastParser() 344 345 ## print "Kicking out" 346 ## raise Exception, 'kick out to recover full result file' 347 348 parsed = p.parse( results ) 349 350 self.__blast2dict( parsed, db ) 351 352 except Exception, why: 353 self.error = { 'results':results, 354 ## 'err':err.read(), 355 'parser':p, 356 'blast_bin':settings.blast_bin, 'seqFile':seqFile, 357 'method':method, 'db':db,'kw':kw, 358 'exception':str(why) } 359 print T.lastErrorTrace() 360 raise BlastError( str(why) + "\n" + 361 str( self.error ) )
362 363
364 - def localPSIBlast( self, seqFile, db, resultOut=None, e=0.01, **kw ):
365 """ 366 Performa a local psi-blast search (requires that the blast binaries 367 and databases are installed localy). 368 Uses Bio.Blast.NCBIStandalone.blastpgp (Biopython) for the search 369 370 @param seqFile: file name with search sequence in FASTA format 371 @type seqFile: str 372 @param db: database(s) to search e.g. ['swissprot', 'pdb'] 373 @type db: [str] 374 @param e: expectation value cutoff (default: 0.001) 375 @type e: float 376 @param resultOut: save blast output to this new file 377 @type resultOut: str 378 379 @param kw: optional keywords:: 380 --- Scoring --- 381 matrix Matrix to use (default BLOSUM62). 382 gap_open Gap open penalty (default 11). 383 gap_extend Gap extension penalty (default 1). 384 window_size Multiple hits window size (default 40). 385 npasses Number of passes (default 1). 386 passes Hits/passes (Integer 0-2, default 1). 387 388 --- Algorithm --- 389 gapped Whether to do a gapped alignment (T/F, default T). 390 wordsize Word size (default 3). 391 keep_hits Number of beset hits from a region to keep (def 0) 392 xdrop Dropoff value (bits) for gapped alignments 393 (def 15) 394 hit_extend Threshold for extending hits (default 11). 395 nbits_gapping Number of bits to trigger gapping (default 22). 396 pseudocounts Pseudocounts constants for multiple passes 397 (def 9). 398 xdrop_final X dropoff for final gapped alignment (default 25). 399 xdrop_extension Dropoff for blast extensions (default 7). 400 model_threshold E-value threshold to include in multipass model 401 (default 0.005). 402 required_start Start of required region in query (default 1). 403 required_end End of required region in query (default -1). 404 405 --- Processing --- 406 filter Filter query sequence with SEG? (T/F, default F) 407 believe_query Believe the query defline? (T/F, default F) 408 nprocessors Number of processors to use (default 1). 409 410 --- Formatting --- 411 alignments Number of alignments (default 250). 412 @type kw: any 413 414 @raise BlastError: if program call failes 415 """ 416 results = err = p = None 417 try: 418 results, err = NCBIStandalone.blastpgp( settings.psi_blast_bin, 419 db, seqFile, 420 expectation=e, **kw) 421 422 p = NCBIStandalone.PSIBlastParser() 423 424 parsed = p.parse( results ) 425 426 self.__blast2dict( parsed.rounds[-1], db ) 427 428 except Exception, why: 429 self.error = { 'results':results,'err':err.read(), 'parser':p, 430 'blast_bin':settings.blast_bin, 'seqFile':seqFile, 431 'db':db,'kw':kw,'exception':str(why) } 432 T.errWriteln( "BlastError " + T.lastErrorTrace() ) 433 print T.lastErrorTrace() 434 raise BlastError( str(why) + "\n" + 435 str( self.error ) )
436 437 438
439 - def getSequenceIDs( self, blast_records ):
440 """ 441 Extract sequence ids from BlastParser result. 442 443 Use:: 444 getSequenceIDs( Bio.Blast.Record.Blast ) -> [ str ] 445 446 @param blast_records: blast search result 447 @type blast_records: Bio.Blast.Record.Blast 448 449 @return: list of sequence IDs 450 @rtype: [str] 451 452 @raise BlastError: if can't find ID 453 """ 454 result = [] 455 456 for a in blast_records.alignments: 457 for pattern in [self.ex_gi, self.ex_swiss, self.ex_swiss2, 458 self.ex_pdb, self.ex_all, 459 self.ex_Rgi_1, self.ex_Rgi_2, 460 self.ex_Rswiss, self.ex_Rpdb]: 461 462 match = re.search( pattern, a.title ) 463 464 if match: 465 break 466 467 if (not match) or (not match.group('id')): 468 raise BlastError( "Couldn't find ID in " + a.title +\ 469 "with pattern " + str(pattern)) 470 471 result += [ match.group('id') ] 472 473 return result
474 475
476 - def fastaRecordFromId( self, db, id ):
477 """ 478 Use:: 479 fastaRecordFromId( db, id ) -> Bio.Fasta.Record 480 481 @param db: database 482 @type db: str 483 @param id: sequence database ID 484 @type id: str 485 486 @return: fasta record 487 @rtype: Bio.Fasta.Record 488 489 @raise BlastError: if can't fetch fasta record from database 490 """ 491 cmd = settings.fastacmd_bin + ' -d %s -s %s' % (db, id) 492 493 err, o = commands.getstatusoutput( cmd ) 494 if err: 495 EHandler.warning('%s returned error: %r' % (cmd, err) ) 496 raise BlastError( err ) 497 498 frecord = Fasta.Record() 499 frecord.title = id 500 501 try: 502 for line in o.split('\n'): 503 if line[0] == '>': 504 frecord.annotation = line[1:] 505 else: 506 frecord.sequence += line.strip() 507 508 except IndexError: 509 raise InternalError, "Couldn't fetch fasta record %s from database %s" \ 510 % (id,db) 511 512 return frecord
513 514
515 - def fastaFromIds( self, db, id_lst, fastaOut=None ):
516 """ 517 Use:: 518 fastaFromIds( id_lst, fastaOut ) -> { str: Bio.Fasta.Record } 519 520 @param db: database 521 @type db: str 522 @param id_lst: sequence database IDs 523 @type id_lst: [str] 524 525 @return: dictionary mapping IDs to Bio.Fasta.Records 526 @rtype: {str: Bio.Fasta.Record} 527 528 @raise BlastError: if couldn't fetch record 529 """ 530 result = {} 531 for i in id_lst: 532 try: 533 r = self.fastaRecordFromId( db, i ) 534 result[i] = r 535 except BlastError, why: 536 EHandler.warning("couldn't fetch %s"%str(i),trace=0 ) 537 538 return result
539 540
541 - def copyClusterOut( self, raw=None):
542 """ 543 Write clustering results to file. 544 545 @param raw: write raw clustering result to disk (default: None) 546 @type raw: 1|0 547 """ 548 if self.verbose and raw: 549 f = open( self.outFolder + self.F_CLUSTER_RAW, 'w', 1) 550 551 f.write(raw) 552 f.close()
553 554
555 - def reportClustering( self, raw=None ):
556 """ 557 Report the clustering result. 558 559 Writes: 560 - clustering results to L{F_CLUSTER_LOG} 561 - blast records to L{F_BLAST_OUT} 562 - blast records of centers to L{F_CLUSTER_BLAST_OUT} 563 - raw clustering results to L{F_CLUSTER_RAW} if raw not None 564 565 @param raw: write raw clustering result to disk (default: None) 566 @type raw: 1|0 567 """ 568 try: 569 if self.verbose: 570 f = open( self.outFolder + self.F_CLUSTER_LOG, 'w', 1) 571 572 for cluster in self.clusters: 573 574 f.write( "%i\t%s\n" % ( len( cluster ), str( cluster ))) 575 576 f.close() 577 578 ## write blast records of centers to disc 579 centers = [ c[0] for c in self.clusters ] 580 581 self.writeClusteredBlastResult( \ 582 self.outFolder + self.F_BLAST_OUT, 583 self.outFolder + self.F_CLUSTER_BLAST_OUT, centers ) 584 585 586 self.copyClusterOut( raw=raw ) 587 588 except IOError, why: 589 EHandler.warning( "Can't write cluster report." + str(why) )
590 591
592 - def clusterFasta( self, fastaIn=None, simCut=1.75, lenCut=0.9, ncpu=1 ):
593 """ 594 Cluster sequences. The input fasta titles must be the IDs. 595 fastaClust( fastaIn [, simCut, lenCut, ncpu] ) 596 597 @param fastaIn: name of input fasta file 598 @type fastaIn: str 599 @param simCut: similarity threshold (score < 3 or %identity) 600 (default: 1.75) 601 @type simCut: double 602 @param lenCut: length threshold (default: 0.9) 603 @type lenCut: double 604 @param ncpu: number of CPUs 605 @type ncpu: int 606 607 @raise BlastError: if fastaIn is empty 608 """ 609 fastaIn = fastaIn or self.outFolder + self.F_FASTA_ALL 610 611 if T.fileLength( fastaIn ) < 1: 612 raise IOError( "File %s empty. Nothing to cluster"%fastaIn ) 613 614 self.log.add( "\nClustering sequences:\n%s"%('-'*20) ) 615 616 cmd = settings.blastclust_bin + ' -i %s -S %f -L %f -a %i' %\ 617 (fastaIn, simCut, lenCut, ncpu) 618 619 if self.verbose: 620 self.log.add("- Command: %s"%cmd) 621 622 ## bugfix: at all cost prevent blastclust from using shared temp folder 623 tmp = os.environ.get( 'TMPDIR', None ) 624 if tmp: 625 del os.environ['TMPDIR'] 626 627 err, o = commands.getstatusoutput( cmd ) 628 if err: 629 raise BlastError( err ) 630 631 if tmp: 632 os.environ['TMPDIR'] = tmp 633 634 ## blastclust might write errors to file, if so the errors 635 ## occur before the dateline 636 lines = [ l.split() for l in o.split('\n') ] 637 dateline = [ l[-1] for l in lines ].index('queries') 638 self.clusters = lines[dateline+1:] 639 640 self.reportClustering( raw=o ) 641 642 self.bestOfCluster = [ self.selectFasta( ids ) 643 for ids in self.clusters ]
644 645
646 - def clusterFastaIterative(self, fastaIn=None, simCut=1.75, lenCut=0.9, 647 ncpu=1 ):
648 """ 649 Run cluterFasta iteratively, with tighter clustering settings, until 650 the number of clusters are less than self.clusterLimit. 651 652 @note: Older versions of T-Coffee can't handle more that approx. 653 50 sequences (out of memory). This problem is taken care of 654 in versions > 3.2 of T-Coffee. 655 656 @param fastaIn: name of input fasta file 657 @type fastaIn: str 658 @param simCut: similarity threshold (score < 3 or %identity) 659 (default: 1.75) 660 @type simCut: double 661 @param lenCut: length threshold (default: 0.9) 662 @type lenCut: double 663 @param ncpu: number of CPUs 664 @type ncpu: int 665 """ 666 iter = 1 667 while self.clustersCurrent > self.clusterLimit \ 668 or self.clustersCurrent == None: 669 670 self.clusterFasta( fastaIn, simCut, lenCut, ncpu ) 671 self.clustersCurrent = len( self.clusters ) 672 673 self.log.add( "- Clustering iteration %i produced %i clusters." \ 674 %( iter, len( self.clusters) )) 675 self.log.add( "\tusing a simCut of %.2f and a lenCut of %.3f.\n" \ 676 %( simCut, lenCut ) ) 677 678 simCut -= 0.15 679 lenCut -= 0.015 680 iter += 1
681 682
683 - def selectFasta( self, ids_from_cluster ):
684 """ 685 Select one member of cluster of sequences. 686 687 @param ids_from_cluster: list of sequence ids defining the cluster 688 @type ids_from_cluster: [str] 689 @return: id of best in cluster 690 @rtype: str 691 """ 692 return ids_from_cluster[0]
693 694
695 - def writeFasta( self, frecords, fastaOut ):
696 """ 697 Create fasta file for given set of records. 698 699 @param frecords: list of Bio.Blast.Records 700 @type frecords: [Bio.Blast.Record] 701 @param fastaOut: file name 702 @type fastaOut: str 703 """ 704 f = open( T.absfile(fastaOut), 'w' ) 705 for r in frecords: 706 f.write( str( r ) + '\n' ) 707 f.close()
708 709
710 - def writeFastaAll( self, fastaOut=None ):
711 """ 712 Write all found template sequences to fasta file. 713 714 @param fastaOut: write all fasta records to file 715 (default: L{F_FASTA_ALL}) 716 @type fastaOut: str OR None 717 """ 718 fastaOut = fastaOut or self.outFolder + self.F_FASTA_ALL 719 self.writeFasta( self.frecords, fastaOut )
720 721
722 - def writeFastaClustered( self, fastaOut=None ):
723 """ 724 Write non-redundant set of template sequences to fasta file. 725 726 @param fastaOut: write non-redundant fasta records to file 727 (default: L{F_FASTA_NR}) 728 @type fastaOut: str 729 """ 730 fastaOut = fastaOut or self.outFolder + self.F_FASTA_NR 731 732 self.writeFasta( self.getClusteredRecords(), fastaOut )
733 734
735 - def __writeBlastResult( self, parsed_blast, outFile):
736 """ 737 Write the result from the blast search to file (similar to the 738 output produced by a regular blast run). 739 740 writeBlastResult( parsed_blast, outFile ) 741 742 @param parsed_blast: Bio.Blast.Record.Blast 743 @type parsed_blast: Bio.Blast.Record.Blast 744 @param outFile: file to write the blast result to 745 @type outFile: str 746 """ 747 f = open( T.absfile( outFile ), 'w' ) 748 749 i=1 750 for alignment in parsed_blast.alignments: 751 for hsp in alignment.hsps: 752 s = string.replace(alignment.title,'\n',' ') 753 s = string.replace(s, 'pdb|', '\npdb|') 754 f.write('Sequence %i: %s\n'%(i,s)) 755 f.write('Length: %i \tScore: %3.1f \tE-value: %2.1e\n'\ 756 %(hsp.identities[1], hsp.score, hsp.expect)) 757 f.write( 'Identities: %i \tPositives: %i \tGaps: %i\n'\ 758 %(hsp.identities[0], hsp.positives[0], 759 hsp.gaps[0] or 0 )) 760 761 f.write( '%s\n'%hsp.query ) 762 f.write( '%s\n'%hsp.match ) 763 f.write( '%s\n\n'%hsp.sbjct ) 764 i += 1 765 f.close()
766 767
768 - def writeClusteredBlastResult( self, allFile, clustFile, selection ):
769 """ 770 Reads the blast.out file and keeps only centers. 771 772 @param allFile: all blast results 773 @type allFile: file 774 @param clustFile: output file name 775 @type clustFile: str 776 @param selection: write only sequences in list 777 @type selection: list 778 """ 779 f = open( clustFile, 'w' ) 780 781 b = open( allFile, 'r' ) 782 line = b.readlines() 783 784 j=0 785 sel = copy.copy(selection) 786 for l in line: 787 for s in sel: 788 ## search for center ID, remove from search list if found 789 if re.search( s, l[:30]): 790 sel.remove(s) 791 f.write( '\nCluster center %i:\n'%(selection.index(s)+1) ) 792 f.writelines( l ) 793 794 ## add following lines containing alignment info 795 while j < len(line)-1: 796 j+=1 797 ## break if empty line 798 if len( line[j] )<4: 799 break 800 ## skipp if additional pdb identifier 801 if line[j][:4] == 'pdb|': 802 pass 803 else: 804 f.writelines( line[j] ) 805 806 f.close() 807 b.close()
808 809 810 811 812 ############# 813 ## TESTING 814 ############# 815
816 -class Test:
817 """ 818 Test class 819 """ 820
821 - def run( self, local=0, flavour='blastp' ):
822 """ 823 run function test 824 825 @param local: transfer local variables to global and perform 826 other tasks only when run locally 827 @type local: 1|0 828 @param flavour: flavour of blast to test, blastp for localBlast 829 OR blastpgp for localPSIBlast (default: blastp) 830 @type flavour: str 831 832 @return: 1 833 @rtype: int 834 """ 835 import tempfile 836 import shutil 837 from Biskit.LogFile import LogFile 838 839 query = T.testRoot() + '/Mod/project/target.fasta' 840 outfolder = tempfile.mkdtemp( '_test_SequenceSearcher' ) 841 shutil.copy( query, outfolder ) 842 843 ## log file 844 f_out = outfolder+'/SequenceSearcher.log' 845 l = LogFile( f_out, mode='w') 846 847 searcher = SequenceSearcher( outFolder=outfolder, verbose=1, log=l ) 848 849 ## blast db has to be built with -o potion 850 ## e.g. cat db1.dat db2.dat | formatdb -i stdin -o T -n indexed_db 851 db = settings.db_swiss 852 853 f_target = searcher.outFolder + searcher.F_FASTA_TARGET 854 855 if local: 856 globals().update( locals() ) 857 858 ## skipp remote blast for now 859 # self.searcher.remoteBlast( f_target, 'nr', 'blastp', alignments=50 ) 860 861 if flavour == 'blastpgp': 862 searcher.localPSIBlast( f_target, db, 'blastpgp', npasses=2 ) 863 else: 864 searcher.localBlast( f_target, db, 'blastp', alignments=500, e=0.01 ) 865 866 searcher.clusterFasta() ## expects all.fasta 867 868 searcher.writeFastaClustered() 869 870 if local: 871 print '\nThe clustered result from the search can be found in %s'%f_out 872 print '\nSequenceSearche log file written to: %s'%f_out 873 globals().update( locals() ) 874 875 ## cleanup 876 T.tryRemove( outfolder, tree=1 ) 877 878 return 1
879 880
881 - def expected_result( self ):
882 """ 883 Precalculated result to check for consistent performance. 884 885 @return: 1 886 @rtype: int 887 """ 888 return 1
889 890 891 892 if __name__ == '__main__': 893 894 test = Test() 895 896 ## test localBlast 897 assert test.run( local=1 ) == test.expected_result() 898 899 ## test localPSIBlast 900 assert test.run( local=1, flavour='blastpgp') == test.expected_result() 901