Package Biskit :: Package Mod :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6   
   7  """ 
   8  This module provides code to work with the standalone version of 
   9  BLAST, either blastall or blastpgp, provided by the NCBI. 
  10  http://www.ncbi.nlm.nih.gov/BLAST/ 
  11   
  12  Classes: 
  13  LowQualityBlastError     Except that indicates low quality query sequences. 
  14  BlastParser              Parses output from blast. 
  15  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  16  PSIBlastParser           Parses output from psi-blast. 
  17  Iterator                 Iterates over a file of blast results. 
  18   
  19  _Scanner                 Scans output from standalone BLAST. 
  20  _BlastConsumer           Consumes output from blast. 
  21  _PSIBlastConsumer        Consumes output from psi-blast. 
  22  _HeaderConsumer          Consumes header information. 
  23  _DescriptionConsumer     Consumes description information. 
  24  _AlignmentConsumer       Consumes alignment information. 
  25  _HSPConsumer             Consumes hsp information. 
  26  _DatabaseReportConsumer  Consumes database report information. 
  27  _ParametersConsumer      Consumes parameters information. 
  28   
  29  Functions: 
  30  blastall        Execute blastall. 
  31  blastpgp        Execute blastpgp. 
  32  rpsblast        Execute rpsblast. 
  33   
  34  """ 
  35   
  36  from __future__ import generators 
  37  import os 
  38  import re 
  39   
  40  from Bio import File 
  41  from Bio.ParserSupport import * 
  42  from Bio.Blast import Record 
  43   
  44   
45 -class LowQualityBlastError(Exception):
46 """Error caused by running a low quality sequence through BLAST. 47 48 When low quality sequences (like GenBank entries containing only 49 stretches of a single nucleotide) are BLASTed, they will result in 50 BLAST generating an error and not being able to perform the BLAST. 51 search. This error should be raised for the BLAST reports produced 52 in this case. 53 """ 54 pass
55
56 -class ShortQueryBlastError(Exception):
57 """Error caused by running a short query sequence through BLAST. 58 59 If the query sequence is too short, BLAST outputs warnings and errors: 60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 61 [blastall] ERROR: [000.000] AT1G08320: Blast: 62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 63 done 64 65 This exception is raised when that condition is detected. 66 67 """ 68 pass
69 70
71 -class _Scanner:
72 """Scan BLAST output from blastall or blastpgp. 73 74 Tested with blastall and blastpgp v2.0.10, v2.0.11 75 76 Methods: 77 feed Feed data into the scanner. 78 79 """
80 - def feed(self, handle, consumer):
81 """S.feed(handle, consumer) 82 83 Feed in a BLAST report for scanning. handle is a file-like 84 object that contains the BLAST report. consumer is a Consumer 85 object that will receive events as the report is scanned. 86 87 """ 88 if isinstance(handle, File.UndoHandle): 89 uhandle = handle 90 else: 91 uhandle = File.UndoHandle(handle) 92 93 # Try to fast-forward to the beginning of the blast report. 94 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 95 # Now scan the BLAST report. 96 self._scan_header(uhandle, consumer) 97 self._scan_rounds(uhandle, consumer) 98 self._scan_database_report(uhandle, consumer) 99 self._scan_parameters(uhandle, consumer)
100
101 - def _scan_header(self, uhandle, consumer):
102 # BLASTP 2.0.10 [Aug-26-1999] 103 # 104 # 105 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 106 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 107 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 108 # programs", Nucleic Acids Res. 25:3389-3402. 109 # 110 # Query= test 111 # (140 letters) 112 # 113 # Database: sdqib40-1.35.seg.fa 114 # 1323 sequences; 223,339 total letters 115 # 116 117 consumer.start_header() 118 119 read_and_call(uhandle, consumer.version, contains='BLAST') 120 read_and_call_while(uhandle, consumer.noevent, blank=1) 121 122 # Read the reference lines and the following blank line. 123 # There might be a <pre> line, for qblast output. 124 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 125 read_and_call(uhandle, consumer.reference, start='Reference') 126 while 1: 127 line = uhandle.readline() 128 if is_blank_line(line) or line.startswith("RID"): 129 consumer.noevent(line) 130 read_and_call_while(uhandle, consumer.noevent, blank=1) 131 break 132 consumer.reference(line) 133 134 # blastpgp has a Reference for composition-based statistics. 135 if attempt_read_and_call( 136 uhandle, consumer.reference, start="Reference"): 137 read_and_call_until(uhandle, consumer.reference, blank=1) 138 read_and_call_while(uhandle, consumer.noevent, blank=1) 139 140 # Read the Query lines and the following blank line. 141 read_and_call(uhandle, consumer.query_info, start='Query=') 142 read_and_call_until(uhandle, consumer.query_info, blank=1) 143 read_and_call_while(uhandle, consumer.noevent, blank=1) 144 145 # Read the database lines and the following blank line. 146 read_and_call_until(uhandle, consumer.database_info, end='total letters') 147 read_and_call(uhandle, consumer.database_info, contains='sequences') 148 read_and_call_while(uhandle, consumer.noevent, blank=1) 149 150 consumer.end_header()
151
152 - def _scan_rounds(self, uhandle, consumer):
153 # Scan a bunch of rounds. 154 # Each round begins with either a "Searching......" line 155 # or a 'Score E' line followed by descriptions and alignments. 156 # The email server doesn't give the "Searching....." line. 157 # If there is no 'Searching.....' line then you'll first see a 158 # 'Results from round' line 159 160 while 1: 161 line = safe_peekline(uhandle) 162 if (not line.startswith('Searching') and 163 not line.startswith('Results from round') and 164 re.search(r"Score +E", line) is None and 165 line.find('No hits found') == -1): 166 break 167 168 self._scan_descriptions(uhandle, consumer) 169 self._scan_alignments(uhandle, consumer)
170
171 - def _scan_descriptions(self, uhandle, consumer):
172 # Searching..................................................done 173 # Results from round 2 174 # 175 # 176 # Sc 177 # Sequences producing significant alignments: (b 178 # Sequences used in model and found again: 179 # 180 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 181 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 182 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 183 # 184 # Sequences not found previously or not previously below threshold: 185 # 186 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 187 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 188 # 189 190 # If PSI-BLAST, may also have: 191 # 192 # CONVERGED! 193 194 consumer.start_descriptions() 195 196 # Read 'Searching' 197 # This line seems to be missing in BLASTN 2.1.2 (others?) 198 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 199 200 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 201 # If this happens, the handle will yield no more information. 202 if not uhandle.peekline(): 203 raise SyntaxError, "Unexpected end of blast report. " + \ 204 "Looks suspiciously like a PSI-BLAST crash." 205 206 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 207 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 208 # [blastall] ERROR: [000.000] AT1G08320: Blast: 209 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 210 # done 211 # Reported by David Weisman. 212 # Check for these error lines and ignore them for now. Let 213 # the BlastErrorParser deal with them. 214 line = uhandle.peekline() 215 if line.find("ERROR:") != -1 or line.startswith("done"): 216 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 217 read_and_call(uhandle, consumer.noevent, start="done") 218 219 # Check to see if this is PSI-BLAST. 220 # If it is, the 'Searching' line will be followed by: 221 # (version 2.0.10) 222 # Searching............................. 223 # Results from round 2 224 # or (version 2.0.11) 225 # Searching............................. 226 # 227 # 228 # Results from round 2 229 230 # Skip a bunch of blank lines. 231 read_and_call_while(uhandle, consumer.noevent, blank=1) 232 # Check for the results line if it's there. 233 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 234 read_and_call_while(uhandle, consumer.noevent, blank=1) 235 236 # Three things can happen here: 237 # 1. line contains 'Score E' 238 # 2. line contains "No hits found" 239 # 3. no descriptions 240 # The first one begins a bunch of descriptions. The last two 241 # indicates that no descriptions follow, and we should go straight 242 # to the alignments. 243 if not attempt_read_and_call( 244 uhandle, consumer.description_header, 245 has_re=re.compile(r'Score +E')): 246 # Either case 2 or 3. Look for "No hits found". 247 attempt_read_and_call(uhandle, consumer.no_hits, 248 contains='No hits found') 249 read_and_call_while(uhandle, consumer.noevent, blank=1) 250 251 consumer.end_descriptions() 252 # Stop processing. 253 return 254 255 # Read the score header lines 256 read_and_call(uhandle, consumer.description_header, 257 start='Sequences producing') 258 259 # If PSI-BLAST, read the 'Sequences used in model' line. 260 attempt_read_and_call(uhandle, consumer.model_sequences, 261 start='Sequences used in model') 262 read_and_call_while(uhandle, consumer.noevent, blank=1) 263 264 # Read the descriptions and the following blank lines, making 265 # sure that there are descriptions. 266 if not uhandle.peekline().startswith('Sequences not found'): 267 read_and_call_until(uhandle, consumer.description, blank=1) 268 read_and_call_while(uhandle, consumer.noevent, blank=1) 269 270 # If PSI-BLAST, read the 'Sequences not found' line followed 271 # by more descriptions. However, I need to watch out for the 272 # case where there were no sequences not found previously, in 273 # which case there will be no more descriptions. 274 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 275 start='Sequences not found'): 276 # Read the descriptions and the following blank lines. 277 read_and_call_while(uhandle, consumer.noevent, blank=1) 278 l = safe_peekline(uhandle) 279 # Brad -- added check for QUERY. On some PSI-BLAST outputs 280 # there will be a 'Sequences not found' line followed by no 281 # descriptions. Check for this case since the first thing you'll 282 # get is a blank line and then 'QUERY' 283 if not l.startswith('CONVERGED') and l[0] != '>' \ 284 and not l.startswith('QUERY'): 285 read_and_call_until(uhandle, consumer.description, blank=1) 286 read_and_call_while(uhandle, consumer.noevent, blank=1) 287 288 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 289 read_and_call_while(uhandle, consumer.noevent, blank=1) 290 291 consumer.end_descriptions()
292
293 - def _scan_alignments(self, uhandle, consumer):
294 # qblast inserts a helpful line here. 295 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 296 297 # First, check to see if I'm at the database report. 298 line = safe_peekline(uhandle) 299 if line.startswith(' Database'): 300 return 301 elif line[0] == '>': 302 # XXX make a better check here between pairwise and masterslave 303 self._scan_pairwise_alignments(uhandle, consumer) 304 else: 305 # XXX put in a check to make sure I'm in a masterslave alignment 306 self._scan_masterslave_alignment(uhandle, consumer)
307
308 - def _scan_pairwise_alignments(self, uhandle, consumer):
309 while 1: 310 line = safe_peekline(uhandle) 311 if line[0] != '>': 312 break 313 self._scan_one_pairwise_alignment(uhandle, consumer)
314
315 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
316 consumer.start_alignment() 317 318 self._scan_alignment_header(uhandle, consumer) 319 320 # Scan a bunch of score/alignment pairs. 321 while 1: 322 line = safe_peekline(uhandle) 323 if not line.startswith(' Score'): 324 break 325 self._scan_hsp(uhandle, consumer) 326 consumer.end_alignment()
327
328 - def _scan_alignment_header(self, uhandle, consumer):
329 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 330 # stearothermophilus] 331 # Length = 81 332 # 333 read_and_call(uhandle, consumer.title, start='>') 334 while 1: 335 line = safe_readline(uhandle) 336 if line.lstrip().startswith('Length ='): 337 consumer.length(line) 338 break 339 elif is_blank_line(line): 340 # Check to make sure I haven't missed the Length line 341 raise SyntaxError, "I missed the Length in an alignment header" 342 consumer.title(line) 343 344 # Older versions of BLAST will have a line with some spaces. 345 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 346 if not attempt_read_and_call(uhandle, consumer.noevent, 347 start=' '): 348 read_and_call(uhandle, consumer.noevent, blank=1)
349
350 - def _scan_hsp(self, uhandle, consumer):
351 consumer.start_hsp() 352 self._scan_hsp_header(uhandle, consumer) 353 self._scan_hsp_alignment(uhandle, consumer) 354 consumer.end_hsp()
355
356 - def _scan_hsp_header(self, uhandle, consumer):
357 # Score = 22.7 bits (47), Expect = 2.5 358 # Identities = 10/36 (27%), Positives = 18/36 (49%) 359 # Strand = Plus / Plus 360 # Frame = +3 361 # 362 363 read_and_call(uhandle, consumer.score, start=' Score') 364 read_and_call(uhandle, consumer.identities, start=' Identities') 365 # BLASTN 366 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 367 # BLASTX, TBLASTN, TBLASTX 368 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 369 read_and_call(uhandle, consumer.noevent, blank=1)
370
371 - def _scan_hsp_alignment(self, uhandle, consumer):
372 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 373 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 374 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 375 # 376 # Query: 64 AEKILIKR 71 377 # I +K 378 # Sbjct: 70 PNIIQLKD 77 379 # 380 381 while 1: 382 # Blastn adds an extra line filled with spaces before Query 383 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 384 read_and_call(uhandle, consumer.query, start='Query') 385 read_and_call(uhandle, consumer.align, start=' ') 386 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 387 read_and_call_while(uhandle, consumer.noevent, blank=1) 388 line = safe_peekline(uhandle) 389 # Alignment continues if I see a 'Query' or the spaces for Blastn. 390 if not (line.startswith('Query') or line.startswith(' ')): 391 break
392
393 - def _scan_masterslave_alignment(self, uhandle, consumer):
394 consumer.start_alignment() 395 while 1: 396 line = safe_readline(uhandle) 397 # Check to see whether I'm finished reading the alignment. 398 # This is indicated by 1) database section, 2) next psi-blast 399 # round, which can also be a 'Results from round' if no 400 # searching line is present 401 # patch by chapmanb 402 if line.startswith('Searching') or \ 403 line.startswith('Results from round'): 404 uhandle.saveline(line) 405 break 406 elif line.startswith(' Database'): 407 uhandle.saveline(line) 408 break 409 elif is_blank_line(line): 410 consumer.noevent(line) 411 else: 412 consumer.multalign(line) 413 read_and_call_while(uhandle, consumer.noevent, blank=1) 414 consumer.end_alignment()
415
416 - def _scan_database_report(self, uhandle, consumer):
417 # Database: sdqib40-1.35.seg.fa 418 # Posted date: Nov 1, 1999 4:25 PM 419 # Number of letters in database: 223,339 420 # Number of sequences in database: 1323 421 # 422 # Lambda K H 423 # 0.322 0.133 0.369 424 # 425 # Gapped 426 # Lambda K H 427 # 0.270 0.0470 0.230 428 # 429 430 consumer.start_database_report() 431 432 # Subset of the database(s) listed below 433 # Number of letters searched: 562,618,960 434 # Number of sequences searched: 228,924 435 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 436 read_and_call(uhandle, consumer.noevent, contains="letters") 437 read_and_call(uhandle, consumer.noevent, contains="sequences") 438 read_and_call(uhandle, consumer.noevent, start=" ") 439 440 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 441 # was missing the "Database" stanza completely. 442 while attempt_read_and_call(uhandle, consumer.database, 443 start=' Database'): 444 # BLAT output ends abruptly here, without any of the other 445 # information. Check to see if this is the case. If so, 446 # then end the database report here gracefully. 447 if not uhandle.peekline(): 448 consumer.end_database_report() 449 return 450 451 # Database can span multiple lines. 452 read_and_call_until(uhandle, consumer.database, start=' Posted') 453 read_and_call(uhandle, consumer.posted_date, start=' Posted') 454 read_and_call(uhandle, consumer.num_letters_in_database, 455 start=' Number of letters') 456 read_and_call(uhandle, consumer.num_sequences_in_database, 457 start=' Number of sequences') 458 read_and_call(uhandle, consumer.noevent, start=' ') 459 460 line = safe_readline(uhandle) 461 uhandle.saveline(line) 462 if line.find('Lambda') != -1: 463 break 464 465 read_and_call(uhandle, consumer.noevent, start='Lambda') 466 read_and_call(uhandle, consumer.ka_params) 467 read_and_call(uhandle, consumer.noevent, blank=1) 468 469 # not BLASTP 470 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 471 # not TBLASTX 472 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 473 read_and_call(uhandle, consumer.ka_params_gap) 474 475 # Blast 2.2.4 can sometimes skip the whole parameter section. 476 # Thus, I need to be careful not to read past the end of the 477 # file. 478 try: 479 read_and_call_while(uhandle, consumer.noevent, blank=1) 480 except SyntaxError, x: 481 if str(x) != "Unexpected end of stream.": 482 raise 483 consumer.end_database_report()
484
485 - def _scan_parameters(self, uhandle, consumer):
486 # Matrix: BLOSUM62 487 # Gap Penalties: Existence: 11, Extension: 1 488 # Number of Hits to DB: 50604 489 # Number of Sequences: 1323 490 # Number of extensions: 1526 491 # Number of successful extensions: 6 492 # Number of sequences better than 10.0: 5 493 # Number of HSP's better than 10.0 without gapping: 5 494 # Number of HSP's successfully gapped in prelim test: 0 495 # Number of HSP's that attempted gapping in prelim test: 1 496 # Number of HSP's gapped (non-prelim): 5 497 # length of query: 140 498 # length of database: 223,339 499 # effective HSP length: 39 500 # effective length of query: 101 501 # effective length of database: 171,742 502 # effective search space: 17345942 503 # effective search space used: 17345942 504 # T: 11 505 # A: 40 506 # X1: 16 ( 7.4 bits) 507 # X2: 38 (14.8 bits) 508 # X3: 64 (24.9 bits) 509 # S1: 41 (21.9 bits) 510 # S2: 42 (20.8 bits) 511 512 # Blast 2.2.4 can sometimes skip the whole parameter section. 513 # Thus, check to make sure that the parameter section really 514 # exists. 515 if not uhandle.peekline(): 516 return 517 518 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 519 # "Number of Sequences" lines. 520 consumer.start_parameters() 521 522 # Matrix line may be missing in BLASTN 2.2.9 523 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 524 # not TBLASTX 525 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 526 527 attempt_read_and_call(uhandle, consumer.num_sequences, 528 start='Number of Sequences') 529 read_and_call(uhandle, consumer.num_hits, 530 start='Number of Hits') 531 attempt_read_and_call(uhandle, consumer.num_sequences, 532 start='Number of Sequences') 533 read_and_call(uhandle, consumer.num_extends, 534 start='Number of extensions') 535 read_and_call(uhandle, consumer.num_good_extends, 536 start='Number of successful') 537 538 read_and_call(uhandle, consumer.num_seqs_better_e, 539 start='Number of sequences') 540 541 # not BLASTN, TBLASTX 542 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 543 start="Number of HSP's better"): 544 545 # BLASTN 2.2.9 546 if attempt_read_and_call(uhandle, consumer.noevent, 547 start="Number of HSP's gapped:"): 548 read_and_call(uhandle, consumer.noevent, 549 start="Number of HSP's successfully") 550 read_and_call(uhandle, consumer.noevent, 551 start="Number of extra gapped extensions") 552 else: 553 read_and_call(uhandle, consumer.hsps_prelim_gapped, 554 start="Number of HSP's successfully") 555 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 556 start="Number of HSP's that") 557 read_and_call(uhandle, consumer.hsps_gapped, 558 start="Number of HSP's gapped") 559 560 # in blastall 2.2.15 561 attempt_read_and_call(uhandle, consumer.noevent, 562 start="Number of HSP's gapped:") 563 564 attempt_read_and_call(uhandle, consumer.noevent, 565 start="Number of HSP's successfully") 566 567 # not in blastx 2.2.1 568 attempt_read_and_call(uhandle, consumer.query_length, 569 has_re=re.compile(r"[Ll]ength of query")) 570 read_and_call(uhandle, consumer.database_length, 571 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 572 573 # BLASTN 2.2.9 574 attempt_read_and_call(uhandle, consumer.noevent, 575 start="Length adjustment") 576 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 577 start='effective HSP') 578 # Not in blastx 2.2.1 579 attempt_read_and_call( 580 uhandle, consumer.effective_query_length, 581 has_re=re.compile(r'[Ee]ffective length of query')) 582 read_and_call( 583 uhandle, consumer.effective_database_length, 584 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 585 # Not in blastx 2.2.1, added a ':' to distinguish between 586 # this and the 'effective search space used' line 587 attempt_read_and_call( 588 uhandle, consumer.effective_search_space, 589 has_re=re.compile(r'[Ee]ffective search space:')) 590 # Does not appear in BLASTP 2.0.5 591 attempt_read_and_call( 592 uhandle, consumer.effective_search_space_used, 593 has_re=re.compile(r'[Ee]ffective search space used')) 594 595 # BLASTX, TBLASTN, TBLASTX 596 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 597 # not in BLASTN 2.2.9 598 attempt_read_and_call(uhandle, consumer.threshold, start='T') 599 attempt_read_and_call(uhandle, consumer.window_size, start='A') 600 ## renamed in BLASTALL 2.2.15 601 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring') 602 attempt_read_and_call(uhandle, consumer.window_size, start='Window') 603 604 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 605 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 606 # not BLASTN, TBLASTX 607 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 608 start='X3') 609 read_and_call(uhandle, consumer.gap_trigger, start='S1') 610 # not in blastx 2.2.1 611 # first we make sure we have additional lines to work with, if 612 # not then the file is done and we don't have a final S2 613 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 614 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 615 616 consumer.end_parameters()
617
618 -class BlastParser(AbstractParser):
619 """Parses BLAST data into a Record.Blast object. 620 621 """
622 - def __init__(self):
623 """__init__(self)""" 624 self._scanner = _Scanner() 625 self._consumer = _BlastConsumer()
626
627 - def parse(self, handle):
628 """parse(self, handle)""" 629 self._scanner.feed(handle, self._consumer) 630 return self._consumer.data
631
632 -class PSIBlastParser(AbstractParser):
633 """Parses BLAST data into a Record.PSIBlast object. 634 635 """
636 - def __init__(self):
637 """__init__(self)""" 638 self._scanner = _Scanner() 639 self._consumer = _PSIBlastConsumer()
640
641 - def parse(self, handle):
642 """parse(self, handle)""" 643 self._scanner.feed(handle, self._consumer) 644 return self._consumer.data
645
646 -class _HeaderConsumer:
647 - def start_header(self):
648 self._header = Record.Header()
649
650 - def version(self, line):
651 c = line.split() 652 self._header.application = c[0] 653 self._header.version = c[1] 654 self._header.date = c[2][1:-1]
655
656 - def reference(self, line):
657 if line.startswith('Reference: '): 658 self._header.reference = line[11:] 659 else: 660 self._header.reference = self._header.reference + line
661
662 - def query_info(self, line):
663 if line.startswith('Query= '): 664 self._header.query = line[7:] 665 elif not line.startswith(' '): # continuation of query_info 666 self._header.query = "%s%s" % (self._header.query, line) 667 else: 668 letters, = _re_search( 669 r"([0-9,]+) letters", line, 670 "I could not find the number of letters in line\n%s" % line) 671 self._header.query_letters = _safe_int(letters)
672
673 - def database_info(self, line):
674 line = line.rstrip() 675 if line.startswith('Database: '): 676 self._header.database = line[10:] 677 elif not line.endswith('total letters'): 678 self._header.database = self._header.database + line.strip() 679 else: 680 sequences, letters =_re_search( 681 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 682 "I could not find the sequences and letters in line\n%s" %line) 683 self._header.database_sequences = _safe_int(sequences) 684 self._header.database_letters = _safe_int(letters)
685
686 - def end_header(self):
687 # Get rid of the trailing newlines 688 self._header.reference = self._header.reference.rstrip() 689 self._header.query = self._header.query.rstrip()
690
691 -class _DescriptionConsumer:
692 - def start_descriptions(self):
693 self._descriptions = [] 694 self._model_sequences = [] 695 self._nonmodel_sequences = [] 696 self._converged = 0 697 self._type = None 698 self._roundnum = None 699 700 self.__has_n = 0 # Does the description line contain an N value?
701
702 - def description_header(self, line):
703 if line.startswith('Sequences producing'): 704 cols = line.split() 705 if cols[-1] == 'N': 706 self.__has_n = 1
707
708 - def description(self, line):
709 dh = self._parse(line) 710 if self._type == 'model': 711 self._model_sequences.append(dh) 712 elif self._type == 'nonmodel': 713 self._nonmodel_sequences.append(dh) 714 else: 715 self._descriptions.append(dh)
716
717 - def model_sequences(self, line):
718 self._type = 'model'
719
720 - def nonmodel_sequences(self, line):
721 self._type = 'nonmodel'
722
723 - def converged(self, line):
724 self._converged = 1
725
726 - def no_hits(self, line):
727 pass
728
729 - def round(self, line):
730 if not line.startswith('Results from round'): 731 raise SyntaxError, "I didn't understand the round line\n%s" % line 732 self._roundnum = _safe_int(line[18:].strip())
733
734 - def end_descriptions(self):
735 pass
736
737 - def _parse(self, description_line):
738 line = description_line # for convenience 739 dh = Record.Description() 740 741 # I need to separate the score and p-value from the title. 742 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 743 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 744 # special cases to handle: 745 # - title must be preserved exactly (including whitespaces) 746 # - score could be equal to e-value (not likely, but what if??) 747 # - sometimes there's an "N" score of '1'. 748 cols = line.split() 749 if len(cols) < 3: 750 raise SyntaxError, \ 751 "Line does not appear to contain description:\n%s" % line 752 if self.__has_n: 753 i = line.rfind(cols[-1]) # find start of N 754 i = line.rfind(cols[-2], 0, i) # find start of p-value 755 i = line.rfind(cols[-3], 0, i) # find start of score 756 else: 757 i = line.rfind(cols[-1]) # find start of p-value 758 i = line.rfind(cols[-2], 0, i) # find start of score 759 if self.__has_n: 760 dh.title, dh.score, dh.e, dh.num_alignments = \ 761 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 762 else: 763 dh.title, dh.score, dh.e, dh.num_alignments = \ 764 line[:i].rstrip(), cols[-2], cols[-1], 1 765 dh.num_alignments = _safe_int(dh.num_alignments) 766 dh.score = _safe_int(dh.score) 767 dh.e = _safe_float(dh.e) 768 return dh
769
770 -class _AlignmentConsumer:
771 # This is a little bit tricky. An alignment can either be a 772 # pairwise alignment or a multiple alignment. Since it's difficult 773 # to know a-priori which one the blast record will contain, I'm going 774 # to make one class that can parse both of them.
775 - def start_alignment(self):
776 self._alignment = Record.Alignment() 777 self._multiple_alignment = Record.MultipleAlignment()
778
779 - def title(self, line):
780 self._alignment.title = "%s%s" % (self._alignment.title, 781 line.lstrip())
782
783 - def length(self, line):
784 self._alignment.length = line.split()[2] 785 self._alignment.length = _safe_int(self._alignment.length)
786
787 - def multalign(self, line):
788 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 789 if line.startswith('QUERY') or line.startswith('blast_tmp'): 790 # If this is the first line of the multiple alignment, 791 # then I need to figure out how the line is formatted. 792 793 # Format of line is: 794 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 795 try: 796 name, start, seq, end = line.split() 797 except ValueError: 798 raise SyntaxError, "I do not understand the line\n%s" \ 799 % line 800 self._start_index = line.index(start, len(name)) 801 self._seq_index = line.index(seq, 802 self._start_index+len(start)) 803 # subtract 1 for the space 804 self._name_length = self._start_index - 1 805 self._start_length = self._seq_index - self._start_index - 1 806 self._seq_length = line.rfind(end) - self._seq_index - 1 807 808 #self._seq_index = line.index(seq) 809 ## subtract 1 for the space 810 #self._seq_length = line.rfind(end) - self._seq_index - 1 811 #self._start_index = line.index(start) 812 #self._start_length = self._seq_index - self._start_index - 1 813 #self._name_length = self._start_index 814 815 # Extract the information from the line 816 name = line[:self._name_length] 817 name = name.rstrip() 818 start = line[self._start_index:self._start_index+self._start_length] 819 start = start.rstrip() 820 if start: 821 start = _safe_int(start) 822 end = line[self._seq_index+self._seq_length:].rstrip() 823 if end: 824 end = _safe_int(end) 825 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 826 # right pad the sequence with spaces if necessary 827 if len(seq) < self._seq_length: 828 seq = seq + ' '*(self._seq_length-len(seq)) 829 830 # I need to make sure the sequence is aligned correctly with the query. 831 # First, I will find the length of the query. Then, if necessary, 832 # I will pad my current sequence with spaces so that they will line 833 # up correctly. 834 835 # Two possible things can happen: 836 # QUERY 837 # 504 838 # 839 # QUERY 840 # 403 841 # 842 # Sequence 504 will need padding at the end. Since I won't know 843 # this until the end of the alignment, this will be handled in 844 # end_alignment. 845 # Sequence 403 will need padding before being added to the alignment. 846 847 align = self._multiple_alignment.alignment # for convenience 848 align.append((name, start, seq, end))
849 850 # This is old code that tried to line up all the sequences 851 # in a multiple alignment by using the sequence title's as 852 # identifiers. The problem with this is that BLAST assigns 853 # different HSP's from the same sequence the same id. Thus, 854 # in one alignment block, there may be multiple sequences with 855 # the same id. I'm not sure how to handle this, so I'm not 856 # going to. 857 858 # # If the sequence is the query, then just add it. 859 # if name == 'QUERY': 860 # if len(align) == 0: 861 # align.append((name, start, seq)) 862 # else: 863 # aname, astart, aseq = align[0] 864 # if name != aname: 865 # raise SyntaxError, "Query is not the first sequence" 866 # aseq = aseq + seq 867 # align[0] = aname, astart, aseq 868 # else: 869 # if len(align) == 0: 870 # raise SyntaxError, "I could not find the query sequence" 871 # qname, qstart, qseq = align[0] 872 # 873 # # Now find my sequence in the multiple alignment. 874 # for i in range(1, len(align)): 875 # aname, astart, aseq = align[i] 876 # if name == aname: 877 # index = i 878 # break 879 # else: 880 # # If I couldn't find it, then add a new one. 881 # align.append((None, None, None)) 882 # index = len(align)-1 883 # # Make sure to left-pad it. 884 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 885 # 886 # if len(qseq) != len(aseq) + len(seq): 887 # # If my sequences are shorter than the query sequence, 888 # # then I will need to pad some spaces to make them line up. 889 # # Since I've already right padded seq, that means aseq 890 # # must be too short. 891 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 892 # aseq = aseq + seq 893 # if astart is None: 894 # astart = start 895 # align[index] = aname, astart, aseq 896
897 - def end_alignment(self):
898 # Remove trailing newlines 899 if self._alignment: 900 self._alignment.title = self._alignment.title.rstrip() 901 902 # This code is also obsolete. See note above. 903 # If there's a multiple alignment, I will need to make sure 904 # all the sequences are aligned. That is, I may need to 905 # right-pad the sequences. 906 # if self._multiple_alignment is not None: 907 # align = self._multiple_alignment.alignment 908 # seqlen = None 909 # for i in range(len(align)): 910 # name, start, seq = align[i] 911 # if seqlen is None: 912 # seqlen = len(seq) 913 # else: 914 # if len(seq) < seqlen: 915 # seq = seq + ' '*(seqlen - len(seq)) 916 # align[i] = name, start, seq 917 # elif len(seq) > seqlen: 918 # raise SyntaxError, \ 919 # "Sequence %s is longer than the query" % name 920 921 # Clean up some variables, if they exist. 922 try: 923 del self._seq_index 924 del self._seq_length 925 del self._start_index 926 del self._start_length 927 del self._name_length 928 except AttributeError: 929 pass
930
931 -class _HSPConsumer:
932 - def start_hsp(self):
933 self._hsp = Record.HSP()
934
935 - def score(self, line):
936 self._hsp.bits, self._hsp.score = _re_search( 937 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 938 "I could not find the score in line\n%s" % line) 939 self._hsp.score = _safe_float(self._hsp.score) 940 self._hsp.bits = _safe_float(self._hsp.bits) 941 942 x, y = _re_search( 943 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 944 "I could not find the expect in line\n%s" % line) 945 if x: 946 self._hsp.num_alignments = _safe_int(x) 947 else: 948 self._hsp.num_alignments = 1 949 self._hsp.expect = _safe_float(y)
950
951 - def identities(self, line):
952 x, y = _re_search( 953 r"Identities = (\d+)\/(\d+)", line, 954 "I could not find the identities in line\n%s" % line) 955 self._hsp.identities = _safe_int(x), _safe_int(y) 956 957 if line.find('Positives') != -1: 958 x, y = _re_search( 959 r"Positives = (\d+)\/(\d+)", line, 960 "I could not find the positives in line\n%s" % line) 961 self._hsp.positives = _safe_int(x), _safe_int(y) 962 963 if line.find('Gaps') != -1: 964 x, y = _re_search( 965 r"Gaps = (\d+)\/(\d+)", line, 966 "I could not find the gaps in line\n%s" % line) 967 self._hsp.gaps = _safe_int(x), _safe_int(y)
968 969
970 - def strand(self, line):
971 self._hsp.strand = _re_search( 972 r"Strand = (\w+) / (\w+)", line, 973 "I could not find the strand in line\n%s" % line)
974
975 - def frame(self, line):
976 # Frame can be in formats: 977 # Frame = +1 978 # Frame = +2 / +2 979 if line.find('/') != -1: 980 self._hsp.frame = _re_search( 981 r"Frame = ([-+][123]) / ([-+][123])", line, 982 "I could not find the frame in line\n%s" % line) 983 else: 984 self._hsp.frame = _re_search( 985 r"Frame = ([-+][123])", line, 986 "I could not find the frame in line\n%s" % line)
987 988 # Match a space, if one is available. Masahir Ishikawa found a 989 # case where there's no space between the start and the sequence: 990 # Query: 100tt 101 991 # line below modified by Yair Benita, Sep 2004 992 _query_re = re.compile(r"Query: \s*(\d+)\s*(.+) (\d+)")
993 - def query(self, line):
994 m = self._query_re.search(line) 995 if m is None: 996 raise SyntaxError, "I could not find the query in line\n%s" % line 997 998 # line below modified by Yair Benita, Sep 2004. 999 # added the end attribute for the query 1000 start, seq, end = m.groups() 1001 self._hsp.query = self._hsp.query + seq 1002 if self._hsp.query_start is None: 1003 self._hsp.query_start = _safe_int(start) 1004 1005 # line below added by Yair Benita, Sep 2004. 1006 # added the end attribute for the query 1007 self._hsp.query_end = _safe_int(end) 1008 self._query_start_index = m.start(2) 1009 self._query_len = len(seq)
1010
1011 - def align(self, line):
1012 seq = line[self._query_start_index:].rstrip() 1013 if len(seq) < self._query_len: 1014 # Make sure the alignment is the same length as the query 1015 seq = seq + ' ' * (self._query_len-len(seq)) 1016 elif len(seq) < self._query_len: 1017 raise SyntaxError, "Match is longer than the query in line\n%s" % \ 1018 line 1019 self._hsp.match = self._hsp.match + seq
1020
1021 - def sbjct(self, line):
1022 # line below modified by Yair Benita, Sep 2004 1023 # added the end group and the -? to allow parsing 1024 # of BLAT output in BLAST format. 1025 start, seq, end = _re_search( 1026 r"Sbjct:\s*(-?\d+)\s*(.+) (-?\d+)", line, 1027 "I could not find the sbjct in line\n%s" % line) 1028 #mikep 26/9/00 1029 #On occasion, there is a blast hit with no subject match 1030 #so far, it only occurs with 1-line short "matches" 1031 #I have decided to let these pass as they appear 1032 if not seq.strip(): 1033 seq = ' ' * self._query_len 1034 self._hsp.sbjct = self._hsp.sbjct + seq 1035 if self._hsp.sbjct_start is None: 1036 self._hsp.sbjct_start = _safe_int(start) 1037 1038 self._hsp.sbjct_end = _safe_int(end) 1039 if len(seq) != self._query_len: 1040 raise SyntaxError, \ 1041 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1042 % line 1043 1044 del self._query_start_index # clean up unused variables 1045 del self._query_len
1046
1047 - def end_hsp(self):
1048 pass
1049
1050 -class _DatabaseReportConsumer:
1051
1052 - def start_database_report(self):
1053 self._dr = Record.DatabaseReport()
1054
1055 - def database(self, line):
1056 m = re.search(r"Database: (.+)$", line) 1057 if m: 1058 self._dr.database_name.append(m.group(1)) 1059 elif self._dr.database_name: 1060 # This must be a continuation of the previous name. 1061 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1062 line.strip())
1063
1064 - def posted_date(self, line):
1065 self._dr.posted_date.append(_re_search( 1066 r"Posted date:\s*(.+)$", line, 1067 "I could not find the posted date in line\n%s" % line))
1068
1069 - def num_letters_in_database(self, line):
1070 letters, = _get_cols( 1071 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1072 self._dr.num_letters_in_database.append(_safe_int(letters))
1073
1074 - def num_sequences_in_database(self, line):
1075 sequences, = _get_cols( 1076 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1077 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1078
1079 - def ka_params(self, line):
1080 x = line.split() 1081 self._dr.ka_params = map(_safe_float, x)
1082
1083 - def gapped(self, line):
1084 self._dr.gapped = 1
1085
1086 - def ka_params_gap(self, line):
1087 x = line.split() 1088 self._dr.ka_params_gap = map(_safe_float, x)
1089
1090 - def end_database_report(self):
1091 pass
1092
1093 -class _ParametersConsumer:
1094 - def start_parameters(self):
1095 self._params = Record.Parameters()
1096
1097 - def matrix(self, line):
1098 self._params.matrix = line[8:].rstrip()
1099
1100 - def gap_penalties(self, line):
1101 x = _get_cols( 1102 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1103 self._params.gap_penalties = map(_safe_float, x)
1104
1105 - def num_hits(self, line):
1106 if line.find('1st pass') != -1: 1107 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1108 self._params.num_hits = _safe_int(x) 1109 else: 1110 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1111 self._params.num_hits = _safe_int(x)
1112
1113 - def num_sequences(self, line):
1114 if line.find('1st pass') != -1: 1115 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1116 self._params.num_sequences = _safe_int(x) 1117 else: 1118 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1119 self._params.num_sequences = _safe_int(x)
1120
1121 - def num_extends(self, line):
1122 if line.find('1st pass') != -1: 1123 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1124 self._params.num_extends = _safe_int(x) 1125 else: 1126 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1127 self._params.num_extends = _safe_int(x)
1128
1129 - def num_good_extends(self, line):
1130 if line.find('1st pass') != -1: 1131 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1132 self._params.num_good_extends = _safe_int(x) 1133 else: 1134 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1135 self._params.num_good_extends = _safe_int(x)
1136
1137 - def num_seqs_better_e(self, line):
1138 self._params.num_seqs_better_e, = _get_cols( 1139 line, (-1,), ncols=7, expected={2:"sequences"}) 1140 self._params.num_seqs_better_e = _safe_int( 1141 self._params.num_seqs_better_e)
1142
1143 - def hsps_no_gap(self, line): ## this one doesn't handle blastp 2.2.15 ??
1144 self._params.hsps_no_gap, = _get_cols( 1145 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1146 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1147
1148 - def hsps_prelim_gapped(self, line):
1149 self._params.hsps_prelim_gapped, = _get_cols( 1150 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1151 self._params.hsps_prelim_gapped = _safe_int( 1152 self._params.hsps_prelim_gapped)
1153
1154 - def hsps_prelim_gapped_attempted(self, line):
1155 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1156 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1157 self._params.hsps_prelim_gapped_attempted = _safe_int( 1158 self._params.hsps_prelim_gapped_attempted)
1159
1160 - def hsps_gapped(self, line):
1161 self._params.hsps_gapped, = _get_cols( 1162 line, (-1,), ncols=6, expected={3:"gapped"}) 1163 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1164
1165 - def query_length(self, line):
1166 self._params.query_length, = _get_cols( 1167 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1168 self._params.query_length = _safe_int(self._params.query_length)
1169
1170 - def database_length(self, line):
1171 self._params.database_length, = _get_cols( 1172 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1173 self._params.database_length = _safe_int(self._params.database_length)
1174
1175 - def effective_hsp_length(self, line):
1176 self._params.effective_hsp_length, = _get_cols( 1177 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1178 self._params.effective_hsp_length = _safe_int( 1179 self._params.effective_hsp_length)
1180
1181 - def effective_query_length(self, line):
1182 self._params.effective_query_length, = _get_cols( 1183 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1184 self._params.effective_query_length = _safe_int( 1185 self._params.effective_query_length)
1186
1187 - def effective_database_length(self, line):
1188 self._params.effective_database_length, = _get_cols( 1189 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1190 self._params.effective_database_length = _safe_int( 1191 self._params.effective_database_length)
1192
1193 - def effective_search_space(self, line):
1194 self._params.effective_search_space, = _get_cols( 1195 line, (-1,), ncols=4, expected={1:"search"}) 1196 self._params.effective_search_space = _safe_int( 1197 self._params.effective_search_space)
1198
1199 - def effective_search_space_used(self, line):
1200 self._params.effective_search_space_used, = _get_cols( 1201 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1202 self._params.effective_search_space_used = _safe_int( 1203 self._params.effective_search_space_used)
1204
1205 - def frameshift(self, line):
1206 self._params.frameshift = _get_cols( 1207 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1208
1209 - def threshold(self, line):
1210 self._params.threshold, = _get_cols( 1211 line, (-1,) ) ## , ncols=2, expected={0:"T:"}) ## doesn't work with 2.2.15 1212 self._params.threshold = _safe_int(self._params.threshold)
1213
1214 - def window_size(self, line):
1215 self._params.window_size, = _get_cols( 1216 line, (-1,) ) ## , ncols=2, expected={0:"A:"}) ## doesn't work with 2.2.15 1217 self._params.window_size = _safe_int(self._params.window_size)
1218
1219 - def dropoff_1st_pass(self, line):
1220 score, bits = _re_search( 1221 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1222 "I could not find the dropoff in line\n%s" % line) 1223 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1224
1225 - def gap_x_dropoff(self, line):
1226 score, bits = _re_search( 1227 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1228 "I could not find the gap dropoff in line\n%s" % line) 1229 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1230
1231 - def gap_x_dropoff_final(self, line):
1232 score, bits = _re_search( 1233 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1234 "I could not find the gap dropoff final in line\n%s" % line) 1235 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1236
1237 - def gap_trigger(self, line):
1238 score, bits = _re_search( 1239 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1240 "I could not find the gap trigger in line\n%s" % line) 1241 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1242
1243 - def blast_cutoff(self, line):
1244 score, bits = _re_search( 1245 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1246 "I could not find the blast cutoff in line\n%s" % line) 1247 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1248
1249 - def end_parameters(self):
1250 pass
1251 1252
1253 -class _BlastConsumer(AbstractConsumer, 1254 _HeaderConsumer, 1255 _DescriptionConsumer, 1256 _AlignmentConsumer, 1257 _HSPConsumer, 1258 _DatabaseReportConsumer, 1259 _ParametersConsumer 1260 ):
1261 # This Consumer is inherits from many other consumer classes that handle 1262 # the actual dirty work. An alternate way to do it is to create objects 1263 # of those classes and then delegate the parsing tasks to them in a 1264 # decorator-type pattern. The disadvantage of that is that the method 1265 # names will need to be resolved in this classes. However, using 1266 # a decorator will retain more control in this class (which may or 1267 # may not be a bad thing). In addition, having each sub-consumer as 1268 # its own object prevents this object's dictionary from being cluttered 1269 # with members and reduces the chance of member collisions.
1270 - def __init__(self):
1271 self.data = None
1272
1273 - def round(self, line):
1274 # Make sure nobody's trying to pass me PSI-BLAST data! 1275 raise ValueError, \ 1276 "This consumer doesn't handle PSI-BLAST data"
1277
1278 - def start_header(self):
1279 self.data = Record.Blast() 1280 _HeaderConsumer.start_header(self)
1281
1282 - def end_header(self):
1283 _HeaderConsumer.end_header(self) 1284 self.data.__dict__.update(self._header.__dict__)
1285
1286 - def end_descriptions(self):
1287 self.data.descriptions = self._descriptions
1288
1289 - def end_alignment(self):
1290 _AlignmentConsumer.end_alignment(self) 1291 if self._alignment.hsps: 1292 self.data.alignments.append(self._alignment) 1293 if self._multiple_alignment.alignment: 1294 self.data.multiple_alignment = self._multiple_alignment
1295
1296 - def end_hsp(self):
1297 _HSPConsumer.end_hsp(self) 1298 try: 1299 self._alignment.hsps.append(self._hsp) 1300 except AttributeError: 1301 raise SyntaxError, "Found an HSP before an alignment"
1302
1303 - def end_database_report(self):
1304 _DatabaseReportConsumer.end_database_report(self) 1305 self.data.__dict__.update(self._dr.__dict__)
1306
1307 - def end_parameters(self):
1308 _ParametersConsumer.end_parameters(self) 1309 self.data.__dict__.update(self._params.__dict__)
1310
1311 -class _PSIBlastConsumer(AbstractConsumer, 1312 _HeaderConsumer, 1313 _DescriptionConsumer, 1314 _AlignmentConsumer, 1315 _HSPConsumer, 1316 _DatabaseReportConsumer, 1317 _ParametersConsumer 1318 ):
1319 - def __init__(self):
1320 self.data = None
1321
1322 - def start_header(self):
1323 self.data = Record.PSIBlast() 1324 _HeaderConsumer.start_header(self)
1325
1326 - def end_header(self):
1327 _HeaderConsumer.end_header(self) 1328 self.data.__dict__.update(self._header.__dict__)
1329
1330 - def start_descriptions(self):
1331 self._round = Record.Round() 1332 self.data.rounds.append(self._round) 1333 _DescriptionConsumer.start_descriptions(self)
1334
1335 - def end_descriptions(self):
1336 _DescriptionConsumer.end_descriptions(self) 1337 self._round.number = self._roundnum 1338 if self._descriptions: 1339 self._round.new_seqs.extend(self._descriptions) 1340 self._round.reused_seqs.extend(self._model_sequences) 1341 self._round.new_seqs.extend(self._nonmodel_sequences) 1342 if self._converged: 1343 self.data.converged = 1
1344
1345 - def end_alignment(self):
1346 _AlignmentConsumer.end_alignment(self) 1347 if self._alignment.hsps: 1348 self._round.alignments.append(self._alignment) 1349 if self._multiple_alignment: 1350 self._round.multiple_alignment = self._multiple_alignment
1351
1352 - def end_hsp(self):
1353 _HSPConsumer.end_hsp(self) 1354 try: 1355 self._alignment.hsps.append(self._hsp) 1356 except AttributeError: 1357 raise SyntaxError, "Found an HSP before an alignment"
1358
1359 - def end_database_report(self):
1360 _DatabaseReportConsumer.end_database_report(self) 1361 self.data.__dict__.update(self._dr.__dict__)
1362
1363 - def end_parameters(self):
1364 _ParametersConsumer.end_parameters(self) 1365 self.data.__dict__.update(self._params.__dict__)
1366
1367 -class Iterator:
1368 """Iterates over a file of multiple BLAST results. 1369 1370 Methods: 1371 next Return the next record from the stream, or None. 1372 1373 """
1374 - def __init__(self, handle, parser=None):
1375 """__init__(self, handle, parser=None) 1376 1377 Create a new iterator. handle is a file-like object. parser 1378 is an optional Parser object to change the results into another form. 1379 If set to None, then the raw contents of the file will be returned. 1380 1381 """ 1382 try: 1383 handle.readline 1384 except AttributeError: 1385 raise ValueError( 1386 "I expected a file handle or file-like object, got %s" 1387 % type(handle)) 1388 self._uhandle = File.UndoHandle(handle) 1389 self._parser = parser
1390
1391 - def next(self):
1392 """next(self) -> object 1393 1394 Return the next Blast record from the file. If no more records, 1395 return None. 1396 1397 """ 1398 lines = [] 1399 while 1: 1400 line = self._uhandle.readline() 1401 if not line: 1402 break 1403 # If I've reached the next one, then put the line back and stop. 1404 if lines and (line.startswith('BLAST') 1405 or line.startswith('BLAST', 1) 1406 or line.startswith('<?xml ')): 1407 self._uhandle.saveline(line) 1408 break 1409 lines.append(line) 1410 1411 if not lines: 1412 return None 1413 1414 data = ''.join(lines) 1415 if self._parser is not None: 1416 return self._parser.parse(File.StringHandle(data)) 1417 return data
1418
1419 - def __iter__(self):
1420 return iter(self.next, None)
1421
1422 -def blastall(blastcmd, program, database, infile, **keywds):
1423 """blastall(blastcmd, program, database, infile, **keywds) -> 1424 read, error Undohandles 1425 1426 Execute and retrieve data from blastall. blastcmd is the command 1427 used to launch the 'blastall' executable. program is the blast program 1428 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1429 to search against. infile is the path to the file containing 1430 the sequence to search with. 1431 1432 You may pass more parameters to **keywds to change the behavior of 1433 the search. Otherwise, optional values will be chosen by blastall. 1434 1435 Scoring 1436 matrix Matrix to use. 1437 gap_open Gap open penalty. 1438 gap_extend Gap extension penalty. 1439 nuc_match Nucleotide match reward. (BLASTN) 1440 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1441 query_genetic_code Genetic code for Query. 1442 db_genetic_code Genetic code for database. (TBLAST[NX]) 1443 1444 Algorithm 1445 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1446 expectation Expectation value cutoff. 1447 wordsize Word size. 1448 strands Query strands to search against database.([T]BLAST[NX]) 1449 keep_hits Number of best hits from a region to keep. 1450 xdrop Dropoff value (bits) for gapped alignments. 1451 hit_extend Threshold for extending hits. 1452 region_length Length of region used to judge hits. 1453 db_length Effective database length. 1454 search_length Effective length of search space. 1455 1456 Processing 1457 filter Filter query sequence? T/F 1458 believe_query Believe the query defline. T/F 1459 restrict_gi Restrict search to these GI's. 1460 nprocessors Number of processors to use. 1461 1462 Formatting 1463 html Produce HTML output? T/F 1464 descriptions Number of one-line descriptions. 1465 alignments Number of alignments. 1466 align_view Alignment view. Integer 0-6. 1467 show_gi Show GI's in deflines? T/F 1468 seqalign_file seqalign file to output. 1469 1470 """ 1471 att2param = { 1472 'matrix' : '-M', 1473 'gap_open' : '-G', 1474 'gap_extend' : '-E', 1475 'nuc_match' : '-r', 1476 'nuc_mismatch' : '-q', 1477 'query_genetic_code' : '-Q', 1478 'db_genetic_code' : '-D', 1479 1480 'gapped' : '-g', 1481 'expectation' : '-e', 1482 'wordsize' : '-W', 1483 'strands' : '-S', 1484 'keep_hits' : '-K', 1485 'xdrop' : '-X', 1486 'hit_extend' : '-f', 1487 'region_length' : '-L', 1488 'db_length' : '-z', 1489 'search_length' : '-Y', 1490 1491 'program' : '-p', 1492 'database' : '-d', 1493 'infile' : '-i', 1494 'filter' : '-F', 1495 'believe_query' : '-J', 1496 'restrict_gi' : '-l', 1497 'nprocessors' : '-a', 1498 1499 'html' : '-T', 1500 'descriptions' : '-v', 1501 'alignments' : '-b', 1502 'align_view' : '-m', 1503 'show_gi' : '-I', 1504 'seqalign_file' : '-O' 1505 } 1506 1507 if not os.path.exists(blastcmd): 1508 raise ValueError, "blastall does not exist at %s" % blastcmd 1509 1510 params = [] 1511 1512 params.extend([att2param['program'], program]) 1513 params.extend([att2param['database'], database]) 1514 params.extend([att2param['infile'], infile]) 1515 1516 for attr in keywds.keys(): 1517 params.extend([att2param[attr], str(keywds[attr])]) 1518 1519 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1520 w.close() 1521 return File.UndoHandle(r), File.UndoHandle(e)
1522 1523
1524 -def blastpgp(blastcmd, database, infile, **keywds):
1525 """blastpgp(blastcmd, database, infile, **keywds) -> 1526 read, error Undohandles 1527 1528 Execute and retrieve data from blastpgp. blastcmd is the command 1529 used to launch the 'blastpgp' executable. database is the path to the 1530 database to search against. infile is the path to the file containing 1531 the sequence to search with. 1532 1533 You may pass more parameters to **keywds to change the behavior of 1534 the search. Otherwise, optional values will be chosen by blastpgp. 1535 1536 Scoring 1537 matrix Matrix to use. 1538 gap_open Gap open penalty. 1539 gap_extend Gap extension penalty. 1540 window_size Multiple hits window size. 1541 npasses Number of passes. 1542 passes Hits/passes. Integer 0-2. 1543 1544 Algorithm 1545 gapped Whether to do a gapped alignment. T/F 1546 expectation Expectation value cutoff. 1547 wordsize Word size. 1548 keep_hits Number of beset hits from a region to keep. 1549 xdrop Dropoff value (bits) for gapped alignments. 1550 hit_extend Threshold for extending hits. 1551 region_length Length of region used to judge hits. 1552 db_length Effective database length. 1553 search_length Effective length of search space. 1554 nbits_gapping Number of bits to trigger gapping. 1555 pseudocounts Pseudocounts constants for multiple passes. 1556 xdrop_final X dropoff for final gapped alignment. 1557 xdrop_extension Dropoff for blast extensions. 1558 model_threshold E-value threshold to include in multipass model. 1559 required_start Start of required region in query. 1560 required_end End of required region in query. 1561 1562 Processing 1563 XXX should document default values 1564 program The blast program to use. (PHI-BLAST) 1565 filter Filter query sequence with SEG? T/F 1566 believe_query Believe the query defline? T/F 1567 nprocessors Number of processors to use. 1568 1569 Formatting 1570 html Produce HTML output? T/F 1571 descriptions Number of one-line descriptions. 1572 alignments Number of alignments. 1573 align_view Alignment view. Integer 0-6. 1574 show_gi Show GI's in deflines? T/F 1575 seqalign_file seqalign file to output. 1576 align_outfile Output file for alignment. 1577 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1578 restart_infile Input file for PSI-BLAST restart. 1579 hit_infile Hit file for PHI-BLAST. 1580 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1581 align_infile Input alignment file for PSI-BLAST restart. 1582 1583 """ 1584 att2param = { 1585 'matrix' : '-M', 1586 'gap_open' : '-G', 1587 'gap_extend' : '-E', 1588 'window_size' : '-A', 1589 'npasses' : '-j', 1590 'passes' : '-P', 1591 1592 'gapped' : '-g', 1593 'expectation' : '-e', 1594 'wordsize' : '-W', 1595 'keep_hits' : '-K', 1596 'xdrop' : '-X', 1597 'hit_extend' : '-f', 1598 'region_length' : '-L', 1599 'db_length' : '-Z', 1600 'search_length' : '-Y', 1601 'nbits_gapping' : '-N', 1602 'pseudocounts' : '-c', 1603 'xdrop_final' : '-Z', 1604 'xdrop_extension' : '-y', 1605 'model_threshold' : '-h', 1606 'required_start' : '-S', 1607 'required_end' : '-H', 1608 1609 'program' : '-p', 1610 'database' : '-d', 1611 'infile' : '-i', 1612 'filter' : '-F', 1613 'believe_query' : '-J', 1614 'nprocessors' : '-a', 1615 1616 'html' : '-T', 1617 'descriptions' : '-v', 1618 'alignments' : '-b', 1619 'align_view' : '-m', 1620 'show_gi' : '-I', 1621 'seqalign_file' : '-O', 1622 'align_outfile' : '-o', 1623 'checkpoint_outfile' : '-C', 1624 'restart_infile' : '-R', 1625 'hit_infile' : '-k', 1626 'matrix_outfile' : '-Q', 1627 'align_infile' : '-B' 1628 } 1629 1630 if not os.path.exists(blastcmd): 1631 raise ValueError, "blastpgp does not exist at %s" % blastcmd 1632 1633 params = [] 1634 1635 params.extend([att2param['database'], database]) 1636 params.extend([att2param['infile'], infile]) 1637 1638 for attr in keywds.keys(): 1639 params.extend([att2param[attr], str(keywds[attr])]) 1640 1641 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1642 w.close() 1643 return File.UndoHandle(r), File.UndoHandle(e)
1644 1645
1646 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1647 """rpsblast(blastcmd, database, infile, **keywds) -> 1648 read, error Undohandles 1649 1650 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1651 command used to launch the 'rpsblast' executable. database is the path 1652 to the database to search against. infile is the path to the file 1653 containing the sequence to search with. 1654 1655 You may pass more parameters to **keywds to change the behavior of 1656 the search. Otherwise, optional values will be chosen by rpsblast. 1657 1658 Please note that this function will give XML output by default, by 1659 setting align_view to seven (i.e. command line option -m 7). 1660 You should use the NCBIXML.BlastParser() to read the resulting output. 1661 This is because NCBIStandalone.BlastParser() does not understand the 1662 plain text output format from rpsblast. 1663 1664 WARNING - The following text and associated parameter handling has not 1665 received extensive testing. Please report any errors we might have made... 1666 1667 Algorithm/Scoring 1668 gapped Whether to do a gapped alignment. T/F 1669 multihit 0 for multiple hit (default), 1 for single hit 1670 expectation Expectation value cutoff. 1671 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1672 0 in 'start' refers to the beginning of the sequence 1673 0 in 'stop' refers to the end of the sequence 1674 Default = 0,0 1675 xdrop Dropoff value (bits) for gapped alignments. 1676 xdrop_final X dropoff for final gapped alignment (in bits). 1677 xdrop_extension Dropoff for blast extensions (in bits). 1678 search_length Effective length of search space. 1679 nbits_gapping Number of bits to trigger gapping. 1680 protein Query sequence is protein. T/F 1681 db_length Effective database length. 1682 1683 Processing 1684 filter Filter query sequence with SEG? T/F 1685 case_filter Use lower case filtering of FASTA sequence T/F, default F 1686 believe_query Believe the query defline. T/F 1687 nprocessors Number of processors to use. 1688 logfile Name of log file to use, default rpsblast.log 1689 1690 Formatting 1691 html Produce HTML output? T/F 1692 descriptions Number of one-line descriptions. 1693 alignments Number of alignments. 1694 align_view Alignment view. Integer 0-9. 1695 show_gi Show GI's in deflines? T/F 1696 seqalign_file seqalign file to output. 1697 align_outfile Output file for alignment. 1698 1699 """ 1700 att2param = { 1701 'multihit' : '-P', 1702 'gapped' : '-g', 1703 'expectation' : '-e', 1704 'range_restriction' : '-L', 1705 'xdrop' : '-X', 1706 'xdrop_final' : '-Z', 1707 'xdrop_extension' : '-y', 1708 'search_length' : '-Y', 1709 'nbits_gapping' : '-N', 1710 'protein' : '-p', 1711 'db_length' : '-z', 1712 1713 'database' : '-d', 1714 'infile' : '-i', 1715 'filter' : '-F', 1716 'case_filter' : '-U', 1717 'believe_query' : '-J', 1718 'nprocessors' : '-a', 1719 'logfile' : '-l', 1720 1721 'html' : '-T', 1722 'descriptions' : '-v', 1723 'alignments' : '-b', 1724 'align_view' : '-m', 1725 'show_gi' : '-I', 1726 'seqalign_file' : '-O', 1727 'align_outfile' : '-o' 1728 } 1729 1730 if not os.path.exists(blastcmd): 1731 raise ValueError, "rpsblast does not exist at %s" % blastcmd 1732 1733 params = [] 1734 1735 params.extend([att2param['database'], database]) 1736 params.extend([att2param['infile'], infile]) 1737 params.extend([att2param['align_view'], align_view]) 1738 1739 for attr in keywds.keys(): 1740 params.extend([att2param[attr], str(keywds[attr])]) 1741 1742 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1743 w.close() 1744 return File.UndoHandle(r), File.UndoHandle(e)
1745
1746 -def _re_search(regex, line, error_msg):
1747 m = re.search(regex, line) 1748 if not m: 1749 raise SyntaxError, error_msg 1750 return m.groups()
1751
1752 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1753 cols = line.split() 1754 1755 # Check to make sure number of columns is correct 1756 if ncols is not None and len(cols) != ncols: 1757 raise SyntaxError, "I expected %d columns (got %d) in line\n%s" % \ 1758 (ncols, len(cols), line) 1759 1760 # Check to make sure columns contain the correct data 1761 for k in expected.keys(): 1762 if cols[k] != expected[k]: 1763 raise SyntaxError, "I expected '%s' in column %d in line\n%s" % ( 1764 expected[k], k, line) 1765 1766 # Construct the answer tuple 1767 results = [] 1768 for c in cols_to_get: 1769 results.append(cols[c]) 1770 return tuple(results)
1771
1772 -def _safe_int(str):
1773 try: 1774 return int(str) 1775 except ValueError: 1776 # Something went wrong. Try to clean up the string. 1777 # Remove all commas from the string 1778 str = str.replace(',', '') 1779 try: 1780 # try again. 1781 return int(str) 1782 except ValueError: 1783 pass 1784 # If it fails again, maybe it's too long? 1785 # XXX why converting to float? 1786 return long(float(str))
1787
1788 -def _safe_float(str):
1789 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1790 # float('e-172') does not produce an error on his platform. Thus, 1791 # we need to check the string for this condition. 1792 1793 # Sometimes BLAST leaves of the '1' in front of an exponent. 1794 if str and str[0] in ['E', 'e']: 1795 str = '1' + str 1796 try: 1797 return float(str) 1798 except ValueError: 1799 # Remove all commas from the string 1800 str = str.replace(',', '') 1801 # try again. 1802 return float(str)
1803
1804 -class _BlastErrorConsumer(_BlastConsumer):
1805 - def __init__(self):
1807 - def noevent(self, line):
1808 if line.find("Query must be at least wordsize") != -1: 1809 raise ShortQueryBlastError, "Query must be at least wordsize" 1810 # Now pass the line back up to the superclass. 1811 method = getattr(_BlastConsumer, 'noevent', 1812 _BlastConsumer.__getattr__(self, 'noevent')) 1813 method(line)
1814
1815 -class BlastErrorParser(AbstractParser):
1816 """Attempt to catch and diagnose BLAST errors while parsing. 1817 1818 This utilizes the BlastParser module but adds an additional layer 1819 of complexity on top of it by attempting to diagnose SyntaxError's 1820 that may actually indicate problems during BLAST parsing. 1821 1822 Current BLAST problems this detects are: 1823 o LowQualityBlastError - When BLASTing really low quality sequences 1824 (ie. some GenBank entries which are just short streches of a single 1825 nucleotide), BLAST will report an error with the sequence and be 1826 unable to search with this. This will lead to a badly formatted 1827 BLAST report that the parsers choke on. The parser will convert the 1828 SyntaxError to a LowQualityBlastError and attempt to provide useful 1829 information. 1830 1831 """
1832 - def __init__(self, bad_report_handle = None):
1833 """Initialize a parser that tries to catch BlastErrors. 1834 1835 Arguments: 1836 o bad_report_handle - An optional argument specifying a handle 1837 where bad reports should be sent. This would allow you to save 1838 all of the bad reports to a file, for instance. If no handle 1839 is specified, the bad reports will not be saved. 1840 """ 1841 self._bad_report_handle = bad_report_handle 1842 1843 #self._b_parser = BlastParser() 1844 self._scanner = _Scanner() 1845 self._consumer = _BlastErrorConsumer()
1846
1847 - def parse(self, handle):
1848 """Parse a handle, attempting to diagnose errors. 1849 """ 1850 results = handle.read() 1851 1852 try: 1853 self._scanner.feed(File.StringHandle(results), self._consumer) 1854 except SyntaxError, msg: 1855 # if we have a bad_report_file, save the info to it first 1856 if self._bad_report_handle: 1857 # send the info to the error handle 1858 self._bad_report_handle.write(results) 1859 1860 # now we want to try and diagnose the error 1861 self._diagnose_error( 1862 File.StringHandle(results), self._consumer.data) 1863 1864 # if we got here we can't figure out the problem 1865 # so we should pass along the syntax error we got 1866 raise 1867 return self._consumer.data
1868
1869 - def _diagnose_error(self, handle, data_record):
1870 """Attempt to diagnose an error in the passed handle. 1871 1872 Arguments: 1873 o handle - The handle potentially containing the error 1874 o data_record - The data record partially created by the consumer. 1875 """ 1876 line = handle.readline() 1877 1878 while line: 1879 # 'Searchingdone' instead of 'Searching......done' seems 1880 # to indicate a failure to perform the BLAST due to 1881 # low quality sequence 1882 if line.startswith('Searchingdone'): 1883 raise LowQualityBlastError("Blast failure occured on query: ", 1884 data_record.query) 1885 line = handle.readline()
1886