Package Biskit :: Package Mod :: Module CheckIdentities
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.CheckIdentities

  1  ## 
  2  ## Biskit, a toolkit for the manipulation of macromolecular structures 
  3  ## Copyright (C) 2004-2005 Raik Gruenberg & Johan Leckner 
  4  ## 
  5  ## This program is free software; you can redistribute it and/or 
  6  ## modify it under the terms of the GNU General Public License as 
  7  ## published by the Free Software Foundation; either version 2 of the 
  8  ## License, or any later version. 
  9  ## 
 10  ## This program is distributed in the hope that it will be useful, 
 11  ## but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 13  ## General Public License for more details. 
 14  ## 
 15  ## You find a copy of the GNU General Public License in the file 
 16  ## license.txt along with this program; if not, write to the Free 
 17  ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 18   
 19  ## Contributions: Olivier PERIN 
 20  ## last $Author: leckner $ 
 21  ## last $Date: 2006/09/13 10:28:14 $ 
 22  ## $Revision: 2.6 $ 
 23   
 24  """ 
 25  Check sequence identity between templates. 
 26  """ 
 27   
 28  import os 
 29  import re 
 30  import linecache 
 31  import Numeric as N 
 32   
 33  import Biskit.tools as T 
 34   
 35  from Biskit import StdLog, EHandler 
 36   
 37   
38 -class Check_Identities:
39 """ 40 This class regroups the different methods to prevent 41 the use of templates with a too high percentage of identities 42 """ 43 # standard directory for input 44 F_INPUT_FOLDER = '/t_coffee' 45 46 # Standard Input Files 47 # t_coffee final alignment 48 F_INPUT_ALNS= F_INPUT_FOLDER +'/final.pir_aln' 49 50 # Standard Ouput Files 51 F_OUTPUT_IDENTITIES = '/identities.out' 52 F_OUTPUT_IDENTITIES_INF = '/identities_info.out' 53 F_OUTPUT_IDENTITIES_COV = '/identities_cov.out' 54 55
56 - def __init__(self, outFolder='.', verbose=1):
57 """ 58 @param outFolder: base folder 59 @type outFolder: str 60 @param verbose: write intermediary files (default: 0) 61 @type verbose: 1|0 62 """ 63 self.outFolder = T.absfile( outFolder ) 64 self.verbose = verbose 65 self.sequences_name=["target"]
66 67
68 - def get_lines(self, aln_file = None):
69 """ 70 Retrieve the lines from an aln file 71 72 @param aln_file: aln file (default: None -> L{F_INPUT_ALNS}) 73 @type aln_file: str 74 75 @return: file as list of strings 76 @rtype: [str] 77 """ 78 aln_file = aln_file or self.outFolder + self.F_INPUT_ALNS 79 file = open('%s'%aln_file, 'r') 80 string_lines = file.readlines() 81 file.close() 82 83 return string_lines
84 85
86 - def search_length(self, string_lines):
87 """ 88 This function returns the length an alignment in the 89 file 'final.pir_aln' 90 91 @param string_lines: aln file as list of string 92 @type string_lines: [str] 93 94 @return: length of alignment 95 @rtype: int 96 """ 97 end_of_sequence=1 98 99 for x in string_lines: 100 if(x == '*\n'): 101 endof_1st_sequence = end_of_sequence 102 break 103 104 end_of_sequence+=1 105 106 return end_of_sequence - 3
107 108
109 - def get_aln_sequences(self, string_lines, aln_length):
110 """ 111 Create a dictionary with the name of the target (i.e 'target') 112 and sequence from the final output from T-Coffee (final.pir_aln). 113 114 @param string_lines: aln file as list of string 115 @type string_lines: [str] 116 @param aln_length: length of alignment 117 @type aln_length: int 118 119 @return: alignment dictionary 120 e.g. {'name':'target, 'seq': 'sequence of the target'} 121 @rtype: dict 122 """ 123 aln_dictionary = {} 124 target_sequence = "" 125 126 y=0 127 for x in string_lines: 128 if(x == '>P1;target\n'): 129 break 130 y+=1 131 132 for s in string_lines[y+2:y+2+aln_length]: 133 target_sequence += s[:-1] 134 135 aln_dictionary["target"] = {'name':'target','seq':target_sequence} 136 137 return aln_dictionary
138 139
140 - def get_aln_templates(self, string_lines, aln_dict, aln_length):
141 """ 142 Add information about the name of the template sequences and 143 the sequence that is aligned to the template. Data taken from 144 the T-Coffee alignment (final.pir_aln). 145 146 @param string_lines: aln file as list of string 147 @type string_lines: [str] 148 @param aln_dict: alignment dictionary 149 @type aln_dict: dict 150 151 @return: template alignment dictionary 152 e.g. { str :{'name':str, 'seq': str} } 153 @rtype: dict 154 """ 155 r1 = re.compile(r'>P1;\d') 156 157 z = 0 158 for string in string_lines[:]: 159 if(r1.match(string)): 160 161 pdb_name ="" 162 for w in string[4:]: 163 164 for letters in w: 165 if(letters!='\n'): 166 pdb_name += w 167 else: break 168 169 template_sequence = "" 170 for lines in string_lines[z+2:z+2+aln_length]: 171 template_sequence += lines[:-1] 172 173 self.sequences_name.append("%s"%pdb_name) 174 aln_dict["%s"%pdb_name] = {'name':'%s'%pdb_name, 175 'seq':template_sequence} 176 177 z+=1 178 179 return aln_dict
180 181
182 - def identities(self, aln_dictionary):
183 """ 184 Create a dictionary that contains information about all the 185 alignments in the aln_dictionar using pairwise comparisons. 186 187 @param aln_dictionary: alignment dictionary 188 @type aln_dictionary: dict 189 190 @return: a dictionary of dictionaries with the sequence name as the 191 top key. Each sub dictionary then has the keys: 192 - 'name' - str, sequence name 193 - 'seq' - str, sequence of 194 - 'template_info' - list of the same length as the 'key' 195 sequence excluding deletions. The number of sequences 196 in tha multiple alignmentthat contain information at 197 this position. 198 - 'ID' - dict, sequence identity in precent comparing the 199 'key' sequence all other sequences (excluding deletions) 200 - 'info_ID' - dict, same as 'ID' but compared to the template 201 sequence length (i.e excluding deletions and insertions 202 in the 'key' sequence ) 203 - 'cov_ID' - dict, same as 'info_ID' but insertions are defined 204 comparing to all template sequences (i.e where 205 'template_info' is zero ) 206 @rtype: dict 207 """ 208 ## loop over all sequences in alignment 209 for i in self.sequences_name: 210 template_names = [] 211 212 ## don't compare to self, remove current sequence 213 for name in self.sequences_name: 214 if(name is not i): 215 template_names.append(name) 216 217 ## loop over all sequences in alignment 218 info_ID, ID, cov_ID = {}, {}, {} 219 for y in self.sequences_name: 220 identity = 0 221 info_identity = 0 222 cov_identity = 0 223 nb_of_identities = 0 224 nb_of_template = 0 225 template_info = [] 226 nb_of_residues = 0 227 228 ## loop over the full length of the alignment 229 for w in range(len(aln_dictionary["target"]["seq"])): 230 231 ## skipp deletions 232 nb_of_info_res=0 233 if(aln_dictionary[i]["seq"][w] is not '-'): 234 nb_of_residues += 1 235 236 ## count identities 237 if(aln_dictionary[i]["seq"][w] == \ 238 aln_dictionary[y]["seq"][w]): 239 nb_of_identities += 1 240 241 ## length excluding insertions 242 if(aln_dictionary[y]["seq"][w] is not '-'): 243 nb_of_template += 1 244 245 ## loop over all sequences but self 246 for z in template_names: 247 ## count how many sequences contain alignment 248 ## information at this position 249 if(aln_dictionary[z]["seq"][w] is not '-'): 250 nb_of_info_res += 1 251 252 template_info.append(nb_of_info_res) 253 254 ## number of positions in which any other sequence 255 ## contains alignment information 256 nb_cov_res = N.sum( N.greater(template_info, 0) ) 257 258 ## calculate identities 259 info_ID[y] = 100. * nb_of_identities / nb_of_template 260 ID[y] = 100. * nb_of_identities / nb_of_residues 261 cov_ID[y] = 100. * nb_of_identities / nb_cov_res 262 263 aln_dictionary[i]["info_ID"] = info_ID 264 aln_dictionary[i]["ID"] = ID 265 aln_dictionary[i]["cov_ID"] = cov_ID 266 aln_dictionary[i]["template_info"] = template_info 267 268 return aln_dictionary
269 270
271 - def __writeId( self, name, dic, key, description ):
272 """ 273 Write an sequence identity matrix to file. 274 275 @param name: file name 276 @type name: str 277 @param dic: alignment dictionary 278 @type dic: dict 279 @param key: key in dictionary to write 280 @type key: key 281 @param description: description to go into file (first line) 282 @type description: str 283 """ 284 f = open( name, 'w' ) 285 f.write( description +'\n\n') 286 287 ## write header 288 f.write('%s'%(' '*5)) 289 for s in self.sequences_name: 290 f.write('%8s'%s) 291 f.write('\n') 292 293 ## write matrix 294 for s in self.sequences_name: 295 f.write('%5s'%s) 296 for t in self.sequences_name: 297 f.write("%8.2f"%dic[s][key][t]) 298 f.write('\n')
299 300
301 - def output_identities(self, aln_dictionary, identities_file = None, 302 identities_info_file = None, 303 identities_cov_file = None):
304 """ 305 Writes three files to disk with identity info about the current 306 multiple alignment. 307 308 @param aln_dictionary: alignment dictionary 309 @type aln_dictionary: dict 310 @param identities_file: name for file with sequence identity 311 in percent comparing a sequence to another 312 (excluding deletions in the first sequence) 313 (default: None -> L{F_OUTPUT_IDENTITIES}) 314 @type identities_file: str 315 @param identities_info_file: name for file with sequence identity in 316 percent comparing a sequence to another 317 (excluding deletions and insertions in 318 the first sequence) 319 (default: None -> L{F_OUTPUT_IDENTITIES_INF}) 320 @type identities_info_file: str 321 @param identities_cov_file: name for file with sequence identity in 322 percent comparing a sequence to another 323 (excluding deletions and insertions in 324 the first sequence but only when the 325 first sequence doesn't match any other 326 sequence in the multiple alignment) 327 (default: None -> L{F_OUTPUT_IDENTITIES_COV}) 328 @type identities_cov_file: str 329 """ 330 ## filenames to create 331 identities_file = identities_file or \ 332 self.outFolder + self.F_OUTPUT_IDENTITIES 333 identities_info_file = identities_info_file or \ 334 self.outFolder + self.F_OUTPUT_IDENTITIES_INF 335 identities_cov_file = identities_cov_file or \ 336 self.outFolder + self.F_OUTPUT_IDENTITIES_COV 337 338 head_ID = "Sequence identity in percent comparing a sequence to another \n(excluding deletions in the first sequence)" 339 340 head_info_ID ="Sequence identity in percent comparing a sequence to another \n(excluding deletions and insertions in the first sequence)" 341 342 head_conv_ID ="Sequence identity in percent comparing a sequence to another \n(excluding deletions and insertions in the first sequence but only \nwhen the first sequence doesn't match any other sequence in the \nmultiple alignment )" 343 344 self.__writeId( identities_file, aln_dictionary, 345 'ID' , head_ID ) 346 self.__writeId( identities_info_file, aln_dictionary, 347 'info_ID' , head_info_ID ) 348 self.__writeId( identities_cov_file, aln_dictionary, 349 'cov_ID', head_conv_ID )
350 351
352 - def go(self, output_folder = None):
353 """ 354 Perform sequence comparison. 355 356 @param output_folder: output folder 357 @type output_folder: str 358 """ 359 output_folder = output_folder or self.outFolder 360 361 string_lines = self.get_lines() 362 363 aln_length = self.search_length( string_lines ) 364 365 ## get information about the target sequence from the alignment 366 aln_dictionary = self.get_aln_sequences( string_lines, 367 aln_length ) 368 369 ## add information about the aligned templates from the alignment 370 aln_dictionary = self.get_aln_templates( string_lines, 371 aln_dictionary, 372 aln_length ) 373 374 aln_dictionary = self.identities( aln_dictionary ) 375 376 self.output_identities( aln_dictionary ) 377 378 return aln_dictionary
379 380 381 382 ############# 383 ## TESTING 384 ############# 385
386 -class Test:
387 """ 388 Test class 389 """ 390
391 - def run( self, local=0, validate_testRoot=0 ):
392 """ 393 run function test 394 395 @param local: transfer local variables to global and perform 396 other tasks only when run locally 397 @type local: 1|0 398 @param validate_testRoot: check a validation project residing in 399 the test root (default: 0) 400 @type validate_testRoot: 1|0 401 402 @return: 1 403 @rtype: int 404 """ 405 import tempfile 406 import shutil 407 408 if validate_testRoot: 409 self.m = Check_Identities( T.testRoot() + '/Mod/project') 410 self.m.go() 411 412 val_root = T.testRoot() + '/Mod/project/validation' 413 folders = os.listdir( val_root ) 414 415 for f in folders: 416 self.m = Check_Identities( outFolder = val_root + '/' + f ) 417 self.m.go() 418 419 if local: 420 globals().update( locals() ) 421 422 else: 423 ## collect the input files needed 424 outfolder = tempfile.mkdtemp( '_test_CheckIdentities' ) 425 os.mkdir( outfolder +'/t_coffee' ) 426 427 shutil.copy( T.testRoot() + '/Mod/project/t_coffee/final.pir_aln', 428 outfolder + '/t_coffee' ) 429 430 self.m = Check_Identities( outfolder ) 431 self.m.go() 432 433 if local: 434 globals().update( locals() ) 435 print """ 436 The result from the template comparison can be found in the three files %s, %s and %s that reside in the folder %s"""\ 437 %(self.m.F_OUTPUT_IDENTITIES[1:], 438 self.m.F_OUTPUT_IDENTITIES_INF[1:], 439 self.m.F_OUTPUT_IDENTITIES_COV[1:], 440 outfolder ) 441 442 ## cleanup 443 T.tryRemove( outfolder, tree=1 ) 444 445 return 1
446 447
448 - def expected_result( self ):
449 """ 450 Precalculated result to check for consistent performance. 451 452 @return: 1 453 @rtype: int 454 """ 455 return 1
456 457 458 if __name__ == '__main__': 459 460 test = Test() 461 462 assert test.run( local=1 ) == test.expected_result() 463