Biskit.Statistics.Density

1 ## 2 ## Biskit, a toolkit for the manipulation of macromolecular structures 3 ## Copyright (c) 2004 Wolfgang Rieping & Michael Habeck 4 ## Copyright (c) 2005 Raik Gruenberg 5 ## 6 ## This program is free software; you can redistribute it and/or 7 ## modify it under the terms of the GNU General Public License as 8 ## published by the Free Software Foundation; either version 2 of the 9 ## License, or any later version. 10 ## 11 ## This program is distributed in the hope that it will be useful, 12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of 13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 ## General Public License for more details. 15 ## 16 ## You find a copy of the GNU General Public License in the file 17 ## license.txt along with this program; if not, write to the Free 18 ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 20 ## Contributions: Wolfgang Rieping, Michael Habeck, Raik Gruenberg 21 ## last $Author: leckner $ 22 ## last $Date: 2006/09/13 10:28:14 $ 23 ## $Revision: 2.5 $ 24 25 """ 26 Analyze a density distribution of values. 27 """ 28 29 import Numeric as N 30 import math 31 import Biskit.hist as H 32

33 -class Density:

34 """ 35 Analyze a density distribution of values. 36 Can be created from a list of samples or from a discrete distribution:: 37 38 Density( values = [ float ] ) 39 or 40 Density( p = array(2xN,'f') ) 41 """ 42

43 - def __init__(self, p = None, values = None, bins = 20):

44 """ 45 @param p: discrete distribution, array (2 x len(data) ) with 46 start of bin and witdh of bin (default: None) 47 @type p: array 48 @param values: list of samples (default: None) 49 @type values: [float] 50 @param bins: number of bins (default: 20) 51 @type bins: int 52 """ 53 if p is None and values is None: 54 raise ValueError, 'Either a discrete distribution or ' + \ 55 'a list of samples must be specified' 56 57 if values is not None: 58 59 p = H.density(values, bins, steps = 0) 60 61 self.store(p) 62 63 self.verbose = 0

64 65

66 - def store(self, val):

67 """ 68 Analyze distribution data. 69 70 @param val: array (2 x len(data) ) with start of bin and witdh 71 of bin (default: None) 72 @type val: array 73 """ 74 self.val = N.array(val, N.Float) 75 self.x = self.val[:,0] 76 self.p = self.val[:,1] 77 78 self.delta_x = abs(self.x[0] - self.x[1]) 79 80 Z = N.sum(self.p) * self.delta_x 81 82 self.p /= Z

83 84

85 - def get(self):

86 return self.val

87 88

89 - def __getslice__(self, *args, **kw):

90 from operator import getslice 91 return getslice(self.val, *args, **kw)

92 93

94 - def __getitem__(self, *args, **kw):

95 from operator import getitem 96 return getitem(self.val, *args, **kw)

97 98

99 - def confidenceInterval(self, level):

100 """ 101 confidenceInterval(self, level) 102 103 @param level: confidence level (e.g. 0.68 for stdev interval) 104 @type level: float 105 106 @return: start and end of the confidence interval 107 containing |level|*100 % of the probability 108 @rtype: float, float 109 """ 110 order = N.argsort(self.p).tolist() 111 cumulative = N.add.accumulate(N.take(self.p, order)) * self.delta_x 112 113 ind = N.nonzero(N.greater_equal(cumulative, 1. - level)) 114 115 sub_set = order[ind[0]:] 116 117 intervals = self.__find_intervals(sub_set) 118 119 boundaries = [(self.x[i[0]], self.x[i[-1]]) for i in intervals] 120 121 return tuple(boundaries)

122 123

124 - def findConfidenceInterval(self, x):

125 """ 126 findConfidenceInterval(self, x) 127 Find the smallest possible density interval that still includes x. 128 129 @param x: value 130 @type x: float 131 132 @return: convidence level, interval start and end 133 @rtype: float, (float,float) 134 """ 135 closest = N.argmin(abs(self.x - x)) 136 137 ind = N.nonzero(N.greater_equal(self.p, self.p[closest])).tolist() 138 139 intervals = self.__find_intervals(ind) 140 141 lens = N.array([len(i) for i in intervals]) 142 levels = [N.sum(N.take(self.p, i)) for i in intervals] 143 level = N.sum(levels) * self.delta_x 144 145 boundaries = [(self.x[i[0]], self.x[i[-1]]) for i in intervals] 146 147 return level, tuple(boundaries)

148 149

150 - def median(self):

151 """ 152 Median of distribution. 153 """ 154 cum = N.add.accumulate(self.p) * self.delta_x 155 index = N.argmin(abs(cum - 0.5)) 156 157 return self.x[index]

158 159

160 - def average(self):

161 """ 162 Average of distribution. 163 """ 164 return self.delta_x * N.sum(self.p * self.x)

165 166

167 - def max(self):

168 """ 169 Max height of distribution. 170 """ 171 index = N.argmax(self.p) 172 return self.x[index]

173 174

175 - def __find_intervals(self, l):

176 l = N.array(l) 177 l = N.take(l, N.argsort(l)) 178 179 break_points = N.nonzero(N.greater(l[1:] - l[:-1], 1)) 180 181 start = 0 182 intervals = [] 183 184 for i in range(len(break_points)): 185 index = break_points[i] 186 intervals.append(tuple(N.take(l, range(start, index + 1)))) 187 start = index + 1 188 189 intervals.append(tuple(l[start:])) 190 191 return intervals

192 193

194 -def p_lognormal(x, alpha, beta):

195 """ 196 Get the probability of x from a lognormal density distribution that 197 is described by two parameters alpha and beta. Alpha and beta are not 198 the usual mean and standard dev of the lognormal distribution itself but 199 are the mean and stdev of the distribution after log-transformation. 200 The two parameters can hence be calculated from n sample values v:: 201 202 alpha = 1/n N.sum( ln(vi) ) 203 beta = N.sqrt( 1/(n-1) N.sum( ln(vi) - alpha )^2 ) 204 205 @param x: value 206 @type x: float 207 @param alpha: mean of the log-transformed random variable 208 @type alpha: float 209 @param beta: stdev of the log-transformed random variable 210 @type beta: float 211 212 @return: probability of x 213 @rtype: float 214 """ 215 return 1. / math.sqrt(2. * math.pi) / beta / x * \ 216 math.exp(-0.5 / beta ** 2 * (math.log(x) - alpha) ** 2)

217 218

219 -def logConfidence( x, R, clip=1e-32 ):

220 """ 221 Estimate the probability of x NOT beeing a random observation from a 222 lognormal distribution that is described by a set of random values. 223 The exact solution to this problem is in L{Biskit.Statistics.lognormal}. 224 225 @param x: observed value 226 @type x: float 227 @param R: sample of random values; 0 -> don't clip (default: 1e-32) 228 @type R: [float] 229 @param clip: clip zeros at this value 230 @type clip: float 231 232 @return: confidence that x is not random, mean of random distrib. 233 @rtype: (float, float) 234 """ 235 if clip and 0 in R: 236 R = N.clip( R, clip, max( R ) ) 237 ## get mean and stdv of log-transformed random sample 238 mean = N.average( N.log( R ) ) 239 240 n = len( R ) 241 242 stdv = N.sqrt(N.sum(N.power(N.log( R ) - mean, 2)) / (n - 1.)) 243 244 ## create dense lognormal distribution representing the random sample 245 stop = max( R ) * 50.0 246 step = stop / 100000 247 start = step / 10.0 248 249 X = [(v, p_lognormal(v, mean, stdv) ) for v in N.arange(start, stop, step)] 250 251 ## analyse distribution 252 d = Density( X ) 253 254 return d.findConfidenceInterval( x * 1.0 )[0], d.average()

255 256 257 ############# 258 ## TESTING 259 ############# 260

261 -class Test:

262 """ 263 Test class 264 """ 265

266 - def run( self, local=0 ):

267 """ 268 run function test 269 270 @param local: transfer local variables to global and perform 271 other tasks only when run locally 272 @type local: 1|0 273 274 @return: 1 275 @rtype: int 276 """ 277 import random 278 279 ## a proper lognormal density distribution the log of which has mean 1.0 280 ## and stdev 0.5 281 X = [ (x, p_lognormal(x, 1.0, 0.5)) for x in N.arange(0.00001, 50, 0.001)] 282 283 for i in range( 5 ): 284 ## Some random values drawn from the same lognormal distribution 285 286 alpha = 2. 287 beta = 0.6 288 289 R = [ random.lognormvariate( alpha, beta ) for i in range( 10000 ) ] 290 291 if local: print logConfidence( 6.0, R )[0]#, area(6.0, alpha, beta) 292 293 if local: 294 globals().update( locals() ) 295 296 return 1

297 298

299 - def expected_result( self ):

300 """ 301 Precalculated result to check for consistent performance. 302 303 @return: 1 304 @rtype: int 305 """ 306 return 1

307 308 309 if __name__ == '__main__': 310 311 test = Test() 312 313 assert test.run( local=1 ) == test.expected_result() 314

Source Code for Module Biskit.Statistics.Density