Package astLib :: Module astStats
[hide private]
[frames] | no frames]

Source Code for Module astLib.astStats

   1  """module for performing statistical calculations. 
   2   
   3  (c) 2007-2012 Matt Hilton 
   4   
   5  (c) 2013-2014 Matt Hilton & Steven Boada 
   6   
   7  U{http://astlib.sourceforge.net} 
   8   
   9  This module (as you may notice) provides very few statistical routines. It 
  10  does, however, provide biweight (robust) estimators of location and scale, as 
  11  described in Beers et al. 1990 (AJ, 100, 32), in addition to a robust least 
  12  squares fitting routine that uses the biweight transform. 
  13   
  14  Some routines may fail if they are passed lists with few items and encounter a 
  15  `divide by zero' error. Where this occurs, the function will return None. An 
  16  error message will be printed to the console when this happens if 
  17  astStats.REPORT_ERRORS=True (the default). Testing if an astStats function 
  18  returns None can be used to handle errors in scripts. 
  19   
  20  For extensive statistics modules, the Python bindings for GNU R 
  21  (U{http://rpy.sourceforge.net}), or SciPy (U{http://www.scipy.org}) are 
  22  suggested. 
  23   
  24  """ 
  25   
  26  import math 
  27  import numpy 
  28  import collections 
  29   
  30  REPORT_ERRORS = True 
  31   
  32   
  33  #----------------------------------------------------------------------------- 
34 -def mean(dataList):
35 """Calculates the mean average of a list of numbers. 36 37 @type dataList: list or numpy array 38 @param dataList: input data, must be a one dimensional list 39 @rtype: float 40 @return: mean average 41 42 """ 43 return numpy.mean(dataList)
44 45 46 #-----------------------------------------------------------------------------
47 -def weightedMean(dataList):
48 """Calculates the weighted mean average of a two dimensional list (value, 49 weight) of numbers. 50 51 @type dataList: list 52 @param dataList: input data, must be a two dimensional list in format 53 [value, weight] 54 @rtype: float 55 @return: weighted mean average 56 57 """ 58 59 dataList = numpy.asarray(dataList) 60 mean = numpy.average(dataList[:, 0], weights=dataList[:, 1]) 61 62 return mean
63 64 65 #-----------------------------------------------------------------------------
66 -def stdev(dataList):
67 """Calculates the (sample) standard deviation of a list of numbers. 68 69 @type dataList: list or numpy array 70 @param dataList: input data, must be a one dimensional list 71 @rtype: float 72 @return: standard deviation 73 74 """ 75 return numpy.std(dataList)
76 77 78 #-----------------------------------------------------------------------------
79 -def rms(dataList):
80 """Calculates the root mean square of a list of numbers. 81 82 @type dataList: list 83 @param dataList: input data, must be a one dimensional list 84 @rtype: float 85 @return: root mean square 86 87 """ 88 # dataListSq = [] 89 # for item in dataList: 90 # dataListSq.append(math.pow(item, 2)) 91 dataListSq = [math.pow(item, 2) for item in dataList] 92 listMeanSq = mean(dataListSq) 93 rms = math.sqrt(listMeanSq) 94 95 return rms
96 97 98 #-----------------------------------------------------------------------------
99 -def weightedStdev(dataList):
100 """Calculates the weighted (sample) standard deviation of a list of 101 numbers. 102 103 @type dataList: list 104 @param dataList: input data, must be a two dimensional list in format 105 [value, weight] 106 @rtype: float 107 @return: weighted standard deviation 108 109 @note: Returns None if an error occurs. 110 111 """ 112 113 dataList = numpy.asarray(dataList) 114 if dataList.shape[0] < 1: 115 if REPORT_ERRORS: 116 print("ERROR: astStats.weightedStdev() : " 117 "dataList contains < 2 items. ") 118 return None 119 120 listMean = weightedMean(dataList[:, 0], weights=dataList[:, 1]) 121 variance = numpy.average((dataList[:, 0] - listMean)**2, 122 weights=dataList[:, 1]) 123 124 return numpy.sqrt(variance)
125 126 127 #-----------------------------------------------------------------------------
128 -def median(dataList):
129 """Calculates the median of a list of numbers. 130 131 @type dataList: list or numpy array 132 @param dataList: input data, must be a one dimensional list 133 @rtype: float 134 @return: median average 135 136 """ 137 return numpy.median(dataList)
138 139 140 #-----------------------------------------------------------------------------
141 -def modeEstimate(dataList):
142 """Returns an estimate of the mode of a set of values by 143 mode=(3*median)-(2*mean). 144 145 @type dataList: list 146 @param dataList: input data, must be a one dimensional list 147 @rtype: float 148 @return: estimate of mode average 149 150 """ 151 mode = (3 * median(dataList)) - (2 * mean(dataList)) 152 153 return mode
154 155 156 #-----------------------------------------------------------------------------
157 -def MAD(dataList):
158 """Calculates the Median Absolute Deviation of a list of numbers. 159 160 @type dataList: list 161 @param dataList: input data, must be a one dimensional list 162 @rtype: float 163 @return: median absolute deviation 164 165 """ 166 # listMedian=median(dataList) 167 168 # Calculate |x-M| values 169 # diffModuli=[] 170 # for item in dataList: 171 # diffModuli.append(math.fabs(item-listMedian)) 172 173 # MAD=median(diffModuli) 174 # return MAD 175 176 # arr = np.ma.array(dataList).compress() # speed up not using this step 177 med = numpy.median(dataList) 178 try: 179 return numpy.median(numpy.fabs(dataList - med)) 180 except TypeError: 181 dataList = numpy.array(dataList) 182 return numpy.median(numpy.fabs(dataList - med))
183 184 185 #-----------------------------------------------------------------------------
186 -def normalizdMAD(dataList):
187 """ Calculates the normalized Median Absolute Deviation of a list of 188 numbers which, for a Gaussian distribution, is related to the standard 189 deviation by 1.4826. 190 191 @type dataList: list 192 @param dataList: input data, must be a one dimensional list 193 @rtype: float 194 @return: normalized median absolute deviation 195 196 """ 197 198 return 1.4826 * MAD(dataList)
199 200 201 #-----------------------------------------------------------------------------
202 -def biweightLocation(dataList, tuningConstant=6.0):
203 """Calculates the biweight location estimator (like a robust average) of a 204 list of numbers. 205 206 @type dataList: list 207 @param dataList: input data, must be a one dimensional list 208 @type tuningConstant: float 209 @param tuningConstant: 6.0 is recommended. 210 @rtype: float 211 @return: biweight location 212 213 @note: Returns None if an error occurs. 214 215 """ 216 # check to make sure there are items in the list 217 if not len(dataList): 218 return numpy.nan 219 220 C = tuningConstant 221 # listMedian = median(dataList) 222 listMedian = numpy.median(dataList) 223 listMAD = MAD(dataList) 224 if listMAD != 0: 225 uValues = [] 226 for item in dataList: 227 uValues.append((item - listMedian) / (C * listMAD)) 228 229 top = 0 # numerator equation (5) Beers et al if you like 230 bottom = 0 # denominator 231 for i in range(len(uValues)): 232 if math.fabs(uValues[i]) <= 1.0: 233 top = top + ((dataList[i] - listMedian) * 234 (1.0 - (uValues[i] * uValues[i])) * 235 (1.0 - (uValues[i] * uValues[i]))) 236 237 bottom = bottom + ((1.0 - (uValues[i] * uValues[i])) * 238 (1.0 - (uValues[i] * uValues[i]))) 239 240 CBI = listMedian + (top / bottom) 241 242 else: 243 if REPORT_ERRORS: 244 print("ERROR: astStats: biweightLocation() : MAD() returned 0.") 245 return None 246 247 return CBI
248 249 250 #-----------------------------------------------------------------------------
251 -def biweightScale(dataList, tuningConstant=9.0):
252 """Calculates the biweight scale estimator (like a robust standard 253 deviation) of a list of numbers. 254 255 @type dataList: list 256 @param dataList: input data, must be a one dimensional list 257 @type tuningConstant: float 258 @param tuningConstant: 9.0 is recommended. 259 @rtype: float 260 @return: biweight scale 261 262 @note: Returns None if an error occurs. 263 264 """ 265 C = tuningConstant 266 267 # Calculate |x-M| values and u values 268 listMedian = median(dataList) 269 listMAD = MAD(dataList) 270 diffModuli = [] 271 for item in dataList: 272 diffModuli.append(math.fabs(item - listMedian)) 273 uValues = [] 274 for item in dataList: 275 try: 276 uValues.append((item - listMedian) / (C * listMAD)) 277 except ZeroDivisionError: 278 if REPORT_ERRORS: 279 print("ERROR: astStats.biweightScale(): divide by zero error.") 280 return None 281 282 top = 0 # numerator equation (9) Beers etal 1990 283 bottom = 0 # denominator equation (9) Beers etal 1990 284 valCount = 0 # Count values where u<1 only 285 286 for i in range(len(uValues)): 287 # Skip u values >1 288 if math.fabs(uValues[i]) <= 1.0: 289 u2Term = 1.0 - math.pow(uValues[i], 2) 290 u4Term = math.pow(u2Term, 4) 291 top += math.pow(diffModuli[i], 2) * u4Term 292 bottom += (u2Term * (1.0 - (5.0 * math.pow(uValues[i], 2)))) 293 valCount += 1 294 295 top = math.sqrt(top) 296 bottom = math.fabs(bottom) 297 298 SBI = math.pow(float(valCount), 0.5) * (top / bottom) 299 return SBI
300 301 302 #-----------------------------------------------------------------------------
303 -def biweightScale_test(dataList, tuningConstant=9.0):
304 """Calculates the biweight scale estimator (like a robust standard 305 deviation) of a list of numbers. 306 307 @type dataList: list 308 @param dataList: input data, must be a one dimensional list 309 @type tuningConstant: float 310 @param tuningConstant: 9.0 is recommended. 311 @rtype: float 312 @return: biweight scale 313 314 @note: Returns None if an error occurs. 315 316 """ 317 C = tuningConstant 318 319 # Calculate |x-M| values and u values 320 listMedian = median(dataList) 321 listMAD = MAD(dataList) 322 diffModuli = [] 323 for item in dataList: 324 diffModuli.append(math.fabs(item - listMedian)) 325 uValues = [] 326 for item in dataList: 327 try: 328 uValues.append((item - listMedian) / (C * listMAD)) 329 except ZeroDivisionError: 330 if REPORT_ERRORS: 331 print("ERROR: astStats.biweightScale(): divide by zero error.") 332 return None 333 334 top = 0 # numerator equation (9) Beers etal 1990 335 bottom = 0 # denominator equation (9) Beers etal 1990 336 valCount = 0 # Count values where u<1 only 337 338 for i in range(len(uValues)): 339 # Skip u values >1 340 if math.fabs(uValues[i]) <= 1.0: 341 u2Term = 1.0 - math.pow(uValues[i], 2) 342 u4Term = math.pow(u2Term, 4) 343 top += math.pow(diffModuli[i], 2) * u4Term 344 bottom += (u2Term * (1.0 - (5.0 * math.pow(uValues[i], 2)))) 345 valCount += 1 346 347 top = math.sqrt(top) 348 #bottom = math.fabs(bottom) 349 bottom = math.sqrt(bottom * (bottom - 1.)) 350 351 SBI = math.pow(float(valCount), 0.5) * (top / bottom) 352 return SBI
353 354 355 #-----------------------------------------------------------------------------
356 -def biweightClipped(dataList, tuningConstant, sigmaCut):
357 """Iteratively calculates biweight location and scale, using sigma 358 clipping, for a list of values. The calculation is performed on the first 359 column of a multi-dimensional list; other columns are ignored. 360 361 @type dataList: list 362 @param dataList: input data 363 @type tuningConstant: float 364 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is 365 recommended for scale estimates 366 @type sigmaCut: float 367 @param sigmaCut: sigma clipping to apply 368 @rtype: dictionary 369 @return: estimate of biweight location, scale, and list of non-clipped 370 data, in the format {'biweightLocation', 'biweightScale', 'dataList'} 371 372 @note: Returns None if an error occurs. 373 374 """ 375 376 iterations = 0 377 clippedValues = [] 378 for row in dataList: 379 if type(row) == list: 380 clippedValues.append(row[0]) 381 else: 382 clippedValues.append(row) 383 384 while iterations < 11 and len(clippedValues) > 5: 385 386 cbi = biweightLocation(clippedValues, tuningConstant) 387 sbi = biweightScale(clippedValues, tuningConstant) 388 389 # check for either biweight routine falling over 390 # happens when feed in lots of similar numbers 391 # e.g. when bootstrapping with a small sample 392 if cbi is None or sbi is None: 393 394 if REPORT_ERRORS: 395 print("ERROR: astStats.biweightClipped(): divide by zero error.") 396 397 return None 398 399 else: 400 401 clippedValues = [] 402 clippedData = [] 403 for row in dataList: 404 if type(row) == list: 405 if row[0] > cbi - (sigmaCut * sbi) and \ 406 row[0] < cbi + (sigmaCut * sbi): 407 clippedValues.append(row[0]) 408 clippedData.append(row) 409 else: 410 if row > cbi - (sigmaCut * sbi) and \ 411 row < cbi + (sigmaCut * sbi): 412 clippedValues.append(row) 413 clippedData.append(row) 414 415 iterations = iterations + 1 416 417 return {'biweightLocation': cbi, 418 'biweightScale': sbi, 419 'dataList': clippedData}
420 421 422 #-----------------------------------------------------------------------------
423 -def biweightTransform(dataList, tuningConstant):
424 """Calculates the biweight transform for a set of values. Useful for using 425 as weights in robust line fitting. 426 427 @type dataList: list 428 @param dataList: input data, must be a one dimensional list 429 @type tuningConstant: float 430 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is 431 recommended for scale estimates 432 @rtype: list 433 @return: list of biweights 434 435 """ 436 C = tuningConstant 437 438 # Calculate |x-M| values and u values 439 listMedian = abs(median(dataList)) 440 cutoff = C * listMedian 441 biweights = [] 442 for item in dataList: 443 if abs(item) < cutoff: 444 biweights.append([item, 445 (1.0 - ((item / cutoff) * (item / cutoff))) * 446 (1.0 - ((item / cutoff) * (item / cutoff)))]) 447 else: 448 biweights.append([item, 0.0]) 449 450 return biweights
451 452 #----------------------------------------------------------------------------- 453 454
455 -def gapperEstimator(dataList):
456 """ Calculates the Gapper Estimator (like a robust standard deviation) on a 457 list of numbers. Beers et al. 1990 recommends this for small number 458 statistics as it is insensitive to outliers and more accurately reproduces 459 the true dispersion of the system when compared to the canonical rms 460 standard deviation. See Hou et al. 2009 for a comparison. 461 462 @type dataList: list 463 @param dataList: input data, must be a one dimensional list 464 @rtype: float 465 @return: The dispersion of the dataList 466 467 """ 468 469 # make a copy to ensure no changes to original list 470 dataList = numpy.copy(dataList) 471 472 # sort 473 dataList.sort() 474 475 # find the differences between neighboring elements 476 diff = numpy.diff(dataList) 477 478 # weight the list of differences 479 weightedDiff = [i * (len(dataList) - i) * g for i, g in enumerate(diff)] 480 481 return math.sqrt(math.pi) / (len(dataList) * (len(dataList) - 1)) *\ 482 sum(weightedDiff)
483 484 #----------------------------------------------------------------------------- 485 486
487 -def OLSFit(dataList):
488 """Performs an ordinary least squares fit on a two dimensional list of 489 numbers. Minimum number of data points is 5. 490 491 @type dataList: list 492 @param dataList: input data, must be a two dimensional list in format [x,y] 493 @rtype: dictionary 494 @return: slope and intercept on y-axis, with associated errors, in the 495 format {'slope', 'intercept', 'slopeError', 'interceptError'} 496 497 @note: Returns None if an error occurs. 498 499 """ 500 sumX = 0 501 sumY = 0 502 sumXY = 0 503 sumXX = 0 504 n = float(len(dataList)) 505 if n > 2: 506 for item in dataList: 507 sumX = sumX + item[0] 508 sumY = sumY + item[1] 509 sumXY = sumXY + (item[0] * item[1]) 510 sumXX = sumXX + (item[0] * item[0]) 511 m = ((n * sumXY) - (sumX * sumY)) / ((n * sumXX) - (sumX * sumX)) 512 c = ((sumXX * sumY) - (sumX * sumXY)) / ((n * sumXX) - (sumX * sumX)) 513 514 sumRes = 0 515 for item in dataList: 516 517 sumRes = sumRes + ((item[1] - (m * item[0]) - c) * 518 (item[1] - (m * item[0]) - c)) 519 520 sigma = math.sqrt((1.0 / (n - 2)) * sumRes) 521 522 try: 523 mSigma = (sigma * 524 math.sqrt(n)) / math.sqrt((n * sumXX) - (sumX * sumX)) 525 except: 526 mSigma = numpy.nan 527 try: 528 cSigma = ( 529 sigma * 530 math.sqrt(sumXX)) / math.sqrt((n * sumXX) - (sumX * sumX)) 531 except: 532 cSigma = numpy.nan 533 else: 534 if REPORT_ERRORS: 535 print("ERROR: astStats.OLSFit(): dataList contains < 3 items.") 536 537 return None 538 539 return {'slope': m, 540 'intercept': c, 541 'slopeError': mSigma, 542 'interceptError': cSigma}
543 544 545 #-----------------------------------------------------------------------------
546 -def clippedMeanStdev(dataList, sigmaCut=3.0, maxIterations=10.0):
547 """Calculates the clipped mean and stdev of a list of numbers. 548 549 @type dataList: list 550 @param dataList: input data, one dimensional list of numbers 551 @type sigmaCut: float 552 @param sigmaCut: clipping in Gaussian sigma to apply 553 @type maxIterations: int 554 @param maxIterations: maximum number of iterations 555 @rtype: dictionary 556 @return: format {'clippedMean', 'clippedStdev', 'numPoints'} 557 558 """ 559 560 listCopy = [] 561 for d in dataList: 562 listCopy.append(d) 563 listCopy = numpy.array(listCopy) 564 565 iterations = 0 566 while iterations < maxIterations and len(listCopy) > 4: 567 568 m = listCopy.mean() 569 s = listCopy.std() 570 571 listCopy = listCopy[numpy.less(abs(listCopy), abs(m + sigmaCut * s))] 572 573 iterations = iterations + 1 574 575 return {'clippedMean': m, 576 'clippedStdev': s, 577 'numPoints': 578 listCopy.shape[0]}
579 580 581 #-----------------------------------------------------------------------------
582 -def clippedMedianStdev(dataList, sigmaCut=3.0, maxIterations=10.0):
583 """Calculates the clipped mean and stdev of a list of numbers. 584 585 @type dataList: list 586 @param dataList: input data, one dimensional list of numbers 587 @type sigmaCut: float 588 @param sigmaCut: clipping in Gaussian sigma to apply 589 @type maxIterations: int 590 @param maxIterations: maximum number of iterations 591 @rtype: dictionary 592 @return: format {'clippedMean', 'clippedStdev', 'numPoints'} 593 594 """ 595 596 listCopy = [] 597 for d in dataList: 598 listCopy.append(d) 599 listCopy = numpy.array(listCopy) 600 601 iterations = 0 602 while iterations < maxIterations and len(listCopy) > 4: 603 604 m = median(listCopy) 605 s = listCopy.std() 606 listCopy = listCopy[numpy.less(abs(listCopy), abs(m + sigmaCut * s))] 607 iterations += 1 608 609 return {'clippedMedian': m, 610 'clippedStdev': s, 611 'numPoints': 612 listCopy.shape[0]}
613 614 615 #-----------------------------------------------------------------------------
616 -def clippedWeightedLSFit(dataList, sigmaCut):
617 """Performs a weighted least squares fit on a list of numbers with sigma 618 clipping. Minimum number of data points is 5. 619 620 @type dataList: list 621 @param dataList: input data, must be a three dimensional list in format [x, 622 y, y weight] 623 @rtype: dictionary 624 @return: slope and intercept on y-axis, with associated errors, in the 625 format {'slope', 'intercept', 'slopeError', 'interceptError'} 626 627 @note: Returns None if an error occurs. 628 629 """ 630 631 iterations = 0 632 clippedValues = [] 633 for row in dataList: 634 clippedValues.append(row) 635 636 while iterations < 11 and len(clippedValues) > 4: 637 638 fitResults = weightedLSFit(clippedValues, "errors") 639 640 if fitResults['slope'] is None: 641 642 if REPORT_ERRORS: 643 print("ERROR: astStats.clippedWeightedLSFit(): " 644 "divide by zero error.") 645 646 return None 647 648 else: 649 650 clippedValues = [] 651 for row in dataList: 652 653 # Trim points more than sigmaCut*sigma away from the fitted line 654 fit = fitResults['slope'] * row[0] + fitResults['intercept'] 655 res = row[1] - fit 656 if abs(res) / row[2] < sigmaCut: 657 clippedValues.append(row) 658 659 iterations = iterations + 1 660 661 # store the number of values that made it through the clipping process 662 fitResults['numDataPoints'] = len(clippedValues) 663 664 return fitResults
665 666 667 #-----------------------------------------------------------------------------
668 -def weightedLSFit(dataList, weightType):
669 """Performs a weighted least squares fit on a three dimensional list of 670 numbers [x, y, y error]. 671 672 @type dataList: list 673 @param dataList: input data, must be a three dimensional list in format [x, 674 y, y error] 675 @type weightType: string 676 @param weightType: if "errors", weights are calculated assuming the input 677 data is in the format [x, y, error on y]; if "weights", the weights are 678 assumed to be already calculated and stored in a fourth column [x, y, error 679 on y, weight] (as used by e.g. L{astStats.biweightLSFit}) 680 @rtype: dictionary 681 @return: slope and intercept on y-axis, with associated errors, in the 682 format {'slope', 'intercept', 'slopeError', 'interceptError'} 683 684 @note: Returns None if an error occurs. 685 686 """ 687 if weightType == "weights": 688 sumW = 0 689 sumWX = 0 690 sumWY = 0 691 sumWXY = 0 692 sumWXX = 0 693 n = float(len(dataList)) 694 if n > 4: 695 for item in dataList: 696 W = item[3] 697 sumWX = sumWX + (W * item[0]) 698 sumWY = sumWY + (W * item[1]) 699 sumWXY = sumWXY + (W * item[0] * item[1]) 700 sumWXX = sumWXX + (W * item[0] * item[0]) 701 sumW = sumW + W 702 #print sumW, sumWXX, sumWX 703 704 try: 705 m = ((sumW * sumWXY) - (sumWX * sumWY)) / ((sumW * sumWXX) - 706 (sumWX * sumWX)) 707 except ZeroDivisionError: 708 if REPORT_ERRORS: 709 print("ERROR: astStats.weightedLSFit(): divide by zero error.") 710 return None 711 712 try: 713 c = ((sumWXX * sumWY) - (sumWX * sumWXY)) / ((sumW * sumWXX) - 714 (sumWX * sumWX)) 715 except ZeroDivisionError: 716 if REPORT_ERRORS: 717 print("ERROR: astStats.weightedLSFit(): divide by zero error.") 718 return None 719 720 sumRes = 0 721 for item in dataList: 722 723 sumRes = sumRes + ((item[1] - (m * item[0]) - c) * (item[1] - 724 (m * item[0]) - c)) 725 726 sigma = math.sqrt((1.0 / (n - 2)) * sumRes) 727 728 # Can get div0 errors here so check 729 # When biweight fitting converges this shouldn't happen 730 if (n * sumWXX) - (sumWX * sumWX) > 0.0: 731 732 mSigma = (sigma * math.sqrt(n)) / math.sqrt((n * sumWXX) - 733 (sumWX * sumWX)) 734 735 cSigma = (sigma * math.sqrt(sumWXX)) / math.sqrt((n * sumWXX) - 736 (sumWX * sumWX)) 737 738 else: 739 740 if REPORT_ERRORS: 741 print("ERROR: astStats.weightedLSFit(): " 742 "divide by zero error.") 743 return None 744 745 else: 746 if REPORT_ERRORS: 747 print("ERROR: astStats.weightedLSFit(): " 748 "dataList contains < 5 items.") 749 return None 750 751 elif weightType == "errors": 752 sumX = 0 753 sumY = 0 754 sumXY = 0 755 sumXX = 0 756 sumSigma = 0 757 n = float(len(dataList)) 758 for item in dataList: 759 sumX = sumX + (item[0] / (item[2] * item[2])) 760 sumY = sumY + (item[1] / (item[2] * item[2])) 761 sumXY = sumXY + ((item[0] * item[1]) / (item[2] * item[2])) 762 sumXX = sumXX + ((item[0] * item[0]) / (item[2] * item[2])) 763 sumSigma = sumSigma + (1.0 / (item[2] * item[2])) 764 delta = (sumSigma * sumXX) - (sumX * sumX) 765 m = ((sumSigma * sumXY) - (sumX * sumY)) / delta 766 c = ((sumXX * sumY) - (sumX * sumXY)) / delta 767 mSigma = math.sqrt(sumSigma / delta) 768 cSigma = math.sqrt(sumXX / delta) 769 770 return {'slope': m, 771 'intercept': c, 772 'slopeError': mSigma, 773 'interceptError': cSigma}
774 775 776 #-----------------------------------------------------------------------------
777 -def biweightLSFit(dataList, tuningConstant, sigmaCut=None):
778 """Performs a weighted least squares fit, where the weights used are the 779 biweight transforms of the residuals to the previous best fit .i.e. the 780 procedure is iterative, and converges very quickly (iterations is set to 10 781 by default). Minimum number of data points is 10. 782 783 This seems to give slightly different results to the equivalent R routine, 784 so use at your own risk! 785 786 @type dataList: list 787 @param dataList: input data, must be a three dimensional list in format [x, 788 y, y weight] 789 @type tuningConstant: float 790 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is 791 recommended for 792 scale estimates 793 @type sigmaCut: float 794 @param sigmaCut: sigma clipping to apply (set to None if not required) 795 @rtype: dictionary 796 @return: slope and intercept on y-axis, with associated errors, in the 797 format {'slope', 'intercept', 'slopeError', 'interceptError'} 798 799 @note: Returns None if an error occurs. 800 801 """ 802 803 dataCopy = [] 804 for row in dataList: 805 dataCopy.append(row) 806 807 # First perform unweighted fit, then calculate residuals 808 results = OLSFit(dataCopy) 809 for k in range(10): 810 m = results['slope'] 811 c = results['intercept'] 812 res = [] 813 for item in dataCopy: 814 res.append((m * item[0] + c) - item[1]) 815 816 if len(res) > 5: 817 # For clipping, trim away things >3 sigma 818 # away from median 819 if sigmaCut is not None: 820 absRes = [] 821 for item in res: 822 absRes.append(abs(item)) 823 sigma = stdev(absRes) 824 count = 0 825 for item in absRes: 826 if item > (sigmaCut * sigma) and len(dataCopy) > 2: 827 del dataCopy[count] 828 del res[count] 829 830 # Index of datalist gets out of 831 # sync with absRes as we delete 832 # items 833 count -= 1 834 835 count += 1 836 837 # Biweight transform residuals 838 weights = biweightTransform(res, tuningConstant) 839 840 # Perform weighted fit, using biweight transforms 841 # of residuals as weight 842 wData = [] 843 for i in range(len(dataCopy)): 844 wData.append([dataCopy[i][0], dataCopy[i][1], dataCopy[i][2], 845 weights[i][1]]) 846 847 results = weightedLSFit(wData, "weights") 848 849 return results
850 851 852 #-----------------------------------------------------------------------------
853 -def cumulativeBinner(data, binMin, binMax, binTotal):
854 """Bins the input data cumulatively. 855 856 @param data: input data, must be a one dimensional list 857 @type binMin: float 858 @param binMin: minimum value from which to bin data 859 @type binMax: float 860 @param binMax: maximum value from which to bin data 861 @type binTotal: int 862 @param binTotal: number of bins 863 @rtype: list 864 @return: binned data, in format [bin centre, frequency] 865 866 """ 867 #Bin data 868 binStep = float(binMax - binMin) / binTotal 869 bins = [] 870 totalItems = len(data) 871 for i in range(binTotal): 872 bins.append(0) 873 for item in data: 874 if item > (binMin + (i * binStep)): 875 bins[i] = bins[i] + 1.0 / totalItems 876 877 # Gnuplot requires points at bin midpoints 878 coords = [] 879 for i in range(binTotal): 880 coords.append([binMin + (float(i + 0.5) * binStep), bins[i]]) 881 882 return coords
883 884 885 #-----------------------------------------------------------------------------
886 -def binner(data, binMin, binMax, binTotal):
887 """Bins the input data.. 888 889 @param data: input data, must be a one dimensional list 890 @type binMin: float 891 @param binMin: minimum value from which to bin data 892 @type binMax: float 893 @param binMax: maximum value from which to bin data 894 @type binTotal: int 895 @param binTotal: number of bins 896 @rtype: list 897 @return: binned data, in format [bin centre, frequency] 898 899 """ 900 #Bin data 901 binStep = float(binMax - binMin) / binTotal 902 bins = [] 903 for i in range(binTotal): 904 bins.append(0) 905 for item in data: 906 if item > (binMin + (i * binStep)) and item <= (binMin + ((i + 1) * 907 binStep)): 908 bins[i] += 1 909 910 # Gnuplot requires points at bin midpoints 911 coords = [] 912 for i in range(binTotal): 913 coords.append([binMin + (float(i + 0.5) * binStep), bins[i]]) 914 915 return coords
916 917 918 #-----------------------------------------------------------------------------
919 -def weightedBinner(data, weights, binMin, binMax, binTotal):
920 """Bins the input data, recorded frequency is sum of weights in bin. 921 922 @param data: input data, must be a one dimensional list 923 @type binMin: float 924 @param binMin: minimum value from which to bin data 925 @type binMax: float 926 @param binMax: maximum value from which to bin data 927 @type binTotal: int 928 @param binTotal: number of bins 929 @rtype: list 930 @return: binned data, in format [bin centre, frequency] 931 932 """ 933 #Bin data 934 binStep = float(binMax - binMin) / binTotal 935 bins = [] 936 for i in range(binTotal): 937 bins.append(0.0) 938 for item, weight in zip(data, weights): 939 if item > (binMin + (i * binStep)) and item <= (binMin + ((i + 1) * 940 binStep)): 941 bins[i] += weight 942 943 # Gnuplot requires points at bin midpoints 944 coords = [] 945 for i in range(binTotal): 946 coords.append([binMin + (float(i + 0.5) * binStep), bins[i]]) 947 948 return coords
949 950 #----------------------------------------------------------------------------- 951 952
953 -def bootstrap(data, 954 statistic, 955 resamples=1000, 956 alpha=0.05, 957 output='ci', 958 **kwargs):
959 """ Returns the bootstrap estimate of the confidence interval for the given 960 statistic. The confidence interval is given by 100*(1-alpha). Passes a 1d 961 array to the function, statistic. Any arguments needed by statistic are 962 passed by **args. 963 964 @type data: list 965 @param data: The data on which the given statistic is calculated 966 @type statistic: function 967 @param statistic: The statistic desired 968 @type resamples: int 969 @param resamples: The number of bootstrap resamplings 970 @type alpha: float 971 @param alpha: The confidence interval given by 100*(1-alpha), 95% defaulti 972 @param output: The format of the output. 'ci' gives the confidence 973 interval, and 'errorbar' gives the length of the errorbar suitable for 974 plotting with matplotlib. 975 @type kwargs: Keywords 976 @param kwargs: Arguments needed by the statistic function 977 @rtype: tuple 978 @return: (Lower Interval, Upper Interval) 979 980 """ 981 982 samples = numpy.random.choice(data, 983 size=(resamples, len(data)), 984 replace=True) 985 stat = numpy.sort([statistic(row, **kwargs) for row in samples]) 986 if output == 'ci': 987 return (stat[int((alpha / 2.0) * resamples)], 988 stat[int((1 - alpha / 2.0) * resamples)]) 989 elif output == 'errorbar': 990 return (numpy.abs(statistic(data, **kwargs) - 991 numpy.array([stat[int((alpha / 2.0) * resamples)], 992 stat[int((1 - alpha / 2.0) * resamples)]]))) 993 else: 994 raise ValueError("Output option {0} is not supported.".format(output))
995 #----------------------------------------------------------------------------- 996 997
998 -def runningStatistic(x, y, statistic='mean', binNumber=10, **kwargs):
999 """ Calculates the value given by statistic in bins of x. Useful for 1000 plotting a running mean value for a scatter plot, for example. This 1001 function allows the computation of the sum, mean, median, std, or other 1002 statistic of the values within each bin. 1003 1004 NOTE: if the statistic is a callable function and there are empty data bins 1005 those bins will be skipped to keep the function from falling over. 1006 1007 @type x: numpy array 1008 @param x: data over which the bins are calculated 1009 @type y: numpy array 1010 @param y: values for corresponding x values 1011 @type statistic: string or function 1012 @param statistic: The statistic to compute (default is 'mean'). Acceptable 1013 values are 'mean', 'median', 'sum', 'std', and callable function. Extra 1014 arguements are passed as kwargs. 1015 @type binNumber: int 1016 @param binNumber: The desired number of bins for the x data. 1017 @rtype: tuple 1018 @return: A tuple of two lists containing the left bin edges and the value 1019 of the statistic in each of the bins. 1020 1021 """ 1022 1023 if type(statistic) == str: 1024 if statistic not in ['mean', 'median', 'sum', 'std']: 1025 raise ValueError('unrecognized statistic "%s"' % statistic) 1026 elif isinstance(statistic, collections.Callable): 1027 pass 1028 else: 1029 raise ValueError("statistic not understood") 1030 1031 if not isinstance(x, numpy.ndarray): 1032 x = numpy.asarray(x) 1033 if not isinstance(y, numpy.ndarray): 1034 y = numpy.asarray(y) 1035 1036 try: 1037 bins = numpy.linspace(x.min(), x.max(), binNumber) 1038 centers = (bins[:-1] + bins[1:]) / 2. 1039 index = numpy.digitize(x, bins) 1040 except TypeError: 1041 bins = binNumber 1042 centers = (bins[:-1] + bins[1:]) / 2. 1043 index = numpy.digitize(x, binNumber) 1044 binNumber = len(binNumber) 1045 1046 if statistic == 'mean': 1047 running = [numpy.mean(y[index == k]) for k in range(1, binNumber)] 1048 elif statistic == 'median': 1049 running = [numpy.median(y[index == k]) for k in range(1, binNumber)] 1050 elif statistic == 'sum': 1051 running = [numpy.sum(y[index == k]) for k in range(1, binNumber)] 1052 elif statistic == 'std': 1053 running = [numpy.std(y[index == k]) for k in range(1, binNumber)] 1054 elif isinstance(statistic, collections.Callable): 1055 running = [statistic(y[index == k], **kwargs) 1056 for k in range(1, binNumber) if not len(y[index == k]) == 0] 1057 return centers, running
1058 1059 #----------------------------------------------------------------------------- 1060 1061
1062 -def slice_sampler(px, N=1, x=None):
1063 """ Provides N samples from a user-defined discreet distribution. 1064 1065 >>> slice_sampler(px, N=1, x=None) 1066 1067 If x=None (default) or if len(x) != len(px), it will return an rray of 1068 intergers between 0 and len(px)-1. If x is given, it will return the 1069 samples from x according to the distribution px. 1070 1071 Originally written by Adam Laiacano, 2011 1072 1073 @type px: numpy array or list 1074 @param px: A discreet probability distribution 1075 @type N: int 1076 @param N: The number of samples to return, default is 1 1077 @type x: numpy array or list 1078 @param x: Optional array/list of observation values to return, where 1079 prob(x) = px 1080 @rtype: numpy.array 1081 @return: The desired number of samples drawn from the distribution 1082 1083 """ 1084 1085 from random import sample 1086 1087 values = numpy.zeros(N, dtype=numpy.int) 1088 samples = numpy.arange(len(px)) 1089 px = numpy.array(px) / (1. * sum(px)) 1090 u = numpy.random.uniform(0, max(px)) 1091 for n in range(N): 1092 included = px >= u 1093 choice = sample(list(range(numpy.sum(included))), 1)[0] 1094 values[n] = samples[included][choice] 1095 u = numpy.random.uniform(0, px[included][choice]) 1096 1097 if len(x) == len(px): 1098 x = numpy.array(x) 1099 values = x[values] 1100 else: 1101 print(("px and x are different lengths. ", 1102 "Returning index locations for px.")) 1103 if N == 1: 1104 return values[0] 1105 return values
1106