Package astLib :: Module astStats
Source Code for Module astLib.astStats

   1  """module for performing statistical calculations. 
   2   
   3  (c) 2007-2012 Matt Hilton 
   4   
   5  (c) 2013-2014 Matt Hilton & Steven Boada 
   6   
   7  U{http://astlib.sourceforge.net} 
   8   
   9  This module (as you may notice) provides very few statistical routines. It 
  10  does, however, provide biweight (robust) estimators of location and scale, as 
  11  described in Beers et al. 1990 (AJ, 100, 32), in addition to a robust least 
  12  squares fitting routine that uses the biweight transform. 
  13   
  14  Some routines may fail if they are passed lists with few items and encounter a 
  15  `divide by zero' error. Where this occurs, the function will return None. An 
  16  error message will be printed to the console when this happens if 
  17  astStats.REPORT_ERRORS=True (the default). Testing if an astStats function 
  18  returns None can be used to handle errors in scripts. 
  19   
  20  For extensive statistics modules, the Python bindings for GNU R 
  21  (U{http://rpy.sourceforge.net}), or SciPy (U{http://www.scipy.org}) are 
  22  suggested. 
  23   
  24  """ 
  25   
  26  import math 
  27  import numpy 
  28  import collections 
  29   
  30  REPORT_ERRORS = True 
  31   
  32   
  33  #----------------------------------------------------------------------------- 
  34 -def mean(dataList): 
  35      """Calculates the mean average of a list of numbers. 
  36   
  37      @type dataList: list or numpy array 
  38      @param dataList: input data, must be a one dimensional list 
  39      @rtype: float 
  40      @return: mean average 
  41   
  42      """ 
  43      return numpy.mean(dataList) 
  44   
  45   
  46  #----------------------------------------------------------------------------- 
  47 -def weightedMean(dataList): 
  48      """Calculates the weighted mean average of a two dimensional list (value, 
  49      weight) of numbers. 
  50   
  51      @type dataList: list 
  52      @param dataList: input data, must be a two dimensional list in format 
  53      [value, weight] 
  54      @rtype: float 
  55      @return: weighted mean average 
  56   
  57      """ 
  58   
  59      dataList = numpy.asarray(dataList) 
  60      mean = numpy.average(dataList[:, 0], weights=dataList[:, 1]) 
  61   
  62      return mean 
  63   
  64   
  65  #----------------------------------------------------------------------------- 
  66 -def stdev(dataList): 
  67      """Calculates the (sample) standard deviation of a list of numbers. 
  68   
  69      @type dataList: list or numpy array 
  70      @param dataList: input data, must be a one dimensional list 
  71      @rtype: float 
  72      @return: standard deviation 
  73   
  74      """ 
  75      return numpy.std(dataList) 
  76   
  77   
  78  #----------------------------------------------------------------------------- 
  79 -def rms(dataList): 
  80      """Calculates the root mean square of a list of numbers. 
  81   
  82      @type dataList: list 
  83      @param dataList: input data, must be a one dimensional list 
  84      @rtype: float 
  85      @return: root mean square 
  86   
  87      """ 
  88      #    dataListSq = [] 
  89      #    for item in dataList: 
  90      #        dataListSq.append(math.pow(item, 2)) 
  91      dataListSq = [math.pow(item, 2) for item in dataList] 
  92      listMeanSq = mean(dataListSq) 
  93      rms = math.sqrt(listMeanSq) 
  94   
  95      return rms 
  96   
  97   
  98  #----------------------------------------------------------------------------- 
  99 -def weightedStdev(dataList): 
 100      """Calculates the weighted (sample) standard deviation of a list of 
 101      numbers. 
 102   
 103      @type dataList: list 
 104      @param dataList: input data, must be a two dimensional list in format 
 105      [value, weight] 
 106      @rtype: float 
 107      @return: weighted standard deviation 
 108   
 109      @note: Returns None if an error occurs. 
 110   
 111      """ 
 112   
 113      dataList = numpy.asarray(dataList) 
 114      if dataList.shape[0] < 1: 
 115          if REPORT_ERRORS: 
 116              print("ERROR: astStats.weightedStdev() : " 
 117                    "dataList contains < 2 items.  ") 
 118          return None 
 119   
 120      listMean = weightedMean(dataList[:, 0], weights=dataList[:, 1]) 
 121      variance = numpy.average((dataList[:, 0] - listMean)**2, 
 122                               weights=dataList[:, 1]) 
 123   
 124      return numpy.sqrt(variance) 
 125   
 126   
 127  #----------------------------------------------------------------------------- 
 128 -def median(dataList): 
 129      """Calculates the median of a list of numbers. 
 130   
 131      @type dataList: list or numpy array 
 132      @param dataList: input data, must be a one dimensional list 
 133      @rtype: float 
 134      @return: median average 
 135   
 136      """ 
 137      return numpy.median(dataList) 
 138   
 139   
 140  #----------------------------------------------------------------------------- 
 141 -def modeEstimate(dataList): 
 142      """Returns an estimate of the mode of a set of values by 
 143      mode=(3*median)-(2*mean). 
 144   
 145      @type dataList: list 
 146      @param dataList: input data, must be a one dimensional list 
 147      @rtype: float 
 148      @return: estimate of mode average 
 149   
 150      """ 
 151      mode = (3 * median(dataList)) - (2 * mean(dataList)) 
 152   
 153      return mode 
 154   
 155   
 156  #----------------------------------------------------------------------------- 
 157 -def MAD(dataList): 
 158      """Calculates the Median Absolute Deviation of a list of numbers. 
 159   
 160      @type dataList: list 
 161      @param dataList: input data, must be a one dimensional list 
 162      @rtype: float 
 163      @return: median absolute deviation 
 164   
 165      """ 
 166      #    listMedian=median(dataList) 
 167   
 168      # Calculate |x-M| values 
 169      #    diffModuli=[] 
 170      #    for item in dataList: 
 171      #        diffModuli.append(math.fabs(item-listMedian)) 
 172   
 173      #    MAD=median(diffModuli) 
 174      #    return MAD 
 175   
 176      #    arr = np.ma.array(dataList).compress() # speed up not using this step 
 177      med = numpy.median(dataList) 
 178      try: 
 179          return numpy.median(numpy.fabs(dataList - med)) 
 180      except TypeError: 
 181          dataList = numpy.array(dataList) 
 182          return numpy.median(numpy.fabs(dataList - med)) 
 183   
 184   
 185  #----------------------------------------------------------------------------- 
 186 -def normalizdMAD(dataList): 
 187      """ Calculates the normalized Median Absolute Deviation of a list of 
 188      numbers which, for a Gaussian distribution, is related to the standard 
 189      deviation by 1.4826. 
 190   
 191      @type dataList: list 
 192      @param dataList: input data, must be a one dimensional list 
 193      @rtype: float 
 194      @return: normalized median absolute deviation 
 195   
 196      """ 
 197   
 198      return 1.4826 * MAD(dataList) 
 199   
 200   
 201  #----------------------------------------------------------------------------- 
 202 -def biweightLocation(dataList, tuningConstant=6.0): 
 203      """Calculates the biweight location estimator (like a robust average) of a 
 204      list of numbers. 
 205   
 206      @type dataList: list 
 207      @param dataList: input data, must be a one dimensional list 
 208      @type tuningConstant: float 
 209      @param tuningConstant: 6.0 is recommended. 
 210      @rtype: float 
 211      @return: biweight location 
 212   
 213      @note: Returns None if an error occurs. 
 214   
 215      """ 
 216      # check to make sure there are items in the list 
 217      if not len(dataList): 
 218          return numpy.nan 
 219   
 220      C = tuningConstant 
 221      #    listMedian = median(dataList) 
 222      listMedian = numpy.median(dataList) 
 223      listMAD = MAD(dataList) 
 224      if listMAD != 0: 
 225          uValues = [] 
 226          for item in dataList: 
 227              uValues.append((item - listMedian) / (C * listMAD)) 
 228   
 229          top = 0  # numerator equation (5) Beers et al if you like 
 230          bottom = 0  # denominator 
 231          for i in range(len(uValues)): 
 232              if math.fabs(uValues[i]) <= 1.0: 
 233                  top = top + ((dataList[i] - listMedian) * 
 234                               (1.0 - (uValues[i] * uValues[i])) * 
 235                               (1.0 - (uValues[i] * uValues[i]))) 
 236   
 237                  bottom = bottom + ((1.0 - (uValues[i] * uValues[i])) * 
 238                                     (1.0 - (uValues[i] * uValues[i]))) 
 239   
 240          CBI = listMedian + (top / bottom) 
 241   
 242      else: 
 243          if REPORT_ERRORS: 
 244              print("ERROR: astStats: biweightLocation() : MAD() returned 0.") 
 245          return None 
 246   
 247      return CBI 
 248   
 249   
 250  #----------------------------------------------------------------------------- 
 251 -def biweightScale(dataList, tuningConstant=9.0): 
 252      """Calculates the biweight scale estimator (like a robust standard 
 253      deviation) of a list of numbers. 
 254   
 255      @type dataList: list 
 256      @param dataList: input data, must be a one dimensional list 
 257      @type tuningConstant: float 
 258      @param tuningConstant: 9.0 is recommended. 
 259      @rtype: float 
 260      @return: biweight scale 
 261   
 262      @note: Returns None if an error occurs. 
 263   
 264      """ 
 265      C = tuningConstant 
 266   
 267      # Calculate |x-M| values and u values 
 268      listMedian = median(dataList) 
 269      listMAD = MAD(dataList) 
 270      diffModuli = [] 
 271      for item in dataList: 
 272          diffModuli.append(math.fabs(item - listMedian)) 
 273      uValues = [] 
 274      for item in dataList: 
 275          try: 
 276              uValues.append((item - listMedian) / (C * listMAD)) 
 277          except ZeroDivisionError: 
 278              if REPORT_ERRORS: 
 279                  print("ERROR: astStats.biweightScale(): divide by zero error.") 
 280              return None 
 281   
 282      top = 0  # numerator equation (9) Beers etal 1990 
 283      bottom = 0  # denominator equation (9) Beers etal 1990 
 284      valCount = 0  # Count values where u<1 only 
 285   
 286      for i in range(len(uValues)): 
 287          # Skip u values >1 
 288          if math.fabs(uValues[i]) <= 1.0: 
 289              u2Term = 1.0 - math.pow(uValues[i], 2) 
 290              u4Term = math.pow(u2Term, 4) 
 291              top += math.pow(diffModuli[i], 2) * u4Term 
 292              bottom += (u2Term * (1.0 - (5.0 * math.pow(uValues[i], 2)))) 
 293              valCount += 1 
 294   
 295      top = math.sqrt(top) 
 296      bottom = math.fabs(bottom) 
 297   
 298      SBI = math.pow(float(valCount), 0.5) * (top / bottom) 
 299      return SBI 
 300   
 301   
 302  #----------------------------------------------------------------------------- 
 303 -def biweightScale_test(dataList, tuningConstant=9.0): 
 304      """Calculates the biweight scale estimator (like a robust standard 
 305      deviation) of a list of numbers. 
 306   
 307      @type dataList: list 
 308      @param dataList: input data, must be a one dimensional list 
 309      @type tuningConstant: float 
 310      @param tuningConstant: 9.0 is recommended. 
 311      @rtype: float 
 312      @return: biweight scale 
 313   
 314      @note: Returns None if an error occurs. 
 315   
 316      """ 
 317      C = tuningConstant 
 318   
 319      # Calculate |x-M| values and u values 
 320      listMedian = median(dataList) 
 321      listMAD = MAD(dataList) 
 322      diffModuli = [] 
 323      for item in dataList: 
 324          diffModuli.append(math.fabs(item - listMedian)) 
 325      uValues = [] 
 326      for item in dataList: 
 327          try: 
 328              uValues.append((item - listMedian) / (C * listMAD)) 
 329          except ZeroDivisionError: 
 330              if REPORT_ERRORS: 
 331                  print("ERROR: astStats.biweightScale(): divide by zero error.") 
 332              return None 
 333   
 334      top = 0  # numerator equation (9) Beers etal 1990 
 335      bottom = 0  # denominator equation (9) Beers etal 1990 
 336      valCount = 0  # Count values where u<1 only 
 337   
 338      for i in range(len(uValues)): 
 339          # Skip u values >1 
 340          if math.fabs(uValues[i]) <= 1.0: 
 341              u2Term = 1.0 - math.pow(uValues[i], 2) 
 342              u4Term = math.pow(u2Term, 4) 
 343              top += math.pow(diffModuli[i], 2) * u4Term 
 344              bottom += (u2Term * (1.0 - (5.0 * math.pow(uValues[i], 2)))) 
 345              valCount += 1 
 346   
 347      top = math.sqrt(top) 
 348      #bottom = math.fabs(bottom) 
 349      bottom = math.sqrt(bottom * (bottom - 1.)) 
 350   
 351      SBI = math.pow(float(valCount), 0.5) * (top / bottom) 
 352      return SBI 
 353   
 354   
 355  #----------------------------------------------------------------------------- 
 356 -def biweightClipped(dataList, tuningConstant, sigmaCut): 
 357      """Iteratively calculates biweight location and scale, using sigma 
 358      clipping, for a list of values. The calculation is performed on the first 
 359      column of a multi-dimensional list; other columns are ignored. 
 360   
 361      @type dataList: list 
 362      @param dataList: input data 
 363      @type tuningConstant: float 
 364      @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is 
 365      recommended for scale estimates 
 366      @type sigmaCut: float 
 367      @param sigmaCut: sigma clipping to apply 
 368      @rtype: dictionary 
 369      @return: estimate of biweight location, scale, and list of non-clipped 
 370      data, in the format {'biweightLocation', 'biweightScale', 'dataList'} 
 371   
 372      @note: Returns None if an error occurs. 
 373   
 374      """ 
 375   
 376      iterations = 0 
 377      clippedValues = [] 
 378      for row in dataList: 
 379          if type(row) == list: 
 380              clippedValues.append(row[0]) 
 381          else: 
 382              clippedValues.append(row) 
 383   
 384      while iterations < 11 and len(clippedValues) > 5: 
 385   
 386          cbi = biweightLocation(clippedValues, tuningConstant) 
 387          sbi = biweightScale(clippedValues, tuningConstant) 
 388   
 389          # check for either biweight routine falling over 
 390          # happens when feed in lots of similar numbers 
 391          # e.g. when bootstrapping with a small sample 
 392          if cbi is None or sbi is None: 
 393   
 394              if REPORT_ERRORS: 
 395                  print("ERROR: astStats.biweightClipped(): divide by zero error.") 
 396   
 397              return None 
 398   
 399          else: 
 400   
 401              clippedValues = [] 
 402              clippedData = [] 
 403              for row in dataList: 
 404                  if type(row) == list: 
 405                      if row[0] > cbi - (sigmaCut * sbi) and \ 
 406                              row[0] < cbi + (sigmaCut * sbi): 
 407                          clippedValues.append(row[0]) 
 408                          clippedData.append(row) 
 409                  else: 
 410                      if row > cbi - (sigmaCut * sbi) and \ 
 411                              row < cbi + (sigmaCut * sbi): 
 412                          clippedValues.append(row) 
 413                          clippedData.append(row) 
 414   
 415          iterations = iterations + 1 
 416   
 417      return {'biweightLocation': cbi, 
 418              'biweightScale': sbi, 
 419              'dataList': clippedData} 
 420   
 421   
 422  #----------------------------------------------------------------------------- 
 423 -def biweightTransform(dataList, tuningConstant): 
 424      """Calculates the biweight transform for a set of values. Useful for using 
 425      as weights in robust line fitting. 
 426   
 427      @type dataList: list 
 428      @param dataList: input data, must be a one dimensional list 
 429      @type tuningConstant: float 
 430      @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is 
 431      recommended for scale estimates 
 432      @rtype: list 
 433      @return: list of biweights 
 434   
 435      """ 
 436      C = tuningConstant 
 437   
 438      # Calculate |x-M| values and u values 
 439      listMedian = abs(median(dataList)) 
 440      cutoff = C * listMedian 
 441      biweights = [] 
 442      for item in dataList: 
 443          if abs(item) < cutoff: 
 444              biweights.append([item, 
 445                                (1.0 - ((item / cutoff) * (item / cutoff))) * 
 446                                (1.0 - ((item / cutoff) * (item / cutoff)))]) 
 447          else: 
 448              biweights.append([item, 0.0]) 
 449   
 450      return biweights 
 451   
 452  #----------------------------------------------------------------------------- 
 453   
 454   
 455 -def gapperEstimator(dataList): 
 456      """ Calculates the Gapper Estimator (like a robust standard deviation) on a 
 457      list of numbers. Beers et al. 1990 recommends this for small number 
 458      statistics as it is insensitive to outliers and more accurately reproduces 
 459      the true dispersion of the system when compared to the canonical rms 
 460      standard deviation. See Hou et al. 2009 for a comparison. 
 461   
 462      @type dataList: list 
 463      @param dataList: input data, must be a one dimensional list 
 464      @rtype: float 
 465      @return: The dispersion of the dataList 
 466   
 467      """ 
 468   
 469      # make a copy to ensure no changes to original list 
 470      dataList = numpy.copy(dataList) 
 471   
 472      # sort 
 473      dataList.sort() 
 474   
 475      # find the differences between neighboring elements 
 476      diff = numpy.diff(dataList) 
 477   
 478      # weight the list of differences 
 479      weightedDiff = [i * (len(dataList) - i) * g for i, g in enumerate(diff)] 
 480   
 481      return math.sqrt(math.pi) / (len(dataList) * (len(dataList) - 1)) *\ 
 482              sum(weightedDiff) 
 483   
 484  #----------------------------------------------------------------------------- 
 485   
 486   
 487 -def OLSFit(dataList): 
 488      """Performs an ordinary least squares fit on a two dimensional list of 
 489      numbers. Minimum number of data points is 5. 
 490   
 491      @type dataList: list 
 492      @param dataList: input data, must be a two dimensional list in format [x,y] 
 493      @rtype: dictionary 
 494      @return: slope and intercept on y-axis, with associated errors, in the 
 495      format {'slope', 'intercept', 'slopeError', 'interceptError'} 
 496   
 497      @note: Returns None if an error occurs. 
 498   
 499      """ 
 500      sumX = 0 
 501      sumY = 0 
 502      sumXY = 0 
 503      sumXX = 0 
 504      n = float(len(dataList)) 
 505      if n > 2: 
 506          for item in dataList: 
 507              sumX = sumX + item[0] 
 508              sumY = sumY + item[1] 
 509              sumXY = sumXY + (item[0] * item[1]) 
 510              sumXX = sumXX + (item[0] * item[0]) 
 511          m = ((n * sumXY) - (sumX * sumY)) / ((n * sumXX) - (sumX * sumX)) 
 512          c = ((sumXX * sumY) - (sumX * sumXY)) / ((n * sumXX) - (sumX * sumX)) 
 513   
 514          sumRes = 0 
 515          for item in dataList: 
 516   
 517              sumRes = sumRes + ((item[1] - (m * item[0]) - c) * 
 518                                 (item[1] - (m * item[0]) - c)) 
 519   
 520          sigma = math.sqrt((1.0 / (n - 2)) * sumRes) 
 521   
 522          try: 
 523              mSigma = (sigma * 
 524                        math.sqrt(n)) / math.sqrt((n * sumXX) - (sumX * sumX)) 
 525          except: 
 526              mSigma = numpy.nan 
 527          try: 
 528              cSigma = ( 
 529                  sigma * 
 530                  math.sqrt(sumXX)) / math.sqrt((n * sumXX) - (sumX * sumX)) 
 531          except: 
 532              cSigma = numpy.nan 
 533      else: 
 534          if REPORT_ERRORS: 
 535              print("ERROR: astStats.OLSFit(): dataList contains < 3 items.") 
 536   
 537          return None 
 538   
 539      return {'slope': m, 
 540              'intercept': c, 
 541              'slopeError': mSigma, 
 542              'interceptError': cSigma} 
 543   
 544   
 545  #----------------------------------------------------------------------------- 
 546 -def clippedMeanStdev(dataList, sigmaCut=3.0, maxIterations=10.0): 
 547      """Calculates the clipped mean and stdev of a list of numbers. 
 548   
 549      @type dataList: list 
 550      @param dataList: input data, one dimensional list of numbers 
 551      @type sigmaCut: float 
 552      @param sigmaCut: clipping in Gaussian sigma to apply 
 553      @type maxIterations: int 
 554      @param maxIterations: maximum number of iterations 
 555      @rtype: dictionary 
 556      @return: format {'clippedMean', 'clippedStdev', 'numPoints'} 
 557   
 558      """ 
 559   
 560      listCopy = [] 
 561      for d in dataList: 
 562          listCopy.append(d) 
 563      listCopy = numpy.array(listCopy) 
 564   
 565      iterations = 0 
 566      while iterations < maxIterations and len(listCopy) > 4: 
 567   
 568          m = listCopy.mean() 
 569          s = listCopy.std() 
 570   
 571          listCopy = listCopy[numpy.less(abs(listCopy), abs(m + sigmaCut * s))] 
 572   
 573          iterations = iterations + 1 
 574   
 575      return {'clippedMean': m, 
 576              'clippedStdev': s, 
 577              'numPoints': 
 578              listCopy.shape[0]} 
 579   
 580   
 581  #----------------------------------------------------------------------------- 
 582 -def clippedMedianStdev(dataList, sigmaCut=3.0, maxIterations=10.0): 
 583      """Calculates the clipped mean and stdev of a list of numbers. 
 584   
 585      @type dataList: list 
 586      @param dataList: input data, one dimensional list of numbers 
 587      @type sigmaCut: float 
 588      @param sigmaCut: clipping in Gaussian sigma to apply 
 589      @type maxIterations: int 
 590      @param maxIterations: maximum number of iterations 
 591      @rtype: dictionary 
 592      @return: format {'clippedMean', 'clippedStdev', 'numPoints'} 
 593   
 594      """ 
 595   
 596      listCopy = [] 
 597      for d in dataList: 
 598          listCopy.append(d) 
 599      listCopy = numpy.array(listCopy) 
 600   
 601      iterations = 0 
 602      while iterations < maxIterations and len(listCopy) > 4: 
 603   
 604          m = median(listCopy) 
 605          s = listCopy.std() 
 606          listCopy = listCopy[numpy.less(abs(listCopy), abs(m + sigmaCut * s))] 
 607          iterations += 1 
 608   
 609      return {'clippedMedian': m, 
 610              'clippedStdev': s, 
 611              'numPoints': 
 612              listCopy.shape[0]} 
 613   
 614   
 615  #----------------------------------------------------------------------------- 
 616 -def clippedWeightedLSFit(dataList, sigmaCut): 
 617      """Performs a weighted least squares fit on a list of numbers with sigma 
 618      clipping. Minimum number of data points is 5. 
 619   
 620      @type dataList: list 
 621      @param dataList: input data, must be a three dimensional list in format [x, 
 622      y, y weight] 
 623      @rtype: dictionary 
 624      @return: slope and intercept on y-axis, with associated errors, in the 
 625      format {'slope', 'intercept', 'slopeError', 'interceptError'} 
 626   
 627      @note: Returns None if an error occurs. 
 628   
 629      """ 
 630   
 631      iterations = 0 
 632      clippedValues = [] 
 633      for row in dataList: 
 634          clippedValues.append(row) 
 635   
 636      while iterations < 11 and len(clippedValues) > 4: 
 637   
 638          fitResults = weightedLSFit(clippedValues, "errors") 
 639   
 640          if fitResults['slope'] is None: 
 641   
 642              if REPORT_ERRORS: 
 643                  print("ERROR: astStats.clippedWeightedLSFit(): " 
 644                        "divide by zero error.") 
 645   
 646              return None 
 647   
 648          else: 
 649   
 650              clippedValues = [] 
 651              for row in dataList: 
 652   
 653                  # Trim points more than sigmaCut*sigma away from the fitted line 
 654                  fit = fitResults['slope'] * row[0] + fitResults['intercept'] 
 655                  res = row[1] - fit 
 656                  if abs(res) / row[2] < sigmaCut: 
 657                      clippedValues.append(row) 
 658   
 659          iterations = iterations + 1 
 660   
 661      # store the number of values that made it through the clipping process 
 662      fitResults['numDataPoints'] = len(clippedValues) 
 663   
 664      return fitResults 
 665   
 666   
 667  #----------------------------------------------------------------------------- 
 668 -def weightedLSFit(dataList, weightType): 
 669      """Performs a weighted least squares fit on a three dimensional list of 
 670      numbers [x, y, y error]. 
 671   
 672      @type dataList: list 
 673      @param dataList: input data, must be a three dimensional list in format [x, 
 674      y, y error] 
 675      @type weightType: string 
 676      @param weightType: if "errors", weights are calculated assuming the input 
 677      data is in the format [x, y, error on y]; if "weights", the weights are 
 678      assumed to be already calculated and stored in a fourth column [x, y, error 
 679      on y, weight] (as used by e.g. L{astStats.biweightLSFit}) 
 680      @rtype: dictionary 
 681      @return: slope and intercept on y-axis, with associated errors, in the 
 682      format {'slope', 'intercept', 'slopeError', 'interceptError'} 
 683   
 684      @note: Returns None if an error occurs. 
 685   
 686      """ 
 687      if weightType == "weights": 
 688          sumW = 0 
 689          sumWX = 0 
 690          sumWY = 0 
 691          sumWXY = 0 
 692          sumWXX = 0 
 693          n = float(len(dataList)) 
 694          if n > 4: 
 695              for item in dataList: 
 696                  W = item[3] 
 697                  sumWX = sumWX + (W * item[0]) 
 698                  sumWY = sumWY + (W * item[1]) 
 699                  sumWXY = sumWXY + (W * item[0] * item[1]) 
 700                  sumWXX = sumWXX + (W * item[0] * item[0]) 
 701                  sumW = sumW + W 
 702                  #print sumW, sumWXX, sumWX 
 703   
 704              try: 
 705                  m = ((sumW * sumWXY) - (sumWX * sumWY)) / ((sumW * sumWXX) - 
 706                                                            (sumWX * sumWX)) 
 707              except ZeroDivisionError: 
 708                  if REPORT_ERRORS: 
 709                      print("ERROR: astStats.weightedLSFit(): divide by zero error.") 
 710                  return None 
 711   
 712              try: 
 713                  c = ((sumWXX * sumWY) - (sumWX * sumWXY)) / ((sumW * sumWXX) - 
 714                                                            (sumWX * sumWX)) 
 715              except ZeroDivisionError: 
 716                  if REPORT_ERRORS: 
 717                      print("ERROR: astStats.weightedLSFit(): divide by zero error.") 
 718                  return None 
 719   
 720              sumRes = 0 
 721              for item in dataList: 
 722   
 723                  sumRes = sumRes + ((item[1] - (m * item[0]) - c) * (item[1] - 
 724                                                              (m * item[0]) - c)) 
 725   
 726              sigma = math.sqrt((1.0 / (n - 2)) * sumRes) 
 727   
 728              # Can get div0 errors here so check 
 729              # When biweight fitting converges this shouldn't happen 
 730              if (n * sumWXX) - (sumWX * sumWX) > 0.0: 
 731   
 732                  mSigma = (sigma * math.sqrt(n)) / math.sqrt((n * sumWXX) - 
 733                                                              (sumWX * sumWX)) 
 734   
 735                  cSigma = (sigma * math.sqrt(sumWXX)) / math.sqrt((n * sumWXX) - 
 736                                                                   (sumWX * sumWX)) 
 737   
 738              else: 
 739   
 740                  if REPORT_ERRORS: 
 741                      print("ERROR: astStats.weightedLSFit(): " 
 742                            "divide by zero error.") 
 743                  return None 
 744   
 745          else: 
 746              if REPORT_ERRORS: 
 747                  print("ERROR: astStats.weightedLSFit(): " 
 748                        "dataList contains < 5 items.") 
 749              return None 
 750   
 751      elif weightType == "errors": 
 752          sumX = 0 
 753          sumY = 0 
 754          sumXY = 0 
 755          sumXX = 0 
 756          sumSigma = 0 
 757          n = float(len(dataList)) 
 758          for item in dataList: 
 759              sumX = sumX + (item[0] / (item[2] * item[2])) 
 760              sumY = sumY + (item[1] / (item[2] * item[2])) 
 761              sumXY = sumXY + ((item[0] * item[1]) / (item[2] * item[2])) 
 762              sumXX = sumXX + ((item[0] * item[0]) / (item[2] * item[2])) 
 763              sumSigma = sumSigma + (1.0 / (item[2] * item[2])) 
 764          delta = (sumSigma * sumXX) - (sumX * sumX) 
 765          m = ((sumSigma * sumXY) - (sumX * sumY)) / delta 
 766          c = ((sumXX * sumY) - (sumX * sumXY)) / delta 
 767          mSigma = math.sqrt(sumSigma / delta) 
 768          cSigma = math.sqrt(sumXX / delta) 
 769   
 770      return {'slope': m, 
 771              'intercept': c, 
 772              'slopeError': mSigma, 
 773              'interceptError': cSigma} 
 774   
 775   
 776  #----------------------------------------------------------------------------- 
 777 -def biweightLSFit(dataList, tuningConstant, sigmaCut=None): 
 778      """Performs a weighted least squares fit, where the weights used are the 
 779      biweight transforms of the residuals to the previous best fit .i.e. the 
 780      procedure is iterative, and converges very quickly (iterations is set to 10 
 781      by default). Minimum number of data points is 10. 
 782   
 783      This seems to give slightly different results to the equivalent R routine, 
 784      so use at your own risk! 
 785   
 786      @type dataList: list 
 787      @param dataList: input data, must be a three dimensional list in format [x, 
 788      y, y weight] 
 789      @type tuningConstant: float 
 790      @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is 
 791      recommended for 
 792      scale estimates 
 793      @type sigmaCut: float 
 794      @param sigmaCut: sigma clipping to apply (set to None if not required) 
 795      @rtype: dictionary 
 796      @return: slope and intercept on y-axis, with associated errors, in the 
 797      format {'slope', 'intercept', 'slopeError', 'interceptError'} 
 798   
 799      @note: Returns None if an error occurs. 
 800   
 801      """ 
 802   
 803      dataCopy = [] 
 804      for row in dataList: 
 805          dataCopy.append(row) 
 806   
 807      # First perform unweighted fit, then calculate residuals 
 808      results = OLSFit(dataCopy) 
 809      for k in range(10): 
 810          m = results['slope'] 
 811          c = results['intercept'] 
 812          res = [] 
 813          for item in dataCopy: 
 814              res.append((m * item[0] + c) - item[1]) 
 815   
 816          if len(res) > 5: 
 817              # For clipping, trim away things >3 sigma 
 818              # away from median 
 819              if sigmaCut is not None: 
 820                  absRes = [] 
 821                  for item in res: 
 822                      absRes.append(abs(item)) 
 823                  sigma = stdev(absRes) 
 824                  count = 0 
 825                  for item in absRes: 
 826                      if item > (sigmaCut * sigma) and len(dataCopy) > 2: 
 827                          del dataCopy[count] 
 828                          del res[count] 
 829   
 830                          # Index of datalist gets out of 
 831                          # sync with absRes as we delete 
 832                          # items 
 833                          count -= 1 
 834   
 835                      count += 1 
 836   
 837              # Biweight transform residuals 
 838              weights = biweightTransform(res, tuningConstant) 
 839   
 840              # Perform weighted fit, using biweight transforms 
 841              # of residuals as weight 
 842              wData = [] 
 843              for i in range(len(dataCopy)): 
 844                  wData.append([dataCopy[i][0], dataCopy[i][1], dataCopy[i][2], 
 845                                weights[i][1]]) 
 846   
 847              results = weightedLSFit(wData, "weights") 
 848   
 849      return results 
 850   
 851   
 852  #----------------------------------------------------------------------------- 
 853 -def cumulativeBinner(data, binMin, binMax, binTotal): 
 854      """Bins the input data cumulatively. 
 855   
 856      @param data: input data, must be a one dimensional list 
 857      @type binMin: float 
 858      @param binMin: minimum value from which to bin data 
 859      @type binMax: float 
 860      @param binMax: maximum value from which to bin data 
 861      @type binTotal: int 
 862      @param binTotal: number of bins 
 863      @rtype: list 
 864      @return: binned data, in format [bin centre, frequency] 
 865   
 866      """ 
 867      #Bin data 
 868      binStep = float(binMax - binMin) / binTotal 
 869      bins = [] 
 870      totalItems = len(data) 
 871      for i in range(binTotal): 
 872          bins.append(0) 
 873          for item in data: 
 874              if item > (binMin + (i * binStep)): 
 875                  bins[i] = bins[i] + 1.0 / totalItems 
 876   
 877      # Gnuplot requires points at bin midpoints 
 878      coords = [] 
 879      for i in range(binTotal): 
 880          coords.append([binMin + (float(i + 0.5) * binStep), bins[i]]) 
 881   
 882      return coords 
 883   
 884   
 885  #----------------------------------------------------------------------------- 
 886 -def binner(data, binMin, binMax, binTotal): 
 887      """Bins the input data.. 
 888   
 889      @param data: input data, must be a one dimensional list 
 890      @type binMin: float 
 891      @param binMin: minimum value from which to bin data 
 892      @type binMax: float 
 893      @param binMax: maximum value from which to bin data 
 894      @type binTotal: int 
 895      @param binTotal: number of bins 
 896      @rtype: list 
 897      @return: binned data, in format [bin centre, frequency] 
 898   
 899      """ 
 900      #Bin data 
 901      binStep = float(binMax - binMin) / binTotal 
 902      bins = [] 
 903      for i in range(binTotal): 
 904          bins.append(0) 
 905          for item in data: 
 906              if item > (binMin + (i * binStep)) and item <= (binMin + ((i + 1) * 
 907                                                                        binStep)): 
 908                  bins[i] += 1 
 909   
 910      # Gnuplot requires points at bin midpoints 
 911      coords = [] 
 912      for i in range(binTotal): 
 913          coords.append([binMin + (float(i + 0.5) * binStep), bins[i]]) 
 914   
 915      return coords 
 916   
 917   
 918  #----------------------------------------------------------------------------- 
 919 -def weightedBinner(data, weights, binMin, binMax, binTotal): 
 920      """Bins the input data, recorded frequency is sum of weights in bin. 
 921   
 922      @param data: input data, must be a one dimensional list 
 923      @type binMin: float 
 924      @param binMin: minimum value from which to bin data 
 925      @type binMax: float 
 926      @param binMax: maximum value from which to bin data 
 927      @type binTotal: int 
 928      @param binTotal: number of bins 
 929      @rtype: list 
 930      @return: binned data, in format [bin centre, frequency] 
 931   
 932      """ 
 933      #Bin data 
 934      binStep = float(binMax - binMin) / binTotal 
 935      bins = [] 
 936      for i in range(binTotal): 
 937          bins.append(0.0) 
 938          for item, weight in zip(data, weights): 
 939              if item > (binMin + (i * binStep)) and item <= (binMin + ((i + 1) * 
 940                                                                        binStep)): 
 941                  bins[i] += weight 
 942   
 943      # Gnuplot requires points at bin midpoints 
 944      coords = [] 
 945      for i in range(binTotal): 
 946          coords.append([binMin + (float(i + 0.5) * binStep), bins[i]]) 
 947   
 948      return coords 
 949   
 950  #----------------------------------------------------------------------------- 
 951   
 952   
 953 -def bootstrap(data, 
 954                statistic, 
 955                resamples=1000, 
 956                alpha=0.05, 
 957                output='ci', 
 958                **kwargs): 
 959      """ Returns the bootstrap estimate of the confidence interval for the given 
 960      statistic. The confidence interval is given by 100*(1-alpha). Passes a 1d 
 961      array to the function, statistic. Any arguments needed by statistic are 
 962      passed by **args. 
 963   
 964      @type data: list 
 965      @param data: The data on which the given statistic is calculated 
 966      @type statistic: function 
 967      @param statistic: The statistic desired 
 968      @type resamples: int 
 969      @param resamples: The number of bootstrap resamplings 
 970      @type alpha: float 
 971      @param alpha: The confidence interval given by 100*(1-alpha), 95% defaulti 
 972      @param output: The format of the output. 'ci' gives the confidence 
 973      interval, and 'errorbar' gives the length of the errorbar suitable for 
 974      plotting with matplotlib. 
 975      @type kwargs: Keywords 
 976      @param kwargs: Arguments needed by the statistic function 
 977      @rtype: tuple 
 978      @return: (Lower Interval, Upper Interval) 
 979   
 980      """ 
 981   
 982      samples = numpy.random.choice(data, 
 983                                    size=(resamples, len(data)), 
 984                                    replace=True) 
 985      stat = numpy.sort([statistic(row, **kwargs) for row in samples]) 
 986      if output == 'ci': 
 987          return (stat[int((alpha / 2.0) * resamples)], 
 988                  stat[int((1 - alpha / 2.0) * resamples)]) 
 989      elif output == 'errorbar': 
 990          return (numpy.abs(statistic(data, **kwargs) - 
 991                  numpy.array([stat[int((alpha / 2.0) * resamples)], 
 992                  stat[int((1 - alpha / 2.0) * resamples)]]))) 
 993      else: 
 994          raise ValueError("Output option {0} is not supported.".format(output)) 
 995  #----------------------------------------------------------------------------- 
 996   
 997   
 998 -def runningStatistic(x, y, statistic='mean', binNumber=10, **kwargs): 
 999      """ Calculates the value given by statistic in bins of x. Useful for 
1000      plotting a running mean value for a scatter plot, for example. This 
1001      function allows the computation of the sum, mean, median, std, or other 
1002      statistic of the values within each bin. 
1003   
1004      NOTE: if the statistic is a callable function and there are empty data bins 
1005      those bins will be skipped to keep the function from falling over. 
1006   
1007      @type x: numpy array 
1008      @param x: data over which the bins are calculated 
1009      @type y: numpy array 
1010      @param y: values for corresponding x values 
1011      @type statistic: string or function 
1012      @param statistic: The statistic to compute (default is 'mean'). Acceptable 
1013      values are 'mean', 'median', 'sum', 'std', and callable function. Extra 
1014      arguements are passed as kwargs. 
1015      @type binNumber: int 
1016      @param binNumber: The desired number of bins for the x data. 
1017      @rtype: tuple 
1018      @return: A tuple of two lists containing the left bin edges and the value 
1019      of the statistic in each of the bins. 
1020   
1021      """ 
1022   
1023      if type(statistic) == str: 
1024          if statistic not in ['mean', 'median', 'sum', 'std']: 
1025              raise ValueError('unrecognized statistic "%s"' % statistic) 
1026      elif isinstance(statistic, collections.Callable): 
1027          pass 
1028      else: 
1029          raise ValueError("statistic not understood") 
1030   
1031      if not isinstance(x, numpy.ndarray): 
1032          x = numpy.asarray(x) 
1033      if not isinstance(y, numpy.ndarray): 
1034          y = numpy.asarray(y) 
1035   
1036      try: 
1037          bins = numpy.linspace(x.min(), x.max(), binNumber) 
1038          centers = (bins[:-1] + bins[1:]) / 2. 
1039          index = numpy.digitize(x, bins) 
1040      except TypeError: 
1041          bins = binNumber 
1042          centers = (bins[:-1] + bins[1:]) / 2. 
1043          index = numpy.digitize(x, binNumber) 
1044          binNumber = len(binNumber) 
1045   
1046      if statistic == 'mean': 
1047          running = [numpy.mean(y[index == k]) for k in range(1, binNumber)] 
1048      elif statistic == 'median': 
1049          running = [numpy.median(y[index == k]) for k in range(1, binNumber)] 
1050      elif statistic == 'sum': 
1051          running = [numpy.sum(y[index == k]) for k in range(1, binNumber)] 
1052      elif statistic == 'std': 
1053          running = [numpy.std(y[index == k]) for k in range(1, binNumber)] 
1054      elif isinstance(statistic, collections.Callable): 
1055          running = [statistic(y[index == k], **kwargs) 
1056                     for k in range(1, binNumber) if not len(y[index == k]) == 0] 
1057      return centers, running 
1058   
1059  #----------------------------------------------------------------------------- 
1060   
1061   
1062 -def slice_sampler(px, N=1, x=None): 
1063      """ Provides N samples from a user-defined discreet distribution. 
1064   
1065      >>> slice_sampler(px, N=1, x=None) 
1066   
1067      If x=None (default) or if len(x) != len(px), it will return an rray of 
1068      intergers between 0 and len(px)-1. If x is given, it will return the 
1069      samples from x according to the distribution px. 
1070   
1071      Originally written by Adam Laiacano, 2011 
1072   
1073      @type px: numpy array or list 
1074      @param px: A discreet probability distribution 
1075      @type N: int 
1076      @param N: The number of samples to return, default is 1 
1077      @type x: numpy array or list 
1078      @param x: Optional array/list of observation values to return, where 
1079      prob(x) = px 
1080      @rtype: numpy.array 
1081      @return: The desired number of samples drawn from the distribution 
1082   
1083      """ 
1084   
1085      from random import sample 
1086   
1087      values = numpy.zeros(N, dtype=numpy.int) 
1088      samples = numpy.arange(len(px)) 
1089      px = numpy.array(px) / (1. * sum(px)) 
1090      u = numpy.random.uniform(0, max(px)) 
1091      for n in range(N): 
1092          included = px >= u 
1093          choice = sample(list(range(numpy.sum(included))), 1)[0] 
1094          values[n] = samples[included][choice] 
1095          u = numpy.random.uniform(0, px[included][choice]) 
1096   
1097      if len(x) == len(px): 
1098          x = numpy.array(x) 
1099          values = x[values] 
1100      else: 
1101          print(("px and x are different lengths. ", 
1102                 "Returning index locations for px.")) 
1103      if N == 1: 
1104          return values[0] 
1105      return values 
1106