acgc.stats.bivariate
Bivariate statistics
Statistical measures of relationships between two populations
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3""" Bivariate statistics 4 5Statistical measures of relationships between two populations 6""" 7 8import numpy as np 9from scipy import stats 10from .bivariate_lines import sen, sma, bivariate_line_equation 11# import xarray as xr 12 13__all__ = [ 14 "BivariateStatistics", 15 "nmb", 16 "nmae", 17 "nmbf", 18 "nmaef" 19] 20 21def nmb( x0, x1 ): 22 '''Compute Normalized Mean Bias (NMB) 23 24 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 25 26 Parameters 27 ---------- 28 x0 : array_like 29 reference values 30 x1 : array_like 31 experiment values 32 ''' 33 34 assert (len(x0) == len(x1)), \ 35 "Parameters x0 and x1 must have the same length" 36 37 # Mean values 38 x0_mean = np.mean(x0) 39 x1_mean = np.mean(x1) 40 41 # Metric value 42 return x1_mean / x0_mean - 1 43 44def nmae( x0, x1 ): 45 '''Compute Normalized Mean Absolute Error (NMAE) 46 47 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 48 49 Parameters 50 --------- 51 x0 : array_like 52 reference values 53 x1 : array_like 54 experiment values 55 ''' 56 57 # Mean values 58 x0_mean = np.mean(x0) 59 60 # Mean absolute difference 61 abs_diff = np.mean( np.abs(x1 - x0) ) 62 63 # Metric value 64 return abs_diff / np.abs( x0_mean ) 65 66 67def nmbf( x0, x1 ): 68 '''Compute Normalized Mean Bias Factor (NMBF) 69 70 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 71 72 Parameters 73 ---------- 74 x0 : array_like 75 reference values 76 x1 : array_like 77 experiment values 78 ''' 79 80 # Ensure that arguments have the same length 81 assert (len(x0) == len(x1)), \ 82 "Parameters x0 and x1 must have the same length" 83 84 # Mean values 85 x0_mean = np.mean(x0) 86 x1_mean = np.mean(x1) 87 88 # Metric value 89 if x1_mean >= x0_mean: 90 result = x1_mean / x0_mean - 1 91 else: 92 result= 1 - x0_mean / x1_mean 93 # Equivalent (faster?) implementation 94 #S = (mMean - oMean) / np.abs(mMean - oMean) 95 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 96 97 return result 98 99def nmaef( x0, x1 ): 100 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 101 102 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 103 104 Parameters 105 ---------- 106 x0 : array_like 107 reference values 108 x1 : array_like 109 experiment values 110 ''' 111 112 # Ensure that arguments have the same length 113 assert (len(x0) == len(x1)), \ 114 "Parameters x0 and x1 must have the same length" 115 116 # Mean values 117 x0_mean = np.mean(x0) 118 x1_mean = np.mean(x1) 119 120 # Mean absolute difference 121 abs_diff = np.mean( np.abs(x1 - x0)) 122 123 # Metric value 124 if x1_mean >= x0_mean: 125 result = abs_diff / x0_mean 126 else: 127 result = abs_diff / x1_mean 128 # Equivalent (faster?) implementation 129 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 130 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 131 132 return result 133 134def _texify_name(name): 135 '''Return a LaTex formatted string for some variables 136 137 Parameters 138 ---------- 139 name : str 140 141 Returns 142 ------- 143 pretty_name : str 144 ''' 145 if name.lower()=='n': 146 pretty_name = r'$n$' 147 elif name=='R2': 148 pretty_name = f'$R^2$' 149 elif name=='r2': 150 pretty_name = f'$r^2$' 151 elif name.lower()=='y_ols': 152 pretty_name = r'$y_{\rm OLS}$' 153 elif name.lower()=='y_sma': 154 pretty_name = r'$y_{\rm SMA}$' 155 elif name.lower()=='y_sen': 156 pretty_name = r'$y_{\rm Sen}$' 157 else: 158 pretty_name = name 159 return pretty_name 160 161def _number2str(value, 162 intformat='{:d}', 163 floatformat='{:.4f}'): 164 '''Format number as string using integer and float format specifiers 165 166 Parameters 167 ---------- 168 value : numeric, str 169 value to be converted 170 intformat : str, default='{:d}' 171 format specifier for integer types 172 floatformat : str, default='{:.4f}' 173 format specifier for float types 174 175 Returns 176 ------- 177 str 178 ''' 179 if isinstance(value,str): 180 pass 181 elif isinstance(value,(int,np.integer)): 182 value = intformat.format(value) 183 else: 184 value = floatformat.format(value) 185 return value 186 187class BivariateStatistics: 188 '''A suite of common statistics to quantify bivariate relationships 189 190 Class method 'summary' provides a formatted summary of these statistics 191 192 Attributes 193 ---------- 194 count, n : int 195 number of valid (not NaN) data value pairs 196 xmean, ymean : float 197 mean of x and y variables 198 xmedian, ymedian :float 199 median of x and y variables 200 xstd, ystd : float 201 standard deviation of x and y variables 202 mean_difference, md : float 203 ymean - xmean 204 std_difference, stdd : float 205 std( y - x ) 206 mean_absolute_difference, mad : float 207 mean( |y-x| ) 208 relative_mean_difference, rmd : float 209 md / xmean 210 relative_mean_absolute_difference, rmad :float 211 mad / xmean 212 standardized_mean_difference, smd : float 213 md / xstd 214 standardized_mean_absolute_difference, smad : float 215 mad /xstd 216 mean_relative_difference, mrd : float 217 mean(y/x) - 1 218 mean_absolute_relative_difference, mrd : float 219 mean(abs(y/x - 1)) 220 mean_log10_ratio, mlr : float 221 mean( log10(y/x) ) 222 std_log10_ratio, stdlr : float 223 std( log10(y/x) ) 224 mean_absolute_log10_ratio, malr : float 225 mean( abs( log10(y/x) ) ) 226 median_difference, medd : float 227 median(y-x) 228 median_absolute_difference, medad : float 229 median(|y-x|) 230 relative_median_difference, rmedd : float 231 median(y-x) / xmedian 232 relative_median_absolute_difference, rmedad : float 233 median(|y-x|) / xmedian 234 median_relative_difference, medianrd, medrd : float 235 median(y/x)-1 236 median_log10_ratio, medlr : float 237 median( log10(y/x) ) 238 median_absolute_log10_ratio, medalr : float 239 median( abs( log10(y/x) ) ) 240 normalized_mean_bias_factor, nmbf : float 241 see `nmbf` 242 normalized_mean_absolute_error_factor, nmaef : float 243 see `nmaef` 244 root_mean_square_difference, rmsd : float 245 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 246 root_mean_square_log10_ratio, rmslr : float 247 $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$ 248 covariance : float 249 cov(x,y) 250 correlation_pearson, correlation, pearsonr, R, r : float 251 Pearson linear correlation coefficient 252 correlation_spearman, spearmanr : float 253 Spearman, non-parametric rank correlation coefficient 254 R2, r2 : float 255 Linear coefficient of determination, $R^2$ 256 ''' 257 258 def __init__(self,x,y,w=None,dropna=False,data=None): 259 '''Compute suite of bivariate statistics during initialization 260 261 Statistic values are saved in attributes. 262 CAUTION: Weights w are ignored except in SMA fit 263 264 Parameters 265 ---------- 266 x : ndarray or str 267 independent variable values 268 y : ndarray or str 269 dependent variable values, same size as x 270 w : ndarray or str, optional 271 weights for points (x,y), same size as x and y 272 dropna : bool, optional (default=False) 273 drops NaN values from x, y, and w 274 data : dict-like, optional 275 if x, y, or w are str, then they should be keys in data 276 ''' 277 278 # Get values from data if needed 279 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 280 raise ValueError( 'Data argument must be used if x, y, or w is a string') 281 if isinstance(x,str): 282 x = data[x] 283 if isinstance(y,str): 284 y = data[y] 285 if isinstance(w,str): 286 w = data[w] 287 288 #Ensure that x and y have same length 289 if len(x) != len(y): 290 raise ValueError( 'Arguments x and y must have the same length' ) 291 if w is None: 292 w = np.ones_like(x) 293 if len(w) != len(x): 294 raise ValueError( 'Argument w (if present) must have the same length as x' ) 295 296 # Drop NaN values 297 if dropna: 298 isna = np.isnan(x*y*w) 299 x = x[~isna] 300 y = y[~isna] 301 w = w[~isna] 302 303 # Differences and ratios used repeatedly 304 diff = y - x 305 absdiff = np.abs( y - x ) 306 # Ignore divide by zero and 0/0 while dividing 307 old_settings = np.seterr(divide='ignore',invalid='ignore') 308 ratio = y/x 309 log10ratio = np.log10(ratio) 310 np.seterr(**old_settings) 311 312 # Number of data points 313 self.count = self.n = len(x) 314 315 # Means, medians, and standard deviations 316 self.xmean = np.mean(x) 317 self.ymean = np.mean(y) 318 self.xmedian = np.median(x) 319 self.ymedian = np.median(y) 320 self.xstd = np.std(x) 321 self.ystd = np.std(y) 322 323 # Save values for use later 324 self._x = x 325 self._y = y 326 self._w = w 327 328 # Mean and mean absolute differences 329 self.mean_difference = self.md = self.ymean - self.xmean 330 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 331 self.std_difference = self.stdd = np.std( diff ) 332 333 # Relative and standardized differences 334 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 335 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 336 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 337 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 338 339 # Mean and median relative differences 340 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 341 self.mean_absolute_relative_difference = self.mard = np.mean( np.abs( ratio - 1 ) ) 342 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 343 self.median_absolute_relative_difference = self.medianard = self.medard = np.median( np.abs( ratio - 1 ) ) 344 345 # Median and median absolute differences 346 self.median_difference = self.medd = np.median( diff ) 347 self.median_absolute_difference = self.medad = np.median( absdiff ) 348 349 # Relative median differences 350 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 351 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 352 353 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 354 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 355 356 # Mean and mean absolute log ratio 357 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 358 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 359 self.std_log10_ratio = self.stdlr= np.std( log10ratio ) 360 361 # Median and median absolute log ratio 362 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 363 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 364 365 # RMS difference 366 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 367 # RMS log ratio 368 self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 ))) 369 370 # Covariance, correlation 371 self.covariance = self.cov = np.cov(x,y)[0][1] 372 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 373 np.corrcoef(x,y)[0][1] 374 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 375 self.R2 = self.r2 = self.R**2 376 377 def __getitem__(self,key): 378 '''Accesses attribute values via object['key']''' 379 return getattr(self,key) 380 381 def fitline(self,method='sma',intercept=True,**kwargs): 382 '''Compute bivariate line fit 383 384 Parameters 385 ---------- 386 method : str 387 line fitting method: sma (default), ols, wls, York, sen, siegel 388 intercept : bool 389 defines whether non-zero intercept should be fitted 390 **kwargs 391 passed to `acgc.stats.sma` (e.g. robust=True) 392 393 Returns 394 ------- 395 result : dict 396 dictionary with keys: 397 - slope (float) 398 slope of fitted line 399 - intercept (float) 400 intercept of fitted line 401 - fittedvalues (array (N,)) 402 values on fit line 403 - residuals (array (N,)) 404 residual from fit line 405 ''' 406 407 fitintercept = intercept 408 409 if method.lower()=='sma': 410 fit = sma( self._x, 411 self._y, 412 self._w, 413 intercept=fitintercept, 414 **kwargs) 415 slope = fit['slope'] 416 intercept= fit['intercept'] 417 418 elif method.lower()=='ols': 419 if fitintercept: 420 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 421 self._y, rcond=None ) 422 else: 423 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 424 slope = ols[0][0] 425 intercept = ols[0][1] 426 427 elif method.lower() in ['theil','sen','theilsen']: 428 fitintercept = True 429 fit = sen( self._x, 430 self._y, 431 **kwargs) 432 slope = fit['slope'] 433 intercept = fit['intercept'] 434 435 elif method.lower()=='siegel': 436 fitintercept = True 437 siegel = stats.siegelslopes( self._x, 438 self._y ) 439 slope = siegel.slope 440 intercept = siegel.intercept 441 442 elif method.lower()=='wls': 443 raise NotImplementedError('WLS regression not implemented yet') 444 445 elif method.lower()=='york': 446 raise NotImplementedError('York regression not implemented yet') 447 448 else: 449 raise ValueError('Undefined method '+method) 450 451 line = dict( slope = slope, 452 intercept = intercept, 453 fittedvalues = slope * self._x + intercept, 454 residuals = self._y - ( slope * self._x + intercept ), 455 method = method, 456 fitintercept = fitintercept ) 457 458 return line 459 460 def slope(self,method='sma',intercept=True,**kwargs): 461 '''Compute slope of bivariate line fit 462 463 Parameters 464 ---------- 465 method : str 466 line fitting method: sma (default), ols, wls 467 intercept : bool 468 defines whether non-zero intercept should be fitted 469 **kwargs 470 passed to `fitline` 471 472 Returns 473 ------- 474 slope : float 475 value of y intercept 476 ''' 477 return self.fitline(method,intercept,**kwargs)['slope'] 478 479 def intercept(self,method='sma',intercept=True,**kwargs): 480 '''Compute intercept of bivariate line fit 481 482 Parameters 483 ---------- 484 method : str 485 line fitting method: sma (default) or ols 486 intercept : bool 487 defines whether non-zero intercept should be fitted 488 **kwargs 489 passed to `fitline` 490 491 Returns 492 ------- 493 intercept : float 494 value of y intercept 495 ''' 496 return self.fitline(method,intercept,**kwargs)['intercept'] 497 498 def _expand_variables(self,variables): 499 '''Expand special strings into a list of variables 500 501 Parameter 502 --------- 503 variables : list or str, default='common' 504 Special strings ("all","common") will be expanded to a list of variables 505 list arguments will not be modified 506 507 Returns 508 ------- 509 list 510 variable names 511 ''' 512 if variables is None: 513 variables='common' 514 if variables=='all': 515 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 516 'MLR','MALR', 517 'MedD','MedAD','RMedD','RMedAD','MedRD', 518 'MedLR','MedALR', 519 'NMBF','NMAEF','RMSD','cov', 520 'R','R2','spearmanr','slope','intercept', 521 'fitline','n'] 522 elif variables=='common': 523 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n'] 524 if not isinstance(variables,list): 525 raise ValueError( 526 'variables must be a list, None, or one of these strings: "all","common"') 527 528 return variables 529 530 def _summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 531 '''Summarize bivariate statistics into a dict 532 533 Parameters 534 ---------- 535 vars : list or str, default='common' 536 names of attribute variables to include in summary 537 names are case insensitive 538 The following strings are also accepted in place of a list 539 "all" (displays all variables) 540 "common" (displays all measures of mean difference) 541 fitline_kw : dict, default=None 542 keywords passed to `fitline` 543 floatformat_fiteqn : str, default=floatformat 544 format specifier for slope and intercept (a,b) in y = a x + b 545 546 Returns 547 ------- 548 summary : dict 549 names and values of variables 550 ''' 551 552 # List of variables 553 variables = self._expand_variables(variables) 554 555 if fitline_kw is None: 556 fitline_kw = {'method':'sma', 557 'intercept':True} 558 559 # Construct the dict 560 summary = {} 561 for v in variables: 562 if v in ['slope','intercept']: 563 # These variables are object methods 564 func = getattr(self,v) 565 value = func(**fitline_kw) 566 elif v == 'fitline': 567 line = self.fitline(**fitline_kw) 568 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 569 else: 570 # Retrieve values 571 value = getattr(self,v.lower()) 572 573 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 574 summary[v] = value 575 576 return summary 577 578 def summary(self, variables=None, fitline_kw=None, 579 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 580 stringlength=None ): 581 '''Summarize bivariate statistics 582 583 Parameters 584 ---------- 585 vars : list or str, default='common' 586 names of attribute variables to include in summary 587 names are case insensitive 588 The following strings are also accepted in place of a list 589 "all" (displays all variables) 590 "common" (displays all measures of mean difference) 591 fitline_kw : dict, default=None 592 keywords passed to `fitline` 593 intformat : str, default='{:d}' 594 format specifier for integer values 595 floatformat : str, default='{:.4f}' 596 format specifier for floating point values 597 floatformat_fiteqn : str, default=floatformat 598 format specifier for slope and intercept (a,b) in y = a x + b 599 stringlength : int, default=None 600 length of the variables on output 601 default (None) is to use the length of the longest variable name 602 603 Returns 604 ------- 605 summary : str 606 names and values of variables 607 ''' 608 # List of variables 609 variables = self._expand_variables(variables) 610 611 if floatformat_fiteqn is None: 612 floatformat_fiteqn = floatformat 613 if stringlength is None: 614 stringlength = np.max([len(v) for v in variables]) 615 stringformat = '{:'+str(stringlength)+'s}' 616 617 # Get a dict containing the needed variables 618 summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn ) 619 620 # Extract length of the float numbers from floatformat 621 # import re 622 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 623 # floatformat )[0] ) ).astype(int) 624 625 # summary = (stringformat+'{:>10s}').format('Variable','Value') 626 summarytext = '' 627 for k,v in summarydict.items(): 628 vstr = _number2str(v,intformat,floatformat) 629 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 630 631 return summarytext 632 633 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 634 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 635 loc=None, loc_units='axes', 636 **kwargs): 637 '''Display bivariate statistics as a table inset on a plot axis 638 639 Parameters 640 ---------- 641 ax : matplotlib.Figure.Axis 642 axis where the table will be displayed 643 variables : list or str, default='common' 644 names of attribute variables to include in summary 645 names are case insensitive 646 The following strings are also accepted in place of a list 647 "all" (displays all variables) 648 "common" (displays all measures of mean difference) 649 fitline_kw : dict, default=None 650 keywords passed to `fitline` 651 intformat : str, default='{:d}' 652 format specifier for integer values 653 floatformat : str, default='{:.3f}' 654 format specifier for floating point values 655 floatformat_fiteqn : str, default=floatformat 656 format specifier for slope and intercept (a,b) in y = a x + b 657 loc : tuple (x0,y0), default=(0.85, 0.05) 658 location on the axis where the table will be drawn 659 can be in data units or axes units [0-1] 660 loc_units : {'axes' (default), 'data'} 661 specifies whether loc has 'data' units or 'axes' units [0-1] 662 663 Returns 664 ------- 665 text1, text2 : matplotlib text object 666 Artist for the two text boxes 667 ''' 668 # List of variables 669 variables = self._expand_variables(variables) 670 671 if floatformat_fiteqn is None: 672 floatformat_fiteqn = floatformat 673 674 # Default location in lower right corner 675 if loc is None: 676 loc = (0.8,0.05) 677 678 # Coordinates for loc 679 if loc_units.lower()=='data': 680 coord=ax.transData 681 elif loc_units.lower() in ['axes','axis']: 682 coord=ax.transAxes 683 else: 684 raise ValueError('Display units should be "Data" or "Axes"') 685 686 # Get a dict containing the needed variables 687 summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn ) 688 689 # Column of label text 690 label_text = '\n'.join([_texify_name(key) 691 for key in summarydict]) 692 # Column of value text 693 value_text = '\n'.join([_number2str(v,intformat,floatformat) 694 for v in summarydict.values()]) 695 696 # Check if horizontal alignment keyword is used 697 ha='' 698 try: 699 ha = kwargs['ha'] 700 except KeyError: 701 pass 702 try: 703 ha = kwargs['horizontalalignment'] 704 except KeyError: 705 pass 706 707 # For right alignment, align on values first 708 # Otherwise, align on labels 709 if ha=='right': 710 first_text = value_text 711 second_text = label_text 712 sign = -1 713 else: 714 first_text = label_text 715 second_text = value_text 716 sign = +1 717 718 # Add first column of text 719 t1=ax.text(loc[0],loc[1], 720 first_text, 721 transform=coord, 722 **kwargs 723 ) 724 725 # Get width of first text column 726 bbox = t1.get_window_extent().transformed(coord.inverted()) 727 width = bbox.x1-bbox.x0 728 729 # Add second column of text 730 t2 = ax.text(loc[0]+width*1.05*sign,loc[1], 731 second_text, 732 transform=coord, 733 **kwargs 734 ) 735 736 ################################## 737 # Early version of this function using matplotlib.table.table() 738 739 # if isinstance(loc,(tuple,list)): 740 # # Create an inset axis to contain the table 741 # tableaxis = ax.inset_axes(loc) 742 # table_width=1 743 # else: 744 # tableaxis = ax 745 746 # # Display the table on the axis 747 # return mtable.table( 748 # tableaxis, 749 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 750 # rowLabels=[texify_name(key) for key in summarydict], 751 # colWidths=[table_width/2]*2, 752 # edges=edges, 753 # loc=loc, bbox=bbox 754 # ) 755 756 return [t1,t2]
188class BivariateStatistics: 189 '''A suite of common statistics to quantify bivariate relationships 190 191 Class method 'summary' provides a formatted summary of these statistics 192 193 Attributes 194 ---------- 195 count, n : int 196 number of valid (not NaN) data value pairs 197 xmean, ymean : float 198 mean of x and y variables 199 xmedian, ymedian :float 200 median of x and y variables 201 xstd, ystd : float 202 standard deviation of x and y variables 203 mean_difference, md : float 204 ymean - xmean 205 std_difference, stdd : float 206 std( y - x ) 207 mean_absolute_difference, mad : float 208 mean( |y-x| ) 209 relative_mean_difference, rmd : float 210 md / xmean 211 relative_mean_absolute_difference, rmad :float 212 mad / xmean 213 standardized_mean_difference, smd : float 214 md / xstd 215 standardized_mean_absolute_difference, smad : float 216 mad /xstd 217 mean_relative_difference, mrd : float 218 mean(y/x) - 1 219 mean_absolute_relative_difference, mrd : float 220 mean(abs(y/x - 1)) 221 mean_log10_ratio, mlr : float 222 mean( log10(y/x) ) 223 std_log10_ratio, stdlr : float 224 std( log10(y/x) ) 225 mean_absolute_log10_ratio, malr : float 226 mean( abs( log10(y/x) ) ) 227 median_difference, medd : float 228 median(y-x) 229 median_absolute_difference, medad : float 230 median(|y-x|) 231 relative_median_difference, rmedd : float 232 median(y-x) / xmedian 233 relative_median_absolute_difference, rmedad : float 234 median(|y-x|) / xmedian 235 median_relative_difference, medianrd, medrd : float 236 median(y/x)-1 237 median_log10_ratio, medlr : float 238 median( log10(y/x) ) 239 median_absolute_log10_ratio, medalr : float 240 median( abs( log10(y/x) ) ) 241 normalized_mean_bias_factor, nmbf : float 242 see `nmbf` 243 normalized_mean_absolute_error_factor, nmaef : float 244 see `nmaef` 245 root_mean_square_difference, rmsd : float 246 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 247 root_mean_square_log10_ratio, rmslr : float 248 $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$ 249 covariance : float 250 cov(x,y) 251 correlation_pearson, correlation, pearsonr, R, r : float 252 Pearson linear correlation coefficient 253 correlation_spearman, spearmanr : float 254 Spearman, non-parametric rank correlation coefficient 255 R2, r2 : float 256 Linear coefficient of determination, $R^2$ 257 ''' 258 259 def __init__(self,x,y,w=None,dropna=False,data=None): 260 '''Compute suite of bivariate statistics during initialization 261 262 Statistic values are saved in attributes. 263 CAUTION: Weights w are ignored except in SMA fit 264 265 Parameters 266 ---------- 267 x : ndarray or str 268 independent variable values 269 y : ndarray or str 270 dependent variable values, same size as x 271 w : ndarray or str, optional 272 weights for points (x,y), same size as x and y 273 dropna : bool, optional (default=False) 274 drops NaN values from x, y, and w 275 data : dict-like, optional 276 if x, y, or w are str, then they should be keys in data 277 ''' 278 279 # Get values from data if needed 280 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 281 raise ValueError( 'Data argument must be used if x, y, or w is a string') 282 if isinstance(x,str): 283 x = data[x] 284 if isinstance(y,str): 285 y = data[y] 286 if isinstance(w,str): 287 w = data[w] 288 289 #Ensure that x and y have same length 290 if len(x) != len(y): 291 raise ValueError( 'Arguments x and y must have the same length' ) 292 if w is None: 293 w = np.ones_like(x) 294 if len(w) != len(x): 295 raise ValueError( 'Argument w (if present) must have the same length as x' ) 296 297 # Drop NaN values 298 if dropna: 299 isna = np.isnan(x*y*w) 300 x = x[~isna] 301 y = y[~isna] 302 w = w[~isna] 303 304 # Differences and ratios used repeatedly 305 diff = y - x 306 absdiff = np.abs( y - x ) 307 # Ignore divide by zero and 0/0 while dividing 308 old_settings = np.seterr(divide='ignore',invalid='ignore') 309 ratio = y/x 310 log10ratio = np.log10(ratio) 311 np.seterr(**old_settings) 312 313 # Number of data points 314 self.count = self.n = len(x) 315 316 # Means, medians, and standard deviations 317 self.xmean = np.mean(x) 318 self.ymean = np.mean(y) 319 self.xmedian = np.median(x) 320 self.ymedian = np.median(y) 321 self.xstd = np.std(x) 322 self.ystd = np.std(y) 323 324 # Save values for use later 325 self._x = x 326 self._y = y 327 self._w = w 328 329 # Mean and mean absolute differences 330 self.mean_difference = self.md = self.ymean - self.xmean 331 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 332 self.std_difference = self.stdd = np.std( diff ) 333 334 # Relative and standardized differences 335 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 336 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 337 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 338 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 339 340 # Mean and median relative differences 341 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 342 self.mean_absolute_relative_difference = self.mard = np.mean( np.abs( ratio - 1 ) ) 343 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 344 self.median_absolute_relative_difference = self.medianard = self.medard = np.median( np.abs( ratio - 1 ) ) 345 346 # Median and median absolute differences 347 self.median_difference = self.medd = np.median( diff ) 348 self.median_absolute_difference = self.medad = np.median( absdiff ) 349 350 # Relative median differences 351 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 352 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 353 354 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 355 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 356 357 # Mean and mean absolute log ratio 358 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 359 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 360 self.std_log10_ratio = self.stdlr= np.std( log10ratio ) 361 362 # Median and median absolute log ratio 363 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 364 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 365 366 # RMS difference 367 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 368 # RMS log ratio 369 self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 ))) 370 371 # Covariance, correlation 372 self.covariance = self.cov = np.cov(x,y)[0][1] 373 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 374 np.corrcoef(x,y)[0][1] 375 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 376 self.R2 = self.r2 = self.R**2 377 378 def __getitem__(self,key): 379 '''Accesses attribute values via object['key']''' 380 return getattr(self,key) 381 382 def fitline(self,method='sma',intercept=True,**kwargs): 383 '''Compute bivariate line fit 384 385 Parameters 386 ---------- 387 method : str 388 line fitting method: sma (default), ols, wls, York, sen, siegel 389 intercept : bool 390 defines whether non-zero intercept should be fitted 391 **kwargs 392 passed to `acgc.stats.sma` (e.g. robust=True) 393 394 Returns 395 ------- 396 result : dict 397 dictionary with keys: 398 - slope (float) 399 slope of fitted line 400 - intercept (float) 401 intercept of fitted line 402 - fittedvalues (array (N,)) 403 values on fit line 404 - residuals (array (N,)) 405 residual from fit line 406 ''' 407 408 fitintercept = intercept 409 410 if method.lower()=='sma': 411 fit = sma( self._x, 412 self._y, 413 self._w, 414 intercept=fitintercept, 415 **kwargs) 416 slope = fit['slope'] 417 intercept= fit['intercept'] 418 419 elif method.lower()=='ols': 420 if fitintercept: 421 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 422 self._y, rcond=None ) 423 else: 424 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 425 slope = ols[0][0] 426 intercept = ols[0][1] 427 428 elif method.lower() in ['theil','sen','theilsen']: 429 fitintercept = True 430 fit = sen( self._x, 431 self._y, 432 **kwargs) 433 slope = fit['slope'] 434 intercept = fit['intercept'] 435 436 elif method.lower()=='siegel': 437 fitintercept = True 438 siegel = stats.siegelslopes( self._x, 439 self._y ) 440 slope = siegel.slope 441 intercept = siegel.intercept 442 443 elif method.lower()=='wls': 444 raise NotImplementedError('WLS regression not implemented yet') 445 446 elif method.lower()=='york': 447 raise NotImplementedError('York regression not implemented yet') 448 449 else: 450 raise ValueError('Undefined method '+method) 451 452 line = dict( slope = slope, 453 intercept = intercept, 454 fittedvalues = slope * self._x + intercept, 455 residuals = self._y - ( slope * self._x + intercept ), 456 method = method, 457 fitintercept = fitintercept ) 458 459 return line 460 461 def slope(self,method='sma',intercept=True,**kwargs): 462 '''Compute slope of bivariate line fit 463 464 Parameters 465 ---------- 466 method : str 467 line fitting method: sma (default), ols, wls 468 intercept : bool 469 defines whether non-zero intercept should be fitted 470 **kwargs 471 passed to `fitline` 472 473 Returns 474 ------- 475 slope : float 476 value of y intercept 477 ''' 478 return self.fitline(method,intercept,**kwargs)['slope'] 479 480 def intercept(self,method='sma',intercept=True,**kwargs): 481 '''Compute intercept of bivariate line fit 482 483 Parameters 484 ---------- 485 method : str 486 line fitting method: sma (default) or ols 487 intercept : bool 488 defines whether non-zero intercept should be fitted 489 **kwargs 490 passed to `fitline` 491 492 Returns 493 ------- 494 intercept : float 495 value of y intercept 496 ''' 497 return self.fitline(method,intercept,**kwargs)['intercept'] 498 499 def _expand_variables(self,variables): 500 '''Expand special strings into a list of variables 501 502 Parameter 503 --------- 504 variables : list or str, default='common' 505 Special strings ("all","common") will be expanded to a list of variables 506 list arguments will not be modified 507 508 Returns 509 ------- 510 list 511 variable names 512 ''' 513 if variables is None: 514 variables='common' 515 if variables=='all': 516 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 517 'MLR','MALR', 518 'MedD','MedAD','RMedD','RMedAD','MedRD', 519 'MedLR','MedALR', 520 'NMBF','NMAEF','RMSD','cov', 521 'R','R2','spearmanr','slope','intercept', 522 'fitline','n'] 523 elif variables=='common': 524 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n'] 525 if not isinstance(variables,list): 526 raise ValueError( 527 'variables must be a list, None, or one of these strings: "all","common"') 528 529 return variables 530 531 def _summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 532 '''Summarize bivariate statistics into a dict 533 534 Parameters 535 ---------- 536 vars : list or str, default='common' 537 names of attribute variables to include in summary 538 names are case insensitive 539 The following strings are also accepted in place of a list 540 "all" (displays all variables) 541 "common" (displays all measures of mean difference) 542 fitline_kw : dict, default=None 543 keywords passed to `fitline` 544 floatformat_fiteqn : str, default=floatformat 545 format specifier for slope and intercept (a,b) in y = a x + b 546 547 Returns 548 ------- 549 summary : dict 550 names and values of variables 551 ''' 552 553 # List of variables 554 variables = self._expand_variables(variables) 555 556 if fitline_kw is None: 557 fitline_kw = {'method':'sma', 558 'intercept':True} 559 560 # Construct the dict 561 summary = {} 562 for v in variables: 563 if v in ['slope','intercept']: 564 # These variables are object methods 565 func = getattr(self,v) 566 value = func(**fitline_kw) 567 elif v == 'fitline': 568 line = self.fitline(**fitline_kw) 569 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 570 else: 571 # Retrieve values 572 value = getattr(self,v.lower()) 573 574 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 575 summary[v] = value 576 577 return summary 578 579 def summary(self, variables=None, fitline_kw=None, 580 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 581 stringlength=None ): 582 '''Summarize bivariate statistics 583 584 Parameters 585 ---------- 586 vars : list or str, default='common' 587 names of attribute variables to include in summary 588 names are case insensitive 589 The following strings are also accepted in place of a list 590 "all" (displays all variables) 591 "common" (displays all measures of mean difference) 592 fitline_kw : dict, default=None 593 keywords passed to `fitline` 594 intformat : str, default='{:d}' 595 format specifier for integer values 596 floatformat : str, default='{:.4f}' 597 format specifier for floating point values 598 floatformat_fiteqn : str, default=floatformat 599 format specifier for slope and intercept (a,b) in y = a x + b 600 stringlength : int, default=None 601 length of the variables on output 602 default (None) is to use the length of the longest variable name 603 604 Returns 605 ------- 606 summary : str 607 names and values of variables 608 ''' 609 # List of variables 610 variables = self._expand_variables(variables) 611 612 if floatformat_fiteqn is None: 613 floatformat_fiteqn = floatformat 614 if stringlength is None: 615 stringlength = np.max([len(v) for v in variables]) 616 stringformat = '{:'+str(stringlength)+'s}' 617 618 # Get a dict containing the needed variables 619 summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn ) 620 621 # Extract length of the float numbers from floatformat 622 # import re 623 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 624 # floatformat )[0] ) ).astype(int) 625 626 # summary = (stringformat+'{:>10s}').format('Variable','Value') 627 summarytext = '' 628 for k,v in summarydict.items(): 629 vstr = _number2str(v,intformat,floatformat) 630 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 631 632 return summarytext 633 634 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 635 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 636 loc=None, loc_units='axes', 637 **kwargs): 638 '''Display bivariate statistics as a table inset on a plot axis 639 640 Parameters 641 ---------- 642 ax : matplotlib.Figure.Axis 643 axis where the table will be displayed 644 variables : list or str, default='common' 645 names of attribute variables to include in summary 646 names are case insensitive 647 The following strings are also accepted in place of a list 648 "all" (displays all variables) 649 "common" (displays all measures of mean difference) 650 fitline_kw : dict, default=None 651 keywords passed to `fitline` 652 intformat : str, default='{:d}' 653 format specifier for integer values 654 floatformat : str, default='{:.3f}' 655 format specifier for floating point values 656 floatformat_fiteqn : str, default=floatformat 657 format specifier for slope and intercept (a,b) in y = a x + b 658 loc : tuple (x0,y0), default=(0.85, 0.05) 659 location on the axis where the table will be drawn 660 can be in data units or axes units [0-1] 661 loc_units : {'axes' (default), 'data'} 662 specifies whether loc has 'data' units or 'axes' units [0-1] 663 664 Returns 665 ------- 666 text1, text2 : matplotlib text object 667 Artist for the two text boxes 668 ''' 669 # List of variables 670 variables = self._expand_variables(variables) 671 672 if floatformat_fiteqn is None: 673 floatformat_fiteqn = floatformat 674 675 # Default location in lower right corner 676 if loc is None: 677 loc = (0.8,0.05) 678 679 # Coordinates for loc 680 if loc_units.lower()=='data': 681 coord=ax.transData 682 elif loc_units.lower() in ['axes','axis']: 683 coord=ax.transAxes 684 else: 685 raise ValueError('Display units should be "Data" or "Axes"') 686 687 # Get a dict containing the needed variables 688 summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn ) 689 690 # Column of label text 691 label_text = '\n'.join([_texify_name(key) 692 for key in summarydict]) 693 # Column of value text 694 value_text = '\n'.join([_number2str(v,intformat,floatformat) 695 for v in summarydict.values()]) 696 697 # Check if horizontal alignment keyword is used 698 ha='' 699 try: 700 ha = kwargs['ha'] 701 except KeyError: 702 pass 703 try: 704 ha = kwargs['horizontalalignment'] 705 except KeyError: 706 pass 707 708 # For right alignment, align on values first 709 # Otherwise, align on labels 710 if ha=='right': 711 first_text = value_text 712 second_text = label_text 713 sign = -1 714 else: 715 first_text = label_text 716 second_text = value_text 717 sign = +1 718 719 # Add first column of text 720 t1=ax.text(loc[0],loc[1], 721 first_text, 722 transform=coord, 723 **kwargs 724 ) 725 726 # Get width of first text column 727 bbox = t1.get_window_extent().transformed(coord.inverted()) 728 width = bbox.x1-bbox.x0 729 730 # Add second column of text 731 t2 = ax.text(loc[0]+width*1.05*sign,loc[1], 732 second_text, 733 transform=coord, 734 **kwargs 735 ) 736 737 ################################## 738 # Early version of this function using matplotlib.table.table() 739 740 # if isinstance(loc,(tuple,list)): 741 # # Create an inset axis to contain the table 742 # tableaxis = ax.inset_axes(loc) 743 # table_width=1 744 # else: 745 # tableaxis = ax 746 747 # # Display the table on the axis 748 # return mtable.table( 749 # tableaxis, 750 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 751 # rowLabels=[texify_name(key) for key in summarydict], 752 # colWidths=[table_width/2]*2, 753 # edges=edges, 754 # loc=loc, bbox=bbox 755 # ) 756 757 return [t1,t2]
A suite of common statistics to quantify bivariate relationships
Class method 'summary' provides a formatted summary of these statistics
Attributes
- count, n (int): number of valid (not NaN) data value pairs
- xmean, ymean (float): mean of x and y variables
- xmedian, ymedian (float): median of x and y variables
- xstd, ystd (float): standard deviation of x and y variables
- mean_difference, md (float): ymean - xmean
- std_difference, stdd (float): std( y - x )
- mean_absolute_difference, mad (float): mean( |y-x| )
- relative_mean_difference, rmd (float): md / xmean
- relative_mean_absolute_difference, rmad (float): mad / xmean
- standardized_mean_difference, smd (float): md / xstd
- standardized_mean_absolute_difference, smad (float): mad /xstd
- mean_relative_difference, mrd (float): mean(y/x) - 1
- mean_absolute_relative_difference, mrd (float): mean(abs(y/x - 1))
- mean_log10_ratio, mlr (float): mean( log10(y/x) )
- std_log10_ratio, stdlr (float): std( log10(y/x) )
- mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
- median_difference, medd (float): median(y-x)
- median_absolute_difference, medad (float): median(|y-x|)
- relative_median_difference, rmedd (float): median(y-x) / xmedian
- relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
- median_relative_difference, medianrd, medrd (float): median(y/x)-1
- median_log10_ratio, medlr (float): median( log10(y/x) )
- median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
- normalized_mean_bias_factor, nmbf (float):
see
nmbf - normalized_mean_absolute_error_factor, nmaef (float):
see
nmaef - root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
- root_mean_square_log10_ratio, rmslr (float): $\sqrt{ \langle \log_{10}(y/x)^2 \rangle }$
- covariance (float): cov(x,y)
- correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
- correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
- R2, r2 (float): Linear coefficient of determination, $R^2$
259 def __init__(self,x,y,w=None,dropna=False,data=None): 260 '''Compute suite of bivariate statistics during initialization 261 262 Statistic values are saved in attributes. 263 CAUTION: Weights w are ignored except in SMA fit 264 265 Parameters 266 ---------- 267 x : ndarray or str 268 independent variable values 269 y : ndarray or str 270 dependent variable values, same size as x 271 w : ndarray or str, optional 272 weights for points (x,y), same size as x and y 273 dropna : bool, optional (default=False) 274 drops NaN values from x, y, and w 275 data : dict-like, optional 276 if x, y, or w are str, then they should be keys in data 277 ''' 278 279 # Get values from data if needed 280 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 281 raise ValueError( 'Data argument must be used if x, y, or w is a string') 282 if isinstance(x,str): 283 x = data[x] 284 if isinstance(y,str): 285 y = data[y] 286 if isinstance(w,str): 287 w = data[w] 288 289 #Ensure that x and y have same length 290 if len(x) != len(y): 291 raise ValueError( 'Arguments x and y must have the same length' ) 292 if w is None: 293 w = np.ones_like(x) 294 if len(w) != len(x): 295 raise ValueError( 'Argument w (if present) must have the same length as x' ) 296 297 # Drop NaN values 298 if dropna: 299 isna = np.isnan(x*y*w) 300 x = x[~isna] 301 y = y[~isna] 302 w = w[~isna] 303 304 # Differences and ratios used repeatedly 305 diff = y - x 306 absdiff = np.abs( y - x ) 307 # Ignore divide by zero and 0/0 while dividing 308 old_settings = np.seterr(divide='ignore',invalid='ignore') 309 ratio = y/x 310 log10ratio = np.log10(ratio) 311 np.seterr(**old_settings) 312 313 # Number of data points 314 self.count = self.n = len(x) 315 316 # Means, medians, and standard deviations 317 self.xmean = np.mean(x) 318 self.ymean = np.mean(y) 319 self.xmedian = np.median(x) 320 self.ymedian = np.median(y) 321 self.xstd = np.std(x) 322 self.ystd = np.std(y) 323 324 # Save values for use later 325 self._x = x 326 self._y = y 327 self._w = w 328 329 # Mean and mean absolute differences 330 self.mean_difference = self.md = self.ymean - self.xmean 331 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 332 self.std_difference = self.stdd = np.std( diff ) 333 334 # Relative and standardized differences 335 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 336 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 337 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 338 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 339 340 # Mean and median relative differences 341 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 342 self.mean_absolute_relative_difference = self.mard = np.mean( np.abs( ratio - 1 ) ) 343 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 344 self.median_absolute_relative_difference = self.medianard = self.medard = np.median( np.abs( ratio - 1 ) ) 345 346 # Median and median absolute differences 347 self.median_difference = self.medd = np.median( diff ) 348 self.median_absolute_difference = self.medad = np.median( absdiff ) 349 350 # Relative median differences 351 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 352 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 353 354 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 355 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 356 357 # Mean and mean absolute log ratio 358 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 359 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 360 self.std_log10_ratio = self.stdlr= np.std( log10ratio ) 361 362 # Median and median absolute log ratio 363 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 364 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 365 366 # RMS difference 367 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 368 # RMS log ratio 369 self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 ))) 370 371 # Covariance, correlation 372 self.covariance = self.cov = np.cov(x,y)[0][1] 373 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 374 np.corrcoef(x,y)[0][1] 375 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 376 self.R2 = self.r2 = self.R**2
Compute suite of bivariate statistics during initialization
Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit
Parameters
- x (ndarray or str): independent variable values
- y (ndarray or str): dependent variable values, same size as x
- w (ndarray or str, optional): weights for points (x,y), same size as x and y
- dropna (bool, optional (default=False)): drops NaN values from x, y, and w
- data (dict-like, optional): if x, y, or w are str, then they should be keys in data
382 def fitline(self,method='sma',intercept=True,**kwargs): 383 '''Compute bivariate line fit 384 385 Parameters 386 ---------- 387 method : str 388 line fitting method: sma (default), ols, wls, York, sen, siegel 389 intercept : bool 390 defines whether non-zero intercept should be fitted 391 **kwargs 392 passed to `acgc.stats.sma` (e.g. robust=True) 393 394 Returns 395 ------- 396 result : dict 397 dictionary with keys: 398 - slope (float) 399 slope of fitted line 400 - intercept (float) 401 intercept of fitted line 402 - fittedvalues (array (N,)) 403 values on fit line 404 - residuals (array (N,)) 405 residual from fit line 406 ''' 407 408 fitintercept = intercept 409 410 if method.lower()=='sma': 411 fit = sma( self._x, 412 self._y, 413 self._w, 414 intercept=fitintercept, 415 **kwargs) 416 slope = fit['slope'] 417 intercept= fit['intercept'] 418 419 elif method.lower()=='ols': 420 if fitintercept: 421 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 422 self._y, rcond=None ) 423 else: 424 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 425 slope = ols[0][0] 426 intercept = ols[0][1] 427 428 elif method.lower() in ['theil','sen','theilsen']: 429 fitintercept = True 430 fit = sen( self._x, 431 self._y, 432 **kwargs) 433 slope = fit['slope'] 434 intercept = fit['intercept'] 435 436 elif method.lower()=='siegel': 437 fitintercept = True 438 siegel = stats.siegelslopes( self._x, 439 self._y ) 440 slope = siegel.slope 441 intercept = siegel.intercept 442 443 elif method.lower()=='wls': 444 raise NotImplementedError('WLS regression not implemented yet') 445 446 elif method.lower()=='york': 447 raise NotImplementedError('York regression not implemented yet') 448 449 else: 450 raise ValueError('Undefined method '+method) 451 452 line = dict( slope = slope, 453 intercept = intercept, 454 fittedvalues = slope * self._x + intercept, 455 residuals = self._y - ( slope * self._x + intercept ), 456 method = method, 457 fitintercept = fitintercept ) 458 459 return line
Compute bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
acgc.stats.sma(e.g. robust=True)
Returns
- result (dict):
dictionary with keys:
- slope (float) slope of fitted line
- intercept (float) intercept of fitted line
- fittedvalues (array (N,)) values on fit line
- residuals (array (N,)) residual from fit line
461 def slope(self,method='sma',intercept=True,**kwargs): 462 '''Compute slope of bivariate line fit 463 464 Parameters 465 ---------- 466 method : str 467 line fitting method: sma (default), ols, wls 468 intercept : bool 469 defines whether non-zero intercept should be fitted 470 **kwargs 471 passed to `fitline` 472 473 Returns 474 ------- 475 slope : float 476 value of y intercept 477 ''' 478 return self.fitline(method,intercept,**kwargs)['slope']
Compute slope of bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- slope (float): value of y intercept
480 def intercept(self,method='sma',intercept=True,**kwargs): 481 '''Compute intercept of bivariate line fit 482 483 Parameters 484 ---------- 485 method : str 486 line fitting method: sma (default) or ols 487 intercept : bool 488 defines whether non-zero intercept should be fitted 489 **kwargs 490 passed to `fitline` 491 492 Returns 493 ------- 494 intercept : float 495 value of y intercept 496 ''' 497 return self.fitline(method,intercept,**kwargs)['intercept']
Compute intercept of bivariate line fit
Parameters
- method (str): line fitting method: sma (default) or ols
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- intercept (float): value of y intercept
579 def summary(self, variables=None, fitline_kw=None, 580 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 581 stringlength=None ): 582 '''Summarize bivariate statistics 583 584 Parameters 585 ---------- 586 vars : list or str, default='common' 587 names of attribute variables to include in summary 588 names are case insensitive 589 The following strings are also accepted in place of a list 590 "all" (displays all variables) 591 "common" (displays all measures of mean difference) 592 fitline_kw : dict, default=None 593 keywords passed to `fitline` 594 intformat : str, default='{:d}' 595 format specifier for integer values 596 floatformat : str, default='{:.4f}' 597 format specifier for floating point values 598 floatformat_fiteqn : str, default=floatformat 599 format specifier for slope and intercept (a,b) in y = a x + b 600 stringlength : int, default=None 601 length of the variables on output 602 default (None) is to use the length of the longest variable name 603 604 Returns 605 ------- 606 summary : str 607 names and values of variables 608 ''' 609 # List of variables 610 variables = self._expand_variables(variables) 611 612 if floatformat_fiteqn is None: 613 floatformat_fiteqn = floatformat 614 if stringlength is None: 615 stringlength = np.max([len(v) for v in variables]) 616 stringformat = '{:'+str(stringlength)+'s}' 617 618 # Get a dict containing the needed variables 619 summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn ) 620 621 # Extract length of the float numbers from floatformat 622 # import re 623 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 624 # floatformat )[0] ) ).astype(int) 625 626 # summary = (stringformat+'{:>10s}').format('Variable','Value') 627 summarytext = '' 628 for k,v in summarydict.items(): 629 vstr = _number2str(v,intformat,floatformat) 630 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 631 632 return summarytext
Summarize bivariate statistics
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline - intformat : str, default='{ (d}'): format specifier for integer values
- floatformat : str, default='{ (.4f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
Returns
- summary (str): names and values of variables
634 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 635 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 636 loc=None, loc_units='axes', 637 **kwargs): 638 '''Display bivariate statistics as a table inset on a plot axis 639 640 Parameters 641 ---------- 642 ax : matplotlib.Figure.Axis 643 axis where the table will be displayed 644 variables : list or str, default='common' 645 names of attribute variables to include in summary 646 names are case insensitive 647 The following strings are also accepted in place of a list 648 "all" (displays all variables) 649 "common" (displays all measures of mean difference) 650 fitline_kw : dict, default=None 651 keywords passed to `fitline` 652 intformat : str, default='{:d}' 653 format specifier for integer values 654 floatformat : str, default='{:.3f}' 655 format specifier for floating point values 656 floatformat_fiteqn : str, default=floatformat 657 format specifier for slope and intercept (a,b) in y = a x + b 658 loc : tuple (x0,y0), default=(0.85, 0.05) 659 location on the axis where the table will be drawn 660 can be in data units or axes units [0-1] 661 loc_units : {'axes' (default), 'data'} 662 specifies whether loc has 'data' units or 'axes' units [0-1] 663 664 Returns 665 ------- 666 text1, text2 : matplotlib text object 667 Artist for the two text boxes 668 ''' 669 # List of variables 670 variables = self._expand_variables(variables) 671 672 if floatformat_fiteqn is None: 673 floatformat_fiteqn = floatformat 674 675 # Default location in lower right corner 676 if loc is None: 677 loc = (0.8,0.05) 678 679 # Coordinates for loc 680 if loc_units.lower()=='data': 681 coord=ax.transData 682 elif loc_units.lower() in ['axes','axis']: 683 coord=ax.transAxes 684 else: 685 raise ValueError('Display units should be "Data" or "Axes"') 686 687 # Get a dict containing the needed variables 688 summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn ) 689 690 # Column of label text 691 label_text = '\n'.join([_texify_name(key) 692 for key in summarydict]) 693 # Column of value text 694 value_text = '\n'.join([_number2str(v,intformat,floatformat) 695 for v in summarydict.values()]) 696 697 # Check if horizontal alignment keyword is used 698 ha='' 699 try: 700 ha = kwargs['ha'] 701 except KeyError: 702 pass 703 try: 704 ha = kwargs['horizontalalignment'] 705 except KeyError: 706 pass 707 708 # For right alignment, align on values first 709 # Otherwise, align on labels 710 if ha=='right': 711 first_text = value_text 712 second_text = label_text 713 sign = -1 714 else: 715 first_text = label_text 716 second_text = value_text 717 sign = +1 718 719 # Add first column of text 720 t1=ax.text(loc[0],loc[1], 721 first_text, 722 transform=coord, 723 **kwargs 724 ) 725 726 # Get width of first text column 727 bbox = t1.get_window_extent().transformed(coord.inverted()) 728 width = bbox.x1-bbox.x0 729 730 # Add second column of text 731 t2 = ax.text(loc[0]+width*1.05*sign,loc[1], 732 second_text, 733 transform=coord, 734 **kwargs 735 ) 736 737 ################################## 738 # Early version of this function using matplotlib.table.table() 739 740 # if isinstance(loc,(tuple,list)): 741 # # Create an inset axis to contain the table 742 # tableaxis = ax.inset_axes(loc) 743 # table_width=1 744 # else: 745 # tableaxis = ax 746 747 # # Display the table on the axis 748 # return mtable.table( 749 # tableaxis, 750 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 751 # rowLabels=[texify_name(key) for key in summarydict], 752 # colWidths=[table_width/2]*2, 753 # edges=edges, 754 # loc=loc, bbox=bbox 755 # ) 756 757 return [t1,t2]
Display bivariate statistics as a table inset on a plot axis
Parameters
- ax (matplotlib.Figure.Axis): axis where the table will be displayed
- variables (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline - intformat : str, default='{ (d}'): format specifier for integer values
- floatformat : str, default='{ (.3f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
- loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
- text1, text2 (matplotlib text object): Artist for the two text boxes
22def nmb( x0, x1 ): 23 '''Compute Normalized Mean Bias (NMB) 24 25 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 26 27 Parameters 28 ---------- 29 x0 : array_like 30 reference values 31 x1 : array_like 32 experiment values 33 ''' 34 35 assert (len(x0) == len(x1)), \ 36 "Parameters x0 and x1 must have the same length" 37 38 # Mean values 39 x0_mean = np.mean(x0) 40 x1_mean = np.mean(x1) 41 42 # Metric value 43 return x1_mean / x0_mean - 1
Compute Normalized Mean Bias (NMB)
NMB = ( mean(x1) - mean(x0) ) / mean(x0)
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
45def nmae( x0, x1 ): 46 '''Compute Normalized Mean Absolute Error (NMAE) 47 48 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 49 50 Parameters 51 --------- 52 x0 : array_like 53 reference values 54 x1 : array_like 55 experiment values 56 ''' 57 58 # Mean values 59 x0_mean = np.mean(x0) 60 61 # Mean absolute difference 62 abs_diff = np.mean( np.abs(x1 - x0) ) 63 64 # Metric value 65 return abs_diff / np.abs( x0_mean )
Compute Normalized Mean Absolute Error (NMAE)
NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
68def nmbf( x0, x1 ): 69 '''Compute Normalized Mean Bias Factor (NMBF) 70 71 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 72 73 Parameters 74 ---------- 75 x0 : array_like 76 reference values 77 x1 : array_like 78 experiment values 79 ''' 80 81 # Ensure that arguments have the same length 82 assert (len(x0) == len(x1)), \ 83 "Parameters x0 and x1 must have the same length" 84 85 # Mean values 86 x0_mean = np.mean(x0) 87 x1_mean = np.mean(x1) 88 89 # Metric value 90 if x1_mean >= x0_mean: 91 result = x1_mean / x0_mean - 1 92 else: 93 result= 1 - x0_mean / x1_mean 94 # Equivalent (faster?) implementation 95 #S = (mMean - oMean) / np.abs(mMean - oMean) 96 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 97 98 return result
Compute Normalized Mean Bias Factor (NMBF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
100def nmaef( x0, x1 ): 101 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 102 103 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 104 105 Parameters 106 ---------- 107 x0 : array_like 108 reference values 109 x1 : array_like 110 experiment values 111 ''' 112 113 # Ensure that arguments have the same length 114 assert (len(x0) == len(x1)), \ 115 "Parameters x0 and x1 must have the same length" 116 117 # Mean values 118 x0_mean = np.mean(x0) 119 x1_mean = np.mean(x1) 120 121 # Mean absolute difference 122 abs_diff = np.mean( np.abs(x1 - x0)) 123 124 # Metric value 125 if x1_mean >= x0_mean: 126 result = abs_diff / x0_mean 127 else: 128 result = abs_diff / x1_mean 129 # Equivalent (faster?) implementation 130 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 131 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 132 133 return result
Compute Normalized Mean Absolute Error Factor (NMAEF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values