{-# LANGUAGE FlexibleContexts #-}
-- |
-- Module    : Statistics.Sample
-- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Commonly used sample statistics, also known as descriptive
-- statistics.

module Statistics.Sample
    (
    -- * Types
      Sample
    , WeightedSample
    -- * Descriptive functions
    , range

    -- * Statistics of location
    , mean
    , welfordMean
    , meanWeighted
    , harmonicMean
    , geometricMean

    -- * Statistics of dispersion
    -- $variance

    -- ** Functions over central moments
    , centralMoment
    , centralMoments
    , skewness
    , kurtosis

    -- ** Two-pass functions (numerically robust)
    -- $robust
    , variance
    , varianceUnbiased
    , meanVariance
    , meanVarianceUnb
    , stdDev
    , varianceWeighted
    , stdErrMean

    -- ** Single-pass functions (faster, less safe)
    -- $cancellation
    , fastVariance
    , fastVarianceUnbiased
    , fastStdDev

    -- * Joint distributions
    , covariance
    , correlation
    , pair
    -- * References
    -- $references
    ) where

import Statistics.Function (minMax)
import Statistics.Sample.Internal (robustSumVar, sum)
import Statistics.Types.Internal  (Sample,WeightedSample)
import qualified Data.Vector as V
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

-- Operator ^ will be overridden
import Prelude hiding ((^), sum)

-- | /O(n)/ Range. The difference between the largest and smallest
-- elements of a sample.
range :: (G.Vector v Double) => v Double -> Double
range :: v Double -> Double
range v Double
s = Double
hi Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
lo
    where (Double
lo , Double
hi) = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
minMax v Double
s
{-# INLINE range #-}

-- | /O(n)/ Arithmetic mean.  This uses Kahan-Babuška-Neumaier
-- summation, so is more accurate than 'welfordMean' unless the input
-- values are very large.
mean :: (G.Vector v Double) => v Double -> Double
mean :: v Double -> Double
mean v Double
xs = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
sum v Double
xs Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs)
{-# SPECIALIZE mean :: U.Vector Double -> Double #-}
{-# SPECIALIZE mean :: V.Vector Double -> Double #-}

-- | /O(n)/ Arithmetic mean.  This uses Welford's algorithm to provide
-- numerical stability, using a single pass over the sample data.
--
-- Compared to 'mean', this loses a surprising amount of precision
-- unless the inputs are very large.
welfordMean :: (G.Vector v Double) => v Double -> Double
welfordMean :: v Double -> Double
welfordMean = T -> Double
fini (T -> Double) -> (v Double -> T) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (T -> Double -> T) -> T -> v Double -> T
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
G.foldl' T -> Double -> T
go (Double -> Int -> T
T Double
0 Int
0)
  where
    fini :: T -> Double
fini (T Double
a Int
_) = Double
a
    go :: T -> Double -> T
go (T Double
m Int
n) Double
x = Double -> Int -> T
T Double
m' Int
n'
        where m' :: Double
m' = Double
m Double -> Double -> Double
forall a. Num a => a -> a -> a
+ (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
m) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n'
              n' :: Int
n' = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
{-# SPECIALIZE welfordMean :: U.Vector Double -> Double #-}
{-# SPECIALIZE welfordMean :: V.Vector Double -> Double #-}

-- | /O(n)/ Arithmetic mean for weighted sample. It uses a single-pass
-- algorithm analogous to the one used by 'welfordMean'.
meanWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double
meanWeighted :: v (Double, Double) -> Double
meanWeighted = V -> Double
fini (V -> Double)
-> (v (Double, Double) -> V) -> v (Double, Double) -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (V -> (Double, Double) -> V) -> V -> v (Double, Double) -> V
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
G.foldl' V -> (Double, Double) -> V
go (Double -> Double -> V
V Double
0 Double
0)
    where
      fini :: V -> Double
fini (V Double
a Double
_) = Double
a
      go :: V -> (Double, Double) -> V
go (V Double
m Double
w) (Double
x,Double
xw) = Double -> Double -> V
V Double
m' Double
w'
          where m' :: Double
m' | Double
w' Double -> Double -> Bool
forall a. Eq a => a -> a -> Bool
== Double
0   = Double
0
                   | Bool
otherwise = Double
m Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
xw Double -> Double -> Double
forall a. Num a => a -> a -> a
* (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
m) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
w'
                w' :: Double
w' = Double
w Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
xw
{-# INLINE meanWeighted #-}

-- | /O(n)/ Harmonic mean.  This algorithm performs a single pass over
-- the sample.
harmonicMean :: (G.Vector v Double) => v Double -> Double
harmonicMean :: v Double -> Double
harmonicMean = T -> Double
fini (T -> Double) -> (v Double -> T) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (T -> Double -> T) -> T -> v Double -> T
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
G.foldl' T -> Double -> T
go (Double -> Int -> T
T Double
0 Int
0)
  where
    fini :: T -> Double
fini (T Double
b Int
a) = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
a Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
b
    go :: T -> Double -> T
go (T Double
x Int
y) Double
n = Double -> Int -> T
T (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
+ (Double
1Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/Double
n)) (Int
yInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
{-# INLINE harmonicMean #-}

-- | /O(n)/ Geometric mean of a sample containing no negative values.
geometricMean :: (G.Vector v Double) => v Double -> Double
geometricMean :: v Double -> Double
geometricMean = Double -> Double
forall a. Floating a => a -> a
exp (Double -> Double) -> (v Double -> Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean (v Double -> Double)
-> (v Double -> v Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map Double -> Double
forall a. Floating a => a -> a
log
{-# INLINE geometricMean #-}

-- | Compute the /k/th central moment of a sample.  The central moment
-- is also known as the moment about the mean.
--
-- This function performs two passes over the sample, so is not subject
-- to stream fusion.
--
-- For samples containing many values very close to the mean, this
-- function is subject to inaccuracy due to catastrophic cancellation.
centralMoment :: (G.Vector v Double) => Int -> v Double -> Double
centralMoment :: Int -> v Double -> Double
centralMoment Int
a v Double
xs
    | Int
a Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0  = [Char] -> Double
forall a. HasCallStack => [Char] -> a
error [Char]
"Statistics.Sample.centralMoment: negative input"
    | Int
a Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Double
1
    | Int
a Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = Double
0
    | Bool
otherwise = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
sum ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map Double -> Double
go v Double
xs) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs)
  where
    go :: Double -> Double
go Double
x = (Double
xDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
m) Double -> Int -> Double
^ Int
a
    m :: Double
m    = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
xs
{-# SPECIALIZE centralMoment :: Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE centralMoment :: Int -> V.Vector Double -> Double #-}

-- | Compute the /k/th and /j/th central moments of a sample.
--
-- This function performs two passes over the sample, so is not subject
-- to stream fusion.
--
-- For samples containing many values very close to the mean, this
-- function is subject to inaccuracy due to catastrophic cancellation.
centralMoments :: (G.Vector v Double) => Int -> Int -> v Double -> (Double, Double)
centralMoments :: Int -> Int -> v Double -> (Double, Double)
centralMoments Int
a Int
b v Double
xs
    | Int
a Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
2 Bool -> Bool -> Bool
|| Int
b Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
2 = (Int -> v Double -> Double
forall (v :: * -> *). Vector v Double => Int -> v Double -> Double
centralMoment Int
a v Double
xs , Int -> v Double -> Double
forall (v :: * -> *). Vector v Double => Int -> v Double -> Double
centralMoment Int
b v Double
xs)
    | Bool
otherwise      = V -> (Double, Double)
fini (V -> (Double, Double))
-> (v Double -> V) -> v Double -> (Double, Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (V -> Double -> V) -> V -> v Double -> V
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
G.foldl' V -> Double -> V
go (Double -> Double -> V
V Double
0 Double
0) (v Double -> (Double, Double)) -> v Double -> (Double, Double)
forall a b. (a -> b) -> a -> b
$ v Double
xs
  where go :: V -> Double -> V
go (V Double
i Double
j) Double
x = Double -> Double -> V
V (Double
i Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
dDouble -> Int -> Double
^Int
a) (Double
j Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
dDouble -> Int -> Double
^Int
b)
            where d :: Double
d  = Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
m
        fini :: V -> (Double, Double)
fini (V Double
i Double
j) = (Double
i Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
n , Double
j Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
n)
        m :: Double
m            = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
xs
        n :: Double
n            = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs)
{-# SPECIALIZE
    centralMoments :: Int -> Int -> U.Vector Double -> (Double, Double) #-}
{-# SPECIALIZE
    centralMoments :: Int -> Int -> V.Vector Double -> (Double, Double) #-}

-- | Compute the skewness of a sample. This is a measure of the
-- asymmetry of its distribution.
--
-- A sample with negative skew is said to be /left-skewed/.  Most of
-- its mass is on the right of the distribution, with the tail on the
-- left.
--
-- > skewness $ U.to [1,100,101,102,103]
-- > ==> -1.497681449918257
--
-- A sample with positive skew is said to be /right-skewed/.
--
-- > skewness $ U.to [1,2,3,4,100]
-- > ==> 1.4975367033335198
--
-- A sample's skewness is not defined if its 'variance' is zero.
--
-- This function performs two passes over the sample, so is not subject
-- to stream fusion.
--
-- For samples containing many values very close to the mean, this
-- function is subject to inaccuracy due to catastrophic cancellation.
skewness :: (G.Vector v Double) => v Double -> Double
skewness :: v Double -> Double
skewness v Double
xs = Double
c3 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
c2 Double -> Double -> Double
forall a. Floating a => a -> a -> a
** (-Double
1.5)
    where (Double
c3 , Double
c2) = Int -> Int -> v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
Int -> Int -> v Double -> (Double, Double)
centralMoments Int
3 Int
2 v Double
xs
{-# SPECIALIZE skewness :: U.Vector Double -> Double #-}
{-# SPECIALIZE skewness :: V.Vector Double -> Double #-}

-- | Compute the excess kurtosis of a sample.  This is a measure of
-- the \"peakedness\" of its distribution.  A high kurtosis indicates
-- that more of the sample's variance is due to infrequent severe
-- deviations, rather than more frequent modest deviations.
--
-- A sample's excess kurtosis is not defined if its 'variance' is
-- zero.
--
-- This function performs two passes over the sample, so is not subject
-- to stream fusion.
--
-- For samples containing many values very close to the mean, this
-- function is subject to inaccuracy due to catastrophic cancellation.
kurtosis :: (G.Vector v Double) => v Double -> Double
kurtosis :: v Double -> Double
kurtosis v Double
xs = Double
c4 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
c2 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
c2) Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
3
    where (Double
c4 , Double
c2) = Int -> Int -> v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
Int -> Int -> v Double -> (Double, Double)
centralMoments Int
4 Int
2 v Double
xs
{-# SPECIALIZE kurtosis :: U.Vector Double -> Double #-}
{-# SPECIALIZE kurtosis :: V.Vector Double -> Double #-}

-- $variance
--
-- The variance — and hence the standard deviation — of a
-- sample of fewer than two elements are both defined to be zero.

-- $robust
--
-- These functions use the compensated summation algorithm of Chan et
-- al. for numerical robustness, but require two passes over the
-- sample data as a result.
--
-- Because of the need for two passes, these functions are /not/
-- subject to stream fusion.

data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double

-- | Maximum likelihood estimate of a sample's variance.  Also known
-- as the population variance, where the denominator is /n/.
variance :: (G.Vector v Double) => v Double -> Double
variance :: v Double -> Double
variance v Double
samp
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1     = Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> v Double -> Double
robustSumVar (v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
samp) v Double
samp Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n
    | Bool
otherwise = Double
0
    where
      n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
samp
{-# SPECIALIZE variance :: U.Vector Double -> Double #-}
{-# SPECIALIZE variance :: V.Vector Double -> Double #-}


-- | Unbiased estimate of a sample's variance.  Also known as the
-- sample variance, where the denominator is /n/-1.
varianceUnbiased :: (G.Vector v Double) => v Double -> Double
varianceUnbiased :: v Double -> Double
varianceUnbiased v Double
samp
    | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1     = Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> v Double -> Double
robustSumVar (v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
samp) v Double
samp Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1)
    | Bool
otherwise = Double
0
    where
      n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
samp
{-# SPECIALIZE varianceUnbiased :: U.Vector Double -> Double #-}
{-# SPECIALIZE varianceUnbiased :: V.Vector Double -> Double #-}

-- | Calculate mean and maximum likelihood estimate of variance. This
-- function should be used if both mean and variance are required
-- since it will calculate mean only once.
meanVariance ::  (G.Vector v Double) => v Double -> (Double,Double)
meanVariance :: v Double -> (Double, Double)
meanVariance v Double
samp
  | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1     = (Double
m, Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> v Double -> Double
robustSumVar Double
m v Double
samp Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n)
  | Bool
otherwise = (Double
m, Double
0)
    where
      n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
samp
      m :: Double
m = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
samp
{-# SPECIALIZE meanVariance :: U.Vector Double -> (Double,Double) #-}
{-# SPECIALIZE meanVariance :: V.Vector Double -> (Double,Double) #-}

-- | Calculate mean and unbiased estimate of variance. This
-- function should be used if both mean and variance are required
-- since it will calculate mean only once.
meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double)
meanVarianceUnb :: v Double -> (Double, Double)
meanVarianceUnb v Double
samp
  | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1     = (Double
m, Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> v Double -> Double
robustSumVar Double
m v Double
samp Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1))
  | Bool
otherwise = (Double
m, Double
0)
    where
      n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
samp
      m :: Double
m = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
samp
{-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-}
{-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-}

-- | Standard deviation.  This is simply the square root of the
-- unbiased estimate of the variance.
stdDev :: (G.Vector v Double) => v Double -> Double
stdDev :: v Double -> Double
stdDev = Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (v Double -> Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
varianceUnbiased
{-# SPECIALIZE stdDev :: U.Vector Double -> Double #-}
{-# SPECIALIZE stdDev :: V.Vector Double -> Double #-}

-- | Standard error of the mean. This is the standard deviation
-- divided by the square root of the sample size.
stdErrMean :: (G.Vector v Double) => v Double -> Double
stdErrMean :: v Double -> Double
stdErrMean v Double
samp = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
stdDev v Double
samp Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (v Double -> Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> (v Double -> Int) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length) v Double
samp
{-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-}
{-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-}

robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V
robustSumVarWeighted :: v (Double, Double) -> V
robustSumVarWeighted v (Double, Double)
samp = (V -> (Double, Double) -> V) -> V -> v (Double, Double) -> V
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
G.foldl' V -> (Double, Double) -> V
go (Double -> Double -> V
V Double
0 Double
0) v (Double, Double)
samp
    where
      go :: V -> (Double, Double) -> V
go (V Double
s Double
w) (Double
x,Double
xw) = Double -> Double -> V
V (Double
s Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
xwDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
dDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
d) (Double
w Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
xw)
          where d :: Double
d = Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
m
      m :: Double
m = v (Double, Double) -> Double
forall (v :: * -> *).
Vector v (Double, Double) =>
v (Double, Double) -> Double
meanWeighted v (Double, Double)
samp
{-# INLINE robustSumVarWeighted #-}

-- | Weighted variance. This is biased estimation.
varianceWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double
varianceWeighted :: v (Double, Double) -> Double
varianceWeighted v (Double, Double)
samp
    | v (Double, Double) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v (Double, Double)
samp Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1 = V -> Double
fini (V -> Double) -> V -> Double
forall a b. (a -> b) -> a -> b
$ v (Double, Double) -> V
forall (v :: * -> *).
Vector v (Double, Double) =>
v (Double, Double) -> V
robustSumVarWeighted v (Double, Double)
samp
    | Bool
otherwise         = Double
0
    where
      fini :: V -> Double
fini (V Double
s Double
w) = Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
w
{-# SPECIALIZE varianceWeighted :: U.Vector (Double,Double) -> Double #-}
{-# SPECIALIZE varianceWeighted :: V.Vector (Double,Double) -> Double #-}

-- $cancellation
--
-- The functions prefixed with the name @fast@ below perform a single
-- pass over the sample data using Knuth's algorithm. They usually
-- work well, but see below for caveats. These functions are subject
-- to array fusion.
--
-- /Note/: in cases where most sample data is close to the sample's
-- mean, Knuth's algorithm gives inaccurate results due to
-- catastrophic cancellation.

fastVar :: (G.Vector v Double) => v Double -> T1
fastVar :: v Double -> T1
fastVar = (T1 -> Double -> T1) -> T1 -> v Double -> T1
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
G.foldl' T1 -> Double -> T1
go (Int -> Double -> Double -> T1
T1 Int
0 Double
0 Double
0)
  where
    go :: T1 -> Double -> T1
go (T1 Int
n Double
m Double
s) Double
x = Int -> Double -> Double -> T1
T1 Int
n' Double
m' Double
s'
      where n' :: Int
n' = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
            m' :: Double
m' = Double
m Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
d Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n'
            s' :: Double
s' = Double
s Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
d Double -> Double -> Double
forall a. Num a => a -> a -> a
* (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
m')
            d :: Double
d  = Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
m

-- | Maximum likelihood estimate of a sample's variance.
fastVariance :: (G.Vector v Double) => v Double -> Double
fastVariance :: v Double -> Double
fastVariance = T1 -> Double
fini (T1 -> Double) -> (v Double -> T1) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> T1
forall (v :: * -> *). Vector v Double => v Double -> T1
fastVar
  where fini :: T1 -> Double
fini (T1 Int
n Double
_m Double
s)
          | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1     = Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n
          | Bool
otherwise = Double
0
{-# INLINE fastVariance #-}

-- | Unbiased estimate of a sample's variance.
fastVarianceUnbiased :: (G.Vector v Double) => v Double -> Double
fastVarianceUnbiased :: v Double -> Double
fastVarianceUnbiased = T1 -> Double
fini (T1 -> Double) -> (v Double -> T1) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> T1
forall (v :: * -> *). Vector v Double => v Double -> T1
fastVar
  where fini :: T1 -> Double
fini (T1 Int
n Double
_m Double
s)
          | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1     = Double
s Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
          | Bool
otherwise = Double
0
{-# INLINE fastVarianceUnbiased #-}

-- | Standard deviation.  This is simply the square root of the
-- maximum likelihood estimate of the variance.
fastStdDev :: (G.Vector v Double) => v Double -> Double
fastStdDev :: v Double -> Double
fastStdDev = Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (v Double -> Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
fastVariance
{-# INLINE fastStdDev #-}

-- | Covariance of sample of pairs. For empty sample it's set to
--   zero
covariance :: (G.Vector v (Double,Double), G.Vector v Double)
           => v (Double,Double)
           -> Double
covariance :: v (Double, Double) -> Double
covariance v (Double, Double)
xy
  | Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0    = Double
0
  | Bool
otherwise = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean (v Double -> Double) -> v Double -> Double
forall a b. (a -> b) -> a -> b
$ (Double -> Double -> Double) -> v Double -> v Double -> v Double
forall (v :: * -> *) a b c.
(Vector v a, Vector v b, Vector v c) =>
(a -> b -> c) -> v a -> v b -> v c
G.zipWith Double -> Double -> Double
forall a. Num a => a -> a -> a
(*)
                         ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (\Double
x -> Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
muX) v Double
xs)
                         ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (\Double
y -> Double
y Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
muY) v Double
ys)
  where
    n :: Int
n       = v (Double, Double) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v (Double, Double)
xy
    (v Double
xs,v Double
ys) = v (Double, Double) -> (v Double, v Double)
forall (v :: * -> *) a b.
(Vector v a, Vector v b, Vector v (a, b)) =>
v (a, b) -> (v a, v b)
G.unzip v (Double, Double)
xy
    muX :: Double
muX     = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
xs
    muY :: Double
muY     = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean v Double
ys
{-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-}
{-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-}

-- | Correlation coefficient for sample of pairs. Also known as
--   Pearson's correlation. For empty sample it's set to zero.
correlation :: (G.Vector v (Double,Double), G.Vector v Double)
           => v (Double,Double)
           -> Double
correlation :: v (Double, Double) -> Double
correlation v (Double, Double)
xy
  | Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0    = Double
0
  | Bool
otherwise = Double
cov Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double -> Double
forall a. Floating a => a -> a
sqrt (Double
varX Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
varY)
  where
    n :: Int
n       = v (Double, Double) -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v (Double, Double)
xy
    (v Double
xs,v Double
ys) = v (Double, Double) -> (v Double, v Double)
forall (v :: * -> *) a b.
(Vector v a, Vector v b, Vector v (a, b)) =>
v (a, b) -> (v a, v b)
G.unzip v (Double, Double)
xy
    (Double
muX,Double
varX) = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
meanVariance v Double
xs
    (Double
muY,Double
varY) = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
meanVariance v Double
ys
    cov :: Double
cov = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
mean (v Double -> Double) -> v Double -> Double
forall a b. (a -> b) -> a -> b
$ (Double -> Double -> Double) -> v Double -> v Double -> v Double
forall (v :: * -> *) a b c.
(Vector v a, Vector v b, Vector v c) =>
(a -> b -> c) -> v a -> v b -> v c
G.zipWith Double -> Double -> Double
forall a. Num a => a -> a -> a
(*)
            ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (\Double
x -> Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
muX) v Double
xs)
            ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (\Double
y -> Double
y Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
muY) v Double
ys)
{-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-}
{-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-}


-- | Pair two samples. It's like 'G.zip' but requires that both
--   samples have equal size.
pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b)
pair :: v a -> v b -> v (a, b)
pair v a
va v b
vb
  | v a -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v a
va Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== v b -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v b
vb = v a -> v b -> v (a, b)
forall (v :: * -> *) a b.
(Vector v a, Vector v b, Vector v (a, b)) =>
v a -> v b -> v (a, b)
G.zip v a
va v b
vb
  | Bool
otherwise = [Char] -> v (a, b)
forall a. HasCallStack => [Char] -> a
error [Char]
"Statistics.Sample.pair: vector must have same length"
{-# INLINE pair #-}

------------------------------------------------------------------------
-- Helper code. Monomorphic unpacked accumulators.

-- (^) operator from Prelude is just slow.
(^) :: Double -> Int -> Double
Double
x ^ :: Double -> Int -> Double
^ Int
1 = Double
x
Double
x ^ Int
n = Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* (Double
x Double -> Int -> Double
^ (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1))
{-# INLINE (^) #-}

-- don't support polymorphism, as we can't get unboxed returns if we use it.
data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int

data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double

{-

Consider this core:

with data T a = T !a !Int

$wfold :: Double#
               -> Int#
               -> Int#
               -> (# Double, Int# #)

and without,

$wfold :: Double#
               -> Int#
               -> Int#
               -> (# Double#, Int# #)

yielding to boxed returns and heap checks.

-}

-- $references
--
-- * Chan, T. F.; Golub, G.H.; LeVeque, R.J. (1979) Updating formulae
--   and a pairwise algorithm for computing sample
--   variances. Technical Report STAN-CS-79-773, Department of
--   Computer Science, Stanford
--   University. <ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf>
--
-- * Knuth, D.E. (1998) The art of computer programming, volume 2:
--   seminumerical algorithms, 3rd ed., p. 232.
--
-- * Welford, B.P. (1962) Note on a method for calculating corrected
--   sums of squares and products. /Technometrics/
--   4(3):419&#8211;420. <http://www.jstor.org/stable/1266577>
--
-- * West, D.H.D. (1979) Updating mean and variance estimates: an
--   improved method. /Communications of the ACM/
--   22(9):532&#8211;535. <http://doi.acm.org/10.1145/359146.359153>