{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
-- |
-- Module    : Statistics.Distribution
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Type classes for probability distributions

module Statistics.Distribution
    (
      -- * Type classes
      Distribution(..)
    , DiscreteDistr(..)
    , ContDistr(..)
      -- ** Distribution statistics
    , MaybeMean(..)
    , Mean(..)
    , MaybeVariance(..)
    , Variance(..)
    , MaybeEntropy(..)
    , Entropy(..)
    , FromSample(..)
      -- ** Random number generation
    , ContGen(..)
    , DiscreteGen(..)
    , genContinuous
      -- * Helper functions
    , findRoot
    , sumProbabilities
    ) where

import Prelude hiding (sum)
import Statistics.Function        (square)
import Statistics.Sample.Internal (sum)
import System.Random.Stateful     (StatefulGen, uniformDouble01M)
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Generic as G


-- | Type class common to all distributions. Only c.d.f. could be
-- defined for both discrete and continuous distributions.
class Distribution d where
    -- | Cumulative distribution function.  The probability that a
    -- random variable /X/ is less or equal than /x/,
    -- i.e. P(/X/≤/x/). Cumulative should be defined for
    -- infinities as well:
    --
    -- > cumulative d +∞ = 1
    -- > cumulative d -∞ = 0
    cumulative :: d -> Double -> Double
    cumulative d
d Double
x = Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- d -> Double -> Double
forall d. Distribution d => d -> Double -> Double
complCumulative d
d Double
x
    -- | One's complement of cumulative distribution:
    --
    -- > complCumulative d x = 1 - cumulative d x
    --
    -- It's useful when one is interested in P(/X/>/x/) and
    -- expression on the right side begin to lose precision. This
    -- function have default implementation but implementors are
    -- encouraged to provide more precise implementation.
    complCumulative :: d -> Double -> Double
    complCumulative d
d Double
x = Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- d -> Double -> Double
forall d. Distribution d => d -> Double -> Double
cumulative d
d Double
x
    {-# MINIMAL (cumulative | complCumulative) #-}


-- | Discrete probability distribution.
class Distribution  d => DiscreteDistr d where
    -- | Probability of n-th outcome.
    probability :: d -> Int -> Double
    probability d
d = Double -> Double
forall a. Floating a => a -> a
exp (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Int -> Double
forall d. DiscreteDistr d => d -> Int -> Double
logProbability d
d
    -- | Logarithm of probability of n-th outcome
    logProbability :: d -> Int -> Double
    logProbability d
d = Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Int -> Double
forall d. DiscreteDistr d => d -> Int -> Double
probability d
d
    {-# MINIMAL (probability | logProbability) #-}

-- | Continuous probability distribution.
--
--   Minimal complete definition is 'quantile' and either 'density' or
--   'logDensity'.
class Distribution d => ContDistr d where
    -- | Probability density function. Probability that random
    -- variable /X/ lies in the infinitesimal interval
    -- [/x/,/x+/δ/x/) equal to /density(x)/⋅δ/x/
    density :: d -> Double -> Double
    density d
d = Double -> Double
forall a. Floating a => a -> a
exp (Double -> Double) -> (Double -> Double) -> Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Double -> Double
forall d. ContDistr d => d -> Double -> Double
logDensity d
d
    -- | Natural logarithm of density.
    logDensity :: d -> Double -> Double
    logDensity d
d = Double -> Double
forall a. Floating a => a -> a
log (Double -> Double) -> (Double -> Double) -> Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Double -> Double
forall d. ContDistr d => d -> Double -> Double
density d
d
    -- | Inverse of the cumulative distribution function. The value
    -- /x/ for which P(/X/≤/x/) = /p/. If probability is outside
    -- of [0,1] range function should call 'error'
    quantile :: d -> Double -> Double
    quantile d
d Double
x = d -> Double -> Double
forall d. ContDistr d => d -> Double -> Double
complQuantile d
d (Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
x)
    -- | 1-complement of @quantile@:
    --
    -- > complQuantile x ≡ quantile (1 - x)
    complQuantile :: d -> Double -> Double
    complQuantile d
d Double
x = d -> Double -> Double
forall d. ContDistr d => d -> Double -> Double
quantile d
d (Double
1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
x)
    {-# MINIMAL (density | logDensity), (quantile | complQuantile) #-}

-- | Type class for distributions with mean. 'maybeMean' should return
--   'Nothing' if it's undefined for current value of data
class Distribution d => MaybeMean d where
    maybeMean :: d -> Maybe Double

-- | Type class for distributions with mean. If a distribution has
--   finite mean for all valid values of parameters it should be
--   instance of this type class.
class MaybeMean d => Mean d where
    mean :: d -> Double



-- | Type class for distributions with variance. If variance is
--   undefined for some parameter values both 'maybeVariance' and
--   'maybeStdDev' should return Nothing.
--
--   Minimal complete definition is 'maybeVariance' or 'maybeStdDev'
class MaybeMean d => MaybeVariance d where
    maybeVariance :: d -> Maybe Double
    maybeVariance = (Double -> Double) -> Maybe Double -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Double -> Double
square (Maybe Double -> Maybe Double)
-> (d -> Maybe Double) -> d -> Maybe Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Maybe Double
forall d. MaybeVariance d => d -> Maybe Double
maybeStdDev
    maybeStdDev   :: d -> Maybe Double
    maybeStdDev   = (Double -> Double) -> Maybe Double -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Double -> Double
forall a. Floating a => a -> a
sqrt (Maybe Double -> Maybe Double)
-> (d -> Maybe Double) -> d -> Maybe Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Maybe Double
forall d. MaybeVariance d => d -> Maybe Double
maybeVariance
    {-# MINIMAL (maybeVariance | maybeStdDev) #-}

-- | Type class for distributions with variance. If distribution have
--   finite variance for all valid parameter values it should be
--   instance of this type class.
--
--   Minimal complete definition is 'variance' or 'stdDev'
class (Mean d, MaybeVariance d) => Variance d where
    variance :: d -> Double
    variance d
d = Double -> Double
square (d -> Double
forall d. Variance d => d -> Double
stdDev d
d)
    stdDev   :: d -> Double
    stdDev = Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (d -> Double) -> d -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. d -> Double
forall d. Variance d => d -> Double
variance
    {-# MINIMAL (variance | stdDev) #-}


-- | Type class for distributions with entropy, meaning Shannon entropy
--   in the case of a discrete distribution, or differential entropy in the
--   case of a continuous one.  'maybeEntropy' should return 'Nothing' if
--   entropy is undefined for the chosen parameter values.
class (Distribution d) => MaybeEntropy d where
  -- | Returns the entropy of a distribution, in nats, if such is defined.
  maybeEntropy :: d -> Maybe Double

-- | Type class for distributions with entropy, meaning Shannon
--   entropy in the case of a discrete distribution, or differential
--   entropy in the case of a continuous one.  If the distribution has
--   well-defined entropy for all valid parameter values then it
--   should be an instance of this type class.
class (MaybeEntropy d) => Entropy d where
  -- | Returns the entropy of a distribution, in nats.
  entropy :: d -> Double

-- | Generate discrete random variates which have given
--   distribution.
class Distribution d => ContGen d where
  genContVar :: (StatefulGen g m) => d -> g -> m Double

-- | Generate discrete random variates which have given
--   distribution. 'ContGen' is superclass because it's always possible
--   to generate real-valued variates from integer values
class (DiscreteDistr d, ContGen d) => DiscreteGen d where
  genDiscreteVar :: (StatefulGen g m) => d -> g -> m Int

-- | Estimate distribution from sample. First parameter in sample is
--   distribution type and second is element type.
class FromSample d a where
  -- | Estimate distribution from sample. Returns nothing is there's
  --   not enough data to estimate or sample clearly doesn't come from
  --   distribution in question. For example if there's negative
  --   samples in exponential distribution.
  fromSample :: G.Vector v a => v a -> Maybe d


-- | Generate variates from continuous distribution using inverse
--   transform rule.
genContinuous :: (ContDistr d, StatefulGen g m) => d -> g -> m Double
genContinuous :: d -> g -> m Double
genContinuous d
d g
gen = do
  Double
x <- g -> m Double
forall g (m :: * -> *). StatefulGen g m => g -> m Double
uniformDouble01M g
gen
  Double -> m Double
forall (m :: * -> *) a. Monad m => a -> m a
return (Double -> m Double) -> Double -> m Double
forall a b. (a -> b) -> a -> b
$! d -> Double -> Double
forall d. ContDistr d => d -> Double -> Double
quantile d
d Double
x

data P = P {-# UNPACK #-} !Double {-# UNPACK #-} !Double

-- | Approximate the value of /X/ for which P(/x/>/X/)=/p/.
--
-- This method uses a combination of Newton-Raphson iteration and
-- bisection with the given guess as a starting point.  The upper and
-- lower bounds specify the interval in which the probability
-- distribution reaches the value /p/.
findRoot :: ContDistr d =>
            d                   -- ^ Distribution
         -> Double              -- ^ Probability /p/
         -> Double              -- ^ Initial guess
         -> Double              -- ^ Lower bound on interval
         -> Double              -- ^ Upper bound on interval
         -> Double
findRoot :: d -> Double -> Double -> Double -> Double -> Double
findRoot d
d Double
prob = Int -> Double -> Double -> Double -> Double -> Double
loop Int
0 Double
1
  where
    loop :: Int -> Double -> Double -> Double -> Double -> Double
loop !(Int
i::Int) !Double
dx !Double
x !Double
lo !Double
hi
      | Double -> Double
forall a. Num a => a -> a
abs Double
dx Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Double
accuracy Bool -> Bool -> Bool
|| Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
maxIters = Double
x
      | Bool
otherwise                           = Int -> Double -> Double -> Double -> Double -> Double
loop (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1) Double
dx'' Double
x'' Double
lo' Double
hi'
      where
        err :: Double
err                   = d -> Double -> Double
forall d. Distribution d => d -> Double -> Double
cumulative d
d Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
prob
        P Double
lo' Double
hi' | Double
err Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
0   = Double -> Double -> P
P Double
x Double
hi
                  | Bool
otherwise = Double -> Double -> P
P Double
lo Double
x
        pdf :: Double
pdf                   = d -> Double -> Double
forall d. ContDistr d => d -> Double -> Double
density d
d Double
x
        P Double
dx' Double
x' | Double
pdf Double -> Double -> Bool
forall a. Eq a => a -> a -> Bool
/= Double
0   = Double -> Double -> P
P (Double
err Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
pdf) (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
dx)
                 | Bool
otherwise  = Double -> Double -> P
P Double
dx Double
x
        P Double
dx'' Double
x''
            | Double
x' Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
lo' Bool -> Bool -> Bool
|| Double
x' Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
> Double
hi' Bool -> Bool -> Bool
|| Double
pdf Double -> Double -> Bool
forall a. Eq a => a -> a -> Bool
== Double
0 = let y :: Double
y = (Double
lo' Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double
hi') Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
2
                                                 in  Double -> Double -> P
P (Double
yDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
x) Double
y
            | Bool
otherwise                        = Double -> Double -> P
P Double
dx' Double
x'
    accuracy :: Double
accuracy = Double
1e-15
    maxIters :: Int
maxIters = Int
150

-- | Sum probabilities in inclusive interval.
sumProbabilities :: DiscreteDistr d => d -> Int -> Int -> Double
sumProbabilities :: d -> Int -> Int -> Double
sumProbabilities d
d Int
low Int
hi =
  -- Return value is forced to be less than 1 to guard against roundoff errors.
  -- ATTENTION! this check should be removed for testing or it could mask bugs.
  Double -> Double -> Double
forall a. Ord a => a -> a -> a
min Double
1 (Double -> Double)
-> (Vector Int -> Double) -> Vector Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
sum (Vector Double -> Double)
-> (Vector Int -> Vector Double) -> Vector Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> Double) -> Vector Int -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
U.map (d -> Int -> Double
forall d. DiscreteDistr d => d -> Int -> Double
probability d
d) (Vector Int -> Double) -> Vector Int -> Double
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Vector Int
forall a. (Unbox a, Enum a) => a -> a -> Vector a
U.enumFromTo Int
low Int
hi