% $Cambridge: hermes/doc/antiforgery/stats.tex,v 1.9 2009/02/23 18:32:47 fanf2 Exp $

\documentclass[a4paper]{article}

\usepackage{fullpage}
\usepackage{url}

\DeclareUrlCommand\email{%
  \def\UrlLeft{<}%
  \def\UrlRight{>}%
  \urlstyle{tt}%
}

\setlength{\parindent}{0.0in}
\setlength{\parskip}{0.0in}

% rubric

\author{
  Tony~Finch\\
  \email{fanf2@cam.ac.uk} \email{dot@dotat.at} \\
  University of Cambridge Computing Service\\
}

\title{
  Incremental calculation of weighted mean and variance
}

\date{February 2009}

\begin{document}

\maketitle

\begin{abstract}
  In these notes I explain how to derive formulae for numerically
  stable calculation of the mean and standard deviation, which are
  also suitable for incremental on-line calculation. I then generalize
  these formulae to weighted means and standard deviations. I unpick
  the difficulties that arise when generalizing further to normalized
  weights. Finally I show that the exponentially weighted moving
  average is a special case of the incremental normalized weighted
  mean formula, and derive a formula for the exponentially weighted
  moving standard deviation.
\end{abstract}

%

\section{Simple mean}

Straightforward translation of equation~\ref{mean-simple} into code
can suffer from loss of precision because of the difference in
magnitude between a sample and the sum of all samples.
Equation~\ref{mean-incr} calculates the mean in a way that is more
numerically stable because it avoids accumulating large sums.
\begin{eqnarray}
  \mu_n &=& \frac{1}{n}         \sum_{i=1}^{n} x_i      \label{mean-simple}\\
        &=& \frac{1}{n} ( x_n + \sum_{i=1}^{n-1} x_i  ) \\
        &=& \frac{1}{n} ( x_n + (n-1) \mu_{n-1}       ) \\
        &=& \mu_{n-1} + \frac{1}{n} ( x_n - \mu_{n-1} ) \label{mean-incr}
\end{eqnarray}

This formula also provides us with some useful identities.
\begin{eqnarray}
    x_n - \mu_{n-1} &=&            n  ( \mu_n - \mu_{n-1} )           \label{simple-id-2}\\
    x_n - \mu_n     &=&            n  ( \mu_n - \mu_{n-1} ) - \mu_n + \mu_{n-1} \nonumber\\
                    &=&      (n-1)    ( \mu_n - \mu_{n-1} )           \label{simple-id-3}
\end{eqnarray}

%

\section{Simple variance}

The definition of the standard deviation in equation~\ref{var-def}
below requires us to already know the mean, which implies two passes
over the data. This isn't feasible for online algorithms that need to
produce incremental results after each sample becomes available.
Equation~\ref{var-simple} solves this problem since it allows us to
calculate the standard deviation from two running sums.

\begin{eqnarray}
  \sigma^2 &=& \frac{1}{n} \sum_{i=1}^{n} (x_i - \mu)^2          \label{var-def}\\
           &=& \frac{1}{n} \sum_{i=1}^{n} (x_i^2 - 2 x_i \mu + \mu^2) \\
           &=& \frac{1}{n} \sum_{i=1}^{n} x_i^2
       - 2 \mu \frac{1}{n} \sum_{i=1}^{n} x_i
       + \mu^2 \frac{1}{n} \sum_{i=1}^{n} 1                      \\
           &=& \frac{1}{n} \sum_{i=1}^{n} x_i^2
               - 2 \mu \mu + \mu^2 \frac{n}{n}                   \\
           &=& \frac{1}{n} \sum_{i=1}^{n} x_i^2 - \mu^2          \\
           &=& \left( \frac{1}{n} \sum_{i=1}^{n} x_i^2 \right)
             - \left( \frac{1}{n} \sum_{i=1}^{n} x_i   \right)^2 \label{var-simple}
\end{eqnarray}

%

\section{Incremental variance}

Knuth notes \cite{var-incr-knuth} that equation~\ref{var-simple} is
prone to loss of precision because it takes the difference between two
large sums of similar size, and suggests equation~\ref{var-incr} as an
alternative that avoids this problem. However he does not say how it
is derived. In the following, equation~\ref{var-incr-mean} is derived
from the previous step using equation~\ref{simple-id-2}.
\begin{eqnarray}
\mbox{Let } S_n &=& n\sigma^2_n                                                     \\
                &=& \textstyle \sum_{i=1}^{n} (x_i - \mu_n)^2                       \\
                &=& \textstyle \sum_{i=1}^{n} x_i^2 - n \mu_n^2                     \\
  S_n - S_{n-1} &=&            \sum_{i=1}^{n}   x_i^2 -  n    \mu_n^2
                             - \sum_{i=1}^{n-1} x_i^2 + (n-1) \mu_{n-1}^2           \\
                &=& x_n^2 - n \mu_n^2 + (n-1) \mu_{n-1}^2                           \\
                &=& x_n^2 - \mu_{n-1}^2 + n (\mu_{n-1}^2 - \mu_n^2)                 \\
                &=& x_n^2 - \mu_{n-1}^2 + n (\mu_{n-1} - \mu_n)(\mu_{n-1} + \mu_n)  \\
                &=& x_n^2 - \mu_{n-1}^2 + (\mu_{n-1} - x_n)(\mu_{n-1} + \mu_n)      \label{var-incr-mean}\\
                &=& x_n^2 - \mu_{n-1}^2 + \mu_{n-1}^2 - x_n\mu_n - x_n\mu_{n-1} + \mu_n\mu_{n-1} \\
                &=& x_n^2 - x_n\mu_n - x_n\mu_{n-1} + \mu_n\mu_{n-1}                \\
                &=& ( x_n - \mu_{n-1} ) ( x_n - \mu_n )                             \\
            S_n &=& S_{n-1} + ( x_n - \mu_{n-1} ) ( x_n - \mu_n )                   \label{var-incr}\\
       \sigma_n &=& \sqrt{ S_n / n }
\end{eqnarray}

Mathworld \cite{mathworld-var-incr} has an alternative derivation of a
similar formula, which in our notation is as follows.
\begin{eqnarray}
S_n &=& \sum_{i=1}^{n}   (x_i - \mu_n)^2 \\
    &=& \sum_{i=1}^{n} ( (x_i - \mu_{n-1}) - (\mu_n - \mu_{n-1}) )^2 \\
    &=& \sum_{i=1}^{n}   (x_i - \mu_{n-1})^2
      + \sum_{i=1}^{n}   (\mu_n - \mu_{n-1})^2
    - 2 \sum_{i=1}^{n}   (x_i - \mu_{n-1})(\mu_n - \mu_{n-1})
\end{eqnarray}

Simplify the first summation:
\begin{eqnarray}
        \sum_{i=1}^{n}   (x_i - \mu_{n-1})^2
    &=&                  (x_n - \mu_{n-1})^2
      + \sum_{i=1}^{n-1} (x_i - \mu_{n-1})^2 \\
    &=&                  (x_n - \mu_{n-1})^2 + S_{n-1} \\
    &=& S_{n-1} +    n^2 (\mu_n - \mu_{n-1})^2
\end{eqnarray}

Simplify the second summation:
\begin{eqnarray}
        \sum_{i=1}^{n}   (\mu_n - \mu_{n-1})^2
    &=&              n   (\mu_n - \mu_{n-1})^2
\end{eqnarray}

Simplify the third summation:
\begin{eqnarray}
                          \sum_{i=1}^{n}   (x_i - \mu_{n-1})(\mu_n - \mu_{n-1})
&=&   (\mu_n - \mu_{n-1}) \sum_{i=1}^{n}   (x_i - \mu_{n-1}) \\
&=&   (\mu_n - \mu_{n-1}) \left(            x_n - \mu_{n-1}
                        + \sum_{i=1}^{n-1} (x_i - \mu_{n-1}) \right) \\
&=&   (\mu_n - \mu_{n-1}) \left(            x_n - \mu_{n-1}
         -(n-1)\mu_{n-1} +\sum_{i=1}^{n-1}  x_i              \right) \\
&=&   (\mu_n - \mu_{n-1}) ( x_n  - \mu_{n-1}
         -(n-1)\mu_{n-1} +(n-1)\mu_{n-1} ) \\
&=&   (\mu_n - \mu_{n-1}) ( x_n  - \mu_{n-1} ) \\
&=& n (\mu_n - \mu_{n-1})^2
\end{eqnarray}

Back to the complete formula:
\begin{eqnarray}
S_n &=& S_{n-1} +    n^2 (\mu_n - \mu_{n-1})^2 +  n   (\mu_n - \mu_{n-1})^2 - 2 n                 (\mu_n - \mu_{n-1})^2 \\
    &=& S_{n-1} +    n^2 (\mu_n - \mu_{n-1})^2 -  n   (\mu_n - \mu_{n-1})^2 \\
    &=& S_{n-1} + n(n-1) (\mu_n - \mu_{n-1})^2
\end{eqnarray}

We can use equations \ref{simple-id-3} and \ref{simple-id-2} to
show this is equivalent to equation~\ref{var-incr}.
\begin{eqnarray}
S_n &=& S_{n-1} + n(n-1) (\mu_n - \mu_{n-1})^2 \\
    &=& S_{n-1} + n (\mu_n - \mu_{n-1}) (x_n - \mu_n) \\
    &=& S_{n-1} + (x_n - \mu_{n-1}) (x_n - \mu_n)
\end{eqnarray}

%

\section{Weighted mean} \label{wmean}

The weighted mean is defined as follows.
\begin{equation}
  \mu = \frac{ \sum_{i=1}^n w_i x_i }
             { \sum_{i=1}^n w_i     }
\end{equation}
It is equivalent to the simple mean when all the weights are equal,
since
\begin{equation}
  \mu = \frac{ \sum_{i=1}^n w x_i }
             { \sum_{i=1}^n w     }
      = \frac{ w \sum_{i=1}^n x_i } { n w }
      = \frac{1}{n} \sum_{i=1}^n x_i
\end{equation}
If the samples are all different, then weights can be thought of as
sample frequencies, or they can be used to calculate probabilities
where $p_i = w_i / \sum w_i$. The following derivation of the
incremental formula (equation~\ref{wmean-incr}) follows the same
pattern as the derivation of equation~\ref{mean-incr}. For brevity we
also define $W_n$ as the sum of the weights.
\begin{eqnarray}
    W_n &=& \sum_{i=1}^{n} w_i                                        \\
  \mu_n &=& \frac{1}{W_n} \sum_{i=1}^{n} w_i x_i                      \\
        &=& \frac{1}{W_n} \left( w_n x_n + \sum_{i=1}^{n-1} w_i x_i \right) \label{vw-a1}\\
        &=& \frac{1}{W_n} ( w_n x_n + W_{n-1} \mu_{n-1} )             \\
        &=& \frac{1}{W_n} ( w_n x_n + (W_n-w_n) \mu_{n-1} )           \label{vw-a2}\\
        &=& \frac{1}{W_n} ( W_n \mu_{n-1} + w_n x_n - w_n \mu_{n-1} ) \\
        &=& \mu_{n-1} + \frac{w_n}{W_n} ( x_n - \mu_{n-1} )           \label{wmean-incr}
\end{eqnarray}

Useful identities derived from this formula are:
\begin{eqnarray}
  W_n ( \mu_{n-1} - \mu_n ) &=& w_n ( \mu_{n-1} - x_n ) \\
  \frac{W_n}{w_n} ( \mu_n - \mu_{n-1} ) &=& x_n - \mu_{n-1} \\
  x_n - \mu_n &=& \frac{W_n}    {w_n} ( \mu_n - \mu_{n-1} ) - \mu_n + \mu_{n-1} \nonumber\\
              &=& \frac{W_n-w_n}{w_n} ( \mu_n - \mu_{n-1} ) \\
              &=& \frac{W_n-w_n}{W_n} (   x_n - \mu_{n-1} )
\end{eqnarray}

%

\section{Weighted variance}

Similarly, we derive a numerically stable formula for calculating the
weighted variance (equation~\ref{wvar-incr}) using the same pattern as
the derivation of the unweighed wersion (equation~\ref{var-incr}).
\begin{eqnarray}
       \sigma^2 &=& \frac{1}{W_n} \sum_{i=1}^{n} w_i (x_i - \mu)^2
                 =  \frac{1}{W_n} \sum_{i=1}^{n} w_i x_i^2 - \mu^2 \\
\mbox{Let } S_n &=& W_n \sigma^2_n \\
                &=& \sum_{i=1}^{n}   w_i x_i^2 - W_n     \mu_n^2 \\
  S_n - S_{n-1} &=& \sum_{i=1}^{n}   w_i x_i^2 - W_n     \mu_n^2
                  - \sum_{i=1}^{n-1} w_i x_i^2 + W_{n-1} \mu_{n-1}^2 \label{vw-b1}\\
                &=& w_n x_n^2 - W_n \mu_n^2 +   W_{n-1}  \mu_{n-1}^2 \\
                &=& w_n x_n^2 - W_n \mu_n^2 + (W_n-w_n)  \mu_{n-1}^2 \label{vw-b2}\\
                &=& w_n \left( x_n^2 - \mu_{n-1}^2 \right) + W_n (\mu_{n-1}^2 - \mu_n^2) \\
                &=& w_n \left( x_n^2 - \mu_{n-1}^2 \right) + W_n (\mu_{n-1} - \mu_n)(\mu_{n-1} + \mu_n) \\
                &=& w_n \left( x_n^2 - \mu_{n-1}^2 + (\mu_{n-1} - x_n)(\mu_{n-1} + \mu_n) \right) \\
                &=& w_n ( x_n - \mu_{n-1} ) ( x_n - \mu_n ) \\
            S_n &=& S_{n-1} + w_n ( x_n - \mu_{n-1} ) ( x_n - \mu_n ) \label{wvar-incr}\\
       \sigma_n &=& \sqrt{ S_n / W_n }
\end{eqnarray}

%

\section{Variable weights}

In the previous three sections, I have assumed that weights are
constant once assigned. However a common requirement is to normalize
the weights, such that
\begin{equation}
  W = \sum_{i=1}^n w_i = 1
\end{equation}

If we are repeatedly adding new data to our working set, then we can't
have both constant weights and normalized weights. To allow us to keep
weights normalized, we need to allow the weight of each sample to
change as the set of samples changes. To indicate this we will give
weights two indices, the first identifying the set of samples using
the sample count (as we have been doing for $\mu_n$ etc.) and the
second being the index of the sample in that set. We will not make any
assumptions about the sum of the weights, that is we will not require
them to be normalized. For example,
\begin{equation}
     W_n =               \sum_{i=1}^{n} w_{n,i}
,\ \mu_n = \frac{1}{W_n} \sum_{i=1}^{n} w_{n,i} x_i
\end{equation}

Having done this we need to re-examine some of the logical steps in
the previous sections to ensure they are still valid. In equations
\ref{vw-a1}--\ref{vw-a2}, we used the fact that in the fixed-weight
setting,
\begin{equation}
  \sum_{i=1}^{n-1} w_i x_i =  W_{n-1}  \mu_{n-1} = (W_n-w_n) \mu_{n-1} \label{vw-d}
\end{equation}

In the new setting, this equality is fairly obviously no longer
true. (For example, if we are keeping weights normalized then
$W_n = W_{n-1} = 1$.) Fortunately there is a different middle
step which justifies equation~\ref{vw-d} when weights vary, so the
results of section~\ref{wmean} remain valid.
\begin{eqnarray}
         \sum_{i=1}^{n-1} w_{n,i}   x_i  &=&  (W_n - w_{n,n}) \mu_{n-1} \\
         \sum_{i=1}^{n-1} w_{n,i}   x_i  &=&
         \sum_{i=1}^{n-1} w_{n,i}
  \frac{ \sum_{i=1}^{n-1} w_{n-1,i} x_i }
       { \sum_{i=1}^{n-1} w_{n-1,i}     } \\
  \frac{ \sum_{i=1}^{n-1} w_{n,i}   x_i }
       { \sum_{i=1}^{n-1} w_{n,i}       } &=&
  \frac{ \sum_{i=1}^{n-1} w_{n-1,i} x_i }
       { \sum_{i=1}^{n-1} w_{n-1,i}     } \label{norm-mean}\\
         \sum_{i=1}^{n-1}
  \frac{                  w_{n,i}       }
       { \sum_{i=1}^{n-1} w_{n,i}       } \,x_i &=&
         \sum_{i=1}^{n-1}
  \frac{                  w_{n-1,i}     }
       { \sum_{i=1}^{n-1} w_{n-1,i}     } \,x_i \\
  \frac{                  w_{n,j}       }
       { \sum_{i=1}^{n-1} w_{n,i}       } &=&
  \frac{                  w_{n-1,j}     }
       { \sum_{i=1}^{n-1} w_{n-1,i}     }
  \mbox{ where } 1 \le j \le n-1          \label{norm-weights}
\end{eqnarray}

This says that for the weighted mean formulae to remain valid the new
and old weights should be consistent. Equation \ref{norm-mean} says
that we get the same result when we calculate the mean of the previous
working set whether we use the old weights or the new weights.
Equation \ref{norm-weights} says that when we normalize the weights
across the previous set (up to $n-1$) we get the same set of weights
whether we start from the old weights or the new ones. This
requirement isn't enough by itself to make the weighted variance
formulae work, so we will examine it again below.

%

\section{The expectation function}

At this point it is worth defining some better notation to reduce the
number of summations we need to write. The expectation function is a
generalized version of the mean, whose argument is some arbitrary
function of each sample.
\begin{eqnarray}
     E_n(f(x)) &=& \frac{1}{W_n} \sum_{i=1}^n w_{n,i} f(x_i) \\
        E_n(k) &=& k \\
    E_n(af(x)) &=& a E_n(f(x)) \\
E_n(f(x)+g(x)) &=& E_n(f(x)) + E_n(g(x)) \\
         \mu_n &=& E_n(x) \\
    \sigma_n^2 &=& E_n( (x - \mu_n)^2 ) \\
               &=& E_n( x^2 + \mu_n^2 - 2 \mu_n x ) \\
               &=& E_n(x^2) + \mu_n^2 - 2 \mu_n E_n(x) \\
               &=& E_n(x^2) - \mu_n^2 \\
               &=& E_n(x^2) - E_n(x)^2
\end{eqnarray}

The incremental formula is derived in the usual way.
Equation \ref{exp-id} is particularly useful.
\begin{eqnarray}
W_n E_n(f(x)) &=& \sum_{i=1}^{n} w_{n,i} f(x_i) \\
              &=& w_{n,n} f(x_n) + \sum_{i=1}^{n-1} w_{n,i} f(x_i) \\
              &=& w_{n,n} f(x_n) + (W_n-w_{n,n}) \frac{ \sum_{i=1}^{n-1} w_{n,i}   f(x_i) }
                                                      { \sum_{i=1}^{n-1} w_{n,i}          } \\
              &=& w_{n,n} f(x_n) + (W_n-w_{n,n}) \frac{ \sum_{i=1}^{n-1} w_{n-1,i} f(x_i) }
                                                      { \sum_{i=1}^{n-1} w_{n-1,i}        } \\
              &=& w_{n,n} f(x_n) + (W_n-w_{n,n}) E_{n-1}(f(x)) \label{exp-id}\\
E_n(f(x)) &=& E_{n-1}(f(x)) + \frac{w_{n,n}}{W_n} \left( f(x_n) - E_{n-1}(f(x)) \right)
\end{eqnarray}

%

\section{Variable-weight variance}

In equations \ref{vw-b1}--\ref{vw-b2} we made the following
assumptions which are not true when weights can vary.
\begin{eqnarray*}
& &  \sum_{i=1}^{n}   w_{n,i}   x_i^2 - W_n     \mu_n^2
   - \sum_{i=1}^{n-1} w_{n-1,i} x_i^2 + W_{n-1} \mu_{n-1}^2 \\
&\ne& w_{n,n} x_n^2 - W_n \mu_n^2 +   W_{n-1}   \mu_{n-1}^2 \\
&\ne& w_{n,n} x_n^2 - W_n \mu_n^2 + (W_n-w_{n,n}) \mu_{n-1}^2
\end{eqnarray*}

If we try to re-do the short derivation of the incremental standard
deviation formula starting from $S_n - S_{n-1}$ then we soon get
stuck. Fortunately the longer derivation shows how to made it work.
\begin{eqnarray}
S_n &=& W_n \sigma_n^2 \\
    &=&         W_n E_n \left( (  x            -  \mu_n            )^2 \right) \\
    &=&         W_n E_n \left( ( [x-\mu_{n-1}] - [\mu_n-\mu_{n-1}] )^2 \right) \\
    &=&         W_n E_n \left(   [x-\mu_{n-1}]^2
                                               + [\mu_n-\mu_{n-1}]^2
                             - 2 [x-\mu_{n-1}]   [\mu_n-\mu_{n-1}]   \right) \\
    &=&         W_n E_n \left(   [x-\mu_{n-1}]^2                       \right)
              + W_n E_n \left(                   [\mu_n-\mu_{n-1}]^2 \right)
            - 2 W_n E_n \left(   [x-\mu_{n-1}]   [\mu_n-\mu_{n-1}]   \right)
\end{eqnarray}

Simplify the first term:
\begin{eqnarray}
W_n E_n \left( [x - \mu_{n-1}]^2 \right)
   &=& w_{n,n} [x_n-\mu_{n-1}]^2 + (W_n-w_{n,n}) E_{n-1} \left( [x-\mu_{n-1}]^2 \right) \\
   &=& w_{n,n} [x_n-\mu_{n-1}]^2 + (W_n-w_{n,n}) \frac{S_{n-1}}{W_{n-1}} \\
   &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1} + \frac{W_n^2}{w_{n,n}} [\mu_n-\mu_{n-1}]^2
 \end{eqnarray}

Simplify the second term:
\begin{eqnarray}
  W_n E_n \left( [\mu_n-\mu_{n-1}]^2 \right) &=& W_n [\mu_n-\mu_{n-1}]^2
\end{eqnarray}

Simplify the third term:
\begin{eqnarray}
& &                   W_n E_n ( [x-\mu_{n-1}][\mu_n-\mu_{n-1}] ) \nonumber\\
&=& [\mu_n-\mu_{n-1}] W_n E_n   [x-\mu_{n-1}]                    \\
&=& [\mu_n-\mu_{n-1}] \left( w_{n,n} [x_n-\mu_{n-1}] + (W_n-w_{n,n}) E_{n-1}[  x  -         \mu_{n-1}  ] \right) \\
&=& [\mu_n-\mu_{n-1}] \left( w_{n,n} [x_n-\mu_{n-1}] + (W_n-w_{n,n}) [ E_{n-1}(x) - E_{n-1}(\mu_{n-1}) ] \right) \\
&=& [\mu_n-\mu_{n-1}] \left( w_{n,n} [x_n-\mu_{n-1}] + (W_n-w_{n,n}) [ \mu_{n-1}  -         \mu_{n-1}  ] \right) \\
&=& [\mu_n-\mu_{n-1}]        w_{n,n} [x_n-\mu_{n-1}] \\
&=& W_n [\mu_n-\mu_{n-1}]^2
\end{eqnarray}

Back to the complete formula:
\begin{eqnarray}
S_n &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1}
      + \frac{W_n^2}      {w_{n,n}} [\mu_n-\mu_{n-1}]^2
                              + W_n [\mu_n-\mu_{n-1}]^2
                            - 2 W_n [\mu_n-\mu_{n-1}]^2 \\
    &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1}
      + \frac{W_n^2}      {w_{n,n}} [\mu_n-\mu_{n-1}]^2
      - \frac{W_nw_{n,n}} {w_{n,n}} [\mu_n-\mu_{n-1}]^2 \\
    &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1}
+ (W_n-w_{n,n}) \frac{W_n}{w_{n,n}} [\mu_n-\mu_{n-1}]^2 \\
    &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1}
  + (W_n-w_{n,n}) [\mu_n-\mu_{n-1}] (x_n - \mu_{n-1}) \\
S_n &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1}
  + w_{n,n} (x_n - \mu_n) (x_n - \mu_{n-1})
\end{eqnarray}

This is the same as equation \ref{wvar-incr}, except for the multiplier
$\frac{W_n-w_{n,n}}{W_{n-1}}$ which captures the change in weights
between the old and new sets.
\begin{equation}
  \frac{W_n-w_{n,n}}{W_{n-1}}
= \frac{ \sum_{i=1}^{n-1} w_{n,i}   }
       { \sum_{i=1}^{n-1} w_{n-1,i} }
= \frac{ w_{n,j}   }
       { w_{n-1,j} } \mbox{ where } 1 \le j \le n-1
\end{equation}

Now that we know the rescaling trick which makes it work,
we can write down the short version.
\begin{eqnarray}
& & S_n - \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1} \nonumber\\
&=& W_n \left( E_n(x^2) - \mu_n^2 \right) - (W_n-w_{n,n}) \left( E_{n-1}(x^2) - \mu_{n-1}^2 \right) \\
&=& W_n \left( E_n(x^2) - \mu_n^2 \right) - W_n E_n(x^2) + w_{n,n} x_n^2 + (W_n-w_{n,n}) \mu_{n-1}^2 \\
&=& w_{n,n}   x_n^2 - W_n \mu_n^2 + (W_n-w_{n,n}) \mu_{n-1}^2 \\
&=& w_{n,n} \left( x_n^2 - \mu_{n-1}^2 \right) + W_n ( \mu_{n-1}^2 - \mu_n^2 ) \\
&=& w_{n,n} \left( x_n^2 - \mu_{n-1}^2 \right) + W_n ( \mu_{n-1} - \mu_n ) ( \mu_{n-1} + \mu_n ) \\
&=& w_{n,n} \left( x_n^2 - \mu_{n-1}^2 + ( \mu_{n-1} - x_n ) ( \mu_{n-1} + \mu_n ) \right) \\
&=& w_{n,n} ( x_n - \mu_n ) ( x_n - \mu_{n-1} )
\end{eqnarray}

%

\section{Exponentially-weighted mean and variance}

Starting from equation \ref{wmean-incr}, let's set $ w_{n,n} / W_n $
to a constant $ 0 < \alpha < 1 $ and let $ a = 1 - \alpha $. This
produces the standard formula for the exponentially weighted moving
average.
\begin{eqnarray}
  \mu_n &=& \mu_{n-1} + \alpha ( x_n - \mu_{n-1} ) \\
        &=& (1 - \alpha) \mu_{n-1} + \alpha  x_n \\
        &=& a \mu_{n-1} + (1 - a) x_n
\end{eqnarray}

In the following it's more convenient to use a lower bound of 0 instead of 1,
i.e. $ 0 \le i \le n $.
We are going to show that the weights are renormalized each time a
datum is added. First, we expand out the inductive definition of the
mean.
\begin{eqnarray}
  \mu_n &=& a \mu_{n-1} + (1 - a) x_n \\
        &=& a^2 \mu_{n-2} + a (1 - a) x_{n-1} + (1 - a) x_n \\
        &=& a^3 \mu_{n-3} + a^2 (1 - a) x_{n-2} + a (1 - a) x_{n-1} + (1 - a) x_n \\
  \mu_n &=& a^n x_0 + \sum_{i=1}^{n} a^{n-i} (1-a) x_i
\end{eqnarray}

This allows us to write down the weights directly.
Note that $w_{n,n}$ is independent of $n$.
\begin{eqnarray}
w_{n,0} &=& a^n \\
w_{n,i} &=& a^{n-i} (1-a)  ,\mbox{ for } 1 \le i \le n \\
w_{n,n} &=& 1 - a \ = \ \alpha
\end{eqnarray}

Since $w_{n,n} = \alpha = w_{n,n}/W_n$
we can see that $\forall n.\ W_n = 1$,
that is, the weights are always normalized. We can get the same result by
summing the geometric series.
\begin{eqnarray}
&& \sum_{i=1}^{n} a^{n-i} = \sum_{j=0}^{n-1} a^{j} = \frac{1-a^n}{1-a} \\
&& \sum_{i=1}^{n} w_{n,i} = \sum_{i=1}^{n} a^{n-i} (1-a) = 1-a^n \\
&& W_n = w_{n,0} + \sum_{i=1}^{n} w_{n,i} = a^n + (1 - a^n) = 1
\end{eqnarray}

These weights satisfy the consistency requirement because
\begin{equation}
    \frac{                    w_{n,j}   }
         { \sum_{i=1}^{n-1}   w_{n,i}   }
  = \frac{                  a w_{n-1,j} }
         { \sum_{i=1}^{n-1} a w_{n-1,i} }
  = \frac{                    w_{n-1,j} }
         { \sum_{i=1}^{n-1}   w_{n-1,i} }
\end{equation}

We can use the expectation function to write down the na\"ive formula
for the variance.
\begin{eqnarray}
 E_n(f(x)) &=& E_{n-1}(f(x)) + \frac{w_{n,n}}{W_n} ( f(x_n) - E_{n-1}(f(x)) ) \\
           &=& E_{n-1}(f(x)) + \alpha              ( f(x_n) - E_{n-1}(f(x)) ) \\
  E_n(x^2) &=& E_{n-1}(x^2) + \alpha ( x_n^2 - E_{n-1}(x^2) ) \\
\sigma_n^2 &=& E_n(x^2) - \mu^2
\end{eqnarray}

So using the formula from the previous section we can write the incremental version:
\begin{eqnarray}
     S_n &=& \frac{W_n-w_{n,n}}{W_{n-1}} S_{n-1} + w_{n,n} (x_n - \mu_n) (x_n - \mu_{n-1}) \\
     S_n &=& \frac{1  - \alpha}{1}       S_{n-1} + \alpha  (x_n - \mu_n) (x_n - \mu_{n-1}) \\
\sigma_n^2 = \frac{S_n}{W_n} = S_n &=& a S_{n-1} +   (1-a) (x_n - \mu_n) (x_n - \mu_{n-1}) \\
                                   &=& (1-\alpha)( S_{n-1} + \alpha (x_n - \mu_{n-1})^2 )
\end{eqnarray}

This latter form is slightly more convenient for code:
\begin{verbatim}
  diff := x - mean
  incr := alpha * diff
  mean := mean + incr
  variance := (1 - alpha) * (variance + diff * incr)
\end{verbatim}

%

\bibliography{stats}
\bibliographystyle{plain}

\end{document}

% eof