\documentclass[nojss]{jss}

\usepackage{amsmath,amsfonts,amsthm,enumerate,bm}
\newcommand{\R}{\mathbb{R}}
\renewcommand{\S}{\mathbb{S}}
\newcommand{\pFq}[2]{{}_{#1}F_{#2}}
\newcommand{\Ies}{I^\vee}
\newcommand{\norm}[1]{\lVert#1\rVert}
\newcommand{\argmax}{\mathop{\mathrm{arg max}}}

\newcommand{\fct}[1]{{\texttt{#1()}\index{#1@\texttt{#1()}}}}

\newenvironment{FIXME}{
  \begin{quote}\strut\marginpar{FIXME}}{\end{quote}}

%% \usepackage{Sweave}
\SweaveOpts{eps=false, keep.source=true, width=8, height=4}
\setkeys{Gin}{width=\textwidth}

\author{Kurt Hornik\\WU Wirtschaftsuniversit\"at Wien \And
  Bettina Gr\"un\\WU Wirtschaftsuniversit\"at Wien}

\Plainauthor{Kurt Hornik, Bettina Gr\"un}

\title{\pkg{movMF}: An \proglang{R} Package for Fitting Mixtures of von Mises-Fisher Distributions}
\Plaintitle{movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions}

\Keywords{EM algorithm, finite mixture, hypergeometric function
  $\pFq{0}{1}$, modified Bessel function ratio, \proglang{R}, von
  Mises-Fisher distribution}

\Plainkeywords{EM algorithm, finite mixture, hypergeometric function
  0F1, modified Bessel function ratio, R, von Mises-Fisher
  distribution}

\Abstract{

  This article is a (slightly) modified and shortened version of
  \cite{vMF:Hornik+Gruen:2014b}, published in the \emph{Journal of
    Statistical Software}.
  
  Finite mixtures of von Mises-Fisher distributions allow to apply
  model-based clustering methods to data which is of standardized
  length, i.e., all data points lie on the unit sphere.  The
  \proglang{R} package \pkg{movMF} contains functionality to draw
  samples from finite mixtures of von Mises-Fisher distributions
  and to fit these models using the expectation-maximization algorithm
  for maximum likelihood estimation. Special features are the
  possibility to use sparse matrix representations for the input data,
  different variants of the expectation-maximization algorithm,
  different methods for determining the concentration parameters in the
  M-step and to impose constraints on the concentration parameters
  over the components. 
  
  In this paper we describe the main fitting function of the package
  and illustrate its application. We also discuss he
  resolution of several numerical issues which occur for estimating the
  concentration parameters and for determining the normalizing
  constant of the von Mises-Fisher distribution.
  
}

\Address{
  Kurt Hornik\\
  Institute for Statistics and Mathematics\\
  WU Wirtschaftsuniversit\"at Wien\\
  Welthandelsplatz 1\\
  1020 Wien, Austria\\
  E-mail: \email{Kurt.Hornik@R-project.org}\\
  URL: \url{https://statmath.wu.ac.at/~hornik/}
  
  Bettina Gr\"un\\
  Institute for Statistics and Mathematics\\
  Wirtschaftsuniversit{\"a}t Wien\\
  Welthandelsplatz 1\\
  1020 Wien, Austria\\
  E-mail: \email{Bettina.Gruen@R-project.org}\\
}
<<echo=false, results=hide>>=
set.seed(1234)
options(width=65, prompt = "R> ", continue = "+  ", useFancyQuotes = FALSE)
cache <- TRUE
library("slam")
library("lattice")
library("movMF")
palette(colorspace::heat_hcl(4))
ltheme <- canonical.theme("pdf", FALSE)
for (i in grep("padding", names(ltheme$layout.heights))) {
  ltheme$layout.heights[[i]] <- 0.2
}
for (i in grep("padding", names(ltheme$layout.widths))) {
  ltheme$layout.widths[[i]] <- 0
}
lattice.options(default.theme = ltheme)
@ 

<<echo=false>>=
rotationMatrix <- function(mu) {
    beta <- asin(mu[1])
    alpha <- atan( - mu[2] / mu[3]) + sign(mu[2] * mu[3]) * (mu[3] < 0) * pi
    cosa <- cos(alpha); cosb <- cos(beta)
    sina <- sin(alpha); sinb <- sin(beta)
    matrix(c(cosb, sina * sinb, - cosa * sinb,
             0, cosa, sina,
             sinb, - sina * cosb, cosa * cosb),
           nrow = 3)
}

getIsolines <- function(d, mu, length = 100) {
    theta <- seq(0, 2*pi, length = length)
    sinphi <- sin(acos(d))
    tcrossprod(cbind(cos(theta) * sinphi, sin(theta) * sinphi, d),
               rotationMatrix(mu))
}

plotGlobe <- function(x, class, pars = NULL, main = "", theta = 0, phi = 0,
                      Q = c(0.5, 0.95), n = 100000,
                      xlim = c(-1, 1), ylim = c(-1, 1), zlim = c(-1, 1), ...) {
    pmat <- persp(0:1, 0:1, matrix(, 2, 2), theta = theta, phi = phi, 
                  xlim = xlim, ylim = ylim, zlim = zlim, scale = FALSE,
                  xlab="x", ylab="y", zlab="z", main = main, ...)
    trans3d <- function(x, y, z) {
        tr <- cbind(x, y, z, 1) %*% pmat
        list(x = tr[, 1]/tr[, 4],
             y = tr[, 2]/tr[, 4])
    }
    x <- x / row_norms(x)
    x3D <- trans3d(x[,1], x[,2], x[,3])
    
    theta <- seq(0, 2*pi, length = 41)
    phi <- seq(0, pi/2, length = 21)
    x <- cos(theta) %o% sin(phi)
    y <- sin(theta) %o% sin(phi)
    z <- rep(1, length(theta)) %o% cos(phi)
    for (j in seq(phi)[-1]) for (i in seq(theta)[-1]) {
        idx <- rbind(c(i-1,j-1), c(i,j-1), c(i,j), c(i-1,j))
        polygon(trans3d(x[idx], y[idx], z[idx]), border = "grey")
    }
    points(x3D$x, x3D$y, col = class, pch = 20)

    if (!is.null(pars)) {
        kappa <- row_norms(pars)
        d <- sapply(kappa, function(K) rmovMF(n, K * c(1, 0, 0))[,1])
        mu <- pars / kappa
        for (i in seq_along(Q)) {
            M <- apply(d, 2, quantile, 1-Q[i])
            isos <- lapply(seq_len(nrow(mu)), function(i) getIsolines(M[i], mu[i,]))
            isostrans <- lapply(isos, function(x) trans3d(x[,1], x[,2], x[,3]))
            lapply(seq_along(isostrans), function(j) 
                   polygon(isostrans[[j]]$x, isostrans[[j]]$y, border = j, lwd = 2, lty = i))
        }
        mu3D <- trans3d(mu[,1], mu[,2], mu[,3])
        points(mu3D$x, mu3D$y, pch = 4, lwd = 2, col = seq_len(nrow(mu)))
    }
    invisible(pmat)
}
@ 

<<corpus, echo=false, results=hide>>=
data("useR_2008_abstracts", package = "movMF")
@ 

\begin{document}
\sloppy

%\VignetteIndexEntry{movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions}
%\VignetteDepends{}
%\VignetteKeywords{EM algorithm, finite mixture, R, von Mises-Fisher distribution}
%\VignettePackage{movMF}

%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\section{Introduction}\label{sec:introduction}

Finite mixture models allow to cluster observations by assuming that
for each component a suitable parametric distribution can be specified and
that the mixture distribution is derived by convex combination of the
component distributions.  \citet{vMF:McLachlan+Peel:2000} and
\citet{vMF:Fruehwirth-Schnatter:2006} present overviews on the
estimation of these models in a maximum likelihood as well as a
Bayesian setting together with different applications of finite
mixture models.

If the support of the data is given by the unit sphere, a natural
choice for the component distributions is the von Mises-Fisher (vMF)
distribution. The special case where data is in $\mathbb{R}^2$, i.e.,
the observations lie on the unit circle, is referred to as von Mises
distribution. The vMF distribution has concentric contour lines
similar to the multivariate normal distribution with the
variance-covariance matrix being a multiple of the identity matrix. In
fact \citet[][page~173]{vMF:Mardia+Jupp:1999} point out that if $X$
follows a multivariate normal distribution with mean parameter $\mu$
which has length one and variance-covariance matrix $\kappa^{-1}I$,
then the conditional distribution of $X$ given that it has length one
is a vMF distribution with mean direction parameter~$\mu$ and
concentration parameter~$\kappa$.

Finite mixtures of vMF distributions were introduced in
\citet{vMF:Banerjee+Dhillon+Ghosh:2005} to cluster data on the unit
sphere. They propose to use the expectation-maximization (EM) algorithm
for maximum likelihood (ML) estimation and present two variants which
they refer to as hard and soft clustering. The areas of application of
the presented examples include text mining where abstracts of scientific
journals, news articles and newsgroup messages are categorized, and
bioinformatics using a yeast gene expression data set.  In these
applications, available data is typically high-dimensional.  The
estimation methods employed need to take this into account.  For finite
mixtures of vMF distributions this is particularly relevant for the
determination of the concentration parameters.

\citet{vMF:Tang+Chu+Huang:2009} use finite mixtures of vMF
distributions for speaker clustering. The data is pre-processed by
fitting a Gaussian mixture model (GMM) to the utterances and stacking
the mean vectors of the mixture components to form the GMM mean
supervector which is then used as input for the mixture model aiming
at clustering the speakers. They compare the performance of finite
mixtures of Gaussian distributions with those of vMF distributions and
conclude that for this application the latter outperform finite
mixtures of multivariate normal distributions. Further possible areas
of application for finite mixtures of vMF distributions are segmentation in high
angular resolution diffusion imaging
\citep{vMF:McGraw+Vemuri+Yezierski:2006} and clustering treatment beam
directions in external radiation therapy
\citep{vMF:Bangert+Hennig+Oelfke:2010}.

This paper is structured as follows.
Section~\ref{sec:vmf-distribution} introduces the vMF distribution and
shows how to draw samples from this distribution and how to determine
the ML estimates of its parameters.  The extension to finite mixture
models is covered in Section~\ref{sec:finite-mixtures-vmf} including
again the methods for drawing samples and determining ML
estimates. The \proglang{R} \citep{vMF:Team:2014} package \pkg{movMF}
is presented in Section~\ref{sec:software} by describing the main
fitting function \fct{movMF} in detail. The package is available from
the Comprehensive \proglang{R} Archive Network at
\url{https://CRAN.R-project.org/package=movMF}. Numerical issues when
evaluating the density or determining the ML estimates are discussed in
Section~\ref{sec:numerics}. An illustrative application of the package
using the abstracts from the talks at ``use\proglang{R}!  2008'', the
3rd international \proglang{R} user conference, in Dortmund, Germany,
is given in Section~\ref{sec:appl-user-2008}. The paper concludes with
a summary and an outlook on possible extensions.

%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\section{The vMF distribution}\label{sec:vmf-distribution}

A random unit length vector in $\R^d$ has a \emph{von Mises-Fisher}
(or Langevin) distribution with parameter~$\theta \in \R^d$ if its
density with respect to the uniform distribution on the unit sphere
$\S^{d-1} = \{x \in \mathbb{R}^d: \|x\| = 1 \}$ is given by
\begin{displaymath}
  f(x|\theta) = e^{\theta^{\top} x} / \pFq{0}{1}(; d/2; \|\theta\|^2 / 4),
\end{displaymath}
where
\begin{displaymath}
  \pFq{0}{1}(; \nu; z)
  = \sum_{n=0}^\infty \frac{\Gamma(\nu)}{\Gamma(\nu+n)}\frac{z^n}{n!}
\end{displaymath}
is the confluent hypergeometric limit function and related to the
modified Bessel function of the first kind $I_\nu$ via
\begin{equation}
  \label{eq:0F1-and-I}
  \pFq{0}{1}(; \nu + 1; z^2 / 4)
  = \frac{I_\nu(z) \Gamma(\nu + 1)}{(z / 2)^\nu}
\end{equation}
\citep[e.g.,][page 168]{vMF:Mardia+Jupp:1999}.

The vMF distribution is commonly parametrized as $\theta
= \kappa \mu$, where $\kappa = \|\theta\|$ and $\mu \in \S^{d-1}$ are
the \emph{concentration} and \emph{mean direction} parameters,
respectively (if $\theta \ne 0$, $\mu$ is uniquely determined as
$\theta / \|\theta\|$).  

In what follows, it will be convenient to write
\begin{displaymath}
  C_d(\kappa)
  = 1 / \pFq{0}{1}(; d/2; \kappa^2 / 4)
\end{displaymath}
so that the density is given by
\begin{displaymath}
  f(x | \theta) = C_d(\|\theta\|) e^{\theta^{\top}x}.
\end{displaymath}

\subsection{Simulating vMF distributions}
\label{sec:simul-vmf-distr}

The following algorithm provides a rejecting sampling scheme for
drawing a sample from the vMF distribution with modal direction
$(0,\ldots,0,1)$ and concentration parameter $\kappa \geq 0$
\citep[see Algorithm VM$^*$ in][]{vMF:Wood:1994}. The extension for
simulating from the matrix Bingham vMF distribution is described in
\citet{vMF:Hoff:2009} and also available in \proglang{R} through
package \pkg{rstiefel} \citep{vMF:Hoff:2012}.

\begin{enumerate}[Step 1.]
\item Calculate $b$ using
  \begin{align*}
    b = \frac{d-1}{2 \kappa + \sqrt{4 \kappa^2 + (d - 1)^2}}.
  \end{align*}
  Note that this calculation of $b$ is algebraically equivalent to the
  one proposed in \citet{vMF:Wood:1994}, but numerically more stable.
  
  Put $x_0 = (1 - b) / (1 + b)$ and $c = \kappa x_0 + (d - 1)
  \log(1-x^2_0)$.
\item\label{item:z} Generate $Z \sim \text{Beta}((d-1)/2, (d-1)/2)$
  and $U \sim \text{Unif}([0, 1])$ and calculate
  \begin{align*}
    W &= \frac{1 - (1 + b)Z}{1 - (1 - b)Z}.
  \end{align*}
\item If
  \begin{align*}
    \kappa W + (d - 1) \log(1 - x_0 W) - c & < \log(U),
  \end{align*}
  go to Step~\ref{item:z}.
\item Generate a uniform $(d-1)$-dimensional unit vector $V$ and
  return
  \begin{align*}
    X &= (\sqrt{1 - W^2} V^{\top}, W)^{\top}.
  \end{align*}
\end{enumerate}

The uniform $(d-1)$-dimensional unit vector $V$ can be generated by
simulating independent standard normal random variables and normalizing
them \citep[see for example][]{vMF:Ulrich:1984}. To get samples from a
vMF distribution with arbitrary mean direction parameter $\mu$, $X$ is
multiplied from the left with a matrix where the first $(d-1)$ columns
consist of unitary basis vectors of the subspace orthogonal to $\mu$ and
the last column is equal to $\mu$.

\pagebreak
\subsection{Estimating the parameters of the vMF distribution}\label{sec:estim-param-vmf}
Using the common parametrization by $\kappa$ and $\mu$, the
log-likelihood of a sample $x_1, \ldots, x_n$ from the vMF distribution
is given by
\begin{displaymath}
  n \log(C_d(\kappa)) + \kappa \mu^{\top} r,
\end{displaymath}
where $r = \sum_{i=1}^n x_i$ is the resultant vector (sum) of the $x_i$.
The maximum likelihood estimates (MLEs) are obtained by solving the
likelihood equations
\begin{displaymath}
  \hat{\mu} = r / \|r\|, \qquad
  - \frac{C_d'(\hat{\kappa})}{C_d(\hat{\kappa})} = \|r\| / n.
\end{displaymath}
Writing $A_d(\kappa) = - C_d'(\kappa) / C_d(\kappa)$ for the logarithmic
derivative of $1 / C_d(\kappa)$ and $\rho = \|r\| / n$ for the average
resultant length, the equation for the MLE of $\kappa$ becomes
\begin{displaymath}
  A_d(\hat{\kappa}) = \rho.
\end{displaymath}
Using recursions for the modified Bessel function \citep[e.g.,][page
71]{vMF:Watson:1995}, one can establish that
\begin{displaymath}
  A_d(\kappa)
  = - \frac{C_d'(\kappa)}{C_d(\kappa)}
  = \frac{I_{d/2}(\kappa)}{I_{d/2-1}(\kappa)}.
\end{displaymath}
It can be shown \citep[see for example][]{vMF:Schou:1978} that $A_d$ is
a strictly increasing function which maps the interval $[0, \infty)$
onto the interval $[0, 1)$ and satisfies the Riccati equation
$A_d'(\kappa) = 1 - A_d(\kappa)^2 - \frac{d-1}{\kappa} A_d(\kappa)$.  As
$A_d$ and hence its derivatives can efficiently be computed using
continued fractions (see Section~\ref{sec:numerics} for details), the
equation $A_d(\hat{\kappa}) = \rho$ can efficiently be solved by
standard iterative techniques provided that good starting approximations
are employed.

\citet{vMF:Dhillon+Sra:2003} and subsequently
\citet{vMF:Banerjee+Dhillon+Ghosh:2005} suggest the approximation
\begin{equation}
  \label{eq:kappa_hat_approx}
  \hat{\kappa}
  \approx
  \frac{\rho ( d - \rho^2)}{1 - \rho^2}
\end{equation}
obtained by truncating the Gauss continued fraction representation of
$A_d$ and adding a correction term ``determined empirically''.  The
former reference also suggests using this approximation as the starting
point of a Newton-Raphson iteration, using the above expression for
$A_d'$.

\citet{vMF:Tanabe+Fukumizu+Oba:2007} show that
\begin{displaymath}
  \hat{\kappa} =
  \frac{\rho (d - c)}{1 - \rho^2}
\end{displaymath}
for some suitable $0 \le c \le 2$.  The approximation in
Equation~\ref{eq:kappa_hat_approx} corresponds to $c \approx \rho^2$.
They also suggest to determine $\hat{\kappa}$ via the fixed point
iteration
\begin{displaymath}
  \kappa_{t+1} = \kappa_t \rho / A_d(\kappa_t)
\end{displaymath}
with a starting value in the solution range, e.g., using $c = 1$ or $c
= \rho^2$.  

\citet{vMF:Sra:2012} introduces a ``truncated Newton approximation''
based on performing exactly two Newton iterations
\begin{displaymath}
  \kappa_{t+1} =
  \kappa_t - \frac{A_d(\kappa_t) - \rho}{A_d'(\kappa_t)}
\end{displaymath}
with $\kappa_0$ determined via the $c = \rho^2$ approximation.

\citet{vMF:Song+Liu+Wang:2012} suggest to use a ``truncated Halley
approximation'' based on performing exactly two Halley iterations
\begin{displaymath}
  \kappa_{t+1} =
  \kappa_t - \frac{2 (A_d(\kappa_t) - \rho)A_d'(\kappa_t)}
  {2 A_d'(\kappa_t)^2 - (A_d(\kappa_t) - \rho)A_d''(\kappa_t)}
\end{displaymath}
with $\kappa_0$ determined via the $c = \rho^2$ approximation and
using that the second derivative can be given as a function of
$A_d(\kappa_t)$, i.e.,
\begin{displaymath}
  A_d''(\kappa_t) = 
  2 A_d(\kappa_t)^3 + \frac{3(d - 1)}{\kappa} A_d(\kappa_t)^2 + 
  \frac{d^2 - d - 2 \kappa^2}{\kappa^2} A_d(\kappa_t) - \frac{d - 1}{\kappa}.
\end{displaymath}

The results in \citet{vMF:Hornik+Gruen:2014a} yield the substantially
improved bounds
\begin{equation}
  \label{eq:bounds-for-kappa-hat}
  \max \left( F_{d/2-1,d/2+1}(\rho), F_{(d-1)/2,\sqrt{d^2-1}/4}(\rho) \right)
  \le \hat{\kappa}
  \le F_{(d-1)/2,(d+1)/2}(\rho),
\end{equation}
valid for $0 \le \rho < 1$, where 
\begin{displaymath}
  F_{\alpha,\beta}(\rho) = 
  \frac{\rho}{1 - \rho^2}
  \left( 
    \alpha + \sqrt{\rho^2 \alpha^2 + (1 - \rho^2) \beta^2}
  \right).
\end{displaymath}
The difference between the upper and lower bound is at most $3\rho
/ 2$ for all $0 \le \rho < 1$, and the difference between the lower
bound and $\hat{\kappa}$ tends to 0 as $\rho \to 1-$.

Convex combinations of the lower and the upper bounds can be employed
as starting values for the above iteration schemes (which require a
single starting point).  In addition, as these bounds actually give an
interval known to contain the unique root $\hat{\kappa}$ of the
function $\kappa \mapsto A_d(\kappa) - \rho$, one can use them as
starting values for root finding methods which iteratively refine
intervals containing the solution, such as simple bisection (as
provided by \fct{uniroot} in \proglang{R}), hybrid algorithms
combining derivative-based (Newton or Halley) and bisection steps
\citep[e.g.][page~366]{vMF:Press+Teukolsky+Vetterling:2002}, or the
Newton-Fourier method \citep[e.g.][pages~62--64]{vMF:Atkinson:1989}.
One can show that $A_d$ is concave \citep[e.g., using Theorem~11 in][
which establishes that $A_d = R_{d/2-1}$ is the pointwise minimum of
concave functions, and hence concave]{vMF:Hornik+Gruen:2013}: hence,
employing the above bounds and a variant of the Newton-Fourier method
for strictly increasing concave functions yields a quadratically
convergent iterative scheme for determining $\hat{\kappa}$.

\subsection{Illustrative example: Household expenses}\label{sec:illustr-exampl-hous-data}

To illustrate the use of the vMF distribution to model data on the
sphere we use the \code{household} data set from package \pkg{HSAUR3}
\citep{vMF:Everitt+Hothorn:2018}. The data are part of a data set
collected from a survey on household expenditures and give the
expenses of 20 single men and 20 single women on four commodity
groups. In the following we will focus only on three of those
commodity groups (housing, food and service) to have 3-dimensional
data which is easier to visualize.  
%
<<illustration1, echo=false>>=
data("household", package = "HSAUR3")
x <- as.matrix(household[, c(1:2, 4)])
gender <- household$gender

theta <- rbind(female = movMF(x[gender == "female", ], k = 1)$theta, 
               male = movMF(x[gender == "male", ], k = 1)$theta)
   
set.seed(2008)
vMFs <- lapply(1:5, function(K) 
  movMF(x, k = K, control= list(nruns = 20)))
@ 
%
<<echo=false>>=
kappa <- row_norms(theta)
tab2 <- table(max.col(vMFs[[2]]$P), household$gender)
mc2 <- min(c(sum(diag(tab2)), sum(tab2) - sum(diag(tab2))))
tab3 <- table(max.col(vMFs[[3]]$P), household$gender)
@ 
%
The data points are projected onto the sphere by normalizing them to
have length one. Thus, in the following analysis we are interested in
finding groups of households which lie in a similar direction, i.e.,
the angle between the observations is small and we are not interested
in differences in their total absolute expenses.

The data points on the sphere are visualized in
Figure~\ref{fig:household-example} on the top left. Using the gender
information, vMF distributions are fitted to the male and the female
observations separately. The fitted distributions are visualized
together with the mean direction indicated by a cross and with confidence
circles of probability 50\% (full lines) and 95\% (dashed
lines). Clearly the females have a smaller dispersion as indicated by
the estimated $\kappa$ which is equal to
\Sexpr{round(kappa[1],digits=1)}, as compared to the $\kappa$ of the
males which is given by \Sexpr{round(kappa[2],digits=1)}.

\begin{figure}[t!]
  \centering
<<fig=true, echo=false, results=hide, width=8, height=6>>=
par(mar = 0.1 + c(0, 0.5, 2, 0), mfrow = c(2, 2))
plotGlobe(x, household$gender, main = "Data", 
          theta = -30, phi = 10, zlim = c(0, 1))

plotGlobe(x, household$gender, theta, main = "Known group membership", 
          theta = -30, phi = 10, zlim = c(0, 1))

mu <- lapply(vMFs, function(x) x$theta / row_norms(x$theta))
ordering <- lapply(mu, function(x) order(x[,1], decreasing = TRUE))
class <- alpha <- kappa <- theta <- vector("list", length(ordering))
for (i in seq_along(ordering)) {
   alpha[[i]] <- vMFs[[i]]$alpha[ordering[[i]]]
   mu[[i]] <- mu[[i]][ordering[[i]],,drop=FALSE]
   theta[[i]] <- vMFs[[i]]$theta[ordering[[i]],,drop=FALSE]
   kappa[[i]] <- row_norms(vMFs[[i]]$theta[ordering[[i]],,drop=FALSE])
   class[[i]] <- max.col(vMFs[[i]]$P[,ordering[[i]]])
}
	  
for (k in 2:3) {
  plotGlobe(x, class[[k]], theta[[k]], 
            main = paste("Mixtures of vMFs with K =", k),
            theta = -30, phi = 10, zlim = c(0, 1))
}
@ 
\setkeys{Gin}{width=0.8\textwidth}
\caption{Household expenses data with gender indicated by color after
  projection to the sphere at the top left, estimated vMF
  distributions with confidence circles for each gender group
  separately at the top right and the estimated mixtures of vMF
  distributions with $K = 2$ and $K = 3$ with confidence circles at
  the bottom.}
  \label{fig:household-example}
\end{figure}

%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\section{Finite mixtures of vMF distributions}\label{sec:finite-mixtures-vmf}

The mixture model with $K$ components is given by
\begin{align*}
  h(x | \Theta) & = \sum_{k = 1}^K \alpha_k f(x | \theta_k),
\end{align*}
where $h(\cdot | \cdot)$ is the mixture density, $\Theta$ the vector
with all $\alpha$ and $\theta$ parameters, and $f(y | \theta_k)$
the density of the vMF distribution with parameter $\theta_k$.
Furthermore, the component weights $\alpha_k$ are positive for all $k$
and sum to one.

\subsection{Simulating mixtures of vMF distributions}

This is straightforwardly achieved by first sampling class ids $z \in
\{1, \ldots, K\}$ with the mixture class probabilities $\alpha_1,
\ldots, \alpha_K$, and then sampling the data from the respective vMF
distributions with parameter $\theta_k$.

\subsection{Estimating the parameters of mixtures of vMF distributions}\label{sec:estim-param-mixt}

EM algorithms for ML estimation of the parameters of mixtures of vMF
distributions are given in \citet{vMF:Dhillon+Sra:2003} and
\citet{vMF:Banerjee+Dhillon+Ghosh:2005}. The EM algorithm exploits the
fact that the complete-data log-likelihood where the component
memberships of the observations are known is easier to maximize than
the observed-data log-likelihood. 

The EM algorithm for fitting mixtures of vMF distributions consists of
the following steps:
\begin{enumerate}
\item Initialization: Either of the following two:
  \begin{enumerate}[(a)]
  \item Assign values to $\alpha_k$ and $\theta_k$ for $k=1,\ldots,K$,
    where $\alpha_k > 0$ and $\sum_{k=1}^K \alpha_k = 1$ and $\theta_k \neq
    \theta_l$ for all $k \neq l$ and $k,l=1,\ldots,K$.
    
    Start the EM algorithm with an E-step.
  \item Assign (probabilities of) component memberships to each of the $n$
    observations. E.g., the output from spherical $k$-means
    \citep{vMF:Hornik+Feinerer+Kober:2012} can be used.

    Start the EM algorithm with an M-step.
  \end{enumerate}
\item Repeat the following steps until the maximum number of
  iterations is reached or the convergence criterion is met.
  \begin{description}
  \item[E-step:] Because the complete-data log-likelihood is linear in
    the missing data which correspond to the component memberships,
    the E-step only consists of calculating the a-posteriori
    probabilities, the probabilities of belonging to a component
    conditional on the observed values, using
    \begin{align*}
      p(k|x_i, \Theta) &\propto \alpha_k f(x_i | \theta_k).
    \end{align*}
  \item[M-step:] Maximize the expected complete-data log-likelihood by
    determining separately for each $k$:
    \begin{align*}
      \hat{\alpha}_k &= \frac{1}{n} \sum_{i=1}^n p(k|x_i, \Theta),\\
      \hat{\mu}_k &= \frac{\sum_{i=1}^n p(k|x_i, \Theta) x_i}{\|\sum_{i=1}^n p(k|x_i, \Theta) x_i\|},&
      - \frac{C_d'(\hat{\kappa}_k)}{C_d(\hat{\kappa}_k)} &=
      \frac{\|\sum_{i=1}^n p(k|x_i, \Theta) x_i\|}{\sum_{i=1}^n p(k|x_i, \Theta)}. 
    \end{align*}
    $\hat{\kappa}_k$ can be determined via the approximation of
    Equation~\ref{eq:kappa_hat_approx}, or one of the improved
    methods discussed in Section~\ref{sec:estim-param-vmf}.
  \item[Convergence check:] Assess convergence by checking (either or both)
    \begin{enumerate}[(a)]
    \item if the relative absolute change in the
      log-likelihood values is smaller than a threshold
      $\epsilon_1$;
    \item if the relative absolute change in parameters is smaller
      than a threshold $\epsilon_2$.
    \end{enumerate}
    If converged, stop the algorithm.   
  \end{description}
\end{enumerate}

This corresponds to the soft-movMF algorithm on page 1357 in
\citet{vMF:Banerjee+Dhillon+Ghosh:2005}. In addition they propose the
hard-movMF algorithm on page 1358. The algorithm above can be modified
to the hard-movMF algorithm by adding a hardening step between E- and
M-step:
\begin{description}
\item[H-step:] Replace the a-posteriori probabilities by assigning
  each observation with probability 1 to one of the components where
  its a-posteriori probability is maximum. Assuming the maximum is
  unique, this corresponds to
  \begin{align*}
    p(k|x_i, \Theta) &= \left\{\begin{array}{rl}
        1,&\text{if } k = \argmax_h p(h|x_i, \Theta),\\
        0,&\text{otherwise}.
      \end{array}
      \right.
  \end{align*}
  If the maximum is not unique, assignment is randomly with equal
  probability to one of the $k \in \argmax_h p(h|x_i, \Theta)$, i.e., ties are
  broken at random.
\end{description}
This algorithm is also referred to as classification EM algorithm in
the literature \citep{vMF:Celeux+Govaert:1992}. A further variant of
the EM algorithm also considered for example in
\cite{vMF:Celeux+Govaert:1992} would be the stochastic EM where
instead of an hardening step a stochastic step is added between E- and
M-step:
\begin{description}
\item[S-step:] Assign at random each observation to one component with
  probability equal to its a-posteriori probability.
\end{description}

The algorithm above determines the parameter estimates if the
concentration parameters are allowed to vary freely over
components. An alternative model specification could be to impose the
restriction that the concentration parameters are the same for all
components. This has the advantage that the clusters will be of
comparable compactness and that spurious solutions containing small
components with very large concentration parameters are eliminated. In
the following we derive how the M-step needs to be modified if the
concentration parameters are restricted to be the same over
components.

From Appendix A.2 of \citet{vMF:Banerjee+Dhillon+Ghosh:2005} the
optimal unconstrained $\kappa_k$ can be obtained by solving
\begin{displaymath}
  A_d(\kappa_k) =
  \frac{\left\| \sum_i p(k|x_i, \Theta) x_i\right\|}{\sum_i p(k|x_i,
    \Theta)}.
\end{displaymath}
If the $\kappa_k$ are constrained to be equal (but are not given), the
optimal common $\kappa$ can be obtained as follows.  Using Equation
A.12 in \citet{vMF:Banerjee+Dhillon+Ghosh:2005}, the modified
Lagrangian becomes
\begin{eqnarray*}
  \lefteqn{
    \sum_k
    \left(
      \sum_i \log(C_d(\kappa)) p(k|x_i,\Theta) +
      \sum_i \kappa \mu_k^{\top} x_i p(k|x_i,\Theta)
    \right)
    + \lambda_k (1 - \mu_k^{\top} \mu_k)} \\
  &=& \log(C_d(\kappa)) \sum_{k,i} p(k|x_i,\Theta)
  + \kappa \sum_{k,i} \mu_k^{\top} x_i p(k|x_i,\Theta)
  + \lambda_k (1 - \mu_k^{\top} \mu_k) \\
  &=& n \log(C_d(\kappa))
  + \kappa \sum_{k,i} \mu_k^{\top} x_i p(k|x_i,\Theta)
  + \lambda_k (1 - \mu_k^{\top} \mu_k).
\end{eqnarray*}
Setting partials with respect to $\mu_k$ and $\lambda_k$ to zero as in
the reference gives (again)
\begin{displaymath}
  \mu_k =
  \frac{\sum_i x_i p(k|x_i,\Theta)}{\|\sum_i x_i p(k|x_i,\Theta)\|}
\end{displaymath}
and with these $\mu_k$ we obtain for $\kappa$ that
\begin{displaymath}
  0 =
  - A_d(\kappa) n + \sum_{k,i} \mu_k^{\top} x_i p(k|x_i,\Theta) =
  - A_d(\kappa) n + \sum_k \left\|\sum_i x_i p(k|x_i,\Theta)\right\|,
\end{displaymath}
i.e., $\kappa$ needs to solve the equation
\begin{displaymath}
  A_d(\kappa) =
  \frac{1}{n} \sum_k \left\|\sum_i x_i p(k|x_i,\Theta)\right\|.
\end{displaymath}

\subsection{Illustrative example: Household expenses}\label{sec:illustr-exampl-hous-movMF}

In the following the gender information is not used and it is
investigated if finite mixtures allow to unravel a distinction between
male and female respondents in their household expenses. Assuming that
it is known that there are two underlying unobserved groups, a mixture
with two components is fitted. The results are visualized in
Figure~\ref{fig:household-example} at the bottom left. The colors are
according to assignments to components using the maximum a-posteriori
probabilities.  These assignments lead to \Sexpr{c("one")[mc2]}
misclassification, i.e., one female is assigned to the component with
the higher dispersion. The estimated parameters and the BIC value of
the model are given in Table~\ref{tab:household-expenses}.

If the number of components is assumed to be unknown, the Bayesian
information criterion (BIC) can be used to select a suitable number of
components \citep[see for example][Chapter
6]{vMF:McLachlan+Peel:2000}. In this case the minimum BIC is obtained
for three components if models in the range of $K = 1,\ldots,5$ are
considered. The results of the mixture with $K = 3$ components are
visualized in Figure~\ref{fig:household-example} at the bottom right
and the estimated parameters and the BIC value are given in
Table~\ref{tab:household-expenses}.  In this case the male respondents
are split into two groups with less dispersion each and different mean
directions. The \proglang{R} code for reproducing these results is
provided in Section~\ref{sec:illustr-exampl-hous-code} after
introducing package \pkg{movMF}.
%
<<echo=false>>=
X2 <- cbind(alpha[[2]], mu[[2]], kappa[[2]], c(BIC(vMFs[[2]]), NA))
X2 <- format(round(X2, digits = 2), nsmall = 2)
X2[1,6] <- paste("$", X2[1,6], "$", sep = "")
X2 <- gsub("NA", "  ", X2)
X2 <- apply(cbind(c("$K = 2$", ""), X2), 1, paste, collapse = "&")

X3 <- cbind(alpha[[3]], mu[[3]], kappa[[3]], c(BIC(vMFs[[3]]), rep(NA, 2)))
X3 <- format(round(X3, digits = 2), nsmall = 2)
X3[1,6] <- paste("$", X3[1,6], "$", sep = "")
X3 <- gsub("NA", "  ", X3)
X3 <- apply(cbind(c("$K = 3$", rep("", 2)), X3), 1, paste, collapse = "&")
@ 

\begin{table}[t!]
  \centering
  \begin{tabular}{lccccrc}
    \hline
    &$\alpha$&\Sexpr{paste(colnames(x), collapse = "&")}&\multicolumn{1}{c}{$\kappa$}&BIC\\
    \hline
    \Sexpr{X2[1]}\\
    \Sexpr{X2[2]}\\
    \hline
    \Sexpr{X3[1]}\\
    \Sexpr{X3[2]}\\
    \Sexpr{X3[3]}\\
    \hline
  \end{tabular}
  \caption{Results of fitting mixtures of vMF distributions to the household expenses example.}
  \label{tab:household-expenses}
\end{table}
\pagebreak
%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\section[Software]{Software}\label{sec:software}

\subsection[Main fitting function movMF()]{Main fitting function
  \fct{movMF}}

The main function in package \pkg{movMF} for fitting mixtures of vMF
distributions is \fct{movMF}, with synopsis
%
<<echo=false>>=
cat(paste("R> ", prompt(movMF, filename = NA)$usage[[2]]))
@ 
%
The arguments for this function are as follows.
\begin{description}
\item[\normalfont \code{x}:] A numeric data matrix, with rows corresponding to
  observations. If necessary the data is standardized to unit row
  lengths. Furthermore, the matrix can be either stored as a dense
  matrix, a simple triplet matrix (defined in package \pkg{slam}), or
  a general sparse triplet matrix of class `\code{dgtMatrix}' (from
  package \pkg{Matrix}).
\item[\normalfont \code{k}:] An integer indicating the number of components.
\item[\normalfont \code{control}:] A list of control parameters consisting of
 \begin{description}
  \item[\code{E}:] Specifies the variant of the EM algorithm used with
   possible values \code{"softmax"} (default), \code{"hardmax"}
   (classification EM), and \code{"stochmax"} (stochastic EM).
  \item[\code{kappa}:] This argument allows to specify how to determine
   the concentration parameters.
   \begin{itemize}
    \item If numbers are given, the concentration parameters are assumed
     to be fixed and are not estimated in the EM algorithm.
   \item The method for solving for the concentration parameters can
     be \mbox{specified} by one of \code{"Banerjee_et_al_2005"},
     \code{"Tanabe_et_al_2007"}, \code{"Sra_2012"},
     \code{"Song_et_al_2012"}, \code{"uniroot"}, \code{"Newton"},
     \code{"Halley"}, \code{"hybrid"} and \code{"Newton_Fourier"}
     (default). For more details on these methods see
     Section~\ref{sec:estim-param-vmf}.
    \item For common concentration parameters a list with elements
     \code{common = TRUE} and a character string giving the estimation
     method needs to be provided.
   \end{itemize}
 \item[\code{converge}:] Logical indicating if convergence of the
   algorithm should be checked and if in this case the algorithm
   should be stopped before the maximum number of iterations is
   reached. For \code{E} equal to \code{"softmax"} this argument is
   set by default to \code{FALSE}. Note that only condition (a) of the
   convergence check (see Section~\ref{sec:estim-param-mixt}) is
   assessed, i.e., the relative change in the log-likelihood values.
  \item[\code{maxiter}:] Integer indicating the maximum number of
   iterations of the EM algorithm.
   (Default: 100.)
  \item[\code{reltol}:] If the relative change in the log-likelihood
   falls below this threshold the EM algorithm is stopped if
   \code{converge} is \code{TRUE}.
   (Default: \verb|sqrt(.Machine$double.eps)|.)
  \item[\code{verbose}:] Logical indicating if information on the
    progress of the fitting process shall be printed during the
    estimation. 
  \item[\code{ids}:] Indicates either the class memberships of the
    observations or if equal to \code{TRUE} the class memberships are
    obtained from the attributes of the data. In this way the class
    memberships are for example stored if data is generated using
    function \code{rmovMF()}. If this argument is specified, the EM
    algorithm is stopped after one iteration, i.e., the parameter
    estimates are determined conditional on the known true class
    memberships.
  \item[\code{start}:] Allows to specify the starting values used for
    initializing the EM algorithm which then starts with an M-step. It
    can either be a list of matrices where each matrix contains the
    a-posteriori probabilities of the observations or a list of
    vectors containing component assignments for the
    observations. Alternatively it can be a character vector with
    entries \code{"i"}, \code{"p"}, \code{"S"} or \code{"s"}. The
    length of the vector specifies how many different initializations
    are made. \code{"i"} indicates to randomly assign component
    memberships to the observations. The latter three draw
    observations as prototypes and determine a-posteriori
    probabilities by taking the implied cosine dissimilarities between
    observations and prototypes. \code{"p"} randomly picks
    observations as prototypes, \code{"S"} takes the first prototype
    to minimize the total cosine dissimilarity to the observations,
    and then successively picks observations farthest away from the
    already picked prototypes.  For \code{"s"} one takes a randomly
    chosen observation as the first prototype, and then proceeds as
    for \code{"S"}. For more details on these initialization methods
    see package \pkg{skmeans} \citep{vMF:Hornik+Feinerer+Kober:2012}
    which uses the same initialization schemes.
  \item[\code{nruns}:] An integer indicating the number of repeated
    runs of the EM algorithm with random initialization. This argument
    is ignored if either \code{ids} or \code{start} are specified.
  \item[\code{minalpha}:] Components with size below the
    threshold indicated by \code{minalpha} are omitted during the
    estimation with the EM algorithm. This avoids estimation problems
    in the M-step if only very few observations are assigned to
    one component. The disadvantage is that the initial number of
    components is not necessarily equal to the number of components of
    the returned model.
  \end{description}
\end{description}

\subsection[Additional functionality in  movMF]{Additional functionality in \pkg{movMF}}

The object returned by \fct{movMF} has an \proglang{S}3 class called
`\code{movMF}' with methods \fct{print}, \fct{coef}, \fct{logLik} and
\fct{predict} (yields either the component assignments or the matrix
of a-posteriori probabilities). Additional functionality available in
the package includes \fct{rmovMF} for drawing from a mixture of vMF
distributions and \fct{dmovMF} for evaluating the mixture density.

\subsection{Illustrative example: Household expenses}\label{sec:illustr-exampl-hous-code}

In the following the code for reproducing the results presented in
Section~\ref{sec:illustr-exampl-hous-movMF} is provided.  After
loading the dataset the three columns of interest are selected from
the expenses and stored in variable \code{x}. Then the classification
variable \code{gender} is also extracted. First \code{movMF()} is used
to fit a single vMF distribution to the two separate sub-samples and
then mixtures are fitted with number of components varying from 1 to
5. To avoid reporting sub-optimal solutions where the EM algorithm was
trapped in a local optimum, the best result from 20 random
initializations is returned.
%
<<eval=false>>=
<<illustration1>>
@ 
%
The BIC values for the different mixtures can be compared
using
<<>>=
sapply(vMFs, BIC)
@ 

\section{Numerical issues}
\label{sec:numerics}

In what follows it will be convenient to write
\begin{displaymath}
  H_\nu(\kappa)
  = \pFq{0}{1}(; \nu + 1; \kappa^2 / 4)
  = \frac{\Gamma(\nu + 1)}{(\kappa / 2)^\nu} I_\nu(\kappa).
\end{displaymath}
As shown in Section~\ref{sec:vmf-distribution}, computing
log-likelihoods for (mixtures of) vMF distributions on $\R^d$ requires
the computation of $\log(H_{d/2-1})$, and ML estimation of concentration
parameters amounts to solving equations of the form $A_d(\kappa)=\rho$,
where
\begin{displaymath}
  A_d(\kappa)
  = \frac{I_{d/2}(\kappa)}{I_{d/2-1}(\kappa)}
  = \frac{\kappa}{d} \frac{H_{d/2}(\kappa)}{H_{d/2-1}(\kappa)}  
\end{displaymath}
is the logarithmic derivative of $H_{d/2-1}$.

As $H_\nu(\kappa) \to \infty$ as $\kappa \to \infty$ (in fact, quite
rapidly, see below), it clearly is a bad idea to try to compute
$\log(H)$ as the logarithm of $H$, or $A$ as the ratio of $H$
functions.  Similar considerations apply for using logarithms or
ratios of incomplete modified Bessel functions~$I$.  One might wonder
whether one could successfully take advantage of the fact that the
Bessel functions provided by \proglang{R} are based on the
\pkg{SPECFUN} package of \citet{vMF:Cody:1993} and hence also provide
the exponentially scaled modified Bessel function $e^{-\kappa}
I_\nu(\kappa)$ (the scaling is motivated by the asymptotic expansion
$I_\nu(\kappa) \sim e^\kappa (2\pi\kappa)^{-1/2} \sum_m a_m(\nu)
/ \kappa^m$ for $\kappa \to \infty$).  However, exponential scaling
does not help in situations where $1 \ll \kappa \ll \nu$: e.g., for
$\kappa = 6000$ and $\nu = 10000$, $H$ overflows whereas $I$
underflows (even though \proglang{R} gives \verb|Inf| and \verb|0| for
the cases without and with exponential scaling, respectively).

\subsection[Computing A]{Computing $A_d$}

One can use the Gauss continued fraction
\begin{displaymath}
  \frac{I_\nu(z)}{I_{\nu - 1}(z)}
  =
  \frac{1}{2\nu/z + {}} \frac{1}{2(\nu+1)/z + {}}
  \frac{1}{2(\nu+2)/z + {}} \cdots
\end{displaymath}
for the ratio of modified Bessel functions
(\url{https://dlmf.nist.gov/10.33.E1}) to compute $A$ as
\begin{displaymath}
  A_d(\kappa)
  = \frac{I_{d/2}(\kappa)}{I_{d/2-1}(\kappa)}
  =
  \frac{1}{d/\kappa + {}}\frac{1}{(d+2)/\kappa + {}}
  \frac{1}{(d+4)/\kappa + {}} \cdots
\end{displaymath}
\citep[Equation~4.3 in][]{vMF:Banerjee+Dhillon+Ghosh:2005}, using,
e.g., Steed's method (e.g., \url{https://dlmf.nist.gov/3.10}) for
evaluation.  However, as pointed out by
\citet{vMF:Gautschi+Slavik:1978} and \citet{vMF:Tretter+Walster:1980}
\citep[and, quite recently, re-iterated by][]{vMF:Song+Liu+Wang:2012},
the Perron continued fraction
\begin{displaymath}
  \frac{I_\nu(z)}{I_{\nu - 1}(z)}
  =
  \frac{z}{2\nu + z - {}}
  \frac{(2\nu+1)z}{2 \nu + 1 + 2z - {}}
  \frac{(2\nu+3)z}{2 \nu + 2 + 2z - {}}
  \frac{(2\nu+5)z}{2 \nu + 3 + 2z - {}}
  \cdots  
\end{displaymath}
is numerically more stable (as computing it by forward recursion only
accumulates positive terms, whereas for the Gauss continued fraction
the terms alternate in sign), and converges substantially faster for
positive $z \gg \nu$.  Hence, by default we compute $A$ via the Perron
continued fraction \citep[with implementation based on Equation~3.3$'$
in][]{vMF:Gautschi+Slavik:1978}, and additionally provide computation
via the Gauss continued fraction \citep[with implementation based on
Equation~3.2$'$ in][]{vMF:Gautschi+Slavik:1978} and using
exponentiation of $\log(H)$ differences as alternatives.

For $\kappa$ close to zero it is better (and
necessary for $\kappa = 0$) to use the approximation
\begin{displaymath}
  A_d(\kappa) =
  \frac{1}{d} \kappa - \frac{1}{d^2(d+2)} \kappa^3 + O(\kappa^5),
  \qquad
  \kappa \to 0
\end{displaymath}
\citep[Equation 5]{vMF:Schou:1978}.  The $O(\kappa^5)$ can be made more
precise by using the series representation
$I_\nu(z) = \sum_{n=0}^\infty (z/2)^{2n + \nu} / (n! \Gamma(n + \nu +
1))$ so that for $\kappa \to 0$,
\begin{eqnarray*}
  A_d(\kappa)
  &=& \frac{\displaystyle (\kappa/2)^{d/2}
    \left(
      \frac{1}{\Gamma(d/2 + 1)} +
      \frac{\kappa^2/4}{\Gamma(d/2 + 2)} +
      \frac{\kappa^4/32}{\Gamma(d/2 + 3)} +
      O(\kappa^6)
    \right)}{\displaystyle
    (\kappa/2)^{d/2-1}
    \left(
      \frac{1}{\Gamma(d/2)} +
      \frac{\kappa^2/4}{\Gamma(d/2 + 1)} +
      \frac{\kappa^2/32}{\Gamma(d/2 + 2)} +
      O(\kappa^6)
    \right)} \\
  &=& \frac{\displaystyle \kappa}{2}
  \frac{\frac{2}{d} +
    \frac{\kappa^2}{d(d+2)} +
    \frac{\kappa^4}{4d(d+2)(d+4)} +
    O(\kappa^6)}{\displaystyle
    1 +
    \frac{\kappa^2}{2d} +
    \frac{\kappa^4}{8d(d+2)} +
    O(\kappa^6)}
\end{eqnarray*}
from which the coefficient of $\kappa^5$ can straightforwardly be
determined as
\begin{eqnarray*}
    \frac{1}{2}
    \left(
      \frac{1}{4d(d+2)(d+4)} -
      \frac{1}{2d^2(d+2)} -
      \frac{2}{8d^2(d+2)} +
      \frac{2}{4d^3}
    \right)
  &=& \frac{2}{d^3(d+2)(d+4)}
\end{eqnarray*}
so that
\begin{displaymath}
  A_d(\kappa) =
  \frac{1}{d} \kappa
  - \frac{1}{d^2(d+2)} \kappa^3
  + \frac{2}{d^3(d+2)(d+4)} \kappa^5
  + O(\kappa^7),
  \qquad
  \kappa \to 0.
\end{displaymath}

From this approximations for $A'$ and $A''$ can be obtained by
term-wise differentiation and also used for $\kappa$ close to 0.

\subsection[Computing log(H)]{Computing $\log(H_{\nu})$}

Write
\begin{displaymath}
  R_\nu(\kappa) =
  \frac{I_{\nu+1}(\kappa)}{I_\nu(\kappa)}
\end{displaymath}
for the Bessel function ratio (so that $A_d = R_{d/2 - 1}$) and 
\begin{displaymath}
  G_{\alpha,\beta}(\kappa) =
  \frac{\kappa}{\alpha + \sqrt{\kappa^2 + \beta^2}}.
\end{displaymath}
\citet[][Equations 11 and 16]{vMF:Amos:1974} shows that for all
non-negative $\kappa$ and $\nu$, $G_{\nu+1/2,\nu+3/2}(\kappa) \le
R_\nu(\kappa) \le G_{\nu,\nu+2}(\kappa)$. With $\beta_{SS}(\nu) =
\sqrt{(\nu+1/2)(\nu+3/2)}$, Theorem~2 of
\citet{vMF:Simpson+Spector:1984} implies the upper bound $R_\nu(\kappa)
\le G_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)$ for all non-negative $\kappa$
and $\nu$.  We thus have
\begin{equation}
  \label{eq:bounds-for-R}
  G_{\nu+1/2,\nu+3/2}(\kappa) \le
  R_\nu(\kappa) \le
  \min \left( G_{\nu,\nu+2}(\kappa), G_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)
  \right)
\end{equation}
(from which the bounds of Equation~\ref{eq:bounds-for-kappa-hat} are
obtained by inversion).  In addition, using results in
\citet[][Equations~5 and 6]{vMF:Schou:1978}, one can show that
$G_{\nu,\nu+2}$ and $G_{\nu+1/2,\beta_{SS}(\nu)}$ are second order
exact approximations for $\kappa \to 0$ and $\kappa \to \infty$,
respectively \citep{vMF:Hornik+Gruen:2013}.

As $\log(H_\nu)' = R_\nu$ (and $H_\nu(0) = 1$), integration gives
\begin{displaymath}
  \log(H_\nu(\kappa)) = \int_0^\kappa R_\nu(t) \,dt.
\end{displaymath}
Thus, the bounds for the Bessel function ratio $R_\nu$ can be used to
obtain bounds for $H_\nu$.  Writing
\begin{displaymath}
  S_{\alpha,\beta}(\kappa) 
  = \sqrt{\kappa^2 + \beta^2}
  - \alpha \log(\alpha + \sqrt{\kappa^2 + \beta^2})
  - \beta + \alpha \log(\alpha + \beta),
\end{displaymath}
it is easily verified that $S_{\alpha,\beta}' = G_{\alpha,\beta}$ and
$S_{\alpha,\beta}(0) = 0$.  Using the Amos-type bounds from
Equation~\ref{eq:bounds-for-R}, we thus obtain that for $\nu \ge 0$ and
$\kappa \ge 0$,
\begin{equation}
  \label{eq:bounds-for-log-H}
  S_{\nu+1/2,\nu+3/2}(\kappa) \le
  \log(H_\nu(\kappa)) \le
  \min 
  \left(
    S_{\nu,\nu+2}(\kappa), 
    S_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)
  \right).
\end{equation}

Where a single approximating value is sought, we prefer to use the upper
bound which is based on the combination of upper Amos-type bounds which
are second order exact at zero and infinity.  Again, the bounds in
Equation~\ref{eq:bounds-for-log-H} are surprisingly tight.

\par\medskip\noindent
\textbf{Result.}
\begin{em}
Let 
\begin{displaymath}
  s(\nu) = (\nu+3/2) - \beta_{SS}(\nu)
  - (\nu+1/2)
  \log \frac{2 (\nu + 1)}{\nu + 1/2 + \beta_{SS}(\nu)},
\end{displaymath}
with $\nu \geq 0$.

Then $s$ is non-increasing on $[0, \infty)$ with $s(0) = (3 - \sqrt{3} +
\log((1 + \sqrt{3}) / 4)) / 2 = 0.4433537$ and $\lim_{\nu \to \infty}
s(\nu) = 1/4$.  For $\nu_0 \ge 0$,
\begin{displaymath}
  \sup_{\kappa \ge 0, \nu \ge \nu_0}
  (S_{\nu+1/2,\beta_{SS}(\nu)}(\kappa) - S_{\nu+1/2,\nu+3/2}(\kappa))
  = s(\nu_0).
\end{displaymath}
For $\beta_{SS}(\nu) \le \beta \le \nu + 3/2$,
\begin{displaymath}
  \sup_{\kappa \ge 0} |\log(H_\nu(\kappa)) - S_{\nu+1/2,\beta}(\kappa)|
  \le s(\nu).
\end{displaymath}
\end{em}
\begin{proof}
For simplicity, write $\alpha = \nu + 1/2$, $\beta_L = \nu + 3/2$ and
$\beta_U = \beta_{SS}(\nu)$, omitting the dependence on $\nu$.  We have
$\beta_U \le \beta_L$ and $G_{\alpha,\beta_U} \ge G_{\alpha,\beta_L}$.
Hence,
\begin{displaymath}
  S_{\alpha,\beta_U}(\kappa) - S_{\alpha,\beta_L}(\kappa)
  = \int_0^\kappa (G_{\alpha,\beta_U}(t) - G_{\alpha,\beta_L}(t)) \, dt
\end{displaymath}
is non-decreasing in $\kappa$, and attains its supremum for $\kappa \to
\infty$.  Now,
\begin{eqnarray*}
  \lefteqn{S_{\alpha,\beta_U}(\kappa) - S_{\alpha,\beta_L}(\kappa)} \\
  &=& \sqrt{\kappa^2 + \beta_U^2} - \sqrt{\kappa^2 + \beta_L^2} 
  - \alpha \log \left(
    \frac{\alpha + \sqrt{\kappa^2 + \beta_U^2}}{\alpha + \sqrt{\kappa^2 + \beta_L^2}}
  \right)
  + (\beta_L - \beta_U) - \alpha \log \frac{\alpha +
    \beta_L}{\alpha + \beta_U}.
\end{eqnarray*}
As
\begin{displaymath}
  \sqrt{\kappa^2 + \beta_U^2} - \sqrt{\kappa^2 + \beta_L^2} =
  \frac{(\kappa^2 + \beta_U^2) - (\kappa^2 + \beta_L^2)}{
    \sqrt{\kappa^2 + \beta_U^2} + \sqrt{\kappa^2 + \beta_L^2}} =
  \frac{\beta_U^2 - \beta_L^2}{
    \sqrt{\kappa^2 + \beta_U^2} + \sqrt{\kappa^2 + \beta_L^2}},
\end{displaymath}
we have
\begin{displaymath}
  \lim_{\kappa \to \infty}
  (S_{\alpha,\beta_U}(\kappa) - S_{\alpha,\beta_L}(\kappa))
  = (\beta_L - \beta_U) - \alpha \log \frac{\alpha +
    \beta_L}{\alpha + \beta_U}
  = s(\nu).
\end{displaymath}

The value of $s(0)$ is obtained by insertion, and $\lim_{\nu \to \infty}
s(\nu)$ can be obtained as follows.  We have
\begin{displaymath}
  \beta_L - \beta_U
  = \sqrt{\alpha + 1} (\sqrt{\alpha + 1} - \sqrt{\alpha})
  = \frac{\sqrt{\alpha + 1}}{\sqrt{\alpha + 1} + \sqrt{\alpha}}
  \to 1/2
\end{displaymath}
as $\nu \to \infty$, and
\begin{eqnarray*}
  \frac{\alpha + \beta_U}{\alpha + \beta_L}
  &=& \frac{\alpha + \sqrt{\alpha (\alpha + 1)}}{2\alpha + 1} \\
  &=& \frac{(1 + \sqrt{1 + 1 / \alpha}) / 2}{1 + 1 / (2\alpha)} \\
  &=& \frac{1}{2} \left(1 + 1 + \frac{1}{2\alpha} + O(\alpha^{-2}) \right)
  \left(1 - \frac{1}{2\alpha} + O(\alpha^{-2}) \right) \\
  &=& 1 - \frac{1}{4\alpha} + O(\alpha^{-2})
\end{eqnarray*}
so that
\begin{displaymath}
  \alpha \log \frac{\alpha + \beta_U}{\alpha + \beta_L}
  = \alpha \left( - \frac{1}{4\alpha} + O(\alpha^{-2}) \right)
  = - \frac{1}{4} + O(\alpha^{-1})
  \to -1/4
\end{displaymath}
as $\nu \to \infty$.  Hence, $\lim_{\nu \to \infty} s(\nu) = 1/4$.

To show that $s$ is non-increasing, note that we have
\begin{displaymath}
  \alpha + \beta_L = 2 \alpha + 1, \qquad
  \frac{d\alpha}{d\nu} = \frac{d\beta_L}{d\nu} = 1, \qquad
  \frac{d\beta_U}{d\nu} = \frac{2\alpha + 1}{2\beta_U}
\end{displaymath}
and hence
\begin{displaymath}
  \frac{ds}{d\nu} =
  1
  - \frac{2\alpha + 1}{2\beta_U}
  - \log \frac{2\alpha + 1}{\alpha + \beta_U}
  - \alpha \left(
    \frac{2}{2\alpha+1}
    - \frac{1}{\alpha + \beta_U}
    \left( 1 + \frac{2\alpha + 1}{2\beta_U} \right)
  \right).
\end{displaymath}
For non-negative $t$ and $\tau$,
\begin{displaymath}
  \log \frac{t+\tau}{t}
  = \int_0^\tau \frac{1}{t+s} \, ds
  \ge \frac{1}{t + \tau} \int_0^\tau ds
  = \frac{\tau}{t + \tau}.
\end{displaymath}
Hence, with $t = \alpha + \beta_U$ and $\tau = \beta_L - \beta_U$,
\begin{displaymath}
  \log \frac{2\alpha + 1}{\alpha + \beta_U}
  \ge \frac{\beta_L - \beta_U}{2\alpha + 1}
\end{displaymath}
and
\begin{eqnarray*}
  \frac{ds}{d\nu}
  &\le&
  1
  - \frac{2\alpha + 1}{2\beta_U}
  + \frac{\beta_U - \beta_L}{2\alpha + 1}
  + \frac{\alpha}{\alpha + \beta_U} \left( 1 + \frac{2\alpha +
      1}{2\beta_U} \right) - \frac{2\alpha}{2\alpha+1} \\
  &=& \frac{2\alpha+1}{2\beta_U}
  \left( \frac{\alpha}{\alpha + \beta_U} - 1 \right)
  + \frac{\alpha}{\alpha + \beta_U}
  + \frac{2\alpha + 1 + \beta_U - \beta_L - 2\alpha}{2\alpha + 1} \\
  &=& - \frac{2\alpha + 1}{2(\alpha + \beta_U)}
  + \frac{\alpha}{\alpha + \beta_U}
  + \frac{\beta_U - \alpha}{2\alpha + 1} \\
  &=& - \frac{1}{2(\alpha + \beta_U)}
  + \frac{\beta_U - \alpha}{2\alpha + 1} \\
  &=& \frac{-(2\alpha + 1) + 2 (\beta_U^2 - \alpha^2)}
  {2(\alpha + \beta_U)(2\alpha + 1)} \\
  &=& \frac{-1}{2(\alpha + \beta_U)(2\alpha + 1)} \\
  &\le& 0,
\end{eqnarray*}
establishing that $s$ is non-increasing.

Finally, for $\beta_{SS}(\nu) \le \beta \le \nu + 3/2$, both
$\log(H_\nu)$ and $S_{\nu+1/2,\beta}$ are bounded below by
$S_{\nu+1/2,\nu+3/2}$ and above by $S_{\nu+1/2,\beta_{SS}(\nu)}$, so that
$| \log(H_\nu) - S_{\nu+1/2,\beta} | \le (S_{\nu+1/2,\beta_{SS}(\nu)} -
S_{\nu+1/2,\nu+3/2}) \le s(\nu)$,
completing the proof.
\end{proof}

These results can be used to derive the following approach to computing
$\log(H_\nu)$.  Choose a threshold $\theta$ such that $e^\theta$ does
not overflow and $e^{-\theta}$ does not underflow.  Using IEEE 754
double precision floating point computations, we can take $\theta =
700$.  Choose an approximation $L_\nu$ for $\log(H_\nu)$ for which
$S_{\nu+1/2,\nu+3/2} \le L_\nu \le S_{\nu+1/2,\beta_{SS}(\nu)}$ for all
$\kappa \ge 0$.  By the above, this has approximation error at most
$s(0) < 1/2$.  Possible choices for $L_\nu$ are convex combinations of
the lower and upper bounds in Equation~\ref{eq:bounds-for-log-H} (e.g.,
simply take the upper bound) or  $S_{\nu+1/2,\beta}(\kappa)$ with some
$\beta_{SS}(\nu) \le \beta \le \nu + 3/2$ (e.g, $\beta = \nu + 1$); in
the package we use
\begin{eqnarray*}
  L_\nu(\kappa)
  &=& \int_0^\kappa 
  \min(G_{\nu,\nu+2}(t), G_{\nu+1/2,\beta_{SS}(\nu)}(t)) \, dt \\
  &=& S_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)
  + (S_{\nu,\nu+2}(\min(\kappa,\kappa_\nu)) -
  S_{\nu+1/2,\beta_{SS}(\nu)}(\min(\kappa,\kappa_\nu))),
\end{eqnarray*}
where $\kappa_\nu = \sqrt{(3\nu+11/2) (\nu+3/2)}$ is the positive root
of the equation $G_{\nu,\nu+2}(\kappa) =
G_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)$.  Then if $L_\nu(\kappa) \le \theta 
- 1/2$ (so that $\log(H_\nu(\kappa)) \le \theta$), compute
$H_\nu(\kappa)$ by its series expansion, and take the logarithm of this.  
If $\theta - 1 / 2< L_\nu(\kappa) \le 2 \theta - 1$ so that
\begin{displaymath}
  |\log(H_\nu(\kappa)) - L_\nu(\kappa) / 2|
  \le |\log(H_\nu(\kappa)) - L_\nu(\kappa)| + L_\nu(\kappa) / 2 \le
  \theta,
\end{displaymath}
and hence $e^{-L_\nu(\kappa)/2} H_\nu(\kappa)$ does not over- or
underflow, write
\begin{displaymath}
  H_\nu(\kappa) =
  e^{L_\nu(\kappa) / 2} e^{- L_\nu(\kappa) / 2} H_\nu(\kappa) =
  e^{L_\nu(\kappa) / 2}
  \sum_{m=0}^\infty
  \frac{e^{-L_\nu(\kappa)/2}\Gamma(\nu+1)}{\Gamma(\nu+1+m)}\frac{(\kappa^2/4)^m}{m!},
\end{displaymath}
and compute $\log(H_\nu(\kappa))$ as $L_\nu(\kappa)/2$ plus the
logarithm of the sum of the scaled series.  Otherwise, use the
approximation $L_\nu(\kappa)$ (note that with $\theta = 700$, this has
a relative approximation error of about at most $1/2 \cdot 1/1400 \le
0.0004$).  This is the approach for computing $\log(H_\nu)$ currently
implemented in package \pkg{movMF}.

Alternatively, one can use the bounds to obtain refined strategies of
computing $\log(H_\nu)$ using available codes for modified Bessel
functions.  We have
\begin{displaymath}
  \log(H_\nu(\kappa))
  = \log(I_\nu(\kappa)) - \nu \log(\kappa/2) + \log(\Gamma(\nu + 1))
\end{displaymath}
and hence, using Stirling's approximation $\log(\Gamma(z)) \approx (z -
1/2) \log(z) - z + \log(2\pi) / 2$ and $L_\nu = S_{\nu+1/2,\nu+1}$ for
notational convenience, gives
\begin{eqnarray*}
  \lefteqn{\log(I_\nu(\kappa))} \\
  &=& \log(H_\nu(\kappa)) + \nu \log(\kappa/2) - \log(\Gamma(\nu + 1))
  \\
  &\approx& L_\nu(\kappa) + \nu \log(\kappa/2)
  - (\nu + 1/2) \log(\nu + 1) + (\nu + 1) - \frac{\log(2\pi)}{2} 
  \\
  &=& \sqrt{\kappa^2 + (\nu + 1)^2}
  + (\nu + 1/2) \log
  \frac{\kappa}{\nu + 1/2 + \sqrt{\kappa^2 + (\nu + 1)^2}}
  \\
  & & \quad - \, \frac{\log(\kappa/2)}{2} 
  + (\nu + 1/2) \log\frac{2\nu + 3/2}{2(\nu + 1)}
  - \frac{\log(2\pi)}{2}.
\end{eqnarray*}
This implies
\begin{eqnarray*}
  \log(I_\nu(\kappa))
  &=& \sqrt{\kappa^2 + (\nu + 1)^2}
  + (\nu + 1/2) \log
  \frac{\kappa}{\nu + 1/2 + \sqrt{\kappa^2 + (\nu + 1)^2}} 
  - \, \frac{\log(\kappa)}{2} + O(1),
\end{eqnarray*}
where the $O(1)$ can be made more precise, giving a first-order variant
of the large $\nu$ uniform asymptotic approximation for $I_\nu$ given by
\citet{vMF:Olver:1954}.  (For $\nu \to \infty$, the error in the
approximation for $\log(\Gamma)$ tends to zero, and as $(\nu + 1/2) \log
((2\nu + 3/2) / (2 \nu + 2)) \to -1/4$, the $O(1)$ becomes $-\log(\pi)/2
- 1 / 4 + o(1)$ (modulo the error in the approximation of
$\log(H_\nu(\kappa))$ by $L_\nu(\kappa)$ which is at most $1/4 + o(1)$).

From the above, we can see that when $\kappa \gg \nu$, $I_\nu(\kappa)$
overflows quite rapidly; on the other hand, for $\kappa = o(\nu)$ and
$\nu \to \infty$, $I_\nu(\kappa)$ underflows quite rapidly.  For
computing logarithms, overflow can be avoided by employing the
exponentially scaled modified Bessel function $e^{-\kappa}
I_\nu(\kappa)$ \citep[as commonly available in codes for computing
Bessel functions, such as the \pkg{SPECFUN} package][used by
\proglang{R}]{vMF:Cody:1993} and computing $\log(I_\nu(\kappa)) =
\kappa + \log(e^{-\kappa} I_\nu(\kappa))$.  However, this clearly does
not help avoiding underflow.

This motivates the following strategy for computing $\log(I_\nu(\kappa))$ for
a wide range of values.  Start by computing a quick approximation
$T_\nu(\kappa)$ to $\log(I_\nu)$, either using the above
$L_\nu(\kappa) + \nu \log(\kappa/2) - \log(\Gamma(\nu + 1))$ or the
leading term of the large $\nu$ uniform asymptotic approximation
(in \proglang{R}, the latter is available via function
\verb|besselI.nuAsym()| in package
\pkg{Bessel}; \citealt{vMF:Maechler:2012}).  If this is ``sufficiently
small'' in absolute value (so that $I_\nu(\kappa)$ will neither over-
nor underflow), compute $\log(I_\nu(\kappa))$ directly as the logarithm
of $I_\nu(\kappa)$.  Otherwise, if the approximation is ``too large'',
but $T_\nu(\kappa) - \kappa$ is sufficiently small in absolute value
so that the exponentially scaled $e^{-\kappa} I_\nu(\kappa)$ can be
computed without over- or underflow, compute $\log(I_\nu(\kappa))$ as
$\kappa + \log(e^{-\kappa} I_\nu(\kappa))$.  Otherwise, use the quick
approximation, or the large $\nu$ uniform asymptotic approximation
with additional terms (package \pkg{Bessel} allows up to 4 additional
terms).  This strategy can readily be translated into a strategy for
computing $\log(H_\nu)$.

\newpage
%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\setkeys{Gin}{width=\textwidth}
\section{Application: useR! 2008 abstracts}\label{sec:appl-user-2008}

In 2008 the ``use\proglang{R}! 2008'', the 3rd international
\proglang{R} user conference, took place in Dortmund, Germany. In
total \Sexpr{nrow(useR_2008_abstracts)} abstracts were submitted and
accepted for presentation at the conference. The abstracts with
additional information such as title, author, session, and keywords
were originally made available in the \proglang{R} data package
\pkg{corpus.useR.2008.abstracts} available from the repository at
\url{https://datacube.wu.ac.at/}. The dataset is also included in
package \pkg{movMF}.

The following code loads the data contained in package \pkg{movMF}.
%
<<eval=false>>=
<<corpus>>
@
%
Using the \pkg{tm} package \citep{vMF:Feinerer+Hornik+Meyer:2008,
  vMF:Feinerer:2012} the data can be pre-processed by (1) generating a
corpus from the vector of abstracts and (2) building a document-term
matrix from the corpus. For constructing the document-term matrix each
abstract needs to be tokenized (i.e., split into words, e.g., by using
white space characters as separators) and transformed to lower case.
Punctuation as well as numbers can be removed and the words can be
stemmed (i.e., inflected words are reduced to a base form). In
addition a minimum length can be imposed on the words as well as a
minimum and maximum frequency within an abstract required.

The vector of abstracts is transformed to an object which has an
extended class of `\code{Source}' using \fct{VectorSource}. This
object is used as input for \fct{Corpus} to generate the corpus. The
map from the corpus to the document-term matrix is performed using
\fct{DocumentTermMatrix}. The \code{control} argument of
\fct{DocumentTermMatrix} specifies which pre-processing steps are
applied to determine the frequency vectors of term occurrences in each
abstract. We use the titles and the abstracts together to
construct the document-term matrix.
%
<<>>=
library("tm")
abstracts_titles <- 
  apply(useR_2008_abstracts[,c("Title", "Abstract")],
        1, paste, collapse = " ")
useR_2008_abstracts_corpus <- VCorpus(VectorSource(abstracts_titles))
useR_2008_abstracts_DTM <- 
  DocumentTermMatrix(useR_2008_abstracts_corpus,
                     control = list(
                       tokenize = "MC",
                       stopwords = TRUE,
                       stemming = TRUE,
                       wordLengths = c(3, Inf)))
@ 

Method \code{"MC"} was used for tokenizing. This method aims at
producing the same results as the \pkg{MC} toolkit for creating vector
models from text documents \citep{vMF:Dhillon+Modha:2001,
  vMF:Dhillon+Fan+Guan:2001}. In addition the words are stemmed, a set
of stop words are removed and all words are kept which have a length of
at least 3.

The resulting document-term matrix has
\Sexpr{nrow(useR_2008_abstracts_DTM)} rows and
\Sexpr{ncol(useR_2008_abstracts_DTM)} columns.  The ten most frequent
terms (occurring in different abstracts) are the following.
%
<<>>=
library("slam")
ColSums <- col_sums(useR_2008_abstracts_DTM > 0)
sort(ColSums, decreasing = TRUE)[1:10]
@ 
%
To reduce the dimension of the problem and omit terms which occur too
frequently or too infrequently in the corpus to be of use when
clustering the abstracts, we omit all terms which occur in less than 5
abstracts and more than 90 abstracts.
%
<<>>=
useR_2008_abstracts_DTM <- 
  useR_2008_abstracts_DTM[, ColSums >= 5 & ColSums <= 90]
useR_2008_abstracts_DTM
@ 
%
The data is transformed using TF-IDF weighting.
%
<<>>=
useR_2008_abstracts_DTM <- weightTfIdf(useR_2008_abstracts_DTM)
@ 
%
In the following different mixtures of vMF distributions are fitted to
training data using 10-fold cross-validation and are compared
based on the predictive log-likelihoods on the hold-out data to select
a suitable model. The numbers of components are varied and the
mixtures are fitted with concentration parameters constrained to be
the same over components as well as where the concentration parameters
are allowed to freely vary over components. For each training data set
the EM algorithm is repeated 20 times with different random
initializations.
%
<<fit-movMF, eval=false>>=
set.seed(2008)
library("movMF")
Ks <- c(1:5, 10, 20)
splits <- sample(rep(1:10, length.out = nrow(useR_2008_abstracts_DTM)))
useR_2008_movMF <- 
  lapply(Ks, function(k) 
         sapply(1:10, function(s) {
           m <- movMF(useR_2008_abstracts_DTM[splits != s,], 
                      k = k, nruns = 20)
           logLik(m, useR_2008_abstracts_DTM[splits == s,])
         }))
useR_2008_movMF_common <- 
  lapply(Ks, function(k) 
         sapply(1:10, function(s) {
           m <- movMF(useR_2008_abstracts_DTM[splits != s,], 
                      k = k, nruns = 20,
                      kappa = list(common = TRUE))
           logLik(m, useR_2008_abstracts_DTM[splits == s,])
         }))
@ 
%
<<echo=false, results=hide>>=
if(cache & file.exists("movMF.rda")) {
  load("movMF.rda")
  library("movMF")
  Ks <- c(1:5, 10, 20)
} else {
<<fit-movMF>>
if(cache) {
  save(useR_2008_movMF, useR_2008_movMF_common,
  file = "movMF.rda")
} else {
  if(file.exists("movMF.rda")) file.remove("movMF.rda")
}
}
@
%
In Figure~\ref{fig:logliks} the fitted models are compared using the
cross-validated predictive log-likelihoods. The results for the models
with free concentration parameters are on the left, for the models with
common concentration parameters on the right. The predictive
log-likelihoods on the hold-out data indicate that the best solutions
have between 1 and 5 components and that the models with more
components have rather bad predictive log-likelihoods.

\begin{figure}[t!]
  \centering
<<fig=true, echo=false, results=hide>>=
logLiks <- data.frame(logLik = c(unlist(useR_2008_movMF),
                        unlist(useR_2008_movMF_common)),
                      K = c(rep(Ks, sapply(useR_2008_movMF, length)),
                        rep(Ks, sapply(useR_2008_movMF_common, length))),
                      Dataset = seq_len(length(useR_2008_movMF[[1]])),
                      Method = factor(rep(1:2, each = length(unlist(useR_2008_movMF))),
                        1:2, c("free", "common")))
logLiks$logLik <- logLiks$logLik - rep(rep(with(logLiks, tapply(logLik, Dataset, mean)), length(Ks)), 2)
print(xyplot(logLik ~ K | Method, data = logLiks, groups = Dataset, type = "l", lty = 1, 
             xlab = "Number of components", ylab = "Predictive log-likelihood",           
             strip = strip.custom(factor.levels  = 
               expression(paste("Free ", kappa), paste("Common ", kappa)))))
@ 
\caption{Predictive log-likelihoods for different and common
  concentration parameters $\kappa$ for the fitted mixtures of vMF
  distributions to the ``use\proglang{R}!  2008'' abstracts. }
  \label{fig:logliks}
\end{figure}

Following the conclusions of the comparison of the predictive
log-likelihoods values we further investigate the model where the
concentration parameters are constrained to be equal over components
and the number of components is equal to 2.
%
<<>>=
set.seed(2008)
best_model <- movMF(useR_2008_abstracts_DTM, k = 2, nruns = 20,
                    kappa = list(common = TRUE))
@ 
%
In the following we look at the 10 most important words of each fitted
component:
%
<<>>=
apply(coef(best_model)$theta, 1, function(x) 
      colnames(coef(best_model)$theta)[order(x, decreasing = TRUE)[1:10]])
@ 
%
Clearly one component deals with issues related to infrastructure or
implementation, while the other component is focusing more on
statistical modeling.

The clustering obtained from the a-posteriori probabilities is
analyzed by comparing the cluster membership with the keywords
assigned to an abstract. Because each abstract might have several
keywords assigned, the abstracts and their cluster assignments are
suitably repeated.
%
<<>>=
clustering <- predict(best_model)
keywords <- useR_2008_abstracts[, "Keywords"]
keywords <- sapply(keywords, 
                   function(x) sapply(strsplit(x, ", ")[[1]], function(y) 
                                      strsplit(y, "-")[[1]][1]))
tab <- table(Keyword = unlist(keywords), 
             Component = rep(clustering, sapply(keywords, length)))
@ 
%
In the following only keywords are shown which have more than 8
abstracts assigned to them.
%
<<>>=
(tab <- tab[rowSums(tab) > 8, ])
@ 
%
The table is also visualized in Figure~\ref{fig:mosaic}. Abstracts
where the keywords assigned relate to infrastructure or
implementational issues such as connectivity, high performance
computing and user interfaces are associated with one component,
whereas abstracts which are related to statistical modeling issues
such as bioinformatics, biostatistics and modeling are more likely to
be assigned to the other component.

\begin{figure}[t!]
  \centering
<<fig=true, height=6, echo=false, width=10>>=
library("vcd")
mosaic(tab, shade = TRUE, 
       labeling_args = list(rot_labels = 0, just_labels = c("center", "right"), 
         pos_varnames = c("left", "right"), rot_varnames = 0))
@   
\caption{Cross-tabulation of the keywords in each session and the
  cluster assignment for keywords which were assigned to more than 8
  abstracts.}
  \label{fig:mosaic}
\end{figure}
%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\section{Summary}\label{sec:summary}

An \proglang{R} package for fitting finite mixtures of vMF
distributions is presented. Special focus has been given on numerical
issues when evaluating the log-likelihood as well as on methods for
ML estimation of the concentration parameter.

A possible extension for package \pkg{movMF} would be to allow for
more flexible distributions in the components which also have as
support the unit sphere. The vMF distribution is the analogue of the
isotropic multivariate normal distribution, i.e., where the
variance-covariance matrix is a multiple of the identity matrix. In
$\mathbb{R}^3$ the generalization of the vMF distribution which is the
analogue to the general multivariate normal distribution on the
two-dimensional unit sphere is the Fisher-Bingham (or Kent)
distribution \citep{vMF:Kent:1982}. Finite mixtures of Fisher-Bingham
distributions were used in \cite{vMF:Peel+Whiten+McLachlan:2001} to
identify joint sets present in rock mass. They also allowed for an
additional component which followed the uniform distribution on the
unit sphere. However, no generalization for higher dimensions are
available and \cite{vMF:Dortet-Bernadet+Wicker:2008} propose to use
inverse stereographic projections of multivariate normal distributions
to cluster gene expression profiles.

%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\section*{Acknowledgments}

This research was supported by the Austrian Science Fund (FWF)
under Elise-Richter grant V170-N18.

We would like to thank Achim Zeileis and Uwe Ligges for helping
generating the corpus of ``use\proglang{R}! 2008'' abstracts.
%%------------------------------------------------------------------------
%%------------------------------------------------------------------------
\bibliography{vMF}
\end{document}
