library(tidyverse)
library(mixtools)
knitr::opts_chunk$set(cache = TRUE)
buffalo <- scan("buffsnow.txt")
It is very straightforward to see that:
θk=θ1+(θ2−θ1)+(θ3−θ2)+⋯+(θk−θk−1)
Since each of the brackets is just ϕj
θ1+k∑i=2ϕi
We compute partial derivative for g(θ) ∂g(.)∂θ1=−2n∑i=1(yi−θ1+i∑j=2ϕj) Setting to zero: ∂g(.)∂θ1=0 We obtain: θ1=1nk∑i=1(yi−i∑j=2ϕj)
∂g(.)∂ϕj(n∑i=1(yi−θ1−i∑j=2ϕj)2+λn∑j=2|ϕj|)=−2n∑i=j(yi−θ1−i∑k=2ϕk)+λ∂|ϕj|∂ϕj
Where the ∂|ϕj|∂ϕj will be replaced by a subgradient of the absolute value.
λ∂|ϕj|∂ϕj={λϕj>0[−λ,λ]ϕj=0−λϕj<0
Since this is zero only on the set of [−λ,λ], we know that the overall function is going to be zero only this set as well. Applying that constraint we see:
−2n∑i=j(yi−θ1−i∑k=2ϕk)∈[−λ,λ]⟺|n∑i=j(yi−θ1−i∑k=2ϕk)|<λ2
Now, if ϕj≠0 we can see that:
ϕj=1n−j+1(n∑i=j(yi−θ1−i∑k=2,k≠jϕk)−λ2)forn∑i=j(yi−θ1−i∑k=2,k≠jϕk)>λ2
and the opposite case:
ϕj=1n−j+1(n∑i=j(yi−θ1−i∑k=2,k≠jϕk)+λ2)forn∑i=j(yi−θ1−i∑k=2,k≠jϕk)<λ2
Let θ represent the vector of parameters.
First we want to find the loglikelihood: log(L(θ))=n∑i=1m∑k=1uiklogfk(xi)+n∑i=1m∑k=1uiklog(λk)
It is only the right sum that depends on the lambda, and solving this problem under the constraint that ∑mk=1λk=1 It’s easy to see that: ^λk=1nn∑i=1uik
Similarily, plugging in the Normal density for fk(.) and solving for μ and σ we obtain the other two results:
∂l(θ)∂μk=1σ2kn∑i=1uik(xi−μk)
∂l(θ)∂σ2k=−12σ2kn∑i=1uik+12(σ2k)2n∑i=1uik(xi−μk)2
Setting both to 0 and solving produces:
^μk=∑ni=1uikxi∑ni=1uik
^σ2k=∑ni=1uik(xi−ˆμk)2∑ni=1uik
normalmixture <- function(x,k,mu,sigma,lambda,em.iter=50, eps = 10^-5, debug = FALSE) {
n <- length(x)
# Add a tiny perturbation to x
x <- x + rnorm(length(x), mean = 0, sd = eps)
x <- sort(x)
vars <- sigma^2
means <- mu
lam <- lambda/sum(lambda)
delta <- matrix(rep(0,n*k),ncol = k)
loglik <- NULL
converged = FALSE
iter = 0
while (!converged) {
loglik_new <- 0
for (i in 1:n) {
xi <- x[i]
for (j in 1:k) {
mj <- means[j]
varj <- vars[j]
denom <- 0
for (u in 1:k) {
mu <- means[u]
varu <- vars[u]
denom <- denom + lam[u]*dnorm(xi,mu,sqrt(varu))
}
delta[i,j] <- lam[j]*dnorm(xi,mj,sqrt(varj))/denom
}
loglik_new <- loglik_new + log(denom)
}
for (j in 1:k) {
deltaj <- as.vector(delta[,j])
lambda[j] <- mean(deltaj)
means[j] <- weighted.mean(x = x, w = deltaj)
vars[j] <- weighted.mean((x - means[j])^2, w = deltaj)
}
# Standardize lambdas
lambda <- lambda / sum(lambda)
lam <- lambda
# Iterate again
iter <- iter + 1
# Print iterations for debugging
if (debug) {cat("Iteration: ", iter, "\n", "Log Likelihood: ", loglik_new, "\n")}
# Update the first log likelihood
if (is.null(loglik)) {
if (debug) {print("loglik empty reached")}
loglik = loglik_new
}
# Check for convergence
else if (abs(tail(loglik, n = 1) - loglik_new) < eps) {
if (debug) {print("convergence reached")}
converged <- TRUE
}
# Check for num interations
if (iter == em.iter) {
print("Number of iterations reached")
converged <- TRUE}
# If likelihood is infinite then print the params
if (is.infinite(loglik_new)) {
cat("Infinite Likelihood", "\n")
r <- list(mu = means,
var = vars,
delta = delta,
lambda = lambda,
loglik = loglik,
iterations = iter)
return(r)
}
# Update the log likelihood
loglik <- c(loglik, loglik_new)
}
# return
r <- list(mu = means,
var = vars,
delta = delta,
lambda = lambda,
loglik = loglik,
iterations = iter)
return(r)
}
df <- tibble(x = buffalo)
df %>% ggplot(aes(x = x, ..density..)) +
geom_density(bw = 5, fill = "red", alpha = 0.3) +
labs(title = "Density plot")
# From density plot:
breaks2 <- c(95)
breaks3 <- c(65, 105)
# Split into normals:
df <- df %>% mutate(bin2 = if_else(x < breaks2[1], 1, 2),
bin3 = if_else(x < breaks3[1], 1,
if_else(x < breaks3[2], 2, 3)))
# Calculate the means and variances of each component normals:
norms2 <- df %>% group_by(bin2) %>% summarize(mu = mean(x),
sd = sqrt(var(x)),
count = n())
norms3 <- df %>% group_by(bin3) %>% summarize(mu = mean(x),
sd = sqrt(var(x)),
count = n())
# run this on m = 2
mu_start <- as.vector(norms2$mu)
var_start <- as.vector(norms2$sd)
lambda_start <- as.vector(norms2$count) / length(df$x)
density2 <- normalmixture(df$x, 2, mu_start, var_start, lambda_start, em.iter = 1000, debug = FALSE)
# run this on m = 3
mu_start <- as.vector(norms3$mu)
var_start <- as.vector(norms3$sd)
lambda_start <- as.vector(norms3$count) / length(df$x)
density3 <- normalmixture(df$x, 3, mu_start, var_start, lambda_start, em.iter = 1000, debug = FALSE)
# Create comparison plot against standard density estimate
density1 <- density(df$x)
## new data:
df2 <- tibble( x = seq(from = min(df$x), to = max(df$x), length.out = 10000))
df2$density2 <- density2$lambda[1] * dnorm(df2$x, mean = density2$mu[1], sd = sqrt(density2$var[1])) +
density2$lambda[2] * dnorm(df2$x, mean = density2$mu[2], sd = sqrt(density2$var[2]))
df2$density3 <- density3$lambda[1] * dnorm(df2$x, mean = density3$mu[1], sd = sqrt(density3$var[1])) +
density3$lambda[2] * dnorm(df2$x, mean = density3$mu[2], sd = sqrt(density3$var[2])) +
density3$lambda[3] * dnorm(df2$x, mean = density3$mu[3], sd = sqrt(density3$var[3]))
# Make the plot:
ggplot(data = df2, aes(x = df2$x)) +
geom_point(aes(y = df2$density2), color = "red") +
geom_point(aes(y = df2$density3), color = "blue") +
geom_density(data = df, mapping = aes(x = x), fill = "green", alpha = 0.5) +
labs(main = "Density Estimates at m = 2 (red) and m = 3 (blue)")
This tiny “hand” looks weird so I will do a sanity check with the mixtools package to see if my work seems reasonable.
mu_start <- as.vector(norms2$mu)
var_start <- as.vector(norms2$sd)
lambda_start <- as.vector(norms2$count) / length(df$x)
density4 <- mixtools::normalmixEM(df$x, k = 2,
mu = mu_start,
sigma = var_start,
lambda = lambda_start)
## number of iterations= 431
mu_start <- as.vector(norms3$mu)
var_start <- as.vector(norms3$sd)
lambda_start <- as.vector(norms3$count) / length(df$x)
density5 <- mixtools::normalmixEM(df$x, k = 3,
mu = mu_start,
sigma = var_start,
lambda = lambda_start)
## number of iterations= 609
print("Mixture of two parameter comparison (mine vs mixtools)")
## [1] "Mixture of two parameter comparison (mine vs mixtools)"
cat("Mean \n",
density2$mu, "\n",
density4$mu, "\n")
## Mean
## 82.33512 150.314
## 82.41374 153.7458
cat("Variance \n",
density2$var, "\n",
density4$sigma^2, "\n")
## Variance
## 581.1774 1173.335
## 583.5366 1079.92
cat("Lambda \n",
density2$lambda, "\n",
density4$lambda, "\n")
## Lambda
## 0.9681888 0.03181124
## 0.970787 0.02921304
print("Mixture of three parameter comparison (mine vs mixtools)")
## [1] "Mixture of three parameter comparison (mine vs mixtools)"
cat("Mean \n",
density3$mu, "\n",
density5$mu, "\n")
## Mean
## 77.98207 112.3513 120.9678
## 78.01955 112.3522 121.6513
cat("Variance \n",
density3$var, "\n",
density5$sigma^2, "\n")
## Variance
## 471.8441 3.386749 1403.283
## 473.245 3.383962 1396.149
cat("Lambda \n",
density3$lambda, "\n",
density5$lambda, "\n")
## Lambda
## 0.8340415 0.07175839 0.09420006
## 0.836254 0.0716751 0.09207088
Looks good to me.
Comparison by AIC:
AIC=2k−2loglikelihood
for m=2 we have 2 means, 2 variances, and 1 mixing parameter (since the other one is just 1- the first one) for a total of k=5.
for m=3 we get k=8
Then AIC for both models is:
AIC2=10+2⋅628.5453=1267.091
AIC3=16+2⋅624.3508=1264.702
Since the AIC has pretty much no difference we can say that both are very comparable. However given the shape of the plots I would prefer the mixture of 2 as it seems to be a much more smooth curve. The fit is also significantly faster.