EM算法公式推导

0. 前言

以下针对《统计学习方法(第二版)》的中的内容

1. 背景介绍

2. EM公式推导

记可观测变量为YY,隐变量为ZZ,参数为θ\theta 。根据对YYnn次观测,估计θ\theta

  1. 最大似然MLE:

    θ^=argmaxθ j=1nP(yjθ)=argmaxθ j=1nlogP(yjθ)(2.1)\begin{aligned} \hat{\theta} & = \mathop{\arg\max}\limits_{\theta} \ \prod \limits_{j=1}^{n} P(y_j |\theta) \\ & = \mathop{\arg\max}\limits_{\theta} \ \sum \limits_{j=1}^{n} \log P(y_j |\theta) \\ \end{aligned} \tag{2.1}

  2. 记第j个样本对应的似然函数 Lj(θ)=logP(yjθ)L_j(\theta) = \log P(y_j |\theta)

    Lj(θ)Lj(θ(i))=logP(yjθ)logP(yjθ(i))=logzjP(yj,zjθ)logP(yjθ(i))=log(zjP(zjyj,θ(i))P(yj,zjθ)P(zjyj,θ(i)))logP(yjθ(i))(2.2)\begin{aligned} L_j(\theta) - L_j(\theta^{(i)}) & = \log P(y_j |\theta) - \log P(y_j |\theta^{(i)}) \\ & = \log \sum \limits_{z_j} P(y_j, z_j |\theta) - \log P(y_j |\theta^{(i)}) \\ & = \log \Big( \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \frac{ P(y_j, z_j |\theta)}{P(z_j | y_j, \theta^{(i)})} \Big) - \log P(y_j |\theta^{(i)}) \\ \end{aligned} \tag{2.2}

  3. 应用Jenson不等式,见书《统计学习方法》p179

    logiλi yiiλi log(yi)\log \sum \limits_i \lambda_i \ y_i \geqslant \sum \limits_i \lambda_i \ \log(y_i)

    其中,iλi=1\sum \limits_i \lambda_i = 1λi0\lambda_i \geqslant 0. 带入(2.2)得到

Lj(θ)Lj(θ(i))=logzjP(zjyj,θ(i))P(yj,zjθ)P(zjyj,θ(i))logP(yjθ(i))zjP(zjyj,θ(i))logP(yj,zjθ)P(zjyj,θ(i))logP(yjθ(i))=zjP(zjyj,θ(i))logP(yj,zjθ)P(zjyj,θ(i))zjP(zjyj,θ(i))logP(yjθ(i))=zjP(zjyj,θ(i))logP(yj,zjθ)P(zjyj,θ(i))zjP(zjyj,θ(i))logP(yjθ(i))=zjP(zjyj,θ(i))logP(yj,zjθ)P(zjyj,θ(i))P(yjθ(i))=zjP(zjyj,θ(i))logP(yj,zjθ)P(yj,zjθ(i))(2.3)\begin{aligned} L_j(\theta) - L_j(\theta^{(i)}) & = \log \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \frac{ P(y_j, z_j |\theta)}{P(z_j | y_j, \theta^{(i)})} - \log P(y_j |\theta^{(i)}) \\ & \geqslant \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(z_j | y_j, \theta^{(i)})} - \log P(y_j |\theta^{(i)}) \\ & = \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(z_j | y_j, \theta^{(i)})} - \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j |\theta^{(i)}) \\ & = \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(z_j | y_j, \theta^{(i)})} - \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j |\theta^{(i)}) \\ & = \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(z_j | y_j, \theta^{(i)}) \cdot P(y_j |\theta^{(i)})} \\ & = \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(y_j, z_j |\theta^{(i)})} \\ \end{aligned} \tag{2.3}

  1. Lj(θ)L_j(\theta) 的下界为Bj(θ,θ(i))B_j(\theta , \theta^{(i)})

    Bj(θ,θ(i))=Lj(θ(i))+zjP(zjyj,θ(i))logP(yj,zjθ)P(yj,zjθ(i))(2.4)\begin{aligned} B_j(\theta , \theta^{(i)}) & = L_j(\theta^{(i)}) + \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(y_j, z_j |\theta^{(i)})} \\ \end{aligned} \tag{2.4}

    通过提升下界为Bj(θ,θ(i))B_j(\theta , \theta^{(i)})的方式,提升L(θ)L(\theta),即

    Lj(θ)=maxBj(θ,θ(i))\begin{aligned} L_j(\theta) & = \mathop{\max} B_j(\theta , \theta^{(i)}) \\ \end{aligned}

    于是,

    θi+1=argmaxθB(θ,θ(i))=argmaxθ(L(θ(i))+zjP(zjyj,θ(i))logP(yj,zjθ)P(yj,zjθ(i)))=argmaxθ(L(θ(i))+zjP(zjyj,θ(i))logP(yj,zjθ)zjP(zjyj,θ(i))logP(yj,zjθ(i)))\begin{aligned} \theta^{i+1} & = \mathop{\arg\max}\limits_{\theta} B(\theta , \theta^{(i)}) \\ & = \mathop{\arg\max}\limits_{\theta} \bigg( L(\theta^{(i)}) + \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log \frac{ P(y_j, z_j |\theta)}{P(y_j, z_j |\theta^{(i)})} \bigg) \\ & = \mathop{\arg\max}\limits_{\theta} \bigg( L(\theta^{(i)}) + \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j |\theta) - \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j |\theta^{(i)}) \bigg) \\ \end{aligned}

    其中,当θ(i)\theta^{(i)}给定时,L(θ(i))L(\theta^{(i)})P(zjyj,θ(i))P(z_j | y_j, \theta^{(i)})P(yj,zjθ(i))P(y_j, z_j |\theta^{(i)})均为定值

    所以,

    θi+1=argmaxθj=1n(zjP(zjyj,θ(i))logP(yj,zjθ))=argmaxθEZY, θ(i)[logP(Y,Zθ)](2.5)\begin{aligned} \theta^{i+1} & = \mathop{\arg\max}\limits_{\theta} \sum \limits_{j=1}^{n} \bigg( \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j |\theta) \bigg) \\ & = \mathop{\arg\max}\limits_{\theta} E_{Z|Y,\ \theta^{(i)}} \big[ \log P(Y, Z |\theta) \big] \end{aligned} \tag{2.5}

  2. 最终简化得到的函数为Q(θ,θ(i))Q(\theta , \theta^{(i)})

    Q(θ,θ(i))=j=1n(zjP(zjyj,θ(i))logP(yj,zjθ))=EZY, θ(i)[logP(Y,Zθ)](2.6)\begin{aligned} Q(\theta , \theta^{(i)}) & = \sum \limits_{j=1}^{n} \bigg( \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j |\theta) \bigg) \\ & = E_{Z|Y,\ \theta^{(i)}} \big[ \log P(Y, Z |\theta) \big] \end{aligned} \tag{2.6}

3. 三硬币问题分析

  1. P(zi=B)=π, P(yizi=B)=p, P(yizi=C)=qP(z_i = B) = \pi , \ P(y_i | z_i = B) = p, \ P(y_i | z_i = C) = q

    Q(θ,θ(i))=j=1n(zjP(zjyj,θ(i))logP(yj,zjθ))=j=1n(P(zj=Byj,θ(i))logP(yj,zj=Bθ)+P(zj=Cyj,θ(i))logP(yj,zj=Cθ))(3.1)\begin{aligned} Q(\theta , \theta^{(i)}) & = \sum \limits_{j=1}^{n} \bigg( \sum \limits_{z_j} P(z_j | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j |\theta) \bigg) \\ & = \sum \limits_{j=1}^{n} \bigg( P(z_j = B | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j = B |\theta) + P(z_j = C | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j = C |\theta) \bigg) \\ \end{aligned} \tag{3.1}

  2. μj(i+1)=P(zj=Byj,θ(i))\mu_j^{(i+1)} = P(z_j = B | y_j, \theta^{(i)}), (表示根据第i次得到的参数值,在第i+1次迭代中,在yjy_j条件下确定zjz_j的概率),则1μj(i+1)=P(zj=Cyj,θ(i))1- \mu_j^{(i+1)} = P(z_j = C | y_j, \theta^{(i)}).

    μj(i+1)=P(zj=Byj,θ(i))=P(yjzj=B,θ(i))P(zj=Bθ(i))P(yjzj=B,θ(i))P(zj=Bθ(i))+P(yjzj=C,θ(i))P(zj=Cθ(i))(3.2)\begin{aligned} \mu_j^{(i+1)} & = P(z_j = B | y_j, \theta^{(i)}) \\ & = \frac{P(y_j | z_j = B, \theta^{(i)}) \cdot P(z_j = B | \theta^{(i)})}{P(y_j | z_j = B, \theta^{(i)}) \cdot P(z_j = B | \theta^{(i)}) + P(y_j | z_j = C, \theta^{(i)}) \cdot P(z_j = C | \theta^{(i)})} \end{aligned} \tag{3.2}

    注意,因为 θ(i)\theta^{(i)}给定,所以μj(i+1)\mu_j^{(i+1)}也是确定的

  3. P(yjzj=B,θ(i))P(y_j | z_j = B, \theta^{(i)}) 的计算 (yj=1y_j =1表示硬币为上)

    P(yjzj=B,θ(i))={p(i),yj=11p(i),yj=0=(p(i))yj(1p(i))1yj\begin{aligned} P(y_j | z_j = B, \theta^{(i)}) & =\left\{ \begin{array}{rcl} p^{(i)} & , & {y_j = 1}\\ 1-p^{(i)} & , & {y_j = 0}\\ \end{array} \right. \\ & = (p^{(i)})^{y_j}(1-p^{(i)})^{1-y_j} \end{aligned}

    同理,

    P(yjzj=C,θ(i))=(q(i))yj(1q(i))1yjP(y_j | z_j = C, \theta^{(i)}) = (q^{(i)})^{y_j}(1-q^{(i)})^{1-y_j}

    带入(3.2)得到,

    μj(i+1)=P(yjzj=B,θ(i))P(zj=Bθ(i))P(yjzj=B,θ(i))P(zj=Bθ(i))+P(yjzj=C,θ(i))P(zj=Cθ(i))=π(i)(p(i))yj(1p(i))1yjπ(i)(p(i))yj(1p(i))1yj+(1π(i))(q(i))yj(1q(i))1yj\begin{aligned} \mu_j^{(i+1)} & = \frac{P(y_j | z_j = B, \theta^{(i)}) \cdot P(z_j = B | \theta^{(i)})}{P(y_j | z_j = B, \theta^{(i)}) \cdot P(z_j = B | \theta^{(i)}) + P(y_j | z_j = C, \theta^{(i)}) \cdot P(z_j = C | \theta^{(i)})} \\ & = \frac{\pi^{(i)} \cdot (p^{(i)})^{y_j}(1-p^{(i)})^{1-y_j}}{\pi^{(i)} \cdot (p^{(i)})^{y_j}(1-p^{(i)})^{1-y_j} + (1 - \pi^{(i)}) \cdot (q^{(i)})^{y_j}(1-q^{(i)})^{1-y_j}} \end{aligned}

  4. 计算 P(yj,zj=Bθ)P(y_j, z_j = B |\theta)

    P(yj,zj=Bθ)=P(yjzj=B,θ)P(zj=B,θ)=pyj(1p)1yjπ\begin{aligned} P(y_j, z_j = B |\theta) & = P(y_j | z_j = B , \theta) \cdot P(z_j = B , \theta) \\ & = p^{y_j} \cdot (1-p)^{1-y_j} \cdot \pi \\ \end{aligned}

    同理,

    P(yj,zj=Cθ)=P(yjzj=C,θ)P(zj=C,θ)=qyj(1q)1yj(1π)\begin{aligned} P(y_j, z_j = C |\theta) & = P(y_j | z_j = C , \theta) \cdot P(z_j = C , \theta) \\ & = q^{y_j} \cdot (1-q)^{1-y_j} \cdot (1-\pi) \\ \end{aligned}

  5. 将第2,3,4步结果带入(3.1)

    Q(θ,θ(i))=j=1n(P(zj=Byj,θ(i))logP(yj,zj=Bθ)+P(zj=Cyj,θ(i))logP(yj,zj=Cθ))=j=1n(μj(i+1)log(pyj(1p)1yjπ)+(1μj(i+1))log(qyj(1q)1yj(1π)))\begin{aligned} Q(\theta , \theta^{(i)}) & = \sum \limits_{j=1}^{n} \bigg( P(z_j = B | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j = B |\theta) + P(z_j = C | y_j, \theta^{(i)}) \cdot \log P(y_j, z_j = C |\theta) \bigg) \\ & = \sum \limits_{j=1}^{n} \bigg( \mu_j^{(i+1)} \cdot \log \big( p^{y_j} \cdot (1-p)^{1-y_j} \cdot \pi \big) + (1 - \mu_j^{(i+1)}) \cdot \log \big( q^{y_j} \cdot (1-q)^{1-y_j} \cdot (1-\pi) \big) \bigg) \\ \end{aligned}

  6. 求导

    1. Qπ=0\frac{\partial Q}{\partial \pi} = 0

      Qπ=j=1n(μj(i+1)π+(1)1μj(i+1)1π)=0\begin{aligned} \frac{\partial Q}{\partial \pi} = \sum \limits_{j=1}^{n} \bigg( \frac{\mu_j^{(i+1)}}{\pi} + (-1) \cdot \frac{1 - \mu_j^{(i+1)}}{1 - \pi} \bigg) = 0 \\ \end{aligned}

      可得,

      π(i+1)=j=1nμj(i+1)n\begin{aligned} \pi^{(i+1)} = \frac{\sum \limits_{j=1}^{n} \mu_j^{(i+1)}}{n} \\ \end{aligned}

    2. Qp=0\frac{\partial Q}{\partial p} = 0

      Qp=j=1n(μj(i+1)log(pyj(1p)1yjπ))p=j=1n(μj(i+1)(yjlogp+(1yj)log(1p)+logπ))p=j=1n(μj(i+1)(yjp1yj1p))=0\begin{aligned} \frac{\partial Q}{\partial p} & = \frac{\partial \sum \limits_{j=1}^{n} \bigg( \mu_j^{(i+1)} \cdot \log \big( p^{y_j} \cdot (1-p)^{1-y_j} \cdot \pi \big) \bigg)}{\partial p} \\ & = \frac{\partial \sum \limits_{j=1}^{n} \bigg( \mu_j^{(i+1)} \cdot \big( y_j \cdot \log p + (1 - y_j) \cdot \log (1 - p) + \log\pi \big) \bigg)}{\partial p} \\ & = \sum \limits_{j=1}^{n} \bigg( \mu_j^{(i+1)} \cdot \big( \frac{y_j}{p} - \frac{1 - y_j}{1 - p}\big) \bigg) = 0 \\ \end{aligned}

      可得,

      p(i+1)=j=1nμj(i+1)yjj=1nμj(i+1)\begin{aligned} p^{(i+1)} = \frac{\sum \limits_{j=1}^{n} \mu_j^{(i+1)} \cdot y_j}{\sum \limits_{j=1}^{n} \mu_j^{(i+1)}} \\ \end{aligned}

    3. Qq=0\frac{\partial Q}{\partial q} = 0

      同理,

      Qp=j=1n((1μj(i+1))(yjlog(1q)+(1yj)log(1q)+log(1π)))q=j=1n((1μj(i+1))(yjq1yj1q))=0\begin{aligned} \frac{\partial Q}{\partial p} & = \frac{\partial \sum \limits_{j=1}^{n} \bigg( (1 - \mu_j^{(i+1)}) \cdot \big( y_j \cdot \log (1-q) + (1 - y_j) \cdot \log (1 - q) + \log (1 - \pi) \big) \bigg)}{\partial q} \\ & = \sum \limits_{j=1}^{n} \bigg( (1 - \mu_j^{(i+1)}) \cdot \big( \frac{y_j}{q} - \frac{1 - y_j}{1 - q}\big) \bigg) = 0 \\ \end{aligned}

      可得,

      q(i+1)=j=1n(1μj(i+1))yjj=1n(1μj(i+1))\begin{aligned} q^{(i+1)} = \frac{\sum \limits_{j=1}^{n} (1 - \mu_j^{(i+1)}) \cdot y_j}{\sum \limits_{j=1}^{n} (1 - \mu_j^{(i+1)})} \\ \end{aligned}

4. GMM高斯混合模型应用EM估计参数

  1. 模型定义

    P(yjθ)=k=1Kakϕ(yjθk) \begin{aligned} P(y_j | \theta) = \sum \limits_{k=1}^{K} a_k \cdot \phi(y_j | \theta_k) \end{aligned}

    ϕ(yjθk)\phi(y_j | \theta_k)表示由第kk个高斯分布生成yiy_i的概率。为方便,小王简记为ϕjk\phi_{jk}

    ϕ(yjθk)=12πσ2e(yjμk)22σ2 \begin{aligned} \phi(y_j | \theta_k) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(y_j - \mu_k)^2}{2\sigma^2}} \end{aligned}

  2. Q函数

    Q(θ,θ(i))=j=1Nk=1KP(rj=rjkyj,θ(i))logP(rj=rjk,yjθ)(4.1) \begin{aligned} Q(\theta, \theta^{(i)}) = \sum \limits_{j=1}^{N} \sum \limits_{k=1}^{K} P(r_j = r_{jk} | y_j, \theta^{(i)}) \cdot \log P(r_j = r_{jk} , y_j | \theta) \end{aligned} \tag{4.1}

  3. 计算 $ P(r_j = r_{jk} | y_j, \theta^{(i)}) $, 记为 $ \hat r_{jk} $

    r^jk=P(rj=rjkyj,θ(i))=P(yjrj=rjk,θ(i))P(rj=rjkθ(i))k=1KP(yjrj=rjk,θ(i))P(rj=rjkθ(i))=ϕjk(i)ak(i)k=1Kϕjk(i)ak(i)(4.2) \begin{aligned} \hat r_{jk} & = P(r_j = r_{jk} | y_j, \theta^{(i)}) \\ & = \frac{ P(y_j | r_j = r_{jk}, \theta^{(i)}) \cdot P(r_j = r_{jk} | \theta^{(i)})}{\sum \limits_{k'=1}^{K} P(y_j | r_j = r_{jk'}, \theta^{(i)}) \cdot P(r_j = r_{jk'} | \theta^{(i)})} \\ & = \frac{\phi_{jk}^{(i)} \cdot a_{k}^{(i)}}{\sum \limits_{k'=1}^{K} \phi_{jk'}^{(i)} \cdot a_{k'}^{(i)}} \end{aligned} \tag{4.2}

    分析r^jk\hat r_{jk}, nkn_k 表示N个样本中有nkn_k个样本是由第kk个分量生成的

    nk=j=1Nrjk(4.3) \begin{aligned} n_k = \sum \limits_{j=1}^{N} r_{jk} \end{aligned} \tag{4.3}

    N=k=1Knk(4.4) \begin{aligned} N = \sum \limits_{k=1}^{K} n_k \end{aligned} \tag{4.4}

  4. 计算P(rj=rjk,yjθ)P(r_j = r_{jk} , y_j | \theta)

    P(rj=rjk,yjθ)=P(yjrj=rjk,θ)P(rj=rjkθ)=ϕjkak(4.5) \begin{aligned} P(r_j = r_{jk} , y_j | \theta) & = P(y_j | r_j = r_{jk} , \theta) \cdot P(r_j = r_{jk} | \theta) \\ & = \phi_{jk} \cdot a_k \\ \end{aligned} \tag{4.5}

  5. 将第3,4步带入(4.1)

    Q(θ,θ(i))=j=1Nk=1KP(rj=rjkyj,θ(i))logP(rj=rjk,yjθ)=j=1Nk=1Kr^jklog(ϕjkak)=k=1Kj=1N(r^jklogak+r^jklogϕjk)=k=1K(j=1Nr^jklogak+j=1Nr^jklogϕjk)=k=1K(n^klogak+j=1Nr^jklogϕjk)=k=1K(n^klogak+j=1Nr^jk(12log2π12logσk2(yjμk)22σk2))(4.6) \begin{aligned} Q(\theta, \theta^{(i)}) & = \sum \limits_{j=1}^{N} \sum \limits_{k=1}^{K} P(r_j = r_{jk} | y_j, \theta^{(i)}) \cdot \log P(r_j = r_{jk} , y_j | \theta) \\ & = \sum \limits_{j=1}^{N} \sum \limits_{k=1}^{K} \hat r_{jk} \cdot \log (\phi_{jk} \cdot a_k) \\ & = \sum \limits_{k=1}^{K} \sum \limits_{j=1}^{N} (\hat r_{jk} \cdot \log a_k + \hat r_{jk} \cdot \log \phi_{jk}) \\ & = \sum \limits_{k=1}^{K} \big( \sum \limits_{j=1}^{N} \hat r_{jk} \cdot \log a_k + \sum \limits_{j=1}^{N} \hat r_{jk} \cdot \log \phi_{jk} \big) \\ & = \sum \limits_{k=1}^{K} \big( \hat n_k \cdot \log a_k + \sum \limits_{j=1}^{N} \hat r_{jk} \cdot \log \phi_{jk} \big) \\ & = \sum \limits_{k=1}^{K} \Big( \hat n_k \cdot \log a_k + \sum \limits_{j=1}^{N} \hat r_{jk} \cdot \big( -\frac{1}{2} \log 2\pi - \frac{1}{2} \log \sigma_k^2 - \frac{(y_j - \mu_k)^2}{2\sigma_k^2} \big) \Big) \\ \end{aligned} \tag{4.6}

    考虑约束条件 k=1Kak=1\sum \limits_{k=1}^{K} a_k = 1,使用拉格朗日算子

    Q(θ,θ(i))=k=1K(n^klogak+j=1Nr^jk(12log2π12logσk2(yjμk)22σk2))+λ(k=1Kak1)(4.7)\begin{aligned} Q'(\theta, \theta^{(i)}) & = \sum \limits_{k=1}^{K} \Big( \hat n_k \cdot \log a_k + \sum \limits_{j=1}^{N} \hat r_{jk} \cdot \big( -\frac{1}{2} \log 2\pi - \frac{1}{2} \log \sigma_k^2 - \frac{(y_j - \mu_k)^2}{2\sigma_k^2} \big) \Big) + \lambda (\sum \limits_{k=1}^{K} a_k - 1) \\ \end{aligned} \tag{4.7}

  6. 求导

    1. Qak=0\frac{\partial Q'}{\partial a_k} = 0

      Qak=k=1K(n^kak+λ)=0\begin{aligned} \frac{\partial Q'}{\partial a_k} & = \sum \limits_{k=1}^{K} ( \frac{\hat n_k}{a_k} + \lambda) =0 \\ \end{aligned}

      一个可行的解

      n^kak+λ=0n^k=λak(4.8)\begin{aligned} \frac{\hat n_k}{a_k} + \lambda & = 0 \\ \hat n_k & = - \lambda a_k \\ \end{aligned} \tag{4.8}

      对(4.8) 两边求和

      N=k=1Kn^k=λk=1Kak=λ\begin{aligned} N = \sum \limits_{k=1}^{K} \hat n_k & = - \lambda \sum \limits_{k=1}^{K} a_k = -\lambda\\ \end{aligned}

      因此λ^=N\hat \lambda = - N, 带入(4.8)可得到

      a^k=n^kN\hat a_k = \frac{\hat n_k}{N}

    2. Qμk=0\frac{\partial Q'}{\partial \mu_k} = 0

      Qμk=k=1Kj=1Nr^jk(yjμk)σk2=0=k=1K1σk2j=1N(r^jkyjr^jkμk)=k=1K1σk2(j=1Nr^jkyjμkn^k)\begin{aligned} \frac{\partial Q'}{\partial \mu_k} & = \sum \limits_{k=1}^{K} \sum \limits_{j=1}^{N} \hat r_{jk} \cdot \frac{(y_j - \mu_k)}{\sigma_k^2} = 0 \\ & = \sum \limits_{k=1}^{K} \frac{1}{\sigma_k^2} \sum \limits_{j=1}^{N} ( \hat r_{jk} y_j - \hat r_{jk} \mu_k) \\ & = \sum \limits_{k=1}^{K} \frac{1}{\sigma_k^2} ( \sum \limits_{j=1}^{N} \hat r_{jk} y_j - \mu_k \hat n_{k}) \end{aligned}

      一个可行的解,

      μ^k=j=1Nr^jkyjn^k \hat \mu_k = \frac{\sum \limits_{j=1}^{N} \hat r_{jk} y_j}{\hat n_{k}}

    3. Qσk2=0\frac{\partial Q'}{\partial \sigma_k^2} = 0

      Qσk2=k=1Kj=1Nr^jk(1σk2+(yjμk)22σk4)=0\begin{aligned} \frac{\partial Q'}{\partial \sigma_k^2} & = \sum \limits_{k=1}^{K} \sum \limits_{j=1}^{N} \hat r_{jk} \cdot (-\frac{1}{\sigma_k^2} + \frac{(y_j - \mu_k)^2}{2\sigma_k^4}) = 0 \\ \end{aligned}

      t=σk2t = \sigma_k^2, 则

      Qt=k=1Kj=1Nr^jk(1t+(yjμk)22t2)=0=k=1Kj=1Nr^jk(yjμk)2j=1Nr^jkt2t2\begin{aligned} \frac{\partial Q'}{\partial t} & = \sum \limits_{k=1}^{K} \sum \limits_{j=1}^{N} \hat r_{jk} \cdot (-\frac{1}{t} + \frac{(y_j - \mu_k)^2}{2t^2}) = 0 \\ & = \sum \limits_{k=1}^{K} \frac{\sum \limits_{j=1}^{N} \hat r_{jk}(y_j - \mu_k)^2 - \sum \limits_{j=1}^{N} \hat r_{jk} t}{2t^2} \end{aligned}

      一个可能的解(此时自动使用μ^k\hat\mu_k?)

      t^=σ^k2=j=1Nr^jk(yjμ^k)2j=1Nr^jk=j=1Nr^jk(yjμ^k)2n^k \hat t = \hat \sigma_k^2 = \frac{\sum \limits_{j=1}^{N} \hat r_{jk}(y_j - \hat\mu_k)^2}{\sum \limits_{j=1}^{N} \hat r_{jk}} = \frac{\sum \limits_{j=1}^{N} \hat r_{jk}(y_j - \hat\mu_k)^2}{\hat n_{k}}

参考

  1. https://www.bilibili.com/video/BV1me4y1j7Uz?p=1