import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
plt.rcParams['font.family'] = 'IPAPGothic'  # 全体のフォントを設定


df = pd.read_csv("data/data4mixed_model.csv")
sns.pairplot(x_vars=['mobile_addr'], y_vars=['pedestrian'], data=df, hue='sensor_name', height=6)

<seaborn.axisgrid.PairGrid at 0x7f3e5a54c910>


# ライブラリのインポート
import itertools
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import numpy as np


# 計画行列(design matrix)の作成
# 特徴ベクトル（一つのデータ)を１行とし、各データを縦に並べた行列
from functools import reduce
class PolynomialFeatures(object):

    def __init__(self, degree=2):
        assert isinstance(degree, int)
        self.degree = degree

    def transform(self, x):
        if x.ndim == 1:
            x = x[:, None]
        x_t = x.transpose()
        features = [np.ones(len(x))]
        for degree in range(1, self.degree + 1):
            for items in itertools.combinations_with_replacement(x_t, degree):
                features.append(reduce(lambda x, y: x * y, items))
        return np.asarray(features).transpose()


import copy
# 条件付き混合線形回帰の本体
class ConditionalMixtureModel(object):

    def __init__(self, n_models=2):
        # モデルの個数を指定
        self.n_models = n_models

    # 最尤推定を行うメソッド
    def fit(self, X, t, n_iter=100):
        # 推定したいパラメータ（重み係数、混合係数、分散）の初期化
        coef = np.linalg.pinv(X).dot(t)
        print(coef)
        self.coef = coef + np.random.normal(size=len(coef))
        for i in range(self.n_models - 1): # バグ修正 (3以上のモデル数に対応)
            self.coef = np.vstack((self.coef,
                                   coef + np.random.normal(size=len(coef))))
        self.coef = self.coef.T
        self.var = np.mean(np.square(X.dot(coef) - t))
        self.weight = np.ones(self.n_models) / self.n_models
        self.rec_coef = [{'step':0, 'coef': copy.deepcopy(self.coef)}]

        # EMステップを指定した回数だけ繰り返す
        for i in range(n_iter):
            # Eステップ 負担率E[z_nk]を各データ、各モデルごとに計算
            self.resp = self._expectation(X, t)          
            # Mステップ パラメータについて最大化
            self._maximization(X, t, self.resp)
            # 係数の履歴を記録
            self.rec_coef.append({'step': i+1, 'coef': copy.deepcopy(self.coef)})  
    # ガウス関数
    def _gauss(self, x, y):
        return np.exp(-(x - y) ** 2 / (2 * self.var))

    # Eステップ 負担率gamma_nk=E[z_nk]を計算
    def _expectation(self, X, t):
        # PRML式(14.37)
        # print(X.dot(self.coef))
        responsibility = self.weight * self._gauss(X.dot(self.coef), t[:, None])
        responsibility /= np.sum(responsibility, axis=1, keepdims=True)
        return responsibility

    # Mステップ パラメータについて最大化
    def _maximization(self, X, t, resp):
        # PRML式(14.38) 混合係数の計算
        self.weight = np.mean(resp, axis=0)
        for m in range(self.n_models):
            # PRML式(14.42) 各モデルごとに係数を計算
            self.coef[:, m] = np.linalg.inv((X.T * resp[:, m]).dot(X)).dot(X.T * resp[:, m]).dot(t)

        # PRML式(14.44) 分散の計算
        self.var = np.mean(
            np.sum(resp * np.square(t[:, None] - X.dot(self.coef)), axis=1))

    def predict(self, X):
        return X.dot(self.coef)
    
    def getResponsibility(self):
        return self.resp


# サンプルデータ作成関数 (オリジナル版)
def create_toy_data(sample_size=20, std=0.1):
    x = np.linspace(-1, 1, sample_size)
    y = (x > 0.).astype(np.float) * 2 - 1
    y = x * y
    y += np.random.normal(scale=std, size=sample_size)
    return x, y


# サンプルデータ作成関数 その2 (3種類データ)
def create_toy_data2(sample_size=30, std=0.2):
    x = np.linspace(0, 3, sample_size)
    x = np.hstack((x,np.linspace(0, 3, sample_size)))
    x = np.hstack((x,np.linspace(0, 3, sample_size)))
    y = np.zeros(sample_size*3)
    y[:sample_size] = x[:sample_size] + 0.1
    y[sample_size: sample_size*2] = 2.0* x[sample_size: sample_size*2] -0.5
    y[sample_size*2:] = 3.0 * x[sample_size*2:]
    y += np.random.normal(scale=std, size=sample_size*3)
    return x, y


x_train,y_train = create_toy_data2(30, std=0.5)
plt.figure(figsize=(6,6))
plt.scatter(x_train,y_train)

<matplotlib.collections.PathCollection at 0x7fd7f2b5f8e0>


import math

# PolynomialFeaturesはべき基底関数をつくるための関数！
# たとえば、degree=2とすると、y = a x + b x**2 のような2次式のフィッティングを行うために、1, x, x**2を列とする(N,3)行列の計画行列になる
# 本研究の場合は線形なのでdegree=1

feature = PolynomialFeatures(degree=1)
X_train = feature.transform(x_train)

# 条件付き混合モデルの用意（モデル数を指定) 
n_models = 3
model = ConditionalMixtureModel(n_models=n_models)

# 学習：パラメータの最尤推定を行う
model.fit(X_train, y_train,n_iter=500)

# 予測
x = np.linspace(0, 3, 100)
X = feature.transform(x)
Y = model.predict(X)

# Figureの初期化
fig = plt.figure(figsize=(6, 6))
# Figure内にAxesを追加()
ax = fig.add_subplot(111) #...2

# トレーニングデータの描画
for i, c in enumerate(model.getResponsibility()): # データ点ごとに負担率(responsibility)で色付け描画
    if len(c) == 2:
        color = [c[0], c[1], 0]
    else:
        color = c
    ax.scatter([x_train[i]], [y_train[i]], color=color)

# 回帰線の描画
for i in range(n_models):
    ax.plot(x, Y[:, i])

# 標準偏差の幅で色付けしてみる
std_dev = math.sqrt(model.var)
for i in range(n_models):
    ax.fill([x[0],x[0],x[-1],x[-1]],
            [Y[0,i]-std_dev, Y[0,i]+std_dev, Y[-1,i]+std_dev, Y[-1,i]-std_dev], alpha=0.2)
plt.show()


df = pd.read_csv("data/data4mixed_model.csv")
df.head()


sns.pairplot(x_vars=['mobile_addr'], y_vars=['pedestrian'], data=df, hue='sensor_name', height=6)

<seaborn.axisgrid.PairGrid at 0x7fd8102bc3d0>


import math

x_train = df['mobile_addr'].values
y_train = df['pedestrian'].values

# 特徴ベクトルの定義（次数1）
feature = PolynomialFeatures(degree=1)

# 計画行列への変換
X_train = feature.transform(x_train)
#print(X_train)


# 条件付き混合モデルの用意（モデル数3) 
model = ConditionalMixtureModel(n_models=3)

# パラメータの最尤推定を行う
model.fit(X_train, y_train,n_iter=500)

# 結果を図示
x = np.linspace(0, 3500, 200)
X = feature.transform(x)
Y = model.predict(X)
# Figureの初期化
fig = plt.figure(figsize=(8, 8))
# Figure内にAxesを追加()
ax = fig.add_subplot(111) #...2

for i, c in enumerate(model.getResponsibility()): # データ点ごとに負担率(responsibility)で色付け描画
    ax.scatter([x_train[i]], [y_train[i]], color=c)

# 回帰線の描画
ax.plot(x, Y[:, 0])
ax.plot(x, Y[:, 1])
ax.plot(x, Y[:, 2])
std_dev = math.sqrt(model.var)
# 標準偏差の幅で色付けしてみる
ax.fill([x[0],x[0],x[-1],x[-1]],
        [Y[0,0]-std_dev, Y[0,0]+std_dev, Y[-1,0]+std_dev, Y[-1,0]-std_dev], alpha=0.2)
ax.fill([x[0],x[0],x[-1],x[-1]],
        [Y[0,1]-std_dev, Y[0,1]+std_dev, Y[-1,1]+std_dev, Y[-1,1]-std_dev], alpha=0.2)
ax.fill([x[0],x[0],x[-1],x[-1]],
        [Y[0,2]-std_dev, Y[0,2]+std_dev, Y[-1,2]+std_dev, Y[-1,2]-std_dev], alpha=0.2)
plt.show()
print(model.coef) # 回帰係数
print(model.var)  # 分散
# print(model.getResponsibility())

[[7.62914484e+01 5.40527021e+01 2.81175125e+02]
 [7.84904007e-02 2.50731834e-01 2.27399650e-01]]
3334.588704964099


df_temperature = pd.read_csv("data/temperature_data_kofu.csv")
df_temperature.head()


df_temperature['date'] = pd.to_datetime(df_temperature['年月'])
df_temperature['year'] = df_temperature['date'].dt.year
# 1920以降のデータを採用
df_temperature = df_temperature[df_temperature['year']>=1920]
df_temperature.head()


df_annual = df_temperature[["year","平均気温(℃)"]].groupby('year').mean().reset_index()
df_annual = df_annual[df_annual["year"]!=1945]
df_annual.plot(x="year", y="平均気温(℃)")

<AxesSubplot: xlabel='year'>


import math
n_models=2

# スケール変換 (切片が0付近でないと数値エラーになる。ガウシアンの計算上において)
syear = 1910
eyear = 2025
base_temp = 12.0
scaler_temp = 10.0

x_train = df_annual['year'].values - syear
y_train = (df_annual['平均気温(℃)'].values - base_temp)*scaler_temp
   
# 特徴ベクトルの定義（次数1）
feature = PolynomialFeatures(degree=1)
# 計画行列への変換
X_train = feature.transform(x_train)


# 条件付き混合モデルの用意（モデル数2) 
model = ConditionalMixtureModel(n_models=n_models)
# パラメータの最尤推定を行う
model.fit(X_train, y_train,n_iter=1000)

# 結果を図示
x = np.linspace(0, eyear - syear, int(eyear - syear))
X = feature.transform(x)
Y = model.predict(X)

#print(X)

[6.33362827 0.23927175]


# Figureの初期化
fig = plt.figure(figsize=(7, 5))
# Figure内にAxesを追加()
ax = fig.add_subplot(111) #...2

for i, c in enumerate(model.getResponsibility()): # データ点ごとに負担率(responsibility)で色付け描画
    if len(c) == 2:
        color = [c[0], 0, c[1]]
    else:
        color = c
    ax.scatter([x_train[i]+syear], [y_train[i]/scaler_temp + base_temp], color=color)

# 回帰線の描画
ax.plot(x+syear, Y[:, 0]/scaler_temp + base_temp, "r-")
ax.plot(x+syear, Y[:, 1]/scaler_temp + base_temp, "b-")

#ax.plot(x, Y[:, 2])
std_dev = math.sqrt(model.var)
# 標準偏差の幅で色付けしてみる
#ax.fill([x[0],x[0],x[-1],x[-1]],
#        [Y[0,0]-std_dev, Y[0,0]+std_dev, Y[-1,0]+std_dev, Y[-1,0]-std_dev], alpha=0.2)
#ax.fill([x[0],x[0],x[-1],x[-1]],
#        [Y[0,1]-std_dev, Y[0,1]+std_dev, Y[-1,1]+std_dev, Y[-1,1]-std_dev], alpha=0.2)
#ax.fill([x[0],x[0],x[-1],x[-1]],
#        [Y[0,2]-std_dev, Y[0,2]+std_dev, Y[-1,2]+std_dev, Y[-1,2]-std_dev], alpha=0.2)
plt.show()
print(model.coef) # 回帰係数
print(model.var)  # 分散
# print(model.getResponsibility())

[[14.32098097  5.49560094]
 [ 0.03267888  0.26301454]]
12.686178557962997

	年月	平均気温(℃)
0	1895/1	0.2
1	1895/2	1.0
2	1895/3	7.2
3	1895/4	12.7
4	1895/5	17.4

	年月	平均気温(℃)	date	year
300	1920/1/1	2.5	1920-01-01	1920
301	1920/2/1	2.0	1920-02-01	1920
302	1920/3/1	7.2	1920-03-01	1920
303	1920/4/1	13.1	1920-04-01	1920
304	1920/5/1	17.5	1920-05-01	1920

Table of Contents

混合モデルの一例：「条件付き混合線形回帰」¶

事例¶

条件付き混合線形回帰¶

条件付き混合線形回帰を行う関数の定義¶

サンプルデータの作成とテスト¶

サンプルデータの作成¶

サンプルデータへの条件付き混合モデルの適用¶

実践的な事例¶

データの散布図¶

混合線形モデルによる分析¶

実践的な例２：甲府の気温データ¶

方法についての概念的解説¶

詳細¶

Eステップ¶

Mステップ¶

	sensor_name	date	hour	mobile_addr	pedestrian
0	H	20181130	10	144	67
1	H	20181130	11	239	90
2	H	20181130	12	173	110
3	H	20181130	13	184	70
4	H	20181130	14	277	89