# You need to put the csv file in your own folder downloaded from
# https://kuboweb.github.io/-kubo/ce/EesLecture2008.html#toc5 
import pandas as pd
df = pd.read_csv("data/data4a.csv")
# Alternatively, you can use kubo's data directly through URL
#df = pd.read_csv("https://kuboweb.github.io/-kubo/stat/2008/d/fig/data4a.csv")
df


import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'IPAGothic' # Linux上で日本語フォントを扱う場合
# plt.rcParams['font.family'] = 'MS Gothic' # Windowsで日本語フォントが使えないとき利用
df_C = df[df['f']=='C'] # extract df['f'] == 'C'
df_T = df[df['f']=='T']
plt.scatter(df_C['x'], df_C['y'], label="C")
plt.scatter(df_T['x'], df_T['y'], label="T")
plt.legend()
plt.xlabel("親のサイズ x")
plt.ylabel("発芽数 y")
plt.show()


import numpy as np

x = np.linspace(-4,4,100)
y = 1. /(1. + np.exp(-x))
plt.plot(x,y)
plt.title("Logistic function")
plt.show()


from sklearn.linear_model import LogisticRegression
lr_C = LogisticRegression(max_iter=500)
lr_T = LogisticRegression(max_iter=500)

lr_C.fit(df_C[['x']], df_C['y']) # explanation variables (features) are an array
lr_T.fit(df_T[['x']], df_T['y'])
# warning will be returned but you will get resultant learned machines

LogisticRegression(max_iter=500)


# 親のサイズに対する発芽数予測：predict bud-break number for a given parent-size
import numpy as np
x_test = np.linspace(6,12.0, 120)
predicted_C = lr_C.predict(x_test[:, np.newaxis])
predicted_T = lr_T.predict(x_test[:, np.newaxis])

/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(
/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(


# Predicted value versus the bud-break number in eight
# predicted value is one with the highest probablity
plt.scatter(df_C['x'], df_C['y'])
plt.plot(x_test, predicted_C)
plt.scatter(df_T['x'], df_T['y'], color='#ff7f00')
plt.plot(x_test, predicted_T, color='#ff7f00')
plt.show()


X=np.linspace(7.0,13.0,5)
prob_C = lr_C.predict_proba(X[:, np.newaxis])

/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(


size = np.linspace(0,8,9)
for i, p in enumerate(prob_C):
    plt.plot(size, p, label=X[i])
plt.legend(title="親のサイズ")
plt.xlabel("発芽数(max=8)")
plt.ylabel("確率")
plt.show()


from scipy.stats import binom
plt.plot(size, binom.pmf(size,8,0.1), label="p=0.1")
plt.plot(size, binom.pmf(size,8,0.3), label="p=0.3")
plt.plot(size, binom.pmf(size,8,0.6), label="p=0.6")
plt.plot(size, binom.pmf(size,8,0.9), label="p=0.9")
plt.legend()
plt.xlabel("n")
plt.ylabel("$P_p(n)$")
plt.show()


import statsmodels.formula.api as smf
import statsmodels.api as sm
N=8
# 説明変数x及びf、目的変数yが起こる確率pと、1-pとの関係を二項分布だとしてフィッティング
result = smf.glm(formula='y + I(N-y) ~ x +f', 
                    data=df, family=sm.families.Binomial()).fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:      ['y', 'I(N - y)']   No. Observations:                  100
Model:                            GLM   Df Residuals:                       97
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -133.11
Date:                Sat, 15 Jul 2023   Deviance:                       123.03
Time:                        14:27:38   Pearson chi2:                     109.
No. Iterations:                     6   Pseudo R-squ. (CS):             0.9768
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -19.5361      1.414    -13.818      0.000     -22.307     -16.765
f[T.T]         2.0215      0.231      8.740      0.000       1.568       2.475
x              1.9524      0.139     14.059      0.000       1.680       2.225
==============================================================================


df['predict'] = result.predict() # xに対する予測値を列に加える


# 描画
df_C = df[df['f']=="C"].sort_values('x')
df_T = df[df['f']=="T"].sort_values('x')
plt.scatter(df_C['x'], df_C['y'], color='b') # データ点の描画
plt.plot(df_C['x'], df_C['predict']*8.0, color='b') # 予測(推定)曲線の描画
plt.scatter(df_T['x'], df_T['y'], color='#ff7f00')
plt.plot(df_T['x'], df_T['predict']*8.0, color='#ff7f00')
plt.show()


# Poisson分布の描画
import numpy as np
from scipy.stats import poisson
# 平均値
m=5
# 描画範囲
x = np.arange(0, m*3)
# ポアソン分布
y = poisson.pmf(x, m)

# FigureとAxes
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid()
ax.set_title("Poisson distribution")
ax.set_xlabel("$k$")
ax.set_ylabel("$P(k)$")
# データをプロット
ax.plot(x, y, marker="o", color="blue")
plt.show()


import pandas as pd
# data url
url = "https://gist.githubusercontent.com/sachinsdate/c17931a3f000492c1c42cf78bf4ce9fe/raw/7a5131d3f02575668b3c7e8c146b6a285acd2cd7/nyc_bb_bicyclist_counts.csv"
bike_data = pd.read_csv(url)
bike_data["Date"] = pd.to_datetime(bike_data["Date"]) # 文字列をDatetime型に変換
bike_data["DOW"] = bike_data["Date"].dt.dayofweek
bike_data["MONTH"] = bike_data["Date"].dt.month
bike_data.head()


features = bike_data[["HIGH_T", "LOW_T", "PRECIP", "DOW"]]
targets = bike_data["BB_COUNT"]

# 訓練データとテストデータへの分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.8, random_state=0)

# データの正規化
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler().fit(X_train)
#X_train_scaled = scaler.transform(X_train)
#X_test_scaled = scaler.transform(X_test)


# models
models = {} # 学習機械
predict_obs = {} # テストデータの推定結果

# (1) 線形重回帰
from sklearn.linear_model import LinearRegression
models["Linear"] = LinearRegression()

# (2) ポアソン回帰
from sklearn.linear_model import PoissonRegressor
models["Poisson"] = PoissonRegressor()

# (3) ニューラルネット
from sklearn.neural_network import MLPRegressor
models["NeuralNetwork"] = MLPRegressor(max_iter=5000)

# (4)ランダムフォレスト回帰
from sklearn.ensemble import RandomForestRegressor
models["RandomForest"] = RandomForestRegressor()

# 学習とテストデータの推定
for k,m in models.items():
    # 学習
    m.fit(X_train, y_train)
    # テストデータでの予測(推定)
    result = m.predict(X_test)
    predict_obs[k] = {"obs": np.array(list(y_test)), "predict": result}

/opt/anaconda3/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:692: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.
  warnings.warn(


# 決定係数
from sklearn.metrics import r2_score
for k,r in predict_obs.items():
    print("model: %s,  R2=%.3f" % (k, r2_score(r["obs"], r["predict"])))

model: Linear,  R2=0.356
model: Poisson,  R2=0.385
model: NeuralNetwork,  R2=0.212
model: RandomForest,  R2=0.442


fig = plt.figure(figsize=(7,7))
plt.subplots_adjust(wspace=0.4, hspace=0.4) # これがないと軸ラベルが重なってしまう
fig_no = 1
for k,r in predict_obs.items():
    ax = fig.add_subplot(2,2,fig_no)
    ax.scatter(r["predict"], r['obs'])
    ax.plot([0,5000],[0,5000], "y")
    ax.set_title(k)
    ax.set_xlabel("推定値")
    ax.set_ylabel("観測値")
    fig_no = fig_no+1


import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import binom
size = np.linspace(0,8,9)
fig = plt.figure(figsize=(8,3))
plt.subplots_adjust(wspace=0.5)
for i, p in enumerate([0.1, 0.3, 0.6, 0.9]):
    ax = fig.add_subplot(1, 4, i+1) 
    ax.barh(size, binom.pmf(size,8,p), label="p=" + str(p))
    ax.set_ylabel("発芽種子数 ($n$)")
    ax.set_xlabel("出現確率 P_p(n)")
    ax.set_title("$p=" + str(p) + "$")
fig.savefig("images/binomial4.png", bbox_inches='tight')
plt.show()


import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import binom
size = np.linspace(0,10,11)
fig = plt.figure(figsize=(8,3))
plt.subplots_adjust(wspace=0.5)
for i, p in enumerate([0.1, 0.3, 0.6, 0.9]):
    ax = fig.add_subplot(1, 4, i+1) 
    ax.barh(size, binom.pmf(size,10,p), label="p=" + str(p))
    ax.set_ylabel("合格者数$n$ (10人中)")
    ax.set_xlabel("出現確率 P_p(n)")
    ax.set_title("$p=" + str(p) + "$")
fig.savefig("images/binomial_max10.png", bbox_inches='tight')
plt.show()

	N	y	x	f
0	8	1	9.76	C
1	8	6	10.48	C
2	8	5	10.83	C
3	8	6	10.94	C
4	8	1	9.37	C
...	...	...	...	...
95	8	7	10.45	T
96	8	0	8.94	T
97	8	5	8.94	T
98	8	8	10.14	T
99	8	1	8.50	T

	Date	HIGH_T	LOW_T	PRECIP	BB_COUNT	DOW	MONTH
0	2017-04-01	46.0	37.0	0.00	606	5	4
1	2017-04-02	62.1	41.0	0.00	2021	6	4
2	2017-04-03	63.0	50.0	0.03	2470	0	4
3	2017-04-04	51.1	46.0	1.18	723	1	4
4	2017-04-05	63.0	46.0	0.00	2807	2	4

Logistic Regression (an example)¶

サンプルデータ¶

scikit-learnでのLogistic Regression¶

statsmodelsによる回帰分析¶

Poisson Regression¶

事例：橋を通行する自転車数と気温、雨量の関係¶

参考¶