import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold


x_dic = {}
for i in range(1, 101):
    if i % 10 ==0:
        print(i)
    df = pd.read_excel('41586_2009_BFnature07634_MOESM271_ESM.xls', sheet_name = i, skiprows = 1)
    xi = df['Mid-Atlantic Region'].tolist()
    x_dic['x'+str(i)] = xi


df = pd.read_excel('41586_2009_BFnature07634_MOESM271_ESM.xls', sheet_name=1, header = 1)

# Combine 45 queries
dict = {'date': df['Date'].tolist()}
for i in range(1, 46):
    if i % 5 ==0:
        print(i)
    df = pd.read_excel('41586_2009_BFnature07634_MOESM271_ESM.xls', sheet_name=i, header = 1)
    dict['query'+str(i)] = df['Mid-Atlantic Region'].tolist()
dat = pd.DataFrame.from_dict(dict)
dat.head()

5
10
15
20
25
30
35
40
45


dat['date'] = pd.to_datetime(dat['date'])


df0 = pd.read_csv('FluView_LineChart_Data-0.csv', skiprows = 1)
df1 = pd.read_csv('FluView_LineChart_Data-1.csv', skiprows = 1)
df2 = pd.read_csv('FluView_LineChart_Data-2.csv', skiprows = 1)
df3 = pd.read_csv('FluView_LineChart_Data-3.csv', skiprows = 1)
df4 = pd.read_csv('FluView_LineChart_Data-4.csv', skiprows = 1)
df5 = pd.read_csv('FluView_LineChart_Data-5.csv', skiprows = 1)
df6 = pd.read_csv('FluView_LineChart_Data-6.csv', skiprows = 1)

y = pd.concat([df0, df1, df2, df3, df4, df5, df6])
y = y[y['YEAR'] > 2002]
y = y[((y['YEAR']==2003)&(y['WEEK'] > 22)) | (y['YEAR'] > 2003) ][:dat['date'].size]

y = y['% WEIGHTED ILI']


plt.plot(dat['date'], y);


plt.plot(dat['date'], dat['query1'], label = 'query1')
plt.plot(dat['date'], dat['query2'], label = 'query2')
plt.plot(dat['date'], dat['query3'], label = 'query3')
plt.plot(dat['date'], y,  label = 'CDC ILI')
plt.legend()
plt.show()


dat['y'] = y.tolist()


for i in range(1, 8):
    dat["lag_{}".format(i)] = dat['y'].shift(i)
print("done")
dat=dat.fillna(0)

done


y = dat['y']
date = dat['date']
X = dat.drop(['y', 'date'], axis = 1)


N = 50
X_train = X.iloc[:N,]
X_test = X.iloc[N:,]
y_train = y[:N]
y_test = y[N:]

# 利用弹性网络
from sklearn.model_selection import cross_val_score
cv_model = ElasticNetCV(l1_ratio=0.5, eps=1e-3, n_alphas=200, fit_intercept=True, 
                        normalize=True, precompute='auto', max_iter=200, tol=0.006, cv=10, 
                        copy_X=True, verbose=0, n_jobs=-1, positive=False, random_state=0)

# 训练模型              
cv_model.fit(X_train, y_train)

# 计算最佳迭代次数、alpha和ratio
print('最佳 alpha: %.8f'%cv_model.alpha_)
print('最佳 l1_ratio: %.3f'%cv_model.l1_ratio_)
print('迭代次数 %d'%cv_model.n_iter_)

最佳 alpha: 0.00182915
最佳 l1_ratio: 0.500
迭代次数 194


# 输出结果
y_train_pred = cv_model.predict(X_train)
y_pred = cv_model.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred))
train_mse = mean_squared_error(y_train_pred, y_train)
test_mse = mean_squared_error(y_pred, y_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print('Train RMSE: %.4f' % train_rmse)
print('Test RMSE: %.4f' % test_rmse)

Train r2 score:  0.9873244857086348
Test r2 score:  0.8287650579933128
Train RMSE: 0.1404
Test RMSE: 0.4257


import datetime
plt.style.use('ggplot')

plt.rcParams.update({'figure.figsize': (15, 5)})

plt.plot(date, y)
plt.plot(date[N:], y_pred)

plt.show()

	date	query1	query2	query3	query4	query5	query6	query7	query8	query9	...	query36	query37	query38	query39	query40	query41	query42	query43	query44	query45
0	2003-06-01	0.990	6.471	6.899	0.990	1.448	1.581	0.118	0.517	0.813	...	0.414	0.103	0.606	0.163	0.355	0.827	0.163	0.295	2.822	2.245
1	2003-06-08	0.806	6.063	6.992	0.964	1.227	1.647	0.123	0.280	0.596	...	0.543	0.158	0.561	0.210	0.210	1.577	0.228	0.193	2.664	2.050
2	2003-06-15	0.892	5.044	6.130	0.912	0.912	1.765	0.019	0.485	0.640	...	0.582	0.194	0.737	0.175	0.310	1.377	0.136	0.330	3.240	2.541
3	2003-06-22	1.149	5.033	5.054	0.710	0.668	2.068	0.104	0.230	0.564	...	0.272	0.125	0.522	0.167	0.313	1.587	0.230	0.292	2.318	2.088
4	2003-06-29	0.768	4.920	5.128	0.685	0.727	1.868	0.021	0.270	0.374	...	0.415	0.104	0.374	0.083	0.145	1.100	0.187	0.353	2.553	2.284