Sklearn NN回归考勤预测答案

【问题标题】：Sklearn NN regression Attendance predictionSklearn NN回归考勤预测
【发布时间】：2018-11-23 15:14:36
【问题描述】：

我之前就同样的问题问过一个问题，但由于我的方法发生了变化，我现在有不同的问题。

我当前的代码：

from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
#Set sizes
rowSize = 200
numColumns = 4

# read  from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]

date = np.zeros(rowSize)
day = np.zeros(rowSize)
rain = np.zeros(rowSize)
temp = np.zeros(rowSize)
out = np.zeros(rowSize)

for i in range(0, rowSize):
    date[i] = sheet_1.cell(row=i + 1, column=1).value
    day[i] = sheet_1.cell(row=i + 1, column=2).value
    rain[i] = sheet_1.cell(row=i + 1, column=3).value
    temp[i] = sheet_1.cell(row=i + 1, column=4).value
    out[i] = sheet_1.cell(row=i + 1, column=5).value

train = np.zeros(shape=(rowSize,numColumns))
t_o = np.zeros(shape=(rowSize,1))

for i in range(0, rowSize):
    train[i] = [date[i], day[i], rain[i], temp[i]]
    t_o[i] = [out[i]]


X = train
# Output
y = t_o

X_train, X_test, y_train, y_test = train_test_split(X, y)

####Neural Net
nn = MLPRegressor(
    hidden_layer_sizes=(3,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(X_train, y_train.ravel())


y_pred = nn.predict(X_test)

###Linear Regression
# lm = LinearRegression()
# lm.fit(X_train,y_train)
# y_pred = lm.predict(X_test)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(X_test[:,0], y_pred, s=1, c='b', marker="s", label='real')
ax1.scatter(X_test[:,0], y_test, s=10, c='r', marker="o", label='NN Prediction')
plt.show()

#Calc MSE
mse = np.square(y_test-y_pred).mean()

print(mse)

由此得出的结果表明对测试数据的预测非常糟糕。因为我是新手，所以我不确定这是我的数据、模型还是我的编码。根据情节，我认为模型的数据是错误的（模型似乎预测接近线性或平方的东西，而实际数据似乎更加分散）

以下是一些数据点：格式为一年中的某一天（2 是 1 月 2 日），工作日（1）/周末（0），下雨（1）/不下雨（0），F 中的温度，出勤（这是输出）

2   0   0   51  1366
4   0   0   62  538
5   1   0   71  317
6   1   0   76  174
7   1   0   78  176
8   1   0   68  220
12  1   1   64  256
13  1   1   60  379
14  1   0   64  316
18  0   0   72  758
19  1   0   72  1038
20  1   0   72  405
21  1   0   71  326
24  0   0   74  867
26  1   1   68  521
27  1   0   71  381
28  1   0   72  343
29  1   1   68  266
30  0   1   57  479
31  0   1   57  717
33  1   0   70  542
34  1   0   73  220
35  1   0   74  360
36  1   0   79  444
42  1   0   78  534
45  0   0   80  1572
52  0   0   76  1236
55  1   1   64  689
56  1   0   69  726
59  0   0   67  1188
60  0   0   74  1140
61  1   1   63  979
62  1   1   62  657
63  1   0   67  687
64  1   0   72  615
67  0   0   80  1074
68  1   0   81  1261
71  1   0   83  1332
73  0   0   85  1259
74  0   0   86  1142
76  1   0   88  1207
77  1   1   78  1438
82  1   0   85  1251
83  1   0   83  1019
85  1   0   86  1178
86  0   0   92  1306
87  0   0   92  1273
89  1   0   93  1101
90  1   0   92  1274
93  0   0   83  1548
94  0   0   86  1318
96  1   0   83  1395
97  1   0   81  1338
98  1   0   75  1240
100 0   0   84  1335
102 0   0   83  931
103 1   0   87  746
104 1   0   91  746
105 1   0   81  600
106 1   0   72  852
108 0   1   87  1204
109 0   0   89  1191
110 1   0   90  769
111 1   0   88  642
112 1   0   86  743
114 0   1   75  1085
115 0   1   78  1109
117 1   0   84  871
120 1   0   96  599
123 0   0   93  651
129 0   0   74  1325
133 1   0   88  637
134 1   0   84  470
135 0   1   73  980
136 0   0   72  1096
137 0   0   83  792
138 1   0   87  565
139 1   0   84  501
141 1   0   88  615
142 0   0   79  722
143 0   0   80  1363
144 0   0   82  1506
146 1   0   93  626
147 1   0   94  415
148 1   0   95  596
149 0   0   100 532
150 0   0   102 784
154 1   0   99  514
155 1   0   94  495
156 0   1   87  689
157 0   1   94  931
158 0   0   97  618
161 1   0   92  451
162 1   0   97  574
164 0   0   102 898
165 0   0   104 746
166 1   0   109 587
167 1   0   109 465
174 1   0   108 514
175 1   0   109 572
179 0   0   107 811
181 1   0   104 423
182 1   0   103 526
184 0   1   97  849
185 0   0   103 852
189 1   0   106 728
191 0   0   101 577
194 1   0   105 511
198 0   1   101 616
199 0   1   97  1056
200 0   0   94  740
202 1   0   103 498
205 0   0   101 610
206 0   0   106 944
207 0   0   105 769
208 1   0   103 551
209 1   0   103 624
210 1   0   97  513
212 0   1   107 561
213 0   0   100 905
214 0   0   105 767
215 1   0   107 510
216 1   0   108 406
217 1   0   109 439
218 1   0   103 427
219 0   1   104 460
224 1   0   105 213
227 0   0   112 834
228 0   0   109 615
229 1   0   105 216
230 1   0   104 213
231 1   0   104 256
232 1   0   104 282
235 0   0   104 569
238 1   0   103 165
239 1   1   105 176
241 0   1   108 727
242 0   1   105 652
243 1   1   103 231
244 1   0   96  117
245 1   1   98  168
246 1   1   97  113
247 0   0   95  227
248 0   0   92  1050
249 0   0   101 1274
250 1   1   95  1148
254 0   0   99  180
255 0   0   104 557
258 1   0   94  228
260 1   0   95  133
263 0   0   100 511
264 1   1   89  249
265 1   1   90  245
267 1   0   101 390
272 1   0   100 223
273 1   0   103 194
274 1   0   103 150
275 0   0   95  224
276 0   0   92  705
277 0   1   92  504
279 1   1   77  331
281 1   0   89  268
284 0   0   95  566
285 1   0   94  579
286 1   0   95  420
288 1   0   93  392
289 0   1   94  525
290 0   1   86  670
291 0   1   89  488
294 1   1   74  295
296 0   0   81  314
299 1   0   88  211
301 1   0   84  246
303 0   1   76  433
304 0   0   80  216
307 1   1   80  275
308 1   1   66  319
312 0   0   80  413
313 1   0   78  278
316 1   0   74  305
320 1   1   57  323
324 0   0   76  220
326 0   0   77  461
327 1   0   78  510
331 0   0   60  1701
334 1   0   58  237
335 1   0   62  355
336 1   0   68  266
338 0   0   70  246
342 1   0   72  109
343 1   0   70  103
347 0   0   58  486
349 1   0   52  144
350 1   0   53  209
351 1   0   55  289
354 0   0   62  707
355 1   0   59  903
359 0   0   58  481
360 0   0   53  1342
364 1   0   57  1624

我总共有超过一千个数据点，但我并没有将它们全部用于训练/测试。一个想法是我需要更多，另一个想法是我需要更多因素，因为温度/下雨/星期几对出勤率的影响不够。

剧情如下：

我可以做些什么来使我的模型更准确并提供更好的预测？

谢谢

编辑：我添加了更多数据点和另一个因素。我似乎无法上传 excel 文件，所以我将数据放在这里，并更好地解释了它的格式

编辑：这是最新的代码：

from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
#Set sizes
rowSize = 500
numColumns = 254

# read  from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]

input = np.zeros(shape=(rowSize,numColumns))
out = np.zeros(rowSize)
for i in range(0, rowSize):
    for j in range(0,numColumns):
        input[i,j] = sheet_1.cell(row=i + 1, column=j+1).value
    out[i] = sheet_1.cell(row=i + 1, column=numColumns+1).value

output = np.zeros(shape=(rowSize,1))

for i in range(0, rowSize):
    output[i] = [out[i]]


X = input
# Output
y = output

print(X)
print(y)
y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3

# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC()
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])

accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())

#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)

这里是最新数据，我可以添加尽可能多的功能。请注意，这是来自完整数据的随机样本：

Link to sample data

电流输出： 0.6230954290296712

我的最终目标是达到 90% 或更高的准确率……我不相信我能找到更多的特征，但如果有帮助，我会继续尽可能多地收集

【问题讨论】：

这个任务可能很适合使用线性回归，是什么让你做一个神经网络？
这似乎是一个学习它们的好机会。另外，我认为这不是线性的，但我的统计课已经 2 年了，也许我误解了它
@ChrisM 你能添加python_excel_read.xlsx 数据吗？那么我可以提供如何提高预测性能的答案。
@seralouk 我在问题中添加了更多数据并上传了我最近的代码/结果。我不确定如何上传文件，因为 SO 没有文件托管服务。我想我可以使用一个并在需要时提供链接
完美。您的目标是预测输出。您是否尝试过使用 MLPRegressor 以外的任何其他东西？

标签： python numpy scikit-learn neural-network prediction

【解决方案1】：

您的问题很笼统，但我有一些建议。您可以使用cross-validation 并尝试不同的模型。就个人而言，我会尝试SVR,RandomForests，作为最后的选择，我会使用MLPR。

我对您的代码进行了一些修改以显示一个简单的示例：

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
import pandas as pd
from sklearn.decomposition import PCA

# read the data
df = pd.read_excel('python_excel_read.xlsx', header = None)
rows, cols = df.shape

X = df.iloc[: , 0:(cols - 1)]
y = df.iloc[: , cols - 1 ]
print(X.shape)
print(y.shape)

y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3
print(np.unique(y))

# We can apply PCA to reduce the dimensions of the data
# pca = PCA(n_components=2)
# pca.fit(X)
# X = pca.fit_transform(X)

# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC(kernel = 'linear')
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])

accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())

#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)

【讨论】：

我试过了，按照你的备忘单（顺便说一句非常有用）我还尝试了刚性回归、集成回归和套索。所有人都给MSE大约-120k。 MSE 对我正在尝试做的事情是一个不好的衡量标准吗？您认为我应该尝试哪些其他模型？
我还会计算 RMSE，如果这对这些模型也不利，那么我会转向更复杂的模型（神经网络、非线性模型、作为预处理的 PCA）。我会尝试使用您的数据并让您知道
谢谢。根据我的测试，Lasso 给了我（勉强）最好的结果，mse 的绝对值为 125k。我现在将尝试您建议的其他一些方法
我可能应该提到我的最终目标是预测实际出勤率的 10% 以内。我不知道这是否可能/是否会发生，但尽可能接近会很棒。实际上，我在我的数据中添加了另一个特征，并且使用之前的 MLPRegression 得到了一些形状很好的东西，但是预测仍然相差太多
要达到如此高的性能，我坚信您需要更多功能（希望是信息丰富的功能）。