# Hyperparameter
PRICE_FEATURE = False # Add feature of monthly average price ?
NORMALIZATION= False # Normalization ?
TIMESTEPS = 28 # Timesteps (1-33)
LAYER_PARAM = [(64, 0.5), (64, 0.5)] # LSTM layers' parameters (units, dropout)
TRAIN_PARAM = {"batch_size":128, # Train parameters
"verbose":2,
"epochs":60,
"validation_split":0.1}
ROUND = False # round the prediction ?
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import LSTM
from keras.layers.core import Dense, Dropout, Activation
sales = pd.read_csv('data/sales_train.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
test = pd.read_csv('data/test.csv')
shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
item_cats = pd.read_csv('data/item_categories.csv')
sales.head()
sns.set(rc={'figure.figsize':(20, 10)})
sns.set_context("talk", font_scale=1.5)
sales_month = pd.DataFrame(sales.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
sales_month.head()
sales_month.columns = ['date_block_num', 'item_sales']
sns.barplot(x ='date_block_num', y='item_sales', data=sales_month);
plt.plot(sales_month.item_sales)
plt.title('Distribution of item sales per month')
del sales_month
plt.figure(figsize=(20,10))
plt.scatter(sales.item_id, sales.item_cnt_day)
plt.figure(figsize=(20,10))
plt.scatter(sales.item_id, sales.item_price)
# find and drop outliers
outliers = sales.loc[(sales.item_cnt_day > 1500) | (sales.item_price > 55000) | (sales.item_price <= 0)]
sales = sales.drop(outliers.index)
print(outliers)
df_cnt = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
df_cnt = df_cnt[['date','item_id','shop_id','item_cnt_day']]
df_cnt = df_cnt.pivot_table(index=['item_id','shop_id'], columns='date', values='item_cnt_day', fill_value=0).reset_index()
df_cnt = pd.merge(test, df_cnt, on=['item_id','shop_id'], how='left').fillna(0.) # Merge the test items with sale history
df_cnt = df_cnt.drop(labels=['ID', 'shop_id', 'item_id'], axis=1) # Remove the categorical data
df_cnt.head()
if PRICE_FEATURE:
df_pri = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).mean().reset_index()
df_pri = df_pri[['date','item_id','shop_id','item_price']]
df_pri = df_pri.pivot_table(index=['item_id','shop_id'], columns='date', values='item_price', fill_value=0).reset_index()
df_pri = pd.merge(test, df_pri, on=['item_id','shop_id'], how='left').fillna(0.) # Merge the test items with sale history
df_pri = df_pri.drop(labels=['ID', 'shop_id', 'item_id'], axis=1) # Remove the categorical data
df_pri.head()
if PRICE_FEATURE:
pri = df_pri.values
if NORMALIZATION:
pri_min = np.min(pri)
pri_gap = np.max(pri) - pri_min
pri /= pri_gap
pri = pri.reshape((pri.shape[0], pri.shape[1], 1))
cnt = df_cnt.values
if NORMALIZATION:
cnt_min = np.min(cnt)
cnt_gap = np.max(cnt) - cnt_min
cnt /= cnt_gap
cnt = cnt.reshape((cnt.shape[0], cnt.shape[1], 1))
if PRICE_FEATURE:
features = np.append(cnt, pri, axis=2) # Merge cnt and price
else:
features = cnt
print(features.shape)
train_X = []
train_y = []
test_X = []
end = features.shape[1]
for fea_timeline, cnt_timeline in zip(features, cnt):
for i in range(0, end-TIMESTEPS):
train_X.append(fea_timeline[i:i + TIMESTEPS])
train_y.append(cnt_timeline[i + TIMESTEPS])
test_X.append(fea_timeline[end-TIMESTEPS:end])
train_X = np.array(train_X)
train_y = np.array(train_y)
test_X = np.array(test_X)
print('train_X', train_X.shape)
print('train_y', train_y.shape)
print('test_X', test_X.shape)
# Build the LSTM model
model = Sequential()
for i, (units, dropout) in enumerate(LAYER_PARAM): # add LSTM layers
return_sequences = (i != len(LAYER_PARAM)- 1) # return_sequences = False at last layer
if i == 0: # First layer require input_shape
model.add(LSTM(units, dropout=dropout, return_sequences=return_sequences,
input_shape=(train_X.shape[1],train_X.shape[2])))
else:
model.add(LSTM(units, dropout=dropout, return_sequences=return_sequences))
model.add(Dense(1))
def rmse(y_true, y_pred):
return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))
model.compile(optimizer='adam', loss = 'mse', metrics=[rmse])
model.summary()
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
callbacks_list = [earlystop]
model_info = model.fit(train_X, train_y, callbacks=callbacks_list, **TRAIN_PARAM)
# Get the test set predictions
pred_y = model.predict(test_X)
if NORMALIZATION:
pred_y = pred_y * cnt_gap + cnt_min
pred_y = pred_y.clip(0., 20.) # Required by kaggle
if ROUND:
round_y = lambda x: x if np.abs(x-np.round(x)) > 0.05 else np.round(x)
vfunc = np.vectorize(round_y)
pred_y = vfunc(pred_y)
# Write to submission file
preds = pd.DataFrame(pred_y, columns=['item_cnt_month'])
preds.to_csv('submission.csv', index_label='ID')
plt.figure(figsize=(10,4))
#plt.plot(model_info.history['rmse'], label='rmse')
#plt.plot(model_info.history['val_rmse'], label='val_rmse')
plt.plot(uzi_a, label='rmse')
plt.plot(uzi_b, label='val_rmse')
plt.xlabel("Epochs")
plt.ylabel("Root Mean Square Error")
plt.legend()
plt.show()
plt.savefig("Train and Validation RMSE Progression.png")
# Show a sample
plt.figure(figsize=(10,5))
for i in range(10, 11):
item_sales = np.append(features[i],pred_y[i])
date_block = range(len(item_sales))
plt.plot(date_block, item_sales)
plt.plot(date_block[-1], item_sales[-1], 'bs', label='Prediction')
plt.xlabel("date_block_number")
plt.ylabel("Sales")
plt.legend()
plt.show()