Load raw data¶

# Hyperparameter
PRICE_FEATURE = False     # Add feature of monthly average price ?
NORMALIZATION= False      # Normalization ?
TIMESTEPS = 28            # Timesteps (1-33)

LAYER_PARAM = [(64, 0.5), (64, 0.5)]  # LSTM layers' parameters (units, dropout)

TRAIN_PARAM = {"batch_size":128, # Train parameters
               "verbose":2,
               "epochs":60,
               "validation_split":0.1}
ROUND = False              # round the prediction ?

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from keras import backend
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import LSTM
from keras.layers.core import Dense, Dropout, Activation

sales = pd.read_csv('data/sales_train.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
test = pd.read_csv('data/test.csv')

shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
item_cats = pd.read_csv('data/item_categories.csv')

sales.head()

Using TensorFlow backend.

Explore data¶

sns.set(rc={'figure.figsize':(20, 10)})
sns.set_context("talk", font_scale=1.5)
sales_month = pd.DataFrame(sales.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
sales_month.head()
sales_month.columns = ['date_block_num', 'item_sales']
sns.barplot(x ='date_block_num', y='item_sales', data=sales_month);
plt.plot(sales_month.item_sales)
plt.title('Distribution of item sales per month')
del sales_month

plt.figure(figsize=(20,10))
plt.scatter(sales.item_id, sales.item_cnt_day)

<matplotlib.collections.PathCollection at 0x24f422e6358>

plt.figure(figsize=(20,10))
plt.scatter(sales.item_id, sales.item_price)

<matplotlib.collections.PathCollection at 0x24f5054c358>

Detect and clear outliers¶

# find and drop outliers
outliers = sales.loc[(sales.item_cnt_day > 1500) | (sales.item_price > 55000) | (sales.item_price <= 0)]
sales = sales.drop(outliers.index)
print(outliers)

              date  date_block_num  shop_id  item_id     item_price  \
484683  2013-05-15               4       32     2973      -1.000000   
885138  2013-09-17               8       12    11365   59200.000000   
1163158 2013-12-13              11       12     6066  307980.000000   
2909818 2015-10-28              33       12    11373       0.908714   

         item_cnt_day  
484683            1.0  
885138            1.0  
1163158           1.0  
2909818        2169.0

Reshape dataframe¶

df_cnt = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
df_cnt = df_cnt[['date','item_id','shop_id','item_cnt_day']]
df_cnt = df_cnt.pivot_table(index=['item_id','shop_id'], columns='date', values='item_cnt_day', fill_value=0).reset_index()
df_cnt = pd.merge(test, df_cnt, on=['item_id','shop_id'], how='left').fillna(0.) # Merge the test items with sale history
df_cnt = df_cnt.drop(labels=['ID', 'shop_id', 'item_id'], axis=1) # Remove the categorical data
df_cnt.head()

if PRICE_FEATURE:
    df_pri = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).mean().reset_index()
    df_pri = df_pri[['date','item_id','shop_id','item_price']]
    df_pri = df_pri.pivot_table(index=['item_id','shop_id'], columns='date', values='item_price', fill_value=0).reset_index()
    df_pri = pd.merge(test, df_pri, on=['item_id','shop_id'], how='left').fillna(0.) # Merge the test items with sale history
    df_pri = df_pri.drop(labels=['ID', 'shop_id', 'item_id'], axis=1) # Remove the categorical data
    df_pri.head()

Normalization¶

if PRICE_FEATURE:
    pri = df_pri.values
    if NORMALIZATION:
        pri_min = np.min(pri)
        pri_gap = np.max(pri) - pri_min
        pri /= pri_gap
    pri = pri.reshape((pri.shape[0], pri.shape[1], 1))

cnt = df_cnt.values

if NORMALIZATION:
    cnt_min = np.min(cnt)
    cnt_gap = np.max(cnt) - cnt_min
    cnt /= cnt_gap
cnt = cnt.reshape((cnt.shape[0], cnt.shape[1], 1))

if PRICE_FEATURE:
    features = np.append(cnt, pri, axis=2)         # Merge cnt and price
else:
    features = cnt
print(features.shape)

(214200, 34, 1)

Reshape the training data¶

train_X = []
train_y = []
test_X = []
end = features.shape[1]
for fea_timeline, cnt_timeline in zip(features, cnt):
    for i in range(0, end-TIMESTEPS):
        train_X.append(fea_timeline[i:i + TIMESTEPS])
        train_y.append(cnt_timeline[i + TIMESTEPS])
    test_X.append(fea_timeline[end-TIMESTEPS:end])

train_X = np.array(train_X)
train_y = np.array(train_y)
test_X = np.array(test_X)
print('train_X', train_X.shape)
print('train_y', train_y.shape)
print('test_X', test_X.shape)

train_X (1285200, 28, 1)
train_y (1285200, 1)
test_X (214200, 28, 1)

Build the model¶

# Build the LSTM model
model = Sequential()
for i, (units, dropout) in enumerate(LAYER_PARAM):  # add LSTM layers
    return_sequences = (i != len(LAYER_PARAM)- 1)  # return_sequences = False at last layer
    if i == 0:  # First layer require input_shape
        model.add(LSTM(units, dropout=dropout, return_sequences=return_sequences, 
                       input_shape=(train_X.shape[1],train_X.shape[2])))
    else:
        model.add(LSTM(units, dropout=dropout, return_sequences=return_sequences))
model.add(Dense(1))

def rmse(y_true, y_pred):
    return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

model.compile(optimizer='adam', loss = 'mse', metrics=[rmse])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_3 (LSTM)                (None, 28, 64)            16896     
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
=================================================================
Total params: 49,985
Trainable params: 49,985
Non-trainable params: 0
_________________________________________________________________

Train¶

earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
callbacks_list = [earlystop]
model_info = model.fit(train_X, train_y, callbacks=callbacks_list, **TRAIN_PARAM)

Train on 1156680 samples, validate on 128520 samples
Epoch 1/60
 - 432s - loss: 6.6006 - rmse: 0.3842 - val_loss: 0.5928 - val_rmse: 0.2062
Epoch 2/60
 - 462s - loss: 6.0368 - rmse: 0.3743 - val_loss: 0.7262 - val_rmse: 0.1894
Epoch 3/60
 - 461s - loss: 5.7656 - rmse: 0.3705 - val_loss: 0.5526 - val_rmse: 0.1602
Epoch 4/60
 - 453s - loss: 5.6582 - rmse: 0.3633 - val_loss: 0.6669 - val_rmse: 0.1671
Epoch 5/60
 - 463s - loss: 5.7804 - rmse: 0.3640 - val_loss: 0.7341 - val_rmse: 0.1735
Epoch 6/60
 - 470s - loss: 5.6366 - rmse: 0.3601 - val_loss: 0.6205 - val_rmse: 0.2915

Predict and write to csv¶

# Get the test set predictions
pred_y = model.predict(test_X)
if NORMALIZATION:
    pred_y = pred_y * cnt_gap + cnt_min
    
pred_y = pred_y.clip(0., 20.) # Required by kaggle

if ROUND:
    round_y = lambda x: x if np.abs(x-np.round(x)) > 0.05 else np.round(x)
    vfunc = np.vectorize(round_y)
    pred_y = vfunc(pred_y)

# Write to submission file
preds = pd.DataFrame(pred_y, columns=['item_cnt_month'])
preds.to_csv('submission.csv', index_label='ID')

plt.figure(figsize=(10,4))
#plt.plot(model_info.history['rmse'], label='rmse')
#plt.plot(model_info.history['val_rmse'], label='val_rmse')
plt.plot(uzi_a, label='rmse')
plt.plot(uzi_b, label='val_rmse')
plt.xlabel("Epochs")
plt.ylabel("Root Mean Square Error")
plt.legend()
plt.show()
plt.savefig("Train and Validation RMSE Progression.png")

<Figure size 1440x720 with 0 Axes>

# Show a sample
plt.figure(figsize=(10,5))
for i in range(10, 11):
    item_sales = np.append(features[i],pred_y[i])
    date_block = range(len(item_sales))
    plt.plot(date_block, item_sales)
    plt.plot(date_block[-1], item_sales[-1], 'bs', label='Prediction')
plt.xlabel("date_block_number")
plt.ylabel("Sales")
plt.legend()
plt.show()

	date	shop_id	item_id	item_price	item_cnt_day
0	2013-01-02	59	22154	999.00	1.0
1	2013-01-03	25	2552	899.00	1.0
2	2013-01-05	25	2552	899.00	-1.0
3	2013-01-06	25	2554	1709.05	1.0
4	2013-01-15	25	2555	1099.00	1.0

	...	2015-01	2015-05	2015-06	2015-07	2015-08	2015-09	2015-10
0	...	2.0	1.0	1.0	1.0	3.0	1.0	0.0
1	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	...	0.0	3.0	2.0	0.0	1.0	3.0	1.0
3	...	0.0	0.0	0.0	0.0	1.0	0.0	0.0
4	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	...	2015-01	2015-05	2015-06	2015-07	2015-08	2015-09	2015-10
0	...	2.0	1.0	1.0	1.0	3.0	1.0	0.0
1	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	...	0.0	3.0	2.0	0.0	1.0	3.0	1.0
3	...	0.0	0.0	0.0	0.0	1.0	0.0	0.0
4	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	...	2015-01	2015-05	2015-06	2015-07	2015-08	2015-09	2015-10
0	...	2.0	1.0	1.0	1.0	3.0	1.0	0.0
1	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	...	0.0	3.0	2.0	0.0	1.0	3.0	1.0
3	...	0.0	0.0	0.0	0.0	1.0	0.0	0.0
4	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0