Load raw data

In [1]:
# Hyperparameter
PRICE_FEATURE = False     # Add feature of monthly average price ?
NORMALIZATION= False      # Normalization ?
TIMESTEPS = 28            # Timesteps (1-33)

LAYER_PARAM = [(64, 0.5), (64, 0.5)]  # LSTM layers' parameters (units, dropout)

TRAIN_PARAM = {"batch_size":128, # Train parameters
               "verbose":2,
               "epochs":60,
               "validation_split":0.1}
ROUND = False              # round the prediction ?
In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from keras import backend
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import LSTM
from keras.layers.core import Dense, Dropout, Activation

sales = pd.read_csv('data/sales_train.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
test = pd.read_csv('data/test.csv')

shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
item_cats = pd.read_csv('data/item_categories.csv')

sales.head()
Using TensorFlow backend.
Out[2]:
date date_block_num shop_id item_id item_price item_cnt_day
0 2013-01-02 0 59 22154 999.00 1.0
1 2013-01-03 0 25 2552 899.00 1.0
2 2013-01-05 0 25 2552 899.00 -1.0
3 2013-01-06 0 25 2554 1709.05 1.0
4 2013-01-15 0 25 2555 1099.00 1.0

Explore data

In [3]:
sns.set(rc={'figure.figsize':(20, 10)})
sns.set_context("talk", font_scale=1.5)
sales_month = pd.DataFrame(sales.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
sales_month.head()
sales_month.columns = ['date_block_num', 'item_sales']
sns.barplot(x ='date_block_num', y='item_sales', data=sales_month);
plt.plot(sales_month.item_sales)
plt.title('Distribution of item sales per month')
del sales_month
In [4]:
plt.figure(figsize=(20,10))
plt.scatter(sales.item_id, sales.item_cnt_day)
Out[4]:
<matplotlib.collections.PathCollection at 0x24f422e6358>
In [5]:
plt.figure(figsize=(20,10))
plt.scatter(sales.item_id, sales.item_price)
Out[5]:
<matplotlib.collections.PathCollection at 0x24f5054c358>

Detect and clear outliers

In [6]:
# find and drop outliers
outliers = sales.loc[(sales.item_cnt_day > 1500) | (sales.item_price > 55000) | (sales.item_price <= 0)]
sales = sales.drop(outliers.index)
print(outliers)
              date  date_block_num  shop_id  item_id     item_price  \
484683  2013-05-15               4       32     2973      -1.000000   
885138  2013-09-17               8       12    11365   59200.000000   
1163158 2013-12-13              11       12     6066  307980.000000   
2909818 2015-10-28              33       12    11373       0.908714   

         item_cnt_day  
484683            1.0  
885138            1.0  
1163158           1.0  
2909818        2169.0  

Reshape dataframe

In [7]:
df_cnt = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
df_cnt = df_cnt[['date','item_id','shop_id','item_cnt_day']]
df_cnt = df_cnt.pivot_table(index=['item_id','shop_id'], columns='date', values='item_cnt_day', fill_value=0).reset_index()
df_cnt = pd.merge(test, df_cnt, on=['item_id','shop_id'], how='left').fillna(0.) # Merge the test items with sale history
df_cnt = df_cnt.drop(labels=['ID', 'shop_id', 'item_id'], axis=1) # Remove the categorical data
df_cnt.head()
Out[7]:
2013-01 2013-02 2013-03 2013-04 2013-05 2013-06 2013-07 2013-08 2013-09 2013-10 ... 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07 2015-08 2015-09 2015-10
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 0.0 0.0 0.0 1.0 1.0 1.0 3.0 1.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 3.0 2.0 0.0 1.0 3.0 1.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 34 columns

In [8]:
if PRICE_FEATURE:
    df_pri = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).mean().reset_index()
    df_pri = df_pri[['date','item_id','shop_id','item_price']]
    df_pri = df_pri.pivot_table(index=['item_id','shop_id'], columns='date', values='item_price', fill_value=0).reset_index()
    df_pri = pd.merge(test, df_pri, on=['item_id','shop_id'], how='left').fillna(0.) # Merge the test items with sale history
    df_pri = df_pri.drop(labels=['ID', 'shop_id', 'item_id'], axis=1) # Remove the categorical data
    df_pri.head()

Normalization

In [9]:
if PRICE_FEATURE:
    pri = df_pri.values
    if NORMALIZATION:
        pri_min = np.min(pri)
        pri_gap = np.max(pri) - pri_min
        pri /= pri_gap
    pri = pri.reshape((pri.shape[0], pri.shape[1], 1))

cnt = df_cnt.values

if NORMALIZATION:
    cnt_min = np.min(cnt)
    cnt_gap = np.max(cnt) - cnt_min
    cnt /= cnt_gap
cnt = cnt.reshape((cnt.shape[0], cnt.shape[1], 1))
In [10]:
if PRICE_FEATURE:
    features = np.append(cnt, pri, axis=2)         # Merge cnt and price
else:
    features = cnt
print(features.shape)
(214200, 34, 1)

Reshape the training data

In [11]:
train_X = []
train_y = []
test_X = []
end = features.shape[1]
for fea_timeline, cnt_timeline in zip(features, cnt):
    for i in range(0, end-TIMESTEPS):
        train_X.append(fea_timeline[i:i + TIMESTEPS])
        train_y.append(cnt_timeline[i + TIMESTEPS])
    test_X.append(fea_timeline[end-TIMESTEPS:end])

train_X = np.array(train_X)
train_y = np.array(train_y)
test_X = np.array(test_X)
print('train_X', train_X.shape)
print('train_y', train_y.shape)
print('test_X', test_X.shape)
train_X (1285200, 28, 1)
train_y (1285200, 1)
test_X (214200, 28, 1)

Build the model

In [17]:
# Build the LSTM model
model = Sequential()
for i, (units, dropout) in enumerate(LAYER_PARAM):  # add LSTM layers
    return_sequences = (i != len(LAYER_PARAM)- 1)  # return_sequences = False at last layer
    if i == 0:  # First layer require input_shape
        model.add(LSTM(units, dropout=dropout, return_sequences=return_sequences, 
                       input_shape=(train_X.shape[1],train_X.shape[2])))
    else:
        model.add(LSTM(units, dropout=dropout, return_sequences=return_sequences))
model.add(Dense(1))

def rmse(y_true, y_pred):
    return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

model.compile(optimizer='adam', loss = 'mse', metrics=[rmse])
model.summary() 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_3 (LSTM)                (None, 28, 64)            16896     
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
=================================================================
Total params: 49,985
Trainable params: 49,985
Non-trainable params: 0
_________________________________________________________________

Train

In [18]:
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
callbacks_list = [earlystop]
model_info = model.fit(train_X, train_y, callbacks=callbacks_list, **TRAIN_PARAM)
Train on 1156680 samples, validate on 128520 samples
Epoch 1/60
 - 432s - loss: 6.6006 - rmse: 0.3842 - val_loss: 0.5928 - val_rmse: 0.2062
Epoch 2/60
 - 462s - loss: 6.0368 - rmse: 0.3743 - val_loss: 0.7262 - val_rmse: 0.1894
Epoch 3/60
 - 461s - loss: 5.7656 - rmse: 0.3705 - val_loss: 0.5526 - val_rmse: 0.1602
Epoch 4/60
 - 453s - loss: 5.6582 - rmse: 0.3633 - val_loss: 0.6669 - val_rmse: 0.1671
Epoch 5/60
 - 463s - loss: 5.7804 - rmse: 0.3640 - val_loss: 0.7341 - val_rmse: 0.1735
Epoch 6/60
 - 470s - loss: 5.6366 - rmse: 0.3601 - val_loss: 0.6205 - val_rmse: 0.2915

Predict and write to csv

In [19]:
# Get the test set predictions
pred_y = model.predict(test_X)
if NORMALIZATION:
    pred_y = pred_y * cnt_gap + cnt_min
    
pred_y = pred_y.clip(0., 20.) # Required by kaggle

if ROUND:
    round_y = lambda x: x if np.abs(x-np.round(x)) > 0.05 else np.round(x)
    vfunc = np.vectorize(round_y)
    pred_y = vfunc(pred_y)

# Write to submission file
preds = pd.DataFrame(pred_y, columns=['item_cnt_month'])
preds.to_csv('submission.csv', index_label='ID')
In [20]:
plt.figure(figsize=(10,4))
#plt.plot(model_info.history['rmse'], label='rmse')
#plt.plot(model_info.history['val_rmse'], label='val_rmse')
plt.plot(uzi_a, label='rmse')
plt.plot(uzi_b, label='val_rmse')
plt.xlabel("Epochs")
plt.ylabel("Root Mean Square Error")
plt.legend()
plt.show()
plt.savefig("Train and Validation RMSE Progression.png")
<Figure size 1440x720 with 0 Axes>
In [21]:
# Show a sample
plt.figure(figsize=(10,5))
for i in range(10, 11):
    item_sales = np.append(features[i],pred_y[i])
    date_block = range(len(item_sales))
    plt.plot(date_block, item_sales)
    plt.plot(date_block[-1], item_sales[-1], 'bs', label='Prediction')
plt.xlabel("date_block_number")
plt.ylabel("Sales")
plt.legend()
plt.show()