LSTM train data resample

Question:

I try train LSTM model with time series data.

If I load it as they are everything is ok. Model work with good accuracy and loss function.

But if I resample it for example 1H or 24H it stop work. Accuracy drops extremely down and lost function is nan

Epoch 5/5
110/110 [==============================] - 4s 32ms/step - loss: nan - accuracy: 0.4437 - val_loss: nan - val_accuracy: 0.3453

What I am doing wrong ?

My loading function:

def readCSV(path, candleTime):
    # Load data
    #print("Loading: " + path)
    data = pd.read_csv(path, names=['Date_Time', 'open', 'high', 'low', 'close', 'volume'], sep=";", index_col=0)

    # Convert the index to datetime
    data.index = pd.to_datetime(data.index, format='%Y%m%d %H%M%S%f')

    if(candleTime == CandleTime.hour):
        data = data.resample('1H').agg({'open': 'first', 
                                 'high': 'max', 
                                 'low': 'min', 
                                 'close': 'last'})
    if(candleTime == CandleTime.day):
        data = data.resample('24H').agg({'open': 'first', 
                            'high': 'max', 
                            'low': 'min', 
                            'close': 'last'})
            
    return data

Edit

Whole train function.
Note: that now I have only one iteration.

def trainModel(trainCandles, prediction_minutes = 60, model_name = 'lstm_1m_10_model'):
tf.keras.backend.clear_session()
#Prepare Data
print("Preparing data..")
x_train = []
y_train = []
normalizedCandles = trainCandles[['open', 'high', 'low', 'close']].to_numpy(copy=True)
for x in range(prediction_minutes, len(normalizedCandles)):
    xdata = normalizedCandles[x-prediction_minutes:x]
    predictionData = []
    for candleX in xdata:
        predictionData.append([candleX[0], candleX[1], candleX[2], candleX[3]])
    candleY = normalizedCandles[x]
    x_train.append(predictionData)
    y_train.append([candleY[0], candleY[1], candleY[2], candleY[3]])

print("Spliting..")
# split train and test
x_toSplit, y_toSplit = x_train, y_train
sizeOf70percentage = int(len(x_toSplit)/100*70)
x_test = np.array(x_toSplit[sizeOf70percentage:len(x_toSplit)])
y_test = np.array(y_toSplit[sizeOf70percentage:len(x_toSplit)])
x_train = np.array(x_toSplit[0: sizeOf70percentage])
y_train = np.array(y_toSplit[0: sizeOf70percentage])


print("Total size of samples: " + str(len(x_train)))
model=None

if (os.path.isdir(model_name)): # you won't have a model for first iteration
    print("Loading model..")
    model = load_model(model_name)
else:
    print("Creatng model..")
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    model.add(Dense(units=4))
    model.compile(optimizer='Adam', loss='mean_squared_error', metrics=["accuracy"])


history = model.fit(
    x_train, 
    y_train, 
    validation_data=(x_test, y_test), 
    epochs=5, 
    batch_size=32)

model.save(model_name)
Asked By: Luboš Hájek

||

Answers:

My problem was input data which contained null data. dropna() was solution for me.

Answered By: Luboš Hájek