Keras Reshape Layer Error: Total Size of New Array Must Be Unchanged

Question:

This is my model

import torch
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import mediapipe as mp
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

# Define the path to your training and testing data directories
train_data_dir = 'train_frames'
test_data_dir = 'test_frames'

# Extract action names (class names) from subfolder names in the training data directory
class_names = sorted(os.listdir(train_data_dir))

# Determine the number of classes
num_classes = len(class_names)

# Parameters
image_size = (240, 240)
batch_size = 32
epochs = 1

train_datagen = ImageDataGenerator(
    rotation_range=15,
    horizontal_flip=True
)

train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=batch_size,
    class_mode='categorical'
)

# Initialize MediaPipe hands module
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Function to perform MediaPipe detection
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False  # Image is no longer writeable
    results = model.process(image)  # Make prediction
    image.flags.writeable = True  # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # COLOR COVERSION RGB 2 BGR
    return image, results


# Function to draw styled landmarks
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)  # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)  # Draw right hand connections

# Function to draw styled landmarks with custom styles
def draw_styled_landmarks(image, results):
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
                              )
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
                              )

# Function to extract keypoints from results
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in
                   results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in
                   results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([lh, rh])


# Open the webcam for capturing video
cap = cv2.VideoCapture(0)

# Set up MediaPipe hands model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # Read frame from the webcam
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


# Define the neural network architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(240, 240, 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Reshape((30, -1)),  # Reshape to (timesteps, input_features)
    tf.keras.layers.LSTM(264, return_sequences=True),  # First LSTM layer with return_sequences=True
    tf.keras.layers.LSTM(128, return_sequences=True),  # Second LSTM layer with return_sequences=False (default)
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.LSTM(128, return_sequences=False),  # Third LSTM layer with return_sequences=False (default)
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_generator, epochs=epochs)

# Save the trained model
model.save('cnn_rnn_model.h5')

print("Training complete. Model saved as 'cnn_rnn_model.h5'")


def prob_viz(res, actions, input_frame):
    output_frame = input_frame.copy()
    for num, (action, prob) in enumerate(zip(actions, res)):
        cv2.putText(output_frame, f'{action}: {prob:.2f}', (10, 30 + num * 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    return output_frame


sequence = []
sentence = []
predictions = []
threshold = 0.5

cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convert the frame to grayscale
        image, results = mediapipe_detection(gray_frame, holistic)
        draw_styled_landmarks(image, results)
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        if len(sequence) == 30:
            action_probabilities = model.predict(np.expand_dims(sequence, axis=0))[0]
            predicted_action = class_names[np.argmax(action_probabilities)]
            print(predicted_action)
            predictions.append(np.argmax(action_probabilities))
            if np.unique(predictions[-10:])[0] == np.argmax(action_probabilities):
                if action_probabilities[np.argmax(action_probabilities)] > threshold:
                    if len(sentence) > 0:
                        if predicted_action != sentence[-1]:
                            sentence.append(predicted_action)
                    else:
                        sentence.append(predicted_action)
            if len(sentence) > 5:
                sentence = sentence[-5:]
            cv2.putText(image, ' '.join(sentence), (10, 30),  # Modify this line
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)  # Modify this line
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

The input images are grayscale in 240*240 with 1 channel and 200 frames of each video.I dont know how to write the input shape in lstm layer as timesteps and input features.I could use some assistance with my neural network.
Iam creating a realtime sign language detection using video sequences.

tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE SSE2 SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
  File "C:UsersAdministratorDesktopSign language RecognitionModel.py", line 105, in <module>
    model = tf.keras.models.Sequential([
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:Program FilesPython311Libsite-packagestensorflowpythontrackablebase.py", line 204, in _method_wrapper
    result = method(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:Program FilesPython311Libsite-packageskerassrcutilstraceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:Program FilesPython311Libsite-packageskerassrclayersreshapingreshape.py", line 118, in _fix_unknown_dimension
    raise ValueError(msg)
ValueError: Exception encountered when calling layer "reshape" (type Reshape).

total size of new array must be unchanged, input_shape = [115200], output_shape = [200, 57600]

Call arguments received by layer "reshape" (type Reshape):
  • inputs=tf.Tensor(shape=(None, 115200), dtype=float32)

Process finished with exit code 1

Asked By: BASIL K AJI

||

Answers:

After your last MaxPooling layer you have a data shape of (28, 28, 128), the Flatten layer turns that to (100352,). 100352 is not divisible by 30 without remainder (=3.345,066…), so it can’t get reshaped into that. Find another divisor instead of 30, that divides 100352 without remainder, or find another input shape that is divisible by 30 after the last MaxPooling layer.

Answered By: mhenning