Problem completing BERT model for sentiment classification

Question:

I am trying to figure out sentiment classification on movie reviews using BERT, transformers and tensorflow. This is the code I currently have:

def read_dataset(filename, model_name="bert-base-uncased"):
    """Reads a dataset from the specified path and returns sentences and labels"""

    tokenizer = BertTokenizer.from_pretrained(model_name)
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.readlines()
        # preallocate memory for the data
        sents, labels = list(), np.empty((len(lines), 1), dtype=int)

        for i, line in enumerate(lines):
            text, str_label, _ = line.split("t")
            labels[i] = int(str_label.split("=")[1] == "POS")
            sents.append(text)
    return dict(tokenizer(sents, padding=True, truncation=True, return_tensors="tf")), labels


class BertMLP(tf.keras.Model):
    def __init__(self, embed_batch_size=100, model_name="bert-base-cased"):
        super(BertMLP, self).__init__()
        self.bs = embed_batch_size
        self.model = TFBertModel.from_pretrained(model_name)
        self.classification_head = tf.keras.models.Sequential(
            layers = [
                tf.keras.Input(shape=(self.model.config.hidden_size,)),
                tf.keras.layers.Dense(350, activation="tanh"),
                tf.keras.layers.Dense(200, activation="tanh"),
                tf.keras.layers.Dense(50, activation="tanh"),
                tf.keras.layers.Dense(1, activation="sigmoid", use_bias=False)
            ]
        )

    def call(self, inputs):
        outputs = self.model(inputs)
        return outputs

def evaluate(model, inputs, labels, loss_func):
    mean_loss = tf.keras.metrics.Mean(name="train_loss")
    accuracy = tf.keras.metrics.BinaryAccuracy(name="train_accuracy")

    predictions = model(inputs)
    mean_loss(loss_func(labels, predictions))
    accuracy(labels, predictions)

    return mean_loss.result(), accuracy.result() * 100


if __name__ == "__main__":
    train = read_dataset("datasets/rt-polarity.train.vecs")
    dev = read_dataset("datasets/rt-polarity.dev.vecs")
    test = read_dataset("datasets/rt-polarity.test.vecs")

    mlp = BertMLP()
    mlp.compile(tf.keras.optimizers.SGD(learning_rate=0.01), loss='mse')
    dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
    print("Before training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
    mlp.fit(*train, epochs=10, batch_size=10)
    dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
    print("After training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")

However, when I run this code, I get an error:

Traceback (most recent call last):

  File "C:Usershomeanaconda3libsite-packagesspyder_kernelspy3compat.py", line 356, in compat_exec
    exec(code, globals, locals)

  File "c:usershomedownloadsmlp.py", line 60, in <module>
    dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())

  File "c:usershomedownloadsmlp.py", line 46, in evaluate
    predictions = model(inputs)

  File "C:Usershomeanaconda3libsite-packageskerasutilstraceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None

  File "c:usershomedownloadsmlp.py", line 39, in call
    outputs = self.model(inputs)

  File "C:Usershomeanaconda3libsite-packagestransformersmodeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
    return func(self, **unpacked_inputs)

  File "C:Usershomeanaconda3libsite-packagestransformersmodelsbertmodeling_tf_bert.py", line 1108, in call
    outputs = self.bert(

  File "C:Usershomeanaconda3libsite-packagestransformersmodeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
    return func(self, **unpacked_inputs)

  File "C:Usershomeanaconda3libsite-packagestransformersmodelsbertmodeling_tf_bert.py", line 781, in call
    embedding_output = self.embeddings(

  File "C:Usershomeanaconda3libsite-packagestransformersmodelsbertmodeling_tf_bert.py", line 203, in call
    inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

InvalidArgumentError: Exception encountered when calling layer "embeddings" (type TFBertEmbeddings).

indices[1174,8] = 29550 is not in [0, 28996) [Op:ResourceGather]

Call arguments received:
  • input_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

I googled for a while, and I can’t find anything conclusive. I am pretty sure it has something to do with this part:

def call(self, inputs):
        outputs = self.model(inputs)
        return outputs

But again, I have tried a lot of different things, including limiting dataset size and installing different versions of transformers and tensorflow, but to no avail. Please let me know what I’m doing wrong. Thank you!

Asked By: Rewaster

||

Answers:

OP was using bert-base-cased for their model, and bert-base-uncased for their tokenizer, causing issues during training when the vocab size of the model and the tokenized data differed.

Answered By: DWKOT