UnparsedFlagAccessError: Trying to access flag –preserve_unused_tokens before flags were parsed. BERT
Question:
I want to use Bert language model for training a multi class text classification task.
Previously I trained using LSTM without any Error but Bert gives me this Error.
I get the following Error and I really don’t know how to solve it, can anyone help me please?
Unfortunately there is very little documentation using BERT in keras library.
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tensorflow_hub as hub
from bert import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
def bert_encode(texts, tokenizer, max_len=512):
all_tokens = []
all_masks = []
all_segments = []
for text in texts:
text = tokenizer.tokenize(text)
text = text[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
pad_len = max_len - len(input_sequence)
tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
pad_masks = [1] * len(input_sequence) + [0] * pad_len
segment_ids = [0] * max_len
all_tokens.append(tokens)
all_masks.append(pad_masks)
all_segments.append(segment_ids)
return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
def build_model(bert_layer, max_len=512):
input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
net = tf.keras.layers.Dense(64, activation='softmax')(clf_output)
net = tf.keras.layers.Dropout(0.2)(net)
net = tf.keras.layers.Dense(32, activation='softmax')(net)
net = tf.keras.layers.Dropout(0.2)(net)
out = tf.keras.layers.Dense(3, activation='softmax')(net)
model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
return model
max_len = 150
train_input = bert_encode(data.text_cleaned, tokenizer, max_len=max_len)
Error as following :
UnparsedFlagAccessError Traceback (most recent call last)
<ipython-input-175-fd64df42591d> in <module>()
1 import sys
2 max_len = 150
----> 3 train_input = bert_encode(o.text_cleaned, tokenizer, max_len=max_len)
4 frames
/usr/local/lib/python3.7/dist-packages/absl/flags/_flagvalues.py in __getattr__(self, name)
496 # get too much noise.
497 logging.error(error_message)
--> 498 raise _exceptions.UnparsedFlagAccessError(error_message)
499
500 def __setattr__(self, name, value):
UnparsedFlagAccessError: Trying to access flag --preserve_unused_tokens before flags were parsed.
Answers:
Based on this issue you have to downgrade bert-tensorflow to 1.0.1. Check this answer to find a solution. If you are following this tutorial downgrade bert-tensorflow and use the !wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
as suggested because inside the python code the author has made the change from tf.gfile.GFile(vocab_file, "r")
to tf.io.gfile.Gfile(vocab_file, "r")
. After that code compiles successfully. Ping me if you want anything else.
Instead of downgrading, you can set FLAGS in your code and you can use latest version of Bert.
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)
Good luck!
I want to use Bert language model for training a multi class text classification task.
Previously I trained using LSTM without any Error but Bert gives me this Error.
I get the following Error and I really don’t know how to solve it, can anyone help me please?
Unfortunately there is very little documentation using BERT in keras library.
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tensorflow_hub as hub
from bert import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
def bert_encode(texts, tokenizer, max_len=512):
all_tokens = []
all_masks = []
all_segments = []
for text in texts:
text = tokenizer.tokenize(text)
text = text[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
pad_len = max_len - len(input_sequence)
tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
pad_masks = [1] * len(input_sequence) + [0] * pad_len
segment_ids = [0] * max_len
all_tokens.append(tokens)
all_masks.append(pad_masks)
all_segments.append(segment_ids)
return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
def build_model(bert_layer, max_len=512):
input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
net = tf.keras.layers.Dense(64, activation='softmax')(clf_output)
net = tf.keras.layers.Dropout(0.2)(net)
net = tf.keras.layers.Dense(32, activation='softmax')(net)
net = tf.keras.layers.Dropout(0.2)(net)
out = tf.keras.layers.Dense(3, activation='softmax')(net)
model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
return model
max_len = 150
train_input = bert_encode(data.text_cleaned, tokenizer, max_len=max_len)
Error as following :
UnparsedFlagAccessError Traceback (most recent call last)
<ipython-input-175-fd64df42591d> in <module>()
1 import sys
2 max_len = 150
----> 3 train_input = bert_encode(o.text_cleaned, tokenizer, max_len=max_len)
4 frames
/usr/local/lib/python3.7/dist-packages/absl/flags/_flagvalues.py in __getattr__(self, name)
496 # get too much noise.
497 logging.error(error_message)
--> 498 raise _exceptions.UnparsedFlagAccessError(error_message)
499
500 def __setattr__(self, name, value):
UnparsedFlagAccessError: Trying to access flag --preserve_unused_tokens before flags were parsed.
Based on this issue you have to downgrade bert-tensorflow to 1.0.1. Check this answer to find a solution. If you are following this tutorial downgrade bert-tensorflow and use the !wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
as suggested because inside the python code the author has made the change from tf.gfile.GFile(vocab_file, "r")
to tf.io.gfile.Gfile(vocab_file, "r")
. After that code compiles successfully. Ping me if you want anything else.
Instead of downgrading, you can set FLAGS in your code and you can use latest version of Bert.
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)
Good luck!