How to generate word embeddings in Portuguese using Gensim?

Question:

I have the following problem:

In English language my code generates successful word embeddings with Gensim, and similar phrases are close to each other considering cosine distance:

The angle between “Response time and error measurement” and “Relation of user perceived response time to error measurement” is very small, thus they are the most similar phrases in the set.

enter image description here

However, when I use the same phrases in Portuguese, it doesn’t work:

enter image description here

My code as follows:

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import matplotlib.pyplot as plt
from gensim import corpora
documents = ["Interface máquina humana para aplicações computacionais de laboratório abc",
          "Um levantamento da opinião do usuário sobre o tempo de resposta do sistema informático",
           "O sistema de gerenciamento de interface do usuário EPS",
           "Sistema e testes de engenharia de sistemas humanos de EPS",
           "Relação do tempo de resposta percebido pelo usuário para a medição de erro",
           "A geração de árvores não ordenadas binárias aleatórias",
           "O gráfico de interseção dos caminhos nas árvores",
           "Gráfico de menores IV Largura de árvores e bem quase encomendado",
           "Gráficos menores Uma pesquisa"]

stoplist = set('for a of the and to in on'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
texts

from collections import defaultdict
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1
frequency

from nltk import tokenize  
texts=[tokenize.word_tokenize(documents[i], language='portuguese') for i in range(0,len(documents))]

from pprint import pprint
pprint(texts)

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)

print(dictionary.token2id)


# VECTOR
new_doc = "Tempo de resposta e medição de erro"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

## VETOR OF PHRASES
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  
print(corpus)

from gensim import corpora, models, similarities
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

### PHRASE COORDINATES
frase=tfidf[new_vec]
print(frase)

corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]

lsi.print_topics(2)

## TEXT COORDINATES
todas=[]
for doc in corpus_lsi:
    todas.append(doc)
todas

from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print(corpus)

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

doc = new_doc
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)

p=[]
for i in range(0,len(documents)):
    doc1 = documents[i]
    vec_bow2 = dictionary.doc2bow(doc1.lower().split())
    vec_lsi2 = lsi[vec_bow2]
    p.append(vec_lsi2)

p

index = similarities.MatrixSimilarity(lsi[corpus])

index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

sims = index[vec_lsi]
print(list(enumerate(sims)))

sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) 

#################

import gensim
import numpy as np
import matplotlib.colors as colors
import matplotlib.cm as cmx
import matplotlib as mpl

matrix1 = gensim.matutils.corpus2dense(p, num_terms=2)
matrix3=matrix1.T
matrix3[0]
ss=[]
for i in range(0,9):
    ss.append(np.insert(matrix3[i],0,[0,0]))
matrix4=ss
matrix4

matrix2 = gensim.matutils.corpus2dense([vec_lsi], num_terms=2)
matrix2=np.insert(matrix2,0,[0,0])
matrix2

DATA=np.insert(matrix4,0,matrix2)
DATA=DATA.reshape(10,4)
DATA

names=np.array(documents)
names=np.insert(names,0,new_doc)
new_doc
cmap = plt.cm.jet

cNorm  = colors.Normalize(vmin=np.min(DATA[:,3])+.2, vmax=np.max(DATA[:,3]))

scalarMap = cmx.ScalarMappable(norm=cNorm,cmap=cmap)
len(DATA[:,1])

plt.subplots()
plt.figure(figsize=(12,9))
plt.scatter(matrix1[0],matrix1[1],s=60)
plt.scatter(matrix2[2],matrix2[3],color='r',s=95)
for idx in range(0,len(DATA[:,1])):
    colorVal = scalarMap.to_rgba(DATA[idx,3])
    plt.arrow(DATA[idx,0],
          DATA[idx,1], 
          DATA[idx,2], 
          DATA[idx,3], 
          color=colorVal,head_width=0.002, head_length=0.001)
for i,names in enumerate (names):
    plt.annotate(names, (DATA[i][2],DATA[i][3]),va='top')
plt.title("PHRASE SIMILARITY - WORD2VEC with GENSIM library")
plt.xlim(min(DATA[:,2]-.2),max(DATA[:,2]+1))
plt.ylim(min(DATA[:,3]-.2),max(DATA[:,3]+.3))
plt.show()

My question is: is there any additional set up for Gensim to generate proper word embeddings in Portuguese language or Gensim does not support this language?

Asked By: razimbres

||

Answers:

One year and 10 months later, I got the response by myself: use BERT embeddings in PyTorch:

Phrases:

Phrases

I adapted PyTorch extract_features.py at https://github.com/ethanjperez/pytorch-pretrained-BERT/blob/master/examples/extract_features.py

class Main:
    def main(self,input_file,output_file):
        self.input_file=input_file
        self.output_file=output_file
        self.bert_model='bert-base-multilingual-uncased'
        self.do_lower_case=True
        self.layers="-1"
        self.max_seq_length=128
        self.batch_size=32
        self.local_rank=-1
        self.no_cuda=False

        if self.local_rank == -1 or self.no_cuda:
            device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu")
            n_gpu = torch.cuda.device_count()
        else:
            device = torch.device("cuda", self.local_rank)
            n_gpu = 1
            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
            torch.distributed.init_process_group(backend='nccl')
        logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(self.local_rank != -1)))

        layer_indexes = [int(x) for x in self.layers.split(",")]

        tokenizer = BertTokenizer.from_pretrained(self.bert_model, do_lower_case=self.do_lower_case)

        examples = read_examples(self.input_file)

        features = convert_examples_to_features(
            examples=examples, seq_length=self.max_seq_length, tokenizer=tokenizer)

        unique_id_to_feature = {}
        for feature in features:
            unique_id_to_feature[feature.unique_id] = feature

        model = BertModel.from_pretrained(self.bert_model)
        model.to(device)

        if self.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[self.local_rank],
                                                            output_device=self.local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
        if self.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size)

        model.eval()
        with open(self.output_file, "w", encoding='utf-8') as writer:
            for input_ids, input_mask, example_indices in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)

                all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
                all_encoder_layers = all_encoder_layers

                for b, example_index in enumerate(example_indices):
                    feature = features[example_index.item()]
                    unique_id = int(feature.unique_id)
                    # feature = unique_id_to_feature[unique_id]
                    output_json = collections.OrderedDict()
                    output_json["linex_index"] = unique_id
                    all_out_features = []
                    for (i, token) in enumerate(feature.tokens):
                        all_layers = []
                        for (j, layer_index) in enumerate(layer_indexes):
                            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                            layer_output = layer_output[b]
                            layers = collections.OrderedDict()
                            layers["index"] = layer_index
                            print(layer_output.shape)
                            layers["values"] = [
                                round(x.item(), 6) for x in layer_output[i]
                            ]
                            all_layers.append(layers)
                        out_features = collections.OrderedDict()
                        out_features["token"] = token
                        out_features["layers"] = all_layers
                        all_out_features.append(out_features)
                    output_json["features"] = all_out_features
                    writer.write(json.dumps(output_json) + "n")

And then run:

embeddings=extrair.Main()
embeddings.main(input_file='gensim.csv',output_file='gensim.json')

Parsing the JSON file:

import json
from pprint import pprint
import numpy as np

data = [json.loads(line) for line in open('gensim.json', 'r')]

xx=[]
for parte in range(0,len(data)):
    xx.append(np.mean([data[parte]['features'][i]['layers'][0]['values'] for i in range(0,len(data[parte]['features']))],axis=0))

from scipy.spatial.distance import cosine as cos

for i in range(0,len(xx)):
    print(cos(xx[2],xx[i]))

Getting as output:

enter image description here

Answered By: razimbres
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.