【问题标题】:How to generate word embeddings in Portuguese using Gensim?如何使用 Gensim 在葡萄牙语中生成词嵌入?
【发布时间】:2023-11-16 17:40:01
【问题描述】:

我有以下问题:

在英语语言中,我的代码使用 Gensim 生成了成功的词嵌入,并且考虑到余弦距离,相似的短语彼此接近:

“响应时间和错误测量”与“用户感知响应时间与错误测量的关系”之间的角度非常小,因此它们是集合中最相似的短语。

但是,当我在葡萄牙语中使用相同的短语时,它不起作用:

我的代码如下:

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import matplotlib.pyplot as plt
from gensim import corpora
documents = ["Interface máquina humana para aplicações computacionais de laboratório abc",
          "Um levantamento da opinião do usuário sobre o tempo de resposta do sistema informático",
           "O sistema de gerenciamento de interface do usuário EPS",
           "Sistema e testes de engenharia de sistemas humanos de EPS",
           "Relação do tempo de resposta percebido pelo usuário para a medição de erro",
           "A geração de árvores não ordenadas binárias aleatórias",
           "O gráfico de interseção dos caminhos nas árvores",
           "Gráfico de menores IV Largura de árvores e bem quase encomendado",
           "Gráficos menores Uma pesquisa"]

stoplist = set('for a of the and to in on'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
texts

from collections import defaultdict
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1
frequency

from nltk import tokenize  
texts=[tokenize.word_tokenize(documents[i], language='portuguese') for i in range(0,len(documents))]

from pprint import pprint
pprint(texts)

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)

print(dictionary.token2id)


# VECTOR
new_doc = "Tempo de resposta e medição de erro"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

## VETOR OF PHRASES
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  
print(corpus)

from gensim import corpora, models, similarities
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

### PHRASE COORDINATES
frase=tfidf[new_vec]
print(frase)

corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]

lsi.print_topics(2)

## TEXT COORDINATES
todas=[]
for doc in corpus_lsi:
    todas.append(doc)
todas

from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print(corpus)

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

doc = new_doc
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)

p=[]
for i in range(0,len(documents)):
    doc1 = documents[i]
    vec_bow2 = dictionary.doc2bow(doc1.lower().split())
    vec_lsi2 = lsi[vec_bow2]
    p.append(vec_lsi2)

p

index = similarities.MatrixSimilarity(lsi[corpus])

index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

sims = index[vec_lsi]
print(list(enumerate(sims)))

sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) 

#################

import gensim
import numpy as np
import matplotlib.colors as colors
import matplotlib.cm as cmx
import matplotlib as mpl

matrix1 = gensim.matutils.corpus2dense(p, num_terms=2)
matrix3=matrix1.T
matrix3[0]
ss=[]
for i in range(0,9):
    ss.append(np.insert(matrix3[i],0,[0,0]))
matrix4=ss
matrix4

matrix2 = gensim.matutils.corpus2dense([vec_lsi], num_terms=2)
matrix2=np.insert(matrix2,0,[0,0])
matrix2

DATA=np.insert(matrix4,0,matrix2)
DATA=DATA.reshape(10,4)
DATA

names=np.array(documents)
names=np.insert(names,0,new_doc)
new_doc
cmap = plt.cm.jet

cNorm  = colors.Normalize(vmin=np.min(DATA[:,3])+.2, vmax=np.max(DATA[:,3]))

scalarMap = cmx.ScalarMappable(norm=cNorm,cmap=cmap)
len(DATA[:,1])

plt.subplots()
plt.figure(figsize=(12,9))
plt.scatter(matrix1[0],matrix1[1],s=60)
plt.scatter(matrix2[2],matrix2[3],color='r',s=95)
for idx in range(0,len(DATA[:,1])):
    colorVal = scalarMap.to_rgba(DATA[idx,3])
    plt.arrow(DATA[idx,0],
          DATA[idx,1], 
          DATA[idx,2], 
          DATA[idx,3], 
          color=colorVal,head_width=0.002, head_length=0.001)
for i,names in enumerate (names):
    plt.annotate(names, (DATA[i][2],DATA[i][3]),va='top')
plt.title("PHRASE SIMILARITY - WORD2VEC with GENSIM library")
plt.xlim(min(DATA[:,2]-.2),max(DATA[:,2]+1))
plt.ylim(min(DATA[:,3]-.2),max(DATA[:,3]+.3))
plt.show()

我的问题是:是否有任何额外的设置让 Gensim 生成正确的葡萄牙语词嵌入或 Gensim 不支持这种语言?

【问题讨论】:

    标签: python nlp nltk gensim


    【解决方案1】:

    一年零十个月后,我自己得到了回复:在 PyTorch 中使用 BERT 嵌入:

    短语:

    我在https://github.com/ethanjperez/pytorch-pretrained-BERT/blob/master/examples/extract_features.py 改编了 PyTorch extract_features.py

    class Main:
        def main(self,input_file,output_file):
            self.input_file=input_file
            self.output_file=output_file
            self.bert_model='bert-base-multilingual-uncased'
            self.do_lower_case=True
            self.layers="-1"
            self.max_seq_length=128
            self.batch_size=32
            self.local_rank=-1
            self.no_cuda=False
    
            if self.local_rank == -1 or self.no_cuda:
                device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu")
                n_gpu = torch.cuda.device_count()
            else:
                device = torch.device("cuda", self.local_rank)
                n_gpu = 1
                # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
                torch.distributed.init_process_group(backend='nccl')
            logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(self.local_rank != -1)))
    
            layer_indexes = [int(x) for x in self.layers.split(",")]
    
            tokenizer = BertTokenizer.from_pretrained(self.bert_model, do_lower_case=self.do_lower_case)
    
            examples = read_examples(self.input_file)
    
            features = convert_examples_to_features(
                examples=examples, seq_length=self.max_seq_length, tokenizer=tokenizer)
    
            unique_id_to_feature = {}
            for feature in features:
                unique_id_to_feature[feature.unique_id] = feature
    
            model = BertModel.from_pretrained(self.bert_model)
            model.to(device)
    
            if self.local_rank != -1:
                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[self.local_rank],
                                                                output_device=self.local_rank)
            elif n_gpu > 1:
                model = torch.nn.DataParallel(model)
    
            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    
            eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
            if self.local_rank == -1:
                eval_sampler = SequentialSampler(eval_data)
            else:
                eval_sampler = DistributedSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size)
    
            model.eval()
            with open(self.output_file, "w", encoding='utf-8') as writer:
                for input_ids, input_mask, example_indices in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
    
                    all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
                    all_encoder_layers = all_encoder_layers
    
                    for b, example_index in enumerate(example_indices):
                        feature = features[example_index.item()]
                        unique_id = int(feature.unique_id)
                        # feature = unique_id_to_feature[unique_id]
                        output_json = collections.OrderedDict()
                        output_json["linex_index"] = unique_id
                        all_out_features = []
                        for (i, token) in enumerate(feature.tokens):
                            all_layers = []
                            for (j, layer_index) in enumerate(layer_indexes):
                                layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                                layer_output = layer_output[b]
                                layers = collections.OrderedDict()
                                layers["index"] = layer_index
                                print(layer_output.shape)
                                layers["values"] = [
                                    round(x.item(), 6) for x in layer_output[i]
                                ]
                                all_layers.append(layers)
                            out_features = collections.OrderedDict()
                            out_features["token"] = token
                            out_features["layers"] = all_layers
                            all_out_features.append(out_features)
                        output_json["features"] = all_out_features
                        writer.write(json.dumps(output_json) + "\n")
    

    然后运行:

    embeddings=extrair.Main()
    embeddings.main(input_file='gensim.csv',output_file='gensim.json')
    

    解析 JSON 文件:

    import json
    from pprint import pprint
    import numpy as np
    
    data = [json.loads(line) for line in open('gensim.json', 'r')]
    
    xx=[]
    for parte in range(0,len(data)):
        xx.append(np.mean([data[parte]['features'][i]['layers'][0]['values'] for i in range(0,len(data[parte]['features']))],axis=0))
    
    from scipy.spatial.distance import cosine as cos
    
    for i in range(0,len(xx)):
        print(cos(xx[2],xx[i]))
    

    获取输出:

    【讨论】: