这是一个非常大的方法,通过在系列中的所有元素之间找到normalized similarity score,然后通过新获得的转换为字符串的相似性列表对它们进行分组。即
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
def convert_tag(tag):
tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
try:
return tag_dict[tag[0]]
except KeyError:
return None
def doc_to_synsets(doc):
"""
Returns a list of synsets in document.
Tokenizes and tags the words in the document doc.
Then finds the first synset for each word/tag combination.
If a synset is not found for that combination it is skipped.
Args:
doc: string to be converted
Returns:
list of synsets
Example:
doc_to_synsets('Fish are nvqjp friends.')
Out: [Synset('fish.n.01'), Synset('be.v.01'),
Synset('friend.n.01')]
"""
synsetlist =[]
tokens=nltk.word_tokenize(doc)
pos=nltk.pos_tag(tokens)
for tup in pos:
try:
synsetlist.append(wn.synsets(tup[0], convert_tag(tup[1]))[0])
except:
continue
return synsetlist
def similarity_score(s1, s2):
"""
Calculate the normalized similarity score of s1 onto s2
For each synset in s1, finds the synset in s2 with the largest similarity value.
Sum of all of the largest similarity values and normalize this value by dividing it by the number of largest similarity values found.
Args:
s1, s2: list of synsets from doc_to_synsets
Returns:
normalized similarity score of s1 onto s2
Example:
synsets1 = doc_to_synsets('I like cats')
synsets2 = doc_to_synsets('I like dogs')
similarity_score(synsets1, synsets2)
Out: 0.73333333333333339
"""
highscores = []
for synset1 in s1:
highest_yet=0
for synset2 in s2:
try:
simscore=synset1.path_similarity(synset2)
if simscore>highest_yet:
highest_yet=simscore
except:
continue
if highest_yet>0:
highscores.append(highest_yet)
return sum(highscores)/len(highscores) if len(highscores) > 0 else 0
def document_path_similarity(doc1, doc2):
synsets1 = doc_to_synsets(doc1)
synsets2 = doc_to_synsets(doc2)
return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2
def similarity(x,df):
sim_score = []
for i in df['Questions']:
sim_score.append(document_path_similarity(x,i))
return sim_score
从上面定义的方法我们现在可以做
df['similarity'] = df['Questions'].apply(lambda x : similarity(x,df)).astype(str)
for _, i in df.groupby('similarity')['Questions']:
print(i,'\n')
输出:
6 我们见面好吗?
名称:问题,dtype:对象
3 你叫什么名字?
4 你的昵称是什么?
5 你的全名是什么?
名称:问题,dtype:对象
0 你在做什么?
1 今晚你在做什么?
2 你现在在做什么?
7 你好吗?
名称:问题,dtype:对象
这不是解决问题的最佳方法,而且速度非常慢。任何新方法都受到高度赞赏。