1、序言
Python基于词频的新词发现代码实现,探索高频新词以更新词库
源码下载地址:https://gitee.com/arye/dl/tree/master/NLP/新词发现
2、极简代码展示
2.1、【HMM】法
仅探索新词
import pandas as pd, jieba
from os.path import dirname
from re import sub
from collections import Counter
# 读取jieba默认词典
jieba_dict = dirname(jieba.__file__) + '\dict.txt'
df = pd.read_table(jieba_dict, sep=' ', header=None)[[0]]
s = set(df.values.reshape(-1)) | {' '}
# 读取语料
with open('三国演义.txt', encoding='utf-8') as f:
text = sub('[^\u4e00-\u9fa5]+', ' ', f.read())
# HMM发现新词,并按词频大小降序;存excel
counter = Counter(w for w in jieba.cut(text) if w not in s)
pd.DataFrame(counter.most_common(), columns=['word', 'freq']).\
to_excel('save/new_word.xlsx', index=False)
探索新词+词性
import pandas as pd, jieba
from jieba.posseg import cut
from os.path import dirname
from re import sub
from collections import Counter
# 读取jieba默认词典
jieba_dict = dirname(jieba.__file__) + '\dict.txt'
df = pd.read_table(jieba_dict, sep=' ', header=None)[[0]]
s = set(df.values.reshape(-1)) | {' '}
# 读取语料
with open('三国演义.txt', encoding='utf-8') as f:
text = sub('[^\u4e00-\u9fa5]+', ' ', f.read())
# HMM发现新词,并按词频大小降序;存excel
counter = Counter(w for w in cut(text) if w.word not in s)
counter = [(i[0].word, i[0].flag, i[1]) for i in counter.most_common()]
pd.DataFrame(counter, columns=['word', 'flag', 'freq']).\
to_excel('save/new_word_flag.xlsx', index=False)
2.2、【无脑遍历】法
简单版
import re
import pandas as pd, jieba
from os.path import dirname
from collections import Counter
# 读取jieba默认词典
jieba_dict = dirname(jieba.__file__) + '\dict.txt'
df = pd.read_table(jieba_dict, sep=' ', header=None)[[0]]
s = set(df.values.reshape(-1)) | {' '}
# 读取语料
with open('三国演义.txt', encoding='utf-8') as f:
text = re.sub('[^\u4e00-\u9fa5]+', ' ', f.read())
le = len(text)
# 遍历
writer = pd.ExcelWriter('save/new_words.xlsx')
for n in (2, 3, 4):
cn = re.compile('[\u4e00-\u9fa5]{%d}' % n)
counter = Counter(text[i: i + n] for i in range(le - n)
if cn.fullmatch(text[i: i + n]) and
text[i: i + n] not in s)
pd.DataFrame(counter.most_common(9999), columns=['w', 'f'])\
.to_excel(writer, sheet_name=str(n), index=False)
writer.save()
提升版
import re
import pandas as pd, jieba
from os.path import dirname
from collections import Counter
# 读取jieba默认词典
jieba_dict = dirname(jieba.__file__) + '\dict.txt'
df = pd.read_table(jieba_dict, sep=' ', header=None)[[0]]
s = set(df.values.reshape(-1)) | {' '}
# 读取语料
with open('三国演义.txt', encoding='utf-8') as f:
text = re.sub('[^\u4e00-\u9fa5]+', ' ', f.read())
le = len(text)
# 遍历
counter = Counter()
for n in (6, 5, 4, 3, 2):
cn = re.compile('[\u4e00-\u9fa5]{%d}' % n)
mc = Counter(text[i: i + n] for i in range(le - n)
if cn.fullmatch(text[i: i + n]) and
text[i: i + n] not in s).most_common(int(9999 / n)) # 词越长越少
for word, freq in mc: # 遍历短词
for w, f in counter.most_common(): # 遍历长词
if word in w and\
f / freq > .9 ** (len(w) - len(word)): # 词长差距
print(w, word, f / freq)
break # 短词被包含于长词且词频相近时,不加入新词
else:
counter[word] = freq
# 短词优先被发现,长词因词频较低排序靠后
pd.DataFrame(counter.most_common(9999), columns=['w', 'f'])\
.to_excel('save/new_words_pro.xlsx', sheet_name=str(n), index=False)