-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword2vec_data.py
More file actions
71 lines (57 loc) · 2.15 KB
/
word2vec_data.py
File metadata and controls
71 lines (57 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import csv
import jieba.posseg as pseg
import jieba.analyse
import numpy as np
from gensim.models import Word2Vec
from pymongo import MongoClient
# 停用词列表
stopwords = [line.strip() for line in open("../config/stopword", 'r', encoding='utf-8').readlines()]
# 从MongoDB获取最新的聊天语料
def get_data():
conn = MongoClient('127.0.0.1', 27017)
db = conn.customer_service_db
message_collection = db.message
all_customer_msgs = message_collection.find() # {"oper_code": 2002}
for singel_document in all_customer_msgs:
msg = singel_document["msg"]
if not msg:
continue
yield str(msg).replace('\n', '').replace(' ', '')
# 存储源数据
with open("data/word2_original_data.csv", "w", newline="", encoding='utf-8') as f:
wr = csv.writer(f, lineterminator='\n')
for val in get_data():
wr.writerow([val])
jieba.load_userdict("../config/dict")
# 分词处理
with open("data/word2_original_data_cut.csv", "w", newline="", encoding='utf-8') as f:
wr = csv.writer(f, lineterminator='\n')
for val in get_data():
# 停用词过滤
sentence = [t.word.strip() for t in pseg.cut(val) if t not in stopwords and t.flag not in ['w', 'x']]
if sentence:
wr.writerow([' '.join(sentence)])
def buildWordVector(text, size, imdb_w2v):
"""
对每个句子的所有词向量取均值, 0补位
"""
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += imdb_w2v[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
x_train = [t.split(" ") for t in open("data/word2_original_data_cut.csv", 'r', encoding='utf-8').readlines()]
n_dim = 300
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
imdb_w2v.train(x_train, total_examples=imdb_w2v.corpus_count, epochs=imdb_w2v.iter)
train_vecs = np.concatenate([buildWordVector(z, n_dim, imdb_w2v) for z in x_train])
np.save('data/word2_original_data_vector.npy', train_vecs)
for vocad in imdb_w2v.wv.most_similar(['贵宾厅']):
print(vocad)