nlp_train_example/word2vec_data.py at master · MachineLearning-Team/nlp_train_example · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import csv

import jieba.posseg as pseg
import jieba.analyse
import numpy as np
from gensim.models import Word2Vec
from pymongo import MongoClient

# 停用词列表
stopwords = [line.strip() for line in open("../config/stopword", 'r', encoding='utf-8').readlines()]


# 从MongoDB获取最新的聊天语料
def get_data():
    conn = MongoClient('127.0.0.1', 27017)
    db = conn.customer_service_db
    message_collection = db.message

    all_customer_msgs = message_collection.find() # {"oper_code": 2002}

    for singel_document in all_customer_msgs:
        msg = singel_document["msg"]
        if not msg:
            continue
        yield str(msg).replace('\n', '').replace(' ', '')


# 存储源数据
with open("data/word2_original_data.csv", "w", newline="", encoding='utf-8') as f:
    wr = csv.writer(f, lineterminator='\n')
    for val in get_data():
        wr.writerow([val])

jieba.load_userdict("../config/dict")
# 分词处理
with open("data/word2_original_data_cut.csv", "w", newline="", encoding='utf-8') as f:
    wr = csv.writer(f, lineterminator='\n')
    for val in get_data():
        # 停用词过滤
        sentence = [t.word.strip() for t in pseg.cut(val) if t not in stopwords and t.flag not in ['w', 'x']]
        if sentence:
            wr.writerow([' '.join(sentence)])


def buildWordVector(text, size, imdb_w2v):
    """
    对每个句子的所有词向量取均值, 0补位
    """
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec


x_train = [t.split(" ") for t in open("data/word2_original_data_cut.csv", 'r', encoding='utf-8').readlines()]
n_dim = 300
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
imdb_w2v.train(x_train, total_examples=imdb_w2v.corpus_count, epochs=imdb_w2v.iter)
train_vecs = np.concatenate([buildWordVector(z, n_dim, imdb_w2v) for z in x_train])
np.save('data/word2_original_data_vector.npy', train_vecs)

for vocad in imdb_w2v.wv.most_similar(['贵宾厅']):
    print(vocad)