-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
147 lines (110 loc) · 4.04 KB
/
preprocessor.py
File metadata and controls
147 lines (110 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# coding: utf-8
# !/usr/bin/python3
"""
Authors: Jiajun Bao, Meng Li, Jane Liu
Classes:
Tokenizer:
Accepts a list of words and outputs tokenized text.
RemoveStopWords:
Accepts a list of tokens and removes stop words
Preprocessor:
Calls Tokenizer, RemoveStopWords, and lemmatizes the text. Finds the top 100 unigrams and bigrams.
Unigrams:
Returns the top 100 unigrams and prints to the console and ngrams.txt.
Bigrams:
Returns the top 100 bigrams and prints to the console and ngrams.txt.
"""
import re
import string
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
# Get the top 100 unigrams
class Tokenizer:
def __init__(self, text):
self._text = text
self._tokenized_text = []
def tokenize(self):
# Remove punctuation, empty elements, numbers, and dates
pattern = re.compile('[0-9]+')
self._tokenized_text = [''.join(c for c in s if c not in string.punctuation) for s in self._text]
self._tokenized_text[:] = [word for word in self._tokenized_text if not pattern.match(word) and word != '']
return self._tokenized_text
class RemoveStopWords:
def __init__(self, text):
self._text = text
self._stopwords = []
def removestopwords(self):
with open('data/stopwords.txt', 'r') as g:
self._stopwords = g.read().splitlines()
for word in self._stopwords:
self._text = [value for value in self._text if value.lower() != word]
return self._text
class Preprocessor:
def __init__(self):
self._reviews = []
self._cleantext = []
self._temptext = []
self._preprocessedlist = []
self._unigrams = []
self._bigrams = []
self._ngrams = []
def preprocess(self):
with open('data/reviewContent.txt', 'r') as f:
self._reviews = f.read().split()
# Tokenize the text file
self._temptext = Tokenizer(self._reviews)
self._cleantext = self._temptext.tokenize()
# Remove stop words
self._temptext = RemoveStopWords(self._cleantext)
self._cleantext = self._temptext.removestopwords()
# Lemmatize the text
lemma_text = []
lemmatizer = WordNetLemmatizer()
for word in list(self._cleantext):
new_word = lemmatizer.lemmatize(word)
lemma_text.append(new_word)
for word in lemma_text:
self._preprocessedlist.append(word)
unitemp = Unigrams(self._preprocessedlist)
self._unigrams = unitemp.get_top_unigrams()
[self._ngrams.append(x) for x in self._unigrams]
return self._ngrams
class Unigrams:
def __init__(self, topics):
self._topics = topics
self._unigrams = []
def get_top_unigrams(self):
# Find the most frequently occuring unigrams
word_freq = Counter(self._topics)
common_words = word_freq.most_common(100)
dict_unigrams = dict(common_words)
# convert dict to list for convenience
[self._unigrams.append(x) for x in dict_unigrams]
return self._unigrams
class Bigrams:
def __init__(self, topics):
self._topics = topics
self._output = []
self._bigrams = []
# Find the most frequently occuring bigrams
def get_top_bigrams(self):
# Generate bigrams
self._output = list(ngrams(self._topics, 2))
word_freq = Counter(self._output)
common_words = word_freq.most_common(100)
dict_bigrams = dict(common_words)
tup2str = ""
for tup in dict_bigrams:
for i in tup:
tup2str += i + " "
tup2str = tup2str.strip()
self._bigrams.append(tup2str)
tup2str = ""
return self._bigrams
def ngram_print(ngrams):
with open('ngrams.txt', 'w') as g:
for item in ngrams:
g.write(str(item))
g.write(', ')
g.write('\n\n')