BigDataScience_final_project/preprocessor.py at master · 66thparallel/BigDataScience_final_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# coding: utf-8
# !/usr/bin/python3
"""
Authors: Jiajun Bao, Meng Li, Jane Liu
Classes:
    Tokenizer:
        Accepts a list of words and outputs tokenized text.
    RemoveStopWords:
        Accepts a list of tokens and removes stop words
    Preprocessor:
        Calls Tokenizer, RemoveStopWords, and lemmatizes the text. Finds the top 100 unigrams and bigrams.
    Unigrams:
        Returns the top 100 unigrams and prints to the console and ngrams.txt.
    Bigrams:
        Returns the top 100 bigrams and prints to the console and ngrams.txt.
"""

import re
import string
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams


# Get the top 100 unigrams
class Tokenizer:
    def __init__(self, text):
        self._text = text
        self._tokenized_text = []

    def tokenize(self):
        # Remove punctuation, empty elements, numbers, and dates
        pattern = re.compile('[0-9]+')
        self._tokenized_text = [''.join(c for c in s if c not in string.punctuation) for s in self._text]
        self._tokenized_text[:] = [word for word in self._tokenized_text if not pattern.match(word) and word != '']

        return self._tokenized_text


class RemoveStopWords:
    def __init__(self, text):
        self._text = text
        self._stopwords = []

    def removestopwords(self):
        with open('data/stopwords.txt', 'r') as g:
            self._stopwords = g.read().splitlines()
        for word in self._stopwords:
            self._text = [value for value in self._text if value.lower() != word]

        return self._text


class Preprocessor:

    def __init__(self):
        self._reviews = []
        self._cleantext = []
        self._temptext = []
        self._preprocessedlist = []
        self._unigrams = []
        self._bigrams = []
        self._ngrams = []

    def preprocess(self):

        with open('data/reviewContent.txt', 'r') as f:
            self._reviews = f.read().split()

            # Tokenize the text file
            self._temptext = Tokenizer(self._reviews)
            self._cleantext = self._temptext.tokenize()

            # Remove stop words
            self._temptext = RemoveStopWords(self._cleantext)
            self._cleantext = self._temptext.removestopwords()

            # Lemmatize the text
            lemma_text = []
            lemmatizer = WordNetLemmatizer()

            for word in list(self._cleantext):
                new_word = lemmatizer.lemmatize(word)
                lemma_text.append(new_word)

            for word in lemma_text:
                self._preprocessedlist.append(word)

            unitemp = Unigrams(self._preprocessedlist)
            self._unigrams = unitemp.get_top_unigrams()

            [self._ngrams.append(x) for x in self._unigrams]

        return self._ngrams


class Unigrams:
    def __init__(self, topics):
        self._topics = topics
        self._unigrams = []

    def get_top_unigrams(self):

        # Find the most frequently occuring unigrams
        word_freq = Counter(self._topics)

        common_words = word_freq.most_common(100)
        dict_unigrams = dict(common_words)

        # convert dict to list for convenience
        [self._unigrams.append(x) for x in dict_unigrams]

        return self._unigrams


class Bigrams:
    def __init__(self, topics):
        self._topics = topics
        self._output = []
        self._bigrams = []

    # Find the most frequently occuring bigrams
    def get_top_bigrams(self):

        # Generate bigrams
        self._output = list(ngrams(self._topics, 2))
        word_freq = Counter(self._output)
        common_words = word_freq.most_common(100)
        dict_bigrams = dict(common_words)

        tup2str = ""
        for tup in dict_bigrams:
            for i in tup:
                tup2str += i + " "
            tup2str = tup2str.strip()
            self._bigrams.append(tup2str)
            tup2str = ""

        return self._bigrams

def ngram_print(ngrams):

    with open('ngrams.txt', 'w') as g:
        for item in ngrams:
            g.write(str(item))
            g.write(', ')
        g.write('\n\n')