-
Notifications
You must be signed in to change notification settings - Fork 183
Expand file tree
/
Copy pathcleanup.py
More file actions
195 lines (158 loc) · 5.92 KB
/
cleanup.py
File metadata and controls
195 lines (158 loc) · 5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Prepares passphrase cracking lists for use with the hashcat rules at
github.com/initstring/passphrase-wordlist
"""
import sys
import re
import urllib.parse
import html
import os
import time
import argparse
from datetime import timedelta
# Set a min/max passphrase character length. Change this if you want.
MIN_LENGTH = 8
MAX_LENGTH = 40
# Compiled regex patterns for performance
MULTIWORD_PATTERN = re.compile('[a-z0-9\'&] [a-z0-9\'&]')
ALLOWED_CHARS_PATTERN = re.compile("[^a-zA-Z0-9 '&]")
MULTIPLE_SPACES_PATTERN = re.compile(r'\s\s+')
QUOTE_REMOVAL_PATTERN = re.compile(r" '([^']*)' ")
WHITESPACE_PATTERN = re.compile(r'\s+')
HYPHEN_UNDERSCORE_PATTERN = re.compile(r'[-_]')
APOSTROPHE_REMOVAL_PATTERN = re.compile("'")
AND_TO_AMPERSAND_PATTERN = re.compile(' and ')
AMPERSAND_TO_AND_PATTERN = re.compile('&')
# Accented character patterns
ACCENTED_A_PATTERN = re.compile('[àáâãäå]')
ACCENTED_E_PATTERN = re.compile('[èéêë]')
ACCENTED_I_PATTERN = re.compile('[ìíîï]')
ACCENTED_O_PATTERN = re.compile('[òóôõö]')
ACCENTED_U_PATTERN = re.compile('[ùúûü]')
ACCENTED_N_PATTERN = re.compile('[ñ]')
ACCENTED_C_PATTERN = re.compile('[ç]')
ACCENTED_Y_PATTERN = re.compile('[ÿ]')
# Split pattern
SPLIT_PATTERN = re.compile(r';|,|\.')
def parse_arguments():
"""
Handles user-passed parameters
"""
desc = 'Transforms text files in passphrase lists.'
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('infile', type=str, action='store',
help='Input file.')
parser.add_argument('outfile', type=str, action='store',
help='Output file.')
args = parser.parse_args()
if not os.access(args.infile, os.R_OK):
print("[!] Cannot access input file, exiting")
sys.exit()
return args
def build_buffer(infile):
"""
Reads infile and builds a list of candidates for additional processing
"""
buffer = []
infile_size = str((int(os.path.getsize(infile)/1000000))) + " MB"
print("Reading from {}: {}".format(infile, infile_size))
with open(infile, encoding='utf-8', errors='ignore') as file_handler:
for line in file_handler:
candidates = []
# Remove HTML and URL encoding first
line = escape_encoding(line)
# Split lines with common delimiters like . , or ;
for split_line in SPLIT_PATTERN.split(line):
candidates.append(split_line.strip())
# There is a new short list, append each to the buffer
for string in candidates:
buffer.append(string)
return buffer
def handle_punctuation(line):
"""
Deals with common punctionation
"""
clean_lines = []
# Gets rid of any remaining special characters in the name
line = ALLOWED_CHARS_PATTERN.sub('', line)
# Shrinks down multiple spaces
line = MULTIPLE_SPACES_PATTERN.sub(' ', line)
# Strip quotes around line
line = line.strip('\'"')
# Remove quotes around internal segments
line = QUOTE_REMOVAL_PATTERN.sub(r' \1 ', line)
# If line has an apostrophe make a duplicate without deleting it
if "'" in line:
clean_lines.append(APOSTROPHE_REMOVAL_PATTERN.sub("", line))
# Making duplicating phrases including and / &
if ' and ' in line:
clean_lines.append(AND_TO_AMPERSAND_PATTERN.sub(' & ', line))
if '&' in line:
newline = AMPERSAND_TO_AND_PATTERN.sub(' and ', line)
newline = WHITESPACE_PATTERN.sub(' ', newline).strip()
clean_lines.append(newline)
# Add what is left to the list and return it
clean_lines.append(line)
return clean_lines
def escape_encoding(line):
"""
Deals with common encoding and accented characters
"""
line = urllib.parse.unquote(line) # convert URL encoding like %27
line = html.unescape(line) # convert HTML encoding like '
line = WHITESPACE_PATTERN.sub(' ', line).strip() # Remove extra whitespace
line = line.lower() # convert to lowercase
line = HYPHEN_UNDERSCORE_PATTERN.sub(' ', line) # Change - and _ to spaces
# The following lines attempt to remove accented characters, as the
# tool is focused on Engligh-language passwords.
line = ACCENTED_A_PATTERN.sub('a', line)
line = ACCENTED_E_PATTERN.sub('e', line)
line = ACCENTED_I_PATTERN.sub('i', line)
line = ACCENTED_O_PATTERN.sub('o', line)
line = ACCENTED_U_PATTERN.sub('u', line)
line = ACCENTED_N_PATTERN.sub('n', line)
line = ACCENTED_C_PATTERN.sub('c', line)
line = ACCENTED_Y_PATTERN.sub('y', line)
return line
def choose_candidates(line):
"""
Final check to determine with cleaned phrases to keep
"""
# Throw out single-word candidates
if not MULTIWORD_PATTERN.search(line):
return False
# Thow out too short / too long lines
if len(line) < MIN_LENGTH or len(line) > MAX_LENGTH:
return False
return True
def write_file(buffer, outfile):
"""
Writes choses candidates to an output file
"""
with open(outfile, 'w') as file_handler:
for line in sorted(buffer):
file_handler.write(line.strip() + '\n')
outfile_size = str((int(os.path.getsize(outfile)/1000000)))
print("Wrote to {}: {} MB".format(outfile, outfile_size))
def main():
"""
Main program function
"""
start = time.time()
args = parse_arguments()
buffer = build_buffer(args.infile)
final = set([])
# Processes phrases and adds to a set (deduped)
for phrase in buffer:
new_phrases = handle_punctuation(phrase)
for newphrase in new_phrases:
if choose_candidates(newphrase):
final.add(newphrase)
# Writes final set out to file
write_file(final, args.outfile)
elapsed = time.time() - start
print("Elapsed time: " + str(timedelta(seconds=elapsed)))
if __name__ == "__main__":
main()