2020-10-21 22:57:48 +00:00
|
|
|
from argparse import ArgumentParser as ap
|
|
|
|
from datetime import timedelta
|
2020-10-21 20:01:26 +00:00
|
|
|
from secrets import choice
|
2020-10-21 22:57:48 +00:00
|
|
|
from string import ascii_letters
|
2020-10-21 20:01:26 +00:00
|
|
|
from timeit import default_timer as timer
|
|
|
|
|
|
|
|
|
|
|
|
def reset_stopwatch():
|
|
|
|
return timer()
|
|
|
|
|
|
|
|
|
|
|
|
def get_elapsed(starttime):
|
|
|
|
end = timer()
|
2020-10-21 22:57:48 +00:00
|
|
|
return timedelta(seconds=end - starttime)
|
2020-10-21 20:01:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
def randstring(strlen=64):
|
|
|
|
return ''.join(choice(ascii_letters) for _ in range(strlen))
|
|
|
|
|
|
|
|
|
|
|
|
def spawn(listlen=100):
|
2020-10-21 22:57:48 +00:00
|
|
|
base_list = [randstring(10) + "." + randstring(10) + "@" + randstring(15) + ".com" for _ in range(listlen)]
|
2020-10-21 20:01:26 +00:00
|
|
|
dup_list = [choice(base_list) for _ in range(len(base_list))]
|
|
|
|
final_list = []
|
|
|
|
for i in range(listlen):
|
|
|
|
final_list.append(base_list[i])
|
|
|
|
final_list.append(dup_list[i])
|
|
|
|
return final_list
|
|
|
|
|
|
|
|
|
|
|
|
def dups(biglist):
|
|
|
|
seen = set()
|
|
|
|
uneek = []
|
|
|
|
for x in biglist:
|
|
|
|
if x not in seen:
|
|
|
|
uneek.append(x)
|
|
|
|
seen.add(x)
|
2020-10-21 22:57:48 +00:00
|
|
|
return list(seen), uneek
|
|
|
|
|
|
|
|
|
|
|
|
# NOTE:
|
|
|
|
# In the event that you do not need both lists,
|
|
|
|
# there is a much simpler, more "pythonic", way
|
|
|
|
# to do the pruning with python:
|
|
|
|
def prune(biglist):
|
|
|
|
return list(dict.fromkeys(biglist))
|
2020-10-21 20:01:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2020-10-21 22:57:48 +00:00
|
|
|
parser = ap()
|
|
|
|
parser.add_argument('-e', '--emails', type=int, default=1000, metavar="emails",
|
|
|
|
help='The number of emails to generate (default=1000)', required=False)
|
2020-10-21 23:14:51 +00:00
|
|
|
parser.add_argument('-d', '--dump', action="store_true",
|
|
|
|
help='Dump the email list to the console (Default=no)', required=False)
|
2020-10-21 22:57:48 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
email_count = args.emails
|
|
|
|
|
|
|
|
# NOTE: The spawning process takes an enormous amount of time,
|
|
|
|
# but since the challenge didn't say anything about how long it takes to
|
|
|
|
# generate 100,000 emails (only how long it takes to de-dupe them), I
|
|
|
|
# didn't do much to try to optimize the creation of the list of emails.
|
|
|
|
# But I will say, that I kept it entirely in memory, to avoid having to
|
|
|
|
# deal with disk i/o.
|
2020-10-21 20:01:26 +00:00
|
|
|
start = reset_stopwatch()
|
2020-10-21 22:57:48 +00:00
|
|
|
list_with_dups = spawn(email_count)
|
2020-10-21 20:01:26 +00:00
|
|
|
print(f"GENERATED COMPLETE LIST WITH DUPLICATES: (count = {len(list_with_dups)})")
|
2020-10-21 23:14:51 +00:00
|
|
|
if args.dump:
|
|
|
|
[print(i) for i in list_with_dups]
|
2020-10-21 20:01:26 +00:00
|
|
|
t1 = get_elapsed(start)
|
|
|
|
print("Elapsed Time: ", t1)
|
|
|
|
|
2020-10-21 22:57:48 +00:00
|
|
|
|
|
|
|
# This is the part we really care about. This step takes the generated list,
|
|
|
|
# and runs it through the de-duplicator, returning two lists: the originals,
|
|
|
|
# and the duplicates. Note, that these lists are identical in LENGTH ONLY,
|
|
|
|
# because the bifurcation process leaves them unsorted, according to the
|
|
|
|
# requirements. If sorted, they could be shown to be identical in content
|
|
|
|
# as well.
|
2020-10-21 20:01:26 +00:00
|
|
|
start = reset_stopwatch()
|
2020-10-21 22:57:48 +00:00
|
|
|
dup_list, orig_list = dups(list_with_dups)
|
2020-10-21 20:01:26 +00:00
|
|
|
print(f"IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = {len(dup_list)})")
|
2020-10-21 23:14:51 +00:00
|
|
|
if args.dump:
|
|
|
|
[print(i) for i in dup_list]
|
2020-10-21 20:01:26 +00:00
|
|
|
t2 = get_elapsed(start)
|
|
|
|
print("Elapsed time: ", t2)
|
|
|
|
|
2020-10-21 22:57:48 +00:00
|
|
|
print("\n\n")
|
|
|
|
print(f"TOTAL ELAPSED TIME: {t1 + t2}")
|