email-prune/email_pruner.py

89 lines
2.9 KiB
Python
Raw Normal View History

2020-10-21 22:57:48 +00:00
from argparse import ArgumentParser as ap
from datetime import timedelta
2020-10-21 20:01:26 +00:00
from secrets import choice
2020-10-21 22:57:48 +00:00
from string import ascii_letters
2020-10-21 20:01:26 +00:00
from timeit import default_timer as timer
def reset_stopwatch():
return timer()
def get_elapsed(starttime):
end = timer()
2020-10-21 22:57:48 +00:00
return timedelta(seconds=end - starttime)
2020-10-21 20:01:26 +00:00
def randstring(strlen=64):
return ''.join(choice(ascii_letters) for _ in range(strlen))
def spawn(listlen=100):
2020-10-21 22:57:48 +00:00
base_list = [randstring(10) + "." + randstring(10) + "@" + randstring(15) + ".com" for _ in range(listlen)]
2020-10-21 20:01:26 +00:00
dup_list = [choice(base_list) for _ in range(len(base_list))]
final_list = []
for i in range(listlen):
final_list.append(base_list[i])
final_list.append(dup_list[i])
return final_list
def dups(biglist):
seen = set()
uneek = []
for x in biglist:
if x not in seen:
uneek.append(x)
seen.add(x)
2020-10-21 22:57:48 +00:00
return list(seen), uneek
# NOTE:
# In the event that you do not need both lists,
# there is a much simpler, more "pythonic", way
# to do the pruning with python:
def prune(biglist):
return list(dict.fromkeys(biglist))
2020-10-21 20:01:26 +00:00
if __name__ == "__main__":
2020-10-21 22:57:48 +00:00
parser = ap()
parser.add_argument('-e', '--emails', type=int, default=1000, metavar="emails",
help='The number of emails to generate (default=1000)', required=False)
parser.add_argument('-d', '--dump', action="store_true",
help='Dump the email list to the console (Default=no)', required=False)
2020-10-21 22:57:48 +00:00
args = parser.parse_args()
email_count = args.emails
# NOTE: The spawning process takes an enormous amount of time,
# but since the challenge didn't say anything about how long it takes to
# generate 100,000 emails (only how long it takes to de-dupe them), I
# didn't do much to try to optimize the creation of the list of emails.
# But I will say, that I kept it entirely in memory, to avoid having to
# deal with disk i/o.
2020-10-21 20:01:26 +00:00
start = reset_stopwatch()
2020-10-21 22:57:48 +00:00
list_with_dups = spawn(email_count)
2020-10-21 20:01:26 +00:00
print(f"GENERATED COMPLETE LIST WITH DUPLICATES: (count = {len(list_with_dups)})")
if args.dump:
[print(i) for i in list_with_dups]
2020-10-21 20:01:26 +00:00
t1 = get_elapsed(start)
print("Elapsed Time: ", t1)
2020-10-21 22:57:48 +00:00
# This is the part we really care about. This step takes the generated list,
# and runs it through the de-duplicator, returning two lists: the originals,
# and the duplicates. Note, that these lists are identical in LENGTH ONLY,
# because the bifurcation process leaves them unsorted, according to the
# requirements. If sorted, they could be shown to be identical in content
# as well.
2020-10-21 20:01:26 +00:00
start = reset_stopwatch()
2020-10-21 22:57:48 +00:00
dup_list, orig_list = dups(list_with_dups)
2020-10-21 20:01:26 +00:00
print(f"IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = {len(dup_list)})")
if args.dump:
[print(i) for i in dup_list]
2020-10-21 20:01:26 +00:00
t2 = get_elapsed(start)
print("Elapsed time: ", t2)
2020-10-21 22:57:48 +00:00
print("\n\n")
print(f"TOTAL ELAPSED TIME: {t1 + t2}")