email-prune/email_pruner.py

from argparse import ArgumentParser as ap
from datetime import timedelta
from secrets import choice
from string import ascii_letters
from timeit import default_timer as timer


def reset_stopwatch():
    return timer()


def get_elapsed(starttime):
    end = timer()
    return timedelta(seconds=end - starttime)


def randstring(strlen=64):
    return ''.join(choice(ascii_letters) for _ in range(strlen))


def spawn(listlen=100):
    base_list = [randstring(10) + "." + randstring(10) + "@" + randstring(15) + ".com" for _ in range(listlen)]
    dup_list = [choice(base_list) for _ in range(len(base_list))]
    final_list = []
    for i in range(listlen):
        final_list.append(base_list[i])
        final_list.append(dup_list[i])
    return final_list


def dups(biglist):
    seen = set()
    uneek = []
    for x in biglist:
        if x not in seen:
            uneek.append(x)
            seen.add(x)
    return list(seen), uneek


# NOTE:
# In the event that you do not need both lists,
# there is a much simpler, more "pythonic", way
# to do the pruning with python:
def prune(biglist):
    return list(dict.fromkeys(biglist))


if __name__ == "__main__":
    parser = ap()
    parser.add_argument('-e', '--emails', type=int, default=1000, metavar="emails",
                        help='The number of emails to generate (default=1000)', required=False)
    parser.add_argument('-d', '--dump', action="store_true",
                        help='Dump the email list to the console (Default=no)', required=False)
    args = parser.parse_args()
    email_count = args.emails

    # NOTE: The spawning process takes an enormous amount of time,
    # but since the challenge didn't say anything about how long it takes to
    # generate 100,000 emails (only how long it takes to de-dupe them), I
    # didn't do much to try to optimize the creation of the list of emails.
    # But I will say, that I kept it entirely in memory, to avoid having to
    # deal with disk i/o.
    start = reset_stopwatch()
    list_with_dups = spawn(email_count)
    print(f"GENERATED COMPLETE LIST WITH DUPLICATES: (count = {len(list_with_dups)})")
    if args.dump:
        [print(i) for i in list_with_dups]
    t1 = get_elapsed(start)
    print("Elapsed Time: ", t1)


    # This is the part we really care about. This step takes the generated list,
    # and runs it through the de-duplicator, returning two lists: the originals,
    # and the duplicates. Note, that these lists are identical in LENGTH ONLY,
    # because the bifurcation process leaves them unsorted, according to the
    # requirements. If sorted, they could be shown to be identical in content
    # as well.
    start = reset_stopwatch()
    dup_list, orig_list = dups(list_with_dups)
    print(f"IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = {len(dup_list)})")
    if args.dump:
        [print(i) for i in dup_list]
    t2 = get_elapsed(start)
    print("Elapsed time: ", t2)

    print("\n\n")
    print(f"TOTAL ELAPSED TIME: {t1 + t2}")
final commit 2020-10-21 22:57:48 +00:00			`from argparse import ArgumentParser as ap`
			`from datetime import timedelta`
initial commit 2020-10-21 20:01:26 +00:00			`from secrets import choice`
final commit 2020-10-21 22:57:48 +00:00			`from string import ascii_letters`
initial commit 2020-10-21 20:01:26 +00:00			`from timeit import default_timer as timer`


			`def reset_stopwatch():`
			`return timer()`


			`def get_elapsed(starttime):`
			`end = timer()`
final commit 2020-10-21 22:57:48 +00:00			`return timedelta(seconds=end - starttime)`
initial commit 2020-10-21 20:01:26 +00:00

			`def randstring(strlen=64):`
			`return ''.join(choice(ascii_letters) for _ in range(strlen))`


			`def spawn(listlen=100):`
final commit 2020-10-21 22:57:48 +00:00			`base_list = [randstring(10) + "." + randstring(10) + "@" + randstring(15) + ".com" for _ in range(listlen)]`
initial commit 2020-10-21 20:01:26 +00:00			`dup_list = [choice(base_list) for _ in range(len(base_list))]`
			`final_list = []`
			`for i in range(listlen):`
			`final_list.append(base_list[i])`
			`final_list.append(dup_list[i])`
			`return final_list`


			`def dups(biglist):`
			`seen = set()`
			`uneek = []`
			`for x in biglist:`
			`if x not in seen:`
			`uneek.append(x)`
			`seen.add(x)`
final commit 2020-10-21 22:57:48 +00:00			`return list(seen), uneek`


			`# NOTE:`
			`# In the event that you do not need both lists,`
			`# there is a much simpler, more "pythonic", way`
			`# to do the pruning with python:`
			`def prune(biglist):`
			`return list(dict.fromkeys(biglist))`
initial commit 2020-10-21 20:01:26 +00:00

			`if __name__ == "__main__":`
final commit 2020-10-21 22:57:48 +00:00			`parser = ap()`
			`parser.add_argument('-e', '--emails', type=int, default=1000, metavar="emails",`
			`help='The number of emails to generate (default=1000)', required=False)`
Add option to dump the email list to the console 2020-10-21 23:14:51 +00:00			`parser.add_argument('-d', '--dump', action="store_true",`
			`help='Dump the email list to the console (Default=no)', required=False)`
final commit 2020-10-21 22:57:48 +00:00			`args = parser.parse_args()`
			`email_count = args.emails`

			`# NOTE: The spawning process takes an enormous amount of time,`
			`# but since the challenge didn't say anything about how long it takes to`
			`# generate 100,000 emails (only how long it takes to de-dupe them), I`
			`# didn't do much to try to optimize the creation of the list of emails.`
			`# But I will say, that I kept it entirely in memory, to avoid having to`
			`# deal with disk i/o.`
initial commit 2020-10-21 20:01:26 +00:00			`start = reset_stopwatch()`
final commit 2020-10-21 22:57:48 +00:00			`list_with_dups = spawn(email_count)`
initial commit 2020-10-21 20:01:26 +00:00			`print(f"GENERATED COMPLETE LIST WITH DUPLICATES: (count = {len(list_with_dups)})")`
Add option to dump the email list to the console 2020-10-21 23:14:51 +00:00			`if args.dump:`
			`[print(i) for i in list_with_dups]`
initial commit 2020-10-21 20:01:26 +00:00			`t1 = get_elapsed(start)`
			`print("Elapsed Time: ", t1)`

final commit 2020-10-21 22:57:48 +00:00
			`# This is the part we really care about. This step takes the generated list,`
			`# and runs it through the de-duplicator, returning two lists: the originals,`
			`# and the duplicates. Note, that these lists are identical in LENGTH ONLY,`
			`# because the bifurcation process leaves them unsorted, according to the`
			`# requirements. If sorted, they could be shown to be identical in content`
			`# as well.`
initial commit 2020-10-21 20:01:26 +00:00			`start = reset_stopwatch()`
final commit 2020-10-21 22:57:48 +00:00			`dup_list, orig_list = dups(list_with_dups)`
initial commit 2020-10-21 20:01:26 +00:00			`print(f"IDENTIFIED DUPLICATES IN COMPLETE LIST: (count = {len(dup_list)})")`
Add option to dump the email list to the console 2020-10-21 23:14:51 +00:00			`if args.dump:`
			`[print(i) for i in dup_list]`
initial commit 2020-10-21 20:01:26 +00:00			`t2 = get_elapsed(start)`
			`print("Elapsed time: ", t2)`

final commit 2020-10-21 22:57:48 +00:00			`print("\n\n")`
			`print(f"TOTAL ELAPSED TIME: {t1 + t2}")`