
import csv
from tqdm import tqdm 
import pickle

def main():
	# a fantastic library for reading csvs
	reader  = csv.DictReader(open('data/baby_names.csv'))
	# dictionary from (name, year) to counts
	count_map = {}
	# every name in the dataset
	all_names = set([])
	# every year in the dataset, sorted
	all_years = set([])

	for row in tqdm(reader):
		year = int(row['Year'])
		# social security applications before 1914 are very biased
		if year < 1914: continue

		key = (row['Name'], year)
		all_names.add(row['Name'])
		all_years.add(year)
		num_babies = int(row['Count'])

		if key in count_map:
			count_map[key] += num_babies
		else:
			count_map[key] = num_babies

	# sort years from lowest to highest
	all_years = sorted(all_years)

	# save data
	pickle.dump(count_map, open('data/count_map.pkl', 'wb'))
	pickle.dump(all_years, open('data/all_years.pkl', 'wb'))
	pickle.dump(all_names, open('data/all_names.pkl', 'wb'))
	print('count_map', len(count_map))
	print('all_names', len(all_names))



if __name__ == '__main__':
	main()