# Import Packages
import pandas as pd
import matplotlib.pyplot as pp
import zipfile
# Running a matplotlib style
from jupyterthemes import jtplot
jtplot.style(theme='chesterish')
jtplot.style(context='talk', spines=True, ticks=True, gridlines='')
# Data Load
## read in the names.zip and extract everything
zipfile.ZipFile('names.zip').extractall('.')
Let's start by looking at the top 2014 names as an example of common names millennials are giving their babies. Here, we will use raw Pandas syntax (later we will use custom functions).
names2014 = pd.read_csv('names/yob2014.txt', names=['name','sex','frequency'])
names2014.set_index(['sex']).loc['F'][:10]
names2014.set_index(['sex']).loc['M'][:10]
Millennials are naming their baby's Emma, Olivia, Noah, and Liam. My name, Michael, remains number seven on the male list.
Tables of Most Popular Female and Male Millennial Names
Let's create functions to show popular names as tables. This method will enable us to see names from 2008 to 2014 to ensure these millennial baby names were not one-off (i.e., names popular for one year, perhaps due to something occurring in pop culture).
# function to add all years worth of data as a panel data frame
names_all = []
for year in range(1880,2014+1):
names_all.append(pd.read_csv('names/yob{}.txt'.format(year),
names=['name','sex','frequency']))
names_all[-1]['year'] = year #append year as the last column
# Save panel data
allyears = pd.concat(names_all)
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
# create a function to do the top ten names for any year based on gender
def topnames(sex, year, limit=10):
simple = allyears_indexed.loc[sex,:,year].sort_values(by='frequency', ascending=False)
simple = simple.reset_index()
simple = simple.drop(['sex','year','frequency'],axis=1).head(limit)
simple.columns = [year]
simple.index = simple.index + 1
return simple
# create a function that gives the top names over a range of years
def top_names_range(sex, start_year, end_year):
years = [topnames(sex,year) for year in range(start_year,end_year+1)]
return years[0].join(years[1:])
top_names_range('F',2008,2014)
top_names_range('M',2008,2014)
Emma and Olivia appear to have been popular for awhile. Noah and Liam appear to be more recent.
topnames('M',1988)
The year I was born, Michael was the most popular boy's name. In fact, I was one of four Michael's in my 60 student elementary class.
# create a function to plot a name across years
def plotname(sex, name):
data = allyears_indexed.loc[sex,name]
pp.plot(data.index, data.values)
pp.figure(figsize=(12,3))
females = ['Emma', 'Olivia']
for name in females:
plotname('F', name)
males = ['Noah', 'Liam']
for name in males:
plotname('M', name)
pp.title('Top Millennial Baby Names')
pp.legend(['Emma', 'Olivia','Noah', 'Liam'])
pp.show()
We can see that Emma was the only name with popularity in the past. Liam is the latest to gain popularity.
First, I will look at each name independently to see their rate of change over time.
plotname('M', 'Michael')
pp.title('Michael')
pp.show()
plotname('F', 'Laura')
pp.title('Laura')
pp.show()
plotname('F', 'Elise')
pp.title('Elise')
pp.show()
plotname('M', 'Emil')
pp.title('Emil')
pp.show()
Next, I will plot the whole family on the same y-axis to show how each name compares in magnitude against one another.
pp.figure(figsize=(12,3))
plotname('M', 'Michael')
plotname('F', 'Laura')
plotname('F', 'Elise')
plotname('M', 'Emil')
pp.title('Siebel Family')
pp.legend(['Michael','Laura','Elise','Emil'])
pp.show()
Finally, what I care about the most, let's look at my children's names on the same y-axis.
pp.figure(figsize=(12,3))
plotname('F', 'Elise')
plotname('M', 'Emil')
pp.title('Siebel Children')
pp.legend(['Elise','Emil'])
pp.show()
This tells me that my daughter may have another Elise in her class - certainly, less likely then how I had three other Michael's in mine.
However, my son is very unlikely to have another Emil in his class.