#Load libraries and set working directory - import whole libraries to avoid confusion
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn import linear_model
#did not work when imported just sklearn
%matplotlib inline
#to make the plots show up
#Set the correct working directory
os.chdir(r"C:\Users\rf\Google Drive\Education\R\Rwork")
#Read the data and review the basic descriptive information
inputdata='titanic_data.csv'
t=pd.read_csv(inputdata) #load data into a pandas dataframe
# Automatically produced summary of the data set
print "Hello, world! Let's review my dataset."
print "\n"
print 'My data set is',inputdata
print 'I imported it into a',type(t)
print 'It has',t.shape[1],'columns and',t.shape[0],'rows.'
print "Here are the columns and their data types in the dataset:"
print t.dtypes
print "\n"
print ("Here are a few rows from this data frame:")
print t.head()
print "\n"
print "Also let's create a variable Minor - if Age <18. It will have missing values when Age does."
#how to make it missing? multi step process but works
t['Minor']=t['Age']
t.loc[(t['Age'] >=0), 'Minor'] = False
t.loc[(t['Age'] >=0) & (t['Age'] <18), 'Minor'] = True
print "\n"
print "Also add dummies for gender which will be useful later."
sdummies = pd.get_dummies(t['Sex'])
t = t.join(sdummies)
print ("Summary statistics for the whole data set")
print t.describe()
print "\n"
print ("Survived will be my Y, Gender is the first X, Age is continuous but has missing values (I will not impute though)")
print "\n"
print ("Quick means (ignoring the missing values by default)")
print t.mean(axis=0)
print "\n"
print ("Let's also look at the distribution by requesting custom percentiles.")
print t.describe(percentiles=[0.01,.25, .5, .75, 0.99])
print "\n"
print ("Let's also describe the categorical variables.")
print t.describe(include=['object'])
print ("Count of missing Age and also same for minor")
survived = t.loc[:,['Survived']]
sur_count= survived.apply(pd.value_counts).fillna(0)
print sur_count
age = t.loc[:,['Age']]
print ("How many rows have missing for Age?")
print age.shape[0] - age.dropna().shape[0]
minor = t.loc[:,['Minor']]
print ("How many rows have missing for Minor?")
print minor.shape[0] - minor.dropna().shape[0]
print ("OK, just as expected.")
print "\n"
print ("Frequency of categoricals")
sex = t.loc[:,['Sex']]
sex_count = sex.apply(pd.value_counts).fillna(0)
print sex_count
pclass = t.loc[:,['Pclass']]
pclass_count = pclass.apply(pd.value_counts).fillna(0)
print pclass_count
print t['Survived'].plot(kind='hist',bins=2,xticks=np.arange(0, 2, 1)
,title='Frequency of Survived',legend=True,color='grey')
print t['Pclass'].plot(kind='hist',bins=3,xticks=np.arange(1, 4, 1)
,title='Frequency of Pclass',legend=True,color='blue')
print sex_count.plot(kind='bar',title='Gender distribution',legend=True,color='pink')
sur_by_sex = pd.crosstab(t.Survived,t.Sex)
sur_by_sex
sur_by_sex.plot(kind='bar', stacked=True)
sur_by_sex_pcts = sur_by_sex.div(sur_by_sex.sum(1).astype(float), axis=0)
sur_by_sex_pcts.plot(kind='bar', stacked=True)
print t['Age'].hist(bins=50)
hist_age = t['Age'].hist(bins=50 , normed=True, color='red')
dens_age = t['Age'].plot(kind='kde' , color='blue')
print hist_age.set_xlim((0,70))
print sur_count
print "\n"
print pd.crosstab(t.Survived, t.Sex, margins=True)
print "\n"
print pd.crosstab(t.Survived, t.Pclass, margins=True)
print "\n"
print t.pivot_table(['Age'], index=['Survived','Sex'], columns='Pclass',margins=True)
spct = np.round((sur_count / sur_count.sum() * 100))
print sur_count, "\n","\n", spct
print spct.plot(kind='bar')
sg = pd.DataFrame(pd.crosstab(t.Survived, t.Sex))
sg2 = sg / float(sur_count.sum())
print sg2.plot(kind='bar')
g = sns.FacetGrid(t, row="Sex", col="Survived", margin_titles=True, row_order=["female", "male"])
age_bins = np.arange(0, 70, 5)
g.map(plt.hist, 'Age', bins=age_bins, color="steelblue")
age_gr = t['Age'].groupby([t['Sex'], t['Survived']]).mean().unstack()
print (age_gr)
s1 = t[t.Survived.isin([1])]
s0 = t[t.Survived.isin([0])]
#refuses to subset
t2 = t.loc[t.Age.notnull()]
age_gr2 = t2.Age.groupby([t2.Sex, t2.Survived]).groups
age_gr2
age_list = np.array(t['Age']) #need to load into np.array, and indexes look at the original t - dataframe!
m1 = age_list[age_gr2['male',1]]
m0 = age_list[age_gr2['male',0]]
f1 = age_list[age_gr2['female',1]]
f0 = age_list[age_gr2['female',0]]
print ("First Anova test for the survivors.")
print sp.stats.f_oneway(m1,f1)
print ("No significant difference for survivors.")
print "\n"
print sp.stats.f_oneway(m0,f0)
print ("But significant for the dead - the men who died were older than the women who died.")
print ("Create a frequency table first.")
freq01 = pd.crosstab(t.Sex, t.Survived, margins=True)
print freq01
freq01t = sp.stats.chi2_contingency(freq01)
print "\n"
print ("Here are the expected values.")
print freq01t[3]
print "\n"
print ("Here is the pvalue.")
print freq01t[1]
print ("Yes, definitely significant difference in survival between the two sexes.")
t3 = t.loc[t.Minor==True]
freq02 = pd.crosstab(t3.Sex, t3.Survived, margins=True)
print freq02
print "\n"
print ("Here are the expected values.")
freq02t = sp.stats.chi2_contingency(freq02)
print freq02t[3]
print "\n"
print ("Here is the pvalue.")
print freq02t[1]
print ("Yes, also significant.")
print "\n"
print ("Calculate Survived percentages by gender.")
fs = round((freq02.ix['female'][1]) / (freq02.ix['female']['All'] * 1.00) * 100)
ms = round((freq02.ix['male'][1]) / (freq02.ix['male']['All'] * 1.00) * 100)
print "Survival for women was at",fs,"percent."
print "Survival for men was at",ms,"percent."
g3 = sns.FacetGrid(t3, row="Sex", col="Survived", margin_titles=True, row_order=["female", "male"])
age_bins3 = np.arange(0, 18, 5)
g3.map(plt.hist, 'Age', bins=age_bins3, color="steelblue")
sgm = pd.DataFrame(pd.crosstab(t3.Survived, t3.Sex))
sgm2 = sg / float(sur_count.sum())
print sgm2.plot(kind='bar')
print ("Overall linear probability plot.")
sns.regplot(x='Age', y='Survived', data=t, y_jitter=0.1, lowess=True)
sns.despine()
tm = t.loc[t.Sex=='male']
tf = t.loc[t.Sex=='female']
print ("For females")
sns.regplot(x='Age', y='Survived', data=tf, y_jitter=0.1, lowess=True)
sns.despine()
print ("For males")
sns.regplot(x='Age', y='Survived', data=tm, y_jitter=0.1, lowess=True)
sns.despine()
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
res = pd.stats.api.ols(y=t2.Survived, x=t2[['Age','Pclass']])
print res
res2 = pd.stats.api.ols(y=t2.Survived, x=t2[['Age','Pclass','female']])
print res2
print ("Check trained model coefficients - Age, Pclass, Female.")
logreg = linear_model.LogisticRegression()
Yvar = t2[['Survived']]
Xvar2 = t2[['Age','Pclass','female']]
log02 = logreg.fit(Xvar2, Yvar.values.ravel())
print(log02.coef_)
print ("The same signs as the linear model. That's good.")
print ("Need to get exponentiate the parameters to get the odds ratios.")
coefs=log02.coef_
#not vectorized fn?
print np.exp(coefs)
sur_count.plot(kind='bar',title='Survival Distribution',legend=True,color='grey')
#use the plt fn from matplotlib for additional customization
plt.ylabel('Frequency (Count)')
plt.xlabel('Survived (0=no, 1=yes')
sex_count.plot(kind='bar',title='Gender distribution',legend=True,color='pink')
#use the plt fn from matplotlib for additional customization
plt.ylabel('Frequency (Count)')
print ("Here are the actual percentages for the charts above.")
print sur_count, "\n","\n", np.round(sur_count / sur_count.sum() * 100)
print "\n"
print sex_count, "\n","\n", np.round(sex_count / sex_count.sum() * 100)
g = sns.FacetGrid(t, row="Sex", col="Survived", margin_titles=True, row_order=["female", "male"])
age_bins = np.arange(0, 70, 5)
g.map(plt.hist, 'Age', bins=age_bins, color="steelblue")
plt.ylabel('Frequency (Count)')
plt.xlabel('Age (Years)')
plt.title('Distribution of Age by Gender and Survival')
plt.show()
#Hmm - title is only for the last chart? oh, well
g = sns.FacetGrid(t, col="Sex", margin_titles=True)
g.map(sns.regplot, "Age", "Survived", lowess=True, y_jitter=.1);
Issues: What we have is an incomplete data set - obviously there were more than 891 passengers on Titanic. We can make no assumptions about whether we got a random sample or a biased one. Therefore we cannot really make any general conclusions but we can speculate about the results.
There is missing data for Age. I did no imputations because I cannot make any assumptions whether the values are MCAR (missing completely at random). Those observations were just dropped by the R’s procedures when Age variable was invovled.
My exploratory data analysis suggested a link between survival and gender, so I pursued this and conducted statistical tests. Chi-squared test of the frequency of survival based on gender (Sex) returned a very low p-value meaning this pattern could not have occurred by chance.
Both the linear and logistic probability models returned negative estimates for being male on survival. The odds ratio of survival of being female comparing to being a male was 11.2. This means that females were 11 times more likely to survive that males.
This pattern holds even for minors (< 18 years old) - females were still more likely to survive. This seems to suggest that in the good old saying “women and children first” boys of 15 years or older do not count as “children.”
Next steps? In the future I could review other tragic events and disasters and analyze factors that influence the probability of survival.
'''
Data source: https://www.kaggle.com/c/titanic/data
VARIABLE DESCRIPTIONS:
survival Survival
(0 = No; 1 = Yes)
pclass Passenger Class
(1 = 1st; 2 = 2nd; 3 = 3rd)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare
cabin Cabin
embarked Port of Embarkation
(C = Cherbourg; Q = Queenstown; S = Southampton)
'''
;