#importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#some visualization settings
%matplotlib inline
sns.set_style('darkgrid')


#loading the dataset
no_show = pd.read_csv('noshowappointments-kagglev2-may-2016.csv')
no_show.head()


#no_show.info()


no_show.shape

(110527, 14)


no_show.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


no_show.describe()


#histogram of dataframe
no_show.hist(figsize=(12, 12), color="r");


#converting titles to lowercase, and renaming the no-show column to follow the python naming convention
no_show.columns = no_show.columns.str.lower()
no_show.rename(columns={'no-show':'no_show'}, inplace=True)
#fixing the spelling error in the 'hipertension' and 'handcap' columns
no_show.rename(columns={'hipertension':'hypertension'}, inplace=True)
no_show.rename(columns={'handcap':'handicap'}, inplace=True)


#getting an idea of the distribution of ages within the age column
no_show['age'].hist( figsize=(8, 8), color="g");


#getting all rows that have the age value set to any number less than 0
no_show.query('age < 0')


#dropping that row
no_show.drop(index= no_show.index[99832], inplace=True)
#confirming what we did
no_show.query('age < 0')


#checking for duplicate rows
no_show.duplicated().sum()

0


#confirming our changes
no_show.head(3)


#getting the total number of rows in the dataFrame
no_show.shape[0]

110526


no_show['appointmentday'].duplicated().sum()

110499


no_show['scheduledday'].duplicated().sum()

6978


(no_show['appointmentday'].duplicated().sum()) / len(no_show) * 100

99.97557135877531


(no_show['scheduledday'].duplicated().sum()) / len(no_show) * 100

6.313446609847456


no_show.drop(['appointmentid', 'patientid', 'neighbourhood', 'scheduledday', 'appointmentday'], axis=1, inplace=True)


#confirming that those columns have indeed been dropped.
no_show.head(3)


#broadly checking if our variables are quantitative or categorical
no_show.nunique()

gender            2
age             103
scholarship       2
hypertension      2
diabetes          2
alcoholism        2
handicap          5
sms_received      2
no_show           2
dtype: int64


#exploring the handicap column
no_show.query('handicap > 1')


#dropping those rows
no_show.drop(no_show.index[no_show['handicap'] > 1], inplace=True)
#confirming what we just did
no_show.query('handicap > 1')


no_show.head()


#confirming that the datatypes of all relevant columns are appropriate.
no_show.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110327 entries, 0 to 110526
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   gender        110327 non-null  object
 1   age           110327 non-null  int64 
 2   scholarship   110327 non-null  int64 
 3   hypertension  110327 non-null  int64 
 4   diabetes      110327 non-null  int64 
 5   alcoholism    110327 non-null  int64 
 6   handicap      110327 non-null  int64 
 7   sms_received  110327 non-null  int64 
 8   no_show       110327 non-null  object
dtypes: int64(7), object(2)
memory usage: 8.4+ MB


#Starting off with the visualization of the relationship between the 'gender' and the 'no_show' columns
ax = sns.countplot(x=no_show['gender'], hue=no_show['no_show'], palette='Set2')
ax.set_title('The relationship between gender and no-show')
xlabels=['Female', 'Male']
ax.set_xticklabels(xlabels)
plt.show()


# getting the total counts for no-show for both genders
gender_counts = no_show.groupby(['gender']).count()['no_show']
gender_counts

gender
F    71730
M    38597
Name: no_show, dtype: int64


#getting the total number of males in the dataFrame
male_count = no_show[no_show['gender'] == 'M']
male_count = no_show.query('gender == "M" ')
male_count = male_count['gender'].count()
male_count

38597


#getting the total number of females in the dataFrame
female_count = no_show[no_show['gender'] == 'F']
female_count = no_show.query('gender == "F" ')
female_count =female_count['gender'].count()
female_count

71730


#finding out the number of males that showed up 
show_male = no_show[no_show['gender'] == 'M']
show_male = no_show.query('gender == "M" and no_show == "No"')
show_male = show_male['no_show'].count()
show_male

30887


#finding out the number of males that didn't show up
no_show_male = no_show[no_show['gender'] == 'M']
no_show_male = no_show.query('gender == "M" and no_show == "Yes"')
no_show_male = no_show_male['no_show'].count()
no_show_male

7710


##finding out the number of females that showed up
show_female = no_show[no_show['gender'] == 'F']
show_female = no_show.query('gender == "F" and no_show == "No"')
show_female = show_female['no_show'].count()
show_female

57162


#finding out the number of females that didn't show up
no_show_female = no_show[no_show['gender'] == 'F']
no_show_female = no_show.query('gender == "F" and no_show == "Yes"')
no_show_female = no_show_female['no_show'].count()
no_show_female

14568


def no_show_percentage(show, no_show, total):
    #creating a function that finds out the no-show percentage
    show_percent = round(show / total * 100)
    no_show_percent = round(no_show / total * 100)
    return "show {}%, no-show {}% ".format(show_percent, no_show_percent)


#getting the percentages for males
male_no_show_percentage = no_show_percentage(show_male, no_show_male, male_count)
male_no_show_percentage

'show 80%, no-show 20% '


#getting the percentages for females
female_no_show_percentage = no_show_percentage(show_female, no_show_female, female_count)
female_no_show_percentage

'show 80%, no-show 20% '


#grouping the values in the age column into 3, and creating a variable that holds the age groups 
age_bins = pd.cut(no_show['age'], bins=3, labels=('young','middle-aged', 'old'))


#confirming that the no. of values in our 'age_bins' column tallies with the no. of rows in the dataFrame 
age_bins.count() == no_show['age'].count()

True


#countplot visualization using seaborn
ax = sns.countplot(x=age_bins, hue=no_show['no_show'])
ax.set_title('The relationship between age groups and No-show')
##x_tick_labels are not necessary here because our values represent labels for the 3 age groups
plt.show()


#a function to get the proportions of patients that showed up and those that didn't
def variable_proportions(column, DataFrame):
    TotalValues_forEachCategory = DataFrame.groupby([column])['no_show'].count()
    valuesFor_Show_noShow = DataFrame.groupby([column, 'no_show'])['no_show'].count()
    proportions = valuesFor_Show_noShow / TotalValues_forEachCategory
    return proportions


#a function to plot the proportions against the no_show column
def proportions_plot(proportions, column_name, color1, color2, xLabel_1, xLabel_2, main_xlabel):
    # Proportions come from get_propotion function.
    proportions.unstack().plot(kind='bar', color=[color1, color2]);
    plt.ylabel('proprtion of patients')
    plt.xlabel(main_xlabel);
    plt.title(f'The relationship between {column_name} and no-show', fontsize=15, )
    plt.legend(labels=['Show', 'no-show'], fontsize=16 , bbox_to_anchor=(1.05, 0.7));
    plt.xticks([0, 1], [xLabel_1, xLabel_2], rotation=0);


#getting the proportions
proportions = variable_proportions('scholarship', no_show)
#plotting
proportions_plot(proportions, 'scholarship', 'g', 'r', 'no', 'yes', 'scholarship')


#diabetes and no-show
#getting the proportions
proportions = variable_proportions('diabetes', no_show)
#plotting
proportions_plot(proportions, 'diabetes', 'g', 'r', 'no', 'yes', 'Diabetes')


#hypertension and no-show
#getting the proportions
proportions = variable_proportions('hypertension', no_show)
#plotting
proportions_plot(proportions, 'hypertension', 'g', 'r', 'no', 'yes', 'Hypertension')


#alcoholism and no-show
#getting the proportions
proportions = variable_proportions('alcoholism', no_show)
#plotting
proportions_plot(proportions, 'alcoholism', 'g', 'r', 'no', 'yes', 'Alcoholism')


#handicap and no-show
#getting the proportions
proportions = variable_proportions('handicap', no_show)
#plotting
proportions_plot(proportions, 'handicap', 'g', 'r', 'no', 'yes', 'Handicap')

	PatientId	AppointmentID	Gender	ScheduledDay	AppointmentDay	Age	Neighbourhood	Hipertension	Diabetes	No-show
0	2.987250e+13	5642903	F	2016-04-29T18:38:08Z	2016-04-29T00:00:00Z	62	JARDIM DA PENHA	1	0	No
1	5.589978e+14	5642503	M	2016-04-29T16:08:27Z	2016-04-29T00:00:00Z	56	JARDIM DA PENHA	0	0	No
2	4.262962e+12	5642549	F	2016-04-29T16:19:04Z	2016-04-29T00:00:00Z	62	MATA DA PRAIA	0	0	No
3	8.679512e+11	5642828	F	2016-04-29T17:29:31Z	2016-04-29T00:00:00Z	8	PONTAL DE CAMBURI	0	0	No
4	8.841186e+12	5642494	F	2016-04-29T16:07:23Z	2016-04-29T00:00:00Z	56	JARDIM DA PENHA	1	1	No

	PatientId	AppointmentID	Age	Scholarship	Hipertension	Diabetes	Alcoholism	Handcap	SMS_received
count	1.105270e+05	1.105270e+05	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000
mean	1.474963e+14	5.675305e+06	37.088874	0.098266	0.197246	0.071865	0.030400	0.022248	0.321026
std	2.560949e+14	7.129575e+04	23.110205	0.297675	0.397921	0.258265	0.171686	0.161543	0.466873
min	3.921784e+04	5.030230e+06	-1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	4.172614e+12	5.640286e+06	18.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	3.173184e+13	5.680573e+06	37.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	9.439172e+13	5.725524e+06	55.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
max	9.999816e+14	5.790484e+06	115.000000	1.000000	1.000000	1.000000	1.000000	4.000000	1.000000

Project: Investigate a Dataset - No-show Appointments¶

Table of Contents¶

Introduction¶

Let's begin our anaysis by importing all the packages that would aid in the exploration and visualization of our dataset.¶

Data wrangling¶

Some questions¶

Cleaning the dataFrame¶

Now, let's drop some irrelevant columns to reduce the noise in our dataFrame.¶

Exploratory data analysis¶

Henceforth, I'll be checking the relationship between column to uncover any trends/patterns that may enable us to be able to predict patients that may miss thier appointments in the future.¶

Let's begin by finding out the relationship between the 'gender' and the 'no_show' columns¶

Next, we would like to visualize the relationship between the ages of the patients and the 'no_show' columns¶

Let's create simple functions to help us get proportions of our variables and plot them against the no_show column¶

visualizing the correlation between the 'scholarship' and 'no_show' columns¶

visualizing any correlations between chronic medical status of patients and the 'no_show' column¶

Conclusion¶

Limitations¶

	gender	age	scholarship	hypertension	diabetes	alcoholism	handicap	sms_received	no_show
946	M	94	0	1	1	0	2	1	No
1665	M	64	0	1	0	1	2	1	No
1666	M	64	0	1	0	1	2	0	No
2071	M	64	0	1	0	1	2	0	No
2091	F	11	0	0	0	0	2	0	No
...	...	...	...	...	...	...	...	...	...
108376	F	44	0	1	1	0	2	1	No
109484	M	64	0	1	1	0	2	0	No
109733	F	34	0	0	0	0	2	1	No
109975	M	39	1	0	0	0	2	1	No
110107	F	44	0	0	0	0	2	1	No