Exploratory data analysis exercise 1
Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Getting Dataset
df = pd.read_csv("D:/StudentsPerformance.csv")
Let's see top 5 row
df.head()
Let's see last 5 row
df.tail()
Let's gather some basic information
df.shape
df.info()
df.describe()
df.columns
df.dtypes
df.nunique()
Checking for missing values
x=[]
z=[]
for i in df:
v=df[i].isnull().sum()/df.shape[0]*100
x.append(v)
z.append(i)
q={"Feature Name":z,"Percentage of missing values":x,}
missingPercentageDataset=pd.DataFrame(q).sort_values(by="Percentage of missing values",ascending=False)
pd.set_option("display.max_rows",None)
missingPercentageDataset
Let's plot a heatmap to see correlation between data
corr=df.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidth=0.7, annot=True)
plt.show()
Let's create a pair plot
sns.pairplot(df)
Creating bar plots for math , reading and writing score for gender
plt.subplot(1,3,1)
figsize=(15,15)
sns.barplot(x="gender",y="math score",data=df,)
plt.xlabel("Gender")
plt.ylabel("Math Score")
plt.show()
plt.subplot(1,3,2)
figsize=(15,15)
sns.barplot(x="gender",y="reading score",data=df,)
plt.xlabel("Gender")
plt.ylabel("Reading Score")
plt.show()
plt.subplot(1,3,3)
sns.barplot(x="gender",y="writing score",data=df)
plt.xlabel("Gender")
plt.ylabel("Writing Score")
plt.show()
Let's see the max value of math, reading and writing score for male and female separately
df1=df.groupby(["gender"])
df2=df1[["math score","reading score","writing score"]].max()
df2
Let's see the mean value of math, reading and writing score for male and female separately
Male
male=df.gender=="male"
male_mean=df.loc[male,["math score","reading score","writing score"]].mean()
male_mean.head()
Female
female=df.gender=="female"
female_mean=df.loc[female,["math score","reading score","writing score"]].mean()
female_mean
Let's see the top 5 largest value of male and female student in math, reading and writing score column
Male
male=df.gender=="male"
male_dt=df.loc[male,["math score","reading score","writing score"]]
v=male_dt.columns
male_li=[]
for i in v:
xx=male_dt[i].nlargest(6)
male_temp_li=[]
for z in xx:
male_temp_li.append(z)
male_li.append(male_temp_li)
ml={"math score":male_li[0],"reading score":male_li[1],"writing score":male_li[2]}
male_largest=pd.DataFrame(ml)
male_largest
Female
female=df.gender=="female"
female_dt=df.loc[female,["math score","reading score","writing score"]]
d=female_dt.columns
female_li=[]
for i in d:
x=female_dt[i].nlargest(6)
female_temp_li=[]
for z in x:
female_temp_li.append(z)
female_li.append(female_temp_li)
fml={"math score":female_li[0],"reading score":female_li[1],"writing score":female_li[2]}
fml_largest=pd.DataFrame(fml)
fml_largest