data analysis using pandas practical code 3
Dataset Link
Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Data Collection and Data Processing
df = pd.read_csv("D:/tips.csv")
df.head()
Gathering some information
df.shape
df.info()
df.describe()
df.columns
df.dtypes
df.nunique()
Checking For missing values
x=[]
z=[]
for i in df:
v=df[i].isnull().sum()/df.shape[0]*100
x.append(v)
z.append(i)
q={"Feature Name":z,"Percentage of missing values":x,}
missingPercentageDataset=pd.DataFrame(q).sort_values(by="Percentage of missing values",ascending=False)
pd.set_option("display.max_rows",None)
missingPercentageDataset
Let's plot a heatmap to see correlation between data
corr=df.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidth=0.7, annot=True)
plt.show()
Let's create a pair plot
sns.pairplot(df)
Let's average bill, tip and size for male and female
df1=df.groupby(["sex"])
df2=df1[["total_bill","tip","size"]].mean()
df2
Let's max bill, tip and size for male and female
df1=df.groupby(["sex"])
df2=df1[["total_bill","tip","size"]].max()
df2
Let's min bill, tip and size for male and female
df1=df.groupby(["sex"])
df2=df1[["total_bill","tip","size"]].min()
df2
Let's see the top 5 largest value of male and female of total_bill, tip, and size column
male=df.sex=="Male"
male_dt=df.loc[male,["total_bill","tip","size"]]
v=male_dt.columns
male_li=[]
for i in v:
xx=male_dt[i].nlargest(6)
male_temp_li=[]
for z in xx:
male_temp_li.append(z)
male_li.append(male_temp_li)
ml={"total_bill":male_li[0],"tip":male_li[1],"size":male_li[2]}
male_largest=pd.DataFrame(ml)
male_largest
Let's see the top 5 largest value of female and female of total_bill, tip, and size column
male=df.sex=="Female"
male_dt=df.loc[male,["total_bill","tip","size"]]
v=male_dt.columns
male_li=[]
for i in v:
xx=male_dt[i].nlargest(6)
male_temp_li=[]
for z in xx:
male_temp_li.append(z)
male_li.append(male_temp_li)
ml={"total_bill":male_li[0],"tip":male_li[1],"size":male_li[2]}
male_largest=pd.DataFrame(ml)
male_largest
Let's create a bar plot to see that in which day and when the number of customer is more
df1=df.groupby(["day"])
df2=df1[["time"]].count()
df2
days=["Sun","Sat","Thur","Fri"]
time=[76,87,62,19]
plt.bar(days,time,width=0.5,color="cyan",edgecolor="blue",linewidth=2,linestyle="-")
plt.title("day vs time count")
plt.xlabel("Days")
plt.ylabel("Time")
plt.show()
Let's see who(male of female) paid bill most of the time
df1=df.groupby(["sex"])
df2=df1[["total_bill"]].count()
df2
days=["Female","Male"]
time=[87,157]
plt.bar(days,time,width=0.5,color="cyan",edgecolor="blue",linewidth=2,linestyle="-")
plt.title("Sex vs Total bill")
plt.xlabel("Sex")
plt.ylabel("Total bill")
plt.show()
Let's see what is the size in which day and when
sns.barplot(x="time",y="size",data=df,hue="day",alpha=0.6,linewidth="4",linestyle=":")
plt.title("Bar plot",fontsize=15)
plt.xlabel("Time",fontsize=12)
plt.ylabel("Size",fontsize=15)
plt.show()