data analysis using pandas practical code 2
Dataset Link
Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Data Collection and Data Processing
df = pd.read_csv("D:/market sales.csv")
df.head()
Gathering some information
df.shape
df.info()
df.describe()
df.columns
df.dtypes
df.nunique()
Checking For missing values
x=[]
z=[]
for i in df:
v=df[i].isnull().sum()/df.shape[0]*100
x.append(v)
z.append(i)
q={"Feature Name":z,"Percentage of missing values":x,}
missingPercentageDataset=pd.DataFrame(q).sort_values(by="Percentage of missing values",ascending=False)
pd.set_option("display.max_rows",None)
missingPercentageDataset
Data Cleaning
For simplicity just dropping the nan contain columns
df.drop(["Outlet_Size"],axis=1,inplace=True)
df.drop(["Item_Weight"],axis=1,inplace=True)
Let's plot a heatmap to see correlation between data
corr=df.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidth=0.7, annot=True)
plt.show()
Let's create a pair plot
sns.pairplot(df)
Let's see the unique values of all categorical column
categorical_column=df.select_dtypes(include="object").columns
categorical_column
unique_li=[]
for i in categorical_column:
x=[]
y=df[i].unique()
for z in y:
x.append(z)
unique_li.append(x)
un_name={}
i=1
while i< len(categorical_column):
un_name[categorical_column[i]]=unique_li[i]
i=i+1
un_name
Let's see which item contain lower fat and regular fat
Regular fat
regular_ft=df.Item_Fat_Content=="Regular"
rf_dt=df.loc[regular_ft,["Item_Type"]]
regular_fat_item=[]
for i in rf_dt["Item_Type"]:
if i not in regular_fat_item:
regular_fat_item.append(i)
else:
pass
regular_fat_item
Low fat
low_Fat=df.Item_Fat_Content=="Low Fat"
lf_dt=df.loc[low_Fat,["Item_Type"]]
low_fat_item=[]
for i in lf_dt["Item_Type"]:
if i not in low_fat_item:
low_fat_item.append(i)
else:
pass
low_fat_item
Let's see average outlet sells of each item
df1=df.groupby(["Item_Type"])
df2=df1["Item_Outlet_Sales"].mean()
df3=df1["Item_Outlet_Sales"].max()
df4=df1["Item_Outlet_Sales"].min()
df5=df1["Item_Outlet_Sales"].count()
dfd={"mean":df2,"max":df3,"min":df4,"count":df4}
sells_info=pd.DataFrame(dfd)
sells_info
Let's create a bar chart of item_type and item_outlet_sells
ax=sns.barplot(x="Item_Type",y="Item_Outlet_Sales",data=df,)
ax.set_xticklabels(['Dairy',
'Soft Drinks', 'Meat', 'Fruits and Vegetables', 'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned', 'Breads', 'Starchy Foods', 'Others', 'Seafood'],
rotation=90)
plt.xlabel("Item_Type")
plt.ylabel("Item_Outlet_Sales")
plt.show()
Getting Dataset