In this first post I will analyze the classical Diamond´s dataset.
The purpose of this analysis is determining the price of diamonds based on their characteristics.
To do so, I will do a exploratory data analysis, features engineering, features standardization and I will test several regression models: linear regresions, decision trees, random forest etc
In [1]:
#DIAMONDS PRICE
#Import libraries
import numpy as np
import pandas as pd
from math import *
import sys
!{sys.executable} -m pip install missingno
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Visualisation Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pylab as pylab
import missingno as msno
plt.style.use( 'ggplot' )
%matplotlib inline
#Import preprocessing libraries
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder
#Import feature selection libraries
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score
# Regression libraries
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
#Import classification libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
#Import evaluation metrics
# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score, mean_absolute_error
# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
In [2]:
#Importing the data set
df = pd.read_csv('C:\Users\Hector\datasetsblog\diamonds.csv')
diamonds = df.copy()
In [3]:
#we drop a column that is used as index
diamonds.drop(['Unnamed: 0'],inplace=True, axis=1)
#Exploring the data set
diamonds.head(5)
Out[3]:
In [4]:
diamonds.shape
Out[4]:
In [5]:
diamonds.info()
#there arent any nan in the data set
df.isnull().sum()
Out[5]:
In [6]:
#checking for nulls
# percentage of nan
# number of nan
msno.matrix(diamonds);
msno.bar(diamonds);
In [7]:
diamonds.describe()
Out[7]:
In [8]:
#there are some values that are wrong such as those that are 0 for x, y, z
#the dimensions can not be 0
df.loc[(df['x']==0) | (df['y']==0) | (df['z']==0)]
Out[8]:
In [9]:
len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])
# there are 20 recorst that has 0 in their dimensions
# we will need to drop this values
Out[9]:
In [10]:
# dropping the records
df = df[(df[['x','y','z']] != 0).all(axis=1)]
In [11]:
# Lets start with the Exploratory data analysis
plt.style.use('ggplot')
plt.figure(figsize=(12,10))
plt.hist(diamonds['price'].dropna(), bins = 100);
plt.xlabel('price'); plt.ylabel('Number of diamonds');
plt.title('Diamonds price Distribution');
# we can see that very few diamonds are of very high value
In [50]:
#lets analysis how the carats are distributed
plt.style.use('ggplot')
plt.figure(figsize=(12,10))
plt.hist(diamonds['carat'].dropna(), bins = 100);
plt.xlabel('carat'); plt.ylabel('Number of diamonds');
plt.title('Diamonds carat Distribution');
# we can see that very few diamonds are of very high carat
In [64]:
#lets analyze how prices varies with the carat and lets take a look also to their distributions
g=sns.JointGrid(x='carat',y='price',data=diamonds,size=10)
g=g.plot(sns.regplot,sns.distplot);
# we can see that very few diamonds have big carats, and that those are piced higher
# Nevertheless the relation is not linear. High carat diamonds are extremenly rare, and thus
# they receive very high prices.
In [12]:
# lets analysize how the prices varies with the clarity
# Create a list of clarity types
types = diamonds.dropna(subset=['clarity'])
types = types['clarity'].value_counts()
types = list(types[types.index ].index)
# Plot of distribution of prices for clarity categories
plt.figure(figsize=(12,10))
# Plot each clarity
for b_type in types:
subset = diamonds[diamonds['clarity'] == b_type]
sns.kdeplot(subset['price'].dropna(),
label = b_type, shade = False, alpha = 0.8);
# label the plot
plt.xlabel('price', size = 20); plt.ylabel('Density', size = 20);
plt.title('Density Plot of diamonds by Clarity Type', size = 28);
#we can see that those diamonds whose clarity is rare, have greater value in the market
In [13]:
# Create a list of colors types
types = diamonds.dropna(subset=['color'])
types = types['color'].value_counts()
types = list(types[types.index ].index)
# Plot of distribution of prices for color categories
plt.figure(figsize=(12,10))
# Plot each color
for b_type in types:
subset = diamonds[diamonds['color'] == b_type]
sns.kdeplot(subset['price'].dropna(),
label = b_type, shade = False, alpha = 0.8);
# label the plot
plt.xlabel('price', size = 20); plt.ylabel('Density', size = 20);
plt.title('Density Plot of diamonds by Color Type', size = 28);
#we can see that those diamonds whose color is rare have higher value in the market
In [14]:
# Create a list of cut types
types = diamonds.dropna(subset=['cut'])
types = types['cut'].value_counts()
types = list(types[types.index ].index)
# Plot of distribution of prices for cut categories
plt.figure(figsize=(12,10))
# Plot each cut
for b_type in types:
subset = diamonds[diamonds['cut'] == b_type]
sns.kdeplot(subset['price'].dropna(),
label = b_type, shade = False, alpha = 0.8);
# label the plot
plt.xlabel('price', size = 20)
plt.ylabel('Density', size = 20)
plt.title('Density Plot of diamonds by Cut Type', size = 28);
#we can see that the cut in the diamond is something that may depend on other parameters as some fair cuts
#reach higher prices despite having air cut instead of premium or ideal
In [15]:
#let´s see how are the distributions of price by color
plt.figure(figsize=(12,10))
sns.boxplot(x='color',y='price',data=diamonds)
plt.xlabel('color', size = 20)
plt.ylabel('price', size = 20)
plt.title('Price by color Type', size = 28);
# clearly I and J colores are priced more highly
In [16]:
#let´s see how are the distributions of price by cut
plt.figure(figsize=(12,10))
sns.boxplot(x='cut',y='price',data=diamonds)
plt.xlabel('cut', size = 20)
plt.ylabel('price', size = 20)
plt.title('Price by cut Type', size = 28);
# we can see that the cut has less influence in the price.
In [17]:
#let´s see how are the distributions of price by clarity
plt.figure(figsize=(12,10))
sns.boxplot(x='clarity',y='price',data=diamonds)
plt.xlabel('clarity', size = 20)
plt.ylabel('price', size = 20)
plt.title('Price by clarity Type', size = 28);
# we can see that the clarity has a big influence in the price.
In [18]:
# let's analyze how the variables are correlated with the price and with each other
plt.figure(figsize=(12,10))
sns.heatmap(diamonds.corr(),cmap='coolwarm',annot=True);
plt.title('Correlations', size = 28);
# in order of importance we have: carat and then the dimensions x, y,z
# we can see that the dimensions are very correlated with each other and with thee carat
# to avoid colinearity we will use only carat in our models
# we should use one hot encoding to incorporate categorical featurees into this chart
In [19]:
correlations_data = diamonds.corr()['price'].sort_values(ascending=False)
pd.DataFrame(correlations_data)
Out[19]:
In [20]:
# lets do one hot encoding for the categorical variable and
# lets run again the correlation matrix
# One hot encode
# lets split into categorical and not categorical
categorical_subset = diamonds[['color', 'cut','clarity']]
numeric_subset=diamonds[['carat','depth','table','price','x','y','z']]
categorical_subset = pd.get_dummies(categorical_subset)
# lets join the categorical and non categorical
# Make sure to use axis = 1 to perform a column bind
diamonds_encoded = pd.concat([numeric_subset, categorical_subset], axis = 1)
In [21]:
plt.figure(figsize=(12,10))
sns.heatmap(diamonds_encoded.corr(),cmap='coolwarm',annot=False);
plt.title('Correlations', size = 28);
#we can see that color I , clarity SI2 and premium cut have some influence in the price
In [22]:
#top 10 variables y correlation with the price
correlations_data2 = diamonds_encoded.corr()['price'].sort_values(ascending=False)
pd.DataFrame(correlations_data2).head(10)
Out[22]:
In [23]:
#Now lets explore the relationships between two variables
#Relationship between Price and carat and clarity
sns.lmplot(x='carat',y='price',data=diamonds,hue='clarity',fit_reg=False, size=10, scatter_kws={'s':10} );
plt.title('Price Vs Carat', size = 28);
# we can see that thee relationship between Price and Carat is not linear.
# Diamonds with high Carat are extremely rare, ans so its pricees is higher in the market.
# There is also a big influence of the clarity in the price, being SI2 the more expensive one.
In [24]:
#Now lets explore the relationships between two variables
#Relationship between Price and carat and colort
sns.lmplot(x='carat',y='price',data=diamonds,hue='color',fit_reg=False, size=10, scatter_kws={'s':10} );
plt.title('Price Vs Carat and color', size = 28);
# Here again we see the strong dependencey of price when it comes to Carat. Colo also has a big influence
# Color D and E are the most appreciated colors in diamonds.
In [25]:
# lets draw now some pair plots to see the relationships between correlations and distributions
# between other important variables
# already have seen that the dimmension x,y,z are strongly correlated between them.
# they are also strongly correlated with the carat, because the carat is the weight and
# the three dimensions multiplied by each other could give us an approximation to the the volumne
# Lest know study the shape of the diamond. The shape is critical because a diamond should capture
# the light and should increase the bright when the light impacts in this facets.
# Then two characteristics are critical, the table, that is the upper surface that allows thee light to
# come into the diamond, and the depth of the diamond it self.
# Extract the columns to plot
pairplot_data = diamonds[['price','carat','depth','table']]
# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,size = 20)
g=sns.PairGrid(pairplot_data,size = 3)
g.map_diag(plt.hist,color = 'red')
g.map_upper(plt.scatter,alpha = 0.6,color = 'red')
g.map_lower(sns.kdeplot,cmap = plt.cm.Reds)
g.map_lower(corr_func);
In [26]:
# we can see that the carat, that is the weight is higly correlated with the price....the higher the weight
# the higher the price.
# we can also see that the depth is inversely correlated with the price. The bigger the depth, the lower the
# bright of a diamond and thus the lower the price.
# when it comes to the table, that is the horizontal suface of the diamond, it seems to follow a normal distribution
# with very low tails. this parameter seems to be almost fix. The surface should be enough big to let the ligth in
# and increase the reflection of the light.
In [27]:
# I am going to do now some feature ingenieering.
# as we have seen, x,y and z are the dimensions and are highly correlated.
# as the carat represent the weight, it would be good to create a new variable for the volume
# and drop then the three dimensions to avoid colinearity
diamonds_encoded['volume']=diamonds_encoded['x']*diamonds_encoded['y']*diamonds_encoded['z']
diamonds_encoded.drop(['x','y','z'], axis=1, inplace= True)
In [28]:
diamonds_encoded.head(5)
Out[28]:
In [29]:
#TRAINING DIFFERENT MODELS AND COMPARING THEM
#before training the models we need to divide the data set into train and test
#Dividing the data into training and test
#I will use 75% of data for training and 25% for testing
y = diamonds_encoded['price']
X = diamonds_encoded.drop(['price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25,
random_state = 33)
In [69]:
# before applying the different models, we need to create a baseline
# baselines are used to to compare the different models, and our models
# should beat this thresshold to be accepted
# For regression problems, a reasonable naive baseline is to guess the
# median value of the target on the training set for all the examples in the test set.
# I will use the MAE mean absolute error as a metric
# Function to calculate mean absolute error
def mae(y_true, y_pred):
return np.mean(abs(y_true - y_pred))
baseline_guess = np.median(y_train)
print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))
In [30]:
# As we don´t have nulls we will skip the stept of imputation
# if we had had nulls i would have used imputer function from scikit learn
# Applying Feature Scaling ( StandardScaler )
# some algorithms such as k neightbours and support vector machines depend on the distant between features
# as we have features in different scales : prices in the thousends and dimensions in the tenhts, we
# need to apply feeature scaling to later on compare the accuracy of different algorithms
# We will apply the standardization method, in wich we will turn every feature into a standard distribution
# with average 0 and standard deviation of 1 (substracting the average and dividing by the standard deviation)
# You can also Apply MinMaxScaler.(normalization)
# MinMaxScaler put every feature into the range 0 to 1 by substracting the minimum value and dividing
# by the range (maximum minus minimum)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# are readdy to go now and imprement the algorithms
In [44]:
#I will use the MAE (mean absolute error) as a metric for the regresions and also to compare different algoritmns
#I will use R2 to compare all the models
# Collect all R2 Scores.
R2_Scores = []
models = ['Linear Regression' , 'Lasso Regression' , 'AdaBoost Regression' , 'Ridge Regression' , 'GradientBoosting Regression',
'RandomForest Regression' ,
'KNeighbours Regression']
MAE_Scores=[]
In [32]:
#LINEAR REGRESSION
#https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
#https://scikit-learn.org/stable/modules/linear_model.html#lasso
clf_lr = LinearRegression()
clf_lr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_lr.predict(X_test)
print('')
print('####### Linear Regression #######')
print('Score : %.4f' % clf_lr.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [83]:
#we will take a look to the residuals
Residuals=y_test-y_pred
Residuals.to_frame(name='Residuals').hist(bins=100,figsize=(12,10));
#they seems normally distributed although the positive tails seems bigger
#the model seems to understimate the price for some diamonds
In [33]:
#LASSO REGRESSION
clf_la = Lasso(normalize=True)
clf_la.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_la, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_la.predict(X_test)
print('')
print('###### Lasso Regression ######')
print('Score : %.4f' % clf_la.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [34]:
#ADABOSST REGRESSION
clf_ar = AdaBoostRegressor(n_estimators=1000)
clf_ar.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_ar, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_ar.predict(X_test)
print('')
print('###### AdaBoost Regression ######')
print('Score : %.4f' % clf_ar.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [35]:
#RIDGE REGRESSION
clf_rr = Ridge(normalize=True)
clf_rr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rr.predict(X_test)
print('')
print('###### Ridge Regression ######')
print('Score : %.4f' % clf_rr.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [36]:
#GRADIENT BOOSTING REGRESSION
clf_gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls',verbose = 1)
clf_gbr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_gbr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_gbr.predict(X_test)
print('')
print('###### Gradient Boosting Regression #######')
print('Score : %.4f' % clf_gbr.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [37]:
#RANDOM FOREST
clf_rf = RandomForestRegressor()
clf_rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rf, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rf.predict(X_test)
print('')
print('###### Random Forest ######')
print('Score : %.4f' % clf_rf.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
In [38]:
#TUNNING PARAMETERS
no_of_test=[100]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='r2')
clf_rf.fit(X_train,y_train)
print('Score : %.4f' % clf_rf.score(X_test, y_test))
pred=clf_rf.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [39]:
#KNEIGHBOURS REGRESSION
clf_knn = KNeighborsRegressor()
clf_knn.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_knn, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_knn.predict(X_test)
print('')
print('###### KNeighbours Regression ######')
print('Score : %.4f' % clf_knn.score(X_test, y_test))
print(accuracies)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
print('')
print('MSE : %0.2f ' % mse)
print('MAE : %0.2f ' % mae)
print('RMSE : %0.2f ' % rmse)
print('R2 : %0.2f ' % r2)
In [40]:
#TUNNING PARAMETERS
n_neighbors=[]
for i in range (0,50,5):
if(i!=0):
n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='r2')
clf_knn.fit(X_train,y_train)
print('Score : %.4f' % clf_knn.score(X_test, y_test))
pred=clf_knn.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2 : %0.2f ' % r2)
R2_Scores.append(r2)
In [41]:
#VISUALIZING R2 SCORES
compare = pd.DataFrame({'Algorithms' : models , 'R2-Scores' : R2_Scores})
compare.sort_values(by='R2-Scores' ,ascending=False)
Out[41]:
In [92]:
sns.barplot(x='R2-Scores' , y='Algorithms' , data=compare);
plt.title('Models comparison', size = 28);
plt.figure(figsize=(12,10));
In [43]:
sns.factorplot(x='Algorithms', y='R2-Scores' , data=compare, size=6 , aspect=4);
plt.title('Price Vs Carat and color', size = 28);
In [94]:
jupyter nbconvert --to html Diamonds Price.ipynb
A WordPress Commenter
Hi, this is a comment.
To get started with moderating, editing, and deleting comments, please visit the Comments screen in the dashboard.
Commenter avatars come from Gravatar.