Diamonds price

In this first post I will analyze the classical Diamond´s dataset.

The purpose of this analysis is determining the price of diamonds based on their characteristics.

To do so, I will do a exploratory data analysis, features engineering, features standardization and I will test several regression models: linear regresions, decision trees, random forest etc

Diamonds Price
In [1]:
#DIAMONDS PRICE

#Import libraries
import numpy as np
import pandas as pd
from math import *
import sys
!{sys.executable} -m pip install missingno

# Ignore warnings 
import warnings
warnings.filterwarnings('ignore')

# Visualisation Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pylab as pylab
import missingno as msno
plt.style.use( 'ggplot' )
%matplotlib inline

#Import preprocessing libraries
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder


#Import feature selection libraries

from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score


# Regression libraries
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor 
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

#Import classification libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis

#Import evaluation metrics


# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score, mean_absolute_error 

# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score 
Requirement already satisfied: missingno in c:\users\hector\anaconda2\lib\site-packages
Requirement already satisfied: numpy in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: scipy in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: matplotlib in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: seaborn in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: six>=1.10 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: backports.functools_lru_cache in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: pytz in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: cycler>=0.10 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: pandas in c:\users\hector\anaconda2\lib\site-packages (from seaborn->missingno)
You are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.
In [2]:
#Importing the data set
df = pd.read_csv('C:\Users\Hector\datasetsblog\diamonds.csv')
diamonds = df.copy()
In [3]:
#we drop a column that is used as index
diamonds.drop(['Unnamed: 0'],inplace=True, axis=1)
#Exploring the data set
diamonds.head(5)
Out[3]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [4]:
diamonds.shape
Out[4]:
(53940, 10)
In [5]:
diamonds.info()
#there arent any nan in the data set

df.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
price      53940 non-null int64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB
Out[5]:
Unnamed: 0    0
carat         0
cut           0
color         0
clarity       0
depth         0
table         0
price         0
x             0
y             0
z             0
dtype: int64
In [6]:
#checking for nulls
# percentage of nan
# number of nan
msno.matrix(diamonds);
msno.bar(diamonds);
In [7]:
diamonds.describe()
Out[7]:
carat depth table price x y z
count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000
mean 0.797940 61.749405 57.457184 3932.799722 5.731157 5.734526 3.538734
std 0.474011 1.432621 2.234491 3989.439738 1.121761 1.142135 0.705699
min 0.200000 43.000000 43.000000 326.000000 0.000000 0.000000 0.000000
25% 0.400000 61.000000 56.000000 950.000000 4.710000 4.720000 2.910000
50% 0.700000 61.800000 57.000000 2401.000000 5.700000 5.710000 3.530000
75% 1.040000 62.500000 59.000000 5324.250000 6.540000 6.540000 4.040000
max 5.010000 79.000000 95.000000 18823.000000 10.740000 58.900000 31.800000
In [8]:
#there are some values that are wrong such as those that are 0 for x, y, z
#the dimensions can not be 0


df.loc[(df['x']==0) | (df['y']==0) | (df['z']==0)]
Out[8]:
Unnamed: 0 carat cut color clarity depth table price x y z
2207 2208 1.00 Premium G SI2 59.1 59.0 3142 6.55 6.48 0.0
2314 2315 1.01 Premium H I1 58.1 59.0 3167 6.66 6.60 0.0
4791 4792 1.10 Premium G SI2 63.0 59.0 3696 6.50 6.47 0.0
5471 5472 1.01 Premium F SI2 59.2 58.0 3837 6.50 6.47 0.0
10167 10168 1.50 Good G I1 64.0 61.0 4731 7.15 7.04 0.0
11182 11183 1.07 Ideal F SI2 61.6 56.0 4954 0.00 6.62 0.0
11963 11964 1.00 Very Good H VS2 63.3 53.0 5139 0.00 0.00 0.0
13601 13602 1.15 Ideal G VS2 59.2 56.0 5564 6.88 6.83 0.0
15951 15952 1.14 Fair G VS1 57.5 67.0 6381 0.00 0.00 0.0
24394 24395 2.18 Premium H SI2 59.4 61.0 12631 8.49 8.45 0.0
24520 24521 1.56 Ideal G VS2 62.2 54.0 12800 0.00 0.00 0.0
26123 26124 2.25 Premium I SI1 61.3 58.0 15397 8.52 8.42 0.0
26243 26244 1.20 Premium D VVS1 62.1 59.0 15686 0.00 0.00 0.0
27112 27113 2.20 Premium H SI1 61.2 59.0 17265 8.42 8.37 0.0
27429 27430 2.25 Premium H SI2 62.8 59.0 18034 0.00 0.00 0.0
27503 27504 2.02 Premium H VS2 62.7 53.0 18207 8.02 7.95 0.0
27739 27740 2.80 Good G SI2 63.8 58.0 18788 8.90 8.85 0.0
49556 49557 0.71 Good F SI2 64.1 60.0 2130 0.00 0.00 0.0
49557 49558 0.71 Good F SI2 64.1 60.0 2130 0.00 0.00 0.0
51506 51507 1.12 Premium G I1 60.4 59.0 2383 6.71 6.67 0.0
In [9]:
len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])
# there are 20 recorst that has 0 in their dimensions
# we will need to drop this values
Out[9]:
20
In [10]:
# dropping the records
df = df[(df[['x','y','z']] != 0).all(axis=1)]
In [11]:
# Lets start with the Exploratory data analysis
plt.style.use('ggplot')
plt.figure(figsize=(12,10))
plt.hist(diamonds['price'].dropna(), bins = 100);
plt.xlabel('price'); plt.ylabel('Number of diamonds'); 
plt.title('Diamonds price Distribution');
# we can see that very few diamonds are of very high value
In [50]:
#lets analysis how the carats are distributed
plt.style.use('ggplot')
plt.figure(figsize=(12,10))
plt.hist(diamonds['carat'].dropna(), bins = 100);
plt.xlabel('carat'); plt.ylabel('Number of diamonds'); 
plt.title('Diamonds carat Distribution');
# we can see that very few diamonds are of very high carat
In [64]:
#lets analyze how prices varies with the carat and lets take a look also to their distributions
g=sns.JointGrid(x='carat',y='price',data=diamonds,size=10)
g=g.plot(sns.regplot,sns.distplot);
# we can see that very few diamonds have big carats, and that those are piced higher
# Nevertheless the relation is not linear. High carat diamonds are extremenly rare, and thus
# they receive very high prices.
In [12]:
# lets analysize how the prices varies with the clarity
# Create a list of clarity types
types = diamonds.dropna(subset=['clarity'])

types = types['clarity'].value_counts()

types = list(types[types.index ].index)

# Plot of distribution of prices for clarity categories
plt.figure(figsize=(12,10))

# Plot each clarity
for b_type in types:
    subset = diamonds[diamonds['clarity'] == b_type]
    sns.kdeplot(subset['price'].dropna(),
            label = b_type, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('price', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of diamonds by Clarity Type', size = 28);
#we can see that those diamonds whose clarity is rare, have greater value in the market
In [13]:
# Create a list of colors types
types = diamonds.dropna(subset=['color'])

types = types['color'].value_counts()

types = list(types[types.index ].index)

# Plot of distribution of prices for color categories

plt.figure(figsize=(12,10))
# Plot each color
for b_type in types:
    subset = diamonds[diamonds['color'] == b_type]
    sns.kdeplot(subset['price'].dropna(),
            label = b_type, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('price', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of diamonds by Color Type', size = 28);
#we can see that those diamonds whose color is rare have higher value in the market
In [14]:
# Create a list of cut types
types = diamonds.dropna(subset=['cut'])

types = types['cut'].value_counts()

types = list(types[types.index ].index)

# Plot of distribution of prices for cut categories

plt.figure(figsize=(12,10))
# Plot each cut
for b_type in types:
    subset = diamonds[diamonds['cut'] == b_type]
    sns.kdeplot(subset['price'].dropna(),
            label = b_type, shade = False, alpha = 0.8);
    
# label the plot

plt.xlabel('price', size = 20)
plt.ylabel('Density', size = 20) 
plt.title('Density Plot of diamonds by Cut Type', size = 28);
#we can see that the cut in the diamond is something that may depend on other parameters as some fair cuts
#reach higher prices despite having air cut instead of premium or ideal
In [15]:
#let´s see how are the distributions of price by color
plt.figure(figsize=(12,10))
sns.boxplot(x='color',y='price',data=diamonds)
plt.xlabel('color', size = 20)
plt.ylabel('price', size = 20) 
plt.title('Price by color Type', size = 28);
# clearly I and J colores are priced more highly
In [16]:
#let´s see how are the distributions of price by cut
plt.figure(figsize=(12,10))
sns.boxplot(x='cut',y='price',data=diamonds)
plt.xlabel('cut', size = 20)
plt.ylabel('price', size = 20) 
plt.title('Price by cut Type', size = 28);
# we can see that the cut has less influence in the price.
In [17]:
#let´s see how are the distributions of price by clarity
plt.figure(figsize=(12,10))
sns.boxplot(x='clarity',y='price',data=diamonds)
plt.xlabel('clarity', size = 20)
plt.ylabel('price', size = 20) 
plt.title('Price by clarity Type', size = 28);
# we can see that the clarity has a big influence in the price.
In [18]:
# let's analyze how the variables are correlated with the price and with each other
plt.figure(figsize=(12,10))
sns.heatmap(diamonds.corr(),cmap='coolwarm',annot=True);
plt.title('Correlations', size = 28);
# in order of importance we have: carat and then the dimensions x, y,z
# we can see that the dimensions are very correlated with each other and with thee carat
# to avoid colinearity we will use only carat in our models
# we should use one hot encoding to incorporate categorical featurees into this chart
In [19]:
correlations_data = diamonds.corr()['price'].sort_values(ascending=False)
pd.DataFrame(correlations_data)
Out[19]:
price
price 1.000000
carat 0.921591
x 0.884435
y 0.865421
z 0.861249
table 0.127134
depth -0.010647
In [20]:
# lets do one hot encoding for the categorical variable and
# lets run again the correlation matrix
# One hot encode
# lets split into categorical and not categorical
categorical_subset = diamonds[['color', 'cut','clarity']]
numeric_subset=diamonds[['carat','depth','table','price','x','y','z']]
categorical_subset = pd.get_dummies(categorical_subset)
# lets join the categorical and non categorical

# Make sure to use axis = 1 to perform a column bind
diamonds_encoded = pd.concat([numeric_subset, categorical_subset], axis = 1)
In [21]:
plt.figure(figsize=(12,10))
sns.heatmap(diamonds_encoded.corr(),cmap='coolwarm',annot=False);
plt.title('Correlations', size = 28);
#we can see that color I , clarity SI2 and premium cut have some influence in the price
In [22]:
#top 10 variables y correlation with the price
correlations_data2 = diamonds_encoded.corr()['price'].sort_values(ascending=False)
pd.DataFrame(correlations_data2).head(10)
Out[22]:
price
price 1.000000
carat 0.921591
x 0.884435
y 0.865421
z 0.861249
clarity_SI2 0.128420
table 0.127134
color_I 0.097125
cut_Premium 0.095706
color_J 0.081710
In [23]:
#Now lets explore the relationships between two variables
#Relationship between Price and carat and clarity
sns.lmplot(x='carat',y='price',data=diamonds,hue='clarity',fit_reg=False, size=10, scatter_kws={'s':10} );
plt.title('Price Vs Carat', size = 28);
# we can see that thee relationship between Price and Carat is not linear.
# Diamonds with high Carat are extremely rare, ans so its pricees is higher in the market.
# There is also a big influence of the clarity in the price, being SI2 the more expensive one.
In [24]:
#Now lets explore the relationships between two variables
#Relationship between Price and carat and colort
sns.lmplot(x='carat',y='price',data=diamonds,hue='color',fit_reg=False, size=10, scatter_kws={'s':10} );
plt.title('Price Vs Carat and color', size = 28);
# Here again we see the strong dependencey of price when it comes to Carat. Colo also has a big influence
# Color D and E are the most appreciated colors in diamonds.
In [25]:
# lets draw now some pair plots to see the relationships between correlations and distributions 
# between other important variables
# already have seen that the dimmension x,y,z are strongly correlated between them.
# they are also strongly correlated with the carat, because the carat is the weight and 
# the three dimensions multiplied by each other could give us an approximation to the the volumne
# Lest know study the shape of the diamond. The shape is critical because a diamond should capture
# the light and should increase the bright when the light impacts in this facets.
# Then two characteristics are critical, the table, that is the upper surface that allows thee light to
# come into the diamond, and the depth of the diamond it self.
# Extract the columns to  plot
pairplot_data = diamonds[['price','carat','depth','table']]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,size = 20)
    

g=sns.PairGrid(pairplot_data,size = 3)
g.map_diag(plt.hist,color = 'red')
g.map_upper(plt.scatter,alpha = 0.6,color = 'red')
g.map_lower(sns.kdeplot,cmap = plt.cm.Reds)
g.map_lower(corr_func);
In [26]:
# we can see that the carat, that is the weight is higly correlated with the price....the higher the weight
# the higher the price.
# we can also see that the depth is inversely correlated with the price. The bigger the depth, the lower the
# bright of a diamond and thus the lower the price.
# when it comes to the table, that is the horizontal suface of the diamond, it seems to follow a normal distribution
# with very low tails. this parameter seems to be almost fix. The surface should be enough big to let the ligth in
# and increase the reflection of the light.
In [27]:
# I am going to do now some feature ingenieering.
# as we have seen, x,y and z are the dimensions and are highly correlated.
# as the carat represent the weight, it would be good to create a new variable for the volume
# and drop then the three dimensions to avoid colinearity
diamonds_encoded['volume']=diamonds_encoded['x']*diamonds_encoded['y']*diamonds_encoded['z']
diamonds_encoded.drop(['x','y','z'], axis=1, inplace= True)
In [28]:
diamonds_encoded.head(5)
Out[28]:
carat depth table price color_D color_E color_F color_G color_H color_I cut_Very Good clarity_I1 clarity_IF clarity_SI1 clarity_SI2 clarity_VS1 clarity_VS2 clarity_VVS1 clarity_VVS2 volume
0 0.23 61.5 55.0 326 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 38.202030
1 0.21 59.8 61.0 326 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 34.505856
2 0.23 56.9 65.0 327 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 38.076885
3 0.29 62.4 58.0 334 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 46.724580
4 0.31 63.3 58.0 335 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 51.917250

5 rows Ă— 25 columns

In [29]:
#TRAINING DIFFERENT MODELS AND COMPARING THEM
#before training the models we need to divide the data set into train and test
#Dividing the data into training and test
#I will use 75% of data for training and 25% for testing

y = diamonds_encoded['price']
X = diamonds_encoded.drop(['price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size = 0.25, 
                                        random_state = 33)
In [69]:
# before applying the different models, we need to create a baseline
# baselines are used to to compare the different models, and our models
# should beat this thresshold to be accepted
# For regression problems, a reasonable naive baseline is to guess the 
# median value of the target on the training set for all the examples in the test set.
# I will use the MAE mean absolute error as a metric
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

baseline_guess = np.median(y_train)

print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))
The baseline guess is a score of 2409.00
Baseline Performance on the test set: MAE = 2817.5057
In [30]:
# As we don´t have nulls we will skip the stept of imputation
# if we had had nulls i would have used imputer function from scikit learn

# Applying Feature Scaling ( StandardScaler )
# some algorithms such as k neightbours and support vector machines depend on the distant between features
# as we have features in different scales : prices in the thousends and dimensions in the tenhts, we 
# need to apply feeature scaling to later on compare the accuracy of different algorithms
# We will apply the standardization method, in wich we will turn every feature into a standard distribution
# with average 0 and standard deviation of 1 (substracting the average and dividing by the standard deviation)
# You can also Apply MinMaxScaler.(normalization)
# MinMaxScaler put every feature into the range 0 to 1 by substracting the minimum value and dividing
# by the range (maximum minus minimum)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# are readdy to go now and imprement the algorithms
In [44]:
#I will use the MAE (mean absolute error) as a metric for the regresions and also to compare different algoritmns
#I will use R2 to compare all the models
# Collect all R2 Scores.
R2_Scores = []
models = ['Linear Regression' , 'Lasso Regression' , 'AdaBoost Regression' , 'Ridge Regression' , 'GradientBoosting Regression',
          'RandomForest Regression' ,
         'KNeighbours Regression']
MAE_Scores=[]
In [32]:
#LINEAR REGRESSION
#https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
#https://scikit-learn.org/stable/modules/linear_model.html#lasso
clf_lr = LinearRegression()
clf_lr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_lr.predict(X_test)
print('')
print('####### Linear Regression #######')
print('Score : %.4f' % clf_lr.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)
####### Linear Regression #######
Score : 0.9136
[0.91463745 0.91552206 0.91704936 0.91569356 0.91952022]

MSE    : 1384444.53 
MAE    : 809.99 
RMSE   : 1176.62 
R2     : 0.91 
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished
In [83]:
#we will take a look to the residuals

Residuals=y_test-y_pred
Residuals.to_frame(name='Residuals').hist(bins=100,figsize=(12,10));
#they seems normally distributed although the positive tails seems bigger 
#the model seems to understimate the price for some diamonds
In [33]:
#LASSO REGRESSION
clf_la = Lasso(normalize=True)
clf_la.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_la, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_la.predict(X_test)
print('')
print('###### Lasso Regression ######')
print('Score : %.4f' % clf_la.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)
###### Lasso Regression ######
Score : 0.8745
[0.87963496 0.88305402 0.88215532 0.87841246 0.88846729]

MSE    : 2010375.16 
MAE    : 874.41 
RMSE   : 1417.88 
R2     : 0.87 
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished
In [34]:
#ADABOSST REGRESSION
clf_ar = AdaBoostRegressor(n_estimators=1000)
clf_ar.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_ar, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_ar.predict(X_test)
print('')
print('###### AdaBoost Regression ######')
print('Score : %.4f' % clf_ar.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished
###### AdaBoost Regression ######
Score : 0.7448
[0.73252728 0.81310399 0.66717286 0.869231   0.89731401]

MSE    : 4087102.78 
MAE    : 1856.52 
RMSE   : 2021.66 
R2     : 0.74 
In [35]:
#RIDGE REGRESSION
clf_rr = Ridge(normalize=True)
clf_rr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rr.predict(X_test)
print('')
print('###### Ridge Regression ######')
print('Score : %.4f' % clf_rr.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)
###### Ridge Regression ######
Score : 0.7619
[0.75833744 0.76383095 0.75960351 0.74872032 0.76777029]

MSE    : 3812731.32 
MAE    : 1311.25 
RMSE   : 1952.62 
R2     : 0.76 
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
In [36]:
#GRADIENT BOOSTING REGRESSION


clf_gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls',verbose = 1)
clf_gbr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_gbr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_gbr.predict(X_test)
print('')
print('###### Gradient Boosting Regression #######')
print('Score : %.4f' % clf_gbr.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)
      Iter       Train Loss   Remaining Time 
         1    14048565.3433            1.58s
         2    12467409.0047            2.30s
         3    11135080.8887            2.04s
         4     9974751.2920            1.90s
         5     9000021.0479            1.79s
         6     8134410.4248            1.96s
         7     7400614.5739            1.87s
         8     6763121.8767            1.81s
         9     6217070.7743            1.74s
        10     5739758.0484            1.69s
        20     3206334.9421            1.50s
        30     2391985.6387            1.31s
        40     2105174.2630            1.12s
        50     1944563.2067            0.92s
        60     1825293.1551            0.74s
        70     1734634.7264            0.55s
        80     1660592.4716            0.37s
        90     1597432.3516            0.18s
       100     1543363.5456            0.00s
      Iter       Train Loss   Remaining Time 
         1    14075006.4522            1.49s
         2    12479107.9896            1.52s
         3    11145742.5466            1.49s
         4     9979299.9129            1.87s
         5     8999983.5431            1.77s
         6     8132242.8850            1.96s
         7     7396885.7364            1.86s
         8     6755596.8710            1.79s
         9     6210609.6502            1.73s
        10     5731322.1660            1.68s
        20     3185250.0349            1.56s
        30     2366942.0596            1.27s
        40     2082200.1276            1.08s
        50     1924837.2192            0.89s
        60     1806955.7145            0.74s
        70     1717395.1325            0.54s
        80     1644482.2326            0.36s
        90     1581789.7524            0.18s
       100     1528036.0007            0.00s
      Iter       Train Loss   Remaining Time 
         1    14230495.0350            3.07s
         2    12620946.2204            2.30s
         3    11270738.8662            2.00s
         4    10091686.9900            1.87s
         5     9098482.4790            1.77s
         6     8224023.2201            1.96s
         7     7479014.6783            1.86s
         8     6832239.3316            1.98s
         9     6278738.9065            1.89s
        10     5796648.3636            1.83s
        20     3228342.0926            1.62s
        30     2404161.1508            1.31s
        40     2114488.2392            1.10s
        50     1954221.2864            0.91s
        60     1835142.3150            0.72s
        70     1744522.8005            0.54s
        80     1670547.4400            0.36s
        90     1607495.1175            0.18s
       100     1553134.4529            0.00s
      Iter       Train Loss   Remaining Time 
         1    14135921.0920            1.58s
         2    12540386.3801            1.57s
         3    11202436.2001            1.52s
         4    10029852.5557            1.90s
         5     9049081.2320            1.79s
         6     8178504.5493            1.72s
         7     7439129.4932            1.87s
         8     6794491.5529            1.81s
         9     6244974.1839            1.74s
        10     5763670.5234            1.69s
        20     3207364.8428            1.63s
        30     2389438.9183            1.28s
        40     2103751.9658            1.10s
        50     1944317.1668            0.92s
        60     1826021.2942            0.73s
        70     1736449.6384            0.54s
        80     1662607.6293            0.37s
        90     1599947.8284            0.18s
       100     1546341.8022            0.00s
      Iter       Train Loss   Remaining Time 
         1    13894567.8494            3.07s
         2    12336212.6597            2.30s
         3    11017777.6642            2.00s
         4     9871054.6765            1.87s
         5     8905481.1781            1.79s
         6     8055406.9888            1.96s
         7     7330116.0897            1.86s
         8     6698545.1775            1.79s
         9     6158650.1191            1.74s
        10     5686963.7273            1.68s
        20     3184478.8888            1.50s
        30     2378858.9046            1.28s
        40     2092982.3902            1.10s
        50     1931928.4268            0.94s
        60     1811045.9846            0.75s
        70     1719827.7597            0.56s
        80     1646197.6886            0.37s
        90     1583705.9729            0.18s
       100     1529994.0731            0.00s
      Iter       Train Loss   Remaining Time 
         1    13903033.4852            1.58s
         2    12358508.2447            1.52s
         3    11033405.8706            1.52s
         4     9897712.2552            1.51s
         5     8923070.7435            1.48s
         6     8077099.0148            1.47s
         7     7360649.9612            1.46s
         8     6720724.4701            1.62s
         9     6188058.9126            1.58s
        10     5710128.8988            1.69s
        20     3208944.2749            1.50s
        30     2406516.3637            1.24s
        40     2121865.9672            1.08s
        50     1960654.2543            0.94s
        60     1840485.7799            0.74s
        70     1749661.6479            0.55s
        80     1674684.0446            0.36s
        90     1610695.6707            0.18s
       100     1555994.2019            0.00s

###### Gradient Boosting Regression #######
Score : 0.8993
[0.89723396 0.90007127 0.89977537 0.90184883 0.90801916]

MSE    : 1612663.29 
MAE    : 730.46 
RMSE   : 1269.91 
R2     : 0.90 
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.5s finished
In [37]:
#RANDOM FOREST
clf_rf = RandomForestRegressor()
clf_rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rf, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rf.predict(X_test)
print('')
print('###### Random Forest ######')
print('Score : %.4f' % clf_rf.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)
###### Random Forest ######
Score : 0.9785
[0.97880452 0.97661042 0.98007484 0.97908359 0.97938782]

MSE    : 344717.00 
MAE    : 298.15 
RMSE   : 587.13 
R2     : 0.98 
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.7s finished
In [38]:
#TUNNING PARAMETERS
no_of_test=[100]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='r2')
clf_rf.fit(X_train,y_train)
print('Score : %.4f' % clf_rf.score(X_test, y_test))
pred=clf_rf.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2     : %0.2f ' % r2)
R2_Scores.append(r2)
Score : 0.9800
R2     : 0.98 
In [39]:
#KNEIGHBOURS REGRESSION
clf_knn = KNeighborsRegressor()
clf_knn.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_knn, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_knn.predict(X_test)
print('')
print('###### KNeighbours Regression ######')
print('Score : %.4f' % clf_knn.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.4s finished
###### KNeighbours Regression ######
Score : 0.9399
[0.94162878 0.93982351 0.93666624 0.93363019 0.93802947]

MSE    : 962671.04 
MAE    : 494.66 
RMSE   : 981.16 
R2     : 0.94 
In [40]:
#TUNNING PARAMETERS


n_neighbors=[]
for i in range (0,50,5):
    if(i!=0):
        n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='r2')
clf_knn.fit(X_train,y_train)
print('Score : %.4f' % clf_knn.score(X_test, y_test))
pred=clf_knn.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2     : %0.2f ' % r2)
R2_Scores.append(r2)
Score : 0.9399
R2     : 0.94 
In [41]:
#VISUALIZING R2 SCORES


compare = pd.DataFrame({'Algorithms' : models , 'R2-Scores' : R2_Scores})
compare.sort_values(by='R2-Scores' ,ascending=False)
Out[41]:
Algorithms R2-Scores
5 RandomForest Regression 0.980033
6 KNeighbours Regression 0.939890
0 Linear Regression 0.913554
4 GradientBoosting Regression 0.899303
1 Lasso Regression 0.874470
3 Ridge Regression 0.761928
2 AdaBoost Regression 0.744796
In [92]:
sns.barplot(x='R2-Scores' , y='Algorithms' , data=compare);
plt.title('Models comparison', size = 28);
plt.figure(figsize=(12,10));
<matplotlib.figure.Figure at 0x11da6e48>
In [43]:
sns.factorplot(x='Algorithms', y='R2-Scores' , data=compare, size=6 , aspect=4);
plt.title('Price Vs Carat and color', size = 28);
In [94]:
jupyter nbconvert --to html Diamonds Price.ipynb  
  File "<ipython-input-94-f5e42659c20e>", line 1
    jupyter nbconvert --to html Diamonds Price.ipynb
                    ^
SyntaxError: invalid syntax