#DIAMONDS PRICE

#Import libraries
import numpy as np
import pandas as pd
from math import *
import sys
!{sys.executable} -m pip install missingno

# Ignore warnings 
import warnings
warnings.filterwarnings('ignore')

# Visualisation Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pylab as pylab
import missingno as msno
plt.style.use( 'ggplot' )
%matplotlib inline

#Import preprocessing libraries
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder


#Import feature selection libraries

from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score


# Regression libraries
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor 
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

#Import classification libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis

#Import evaluation metrics


# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score, mean_absolute_error 

# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

Requirement already satisfied: missingno in c:\users\hector\anaconda2\lib\site-packages
Requirement already satisfied: numpy in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: scipy in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: matplotlib in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: seaborn in c:\users\hector\anaconda2\lib\site-packages (from missingno)
Requirement already satisfied: six>=1.10 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: backports.functools_lru_cache in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: pytz in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: cycler>=0.10 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\users\hector\anaconda2\lib\site-packages (from matplotlib->missingno)
Requirement already satisfied: pandas in c:\users\hector\anaconda2\lib\site-packages (from seaborn->missingno)

You are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.

#Importing the data set
df = pd.read_csv('C:\Users\Hector\datasetsblog\diamonds.csv')
diamonds = df.copy()

#we drop a column that is used as index
diamonds.drop(['Unnamed: 0'],inplace=True, axis=1)
#Exploring the data set
diamonds.head(5)

diamonds.shape

(53940, 10)

diamonds.info()
#there arent any nan in the data set

df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
price      53940 non-null int64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB

Unnamed: 0    0
carat         0
cut           0
color         0
clarity       0
depth         0
table         0
price         0
x             0
y             0
z             0
dtype: int64

#checking for nulls
# percentage of nan
# number of nan
msno.matrix(diamonds);
msno.bar(diamonds);

diamonds.describe()

#there are some values that are wrong such as those that are 0 for x, y, z
#the dimensions can not be 0


df.loc[(df['x']==0) | (df['y']==0) | (df['z']==0)]

len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])
# there are 20 recorst that has 0 in their dimensions
# we will need to drop this values

20

# dropping the records
df = df[(df[['x','y','z']] != 0).all(axis=1)]

# Lets start with the Exploratory data analysis
plt.style.use('ggplot')
plt.figure(figsize=(12,10))
plt.hist(diamonds['price'].dropna(), bins = 100);
plt.xlabel('price'); plt.ylabel('Number of diamonds'); 
plt.title('Diamonds price Distribution');
# we can see that very few diamonds are of very high value

#lets analysis how the carats are distributed
plt.style.use('ggplot')
plt.figure(figsize=(12,10))
plt.hist(diamonds['carat'].dropna(), bins = 100);
plt.xlabel('carat'); plt.ylabel('Number of diamonds'); 
plt.title('Diamonds carat Distribution');
# we can see that very few diamonds are of very high carat

#lets analyze how prices varies with the carat and lets take a look also to their distributions
g=sns.JointGrid(x='carat',y='price',data=diamonds,size=10)
g=g.plot(sns.regplot,sns.distplot);
# we can see that very few diamonds have big carats, and that those are piced higher
# Nevertheless the relation is not linear. High carat diamonds are extremenly rare, and thus
# they receive very high prices.

# lets analysize how the prices varies with the clarity
# Create a list of clarity types
types = diamonds.dropna(subset=['clarity'])

types = types['clarity'].value_counts()

types = list(types[types.index ].index)

# Plot of distribution of prices for clarity categories
plt.figure(figsize=(12,10))

# Plot each clarity
for b_type in types:
    subset = diamonds[diamonds['clarity'] == b_type]
    sns.kdeplot(subset['price'].dropna(),
            label = b_type, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('price', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of diamonds by Clarity Type', size = 28);
#we can see that those diamonds whose clarity is rare, have greater value in the market

# Create a list of colors types
types = diamonds.dropna(subset=['color'])

types = types['color'].value_counts()

types = list(types[types.index ].index)

# Plot of distribution of prices for color categories

plt.figure(figsize=(12,10))
# Plot each color
for b_type in types:
    subset = diamonds[diamonds['color'] == b_type]
    sns.kdeplot(subset['price'].dropna(),
            label = b_type, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('price', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of diamonds by Color Type', size = 28);
#we can see that those diamonds whose color is rare have higher value in the market

# Create a list of cut types
types = diamonds.dropna(subset=['cut'])

types = types['cut'].value_counts()

types = list(types[types.index ].index)

# Plot of distribution of prices for cut categories

plt.figure(figsize=(12,10))
# Plot each cut
for b_type in types:
    subset = diamonds[diamonds['cut'] == b_type]
    sns.kdeplot(subset['price'].dropna(),
            label = b_type, shade = False, alpha = 0.8);
    
# label the plot

plt.xlabel('price', size = 20)
plt.ylabel('Density', size = 20) 
plt.title('Density Plot of diamonds by Cut Type', size = 28);
#we can see that the cut in the diamond is something that may depend on other parameters as some fair cuts
#reach higher prices despite having air cut instead of premium or ideal

#let´s see how are the distributions of price by color
plt.figure(figsize=(12,10))
sns.boxplot(x='color',y='price',data=diamonds)
plt.xlabel('color', size = 20)
plt.ylabel('price', size = 20) 
plt.title('Price by color Type', size = 28);
# clearly I and J colores are priced more highly

#let´s see how are the distributions of price by cut
plt.figure(figsize=(12,10))
sns.boxplot(x='cut',y='price',data=diamonds)
plt.xlabel('cut', size = 20)
plt.ylabel('price', size = 20) 
plt.title('Price by cut Type', size = 28);
# we can see that the cut has less influence in the price.

#let´s see how are the distributions of price by clarity
plt.figure(figsize=(12,10))
sns.boxplot(x='clarity',y='price',data=diamonds)
plt.xlabel('clarity', size = 20)
plt.ylabel('price', size = 20) 
plt.title('Price by clarity Type', size = 28);
# we can see that the clarity has a big influence in the price.

# let's analyze how the variables are correlated with the price and with each other
plt.figure(figsize=(12,10))
sns.heatmap(diamonds.corr(),cmap='coolwarm',annot=True);
plt.title('Correlations', size = 28);
# in order of importance we have: carat and then the dimensions x, y,z
# we can see that the dimensions are very correlated with each other and with thee carat
# to avoid colinearity we will use only carat in our models
# we should use one hot encoding to incorporate categorical featurees into this chart

correlations_data = diamonds.corr()['price'].sort_values(ascending=False)
pd.DataFrame(correlations_data)

# lets do one hot encoding for the categorical variable and
# lets run again the correlation matrix
# One hot encode
# lets split into categorical and not categorical
categorical_subset = diamonds[['color', 'cut','clarity']]
numeric_subset=diamonds[['carat','depth','table','price','x','y','z']]
categorical_subset = pd.get_dummies(categorical_subset)
# lets join the categorical and non categorical

# Make sure to use axis = 1 to perform a column bind
diamonds_encoded = pd.concat([numeric_subset, categorical_subset], axis = 1)

plt.figure(figsize=(12,10))
sns.heatmap(diamonds_encoded.corr(),cmap='coolwarm',annot=False);
plt.title('Correlations', size = 28);
#we can see that color I , clarity SI2 and premium cut have some influence in the price

#top 10 variables y correlation with the price
correlations_data2 = diamonds_encoded.corr()['price'].sort_values(ascending=False)
pd.DataFrame(correlations_data2).head(10)

#Now lets explore the relationships between two variables
#Relationship between Price and carat and clarity
sns.lmplot(x='carat',y='price',data=diamonds,hue='clarity',fit_reg=False, size=10, scatter_kws={'s':10} );
plt.title('Price Vs Carat', size = 28);
# we can see that thee relationship between Price and Carat is not linear.
# Diamonds with high Carat are extremely rare, ans so its pricees is higher in the market.
# There is also a big influence of the clarity in the price, being SI2 the more expensive one.

#Now lets explore the relationships between two variables
#Relationship between Price and carat and colort
sns.lmplot(x='carat',y='price',data=diamonds,hue='color',fit_reg=False, size=10, scatter_kws={'s':10} );
plt.title('Price Vs Carat and color', size = 28);
# Here again we see the strong dependencey of price when it comes to Carat. Colo also has a big influence
# Color D and E are the most appreciated colors in diamonds.

# lets draw now some pair plots to see the relationships between correlations and distributions 
# between other important variables
# already have seen that the dimmension x,y,z are strongly correlated between them.
# they are also strongly correlated with the carat, because the carat is the weight and 
# the three dimensions multiplied by each other could give us an approximation to the the volumne
# Lest know study the shape of the diamond. The shape is critical because a diamond should capture
# the light and should increase the bright when the light impacts in this facets.
# Then two characteristics are critical, the table, that is the upper surface that allows thee light to
# come into the diamond, and the depth of the diamond it self.
# Extract the columns to  plot
pairplot_data = diamonds[['price','carat','depth','table']]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,size = 20)
    

g=sns.PairGrid(pairplot_data,size = 3)
g.map_diag(plt.hist,color = 'red')
g.map_upper(plt.scatter,alpha = 0.6,color = 'red')
g.map_lower(sns.kdeplot,cmap = plt.cm.Reds)
g.map_lower(corr_func);

# we can see that the carat, that is the weight is higly correlated with the price....the higher the weight
# the higher the price.
# we can also see that the depth is inversely correlated with the price. The bigger the depth, the lower the
# bright of a diamond and thus the lower the price.
# when it comes to the table, that is the horizontal suface of the diamond, it seems to follow a normal distribution
# with very low tails. this parameter seems to be almost fix. The surface should be enough big to let the ligth in
# and increase the reflection of the light.

# I am going to do now some feature ingenieering.
# as we have seen, x,y and z are the dimensions and are highly correlated.
# as the carat represent the weight, it would be good to create a new variable for the volume
# and drop then the three dimensions to avoid colinearity
diamonds_encoded['volume']=diamonds_encoded['x']*diamonds_encoded['y']*diamonds_encoded['z']
diamonds_encoded.drop(['x','y','z'], axis=1, inplace= True)

diamonds_encoded.head(5)

#TRAINING DIFFERENT MODELS AND COMPARING THEM
#before training the models we need to divide the data set into train and test
#Dividing the data into training and test
#I will use 75% of data for training and 25% for testing

y = diamonds_encoded['price']
X = diamonds_encoded.drop(['price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size = 0.25, 
                                        random_state = 33)

# before applying the different models, we need to create a baseline
# baselines are used to to compare the different models, and our models
# should beat this thresshold to be accepted
# For regression problems, a reasonable naive baseline is to guess the 
# median value of the target on the training set for all the examples in the test set.
# I will use the MAE mean absolute error as a metric
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

baseline_guess = np.median(y_train)

print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))

The baseline guess is a score of 2409.00
Baseline Performance on the test set: MAE = 2817.5057

# As we don´t have nulls we will skip the stept of imputation
# if we had had nulls i would have used imputer function from scikit learn

# Applying Feature Scaling ( StandardScaler )
# some algorithms such as k neightbours and support vector machines depend on the distant between features
# as we have features in different scales : prices in the thousends and dimensions in the tenhts, we 
# need to apply feeature scaling to later on compare the accuracy of different algorithms
# We will apply the standardization method, in wich we will turn every feature into a standard distribution
# with average 0 and standard deviation of 1 (substracting the average and dividing by the standard deviation)
# You can also Apply MinMaxScaler.(normalization)
# MinMaxScaler put every feature into the range 0 to 1 by substracting the minimum value and dividing
# by the range (maximum minus minimum)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# are readdy to go now and imprement the algorithms

#I will use the MAE (mean absolute error) as a metric for the regresions and also to compare different algoritmns
#I will use R2 to compare all the models
# Collect all R2 Scores.
R2_Scores = []
models = ['Linear Regression' , 'Lasso Regression' , 'AdaBoost Regression' , 'Ridge Regression' , 'GradientBoosting Regression',
          'RandomForest Regression' ,
         'KNeighbours Regression']
MAE_Scores=[]

#LINEAR REGRESSION
#https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
#https://scikit-learn.org/stable/modules/linear_model.html#lasso
clf_lr = LinearRegression()
clf_lr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_lr.predict(X_test)
print('')
print('####### Linear Regression #######')
print('Score : %.4f' % clf_lr.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)

####### Linear Regression #######
Score : 0.9136
[0.91463745 0.91552206 0.91704936 0.91569356 0.91952022]

MSE    : 1384444.53 
MAE    : 809.99 
RMSE   : 1176.62 
R2     : 0.91

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished

#we will take a look to the residuals

Residuals=y_test-y_pred
Residuals.to_frame(name='Residuals').hist(bins=100,figsize=(12,10));
#they seems normally distributed although the positive tails seems bigger 
#the model seems to understimate the price for some diamonds

#LASSO REGRESSION
clf_la = Lasso(normalize=True)
clf_la.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_la, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_la.predict(X_test)
print('')
print('###### Lasso Regression ######')
print('Score : %.4f' % clf_la.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)

###### Lasso Regression ######
Score : 0.8745
[0.87963496 0.88305402 0.88215532 0.87841246 0.88846729]

MSE    : 2010375.16 
MAE    : 874.41 
RMSE   : 1417.88 
R2     : 0.87

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished

#ADABOSST REGRESSION
clf_ar = AdaBoostRegressor(n_estimators=1000)
clf_ar.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_ar, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_ar.predict(X_test)
print('')
print('###### AdaBoost Regression ######')
print('Score : %.4f' % clf_ar.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished

###### AdaBoost Regression ######
Score : 0.7448
[0.73252728 0.81310399 0.66717286 0.869231   0.89731401]

MSE    : 4087102.78 
MAE    : 1856.52 
RMSE   : 2021.66 
R2     : 0.74

#RIDGE REGRESSION
clf_rr = Ridge(normalize=True)
clf_rr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rr.predict(X_test)
print('')
print('###### Ridge Regression ######')
print('Score : %.4f' % clf_rr.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)

###### Ridge Regression ######
Score : 0.7619
[0.75833744 0.76383095 0.75960351 0.74872032 0.76777029]

MSE    : 3812731.32 
MAE    : 1311.25 
RMSE   : 1952.62 
R2     : 0.76

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished

#GRADIENT BOOSTING REGRESSION


clf_gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls',verbose = 1)
clf_gbr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_gbr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_gbr.predict(X_test)
print('')
print('###### Gradient Boosting Regression #######')
print('Score : %.4f' % clf_gbr.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

R2_Scores.append(r2)

      Iter       Train Loss   Remaining Time 
         1    14048565.3433            1.58s
         2    12467409.0047            2.30s
         3    11135080.8887            2.04s
         4     9974751.2920            1.90s
         5     9000021.0479            1.79s
         6     8134410.4248            1.96s
         7     7400614.5739            1.87s
         8     6763121.8767            1.81s
         9     6217070.7743            1.74s
        10     5739758.0484            1.69s
        20     3206334.9421            1.50s
        30     2391985.6387            1.31s
        40     2105174.2630            1.12s
        50     1944563.2067            0.92s
        60     1825293.1551            0.74s
        70     1734634.7264            0.55s
        80     1660592.4716            0.37s
        90     1597432.3516            0.18s
       100     1543363.5456            0.00s
      Iter       Train Loss   Remaining Time 
         1    14075006.4522            1.49s
         2    12479107.9896            1.52s
         3    11145742.5466            1.49s
         4     9979299.9129            1.87s
         5     8999983.5431            1.77s
         6     8132242.8850            1.96s
         7     7396885.7364            1.86s
         8     6755596.8710            1.79s
         9     6210609.6502            1.73s
        10     5731322.1660            1.68s
        20     3185250.0349            1.56s
        30     2366942.0596            1.27s
        40     2082200.1276            1.08s
        50     1924837.2192            0.89s
        60     1806955.7145            0.74s
        70     1717395.1325            0.54s
        80     1644482.2326            0.36s
        90     1581789.7524            0.18s
       100     1528036.0007            0.00s
      Iter       Train Loss   Remaining Time 
         1    14230495.0350            3.07s
         2    12620946.2204            2.30s
         3    11270738.8662            2.00s
         4    10091686.9900            1.87s
         5     9098482.4790            1.77s
         6     8224023.2201            1.96s
         7     7479014.6783            1.86s
         8     6832239.3316            1.98s
         9     6278738.9065            1.89s
        10     5796648.3636            1.83s
        20     3228342.0926            1.62s
        30     2404161.1508            1.31s
        40     2114488.2392            1.10s
        50     1954221.2864            0.91s
        60     1835142.3150            0.72s
        70     1744522.8005            0.54s
        80     1670547.4400            0.36s
        90     1607495.1175            0.18s
       100     1553134.4529            0.00s
      Iter       Train Loss   Remaining Time 
         1    14135921.0920            1.58s
         2    12540386.3801            1.57s
         3    11202436.2001            1.52s
         4    10029852.5557            1.90s
         5     9049081.2320            1.79s
         6     8178504.5493            1.72s
         7     7439129.4932            1.87s
         8     6794491.5529            1.81s
         9     6244974.1839            1.74s
        10     5763670.5234            1.69s
        20     3207364.8428            1.63s
        30     2389438.9183            1.28s
        40     2103751.9658            1.10s
        50     1944317.1668            0.92s
        60     1826021.2942            0.73s
        70     1736449.6384            0.54s
        80     1662607.6293            0.37s
        90     1599947.8284            0.18s
       100     1546341.8022            0.00s
      Iter       Train Loss   Remaining Time 
         1    13894567.8494            3.07s
         2    12336212.6597            2.30s
         3    11017777.6642            2.00s
         4     9871054.6765            1.87s
         5     8905481.1781            1.79s
         6     8055406.9888            1.96s
         7     7330116.0897            1.86s
         8     6698545.1775            1.79s
         9     6158650.1191            1.74s
        10     5686963.7273            1.68s
        20     3184478.8888            1.50s
        30     2378858.9046            1.28s
        40     2092982.3902            1.10s
        50     1931928.4268            0.94s
        60     1811045.9846            0.75s
        70     1719827.7597            0.56s
        80     1646197.6886            0.37s
        90     1583705.9729            0.18s
       100     1529994.0731            0.00s
      Iter       Train Loss   Remaining Time 
         1    13903033.4852            1.58s
         2    12358508.2447            1.52s
         3    11033405.8706            1.52s
         4     9897712.2552            1.51s
         5     8923070.7435            1.48s
         6     8077099.0148            1.47s
         7     7360649.9612            1.46s
         8     6720724.4701            1.62s
         9     6188058.9126            1.58s
        10     5710128.8988            1.69s
        20     3208944.2749            1.50s
        30     2406516.3637            1.24s
        40     2121865.9672            1.08s
        50     1960654.2543            0.94s
        60     1840485.7799            0.74s
        70     1749661.6479            0.55s
        80     1674684.0446            0.36s
        90     1610695.6707            0.18s
       100     1555994.2019            0.00s

###### Gradient Boosting Regression #######
Score : 0.8993
[0.89723396 0.90007127 0.89977537 0.90184883 0.90801916]

MSE    : 1612663.29 
MAE    : 730.46 
RMSE   : 1269.91 
R2     : 0.90

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.5s finished

#RANDOM FOREST
clf_rf = RandomForestRegressor()
clf_rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rf, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rf.predict(X_test)
print('')
print('###### Random Forest ######')
print('Score : %.4f' % clf_rf.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

###### Random Forest ######
Score : 0.9785
[0.97880452 0.97661042 0.98007484 0.97908359 0.97938782]

MSE    : 344717.00 
MAE    : 298.15 
RMSE   : 587.13 
R2     : 0.98

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.7s finished

#TUNNING PARAMETERS
no_of_test=[100]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='r2')
clf_rf.fit(X_train,y_train)
print('Score : %.4f' % clf_rf.score(X_test, y_test))
pred=clf_rf.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2     : %0.2f ' % r2)
R2_Scores.append(r2)

Score : 0.9800
R2     : 0.98

#KNEIGHBOURS REGRESSION
clf_knn = KNeighborsRegressor()
clf_knn.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_knn, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_knn.predict(X_test)
print('')
print('###### KNeighbours Regression ######')
print('Score : %.4f' % clf_knn.score(X_test, y_test))
print(accuracies)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print('')
print('MSE    : %0.2f ' % mse)
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.2f ' % r2)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.4s finished

###### KNeighbours Regression ######
Score : 0.9399
[0.94162878 0.93982351 0.93666624 0.93363019 0.93802947]

MSE    : 962671.04 
MAE    : 494.66 
RMSE   : 981.16 
R2     : 0.94

#TUNNING PARAMETERS


n_neighbors=[]
for i in range (0,50,5):
    if(i!=0):
        n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='r2')
clf_knn.fit(X_train,y_train)
print('Score : %.4f' % clf_knn.score(X_test, y_test))
pred=clf_knn.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2     : %0.2f ' % r2)
R2_Scores.append(r2)

Score : 0.9399
R2     : 0.94

#VISUALIZING R2 SCORES


compare = pd.DataFrame({'Algorithms' : models , 'R2-Scores' : R2_Scores})
compare.sort_values(by='R2-Scores' ,ascending=False)

sns.barplot(x='R2-Scores' , y='Algorithms' , data=compare);
plt.title('Models comparison', size = 28);
plt.figure(figsize=(12,10));

<matplotlib.figure.Figure at 0x11da6e48>

sns.factorplot(x='Algorithms', y='R2-Scores' , data=compare, size=6 , aspect=4);
plt.title('Price Vs Carat and color', size = 28);

jupyter nbconvert --to html Diamonds Price.ipynb

  File "<ipython-input-94-f5e42659c20e>", line 1
    jupyter nbconvert --to html Diamonds Price.ipynb
                    ^
SyntaxError: invalid syntax

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75

	carat	depth	table	price	x	y	z
count	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000
mean	0.797940	61.749405	57.457184	3932.799722	5.731157	5.734526	3.538734
std	0.474011	1.432621	2.234491	3989.439738	1.121761	1.142135	0.705699
min	0.200000	43.000000	43.000000	326.000000	0.000000	0.000000	0.000000
25%	0.400000	61.000000	56.000000	950.000000	4.710000	4.720000	2.910000
50%	0.700000	61.800000	57.000000	2401.000000	5.700000	5.710000	3.530000
75%	1.040000	62.500000	59.000000	5324.250000	6.540000	6.540000	4.040000
max	5.010000	79.000000	95.000000	18823.000000	10.740000	58.900000	31.800000

	Unnamed: 0	carat	cut	color	clarity	depth	table	price	x	y
2207	2208	1.00	Premium	G	SI2	59.1	59.0	3142	6.55	6.48
2314	2315	1.01	Premium	H	I1	58.1	59.0	3167	6.66	6.60
4791	4792	1.10	Premium	G	SI2	63.0	59.0	3696	6.50	6.47
5471	5472	1.01	Premium	F	SI2	59.2	58.0	3837	6.50	6.47
10167	10168	1.50	Good	G	I1	64.0	61.0	4731	7.15	7.04
11182	11183	1.07	Ideal	F	SI2	61.6	56.0	4954	0.00	6.62
11963	11964	1.00	Very Good	H	VS2	63.3	53.0	5139	0.00	0.00
13601	13602	1.15	Ideal	G	VS2	59.2	56.0	5564	6.88	6.83
15951	15952	1.14	Fair	G	VS1	57.5	67.0	6381	0.00	0.00
24394	24395	2.18	Premium	H	SI2	59.4	61.0	12631	8.49	8.45
24520	24521	1.56	Ideal	G	VS2	62.2	54.0	12800	0.00	0.00
26123	26124	2.25	Premium	I	SI1	61.3	58.0	15397	8.52	8.42
26243	26244	1.20	Premium	D	VVS1	62.1	59.0	15686	0.00	0.00
27112	27113	2.20	Premium	H	SI1	61.2	59.0	17265	8.42	8.37
27429	27430	2.25	Premium	H	SI2	62.8	59.0	18034	0.00	0.00
27503	27504	2.02	Premium	H	VS2	62.7	53.0	18207	8.02	7.95
27739	27740	2.80	Good	G	SI2	63.8	58.0	18788	8.90	8.85
49556	49557	0.71	Good	F	SI2	64.1	60.0	2130	0.00	0.00
49557	49558	0.71	Good	F	SI2	64.1	60.0	2130	0.00	0.00
51506	51507	1.12	Premium	G	I1	60.4	59.0	2383	6.71	6.67

	price
price	1.000000
carat	0.921591
x	0.884435
y	0.865421
z	0.861249
table	0.127134
depth	-0.010647

	price
price	1.000000
carat	0.921591
x	0.884435
y	0.865421
z	0.861249
clarity_SI2	0.128420
table	0.127134
color_I	0.097125
cut_Premium	0.095706
color_J	0.081710

	Algorithms	R2-Scores
5	RandomForest Regression	0.980033
6	KNeighbours Regression	0.939890
0	Linear Regression	0.913554
4	GradientBoosting Regression	0.899303
1	Lasso Regression	0.874470
3	Ridge Regression	0.761928
2	AdaBoost Regression	0.744796