DATA301_projects/project_1_eda_and_model_tra...

2636 lines
1.0 MiB
Plaintext
Raw Permalink Normal View History

2022-12-06 19:29:33 -05:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "ZSBjGiLtkL8Z",
"tags": []
},
"source": [
"# Task 1: Define the Problem"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "SRffAEZMkikd"
},
"outputs": [],
"source": [
"# predict the price of a diamond"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SYOM7cJ_kiY1"
},
"source": [
"# Task 2a: Install the Needed Libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "ft8kSvTUkiUd"
},
"outputs": [],
"source": [
"# pass"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OZNxHMuSkiON"
},
"source": [
"# Task 2b: Import the Needed Libraries"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "HPsfw4s7kiKk"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn import metrics\n",
"from joblib import dump, load # sklearn's replacement for pickle"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "umeaYS-SkiGU"
},
"source": [
"# Task 3: Load the Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "loYU2Pg3kiCS"
},
"outputs": [],
"source": [
"df = pd.read_csv('./diamonds.csv') # the data is in the current directory"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OTaEVKXAkh-B"
},
"source": [
"# Task 4: Perform Exploratory Data Analysis (EDA)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2FyPB8fMkh6U"
},
"source": [
"## Show the Data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "x9efqRclkh14"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head() # what are x, y and z?"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yOaCGYX5khxz"
},
"source": [
"## Get Data Info"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "YbjJUPR4khtR"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 53940 entries, 0 to 53939\n",
"Data columns (total 10 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 carat 53940 non-null float64\n",
" 1 cut 53940 non-null object \n",
" 2 color 53940 non-null object \n",
" 3 clarity 53940 non-null object \n",
" 4 depth 53940 non-null float64\n",
" 5 table 53940 non-null float64\n",
" 6 price 53940 non-null int64 \n",
" 7 x 53940 non-null float64\n",
" 8 y 53940 non-null float64\n",
" 9 z 53940 non-null float64\n",
"dtypes: float64(6), int64(1), object(3)\n",
"memory usage: 4.1+ MB\n"
]
}
],
"source": [
"df.info() # 'cut', 'color' and 'clarity' are strings, the rest are numbers"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6GZsnSiMkhpo"
},
"source": [
"## Find Missing Values"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"carat 0\n",
"cut 0\n",
"color 0\n",
"clarity 0\n",
"depth 0\n",
"table 0\n",
"price 0\n",
"x 0\n",
"y 0\n",
"z 0\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isna().sum() # no missing values"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# let's take a look at the string values"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Ideal 21551\n",
"Premium 13791\n",
"Very Good 12082\n",
"Good 4906\n",
"Fair 1610\n",
"Name: cut, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['cut'].value_counts() # see https://en.wikipedia.org/wiki/Diamond_cut"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"G 11292\n",
"E 9797\n",
"F 9542\n",
"H 8304\n",
"D 6775\n",
"I 5422\n",
"J 2808\n",
"Name: color, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['color'].value_counts() # see https://en.wikipedia.org/wiki/Diamond_color"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SI1 13065\n",
"VS2 12258\n",
"SI2 9194\n",
"VS1 8171\n",
"VVS2 5066\n",
"VVS1 3655\n",
"IF 1790\n",
"I1 741\n",
"Name: clarity, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['clarity'].value_counts() # see https://en.wikipedia.org/wiki/Diamond_clarity"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7f8105d7faf0>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAHpCAYAAABN+X+UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA3DUlEQVR4nO3dfXRU1b3/8c9IHoA0DCQ0GSIRogaKJiKChNAKUZ6vMXpZq9RCU1wXQa8CpsKlUtpL6rJB6SrQC5UiF4EKlN61Kl56ayOhKNTyEAwkkogRawRCE0JtMgEbkhj27w9/nDLJACEkmR3yfq01azH7fOfM3pnoJ2fPPue4jDFGAADASjcFugMAAODyCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqJvJGKPq6mpx2jkAoD0R1M109uxZud1unT17NtBdAQB0IgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsFhQoDsA6fz588rNzfVpGz58uLp27RqgHgEAbEFQWyA3N1cZL7+hnn1vlyRVlX6sFZJGjRoV0H4BAAKPoLZEz763K2rAkEB3AwBgGb6jBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsFPKhPnTql73znO4qMjFT37t119913Ky8vz9lujFFmZqZiYmLUrVs3paSkqKioyGcftbW1mjNnjnr37q2wsDClpaWptLTUp6ayslLp6elyu91yu91KT09XVVVVewwRAIAWC2hQV1ZW6utf/7qCg4P1hz/8QR988IF+9rOfqWfPnk7N0qVLtWzZMq1atUoHDx6Ux+PRuHHjdPbsWacmIyND27Zt09atW/Xuu+/q3LlzSk1NVUNDg1MzdepU5efnKzs7W9nZ2crPz1d6enp7DhcAgGsWFMg3f+mllxQbG6v169c7bf3793f+bYzRihUrtGjRIk2ePFmStHHjRkVHR2vLli164okn5PV6tW7dOr322msaO3asJGnTpk2KjY3Vzp07NWHCBB09elTZ2dnav3+/kpKSJElr165VcnKyiouLNXDgwCZ9q62tVW1trfO8urq6LX4EAABcUUCPqLdv365hw4bpm9/8pqKiojRkyBCtXbvW2V5SUqLy8nKNHz/eaQsNDdXo0aO1d+9eSVJeXp7q6+t9amJiYpSQkODU7Nu3T2632wlpSRoxYoTcbrdT09iSJUucaXK3263Y2NhWHTsAAM0R0KD+5JNPtHr1asXHx+utt97Sk08+qblz5+pXv/qVJKm8vFySFB0d7fO66OhoZ1t5eblCQkLUq1evK9ZERUU1ef+oqCinprGFCxfK6/U6j5MnT17fYAEAaIGATn1fuHBBw4YNU1ZWliRpyJAhKioq0urVq/Xd737XqXO5XD6vM8Y0aWuscY2/+ivtJzQ0VKGhoc0eCwAAbSGgR9R9+vTRHXfc4dM2aNAgnThxQpLk8XgkqclRb0VFhXOU7fF4VFdXp8rKyivWnD59usn7nzlzpsnROgAANgloUH/9619XcXGxT9tHH32kfv36SZLi4uLk8XiUk5PjbK+rq9Pu3bs1cuRISdLQoUMVHBzsU1NWVqbCwkKnJjk5WV6vV7m5uU7NgQMH5PV6nRoAAGwU0Knv733vexo5cqSysrI0ZcoU5ebm6pVXXtErr7wi6cvp6oyMDGVlZSk+Pl7x8fHKyspS9+7dNXXqVEmS2+3WjBkzNG/ePEVGRioiIkLz589XYmKiswp80KBBmjhxombOnKk1a9ZIkmbNmqXU1FS/K74BALBFQIP63nvv1bZt27Rw4UI9//zziouL04oVKzRt2jSnZsGCBaqpqdFTTz2lyspKJSUlaceOHQoPD3dqli9frqCgIE2ZMkU1NTUaM2aMNmzYoC5dujg1mzdv1ty5c53V4WlpaVq1alX7DRYAgBZwGWNMoDvREVRXV8vtdsvr9apHjx6tuu89e/Yoc3uhogYMkSRVfHRYmWkJGjVqVKu+DwCg4wn4JUQBAMDlEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgsaBAdwBNNXxRr4KCgibtw4cPV9euXQPQIwBAoBDUFjpb/qlWl5yX53gXp62q9GOtkDRq1KiA9QsA0P4IakuF97lVUQOGBLobAIAA4ztqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsFtCgzszMlMvl8nl4PB5nuzFGmZmZiomJUbdu3ZSSkqKioiKffdTW1mrOnDnq3bu3wsLClJaWptLSUp+ayspKpaeny+12y+12Kz09XVVVVe0xRAAArkvAj6jvvPNOlZWVOY8jR44425YuXaply5Zp1apVOnjwoDwej8aNG6ezZ886NRkZGdq2bZu2bt2qd999V+fOnVNqaqoaGhqcmqlTpyo/P1/Z2dnKzs5Wfn6+0tPT23WcAAC0RFDAOxAU5HMUfZExRitWrNCiRYs0efJkSdLGjRsVHR2tLVu26IknnpDX69W6dev02muvaezYsZKkTZs2KTY2Vjt37tSECRN09OhRZWdna//+/UpKSpIkrV27VsnJySouLtbAgQP99qu2tla1tbXO8+rq6tYeOgAAVxXwI+pjx44pJiZGcXFxevTRR/XJJ59IkkpKSlReXq7x48c7taGhoRo9erT27t0rScrLy1N9fb1PTUxMjBISEpyaffv2ye12OyEtSSNGjJDb7XZq/FmyZIkzVe52uxUbG9uq4wYAoDkCGtRJSUn61a9+pbfeektr165VeXm5Ro4cqc8++0zl5eWSpOjoaJ/XREdHO9vKy8sVEhKiXr16XbEmKiqqyXtHRUU5Nf4sXLhQXq/XeZw8efK6xgoAQEsEdOp70qRJzr8TExOVnJys2267TRs3btSIESMkSS6Xy+c1xpgmbY01rvFXf7X9hIaGKjQ0tFnjAACgrQR86vtSYWFhSkxM1LFjx5zvrRsf9VZUVDhH2R6PR3V1daqsrLxizenTp5u815kzZ5ocrQMAYBurgrq2tlZHjx5Vnz59FBcXJ4/Ho5ycHGd7XV2ddu/erZEjR0qShg4dquDgYJ+asrIyFRYWOjXJycnyer3Kzc11ag4cOCCv1+vUAABgq4BOfc+fP18PPfSQbrnlFlVUVOiFF15QdXW1pk+fLpfLpYyMDGVlZSk+Pl7x8fHKyspS9+7dNXXqVEmS2+3WjBkzNG/ePEVGRioiIkLz589XYmKiswp80KBBmjhxombOnKk1a9ZIkmbNmqXU1NTLrvgGAMAWAQ3q0tJSffvb39bf/vY3ffWrX9WIESO0f/9+9evXT5K0YMEC1dTU6KmnnlJlZaWSkpK0Y8cOhYeHO/tYvny5goKCNGXKFNXU1GjMmDHasGGDunTp4tRs3rxZc+fOdVaHp6WladWqVe07WAAAWsBljDGB7kRHUF1dLbfbLa/Xqx49erTqvvfs2aPM7YWKGjBEkvTRrv9RcM8Yxd3zDaem4qPDykxL0KhRo1r1vQEAdrPqO2oAAOCLoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlA
"text/plain": [
"<Figure size 500x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# what about price? Let's see the distribution\n",
"\n",
"sns.displot(df['price'], kind = \"hist\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HvFrMzIGmMOl"
},
"source": [
"# Task 5: Perform Data Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "S6cyPRAAmMLI"
},
"outputs": [],
"source": [
"# convert non-numeric values to numbers\n",
"# each dictionary has quality in descending order"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# 'cut'\n",
"\n",
"dict_cut = {'Ideal': 4, 'Premium': 3, 'Very Good': 2, 'Good': 1, 'Fair': 0}\n",
"df['cut'] = df['cut'].replace(dict_cut)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# 'color'\n",
"\n",
"dict_color = {'D': 6, 'E': 5, 'F': 4, 'G': 3, 'H': 2, 'I': 1, 'J': 0}\n",
"df['color'] = df['cut'].replace(dict_color)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# 'clarity'\n",
"\n",
"dict_clarity = {'IF': 7, 'VVS1': 6, 'VVS2': 5, 'VS1': 4, 'VS2': 3, 'SI1': 2, 'SI2': 1, 'I1': 0}\n",
"df['clarity'] = df['cut'].replace(dict_clarity)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" cut color clarity\n",
"0 4 4 4\n",
"1 3 3 3\n",
"2 1 1 1\n",
"3 3 3 3\n",
"4 1 1 1"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[['cut', 'color', 'clarity']].head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.23</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.21</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.23</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"0 0.23 4 4 4 61.5 55.0 326 3.95 3.98 2.43\n",
"1 0.21 3 3 3 59.8 61.0 326 3.89 3.84 2.31\n",
"2 0.23 1 1 1 56.9 65.0 327 4.05 4.07 2.31\n",
"3 0.29 3 3 3 62.4 58.0 334 4.20 4.23 2.63\n",
"4 0.31 1 1 1 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head() # now the whole dataset is numeric"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# drop x, y, z\n",
"\n",
"df = df.drop(columns = ['x', 'y', 'z'])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.23</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.21</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.23</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price\n",
"0 0.23 4 4 4 61.5 55.0 326\n",
"1 0.21 3 3 3 59.8 61.0 326\n",
"2 0.23 1 1 1 56.9 65.0 327\n",
"3 0.29 3 3 3 62.4 58.0 334\n",
"4 0.31 1 1 1 63.3 58.0 335"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>53940.00</td>\n",
" <td>53940.00</td>\n",
" <td>53940.00</td>\n",
" <td>53940.00</td>\n",
" <td>53940.00</td>\n",
" <td>53940.00</td>\n",
" <td>53940.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.80</td>\n",
" <td>2.90</td>\n",
" <td>2.90</td>\n",
" <td>2.90</td>\n",
" <td>61.75</td>\n",
" <td>57.46</td>\n",
" <td>3932.80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.47</td>\n",
" <td>1.12</td>\n",
" <td>1.12</td>\n",
" <td>1.12</td>\n",
" <td>1.43</td>\n",
" <td>2.23</td>\n",
" <td>3989.44</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.20</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>43.00</td>\n",
" <td>43.00</td>\n",
" <td>326.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.40</td>\n",
" <td>2.00</td>\n",
" <td>2.00</td>\n",
" <td>2.00</td>\n",
" <td>61.00</td>\n",
" <td>56.00</td>\n",
" <td>950.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.70</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>61.80</td>\n",
" <td>57.00</td>\n",
" <td>2401.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.04</td>\n",
" <td>4.00</td>\n",
" <td>4.00</td>\n",
" <td>4.00</td>\n",
" <td>62.50</td>\n",
" <td>59.00</td>\n",
" <td>5324.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>5.01</td>\n",
" <td>4.00</td>\n",
" <td>4.00</td>\n",
" <td>4.00</td>\n",
" <td>79.00</td>\n",
" <td>95.00</td>\n",
" <td>18823.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price\n",
"count 53940.00 53940.00 53940.00 53940.00 53940.00 53940.00 53940.00\n",
"mean 0.80 2.90 2.90 2.90 61.75 57.46 3932.80\n",
"std 0.47 1.12 1.12 1.12 1.43 2.23 3989.44\n",
"min 0.20 0.00 0.00 0.00 43.00 43.00 326.00\n",
"25% 0.40 2.00 2.00 2.00 61.00 56.00 950.00\n",
"50% 0.70 3.00 3.00 3.00 61.80 57.00 2401.00\n",
"75% 1.04 4.00 4.00 4.00 62.50 59.00 5324.25\n",
"max 5.01 4.00 4.00 4.00 79.00 95.00 18823.00"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe().round(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Look for outliers"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# data frame of features only\n",
"df2 = df.drop(['price'], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[<AxesSubplot:title={'center':'carat'}>,\n",
" <AxesSubplot:title={'center':'cut'}>],\n",
" [<AxesSubplot:title={'center':'color'}>,\n",
" <AxesSubplot:title={'center':'clarity'}>],\n",
" [<AxesSubplot:title={'center':'depth'}>,\n",
" <AxesSubplot:title={'center':'table'}>]], dtype=object)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABlgAAATDCAYAAAAazjjkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAADrF0lEQVR4nOz9fZhV9X0v/L9HGEYgMOEhMMwRlbSEarCJByOiacQAg1akOTYlCT0TTSyaWyPhRo6J8eRkTCM0aNS70BhjvdSIhvx6WxKrljCeKB4u8ImEJBhrk/vgU8OIieOASIcJ7t8fln0cB5SlzGxgXq/rmkv22p+913d91nfG+c57r72rSqVSKQAAAAAAAOyzwyo9AAAAAAAAgIONgAUAAAAAAKAgAQsAAAAAAEBBAhYAAAAAAICCBCwAAAAAAAAFCVgAAAAAAAAKErAAAAAAAAAUJGABAAAAAAAoSMACAAAAAABQkIAFgIpZuHBhfvCDH1R6GAAAAL3Ot771rdxyyy2VHgbAQa2qVCqVKj0IAHqnd73rXfn4xz/ul3oAAIAeNn78+AwfPjwPPPBApYcCcNByBQsA+8WuXbvS3t5e6WEAAAAAQI8QsAD0Mv/yL/+ST33qUxk5cmRqampy5JFH5tOf/nTa29vzwgsv5MILL8yxxx6bd73rXRkxYkQ++tGP5n/9r//V6TmeeuqpVFVVZfHixfn617+eMWPGpKamJvfff3/+/d//PZdcckk++MEPpra2NkOHDs2kSZPywx/+sNNzVFVVZfv27bn11ltTVVWVqqqqTJ48uQc7AQAAcPB5szVdU1NTqqqqujzmlltuSVVVVZ566qkkydFHH53HH388q1evLq/Hjj766J49EIBDQN9KDwCAnvOzn/0sH/7whzN8+PB87Wtfy9ixY7N58+bcdddd2blzZ1588cUkyVe/+tXU1dXl5ZdfzooVKzJ58uT8z//5P7sEIH/7t3+b973vfbn66qszePDgjB07Nu3t7XnxxRezYMGC/Kf/9J+yc+fO3HfffTn77LNz880359Of/nSSZN26dfnoRz+a0047LV/5yleSJIMHD+7RfgAAABxM3mpNt69WrFiRj3/846mtrc23vvWtJElNTU13DRvgkOUzWAB6kSlTpuQnP/lJ/vVf/zXvec973rJ+165dKZVKOf300zN48OD84z/+Y5LXrmAZM2ZM/uAP/iBPPPFEqqur3/I5Pve5z+UnP/lJfvKTn5Tv8xksAAAA++6t1nRNTU254oor8sY/991yyy35zGc+k02bNpWvVPEZLADvnLcIA+glXnnllaxevTqzZs1603Dl29/+dv7zf/7POfzww9O3b99UV1fnf/7P/5knnniiS+3MmTP3GK78wz/8Q0455ZS8613vKj/HTTfdtMfnAAAA4K3t65oOgJ4jYAHoJVpbW7Nr164cccQRe6255ppr8n/9X/9XJk6cmDvvvDMPPfRQHn300Zx++unZsWNHl/pRo0Z12faP//iPmTVrVv7Tf/pPWbZsWdatW5dHH300n/3sZ/Pv//7v+/WYAAAAeot9WdMB0LN8BgtALzF06ND06dMnzz333F5rli1blsmTJ+f666/vtH3btm17rN/ThycuW7YsY8aMyfe///1O97e3t7/NkQMAALAva7rDDz88yWvrr9d/pspvf/vbbh8fQG/kChaAXqJ///459dRT8w//8A97/eW6qqqqywcb/vznP8+6dev2eT9VVVXp169fp3ClpaUlP/zhD7vU1tTU7PHKGAAAADrblzXd7s9X+fnPf95p+z/90z91qbUeA3jnBCwAvcg111yTjo6OTJw4MTfeeGPuv//+LF++PLNnz862bdsyY8aMrFq1Kl/96lfz4x//ONdff32mT5+eMWPG7PM+ZsyYkSeffDIXXnhhfvzjH+fWW2/Nhz/84T2+ndhxxx2XBx54IP/0T/+Uxx57LE8++eT+PFwAAIBDylut6f70T/80Q4cOzXnnnZcf/OAHufvuu/Pxj388zz77bJfnOu644/Kzn/0s3//+9/Poo4/mF7/4RQWOCODgVlUqlUqVHgQAPeeJJ54oByjbtm1LXV1dPvrRj+bb3/52qqqqcvnll+d73/tefve73+XYY4/N//gf/yMrVqzIAw88kKeeeipJ8tRTT2XMmDG56qqrsmDBgi77+MY3vpFvf/vb2bx5c9773vdm/vz5ee6553LFFVfk9f/b+dnPfpaLLrooP/3pT/PKK6/k1FNPzQMPPNBDnQAAADj4vNmarqamJo8++mjmzZuXn/3sZ3n3u9+dv/qrv8ro0aPzV3/1V9m0aVP5Kpenn346559/ftatW5dt27blqKOOKq/5ANg3AhYAAAAAAICCvEUYAAAAAABAQQIWAAAAAACAggQsAAAAAAAABQlYAAAAAAAAChKwAAAAAAAAFFQ4YHnwwQdz1llnpb6+PlVVVfnBD35Qvq+joyNf/OIXc9xxx2XgwIGpr6/Ppz/96fzmN7/p9Bzt7e25+OKLM3z48AwcODAzZ87Mc88916mmtbU1jY2Nqa2tTW1tbRobG/PSSy91qnnmmWdy1llnZeDAgRk+fHjmzp2bnTt3Fj0kAAAAAACAQvoWfcD27dvzgQ98IJ/5zGfy53/+553ue+WVV/KTn/wkX/nKV/KBD3wgra2tmTdvXmbOnJnHHnusXDdv3rz80z/9U5YvX55hw4blkksuyYwZM7J+/fr06dMnSTJ79uw899xzWblyZZLk/PPPT2NjY/7pn/4pSbJr166ceeaZec973pM1a9bkd7/7Xc4555yUSqUsWbJkn47l1VdfzW9+85sMGjQoVVVVRVsBAABvS6lUyrZt21JfX5/DDnNROeyJ9RoAAJVQaL1WegeSlFasWPGmNY888kgpSenpp58ulUql0ksvvVSqrq4uLV++vFzzb//2b6XDDjustHLlylKpVCr98pe/LCUpPfTQQ+WadevWlZKU/uVf/qVUKpVK9957b+mwww4r/du//Vu55nvf+16ppqam1NbWtk/jf/bZZ0tJfPny5cuXL1++fPmqyNezzz67T7+3Qm9kvebLly9fvnz58uWrkl/7sl4rfAVLUW1tbamqqsq73/3uJMn69evT0dGRhoaGck19fX3Gjx+ftWvXZvr06Vm3bl1qa2szceLEcs1JJ52U2trarF27NuPGjcu6desyfvz41NfXl2umT5+e9vb2rF+/PqeddlqXsbS3t6e9vb18u1QqJUk2bdqUQYMGJXntbc7uv//+nHbaaamurt6vveDA5/z3Xs597+Xc927Of+9V6XO/bdu2jBkzpvw7KNDV7u+PZ599NoMHD+7RfXd0dGTVqlVpaGjw/4d9oF/F6VlxelaMfhWnZ8XoV3F6Vkwl+7V169aMHj16n9Zr3Rqw/Pu//3u+9KUvZfbs2eVfiFtaWtKvX78MGTKkU+3IkSPT0tJSrhkxYkSX5xsxYkSnmpEjR3a6f8iQIenXr1+55o0WLVqUK664osv2devWZcCAAeXbAwYMyMMPP1zgSDmUOP+9l3Pfezn3vZvz33tV8ty/8sorSeJtj+BN7P7+GDx4cEUClgEDBmTw4MH+ALIP9Ks4PStOz4rRr+L0rBj9Kk7PijkQ+rUv67VuC1g6OjryyU9+Mq+++mq+9a1vvWV9qVTqNOA9Df7t1LzeZZddlvnz55dv706iGhoayr+wd3R0pLm5OdOmTTPReyHnv/dy7nsv5753c/57r0qf+61bt/b4PgEAANi/uiVg6ejoyKxZs7Jp06b8+Mc/7vRqo7q6uuzcuTOtra2drmLZsmVLTj755HLN888/3+V5X3jhhfJVK3V1dV1ecdja2pqOjo4uV7bsVlNTk5qami7bq6uruyys97SN3sP5772c+97Lue/dnP/eq1Ln3nwDAAA4+B22v59wd7jyq1/9Kvfdd1+GDRvW6f4JEyakuro6zc3N5W2bN2/Oxo0bywHLpEmT0tbWlkceeaRc8/DDD6etra1TzcaNG7N58+ZyzapVq1JTU5MJEybs78MCAAAAAAAoK3wFy8svv5xf//rX5dubNm3Khg0bMnT
"text/plain": [
"<Figure size 2000x1500 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df2.hist(bins = 50, figsize = (20, 15)) # histograms for each feature"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:>"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGdCAYAAABO2DpVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAjbElEQVR4nO3df3BU9f3v8ddmE/KLJAVChEiAQCpYE6T1RwoSfohhDD9MDOnYWp2Oo52pReYqiSg4KOlY0hahXnWkSsfxemcE27CEDgQmQSEs9RsVKjOgYIHySww/EusubMKy2ez9w5voSpDs5pPdbPJ8zDCYs2f3vMPM7j495+xZi8/n8wkAAMCAqHAPAAAA+g7CAgAAGENYAAAAYwgLAABgDGEBAACMISwAAIAxhAUAADCGsAAAAMZEh3qDbW1t+uKLL5SUlCSLxRLqzQMAgCD4fD5duHBB6enpioq6+n6JkIfFF198oYyMjFBvFgAAGHDq1CmNGDHiqreHPCySkpIkfT1YcnJyqDcPoAd5PB7V1NRo1qxZiomJCfc4AAxyOp3KyMjoeB+/mpCHRfvhj+TkZMIC6GM8Ho8SEhKUnJxMWAB91LVOY+DkTQAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAAMAYwgIAABhDWAAAAGMICwBGeL1e1dXVadeuXaqrq5PX6w33SADCgLAA0G02m01ZWVnKz8/X6tWrlZ+fr6ysLNlstnCPBiDEAgqL5cuXy2Kx+P0ZNmxYT80GIALYbDaVlJQoJydHdrtd69atk91uV05OjkpKSogLoJ8J+LtCbrrpJm3fvr3jZ6vVanQgAJHD6/WqtLRUc+fOVVVVlbxer5qampSbm6uqqioVFRWprKxMhYWFvFYA/UTAYREdHc1eCgCSJLvdruPHj2vdunWKioryO68iKipKS5Ys0eTJk2W32zV9+vTwDQogZAIOi8OHDys9PV2xsbHKzc3VihUrNGbMmKuu73a75Xa7O352Op2Svv4WRI/HE8TIAHqLU6dOSZLGjRvn95xu/3vcuHEd6/F8ByJbV5/DAYVFbm6u3nrrLd1www06e/asnn/+eU2ePFmffPKJhgwZ0ul9KioqVF5efsXympoaJSQkBLJ5AL3MiRMnJElr167tiAhJqq2tlSQdOnSoY73q6urQDwjAmObm5i6tZ/H5fL5gN+JyuTR27FgtXrxYixYt6nSdzvZYZGRkqLGxUcnJycFuGkAv4PV6deONN+qmm27Shg0b5PV6VVtbq/z8fFmtVs2fP1+ffvqpPv30U86xACKc0+lUamqqHA7H975/B3wo5NsSExOVk5Ojw4cPX3Wd2NhYxcbGXrE8JiZGMTEx3dk8gDCLiYnRqlWrVFJSop/97Gd68skn1dLSor1792rlypWqrq5WZWWl4uLiwj0qgG7q6nt2t8LC7Xbr4MGDysvL687DAIhgxcXFqqysVGlpqaZOndqxPDMzU5WVlSouLg7jdABCLaBDIWVlZZo3b55Gjhypc+fO6fnnn1ddXZ3279+vUaNGdekxnE6nUlJSrrkrBUBk8Xq92rFjh7Zu3aqCggLNmDGDwx9AH9LV9++A9lh8/vnn+sUvfqHGxkYNHTpUP/3pT1VfX9/lqADQd1mtVk2bNk0ul0vTpk0jKoB+KqCwWL9+fU/NAQAA+gC+KwQAABhDWAAAAGMICwAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAAMAYwgIAABhDWAAAAGMICwAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAAMAYwgIAABhDWAAAAGMICwAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAjPB6vaqrq9OuXbtUV1cnr9cb7pEAhAFhAaDbbDabsrKylJ+fr9WrVys/P19ZWVmy2WzhHg1AiBEWALrFZrOppKREOTk5stvtWrdunex2u3JyclRSUkJcAP2Mxefz+UK5QafTqZSUFDkcDiUnJ4dy0wAM83q9ysrKUk5OjqqqquT1elVdXa3Zs2fLarWqqKhIBw4c0OHDh2W1WsM9LoBu6Or7N3ssAATNbrfr+PHjWrp0qaKi/F9OoqKitGTJEh07dkx2uz1MEwIINcICQNAaGhokSdnZ2Z3e3r68fT0AfR9hASBow4cPlyQdOHCg09vbl7evB6DvIywABC0vL0+jR4/WihUr1NbW5ndbW1ubKioqlJmZqby8vDBNCCDUCAsAQbNarVq1apU2b96soqIi1dfXq6WlRfX19SoqKtLmzZv1wgsvcOIm0I9Eh3sAAJGtuLhYlZWVKi0t1dSpUzuWZ2ZmqrKyUsXFxWGcDkCo8XFTAEZ4vV7t2LFDW7duVUFBgWbMmMGeCqAP6er7N3ssABhhtVo1bdo0uVwuTZs2jagA+inOsQAAAMYQFgAAwBjCAgAAGENYAAAAYwgLAABgDGEBAACMISwAAIAxhAUAADCGsAAAAMYQFgAAwBjCAgAAGENYAAAAYwgLAABgDGEBAACMISwAGOH1elVXV6ddu3aprq5OXq833CMBCAPCAkC32Ww2ZWVlKT8/X6tXr1Z+fr6ysrJks9nCPRqAECMsAHSLzWZTSUmJcnJyZLfbtW7dOtntduXk5KikpIS4APoZi8/n84Vyg06nUykpKXI4HEpOTg7lpgEY5vV6lZWVpZycHFVVVcnr9aq6ulqzZ8+W1WpVUVGRDhw4oMOHD8tqtYZ7XADd0NX3b/ZYAAia3W7X8ePHtXTpUkVF+b+cREVFacmSJTp27JjsdnuYJgQQaoQFgKA1NDRIkrKzszu9vX15+3oA+j7CAkDQhg8fLkk6cOBAp7e3L29fD0DfR1gACFpeXp5Gjx6tFStWqK2tze+2trY2VVRUKDMzU3l5eWGaEECoRYd7AACRy2q1atWqVSopKdE999yjzMxM/fvf/9b27dt17NgxVVdXq7KykhM3gX6ET4UA6LaioiJt2rTpiuWFhYWqqqoK/UAAjAvJp0IqKipksVj0+OOPd+dhAESwxYsXa9OmTbJYLH7LLRaLNm3apMWLF4dpMgDhEHRYfPTRR3r99dc1YcIEk/MAiCCXL1/WqlWrJElz5szxu0DWnDlzJEmrVq3S5cuXwzkmgBAKKiwuXryoX/7yl1q7dq0GDRpkeiYAEeKVV15RW1ubbr75Zm3atEm5ubmKj49Xbm6uNm3apAkTJqitrU2vvPJKuEcFECJBnby5YMECzZkzR3fddZeef/75713X7XbL7XZ3/Ox0OiVJHo9HHo8nmM0D6CXq6uokSeXl5fJ6vR3P6fa/n3vuOc2fP191dXVauHBh2OYE0H1dfc8OOCzWr1+vf/3rX/roo4+6tH5FRYXKy8uvWF5TU6OEhIRANw+gF3E4HJKkLVu2+C2vra2VJFVXV3es1/7fACJTc3Nzl9YL6FMhp06d0q233qqamhrdfPPNkqTp06dr4sSJevHFFzu9T2d7LDIyMtTY2MinQoAIt337ds2ePVuDBg3S6dOn5fP5VFtbq/z8fFksFl1//fX673//q+rqat11113hHhdANzidTqWmpl7zUyEBhUVVVZXuvfdev8+ke71eWSwWRUVFye12X/Pz6nzcFOg7vF6vhgwZIofDobS0NC1fvlxxcXG6dOmSli9frnPnziklJUVNTU1cywKIcF19/w7oUMjMmTO1f/9+v2UPPfSQxo8fr6eeeooXDqCfsVqteuONNzR//nydP39ev/3tbztua//46RtvvMFrA9CPBPSpkKSkJGVnZ/v9SUxM1JAhQ676JUQA+rbi4mJt2LBBI0eO9Fs+atQobdiwQcXFxWGaDEA4cElvAN1WXFyswsJC7dixQ1u3blVBQYFmzJjBngqgH+p2WOzcudPAGAAindVq1bRp0+RyuTRt2jSiAuin+HZTAABgDGEBAACMISwAAIAxhAUAADCGsAAAAMYQFgAAwBjCAgAAGENYAAAAYwgLAABgDGEBAACMISwAAIAxhAUAI7xer+rq6rRr1y7V1dXJ6/WGeyQAYUBYAOg2m82mrKws5efna/Xq1crPz1dWVpZsNlu
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 'carat' distribution is not normal\n",
"# let's see the box plot\n",
"\n",
"df.boxplot(['carat'])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>53940.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.797940</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.474011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.700000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.040000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>5.010000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat\n",
"count 53940.000000\n",
"mean 0.797940\n",
"std 0.474011\n",
"min 0.200000\n",
"25% 0.400000\n",
"50% 0.700000\n",
"75% 1.040000\n",
"max 5.010000"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# deal with 'carat' outliers\n",
"\n",
"df[['carat']].describe()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.24000000000000002\n",
"1.2\n"
]
}
],
"source": [
"# outliers, continued\n",
"\n",
"Q1 = df['carat'].quantile(0.25)\n",
"Q3 = df['carat'].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"lower_limit = Q1 - 0.25 * IQR # define boundary for the lower end\n",
"upper_limit = Q3 + 0.25 * IQR # define boundary for the upper end\n",
"\n",
"print(lower_limit)\n",
"print(upper_limit)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"573"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many outliers on the low end?\n",
"\n",
"len(df[df.carat < lower_limit])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9155"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many outliers on the high end?\n",
"\n",
"\n",
"len(df[df.carat > upper_limit])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# drop the outliers\n",
"\n",
"df = df[~((df.carat < lower_limit) | (df.carat > upper_limit))]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.26</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>61.9</td>\n",
" <td>55.0</td>\n",
" <td>337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.30</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>64.0</td>\n",
" <td>55.0</td>\n",
" <td>339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0.31</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>62.2</td>\n",
" <td>54.0</td>\n",
" <td>344</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price\n",
"3 0.29 3 3 3 62.4 58.0 334\n",
"4 0.31 1 1 1 63.3 58.0 335\n",
"7 0.26 2 2 2 61.9 55.0 337\n",
"10 0.30 1 1 1 64.0 55.0 339\n",
"13 0.31 4 4 4 62.2 54.0 344"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head() # new dataframe having dropped outliers"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nNu-6-3emMH5"
},
"source": [
"# Task 6: Visualize Cleaned Up Dataset"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.PairGrid at 0x7f8103fb17e0>"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABrkAAAa5CAYAAAD/yGElAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeXxU5b3/32eZfUlCQliUQCAouwZR0CRal9YNWyz39hZtCwRrW8W2t7/WFdSqbbXLvb1u3RT1toq993rVinaz2gq0atXcyqpEIqAikJBkMtuZOcvvjzNzmMnMJGAFJunzfr14kZw55znPk/N8n+eZ830+369kWZaFQCAQCAQCgUAgEAgEAoFAIBAIBAKBQDCEkI92BQQCgUAgEAgEAoFAIBAIBAKBQCAQCASCQ0U4uQQCgUAgEAgEAoFAIBAIBAKBQCAQCARDDuHkEggEAoFAIBAIBAKBQCAQCAQCgUAgEAw5hJNLIBAIBAKBQCAQCAQCgUAgEAgEAoFAMOQQTi6BQCAQCAQCgUAgEAgEAoFAIBAIBALBkEM4uQQCgUAgEAgEAoFAIBAIBAKBQCAQCARDDuHkEggEAoFAIBAIBAKBQCAQCAQCgUAgEAw5hJNLIBAIBAKBQCAQCAQCgUAgEAgEAoFAMOQQTq4PEcuyiEQiWJZ1tKsiEAhKIOxUIBgaCFsVCIYGwlYFgqGBsFWBYGggbFUgGBoIWxUIygvh5PoQ6evro6Kigr6+vqNdFYFAUAJhpwLB0EDYqkAwNBC2KhAMDYStCgRDA2GrAsHQQNiqQFBeCCeXQCAQCAQCgUAgEAgEAoFAIBAIBAKBYMghnFwCgUAgEAgEAoFAIBAIBAKBQCAQCASCIYdwcgkEAoFAIBAIBAKBQCAQCAQCgUAgEAiGHMLJJRAIBAKBQCAQCAQCgUAgEAgEAoFAIBhyCCeXQCAQCAQCgUAgEAgEAoFAIBAIBAKBYMihHu0KHA5eeOEFvve97/Hqq6+ye/duHn/8cRYsWFDy/P/93//lRz/6Ef/3f/+HpmlMnz6dm2++mXPPPffIVVogEJQN73TH6UvqRBJpKnwugl6VY6v8R7tah4xoR/kwHNoA5deOcqvPB0W0o3wYDm2A8mtHudXngzIc2jEc2gCiHYLyZk8kSXcsRUzTGRFwoxkmWGABaV1nhN9DyrRIpA20tEGV3z6nL6ET8qq4VZmeeIqAR8WlyCS0FJU+DwYQSxn0JdKEfC78bgUXoJkW8bRBXDMI+1Q8qoxlQVdUI+hzEXApxNMGfUmdsFfFm/k9mkxTHfCQMkwimc/cqkzaMDEt6EvoBDwKAY9Kpc9Fhd99lP+yguFEbzxFZzRFJJkm7HNRE3BT4XcXjIsVXhUD8o4F3ArxtIbf5SGaMpzjIa/9ijPS71yvqaPJKn2Zc8M+F0G3ghfQIO942K2ALOWVkR2b+9ct5FaQgZ6cOpQ816siAb1Fyi32twCK/n1K/d0E/1i82x0nktSJa2lGhrz23JBME/bafVs3LWIpg3jKoNLvosLnQksZAEiSrXixAAnbMWACSqZskw9HEZMtvxjpzP1kQM/8LGV+NjPXugAjU78UoJgWliwhZc5V+t1Dz6l3GpBMC1OW0EyDgKw4ZaaKtDFbj1jaQJHBpyj2PJ0y0A2DmoDHqZdhWliAZYFuGQRUe4yKpwxiyTQjgx4kWXKeSYXfRcjjIpEyCmy8K5bCsCxckmSXmSk3ZRjIsoRblkmkDWIpg0qfi9qQp+A604KEnnnWmXP6jwkHO8YA9MTTxFJ63j2HyhgzLJ1csViME044gaVLl7Jw4cJBz3/hhRf46Ec/yre//W0qKyt54IEHuOiii3jppZdobGw8AjUWCATlwo6uGNc/voH17V3OseaGar518UzGVweOYs0ODdGO8mE4tAHKrx3lVp8PimhH+TAc2gDl145yq88HZTi0Yzi0AUQ7BOXNzsxzfW1nD3cuauSeP7ZzydzxPLC+g51dcX6+7BR29SS4+/l22nb2cPcljfz7799kbU4/aGqoZmlTPZ+5/2XOOK6G686bSsKwuOlXG/P6y9lTRrJy/jRufHJj3vUtk2u46qwGkimTiJbg/nUdBf2stbkeCYl/fza/zJaGGq48cxKtD71CPPNCtKmhmqvOmsz4EX7GVPoO559P8A/Cez0JrnnsddZu63SOfW5eHctaJuaNi+OqfPx82Snc8ER+P71gxiiuPm8q1/YbQ4v13+aGar61YCbffGoTz23d55zb3FDNbQtm8t3fbOGZjXsOlDG5his+Moll/cu4OL8Mv1vh/sVzuPf59jz7y577P3/dyV1/3H5Q5966ZjPPbtmbV4crz2yg9cG/OnU4fXINty2YwS39zj19cg23L5zFWGGb/zBk1w9vvN/Hw5fN47oia4mbPz6dxQ+8TDxlcM8ls+mJa1QHPRmnloQOWJaFW5JJc8A5kOaAc0nJ+f9Q0bGdTzKFjq4koGBhIaFxwMGlYTufDMvEI0mkkFCxSCCRSCUIuH1IloWEhJq5h5qpY9bBZWTaEE0mCHh93PfCW3yxZRK6Ba5MWbpl2k4+Sca0TFRJRgNuWbOJ1tPqGTfC58zTO7vi/GLZKWimiQV09mkAmBb88q87+ddzjiNpmNz0q0207ezhx5eeRNqyuOlxe8zyuxXuXNTIA+s3O8/I71ZYteRk7nthO5fOG4/ffcDdZlrws7Xb+ey8CXhdMnc/3z7gdaZF3jlgjx935IwJ/cfbbDn3PNfO2vYDxx5cejKmCXc9v23A8soZybIs62hX4nAiSdKgSq5iTJ8+nX/5l3/hxhtvPOhrIpEIFRUV9Pb2Eg6HD7GmAoHgSDCQnb7THeeax17PG9CzNDdUc/vCWUNid+1waseNT25k2tgKGsdVoukmXpfCazu72fJeL9/8xIyyb8dwehZHuh3CVodWO4StlgfCVg8Pw6Edw8FOYXg8Cyg/WxX8/eyJJIlruuNwWn5WA207u2msq6JtZzfr27v41fImtu6OoJsWo8JeXIqMqkj8+a0uVq3rcF5mg+1YaqyrYk5dFaMqPNz29JaC/pK9R7F+1NJQw5VnTeLu59pZV/Tzas6fOYbrH99Y8Fn23nc/1553bP6ssVwwY/SQ2dE9VBnuttobT7F8dVuegwvgj18/gxVPbMzrr79a3sQdv9la0MfvXzyHVes7ivb9Yv23uaGaE/sdyx5f2lTPsodeOeQyBrK/5oZqvnPxTFq+98eDOrdY3Q6lHadPruGuRY3CNo8wR8NW3+2Oc3Vm/TCQHWT7dtuuHuqqfEyoCWBaFpU+NxYWvYk0Yyt8SHx4yq2DJZYyCLht11natHDJkvPz3r6krXJ0q0RTOkG3ytr2TloaaoilDNyqjCJLBfXNLSd7/rWPb+BbC2ZgWZDUDYJulXd7EwDUhrzs7UtSG/IiASue3EhjXRUXzhxN284ent6wm/XtXTy1vIlYSgdg+76Yc7+nN+zmmvOmEE/p3PWc7WRaflYDzQ3Vzu9Q3Pazx06eMIKxFV5HGZYtt7GuirEVXqcO/a/Lfp49v9jzz44JSd3k6//1f3nO9Ww5bTt7aG2up3FcJSMCbrbujrBmkPLKfYwZlkquvxfTNOnr62PEiBEDnqdpGpqmOb9HIpHDXTWBQHCIHIqd9iX1ogM6wLr2LvqS+odev8PBcGlHVNOdna/9v2QvbaonqpV/O4bLszgS7RC2ms9Qaoew1fJB2OrhYTi0YzjYKQyPZwHlZ6uCv4+dXTGue3wDrU31zoukxnGV3P1cO61N9Y7NuVSZE8dV8c01m/Kef1NDNXcuauTLq9scR9f69i6WNU+krtpPImUU7S/ZexRjbXsnV59/fFEHl/15F0ua6ot+tr69i9Z+n2WPdUZTZf+Sa6jxj2arndFUgYMLIJE2C/qrqshF+35t2FtyDC3Wf9e1d7G0SH9f197FNedP+UBlDGR/69q7iOY4rQc7t1jdDqUdL2zrFLZ5BCgHW43krB9qw54B1xLXnD+FOXVVjAy7kSUZy8qE2UMi5HURTxuUkr1IEiU/O5hzS10vZWIN5tqHlvN5wKMiSZL9eeb/sRU
"text/plain": [
"<Figure size 1750x1750 with 56 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.pairplot(df) # plot every column against every column\n",
"\n",
"# 'carat' and 'price' have a linear relationship"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAg8AAAGiCAYAAABgTyUPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAACXwUlEQVR4nOzdd1hURxfA4d/SERSUZkWxRsSusWLssSRG/WIvsWtULNhjN4lYEjVq1BhbjD3GmtiIvWHFEntBEekIqIDU/f4grlkWEJZdWs7rc5+HHWbungF3OTszd65CqVQqEUIIIYTIIIOcDkAIIYQQeYskD0IIIYTIFEkehBBCCJEpkjwIIYQQIlMkeRBCCCFEpkjyIIQQQohMkeRBCCGEEJkiyYMQQgghMkWSByGEEEJkiiQPQgghhMgUSR6EEEKIXOLUqVN8+umnFC9eHIVCwZ49e97b5uTJk9SuXRszMzPKli3LqlWr9B6nJA9CCCFELhEVFUX16tVZvnx5hur7+PjQrl07XF1d8fb25quvvmLUqFH8/vvveo1TITfGEkIIIXIfhULB7t276dixY5p1Jk2axL59+7hz546qbNiwYVy/fp3z58/rLTYZeRBCCCH0KDY2lpcvX6odsbGxOjn3+fPnad26tVrZxx9/zOXLl4mPj9fJc6TGSG9nzqT40Mc5HYJeta85PKdDEOK9PjKwyekQ9K5KbP4fbG3zZ9ecDkHvzGp31Ov5dfk3yWP5RmbPnq1WNnPmTGbNmpXlcwcGBuLg4KBW5uDgQEJCAqGhoRQrVizLz5GaXJM8CCGEELlGUqLOTjVlyhTc3d3VykxNTXV2foVCofb47WqElOW6JMmDEEIIoUempqY6TRb+rWjRogQGBqqVBQcHY2RkhI2N/kYSJXkQQgghUlIm5XQEGdKgQQP279+vVnbkyBHq1KmDsbGx3p5XFkwKIYQQKSUl6e7IhNevX3Pt2jWuXbsGJF+Kee3aNXx9fYHkKZC+ffuq6g8bNoynT5/i7u7OnTt3WLduHWvXrmX8+PE6+1GkRkYehBBCiBSUOTTycPnyZZo1a6Z6/HatxBdffMGGDRsICAhQJRIATk5OHDhwgLFjx/Ljjz9SvHhxli5dyv/+9z+9xinJgxBCCJFLNG3alPS2X9qwYYNG2UcffcTVq1f1GJUmSR6EEEKIlDI53fBfI8mDEEIIkVIeWTCZU2TBpBBCCCEyRUYehBBCiJR0uElUfqTVyEPz5s2JiIjQKH/58iXNmzfPakxCCCFEzlIm6e7Ih7RKHk6cOEFcXJxG+Zs3bzh9+nSWgxJCCCFE7pWpaYsbN26ovr59+7balpiJiYkcOnSIEiVK6C46IYQQIifI1RbpylTyUKNGDRQKBQqFItXpCXNzc5YtW6az4IQQQoickFObROUVmUoefHx8UCqVlC1blosXL2JnZ6f6nomJCfb29hgaGuo8SCGEEELkHplKHkqXLg1AkgznCCGEyM/k71y6snSp5u3bt/H19dVYPNmhQ4csBSWEEELkKJm2SJdWycPjx4/p1KkTN2/eRKFQqPbhVigUQPLiSSGEECLPkn0e0qXVpZqjR4/GycmJoKAgChQowK1btzh16hR16tThxIkTOg5RCCGEELmJViMP58+f59ixY9jZ2WFgYICBgQGNGzfGw8ODUaNG4e3tres4hRBCiOwj0xbp0ip5SExMxNLSEgBbW1v8/f2pVKkSpUuX5t69ezoNUJcuX7vJ+i07uX33ISFhL/jBYzotmjTM6bAyrM/Y3rTr1RZLK0vuet9j+bQfeXr/aZr1S1csTd9xfahQtQJFSzmwctYqdq/do1bHwNCAvu59aN6xGYXtC/Mi6AVHfvNky9Kt6d4WVl+kj5pycx9r92lJ/aHtsbSzJuTBczxn/8qzS2m/BzjW+4CW03tjV6EEr4Ij8Fr1B1c3H1V9v1KbOjQa8RmFSztgYGxIuE8QXj8f4O/dZ1R1FIYGNBn7P1w6NsTCzprXwRHc+O0UZ5btgWz4fTr1a0n54Z9gZm/Nq3vPuTljI2EXUu+zqb01LrN6YV3NCcuyRXm85jA3Z/yqUa/c4DaU+aIlBUrYEvviFf5/XOD23O0kxcbruzup2u55ng1/nCQ04hXlSjgwse+n1PrAKc36f57xZsMfJ/ENDMWygBkNq1VkXK/2WBe0UNXZdPA0O/7yIjA0AuuCFrSqV5VR3dpgamKcHV3KPFkwmS6tpi1cXFxUG0bVq1ePBQsWcPbsWebMmUPZsmV1GqAuxcS8oVL5snzlPjynQ8m0rl92ofPgTiyftgK3T0YRHvKCeVvmYm5hnmYbU3NTAn0DWTdvHWFBL1Kt0214V9r3bsfy6SsY1GwIa+aupcuwz/msf/YvepU+pi639rHyJ/VpNaMPZ5fvZU37qTy7eJfuv0ykUHGbVOtblbKj24YJPLt4lzXtp3Lux720ntWXSm3rqurERERxdvleNnSexc8fT+H6byf59LshlG1SVVWn4ZefUqtXCw7P+IWfWkzgmMdW6g9tT91+rfXW17dKfFafqnP6cn/JHo63+oqwC3dpsGUS5iVS77OhqRFxYa+4/8NeIm/5plqnZOdGOE/tzr3vd3G0yXi83VdT4rMGOH/VTZ9dSdOh89dZsHE/gzs2Z/vcUdT6oAzD568jIDQ81fpX7/owbeV2Ojaty+8L3Fk4qhe3Hvsx6+ffVXX+POPND9sOMaxzS3Z/N45ZQz7n8PnrLN1+KLu6JXRMq5GHadOmERUVBcA333zDJ598gqurKzY2Nmzfvl2nAeqSa4O6uDao+/6KuVCngZ3YumwbZw+dBWDh2O/ZfnUrzTs248/NB1Jtc//6fe5fvw/AgMkDUq1TuVZlzh/x4uKxiwAE+QXR9LOmVKxWUQ+9SJ/0MW/1sd6gtlzbfoJr204A4DlnE2U/qkat3i05sUDzfaBWrxa89A/Dc84mAMIe+lOsalnqD2nPvYOXAPD1uqPW5tL6w1T73JVSdSvx+NRNAErUqsB9zys8PHYNgEi/UKp0aECxavr/4FJuaDuebj3B0y0nALg541fsm1bD6YuW3J6r2efoZ6HcnL4RgNI9Pkr1nEXqVODFpfv47T6navN8zzmsa5TTTyfe49cDp+nUtC6dm30IwMS+HTh34z47/vJidPe2GvVvPvSluF1herVpBEBJ+yJ83qIeG/afVNW5/uApNSqWpl2jmgCUsCtCm4Y1+PvRs2zokZZk2iJdWo08fPzxx3Tu3BmAsmXLcvv2bUJDQwkODpYbY+lBUcei2DgU4cqpq6qy+Lh4bly4iXPtylk6961Lt6jRqAYlnJK3FS9b2QmXulW4ePxSls6bWdLHvNVHA2NDilV1wuf0TbXyx6duUrJ2hVTblKxVQZUAvKt/g2JVnTAwSn1zuTKNqlCkbDF8L9xVlT27dI8yDatQxKkoAPaVHSlZpxIPj1/LQo/eT2FsiHU1J4JP3FArDz55kyJ1tU/Swi7cw7qaE9Y1k5OFAo72ODSvQdBf2b92LD4hgTs+z2lQTf132KBqRa6nMbVWvWJpgl5Ectr7LkqlkrDIV/x14SauNT9Q1alZqQx3fJ5z82FysuAXFMaZa3fV6uQ6SUm6O/KhTI88JCQkYGZmxrVr13BxcVGVFylSJMPniI2NJTY2Vq3MIDYWU1PTzIbzn1DErjAA4SmGDSNCwrEv6ZClc29fsQOLghasPfEzSYlJGBgasGHBL5zYeyJL580s6WPe6mOBwgUxMDLkdWikWnlUaCSWdlaptrGwsyIqRf3XoZEYGhtRoEhBXgdHAGBa0JxRF5ZjaGKEMjGJQ9M34HPmb1Wb8yv3Y1qwAMOOLVT19cTC37i977xuO5mCaZHkPseGqPchNiQS0zT6nBHP957H1LYgTfbOBAUYGBvxeIMnD5bvz2rImRb+KprEpCRsrCzVym2sLAmNfJVqmxoVy+AxojsTl20mLj6BhMQkmtZ2ZvIXn6nqtG1Yg/BXUfSbvRJQkpCYRNeW9RnYoZk+uyP0KNPJg5GREaVLl87SXg4eHh7Mnj1brWzahFHMmDha63PmJ807NmP0vFGqx9P
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(df.corr(), annot = True) # what is the relationship between columns?\n",
"\n",
"# there's a strong correlation between 'carat' and 'price'"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4k8nJyFemMBY"
},
"source": [
"# Task 7: Conceptualize the Problem"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"id": "gu4nQbmQm9z8",
"tags": []
},
"outputs": [],
"source": [
"# 'carat' is a unit of mass and has a positive correlation with 'price', which makes sense:\n",
"# the larger the diamond, the higher the price.\n",
"\n",
"# Predicting 'price' is a typical regression problem. So let's use three different models:\n",
"\n",
"# Simple Linear Regression\n",
"# Multiple Linear Regression\n",
"# Decision Tree Regression"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E9xy8hetm9wG"
},
"source": [
"# Task 8: Perform Data Split"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"id": "Swr_A0p1m9r6"
},
"outputs": [],
"source": [
"X = df[['carat']] # only one feature\n",
"y = df['price']"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(29622, 1)\n",
"(14590, 1)\n",
"(29622,)\n",
"(14590,)\n"
]
}
],
"source": [
"print(X_train.shape)\n",
"print(X_test.shape)\n",
"print(y_train.shape)\n",
"print(y_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CCJo3OYWm9oQ"
},
"source": [
"# Task 9: Choose the ML Method to Follow"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"id": "7UJWBSspm9lM"
},
"outputs": [],
"source": [
"model = LinearRegression() # instantiate the model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Dmo0wF-vnXzD"
},
"source": [
"# Task 10: Train the ML Model"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"id": "AUVPWfp9nXvl"
},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\
],
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X_train, y_train) # fit the model"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['simple_regression.joblib']"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# save the model for later\n",
"dump(model, 'simple_regression.joblib')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RYSfWcCcnXsb"
},
"source": [
"# Task 11: Test the Model"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"id": "2ZDwzmMvnXmA",
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[6692.49012456]\n"
]
}
],
"source": [
"print(model.coef_) # the slope"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# The coefficient is positive. Therefore, 1 unit increase in carat is associated with\n",
"# more than $6600 increase in price."
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"id": "ufa3Ch6inXcK"
},
"outputs": [],
"source": [
"# predictions\n",
"y_pred = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.collections.PathCollection at 0x7f8100363490>"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjEAAAGdCAYAAADjWSL8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAABv1klEQVR4nO3deXxTdb4//lfSJulim260aaWUihu1uFAFCgoqizhidZy5LkBHf+NFXBGXEXHufCnXe0Fwrnrv4IKMywyozL13ZIQrVosoCLQUwTqUggO1bNJQuqWlW9rk/P4oCUma5ZzsJ3k9H48+HpCcnPM5yTnNp5/P5/1+KwRBEEBEREQkM8pQN4CIiIjIG+zEEBERkSyxE0NERESyxE4MERERyRI7MURERCRL7MQQERGRLLETQ0RERLLETgwRERHJUmyoGxAoZrMZp06dQlJSEhQKRaibQ0RERCIIgoDOzk7k5ORAqXQ/1hKxnZhTp04hNzc31M0gIiIiL5w4cQLDhw93u03EdmKSkpIADL4JycnJIW4NERERidHR0YHc3Fzr97g7EduJsUwhJScnsxNDREQkM2KWgnBhLxEREckSOzFEREQkS+zEEBERkSyxE0NERESyxE4MERERyRI7MURERCRL7MQQERGRLLETQ0RERLIUscnuSByTWUB1QyuaOnuRmRSHcflpiFEqnD4OAFX1Laj8sRmAAsWj0nHdyDTsPdY25PVSjgfAaRtsGQfM+NOuo9j9YzP0HX3QxCoxPDUeo3XJ6DQOQDADHb1GmAXgdHsPjracRdPZfiRqYjHlkgyMHHYBGprPYufhFnT09kMdo0BSXAz6zQqoY5S4efQwdPeZ8Y/Tnejo6UeMUkBHrwndfQOAIKDLaIZJAGKVgEkY/FFg8EepBDISVUiOV+FUew96+wUoFECMAjCbgf5z2ybFx2BkagK6+k0wCcCEkWkYnp6AmuPt6Ok3YcyFKZh4UTqgAHbVN6PmeBuOt3bDJAAj0xIQp1LiaHMXOvsGAABJcWrkZyTgAk0MegcEZCXH4ZoRqchJice4/DQYB8xYtrkOR1u6MTI9Ac/OuBz/u/cEjrZ0wWQW0NU7gJ4BE64bmY65E/Kw73gbKutbAAgovigDE0alD/lsivJS7T5v2/9nJGoABdB8ts/63O4fW/C/e0/ip/ZuXJiSgMuzk9BythdbDp5B/4AZWVoN7r0uDxemxgMCoO/oRc2JNpgFAQoAyXFqKJVA8UUZuC5/8FrTd/Si9Wwf0hLVyLhAg4OnOrDnWCu6jSZcmavFxIsyAACV9S2oOdGGvgETctMS8YuxwzHx4gy316flGnW8zidclG69L1w953GfP7YMeX+dvU7ytiLa4+o+98Tb+zVQx7a8zva5jAs0gAA0d/VJ2r8/20qhoxAEQQh1IwKho6MDWq0WBoOBGXtdKK9txNJNdWg09Fofy9bGoeSqbGz8vtHu8ZQEFYwDZnQbTXb7UCgA2ysoWxuHJbcXYGZhtqjjpSSoAADt3f0u97F8cx3e/qYBkXmlBoYmVom+AbNP+0hQx0Adq7T7bJQKwGzzOTj+P5AUAHw9VKI6Bv9x91VOr09g8Bp9/uP9ducMDF6n91w7HH/59qTT5166a4xX+3R8XSC2dXWfu7pPbffvzf3qaR/eHtvyOgBDnrMlZv9Sjyl1X+QbKd/f7MREqfLaRjyybp/PXwqOLH+zvDl37JBfuGKPZ7uP7463YfX2Bj+3kqLdWw7XJzB4jT68bl/Q92l5XSC2BeD0vnN1n9q225v7Vcw97+2xxXZiPe3fGW/bSoEh5fuba2KikMksYOmmOr93YIDzv2SWbqqD6dyf6FKPZ7uPt9mBoQCwvT6BwWu0bOOBAOyzzuPryjYegHHALHFbz21duqkOZRsPOL3vnN2ntu329n4Vc897e2xf2uOOt22l8MBOTBSqbmh1ORTrDwKARkMvqhtavT6eZR/8tUGBYHt9AoPXqL6jLwD79Hzd6zv6sLbyqMRtPbe10dDrdjvH+9S23d7er2LveX8eW8r+HZnMAt7f2eBVWyk8cGFvFGrqDFwHxtlxgnU8Iilsr0t/XaPe7vNYa3dAthXDsZ2+vBdS73l/HlvM/m05WwPj7b4odNiJiUKZSXFBPU6wjkckhe116a9r9Gjz+Q6GlH3mpSUEZFsxHNvpy3sh9Z7357HF7N/CmzWB/D0WnjidFIXG5achWxuHQAUOKjC4qt8Sjhno4xFJZXt9AoPXqC5Z4/N+1+85bl07MbhPz198umQNSotHStzWc1uztXHQJWtc3neO96mFN/er1Hven8eWsn9A+pofd/ui0GMnJgrFKBXWUEV/dyws+1tye4E1v4LleFzfQuHC9voEBq/RspIrfN6v7dqJwX0WeHzNbWOysfdYG/7frNEety0ruQLqWKWott573QjMLNQ5ve+c3acWtr8fxBLg/J63PZbUY7t6nbPnxO5fzBoYsfui8MBOTJSaWZiNN+eOhU5r/9dftjYO8yfnI9vh8dQEFRLUMUP2o3C4r3XaOKfhiDMLs/HrSSP90nYKHcdf43L7va4AMH9yvtNw2ZmF2Xhr7lhrLhRv2a6dcLdPy1v3zs6juG9NFV789CDmT853um1KgsouhNvdfhM1MUhJUOHVLf/A+7uOOW2jq/vUtt1vzh2LlHhx78WvJ410es87+x0j9tjOXvfW3LF4y8lzYvZfXtuI61dsxYufHhR1TmLaSqHHPDFRLpgZeyvrW3Dfmqognh352+JbL8OVw1O9ytirALDnWLvoY00bPQwxCiU+rzvt13NQwH3eD8csuO3dRqzbfVz0/j+aNwHF57Id2+3zXBbe+jOd+Kx26DlZ7prXZ4+FNkHlVcbeWCXw2pdHPLbxjdlj8bMrPX8x7zzcjDnv7Pa4nbNztm1jqDP2erMG5ne3jcYDk/I5AhMCTHYHdmL8zfEXimP6eTG/mExmAdev2Ao9Q6dlSakADr14K9Sx5wdwxX5BWT57McP4Cgz+BbztNzdhystf+T0dgO3+xVzDxgEzLvuXz0Rds9naOOxYdLPLe8HT+2Bpm7t9uCL2PZZyDE/3rC/tDRYp1x4gj3OKdFK+vxmdRB45C0V0TDcvJj23Zb77kXX7/JJCnoIrThWDrYdOe5XOXmr+jyW3F2DvsbaA5DOy5P2YsHwLWrs8p8/feug0EtQx6HIoueFIAc9rJ6TkT3E1suHtvr05hrt7Vi7rRaRce3I5JzqPa2LILcswrOMvAcfklXpDLx5Ztw/ltY1u9+dqvpu/L8Jfj9Fk/YxdXReurgMpOTYeOrdmRexrEp2s1RLDtgMDOG+75Tw9dWCyRa6d8DZ/SiBeI3Z7b9e2hAsp74tczonO40gMuSQlFFHA4F8xSzfVYXqBzu1fMTMLszG9QGc3DXF1bgrGLatAZ6/7LwsKHctnPJjyXuEyTbuz60BKjo2N3zfiuZmjRb/m7dJrUdfYgX/fLH7BpjOObce5f7u7/lPiVXh99liXa1YceZs/JRCvkbK9s3tWLhWexZ4n18DIE0diyCWpUwBS0nPHKAcXBt9x9YUoHpWOmhPtPnVgVLySg0LAYOp7dynyLddB1Y8t1scs+T/EsFxDYnKGWKJz5k7IQ1qib1FFwPm2v7+zAbuONHu8/tt7+qFUKkR/8XnKHeNLThKx77G3x3C8Z+XyZS82Zw07MPLEX/3kkrdptoMxFO6o3+zTyykAHvvg/NSM1NwjTZ29ol7T3t2POe/sxhVLyodMD/nixU8P4lfvVovadkudXvR+K+r06B1w3ln3dT2G5f0S88poWvPhbc4akgd2Ysglb9NsB2MonMJfe0+/3RqTmYXZeGraJaJea7keZhZm46HJ+R63F1tgOD1RLW5DiF94/s7Oox7XggHn19e0dzvvbKUkqHxej2FZv+JqREbs2p1II/d1PeQa18SQS5ZhWLEh0ZbQxKK8VFTWt4gKu7XN9aBL1uB0Rx+jltyIUymwe/F01DV2oLG9B0s2HUBn70DQjm/5FKV8RrbrYx6/+RJ8VH3C7XRUaoIK3xxuws4jzbguLxX//e1Jr9ublqjGf95
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(y_test, y_pred) # how close are our predictions?"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7f8100362320>"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAHpCAYAAABN+X+UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA0EElEQVR4nO3de3RU5b3G8WckIYQ0jISYTCIRsEYEA0rBhlCVIPcacyhdRaCmsIpc1ARSoChqDzkuS6y2QE8o1LI4gFwa16minlMaDCK0FMIlkBouUqygAXIBGybgiZMQ3vOHi12HXAghJC/y/ay112L2+5s9v51F5sne8+7ZLmOMEQAAsNJNrd0AAACoH0ENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBHUjGWNUUVEhLjsHALQkgrqRzp49K7fbrbNnz7Z2KwCAGwhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYLGA1m4AuBYSh45QcenpOseiIsO1JTenhTsCgKYhqPG1VFx6Wn1Ss+oc27c4rYW7AYCm49Q3AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFmvVoF66dKl69+6tDh06qEOHDkpISNCf/vQnZ9wYo4yMDEVHRys4OFiJiYk6cOCA3zZ8Pp/S0tIUHh6ukJAQJScn6/jx43415eXlSklJkdvtltvtVkpKis6cOdMSuwgAwFVp1aDu3LmzXnrpJe3Zs0d79uzRQw89pH/7t39zwvjll1/WggULtHjxYu3evVsej0dDhw7V2bNnnW2kp6dr/fr1ys7O1rZt23Tu3DklJSWppqbGqRk/frwKCgqUk5OjnJwcFRQUKCUlpcX3FwCAK+UyxpjWbuKrwsLC9Morr+jHP/6xoqOjlZ6erqefflrSl0fPkZGR+sUvfqGpU6fK6/Xqlltu0erVq/Xoo49Kkk6ePKmYmBht2LBBw4cP16FDh9SzZ0/l5eUpPj5ekpSXl6eEhAR9+OGH6t69e6P6qqiokNvtltfrVYcOHa7NzqPZdO/dT31Ss+oc27c4TYc/2NPCHQFA01jzGXVNTY2ys7P1+eefKyEhQUePHlVJSYmGDRvm1AQFBWngwIHavn27JCk/P1/V1dV+NdHR0YqLi3NqduzYIbfb7YS0JPXv319ut9upqYvP51NFRYXfAgBAS2v1oC4sLNQ3vvENBQUFadq0aVq/fr169uypkpISSVJkZKRffWRkpDNWUlKitm3bqmPHjg3WRERE1HrdiIgIp6YumZmZzmfabrdbMTExV7WfAAA0RasHdffu3VVQUKC8vDw98cQTmjBhgg4ePOiMu1wuv3pjTK11l7q0pq76y21n7ty58nq9zlJUVNTYXQIAoNm0elC3bdtWd9xxh/r166fMzEzdc889+vWvfy2PxyNJtY56y8rKnKNsj8ejqqoqlZeXN1hTWlpa63VPnTpV62j9q4KCgpzZ6BcXAABaWqsH9aWMMfL5fOrWrZs8Ho9yc3OdsaqqKm3dulUDBgyQJPXt21eBgYF+NcXFxdq/f79Tk5CQIK/Xq127djk1O3fulNfrdWoAALBVQGu++LPPPquRI0cqJiZGZ8+eVXZ2trZs2aKcnBy5XC6lp6dr/vz5io2NVWxsrObPn6/27dtr/PjxkiS3261JkyZp1qxZ6tSpk8LCwjR79mz16tVLQ4YMkST16NFDI0aM0OTJk/Xqq69KkqZMmaKkpKRGz/gGAKC1tGpQl5aWKiUlRcXFxXK73erdu7dycnI0dOhQSdKcOXNUWVmpJ598UuXl5YqPj9e7776r0NBQZxsLFy5UQECAxowZo8rKSg0ePFgrV65UmzZtnJq1a9dq+vTpzuzw5ORkLV68uGV3FgCAJrDuOmpbcR319YXrqAF8XVj3GTUAAPgXghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFgto7QaAlnb8eJG69+5X51hUZLi25Oa0cEcAUD+CGjecGiP1Sc2qc2zf4rQW7gYAGsapbwAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGLM+sZ1KXHoCBWXnq53/MTJk+rTgv0AwLVCUOO6VFx6ut5LrCTpk6dHtVwzAHANceobAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwWKsGdWZmpu677z6FhoYqIiJCo0aN0uHDh/1qJk6cKJfL5bf079/fr8bn8yktLU3h4eEKCQlRcnKyjh8/7ldTXl6ulJQUud1uud1upaSk6MyZM9d6FwEAuCqtGtRbt27VU089pby8POXm5ur8+fMaNmyYPv/8c7+6ESNGqLi42Fk2bNjgN56enq7169crOztb27Zt07lz55SUlKSamhqnZvz48SooKFBOTo5ycnJUUFCglJSUFtlPAACaqlWvo87J8b/v74oVKxQREaH8/Hw9+OCDzvqgoCB5PJ46t+H1erV8+XKtXr1aQ4YMkSStWbNGMTEx2rRpk4YPH65Dhw4pJydHeXl5io+PlyQtW7ZMCQkJOnz4sLp3715ruz6fTz6fz3lcUVFx1fsLAMCVsuozaq/XK0kKCwvzW79lyxZFRETozjvv1OTJk1VWVuaM5efnq7q6WsOGDXPWRUdHKy4uTtu3b5ck7dixQ2632wlpSerfv7/cbrdTc6nMzEznNLnb7VZMTEyz7ScAAI1lTVAbYzRz5kzdf//9iouLc9aPHDlSa9eu1ebNm/WrX/1Ku3fv1kMPPeQc7ZaUlKht27bq2LGj3/YiIyNVUlLi1ERERNR6zYiICKfmUnPnzpXX63WWoqKi5tpVAAAazZqvEE1NTdUHH3ygbdu2+a1/9NFHnX/HxcWpX79+6tKli/74xz9q9OjR9W7PGCOXy+U8/uq/66v5qqCgIAUFBV3pbgAA0KysOKJOS0vTO++8o/fff1+dO3dusDYqKkpdunTRkSNHJEkej0dVVVUqLy/3qysrK1NkZKRTU1paWmtbp06dcmoAALBRqwa1MUapqal68803tXnzZnXr1u2yz/nss89UVFSkqKgoSVLfvn0VGBio3Nxcp6a4uFj79+/XgAEDJEkJCQnyer3atWuXU7Nz5055vV6nBgAAG7Xqqe+nnnpK69at09tvv63Q0FDn82K3263g4GCdO3dOGRkZ+v73v6+oqCgdO3ZMzz77rMLDw/W9733PqZ00aZJmzZqlTp06KSwsTLNnz1avXr2cWeA9evTQiBEjNHnyZL366quSpClTpigpKanOGd8AANiiVYN66dKlkqTExES/9StWrNDEiRPVpk0bFRYW6rXXXtOZM2cUFRWlQYMG6fXXX1doaKhTv3DhQgUEBGjMmDGqrKzU4MGDtXLlSrVp08apWbt2raZPn+7MDk9OTtbixYuv/U4CAHAVWjWojTENjgcHB2vjxo2X3U67du2UlZWlrKz6708cFhamNWvWXHGPAAC0JismkwEAgLoR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDG
"text/plain": [
"<Figure size 500x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.displot((y_test - y_pred), bins = 50) # residual plot"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"mse = metrics.mean_squared_error(y_test, y_pred) # mean squared error"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1032.6823495276928"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sqrt(mse) # root mean squared error; this is our performance measure"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.26</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>61.9</td>\n",
" <td>55.0</td>\n",
" <td>337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.30</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>64.0</td>\n",
" <td>55.0</td>\n",
" <td>339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0.31</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>62.2</td>\n",
" <td>54.0</td>\n",
" <td>344</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price\n",
"3 0.29 3 3 3 62.4 58.0 334\n",
"4 0.31 1 1 1 63.3 58.0 335\n",
"7 0.26 2 2 2 61.9 55.0 337\n",
"10 0.30 1 1 1 64.0 55.0 339\n",
"13 0.31 4 4 4 62.2 54.0 344"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# next model: multiple regression\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler() # we're going to scale the features"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.26</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>61.9</td>\n",
" <td>55.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.30</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>64.0</td>\n",
" <td>55.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0.31</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>62.2</td>\n",
" <td>54.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table\n",
"3 0.29 3 3 3 62.4 58.0\n",
"4 0.31 1 1 1 63.3 58.0\n",
"7 0.26 2 2 2 61.9 55.0\n",
"10 0.30 1 1 1 64.0 55.0\n",
"13 0.31 4 4 4 62.2 54.0"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features = df.drop(['price'], axis = 1) # all columns except 'price'\n",
"features.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table'], dtype='object')"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns[0:6]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"scaled_features = scaler.fit_transform(features)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"numpy.ndarray"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(scaled_features)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"# save the X array for later\n",
"np.save('features.npy', scaled_features)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1.208360</td>\n",
" <td>0.058916</td>\n",
" <td>0.058916</td>\n",
" <td>0.058916</td>\n",
" <td>0.462594</td>\n",
" <td>0.294839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-1.138663</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>1.104188</td>\n",
" <td>0.294839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-1.312906</td>\n",
" <td>-0.833437</td>\n",
" <td>-0.833437</td>\n",
" <td>-0.833437</td>\n",
" <td>0.106153</td>\n",
" <td>-1.052933</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-1.173512</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>1.603205</td>\n",
" <td>-1.052933</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-1.138663</td>\n",
" <td>0.951268</td>\n",
" <td>0.951268</td>\n",
" <td>0.951268</td>\n",
" <td>0.320018</td>\n",
" <td>-1.502191</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table\n",
"0 -1.208360 0.058916 0.058916 0.058916 0.462594 0.294839\n",
"1 -1.138663 -1.725789 -1.725789 -1.725789 1.104188 0.294839\n",
"2 -1.312906 -0.833437 -0.833437 -0.833437 0.106153 -1.052933\n",
"3 -1.173512 -1.725789 -1.725789 -1.725789 1.603205 -1.052933\n",
"4 -1.138663 0.951268 0.951268 0.951268 0.320018 -1.502191"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = pd.DataFrame(scaled_features, columns = df.columns[0:6])\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3 334\n",
"4 335\n",
"7 337\n",
"10 339\n",
"13 344\n",
"Name: price, dtype: int64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y.head() # y stays the same"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"# save Y for later\n",
"np.save('price.npy', y)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"model = LinearRegression() # instantiate the model"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\
],
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X_train, y_train) # fit the model"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['multiple_regression.joblib']"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# save the model for later\n",
"dump(model, 'multiple_regression.joblib')"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"id": "2ZDwzmMvnXmA",
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1963.25110751 45.01046146 45.01046146 45.01046146 -71.28639623\n",
" -86.94439747]\n"
]
}
],
"source": [
"print(model.coef_)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"id": "ufa3Ch6inXcK"
},
"outputs": [],
"source": [
"# predictions\n",
"y_pred = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.collections.PathCollection at 0x7f81000b7c40>"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjEAAAGdCAYAAADjWSL8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAABt9UlEQVR4nO3de3xU1bk//s8kJJMLyc7NZBJACBiRGJCLQgJWkGuQm/V7igqmWi0qCoji0aLlAEcLYr9H7O+gFCkFKyI936OoCKYmRUEkEJoYIQQRMdxnCJBkEkJuZPbvjzhD5r733PfM5/165aXMrOxZeybJfvZaz3qWShRFEUREREQKE+bvDhARERG5gkEMERERKRKDGCIiIlIkBjFERESkSAxiiIiISJEYxBAREZEiMYghIiIiRWIQQ0RERIrUzd8d8BaDwYDz588jLi4OKpXK390hIiIiCURRRGNjIzIyMhAW5nisJWiDmPPnz6NXr17+7gYRERG54MyZM+jZs6fDNrKCmD59+uDUqVNWjz/11FN46623IIoili9fjnfeeQd1dXUYMWIE3nrrLdx6662mtq2trXj++efxwQcfoLm5GePGjcPbb79t1tG6ujosWLAAn376KQBg+vTp+O///m8kJCRI7mtcXByAzjchPj5ezmkSERGRnzQ0NKBXr16m67gjKjl7J128eBEdHR2mf1dWVmLChAn48ssvMWbMGKxatQp/+MMfsGnTJtx888149dVXsWfPHhw7dszUmblz52L79u3YtGkTkpOTsWjRItTW1qKsrAzh4eEAgMmTJ+Ps2bN45513AACPP/44+vTpg+3bt8t6EwRBgF6vZxBDRESkELKu36IbnnnmGbFfv36iwWAQDQaDqNFoxNdee830fEtLiygIgvjnP/9ZFEVRrK+vFyMiIsStW7ea2pw7d04MCwsTCwsLRVEUxaqqKhGAuH//flObkpISEYD4/fffS+6bXq8XAYh6vd6dUyQiIiIfknP9dnl1UltbGzZv3oxHH30UKpUK1dXV0Ol0mDhxoqmNWq3G6NGjsW/fPgBAWVkZ2tvbzdpkZGQgJyfH1KakpASCIGDEiBGmNrm5uRAEwdTGltbWVjQ0NJh9ERERUfByOYj5+OOPUV9fj0ceeQQAoNPpAABpaWlm7dLS0kzP6XQ6REZGIjEx0WGb1NRUq9dLTU01tbFl5cqVEATB9MWkXiIiouDmchCzYcMGTJ48GRkZGWaPWy5nFkXR6RJnyza22js7zuLFi6HX601fZ86ckXIaREREpFAuBTGnTp1CcXExfvvb35oe02g0AGA1WlJTU2MandFoNGhra0NdXZ3DNhcuXLB6zYsXL1qN8nSlVqsRHx9v9kVERETBy6UgZuPGjUhNTcWUKVNMj2VmZkKj0aCoqMj0WFtbG3bv3o2RI0cCAIYNG4aIiAizNlqtFpWVlaY2eXl50Ov1KC0tNbU5cOAA9Hq9qQ0RERGR7GJ3BoMBGzduxMMPP4xu3a5/u0qlwsKFC7FixQpkZWUhKysLK1asQExMDGbNmgUAEAQBjz32GBYtWoTk5GQkJSXh+eefx8CBAzF+/HgAwIABA5Cfn485c+Zg3bp1ADqXWE+dOhX9+/f3xDkTERFREJAdxBQXF+P06dN49NFHrZ574YUX0NzcjKeeespU7O6LL74wK1izevVqdOvWDTNnzjQVu9u0aZOpRgwAvP/++1iwYIFpFdP06dOxZs0aV86PiIiIgpSsYndKwmJ3RIGpwyCitLoWNY0tSI2LwvDMJISHcX8zIuok5/odtHsnEVHgKazUYvn2Kmj1LabH0oUoLJ2WjfycdD/2jIiUyOUl1kREchRWajF3c7lZAAMAOn0L5m4uR2Gl1k89IyKl4kgMkQcZp0p0DS2ovdKKpNhIaITokJ8y6TCIWL69CrbmrkUAKgDLt1dhQrbGpffJl1NUnA4jChwMYohcZHkxq2tqxSs7jlqNNACcMimtrrX5vhiJALT6FpRW1yKvX7KsY/tyiorTYUSBhUEMkQRyAhZbtPoWPLm5HH9+aKisi52/7/o99fo1jdLeJ6ntjIxTVJYjPLqf3+9nx2ehT0qsR947R681d3M51sr8bInIfQxiiOD4Yr3zkBa//6QStU1tbr/O7z46jDh1BC41tTq9sPr7rt+Tr58aF+XRdkDnZ7bs0yN2p6gAYHXxcdNjiTER+MO9ObhnUIaN77A+dtefh2G9E706HUbW/B3AkzJwiTUFJTl/AB1drL89XYd1e6q91k97QYG9u36jyTkaPJTbG7l9k+2el6sXgQ6DiDW7fsTq4h+snjN+t5RRh66vnxKrxqL/9x0uNLTYPCcVAI0Qhd3/fjfKTtVJ6vOfio/b7KMzT9yVicX3ZNt93tbPQ1JspKQg9oM5ubKnw8iavwN48i85128GMeRTvri7kvMH0F6woALsBhCeZCso6DCIuHPVLklTVQkxEXjtvoE2z2vZp1XQNVw/hiY+Csum278IdAYvx/HXb6qhb77msM8aIQp7XxwrKzCMiQzH1bYOq/fWeITH78rEp99pJX9uT24ut9tHZ96eNRT3DLJ+H5wFj8786YHBmDG4h8v9Ise/k4C0AFoqjvYEJgYxYBATiHxxd7XzkBZPbbG+uLkbLHhbUmwElky9FZr4KBhEEbP/ckDW93fNtXF2gbeVl1NYqcXvPjqM+qvtkl9zyZQBeGRUptUffbmBQJgKGDcgFcVVNZIuXB0GEaNe+yd0Da2S+2opKTYSB18eb9Z3T/w8BOpIjD8v1nJe29lnYGvELqW7GhAhaYq2K472BC4GMWAQE2h8cXe189B5zPvgWxjs/ERbjiCUnLiMB9fvd+s1vSGymwpt1+T9Wqb/fF4AMOzVIofBSGJMBP71+wmmP/TujD5Y/tH3RmDorc/NMuBw97jJsZEotQiMvE1KgODPi7Xc15b6GTia3pNybr4c7SH55Fy/WeyOvM5ZjRCgMymyw170IUFhpRZPbbEfwBhfy7iMF4DZVEsgkRvAANfPa/9Pl52OptRdbcfSTyrRds2AtmsGvLSt0uXpE8tCdc6WUrvC8nOTu4LJHsvjuHvcGYMzfBrAFFZqceeqXXhw/X48s7UCD67fjztX7TIrGujPAoOuvLbUz8BRfpKzc/PF3yPyHQYx5HVSa4Rs+qYan1ScQ8mJy1Z/QDoMIkpOXDZ73vjYtm/P4aVthyX3p6axBYWVWrzy2RFXTykg1TS2oOTEZUltNx84jf5LPsfAZf9wa9WV5R99TwUYthiPLWcFkyOWx3H3uONuSXPr+41s/axbkhIg+PNi7epre+KzdXT8DoOITd9US/57xEAm8HGJNXmd1AvbKzuOmv6/65CwrSHp7upuMIgirrZ1yO7PyUtNeLP4uE8Sd30ppbsaxy80Sm4vikDrNYPbr9t1pMRTAYYtxmMPz0yCJl7tVk5MutA59dLV8MwkpAtR0Oltr6ByxuCBmXkp0y9Sqx/HqSM8WmBQTm6Lq8UN3f0MHB3f1nvryCs7juIve6uZIxPgOBJDXufKhU378x3lyp1VNu84r7RecymASRei8EHp6aALYABg0f9U+PX1dQ0tMIgiEqIjPH5sTbwaBlHEJxXnUFpdi/+Yan+JtBRLp2VbXYDDw1RYOq3zuK5MCh34ebrLVVKnX6QGCCU/XZL0ulJuMqRMXck9pq127n4G9o5v7711hvt6BT4GMeR1xrsruX+URADrv672aMAxql+yW3fwgUzX0Io1X57w2+v/xyeVmP2XA6hvlr7CSaqWawbM/ssB0wX0lR1H8cRdmUiIkRcwJcZEOKyanJ+TjrUPDYVGcGVEyfWfVDnTL9Kn7KT9xjm7yXAlt8Wd4ob2PoOkWPnBcWpclMP31hnmyAQ+TieR1xnvruZuLpddf8WTfzdiIsPwv+XnPHdAMtPYYr+2jLssk5V1+ha8s6cab80aCiE6AvtOXML5+mZkJEYjt08yoALeP3AKXx6rQWuXRGl1N+f3bROyNYhTdx5zU8lJNLVKG/FzJ6lXzvSL1AAhr18yPiw/a3dqxrjqy3JarStXN+50Ni3k7LXzc9I
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(y_test, y_pred) # how close are our predictions?"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7f810027ba60>"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAesAAAHqCAYAAAAkr2YEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA0M0lEQVR4nO3de3RU5b3/8c+UXAhpGAkxmUQQYkUEA0rBQqhKkHuNlNJVBGoKqwh4SSAFiqL2R+riEKst0BMKtSwOIBfjOlWq55QGgwgtBwISSA0XKa2gAXIBGybgiZOQPL8/POw65EIIIfMg79daey1mP9+957tjnE/2ZfZ2GWOMAACAtb4W6AYAAEDjCGsAACxHWAMAYDnCGgAAyxHWAABYjrAGAMByhDUAAJYjrAEAsBxh3UTGGFVUVIh7yAAAWhth3UTnzp2T2+3WuXPnAt0KAOAGQ1gDAGA5whoAAMsR1gAAWI6wBgDAcoQ1AACWI6wBALAcYQ0AgOUIawAALEdYAwBgOcIaAADLEdYAAFiOsAYAwHKENQAAliOsAQCwHGENAIDlCGsAACxHWAMAYDnCGgAAyxHWAABYLijQDQDNkTRspIpLzzQ4HhsTpW25Oa3YEQBcO4Q1rkvFpWfUJzWrwfH9S9NasRsAuLY4DA4AgOUIawAALEdYAwBgOcIaAADLEdYAAFiOsAYAwHKENQAAliOsAQCwHGENAIDlAhrWy5cvV+/evdW+fXu1b99eiYmJ+tOf/uSMG2OUkZGhuLg4hYWFKSkpSQcPHvRbh8/nU1pamqKiohQeHq7Ro0frxIkTfjXl5eVKSUmR2+2W2+1WSkqKzp492xqbCADAVQtoWHfq1Ekvvvii9u7dq7179+rBBx/Ud7/7XSeQX3rpJS1atEhLly7V+++/L4/Ho2HDhuncuXPOOtLT07Vx40ZlZ2drx44dOn/+vJKTk1VTU+PUTJw4UQUFBcrJyVFOTo4KCgqUkpLS6tsLAEBzuIwxJtBNfFlkZKRefvll/fjHP1ZcXJzS09P19NNPS/piLzomJka/+MUvNH36dHm9Xt18881au3atHnnkEUnSqVOn1LlzZ23atEkjRozQ4cOH1bNnT+Xl5al///6SpLy8PCUmJurDDz9U9+7dm9RXRUWF3G63vF6v2rdvf202Hk3WvXe/y94b/MgHe1uxIwC4dqw5Z11TU6Ps7Gx99tlnSkxM1LFjx1RSUqLhw4c7NaGhoRo0aJB27twpScrPz1d1dbVfTVxcnBISEpyaXbt2ye12O0EtSQMGDJDb7XZqAACwWcCfulVYWKjExER9/vnn+vrXv66NGzeqZ8+eTpDGxMT41cfExOjjjz+WJJWUlCgkJEQdOnSoU1NSUuLUREdH13nf6Ohop6Y+Pp9PPp/PeV1RUdG8DQQA4CoFfM+6e/fuKigoUF5enp544glNmjRJhw4dcsZdLpdfvTGmzrxLXVpTX/3l1pOZmelckOZ2u9W5c+embhIAAC0q4GEdEhKi22+/Xf369VNmZqbuvvtu/frXv5bH45GkOnu/ZWVlzt62x+NRVVWVysvLG60pLS2t876nT5+us9f+ZfPmzZPX63WmoqKiq9pOAACaK+BhfSljjHw+n+Lj4+XxeJSbm+uMVVVVafv27Ro4cKAkqW/fvgoODvarKS4u1oEDB5yaxMREeb1e7dmzx6nZvXu3vF6vU1Of0NBQ5ytlFycAAAIhoOesn332WY0aNUqdO3fWuXPnlJ2drW3btiknJ0cul0vp6elauHChunXrpm7dumnhwoVq166dJk6cKElyu92aMmWKZs+erY4dOyoyMlJz5sxRr169NHToUElSjx49NHLkSE2dOlWvvPKKJGnatGlKTk5u8pXgAAAEUkDDurS0VCkpKSouLpbb7Vbv3r2Vk5OjYcOGSZLmzp2ryspKPfnkkyovL1f//v31zjvvKCIiwlnH4sWLFRQUpHHjxqmyslJDhgzR6tWr1aZNG6dm/fr1mjFjhnPV+OjRo7V06dLW3VgAAJrJuu9Z24rvWduF71kDuJFYd84aAAD4I6wBALAcYQ0AgOUIawAALEdYAwBgOcIaAADLEdYAAFiOsAYAwHKENQAAliOsAQCwHGENAIDlCGsAACxHWAMAYDnCGgAAyxHWAABYjrAGAMByhDUAAJYjrAEAsBxhDQCA5QhrAAAsR1gDAGA5whoAAMsR1gAAWI6wBgDAcoQ1AACWI6wBALAcYQ0AgOUIawAALEdYAwBgOcIaAADLBQW6AeBaOHGiSN1796t3LDYmSttyc1q5IwBoPsIaX0k1RuqTmlXv2P6laa3cDQBcHQ6DAwBgOcIaAADLEdYAAFiOsAYAwHKENQAAliOsAQCwHGENAIDlCGsAACxHWAMAYDnCGgAAyxHWAABYjrAGAMByhDUAAJYjrAEAsBxhDQCA5QhrAAAsR1gDAGA5whoAAMsR1gAAWI6wBgDAcoQ1AACWI6wBALBcQMM6MzNT9957ryIiIhQdHa0xY8boyJEjfjWTJ0+Wy+XymwYMGOBX4/P5lJaWpqioKIWHh2v06NE6ceKEX015eblSUlLkdrvldruVkpKis2fPXutNBADgqgU0rLdv366nnnpKeXl5ys3N1YULFzR8+HB99tlnfnUjR45UcXGxM23atMlvPD09XRs3blR2drZ27Nih8+fPKzk5WTU1NU7NxIkTVVBQoJycHOXk5KigoEApKSmtsp0AAFyNoEC+eU5Ojt/rVatWKTo6Wvn5+XrggQec+aGhofJ4PPWuw+v1auXKlVq7dq2GDh0qSVq3bp06d+6sLVu2aMSIETp8+LBycnKUl5en/v37S5JWrFihxMREHTlyRN27d79GWwgAwNWz6py11+uVJEVGRvrN37Ztm6Kjo3XHHXdo6tSpKisrc8by8/NVXV2t4cOHO/Pi4uKUkJCgnTt3SpJ27dolt9vtBLUkDRgwQG6326kBAMBWAd2z/jJjjGbNmqX77rtPCQkJzvxRo0bpBz/4gbp06aJjx47pZz/7mR588EHl5+crNDRUJSUlCgkJUYcOHfzWFxMTo5KSEklSSUmJoqOj67xndHS0U3Mpn88nn8/nvK6oqGiJzQQA4IpZE9apqan64IMPtGPHDr/5jzzyiPPvhIQE9evXT126dNEf//hHjR07tsH1GWPkcrmc11/+d0M1X5aZmamf//znV7oZAAC0OCsOg6elpentt9/We++9p06dOjVaGxsbqy5duujo0aOSJI/Ho6qqKpWXl/vVlZWVKSYmxqkpLS2ts67Tp087NZeaN2+evF6vMxUVFTVn0wAAuGoBDWtjjFJTU/Xmm29q69atio+Pv+wyn376qYqKihQbGytJ6tu3r4KDg5Wbm+vUFBcX68CBAxo4cKAkKTExUV6vV3v27HFqdu/eLa/X69RcKjQ0VO3bt/ebAAAIhIAeBn/qqae0YcMGvfXWW4qIiHDOH7vdboWFhen8+fPKyMjQ97//fcXGxur48eN69tlnFRUVpe9973tO7ZQpUzR79mx17NhRkZGRmjNnjnr16uVcHd6jRw+NHDlSU6dO1SuvvCJJmjZtmpKTk7kSHABgvYCG9fLlyyVJSUlJfvNXrVqlyZMnq02bNiosLNSrr76qs2fPKjY2VoMHD9brr7+uiIgIp37x4sUKCgrSuHHjVFlZqSFDhmj16tVq06aNU7N+/XrNmDHDuWp89OjRWrp06bXfSAAArlJAw9oY0+h4WFiYNm/efNn1tG3bVllZWcrKymqwJjIyUuvWrbviHgEACDQrLjADAAANI6wBALAcYQ0AgOUIawAALEdYAwBgOcIaAADLEdYAAFiOsAYAwHKENQAAlrPmEZnApZKGjVRx6Zl6x06eOqU+rdwPAAQKYQ1rFZeeUZ/U+m8h+/HTY1q3GQAIIA6DAwBgOcIaAADLEdYAAFiOsAYAwHKENQAAliOsAQCwHGENAIDlCGsAACxHWAMAYDnCGgAAyxHWAABYjrAGAMByhDUAAJYjrAEAsBxhDQCA5QhrAAAsR1gDAGA5whoAAMsR1gAAWI6wBgDAcoQ1AACWI6wBALAcYQ0
"text/plain": [
"<Figure size 500x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.displot((y_test - y_pred), bins = 50) # residual plot"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"mse = metrics.mean_squared_error(y_test, y_pred) # mean squared error"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1013.4172836449808"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sqrt(mse) # root mean squared error"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1.208360</td>\n",
" <td>0.058916</td>\n",
" <td>0.058916</td>\n",
" <td>0.058916</td>\n",
" <td>0.462594</td>\n",
" <td>0.294839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-1.138663</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>1.104188</td>\n",
" <td>0.294839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-1.312906</td>\n",
" <td>-0.833437</td>\n",
" <td>-0.833437</td>\n",
" <td>-0.833437</td>\n",
" <td>0.106153</td>\n",
" <td>-1.052933</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-1.173512</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>-1.725789</td>\n",
" <td>1.603205</td>\n",
" <td>-1.052933</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-1.138663</td>\n",
" <td>0.951268</td>\n",
" <td>0.951268</td>\n",
" <td>0.951268</td>\n",
" <td>0.320018</td>\n",
" <td>-1.502191</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table\n",
"0 -1.208360 0.058916 0.058916 0.058916 0.462594 0.294839\n",
"1 -1.138663 -1.725789 -1.725789 -1.725789 1.104188 0.294839\n",
"2 -1.312906 -0.833437 -0.833437 -0.833437 0.106153 -1.052933\n",
"3 -1.173512 -1.725789 -1.725789 -1.725789 1.603205 -1.052933\n",
"4 -1.138663 0.951268 0.951268 0.951268 0.320018 -1.502191"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# next model: decision tree regression\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"model = DecisionTreeRegressor() # instantiate the model"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-3 {color: black;background-color: white;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\
],
"text/plain": [
"DecisionTreeRegressor()"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X_train, y_train) # fit the model"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['decision_tree.joblib']"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# save the model for later\n",
"dump(model, 'decision_tree.joblib')"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"id": "ufa3Ch6inXcK",
"tags": []
},
"outputs": [],
"source": [
"# predictions\n",
"y_pred = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.collections.PathCollection at 0x7f80fb767c70>"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAGdCAYAAAAbudkLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2T0lEQVR4nO3deXxU9b0//tckJJPFMCSEZCbKklJAYpAlKgRUFtk3qVplMRdaG1dEFG6R+uUC1wWtbW1/pVqkbldAenvdoGBqKCgiCdhA1BCggGGfIZBlAoFMQub8/ohnnH3OmTmznJnX8/GgNTNnznzObOd9Pp/35/3RCIIggIiIiCgKxYW7AURERETBwkCHiIiIohYDHSIiIopaDHSIiIgoajHQISIioqjFQIeIiIiiFgMdIiIiiloMdIiIiChqdQp3A8LJarXi7NmzSEtLg0ajCXdziIiISAJBEHDx4kXk5OQgLs57n01MBzpnz55F9+7dw90MIiIi8sOpU6dw3XXXed0mpgOdtLQ0AB0vVOfOncPcGiIiIpKiqakJ3bt3t53HvYnpQEccrurcuTMDHSIiIpWRknbCZGQiIiKKWgx0iIiIKGox0CEiIqKoxUCHiIiIohYDHSIiIopaDHSIiIgoajHQISIioqjFQIeIiIiiVkwXDCRy1m4VsLemHrUXW5CVloRbcjMQH8d10IiI1IqBDtH3SqqMWLm5GkZzi+02gy4Jy6flYWK+IYwtIyIif3HoiggdQc4j6/Y5BDkAYDK34JF1+1BSZQxTy4iIKBAMdCjmtVsFrNxcDcHNfeJtKzdXo93qbgsiIopkDHQo5u2tqXfpybEnADCaW7C3pj50jSIiIkUw0KGYV3vRc5Djz3ZERBQ5GOhQzMtKS1J0OyIiihycdUVRIZBp4bfkZsCgS4LJ3OI2TwcAuqYmoqBnunINDhJ3rwMATpknopjFQIdUL9Bp4fFxGiyflodH1u2DBnAb7NQ1t2Lkyzsieqq5u9ehS0oCAKDxcpvtNk6ZJ6JYwqErUjWlpoVPzDfgtfuHQK/zPDwVyVPNPb0OjZfbHIIcILKPg4hIaQx0SLWUnhY+Md+Az/9zNDJSE9zeH6lTzb29Du5E6nEQEQUDAx1SrWBMC6840YD65jaP90fiVHNfr4M7kXgcRETBwECHVCsY08LVONU8kLZE0nEQEQUDAx1SrWBMC1fjVPNA2hJJx0FEFAwMdEi1xGnhniZKa9Axw0icYh2ufQabrza7E4nHQUQUDLIDnZ07d2LatGnIycmBRqPBRx995HC/RqNx++/ll1+2bTNq1CiX+2fOnOmwn4aGBhQVFUGn00Gn06GoqAiNjY0O25w8eRLTpk1DamoqMjMzsWDBArS2tso9JFIpcVo4AJeTvPj38ml5smrGBGOfweatze5E6nEQEQWD7ECnubkZAwcOxOrVq93ebzQaHf69+eab0Gg0uPvuux22Ky4udthuzZo1DvfPnj0blZWVKCkpQUlJCSorK1FUVGS7v729HVOmTEFzczN27dqFjRs34v3338eiRYvkHhKpmKdp4XpdEl67f4hftWKCsc9g89Tm9JQEWy0dUSQfBxGR0jSCIPg9v1Sj0eDDDz/EjBkzPG4zY8YMXLx4Ef/85z9tt40aNQqDBg3C73//e7ePOXjwIPLy8lBeXo6hQ4cCAMrLy1FYWIhDhw6hX79++OSTTzB16lScOnUKOTk5AICNGzdi3rx5qK2tRefOnX22v6mpCTqdDmazWdL2FLkCqYwcyn0GGysjE1EskHP+Dmpl5HPnzmHLli145513XO5bv3491q1bh+zsbEyaNAnLly9HWloaAKCsrAw6nc4W5ADAsGHDoNPpsHv3bvTr1w9lZWXIz8+3BTkAMGHCBFgsFlRUVGD06NEuz2mxWGCxWGx/NzU1KXm4FEbxcRoU9u4a8fsMNk9tVttxEBEpJaiBzjvvvIO0tDTcddddDrfPmTMHubm50Ov1qKqqwtKlS/H111+jtLQUAGAymZCVleWyv6ysLJhMJts22dnZDvenp6cjMTHRto2zVatWYeXKlUocGhEREalAUAOdN998E3PmzEFSkmPeQHFxse2/8/Pz0adPH9x0003Yt28fhgwZAqBjWMyZIAgOt0vZxt7SpUvx1FNP2f5uampC9+7d5R0UERERqUbQppd/8cUXOHz4MH7xi1/43HbIkCFISEjAkSNHAAB6vR7nzp1z2e78+fO2Xhy9Xu/Sc9PQ0IC2tjaXnh6RVqtF586dHf4RERFR9ApaoPPGG2+goKAAAwcO9LntgQMH0NbWBoOhYxZIYWEhzGYz9u7da9tmz549MJvNGD58uG2bqqoqGI0/LEz46aefQqvVoqCgQOGjISIiIjWSPXR16dIlHD161PZ3TU0NKisrkZGRgR49egDoGBL629/+ht/+9rcujz927BjWr1+PyZMnIzMzE9XV1Vi0aBEGDx6MESNGAAD69++PiRMnori42Dbt/MEHH8TUqVPRr18/AMD48eORl5eHoqIivPzyy6ivr8fixYtRXFzMnhoiIiLqIMi0Y8cOAR1rAjr8mzt3rm2bNWvWCMnJyUJjY6PL40+ePCncfvvtQkZGhpCYmCj07t1bWLBggVBXV+ewXV1dnTBnzhwhLS1NSEtLE+bMmSM0NDQ4bHPixAlhypQpQnJyspCRkSHMnz9faGlpkXwsZrNZACCYzWZZrwERERGFj5zzd0B1dNSOdXSIiIjUR875m2tdERERUdRioENERERRi4EOERERRS0GOkRERBS1GOgQERFR1GKgQ0RERFGLgQ4RERFFLQY6REREFLUY6BAREVHUkr3WFRFFtnargL019ai92IKstCTckpuB+DhNuJtFRBQWDHSIokhJlRErN1fDaG6x3WbQJWH5tDxMzDeEsWVEROHBoSuiKFFSZcQj6/Y5BDkAYDK34JF1+1BSZQxTy4iIwoeBDlEUaLcKWLm5Gu5W6BVvW7m5Gu3WmF3Dl4hiFAMdoiiwt6bepSfHngDAaG7B3pr60DWKiCgCMNAhigK1Fz0HOf5sR0QULZiMTBRGSs2QykpLUnQ7IqJowUCHKEyUnCF1S24GDLokmMwtbvN0NAD0uo5AiogolnDoiigMlJ4hFR+nwfJpeQA6ghp74t/Lp+Wxng4RxRwGOkQhFqwZUhPzDXjt/iHQ6xyHp/S6JLx2/xDW0SGimMShK6IQkzNDqrB3V1n7nphvwLg8PSsjExF9j4EOUYgFe4ZUfJxGdoBERBStOHRFFGKcIUVEFDoMdIhCTJwh5WkwSYOO2VecIUVEFDgGOkQhxhlSREShw0CHolq7VUDZsTp8XHkGZcfqbDOZPN0eKpwhRUQUGkxGpqjlqSDf9IEGbPraqEihvkBwhhQRUfBpBEGI2eWMm5qaoNPpYDab0blz53A3hxQkFuST+uEWQwv2phARRT45528OXVHU8VaQz5NACvUREVHk4tAVRR1fBfk8CaRQXyRQaoHQcFBz2yn28POqLgx0KOr4W2hPqceHg5ILhIaamttOsYefV/Xh0BVFnUAL7amtUJ/SC4SGkprbTrGHn1d1YqBDUcdXQT5P1FioL1gLhIaCmttOsYefV/VioENRx1tBPk/UWqhPzgKhkUbNbafYw8+rejHQoajkqSCfQZeEh27PhSFKCvUFe4HQYFJz2yn28POqXkxGpqjlrSDfLyf2j4pZE2peIFTNbafYw8+rejHQoagWH6dxO1Xc0+1qI+YjmcwtbnMHNOjorYrEvCM1t51iDz+v6iV76Grnzp2YNm0acnJyoNFo8NFHHzncP2/ePGg0God/w4YNc9jGYrHg8ccfR2ZmJlJTUzF9+nScPn3aYZuGhgYUFRVBp9NBp9OhqKgIjY2NDtucPHkS06ZNQ2pqKjIzM7FgwQK0trbKPSQi1VLzAqFqbjvFHn5e1Ut2oNPc3IyBAwdi9erVHreZOHEijEaj7d/WrVsd7l+4cCE+/PBDbNy4Ebt27cKlS5cwdepUtLe327aZPXs2KisrUVJSgpKSElRWVqKoqMh2f3t7O6ZMmYLm5mbs2rULGzduxPvvv49
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(y_test, y_pred) # how close are our predictions?"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7f8100279210>"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAHpCAYAAABN+X+UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA2LklEQVR4nO3de3RV9Z3//9cpuQAx2UJizuFohNimEQwIBRuCY4nlPsaMy64ixmboFAEHE4zAoFQ7pI4NiFOgY6oiiwHKpenqKI7fX2kkeKHFJFwiGblJdUAFzAVsOAlOTDB8fn+42PWQC5BA8gGej7X2Wp79eZ99Pp+PCa/s29keY4wRAACw0je6ugMAAKB1BDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1OfJGKPa2lpx2zkAoDMR1Oeprq5OjuOorq6uq7sCALiKENQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWC+nqDgC4sqSOGa+KquOttvfxxujtosJO7BFweSOoAVxUFVXHNSTruVbbd+Vnd2JvgMsfh74BALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFujSo+/XrJ4/H02x5+OGHJUnGGOXm5srv96tHjx5KTU3V3r17g7bR0NCg7OxsxcTEKCIiQunp6Tpy5EhQTU1NjTIzM+U4jhzHUWZmpk6cONFZwwQAoN26NKh37NihiooKdykqKpIk/fCHP5QkLVq0SIsXL1Z+fr527Nghn8+nMWPGqK6uzt1GTk6ONmzYoIKCAm3dulUnT55UWlqampqa3JqMjAyVl5ersLBQhYWFKi8vV2ZmZucOFgCAdujSh3Jcd911Qa8XLlyob37zmxo5cqSMMVq6dKmeeOIJ3XvvvZKk1atXy+v1av369Zo+fboCgYBWrFihNWvWaPTo0ZKktWvXKi4uTps3b9a4ceO0f/9+FRYWqrS0VMnJyZKk5cuXKyUlRQcOHFBiYmLnDhoAgAtgzTnqxsZGrV27Vj/5yU/k8Xh06NAhVVZWauzYsW5NeHi4Ro4cqeLiYklSWVmZTp06FVTj9/uVlJTk1pSUlMhxHDekJWn48OFyHMetaUlDQ4Nqa2uDFgAAOps1Qf3qq6/qxIkT+vGPfyxJqqyslCR5vd6gOq/X67ZVVlYqLCxMvXr1arMmNja22efFxsa6NS1ZsGCBe07bcRzFxcW1e2wAALSXNUG9YsUKTZgwQX6/P2i9x+MJem2MabbubGfXtFR/ru3MmzdPgUDAXQ4fPnw+wwAA4KKyIqg//vhjbd68WQ8++KC7zufzSVKzvd7q6mp3L9vn86mxsVE1NTVt1lRVVTX7zGPHjjXbW/+68PBwRUVFBS0AAHQ2K4J65cqVio2N1V133eWui4+Pl8/nc68El746j71lyxaNGDFCkjR06FCFhoYG1VRUVGjPnj1uTUpKigKBgLZv3+7WbNu2TYFAwK0BAMBWXXrVtySdPn1aK1eu1OTJkxUS8rfueDwe5eTkKC8vTwkJCUpISFBeXp569uypjIwMSZLjOJoyZYpmz56t6Oho9e7dW3PmzNHAgQPdq8D79++v8ePHa+rUqVq2bJkkadq0aUpLS+OKbwCA9bo8qDdv3qxPPvlEP/nJT5q1zZ07V/X19ZoxY4ZqamqUnJysTZs2KTIy0q1ZsmSJQkJCNHHiRNXX12vUqFFatWqVunXr5tasW7dOM2fOdK8OT09PV35+/qUfHAAAHeQxxpiu7sTloLa2Vo7jKBAIcL4aaEPioGEakvVcq+278rN14L2dndgj4PJmxTlqAADQMoIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAW6/KgPnr0qH70ox8pOjpaPXv21ODBg1VWVua2G2OUm5srv9+vHj16KDU1VXv37g3aRkNDg7KzsxUTE6OIiAilp6fryJEjQTU1NTXKzMyU4zhyHEeZmZk6ceJEZwwRAIB269Kgrqmp0e23367Q0FD98Y9/1L59+/TLX/5S1157rVuzaNEiLV68WPn5+dqxY4d8Pp/GjBmjuro6tyYnJ0cbNmxQQUGBtm7dqpMnTyotLU1NTU1uTUZGhsrLy1VYWKjCwkKVl5crMzOzM4cLAMAF8xhjTFd9+OOPP6533nlHf/7zn1tsN8bI7/crJydHjz32mKSv9p69Xq+eeeYZTZ8+XYFAQNddd53WrFmj++67T5L06aefKi4uThs3btS4ceO0f/9+DRgwQKWlpUpOTpYklZaWKiUlRe+//74SExPP2dfa2lo5jqNAIKCoqKiLNAPAlSdx0DANyXqu1fZd+dk68N7OTuwRcHnr0j3q1157TcOGDdMPf/hDxcbGasiQIVq+fLnbfujQIVVWVmrs2LHuuvDwcI0cOVLFxcWSpLKyMp06dSqoxu/3Kykpya0pKSmR4zhuSEvS8OHD5TiOW3O2hoYG1dbWBi0AAHS2Lg3qgwcP6oUXXlBCQoJef/11PfTQQ5o5c6Z+85vfSJIqKyslSV6vN+h9Xq/XbausrFRYWJh69erVZk1sbGyzz4+NjXVrzrZgwQL3fLbjOIqLi+vYYAEAaIcuDerTp0/rO9/5jvLy8jRkyBBNnz5dU6dO1QsvvBBU5/F4gl4bY5qtO9vZNS3Vt7WdefPmKRAIuMvhw4fPd1gAAFw0XRrUffr00YABA4LW9e/fX5988okkyefzSVKzvd7q6mp3L9vn86mxsVE1NTVt1lRVVTX7/GPHjjXbWz8jPDxcUVFRQQsAAJ2tS4P69ttv14EDB4LW/eUvf1Hfvn0lSfHx8fL5fCoqKnLbGxsbtWXLFo0YMUKSNHToUIWGhgbVVFRUaM+ePW5NSkqKAoGAtm/f7tZs27ZNgUDArQEAwEYhXfnhjz76qEaMGKG8vDxNnDhR27dv10svvaSXXnpJ0leHq3NycpSXl6eEhAQlJCQoLy9PPXv2VEZGhiTJcRxNmTJFs2fPVnR0tHr37q05c+Zo4MCBGj16tKSv9tLHjx+vqVOnatmyZZKkadOmKS0t7byu+AYAoKt0aVDfdttt2rBhg+bNm6ennnpK8fHxWrp0qR544AG3Zu7cuaqvr9eMGTNUU1Oj5ORkbdq0SZGRkW7NkiVLFBISookTJ6q+vl6jRo3SqlWr1K1bN7dm3bp1mjlzpnt1eHp6uvLz8ztvsAAAtEOX3kd9OeE+auD8cB81cHF1+VeIAgCA1hHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwWJcGdW5urjweT9Di8/ncdmOMcnNz5ff71aNHD6Wmpmrv3r1B22hoaFB2drZiYmIUERGh9PR0HTlyJKimpqZGmZmZchxHjuMoMzNTJ06c6IwhAgDQIV2+R33LLbeooqLCXXbv3u22LVq
"text/plain": [
"<Figure size 500x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.displot((y_test - y_pred), bins = 50) # residual plot"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"mse = metrics.mean_squared_error(y_test, y_pred) # mean squared error"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"1188.278372165346"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sqrt(mse) # root mean squared error"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"# Of the three models--simple regression, multiple regression, and decision tree regression--our second model\n",
"# returned the lowest RMSE value.\n",
"\n",
"# In the next notebook we'll continue with predictions."
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [
"ZSBjGiLtkL8Z",
"SYOM7cJ_kiY1",
"OZNxHMuSkiON",
"umeaYS-SkiGU",
"OTaEVKXAkh-B",
"HvFrMzIGmMOl",
"nNu-6-3emMH5",
"4k8nJyFemMBY",
"E9xy8hetm9wG",
"CCJo3OYWm9oQ",
"Dmo0wF-vnXzD",
"RYSfWcCcnXsb"
],
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}