2636 lines
1.0 MiB
Plaintext
2636 lines
1.0 MiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "ZSBjGiLtkL8Z",
|
||
|
"tags": []
|
||
|
},
|
||
|
"source": [
|
||
|
"# Task 1: Define the Problem"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {
|
||
|
"id": "SRffAEZMkikd"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# predict the price of a diamond"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "SYOM7cJ_kiY1"
|
||
|
},
|
||
|
"source": [
|
||
|
"# Task 2a: Install the Needed Libraries"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {
|
||
|
"id": "ft8kSvTUkiUd"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# pass"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "OZNxHMuSkiON"
|
||
|
},
|
||
|
"source": [
|
||
|
"# Task 2b: Import the Needed Libraries"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {
|
||
|
"id": "HPsfw4s7kiKk"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import seaborn as sns\n",
|
||
|
"from sklearn.linear_model import LinearRegression\n",
|
||
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from sklearn.preprocessing import StandardScaler\n",
|
||
|
"from sklearn import metrics\n",
|
||
|
"from joblib import dump, load # sklearn's replacement for pickle"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "umeaYS-SkiGU"
|
||
|
},
|
||
|
"source": [
|
||
|
"# Task 3: Load the Data"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {
|
||
|
"id": "loYU2Pg3kiCS"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df = pd.read_csv('./diamonds.csv') # the data is in the current directory"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "OTaEVKXAkh-B"
|
||
|
},
|
||
|
"source": [
|
||
|
"# Task 4: Perform Exploratory Data Analysis (EDA)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "2FyPB8fMkh6U"
|
||
|
},
|
||
|
"source": [
|
||
|
"## Show the Data"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {
|
||
|
"id": "x9efqRclkh14"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>carat</th>\n",
|
||
|
" <th>cut</th>\n",
|
||
|
" <th>color</th>\n",
|
||
|
" <th>clarity</th>\n",
|
||
|
" <th>depth</th>\n",
|
||
|
" <th>table</th>\n",
|
||
|
" <th>price</th>\n",
|
||
|
" <th>x</th>\n",
|
||
|
" <th>y</th>\n",
|
||
|
" <th>z</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>0.23</td>\n",
|
||
|
" <td>Ideal</td>\n",
|
||
|
" <td>E</td>\n",
|
||
|
" <td>SI2</td>\n",
|
||
|
" <td>61.5</td>\n",
|
||
|
" <td>55.0</td>\n",
|
||
|
" <td>326</td>\n",
|
||
|
" <td>3.95</td>\n",
|
||
|
" <td>3.98</td>\n",
|
||
|
" <td>2.43</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>0.21</td>\n",
|
||
|
" <td>Premium</td>\n",
|
||
|
" <td>E</td>\n",
|
||
|
" <td>SI1</td>\n",
|
||
|
" <td>59.8</td>\n",
|
||
|
" <td>61.0</td>\n",
|
||
|
" <td>326</td>\n",
|
||
|
" <td>3.89</td>\n",
|
||
|
" <td>3.84</td>\n",
|
||
|
" <td>2.31</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>0.23</td>\n",
|
||
|
" <td>Good</td>\n",
|
||
|
" <td>E</td>\n",
|
||
|
" <td>VS1</td>\n",
|
||
|
" <td>56.9</td>\n",
|
||
|
" <td>65.0</td>\n",
|
||
|
" <td>327</td>\n",
|
||
|
" <td>4.05</td>\n",
|
||
|
" <td>4.07</td>\n",
|
||
|
" <td>2.31</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>0.29</td>\n",
|
||
|
" <td>Premium</td>\n",
|
||
|
" <td>I</td>\n",
|
||
|
" <td>VS2</td>\n",
|
||
|
" <td>62.4</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>334</td>\n",
|
||
|
" <td>4.20</td>\n",
|
||
|
" <td>4.23</td>\n",
|
||
|
" <td>2.63</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0.31</td>\n",
|
||
|
" <td>Good</td>\n",
|
||
|
" <td>J</td>\n",
|
||
|
" <td>SI2</td>\n",
|
||
|
" <td>63.3</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>335</td>\n",
|
||
|
" <td>4.34</td>\n",
|
||
|
" <td>4.35</td>\n",
|
||
|
" <td>2.75</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" carat cut color clarity depth table price x y z\n",
|
||
|
"0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
|
||
|
"1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
|
||
|
"2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
|
||
|
"3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
|
||
|
"4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 5,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head() # what are x, y and z?"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "yOaCGYX5khxz"
|
||
|
},
|
||
|
"source": [
|
||
|
"## Get Data Info"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {
|
||
|
"id": "YbjJUPR4khtR"
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
"RangeIndex: 53940 entries, 0 to 53939\n",
|
||
|
"Data columns (total 10 columns):\n",
|
||
|
" # Column Non-Null Count Dtype \n",
|
||
|
"--- ------ -------------- ----- \n",
|
||
|
" 0 carat 53940 non-null float64\n",
|
||
|
" 1 cut 53940 non-null object \n",
|
||
|
" 2 color 53940 non-null object \n",
|
||
|
" 3 clarity 53940 non-null object \n",
|
||
|
" 4 depth 53940 non-null float64\n",
|
||
|
" 5 table 53940 non-null float64\n",
|
||
|
" 6 price 53940 non-null int64 \n",
|
||
|
" 7 x 53940 non-null float64\n",
|
||
|
" 8 y 53940 non-null float64\n",
|
||
|
" 9 z 53940 non-null float64\n",
|
||
|
"dtypes: float64(6), int64(1), object(3)\n",
|
||
|
"memory usage: 4.1+ MB\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.info() # 'cut', 'color' and 'clarity' are strings, the rest are numbers"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "6GZsnSiMkhpo"
|
||
|
},
|
||
|
"source": [
|
||
|
"## Find Missing Values"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"carat 0\n",
|
||
|
"cut 0\n",
|
||
|
"color 0\n",
|
||
|
"clarity 0\n",
|
||
|
"depth 0\n",
|
||
|
"table 0\n",
|
||
|
"price 0\n",
|
||
|
"x 0\n",
|
||
|
"y 0\n",
|
||
|
"z 0\n",
|
||
|
"dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.isna().sum() # no missing values"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# let's take a look at the string values"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Ideal 21551\n",
|
||
|
"Premium 13791\n",
|
||
|
"Very Good 12082\n",
|
||
|
"Good 4906\n",
|
||
|
"Fair 1610\n",
|
||
|
"Name: cut, dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df['cut'].value_counts() # see https://en.wikipedia.org/wiki/Diamond_cut"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"G 11292\n",
|
||
|
"E 9797\n",
|
||
|
"F 9542\n",
|
||
|
"H 8304\n",
|
||
|
"D 6775\n",
|
||
|
"I 5422\n",
|
||
|
"J 2808\n",
|
||
|
"Name: color, dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df['color'].value_counts() # see https://en.wikipedia.org/wiki/Diamond_color"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"SI1 13065\n",
|
||
|
"VS2 12258\n",
|
||
|
"SI2 9194\n",
|
||
|
"VS1 8171\n",
|
||
|
"VVS2 5066\n",
|
||
|
"VVS1 3655\n",
|
||
|
"IF 1790\n",
|
||
|
"I1 741\n",
|
||
|
"Name: clarity, dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 11,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df['clarity'].value_counts() # see https://en.wikipedia.org/wiki/Diamond_clarity"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<seaborn.axisgrid.FacetGrid at 0x7f8105d7faf0>"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 12,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAHpCAYAAABN+X+UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA3DUlEQVR4nO3dfXRU1b3/8c9IHoA0DCQ0GSIRogaKJiKChNAKUZ6vMXpZq9RCU1wXQa8CpsKlUtpL6rJB6SrQC5UiF4EKlN61Kl56ayOhKNTyEAwkkogRawRCE0JtMgEbkhj27w9/nDLJACEkmR3yfq01azH7fOfM3pnoJ2fPPue4jDFGAADASjcFugMAAODyCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqJvJGKPq6mpx2jkAoD0R1M109uxZud1unT17NtBdAQB0IgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsFhQoDsA6fz588rNzfVpGz58uLp27RqgHgEAbEFQWyA3N1cZL7+hnn1vlyRVlX6sFZJGjRoV0H4BAAKPoLZEz763K2rAkEB3AwBgGb6jBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsFPKhPnTql73znO4qMjFT37t119913Ky8vz9lujFFmZqZiYmLUrVs3paSkqKioyGcftbW1mjNnjnr37q2wsDClpaWptLTUp6ayslLp6elyu91yu91KT09XVVVVewwRAIAWC2hQV1ZW6utf/7qCg4P1hz/8QR988IF+9rOfqWfPnk7N0qVLtWzZMq1atUoHDx6Ux+PRuHHjdPbsWacmIyND27Zt09atW/Xuu+/q3LlzSk1NVUNDg1MzdepU5efnKzs7W9nZ2crPz1d6enp7DhcAgGsWFMg3f+mllxQbG6v169c7bf3793f+bYzRihUrtGjRIk2ePFmStHHjRkVHR2vLli164okn5PV6tW7dOr322msaO3asJGnTpk2KjY3Vzp07NWHCBB09elTZ2dnav3+/kpKSJElr165VcnKyiouLNXDgwCZ9q62tVW1trfO8urq6LX4EAABcUUCPqLdv365hw4bpm9/8pqKiojRkyBCtXbvW2V5SUqLy8nKNHz/eaQsNDdXo0aO1d+9eSVJeXp7q6+t9amJiYpSQkODU7Nu3T2632wlpSRoxYoTcbrdT09iSJUucaXK3263Y2NhWHTsAAM0R0KD+5JNPtHr1asXHx+utt97Sk08+qblz5+pXv/qVJKm8vFySFB0d7fO66OhoZ1t5eblCQkLUq1evK9ZERUU1ef+oqCinprGFCxfK6/U6j5MnT17fYAEAaIGATn1fuHBBw4YNU1ZWliRpyJAhKioq0urVq/Xd737XqXO5XD6vM8Y0aWuscY2/+ivtJzQ0VKGhoc0eCwAAbSGgR9R9+vTRHXfc4dM2aNAgnThxQpLk8XgkqclRb0VFhXOU7fF4VFdXp8rKyivWnD59usn7nzlzpsnROgAANgloUH/9619XcXGxT9tHH32kfv36SZLi4uLk8XiUk5PjbK+rq9Pu3bs1cuRISdLQoUMVHBzsU1NWVqbCwkKnJjk5WV6vV7m5uU7NgQMH5PV6nRoAAGwU0Knv733vexo5cqSysrI0ZcoU5ebm6pVXXtErr7wi6cvp6oyMDGVlZSk+Pl7x8fHKyspS9+7dNXXqVEmS2+3WjBkzNG/ePEVGRioiIkLz589XYmKiswp80KBBmjhxombOnKk1a9ZIkmbNmqXU1FS/K74BALBFQIP63nvv1bZt27Rw4UI9//zziouL04oVKzRt2jSnZsGCBaqpqdFTTz2lyspKJSUlaceOHQoPD3dqli9frqCgIE2ZMkU1NTUaM2aMNmzYoC5dujg1mzdv1ty5c53V4WlpaVq1alX7DRYAgBZwGWNMoDvREVRXV8vtdsvr9apHjx6tuu89e/Yoc3uhogYMkSRVfHRYmWkJGjVqVKu+DwCg4wn4JUQBAMDlEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgsaBAdwBNNXxRr4KCgibtw4cPV9euXQPQIwBAoBDUFjpb/qlWl5yX53gXp62q9GOtkDRq1KiA9QsA0P4IakuF97lVUQOGBLobAIAA4ztqAAAsRlADAGAxghoAAIsR1AAAWIygBgDAYgQ1AAAWI6gBALAYQQ0AgMUIagAALEZQAwBgMYIaAACLEdQAAFiMoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsFtCgzszMlMvl8nl4PB5nuzFGmZmZiomJUbdu3ZSSkqKioiKffdTW1mrOnDnq3bu3wsLClJaWptLSUp+ayspKpaeny+12y+12Kz09XVVVVe0xRAAArkvAj6jvvPNOlZWVOY8jR44425YuXaply5Zp1apVOnjwoDwej8aNG6ezZ886NRkZGdq2bZu2bt2qd999V+fOnVNqaqoaGhqcmqlTpyo/P1/Z2dnKzs5Wfn6+0tPT23WcAAC0RFDAOxAU5HMUfZExRitWrNCiRYs0efJkSdLGjRsVHR2tLVu26IknnpDX69W6dev02muvaezYsZKkTZs2KTY2Vjt37tSECRN09OhRZWdna//+/UpKSpIkrV27VsnJySouLtbAgQP99qu2tla1tbXO8+rq6tYeOgAAVxXwI+pjx44pJiZGcXFxevTRR/XJJ59IkkpKSlReXq7x48c7taGhoRo9erT27t0rScrLy1N9fb1PTUxMjBISEpyaffv2ye12OyEtSSNGjJDb7XZq/FmyZIkzVe52uxUbG9uq4wYAoDkCGtRJSUn61a9+pbfeektr165VeXm5Ro4cqc8++0zl5eWSpOjoaJ/XREdHO9vKy8sVEhKiXr16XbEmKiqqyXtHRUU5Nf4sXLhQXq/XeZw8efK6xgoAQEsEdOp70qRJzr8TExOVnJys2267TRs3btSIESMkSS6Xy+c1xpgmbY01rvFXf7X9hIaGKjQ0tFnjAACgrQR86vtSYWFhSkxM1LFjx5zvrRsf9VZUVDhH2R6PR3V1daqsrLxizenTp5u815kzZ5ocrQMAYBurgrq2tlZHjx5Vnz59FBcXJ4/Ho5ycHGd7XV2ddu/erZEjR0qShg4dquDgYJ+asrIyFRYWOjXJycnyer3Kzc11ag4cOCCv1+vUAABgq4BOfc+fP18PPfSQbrnlFlVUVOiFF15QdXW1pk+fLpfLpYyMDGVlZSk+Pl7x8fHKyspS9+7dNXXqVEmS2+3WjBkzNG/ePEVGRioiIkLz589XYmKiswp80KBBmjhxombOnKk1a9ZIkmbNmqXU1NTLrvgGAMAWAQ3q0tJSffvb39bf/vY3ffWrX9WIESO0f/9+9evXT5K0YMEC1dTU6KmnnlJlZaWSkpK0Y8cOhYeHO/tYvny5goKCNGXKFNXU1GjMmDHasGGDunTp4tRs3rxZc+fOdVaHp6WladWqVe07WAAAWsBljDGB7kRHUF1dLbfbLa/Xqx49erTqvvfs2aPM7YWKGjBEkvTRrv9RcM8Yxd3zDaem4qPDykxL0KhRo1r1vQEAdrPqO2oAAOCLoAYAwGIENQAAFiOoAQCwGEENAIDFCGoAACxGUAMAYDGCGgAAixHUAABYjKAGAMBiBDUAABYjqAEAsBhBDQCAxQhqAAAsRlA
|
||
|
"text/plain": [
|
||
|
"<Figure size 500x500 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# what about price? Let's see the distribution\n",
|
||
|
"\n",
|
||
|
"sns.displot(df['price'], kind = \"hist\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {
|
||
|
"id": "HvFrMzIGmMOl"
|
||
|
},
|
||
|
"source": [
|
||
|
"# Task 5: Perform Data Cleaning"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 13,
|
||
|
"metadata": {
|
||
|
"id": "S6cyPRAAmMLI"
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# convert non-numeric values to numbers\n",
|
||
|
"# each dictionary has quality in descending order"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# 'cut'\n",
|
||
|
"\n",
|
||
|
"dict_cut = {'Ideal': 4, 'Premium': 3, 'Very Good': 2, 'Good': 1, 'Fair': 0}\n",
|
||
|
"df['cut'] = df['cut'].replace(dict_cut)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# 'color'\n",
|
||
|
"\n",
|
||
|
"dict_color = {'D': 6, 'E': 5, 'F': 4, 'G': 3, 'H': 2, 'I': 1, 'J': 0}\n",
|
||
|
"df['color'] = df['cut'].replace(dict_color)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# 'clarity'\n",
|
||
|
"\n",
|
||
|
"dict_clarity = {'IF': 7, 'VVS1': 6, 'VVS2': 5, 'VS1': 4, 'VS2': 3, 'SI1': 2, 'SI2': 1, 'I1': 0}\n",
|
||
|
"df['clarity'] = df['cut'].replace(dict_clarity)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>cut</th>\n",
|
||
|
" <th>color</th>\n",
|
||
|
" <th>clarity</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" cut color clarity\n",
|
||
|
"0 4 4 4\n",
|
||
|
"1 3 3 3\n",
|
||
|
"2 1 1 1\n",
|
||
|
"3 3 3 3\n",
|
||
|
"4 1 1 1"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 17,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df[['cut', 'color', 'clarity']].head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>carat</th>\n",
|
||
|
" <th>cut</th>\n",
|
||
|
" <th>color</th>\n",
|
||
|
" <th>clarity</th>\n",
|
||
|
" <th>depth</th>\n",
|
||
|
" <th>table</th>\n",
|
||
|
" <th>price</th>\n",
|
||
|
" <th>x</th>\n",
|
||
|
" <th>y</th>\n",
|
||
|
" <th>z</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>0.23</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>61.5</td>\n",
|
||
|
" <td>55.0</td>\n",
|
||
|
" <td>326</td>\n",
|
||
|
" <td>3.95</td>\n",
|
||
|
" <td>3.98</td>\n",
|
||
|
" <td>2.43</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>0.21</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>59.8</td>\n",
|
||
|
" <td>61.0</td>\n",
|
||
|
" <td>326</td>\n",
|
||
|
" <td>3.89</td>\n",
|
||
|
" <td>3.84</td>\n",
|
||
|
" <td>2.31</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>0.23</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>56.9</td>\n",
|
||
|
" <td>65.0</td>\n",
|
||
|
" <td>327</td>\n",
|
||
|
" <td>4.05</td>\n",
|
||
|
" <td>4.07</td>\n",
|
||
|
" <td>2.31</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>0.29</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>62.4</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>334</td>\n",
|
||
|
" <td>4.20</td>\n",
|
||
|
" <td>4.23</td>\n",
|
||
|
" <td>2.63</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0.31</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>63.3</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>335</td>\n",
|
||
|
" <td>4.34</td>\n",
|
||
|
" <td>4.35</td>\n",
|
||
|
" <td>2.75</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" carat cut color clarity depth table price x y z\n",
|
||
|
"0 0.23 4 4 4 61.5 55.0 326 3.95 3.98 2.43\n",
|
||
|
"1 0.21 3 3 3 59.8 61.0 326 3.89 3.84 2.31\n",
|
||
|
"2 0.23 1 1 1 56.9 65.0 327 4.05 4.07 2.31\n",
|
||
|
"3 0.29 3 3 3 62.4 58.0 334 4.20 4.23 2.63\n",
|
||
|
"4 0.31 1 1 1 63.3 58.0 335 4.34 4.35 2.75"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 18,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head() # now the whole dataset is numeric"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"metadata": {
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# drop x, y, z\n",
|
||
|
"\n",
|
||
|
"df = df.drop(columns = ['x', 'y', 'z'])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 20,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>carat</th>\n",
|
||
|
" <th>cut</th>\n",
|
||
|
" <th>color</th>\n",
|
||
|
" <th>clarity</th>\n",
|
||
|
" <th>depth</th>\n",
|
||
|
" <th>table</th>\n",
|
||
|
" <th>price</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>0.23</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>61.5</td>\n",
|
||
|
" <td>55.0</td>\n",
|
||
|
" <td>326</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>0.21</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>59.8</td>\n",
|
||
|
" <td>61.0</td>\n",
|
||
|
" <td>326</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>0.23</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>56.9</td>\n",
|
||
|
" <td>65.0</td>\n",
|
||
|
" <td>327</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>0.29</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>62.4</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>334</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0.31</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>63.3</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>335</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" carat cut color clarity depth table price\n",
|
||
|
"0 0.23 4 4 4 61.5 55.0 326\n",
|
||
|
"1 0.21 3 3 3 59.8 61.0 326\n",
|
||
|
"2 0.23 1 1 1 56.9 65.0 327\n",
|
||
|
"3 0.29 3 3 3 62.4 58.0 334\n",
|
||
|
"4 0.31 1 1 1 63.3 58.0 335"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 20,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 21,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>carat</th>\n",
|
||
|
" <th>cut</th>\n",
|
||
|
" <th>color</th>\n",
|
||
|
" <th>clarity</th>\n",
|
||
|
" <th>depth</th>\n",
|
||
|
" <th>table</th>\n",
|
||
|
" <th>price</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" <td>53940.00</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>0.80</td>\n",
|
||
|
" <td>2.90</td>\n",
|
||
|
" <td>2.90</td>\n",
|
||
|
" <td>2.90</td>\n",
|
||
|
" <td>61.75</td>\n",
|
||
|
" <td>57.46</td>\n",
|
||
|
" <td>3932.80</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>0.47</td>\n",
|
||
|
" <td>1.12</td>\n",
|
||
|
" <td>1.12</td>\n",
|
||
|
" <td>1.12</td>\n",
|
||
|
" <td>1.43</td>\n",
|
||
|
" <td>2.23</td>\n",
|
||
|
" <td>3989.44</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>0.20</td>\n",
|
||
|
" <td>0.00</td>\n",
|
||
|
" <td>0.00</td>\n",
|
||
|
" <td>0.00</td>\n",
|
||
|
" <td>43.00</td>\n",
|
||
|
" <td>43.00</td>\n",
|
||
|
" <td>326.00</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>0.40</td>\n",
|
||
|
" <td>2.00</td>\n",
|
||
|
" <td>2.00</td>\n",
|
||
|
" <td>2.00</td>\n",
|
||
|
" <td>61.00</td>\n",
|
||
|
" <td>56.00</td>\n",
|
||
|
" <td>950.00</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>0.70</td>\n",
|
||
|
" <td>3.00</td>\n",
|
||
|
" <td>3.00</td>\n",
|
||
|
" <td>3.00</td>\n",
|
||
|
" <td>61.80</td>\n",
|
||
|
" <td>57.00</td>\n",
|
||
|
" <td>2401.00</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>1.04</td>\n",
|
||
|
" <td>4.00</td>\n",
|
||
|
" <td>4.00</td>\n",
|
||
|
" <td>4.00</td>\n",
|
||
|
" <td>62.50</td>\n",
|
||
|
" <td>59.00</td>\n",
|
||
|
" <td>5324.25</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>5.01</td>\n",
|
||
|
" <td>4.00</td>\n",
|
||
|
" <td>4.00</td>\n",
|
||
|
" <td>4.00</td>\n",
|
||
|
" <td>79.00</td>\n",
|
||
|
" <td>95.00</td>\n",
|
||
|
" <td>18823.00</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" carat cut color clarity depth table price\n",
|
||
|
"count 53940.00 53940.00 53940.00 53940.00 53940.00 53940.00 53940.00\n",
|
||
|
"mean 0.80 2.90 2.90 2.90 61.75 57.46 3932.80\n",
|
||
|
"std 0.47 1.12 1.12 1.12 1.43 2.23 3989.44\n",
|
||
|
"min 0.20 0.00 0.00 0.00 43.00 43.00 326.00\n",
|
||
|
"25% 0.40 2.00 2.00 2.00 61.00 56.00 950.00\n",
|
||
|
"50% 0.70 3.00 3.00 3.00 61.80 57.00 2401.00\n",
|
||
|
"75% 1.04 4.00 4.00 4.00 62.50 59.00 5324.25\n",
|
||
|
"max 5.01 4.00 4.00 4.00 79.00 95.00 18823.00"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 21,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.describe().round(2)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Look for outliers"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"metadata": {
|
||
|
"tags": []
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# data frame of features only\n",
|
||
|
"df2 = df.drop(['price'], axis = 1)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([[<AxesSubplot:title={'center':'carat'}>,\n",
|
||
|
" <AxesSubplot:title={'center':'cut'}>],\n",
|
||
|
" [<AxesSubplot:title={'center':'color'}>,\n",
|
||
|
" <AxesSubplot:title={'center':'clarity'}>],\n",
|
||
|
" [<AxesSubplot:title={'center':'depth'}>,\n",
|
||
|
" <AxesSubplot:title={'center':'table'}>]], dtype=object)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABlgAAATDCAYAAAAazjjkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAADrF0lEQVR4nOz9fZhV9X0v/L9HGEYgMOEhMMwRlbSEarCJByOiacQAg1akOTYlCT0TTSyaWyPhRo6J8eRkTCM0aNS70BhjvdSIhvx6WxKrljCeKB4u8ImEJBhrk/vgU8OIieOASIcJ7t8fln0cB5SlzGxgXq/rmkv22p+913d91nfG+c57r72rSqVSKQAAAAAAAOyzwyo9AAAAAAAAgIONgAUAAAAAAKAgAQsAAAAAAEBBAhYAAAAAAICCBCwAAAAAAAAFCVgAAAAAAAAKErAAAAAAAAAUJGABAAAAAAAoSMACAAAAAABQkIAFgIpZuHBhfvCDH1R6GAAAAL3Ot771rdxyyy2VHgbAQa2qVCqVKj0IAHqnd73rXfn4xz/ul3oAAIAeNn78+AwfPjwPPPBApYcCcNByBQsA+8WuXbvS3t5e6WEAAAAAQI8QsAD0Mv/yL/+ST33qUxk5cmRqampy5JFH5tOf/nTa29vzwgsv5MILL8yxxx6bd73rXRkxYkQ++tGP5n/9r//V6TmeeuqpVFVVZfHixfn617+eMWPGpKamJvfff3/+/d//PZdcckk++MEPpra2NkOHDs2kSZPywx/+sNNzVFVVZfv27bn11ltTVVWVqqqqTJ48uQc7AQAAcPB5szVdU1NTqqqqujzmlltuSVVVVZ566qkkydFHH53HH388q1evLq/Hjj766J49EIBDQN9KDwCAnvOzn/0sH/7whzN8+PB87Wtfy9ixY7N58+bcdddd2blzZ1588cUkyVe/+tXU1dXl5ZdfzooVKzJ58uT8z//5P7sEIH/7t3+b973vfbn66qszePDgjB07Nu3t7XnxxRezYMGC/Kf/9J+yc+fO3HfffTn77LNz880359Of/nSSZN26dfnoRz+a0047LV/5yleSJIMHD+7RfgAAABxM3mpNt69WrFiRj3/846mtrc23vvWtJElNTU13DRvgkOUzWAB6kSlTpuQnP/lJ/vVf/zXvec973rJ+165dKZVKOf300zN48OD84z/+Y5LXrmAZM2ZM/uAP/iBPPPFEqqur3/I5Pve5z+UnP/lJfvKTn5Tv8xksAAAA++6t1nRNTU254oor8sY/991yyy35zGc+k02bNpWvVPEZLADvnLcIA+glXnnllaxevTqzZs1603Dl29/+dv7zf/7POfzww9O3b99UV1fnf/7P/5knnniiS+3MmTP3GK78wz/8Q0455ZS8613vKj/HTTfdtMfnAAAA4K3t65oOgJ4jYAHoJVpbW7Nr164cccQRe6255ppr8n/9X/9XJk6cmDvvvDMPPfRQHn300Zx++unZsWNHl/pRo0Z12faP//iPmTVrVv7Tf/pPWbZsWdatW5dHH300n/3sZ/Pv//7v+/WYAAAAeot9WdMB0LN8BgtALzF06ND06dMnzz333F5rli1blsmTJ+f666/vtH3btm17rN/ThycuW7YsY8aMyfe///1O97e3t7/NkQMAALAva7rDDz88yWvrr9d/pspvf/vbbh8fQG/kChaAXqJ///459dRT8w//8A97/eW6qqqqywcb/vznP8+6dev2eT9VVVXp169fp3ClpaUlP/zhD7vU1tTU7PHKGAAAADrblzXd7s9X+fnPf95p+z/90z91qbUeA3jnBCwAvcg111yTjo6OTJw4MTfeeGPuv//+LF++PLNnz862bdsyY8aMrFq1Kl/96lfz4x//ONdff32mT5+eMWPG7PM+ZsyYkSeffDIXXnhhfvzjH+fWW2/Nhz/84T2+ndhxxx2XBx54IP/0T/+Uxx57LE8++eT+PFwAAIBDylut6f70T/80Q4cOzXnnnZcf/OAHufvuu/Pxj388zz77bJfnOu644/Kzn/0s3//+9/Poo4/mF7/4RQWOCODgVlUqlUqVHgQAPeeJJ54oByjbtm1LXV1dPvrRj+bb3/52qqqqcvnll+d73/tefve73+XYY4/N//gf/yMrVqzIAw88kKeeeipJ8tRTT2XMmDG56qqrsmDBgi77+MY3vpFvf/vb2bx5c9773vdm/vz5ee6553LFFVfk9f/b+dnPfpaLLrooP/3pT/PKK6/k1FNPzQMPPNBDnQAAADj4vNmarqamJo8++mjmzZuXn/3sZ3n3u9+dv/qrv8ro0aPzV3/1V9m0aVP5Kpenn346559/ftatW5dt27blqKOOKq/5ANg3AhYAAAAAAICCvEUYAAAAAABAQQIWAAAAAACAggQsAAAAAAAABQlYAAAAAAAAChKwAAAAAAAAFFQ4YHnwwQdz1llnpb6+PlVVVfnBD35Qvq+joyNf/OIXc9xxx2XgwIGpr6/Ppz/96fzmN7/p9Bzt7e25+OKLM3z48AwcODAzZ87Mc88916mmtbU1jY2Nqa2tTW1tbRobG/PSSy91qnnmmWdy1llnZeDAgRk+fHjmzp2bnTt3Fj0kAAAAAACAQvoWfcD27dvzgQ98IJ/5zGfy53/+553ue+WVV/KTn/wkX/nKV/KBD3wgra2tmTdvXmbOnJnHHnusXDdv3rz80z/9U5YvX55hw4blkksuyYwZM7J+/fr06dMnSTJ79uw899xzWblyZZLk/PPPT2NjY/7pn/4pSbJr166ceeaZec973pM1a9bkd7/7Xc4555yUSqUsWbJkn47l1VdfzW9+85sMGjQoVVVVRVsBAABvS6lUyrZt21JfX5/DDnNROeyJ9RoAAJVQaL1WegeSlFasWPGmNY888kgpSenpp58ulUql0ksvvVSqrq4uLV++vFzzb//2b6XDDjustHLlylKpVCr98pe/LCUpPfTQQ+WadevWlZKU/uVf/qVUKpVK9957b+mwww4r/du//Vu55nvf+16ppqam1NbWtk/jf/bZZ0tJfPny5cuXL1++fPmqyNezzz67T7+3Qm9kvebLly9fvnz58uWrkl/7sl4rfAVLUW1tbamqqsq73/3uJMn69evT0dGRhoaGck19fX3Gjx+ftWvXZvr06Vm3bl1qa2szceLEcs1JJ52U2trarF27NuPGjcu6desyfvz41NfXl2umT5+e9vb2rF+/PqeddlqXsbS3t6e9vb18u1QqJUk2bdqUQYMGJXntbc7uv//+nHbaaamurt6vveDA5/z3Xs597+Xc927Of+9V6XO/bdu2jBkzpvw7KNDV7u+PZ599NoMHD+7RfXd0dGTVqlVpaGjw/4d9oF/F6VlxelaMfhWnZ8XoV3F6Vkwl+7V169aMHj16n9Zr3Rqw/Pu//3u+9KUvZfbs2eVfiFtaWtKvX78MGTKkU+3IkSPT0tJSrhkxYkSX5xsxYkSnmpEjR3a6f8iQIenXr1+55o0WLVqUK664osv2devWZcCAAeXbAwYMyMMPP1zgSDmUOP+9l3Pfezn3vZvz33tV8ty/8sorSeJtj+BN7P7+GDx4cEUClgEDBmTw4MH+ALIP9Ks4PStOz4rRr+L0rBj9Kk7PijkQ+rUv67VuC1g6OjryyU9+Mq+++mq+9a1vvWV9qVTqNOA9Df7t1LzeZZddlvnz55dv706iGhoayr+wd3R0pLm5OdOmTTPReyHnv/dy7nsv5753c/57r0qf+61bt/b4PgEAANi/uiVg6ejoyKxZs7Jp06b8+Mc/7vRqo7q6uuzcuTOtra2drmLZsmVLTj755HLN888/3+V5X3jhhfJVK3V1dV1ecdja2pqOjo4uV7bsVlNTk5qami7bq6uruyys97SN3sP5772c+97Lue/dnP/eq1Ln3nwDAAA4+B22v59wd7jyq1/9Kvfdd1+GDRvW6f4JEyakuro6zc3N5W2bN2/Oxo0bywHLpEmT0tbWlkceeaRc8/DDD6etra1TzcaNG7N58+ZyzapVq1JTU5MJEybs78MCAAAAAAAoK3wFy8svv5xf//rX5dubNm3Khg0bMnT
|
||
|
"text/plain": [
|
||
|
"<Figure size 2000x1500 with 6 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df2.hist(bins = 50, figsize = (20, 15)) # histograms for each feature"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 24,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<AxesSubplot:>"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 24,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGdCAYAAABO2DpVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAjbElEQVR4nO3df3BU9f3v8ddmE/KLJAVChEiAQCpYE6T1RwoSfohhDD9MDOnYWp2Oo52pReYqiSg4KOlY0hahXnWkSsfxemcE27CEDgQmQSEs9RsVKjOgYIHySww/EusubMKy2ez9w5voSpDs5pPdbPJ8zDCYs2f3vMPM7j495+xZi8/n8wkAAMCAqHAPAAAA+g7CAgAAGENYAAAAYwgLAABgDGEBAACMISwAAIAxhAUAADCGsAAAAMZEh3qDbW1t+uKLL5SUlCSLxRLqzQMAgCD4fD5duHBB6enpioq6+n6JkIfFF198oYyMjFBvFgAAGHDq1CmNGDHiqreHPCySkpIkfT1YcnJyqDcPoAd5PB7V1NRo1qxZiomJCfc4AAxyOp3KyMjoeB+/mpCHRfvhj+TkZMIC6GM8Ho8SEhKUnJxMWAB91LVOY+DkTQAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAAMAYwgIAABhDWAAAAGMICwBGeL1e1dXVadeuXaqrq5PX6w33SADCgLAA0G02m01ZWVnKz8/X6tWrlZ+fr6ysLNlstnCPBiDEAgqL5cuXy2Kx+P0ZNmxYT80GIALYbDaVlJQoJydHdrtd69atk91uV05OjkpKSogLoJ8J+LtCbrrpJm3fvr3jZ6vVanQgAJHD6/WqtLRUc+fOVVVVlbxer5qampSbm6uqqioVFRWprKxMhYWFvFYA/UTAYREdHc1eCgCSJLvdruPHj2vdunWKioryO68iKipKS5Ys0eTJk2W32zV9+vTwDQogZAIOi8OHDys9PV2xsbHKzc3VihUrNGbMmKuu73a75Xa7O352Op2Svv4WRI/HE8TIAHqLU6dOSZLGjRvn95xu/3vcuHEd6/F8ByJbV5/DAYVFbm6u3nrrLd1www06e/asnn/+eU2ePFmffPKJhgwZ0ul9KioqVF5efsXympoaJSQkBLJ5AL3MiRMnJElr167tiAhJqq2tlSQdOnSoY73q6urQDwjAmObm5i6tZ/H5fL5gN+JyuTR27FgtXrxYixYt6nSdzvZYZGRkqLGxUcnJycFuGkAv4PV6deONN+qmm27Shg0b5PV6VVtbq/z8fFmtVs2fP1+ffvqpPv30U86xACKc0+lUamqqHA7H975/B3wo5NsSExOVk5Ojw4cPX3Wd2NhYxcbGXrE8JiZGMTEx3dk8gDCLiYnRqlWrVFJSop/97Gd68skn1dLSor1792rlypWqrq5WZWWl4uLiwj0qgG7q6nt2t8LC7Xbr4MGDysvL687DAIhgxcXFqqysVGlpqaZOndqxPDMzU5WVlSouLg7jdABCLaBDIWVlZZo3b55Gjhypc+fO6fnnn1ddXZ3279+vUaNGdekxnE6nUlJSrrkrBUBk8Xq92rFjh7Zu3aqCggLNmDGDwx9AH9LV9++A9lh8/vnn+sUvfqHGxkYNHTpUP/3pT1VfX9/lqADQd1mtVk2bNk0ul0vTpk0jKoB+KqCwWL9+fU/NAQAA+gC+KwQAABhDWAAAAGMICwAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAAMAYwgIAABhDWAAAAGMICwAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAAMAYwgIAABhDWAAAAGMICwAAYAxhAQAAjCEsAACAMYQFAAAwhrAAAADGEBYAjPB6vaqrq9OuXbtUV1cnr9cb7pEAhAFhAaDbbDabsrKylJ+fr9WrVys/P19ZWVmy2WzhHg1AiBEWALrFZrOppKREOTk5stvtWrdunex2u3JyclRSUkJcAP2Mxefz+UK5QafTqZSUFDkcDiUnJ4dy0wAM83q9ysrKUk5OjqqqquT1elVdXa3Zs2fLarWqqKhIBw4c0OHDh2W1WsM9LoBu6Or7N3ssAATNbrfr+PHjWrp0qaKi/F9OoqKitGTJEh07dkx2uz1MEwIINcICQNAaGhokSdnZ2Z3e3r68fT0AfR9hASBow4cPlyQdOHCg09vbl7evB6DvIywABC0vL0+jR4/WihUr1NbW5ndbW1ubKioqlJmZqby8vDBNCCDUCAsAQbNarVq1apU2b96soqIi1dfXq6WlRfX19SoqKtLmzZv1wgsvcOIm0I9Eh3sAAJGtuLhYlZWVKi0t1dSpUzuWZ2ZmqrKyUsXFxWGcDkCo8XFTAEZ4vV7t2LFDW7duVUFBgWbMmMGeCqAP6er7N3ssABhhtVo1bdo0uVwuTZs2jagA+inOsQAAAMYQFgAAwBjCAgAAGENYAAAAYwgLAABgDGEBAACMISwAAIAxhAUAADCGsAAAAMYQFgAAwBjCAgAAGENYAAAAYwgLAABgDGEBAACMISwAGOH1elVXV6ddu3aprq5OXq833CMBCAPCAkC32Ww2ZWVlKT8/X6tXr1Z+fr6ysrJks9nCPRqAECMsAHSLzWZTSUmJcnJyZLfbtW7dOtntduXk5KikpIS4APoZi8/n84Vyg06nUykpKXI4HEpOTg7lpgEY5vV6lZWVpZycHFVVVcnr9aq6ulqzZ8+W1WpVUVGRDhw4oMOHD8tqtYZ7XADd0NX3b/ZYAAia3W7X8ePHtXTpUkVF+b+cREVFacmSJTp27JjsdnuYJgQQaoQFgKA1NDRIkrKzszu9vX15+3oA+j7CAkDQhg8fLkk6cOBAp7e3L29fD0DfR1gACFpeXp5Gjx6tFStWqK2tze+2trY2VVRUKDMzU3l5eWGaEECoRYd7AACRy2q1atWqVSopKdE999yjzMxM/fvf/9b27dt17NgxVVdXq7KykhM3gX6ET4UA6LaioiJt2rTpiuWFhYWqqqoK/UAAjAvJp0IqKipksVj0+OOPd+dhAESwxYsXa9OmTbJYLH7LLRaLNm3apMWLF4dpMgDhEHRYfPTRR3r99dc1YcIEk/MAiCCXL1/WqlWrJElz5szxu0DWnDlzJEmrVq3S5cuXwzkmgBAKKiwuXryoX/7yl1q7dq0GDRpkeiYAEeKVV15RW1ubbr75Zm3atEm5ubmKj49Xbm6uNm3apAkTJqitrU2vvPJKuEcFECJBnby5YMECzZkzR3fddZeef/75713X7XbL7XZ3/Ox0OiVJHo9HHo8nmM0D6CXq6uokSeXl5fJ6vR3P6fa/n3vuOc2fP191dXVauHBh2OYE0H1dfc8OOCzWr1+vf/3rX/roo4+6tH5FRYXKy8uvWF5TU6OEhIRANw+gF3E4HJKkLVu2+C2vra2VJFVXV3es1/7fACJTc3Nzl9YL6FMhp06d0q233qqamhrdfPPNkqTp06dr4sSJevHFFzu9T2d7LDIyMtTY2MinQoAIt337ds2ePVuDBg3S6dOn5fP5VFtbq/z8fFksFl1//fX673//q+rqat11113hHhdANzidTqWmpl7zUyEBhUVVVZXuvfdev8+ke71eWSwWRUVFye12X/Pz6nzcFOg7vF6vhgwZIofDobS0NC1fvlxxcXG6dOmSli9frnPnziklJUVNTU1cywKIcF19/w7oUMjMmTO1f/9+v2UPPfSQxo8fr6eeeooXDqCfsVqteuONNzR//nydP39ev/3tbztua//46RtvvMFrA9CPBPSpkKSkJGVnZ/v9SUxM1JAhQ676JUQA+rbi4mJt2LBBI0eO9Fs+atQobdiwQcXFxWGaDEA4cElvAN1WXFyswsJC7dixQ1u3blVBQYFmzJjBngqgH+p2WOzcudPAGAAindVq1bRp0+RyuTRt2jSiAuin+HZTAABgDGEBAACMISwAAIAxhAUAADCGsAAAAMYQFgAAwBjCAgAAGENYAAAAYwgLAABgDGEBAACMISwAAIAxhAUAI7xer+rq6rRr1y7V1dXJ6/WGeyQAYUBYAOg2m82mrKws5efna/Xq1crPz1dWVpZsNlu
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# 'carat' distribution is not normal\n",
|
||
|
"# let's see the box plot\n",
|
||
|
"\n",
|
||
|
"df.boxplot(['carat'])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 25,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>carat</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>53940.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>0.797940</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>0.474011</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>0.200000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>0.400000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>0.700000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>1.040000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>5.010000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" carat\n",
|
||
|
"count 53940.000000\n",
|
||
|
"mean 0.797940\n",
|
||
|
"std 0.474011\n",
|
||
|
"min 0.200000\n",
|
||
|
"25% 0.400000\n",
|
||
|
"50% 0.700000\n",
|
||
|
"75% 1.040000\n",
|
||
|
"max 5.010000"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 25,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# deal with 'carat' outliers\n",
|
||
|
"\n",
|
||
|
"df[['carat']].describe()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 26,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"0.24000000000000002\n",
|
||
|
"1.2\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# outliers, continued\n",
|
||
|
"\n",
|
||
|
"Q1 = df['carat'].quantile(0.25)\n",
|
||
|
"Q3 = df['carat'].quantile(0.75)\n",
|
||
|
"IQR = Q3 - Q1\n",
|
||
|
"\n",
|
||
|
"lower_limit = Q1 - 0.25 * IQR # define boundary for the lower end\n",
|
||
|
"upper_limit = Q3 + 0.25 * IQR # define boundary for the upper end\n",
|
||
|
"\n",
|
||
|
"print(lower_limit)\n",
|
||
|
"print(upper_limit)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 27,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"573"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 27,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# how many outliers on the low end?\n",
|
||
|
"\n",
|
||
|
"len(df[df.carat < lower_limit])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 28,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"9155"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 28,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# how many outliers on the high end?\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"len(df[df.carat > upper_limit])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 29,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# drop the outliers\n",
|
||
|
"\n",
|
||
|
"df = df[~((df.carat < lower_limit) | (df.carat > upper_limit))]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 30,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>carat</th>\n",
|
||
|
" <th>cut</th>\n",
|
||
|
" <th>color</th>\n",
|
||
|
" <th>clarity</th>\n",
|
||
|
" <th>depth</th>\n",
|
||
|
" <th>table</th>\n",
|
||
|
" <th>price</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>0.29</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>62.4</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>334</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0.31</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>63.3</td>\n",
|
||
|
" <td>58.0</td>\n",
|
||
|
" <td>335</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>7</th>\n",
|
||
|
" <td>0.26</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>61.9</td>\n",
|
||
|
" <td>55.0</td>\n",
|
||
|
" <td>337</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>10</th>\n",
|
||
|
" <td>0.30</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>64.0</td>\n",
|
||
|
" <td>55.0</td>\n",
|
||
|
" <td>339</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>13</th>\n",
|
||
|
" <td>0.31</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>62.2</td>\n",
|
||
|
" <td>54.0</td>\n",
|
||
|
" <td>344</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" carat cut color clarity depth table price\n",
|
||
|
"3 0.29 3 3 3 62.4 58.0 334\n",
|
||
|
"4 0.31 1 1 1 63.3 58.0 335\n",
|
||
|
"7 0.26 2 2 2 61.9 55.0 337\n",
|
||
|
"10 0.30 1 1 1 64.0 55.0 339\n",
|
||
|
"13 0.31 4 4 4 62.2 54.0 344"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 30,
|
||
|
"metadata": {},
|
||