added project1 files

2022-12-06 19:29:33 -05:00 · 2022-12-06 19:29:33 -05:00 · bb8c7a0491
parent 8af9c2e8a0
commit bb8c7a0491
6 changed files with 57004 additions and 0 deletions
--- a/decision_tree.joblib
+++ b/decision_tree.joblib
--- a/diamonds.csv
+++ b/diamonds.csv
--- a/multiple_regression.joblib
+++ b/multiple_regression.joblib
--- a/project_1_eda_and_model_training.ipynb
+++ b/project_1_eda_and_model_training.ipynb
--- a/project_1_predictions.ipynb
+++ b/project_1_predictions.ipynb
@ -0,0 +1,428 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "11e53f32-8614-4020-9fce-b838ad409098",
+   "metadata": {
+    "id": "HPsfw4s7kiKk",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn import metrics\n",
+    "from joblib import dump, load # sklearn's replacement for pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "73cb34f0-0771-4df5-bb29-afee3d2f6350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load saved models\n",
+    "model_1 = load('simple_regression.joblib')\n",
+    "model_2 = load('multiple_regression.joblib')\n",
+    "model_3 = load('decision_tree.joblib')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3023a4a8-7deb-4370-8ad6-8ab61bba514f",
+   "metadata": {},
+   "source": [
+    "## Predictions: model 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b424b756-c488-426b-95d1-5757ab2dd413",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_x = np.zeros((1, 1)) # create an empty vector for 'carat'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "851ccc74-7cdf-43af-aed1-d3f3777768a9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.7]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_x[0, 0] = 0.70 # weight of the diamond\n",
+    "print(test_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "04fc4367-9022-49b5-a5e6-5721c0671f92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_y = np.zeros((1, 1)) # create an empty vector for 'price'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "17d43990-28a3-4799-b10e-68373208dc66",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[2751.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_y[0, 0] = 2751.00 # price of the diamond\n",
+    "print(test_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5eedcc23-376a-4fc4-b4a5-9a20c4674b79",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2996.33847049]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tim/Nextcloud/Documents/predictive_analytics/envs/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "prediction = model_1.predict(test_x)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "29a7b8ac-31cf-4193-93e6-cd81417a635e",
+   "metadata": {
+    "id": "cQEsaET7Oj8o"
+   },
+   "outputs": [],
+   "source": [
+    "mse = metrics.mean_squared_error(test_y, prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "86a0249c-7987-4510-994a-e4add582c0c5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "245.338470494803"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sqrt(mse) # root mean squared error"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7821a20-26bd-4703-ad09-0224aa331c07",
+   "metadata": {},
+   "source": [
+    "## Predictions: model 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "802e6f37-b0e1-403d-865e-489d5b85f53f",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "YVeMEFuEn4wi",
+    "outputId": "bbeb9369-e01a-4358-f517-53d57f238b64"
+   },
+   "outputs": [],
+   "source": [
+    "test_x = np.zeros((1, 6)) # create an empty array for the features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "900cfbc7-ea3e-4f55-97e3-4b6a9f2b0cbb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.7  4.   4.   4.  62.  55. ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_x[0, 0] = 0.70\n",
+    "test_x[0, 1] = 4.00\n",
+    "test_x[0, 2] = 4.00\n",
+    "test_x[0, 3] = 4.00\n",
+    "test_x[0, 4] = 62.00\n",
+    "test_x[0, 5] = 55.00\n",
+    "print(test_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3965a843-809f-492b-9f79-c6577b062c76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scaler = StandardScaler() # instantiate the scaler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d70c4f75-e644-4a4a-b546-d0eed460f76a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scaled_X = scaler.fit_transform(test_x) # scale the array"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7ac2f206-e400-4ac7-be27-1250f48b2d9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[2751.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# y remains the same\n",
+    "print(test_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "53b6f21f-caa9-4d6f-b90b-2b49c00879cd",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2572.08756099]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tim/Nextcloud/Documents/predictive_analytics/envs/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "prediction = model_2.predict(scaled_X)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "855b89da-82e0-4b67-8ad7-0bcffd9bf2d5",
+   "metadata": {
+    "id": "cQEsaET7Oj8o"
+   },
+   "outputs": [],
+   "source": [
+    "mse = metrics.mean_squared_error(test_y, prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "ef1098b5-3941-4df0-b427-d0089736a65c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "178.91243900831523"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sqrt(mse) # root mean squared error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "26996ee9-3b6e-4e37-9f90-7a2fb8c0979d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Predictions: model 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "2cf9462e-d339-4f0b-bcc5-bacc31890374",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_x remains the same\n",
+    "# scaled_X remains the same\n",
+    "# test_y remains the same"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "be587251-63a5-4e1e-8b95-7da381414699",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1667.]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tim/Nextcloud/Documents/predictive_analytics/envs/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "prediction = model_3.predict(scaled_X)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "753d5621-78ff-4be8-9e11-f74473ba48b6",
+   "metadata": {
+    "id": "cQEsaET7Oj8o"
+   },
+   "outputs": [],
+   "source": [
+    "mse = metrics.mean_squared_error(test_y, prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "e50e4073-8da9-4ca2-9551-eedbe9dbf631",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1084.0"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sqrt(mse) # root mean squared error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "e76d5e9b-1ac9-477a-9dbc-c30a5df50c4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model 1 returned the smallest RMSE, followed by model 2, then model 3"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/simple_regression.joblib
+++ b/simple_regression.joblib