diff --git a/capstone_project.ipynb b/capstone_project.ipynb
new file mode 100644
index 0000000..a1a556e
--- /dev/null
+++ b/capstone_project.ipynb
@@ -0,0 +1,1890 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "58dfeab5",
+ "metadata": {},
+ "source": [
+ "# Working title: working subtitle\n",
+ "\n",
+ "## initial remarks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "fe05b4a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "39a4ce3f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('~/python_class/DOB_Sustainability_Compliance_Map__Local_Law_33.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0e97c85",
+ "metadata": {},
+ "source": [
+ "## Part 1: Data Exploration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "6b430c20",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(21681, 11)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "917a6779",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Building_Class | \n",
+ " Tax_Class | \n",
+ " Building_Count | \n",
+ " DOF_Gross_Square_Footage | \n",
+ " Address | \n",
+ " BoroughName | \n",
+ " BBL | \n",
+ " ENERGY STAR Score | \n",
+ " LetterScore | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 10 | \n",
+ " Y4 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 2598091 | \n",
+ " 920 GRESHAM ROAD | \n",
+ " MANHATTAN | \n",
+ " 1000010010 | \n",
+ " 1 | \n",
+ " D | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 23 | \n",
+ " T2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 24346 | \n",
+ " 20 SOUTH STREET | \n",
+ " MANHATTAN | \n",
+ " 1000020023 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 7501 | \n",
+ " R0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2542563 | \n",
+ " 1 WATER STREET | \n",
+ " MANHATTAN | \n",
+ " 1000047501 | \n",
+ " 61 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Building_Class Tax_Class Building_Count \\\n",
+ "0 1 10 Y4 0 124 \n",
+ "1 2 23 T2 0 1 \n",
+ "2 4 7501 R0 2 1 \n",
+ "\n",
+ " DOF_Gross_Square_Footage Address BoroughName BBL \\\n",
+ "0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n",
+ "1 24346 20 SOUTH STREET MANHATTAN 1000020023 \n",
+ "2 2542563 1 WATER STREET MANHATTAN 1000047501 \n",
+ "\n",
+ " ENERGY STAR Score LetterScore \n",
+ "0 1 D \n",
+ "1 0 F \n",
+ "2 61 C "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "38d0ac47",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Block', 'Lot', 'Building_Class', 'Tax_Class', 'Building_Count',\n",
+ " 'DOF_Gross_Square_Footage', 'Address', 'BoroughName', 'BBL',\n",
+ " 'ENERGY STAR Score', 'LetterScore'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "adf4092b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Columns seem to be self-explanatory, except BBL. According to NYC OpenData:\n",
+ "# \"Borough Block and Lot identifier as assigned by NYC Department of Finance\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "276d9619",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "MANHATTAN 7858\n",
+ "BROOKLYN 5469\n",
+ "BRONX 4349\n",
+ "QUEENS 3659\n",
+ "STATEN ISLAND 346\n",
+ "Name: BoroughName, dtype: int64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Is this dataset citywide? or just Manhattan?\n",
+ "\n",
+ "df['BoroughName'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "d3c8c305",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Block 0\n",
+ "Lot 0\n",
+ "Building_Class 2\n",
+ "Tax_Class 0\n",
+ "Building_Count 0\n",
+ "DOF_Gross_Square_Footage 0\n",
+ "Address 7\n",
+ "BoroughName 0\n",
+ "BBL 0\n",
+ "ENERGY STAR Score 0\n",
+ "LetterScore 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Missing data?\n",
+ "\n",
+ "df.isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "64eb852e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Building_Class | \n",
+ " Tax_Class | \n",
+ " Building_Count | \n",
+ " DOF_Gross_Square_Footage | \n",
+ " Address | \n",
+ " BoroughName | \n",
+ " BBL | \n",
+ " ENERGY STAR Score | \n",
+ " LetterScore | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4254 | \n",
+ " 1595 | \n",
+ " 7501 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1330 5 AVENUE | \n",
+ " MANHATTAN | \n",
+ " 1015950031 | \n",
+ " 64 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 8124 | \n",
+ " 3016 | \n",
+ " 7502 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1926 LONGFELLOW AVENUE | \n",
+ " BRONX | \n",
+ " 2030160038 | \n",
+ " 100 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Building_Class Tax_Class Building_Count \\\n",
+ "4254 1595 7501 NaN 0 0 \n",
+ "8124 3016 7502 NaN 0 0 \n",
+ "\n",
+ " DOF_Gross_Square_Footage Address BoroughName \\\n",
+ "4254 0 1330 5 AVENUE MANHATTAN \n",
+ "8124 0 1926 LONGFELLOW AVENUE BRONX \n",
+ "\n",
+ " BBL ENERGY STAR Score LetterScore \n",
+ "4254 1015950031 64 C \n",
+ "8124 2030160038 100 A "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df['Building_Class'].isna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "cdf678d2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Building_Class | \n",
+ " Tax_Class | \n",
+ " Building_Count | \n",
+ " DOF_Gross_Square_Footage | \n",
+ " Address | \n",
+ " BoroughName | \n",
+ " BBL | \n",
+ " ENERGY STAR Score | \n",
+ " LetterScore | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1228 | \n",
+ " 506 | \n",
+ " 12 | \n",
+ " W3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 49475 | \n",
+ " NaN | \n",
+ " MANHATTAN | \n",
+ " 1005060012 | \n",
+ " 10 | \n",
+ " D | \n",
+ "
\n",
+ " \n",
+ " 7145 | \n",
+ " 1734 | \n",
+ " 1 | \n",
+ " I1 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 1017118 | \n",
+ " NaN | \n",
+ " MANHATTAN | \n",
+ " 1017340001 | \n",
+ " 7 | \n",
+ " D | \n",
+ "
\n",
+ " \n",
+ " 9225 | \n",
+ " 2758 | \n",
+ " 6 | \n",
+ " N9 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 17200 | \n",
+ " NaN | \n",
+ " BRONX | \n",
+ " 2027580006 | \n",
+ " 89 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " 9226 | \n",
+ " 2758 | \n",
+ " 36 | \n",
+ " N9 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 37060 | \n",
+ " NaN | \n",
+ " BRONX | \n",
+ " 2027580036 | \n",
+ " 66 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 13711 | \n",
+ " 1769 | \n",
+ " 72 | \n",
+ " C1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 30720 | \n",
+ " NaN | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " 15056 | \n",
+ " 1602 | \n",
+ " 13 | \n",
+ " C1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 14720 | \n",
+ " NaN | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " 16381 | \n",
+ " 3755 | \n",
+ " 22 | \n",
+ " C1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 25564 | \n",
+ " NaN | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Building_Class Tax_Class Building_Count \\\n",
+ "1228 506 12 W3 0 1 \n",
+ "7145 1734 1 I1 0 5 \n",
+ "9225 2758 6 N9 0 1 \n",
+ "9226 2758 36 N9 0 1 \n",
+ "13711 1769 72 C1 0 1 \n",
+ "15056 1602 13 C1 0 1 \n",
+ "16381 3755 22 C1 0 1 \n",
+ "\n",
+ " DOF_Gross_Square_Footage Address BoroughName BBL \\\n",
+ "1228 49475 NaN MANHATTAN 1005060012 \n",
+ "7145 1017118 NaN MANHATTAN 1017340001 \n",
+ "9225 17200 NaN BRONX 2027580006 \n",
+ "9226 37060 NaN BRONX 2027580036 \n",
+ "13711 30720 NaN BROOKLYN -2147483648 \n",
+ "15056 14720 NaN BROOKLYN -2147483648 \n",
+ "16381 25564 NaN BROOKLYN -2147483648 \n",
+ "\n",
+ " ENERGY STAR Score LetterScore \n",
+ "1228 10 D \n",
+ "7145 7 D \n",
+ "9225 89 A \n",
+ "9226 66 C \n",
+ "13711 0 F \n",
+ "15056 0 F \n",
+ "16381 0 F "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df['Address'].isna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "e205df03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Missing Address is not a big deal because the rest of the values are complete.\n",
+ "# But missing Building Class could be significant.\n",
+ "\n",
+ "# The two offending rows also have Building Count = 0.\n",
+ "# How is that possible, since they have Energy Star scores?\n",
+ "\n",
+ "# In the next secion we may decide to drop those two rows."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4d539a8c",
+ "metadata": {},
+ "source": [
+ "## Part 2: Data Cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "614dbd9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Block int64\n",
+ "Lot int64\n",
+ "Building_Class object\n",
+ "Tax_Class int64\n",
+ "Building_Count int64\n",
+ "DOF_Gross_Square_Footage int64\n",
+ "Address object\n",
+ "BoroughName object\n",
+ "BBL int64\n",
+ "ENERGY STAR Score int64\n",
+ "LetterScore object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Which columns are informative?\n",
+ "\n",
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "6c58a084",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Interesting for analysis:\n",
+ "\n",
+ "# DOF_Gross_Square_Footage\n",
+ "# ENERGY STAR Score\n",
+ "# LetterScore\n",
+ "\n",
+ "# Other columns are less interesting:\n",
+ "\n",
+ "# Building_Count is the number of buildings in one Block.\n",
+ "# A Block can have more than one Lot, but a Lot only has one Block.\n",
+ "# Block, Lot and BBL are identifiers assigned by the city.\n",
+ "\n",
+ "# A good visual reference is the Digital Tax Map put out by the NYC Department of Finance:\n",
+ "# http://gis.nyc.gov/taxmap/map.htm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "14213bd2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Can any identifiers be used as an index?\n",
+ "\n",
+ "df['Block'].is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "1e1a5e9b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Lot'].is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "67b7f633",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['BBL'].is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "c4469ca8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Since their values are not unique, they cannot be used as an index."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "782190b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Shall we rename or discard any columns from this dataset?\n",
+ "\n",
+ "# BBL could be eliminated. However, there are only 11 columns total, and since df.head() is easily readable on my monitor without scrolling horizontally (as you're doing now), I see no harm in keeping it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "e085ba33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Rename columns containing whitespace or camelcase\n",
+ "\n",
+ "df.rename(columns = {\"BoroughName\": \"Borough_Name\",\n",
+ " \"ENERGY STAR Score\": \"Energy_Star_Score\",\n",
+ " \"LetterScore\": \"Letter_Score\"\n",
+ " }, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "c4a8ebb7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Building_Class | \n",
+ " Tax_Class | \n",
+ " Building_Count | \n",
+ " DOF_Gross_Square_Footage | \n",
+ " Address | \n",
+ " Borough_Name | \n",
+ " BBL | \n",
+ " Energy_Star_Score | \n",
+ " Letter_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 10 | \n",
+ " Y4 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 2598091 | \n",
+ " 920 GRESHAM ROAD | \n",
+ " MANHATTAN | \n",
+ " 1000010010 | \n",
+ " 1 | \n",
+ " D | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Building_Class Tax_Class Building_Count \\\n",
+ "0 1 10 Y4 0 124 \n",
+ "\n",
+ " DOF_Gross_Square_Footage Address Borough_Name BBL \\\n",
+ "0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n",
+ "\n",
+ " Energy_Star_Score Letter_Score \n",
+ "0 1 D "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "38de98e9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Unforseen consequence of renaming: now I have to scroll horizontally."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "c0b5504f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Rename columns once more\n",
+ "\n",
+ "df.rename(columns = {\"DOF_Gross_Square_Footage\": \"Sq_Footage\",\n",
+ " \"Energy_Star_Score\": \"Energy_Score\",\n",
+ " \"Borough_Name\": \"Borough\",\n",
+ " \"Building_Class\": \"Bldg_Class\",\n",
+ " \"Building_Count\": \"Bldg_Count\"\n",
+ " }, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "0d3cf300",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Bldg_Class | \n",
+ " Tax_Class | \n",
+ " Bldg_Count | \n",
+ " Sq_Footage | \n",
+ " Address | \n",
+ " Borough | \n",
+ " BBL | \n",
+ " Energy_Score | \n",
+ " Letter_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 10 | \n",
+ " Y4 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 2598091 | \n",
+ " 920 GRESHAM ROAD | \n",
+ " MANHATTAN | \n",
+ " 1000010010 | \n",
+ " 1 | \n",
+ " D | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage Address \\\n",
+ "0 1 10 Y4 0 124 2598091 920 GRESHAM ROAD \n",
+ "\n",
+ " Borough BBL Energy_Score Letter_Score \n",
+ "0 MANHATTAN 1000010010 1 D "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "c1c2e027",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Block 0\n",
+ "Lot 0\n",
+ "Bldg_Class 2\n",
+ "Tax_Class 0\n",
+ "Bldg_Count 0\n",
+ "Sq_Footage 0\n",
+ "Address 7\n",
+ "Borough 0\n",
+ "BBL 0\n",
+ "Energy_Score 0\n",
+ "Letter_Score 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Repeat the search for missing data\n",
+ "\n",
+ "df.isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "5debf1d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Ignore the 7 missing addresses, but drop the 2 rows with missing Building Class.\n",
+ "# Building Class is a feature that will be used in the df.groupby() function.\n",
+ "\n",
+ "df.dropna(subset = ['Bldg_Class'], inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "5d2eb339",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Block 0\n",
+ "Lot 0\n",
+ "Bldg_Class 0\n",
+ "Tax_Class 0\n",
+ "Bldg_Count 0\n",
+ "Sq_Footage 0\n",
+ "Address 7\n",
+ "Borough 0\n",
+ "BBL 0\n",
+ "Energy_Score 0\n",
+ "Letter_Score 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "3b5525b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Search for unexpected data\n",
+ "\n",
+ "# df['Energy_Score'].min() # looks good\n",
+ "# df['Energy_Score'].max() # looks good\n",
+ "# df['Sq_Footage'].max() # looks good\n",
+ "df['Sq_Footage'].min()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "c1f3edc4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Bldg_Class | \n",
+ " Tax_Class | \n",
+ " Bldg_Count | \n",
+ " Sq_Footage | \n",
+ " Address | \n",
+ " Borough | \n",
+ " BBL | \n",
+ " Energy_Score | \n",
+ "
\n",
+ " \n",
+ " Letter_Score | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " A | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " C | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " D | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " F | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
+ "Letter_Score \n",
+ "A 3 3 3 3 3 3 \n",
+ "B 1 1 1 1 1 1 \n",
+ "C 5 5 5 5 5 5 \n",
+ "D 6 6 6 6 6 6 \n",
+ "F 14 14 14 14 14 14 \n",
+ "\n",
+ " Address Borough BBL Energy_Score \n",
+ "Letter_Score \n",
+ "A 3 3 3 3 \n",
+ "B 1 1 1 1 \n",
+ "C 5 5 5 5 \n",
+ "D 6 6 6 6 \n",
+ "F 14 14 14 14 "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# A building cannot have zero square feet of floorspace.\n",
+ "# What's going on?\n",
+ "\n",
+ "df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "e27467ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The ones with F can be explained:\n",
+ "# An F grade means that the building owner \"didn’t submit required benchmarking information\",\n",
+ "# according to Local Law 95 of 2019. So it's not that the building has no square footage,\n",
+ "# but that the data was not submitted. Thus the failing grade.\n",
+ "\n",
+ "# We'll leave 0 square feet with F grade untouched.\n",
+ "\n",
+ "# For more information, see https://www1.nyc.gov/site/buildings/codes/benchmarking.page"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "b73e15d9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Bldg_Class | \n",
+ " Tax_Class | \n",
+ " Bldg_Count | \n",
+ " Sq_Footage | \n",
+ " Address | \n",
+ " Borough | \n",
+ " BBL | \n",
+ " Energy_Score | \n",
+ "
\n",
+ " \n",
+ " Letter_Score | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " A | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " C | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " D | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
+ "Letter_Score \n",
+ "A 3 3 3 3 3 3 \n",
+ "B 1 1 1 1 1 1 \n",
+ "C 5 5 5 5 5 5 \n",
+ "D 6 6 6 6 6 6 \n",
+ "\n",
+ " Address Borough BBL Energy_Score \n",
+ "Letter_Score \n",
+ "A 3 3 3 3 \n",
+ "B 1 1 1 1 \n",
+ "C 5 5 5 5 \n",
+ "D 6 6 6 6 "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# What to do with the others?\n",
+ "\n",
+ "df[(df['Sq_Footage'] == 0) & (df['Letter_Score'] != 'F')].groupby(['Letter_Score']).count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "2d9bea8f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 15 rows remain with 0 square feet of floorspace.\n",
+ "# Can we impute values from the mean square footage for each grade?\n",
+ "\n",
+ "# (There must be an elegant way to do this. What you see below is not.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "6eb73792",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Sq_Footage | \n",
+ "
\n",
+ " \n",
+ " Letter_Score | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " A | \n",
+ " 111197.291071 | \n",
+ "
\n",
+ " \n",
+ " B | \n",
+ " 133270.963702 | \n",
+ "
\n",
+ " \n",
+ " C | \n",
+ " 128833.575964 | \n",
+ "
\n",
+ " \n",
+ " D | \n",
+ " 108170.778312 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Sq_Footage\n",
+ "Letter_Score \n",
+ "A 111197.291071\n",
+ "B 133270.963702\n",
+ "C 128833.575964\n",
+ "D 108170.778312"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# First, get averages\n",
+ "\n",
+ "subset0 = df[['Letter_Score', 'Sq_Footage']]\n",
+ "subset1 = subset0[(subset0['Letter_Score'] != 'F') & (subset0['Sq_Footage'] != 0)]\n",
+ "subset1.groupby(['Letter_Score']).mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "8666f836",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Assign variables, rounding to whole numbers\n",
+ "\n",
+ "mean_A = 111197\n",
+ "mean_B = 133271\n",
+ "mean_C = 128834\n",
+ "mean_D = 108171"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "73900853",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Replace 0 values with mean_A, mean_B, etc.\n",
+ "\n",
+ "df.loc[(df['Letter_Score'] == 'A') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_A\n",
+ "df.loc[(df['Letter_Score'] == 'B') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_B\n",
+ "df.loc[(df['Letter_Score'] == 'C') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_C\n",
+ "df.loc[(df['Letter_Score'] == 'D') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_D"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "37eb13ea",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Bldg_Class | \n",
+ " Tax_Class | \n",
+ " Bldg_Count | \n",
+ " Sq_Footage | \n",
+ " Address | \n",
+ " Borough | \n",
+ " BBL | \n",
+ " Energy_Score | \n",
+ "
\n",
+ " \n",
+ " Letter_Score | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " F | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
+ "Letter_Score \n",
+ "F 14 14 14 14 14 14 \n",
+ "\n",
+ " Address Borough BBL Energy_Score \n",
+ "Letter_Score \n",
+ "F 14 14 14 14 "
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Now the only 0 values should be for F grades\n",
+ "\n",
+ "df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "913b2ae6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Block int64\n",
+ "Lot int64\n",
+ "Bldg_Class object\n",
+ "Tax_Class int64\n",
+ "Bldg_Count int64\n",
+ "Sq_Footage int64\n",
+ "Address object\n",
+ "Borough object\n",
+ "BBL int64\n",
+ "Energy_Score int64\n",
+ "Letter_Score object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# delete this cell\n",
+ "\n",
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "id": "007f1189",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Bldg_Class | \n",
+ " Tax_Class | \n",
+ " Bldg_Count | \n",
+ " Sq_Footage | \n",
+ " Address | \n",
+ " Borough | \n",
+ " BBL | \n",
+ " Energy_Score | \n",
+ " Letter_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 11319 | \n",
+ " 149 | \n",
+ " 7502 | \n",
+ " U7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 138 WILLOUGHBY STREET | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " 11611 | \n",
+ " 165 | \n",
+ " 7504 | \n",
+ " U7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 133271 | \n",
+ " 35 HOYT STREET | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 75 | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " 13351 | \n",
+ " 5804 | \n",
+ " 2 | \n",
+ " U6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " COLONIAL ROAD | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " 14570 | \n",
+ " 5322 | \n",
+ " 4 | \n",
+ " V1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 111197 | \n",
+ " 23 OCEAN PARKWAY | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 100 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ " 14668 | \n",
+ " 5799 | \n",
+ " 59 | \n",
+ " D9 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 38315 | \n",
+ " 3641 JOHNSON AVENUE | \n",
+ " BRONX | \n",
+ " 2057990059 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " 15726 | \n",
+ " 4282 | \n",
+ " 100 | \n",
+ " V1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 25-70 REAR WHITESTONE EXPRESSWAY SR WEST | \n",
+ " QUEENS | \n",
+ " -2147483648 | \n",
+ " 0 | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
+ "11319 149 7502 U7 0 0 0 \n",
+ "11611 165 7504 U7 0 0 133271 \n",
+ "13351 5804 2 U6 0 0 0 \n",
+ "14570 5322 4 V1 0 0 111197 \n",
+ "14668 5799 59 D9 0 0 38315 \n",
+ "15726 4282 100 V1 0 0 0 \n",
+ "\n",
+ " Address Borough BBL \\\n",
+ "11319 138 WILLOUGHBY STREET BROOKLYN -2147483648 \n",
+ "11611 35 HOYT STREET BROOKLYN -2147483648 \n",
+ "13351 COLONIAL ROAD BROOKLYN -2147483648 \n",
+ "14570 23 OCEAN PARKWAY BROOKLYN -2147483648 \n",
+ "14668 3641 JOHNSON AVENUE BRONX 2057990059 \n",
+ "15726 25-70 REAR WHITESTONE EXPRESSWAY SR WEST QUEENS -2147483648 \n",
+ "\n",
+ " Energy_Score Letter_Score \n",
+ "11319 0 F \n",
+ "11611 75 B \n",
+ "13351 0 F \n",
+ "14570 100 A \n",
+ "14668 0 F \n",
+ "15726 0 F "
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Unexpected values, continued\n",
+ "\n",
+ "df[df['Bldg_Count'] == 0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "id": "3479152f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Block | \n",
+ " Lot | \n",
+ " Bldg_Class | \n",
+ " Tax_Class | \n",
+ " Bldg_Count | \n",
+ " Sq_Footage | \n",
+ " Address | \n",
+ " Borough | \n",
+ " BBL | \n",
+ " Energy_Score | \n",
+ " Letter_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 11611 | \n",
+ " 165 | \n",
+ " 7504 | \n",
+ " U7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 133271 | \n",
+ " 35 HOYT STREET | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 75 | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " 14570 | \n",
+ " 5322 | \n",
+ " 4 | \n",
+ " V1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 111197 | \n",
+ " 23 OCEAN PARKWAY | \n",
+ " BROOKLYN | \n",
+ " -2147483648 | \n",
+ " 100 | \n",
+ " A | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
+ "11611 165 7504 U7 0 0 133271 \n",
+ "14570 5322 4 V1 0 0 111197 \n",
+ "\n",
+ " Address Borough BBL Energy_Score Letter_Score \n",
+ "11611 35 HOYT STREET BROOKLYN -2147483648 75 B \n",
+ "14570 23 OCEAN PARKWAY BROOKLYN -2147483648 100 A "
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# How can a block have zero buildings?\n",
+ "# Again, we'll leave the F grades as is.\n",
+ "\n",
+ "df[(df['Bldg_Count'] == 0) & (df['Letter_Score'] != 'F')]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "id": "6adfd92a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Have a peek at the Department of Finance Tax Map: http://gis.nyc.gov/taxmap/map.htm\n",
+ "\n",
+ "# Looks like Bldg_Count = 1 for both. However, I'm not comfortable with imputing data\n",
+ "# by eyeballing it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "id": "7503be1f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Let's just drop them\n",
+ "\n",
+ "df.drop([11611, 14570], inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ab6ecc85",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}