1891 lines
53 KiB
Plaintext
1891 lines
53 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "58dfeab5",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Working title: working subtitle\n",
|
||
"\n",
|
||
"## initial remarks"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "fe05b4a4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import seaborn as sns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "39a4ce3f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = pd.read_csv('~/python_class/DOB_Sustainability_Compliance_Map__Local_Law_33.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e0e97c85",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Part 1: Data Exploration"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "6b430c20",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(21681, 11)"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "917a6779",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Building_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Building_Count</th>\n",
|
||
" <th>DOF_Gross_Square_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>BoroughName</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>ENERGY STAR Score</th>\n",
|
||
" <th>LetterScore</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Y4</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>124</td>\n",
|
||
" <td>2598091</td>\n",
|
||
" <td>920 GRESHAM ROAD</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1000010010</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>T2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>24346</td>\n",
|
||
" <td>20 SOUTH STREET</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1000020023</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>7501</td>\n",
|
||
" <td>R0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2542563</td>\n",
|
||
" <td>1 WATER STREET</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1000047501</td>\n",
|
||
" <td>61</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Building_Class Tax_Class Building_Count \\\n",
|
||
"0 1 10 Y4 0 124 \n",
|
||
"1 2 23 T2 0 1 \n",
|
||
"2 4 7501 R0 2 1 \n",
|
||
"\n",
|
||
" DOF_Gross_Square_Footage Address BoroughName BBL \\\n",
|
||
"0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n",
|
||
"1 24346 20 SOUTH STREET MANHATTAN 1000020023 \n",
|
||
"2 2542563 1 WATER STREET MANHATTAN 1000047501 \n",
|
||
"\n",
|
||
" ENERGY STAR Score LetterScore \n",
|
||
"0 1 D \n",
|
||
"1 0 F \n",
|
||
"2 61 C "
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head(3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "38d0ac47",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['Block', 'Lot', 'Building_Class', 'Tax_Class', 'Building_Count',\n",
|
||
" 'DOF_Gross_Square_Footage', 'Address', 'BoroughName', 'BBL',\n",
|
||
" 'ENERGY STAR Score', 'LetterScore'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "adf4092b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Columns seem to be self-explanatory, except BBL. According to NYC OpenData:\n",
|
||
"# \"Borough Block and Lot identifier as assigned by NYC Department of Finance\"."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "276d9619",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"MANHATTAN 7858\n",
|
||
"BROOKLYN 5469\n",
|
||
"BRONX 4349\n",
|
||
"QUEENS 3659\n",
|
||
"STATEN ISLAND 346\n",
|
||
"Name: BoroughName, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Is this dataset citywide? or just Manhattan?\n",
|
||
"\n",
|
||
"df['BoroughName'].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "d3c8c305",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Block 0\n",
|
||
"Lot 0\n",
|
||
"Building_Class 2\n",
|
||
"Tax_Class 0\n",
|
||
"Building_Count 0\n",
|
||
"DOF_Gross_Square_Footage 0\n",
|
||
"Address 7\n",
|
||
"BoroughName 0\n",
|
||
"BBL 0\n",
|
||
"ENERGY STAR Score 0\n",
|
||
"LetterScore 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Missing data?\n",
|
||
"\n",
|
||
"df.isna().sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "64eb852e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Building_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Building_Count</th>\n",
|
||
" <th>DOF_Gross_Square_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>BoroughName</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>ENERGY STAR Score</th>\n",
|
||
" <th>LetterScore</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>4254</th>\n",
|
||
" <td>1595</td>\n",
|
||
" <td>7501</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1330 5 AVENUE</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1015950031</td>\n",
|
||
" <td>64</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8124</th>\n",
|
||
" <td>3016</td>\n",
|
||
" <td>7502</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1926 LONGFELLOW AVENUE</td>\n",
|
||
" <td>BRONX</td>\n",
|
||
" <td>2030160038</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>A</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Building_Class Tax_Class Building_Count \\\n",
|
||
"4254 1595 7501 NaN 0 0 \n",
|
||
"8124 3016 7502 NaN 0 0 \n",
|
||
"\n",
|
||
" DOF_Gross_Square_Footage Address BoroughName \\\n",
|
||
"4254 0 1330 5 AVENUE MANHATTAN \n",
|
||
"8124 0 1926 LONGFELLOW AVENUE BRONX \n",
|
||
"\n",
|
||
" BBL ENERGY STAR Score LetterScore \n",
|
||
"4254 1015950031 64 C \n",
|
||
"8124 2030160038 100 A "
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df[df['Building_Class'].isna()]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "cdf678d2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Building_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Building_Count</th>\n",
|
||
" <th>DOF_Gross_Square_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>BoroughName</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>ENERGY STAR Score</th>\n",
|
||
" <th>LetterScore</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1228</th>\n",
|
||
" <td>506</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>W3</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>49475</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1005060012</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7145</th>\n",
|
||
" <td>1734</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>I1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1017118</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1017340001</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9225</th>\n",
|
||
" <td>2758</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>N9</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>17200</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>BRONX</td>\n",
|
||
" <td>2027580006</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>A</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9226</th>\n",
|
||
" <td>2758</td>\n",
|
||
" <td>36</td>\n",
|
||
" <td>N9</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>37060</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>BRONX</td>\n",
|
||
" <td>2027580036</td>\n",
|
||
" <td>66</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13711</th>\n",
|
||
" <td>1769</td>\n",
|
||
" <td>72</td>\n",
|
||
" <td>C1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>30720</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15056</th>\n",
|
||
" <td>1602</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>C1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>14720</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16381</th>\n",
|
||
" <td>3755</td>\n",
|
||
" <td>22</td>\n",
|
||
" <td>C1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>25564</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Building_Class Tax_Class Building_Count \\\n",
|
||
"1228 506 12 W3 0 1 \n",
|
||
"7145 1734 1 I1 0 5 \n",
|
||
"9225 2758 6 N9 0 1 \n",
|
||
"9226 2758 36 N9 0 1 \n",
|
||
"13711 1769 72 C1 0 1 \n",
|
||
"15056 1602 13 C1 0 1 \n",
|
||
"16381 3755 22 C1 0 1 \n",
|
||
"\n",
|
||
" DOF_Gross_Square_Footage Address BoroughName BBL \\\n",
|
||
"1228 49475 NaN MANHATTAN 1005060012 \n",
|
||
"7145 1017118 NaN MANHATTAN 1017340001 \n",
|
||
"9225 17200 NaN BRONX 2027580006 \n",
|
||
"9226 37060 NaN BRONX 2027580036 \n",
|
||
"13711 30720 NaN BROOKLYN -2147483648 \n",
|
||
"15056 14720 NaN BROOKLYN -2147483648 \n",
|
||
"16381 25564 NaN BROOKLYN -2147483648 \n",
|
||
"\n",
|
||
" ENERGY STAR Score LetterScore \n",
|
||
"1228 10 D \n",
|
||
"7145 7 D \n",
|
||
"9225 89 A \n",
|
||
"9226 66 C \n",
|
||
"13711 0 F \n",
|
||
"15056 0 F \n",
|
||
"16381 0 F "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df[df['Address'].isna()]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "e205df03",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Missing Address is not a big deal because the rest of the values are complete.\n",
|
||
"# But missing Building Class could be significant.\n",
|
||
"\n",
|
||
"# The two offending rows also have Building Count = 0.\n",
|
||
"# How is that possible, since they have Energy Star scores?\n",
|
||
"\n",
|
||
"# In the next secion we may decide to drop those two rows."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4d539a8c",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Part 2: Data Cleaning"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "614dbd9f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Block int64\n",
|
||
"Lot int64\n",
|
||
"Building_Class object\n",
|
||
"Tax_Class int64\n",
|
||
"Building_Count int64\n",
|
||
"DOF_Gross_Square_Footage int64\n",
|
||
"Address object\n",
|
||
"BoroughName object\n",
|
||
"BBL int64\n",
|
||
"ENERGY STAR Score int64\n",
|
||
"LetterScore object\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Which columns are informative?\n",
|
||
"\n",
|
||
"df.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "6c58a084",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Interesting for analysis:\n",
|
||
"\n",
|
||
"# DOF_Gross_Square_Footage\n",
|
||
"# ENERGY STAR Score\n",
|
||
"# LetterScore\n",
|
||
"\n",
|
||
"# Other columns are less interesting:\n",
|
||
"\n",
|
||
"# Building_Count is the number of buildings in one Block.\n",
|
||
"# A Block can have more than one Lot, but a Lot only has one Block.\n",
|
||
"# Block, Lot and BBL are identifiers assigned by the city.\n",
|
||
"\n",
|
||
"# A good visual reference is the Digital Tax Map put out by the NYC Department of Finance:\n",
|
||
"# http://gis.nyc.gov/taxmap/map.htm"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "14213bd2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Can any identifiers be used as an index?\n",
|
||
"\n",
|
||
"df['Block'].is_unique"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "1e1a5e9b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df['Lot'].is_unique"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "67b7f633",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df['BBL'].is_unique"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "c4469ca8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Since their values are not unique, they cannot be used as an index."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "782190b5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Shall we rename or discard any columns from this dataset?\n",
|
||
"\n",
|
||
"# BBL could be eliminated. However, there are only 11 columns total, and since df.head() is easily readable on my monitor without scrolling horizontally (as you're doing now), I see no harm in keeping it."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "e085ba33",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Rename columns containing whitespace or camelcase\n",
|
||
"\n",
|
||
"df.rename(columns = {\"BoroughName\": \"Borough_Name\",\n",
|
||
" \"ENERGY STAR Score\": \"Energy_Star_Score\",\n",
|
||
" \"LetterScore\": \"Letter_Score\"\n",
|
||
" }, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "c4a8ebb7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Building_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Building_Count</th>\n",
|
||
" <th>DOF_Gross_Square_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough_Name</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Star_Score</th>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Y4</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>124</td>\n",
|
||
" <td>2598091</td>\n",
|
||
" <td>920 GRESHAM ROAD</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1000010010</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Building_Class Tax_Class Building_Count \\\n",
|
||
"0 1 10 Y4 0 124 \n",
|
||
"\n",
|
||
" DOF_Gross_Square_Footage Address Borough_Name BBL \\\n",
|
||
"0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n",
|
||
"\n",
|
||
" Energy_Star_Score Letter_Score \n",
|
||
"0 1 D "
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head(1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "38de98e9",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Unforseen consequence of renaming: now I have to scroll horizontally."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "c0b5504f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Rename columns once more\n",
|
||
"\n",
|
||
"df.rename(columns = {\"DOF_Gross_Square_Footage\": \"Sq_Footage\",\n",
|
||
" \"Energy_Star_Score\": \"Energy_Score\",\n",
|
||
" \"Borough_Name\": \"Borough\",\n",
|
||
" \"Building_Class\": \"Bldg_Class\",\n",
|
||
" \"Building_Count\": \"Bldg_Count\"\n",
|
||
" }, inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "0d3cf300",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Bldg_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Bldg_Count</th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Score</th>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Y4</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>124</td>\n",
|
||
" <td>2598091</td>\n",
|
||
" <td>920 GRESHAM ROAD</td>\n",
|
||
" <td>MANHATTAN</td>\n",
|
||
" <td>1000010010</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage Address \\\n",
|
||
"0 1 10 Y4 0 124 2598091 920 GRESHAM ROAD \n",
|
||
"\n",
|
||
" Borough BBL Energy_Score Letter_Score \n",
|
||
"0 MANHATTAN 1000010010 1 D "
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head(1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "c1c2e027",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Block 0\n",
|
||
"Lot 0\n",
|
||
"Bldg_Class 2\n",
|
||
"Tax_Class 0\n",
|
||
"Bldg_Count 0\n",
|
||
"Sq_Footage 0\n",
|
||
"Address 7\n",
|
||
"Borough 0\n",
|
||
"BBL 0\n",
|
||
"Energy_Score 0\n",
|
||
"Letter_Score 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Repeat the search for missing data\n",
|
||
"\n",
|
||
"df.isna().sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "5debf1d6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Ignore the 7 missing addresses, but drop the 2 rows with missing Building Class.\n",
|
||
"# Building Class is a feature that will be used in the df.groupby() function.\n",
|
||
"\n",
|
||
"df.dropna(subset = ['Bldg_Class'], inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "5d2eb339",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Block 0\n",
|
||
"Lot 0\n",
|
||
"Bldg_Class 0\n",
|
||
"Tax_Class 0\n",
|
||
"Bldg_Count 0\n",
|
||
"Sq_Footage 0\n",
|
||
"Address 7\n",
|
||
"Borough 0\n",
|
||
"BBL 0\n",
|
||
"Energy_Score 0\n",
|
||
"Letter_Score 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.isna().sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "3b5525b0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0"
|
||
]
|
||
},
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Search for unexpected data\n",
|
||
"\n",
|
||
"# df['Energy_Score'].min() # looks good\n",
|
||
"# df['Energy_Score'].max() # looks good\n",
|
||
"# df['Sq_Footage'].max() # looks good\n",
|
||
"df['Sq_Footage'].min()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "c1f3edc4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Bldg_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Bldg_Count</th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Score</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>A</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>B</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>C</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>D</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>F</th>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
|
||
"Letter_Score \n",
|
||
"A 3 3 3 3 3 3 \n",
|
||
"B 1 1 1 1 1 1 \n",
|
||
"C 5 5 5 5 5 5 \n",
|
||
"D 6 6 6 6 6 6 \n",
|
||
"F 14 14 14 14 14 14 \n",
|
||
"\n",
|
||
" Address Borough BBL Energy_Score \n",
|
||
"Letter_Score \n",
|
||
"A 3 3 3 3 \n",
|
||
"B 1 1 1 1 \n",
|
||
"C 5 5 5 5 \n",
|
||
"D 6 6 6 6 \n",
|
||
"F 14 14 14 14 "
|
||
]
|
||
},
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# A building cannot have zero square feet of floorspace.\n",
|
||
"# What's going on?\n",
|
||
"\n",
|
||
"df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "e27467ce",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# The ones with F can be explained:\n",
|
||
"# An F grade means that the building owner \"didn’t submit required benchmarking information\",\n",
|
||
"# according to Local Law 95 of 2019. So it's not that the building has no square footage,\n",
|
||
"# but that the data was not submitted. Thus the failing grade.\n",
|
||
"\n",
|
||
"# We'll leave 0 square feet with F grade untouched.\n",
|
||
"\n",
|
||
"# For more information, see https://www1.nyc.gov/site/buildings/codes/benchmarking.page"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "b73e15d9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Bldg_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Bldg_Count</th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Score</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>A</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>B</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>C</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>D</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
|
||
"Letter_Score \n",
|
||
"A 3 3 3 3 3 3 \n",
|
||
"B 1 1 1 1 1 1 \n",
|
||
"C 5 5 5 5 5 5 \n",
|
||
"D 6 6 6 6 6 6 \n",
|
||
"\n",
|
||
" Address Borough BBL Energy_Score \n",
|
||
"Letter_Score \n",
|
||
"A 3 3 3 3 \n",
|
||
"B 1 1 1 1 \n",
|
||
"C 5 5 5 5 \n",
|
||
"D 6 6 6 6 "
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# What to do with the others?\n",
|
||
"\n",
|
||
"df[(df['Sq_Footage'] == 0) & (df['Letter_Score'] != 'F')].groupby(['Letter_Score']).count()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"id": "2d9bea8f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 15 rows remain with 0 square feet of floorspace.\n",
|
||
"# Can we impute values from the mean square footage for each grade?\n",
|
||
"\n",
|
||
"# (There must be an elegant way to do this. What you see below is not.)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 67,
|
||
"id": "6eb73792",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>A</th>\n",
|
||
" <td>111197.291071</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>B</th>\n",
|
||
" <td>133270.963702</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>C</th>\n",
|
||
" <td>128833.575964</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>D</th>\n",
|
||
" <td>108170.778312</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Sq_Footage\n",
|
||
"Letter_Score \n",
|
||
"A 111197.291071\n",
|
||
"B 133270.963702\n",
|
||
"C 128833.575964\n",
|
||
"D 108170.778312"
|
||
]
|
||
},
|
||
"execution_count": 67,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# First, get averages\n",
|
||
"\n",
|
||
"subset0 = df[['Letter_Score', 'Sq_Footage']]\n",
|
||
"subset1 = subset0[(subset0['Letter_Score'] != 'F') & (subset0['Sq_Footage'] != 0)]\n",
|
||
"subset1.groupby(['Letter_Score']).mean()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 77,
|
||
"id": "8666f836",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Assign variables, rounding to whole numbers\n",
|
||
"\n",
|
||
"mean_A = 111197\n",
|
||
"mean_B = 133271\n",
|
||
"mean_C = 128834\n",
|
||
"mean_D = 108171"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 79,
|
||
"id": "73900853",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Replace 0 values with mean_A, mean_B, etc.\n",
|
||
"\n",
|
||
"df.loc[(df['Letter_Score'] == 'A') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_A\n",
|
||
"df.loc[(df['Letter_Score'] == 'B') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_B\n",
|
||
"df.loc[(df['Letter_Score'] == 'C') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_C\n",
|
||
"df.loc[(df['Letter_Score'] == 'D') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_D"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 81,
|
||
"id": "37eb13ea",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Bldg_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Bldg_Count</th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Score</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>F</th>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>14</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
|
||
"Letter_Score \n",
|
||
"F 14 14 14 14 14 14 \n",
|
||
"\n",
|
||
" Address Borough BBL Energy_Score \n",
|
||
"Letter_Score \n",
|
||
"F 14 14 14 14 "
|
||
]
|
||
},
|
||
"execution_count": 81,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Now the only 0 values should be for F grades\n",
|
||
"\n",
|
||
"df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 82,
|
||
"id": "913b2ae6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Block int64\n",
|
||
"Lot int64\n",
|
||
"Bldg_Class object\n",
|
||
"Tax_Class int64\n",
|
||
"Bldg_Count int64\n",
|
||
"Sq_Footage int64\n",
|
||
"Address object\n",
|
||
"Borough object\n",
|
||
"BBL int64\n",
|
||
"Energy_Score int64\n",
|
||
"Letter_Score object\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 82,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# delete this cell\n",
|
||
"\n",
|
||
"df.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 92,
|
||
"id": "007f1189",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Bldg_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Bldg_Count</th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Score</th>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>11319</th>\n",
|
||
" <td>149</td>\n",
|
||
" <td>7502</td>\n",
|
||
" <td>U7</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>138 WILLOUGHBY STREET</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11611</th>\n",
|
||
" <td>165</td>\n",
|
||
" <td>7504</td>\n",
|
||
" <td>U7</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>133271</td>\n",
|
||
" <td>35 HOYT STREET</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>B</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13351</th>\n",
|
||
" <td>5804</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>U6</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>COLONIAL ROAD</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14570</th>\n",
|
||
" <td>5322</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>V1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>111197</td>\n",
|
||
" <td>23 OCEAN PARKWAY</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>A</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14668</th>\n",
|
||
" <td>5799</td>\n",
|
||
" <td>59</td>\n",
|
||
" <td>D9</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>38315</td>\n",
|
||
" <td>3641 JOHNSON AVENUE</td>\n",
|
||
" <td>BRONX</td>\n",
|
||
" <td>2057990059</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15726</th>\n",
|
||
" <td>4282</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>V1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>25-70 REAR WHITESTONE EXPRESSWAY SR WEST</td>\n",
|
||
" <td>QUEENS</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>F</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
|
||
"11319 149 7502 U7 0 0 0 \n",
|
||
"11611 165 7504 U7 0 0 133271 \n",
|
||
"13351 5804 2 U6 0 0 0 \n",
|
||
"14570 5322 4 V1 0 0 111197 \n",
|
||
"14668 5799 59 D9 0 0 38315 \n",
|
||
"15726 4282 100 V1 0 0 0 \n",
|
||
"\n",
|
||
" Address Borough BBL \\\n",
|
||
"11319 138 WILLOUGHBY STREET BROOKLYN -2147483648 \n",
|
||
"11611 35 HOYT STREET BROOKLYN -2147483648 \n",
|
||
"13351 COLONIAL ROAD BROOKLYN -2147483648 \n",
|
||
"14570 23 OCEAN PARKWAY BROOKLYN -2147483648 \n",
|
||
"14668 3641 JOHNSON AVENUE BRONX 2057990059 \n",
|
||
"15726 25-70 REAR WHITESTONE EXPRESSWAY SR WEST QUEENS -2147483648 \n",
|
||
"\n",
|
||
" Energy_Score Letter_Score \n",
|
||
"11319 0 F \n",
|
||
"11611 75 B \n",
|
||
"13351 0 F \n",
|
||
"14570 100 A \n",
|
||
"14668 0 F \n",
|
||
"15726 0 F "
|
||
]
|
||
},
|
||
"execution_count": 92,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Unexpected values, continued\n",
|
||
"\n",
|
||
"df[df['Bldg_Count'] == 0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 93,
|
||
"id": "3479152f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Block</th>\n",
|
||
" <th>Lot</th>\n",
|
||
" <th>Bldg_Class</th>\n",
|
||
" <th>Tax_Class</th>\n",
|
||
" <th>Bldg_Count</th>\n",
|
||
" <th>Sq_Footage</th>\n",
|
||
" <th>Address</th>\n",
|
||
" <th>Borough</th>\n",
|
||
" <th>BBL</th>\n",
|
||
" <th>Energy_Score</th>\n",
|
||
" <th>Letter_Score</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>11611</th>\n",
|
||
" <td>165</td>\n",
|
||
" <td>7504</td>\n",
|
||
" <td>U7</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>133271</td>\n",
|
||
" <td>35 HOYT STREET</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>B</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14570</th>\n",
|
||
" <td>5322</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>V1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>111197</td>\n",
|
||
" <td>23 OCEAN PARKWAY</td>\n",
|
||
" <td>BROOKLYN</td>\n",
|
||
" <td>-2147483648</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>A</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n",
|
||
"11611 165 7504 U7 0 0 133271 \n",
|
||
"14570 5322 4 V1 0 0 111197 \n",
|
||
"\n",
|
||
" Address Borough BBL Energy_Score Letter_Score \n",
|
||
"11611 35 HOYT STREET BROOKLYN -2147483648 75 B \n",
|
||
"14570 23 OCEAN PARKWAY BROOKLYN -2147483648 100 A "
|
||
]
|
||
},
|
||
"execution_count": 93,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# How can a block have zero buildings?\n",
|
||
"# Again, we'll leave the F grades as is.\n",
|
||
"\n",
|
||
"df[(df['Bldg_Count'] == 0) & (df['Letter_Score'] != 'F')]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 94,
|
||
"id": "6adfd92a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Have a peek at the Department of Finance Tax Map: http://gis.nyc.gov/taxmap/map.htm\n",
|
||
"\n",
|
||
"# Looks like Bldg_Count = 1 for both. However, I'm not comfortable with imputing data\n",
|
||
"# by eyeballing it."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 95,
|
||
"id": "7503be1f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Let's just drop them\n",
|
||
"\n",
|
||
"df.drop([11611, 14570], inplace = True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ab6ecc85",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|