From cba44b0489fe277be41bcce98ed7c8c4f5e569b1 Mon Sep 17 00:00:00 2001 From: gyoza1 Date: Sun, 31 Jul 2022 16:36:13 -0400 Subject: [PATCH] about halfway through --- capstone_project.ipynb | 1890 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1890 insertions(+) create mode 100644 capstone_project.ipynb diff --git a/capstone_project.ipynb b/capstone_project.ipynb new file mode 100644 index 0000000..a1a556e --- /dev/null +++ b/capstone_project.ipynb @@ -0,0 +1,1890 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "58dfeab5", + "metadata": {}, + "source": [ + "# Working title: working subtitle\n", + "\n", + "## initial remarks" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fe05b4a4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "39a4ce3f", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('~/python_class/DOB_Sustainability_Compliance_Map__Local_Law_33.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "e0e97c85", + "metadata": {}, + "source": [ + "## Part 1: Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6b430c20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21681, 11)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "917a6779", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBoroughNameBBLENERGY STAR ScoreLetterScore
0110Y401242598091920 GRESHAM ROADMANHATTAN10000100101D
1223T2012434620 SOUTH STREETMANHATTAN10000200230F
247501R02125425631 WATER STREETMANHATTAN100004750161C
\n", + "
" + ], + "text/plain": [ + " Block Lot Building_Class Tax_Class Building_Count \\\n", + "0 1 10 Y4 0 124 \n", + "1 2 23 T2 0 1 \n", + "2 4 7501 R0 2 1 \n", + "\n", + " DOF_Gross_Square_Footage Address BoroughName BBL \\\n", + "0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n", + "1 24346 20 SOUTH STREET MANHATTAN 1000020023 \n", + "2 2542563 1 WATER STREET MANHATTAN 1000047501 \n", + "\n", + " ENERGY STAR Score LetterScore \n", + "0 1 D \n", + "1 0 F \n", + "2 61 C " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38d0ac47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Block', 'Lot', 'Building_Class', 'Tax_Class', 'Building_Count',\n", + " 'DOF_Gross_Square_Footage', 'Address', 'BoroughName', 'BBL',\n", + " 'ENERGY STAR Score', 'LetterScore'],\n", + " dtype='object')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "adf4092b", + "metadata": {}, + "outputs": [], + "source": [ + "# Columns seem to be self-explanatory, except BBL. According to NYC OpenData:\n", + "# \"Borough Block and Lot identifier as assigned by NYC Department of Finance\"." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "276d9619", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MANHATTAN 7858\n", + "BROOKLYN 5469\n", + "BRONX 4349\n", + "QUEENS 3659\n", + "STATEN ISLAND 346\n", + "Name: BoroughName, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Is this dataset citywide? or just Manhattan?\n", + "\n", + "df['BoroughName'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d3c8c305", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Block 0\n", + "Lot 0\n", + "Building_Class 2\n", + "Tax_Class 0\n", + "Building_Count 0\n", + "DOF_Gross_Square_Footage 0\n", + "Address 7\n", + "BoroughName 0\n", + "BBL 0\n", + "ENERGY STAR Score 0\n", + "LetterScore 0\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Missing data?\n", + "\n", + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64eb852e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBoroughNameBBLENERGY STAR ScoreLetterScore
425415957501NaN0001330 5 AVENUEMANHATTAN101595003164C
812430167502NaN0001926 LONGFELLOW AVENUEBRONX2030160038100A
\n", + "
" + ], + "text/plain": [ + " Block Lot Building_Class Tax_Class Building_Count \\\n", + "4254 1595 7501 NaN 0 0 \n", + "8124 3016 7502 NaN 0 0 \n", + "\n", + " DOF_Gross_Square_Footage Address BoroughName \\\n", + "4254 0 1330 5 AVENUE MANHATTAN \n", + "8124 0 1926 LONGFELLOW AVENUE BRONX \n", + "\n", + " BBL ENERGY STAR Score LetterScore \n", + "4254 1015950031 64 C \n", + "8124 2030160038 100 A " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['Building_Class'].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cdf678d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBoroughNameBBLENERGY STAR ScoreLetterScore
122850612W30149475NaNMANHATTAN100506001210D
714517341I1051017118NaNMANHATTAN10173400017D
922527586N90117200NaNBRONX202758000689A
9226275836N90137060NaNBRONX202758003666C
13711176972C10130720NaNBROOKLYN-21474836480F
15056160213C10114720NaNBROOKLYN-21474836480F
16381375522C10125564NaNBROOKLYN-21474836480F
\n", + "
" + ], + "text/plain": [ + " Block Lot Building_Class Tax_Class Building_Count \\\n", + "1228 506 12 W3 0 1 \n", + "7145 1734 1 I1 0 5 \n", + "9225 2758 6 N9 0 1 \n", + "9226 2758 36 N9 0 1 \n", + "13711 1769 72 C1 0 1 \n", + "15056 1602 13 C1 0 1 \n", + "16381 3755 22 C1 0 1 \n", + "\n", + " DOF_Gross_Square_Footage Address BoroughName BBL \\\n", + "1228 49475 NaN MANHATTAN 1005060012 \n", + "7145 1017118 NaN MANHATTAN 1017340001 \n", + "9225 17200 NaN BRONX 2027580006 \n", + "9226 37060 NaN BRONX 2027580036 \n", + "13711 30720 NaN BROOKLYN -2147483648 \n", + "15056 14720 NaN BROOKLYN -2147483648 \n", + "16381 25564 NaN BROOKLYN -2147483648 \n", + "\n", + " ENERGY STAR Score LetterScore \n", + "1228 10 D \n", + "7145 7 D \n", + "9225 89 A \n", + "9226 66 C \n", + "13711 0 F \n", + "15056 0 F \n", + "16381 0 F " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['Address'].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e205df03", + "metadata": {}, + "outputs": [], + "source": [ + "# Missing Address is not a big deal because the rest of the values are complete.\n", + "# But missing Building Class could be significant.\n", + "\n", + "# The two offending rows also have Building Count = 0.\n", + "# How is that possible, since they have Energy Star scores?\n", + "\n", + "# In the next secion we may decide to drop those two rows." + ] + }, + { + "cell_type": "markdown", + "id": "4d539a8c", + "metadata": {}, + "source": [ + "## Part 2: Data Cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "614dbd9f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Block int64\n", + "Lot int64\n", + "Building_Class object\n", + "Tax_Class int64\n", + "Building_Count int64\n", + "DOF_Gross_Square_Footage int64\n", + "Address object\n", + "BoroughName object\n", + "BBL int64\n", + "ENERGY STAR Score int64\n", + "LetterScore object\n", + "dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Which columns are informative?\n", + "\n", + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6c58a084", + "metadata": {}, + "outputs": [], + "source": [ + "# Interesting for analysis:\n", + "\n", + "# DOF_Gross_Square_Footage\n", + "# ENERGY STAR Score\n", + "# LetterScore\n", + "\n", + "# Other columns are less interesting:\n", + "\n", + "# Building_Count is the number of buildings in one Block.\n", + "# A Block can have more than one Lot, but a Lot only has one Block.\n", + "# Block, Lot and BBL are identifiers assigned by the city.\n", + "\n", + "# A good visual reference is the Digital Tax Map put out by the NYC Department of Finance:\n", + "# http://gis.nyc.gov/taxmap/map.htm" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "14213bd2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Can any identifiers be used as an index?\n", + "\n", + "df['Block'].is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1e1a5e9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Lot'].is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "67b7f633", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['BBL'].is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c4469ca8", + "metadata": {}, + "outputs": [], + "source": [ + "# Since their values are not unique, they cannot be used as an index." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "782190b5", + "metadata": {}, + "outputs": [], + "source": [ + "# Shall we rename or discard any columns from this dataset?\n", + "\n", + "# BBL could be eliminated. However, there are only 11 columns total, and since df.head() is easily readable on my monitor without scrolling horizontally (as you're doing now), I see no harm in keeping it." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e085ba33", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename columns containing whitespace or camelcase\n", + "\n", + "df.rename(columns = {\"BoroughName\": \"Borough_Name\",\n", + " \"ENERGY STAR Score\": \"Energy_Star_Score\",\n", + " \"LetterScore\": \"Letter_Score\"\n", + " }, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c4a8ebb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBorough_NameBBLEnergy_Star_ScoreLetter_Score
0110Y401242598091920 GRESHAM ROADMANHATTAN10000100101D
\n", + "
" + ], + "text/plain": [ + " Block Lot Building_Class Tax_Class Building_Count \\\n", + "0 1 10 Y4 0 124 \n", + "\n", + " DOF_Gross_Square_Footage Address Borough_Name BBL \\\n", + "0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n", + "\n", + " Energy_Star_Score Letter_Score \n", + "0 1 D " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "38de98e9", + "metadata": {}, + "outputs": [], + "source": [ + "# Unforseen consequence of renaming: now I have to scroll horizontally." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c0b5504f", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename columns once more\n", + "\n", + "df.rename(columns = {\"DOF_Gross_Square_Footage\": \"Sq_Footage\",\n", + " \"Energy_Star_Score\": \"Energy_Score\",\n", + " \"Borough_Name\": \"Borough\",\n", + " \"Building_Class\": \"Bldg_Class\",\n", + " \"Building_Count\": \"Bldg_Count\"\n", + " }, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "0d3cf300", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_ScoreLetter_Score
0110Y401242598091920 GRESHAM ROADMANHATTAN10000100101D
\n", + "
" + ], + "text/plain": [ + " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage Address \\\n", + "0 1 10 Y4 0 124 2598091 920 GRESHAM ROAD \n", + "\n", + " Borough BBL Energy_Score Letter_Score \n", + "0 MANHATTAN 1000010010 1 D " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c1c2e027", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Block 0\n", + "Lot 0\n", + "Bldg_Class 2\n", + "Tax_Class 0\n", + "Bldg_Count 0\n", + "Sq_Footage 0\n", + "Address 7\n", + "Borough 0\n", + "BBL 0\n", + "Energy_Score 0\n", + "Letter_Score 0\n", + "dtype: int64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Repeat the search for missing data\n", + "\n", + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "5debf1d6", + "metadata": {}, + "outputs": [], + "source": [ + "# Ignore the 7 missing addresses, but drop the 2 rows with missing Building Class.\n", + "# Building Class is a feature that will be used in the df.groupby() function.\n", + "\n", + "df.dropna(subset = ['Bldg_Class'], inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "5d2eb339", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Block 0\n", + "Lot 0\n", + "Bldg_Class 0\n", + "Tax_Class 0\n", + "Bldg_Count 0\n", + "Sq_Footage 0\n", + "Address 7\n", + "Borough 0\n", + "BBL 0\n", + "Energy_Score 0\n", + "Letter_Score 0\n", + "dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "3b5525b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Search for unexpected data\n", + "\n", + "# df['Energy_Score'].min() # looks good\n", + "# df['Energy_Score'].max() # looks good\n", + "# df['Sq_Footage'].max() # looks good\n", + "df['Sq_Footage'].min()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "c1f3edc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_Score
Letter_Score
A3333333333
B1111111111
C5555555555
D6666666666
F14141414141414141414
\n", + "
" + ], + "text/plain": [ + " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", + "Letter_Score \n", + "A 3 3 3 3 3 3 \n", + "B 1 1 1 1 1 1 \n", + "C 5 5 5 5 5 5 \n", + "D 6 6 6 6 6 6 \n", + "F 14 14 14 14 14 14 \n", + "\n", + " Address Borough BBL Energy_Score \n", + "Letter_Score \n", + "A 3 3 3 3 \n", + "B 1 1 1 1 \n", + "C 5 5 5 5 \n", + "D 6 6 6 6 \n", + "F 14 14 14 14 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A building cannot have zero square feet of floorspace.\n", + "# What's going on?\n", + "\n", + "df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e27467ce", + "metadata": {}, + "outputs": [], + "source": [ + "# The ones with F can be explained:\n", + "# An F grade means that the building owner \"didn’t submit required benchmarking information\",\n", + "# according to Local Law 95 of 2019. So it's not that the building has no square footage,\n", + "# but that the data was not submitted. Thus the failing grade.\n", + "\n", + "# We'll leave 0 square feet with F grade untouched.\n", + "\n", + "# For more information, see https://www1.nyc.gov/site/buildings/codes/benchmarking.page" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b73e15d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_Score
Letter_Score
A3333333333
B1111111111
C5555555555
D6666666666
\n", + "
" + ], + "text/plain": [ + " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", + "Letter_Score \n", + "A 3 3 3 3 3 3 \n", + "B 1 1 1 1 1 1 \n", + "C 5 5 5 5 5 5 \n", + "D 6 6 6 6 6 6 \n", + "\n", + " Address Borough BBL Energy_Score \n", + "Letter_Score \n", + "A 3 3 3 3 \n", + "B 1 1 1 1 \n", + "C 5 5 5 5 \n", + "D 6 6 6 6 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# What to do with the others?\n", + "\n", + "df[(df['Sq_Footage'] == 0) & (df['Letter_Score'] != 'F')].groupby(['Letter_Score']).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "2d9bea8f", + "metadata": {}, + "outputs": [], + "source": [ + "# 15 rows remain with 0 square feet of floorspace.\n", + "# Can we impute values from the mean square footage for each grade?\n", + "\n", + "# (There must be an elegant way to do this. What you see below is not.)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "6eb73792", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sq_Footage
Letter_Score
A111197.291071
B133270.963702
C128833.575964
D108170.778312
\n", + "
" + ], + "text/plain": [ + " Sq_Footage\n", + "Letter_Score \n", + "A 111197.291071\n", + "B 133270.963702\n", + "C 128833.575964\n", + "D 108170.778312" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First, get averages\n", + "\n", + "subset0 = df[['Letter_Score', 'Sq_Footage']]\n", + "subset1 = subset0[(subset0['Letter_Score'] != 'F') & (subset0['Sq_Footage'] != 0)]\n", + "subset1.groupby(['Letter_Score']).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "8666f836", + "metadata": {}, + "outputs": [], + "source": [ + "# Assign variables, rounding to whole numbers\n", + "\n", + "mean_A = 111197\n", + "mean_B = 133271\n", + "mean_C = 128834\n", + "mean_D = 108171" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "73900853", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace 0 values with mean_A, mean_B, etc.\n", + "\n", + "df.loc[(df['Letter_Score'] == 'A') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_A\n", + "df.loc[(df['Letter_Score'] == 'B') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_B\n", + "df.loc[(df['Letter_Score'] == 'C') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_C\n", + "df.loc[(df['Letter_Score'] == 'D') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_D" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "37eb13ea", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_Score
Letter_Score
F14141414141414141414
\n", + "
" + ], + "text/plain": [ + " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", + "Letter_Score \n", + "F 14 14 14 14 14 14 \n", + "\n", + " Address Borough BBL Energy_Score \n", + "Letter_Score \n", + "F 14 14 14 14 " + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now the only 0 values should be for F grades\n", + "\n", + "df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "913b2ae6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Block int64\n", + "Lot int64\n", + "Bldg_Class object\n", + "Tax_Class int64\n", + "Bldg_Count int64\n", + "Sq_Footage int64\n", + "Address object\n", + "Borough object\n", + "BBL int64\n", + "Energy_Score int64\n", + "Letter_Score object\n", + "dtype: object" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# delete this cell\n", + "\n", + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "007f1189", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_ScoreLetter_Score
113191497502U7000138 WILLOUGHBY STREETBROOKLYN-21474836480F
116111657504U70013327135 HOYT STREETBROOKLYN-214748364875B
1335158042U6000COLONIAL ROADBROOKLYN-21474836480F
1457053224V10011119723 OCEAN PARKWAYBROOKLYN-2147483648100A
14668579959D900383153641 JOHNSON AVENUEBRONX20579900590F
157264282100V100025-70 REAR WHITESTONE EXPRESSWAY SR WESTQUEENS-21474836480F
\n", + "
" + ], + "text/plain": [ + " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", + "11319 149 7502 U7 0 0 0 \n", + "11611 165 7504 U7 0 0 133271 \n", + "13351 5804 2 U6 0 0 0 \n", + "14570 5322 4 V1 0 0 111197 \n", + "14668 5799 59 D9 0 0 38315 \n", + "15726 4282 100 V1 0 0 0 \n", + "\n", + " Address Borough BBL \\\n", + "11319 138 WILLOUGHBY STREET BROOKLYN -2147483648 \n", + "11611 35 HOYT STREET BROOKLYN -2147483648 \n", + "13351 COLONIAL ROAD BROOKLYN -2147483648 \n", + "14570 23 OCEAN PARKWAY BROOKLYN -2147483648 \n", + "14668 3641 JOHNSON AVENUE BRONX 2057990059 \n", + "15726 25-70 REAR WHITESTONE EXPRESSWAY SR WEST QUEENS -2147483648 \n", + "\n", + " Energy_Score Letter_Score \n", + "11319 0 F \n", + "11611 75 B \n", + "13351 0 F \n", + "14570 100 A \n", + "14668 0 F \n", + "15726 0 F " + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Unexpected values, continued\n", + "\n", + "df[df['Bldg_Count'] == 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "3479152f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_ScoreLetter_Score
116111657504U70013327135 HOYT STREETBROOKLYN-214748364875B
1457053224V10011119723 OCEAN PARKWAYBROOKLYN-2147483648100A
\n", + "
" + ], + "text/plain": [ + " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", + "11611 165 7504 U7 0 0 133271 \n", + "14570 5322 4 V1 0 0 111197 \n", + "\n", + " Address Borough BBL Energy_Score Letter_Score \n", + "11611 35 HOYT STREET BROOKLYN -2147483648 75 B \n", + "14570 23 OCEAN PARKWAY BROOKLYN -2147483648 100 A " + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# How can a block have zero buildings?\n", + "# Again, we'll leave the F grades as is.\n", + "\n", + "df[(df['Bldg_Count'] == 0) & (df['Letter_Score'] != 'F')]" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "6adfd92a", + "metadata": {}, + "outputs": [], + "source": [ + "# Have a peek at the Department of Finance Tax Map: http://gis.nyc.gov/taxmap/map.htm\n", + "\n", + "# Looks like Bldg_Count = 1 for both. However, I'm not comfortable with imputing data\n", + "# by eyeballing it." + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "7503be1f", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's just drop them\n", + "\n", + "df.drop([11611, 14570], inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab6ecc85", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}