{ "cells": [ { "cell_type": "markdown", "id": "58dfeab5", "metadata": {}, "source": [ "# Working title: working subtitle\n", "\n", "## initial remarks" ] }, { "cell_type": "code", "execution_count": 1, "id": "fe05b4a4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "id": "39a4ce3f", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('~/python_class/DOB_Sustainability_Compliance_Map__Local_Law_33.csv')" ] }, { "cell_type": "markdown", "id": "e0e97c85", "metadata": {}, "source": [ "## Part 1: Data Exploration" ] }, { "cell_type": "code", "execution_count": 3, "id": "6b430c20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(21681, 11)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 4, "id": "917a6779", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBoroughNameBBLENERGY STAR ScoreLetterScore
0110Y401242598091920 GRESHAM ROADMANHATTAN10000100101D
1223T2012434620 SOUTH STREETMANHATTAN10000200230F
247501R02125425631 WATER STREETMANHATTAN100004750161C
\n", "
" ], "text/plain": [ " Block Lot Building_Class Tax_Class Building_Count \\\n", "0 1 10 Y4 0 124 \n", "1 2 23 T2 0 1 \n", "2 4 7501 R0 2 1 \n", "\n", " DOF_Gross_Square_Footage Address BoroughName BBL \\\n", "0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n", "1 24346 20 SOUTH STREET MANHATTAN 1000020023 \n", "2 2542563 1 WATER STREET MANHATTAN 1000047501 \n", "\n", " ENERGY STAR Score LetterScore \n", "0 1 D \n", "1 0 F \n", "2 61 C " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(3)" ] }, { "cell_type": "code", "execution_count": 5, "id": "38d0ac47", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Block', 'Lot', 'Building_Class', 'Tax_Class', 'Building_Count',\n", " 'DOF_Gross_Square_Footage', 'Address', 'BoroughName', 'BBL',\n", " 'ENERGY STAR Score', 'LetterScore'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 6, "id": "adf4092b", "metadata": {}, "outputs": [], "source": [ "# Columns seem to be self-explanatory, except BBL. According to NYC OpenData:\n", "# \"Borough Block and Lot identifier as assigned by NYC Department of Finance\"." ] }, { "cell_type": "code", "execution_count": 7, "id": "276d9619", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MANHATTAN 7858\n", "BROOKLYN 5469\n", "BRONX 4349\n", "QUEENS 3659\n", "STATEN ISLAND 346\n", "Name: BoroughName, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Is this dataset citywide? or just Manhattan?\n", "\n", "df['BoroughName'].value_counts()" ] }, { "cell_type": "code", "execution_count": 8, "id": "d3c8c305", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Block 0\n", "Lot 0\n", "Building_Class 2\n", "Tax_Class 0\n", "Building_Count 0\n", "DOF_Gross_Square_Footage 0\n", "Address 7\n", "BoroughName 0\n", "BBL 0\n", "ENERGY STAR Score 0\n", "LetterScore 0\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Missing data?\n", "\n", "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 9, "id": "64eb852e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBoroughNameBBLENERGY STAR ScoreLetterScore
425415957501NaN0001330 5 AVENUEMANHATTAN101595003164C
812430167502NaN0001926 LONGFELLOW AVENUEBRONX2030160038100A
\n", "
" ], "text/plain": [ " Block Lot Building_Class Tax_Class Building_Count \\\n", "4254 1595 7501 NaN 0 0 \n", "8124 3016 7502 NaN 0 0 \n", "\n", " DOF_Gross_Square_Footage Address BoroughName \\\n", "4254 0 1330 5 AVENUE MANHATTAN \n", "8124 0 1926 LONGFELLOW AVENUE BRONX \n", "\n", " BBL ENERGY STAR Score LetterScore \n", "4254 1015950031 64 C \n", "8124 2030160038 100 A " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['Building_Class'].isna()]" ] }, { "cell_type": "code", "execution_count": 10, "id": "cdf678d2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBoroughNameBBLENERGY STAR ScoreLetterScore
122850612W30149475NaNMANHATTAN100506001210D
714517341I1051017118NaNMANHATTAN10173400017D
922527586N90117200NaNBRONX202758000689A
9226275836N90137060NaNBRONX202758003666C
13711176972C10130720NaNBROOKLYN-21474836480F
15056160213C10114720NaNBROOKLYN-21474836480F
16381375522C10125564NaNBROOKLYN-21474836480F
\n", "
" ], "text/plain": [ " Block Lot Building_Class Tax_Class Building_Count \\\n", "1228 506 12 W3 0 1 \n", "7145 1734 1 I1 0 5 \n", "9225 2758 6 N9 0 1 \n", "9226 2758 36 N9 0 1 \n", "13711 1769 72 C1 0 1 \n", "15056 1602 13 C1 0 1 \n", "16381 3755 22 C1 0 1 \n", "\n", " DOF_Gross_Square_Footage Address BoroughName BBL \\\n", "1228 49475 NaN MANHATTAN 1005060012 \n", "7145 1017118 NaN MANHATTAN 1017340001 \n", "9225 17200 NaN BRONX 2027580006 \n", "9226 37060 NaN BRONX 2027580036 \n", "13711 30720 NaN BROOKLYN -2147483648 \n", "15056 14720 NaN BROOKLYN -2147483648 \n", "16381 25564 NaN BROOKLYN -2147483648 \n", "\n", " ENERGY STAR Score LetterScore \n", "1228 10 D \n", "7145 7 D \n", "9225 89 A \n", "9226 66 C \n", "13711 0 F \n", "15056 0 F \n", "16381 0 F " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['Address'].isna()]" ] }, { "cell_type": "code", "execution_count": 11, "id": "e205df03", "metadata": {}, "outputs": [], "source": [ "# Missing Address is not a big deal because the rest of the values are complete.\n", "# But missing Building Class could be significant.\n", "\n", "# The two offending rows also have Building Count = 0.\n", "# How is that possible, since they have Energy Star scores?\n", "\n", "# In the next secion we may decide to drop those two rows." ] }, { "cell_type": "markdown", "id": "4d539a8c", "metadata": {}, "source": [ "## Part 2: Data Cleaning" ] }, { "cell_type": "code", "execution_count": 12, "id": "614dbd9f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Block int64\n", "Lot int64\n", "Building_Class object\n", "Tax_Class int64\n", "Building_Count int64\n", "DOF_Gross_Square_Footage int64\n", "Address object\n", "BoroughName object\n", "BBL int64\n", "ENERGY STAR Score int64\n", "LetterScore object\n", "dtype: object" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Which columns are informative?\n", "\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 13, "id": "6c58a084", "metadata": {}, "outputs": [], "source": [ "# Interesting for analysis:\n", "\n", "# DOF_Gross_Square_Footage\n", "# ENERGY STAR Score\n", "# LetterScore\n", "\n", "# Other columns are less interesting:\n", "\n", "# Building_Count is the number of buildings in one Block.\n", "# A Block can have more than one Lot, but a Lot only has one Block.\n", "# Block, Lot and BBL are identifiers assigned by the city.\n", "\n", "# A good visual reference is the Digital Tax Map put out by the NYC Department of Finance:\n", "# http://gis.nyc.gov/taxmap/map.htm" ] }, { "cell_type": "code", "execution_count": 14, "id": "14213bd2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Can any identifiers be used as an index?\n", "\n", "df['Block'].is_unique" ] }, { "cell_type": "code", "execution_count": 15, "id": "1e1a5e9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Lot'].is_unique" ] }, { "cell_type": "code", "execution_count": 16, "id": "67b7f633", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['BBL'].is_unique" ] }, { "cell_type": "code", "execution_count": 17, "id": "c4469ca8", "metadata": {}, "outputs": [], "source": [ "# Since their values are not unique, they cannot be used as an index." ] }, { "cell_type": "code", "execution_count": 18, "id": "782190b5", "metadata": {}, "outputs": [], "source": [ "# Shall we rename or discard any columns from this dataset?\n", "\n", "# BBL could be eliminated. However, there are only 11 columns total, and since df.head() is easily readable on my monitor without scrolling horizontally (as you're doing now), I see no harm in keeping it." ] }, { "cell_type": "code", "execution_count": 19, "id": "e085ba33", "metadata": {}, "outputs": [], "source": [ "# Rename columns containing whitespace or camelcase\n", "\n", "df.rename(columns = {\"BoroughName\": \"Borough_Name\",\n", " \"ENERGY STAR Score\": \"Energy_Star_Score\",\n", " \"LetterScore\": \"Letter_Score\"\n", " }, inplace = True)" ] }, { "cell_type": "code", "execution_count": 20, "id": "c4a8ebb7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBuilding_ClassTax_ClassBuilding_CountDOF_Gross_Square_FootageAddressBorough_NameBBLEnergy_Star_ScoreLetter_Score
0110Y401242598091920 GRESHAM ROADMANHATTAN10000100101D
\n", "
" ], "text/plain": [ " Block Lot Building_Class Tax_Class Building_Count \\\n", "0 1 10 Y4 0 124 \n", "\n", " DOF_Gross_Square_Footage Address Borough_Name BBL \\\n", "0 2598091 920 GRESHAM ROAD MANHATTAN 1000010010 \n", "\n", " Energy_Star_Score Letter_Score \n", "0 1 D " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(1)" ] }, { "cell_type": "code", "execution_count": 21, "id": "38de98e9", "metadata": {}, "outputs": [], "source": [ "# Unforseen consequence of renaming: now I have to scroll horizontally." ] }, { "cell_type": "code", "execution_count": 22, "id": "c0b5504f", "metadata": {}, "outputs": [], "source": [ "# Rename columns once more\n", "\n", "df.rename(columns = {\"DOF_Gross_Square_Footage\": \"Sq_Footage\",\n", " \"Energy_Star_Score\": \"Energy_Score\",\n", " \"Borough_Name\": \"Borough\",\n", " \"Building_Class\": \"Bldg_Class\",\n", " \"Building_Count\": \"Bldg_Count\"\n", " }, inplace = True)" ] }, { "cell_type": "code", "execution_count": 23, "id": "0d3cf300", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_ScoreLetter_Score
0110Y401242598091920 GRESHAM ROADMANHATTAN10000100101D
\n", "
" ], "text/plain": [ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage Address \\\n", "0 1 10 Y4 0 124 2598091 920 GRESHAM ROAD \n", "\n", " Borough BBL Energy_Score Letter_Score \n", "0 MANHATTAN 1000010010 1 D " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(1)" ] }, { "cell_type": "code", "execution_count": 24, "id": "c1c2e027", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Block 0\n", "Lot 0\n", "Bldg_Class 2\n", "Tax_Class 0\n", "Bldg_Count 0\n", "Sq_Footage 0\n", "Address 7\n", "Borough 0\n", "BBL 0\n", "Energy_Score 0\n", "Letter_Score 0\n", "dtype: int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Repeat the search for missing data\n", "\n", "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 25, "id": "5debf1d6", "metadata": {}, "outputs": [], "source": [ "# Ignore the 7 missing addresses, but drop the 2 rows with missing Building Class.\n", "# Building Class is a feature that will be used in the df.groupby() function.\n", "\n", "df.dropna(subset = ['Bldg_Class'], inplace = True)" ] }, { "cell_type": "code", "execution_count": 26, "id": "5d2eb339", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Block 0\n", "Lot 0\n", "Bldg_Class 0\n", "Tax_Class 0\n", "Bldg_Count 0\n", "Sq_Footage 0\n", "Address 7\n", "Borough 0\n", "BBL 0\n", "Energy_Score 0\n", "Letter_Score 0\n", "dtype: int64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 28, "id": "3b5525b0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Search for unexpected data\n", "\n", "# df['Energy_Score'].min() # looks good\n", "# df['Energy_Score'].max() # looks good\n", "# df['Sq_Footage'].max() # looks good\n", "df['Sq_Footage'].min()" ] }, { "cell_type": "code", "execution_count": 29, "id": "c1f3edc4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_Score
Letter_Score
A3333333333
B1111111111
C5555555555
D6666666666
F14141414141414141414
\n", "
" ], "text/plain": [ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", "Letter_Score \n", "A 3 3 3 3 3 3 \n", "B 1 1 1 1 1 1 \n", "C 5 5 5 5 5 5 \n", "D 6 6 6 6 6 6 \n", "F 14 14 14 14 14 14 \n", "\n", " Address Borough BBL Energy_Score \n", "Letter_Score \n", "A 3 3 3 3 \n", "B 1 1 1 1 \n", "C 5 5 5 5 \n", "D 6 6 6 6 \n", "F 14 14 14 14 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A building cannot have zero square feet of floorspace.\n", "# What's going on?\n", "\n", "df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()" ] }, { "cell_type": "code", "execution_count": 30, "id": "e27467ce", "metadata": {}, "outputs": [], "source": [ "# The ones with F can be explained:\n", "# An F grade means that the building owner \"didn’t submit required benchmarking information\",\n", "# according to Local Law 95 of 2019. So it's not that the building has no square footage,\n", "# but that the data was not submitted. Thus the failing grade.\n", "\n", "# We'll leave 0 square feet with F grade untouched.\n", "\n", "# For more information, see https://www1.nyc.gov/site/buildings/codes/benchmarking.page" ] }, { "cell_type": "code", "execution_count": 35, "id": "b73e15d9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_Score
Letter_Score
A3333333333
B1111111111
C5555555555
D6666666666
\n", "
" ], "text/plain": [ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", "Letter_Score \n", "A 3 3 3 3 3 3 \n", "B 1 1 1 1 1 1 \n", "C 5 5 5 5 5 5 \n", "D 6 6 6 6 6 6 \n", "\n", " Address Borough BBL Energy_Score \n", "Letter_Score \n", "A 3 3 3 3 \n", "B 1 1 1 1 \n", "C 5 5 5 5 \n", "D 6 6 6 6 " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What to do with the others?\n", "\n", "df[(df['Sq_Footage'] == 0) & (df['Letter_Score'] != 'F')].groupby(['Letter_Score']).count()" ] }, { "cell_type": "code", "execution_count": 53, "id": "2d9bea8f", "metadata": {}, "outputs": [], "source": [ "# 15 rows remain with 0 square feet of floorspace.\n", "# Can we impute values from the mean square footage for each grade?\n", "\n", "# (There must be an elegant way to do this. What you see below is not.)" ] }, { "cell_type": "code", "execution_count": 67, "id": "6eb73792", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sq_Footage
Letter_Score
A111197.291071
B133270.963702
C128833.575964
D108170.778312
\n", "
" ], "text/plain": [ " Sq_Footage\n", "Letter_Score \n", "A 111197.291071\n", "B 133270.963702\n", "C 128833.575964\n", "D 108170.778312" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First, get averages\n", "\n", "subset0 = df[['Letter_Score', 'Sq_Footage']]\n", "subset1 = subset0[(subset0['Letter_Score'] != 'F') & (subset0['Sq_Footage'] != 0)]\n", "subset1.groupby(['Letter_Score']).mean()" ] }, { "cell_type": "code", "execution_count": 77, "id": "8666f836", "metadata": {}, "outputs": [], "source": [ "# Assign variables, rounding to whole numbers\n", "\n", "mean_A = 111197\n", "mean_B = 133271\n", "mean_C = 128834\n", "mean_D = 108171" ] }, { "cell_type": "code", "execution_count": 79, "id": "73900853", "metadata": {}, "outputs": [], "source": [ "# Replace 0 values with mean_A, mean_B, etc.\n", "\n", "df.loc[(df['Letter_Score'] == 'A') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_A\n", "df.loc[(df['Letter_Score'] == 'B') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_B\n", "df.loc[(df['Letter_Score'] == 'C') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_C\n", "df.loc[(df['Letter_Score'] == 'D') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_D" ] }, { "cell_type": "code", "execution_count": 81, "id": "37eb13ea", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_Score
Letter_Score
F14141414141414141414
\n", "
" ], "text/plain": [ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", "Letter_Score \n", "F 14 14 14 14 14 14 \n", "\n", " Address Borough BBL Energy_Score \n", "Letter_Score \n", "F 14 14 14 14 " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now the only 0 values should be for F grades\n", "\n", "df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()" ] }, { "cell_type": "code", "execution_count": 82, "id": "913b2ae6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Block int64\n", "Lot int64\n", "Bldg_Class object\n", "Tax_Class int64\n", "Bldg_Count int64\n", "Sq_Footage int64\n", "Address object\n", "Borough object\n", "BBL int64\n", "Energy_Score int64\n", "Letter_Score object\n", "dtype: object" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# delete this cell\n", "\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 92, "id": "007f1189", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_ScoreLetter_Score
113191497502U7000138 WILLOUGHBY STREETBROOKLYN-21474836480F
116111657504U70013327135 HOYT STREETBROOKLYN-214748364875B
1335158042U6000COLONIAL ROADBROOKLYN-21474836480F
1457053224V10011119723 OCEAN PARKWAYBROOKLYN-2147483648100A
14668579959D900383153641 JOHNSON AVENUEBRONX20579900590F
157264282100V100025-70 REAR WHITESTONE EXPRESSWAY SR WESTQUEENS-21474836480F
\n", "
" ], "text/plain": [ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", "11319 149 7502 U7 0 0 0 \n", "11611 165 7504 U7 0 0 133271 \n", "13351 5804 2 U6 0 0 0 \n", "14570 5322 4 V1 0 0 111197 \n", "14668 5799 59 D9 0 0 38315 \n", "15726 4282 100 V1 0 0 0 \n", "\n", " Address Borough BBL \\\n", "11319 138 WILLOUGHBY STREET BROOKLYN -2147483648 \n", "11611 35 HOYT STREET BROOKLYN -2147483648 \n", "13351 COLONIAL ROAD BROOKLYN -2147483648 \n", "14570 23 OCEAN PARKWAY BROOKLYN -2147483648 \n", "14668 3641 JOHNSON AVENUE BRONX 2057990059 \n", "15726 25-70 REAR WHITESTONE EXPRESSWAY SR WEST QUEENS -2147483648 \n", "\n", " Energy_Score Letter_Score \n", "11319 0 F \n", "11611 75 B \n", "13351 0 F \n", "14570 100 A \n", "14668 0 F \n", "15726 0 F " ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Unexpected values, continued\n", "\n", "df[df['Bldg_Count'] == 0]" ] }, { "cell_type": "code", "execution_count": 93, "id": "3479152f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BlockLotBldg_ClassTax_ClassBldg_CountSq_FootageAddressBoroughBBLEnergy_ScoreLetter_Score
116111657504U70013327135 HOYT STREETBROOKLYN-214748364875B
1457053224V10011119723 OCEAN PARKWAYBROOKLYN-2147483648100A
\n", "
" ], "text/plain": [ " Block Lot Bldg_Class Tax_Class Bldg_Count Sq_Footage \\\n", "11611 165 7504 U7 0 0 133271 \n", "14570 5322 4 V1 0 0 111197 \n", "\n", " Address Borough BBL Energy_Score Letter_Score \n", "11611 35 HOYT STREET BROOKLYN -2147483648 75 B \n", "14570 23 OCEAN PARKWAY BROOKLYN -2147483648 100 A " ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# How can a block have zero buildings?\n", "# Again, we'll leave the F grades as is.\n", "\n", "df[(df['Bldg_Count'] == 0) & (df['Letter_Score'] != 'F')]" ] }, { "cell_type": "code", "execution_count": 94, "id": "6adfd92a", "metadata": {}, "outputs": [], "source": [ "# Have a peek at the Department of Finance Tax Map: http://gis.nyc.gov/taxmap/map.htm\n", "\n", "# Looks like Bldg_Count = 1 for both. However, I'm not comfortable with imputing data\n", "# by eyeballing it." ] }, { "cell_type": "code", "execution_count": 95, "id": "7503be1f", "metadata": {}, "outputs": [], "source": [ "# Let's just drop them\n", "\n", "df.drop([11611, 14570], inplace = True)" ] }, { "cell_type": "code", "execution_count": null, "id": "ab6ecc85", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 5 }